softfloat.c revision 1.9 1 1.9 martin /* $NetBSD: softfloat.c,v 1.9 2012/03/20 21:34:51 martin Exp $ */
2 1.1 bjh21
3 1.1 bjh21 /*
4 1.1 bjh21 * This version hacked for use with gcc -msoft-float by bjh21.
5 1.1 bjh21 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 1.1 bjh21 * itself).
7 1.1 bjh21 */
8 1.1 bjh21
9 1.1 bjh21 /*
10 1.1 bjh21 * Things you may want to define:
11 1.1 bjh21 *
12 1.1 bjh21 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 1.1 bjh21 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 1.1 bjh21 * properly renamed.
15 1.1 bjh21 */
16 1.1 bjh21
17 1.1 bjh21 /*
18 1.1 bjh21 ===============================================================================
19 1.1 bjh21
20 1.1 bjh21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 1.1 bjh21 Arithmetic Package, Release 2a.
22 1.1 bjh21
23 1.1 bjh21 Written by John R. Hauser. This work was made possible in part by the
24 1.1 bjh21 International Computer Science Institute, located at Suite 600, 1947 Center
25 1.1 bjh21 Street, Berkeley, California 94704. Funding was partially provided by the
26 1.1 bjh21 National Science Foundation under grant MIP-9311980. The original version
27 1.1 bjh21 of this code was written as part of a project to build a fixed-point vector
28 1.1 bjh21 processor in collaboration with the University of California at Berkeley,
29 1.1 bjh21 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 1.1 bjh21 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 1.1 bjh21 arithmetic/SoftFloat.html'.
32 1.1 bjh21
33 1.1 bjh21 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 1.1 bjh21 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 1.1 bjh21 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 1.1 bjh21 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 1.1 bjh21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 1.1 bjh21
39 1.1 bjh21 Derivative works are acceptable, even for commercial purposes, so long as
40 1.1 bjh21 (1) they include prominent notice that the work is derivative, and (2) they
41 1.1 bjh21 include prominent notice akin to these four paragraphs for those parts of
42 1.1 bjh21 this code that are retained.
43 1.1 bjh21
44 1.1 bjh21 ===============================================================================
45 1.1 bjh21 */
46 1.1 bjh21
47 1.1 bjh21 #include <sys/cdefs.h>
48 1.1 bjh21 #if defined(LIBC_SCCS) && !defined(lint)
49 1.9 martin __RCSID("$NetBSD: softfloat.c,v 1.9 2012/03/20 21:34:51 martin Exp $");
50 1.1 bjh21 #endif /* LIBC_SCCS and not lint */
51 1.1 bjh21
52 1.1 bjh21 #ifdef SOFTFLOAT_FOR_GCC
53 1.1 bjh21 #include "softfloat-for-gcc.h"
54 1.1 bjh21 #endif
55 1.1 bjh21
56 1.1 bjh21 #include "milieu.h"
57 1.1 bjh21 #include "softfloat.h"
58 1.1 bjh21
59 1.1 bjh21 /*
60 1.1 bjh21 * Conversions between floats as stored in memory and floats as
61 1.1 bjh21 * SoftFloat uses them
62 1.1 bjh21 */
63 1.1 bjh21 #ifndef FLOAT64_DEMANGLE
64 1.1 bjh21 #define FLOAT64_DEMANGLE(a) (a)
65 1.1 bjh21 #endif
66 1.1 bjh21 #ifndef FLOAT64_MANGLE
67 1.1 bjh21 #define FLOAT64_MANGLE(a) (a)
68 1.1 bjh21 #endif
69 1.1 bjh21
70 1.1 bjh21 /*
71 1.1 bjh21 -------------------------------------------------------------------------------
72 1.1 bjh21 Floating-point rounding mode, extended double-precision rounding precision,
73 1.1 bjh21 and exception flags.
74 1.1 bjh21 -------------------------------------------------------------------------------
75 1.1 bjh21 */
76 1.1 bjh21 fp_rnd float_rounding_mode = float_round_nearest_even;
77 1.1 bjh21 fp_except float_exception_flags = 0;
78 1.1 bjh21 #ifdef FLOATX80
79 1.1 bjh21 int8 floatx80_rounding_precision = 80;
80 1.1 bjh21 #endif
81 1.1 bjh21
82 1.1 bjh21 /*
83 1.1 bjh21 -------------------------------------------------------------------------------
84 1.1 bjh21 Primitive arithmetic functions, including multi-word arithmetic, and
85 1.1 bjh21 division and square root approximations. (Can be specialized to target if
86 1.1 bjh21 desired.)
87 1.1 bjh21 -------------------------------------------------------------------------------
88 1.1 bjh21 */
89 1.1 bjh21 #include "softfloat-macros"
90 1.1 bjh21
91 1.1 bjh21 /*
92 1.1 bjh21 -------------------------------------------------------------------------------
93 1.1 bjh21 Functions and definitions to determine: (1) whether tininess for underflow
94 1.1 bjh21 is detected before or after rounding by default, (2) what (if anything)
95 1.1 bjh21 happens when exceptions are raised, (3) how signaling NaNs are distinguished
96 1.1 bjh21 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
97 1.1 bjh21 are propagated from function inputs to output. These details are target-
98 1.1 bjh21 specific.
99 1.1 bjh21 -------------------------------------------------------------------------------
100 1.1 bjh21 */
101 1.1 bjh21 #include "softfloat-specialize"
102 1.1 bjh21
103 1.1 bjh21 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
104 1.1 bjh21 /*
105 1.1 bjh21 -------------------------------------------------------------------------------
106 1.1 bjh21 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 1.1 bjh21 and 7, and returns the properly rounded 32-bit integer corresponding to the
108 1.1 bjh21 input. If `zSign' is 1, the input is negated before being converted to an
109 1.1 bjh21 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
110 1.1 bjh21 is simply rounded to an integer, with the inexact exception raised if the
111 1.1 bjh21 input cannot be represented exactly as an integer. However, if the fixed-
112 1.1 bjh21 point input is too large, the invalid exception is raised and the largest
113 1.1 bjh21 positive or negative integer is returned.
114 1.1 bjh21 -------------------------------------------------------------------------------
115 1.1 bjh21 */
116 1.1 bjh21 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 1.1 bjh21 {
118 1.1 bjh21 int8 roundingMode;
119 1.1 bjh21 flag roundNearestEven;
120 1.1 bjh21 int8 roundIncrement, roundBits;
121 1.1 bjh21 int32 z;
122 1.1 bjh21
123 1.1 bjh21 roundingMode = float_rounding_mode;
124 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
125 1.1 bjh21 roundIncrement = 0x40;
126 1.1 bjh21 if ( ! roundNearestEven ) {
127 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
128 1.1 bjh21 roundIncrement = 0;
129 1.1 bjh21 }
130 1.1 bjh21 else {
131 1.1 bjh21 roundIncrement = 0x7F;
132 1.1 bjh21 if ( zSign ) {
133 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
134 1.1 bjh21 }
135 1.1 bjh21 else {
136 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
137 1.1 bjh21 }
138 1.1 bjh21 }
139 1.1 bjh21 }
140 1.1 bjh21 roundBits = absZ & 0x7F;
141 1.1 bjh21 absZ = ( absZ + roundIncrement )>>7;
142 1.1 bjh21 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143 1.1 bjh21 z = absZ;
144 1.1 bjh21 if ( zSign ) z = - z;
145 1.1 bjh21 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146 1.1 bjh21 float_raise( float_flag_invalid );
147 1.1 bjh21 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148 1.1 bjh21 }
149 1.1 bjh21 if ( roundBits ) float_exception_flags |= float_flag_inexact;
150 1.1 bjh21 return z;
151 1.1 bjh21
152 1.1 bjh21 }
153 1.1 bjh21
154 1.1 bjh21 /*
155 1.1 bjh21 -------------------------------------------------------------------------------
156 1.1 bjh21 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
157 1.1 bjh21 `absZ1', with binary point between bits 63 and 64 (between the input words),
158 1.1 bjh21 and returns the properly rounded 64-bit integer corresponding to the input.
159 1.1 bjh21 If `zSign' is 1, the input is negated before being converted to an integer.
160 1.1 bjh21 Ordinarily, the fixed-point input is simply rounded to an integer, with
161 1.1 bjh21 the inexact exception raised if the input cannot be represented exactly as
162 1.1 bjh21 an integer. However, if the fixed-point input is too large, the invalid
163 1.1 bjh21 exception is raised and the largest positive or negative integer is
164 1.1 bjh21 returned.
165 1.1 bjh21 -------------------------------------------------------------------------------
166 1.1 bjh21 */
167 1.1 bjh21 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 1.1 bjh21 {
169 1.1 bjh21 int8 roundingMode;
170 1.1 bjh21 flag roundNearestEven, increment;
171 1.1 bjh21 int64 z;
172 1.1 bjh21
173 1.1 bjh21 roundingMode = float_rounding_mode;
174 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
175 1.1 bjh21 increment = ( (sbits64) absZ1 < 0 );
176 1.1 bjh21 if ( ! roundNearestEven ) {
177 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
178 1.1 bjh21 increment = 0;
179 1.1 bjh21 }
180 1.1 bjh21 else {
181 1.1 bjh21 if ( zSign ) {
182 1.1 bjh21 increment = ( roundingMode == float_round_down ) && absZ1;
183 1.1 bjh21 }
184 1.1 bjh21 else {
185 1.1 bjh21 increment = ( roundingMode == float_round_up ) && absZ1;
186 1.1 bjh21 }
187 1.1 bjh21 }
188 1.1 bjh21 }
189 1.1 bjh21 if ( increment ) {
190 1.1 bjh21 ++absZ0;
191 1.1 bjh21 if ( absZ0 == 0 ) goto overflow;
192 1.1 bjh21 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193 1.1 bjh21 }
194 1.1 bjh21 z = absZ0;
195 1.1 bjh21 if ( zSign ) z = - z;
196 1.1 bjh21 if ( z && ( ( z < 0 ) ^ zSign ) ) {
197 1.1 bjh21 overflow:
198 1.1 bjh21 float_raise( float_flag_invalid );
199 1.1 bjh21 return
200 1.1 bjh21 zSign ? (sbits64) LIT64( 0x8000000000000000 )
201 1.1 bjh21 : LIT64( 0x7FFFFFFFFFFFFFFF );
202 1.1 bjh21 }
203 1.1 bjh21 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
204 1.1 bjh21 return z;
205 1.1 bjh21
206 1.1 bjh21 }
207 1.1 bjh21 #endif
208 1.1 bjh21
209 1.1 bjh21 /*
210 1.1 bjh21 -------------------------------------------------------------------------------
211 1.1 bjh21 Returns the fraction bits of the single-precision floating-point value `a'.
212 1.1 bjh21 -------------------------------------------------------------------------------
213 1.1 bjh21 */
214 1.1 bjh21 INLINE bits32 extractFloat32Frac( float32 a )
215 1.1 bjh21 {
216 1.1 bjh21
217 1.1 bjh21 return a & 0x007FFFFF;
218 1.1 bjh21
219 1.1 bjh21 }
220 1.1 bjh21
221 1.1 bjh21 /*
222 1.1 bjh21 -------------------------------------------------------------------------------
223 1.1 bjh21 Returns the exponent bits of the single-precision floating-point value `a'.
224 1.1 bjh21 -------------------------------------------------------------------------------
225 1.1 bjh21 */
226 1.1 bjh21 INLINE int16 extractFloat32Exp( float32 a )
227 1.1 bjh21 {
228 1.1 bjh21
229 1.1 bjh21 return ( a>>23 ) & 0xFF;
230 1.1 bjh21
231 1.1 bjh21 }
232 1.1 bjh21
233 1.1 bjh21 /*
234 1.1 bjh21 -------------------------------------------------------------------------------
235 1.1 bjh21 Returns the sign bit of the single-precision floating-point value `a'.
236 1.1 bjh21 -------------------------------------------------------------------------------
237 1.1 bjh21 */
238 1.1 bjh21 INLINE flag extractFloat32Sign( float32 a )
239 1.1 bjh21 {
240 1.1 bjh21
241 1.1 bjh21 return a>>31;
242 1.1 bjh21
243 1.1 bjh21 }
244 1.1 bjh21
245 1.1 bjh21 /*
246 1.1 bjh21 -------------------------------------------------------------------------------
247 1.1 bjh21 Normalizes the subnormal single-precision floating-point value represented
248 1.1 bjh21 by the denormalized significand `aSig'. The normalized exponent and
249 1.1 bjh21 significand are stored at the locations pointed to by `zExpPtr' and
250 1.1 bjh21 `zSigPtr', respectively.
251 1.1 bjh21 -------------------------------------------------------------------------------
252 1.1 bjh21 */
253 1.1 bjh21 static void
254 1.1 bjh21 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255 1.1 bjh21 {
256 1.1 bjh21 int8 shiftCount;
257 1.1 bjh21
258 1.1 bjh21 shiftCount = countLeadingZeros32( aSig ) - 8;
259 1.1 bjh21 *zSigPtr = aSig<<shiftCount;
260 1.1 bjh21 *zExpPtr = 1 - shiftCount;
261 1.1 bjh21
262 1.1 bjh21 }
263 1.1 bjh21
264 1.1 bjh21 /*
265 1.1 bjh21 -------------------------------------------------------------------------------
266 1.1 bjh21 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
267 1.1 bjh21 single-precision floating-point value, returning the result. After being
268 1.1 bjh21 shifted into the proper positions, the three fields are simply added
269 1.1 bjh21 together to form the result. This means that any integer portion of `zSig'
270 1.1 bjh21 will be added into the exponent. Since a properly normalized significand
271 1.1 bjh21 will have an integer portion equal to 1, the `zExp' input should be 1 less
272 1.1 bjh21 than the desired result exponent whenever `zSig' is a complete, normalized
273 1.1 bjh21 significand.
274 1.1 bjh21 -------------------------------------------------------------------------------
275 1.1 bjh21 */
276 1.1 bjh21 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 1.1 bjh21 {
278 1.1 bjh21
279 1.1 bjh21 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
280 1.1 bjh21
281 1.1 bjh21 }
282 1.1 bjh21
283 1.1 bjh21 /*
284 1.1 bjh21 -------------------------------------------------------------------------------
285 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
286 1.1 bjh21 and significand `zSig', and returns the proper single-precision floating-
287 1.1 bjh21 point value corresponding to the abstract input. Ordinarily, the abstract
288 1.1 bjh21 value is simply rounded and packed into the single-precision format, with
289 1.1 bjh21 the inexact exception raised if the abstract input cannot be represented
290 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
291 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
292 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
293 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
294 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal single-
295 1.1 bjh21 precision floating-point number.
296 1.1 bjh21 The input significand `zSig' has its binary point between bits 30
297 1.1 bjh21 and 29, which is 7 bits to the left of the usual location. This shifted
298 1.1 bjh21 significand must be normalized or smaller. If `zSig' is not normalized,
299 1.1 bjh21 `zExp' must be 0; in that case, the result returned is a subnormal number,
300 1.1 bjh21 and it must not require rounding. In the usual case that `zSig' is
301 1.1 bjh21 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
302 1.1 bjh21 The handling of underflow and overflow follows the IEC/IEEE Standard for
303 1.1 bjh21 Binary Floating-Point Arithmetic.
304 1.1 bjh21 -------------------------------------------------------------------------------
305 1.1 bjh21 */
306 1.1 bjh21 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 1.1 bjh21 {
308 1.1 bjh21 int8 roundingMode;
309 1.1 bjh21 flag roundNearestEven;
310 1.1 bjh21 int8 roundIncrement, roundBits;
311 1.1 bjh21 flag isTiny;
312 1.1 bjh21
313 1.1 bjh21 roundingMode = float_rounding_mode;
314 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
315 1.1 bjh21 roundIncrement = 0x40;
316 1.1 bjh21 if ( ! roundNearestEven ) {
317 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
318 1.1 bjh21 roundIncrement = 0;
319 1.1 bjh21 }
320 1.1 bjh21 else {
321 1.1 bjh21 roundIncrement = 0x7F;
322 1.1 bjh21 if ( zSign ) {
323 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
324 1.1 bjh21 }
325 1.1 bjh21 else {
326 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
327 1.1 bjh21 }
328 1.1 bjh21 }
329 1.1 bjh21 }
330 1.1 bjh21 roundBits = zSig & 0x7F;
331 1.1 bjh21 if ( 0xFD <= (bits16) zExp ) {
332 1.1 bjh21 if ( ( 0xFD < zExp )
333 1.1 bjh21 || ( ( zExp == 0xFD )
334 1.1 bjh21 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335 1.1 bjh21 ) {
336 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
337 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
338 1.1 bjh21 }
339 1.1 bjh21 if ( zExp < 0 ) {
340 1.1 bjh21 isTiny =
341 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
342 1.1 bjh21 || ( zExp < -1 )
343 1.1 bjh21 || ( zSig + roundIncrement < 0x80000000 );
344 1.1 bjh21 shift32RightJamming( zSig, - zExp, &zSig );
345 1.1 bjh21 zExp = 0;
346 1.1 bjh21 roundBits = zSig & 0x7F;
347 1.1 bjh21 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348 1.1 bjh21 }
349 1.1 bjh21 }
350 1.1 bjh21 if ( roundBits ) float_exception_flags |= float_flag_inexact;
351 1.1 bjh21 zSig = ( zSig + roundIncrement )>>7;
352 1.1 bjh21 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353 1.1 bjh21 if ( zSig == 0 ) zExp = 0;
354 1.1 bjh21 return packFloat32( zSign, zExp, zSig );
355 1.1 bjh21
356 1.1 bjh21 }
357 1.1 bjh21
358 1.1 bjh21 /*
359 1.1 bjh21 -------------------------------------------------------------------------------
360 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
361 1.1 bjh21 and significand `zSig', and returns the proper single-precision floating-
362 1.1 bjh21 point value corresponding to the abstract input. This routine is just like
363 1.1 bjh21 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
364 1.1 bjh21 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
365 1.1 bjh21 floating-point exponent.
366 1.1 bjh21 -------------------------------------------------------------------------------
367 1.1 bjh21 */
368 1.1 bjh21 static float32
369 1.1 bjh21 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
370 1.1 bjh21 {
371 1.1 bjh21 int8 shiftCount;
372 1.1 bjh21
373 1.1 bjh21 shiftCount = countLeadingZeros32( zSig ) - 1;
374 1.1 bjh21 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
375 1.1 bjh21
376 1.1 bjh21 }
377 1.1 bjh21
378 1.1 bjh21 /*
379 1.1 bjh21 -------------------------------------------------------------------------------
380 1.1 bjh21 Returns the fraction bits of the double-precision floating-point value `a'.
381 1.1 bjh21 -------------------------------------------------------------------------------
382 1.1 bjh21 */
383 1.1 bjh21 INLINE bits64 extractFloat64Frac( float64 a )
384 1.1 bjh21 {
385 1.1 bjh21
386 1.1 bjh21 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
387 1.1 bjh21
388 1.1 bjh21 }
389 1.1 bjh21
390 1.1 bjh21 /*
391 1.1 bjh21 -------------------------------------------------------------------------------
392 1.1 bjh21 Returns the exponent bits of the double-precision floating-point value `a'.
393 1.1 bjh21 -------------------------------------------------------------------------------
394 1.1 bjh21 */
395 1.1 bjh21 INLINE int16 extractFloat64Exp( float64 a )
396 1.1 bjh21 {
397 1.1 bjh21
398 1.1 bjh21 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
399 1.1 bjh21
400 1.1 bjh21 }
401 1.1 bjh21
402 1.1 bjh21 /*
403 1.1 bjh21 -------------------------------------------------------------------------------
404 1.1 bjh21 Returns the sign bit of the double-precision floating-point value `a'.
405 1.1 bjh21 -------------------------------------------------------------------------------
406 1.1 bjh21 */
407 1.1 bjh21 INLINE flag extractFloat64Sign( float64 a )
408 1.1 bjh21 {
409 1.1 bjh21
410 1.1 bjh21 return FLOAT64_DEMANGLE(a)>>63;
411 1.1 bjh21
412 1.1 bjh21 }
413 1.1 bjh21
414 1.1 bjh21 /*
415 1.1 bjh21 -------------------------------------------------------------------------------
416 1.1 bjh21 Normalizes the subnormal double-precision floating-point value represented
417 1.1 bjh21 by the denormalized significand `aSig'. The normalized exponent and
418 1.1 bjh21 significand are stored at the locations pointed to by `zExpPtr' and
419 1.1 bjh21 `zSigPtr', respectively.
420 1.1 bjh21 -------------------------------------------------------------------------------
421 1.1 bjh21 */
422 1.1 bjh21 static void
423 1.1 bjh21 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
424 1.1 bjh21 {
425 1.1 bjh21 int8 shiftCount;
426 1.1 bjh21
427 1.1 bjh21 shiftCount = countLeadingZeros64( aSig ) - 11;
428 1.1 bjh21 *zSigPtr = aSig<<shiftCount;
429 1.1 bjh21 *zExpPtr = 1 - shiftCount;
430 1.1 bjh21
431 1.1 bjh21 }
432 1.1 bjh21
433 1.1 bjh21 /*
434 1.1 bjh21 -------------------------------------------------------------------------------
435 1.1 bjh21 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
436 1.1 bjh21 double-precision floating-point value, returning the result. After being
437 1.1 bjh21 shifted into the proper positions, the three fields are simply added
438 1.1 bjh21 together to form the result. This means that any integer portion of `zSig'
439 1.1 bjh21 will be added into the exponent. Since a properly normalized significand
440 1.1 bjh21 will have an integer portion equal to 1, the `zExp' input should be 1 less
441 1.1 bjh21 than the desired result exponent whenever `zSig' is a complete, normalized
442 1.1 bjh21 significand.
443 1.1 bjh21 -------------------------------------------------------------------------------
444 1.1 bjh21 */
445 1.1 bjh21 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 1.1 bjh21 {
447 1.1 bjh21
448 1.1 bjh21 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
449 1.1 bjh21 ( ( (bits64) zExp )<<52 ) + zSig );
450 1.1 bjh21
451 1.1 bjh21 }
452 1.1 bjh21
453 1.1 bjh21 /*
454 1.1 bjh21 -------------------------------------------------------------------------------
455 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
456 1.1 bjh21 and significand `zSig', and returns the proper double-precision floating-
457 1.1 bjh21 point value corresponding to the abstract input. Ordinarily, the abstract
458 1.1 bjh21 value is simply rounded and packed into the double-precision format, with
459 1.1 bjh21 the inexact exception raised if the abstract input cannot be represented
460 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
461 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
462 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
463 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
464 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal double-
465 1.1 bjh21 precision floating-point number.
466 1.1 bjh21 The input significand `zSig' has its binary point between bits 62
467 1.1 bjh21 and 61, which is 10 bits to the left of the usual location. This shifted
468 1.1 bjh21 significand must be normalized or smaller. If `zSig' is not normalized,
469 1.1 bjh21 `zExp' must be 0; in that case, the result returned is a subnormal number,
470 1.1 bjh21 and it must not require rounding. In the usual case that `zSig' is
471 1.1 bjh21 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
472 1.1 bjh21 The handling of underflow and overflow follows the IEC/IEEE Standard for
473 1.1 bjh21 Binary Floating-Point Arithmetic.
474 1.1 bjh21 -------------------------------------------------------------------------------
475 1.1 bjh21 */
476 1.1 bjh21 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 1.1 bjh21 {
478 1.1 bjh21 int8 roundingMode;
479 1.1 bjh21 flag roundNearestEven;
480 1.1 bjh21 int16 roundIncrement, roundBits;
481 1.1 bjh21 flag isTiny;
482 1.1 bjh21
483 1.1 bjh21 roundingMode = float_rounding_mode;
484 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
485 1.1 bjh21 roundIncrement = 0x200;
486 1.1 bjh21 if ( ! roundNearestEven ) {
487 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
488 1.1 bjh21 roundIncrement = 0;
489 1.1 bjh21 }
490 1.1 bjh21 else {
491 1.1 bjh21 roundIncrement = 0x3FF;
492 1.1 bjh21 if ( zSign ) {
493 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
494 1.1 bjh21 }
495 1.1 bjh21 else {
496 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
497 1.1 bjh21 }
498 1.1 bjh21 }
499 1.1 bjh21 }
500 1.1 bjh21 roundBits = zSig & 0x3FF;
501 1.1 bjh21 if ( 0x7FD <= (bits16) zExp ) {
502 1.1 bjh21 if ( ( 0x7FD < zExp )
503 1.1 bjh21 || ( ( zExp == 0x7FD )
504 1.1 bjh21 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
505 1.1 bjh21 ) {
506 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
507 1.1 bjh21 return FLOAT64_MANGLE(
508 1.1 bjh21 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
509 1.1 bjh21 ( roundIncrement == 0 ));
510 1.1 bjh21 }
511 1.1 bjh21 if ( zExp < 0 ) {
512 1.1 bjh21 isTiny =
513 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
514 1.1 bjh21 || ( zExp < -1 )
515 1.1 bjh21 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
516 1.1 bjh21 shift64RightJamming( zSig, - zExp, &zSig );
517 1.1 bjh21 zExp = 0;
518 1.1 bjh21 roundBits = zSig & 0x3FF;
519 1.1 bjh21 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520 1.1 bjh21 }
521 1.1 bjh21 }
522 1.1 bjh21 if ( roundBits ) float_exception_flags |= float_flag_inexact;
523 1.1 bjh21 zSig = ( zSig + roundIncrement )>>10;
524 1.1 bjh21 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
525 1.1 bjh21 if ( zSig == 0 ) zExp = 0;
526 1.1 bjh21 return packFloat64( zSign, zExp, zSig );
527 1.1 bjh21
528 1.1 bjh21 }
529 1.1 bjh21
530 1.1 bjh21 /*
531 1.1 bjh21 -------------------------------------------------------------------------------
532 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 1.1 bjh21 and significand `zSig', and returns the proper double-precision floating-
534 1.1 bjh21 point value corresponding to the abstract input. This routine is just like
535 1.1 bjh21 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
536 1.1 bjh21 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
537 1.1 bjh21 floating-point exponent.
538 1.1 bjh21 -------------------------------------------------------------------------------
539 1.1 bjh21 */
540 1.1 bjh21 static float64
541 1.1 bjh21 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
542 1.1 bjh21 {
543 1.1 bjh21 int8 shiftCount;
544 1.1 bjh21
545 1.1 bjh21 shiftCount = countLeadingZeros64( zSig ) - 1;
546 1.1 bjh21 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
547 1.1 bjh21
548 1.1 bjh21 }
549 1.1 bjh21
550 1.1 bjh21 #ifdef FLOATX80
551 1.1 bjh21
552 1.1 bjh21 /*
553 1.1 bjh21 -------------------------------------------------------------------------------
554 1.1 bjh21 Returns the fraction bits of the extended double-precision floating-point
555 1.1 bjh21 value `a'.
556 1.1 bjh21 -------------------------------------------------------------------------------
557 1.1 bjh21 */
558 1.1 bjh21 INLINE bits64 extractFloatx80Frac( floatx80 a )
559 1.1 bjh21 {
560 1.1 bjh21
561 1.1 bjh21 return a.low;
562 1.1 bjh21
563 1.1 bjh21 }
564 1.1 bjh21
565 1.1 bjh21 /*
566 1.1 bjh21 -------------------------------------------------------------------------------
567 1.1 bjh21 Returns the exponent bits of the extended double-precision floating-point
568 1.1 bjh21 value `a'.
569 1.1 bjh21 -------------------------------------------------------------------------------
570 1.1 bjh21 */
571 1.1 bjh21 INLINE int32 extractFloatx80Exp( floatx80 a )
572 1.1 bjh21 {
573 1.1 bjh21
574 1.1 bjh21 return a.high & 0x7FFF;
575 1.1 bjh21
576 1.1 bjh21 }
577 1.1 bjh21
578 1.1 bjh21 /*
579 1.1 bjh21 -------------------------------------------------------------------------------
580 1.1 bjh21 Returns the sign bit of the extended double-precision floating-point value
581 1.1 bjh21 `a'.
582 1.1 bjh21 -------------------------------------------------------------------------------
583 1.1 bjh21 */
584 1.1 bjh21 INLINE flag extractFloatx80Sign( floatx80 a )
585 1.1 bjh21 {
586 1.1 bjh21
587 1.1 bjh21 return a.high>>15;
588 1.1 bjh21
589 1.1 bjh21 }
590 1.1 bjh21
591 1.1 bjh21 /*
592 1.1 bjh21 -------------------------------------------------------------------------------
593 1.1 bjh21 Normalizes the subnormal extended double-precision floating-point value
594 1.1 bjh21 represented by the denormalized significand `aSig'. The normalized exponent
595 1.1 bjh21 and significand are stored at the locations pointed to by `zExpPtr' and
596 1.1 bjh21 `zSigPtr', respectively.
597 1.1 bjh21 -------------------------------------------------------------------------------
598 1.1 bjh21 */
599 1.1 bjh21 static void
600 1.1 bjh21 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
601 1.1 bjh21 {
602 1.1 bjh21 int8 shiftCount;
603 1.1 bjh21
604 1.1 bjh21 shiftCount = countLeadingZeros64( aSig );
605 1.1 bjh21 *zSigPtr = aSig<<shiftCount;
606 1.1 bjh21 *zExpPtr = 1 - shiftCount;
607 1.1 bjh21
608 1.1 bjh21 }
609 1.1 bjh21
610 1.1 bjh21 /*
611 1.1 bjh21 -------------------------------------------------------------------------------
612 1.1 bjh21 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
613 1.1 bjh21 extended double-precision floating-point value, returning the result.
614 1.1 bjh21 -------------------------------------------------------------------------------
615 1.1 bjh21 */
616 1.1 bjh21 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
617 1.1 bjh21 {
618 1.1 bjh21 floatx80 z;
619 1.1 bjh21
620 1.1 bjh21 z.low = zSig;
621 1.1 bjh21 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
622 1.1 bjh21 return z;
623 1.1 bjh21
624 1.1 bjh21 }
625 1.1 bjh21
626 1.1 bjh21 /*
627 1.1 bjh21 -------------------------------------------------------------------------------
628 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 1.1 bjh21 and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 1.1 bjh21 and returns the proper extended double-precision floating-point value
631 1.1 bjh21 corresponding to the abstract input. Ordinarily, the abstract value is
632 1.1 bjh21 rounded and packed into the extended double-precision format, with the
633 1.1 bjh21 inexact exception raised if the abstract input cannot be represented
634 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
635 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
636 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
637 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
638 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal extended
639 1.1 bjh21 double-precision floating-point number.
640 1.1 bjh21 If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 1.1 bjh21 number of bits as single or double precision, respectively. Otherwise, the
642 1.1 bjh21 result is rounded to the full precision of the extended double-precision
643 1.1 bjh21 format.
644 1.1 bjh21 The input significand must be normalized or smaller. If the input
645 1.1 bjh21 significand is not normalized, `zExp' must be 0; in that case, the result
646 1.1 bjh21 returned is a subnormal number, and it must not require rounding. The
647 1.1 bjh21 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 1.1 bjh21 Floating-Point Arithmetic.
649 1.1 bjh21 -------------------------------------------------------------------------------
650 1.1 bjh21 */
651 1.1 bjh21 static floatx80
652 1.1 bjh21 roundAndPackFloatx80(
653 1.1 bjh21 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654 1.1 bjh21 )
655 1.1 bjh21 {
656 1.1 bjh21 int8 roundingMode;
657 1.1 bjh21 flag roundNearestEven, increment, isTiny;
658 1.1 bjh21 int64 roundIncrement, roundMask, roundBits;
659 1.1 bjh21
660 1.1 bjh21 roundingMode = float_rounding_mode;
661 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
662 1.1 bjh21 if ( roundingPrecision == 80 ) goto precision80;
663 1.1 bjh21 if ( roundingPrecision == 64 ) {
664 1.1 bjh21 roundIncrement = LIT64( 0x0000000000000400 );
665 1.1 bjh21 roundMask = LIT64( 0x00000000000007FF );
666 1.1 bjh21 }
667 1.1 bjh21 else if ( roundingPrecision == 32 ) {
668 1.1 bjh21 roundIncrement = LIT64( 0x0000008000000000 );
669 1.1 bjh21 roundMask = LIT64( 0x000000FFFFFFFFFF );
670 1.1 bjh21 }
671 1.1 bjh21 else {
672 1.1 bjh21 goto precision80;
673 1.1 bjh21 }
674 1.1 bjh21 zSig0 |= ( zSig1 != 0 );
675 1.1 bjh21 if ( ! roundNearestEven ) {
676 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
677 1.1 bjh21 roundIncrement = 0;
678 1.1 bjh21 }
679 1.1 bjh21 else {
680 1.1 bjh21 roundIncrement = roundMask;
681 1.1 bjh21 if ( zSign ) {
682 1.1 bjh21 if ( roundingMode == float_round_up ) roundIncrement = 0;
683 1.1 bjh21 }
684 1.1 bjh21 else {
685 1.1 bjh21 if ( roundingMode == float_round_down ) roundIncrement = 0;
686 1.1 bjh21 }
687 1.1 bjh21 }
688 1.1 bjh21 }
689 1.1 bjh21 roundBits = zSig0 & roundMask;
690 1.1 bjh21 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 1.1 bjh21 if ( ( 0x7FFE < zExp )
692 1.1 bjh21 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693 1.1 bjh21 ) {
694 1.1 bjh21 goto overflow;
695 1.1 bjh21 }
696 1.1 bjh21 if ( zExp <= 0 ) {
697 1.1 bjh21 isTiny =
698 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
699 1.1 bjh21 || ( zExp < 0 )
700 1.1 bjh21 || ( zSig0 <= zSig0 + roundIncrement );
701 1.1 bjh21 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
702 1.1 bjh21 zExp = 0;
703 1.1 bjh21 roundBits = zSig0 & roundMask;
704 1.1 bjh21 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
705 1.1 bjh21 if ( roundBits ) float_exception_flags |= float_flag_inexact;
706 1.1 bjh21 zSig0 += roundIncrement;
707 1.1 bjh21 if ( (sbits64) zSig0 < 0 ) zExp = 1;
708 1.1 bjh21 roundIncrement = roundMask + 1;
709 1.1 bjh21 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
710 1.1 bjh21 roundMask |= roundIncrement;
711 1.1 bjh21 }
712 1.1 bjh21 zSig0 &= ~ roundMask;
713 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
714 1.1 bjh21 }
715 1.1 bjh21 }
716 1.1 bjh21 if ( roundBits ) float_exception_flags |= float_flag_inexact;
717 1.1 bjh21 zSig0 += roundIncrement;
718 1.1 bjh21 if ( zSig0 < roundIncrement ) {
719 1.1 bjh21 ++zExp;
720 1.1 bjh21 zSig0 = LIT64( 0x8000000000000000 );
721 1.1 bjh21 }
722 1.1 bjh21 roundIncrement = roundMask + 1;
723 1.1 bjh21 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
724 1.1 bjh21 roundMask |= roundIncrement;
725 1.1 bjh21 }
726 1.1 bjh21 zSig0 &= ~ roundMask;
727 1.1 bjh21 if ( zSig0 == 0 ) zExp = 0;
728 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
729 1.1 bjh21 precision80:
730 1.1 bjh21 increment = ( (sbits64) zSig1 < 0 );
731 1.1 bjh21 if ( ! roundNearestEven ) {
732 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
733 1.1 bjh21 increment = 0;
734 1.1 bjh21 }
735 1.1 bjh21 else {
736 1.1 bjh21 if ( zSign ) {
737 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig1;
738 1.1 bjh21 }
739 1.1 bjh21 else {
740 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig1;
741 1.1 bjh21 }
742 1.1 bjh21 }
743 1.1 bjh21 }
744 1.1 bjh21 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745 1.1 bjh21 if ( ( 0x7FFE < zExp )
746 1.1 bjh21 || ( ( zExp == 0x7FFE )
747 1.1 bjh21 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
748 1.1 bjh21 && increment
749 1.1 bjh21 )
750 1.1 bjh21 ) {
751 1.1 bjh21 roundMask = 0;
752 1.1 bjh21 overflow:
753 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
754 1.1 bjh21 if ( ( roundingMode == float_round_to_zero )
755 1.1 bjh21 || ( zSign && ( roundingMode == float_round_up ) )
756 1.1 bjh21 || ( ! zSign && ( roundingMode == float_round_down ) )
757 1.1 bjh21 ) {
758 1.1 bjh21 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
759 1.1 bjh21 }
760 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
761 1.1 bjh21 }
762 1.1 bjh21 if ( zExp <= 0 ) {
763 1.1 bjh21 isTiny =
764 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
765 1.1 bjh21 || ( zExp < 0 )
766 1.1 bjh21 || ! increment
767 1.1 bjh21 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
768 1.1 bjh21 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
769 1.1 bjh21 zExp = 0;
770 1.1 bjh21 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
771 1.1 bjh21 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
772 1.1 bjh21 if ( roundNearestEven ) {
773 1.1 bjh21 increment = ( (sbits64) zSig1 < 0 );
774 1.1 bjh21 }
775 1.1 bjh21 else {
776 1.1 bjh21 if ( zSign ) {
777 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig1;
778 1.1 bjh21 }
779 1.1 bjh21 else {
780 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig1;
781 1.1 bjh21 }
782 1.1 bjh21 }
783 1.1 bjh21 if ( increment ) {
784 1.1 bjh21 ++zSig0;
785 1.1 bjh21 zSig0 &=
786 1.1 bjh21 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
787 1.1 bjh21 if ( (sbits64) zSig0 < 0 ) zExp = 1;
788 1.1 bjh21 }
789 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
790 1.1 bjh21 }
791 1.1 bjh21 }
792 1.1 bjh21 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
793 1.1 bjh21 if ( increment ) {
794 1.1 bjh21 ++zSig0;
795 1.1 bjh21 if ( zSig0 == 0 ) {
796 1.1 bjh21 ++zExp;
797 1.1 bjh21 zSig0 = LIT64( 0x8000000000000000 );
798 1.1 bjh21 }
799 1.1 bjh21 else {
800 1.1 bjh21 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
801 1.1 bjh21 }
802 1.1 bjh21 }
803 1.1 bjh21 else {
804 1.1 bjh21 if ( zSig0 == 0 ) zExp = 0;
805 1.1 bjh21 }
806 1.1 bjh21 return packFloatx80( zSign, zExp, zSig0 );
807 1.1 bjh21
808 1.1 bjh21 }
809 1.1 bjh21
810 1.1 bjh21 /*
811 1.1 bjh21 -------------------------------------------------------------------------------
812 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent
813 1.1 bjh21 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 1.1 bjh21 and returns the proper extended double-precision floating-point value
815 1.1 bjh21 corresponding to the abstract input. This routine is just like
816 1.1 bjh21 `roundAndPackFloatx80' except that the input significand does not have to be
817 1.1 bjh21 normalized.
818 1.1 bjh21 -------------------------------------------------------------------------------
819 1.1 bjh21 */
820 1.1 bjh21 static floatx80
821 1.1 bjh21 normalizeRoundAndPackFloatx80(
822 1.1 bjh21 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823 1.1 bjh21 )
824 1.1 bjh21 {
825 1.1 bjh21 int8 shiftCount;
826 1.1 bjh21
827 1.1 bjh21 if ( zSig0 == 0 ) {
828 1.1 bjh21 zSig0 = zSig1;
829 1.1 bjh21 zSig1 = 0;
830 1.1 bjh21 zExp -= 64;
831 1.1 bjh21 }
832 1.1 bjh21 shiftCount = countLeadingZeros64( zSig0 );
833 1.1 bjh21 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 1.1 bjh21 zExp -= shiftCount;
835 1.1 bjh21 return
836 1.1 bjh21 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
837 1.1 bjh21
838 1.1 bjh21 }
839 1.1 bjh21
840 1.1 bjh21 #endif
841 1.1 bjh21
842 1.1 bjh21 #ifdef FLOAT128
843 1.1 bjh21
844 1.1 bjh21 /*
845 1.1 bjh21 -------------------------------------------------------------------------------
846 1.1 bjh21 Returns the least-significant 64 fraction bits of the quadruple-precision
847 1.1 bjh21 floating-point value `a'.
848 1.1 bjh21 -------------------------------------------------------------------------------
849 1.1 bjh21 */
850 1.1 bjh21 INLINE bits64 extractFloat128Frac1( float128 a )
851 1.1 bjh21 {
852 1.1 bjh21
853 1.1 bjh21 return a.low;
854 1.1 bjh21
855 1.1 bjh21 }
856 1.1 bjh21
857 1.1 bjh21 /*
858 1.1 bjh21 -------------------------------------------------------------------------------
859 1.1 bjh21 Returns the most-significant 48 fraction bits of the quadruple-precision
860 1.1 bjh21 floating-point value `a'.
861 1.1 bjh21 -------------------------------------------------------------------------------
862 1.1 bjh21 */
863 1.1 bjh21 INLINE bits64 extractFloat128Frac0( float128 a )
864 1.1 bjh21 {
865 1.1 bjh21
866 1.1 bjh21 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
867 1.1 bjh21
868 1.1 bjh21 }
869 1.1 bjh21
870 1.1 bjh21 /*
871 1.1 bjh21 -------------------------------------------------------------------------------
872 1.1 bjh21 Returns the exponent bits of the quadruple-precision floating-point value
873 1.1 bjh21 `a'.
874 1.1 bjh21 -------------------------------------------------------------------------------
875 1.1 bjh21 */
876 1.1 bjh21 INLINE int32 extractFloat128Exp( float128 a )
877 1.1 bjh21 {
878 1.1 bjh21
879 1.1 bjh21 return ( a.high>>48 ) & 0x7FFF;
880 1.1 bjh21
881 1.1 bjh21 }
882 1.1 bjh21
883 1.1 bjh21 /*
884 1.1 bjh21 -------------------------------------------------------------------------------
885 1.1 bjh21 Returns the sign bit of the quadruple-precision floating-point value `a'.
886 1.1 bjh21 -------------------------------------------------------------------------------
887 1.1 bjh21 */
888 1.1 bjh21 INLINE flag extractFloat128Sign( float128 a )
889 1.1 bjh21 {
890 1.1 bjh21
891 1.1 bjh21 return a.high>>63;
892 1.1 bjh21
893 1.1 bjh21 }
894 1.1 bjh21
895 1.1 bjh21 /*
896 1.1 bjh21 -------------------------------------------------------------------------------
897 1.1 bjh21 Normalizes the subnormal quadruple-precision floating-point value
898 1.1 bjh21 represented by the denormalized significand formed by the concatenation of
899 1.1 bjh21 `aSig0' and `aSig1'. The normalized exponent is stored at the location
900 1.1 bjh21 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901 1.1 bjh21 significand are stored at the location pointed to by `zSig0Ptr', and the
902 1.1 bjh21 least significant 64 bits of the normalized significand are stored at the
903 1.1 bjh21 location pointed to by `zSig1Ptr'.
904 1.1 bjh21 -------------------------------------------------------------------------------
905 1.1 bjh21 */
906 1.1 bjh21 static void
907 1.1 bjh21 normalizeFloat128Subnormal(
908 1.1 bjh21 bits64 aSig0,
909 1.1 bjh21 bits64 aSig1,
910 1.1 bjh21 int32 *zExpPtr,
911 1.1 bjh21 bits64 *zSig0Ptr,
912 1.1 bjh21 bits64 *zSig1Ptr
913 1.1 bjh21 )
914 1.1 bjh21 {
915 1.1 bjh21 int8 shiftCount;
916 1.1 bjh21
917 1.1 bjh21 if ( aSig0 == 0 ) {
918 1.1 bjh21 shiftCount = countLeadingZeros64( aSig1 ) - 15;
919 1.1 bjh21 if ( shiftCount < 0 ) {
920 1.1 bjh21 *zSig0Ptr = aSig1>>( - shiftCount );
921 1.1 bjh21 *zSig1Ptr = aSig1<<( shiftCount & 63 );
922 1.1 bjh21 }
923 1.1 bjh21 else {
924 1.1 bjh21 *zSig0Ptr = aSig1<<shiftCount;
925 1.1 bjh21 *zSig1Ptr = 0;
926 1.1 bjh21 }
927 1.1 bjh21 *zExpPtr = - shiftCount - 63;
928 1.1 bjh21 }
929 1.1 bjh21 else {
930 1.1 bjh21 shiftCount = countLeadingZeros64( aSig0 ) - 15;
931 1.1 bjh21 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932 1.1 bjh21 *zExpPtr = 1 - shiftCount;
933 1.1 bjh21 }
934 1.1 bjh21
935 1.1 bjh21 }
936 1.1 bjh21
937 1.1 bjh21 /*
938 1.1 bjh21 -------------------------------------------------------------------------------
939 1.1 bjh21 Packs the sign `zSign', the exponent `zExp', and the significand formed
940 1.1 bjh21 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 1.1 bjh21 floating-point value, returning the result. After being shifted into the
942 1.1 bjh21 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 1.1 bjh21 added together to form the most significant 32 bits of the result. This
944 1.1 bjh21 means that any integer portion of `zSig0' will be added into the exponent.
945 1.1 bjh21 Since a properly normalized significand will have an integer portion equal
946 1.1 bjh21 to 1, the `zExp' input should be 1 less than the desired result exponent
947 1.1 bjh21 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948 1.1 bjh21 significand.
949 1.1 bjh21 -------------------------------------------------------------------------------
950 1.1 bjh21 */
951 1.1 bjh21 INLINE float128
952 1.1 bjh21 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
953 1.1 bjh21 {
954 1.1 bjh21 float128 z;
955 1.1 bjh21
956 1.1 bjh21 z.low = zSig1;
957 1.1 bjh21 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
958 1.1 bjh21 return z;
959 1.1 bjh21
960 1.1 bjh21 }
961 1.1 bjh21
962 1.1 bjh21 /*
963 1.1 bjh21 -------------------------------------------------------------------------------
964 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
965 1.1 bjh21 and extended significand formed by the concatenation of `zSig0', `zSig1',
966 1.1 bjh21 and `zSig2', and returns the proper quadruple-precision floating-point value
967 1.1 bjh21 corresponding to the abstract input. Ordinarily, the abstract value is
968 1.1 bjh21 simply rounded and packed into the quadruple-precision format, with the
969 1.1 bjh21 inexact exception raised if the abstract input cannot be represented
970 1.1 bjh21 exactly. However, if the abstract value is too large, the overflow and
971 1.1 bjh21 inexact exceptions are raised and an infinity or maximal finite value is
972 1.1 bjh21 returned. If the abstract value is too small, the input value is rounded to
973 1.1 bjh21 a subnormal number, and the underflow and inexact exceptions are raised if
974 1.1 bjh21 the abstract input cannot be represented exactly as a subnormal quadruple-
975 1.1 bjh21 precision floating-point number.
976 1.1 bjh21 The input significand must be normalized or smaller. If the input
977 1.1 bjh21 significand is not normalized, `zExp' must be 0; in that case, the result
978 1.1 bjh21 returned is a subnormal number, and it must not require rounding. In the
979 1.1 bjh21 usual case that the input significand is normalized, `zExp' must be 1 less
980 1.1 bjh21 than the ``true'' floating-point exponent. The handling of underflow and
981 1.1 bjh21 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
982 1.1 bjh21 -------------------------------------------------------------------------------
983 1.1 bjh21 */
984 1.1 bjh21 static float128
985 1.1 bjh21 roundAndPackFloat128(
986 1.1 bjh21 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 1.1 bjh21 {
988 1.1 bjh21 int8 roundingMode;
989 1.1 bjh21 flag roundNearestEven, increment, isTiny;
990 1.1 bjh21
991 1.1 bjh21 roundingMode = float_rounding_mode;
992 1.1 bjh21 roundNearestEven = ( roundingMode == float_round_nearest_even );
993 1.1 bjh21 increment = ( (sbits64) zSig2 < 0 );
994 1.1 bjh21 if ( ! roundNearestEven ) {
995 1.1 bjh21 if ( roundingMode == float_round_to_zero ) {
996 1.1 bjh21 increment = 0;
997 1.1 bjh21 }
998 1.1 bjh21 else {
999 1.1 bjh21 if ( zSign ) {
1000 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig2;
1001 1.1 bjh21 }
1002 1.1 bjh21 else {
1003 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig2;
1004 1.1 bjh21 }
1005 1.1 bjh21 }
1006 1.1 bjh21 }
1007 1.1 bjh21 if ( 0x7FFD <= (bits32) zExp ) {
1008 1.1 bjh21 if ( ( 0x7FFD < zExp )
1009 1.1 bjh21 || ( ( zExp == 0x7FFD )
1010 1.1 bjh21 && eq128(
1011 1.1 bjh21 LIT64( 0x0001FFFFFFFFFFFF ),
1012 1.1 bjh21 LIT64( 0xFFFFFFFFFFFFFFFF ),
1013 1.1 bjh21 zSig0,
1014 1.1 bjh21 zSig1
1015 1.1 bjh21 )
1016 1.1 bjh21 && increment
1017 1.1 bjh21 )
1018 1.1 bjh21 ) {
1019 1.1 bjh21 float_raise( float_flag_overflow | float_flag_inexact );
1020 1.1 bjh21 if ( ( roundingMode == float_round_to_zero )
1021 1.1 bjh21 || ( zSign && ( roundingMode == float_round_up ) )
1022 1.1 bjh21 || ( ! zSign && ( roundingMode == float_round_down ) )
1023 1.1 bjh21 ) {
1024 1.1 bjh21 return
1025 1.1 bjh21 packFloat128(
1026 1.1 bjh21 zSign,
1027 1.1 bjh21 0x7FFE,
1028 1.1 bjh21 LIT64( 0x0000FFFFFFFFFFFF ),
1029 1.1 bjh21 LIT64( 0xFFFFFFFFFFFFFFFF )
1030 1.1 bjh21 );
1031 1.1 bjh21 }
1032 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
1033 1.1 bjh21 }
1034 1.1 bjh21 if ( zExp < 0 ) {
1035 1.1 bjh21 isTiny =
1036 1.1 bjh21 ( float_detect_tininess == float_tininess_before_rounding )
1037 1.1 bjh21 || ( zExp < -1 )
1038 1.1 bjh21 || ! increment
1039 1.1 bjh21 || lt128(
1040 1.1 bjh21 zSig0,
1041 1.1 bjh21 zSig1,
1042 1.1 bjh21 LIT64( 0x0001FFFFFFFFFFFF ),
1043 1.1 bjh21 LIT64( 0xFFFFFFFFFFFFFFFF )
1044 1.1 bjh21 );
1045 1.1 bjh21 shift128ExtraRightJamming(
1046 1.1 bjh21 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1047 1.1 bjh21 zExp = 0;
1048 1.1 bjh21 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1049 1.1 bjh21 if ( roundNearestEven ) {
1050 1.1 bjh21 increment = ( (sbits64) zSig2 < 0 );
1051 1.1 bjh21 }
1052 1.1 bjh21 else {
1053 1.1 bjh21 if ( zSign ) {
1054 1.1 bjh21 increment = ( roundingMode == float_round_down ) && zSig2;
1055 1.1 bjh21 }
1056 1.1 bjh21 else {
1057 1.1 bjh21 increment = ( roundingMode == float_round_up ) && zSig2;
1058 1.1 bjh21 }
1059 1.1 bjh21 }
1060 1.1 bjh21 }
1061 1.1 bjh21 }
1062 1.1 bjh21 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1063 1.1 bjh21 if ( increment ) {
1064 1.1 bjh21 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1065 1.1 bjh21 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066 1.1 bjh21 }
1067 1.1 bjh21 else {
1068 1.1 bjh21 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069 1.1 bjh21 }
1070 1.1 bjh21 return packFloat128( zSign, zExp, zSig0, zSig1 );
1071 1.1 bjh21
1072 1.1 bjh21 }
1073 1.1 bjh21
1074 1.1 bjh21 /*
1075 1.1 bjh21 -------------------------------------------------------------------------------
1076 1.1 bjh21 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 1.1 bjh21 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 1.1 bjh21 returns the proper quadruple-precision floating-point value corresponding
1079 1.1 bjh21 to the abstract input. This routine is just like `roundAndPackFloat128'
1080 1.1 bjh21 except that the input significand has fewer bits and does not have to be
1081 1.1 bjh21 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1082 1.1 bjh21 point exponent.
1083 1.1 bjh21 -------------------------------------------------------------------------------
1084 1.1 bjh21 */
1085 1.1 bjh21 static float128
1086 1.1 bjh21 normalizeRoundAndPackFloat128(
1087 1.1 bjh21 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1088 1.1 bjh21 {
1089 1.1 bjh21 int8 shiftCount;
1090 1.1 bjh21 bits64 zSig2;
1091 1.1 bjh21
1092 1.1 bjh21 if ( zSig0 == 0 ) {
1093 1.1 bjh21 zSig0 = zSig1;
1094 1.1 bjh21 zSig1 = 0;
1095 1.1 bjh21 zExp -= 64;
1096 1.1 bjh21 }
1097 1.1 bjh21 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1098 1.1 bjh21 if ( 0 <= shiftCount ) {
1099 1.1 bjh21 zSig2 = 0;
1100 1.1 bjh21 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101 1.1 bjh21 }
1102 1.1 bjh21 else {
1103 1.1 bjh21 shift128ExtraRightJamming(
1104 1.1 bjh21 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105 1.1 bjh21 }
1106 1.1 bjh21 zExp -= shiftCount;
1107 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1108 1.1 bjh21
1109 1.1 bjh21 }
1110 1.1 bjh21
1111 1.1 bjh21 #endif
1112 1.1 bjh21
1113 1.1 bjh21 /*
1114 1.1 bjh21 -------------------------------------------------------------------------------
1115 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a'
1116 1.1 bjh21 to the single-precision floating-point format. The conversion is performed
1117 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 1.1 bjh21 -------------------------------------------------------------------------------
1119 1.1 bjh21 */
1120 1.1 bjh21 float32 int32_to_float32( int32 a )
1121 1.1 bjh21 {
1122 1.1 bjh21 flag zSign;
1123 1.1 bjh21
1124 1.1 bjh21 if ( a == 0 ) return 0;
1125 1.1 bjh21 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126 1.1 bjh21 zSign = ( a < 0 );
1127 1.1 bjh21 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1128 1.1 bjh21
1129 1.1 bjh21 }
1130 1.1 bjh21
1131 1.6 matt float32 uint32_to_float32( uint32 a )
1132 1.6 matt {
1133 1.6 matt if ( a == 0 ) return 0;
1134 1.6 matt if ( a & (bits32) 0x80000000 )
1135 1.6 matt return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1136 1.6 matt return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1137 1.6 matt }
1138 1.6 matt
1139 1.6 matt
1140 1.1 bjh21 /*
1141 1.1 bjh21 -------------------------------------------------------------------------------
1142 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a'
1143 1.1 bjh21 to the double-precision floating-point format. The conversion is performed
1144 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 1.1 bjh21 -------------------------------------------------------------------------------
1146 1.1 bjh21 */
1147 1.1 bjh21 float64 int32_to_float64( int32 a )
1148 1.1 bjh21 {
1149 1.1 bjh21 flag zSign;
1150 1.1 bjh21 uint32 absA;
1151 1.1 bjh21 int8 shiftCount;
1152 1.1 bjh21 bits64 zSig;
1153 1.1 bjh21
1154 1.1 bjh21 if ( a == 0 ) return 0;
1155 1.1 bjh21 zSign = ( a < 0 );
1156 1.1 bjh21 absA = zSign ? - a : a;
1157 1.1 bjh21 shiftCount = countLeadingZeros32( absA ) + 21;
1158 1.1 bjh21 zSig = absA;
1159 1.1 bjh21 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160 1.1 bjh21
1161 1.1 bjh21 }
1162 1.1 bjh21
1163 1.6 matt float64 uint32_to_float64( uint32 a )
1164 1.6 matt {
1165 1.6 matt int8 shiftCount;
1166 1.6 matt bits64 zSig = a;
1167 1.6 matt
1168 1.6 matt if ( a == 0 ) return 0;
1169 1.6 matt shiftCount = countLeadingZeros32( a ) + 21;
1170 1.6 matt return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1171 1.6 matt
1172 1.6 matt }
1173 1.6 matt
1174 1.1 bjh21 #ifdef FLOATX80
1175 1.1 bjh21
1176 1.1 bjh21 /*
1177 1.1 bjh21 -------------------------------------------------------------------------------
1178 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a'
1179 1.1 bjh21 to the extended double-precision floating-point format. The conversion
1180 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 1.1 bjh21 Arithmetic.
1182 1.1 bjh21 -------------------------------------------------------------------------------
1183 1.1 bjh21 */
1184 1.1 bjh21 floatx80 int32_to_floatx80( int32 a )
1185 1.1 bjh21 {
1186 1.1 bjh21 flag zSign;
1187 1.1 bjh21 uint32 absA;
1188 1.1 bjh21 int8 shiftCount;
1189 1.1 bjh21 bits64 zSig;
1190 1.1 bjh21
1191 1.1 bjh21 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1192 1.1 bjh21 zSign = ( a < 0 );
1193 1.1 bjh21 absA = zSign ? - a : a;
1194 1.1 bjh21 shiftCount = countLeadingZeros32( absA ) + 32;
1195 1.1 bjh21 zSig = absA;
1196 1.1 bjh21 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1197 1.1 bjh21
1198 1.1 bjh21 }
1199 1.1 bjh21
1200 1.6 matt floatx80 uint32_to_floatx80( uint32 a )
1201 1.6 matt {
1202 1.6 matt int8 shiftCount;
1203 1.6 matt bits64 zSig = a;
1204 1.6 matt
1205 1.6 matt if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1206 1.6 matt shiftCount = countLeadingZeros32( a ) + 32;
1207 1.6 matt return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1208 1.6 matt
1209 1.6 matt }
1210 1.6 matt
1211 1.1 bjh21 #endif
1212 1.1 bjh21
1213 1.1 bjh21 #ifdef FLOAT128
1214 1.1 bjh21
1215 1.1 bjh21 /*
1216 1.1 bjh21 -------------------------------------------------------------------------------
1217 1.1 bjh21 Returns the result of converting the 32-bit two's complement integer `a' to
1218 1.1 bjh21 the quadruple-precision floating-point format. The conversion is performed
1219 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1220 1.1 bjh21 -------------------------------------------------------------------------------
1221 1.1 bjh21 */
1222 1.1 bjh21 float128 int32_to_float128( int32 a )
1223 1.1 bjh21 {
1224 1.1 bjh21 flag zSign;
1225 1.1 bjh21 uint32 absA;
1226 1.1 bjh21 int8 shiftCount;
1227 1.1 bjh21 bits64 zSig0;
1228 1.1 bjh21
1229 1.1 bjh21 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1230 1.1 bjh21 zSign = ( a < 0 );
1231 1.1 bjh21 absA = zSign ? - a : a;
1232 1.1 bjh21 shiftCount = countLeadingZeros32( absA ) + 17;
1233 1.1 bjh21 zSig0 = absA;
1234 1.1 bjh21 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1235 1.1 bjh21
1236 1.1 bjh21 }
1237 1.1 bjh21
1238 1.6 matt float128 uint32_to_float128( uint32 a )
1239 1.6 matt {
1240 1.6 matt int8 shiftCount;
1241 1.6 matt bits64 zSig0 = a;
1242 1.6 matt
1243 1.6 matt if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244 1.6 matt shiftCount = countLeadingZeros32( a ) + 17;
1245 1.6 matt return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1246 1.6 matt
1247 1.6 matt }
1248 1.6 matt
1249 1.1 bjh21 #endif
1250 1.1 bjh21
1251 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1252 1.1 bjh21 /*
1253 1.1 bjh21 -------------------------------------------------------------------------------
1254 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a'
1255 1.1 bjh21 to the single-precision floating-point format. The conversion is performed
1256 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 1.1 bjh21 -------------------------------------------------------------------------------
1258 1.1 bjh21 */
1259 1.1 bjh21 float32 int64_to_float32( int64 a )
1260 1.1 bjh21 {
1261 1.1 bjh21 flag zSign;
1262 1.1 bjh21 uint64 absA;
1263 1.1 bjh21 int8 shiftCount;
1264 1.1 bjh21
1265 1.1 bjh21 if ( a == 0 ) return 0;
1266 1.1 bjh21 zSign = ( a < 0 );
1267 1.1 bjh21 absA = zSign ? - a : a;
1268 1.1 bjh21 shiftCount = countLeadingZeros64( absA ) - 40;
1269 1.1 bjh21 if ( 0 <= shiftCount ) {
1270 1.1 bjh21 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1271 1.1 bjh21 }
1272 1.1 bjh21 else {
1273 1.1 bjh21 shiftCount += 7;
1274 1.1 bjh21 if ( shiftCount < 0 ) {
1275 1.1 bjh21 shift64RightJamming( absA, - shiftCount, &absA );
1276 1.1 bjh21 }
1277 1.1 bjh21 else {
1278 1.1 bjh21 absA <<= shiftCount;
1279 1.1 bjh21 }
1280 1.1 bjh21 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1281 1.1 bjh21 }
1282 1.1 bjh21
1283 1.1 bjh21 }
1284 1.1 bjh21
1285 1.1 bjh21 /*
1286 1.1 bjh21 -------------------------------------------------------------------------------
1287 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a'
1288 1.1 bjh21 to the double-precision floating-point format. The conversion is performed
1289 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 1.1 bjh21 -------------------------------------------------------------------------------
1291 1.1 bjh21 */
1292 1.1 bjh21 float64 int64_to_float64( int64 a )
1293 1.1 bjh21 {
1294 1.1 bjh21 flag zSign;
1295 1.1 bjh21
1296 1.1 bjh21 if ( a == 0 ) return 0;
1297 1.1 bjh21 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1298 1.1 bjh21 return packFloat64( 1, 0x43E, 0 );
1299 1.1 bjh21 }
1300 1.1 bjh21 zSign = ( a < 0 );
1301 1.1 bjh21 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1302 1.1 bjh21
1303 1.1 bjh21 }
1304 1.1 bjh21
1305 1.1 bjh21 #ifdef FLOATX80
1306 1.1 bjh21
1307 1.1 bjh21 /*
1308 1.1 bjh21 -------------------------------------------------------------------------------
1309 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a'
1310 1.1 bjh21 to the extended double-precision floating-point format. The conversion
1311 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 1.1 bjh21 Arithmetic.
1313 1.1 bjh21 -------------------------------------------------------------------------------
1314 1.1 bjh21 */
1315 1.1 bjh21 floatx80 int64_to_floatx80( int64 a )
1316 1.1 bjh21 {
1317 1.1 bjh21 flag zSign;
1318 1.1 bjh21 uint64 absA;
1319 1.1 bjh21 int8 shiftCount;
1320 1.1 bjh21
1321 1.1 bjh21 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1322 1.1 bjh21 zSign = ( a < 0 );
1323 1.1 bjh21 absA = zSign ? - a : a;
1324 1.1 bjh21 shiftCount = countLeadingZeros64( absA );
1325 1.1 bjh21 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1326 1.1 bjh21
1327 1.1 bjh21 }
1328 1.1 bjh21
1329 1.1 bjh21 #endif
1330 1.1 bjh21
1331 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1332 1.1 bjh21
1333 1.1 bjh21 #ifdef FLOAT128
1334 1.1 bjh21
1335 1.1 bjh21 /*
1336 1.1 bjh21 -------------------------------------------------------------------------------
1337 1.1 bjh21 Returns the result of converting the 64-bit two's complement integer `a' to
1338 1.1 bjh21 the quadruple-precision floating-point format. The conversion is performed
1339 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1340 1.1 bjh21 -------------------------------------------------------------------------------
1341 1.1 bjh21 */
1342 1.1 bjh21 float128 int64_to_float128( int64 a )
1343 1.1 bjh21 {
1344 1.1 bjh21 flag zSign;
1345 1.1 bjh21 uint64 absA;
1346 1.1 bjh21 int8 shiftCount;
1347 1.1 bjh21 int32 zExp;
1348 1.1 bjh21 bits64 zSig0, zSig1;
1349 1.1 bjh21
1350 1.1 bjh21 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1351 1.1 bjh21 zSign = ( a < 0 );
1352 1.1 bjh21 absA = zSign ? - a : a;
1353 1.1 bjh21 shiftCount = countLeadingZeros64( absA ) + 49;
1354 1.1 bjh21 zExp = 0x406E - shiftCount;
1355 1.1 bjh21 if ( 64 <= shiftCount ) {
1356 1.1 bjh21 zSig1 = 0;
1357 1.1 bjh21 zSig0 = absA;
1358 1.1 bjh21 shiftCount -= 64;
1359 1.1 bjh21 }
1360 1.1 bjh21 else {
1361 1.1 bjh21 zSig1 = absA;
1362 1.1 bjh21 zSig0 = 0;
1363 1.1 bjh21 }
1364 1.1 bjh21 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1365 1.1 bjh21 return packFloat128( zSign, zExp, zSig0, zSig1 );
1366 1.1 bjh21
1367 1.1 bjh21 }
1368 1.1 bjh21
1369 1.1 bjh21 #endif
1370 1.1 bjh21
1371 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1372 1.1 bjh21 /*
1373 1.1 bjh21 -------------------------------------------------------------------------------
1374 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1375 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
1376 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1377 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
1378 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
1379 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
1380 1.1 bjh21 largest integer with the same sign as `a' is returned.
1381 1.1 bjh21 -------------------------------------------------------------------------------
1382 1.1 bjh21 */
1383 1.1 bjh21 int32 float32_to_int32( float32 a )
1384 1.1 bjh21 {
1385 1.1 bjh21 flag aSign;
1386 1.1 bjh21 int16 aExp, shiftCount;
1387 1.1 bjh21 bits32 aSig;
1388 1.1 bjh21 bits64 aSig64;
1389 1.1 bjh21
1390 1.1 bjh21 aSig = extractFloat32Frac( a );
1391 1.1 bjh21 aExp = extractFloat32Exp( a );
1392 1.1 bjh21 aSign = extractFloat32Sign( a );
1393 1.1 bjh21 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1394 1.1 bjh21 if ( aExp ) aSig |= 0x00800000;
1395 1.1 bjh21 shiftCount = 0xAF - aExp;
1396 1.1 bjh21 aSig64 = aSig;
1397 1.1 bjh21 aSig64 <<= 32;
1398 1.1 bjh21 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1399 1.1 bjh21 return roundAndPackInt32( aSign, aSig64 );
1400 1.1 bjh21
1401 1.1 bjh21 }
1402 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1403 1.1 bjh21
1404 1.1 bjh21 /*
1405 1.1 bjh21 -------------------------------------------------------------------------------
1406 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1407 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
1408 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
1410 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1411 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
1412 1.1 bjh21 returned.
1413 1.1 bjh21 -------------------------------------------------------------------------------
1414 1.1 bjh21 */
1415 1.1 bjh21 int32 float32_to_int32_round_to_zero( float32 a )
1416 1.1 bjh21 {
1417 1.1 bjh21 flag aSign;
1418 1.1 bjh21 int16 aExp, shiftCount;
1419 1.1 bjh21 bits32 aSig;
1420 1.1 bjh21 int32 z;
1421 1.1 bjh21
1422 1.1 bjh21 aSig = extractFloat32Frac( a );
1423 1.1 bjh21 aExp = extractFloat32Exp( a );
1424 1.1 bjh21 aSign = extractFloat32Sign( a );
1425 1.1 bjh21 shiftCount = aExp - 0x9E;
1426 1.1 bjh21 if ( 0 <= shiftCount ) {
1427 1.1 bjh21 if ( a != 0xCF000000 ) {
1428 1.1 bjh21 float_raise( float_flag_invalid );
1429 1.1 bjh21 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1430 1.1 bjh21 }
1431 1.1 bjh21 return (sbits32) 0x80000000;
1432 1.1 bjh21 }
1433 1.1 bjh21 else if ( aExp <= 0x7E ) {
1434 1.1 bjh21 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1435 1.1 bjh21 return 0;
1436 1.1 bjh21 }
1437 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<8;
1438 1.1 bjh21 z = aSig>>( - shiftCount );
1439 1.1 bjh21 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1440 1.1 bjh21 float_exception_flags |= float_flag_inexact;
1441 1.1 bjh21 }
1442 1.1 bjh21 if ( aSign ) z = - z;
1443 1.1 bjh21 return z;
1444 1.1 bjh21
1445 1.1 bjh21 }
1446 1.1 bjh21
1447 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1448 1.1 bjh21 /*
1449 1.1 bjh21 -------------------------------------------------------------------------------
1450 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1451 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
1452 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1453 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
1454 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
1455 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
1456 1.1 bjh21 largest integer with the same sign as `a' is returned.
1457 1.1 bjh21 -------------------------------------------------------------------------------
1458 1.1 bjh21 */
1459 1.1 bjh21 int64 float32_to_int64( float32 a )
1460 1.1 bjh21 {
1461 1.1 bjh21 flag aSign;
1462 1.1 bjh21 int16 aExp, shiftCount;
1463 1.1 bjh21 bits32 aSig;
1464 1.1 bjh21 bits64 aSig64, aSigExtra;
1465 1.1 bjh21
1466 1.1 bjh21 aSig = extractFloat32Frac( a );
1467 1.1 bjh21 aExp = extractFloat32Exp( a );
1468 1.1 bjh21 aSign = extractFloat32Sign( a );
1469 1.1 bjh21 shiftCount = 0xBE - aExp;
1470 1.1 bjh21 if ( shiftCount < 0 ) {
1471 1.1 bjh21 float_raise( float_flag_invalid );
1472 1.1 bjh21 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
1474 1.1 bjh21 }
1475 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
1476 1.1 bjh21 }
1477 1.1 bjh21 if ( aExp ) aSig |= 0x00800000;
1478 1.1 bjh21 aSig64 = aSig;
1479 1.1 bjh21 aSig64 <<= 40;
1480 1.1 bjh21 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1481 1.1 bjh21 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1482 1.1 bjh21
1483 1.1 bjh21 }
1484 1.1 bjh21
1485 1.1 bjh21 /*
1486 1.1 bjh21 -------------------------------------------------------------------------------
1487 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1488 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
1489 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1490 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
1491 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1492 1.1 bjh21 conversion overflows, the largest integer with the same sign as `a' is
1493 1.1 bjh21 returned.
1494 1.1 bjh21 -------------------------------------------------------------------------------
1495 1.1 bjh21 */
1496 1.1 bjh21 int64 float32_to_int64_round_to_zero( float32 a )
1497 1.1 bjh21 {
1498 1.1 bjh21 flag aSign;
1499 1.1 bjh21 int16 aExp, shiftCount;
1500 1.1 bjh21 bits32 aSig;
1501 1.1 bjh21 bits64 aSig64;
1502 1.1 bjh21 int64 z;
1503 1.1 bjh21
1504 1.1 bjh21 aSig = extractFloat32Frac( a );
1505 1.1 bjh21 aExp = extractFloat32Exp( a );
1506 1.1 bjh21 aSign = extractFloat32Sign( a );
1507 1.1 bjh21 shiftCount = aExp - 0xBE;
1508 1.1 bjh21 if ( 0 <= shiftCount ) {
1509 1.1 bjh21 if ( a != 0xDF000000 ) {
1510 1.1 bjh21 float_raise( float_flag_invalid );
1511 1.1 bjh21 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1512 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
1513 1.1 bjh21 }
1514 1.1 bjh21 }
1515 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
1516 1.1 bjh21 }
1517 1.1 bjh21 else if ( aExp <= 0x7E ) {
1518 1.1 bjh21 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1519 1.1 bjh21 return 0;
1520 1.1 bjh21 }
1521 1.1 bjh21 aSig64 = aSig | 0x00800000;
1522 1.1 bjh21 aSig64 <<= 40;
1523 1.1 bjh21 z = aSig64>>( - shiftCount );
1524 1.1 bjh21 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1525 1.1 bjh21 float_exception_flags |= float_flag_inexact;
1526 1.1 bjh21 }
1527 1.1 bjh21 if ( aSign ) z = - z;
1528 1.1 bjh21 return z;
1529 1.1 bjh21
1530 1.1 bjh21 }
1531 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1532 1.1 bjh21
1533 1.1 bjh21 /*
1534 1.1 bjh21 -------------------------------------------------------------------------------
1535 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1536 1.1 bjh21 `a' to the double-precision floating-point format. The conversion is
1537 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 1.1 bjh21 Arithmetic.
1539 1.1 bjh21 -------------------------------------------------------------------------------
1540 1.1 bjh21 */
1541 1.1 bjh21 float64 float32_to_float64( float32 a )
1542 1.1 bjh21 {
1543 1.1 bjh21 flag aSign;
1544 1.1 bjh21 int16 aExp;
1545 1.1 bjh21 bits32 aSig;
1546 1.1 bjh21
1547 1.1 bjh21 aSig = extractFloat32Frac( a );
1548 1.1 bjh21 aExp = extractFloat32Exp( a );
1549 1.1 bjh21 aSign = extractFloat32Sign( a );
1550 1.1 bjh21 if ( aExp == 0xFF ) {
1551 1.1 bjh21 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1552 1.1 bjh21 return packFloat64( aSign, 0x7FF, 0 );
1553 1.1 bjh21 }
1554 1.1 bjh21 if ( aExp == 0 ) {
1555 1.1 bjh21 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1556 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1557 1.1 bjh21 --aExp;
1558 1.1 bjh21 }
1559 1.1 bjh21 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1560 1.1 bjh21
1561 1.1 bjh21 }
1562 1.1 bjh21
1563 1.1 bjh21 #ifdef FLOATX80
1564 1.1 bjh21
1565 1.1 bjh21 /*
1566 1.1 bjh21 -------------------------------------------------------------------------------
1567 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1568 1.1 bjh21 `a' to the extended double-precision floating-point format. The conversion
1569 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 1.1 bjh21 Arithmetic.
1571 1.1 bjh21 -------------------------------------------------------------------------------
1572 1.1 bjh21 */
1573 1.1 bjh21 floatx80 float32_to_floatx80( float32 a )
1574 1.1 bjh21 {
1575 1.1 bjh21 flag aSign;
1576 1.1 bjh21 int16 aExp;
1577 1.1 bjh21 bits32 aSig;
1578 1.1 bjh21
1579 1.1 bjh21 aSig = extractFloat32Frac( a );
1580 1.1 bjh21 aExp = extractFloat32Exp( a );
1581 1.1 bjh21 aSign = extractFloat32Sign( a );
1582 1.1 bjh21 if ( aExp == 0xFF ) {
1583 1.1 bjh21 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1584 1.1 bjh21 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1585 1.1 bjh21 }
1586 1.1 bjh21 if ( aExp == 0 ) {
1587 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1588 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589 1.1 bjh21 }
1590 1.1 bjh21 aSig |= 0x00800000;
1591 1.1 bjh21 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1592 1.1 bjh21
1593 1.1 bjh21 }
1594 1.1 bjh21
1595 1.1 bjh21 #endif
1596 1.1 bjh21
1597 1.1 bjh21 #ifdef FLOAT128
1598 1.1 bjh21
1599 1.1 bjh21 /*
1600 1.1 bjh21 -------------------------------------------------------------------------------
1601 1.1 bjh21 Returns the result of converting the single-precision floating-point value
1602 1.1 bjh21 `a' to the double-precision floating-point format. The conversion is
1603 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 1.1 bjh21 Arithmetic.
1605 1.1 bjh21 -------------------------------------------------------------------------------
1606 1.1 bjh21 */
1607 1.1 bjh21 float128 float32_to_float128( float32 a )
1608 1.1 bjh21 {
1609 1.1 bjh21 flag aSign;
1610 1.1 bjh21 int16 aExp;
1611 1.1 bjh21 bits32 aSig;
1612 1.1 bjh21
1613 1.1 bjh21 aSig = extractFloat32Frac( a );
1614 1.1 bjh21 aExp = extractFloat32Exp( a );
1615 1.1 bjh21 aSign = extractFloat32Sign( a );
1616 1.1 bjh21 if ( aExp == 0xFF ) {
1617 1.1 bjh21 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1618 1.1 bjh21 return packFloat128( aSign, 0x7FFF, 0, 0 );
1619 1.1 bjh21 }
1620 1.1 bjh21 if ( aExp == 0 ) {
1621 1.1 bjh21 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1622 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1623 1.1 bjh21 --aExp;
1624 1.1 bjh21 }
1625 1.1 bjh21 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1626 1.1 bjh21
1627 1.1 bjh21 }
1628 1.1 bjh21
1629 1.1 bjh21 #endif
1630 1.1 bjh21
1631 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1632 1.1 bjh21 /*
1633 1.1 bjh21 -------------------------------------------------------------------------------
1634 1.1 bjh21 Rounds the single-precision floating-point value `a' to an integer, and
1635 1.1 bjh21 returns the result as a single-precision floating-point value. The
1636 1.1 bjh21 operation is performed according to the IEC/IEEE Standard for Binary
1637 1.1 bjh21 Floating-Point Arithmetic.
1638 1.1 bjh21 -------------------------------------------------------------------------------
1639 1.1 bjh21 */
1640 1.1 bjh21 float32 float32_round_to_int( float32 a )
1641 1.1 bjh21 {
1642 1.1 bjh21 flag aSign;
1643 1.1 bjh21 int16 aExp;
1644 1.1 bjh21 bits32 lastBitMask, roundBitsMask;
1645 1.1 bjh21 int8 roundingMode;
1646 1.1 bjh21 float32 z;
1647 1.1 bjh21
1648 1.1 bjh21 aExp = extractFloat32Exp( a );
1649 1.1 bjh21 if ( 0x96 <= aExp ) {
1650 1.1 bjh21 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651 1.1 bjh21 return propagateFloat32NaN( a, a );
1652 1.1 bjh21 }
1653 1.1 bjh21 return a;
1654 1.1 bjh21 }
1655 1.1 bjh21 if ( aExp <= 0x7E ) {
1656 1.1 bjh21 if ( (bits32) ( a<<1 ) == 0 ) return a;
1657 1.1 bjh21 float_exception_flags |= float_flag_inexact;
1658 1.1 bjh21 aSign = extractFloat32Sign( a );
1659 1.1 bjh21 switch ( float_rounding_mode ) {
1660 1.1 bjh21 case float_round_nearest_even:
1661 1.1 bjh21 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662 1.1 bjh21 return packFloat32( aSign, 0x7F, 0 );
1663 1.1 bjh21 }
1664 1.1 bjh21 break;
1665 1.1 bjh21 case float_round_to_zero:
1666 1.1 bjh21 break;
1667 1.1 bjh21 case float_round_down:
1668 1.1 bjh21 return aSign ? 0xBF800000 : 0;
1669 1.1 bjh21 case float_round_up:
1670 1.1 bjh21 return aSign ? 0x80000000 : 0x3F800000;
1671 1.1 bjh21 }
1672 1.1 bjh21 return packFloat32( aSign, 0, 0 );
1673 1.1 bjh21 }
1674 1.1 bjh21 lastBitMask = 1;
1675 1.1 bjh21 lastBitMask <<= 0x96 - aExp;
1676 1.1 bjh21 roundBitsMask = lastBitMask - 1;
1677 1.1 bjh21 z = a;
1678 1.1 bjh21 roundingMode = float_rounding_mode;
1679 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
1680 1.1 bjh21 z += lastBitMask>>1;
1681 1.1 bjh21 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1682 1.1 bjh21 }
1683 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
1684 1.1 bjh21 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1685 1.1 bjh21 z += roundBitsMask;
1686 1.1 bjh21 }
1687 1.1 bjh21 }
1688 1.1 bjh21 z &= ~ roundBitsMask;
1689 1.1 bjh21 if ( z != a ) float_exception_flags |= float_flag_inexact;
1690 1.1 bjh21 return z;
1691 1.1 bjh21
1692 1.1 bjh21 }
1693 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
1694 1.1 bjh21
1695 1.1 bjh21 /*
1696 1.1 bjh21 -------------------------------------------------------------------------------
1697 1.1 bjh21 Returns the result of adding the absolute values of the single-precision
1698 1.1 bjh21 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1699 1.1 bjh21 before being returned. `zSign' is ignored if the result is a NaN.
1700 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
1701 1.1 bjh21 Floating-Point Arithmetic.
1702 1.1 bjh21 -------------------------------------------------------------------------------
1703 1.1 bjh21 */
1704 1.1 bjh21 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1705 1.1 bjh21 {
1706 1.1 bjh21 int16 aExp, bExp, zExp;
1707 1.1 bjh21 bits32 aSig, bSig, zSig;
1708 1.1 bjh21 int16 expDiff;
1709 1.1 bjh21
1710 1.1 bjh21 aSig = extractFloat32Frac( a );
1711 1.1 bjh21 aExp = extractFloat32Exp( a );
1712 1.1 bjh21 bSig = extractFloat32Frac( b );
1713 1.1 bjh21 bExp = extractFloat32Exp( b );
1714 1.1 bjh21 expDiff = aExp - bExp;
1715 1.1 bjh21 aSig <<= 6;
1716 1.1 bjh21 bSig <<= 6;
1717 1.1 bjh21 if ( 0 < expDiff ) {
1718 1.1 bjh21 if ( aExp == 0xFF ) {
1719 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, b );
1720 1.1 bjh21 return a;
1721 1.1 bjh21 }
1722 1.1 bjh21 if ( bExp == 0 ) {
1723 1.1 bjh21 --expDiff;
1724 1.1 bjh21 }
1725 1.1 bjh21 else {
1726 1.1 bjh21 bSig |= 0x20000000;
1727 1.1 bjh21 }
1728 1.1 bjh21 shift32RightJamming( bSig, expDiff, &bSig );
1729 1.1 bjh21 zExp = aExp;
1730 1.1 bjh21 }
1731 1.1 bjh21 else if ( expDiff < 0 ) {
1732 1.1 bjh21 if ( bExp == 0xFF ) {
1733 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1734 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1735 1.1 bjh21 }
1736 1.1 bjh21 if ( aExp == 0 ) {
1737 1.1 bjh21 ++expDiff;
1738 1.1 bjh21 }
1739 1.1 bjh21 else {
1740 1.1 bjh21 aSig |= 0x20000000;
1741 1.1 bjh21 }
1742 1.1 bjh21 shift32RightJamming( aSig, - expDiff, &aSig );
1743 1.1 bjh21 zExp = bExp;
1744 1.1 bjh21 }
1745 1.1 bjh21 else {
1746 1.1 bjh21 if ( aExp == 0xFF ) {
1747 1.1 bjh21 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1748 1.1 bjh21 return a;
1749 1.1 bjh21 }
1750 1.1 bjh21 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1751 1.1 bjh21 zSig = 0x40000000 + aSig + bSig;
1752 1.1 bjh21 zExp = aExp;
1753 1.1 bjh21 goto roundAndPack;
1754 1.1 bjh21 }
1755 1.1 bjh21 aSig |= 0x20000000;
1756 1.1 bjh21 zSig = ( aSig + bSig )<<1;
1757 1.1 bjh21 --zExp;
1758 1.1 bjh21 if ( (sbits32) zSig < 0 ) {
1759 1.1 bjh21 zSig = aSig + bSig;
1760 1.1 bjh21 ++zExp;
1761 1.1 bjh21 }
1762 1.1 bjh21 roundAndPack:
1763 1.1 bjh21 return roundAndPackFloat32( zSign, zExp, zSig );
1764 1.1 bjh21
1765 1.1 bjh21 }
1766 1.1 bjh21
1767 1.1 bjh21 /*
1768 1.1 bjh21 -------------------------------------------------------------------------------
1769 1.1 bjh21 Returns the result of subtracting the absolute values of the single-
1770 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the
1771 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
1772 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
1773 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
1774 1.1 bjh21 -------------------------------------------------------------------------------
1775 1.1 bjh21 */
1776 1.1 bjh21 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1777 1.1 bjh21 {
1778 1.1 bjh21 int16 aExp, bExp, zExp;
1779 1.1 bjh21 bits32 aSig, bSig, zSig;
1780 1.1 bjh21 int16 expDiff;
1781 1.1 bjh21
1782 1.1 bjh21 aSig = extractFloat32Frac( a );
1783 1.1 bjh21 aExp = extractFloat32Exp( a );
1784 1.1 bjh21 bSig = extractFloat32Frac( b );
1785 1.1 bjh21 bExp = extractFloat32Exp( b );
1786 1.1 bjh21 expDiff = aExp - bExp;
1787 1.1 bjh21 aSig <<= 7;
1788 1.1 bjh21 bSig <<= 7;
1789 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
1790 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
1791 1.1 bjh21 if ( aExp == 0xFF ) {
1792 1.1 bjh21 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1793 1.1 bjh21 float_raise( float_flag_invalid );
1794 1.1 bjh21 return float32_default_nan;
1795 1.1 bjh21 }
1796 1.1 bjh21 if ( aExp == 0 ) {
1797 1.1 bjh21 aExp = 1;
1798 1.1 bjh21 bExp = 1;
1799 1.1 bjh21 }
1800 1.1 bjh21 if ( bSig < aSig ) goto aBigger;
1801 1.1 bjh21 if ( aSig < bSig ) goto bBigger;
1802 1.1 bjh21 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1803 1.1 bjh21 bExpBigger:
1804 1.1 bjh21 if ( bExp == 0xFF ) {
1805 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1806 1.1 bjh21 return packFloat32( zSign ^ 1, 0xFF, 0 );
1807 1.1 bjh21 }
1808 1.1 bjh21 if ( aExp == 0 ) {
1809 1.1 bjh21 ++expDiff;
1810 1.1 bjh21 }
1811 1.1 bjh21 else {
1812 1.1 bjh21 aSig |= 0x40000000;
1813 1.1 bjh21 }
1814 1.1 bjh21 shift32RightJamming( aSig, - expDiff, &aSig );
1815 1.1 bjh21 bSig |= 0x40000000;
1816 1.1 bjh21 bBigger:
1817 1.1 bjh21 zSig = bSig - aSig;
1818 1.1 bjh21 zExp = bExp;
1819 1.1 bjh21 zSign ^= 1;
1820 1.1 bjh21 goto normalizeRoundAndPack;
1821 1.1 bjh21 aExpBigger:
1822 1.1 bjh21 if ( aExp == 0xFF ) {
1823 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, b );
1824 1.1 bjh21 return a;
1825 1.1 bjh21 }
1826 1.1 bjh21 if ( bExp == 0 ) {
1827 1.1 bjh21 --expDiff;
1828 1.1 bjh21 }
1829 1.1 bjh21 else {
1830 1.1 bjh21 bSig |= 0x40000000;
1831 1.1 bjh21 }
1832 1.1 bjh21 shift32RightJamming( bSig, expDiff, &bSig );
1833 1.1 bjh21 aSig |= 0x40000000;
1834 1.1 bjh21 aBigger:
1835 1.1 bjh21 zSig = aSig - bSig;
1836 1.1 bjh21 zExp = aExp;
1837 1.1 bjh21 normalizeRoundAndPack:
1838 1.1 bjh21 --zExp;
1839 1.1 bjh21 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1840 1.1 bjh21
1841 1.1 bjh21 }
1842 1.1 bjh21
1843 1.1 bjh21 /*
1844 1.1 bjh21 -------------------------------------------------------------------------------
1845 1.1 bjh21 Returns the result of adding the single-precision floating-point values `a'
1846 1.1 bjh21 and `b'. The operation is performed according to the IEC/IEEE Standard for
1847 1.1 bjh21 Binary Floating-Point Arithmetic.
1848 1.1 bjh21 -------------------------------------------------------------------------------
1849 1.1 bjh21 */
1850 1.1 bjh21 float32 float32_add( float32 a, float32 b )
1851 1.1 bjh21 {
1852 1.1 bjh21 flag aSign, bSign;
1853 1.1 bjh21
1854 1.1 bjh21 aSign = extractFloat32Sign( a );
1855 1.1 bjh21 bSign = extractFloat32Sign( b );
1856 1.1 bjh21 if ( aSign == bSign ) {
1857 1.1 bjh21 return addFloat32Sigs( a, b, aSign );
1858 1.1 bjh21 }
1859 1.1 bjh21 else {
1860 1.1 bjh21 return subFloat32Sigs( a, b, aSign );
1861 1.1 bjh21 }
1862 1.1 bjh21
1863 1.1 bjh21 }
1864 1.1 bjh21
1865 1.1 bjh21 /*
1866 1.1 bjh21 -------------------------------------------------------------------------------
1867 1.1 bjh21 Returns the result of subtracting the single-precision floating-point values
1868 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1869 1.1 bjh21 for Binary Floating-Point Arithmetic.
1870 1.1 bjh21 -------------------------------------------------------------------------------
1871 1.1 bjh21 */
1872 1.1 bjh21 float32 float32_sub( float32 a, float32 b )
1873 1.1 bjh21 {
1874 1.1 bjh21 flag aSign, bSign;
1875 1.1 bjh21
1876 1.1 bjh21 aSign = extractFloat32Sign( a );
1877 1.1 bjh21 bSign = extractFloat32Sign( b );
1878 1.1 bjh21 if ( aSign == bSign ) {
1879 1.1 bjh21 return subFloat32Sigs( a, b, aSign );
1880 1.1 bjh21 }
1881 1.1 bjh21 else {
1882 1.1 bjh21 return addFloat32Sigs( a, b, aSign );
1883 1.1 bjh21 }
1884 1.1 bjh21
1885 1.1 bjh21 }
1886 1.1 bjh21
1887 1.1 bjh21 /*
1888 1.1 bjh21 -------------------------------------------------------------------------------
1889 1.1 bjh21 Returns the result of multiplying the single-precision floating-point values
1890 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1891 1.1 bjh21 for Binary Floating-Point Arithmetic.
1892 1.1 bjh21 -------------------------------------------------------------------------------
1893 1.1 bjh21 */
1894 1.1 bjh21 float32 float32_mul( float32 a, float32 b )
1895 1.1 bjh21 {
1896 1.1 bjh21 flag aSign, bSign, zSign;
1897 1.1 bjh21 int16 aExp, bExp, zExp;
1898 1.1 bjh21 bits32 aSig, bSig;
1899 1.1 bjh21 bits64 zSig64;
1900 1.1 bjh21 bits32 zSig;
1901 1.1 bjh21
1902 1.1 bjh21 aSig = extractFloat32Frac( a );
1903 1.1 bjh21 aExp = extractFloat32Exp( a );
1904 1.1 bjh21 aSign = extractFloat32Sign( a );
1905 1.1 bjh21 bSig = extractFloat32Frac( b );
1906 1.1 bjh21 bExp = extractFloat32Exp( b );
1907 1.1 bjh21 bSign = extractFloat32Sign( b );
1908 1.1 bjh21 zSign = aSign ^ bSign;
1909 1.1 bjh21 if ( aExp == 0xFF ) {
1910 1.1 bjh21 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1911 1.1 bjh21 return propagateFloat32NaN( a, b );
1912 1.1 bjh21 }
1913 1.1 bjh21 if ( ( bExp | bSig ) == 0 ) {
1914 1.1 bjh21 float_raise( float_flag_invalid );
1915 1.1 bjh21 return float32_default_nan;
1916 1.1 bjh21 }
1917 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1918 1.1 bjh21 }
1919 1.1 bjh21 if ( bExp == 0xFF ) {
1920 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1921 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
1922 1.1 bjh21 float_raise( float_flag_invalid );
1923 1.1 bjh21 return float32_default_nan;
1924 1.1 bjh21 }
1925 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1926 1.1 bjh21 }
1927 1.1 bjh21 if ( aExp == 0 ) {
1928 1.1 bjh21 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930 1.1 bjh21 }
1931 1.1 bjh21 if ( bExp == 0 ) {
1932 1.1 bjh21 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1933 1.1 bjh21 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934 1.1 bjh21 }
1935 1.1 bjh21 zExp = aExp + bExp - 0x7F;
1936 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<7;
1937 1.1 bjh21 bSig = ( bSig | 0x00800000 )<<8;
1938 1.1 bjh21 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1939 1.1 bjh21 zSig = zSig64;
1940 1.1 bjh21 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1941 1.1 bjh21 zSig <<= 1;
1942 1.1 bjh21 --zExp;
1943 1.1 bjh21 }
1944 1.1 bjh21 return roundAndPackFloat32( zSign, zExp, zSig );
1945 1.1 bjh21
1946 1.1 bjh21 }
1947 1.1 bjh21
1948 1.1 bjh21 /*
1949 1.1 bjh21 -------------------------------------------------------------------------------
1950 1.1 bjh21 Returns the result of dividing the single-precision floating-point value `a'
1951 1.1 bjh21 by the corresponding value `b'. The operation is performed according to the
1952 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1953 1.1 bjh21 -------------------------------------------------------------------------------
1954 1.1 bjh21 */
1955 1.1 bjh21 float32 float32_div( float32 a, float32 b )
1956 1.1 bjh21 {
1957 1.1 bjh21 flag aSign, bSign, zSign;
1958 1.1 bjh21 int16 aExp, bExp, zExp;
1959 1.1 bjh21 bits32 aSig, bSig, zSig;
1960 1.1 bjh21
1961 1.1 bjh21 aSig = extractFloat32Frac( a );
1962 1.1 bjh21 aExp = extractFloat32Exp( a );
1963 1.1 bjh21 aSign = extractFloat32Sign( a );
1964 1.1 bjh21 bSig = extractFloat32Frac( b );
1965 1.1 bjh21 bExp = extractFloat32Exp( b );
1966 1.1 bjh21 bSign = extractFloat32Sign( b );
1967 1.1 bjh21 zSign = aSign ^ bSign;
1968 1.1 bjh21 if ( aExp == 0xFF ) {
1969 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, b );
1970 1.1 bjh21 if ( bExp == 0xFF ) {
1971 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1972 1.1 bjh21 float_raise( float_flag_invalid );
1973 1.1 bjh21 return float32_default_nan;
1974 1.1 bjh21 }
1975 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1976 1.1 bjh21 }
1977 1.1 bjh21 if ( bExp == 0xFF ) {
1978 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
1979 1.1 bjh21 return packFloat32( zSign, 0, 0 );
1980 1.1 bjh21 }
1981 1.1 bjh21 if ( bExp == 0 ) {
1982 1.1 bjh21 if ( bSig == 0 ) {
1983 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
1984 1.1 bjh21 float_raise( float_flag_invalid );
1985 1.1 bjh21 return float32_default_nan;
1986 1.1 bjh21 }
1987 1.1 bjh21 float_raise( float_flag_divbyzero );
1988 1.1 bjh21 return packFloat32( zSign, 0xFF, 0 );
1989 1.1 bjh21 }
1990 1.1 bjh21 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1991 1.1 bjh21 }
1992 1.1 bjh21 if ( aExp == 0 ) {
1993 1.1 bjh21 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1994 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1995 1.1 bjh21 }
1996 1.1 bjh21 zExp = aExp - bExp + 0x7D;
1997 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<7;
1998 1.1 bjh21 bSig = ( bSig | 0x00800000 )<<8;
1999 1.1 bjh21 if ( bSig <= ( aSig + aSig ) ) {
2000 1.1 bjh21 aSig >>= 1;
2001 1.1 bjh21 ++zExp;
2002 1.1 bjh21 }
2003 1.1 bjh21 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2004 1.1 bjh21 if ( ( zSig & 0x3F ) == 0 ) {
2005 1.1 bjh21 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2006 1.1 bjh21 }
2007 1.1 bjh21 return roundAndPackFloat32( zSign, zExp, zSig );
2008 1.1 bjh21
2009 1.1 bjh21 }
2010 1.1 bjh21
2011 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2012 1.1 bjh21 /*
2013 1.1 bjh21 -------------------------------------------------------------------------------
2014 1.1 bjh21 Returns the remainder of the single-precision floating-point value `a'
2015 1.1 bjh21 with respect to the corresponding value `b'. The operation is performed
2016 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 1.1 bjh21 -------------------------------------------------------------------------------
2018 1.1 bjh21 */
2019 1.1 bjh21 float32 float32_rem( float32 a, float32 b )
2020 1.1 bjh21 {
2021 1.1 bjh21 flag aSign, bSign, zSign;
2022 1.1 bjh21 int16 aExp, bExp, expDiff;
2023 1.1 bjh21 bits32 aSig, bSig;
2024 1.1 bjh21 bits32 q;
2025 1.1 bjh21 bits64 aSig64, bSig64, q64;
2026 1.1 bjh21 bits32 alternateASig;
2027 1.1 bjh21 sbits32 sigMean;
2028 1.1 bjh21
2029 1.1 bjh21 aSig = extractFloat32Frac( a );
2030 1.1 bjh21 aExp = extractFloat32Exp( a );
2031 1.1 bjh21 aSign = extractFloat32Sign( a );
2032 1.1 bjh21 bSig = extractFloat32Frac( b );
2033 1.1 bjh21 bExp = extractFloat32Exp( b );
2034 1.1 bjh21 bSign = extractFloat32Sign( b );
2035 1.1 bjh21 if ( aExp == 0xFF ) {
2036 1.1 bjh21 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2037 1.1 bjh21 return propagateFloat32NaN( a, b );
2038 1.1 bjh21 }
2039 1.1 bjh21 float_raise( float_flag_invalid );
2040 1.1 bjh21 return float32_default_nan;
2041 1.1 bjh21 }
2042 1.1 bjh21 if ( bExp == 0xFF ) {
2043 1.1 bjh21 if ( bSig ) return propagateFloat32NaN( a, b );
2044 1.1 bjh21 return a;
2045 1.1 bjh21 }
2046 1.1 bjh21 if ( bExp == 0 ) {
2047 1.1 bjh21 if ( bSig == 0 ) {
2048 1.1 bjh21 float_raise( float_flag_invalid );
2049 1.1 bjh21 return float32_default_nan;
2050 1.1 bjh21 }
2051 1.1 bjh21 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2052 1.1 bjh21 }
2053 1.1 bjh21 if ( aExp == 0 ) {
2054 1.1 bjh21 if ( aSig == 0 ) return a;
2055 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2056 1.1 bjh21 }
2057 1.1 bjh21 expDiff = aExp - bExp;
2058 1.1 bjh21 aSig |= 0x00800000;
2059 1.1 bjh21 bSig |= 0x00800000;
2060 1.1 bjh21 if ( expDiff < 32 ) {
2061 1.1 bjh21 aSig <<= 8;
2062 1.1 bjh21 bSig <<= 8;
2063 1.1 bjh21 if ( expDiff < 0 ) {
2064 1.1 bjh21 if ( expDiff < -1 ) return a;
2065 1.1 bjh21 aSig >>= 1;
2066 1.1 bjh21 }
2067 1.1 bjh21 q = ( bSig <= aSig );
2068 1.1 bjh21 if ( q ) aSig -= bSig;
2069 1.1 bjh21 if ( 0 < expDiff ) {
2070 1.1 bjh21 q = ( ( (bits64) aSig )<<32 ) / bSig;
2071 1.1 bjh21 q >>= 32 - expDiff;
2072 1.1 bjh21 bSig >>= 2;
2073 1.1 bjh21 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2074 1.1 bjh21 }
2075 1.1 bjh21 else {
2076 1.1 bjh21 aSig >>= 2;
2077 1.1 bjh21 bSig >>= 2;
2078 1.1 bjh21 }
2079 1.1 bjh21 }
2080 1.1 bjh21 else {
2081 1.1 bjh21 if ( bSig <= aSig ) aSig -= bSig;
2082 1.1 bjh21 aSig64 = ( (bits64) aSig )<<40;
2083 1.1 bjh21 bSig64 = ( (bits64) bSig )<<40;
2084 1.1 bjh21 expDiff -= 64;
2085 1.1 bjh21 while ( 0 < expDiff ) {
2086 1.1 bjh21 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087 1.1 bjh21 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088 1.1 bjh21 aSig64 = - ( ( bSig * q64 )<<38 );
2089 1.1 bjh21 expDiff -= 62;
2090 1.1 bjh21 }
2091 1.1 bjh21 expDiff += 64;
2092 1.1 bjh21 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093 1.1 bjh21 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094 1.1 bjh21 q = q64>>( 64 - expDiff );
2095 1.1 bjh21 bSig <<= 6;
2096 1.1 bjh21 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2097 1.1 bjh21 }
2098 1.1 bjh21 do {
2099 1.1 bjh21 alternateASig = aSig;
2100 1.1 bjh21 ++q;
2101 1.1 bjh21 aSig -= bSig;
2102 1.1 bjh21 } while ( 0 <= (sbits32) aSig );
2103 1.1 bjh21 sigMean = aSig + alternateASig;
2104 1.1 bjh21 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2105 1.1 bjh21 aSig = alternateASig;
2106 1.1 bjh21 }
2107 1.1 bjh21 zSign = ( (sbits32) aSig < 0 );
2108 1.1 bjh21 if ( zSign ) aSig = - aSig;
2109 1.1 bjh21 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2110 1.1 bjh21
2111 1.1 bjh21 }
2112 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2113 1.1 bjh21
2114 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2115 1.1 bjh21 /*
2116 1.1 bjh21 -------------------------------------------------------------------------------
2117 1.1 bjh21 Returns the square root of the single-precision floating-point value `a'.
2118 1.1 bjh21 The operation is performed according to the IEC/IEEE Standard for Binary
2119 1.1 bjh21 Floating-Point Arithmetic.
2120 1.1 bjh21 -------------------------------------------------------------------------------
2121 1.1 bjh21 */
2122 1.1 bjh21 float32 float32_sqrt( float32 a )
2123 1.1 bjh21 {
2124 1.1 bjh21 flag aSign;
2125 1.1 bjh21 int16 aExp, zExp;
2126 1.1 bjh21 bits32 aSig, zSig;
2127 1.1 bjh21 bits64 rem, term;
2128 1.1 bjh21
2129 1.1 bjh21 aSig = extractFloat32Frac( a );
2130 1.1 bjh21 aExp = extractFloat32Exp( a );
2131 1.1 bjh21 aSign = extractFloat32Sign( a );
2132 1.1 bjh21 if ( aExp == 0xFF ) {
2133 1.1 bjh21 if ( aSig ) return propagateFloat32NaN( a, 0 );
2134 1.1 bjh21 if ( ! aSign ) return a;
2135 1.1 bjh21 float_raise( float_flag_invalid );
2136 1.1 bjh21 return float32_default_nan;
2137 1.1 bjh21 }
2138 1.1 bjh21 if ( aSign ) {
2139 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) return a;
2140 1.1 bjh21 float_raise( float_flag_invalid );
2141 1.1 bjh21 return float32_default_nan;
2142 1.1 bjh21 }
2143 1.1 bjh21 if ( aExp == 0 ) {
2144 1.1 bjh21 if ( aSig == 0 ) return 0;
2145 1.1 bjh21 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146 1.1 bjh21 }
2147 1.1 bjh21 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148 1.1 bjh21 aSig = ( aSig | 0x00800000 )<<8;
2149 1.1 bjh21 zSig = estimateSqrt32( aExp, aSig ) + 2;
2150 1.1 bjh21 if ( ( zSig & 0x7F ) <= 5 ) {
2151 1.1 bjh21 if ( zSig < 2 ) {
2152 1.1 bjh21 zSig = 0x7FFFFFFF;
2153 1.1 bjh21 goto roundAndPack;
2154 1.1 bjh21 }
2155 1.1 bjh21 aSig >>= aExp & 1;
2156 1.1 bjh21 term = ( (bits64) zSig ) * zSig;
2157 1.1 bjh21 rem = ( ( (bits64) aSig )<<32 ) - term;
2158 1.1 bjh21 while ( (sbits64) rem < 0 ) {
2159 1.1 bjh21 --zSig;
2160 1.1 bjh21 rem += ( ( (bits64) zSig )<<1 ) | 1;
2161 1.1 bjh21 }
2162 1.1 bjh21 zSig |= ( rem != 0 );
2163 1.1 bjh21 }
2164 1.1 bjh21 shift32RightJamming( zSig, 1, &zSig );
2165 1.1 bjh21 roundAndPack:
2166 1.1 bjh21 return roundAndPackFloat32( 0, zExp, zSig );
2167 1.1 bjh21
2168 1.1 bjh21 }
2169 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2170 1.1 bjh21
2171 1.1 bjh21 /*
2172 1.1 bjh21 -------------------------------------------------------------------------------
2173 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is equal to
2174 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
2175 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 1.1 bjh21 -------------------------------------------------------------------------------
2177 1.1 bjh21 */
2178 1.1 bjh21 flag float32_eq( float32 a, float32 b )
2179 1.1 bjh21 {
2180 1.1 bjh21
2181 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2182 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2183 1.1 bjh21 ) {
2184 1.1 bjh21 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2185 1.1 bjh21 float_raise( float_flag_invalid );
2186 1.1 bjh21 }
2187 1.1 bjh21 return 0;
2188 1.1 bjh21 }
2189 1.1 bjh21 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2190 1.1 bjh21
2191 1.1 bjh21 }
2192 1.1 bjh21
2193 1.1 bjh21 /*
2194 1.1 bjh21 -------------------------------------------------------------------------------
2195 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than
2196 1.1 bjh21 or equal to the corresponding value `b', and 0 otherwise. The comparison
2197 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 1.1 bjh21 Arithmetic.
2199 1.1 bjh21 -------------------------------------------------------------------------------
2200 1.1 bjh21 */
2201 1.1 bjh21 flag float32_le( float32 a, float32 b )
2202 1.1 bjh21 {
2203 1.1 bjh21 flag aSign, bSign;
2204 1.1 bjh21
2205 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207 1.1 bjh21 ) {
2208 1.1 bjh21 float_raise( float_flag_invalid );
2209 1.1 bjh21 return 0;
2210 1.1 bjh21 }
2211 1.1 bjh21 aSign = extractFloat32Sign( a );
2212 1.1 bjh21 bSign = extractFloat32Sign( b );
2213 1.1 bjh21 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2214 1.1 bjh21 return ( a == b ) || ( aSign ^ ( a < b ) );
2215 1.1 bjh21
2216 1.1 bjh21 }
2217 1.1 bjh21
2218 1.1 bjh21 /*
2219 1.1 bjh21 -------------------------------------------------------------------------------
2220 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than
2221 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
2222 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 1.1 bjh21 -------------------------------------------------------------------------------
2224 1.1 bjh21 */
2225 1.1 bjh21 flag float32_lt( float32 a, float32 b )
2226 1.1 bjh21 {
2227 1.1 bjh21 flag aSign, bSign;
2228 1.1 bjh21
2229 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2231 1.1 bjh21 ) {
2232 1.1 bjh21 float_raise( float_flag_invalid );
2233 1.1 bjh21 return 0;
2234 1.1 bjh21 }
2235 1.1 bjh21 aSign = extractFloat32Sign( a );
2236 1.1 bjh21 bSign = extractFloat32Sign( b );
2237 1.1 bjh21 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2238 1.1 bjh21 return ( a != b ) && ( aSign ^ ( a < b ) );
2239 1.1 bjh21
2240 1.1 bjh21 }
2241 1.1 bjh21
2242 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2243 1.1 bjh21 /*
2244 1.1 bjh21 -------------------------------------------------------------------------------
2245 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is equal to
2246 1.1 bjh21 the corresponding value `b', and 0 otherwise. The invalid exception is
2247 1.1 bjh21 raised if either operand is a NaN. Otherwise, the comparison is performed
2248 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249 1.1 bjh21 -------------------------------------------------------------------------------
2250 1.1 bjh21 */
2251 1.1 bjh21 flag float32_eq_signaling( float32 a, float32 b )
2252 1.1 bjh21 {
2253 1.1 bjh21
2254 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2255 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2256 1.1 bjh21 ) {
2257 1.1 bjh21 float_raise( float_flag_invalid );
2258 1.1 bjh21 return 0;
2259 1.1 bjh21 }
2260 1.1 bjh21 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2261 1.1 bjh21
2262 1.1 bjh21 }
2263 1.1 bjh21
2264 1.1 bjh21 /*
2265 1.1 bjh21 -------------------------------------------------------------------------------
2266 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than or
2267 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2268 1.1 bjh21 cause an exception. Otherwise, the comparison is performed according to the
2269 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 1.1 bjh21 -------------------------------------------------------------------------------
2271 1.1 bjh21 */
2272 1.1 bjh21 flag float32_le_quiet( float32 a, float32 b )
2273 1.1 bjh21 {
2274 1.1 bjh21 flag aSign, bSign;
2275 1.1 bjh21
2276 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2277 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2278 1.1 bjh21 ) {
2279 1.1 bjh21 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2280 1.1 bjh21 float_raise( float_flag_invalid );
2281 1.1 bjh21 }
2282 1.1 bjh21 return 0;
2283 1.1 bjh21 }
2284 1.1 bjh21 aSign = extractFloat32Sign( a );
2285 1.1 bjh21 bSign = extractFloat32Sign( b );
2286 1.1 bjh21 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2287 1.1 bjh21 return ( a == b ) || ( aSign ^ ( a < b ) );
2288 1.1 bjh21
2289 1.1 bjh21 }
2290 1.1 bjh21
2291 1.1 bjh21 /*
2292 1.1 bjh21 -------------------------------------------------------------------------------
2293 1.1 bjh21 Returns 1 if the single-precision floating-point value `a' is less than
2294 1.1 bjh21 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2295 1.1 bjh21 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2296 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
2297 1.1 bjh21 -------------------------------------------------------------------------------
2298 1.1 bjh21 */
2299 1.1 bjh21 flag float32_lt_quiet( float32 a, float32 b )
2300 1.1 bjh21 {
2301 1.1 bjh21 flag aSign, bSign;
2302 1.1 bjh21
2303 1.1 bjh21 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2304 1.1 bjh21 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2305 1.1 bjh21 ) {
2306 1.1 bjh21 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2307 1.1 bjh21 float_raise( float_flag_invalid );
2308 1.1 bjh21 }
2309 1.1 bjh21 return 0;
2310 1.1 bjh21 }
2311 1.1 bjh21 aSign = extractFloat32Sign( a );
2312 1.1 bjh21 bSign = extractFloat32Sign( b );
2313 1.1 bjh21 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2314 1.1 bjh21 return ( a != b ) && ( aSign ^ ( a < b ) );
2315 1.1 bjh21
2316 1.1 bjh21 }
2317 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2318 1.1 bjh21
2319 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2320 1.1 bjh21 /*
2321 1.1 bjh21 -------------------------------------------------------------------------------
2322 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2323 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
2324 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
2326 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
2327 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
2328 1.1 bjh21 largest integer with the same sign as `a' is returned.
2329 1.1 bjh21 -------------------------------------------------------------------------------
2330 1.1 bjh21 */
2331 1.1 bjh21 int32 float64_to_int32( float64 a )
2332 1.1 bjh21 {
2333 1.1 bjh21 flag aSign;
2334 1.1 bjh21 int16 aExp, shiftCount;
2335 1.1 bjh21 bits64 aSig;
2336 1.1 bjh21
2337 1.1 bjh21 aSig = extractFloat64Frac( a );
2338 1.1 bjh21 aExp = extractFloat64Exp( a );
2339 1.1 bjh21 aSign = extractFloat64Sign( a );
2340 1.1 bjh21 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2341 1.1 bjh21 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342 1.1 bjh21 shiftCount = 0x42C - aExp;
2343 1.1 bjh21 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2344 1.1 bjh21 return roundAndPackInt32( aSign, aSig );
2345 1.1 bjh21
2346 1.1 bjh21 }
2347 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2348 1.1 bjh21
2349 1.1 bjh21 /*
2350 1.1 bjh21 -------------------------------------------------------------------------------
2351 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2352 1.1 bjh21 `a' to the 32-bit two's complement integer format. The conversion is
2353 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2354 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
2355 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2356 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
2357 1.1 bjh21 returned.
2358 1.1 bjh21 -------------------------------------------------------------------------------
2359 1.1 bjh21 */
2360 1.1 bjh21 int32 float64_to_int32_round_to_zero( float64 a )
2361 1.1 bjh21 {
2362 1.1 bjh21 flag aSign;
2363 1.1 bjh21 int16 aExp, shiftCount;
2364 1.1 bjh21 bits64 aSig, savedASig;
2365 1.1 bjh21 int32 z;
2366 1.1 bjh21
2367 1.1 bjh21 aSig = extractFloat64Frac( a );
2368 1.1 bjh21 aExp = extractFloat64Exp( a );
2369 1.1 bjh21 aSign = extractFloat64Sign( a );
2370 1.1 bjh21 if ( 0x41E < aExp ) {
2371 1.1 bjh21 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2372 1.1 bjh21 goto invalid;
2373 1.1 bjh21 }
2374 1.1 bjh21 else if ( aExp < 0x3FF ) {
2375 1.1 bjh21 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2376 1.1 bjh21 return 0;
2377 1.1 bjh21 }
2378 1.1 bjh21 aSig |= LIT64( 0x0010000000000000 );
2379 1.1 bjh21 shiftCount = 0x433 - aExp;
2380 1.1 bjh21 savedASig = aSig;
2381 1.1 bjh21 aSig >>= shiftCount;
2382 1.1 bjh21 z = aSig;
2383 1.1 bjh21 if ( aSign ) z = - z;
2384 1.1 bjh21 if ( ( z < 0 ) ^ aSign ) {
2385 1.1 bjh21 invalid:
2386 1.1 bjh21 float_raise( float_flag_invalid );
2387 1.1 bjh21 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2388 1.1 bjh21 }
2389 1.1 bjh21 if ( ( aSig<<shiftCount ) != savedASig ) {
2390 1.1 bjh21 float_exception_flags |= float_flag_inexact;
2391 1.1 bjh21 }
2392 1.1 bjh21 return z;
2393 1.1 bjh21
2394 1.1 bjh21 }
2395 1.1 bjh21
2396 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2397 1.1 bjh21 /*
2398 1.1 bjh21 -------------------------------------------------------------------------------
2399 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2400 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
2401 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2402 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
2403 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
2404 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
2405 1.1 bjh21 largest integer with the same sign as `a' is returned.
2406 1.1 bjh21 -------------------------------------------------------------------------------
2407 1.1 bjh21 */
2408 1.1 bjh21 int64 float64_to_int64( float64 a )
2409 1.1 bjh21 {
2410 1.1 bjh21 flag aSign;
2411 1.1 bjh21 int16 aExp, shiftCount;
2412 1.1 bjh21 bits64 aSig, aSigExtra;
2413 1.1 bjh21
2414 1.1 bjh21 aSig = extractFloat64Frac( a );
2415 1.1 bjh21 aExp = extractFloat64Exp( a );
2416 1.1 bjh21 aSign = extractFloat64Sign( a );
2417 1.1 bjh21 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418 1.1 bjh21 shiftCount = 0x433 - aExp;
2419 1.1 bjh21 if ( shiftCount <= 0 ) {
2420 1.1 bjh21 if ( 0x43E < aExp ) {
2421 1.1 bjh21 float_raise( float_flag_invalid );
2422 1.1 bjh21 if ( ! aSign
2423 1.1 bjh21 || ( ( aExp == 0x7FF )
2424 1.1 bjh21 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2425 1.1 bjh21 ) {
2426 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427 1.1 bjh21 }
2428 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
2429 1.1 bjh21 }
2430 1.1 bjh21 aSigExtra = 0;
2431 1.1 bjh21 aSig <<= - shiftCount;
2432 1.1 bjh21 }
2433 1.1 bjh21 else {
2434 1.1 bjh21 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2435 1.1 bjh21 }
2436 1.1 bjh21 return roundAndPackInt64( aSign, aSig, aSigExtra );
2437 1.1 bjh21
2438 1.1 bjh21 }
2439 1.1 bjh21
2440 1.1 bjh21 /*
2441 1.1 bjh21 -------------------------------------------------------------------------------
2442 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2443 1.1 bjh21 `a' to the 64-bit two's complement integer format. The conversion is
2444 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
2446 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2447 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
2448 1.1 bjh21 returned.
2449 1.1 bjh21 -------------------------------------------------------------------------------
2450 1.1 bjh21 */
2451 1.1 bjh21 int64 float64_to_int64_round_to_zero( float64 a )
2452 1.1 bjh21 {
2453 1.1 bjh21 flag aSign;
2454 1.1 bjh21 int16 aExp, shiftCount;
2455 1.1 bjh21 bits64 aSig;
2456 1.1 bjh21 int64 z;
2457 1.1 bjh21
2458 1.1 bjh21 aSig = extractFloat64Frac( a );
2459 1.1 bjh21 aExp = extractFloat64Exp( a );
2460 1.1 bjh21 aSign = extractFloat64Sign( a );
2461 1.1 bjh21 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2462 1.1 bjh21 shiftCount = aExp - 0x433;
2463 1.1 bjh21 if ( 0 <= shiftCount ) {
2464 1.1 bjh21 if ( 0x43E <= aExp ) {
2465 1.1 bjh21 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2466 1.1 bjh21 float_raise( float_flag_invalid );
2467 1.1 bjh21 if ( ! aSign
2468 1.1 bjh21 || ( ( aExp == 0x7FF )
2469 1.1 bjh21 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2470 1.1 bjh21 ) {
2471 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
2472 1.1 bjh21 }
2473 1.1 bjh21 }
2474 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
2475 1.1 bjh21 }
2476 1.1 bjh21 z = aSig<<shiftCount;
2477 1.1 bjh21 }
2478 1.1 bjh21 else {
2479 1.1 bjh21 if ( aExp < 0x3FE ) {
2480 1.1 bjh21 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2481 1.1 bjh21 return 0;
2482 1.1 bjh21 }
2483 1.1 bjh21 z = aSig>>( - shiftCount );
2484 1.1 bjh21 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2485 1.1 bjh21 float_exception_flags |= float_flag_inexact;
2486 1.1 bjh21 }
2487 1.1 bjh21 }
2488 1.1 bjh21 if ( aSign ) z = - z;
2489 1.1 bjh21 return z;
2490 1.1 bjh21
2491 1.1 bjh21 }
2492 1.1 bjh21 #endif /* !SOFTFLOAT_FOR_GCC */
2493 1.1 bjh21
2494 1.1 bjh21 /*
2495 1.1 bjh21 -------------------------------------------------------------------------------
2496 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2497 1.1 bjh21 `a' to the single-precision floating-point format. The conversion is
2498 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 1.1 bjh21 Arithmetic.
2500 1.1 bjh21 -------------------------------------------------------------------------------
2501 1.1 bjh21 */
2502 1.1 bjh21 float32 float64_to_float32( float64 a )
2503 1.1 bjh21 {
2504 1.1 bjh21 flag aSign;
2505 1.1 bjh21 int16 aExp;
2506 1.1 bjh21 bits64 aSig;
2507 1.1 bjh21 bits32 zSig;
2508 1.1 bjh21
2509 1.1 bjh21 aSig = extractFloat64Frac( a );
2510 1.1 bjh21 aExp = extractFloat64Exp( a );
2511 1.1 bjh21 aSign = extractFloat64Sign( a );
2512 1.1 bjh21 if ( aExp == 0x7FF ) {
2513 1.1 bjh21 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2514 1.1 bjh21 return packFloat32( aSign, 0xFF, 0 );
2515 1.1 bjh21 }
2516 1.1 bjh21 shift64RightJamming( aSig, 22, &aSig );
2517 1.1 bjh21 zSig = aSig;
2518 1.1 bjh21 if ( aExp || zSig ) {
2519 1.1 bjh21 zSig |= 0x40000000;
2520 1.1 bjh21 aExp -= 0x381;
2521 1.1 bjh21 }
2522 1.1 bjh21 return roundAndPackFloat32( aSign, aExp, zSig );
2523 1.1 bjh21
2524 1.1 bjh21 }
2525 1.1 bjh21
2526 1.1 bjh21 #ifdef FLOATX80
2527 1.1 bjh21
2528 1.1 bjh21 /*
2529 1.1 bjh21 -------------------------------------------------------------------------------
2530 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2531 1.1 bjh21 `a' to the extended double-precision floating-point format. The conversion
2532 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 1.1 bjh21 Arithmetic.
2534 1.1 bjh21 -------------------------------------------------------------------------------
2535 1.1 bjh21 */
2536 1.1 bjh21 floatx80 float64_to_floatx80( float64 a )
2537 1.1 bjh21 {
2538 1.1 bjh21 flag aSign;
2539 1.1 bjh21 int16 aExp;
2540 1.1 bjh21 bits64 aSig;
2541 1.1 bjh21
2542 1.1 bjh21 aSig = extractFloat64Frac( a );
2543 1.1 bjh21 aExp = extractFloat64Exp( a );
2544 1.1 bjh21 aSign = extractFloat64Sign( a );
2545 1.1 bjh21 if ( aExp == 0x7FF ) {
2546 1.1 bjh21 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2547 1.1 bjh21 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2548 1.1 bjh21 }
2549 1.1 bjh21 if ( aExp == 0 ) {
2550 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2551 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2552 1.1 bjh21 }
2553 1.1 bjh21 return
2554 1.1 bjh21 packFloatx80(
2555 1.1 bjh21 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2556 1.1 bjh21
2557 1.1 bjh21 }
2558 1.1 bjh21
2559 1.1 bjh21 #endif
2560 1.1 bjh21
2561 1.1 bjh21 #ifdef FLOAT128
2562 1.1 bjh21
2563 1.1 bjh21 /*
2564 1.1 bjh21 -------------------------------------------------------------------------------
2565 1.1 bjh21 Returns the result of converting the double-precision floating-point value
2566 1.1 bjh21 `a' to the quadruple-precision floating-point format. The conversion is
2567 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 1.1 bjh21 Arithmetic.
2569 1.1 bjh21 -------------------------------------------------------------------------------
2570 1.1 bjh21 */
2571 1.1 bjh21 float128 float64_to_float128( float64 a )
2572 1.1 bjh21 {
2573 1.1 bjh21 flag aSign;
2574 1.1 bjh21 int16 aExp;
2575 1.1 bjh21 bits64 aSig, zSig0, zSig1;
2576 1.1 bjh21
2577 1.1 bjh21 aSig = extractFloat64Frac( a );
2578 1.1 bjh21 aExp = extractFloat64Exp( a );
2579 1.1 bjh21 aSign = extractFloat64Sign( a );
2580 1.1 bjh21 if ( aExp == 0x7FF ) {
2581 1.1 bjh21 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2582 1.1 bjh21 return packFloat128( aSign, 0x7FFF, 0, 0 );
2583 1.1 bjh21 }
2584 1.1 bjh21 if ( aExp == 0 ) {
2585 1.1 bjh21 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2586 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2587 1.1 bjh21 --aExp;
2588 1.1 bjh21 }
2589 1.1 bjh21 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2590 1.1 bjh21 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2591 1.1 bjh21
2592 1.1 bjh21 }
2593 1.1 bjh21
2594 1.1 bjh21 #endif
2595 1.1 bjh21
2596 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC
2597 1.1 bjh21 /*
2598 1.1 bjh21 -------------------------------------------------------------------------------
2599 1.1 bjh21 Rounds the double-precision floating-point value `a' to an integer, and
2600 1.1 bjh21 returns the result as a double-precision floating-point value. The
2601 1.1 bjh21 operation is performed according to the IEC/IEEE Standard for Binary
2602 1.1 bjh21 Floating-Point Arithmetic.
2603 1.1 bjh21 -------------------------------------------------------------------------------
2604 1.1 bjh21 */
2605 1.1 bjh21 float64 float64_round_to_int( float64 a )
2606 1.1 bjh21 {
2607 1.1 bjh21 flag aSign;
2608 1.1 bjh21 int16 aExp;
2609 1.1 bjh21 bits64 lastBitMask, roundBitsMask;
2610 1.1 bjh21 int8 roundingMode;
2611 1.1 bjh21 float64 z;
2612 1.1 bjh21
2613 1.1 bjh21 aExp = extractFloat64Exp( a );
2614 1.1 bjh21 if ( 0x433 <= aExp ) {
2615 1.1 bjh21 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2616 1.1 bjh21 return propagateFloat64NaN( a, a );
2617 1.1 bjh21 }
2618 1.1 bjh21 return a;
2619 1.1 bjh21 }
2620 1.1 bjh21 if ( aExp < 0x3FF ) {
2621 1.1 bjh21 if ( (bits64) ( a<<1 ) == 0 ) return a;
2622 1.1 bjh21 float_exception_flags |= float_flag_inexact;
2623 1.1 bjh21 aSign = extractFloat64Sign( a );
2624 1.1 bjh21 switch ( float_rounding_mode ) {
2625 1.1 bjh21 case float_round_nearest_even:
2626 1.1 bjh21 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2627 1.1 bjh21 return packFloat64( aSign, 0x3FF, 0 );
2628 1.1 bjh21 }
2629 1.1 bjh21 break;
2630 1.1 bjh21 case float_round_to_zero:
2631 1.1 bjh21 break;
2632 1.1 bjh21 case float_round_down:
2633 1.1 bjh21 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2634 1.1 bjh21 case float_round_up:
2635 1.1 bjh21 return
2636 1.1 bjh21 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2637 1.1 bjh21 }
2638 1.1 bjh21 return packFloat64( aSign, 0, 0 );
2639 1.1 bjh21 }
2640 1.1 bjh21 lastBitMask = 1;
2641 1.1 bjh21 lastBitMask <<= 0x433 - aExp;
2642 1.1 bjh21 roundBitsMask = lastBitMask - 1;
2643 1.1 bjh21 z = a;
2644 1.1 bjh21 roundingMode = float_rounding_mode;
2645 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
2646 1.1 bjh21 z += lastBitMask>>1;
2647 1.1 bjh21 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2648 1.1 bjh21 }
2649 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
2650 1.1 bjh21 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2651 1.1 bjh21 z += roundBitsMask;
2652 1.1 bjh21 }
2653 1.1 bjh21 }
2654 1.1 bjh21 z &= ~ roundBitsMask;
2655 1.1 bjh21 if ( z != a ) float_exception_flags |= float_flag_inexact;
2656 1.1 bjh21 return z;
2657 1.1 bjh21
2658 1.1 bjh21 }
2659 1.1 bjh21 #endif
2660 1.1 bjh21
2661 1.1 bjh21 /*
2662 1.1 bjh21 -------------------------------------------------------------------------------
2663 1.1 bjh21 Returns the result of adding the absolute values of the double-precision
2664 1.1 bjh21 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2665 1.1 bjh21 before being returned. `zSign' is ignored if the result is a NaN.
2666 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
2667 1.1 bjh21 Floating-Point Arithmetic.
2668 1.1 bjh21 -------------------------------------------------------------------------------
2669 1.1 bjh21 */
2670 1.1 bjh21 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2671 1.1 bjh21 {
2672 1.1 bjh21 int16 aExp, bExp, zExp;
2673 1.1 bjh21 bits64 aSig, bSig, zSig;
2674 1.1 bjh21 int16 expDiff;
2675 1.1 bjh21
2676 1.1 bjh21 aSig = extractFloat64Frac( a );
2677 1.1 bjh21 aExp = extractFloat64Exp( a );
2678 1.1 bjh21 bSig = extractFloat64Frac( b );
2679 1.1 bjh21 bExp = extractFloat64Exp( b );
2680 1.1 bjh21 expDiff = aExp - bExp;
2681 1.1 bjh21 aSig <<= 9;
2682 1.1 bjh21 bSig <<= 9;
2683 1.1 bjh21 if ( 0 < expDiff ) {
2684 1.1 bjh21 if ( aExp == 0x7FF ) {
2685 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, b );
2686 1.1 bjh21 return a;
2687 1.1 bjh21 }
2688 1.1 bjh21 if ( bExp == 0 ) {
2689 1.1 bjh21 --expDiff;
2690 1.1 bjh21 }
2691 1.1 bjh21 else {
2692 1.1 bjh21 bSig |= LIT64( 0x2000000000000000 );
2693 1.1 bjh21 }
2694 1.1 bjh21 shift64RightJamming( bSig, expDiff, &bSig );
2695 1.1 bjh21 zExp = aExp;
2696 1.1 bjh21 }
2697 1.1 bjh21 else if ( expDiff < 0 ) {
2698 1.1 bjh21 if ( bExp == 0x7FF ) {
2699 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2700 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2701 1.1 bjh21 }
2702 1.1 bjh21 if ( aExp == 0 ) {
2703 1.1 bjh21 ++expDiff;
2704 1.1 bjh21 }
2705 1.1 bjh21 else {
2706 1.1 bjh21 aSig |= LIT64( 0x2000000000000000 );
2707 1.1 bjh21 }
2708 1.1 bjh21 shift64RightJamming( aSig, - expDiff, &aSig );
2709 1.1 bjh21 zExp = bExp;
2710 1.1 bjh21 }
2711 1.1 bjh21 else {
2712 1.1 bjh21 if ( aExp == 0x7FF ) {
2713 1.1 bjh21 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2714 1.1 bjh21 return a;
2715 1.1 bjh21 }
2716 1.1 bjh21 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2717 1.1 bjh21 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2718 1.1 bjh21 zExp = aExp;
2719 1.1 bjh21 goto roundAndPack;
2720 1.1 bjh21 }
2721 1.1 bjh21 aSig |= LIT64( 0x2000000000000000 );
2722 1.1 bjh21 zSig = ( aSig + bSig )<<1;
2723 1.1 bjh21 --zExp;
2724 1.1 bjh21 if ( (sbits64) zSig < 0 ) {
2725 1.1 bjh21 zSig = aSig + bSig;
2726 1.1 bjh21 ++zExp;
2727 1.1 bjh21 }
2728 1.1 bjh21 roundAndPack:
2729 1.1 bjh21 return roundAndPackFloat64( zSign, zExp, zSig );
2730 1.1 bjh21
2731 1.1 bjh21 }
2732 1.1 bjh21
2733 1.1 bjh21 /*
2734 1.1 bjh21 -------------------------------------------------------------------------------
2735 1.1 bjh21 Returns the result of subtracting the absolute values of the double-
2736 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the
2737 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
2738 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
2739 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
2740 1.1 bjh21 -------------------------------------------------------------------------------
2741 1.1 bjh21 */
2742 1.1 bjh21 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2743 1.1 bjh21 {
2744 1.1 bjh21 int16 aExp, bExp, zExp;
2745 1.1 bjh21 bits64 aSig, bSig, zSig;
2746 1.1 bjh21 int16 expDiff;
2747 1.1 bjh21
2748 1.1 bjh21 aSig = extractFloat64Frac( a );
2749 1.1 bjh21 aExp = extractFloat64Exp( a );
2750 1.1 bjh21 bSig = extractFloat64Frac( b );
2751 1.1 bjh21 bExp = extractFloat64Exp( b );
2752 1.1 bjh21 expDiff = aExp - bExp;
2753 1.1 bjh21 aSig <<= 10;
2754 1.1 bjh21 bSig <<= 10;
2755 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
2756 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
2757 1.1 bjh21 if ( aExp == 0x7FF ) {
2758 1.1 bjh21 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759 1.1 bjh21 float_raise( float_flag_invalid );
2760 1.1 bjh21 return float64_default_nan;
2761 1.1 bjh21 }
2762 1.1 bjh21 if ( aExp == 0 ) {
2763 1.1 bjh21 aExp = 1;
2764 1.1 bjh21 bExp = 1;
2765 1.1 bjh21 }
2766 1.1 bjh21 if ( bSig < aSig ) goto aBigger;
2767 1.1 bjh21 if ( aSig < bSig ) goto bBigger;
2768 1.1 bjh21 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2769 1.1 bjh21 bExpBigger:
2770 1.1 bjh21 if ( bExp == 0x7FF ) {
2771 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2772 1.1 bjh21 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2773 1.1 bjh21 }
2774 1.1 bjh21 if ( aExp == 0 ) {
2775 1.1 bjh21 ++expDiff;
2776 1.1 bjh21 }
2777 1.1 bjh21 else {
2778 1.1 bjh21 aSig |= LIT64( 0x4000000000000000 );
2779 1.1 bjh21 }
2780 1.1 bjh21 shift64RightJamming( aSig, - expDiff, &aSig );
2781 1.1 bjh21 bSig |= LIT64( 0x4000000000000000 );
2782 1.1 bjh21 bBigger:
2783 1.1 bjh21 zSig = bSig - aSig;
2784 1.1 bjh21 zExp = bExp;
2785 1.1 bjh21 zSign ^= 1;
2786 1.1 bjh21 goto normalizeRoundAndPack;
2787 1.1 bjh21 aExpBigger:
2788 1.1 bjh21 if ( aExp == 0x7FF ) {
2789 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, b );
2790 1.1 bjh21 return a;
2791 1.1 bjh21 }
2792 1.1 bjh21 if ( bExp == 0 ) {
2793 1.1 bjh21 --expDiff;
2794 1.1 bjh21 }
2795 1.1 bjh21 else {
2796 1.1 bjh21 bSig |= LIT64( 0x4000000000000000 );
2797 1.1 bjh21 }
2798 1.1 bjh21 shift64RightJamming( bSig, expDiff, &bSig );
2799 1.1 bjh21 aSig |= LIT64( 0x4000000000000000 );
2800 1.1 bjh21 aBigger:
2801 1.1 bjh21 zSig = aSig - bSig;
2802 1.1 bjh21 zExp = aExp;
2803 1.1 bjh21 normalizeRoundAndPack:
2804 1.1 bjh21 --zExp;
2805 1.1 bjh21 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2806 1.1 bjh21
2807 1.1 bjh21 }
2808 1.1 bjh21
2809 1.1 bjh21 /*
2810 1.1 bjh21 -------------------------------------------------------------------------------
2811 1.1 bjh21 Returns the result of adding the double-precision floating-point values `a'
2812 1.1 bjh21 and `b'. The operation is performed according to the IEC/IEEE Standard for
2813 1.1 bjh21 Binary Floating-Point Arithmetic.
2814 1.1 bjh21 -------------------------------------------------------------------------------
2815 1.1 bjh21 */
2816 1.1 bjh21 float64 float64_add( float64 a, float64 b )
2817 1.1 bjh21 {
2818 1.1 bjh21 flag aSign, bSign;
2819 1.1 bjh21
2820 1.1 bjh21 aSign = extractFloat64Sign( a );
2821 1.1 bjh21 bSign = extractFloat64Sign( b );
2822 1.1 bjh21 if ( aSign == bSign ) {
2823 1.1 bjh21 return addFloat64Sigs( a, b, aSign );
2824 1.1 bjh21 }
2825 1.1 bjh21 else {
2826 1.1 bjh21 return subFloat64Sigs( a, b, aSign );
2827 1.1 bjh21 }
2828 1.1 bjh21
2829 1.1 bjh21 }
2830 1.1 bjh21
2831 1.1 bjh21 /*
2832 1.1 bjh21 -------------------------------------------------------------------------------
2833 1.1 bjh21 Returns the result of subtracting the double-precision floating-point values
2834 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2835 1.1 bjh21 for Binary Floating-Point Arithmetic.
2836 1.1 bjh21 -------------------------------------------------------------------------------
2837 1.1 bjh21 */
2838 1.1 bjh21 float64 float64_sub( float64 a, float64 b )
2839 1.1 bjh21 {
2840 1.1 bjh21 flag aSign, bSign;
2841 1.1 bjh21
2842 1.1 bjh21 aSign = extractFloat64Sign( a );
2843 1.1 bjh21 bSign = extractFloat64Sign( b );
2844 1.1 bjh21 if ( aSign == bSign ) {
2845 1.1 bjh21 return subFloat64Sigs( a, b, aSign );
2846 1.1 bjh21 }
2847 1.1 bjh21 else {
2848 1.1 bjh21 return addFloat64Sigs( a, b, aSign );
2849 1.1 bjh21 }
2850 1.1 bjh21
2851 1.1 bjh21 }
2852 1.1 bjh21
2853 1.1 bjh21 /*
2854 1.1 bjh21 -------------------------------------------------------------------------------
2855 1.1 bjh21 Returns the result of multiplying the double-precision floating-point values
2856 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2857 1.1 bjh21 for Binary Floating-Point Arithmetic.
2858 1.1 bjh21 -------------------------------------------------------------------------------
2859 1.1 bjh21 */
2860 1.1 bjh21 float64 float64_mul( float64 a, float64 b )
2861 1.1 bjh21 {
2862 1.1 bjh21 flag aSign, bSign, zSign;
2863 1.1 bjh21 int16 aExp, bExp, zExp;
2864 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
2865 1.1 bjh21
2866 1.1 bjh21 aSig = extractFloat64Frac( a );
2867 1.1 bjh21 aExp = extractFloat64Exp( a );
2868 1.1 bjh21 aSign = extractFloat64Sign( a );
2869 1.1 bjh21 bSig = extractFloat64Frac( b );
2870 1.1 bjh21 bExp = extractFloat64Exp( b );
2871 1.1 bjh21 bSign = extractFloat64Sign( b );
2872 1.1 bjh21 zSign = aSign ^ bSign;
2873 1.1 bjh21 if ( aExp == 0x7FF ) {
2874 1.1 bjh21 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2875 1.1 bjh21 return propagateFloat64NaN( a, b );
2876 1.1 bjh21 }
2877 1.1 bjh21 if ( ( bExp | bSig ) == 0 ) {
2878 1.1 bjh21 float_raise( float_flag_invalid );
2879 1.1 bjh21 return float64_default_nan;
2880 1.1 bjh21 }
2881 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2882 1.1 bjh21 }
2883 1.1 bjh21 if ( bExp == 0x7FF ) {
2884 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2885 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
2886 1.1 bjh21 float_raise( float_flag_invalid );
2887 1.1 bjh21 return float64_default_nan;
2888 1.1 bjh21 }
2889 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2890 1.1 bjh21 }
2891 1.1 bjh21 if ( aExp == 0 ) {
2892 1.1 bjh21 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2893 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2894 1.1 bjh21 }
2895 1.1 bjh21 if ( bExp == 0 ) {
2896 1.1 bjh21 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2897 1.1 bjh21 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2898 1.1 bjh21 }
2899 1.1 bjh21 zExp = aExp + bExp - 0x3FF;
2900 1.1 bjh21 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2901 1.1 bjh21 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2902 1.1 bjh21 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2903 1.1 bjh21 zSig0 |= ( zSig1 != 0 );
2904 1.1 bjh21 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2905 1.1 bjh21 zSig0 <<= 1;
2906 1.1 bjh21 --zExp;
2907 1.1 bjh21 }
2908 1.1 bjh21 return roundAndPackFloat64( zSign, zExp, zSig0 );
2909 1.1 bjh21
2910 1.1 bjh21 }
2911 1.1 bjh21
2912 1.1 bjh21 /*
2913 1.1 bjh21 -------------------------------------------------------------------------------
2914 1.1 bjh21 Returns the result of dividing the double-precision floating-point value `a'
2915 1.1 bjh21 by the corresponding value `b'. The operation is performed according to
2916 1.1 bjh21 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 1.1 bjh21 -------------------------------------------------------------------------------
2918 1.1 bjh21 */
2919 1.1 bjh21 float64 float64_div( float64 a, float64 b )
2920 1.1 bjh21 {
2921 1.1 bjh21 flag aSign, bSign, zSign;
2922 1.1 bjh21 int16 aExp, bExp, zExp;
2923 1.1 bjh21 bits64 aSig, bSig, zSig;
2924 1.1 bjh21 bits64 rem0, rem1;
2925 1.1 bjh21 bits64 term0, term1;
2926 1.1 bjh21
2927 1.1 bjh21 aSig = extractFloat64Frac( a );
2928 1.1 bjh21 aExp = extractFloat64Exp( a );
2929 1.1 bjh21 aSign = extractFloat64Sign( a );
2930 1.1 bjh21 bSig = extractFloat64Frac( b );
2931 1.1 bjh21 bExp = extractFloat64Exp( b );
2932 1.1 bjh21 bSign = extractFloat64Sign( b );
2933 1.1 bjh21 zSign = aSign ^ bSign;
2934 1.1 bjh21 if ( aExp == 0x7FF ) {
2935 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, b );
2936 1.1 bjh21 if ( bExp == 0x7FF ) {
2937 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2938 1.1 bjh21 float_raise( float_flag_invalid );
2939 1.1 bjh21 return float64_default_nan;
2940 1.1 bjh21 }
2941 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2942 1.1 bjh21 }
2943 1.1 bjh21 if ( bExp == 0x7FF ) {
2944 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
2945 1.1 bjh21 return packFloat64( zSign, 0, 0 );
2946 1.1 bjh21 }
2947 1.1 bjh21 if ( bExp == 0 ) {
2948 1.1 bjh21 if ( bSig == 0 ) {
2949 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
2950 1.1 bjh21 float_raise( float_flag_invalid );
2951 1.1 bjh21 return float64_default_nan;
2952 1.1 bjh21 }
2953 1.1 bjh21 float_raise( float_flag_divbyzero );
2954 1.1 bjh21 return packFloat64( zSign, 0x7FF, 0 );
2955 1.1 bjh21 }
2956 1.1 bjh21 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2957 1.1 bjh21 }
2958 1.1 bjh21 if ( aExp == 0 ) {
2959 1.1 bjh21 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2960 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2961 1.1 bjh21 }
2962 1.1 bjh21 zExp = aExp - bExp + 0x3FD;
2963 1.1 bjh21 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2964 1.1 bjh21 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2965 1.1 bjh21 if ( bSig <= ( aSig + aSig ) ) {
2966 1.1 bjh21 aSig >>= 1;
2967 1.1 bjh21 ++zExp;
2968 1.1 bjh21 }
2969 1.1 bjh21 zSig = estimateDiv128To64( aSig, 0, bSig );
2970 1.1 bjh21 if ( ( zSig & 0x1FF ) <= 2 ) {
2971 1.1 bjh21 mul64To128( bSig, zSig, &term0, &term1 );
2972 1.1 bjh21 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2973 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
2974 1.1 bjh21 --zSig;
2975 1.1 bjh21 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2976 1.1 bjh21 }
2977 1.1 bjh21 zSig |= ( rem1 != 0 );
2978 1.1 bjh21 }
2979 1.1 bjh21 return roundAndPackFloat64( zSign, zExp, zSig );
2980 1.1 bjh21
2981 1.1 bjh21 }
2982 1.1 bjh21
2983 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC
2984 1.1 bjh21 /*
2985 1.1 bjh21 -------------------------------------------------------------------------------
2986 1.1 bjh21 Returns the remainder of the double-precision floating-point value `a'
2987 1.1 bjh21 with respect to the corresponding value `b'. The operation is performed
2988 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2989 1.1 bjh21 -------------------------------------------------------------------------------
2990 1.1 bjh21 */
2991 1.1 bjh21 float64 float64_rem( float64 a, float64 b )
2992 1.1 bjh21 {
2993 1.1 bjh21 flag aSign, bSign, zSign;
2994 1.1 bjh21 int16 aExp, bExp, expDiff;
2995 1.1 bjh21 bits64 aSig, bSig;
2996 1.1 bjh21 bits64 q, alternateASig;
2997 1.1 bjh21 sbits64 sigMean;
2998 1.1 bjh21
2999 1.1 bjh21 aSig = extractFloat64Frac( a );
3000 1.1 bjh21 aExp = extractFloat64Exp( a );
3001 1.1 bjh21 aSign = extractFloat64Sign( a );
3002 1.1 bjh21 bSig = extractFloat64Frac( b );
3003 1.1 bjh21 bExp = extractFloat64Exp( b );
3004 1.1 bjh21 bSign = extractFloat64Sign( b );
3005 1.1 bjh21 if ( aExp == 0x7FF ) {
3006 1.1 bjh21 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3007 1.1 bjh21 return propagateFloat64NaN( a, b );
3008 1.1 bjh21 }
3009 1.1 bjh21 float_raise( float_flag_invalid );
3010 1.1 bjh21 return float64_default_nan;
3011 1.1 bjh21 }
3012 1.1 bjh21 if ( bExp == 0x7FF ) {
3013 1.1 bjh21 if ( bSig ) return propagateFloat64NaN( a, b );
3014 1.1 bjh21 return a;
3015 1.1 bjh21 }
3016 1.1 bjh21 if ( bExp == 0 ) {
3017 1.1 bjh21 if ( bSig == 0 ) {
3018 1.1 bjh21 float_raise( float_flag_invalid );
3019 1.1 bjh21 return float64_default_nan;
3020 1.1 bjh21 }
3021 1.1 bjh21 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3022 1.1 bjh21 }
3023 1.1 bjh21 if ( aExp == 0 ) {
3024 1.1 bjh21 if ( aSig == 0 ) return a;
3025 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3026 1.1 bjh21 }
3027 1.1 bjh21 expDiff = aExp - bExp;
3028 1.1 bjh21 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3029 1.1 bjh21 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3030 1.1 bjh21 if ( expDiff < 0 ) {
3031 1.1 bjh21 if ( expDiff < -1 ) return a;
3032 1.1 bjh21 aSig >>= 1;
3033 1.1 bjh21 }
3034 1.1 bjh21 q = ( bSig <= aSig );
3035 1.1 bjh21 if ( q ) aSig -= bSig;
3036 1.1 bjh21 expDiff -= 64;
3037 1.1 bjh21 while ( 0 < expDiff ) {
3038 1.1 bjh21 q = estimateDiv128To64( aSig, 0, bSig );
3039 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3040 1.1 bjh21 aSig = - ( ( bSig>>2 ) * q );
3041 1.1 bjh21 expDiff -= 62;
3042 1.1 bjh21 }
3043 1.1 bjh21 expDiff += 64;
3044 1.1 bjh21 if ( 0 < expDiff ) {
3045 1.1 bjh21 q = estimateDiv128To64( aSig, 0, bSig );
3046 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3047 1.1 bjh21 q >>= 64 - expDiff;
3048 1.1 bjh21 bSig >>= 2;
3049 1.1 bjh21 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3050 1.1 bjh21 }
3051 1.1 bjh21 else {
3052 1.1 bjh21 aSig >>= 2;
3053 1.1 bjh21 bSig >>= 2;
3054 1.1 bjh21 }
3055 1.1 bjh21 do {
3056 1.1 bjh21 alternateASig = aSig;
3057 1.1 bjh21 ++q;
3058 1.1 bjh21 aSig -= bSig;
3059 1.1 bjh21 } while ( 0 <= (sbits64) aSig );
3060 1.1 bjh21 sigMean = aSig + alternateASig;
3061 1.1 bjh21 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3062 1.1 bjh21 aSig = alternateASig;
3063 1.1 bjh21 }
3064 1.1 bjh21 zSign = ( (sbits64) aSig < 0 );
3065 1.1 bjh21 if ( zSign ) aSig = - aSig;
3066 1.1 bjh21 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3067 1.1 bjh21
3068 1.1 bjh21 }
3069 1.1 bjh21
3070 1.1 bjh21 /*
3071 1.1 bjh21 -------------------------------------------------------------------------------
3072 1.1 bjh21 Returns the square root of the double-precision floating-point value `a'.
3073 1.1 bjh21 The operation is performed according to the IEC/IEEE Standard for Binary
3074 1.1 bjh21 Floating-Point Arithmetic.
3075 1.1 bjh21 -------------------------------------------------------------------------------
3076 1.1 bjh21 */
3077 1.1 bjh21 float64 float64_sqrt( float64 a )
3078 1.1 bjh21 {
3079 1.1 bjh21 flag aSign;
3080 1.1 bjh21 int16 aExp, zExp;
3081 1.1 bjh21 bits64 aSig, zSig, doubleZSig;
3082 1.1 bjh21 bits64 rem0, rem1, term0, term1;
3083 1.1 bjh21
3084 1.1 bjh21 aSig = extractFloat64Frac( a );
3085 1.1 bjh21 aExp = extractFloat64Exp( a );
3086 1.1 bjh21 aSign = extractFloat64Sign( a );
3087 1.1 bjh21 if ( aExp == 0x7FF ) {
3088 1.1 bjh21 if ( aSig ) return propagateFloat64NaN( a, a );
3089 1.1 bjh21 if ( ! aSign ) return a;
3090 1.1 bjh21 float_raise( float_flag_invalid );
3091 1.1 bjh21 return float64_default_nan;
3092 1.1 bjh21 }
3093 1.1 bjh21 if ( aSign ) {
3094 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) return a;
3095 1.1 bjh21 float_raise( float_flag_invalid );
3096 1.1 bjh21 return float64_default_nan;
3097 1.1 bjh21 }
3098 1.1 bjh21 if ( aExp == 0 ) {
3099 1.1 bjh21 if ( aSig == 0 ) return 0;
3100 1.1 bjh21 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3101 1.1 bjh21 }
3102 1.1 bjh21 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3103 1.1 bjh21 aSig |= LIT64( 0x0010000000000000 );
3104 1.1 bjh21 zSig = estimateSqrt32( aExp, aSig>>21 );
3105 1.1 bjh21 aSig <<= 9 - ( aExp & 1 );
3106 1.1 bjh21 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3107 1.1 bjh21 if ( ( zSig & 0x1FF ) <= 5 ) {
3108 1.1 bjh21 doubleZSig = zSig<<1;
3109 1.1 bjh21 mul64To128( zSig, zSig, &term0, &term1 );
3110 1.1 bjh21 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3111 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
3112 1.1 bjh21 --zSig;
3113 1.1 bjh21 doubleZSig -= 2;
3114 1.1 bjh21 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3115 1.1 bjh21 }
3116 1.1 bjh21 zSig |= ( ( rem0 | rem1 ) != 0 );
3117 1.1 bjh21 }
3118 1.1 bjh21 return roundAndPackFloat64( 0, zExp, zSig );
3119 1.1 bjh21
3120 1.1 bjh21 }
3121 1.1 bjh21 #endif
3122 1.1 bjh21
3123 1.1 bjh21 /*
3124 1.1 bjh21 -------------------------------------------------------------------------------
3125 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 1.1 bjh21 corresponding value `b', and 0 otherwise. The comparison is performed
3127 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3128 1.1 bjh21 -------------------------------------------------------------------------------
3129 1.1 bjh21 */
3130 1.1 bjh21 flag float64_eq( float64 a, float64 b )
3131 1.1 bjh21 {
3132 1.1 bjh21
3133 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3134 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3135 1.1 bjh21 ) {
3136 1.1 bjh21 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3137 1.1 bjh21 float_raise( float_flag_invalid );
3138 1.1 bjh21 }
3139 1.1 bjh21 return 0;
3140 1.1 bjh21 }
3141 1.1 bjh21 return ( a == b ) ||
3142 1.1 bjh21 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3143 1.1 bjh21
3144 1.1 bjh21 }
3145 1.1 bjh21
3146 1.1 bjh21 /*
3147 1.1 bjh21 -------------------------------------------------------------------------------
3148 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than or
3149 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. The comparison is
3150 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 1.1 bjh21 Arithmetic.
3152 1.1 bjh21 -------------------------------------------------------------------------------
3153 1.1 bjh21 */
3154 1.1 bjh21 flag float64_le( float64 a, float64 b )
3155 1.1 bjh21 {
3156 1.1 bjh21 flag aSign, bSign;
3157 1.1 bjh21
3158 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3159 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3160 1.1 bjh21 ) {
3161 1.1 bjh21 float_raise( float_flag_invalid );
3162 1.1 bjh21 return 0;
3163 1.1 bjh21 }
3164 1.1 bjh21 aSign = extractFloat64Sign( a );
3165 1.1 bjh21 bSign = extractFloat64Sign( b );
3166 1.1 bjh21 if ( aSign != bSign )
3167 1.1 bjh21 return aSign ||
3168 1.1 bjh21 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3169 1.1 bjh21 0 );
3170 1.1 bjh21 return ( a == b ) ||
3171 1.1 bjh21 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3172 1.1 bjh21
3173 1.1 bjh21 }
3174 1.1 bjh21
3175 1.1 bjh21 /*
3176 1.1 bjh21 -------------------------------------------------------------------------------
3177 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than
3178 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
3179 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3180 1.1 bjh21 -------------------------------------------------------------------------------
3181 1.1 bjh21 */
3182 1.1 bjh21 flag float64_lt( float64 a, float64 b )
3183 1.1 bjh21 {
3184 1.1 bjh21 flag aSign, bSign;
3185 1.1 bjh21
3186 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3187 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3188 1.1 bjh21 ) {
3189 1.1 bjh21 float_raise( float_flag_invalid );
3190 1.1 bjh21 return 0;
3191 1.1 bjh21 }
3192 1.1 bjh21 aSign = extractFloat64Sign( a );
3193 1.1 bjh21 bSign = extractFloat64Sign( b );
3194 1.1 bjh21 if ( aSign != bSign )
3195 1.1 bjh21 return aSign &&
3196 1.1 bjh21 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3197 1.1 bjh21 0 );
3198 1.1 bjh21 return ( a != b ) &&
3199 1.1 bjh21 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3200 1.1 bjh21
3201 1.1 bjh21 }
3202 1.1 bjh21
3203 1.1 bjh21 #ifndef SOFTFLOAT_FOR_GCC
3204 1.1 bjh21 /*
3205 1.1 bjh21 -------------------------------------------------------------------------------
3206 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is equal to the
3207 1.1 bjh21 corresponding value `b', and 0 otherwise. The invalid exception is raised
3208 1.1 bjh21 if either operand is a NaN. Otherwise, the comparison is performed
3209 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 1.1 bjh21 -------------------------------------------------------------------------------
3211 1.1 bjh21 */
3212 1.1 bjh21 flag float64_eq_signaling( float64 a, float64 b )
3213 1.1 bjh21 {
3214 1.1 bjh21
3215 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3216 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3217 1.1 bjh21 ) {
3218 1.1 bjh21 float_raise( float_flag_invalid );
3219 1.1 bjh21 return 0;
3220 1.1 bjh21 }
3221 1.1 bjh21 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3222 1.1 bjh21
3223 1.1 bjh21 }
3224 1.1 bjh21
3225 1.1 bjh21 /*
3226 1.1 bjh21 -------------------------------------------------------------------------------
3227 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than or
3228 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3229 1.1 bjh21 cause an exception. Otherwise, the comparison is performed according to the
3230 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3231 1.1 bjh21 -------------------------------------------------------------------------------
3232 1.1 bjh21 */
3233 1.1 bjh21 flag float64_le_quiet( float64 a, float64 b )
3234 1.1 bjh21 {
3235 1.1 bjh21 flag aSign, bSign;
3236 1.1 bjh21
3237 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3238 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3239 1.1 bjh21 ) {
3240 1.1 bjh21 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3241 1.1 bjh21 float_raise( float_flag_invalid );
3242 1.1 bjh21 }
3243 1.1 bjh21 return 0;
3244 1.1 bjh21 }
3245 1.1 bjh21 aSign = extractFloat64Sign( a );
3246 1.1 bjh21 bSign = extractFloat64Sign( b );
3247 1.1 bjh21 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3248 1.1 bjh21 return ( a == b ) || ( aSign ^ ( a < b ) );
3249 1.1 bjh21
3250 1.1 bjh21 }
3251 1.1 bjh21
3252 1.1 bjh21 /*
3253 1.1 bjh21 -------------------------------------------------------------------------------
3254 1.1 bjh21 Returns 1 if the double-precision floating-point value `a' is less than
3255 1.1 bjh21 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3256 1.1 bjh21 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3257 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
3258 1.1 bjh21 -------------------------------------------------------------------------------
3259 1.1 bjh21 */
3260 1.1 bjh21 flag float64_lt_quiet( float64 a, float64 b )
3261 1.1 bjh21 {
3262 1.1 bjh21 flag aSign, bSign;
3263 1.1 bjh21
3264 1.1 bjh21 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3265 1.1 bjh21 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3266 1.1 bjh21 ) {
3267 1.1 bjh21 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3268 1.1 bjh21 float_raise( float_flag_invalid );
3269 1.1 bjh21 }
3270 1.1 bjh21 return 0;
3271 1.1 bjh21 }
3272 1.1 bjh21 aSign = extractFloat64Sign( a );
3273 1.1 bjh21 bSign = extractFloat64Sign( b );
3274 1.1 bjh21 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3275 1.1 bjh21 return ( a != b ) && ( aSign ^ ( a < b ) );
3276 1.1 bjh21
3277 1.1 bjh21 }
3278 1.1 bjh21 #endif
3279 1.1 bjh21
3280 1.1 bjh21 #ifdef FLOATX80
3281 1.1 bjh21
3282 1.1 bjh21 /*
3283 1.1 bjh21 -------------------------------------------------------------------------------
3284 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3285 1.1 bjh21 point value `a' to the 32-bit two's complement integer format. The
3286 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3287 1.1 bjh21 Floating-Point Arithmetic---which means in particular that the conversion
3288 1.1 bjh21 is rounded according to the current rounding mode. If `a' is a NaN, the
3289 1.1 bjh21 largest positive integer is returned. Otherwise, if the conversion
3290 1.1 bjh21 overflows, the largest integer with the same sign as `a' is returned.
3291 1.1 bjh21 -------------------------------------------------------------------------------
3292 1.1 bjh21 */
3293 1.1 bjh21 int32 floatx80_to_int32( floatx80 a )
3294 1.1 bjh21 {
3295 1.1 bjh21 flag aSign;
3296 1.1 bjh21 int32 aExp, shiftCount;
3297 1.1 bjh21 bits64 aSig;
3298 1.1 bjh21
3299 1.1 bjh21 aSig = extractFloatx80Frac( a );
3300 1.1 bjh21 aExp = extractFloatx80Exp( a );
3301 1.1 bjh21 aSign = extractFloatx80Sign( a );
3302 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3303 1.1 bjh21 shiftCount = 0x4037 - aExp;
3304 1.1 bjh21 if ( shiftCount <= 0 ) shiftCount = 1;
3305 1.1 bjh21 shift64RightJamming( aSig, shiftCount, &aSig );
3306 1.1 bjh21 return roundAndPackInt32( aSign, aSig );
3307 1.1 bjh21
3308 1.1 bjh21 }
3309 1.1 bjh21
3310 1.1 bjh21 /*
3311 1.1 bjh21 -------------------------------------------------------------------------------
3312 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3313 1.1 bjh21 point value `a' to the 32-bit two's complement integer format. The
3314 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3315 1.1 bjh21 Floating-Point Arithmetic, except that the conversion is always rounded
3316 1.1 bjh21 toward zero. If `a' is a NaN, the largest positive integer is returned.
3317 1.1 bjh21 Otherwise, if the conversion overflows, the largest integer with the same
3318 1.1 bjh21 sign as `a' is returned.
3319 1.1 bjh21 -------------------------------------------------------------------------------
3320 1.1 bjh21 */
3321 1.1 bjh21 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3322 1.1 bjh21 {
3323 1.1 bjh21 flag aSign;
3324 1.1 bjh21 int32 aExp, shiftCount;
3325 1.1 bjh21 bits64 aSig, savedASig;
3326 1.1 bjh21 int32 z;
3327 1.1 bjh21
3328 1.1 bjh21 aSig = extractFloatx80Frac( a );
3329 1.1 bjh21 aExp = extractFloatx80Exp( a );
3330 1.1 bjh21 aSign = extractFloatx80Sign( a );
3331 1.1 bjh21 if ( 0x401E < aExp ) {
3332 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333 1.1 bjh21 goto invalid;
3334 1.1 bjh21 }
3335 1.1 bjh21 else if ( aExp < 0x3FFF ) {
3336 1.1 bjh21 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3337 1.1 bjh21 return 0;
3338 1.1 bjh21 }
3339 1.1 bjh21 shiftCount = 0x403E - aExp;
3340 1.1 bjh21 savedASig = aSig;
3341 1.1 bjh21 aSig >>= shiftCount;
3342 1.1 bjh21 z = aSig;
3343 1.1 bjh21 if ( aSign ) z = - z;
3344 1.1 bjh21 if ( ( z < 0 ) ^ aSign ) {
3345 1.1 bjh21 invalid:
3346 1.1 bjh21 float_raise( float_flag_invalid );
3347 1.1 bjh21 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3348 1.1 bjh21 }
3349 1.1 bjh21 if ( ( aSig<<shiftCount ) != savedASig ) {
3350 1.1 bjh21 float_exception_flags |= float_flag_inexact;
3351 1.1 bjh21 }
3352 1.1 bjh21 return z;
3353 1.1 bjh21
3354 1.1 bjh21 }
3355 1.1 bjh21
3356 1.1 bjh21 /*
3357 1.1 bjh21 -------------------------------------------------------------------------------
3358 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3359 1.1 bjh21 point value `a' to the 64-bit two's complement integer format. The
3360 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3361 1.1 bjh21 Floating-Point Arithmetic---which means in particular that the conversion
3362 1.1 bjh21 is rounded according to the current rounding mode. If `a' is a NaN,
3363 1.1 bjh21 the largest positive integer is returned. Otherwise, if the conversion
3364 1.1 bjh21 overflows, the largest integer with the same sign as `a' is returned.
3365 1.1 bjh21 -------------------------------------------------------------------------------
3366 1.1 bjh21 */
3367 1.1 bjh21 int64 floatx80_to_int64( floatx80 a )
3368 1.1 bjh21 {
3369 1.1 bjh21 flag aSign;
3370 1.1 bjh21 int32 aExp, shiftCount;
3371 1.1 bjh21 bits64 aSig, aSigExtra;
3372 1.1 bjh21
3373 1.1 bjh21 aSig = extractFloatx80Frac( a );
3374 1.1 bjh21 aExp = extractFloatx80Exp( a );
3375 1.1 bjh21 aSign = extractFloatx80Sign( a );
3376 1.1 bjh21 shiftCount = 0x403E - aExp;
3377 1.1 bjh21 if ( shiftCount <= 0 ) {
3378 1.1 bjh21 if ( shiftCount ) {
3379 1.1 bjh21 float_raise( float_flag_invalid );
3380 1.1 bjh21 if ( ! aSign
3381 1.1 bjh21 || ( ( aExp == 0x7FFF )
3382 1.1 bjh21 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3383 1.1 bjh21 ) {
3384 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385 1.1 bjh21 }
3386 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
3387 1.1 bjh21 }
3388 1.1 bjh21 aSigExtra = 0;
3389 1.1 bjh21 }
3390 1.1 bjh21 else {
3391 1.1 bjh21 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3392 1.1 bjh21 }
3393 1.1 bjh21 return roundAndPackInt64( aSign, aSig, aSigExtra );
3394 1.1 bjh21
3395 1.1 bjh21 }
3396 1.1 bjh21
3397 1.1 bjh21 /*
3398 1.1 bjh21 -------------------------------------------------------------------------------
3399 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3400 1.1 bjh21 point value `a' to the 64-bit two's complement integer format. The
3401 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3402 1.1 bjh21 Floating-Point Arithmetic, except that the conversion is always rounded
3403 1.1 bjh21 toward zero. If `a' is a NaN, the largest positive integer is returned.
3404 1.1 bjh21 Otherwise, if the conversion overflows, the largest integer with the same
3405 1.1 bjh21 sign as `a' is returned.
3406 1.1 bjh21 -------------------------------------------------------------------------------
3407 1.1 bjh21 */
3408 1.1 bjh21 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3409 1.1 bjh21 {
3410 1.1 bjh21 flag aSign;
3411 1.1 bjh21 int32 aExp, shiftCount;
3412 1.1 bjh21 bits64 aSig;
3413 1.1 bjh21 int64 z;
3414 1.1 bjh21
3415 1.1 bjh21 aSig = extractFloatx80Frac( a );
3416 1.1 bjh21 aExp = extractFloatx80Exp( a );
3417 1.1 bjh21 aSign = extractFloatx80Sign( a );
3418 1.1 bjh21 shiftCount = aExp - 0x403E;
3419 1.1 bjh21 if ( 0 <= shiftCount ) {
3420 1.1 bjh21 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3421 1.1 bjh21 if ( ( a.high != 0xC03E ) || aSig ) {
3422 1.1 bjh21 float_raise( float_flag_invalid );
3423 1.1 bjh21 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3424 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
3425 1.1 bjh21 }
3426 1.1 bjh21 }
3427 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
3428 1.1 bjh21 }
3429 1.1 bjh21 else if ( aExp < 0x3FFF ) {
3430 1.1 bjh21 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3431 1.1 bjh21 return 0;
3432 1.1 bjh21 }
3433 1.1 bjh21 z = aSig>>( - shiftCount );
3434 1.1 bjh21 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3435 1.1 bjh21 float_exception_flags |= float_flag_inexact;
3436 1.1 bjh21 }
3437 1.1 bjh21 if ( aSign ) z = - z;
3438 1.1 bjh21 return z;
3439 1.1 bjh21
3440 1.1 bjh21 }
3441 1.1 bjh21
3442 1.1 bjh21 /*
3443 1.1 bjh21 -------------------------------------------------------------------------------
3444 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3445 1.1 bjh21 point value `a' to the single-precision floating-point format. The
3446 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3447 1.1 bjh21 Floating-Point Arithmetic.
3448 1.1 bjh21 -------------------------------------------------------------------------------
3449 1.1 bjh21 */
3450 1.1 bjh21 float32 floatx80_to_float32( floatx80 a )
3451 1.1 bjh21 {
3452 1.1 bjh21 flag aSign;
3453 1.1 bjh21 int32 aExp;
3454 1.1 bjh21 bits64 aSig;
3455 1.1 bjh21
3456 1.1 bjh21 aSig = extractFloatx80Frac( a );
3457 1.1 bjh21 aExp = extractFloatx80Exp( a );
3458 1.1 bjh21 aSign = extractFloatx80Sign( a );
3459 1.1 bjh21 if ( aExp == 0x7FFF ) {
3460 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) {
3461 1.1 bjh21 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3462 1.1 bjh21 }
3463 1.1 bjh21 return packFloat32( aSign, 0xFF, 0 );
3464 1.1 bjh21 }
3465 1.1 bjh21 shift64RightJamming( aSig, 33, &aSig );
3466 1.1 bjh21 if ( aExp || aSig ) aExp -= 0x3F81;
3467 1.1 bjh21 return roundAndPackFloat32( aSign, aExp, aSig );
3468 1.1 bjh21
3469 1.1 bjh21 }
3470 1.1 bjh21
3471 1.1 bjh21 /*
3472 1.1 bjh21 -------------------------------------------------------------------------------
3473 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3474 1.1 bjh21 point value `a' to the double-precision floating-point format. The
3475 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3476 1.1 bjh21 Floating-Point Arithmetic.
3477 1.1 bjh21 -------------------------------------------------------------------------------
3478 1.1 bjh21 */
3479 1.1 bjh21 float64 floatx80_to_float64( floatx80 a )
3480 1.1 bjh21 {
3481 1.1 bjh21 flag aSign;
3482 1.1 bjh21 int32 aExp;
3483 1.1 bjh21 bits64 aSig, zSig;
3484 1.1 bjh21
3485 1.1 bjh21 aSig = extractFloatx80Frac( a );
3486 1.1 bjh21 aExp = extractFloatx80Exp( a );
3487 1.1 bjh21 aSign = extractFloatx80Sign( a );
3488 1.1 bjh21 if ( aExp == 0x7FFF ) {
3489 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) {
3490 1.1 bjh21 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3491 1.1 bjh21 }
3492 1.1 bjh21 return packFloat64( aSign, 0x7FF, 0 );
3493 1.1 bjh21 }
3494 1.1 bjh21 shift64RightJamming( aSig, 1, &zSig );
3495 1.1 bjh21 if ( aExp || aSig ) aExp -= 0x3C01;
3496 1.1 bjh21 return roundAndPackFloat64( aSign, aExp, zSig );
3497 1.1 bjh21
3498 1.1 bjh21 }
3499 1.1 bjh21
3500 1.1 bjh21 #ifdef FLOAT128
3501 1.1 bjh21
3502 1.1 bjh21 /*
3503 1.1 bjh21 -------------------------------------------------------------------------------
3504 1.1 bjh21 Returns the result of converting the extended double-precision floating-
3505 1.1 bjh21 point value `a' to the quadruple-precision floating-point format. The
3506 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
3507 1.1 bjh21 Floating-Point Arithmetic.
3508 1.1 bjh21 -------------------------------------------------------------------------------
3509 1.1 bjh21 */
3510 1.1 bjh21 float128 floatx80_to_float128( floatx80 a )
3511 1.1 bjh21 {
3512 1.1 bjh21 flag aSign;
3513 1.1 bjh21 int16 aExp;
3514 1.1 bjh21 bits64 aSig, zSig0, zSig1;
3515 1.1 bjh21
3516 1.1 bjh21 aSig = extractFloatx80Frac( a );
3517 1.1 bjh21 aExp = extractFloatx80Exp( a );
3518 1.1 bjh21 aSign = extractFloatx80Sign( a );
3519 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3520 1.1 bjh21 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3521 1.1 bjh21 }
3522 1.1 bjh21 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3523 1.1 bjh21 return packFloat128( aSign, aExp, zSig0, zSig1 );
3524 1.1 bjh21
3525 1.1 bjh21 }
3526 1.1 bjh21
3527 1.1 bjh21 #endif
3528 1.1 bjh21
3529 1.1 bjh21 /*
3530 1.1 bjh21 -------------------------------------------------------------------------------
3531 1.1 bjh21 Rounds the extended double-precision floating-point value `a' to an integer,
3532 1.1 bjh21 and returns the result as an extended quadruple-precision floating-point
3533 1.1 bjh21 value. The operation is performed according to the IEC/IEEE Standard for
3534 1.1 bjh21 Binary Floating-Point Arithmetic.
3535 1.1 bjh21 -------------------------------------------------------------------------------
3536 1.1 bjh21 */
3537 1.1 bjh21 floatx80 floatx80_round_to_int( floatx80 a )
3538 1.1 bjh21 {
3539 1.1 bjh21 flag aSign;
3540 1.1 bjh21 int32 aExp;
3541 1.1 bjh21 bits64 lastBitMask, roundBitsMask;
3542 1.1 bjh21 int8 roundingMode;
3543 1.1 bjh21 floatx80 z;
3544 1.1 bjh21
3545 1.1 bjh21 aExp = extractFloatx80Exp( a );
3546 1.1 bjh21 if ( 0x403E <= aExp ) {
3547 1.1 bjh21 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3548 1.1 bjh21 return propagateFloatx80NaN( a, a );
3549 1.1 bjh21 }
3550 1.1 bjh21 return a;
3551 1.1 bjh21 }
3552 1.1 bjh21 if ( aExp < 0x3FFF ) {
3553 1.1 bjh21 if ( ( aExp == 0 )
3554 1.1 bjh21 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3555 1.1 bjh21 return a;
3556 1.1 bjh21 }
3557 1.1 bjh21 float_exception_flags |= float_flag_inexact;
3558 1.1 bjh21 aSign = extractFloatx80Sign( a );
3559 1.1 bjh21 switch ( float_rounding_mode ) {
3560 1.1 bjh21 case float_round_nearest_even:
3561 1.1 bjh21 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3562 1.1 bjh21 ) {
3563 1.1 bjh21 return
3564 1.1 bjh21 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3565 1.1 bjh21 }
3566 1.1 bjh21 break;
3567 1.1 bjh21 case float_round_to_zero:
3568 1.1 bjh21 break;
3569 1.1 bjh21 case float_round_down:
3570 1.1 bjh21 return
3571 1.1 bjh21 aSign ?
3572 1.1 bjh21 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3573 1.1 bjh21 : packFloatx80( 0, 0, 0 );
3574 1.1 bjh21 case float_round_up:
3575 1.1 bjh21 return
3576 1.1 bjh21 aSign ? packFloatx80( 1, 0, 0 )
3577 1.1 bjh21 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3578 1.1 bjh21 }
3579 1.1 bjh21 return packFloatx80( aSign, 0, 0 );
3580 1.1 bjh21 }
3581 1.1 bjh21 lastBitMask = 1;
3582 1.1 bjh21 lastBitMask <<= 0x403E - aExp;
3583 1.1 bjh21 roundBitsMask = lastBitMask - 1;
3584 1.1 bjh21 z = a;
3585 1.1 bjh21 roundingMode = float_rounding_mode;
3586 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
3587 1.1 bjh21 z.low += lastBitMask>>1;
3588 1.1 bjh21 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3589 1.1 bjh21 }
3590 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
3591 1.1 bjh21 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3592 1.1 bjh21 z.low += roundBitsMask;
3593 1.1 bjh21 }
3594 1.1 bjh21 }
3595 1.1 bjh21 z.low &= ~ roundBitsMask;
3596 1.1 bjh21 if ( z.low == 0 ) {
3597 1.1 bjh21 ++z.high;
3598 1.1 bjh21 z.low = LIT64( 0x8000000000000000 );
3599 1.1 bjh21 }
3600 1.1 bjh21 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3601 1.1 bjh21 return z;
3602 1.1 bjh21
3603 1.1 bjh21 }
3604 1.1 bjh21
3605 1.1 bjh21 /*
3606 1.1 bjh21 -------------------------------------------------------------------------------
3607 1.1 bjh21 Returns the result of adding the absolute values of the extended double-
3608 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3609 1.1 bjh21 negated before being returned. `zSign' is ignored if the result is a NaN.
3610 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
3611 1.1 bjh21 Floating-Point Arithmetic.
3612 1.1 bjh21 -------------------------------------------------------------------------------
3613 1.1 bjh21 */
3614 1.1 bjh21 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3615 1.1 bjh21 {
3616 1.1 bjh21 int32 aExp, bExp, zExp;
3617 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3618 1.1 bjh21 int32 expDiff;
3619 1.1 bjh21
3620 1.1 bjh21 aSig = extractFloatx80Frac( a );
3621 1.1 bjh21 aExp = extractFloatx80Exp( a );
3622 1.1 bjh21 bSig = extractFloatx80Frac( b );
3623 1.1 bjh21 bExp = extractFloatx80Exp( b );
3624 1.1 bjh21 expDiff = aExp - bExp;
3625 1.1 bjh21 if ( 0 < expDiff ) {
3626 1.1 bjh21 if ( aExp == 0x7FFF ) {
3627 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3628 1.1 bjh21 return a;
3629 1.1 bjh21 }
3630 1.1 bjh21 if ( bExp == 0 ) --expDiff;
3631 1.1 bjh21 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3632 1.1 bjh21 zExp = aExp;
3633 1.1 bjh21 }
3634 1.1 bjh21 else if ( expDiff < 0 ) {
3635 1.1 bjh21 if ( bExp == 0x7FFF ) {
3636 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3637 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638 1.1 bjh21 }
3639 1.1 bjh21 if ( aExp == 0 ) ++expDiff;
3640 1.1 bjh21 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3641 1.1 bjh21 zExp = bExp;
3642 1.1 bjh21 }
3643 1.1 bjh21 else {
3644 1.1 bjh21 if ( aExp == 0x7FFF ) {
3645 1.1 bjh21 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3646 1.1 bjh21 return propagateFloatx80NaN( a, b );
3647 1.1 bjh21 }
3648 1.1 bjh21 return a;
3649 1.1 bjh21 }
3650 1.1 bjh21 zSig1 = 0;
3651 1.1 bjh21 zSig0 = aSig + bSig;
3652 1.1 bjh21 if ( aExp == 0 ) {
3653 1.1 bjh21 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3654 1.1 bjh21 goto roundAndPack;
3655 1.1 bjh21 }
3656 1.1 bjh21 zExp = aExp;
3657 1.1 bjh21 goto shiftRight1;
3658 1.1 bjh21 }
3659 1.1 bjh21 zSig0 = aSig + bSig;
3660 1.1 bjh21 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3661 1.1 bjh21 shiftRight1:
3662 1.1 bjh21 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3663 1.1 bjh21 zSig0 |= LIT64( 0x8000000000000000 );
3664 1.1 bjh21 ++zExp;
3665 1.1 bjh21 roundAndPack:
3666 1.1 bjh21 return
3667 1.1 bjh21 roundAndPackFloatx80(
3668 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3669 1.1 bjh21
3670 1.1 bjh21 }
3671 1.1 bjh21
3672 1.1 bjh21 /*
3673 1.1 bjh21 -------------------------------------------------------------------------------
3674 1.1 bjh21 Returns the result of subtracting the absolute values of the extended
3675 1.1 bjh21 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3676 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
3677 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
3678 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
3679 1.1 bjh21 -------------------------------------------------------------------------------
3680 1.1 bjh21 */
3681 1.1 bjh21 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3682 1.1 bjh21 {
3683 1.1 bjh21 int32 aExp, bExp, zExp;
3684 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3685 1.1 bjh21 int32 expDiff;
3686 1.1 bjh21 floatx80 z;
3687 1.1 bjh21
3688 1.1 bjh21 aSig = extractFloatx80Frac( a );
3689 1.1 bjh21 aExp = extractFloatx80Exp( a );
3690 1.1 bjh21 bSig = extractFloatx80Frac( b );
3691 1.1 bjh21 bExp = extractFloatx80Exp( b );
3692 1.1 bjh21 expDiff = aExp - bExp;
3693 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
3694 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
3695 1.1 bjh21 if ( aExp == 0x7FFF ) {
3696 1.1 bjh21 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3697 1.1 bjh21 return propagateFloatx80NaN( a, b );
3698 1.1 bjh21 }
3699 1.1 bjh21 float_raise( float_flag_invalid );
3700 1.1 bjh21 z.low = floatx80_default_nan_low;
3701 1.1 bjh21 z.high = floatx80_default_nan_high;
3702 1.1 bjh21 return z;
3703 1.1 bjh21 }
3704 1.1 bjh21 if ( aExp == 0 ) {
3705 1.1 bjh21 aExp = 1;
3706 1.1 bjh21 bExp = 1;
3707 1.1 bjh21 }
3708 1.1 bjh21 zSig1 = 0;
3709 1.1 bjh21 if ( bSig < aSig ) goto aBigger;
3710 1.1 bjh21 if ( aSig < bSig ) goto bBigger;
3711 1.1 bjh21 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3712 1.1 bjh21 bExpBigger:
3713 1.1 bjh21 if ( bExp == 0x7FFF ) {
3714 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3715 1.1 bjh21 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716 1.1 bjh21 }
3717 1.1 bjh21 if ( aExp == 0 ) ++expDiff;
3718 1.1 bjh21 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3719 1.1 bjh21 bBigger:
3720 1.1 bjh21 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3721 1.1 bjh21 zExp = bExp;
3722 1.1 bjh21 zSign ^= 1;
3723 1.1 bjh21 goto normalizeRoundAndPack;
3724 1.1 bjh21 aExpBigger:
3725 1.1 bjh21 if ( aExp == 0x7FFF ) {
3726 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3727 1.1 bjh21 return a;
3728 1.1 bjh21 }
3729 1.1 bjh21 if ( bExp == 0 ) --expDiff;
3730 1.1 bjh21 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3731 1.1 bjh21 aBigger:
3732 1.1 bjh21 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3733 1.1 bjh21 zExp = aExp;
3734 1.1 bjh21 normalizeRoundAndPack:
3735 1.1 bjh21 return
3736 1.1 bjh21 normalizeRoundAndPackFloatx80(
3737 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3738 1.1 bjh21
3739 1.1 bjh21 }
3740 1.1 bjh21
3741 1.1 bjh21 /*
3742 1.1 bjh21 -------------------------------------------------------------------------------
3743 1.1 bjh21 Returns the result of adding the extended double-precision floating-point
3744 1.1 bjh21 values `a' and `b'. The operation is performed according to the IEC/IEEE
3745 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
3746 1.1 bjh21 -------------------------------------------------------------------------------
3747 1.1 bjh21 */
3748 1.1 bjh21 floatx80 floatx80_add( floatx80 a, floatx80 b )
3749 1.1 bjh21 {
3750 1.1 bjh21 flag aSign, bSign;
3751 1.1 bjh21
3752 1.1 bjh21 aSign = extractFloatx80Sign( a );
3753 1.1 bjh21 bSign = extractFloatx80Sign( b );
3754 1.1 bjh21 if ( aSign == bSign ) {
3755 1.1 bjh21 return addFloatx80Sigs( a, b, aSign );
3756 1.1 bjh21 }
3757 1.1 bjh21 else {
3758 1.1 bjh21 return subFloatx80Sigs( a, b, aSign );
3759 1.1 bjh21 }
3760 1.1 bjh21
3761 1.1 bjh21 }
3762 1.1 bjh21
3763 1.1 bjh21 /*
3764 1.1 bjh21 -------------------------------------------------------------------------------
3765 1.1 bjh21 Returns the result of subtracting the extended double-precision floating-
3766 1.1 bjh21 point values `a' and `b'. The operation is performed according to the
3767 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3768 1.1 bjh21 -------------------------------------------------------------------------------
3769 1.1 bjh21 */
3770 1.1 bjh21 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3771 1.1 bjh21 {
3772 1.1 bjh21 flag aSign, bSign;
3773 1.1 bjh21
3774 1.1 bjh21 aSign = extractFloatx80Sign( a );
3775 1.1 bjh21 bSign = extractFloatx80Sign( b );
3776 1.1 bjh21 if ( aSign == bSign ) {
3777 1.1 bjh21 return subFloatx80Sigs( a, b, aSign );
3778 1.1 bjh21 }
3779 1.1 bjh21 else {
3780 1.1 bjh21 return addFloatx80Sigs( a, b, aSign );
3781 1.1 bjh21 }
3782 1.1 bjh21
3783 1.1 bjh21 }
3784 1.1 bjh21
3785 1.1 bjh21 /*
3786 1.1 bjh21 -------------------------------------------------------------------------------
3787 1.1 bjh21 Returns the result of multiplying the extended double-precision floating-
3788 1.1 bjh21 point values `a' and `b'. The operation is performed according to the
3789 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3790 1.1 bjh21 -------------------------------------------------------------------------------
3791 1.1 bjh21 */
3792 1.1 bjh21 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3793 1.1 bjh21 {
3794 1.1 bjh21 flag aSign, bSign, zSign;
3795 1.1 bjh21 int32 aExp, bExp, zExp;
3796 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3797 1.1 bjh21 floatx80 z;
3798 1.1 bjh21
3799 1.1 bjh21 aSig = extractFloatx80Frac( a );
3800 1.1 bjh21 aExp = extractFloatx80Exp( a );
3801 1.1 bjh21 aSign = extractFloatx80Sign( a );
3802 1.1 bjh21 bSig = extractFloatx80Frac( b );
3803 1.1 bjh21 bExp = extractFloatx80Exp( b );
3804 1.1 bjh21 bSign = extractFloatx80Sign( b );
3805 1.1 bjh21 zSign = aSign ^ bSign;
3806 1.1 bjh21 if ( aExp == 0x7FFF ) {
3807 1.1 bjh21 if ( (bits64) ( aSig<<1 )
3808 1.1 bjh21 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3809 1.1 bjh21 return propagateFloatx80NaN( a, b );
3810 1.1 bjh21 }
3811 1.1 bjh21 if ( ( bExp | bSig ) == 0 ) goto invalid;
3812 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3813 1.1 bjh21 }
3814 1.1 bjh21 if ( bExp == 0x7FFF ) {
3815 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3816 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
3817 1.1 bjh21 invalid:
3818 1.1 bjh21 float_raise( float_flag_invalid );
3819 1.1 bjh21 z.low = floatx80_default_nan_low;
3820 1.1 bjh21 z.high = floatx80_default_nan_high;
3821 1.1 bjh21 return z;
3822 1.1 bjh21 }
3823 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3824 1.1 bjh21 }
3825 1.1 bjh21 if ( aExp == 0 ) {
3826 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3827 1.1 bjh21 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3828 1.1 bjh21 }
3829 1.1 bjh21 if ( bExp == 0 ) {
3830 1.1 bjh21 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3831 1.1 bjh21 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3832 1.1 bjh21 }
3833 1.1 bjh21 zExp = aExp + bExp - 0x3FFE;
3834 1.1 bjh21 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3835 1.1 bjh21 if ( 0 < (sbits64) zSig0 ) {
3836 1.1 bjh21 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3837 1.1 bjh21 --zExp;
3838 1.1 bjh21 }
3839 1.1 bjh21 return
3840 1.1 bjh21 roundAndPackFloatx80(
3841 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3842 1.1 bjh21
3843 1.1 bjh21 }
3844 1.1 bjh21
3845 1.1 bjh21 /*
3846 1.1 bjh21 -------------------------------------------------------------------------------
3847 1.1 bjh21 Returns the result of dividing the extended double-precision floating-point
3848 1.1 bjh21 value `a' by the corresponding value `b'. The operation is performed
3849 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3850 1.1 bjh21 -------------------------------------------------------------------------------
3851 1.1 bjh21 */
3852 1.1 bjh21 floatx80 floatx80_div( floatx80 a, floatx80 b )
3853 1.1 bjh21 {
3854 1.1 bjh21 flag aSign, bSign, zSign;
3855 1.1 bjh21 int32 aExp, bExp, zExp;
3856 1.1 bjh21 bits64 aSig, bSig, zSig0, zSig1;
3857 1.1 bjh21 bits64 rem0, rem1, rem2, term0, term1, term2;
3858 1.1 bjh21 floatx80 z;
3859 1.1 bjh21
3860 1.1 bjh21 aSig = extractFloatx80Frac( a );
3861 1.1 bjh21 aExp = extractFloatx80Exp( a );
3862 1.1 bjh21 aSign = extractFloatx80Sign( a );
3863 1.1 bjh21 bSig = extractFloatx80Frac( b );
3864 1.1 bjh21 bExp = extractFloatx80Exp( b );
3865 1.1 bjh21 bSign = extractFloatx80Sign( b );
3866 1.1 bjh21 zSign = aSign ^ bSign;
3867 1.1 bjh21 if ( aExp == 0x7FFF ) {
3868 1.1 bjh21 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869 1.1 bjh21 if ( bExp == 0x7FFF ) {
3870 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871 1.1 bjh21 goto invalid;
3872 1.1 bjh21 }
3873 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3874 1.1 bjh21 }
3875 1.1 bjh21 if ( bExp == 0x7FFF ) {
3876 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877 1.1 bjh21 return packFloatx80( zSign, 0, 0 );
3878 1.1 bjh21 }
3879 1.1 bjh21 if ( bExp == 0 ) {
3880 1.1 bjh21 if ( bSig == 0 ) {
3881 1.1 bjh21 if ( ( aExp | aSig ) == 0 ) {
3882 1.1 bjh21 invalid:
3883 1.1 bjh21 float_raise( float_flag_invalid );
3884 1.1 bjh21 z.low = floatx80_default_nan_low;
3885 1.1 bjh21 z.high = floatx80_default_nan_high;
3886 1.1 bjh21 return z;
3887 1.1 bjh21 }
3888 1.1 bjh21 float_raise( float_flag_divbyzero );
3889 1.1 bjh21 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3890 1.1 bjh21 }
3891 1.1 bjh21 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892 1.1 bjh21 }
3893 1.1 bjh21 if ( aExp == 0 ) {
3894 1.1 bjh21 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3895 1.1 bjh21 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3896 1.1 bjh21 }
3897 1.1 bjh21 zExp = aExp - bExp + 0x3FFE;
3898 1.1 bjh21 rem1 = 0;
3899 1.1 bjh21 if ( bSig <= aSig ) {
3900 1.1 bjh21 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3901 1.1 bjh21 ++zExp;
3902 1.1 bjh21 }
3903 1.1 bjh21 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3904 1.1 bjh21 mul64To128( bSig, zSig0, &term0, &term1 );
3905 1.1 bjh21 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3906 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
3907 1.1 bjh21 --zSig0;
3908 1.1 bjh21 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3909 1.1 bjh21 }
3910 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3911 1.1 bjh21 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3912 1.1 bjh21 mul64To128( bSig, zSig1, &term1, &term2 );
3913 1.1 bjh21 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3914 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
3915 1.1 bjh21 --zSig1;
3916 1.1 bjh21 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3917 1.1 bjh21 }
3918 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3919 1.1 bjh21 }
3920 1.1 bjh21 return
3921 1.1 bjh21 roundAndPackFloatx80(
3922 1.1 bjh21 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3923 1.1 bjh21
3924 1.1 bjh21 }
3925 1.1 bjh21
3926 1.1 bjh21 /*
3927 1.1 bjh21 -------------------------------------------------------------------------------
3928 1.1 bjh21 Returns the remainder of the extended double-precision floating-point value
3929 1.1 bjh21 `a' with respect to the corresponding value `b'. The operation is performed
3930 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3931 1.1 bjh21 -------------------------------------------------------------------------------
3932 1.1 bjh21 */
3933 1.1 bjh21 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3934 1.1 bjh21 {
3935 1.1 bjh21 flag aSign, bSign, zSign;
3936 1.1 bjh21 int32 aExp, bExp, expDiff;
3937 1.1 bjh21 bits64 aSig0, aSig1, bSig;
3938 1.1 bjh21 bits64 q, term0, term1, alternateASig0, alternateASig1;
3939 1.1 bjh21 floatx80 z;
3940 1.1 bjh21
3941 1.1 bjh21 aSig0 = extractFloatx80Frac( a );
3942 1.1 bjh21 aExp = extractFloatx80Exp( a );
3943 1.1 bjh21 aSign = extractFloatx80Sign( a );
3944 1.1 bjh21 bSig = extractFloatx80Frac( b );
3945 1.1 bjh21 bExp = extractFloatx80Exp( b );
3946 1.1 bjh21 bSign = extractFloatx80Sign( b );
3947 1.1 bjh21 if ( aExp == 0x7FFF ) {
3948 1.1 bjh21 if ( (bits64) ( aSig0<<1 )
3949 1.1 bjh21 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3950 1.1 bjh21 return propagateFloatx80NaN( a, b );
3951 1.1 bjh21 }
3952 1.1 bjh21 goto invalid;
3953 1.1 bjh21 }
3954 1.1 bjh21 if ( bExp == 0x7FFF ) {
3955 1.1 bjh21 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3956 1.1 bjh21 return a;
3957 1.1 bjh21 }
3958 1.1 bjh21 if ( bExp == 0 ) {
3959 1.1 bjh21 if ( bSig == 0 ) {
3960 1.1 bjh21 invalid:
3961 1.1 bjh21 float_raise( float_flag_invalid );
3962 1.1 bjh21 z.low = floatx80_default_nan_low;
3963 1.1 bjh21 z.high = floatx80_default_nan_high;
3964 1.1 bjh21 return z;
3965 1.1 bjh21 }
3966 1.1 bjh21 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3967 1.1 bjh21 }
3968 1.1 bjh21 if ( aExp == 0 ) {
3969 1.1 bjh21 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3970 1.1 bjh21 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3971 1.1 bjh21 }
3972 1.1 bjh21 bSig |= LIT64( 0x8000000000000000 );
3973 1.1 bjh21 zSign = aSign;
3974 1.1 bjh21 expDiff = aExp - bExp;
3975 1.1 bjh21 aSig1 = 0;
3976 1.1 bjh21 if ( expDiff < 0 ) {
3977 1.1 bjh21 if ( expDiff < -1 ) return a;
3978 1.1 bjh21 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3979 1.1 bjh21 expDiff = 0;
3980 1.1 bjh21 }
3981 1.1 bjh21 q = ( bSig <= aSig0 );
3982 1.1 bjh21 if ( q ) aSig0 -= bSig;
3983 1.1 bjh21 expDiff -= 64;
3984 1.1 bjh21 while ( 0 < expDiff ) {
3985 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig );
3986 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3987 1.1 bjh21 mul64To128( bSig, q, &term0, &term1 );
3988 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3989 1.1 bjh21 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3990 1.1 bjh21 expDiff -= 62;
3991 1.1 bjh21 }
3992 1.1 bjh21 expDiff += 64;
3993 1.1 bjh21 if ( 0 < expDiff ) {
3994 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig );
3995 1.1 bjh21 q = ( 2 < q ) ? q - 2 : 0;
3996 1.1 bjh21 q >>= 64 - expDiff;
3997 1.1 bjh21 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3998 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3999 1.1 bjh21 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4000 1.1 bjh21 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4001 1.1 bjh21 ++q;
4002 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4003 1.1 bjh21 }
4004 1.1 bjh21 }
4005 1.1 bjh21 else {
4006 1.1 bjh21 term1 = 0;
4007 1.1 bjh21 term0 = bSig;
4008 1.1 bjh21 }
4009 1.1 bjh21 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4010 1.1 bjh21 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011 1.1 bjh21 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4012 1.1 bjh21 && ( q & 1 ) )
4013 1.1 bjh21 ) {
4014 1.1 bjh21 aSig0 = alternateASig0;
4015 1.1 bjh21 aSig1 = alternateASig1;
4016 1.1 bjh21 zSign = ! zSign;
4017 1.1 bjh21 }
4018 1.1 bjh21 return
4019 1.1 bjh21 normalizeRoundAndPackFloatx80(
4020 1.1 bjh21 80, zSign, bExp + expDiff, aSig0, aSig1 );
4021 1.1 bjh21
4022 1.1 bjh21 }
4023 1.1 bjh21
4024 1.1 bjh21 /*
4025 1.1 bjh21 -------------------------------------------------------------------------------
4026 1.1 bjh21 Returns the square root of the extended double-precision floating-point
4027 1.1 bjh21 value `a'. The operation is performed according to the IEC/IEEE Standard
4028 1.1 bjh21 for Binary Floating-Point Arithmetic.
4029 1.1 bjh21 -------------------------------------------------------------------------------
4030 1.1 bjh21 */
4031 1.1 bjh21 floatx80 floatx80_sqrt( floatx80 a )
4032 1.1 bjh21 {
4033 1.1 bjh21 flag aSign;
4034 1.1 bjh21 int32 aExp, zExp;
4035 1.1 bjh21 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4036 1.1 bjh21 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4037 1.1 bjh21 floatx80 z;
4038 1.1 bjh21
4039 1.1 bjh21 aSig0 = extractFloatx80Frac( a );
4040 1.1 bjh21 aExp = extractFloatx80Exp( a );
4041 1.1 bjh21 aSign = extractFloatx80Sign( a );
4042 1.1 bjh21 if ( aExp == 0x7FFF ) {
4043 1.1 bjh21 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4044 1.1 bjh21 if ( ! aSign ) return a;
4045 1.1 bjh21 goto invalid;
4046 1.1 bjh21 }
4047 1.1 bjh21 if ( aSign ) {
4048 1.1 bjh21 if ( ( aExp | aSig0 ) == 0 ) return a;
4049 1.1 bjh21 invalid:
4050 1.1 bjh21 float_raise( float_flag_invalid );
4051 1.1 bjh21 z.low = floatx80_default_nan_low;
4052 1.1 bjh21 z.high = floatx80_default_nan_high;
4053 1.1 bjh21 return z;
4054 1.1 bjh21 }
4055 1.1 bjh21 if ( aExp == 0 ) {
4056 1.1 bjh21 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4057 1.1 bjh21 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4058 1.1 bjh21 }
4059 1.1 bjh21 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4060 1.1 bjh21 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4061 1.1 bjh21 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4062 1.1 bjh21 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4063 1.1 bjh21 doubleZSig0 = zSig0<<1;
4064 1.1 bjh21 mul64To128( zSig0, zSig0, &term0, &term1 );
4065 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4066 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
4067 1.1 bjh21 --zSig0;
4068 1.1 bjh21 doubleZSig0 -= 2;
4069 1.1 bjh21 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4070 1.1 bjh21 }
4071 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4072 1.1 bjh21 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4073 1.1 bjh21 if ( zSig1 == 0 ) zSig1 = 1;
4074 1.1 bjh21 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4075 1.1 bjh21 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4076 1.1 bjh21 mul64To128( zSig1, zSig1, &term2, &term3 );
4077 1.1 bjh21 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4078 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
4079 1.1 bjh21 --zSig1;
4080 1.1 bjh21 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4081 1.1 bjh21 term3 |= 1;
4082 1.1 bjh21 term2 |= doubleZSig0;
4083 1.1 bjh21 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4084 1.1 bjh21 }
4085 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4086 1.1 bjh21 }
4087 1.1 bjh21 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4088 1.1 bjh21 zSig0 |= doubleZSig0;
4089 1.1 bjh21 return
4090 1.1 bjh21 roundAndPackFloatx80(
4091 1.1 bjh21 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4092 1.1 bjh21
4093 1.1 bjh21 }
4094 1.1 bjh21
4095 1.1 bjh21 /*
4096 1.1 bjh21 -------------------------------------------------------------------------------
4097 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is
4098 1.1 bjh21 equal to the corresponding value `b', and 0 otherwise. The comparison is
4099 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 1.1 bjh21 Arithmetic.
4101 1.1 bjh21 -------------------------------------------------------------------------------
4102 1.1 bjh21 */
4103 1.1 bjh21 flag floatx80_eq( floatx80 a, floatx80 b )
4104 1.1 bjh21 {
4105 1.1 bjh21
4106 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4107 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4108 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4109 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4110 1.1 bjh21 ) {
4111 1.1 bjh21 if ( floatx80_is_signaling_nan( a )
4112 1.1 bjh21 || floatx80_is_signaling_nan( b ) ) {
4113 1.1 bjh21 float_raise( float_flag_invalid );
4114 1.1 bjh21 }
4115 1.1 bjh21 return 0;
4116 1.1 bjh21 }
4117 1.1 bjh21 return
4118 1.1 bjh21 ( a.low == b.low )
4119 1.1 bjh21 && ( ( a.high == b.high )
4120 1.1 bjh21 || ( ( a.low == 0 )
4121 1.1 bjh21 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4122 1.1 bjh21 );
4123 1.1 bjh21
4124 1.1 bjh21 }
4125 1.1 bjh21
4126 1.1 bjh21 /*
4127 1.1 bjh21 -------------------------------------------------------------------------------
4128 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is
4129 1.1 bjh21 less than or equal to the corresponding value `b', and 0 otherwise. The
4130 1.1 bjh21 comparison is performed according to the IEC/IEEE Standard for Binary
4131 1.1 bjh21 Floating-Point Arithmetic.
4132 1.1 bjh21 -------------------------------------------------------------------------------
4133 1.1 bjh21 */
4134 1.1 bjh21 flag floatx80_le( floatx80 a, floatx80 b )
4135 1.1 bjh21 {
4136 1.1 bjh21 flag aSign, bSign;
4137 1.1 bjh21
4138 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4139 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4140 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4141 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4142 1.1 bjh21 ) {
4143 1.1 bjh21 float_raise( float_flag_invalid );
4144 1.1 bjh21 return 0;
4145 1.1 bjh21 }
4146 1.1 bjh21 aSign = extractFloatx80Sign( a );
4147 1.1 bjh21 bSign = extractFloatx80Sign( b );
4148 1.1 bjh21 if ( aSign != bSign ) {
4149 1.1 bjh21 return
4150 1.1 bjh21 aSign
4151 1.1 bjh21 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4152 1.1 bjh21 == 0 );
4153 1.1 bjh21 }
4154 1.1 bjh21 return
4155 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
4156 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
4157 1.1 bjh21
4158 1.1 bjh21 }
4159 1.1 bjh21
4160 1.1 bjh21 /*
4161 1.1 bjh21 -------------------------------------------------------------------------------
4162 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is
4163 1.1 bjh21 less than the corresponding value `b', and 0 otherwise. The comparison
4164 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 1.1 bjh21 Arithmetic.
4166 1.1 bjh21 -------------------------------------------------------------------------------
4167 1.1 bjh21 */
4168 1.1 bjh21 flag floatx80_lt( floatx80 a, floatx80 b )
4169 1.1 bjh21 {
4170 1.1 bjh21 flag aSign, bSign;
4171 1.1 bjh21
4172 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4173 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4174 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4175 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4176 1.1 bjh21 ) {
4177 1.1 bjh21 float_raise( float_flag_invalid );
4178 1.1 bjh21 return 0;
4179 1.1 bjh21 }
4180 1.1 bjh21 aSign = extractFloatx80Sign( a );
4181 1.1 bjh21 bSign = extractFloatx80Sign( b );
4182 1.1 bjh21 if ( aSign != bSign ) {
4183 1.1 bjh21 return
4184 1.1 bjh21 aSign
4185 1.1 bjh21 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4186 1.1 bjh21 != 0 );
4187 1.1 bjh21 }
4188 1.1 bjh21 return
4189 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
4190 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
4191 1.1 bjh21
4192 1.1 bjh21 }
4193 1.1 bjh21
4194 1.1 bjh21 /*
4195 1.1 bjh21 -------------------------------------------------------------------------------
4196 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is equal
4197 1.1 bjh21 to the corresponding value `b', and 0 otherwise. The invalid exception is
4198 1.1 bjh21 raised if either operand is a NaN. Otherwise, the comparison is performed
4199 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4200 1.1 bjh21 -------------------------------------------------------------------------------
4201 1.1 bjh21 */
4202 1.1 bjh21 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4203 1.1 bjh21 {
4204 1.1 bjh21
4205 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4206 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4207 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4208 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4209 1.1 bjh21 ) {
4210 1.1 bjh21 float_raise( float_flag_invalid );
4211 1.1 bjh21 return 0;
4212 1.1 bjh21 }
4213 1.1 bjh21 return
4214 1.1 bjh21 ( a.low == b.low )
4215 1.1 bjh21 && ( ( a.high == b.high )
4216 1.1 bjh21 || ( ( a.low == 0 )
4217 1.1 bjh21 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4218 1.1 bjh21 );
4219 1.1 bjh21
4220 1.1 bjh21 }
4221 1.1 bjh21
4222 1.1 bjh21 /*
4223 1.1 bjh21 -------------------------------------------------------------------------------
4224 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is less
4225 1.1 bjh21 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4226 1.1 bjh21 do not cause an exception. Otherwise, the comparison is performed according
4227 1.1 bjh21 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 1.1 bjh21 -------------------------------------------------------------------------------
4229 1.1 bjh21 */
4230 1.1 bjh21 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4231 1.1 bjh21 {
4232 1.1 bjh21 flag aSign, bSign;
4233 1.1 bjh21
4234 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4235 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4236 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4237 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4238 1.1 bjh21 ) {
4239 1.1 bjh21 if ( floatx80_is_signaling_nan( a )
4240 1.1 bjh21 || floatx80_is_signaling_nan( b ) ) {
4241 1.1 bjh21 float_raise( float_flag_invalid );
4242 1.1 bjh21 }
4243 1.1 bjh21 return 0;
4244 1.1 bjh21 }
4245 1.1 bjh21 aSign = extractFloatx80Sign( a );
4246 1.1 bjh21 bSign = extractFloatx80Sign( b );
4247 1.1 bjh21 if ( aSign != bSign ) {
4248 1.1 bjh21 return
4249 1.1 bjh21 aSign
4250 1.1 bjh21 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4251 1.1 bjh21 == 0 );
4252 1.1 bjh21 }
4253 1.1 bjh21 return
4254 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
4255 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
4256 1.1 bjh21
4257 1.1 bjh21 }
4258 1.1 bjh21
4259 1.1 bjh21 /*
4260 1.1 bjh21 -------------------------------------------------------------------------------
4261 1.1 bjh21 Returns 1 if the extended double-precision floating-point value `a' is less
4262 1.1 bjh21 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4263 1.1 bjh21 an exception. Otherwise, the comparison is performed according to the
4264 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 1.1 bjh21 -------------------------------------------------------------------------------
4266 1.1 bjh21 */
4267 1.1 bjh21 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4268 1.1 bjh21 {
4269 1.1 bjh21 flag aSign, bSign;
4270 1.1 bjh21
4271 1.1 bjh21 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4272 1.1 bjh21 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4273 1.1 bjh21 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4274 1.1 bjh21 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4275 1.1 bjh21 ) {
4276 1.1 bjh21 if ( floatx80_is_signaling_nan( a )
4277 1.1 bjh21 || floatx80_is_signaling_nan( b ) ) {
4278 1.1 bjh21 float_raise( float_flag_invalid );
4279 1.1 bjh21 }
4280 1.1 bjh21 return 0;
4281 1.1 bjh21 }
4282 1.1 bjh21 aSign = extractFloatx80Sign( a );
4283 1.1 bjh21 bSign = extractFloatx80Sign( b );
4284 1.1 bjh21 if ( aSign != bSign ) {
4285 1.1 bjh21 return
4286 1.1 bjh21 aSign
4287 1.1 bjh21 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4288 1.1 bjh21 != 0 );
4289 1.1 bjh21 }
4290 1.1 bjh21 return
4291 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
4292 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
4293 1.1 bjh21
4294 1.1 bjh21 }
4295 1.1 bjh21
4296 1.1 bjh21 #endif
4297 1.1 bjh21
4298 1.1 bjh21 #ifdef FLOAT128
4299 1.1 bjh21
4300 1.1 bjh21 /*
4301 1.1 bjh21 -------------------------------------------------------------------------------
4302 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4303 1.1 bjh21 value `a' to the 32-bit two's complement integer format. The conversion
4304 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
4306 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
4307 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
4308 1.1 bjh21 largest integer with the same sign as `a' is returned.
4309 1.1 bjh21 -------------------------------------------------------------------------------
4310 1.1 bjh21 */
4311 1.1 bjh21 int32 float128_to_int32( float128 a )
4312 1.1 bjh21 {
4313 1.1 bjh21 flag aSign;
4314 1.1 bjh21 int32 aExp, shiftCount;
4315 1.1 bjh21 bits64 aSig0, aSig1;
4316 1.1 bjh21
4317 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4318 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4319 1.1 bjh21 aExp = extractFloat128Exp( a );
4320 1.1 bjh21 aSign = extractFloat128Sign( a );
4321 1.1 bjh21 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4322 1.1 bjh21 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4323 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4324 1.1 bjh21 shiftCount = 0x4028 - aExp;
4325 1.1 bjh21 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4326 1.1 bjh21 return roundAndPackInt32( aSign, aSig0 );
4327 1.1 bjh21
4328 1.1 bjh21 }
4329 1.1 bjh21
4330 1.1 bjh21 /*
4331 1.1 bjh21 -------------------------------------------------------------------------------
4332 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4333 1.1 bjh21 value `a' to the 32-bit two's complement integer format. The conversion
4334 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
4336 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4337 1.1 bjh21 conversion overflows, the largest integer with the same sign as `a' is
4338 1.1 bjh21 returned.
4339 1.1 bjh21 -------------------------------------------------------------------------------
4340 1.1 bjh21 */
4341 1.1 bjh21 int32 float128_to_int32_round_to_zero( float128 a )
4342 1.1 bjh21 {
4343 1.1 bjh21 flag aSign;
4344 1.1 bjh21 int32 aExp, shiftCount;
4345 1.1 bjh21 bits64 aSig0, aSig1, savedASig;
4346 1.1 bjh21 int32 z;
4347 1.1 bjh21
4348 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4349 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4350 1.1 bjh21 aExp = extractFloat128Exp( a );
4351 1.1 bjh21 aSign = extractFloat128Sign( a );
4352 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4353 1.1 bjh21 if ( 0x401E < aExp ) {
4354 1.1 bjh21 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4355 1.1 bjh21 goto invalid;
4356 1.1 bjh21 }
4357 1.1 bjh21 else if ( aExp < 0x3FFF ) {
4358 1.1 bjh21 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4359 1.1 bjh21 return 0;
4360 1.1 bjh21 }
4361 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4362 1.1 bjh21 shiftCount = 0x402F - aExp;
4363 1.1 bjh21 savedASig = aSig0;
4364 1.1 bjh21 aSig0 >>= shiftCount;
4365 1.1 bjh21 z = aSig0;
4366 1.1 bjh21 if ( aSign ) z = - z;
4367 1.1 bjh21 if ( ( z < 0 ) ^ aSign ) {
4368 1.1 bjh21 invalid:
4369 1.1 bjh21 float_raise( float_flag_invalid );
4370 1.1 bjh21 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4371 1.1 bjh21 }
4372 1.1 bjh21 if ( ( aSig0<<shiftCount ) != savedASig ) {
4373 1.1 bjh21 float_exception_flags |= float_flag_inexact;
4374 1.1 bjh21 }
4375 1.1 bjh21 return z;
4376 1.1 bjh21
4377 1.1 bjh21 }
4378 1.1 bjh21
4379 1.1 bjh21 /*
4380 1.1 bjh21 -------------------------------------------------------------------------------
4381 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4382 1.1 bjh21 value `a' to the 64-bit two's complement integer format. The conversion
4383 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 1.1 bjh21 Arithmetic---which means in particular that the conversion is rounded
4385 1.1 bjh21 according to the current rounding mode. If `a' is a NaN, the largest
4386 1.1 bjh21 positive integer is returned. Otherwise, if the conversion overflows, the
4387 1.1 bjh21 largest integer with the same sign as `a' is returned.
4388 1.1 bjh21 -------------------------------------------------------------------------------
4389 1.1 bjh21 */
4390 1.1 bjh21 int64 float128_to_int64( float128 a )
4391 1.1 bjh21 {
4392 1.1 bjh21 flag aSign;
4393 1.1 bjh21 int32 aExp, shiftCount;
4394 1.1 bjh21 bits64 aSig0, aSig1;
4395 1.1 bjh21
4396 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4397 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4398 1.1 bjh21 aExp = extractFloat128Exp( a );
4399 1.1 bjh21 aSign = extractFloat128Sign( a );
4400 1.1 bjh21 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401 1.1 bjh21 shiftCount = 0x402F - aExp;
4402 1.1 bjh21 if ( shiftCount <= 0 ) {
4403 1.1 bjh21 if ( 0x403E < aExp ) {
4404 1.1 bjh21 float_raise( float_flag_invalid );
4405 1.1 bjh21 if ( ! aSign
4406 1.1 bjh21 || ( ( aExp == 0x7FFF )
4407 1.1 bjh21 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4408 1.1 bjh21 )
4409 1.1 bjh21 ) {
4410 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
4411 1.1 bjh21 }
4412 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
4413 1.1 bjh21 }
4414 1.1 bjh21 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4415 1.1 bjh21 }
4416 1.1 bjh21 else {
4417 1.1 bjh21 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4418 1.1 bjh21 }
4419 1.1 bjh21 return roundAndPackInt64( aSign, aSig0, aSig1 );
4420 1.1 bjh21
4421 1.1 bjh21 }
4422 1.1 bjh21
4423 1.1 bjh21 /*
4424 1.1 bjh21 -------------------------------------------------------------------------------
4425 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4426 1.1 bjh21 value `a' to the 64-bit two's complement integer format. The conversion
4427 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4428 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero.
4429 1.1 bjh21 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4430 1.1 bjh21 the conversion overflows, the largest integer with the same sign as `a' is
4431 1.1 bjh21 returned.
4432 1.1 bjh21 -------------------------------------------------------------------------------
4433 1.1 bjh21 */
4434 1.1 bjh21 int64 float128_to_int64_round_to_zero( float128 a )
4435 1.1 bjh21 {
4436 1.1 bjh21 flag aSign;
4437 1.1 bjh21 int32 aExp, shiftCount;
4438 1.1 bjh21 bits64 aSig0, aSig1;
4439 1.1 bjh21 int64 z;
4440 1.1 bjh21
4441 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4442 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4443 1.1 bjh21 aExp = extractFloat128Exp( a );
4444 1.1 bjh21 aSign = extractFloat128Sign( a );
4445 1.1 bjh21 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4446 1.1 bjh21 shiftCount = aExp - 0x402F;
4447 1.1 bjh21 if ( 0 < shiftCount ) {
4448 1.1 bjh21 if ( 0x403E <= aExp ) {
4449 1.1 bjh21 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4450 1.1 bjh21 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4451 1.1 bjh21 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4452 1.1 bjh21 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4453 1.1 bjh21 }
4454 1.1 bjh21 else {
4455 1.1 bjh21 float_raise( float_flag_invalid );
4456 1.1 bjh21 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4457 1.1 bjh21 return LIT64( 0x7FFFFFFFFFFFFFFF );
4458 1.1 bjh21 }
4459 1.1 bjh21 }
4460 1.1 bjh21 return (sbits64) LIT64( 0x8000000000000000 );
4461 1.1 bjh21 }
4462 1.1 bjh21 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4463 1.1 bjh21 if ( (bits64) ( aSig1<<shiftCount ) ) {
4464 1.1 bjh21 float_exception_flags |= float_flag_inexact;
4465 1.1 bjh21 }
4466 1.1 bjh21 }
4467 1.1 bjh21 else {
4468 1.1 bjh21 if ( aExp < 0x3FFF ) {
4469 1.1 bjh21 if ( aExp | aSig0 | aSig1 ) {
4470 1.1 bjh21 float_exception_flags |= float_flag_inexact;
4471 1.1 bjh21 }
4472 1.1 bjh21 return 0;
4473 1.1 bjh21 }
4474 1.1 bjh21 z = aSig0>>( - shiftCount );
4475 1.1 bjh21 if ( aSig1
4476 1.1 bjh21 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4477 1.1 bjh21 float_exception_flags |= float_flag_inexact;
4478 1.1 bjh21 }
4479 1.1 bjh21 }
4480 1.1 bjh21 if ( aSign ) z = - z;
4481 1.1 bjh21 return z;
4482 1.1 bjh21
4483 1.1 bjh21 }
4484 1.1 bjh21
4485 1.8 matt #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4486 1.8 matt && defined(SOFTFLOAT_NEED_FIXUNS)
4487 1.1 bjh21 /*
4488 1.4 martin * just like above - but do not care for overflow of signed results
4489 1.4 martin */
4490 1.4 martin uint64 float128_to_uint64_round_to_zero( float128 a )
4491 1.4 martin {
4492 1.4 martin flag aSign;
4493 1.4 martin int32 aExp, shiftCount;
4494 1.4 martin bits64 aSig0, aSig1;
4495 1.4 martin uint64 z;
4496 1.4 martin
4497 1.4 martin aSig1 = extractFloat128Frac1( a );
4498 1.4 martin aSig0 = extractFloat128Frac0( a );
4499 1.4 martin aExp = extractFloat128Exp( a );
4500 1.4 martin aSign = extractFloat128Sign( a );
4501 1.4 martin if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4502 1.4 martin shiftCount = aExp - 0x402F;
4503 1.4 martin if ( 0 < shiftCount ) {
4504 1.4 martin if ( 0x403F <= aExp ) {
4505 1.4 martin aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4506 1.4 martin if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4507 1.4 martin && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4508 1.4 martin if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4509 1.4 martin }
4510 1.4 martin else {
4511 1.4 martin float_raise( float_flag_invalid );
4512 1.4 martin }
4513 1.5 martin return LIT64( 0xFFFFFFFFFFFFFFFF );
4514 1.4 martin }
4515 1.4 martin z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4516 1.4 martin if ( (bits64) ( aSig1<<shiftCount ) ) {
4517 1.4 martin float_exception_flags |= float_flag_inexact;
4518 1.4 martin }
4519 1.4 martin }
4520 1.4 martin else {
4521 1.4 martin if ( aExp < 0x3FFF ) {
4522 1.4 martin if ( aExp | aSig0 | aSig1 ) {
4523 1.4 martin float_exception_flags |= float_flag_inexact;
4524 1.4 martin }
4525 1.4 martin return 0;
4526 1.4 martin }
4527 1.4 martin z = aSig0>>( - shiftCount );
4528 1.4 martin if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4529 1.4 martin float_exception_flags |= float_flag_inexact;
4530 1.4 martin }
4531 1.4 martin }
4532 1.4 martin if ( aSign ) z = - z;
4533 1.4 martin return z;
4534 1.4 martin
4535 1.4 martin }
4536 1.8 matt #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4537 1.4 martin
4538 1.4 martin /*
4539 1.1 bjh21 -------------------------------------------------------------------------------
4540 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4541 1.1 bjh21 value `a' to the single-precision floating-point format. The conversion
4542 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4543 1.1 bjh21 Arithmetic.
4544 1.1 bjh21 -------------------------------------------------------------------------------
4545 1.1 bjh21 */
4546 1.1 bjh21 float32 float128_to_float32( float128 a )
4547 1.1 bjh21 {
4548 1.1 bjh21 flag aSign;
4549 1.1 bjh21 int32 aExp;
4550 1.1 bjh21 bits64 aSig0, aSig1;
4551 1.1 bjh21 bits32 zSig;
4552 1.1 bjh21
4553 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4554 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4555 1.1 bjh21 aExp = extractFloat128Exp( a );
4556 1.1 bjh21 aSign = extractFloat128Sign( a );
4557 1.1 bjh21 if ( aExp == 0x7FFF ) {
4558 1.1 bjh21 if ( aSig0 | aSig1 ) {
4559 1.1 bjh21 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4560 1.1 bjh21 }
4561 1.1 bjh21 return packFloat32( aSign, 0xFF, 0 );
4562 1.1 bjh21 }
4563 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4564 1.1 bjh21 shift64RightJamming( aSig0, 18, &aSig0 );
4565 1.1 bjh21 zSig = aSig0;
4566 1.1 bjh21 if ( aExp || zSig ) {
4567 1.1 bjh21 zSig |= 0x40000000;
4568 1.1 bjh21 aExp -= 0x3F81;
4569 1.1 bjh21 }
4570 1.1 bjh21 return roundAndPackFloat32( aSign, aExp, zSig );
4571 1.1 bjh21
4572 1.1 bjh21 }
4573 1.1 bjh21
4574 1.1 bjh21 /*
4575 1.1 bjh21 -------------------------------------------------------------------------------
4576 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4577 1.1 bjh21 value `a' to the double-precision floating-point format. The conversion
4578 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 1.1 bjh21 Arithmetic.
4580 1.1 bjh21 -------------------------------------------------------------------------------
4581 1.1 bjh21 */
4582 1.1 bjh21 float64 float128_to_float64( float128 a )
4583 1.1 bjh21 {
4584 1.1 bjh21 flag aSign;
4585 1.1 bjh21 int32 aExp;
4586 1.1 bjh21 bits64 aSig0, aSig1;
4587 1.1 bjh21
4588 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4589 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4590 1.1 bjh21 aExp = extractFloat128Exp( a );
4591 1.1 bjh21 aSign = extractFloat128Sign( a );
4592 1.1 bjh21 if ( aExp == 0x7FFF ) {
4593 1.1 bjh21 if ( aSig0 | aSig1 ) {
4594 1.1 bjh21 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4595 1.1 bjh21 }
4596 1.1 bjh21 return packFloat64( aSign, 0x7FF, 0 );
4597 1.1 bjh21 }
4598 1.1 bjh21 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4599 1.1 bjh21 aSig0 |= ( aSig1 != 0 );
4600 1.1 bjh21 if ( aExp || aSig0 ) {
4601 1.1 bjh21 aSig0 |= LIT64( 0x4000000000000000 );
4602 1.1 bjh21 aExp -= 0x3C01;
4603 1.1 bjh21 }
4604 1.1 bjh21 return roundAndPackFloat64( aSign, aExp, aSig0 );
4605 1.1 bjh21
4606 1.1 bjh21 }
4607 1.1 bjh21
4608 1.1 bjh21 #ifdef FLOATX80
4609 1.1 bjh21
4610 1.1 bjh21 /*
4611 1.1 bjh21 -------------------------------------------------------------------------------
4612 1.1 bjh21 Returns the result of converting the quadruple-precision floating-point
4613 1.1 bjh21 value `a' to the extended double-precision floating-point format. The
4614 1.1 bjh21 conversion is performed according to the IEC/IEEE Standard for Binary
4615 1.1 bjh21 Floating-Point Arithmetic.
4616 1.1 bjh21 -------------------------------------------------------------------------------
4617 1.1 bjh21 */
4618 1.1 bjh21 floatx80 float128_to_floatx80( float128 a )
4619 1.1 bjh21 {
4620 1.1 bjh21 flag aSign;
4621 1.1 bjh21 int32 aExp;
4622 1.1 bjh21 bits64 aSig0, aSig1;
4623 1.1 bjh21
4624 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4625 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4626 1.1 bjh21 aExp = extractFloat128Exp( a );
4627 1.1 bjh21 aSign = extractFloat128Sign( a );
4628 1.1 bjh21 if ( aExp == 0x7FFF ) {
4629 1.1 bjh21 if ( aSig0 | aSig1 ) {
4630 1.1 bjh21 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4631 1.1 bjh21 }
4632 1.1 bjh21 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4633 1.1 bjh21 }
4634 1.1 bjh21 if ( aExp == 0 ) {
4635 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4636 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4637 1.1 bjh21 }
4638 1.1 bjh21 else {
4639 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4640 1.1 bjh21 }
4641 1.1 bjh21 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4642 1.1 bjh21 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4643 1.1 bjh21
4644 1.1 bjh21 }
4645 1.1 bjh21
4646 1.1 bjh21 #endif
4647 1.1 bjh21
4648 1.1 bjh21 /*
4649 1.1 bjh21 -------------------------------------------------------------------------------
4650 1.1 bjh21 Rounds the quadruple-precision floating-point value `a' to an integer, and
4651 1.1 bjh21 returns the result as a quadruple-precision floating-point value. The
4652 1.1 bjh21 operation is performed according to the IEC/IEEE Standard for Binary
4653 1.1 bjh21 Floating-Point Arithmetic.
4654 1.1 bjh21 -------------------------------------------------------------------------------
4655 1.1 bjh21 */
4656 1.1 bjh21 float128 float128_round_to_int( float128 a )
4657 1.1 bjh21 {
4658 1.1 bjh21 flag aSign;
4659 1.1 bjh21 int32 aExp;
4660 1.1 bjh21 bits64 lastBitMask, roundBitsMask;
4661 1.1 bjh21 int8 roundingMode;
4662 1.1 bjh21 float128 z;
4663 1.1 bjh21
4664 1.1 bjh21 aExp = extractFloat128Exp( a );
4665 1.1 bjh21 if ( 0x402F <= aExp ) {
4666 1.1 bjh21 if ( 0x406F <= aExp ) {
4667 1.1 bjh21 if ( ( aExp == 0x7FFF )
4668 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4669 1.1 bjh21 ) {
4670 1.1 bjh21 return propagateFloat128NaN( a, a );
4671 1.1 bjh21 }
4672 1.1 bjh21 return a;
4673 1.1 bjh21 }
4674 1.1 bjh21 lastBitMask = 1;
4675 1.1 bjh21 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4676 1.1 bjh21 roundBitsMask = lastBitMask - 1;
4677 1.1 bjh21 z = a;
4678 1.1 bjh21 roundingMode = float_rounding_mode;
4679 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
4680 1.1 bjh21 if ( lastBitMask ) {
4681 1.1 bjh21 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4682 1.1 bjh21 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4683 1.1 bjh21 }
4684 1.1 bjh21 else {
4685 1.1 bjh21 if ( (sbits64) z.low < 0 ) {
4686 1.1 bjh21 ++z.high;
4687 1.1 bjh21 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4688 1.1 bjh21 }
4689 1.1 bjh21 }
4690 1.1 bjh21 }
4691 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
4692 1.1 bjh21 if ( extractFloat128Sign( z )
4693 1.1 bjh21 ^ ( roundingMode == float_round_up ) ) {
4694 1.1 bjh21 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4695 1.1 bjh21 }
4696 1.1 bjh21 }
4697 1.1 bjh21 z.low &= ~ roundBitsMask;
4698 1.1 bjh21 }
4699 1.1 bjh21 else {
4700 1.1 bjh21 if ( aExp < 0x3FFF ) {
4701 1.1 bjh21 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4702 1.1 bjh21 float_exception_flags |= float_flag_inexact;
4703 1.1 bjh21 aSign = extractFloat128Sign( a );
4704 1.1 bjh21 switch ( float_rounding_mode ) {
4705 1.1 bjh21 case float_round_nearest_even:
4706 1.1 bjh21 if ( ( aExp == 0x3FFE )
4707 1.1 bjh21 && ( extractFloat128Frac0( a )
4708 1.1 bjh21 | extractFloat128Frac1( a ) )
4709 1.1 bjh21 ) {
4710 1.1 bjh21 return packFloat128( aSign, 0x3FFF, 0, 0 );
4711 1.1 bjh21 }
4712 1.1 bjh21 break;
4713 1.1 bjh21 case float_round_to_zero:
4714 1.1 bjh21 break;
4715 1.1 bjh21 case float_round_down:
4716 1.1 bjh21 return
4717 1.1 bjh21 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4718 1.1 bjh21 : packFloat128( 0, 0, 0, 0 );
4719 1.1 bjh21 case float_round_up:
4720 1.1 bjh21 return
4721 1.1 bjh21 aSign ? packFloat128( 1, 0, 0, 0 )
4722 1.1 bjh21 : packFloat128( 0, 0x3FFF, 0, 0 );
4723 1.1 bjh21 }
4724 1.1 bjh21 return packFloat128( aSign, 0, 0, 0 );
4725 1.1 bjh21 }
4726 1.1 bjh21 lastBitMask = 1;
4727 1.1 bjh21 lastBitMask <<= 0x402F - aExp;
4728 1.1 bjh21 roundBitsMask = lastBitMask - 1;
4729 1.1 bjh21 z.low = 0;
4730 1.1 bjh21 z.high = a.high;
4731 1.1 bjh21 roundingMode = float_rounding_mode;
4732 1.1 bjh21 if ( roundingMode == float_round_nearest_even ) {
4733 1.1 bjh21 z.high += lastBitMask>>1;
4734 1.1 bjh21 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4735 1.1 bjh21 z.high &= ~ lastBitMask;
4736 1.1 bjh21 }
4737 1.1 bjh21 }
4738 1.1 bjh21 else if ( roundingMode != float_round_to_zero ) {
4739 1.1 bjh21 if ( extractFloat128Sign( z )
4740 1.1 bjh21 ^ ( roundingMode == float_round_up ) ) {
4741 1.1 bjh21 z.high |= ( a.low != 0 );
4742 1.1 bjh21 z.high += roundBitsMask;
4743 1.1 bjh21 }
4744 1.1 bjh21 }
4745 1.1 bjh21 z.high &= ~ roundBitsMask;
4746 1.1 bjh21 }
4747 1.1 bjh21 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4748 1.1 bjh21 float_exception_flags |= float_flag_inexact;
4749 1.1 bjh21 }
4750 1.1 bjh21 return z;
4751 1.1 bjh21
4752 1.1 bjh21 }
4753 1.1 bjh21
4754 1.1 bjh21 /*
4755 1.1 bjh21 -------------------------------------------------------------------------------
4756 1.1 bjh21 Returns the result of adding the absolute values of the quadruple-precision
4757 1.1 bjh21 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4758 1.1 bjh21 before being returned. `zSign' is ignored if the result is a NaN.
4759 1.1 bjh21 The addition is performed according to the IEC/IEEE Standard for Binary
4760 1.1 bjh21 Floating-Point Arithmetic.
4761 1.1 bjh21 -------------------------------------------------------------------------------
4762 1.1 bjh21 */
4763 1.1 bjh21 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4764 1.1 bjh21 {
4765 1.1 bjh21 int32 aExp, bExp, zExp;
4766 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4767 1.1 bjh21 int32 expDiff;
4768 1.1 bjh21
4769 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4770 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4771 1.1 bjh21 aExp = extractFloat128Exp( a );
4772 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
4773 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
4774 1.1 bjh21 bExp = extractFloat128Exp( b );
4775 1.1 bjh21 expDiff = aExp - bExp;
4776 1.1 bjh21 if ( 0 < expDiff ) {
4777 1.1 bjh21 if ( aExp == 0x7FFF ) {
4778 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4779 1.1 bjh21 return a;
4780 1.1 bjh21 }
4781 1.1 bjh21 if ( bExp == 0 ) {
4782 1.1 bjh21 --expDiff;
4783 1.1 bjh21 }
4784 1.1 bjh21 else {
4785 1.1 bjh21 bSig0 |= LIT64( 0x0001000000000000 );
4786 1.1 bjh21 }
4787 1.1 bjh21 shift128ExtraRightJamming(
4788 1.1 bjh21 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4789 1.1 bjh21 zExp = aExp;
4790 1.1 bjh21 }
4791 1.1 bjh21 else if ( expDiff < 0 ) {
4792 1.1 bjh21 if ( bExp == 0x7FFF ) {
4793 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4794 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
4795 1.1 bjh21 }
4796 1.1 bjh21 if ( aExp == 0 ) {
4797 1.1 bjh21 ++expDiff;
4798 1.1 bjh21 }
4799 1.1 bjh21 else {
4800 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4801 1.1 bjh21 }
4802 1.1 bjh21 shift128ExtraRightJamming(
4803 1.1 bjh21 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4804 1.1 bjh21 zExp = bExp;
4805 1.1 bjh21 }
4806 1.1 bjh21 else {
4807 1.1 bjh21 if ( aExp == 0x7FFF ) {
4808 1.1 bjh21 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4809 1.1 bjh21 return propagateFloat128NaN( a, b );
4810 1.1 bjh21 }
4811 1.1 bjh21 return a;
4812 1.1 bjh21 }
4813 1.1 bjh21 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814 1.1 bjh21 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4815 1.1 bjh21 zSig2 = 0;
4816 1.1 bjh21 zSig0 |= LIT64( 0x0002000000000000 );
4817 1.1 bjh21 zExp = aExp;
4818 1.1 bjh21 goto shiftRight1;
4819 1.1 bjh21 }
4820 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
4821 1.1 bjh21 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4822 1.1 bjh21 --zExp;
4823 1.1 bjh21 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4824 1.1 bjh21 ++zExp;
4825 1.1 bjh21 shiftRight1:
4826 1.1 bjh21 shift128ExtraRightJamming(
4827 1.1 bjh21 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4828 1.1 bjh21 roundAndPack:
4829 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4830 1.1 bjh21
4831 1.1 bjh21 }
4832 1.1 bjh21
4833 1.1 bjh21 /*
4834 1.1 bjh21 -------------------------------------------------------------------------------
4835 1.1 bjh21 Returns the result of subtracting the absolute values of the quadruple-
4836 1.1 bjh21 precision floating-point values `a' and `b'. If `zSign' is 1, the
4837 1.1 bjh21 difference is negated before being returned. `zSign' is ignored if the
4838 1.1 bjh21 result is a NaN. The subtraction is performed according to the IEC/IEEE
4839 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
4840 1.1 bjh21 -------------------------------------------------------------------------------
4841 1.1 bjh21 */
4842 1.1 bjh21 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4843 1.1 bjh21 {
4844 1.1 bjh21 int32 aExp, bExp, zExp;
4845 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4846 1.1 bjh21 int32 expDiff;
4847 1.1 bjh21 float128 z;
4848 1.1 bjh21
4849 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4850 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4851 1.1 bjh21 aExp = extractFloat128Exp( a );
4852 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
4853 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
4854 1.1 bjh21 bExp = extractFloat128Exp( b );
4855 1.1 bjh21 expDiff = aExp - bExp;
4856 1.1 bjh21 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4857 1.1 bjh21 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4858 1.1 bjh21 if ( 0 < expDiff ) goto aExpBigger;
4859 1.1 bjh21 if ( expDiff < 0 ) goto bExpBigger;
4860 1.1 bjh21 if ( aExp == 0x7FFF ) {
4861 1.1 bjh21 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4862 1.1 bjh21 return propagateFloat128NaN( a, b );
4863 1.1 bjh21 }
4864 1.1 bjh21 float_raise( float_flag_invalid );
4865 1.1 bjh21 z.low = float128_default_nan_low;
4866 1.1 bjh21 z.high = float128_default_nan_high;
4867 1.1 bjh21 return z;
4868 1.1 bjh21 }
4869 1.1 bjh21 if ( aExp == 0 ) {
4870 1.1 bjh21 aExp = 1;
4871 1.1 bjh21 bExp = 1;
4872 1.1 bjh21 }
4873 1.1 bjh21 if ( bSig0 < aSig0 ) goto aBigger;
4874 1.1 bjh21 if ( aSig0 < bSig0 ) goto bBigger;
4875 1.1 bjh21 if ( bSig1 < aSig1 ) goto aBigger;
4876 1.1 bjh21 if ( aSig1 < bSig1 ) goto bBigger;
4877 1.1 bjh21 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4878 1.1 bjh21 bExpBigger:
4879 1.1 bjh21 if ( bExp == 0x7FFF ) {
4880 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4881 1.1 bjh21 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4882 1.1 bjh21 }
4883 1.1 bjh21 if ( aExp == 0 ) {
4884 1.1 bjh21 ++expDiff;
4885 1.1 bjh21 }
4886 1.1 bjh21 else {
4887 1.1 bjh21 aSig0 |= LIT64( 0x4000000000000000 );
4888 1.1 bjh21 }
4889 1.1 bjh21 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4890 1.1 bjh21 bSig0 |= LIT64( 0x4000000000000000 );
4891 1.1 bjh21 bBigger:
4892 1.1 bjh21 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4893 1.1 bjh21 zExp = bExp;
4894 1.1 bjh21 zSign ^= 1;
4895 1.1 bjh21 goto normalizeRoundAndPack;
4896 1.1 bjh21 aExpBigger:
4897 1.1 bjh21 if ( aExp == 0x7FFF ) {
4898 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4899 1.1 bjh21 return a;
4900 1.1 bjh21 }
4901 1.1 bjh21 if ( bExp == 0 ) {
4902 1.1 bjh21 --expDiff;
4903 1.1 bjh21 }
4904 1.1 bjh21 else {
4905 1.1 bjh21 bSig0 |= LIT64( 0x4000000000000000 );
4906 1.1 bjh21 }
4907 1.1 bjh21 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4908 1.1 bjh21 aSig0 |= LIT64( 0x4000000000000000 );
4909 1.1 bjh21 aBigger:
4910 1.1 bjh21 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4911 1.1 bjh21 zExp = aExp;
4912 1.1 bjh21 normalizeRoundAndPack:
4913 1.1 bjh21 --zExp;
4914 1.1 bjh21 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4915 1.1 bjh21
4916 1.1 bjh21 }
4917 1.1 bjh21
4918 1.1 bjh21 /*
4919 1.1 bjh21 -------------------------------------------------------------------------------
4920 1.1 bjh21 Returns the result of adding the quadruple-precision floating-point values
4921 1.1 bjh21 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4922 1.1 bjh21 for Binary Floating-Point Arithmetic.
4923 1.1 bjh21 -------------------------------------------------------------------------------
4924 1.1 bjh21 */
4925 1.1 bjh21 float128 float128_add( float128 a, float128 b )
4926 1.1 bjh21 {
4927 1.1 bjh21 flag aSign, bSign;
4928 1.1 bjh21
4929 1.1 bjh21 aSign = extractFloat128Sign( a );
4930 1.1 bjh21 bSign = extractFloat128Sign( b );
4931 1.1 bjh21 if ( aSign == bSign ) {
4932 1.1 bjh21 return addFloat128Sigs( a, b, aSign );
4933 1.1 bjh21 }
4934 1.1 bjh21 else {
4935 1.1 bjh21 return subFloat128Sigs( a, b, aSign );
4936 1.1 bjh21 }
4937 1.1 bjh21
4938 1.1 bjh21 }
4939 1.1 bjh21
4940 1.1 bjh21 /*
4941 1.1 bjh21 -------------------------------------------------------------------------------
4942 1.1 bjh21 Returns the result of subtracting the quadruple-precision floating-point
4943 1.1 bjh21 values `a' and `b'. The operation is performed according to the IEC/IEEE
4944 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
4945 1.1 bjh21 -------------------------------------------------------------------------------
4946 1.1 bjh21 */
4947 1.1 bjh21 float128 float128_sub( float128 a, float128 b )
4948 1.1 bjh21 {
4949 1.1 bjh21 flag aSign, bSign;
4950 1.1 bjh21
4951 1.1 bjh21 aSign = extractFloat128Sign( a );
4952 1.1 bjh21 bSign = extractFloat128Sign( b );
4953 1.1 bjh21 if ( aSign == bSign ) {
4954 1.1 bjh21 return subFloat128Sigs( a, b, aSign );
4955 1.1 bjh21 }
4956 1.1 bjh21 else {
4957 1.1 bjh21 return addFloat128Sigs( a, b, aSign );
4958 1.1 bjh21 }
4959 1.1 bjh21
4960 1.1 bjh21 }
4961 1.1 bjh21
4962 1.1 bjh21 /*
4963 1.1 bjh21 -------------------------------------------------------------------------------
4964 1.1 bjh21 Returns the result of multiplying the quadruple-precision floating-point
4965 1.1 bjh21 values `a' and `b'. The operation is performed according to the IEC/IEEE
4966 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
4967 1.1 bjh21 -------------------------------------------------------------------------------
4968 1.1 bjh21 */
4969 1.1 bjh21 float128 float128_mul( float128 a, float128 b )
4970 1.1 bjh21 {
4971 1.1 bjh21 flag aSign, bSign, zSign;
4972 1.1 bjh21 int32 aExp, bExp, zExp;
4973 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4974 1.1 bjh21 float128 z;
4975 1.1 bjh21
4976 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
4977 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
4978 1.1 bjh21 aExp = extractFloat128Exp( a );
4979 1.1 bjh21 aSign = extractFloat128Sign( a );
4980 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
4981 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
4982 1.1 bjh21 bExp = extractFloat128Exp( b );
4983 1.1 bjh21 bSign = extractFloat128Sign( b );
4984 1.1 bjh21 zSign = aSign ^ bSign;
4985 1.1 bjh21 if ( aExp == 0x7FFF ) {
4986 1.1 bjh21 if ( ( aSig0 | aSig1 )
4987 1.1 bjh21 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4988 1.1 bjh21 return propagateFloat128NaN( a, b );
4989 1.1 bjh21 }
4990 1.1 bjh21 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4991 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
4992 1.1 bjh21 }
4993 1.1 bjh21 if ( bExp == 0x7FFF ) {
4994 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4995 1.1 bjh21 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4996 1.1 bjh21 invalid:
4997 1.1 bjh21 float_raise( float_flag_invalid );
4998 1.1 bjh21 z.low = float128_default_nan_low;
4999 1.1 bjh21 z.high = float128_default_nan_high;
5000 1.1 bjh21 return z;
5001 1.1 bjh21 }
5002 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
5003 1.1 bjh21 }
5004 1.1 bjh21 if ( aExp == 0 ) {
5005 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5006 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5007 1.1 bjh21 }
5008 1.1 bjh21 if ( bExp == 0 ) {
5009 1.1 bjh21 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5010 1.1 bjh21 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5011 1.1 bjh21 }
5012 1.1 bjh21 zExp = aExp + bExp - 0x4000;
5013 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
5014 1.1 bjh21 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5015 1.1 bjh21 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5016 1.1 bjh21 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5017 1.1 bjh21 zSig2 |= ( zSig3 != 0 );
5018 1.1 bjh21 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5019 1.1 bjh21 shift128ExtraRightJamming(
5020 1.1 bjh21 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5021 1.1 bjh21 ++zExp;
5022 1.1 bjh21 }
5023 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5024 1.1 bjh21
5025 1.1 bjh21 }
5026 1.1 bjh21
5027 1.1 bjh21 /*
5028 1.1 bjh21 -------------------------------------------------------------------------------
5029 1.1 bjh21 Returns the result of dividing the quadruple-precision floating-point value
5030 1.1 bjh21 `a' by the corresponding value `b'. The operation is performed according to
5031 1.1 bjh21 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5032 1.1 bjh21 -------------------------------------------------------------------------------
5033 1.1 bjh21 */
5034 1.1 bjh21 float128 float128_div( float128 a, float128 b )
5035 1.1 bjh21 {
5036 1.1 bjh21 flag aSign, bSign, zSign;
5037 1.1 bjh21 int32 aExp, bExp, zExp;
5038 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5039 1.1 bjh21 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5040 1.1 bjh21 float128 z;
5041 1.1 bjh21
5042 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
5043 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
5044 1.1 bjh21 aExp = extractFloat128Exp( a );
5045 1.1 bjh21 aSign = extractFloat128Sign( a );
5046 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
5047 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
5048 1.1 bjh21 bExp = extractFloat128Exp( b );
5049 1.1 bjh21 bSign = extractFloat128Sign( b );
5050 1.1 bjh21 zSign = aSign ^ bSign;
5051 1.1 bjh21 if ( aExp == 0x7FFF ) {
5052 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5053 1.1 bjh21 if ( bExp == 0x7FFF ) {
5054 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5055 1.1 bjh21 goto invalid;
5056 1.1 bjh21 }
5057 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
5058 1.1 bjh21 }
5059 1.1 bjh21 if ( bExp == 0x7FFF ) {
5060 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061 1.1 bjh21 return packFloat128( zSign, 0, 0, 0 );
5062 1.1 bjh21 }
5063 1.1 bjh21 if ( bExp == 0 ) {
5064 1.1 bjh21 if ( ( bSig0 | bSig1 ) == 0 ) {
5065 1.1 bjh21 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5066 1.1 bjh21 invalid:
5067 1.1 bjh21 float_raise( float_flag_invalid );
5068 1.1 bjh21 z.low = float128_default_nan_low;
5069 1.1 bjh21 z.high = float128_default_nan_high;
5070 1.1 bjh21 return z;
5071 1.1 bjh21 }
5072 1.1 bjh21 float_raise( float_flag_divbyzero );
5073 1.1 bjh21 return packFloat128( zSign, 0x7FFF, 0, 0 );
5074 1.1 bjh21 }
5075 1.1 bjh21 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5076 1.1 bjh21 }
5077 1.1 bjh21 if ( aExp == 0 ) {
5078 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5079 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5080 1.1 bjh21 }
5081 1.1 bjh21 zExp = aExp - bExp + 0x3FFD;
5082 1.1 bjh21 shortShift128Left(
5083 1.1 bjh21 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5084 1.1 bjh21 shortShift128Left(
5085 1.1 bjh21 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5086 1.1 bjh21 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5087 1.1 bjh21 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5088 1.1 bjh21 ++zExp;
5089 1.1 bjh21 }
5090 1.1 bjh21 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091 1.1 bjh21 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5092 1.1 bjh21 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5093 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
5094 1.1 bjh21 --zSig0;
5095 1.1 bjh21 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5096 1.1 bjh21 }
5097 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5098 1.1 bjh21 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5099 1.1 bjh21 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5100 1.1 bjh21 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5101 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
5102 1.1 bjh21 --zSig1;
5103 1.1 bjh21 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5104 1.1 bjh21 }
5105 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5106 1.1 bjh21 }
5107 1.1 bjh21 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5108 1.1 bjh21 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5109 1.1 bjh21
5110 1.1 bjh21 }
5111 1.1 bjh21
5112 1.1 bjh21 /*
5113 1.1 bjh21 -------------------------------------------------------------------------------
5114 1.1 bjh21 Returns the remainder of the quadruple-precision floating-point value `a'
5115 1.1 bjh21 with respect to the corresponding value `b'. The operation is performed
5116 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5117 1.1 bjh21 -------------------------------------------------------------------------------
5118 1.1 bjh21 */
5119 1.1 bjh21 float128 float128_rem( float128 a, float128 b )
5120 1.1 bjh21 {
5121 1.9 martin flag aSign, zSign;
5122 1.1 bjh21 int32 aExp, bExp, expDiff;
5123 1.1 bjh21 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5124 1.1 bjh21 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5125 1.1 bjh21 sbits64 sigMean0;
5126 1.1 bjh21 float128 z;
5127 1.1 bjh21
5128 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
5129 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
5130 1.1 bjh21 aExp = extractFloat128Exp( a );
5131 1.1 bjh21 aSign = extractFloat128Sign( a );
5132 1.1 bjh21 bSig1 = extractFloat128Frac1( b );
5133 1.1 bjh21 bSig0 = extractFloat128Frac0( b );
5134 1.1 bjh21 bExp = extractFloat128Exp( b );
5135 1.1 bjh21 if ( aExp == 0x7FFF ) {
5136 1.1 bjh21 if ( ( aSig0 | aSig1 )
5137 1.1 bjh21 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138 1.1 bjh21 return propagateFloat128NaN( a, b );
5139 1.1 bjh21 }
5140 1.1 bjh21 goto invalid;
5141 1.1 bjh21 }
5142 1.1 bjh21 if ( bExp == 0x7FFF ) {
5143 1.1 bjh21 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144 1.1 bjh21 return a;
5145 1.1 bjh21 }
5146 1.1 bjh21 if ( bExp == 0 ) {
5147 1.1 bjh21 if ( ( bSig0 | bSig1 ) == 0 ) {
5148 1.1 bjh21 invalid:
5149 1.1 bjh21 float_raise( float_flag_invalid );
5150 1.1 bjh21 z.low = float128_default_nan_low;
5151 1.1 bjh21 z.high = float128_default_nan_high;
5152 1.1 bjh21 return z;
5153 1.1 bjh21 }
5154 1.1 bjh21 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155 1.1 bjh21 }
5156 1.1 bjh21 if ( aExp == 0 ) {
5157 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159 1.1 bjh21 }
5160 1.1 bjh21 expDiff = aExp - bExp;
5161 1.1 bjh21 if ( expDiff < -1 ) return a;
5162 1.1 bjh21 shortShift128Left(
5163 1.1 bjh21 aSig0 | LIT64( 0x0001000000000000 ),
5164 1.1 bjh21 aSig1,
5165 1.1 bjh21 15 - ( expDiff < 0 ),
5166 1.1 bjh21 &aSig0,
5167 1.1 bjh21 &aSig1
5168 1.1 bjh21 );
5169 1.1 bjh21 shortShift128Left(
5170 1.1 bjh21 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171 1.1 bjh21 q = le128( bSig0, bSig1, aSig0, aSig1 );
5172 1.1 bjh21 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173 1.1 bjh21 expDiff -= 64;
5174 1.1 bjh21 while ( 0 < expDiff ) {
5175 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176 1.1 bjh21 q = ( 4 < q ) ? q - 4 : 0;
5177 1.1 bjh21 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178 1.1 bjh21 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179 1.1 bjh21 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180 1.1 bjh21 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181 1.1 bjh21 expDiff -= 61;
5182 1.1 bjh21 }
5183 1.1 bjh21 if ( -64 < expDiff ) {
5184 1.1 bjh21 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185 1.1 bjh21 q = ( 4 < q ) ? q - 4 : 0;
5186 1.1 bjh21 q >>= - expDiff;
5187 1.1 bjh21 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188 1.1 bjh21 expDiff += 52;
5189 1.1 bjh21 if ( expDiff < 0 ) {
5190 1.1 bjh21 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191 1.1 bjh21 }
5192 1.1 bjh21 else {
5193 1.1 bjh21 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194 1.1 bjh21 }
5195 1.1 bjh21 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196 1.1 bjh21 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197 1.1 bjh21 }
5198 1.1 bjh21 else {
5199 1.1 bjh21 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200 1.1 bjh21 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201 1.1 bjh21 }
5202 1.1 bjh21 do {
5203 1.1 bjh21 alternateASig0 = aSig0;
5204 1.1 bjh21 alternateASig1 = aSig1;
5205 1.1 bjh21 ++q;
5206 1.1 bjh21 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207 1.1 bjh21 } while ( 0 <= (sbits64) aSig0 );
5208 1.1 bjh21 add128(
5209 1.3 mrg aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210 1.1 bjh21 if ( ( sigMean0 < 0 )
5211 1.1 bjh21 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212 1.1 bjh21 aSig0 = alternateASig0;
5213 1.1 bjh21 aSig1 = alternateASig1;
5214 1.1 bjh21 }
5215 1.1 bjh21 zSign = ( (sbits64) aSig0 < 0 );
5216 1.1 bjh21 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217 1.1 bjh21 return
5218 1.1 bjh21 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219 1.1 bjh21
5220 1.1 bjh21 }
5221 1.1 bjh21
5222 1.1 bjh21 /*
5223 1.1 bjh21 -------------------------------------------------------------------------------
5224 1.1 bjh21 Returns the square root of the quadruple-precision floating-point value `a'.
5225 1.1 bjh21 The operation is performed according to the IEC/IEEE Standard for Binary
5226 1.1 bjh21 Floating-Point Arithmetic.
5227 1.1 bjh21 -------------------------------------------------------------------------------
5228 1.1 bjh21 */
5229 1.1 bjh21 float128 float128_sqrt( float128 a )
5230 1.1 bjh21 {
5231 1.1 bjh21 flag aSign;
5232 1.1 bjh21 int32 aExp, zExp;
5233 1.1 bjh21 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234 1.1 bjh21 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235 1.1 bjh21 float128 z;
5236 1.1 bjh21
5237 1.1 bjh21 aSig1 = extractFloat128Frac1( a );
5238 1.1 bjh21 aSig0 = extractFloat128Frac0( a );
5239 1.1 bjh21 aExp = extractFloat128Exp( a );
5240 1.1 bjh21 aSign = extractFloat128Sign( a );
5241 1.1 bjh21 if ( aExp == 0x7FFF ) {
5242 1.1 bjh21 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243 1.1 bjh21 if ( ! aSign ) return a;
5244 1.1 bjh21 goto invalid;
5245 1.1 bjh21 }
5246 1.1 bjh21 if ( aSign ) {
5247 1.1 bjh21 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248 1.1 bjh21 invalid:
5249 1.1 bjh21 float_raise( float_flag_invalid );
5250 1.1 bjh21 z.low = float128_default_nan_low;
5251 1.1 bjh21 z.high = float128_default_nan_high;
5252 1.1 bjh21 return z;
5253 1.1 bjh21 }
5254 1.1 bjh21 if ( aExp == 0 ) {
5255 1.1 bjh21 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256 1.1 bjh21 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257 1.1 bjh21 }
5258 1.1 bjh21 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5259 1.1 bjh21 aSig0 |= LIT64( 0x0001000000000000 );
5260 1.1 bjh21 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5261 1.1 bjh21 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262 1.1 bjh21 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263 1.1 bjh21 doubleZSig0 = zSig0<<1;
5264 1.1 bjh21 mul64To128( zSig0, zSig0, &term0, &term1 );
5265 1.1 bjh21 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266 1.1 bjh21 while ( (sbits64) rem0 < 0 ) {
5267 1.1 bjh21 --zSig0;
5268 1.1 bjh21 doubleZSig0 -= 2;
5269 1.1 bjh21 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270 1.1 bjh21 }
5271 1.1 bjh21 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272 1.1 bjh21 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273 1.1 bjh21 if ( zSig1 == 0 ) zSig1 = 1;
5274 1.1 bjh21 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275 1.1 bjh21 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276 1.1 bjh21 mul64To128( zSig1, zSig1, &term2, &term3 );
5277 1.1 bjh21 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278 1.1 bjh21 while ( (sbits64) rem1 < 0 ) {
5279 1.1 bjh21 --zSig1;
5280 1.1 bjh21 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281 1.1 bjh21 term3 |= 1;
5282 1.1 bjh21 term2 |= doubleZSig0;
5283 1.1 bjh21 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284 1.1 bjh21 }
5285 1.1 bjh21 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286 1.1 bjh21 }
5287 1.1 bjh21 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288 1.1 bjh21 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289 1.1 bjh21
5290 1.1 bjh21 }
5291 1.1 bjh21
5292 1.1 bjh21 /*
5293 1.1 bjh21 -------------------------------------------------------------------------------
5294 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
5296 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 1.1 bjh21 -------------------------------------------------------------------------------
5298 1.1 bjh21 */
5299 1.1 bjh21 flag float128_eq( float128 a, float128 b )
5300 1.1 bjh21 {
5301 1.1 bjh21
5302 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5303 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5305 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306 1.1 bjh21 ) {
5307 1.1 bjh21 if ( float128_is_signaling_nan( a )
5308 1.1 bjh21 || float128_is_signaling_nan( b ) ) {
5309 1.1 bjh21 float_raise( float_flag_invalid );
5310 1.1 bjh21 }
5311 1.1 bjh21 return 0;
5312 1.1 bjh21 }
5313 1.1 bjh21 return
5314 1.1 bjh21 ( a.low == b.low )
5315 1.1 bjh21 && ( ( a.high == b.high )
5316 1.1 bjh21 || ( ( a.low == 0 )
5317 1.1 bjh21 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318 1.1 bjh21 );
5319 1.1 bjh21
5320 1.1 bjh21 }
5321 1.1 bjh21
5322 1.1 bjh21 /*
5323 1.1 bjh21 -------------------------------------------------------------------------------
5324 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5325 1.1 bjh21 or equal to the corresponding value `b', and 0 otherwise. The comparison
5326 1.1 bjh21 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327 1.1 bjh21 Arithmetic.
5328 1.1 bjh21 -------------------------------------------------------------------------------
5329 1.1 bjh21 */
5330 1.1 bjh21 flag float128_le( float128 a, float128 b )
5331 1.1 bjh21 {
5332 1.1 bjh21 flag aSign, bSign;
5333 1.1 bjh21
5334 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5335 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5337 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338 1.1 bjh21 ) {
5339 1.1 bjh21 float_raise( float_flag_invalid );
5340 1.1 bjh21 return 0;
5341 1.1 bjh21 }
5342 1.1 bjh21 aSign = extractFloat128Sign( a );
5343 1.1 bjh21 bSign = extractFloat128Sign( b );
5344 1.1 bjh21 if ( aSign != bSign ) {
5345 1.1 bjh21 return
5346 1.1 bjh21 aSign
5347 1.1 bjh21 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348 1.1 bjh21 == 0 );
5349 1.1 bjh21 }
5350 1.1 bjh21 return
5351 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
5352 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
5353 1.1 bjh21
5354 1.1 bjh21 }
5355 1.1 bjh21
5356 1.1 bjh21 /*
5357 1.1 bjh21 -------------------------------------------------------------------------------
5358 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5359 1.1 bjh21 the corresponding value `b', and 0 otherwise. The comparison is performed
5360 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 1.1 bjh21 -------------------------------------------------------------------------------
5362 1.1 bjh21 */
5363 1.1 bjh21 flag float128_lt( float128 a, float128 b )
5364 1.1 bjh21 {
5365 1.1 bjh21 flag aSign, bSign;
5366 1.1 bjh21
5367 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5368 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5370 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371 1.1 bjh21 ) {
5372 1.1 bjh21 float_raise( float_flag_invalid );
5373 1.1 bjh21 return 0;
5374 1.1 bjh21 }
5375 1.1 bjh21 aSign = extractFloat128Sign( a );
5376 1.1 bjh21 bSign = extractFloat128Sign( b );
5377 1.1 bjh21 if ( aSign != bSign ) {
5378 1.1 bjh21 return
5379 1.1 bjh21 aSign
5380 1.1 bjh21 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381 1.1 bjh21 != 0 );
5382 1.1 bjh21 }
5383 1.1 bjh21 return
5384 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
5385 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
5386 1.1 bjh21
5387 1.1 bjh21 }
5388 1.1 bjh21
5389 1.1 bjh21 /*
5390 1.1 bjh21 -------------------------------------------------------------------------------
5391 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392 1.1 bjh21 the corresponding value `b', and 0 otherwise. The invalid exception is
5393 1.1 bjh21 raised if either operand is a NaN. Otherwise, the comparison is performed
5394 1.1 bjh21 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 1.1 bjh21 -------------------------------------------------------------------------------
5396 1.1 bjh21 */
5397 1.1 bjh21 flag float128_eq_signaling( float128 a, float128 b )
5398 1.1 bjh21 {
5399 1.1 bjh21
5400 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5401 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5403 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404 1.1 bjh21 ) {
5405 1.1 bjh21 float_raise( float_flag_invalid );
5406 1.1 bjh21 return 0;
5407 1.1 bjh21 }
5408 1.1 bjh21 return
5409 1.1 bjh21 ( a.low == b.low )
5410 1.1 bjh21 && ( ( a.high == b.high )
5411 1.1 bjh21 || ( ( a.low == 0 )
5412 1.1 bjh21 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413 1.1 bjh21 );
5414 1.1 bjh21
5415 1.1 bjh21 }
5416 1.1 bjh21
5417 1.1 bjh21 /*
5418 1.1 bjh21 -------------------------------------------------------------------------------
5419 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5420 1.1 bjh21 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5421 1.1 bjh21 cause an exception. Otherwise, the comparison is performed according to the
5422 1.1 bjh21 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 1.1 bjh21 -------------------------------------------------------------------------------
5424 1.1 bjh21 */
5425 1.1 bjh21 flag float128_le_quiet( float128 a, float128 b )
5426 1.1 bjh21 {
5427 1.1 bjh21 flag aSign, bSign;
5428 1.1 bjh21
5429 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5430 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5432 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433 1.1 bjh21 ) {
5434 1.1 bjh21 if ( float128_is_signaling_nan( a )
5435 1.1 bjh21 || float128_is_signaling_nan( b ) ) {
5436 1.1 bjh21 float_raise( float_flag_invalid );
5437 1.1 bjh21 }
5438 1.1 bjh21 return 0;
5439 1.1 bjh21 }
5440 1.1 bjh21 aSign = extractFloat128Sign( a );
5441 1.1 bjh21 bSign = extractFloat128Sign( b );
5442 1.1 bjh21 if ( aSign != bSign ) {
5443 1.1 bjh21 return
5444 1.1 bjh21 aSign
5445 1.1 bjh21 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446 1.1 bjh21 == 0 );
5447 1.1 bjh21 }
5448 1.1 bjh21 return
5449 1.1 bjh21 aSign ? le128( b.high, b.low, a.high, a.low )
5450 1.1 bjh21 : le128( a.high, a.low, b.high, b.low );
5451 1.1 bjh21
5452 1.1 bjh21 }
5453 1.1 bjh21
5454 1.1 bjh21 /*
5455 1.1 bjh21 -------------------------------------------------------------------------------
5456 1.1 bjh21 Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 1.1 bjh21 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5458 1.1 bjh21 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5459 1.1 bjh21 Standard for Binary Floating-Point Arithmetic.
5460 1.1 bjh21 -------------------------------------------------------------------------------
5461 1.1 bjh21 */
5462 1.1 bjh21 flag float128_lt_quiet( float128 a, float128 b )
5463 1.1 bjh21 {
5464 1.1 bjh21 flag aSign, bSign;
5465 1.1 bjh21
5466 1.1 bjh21 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5467 1.1 bjh21 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468 1.1 bjh21 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5469 1.1 bjh21 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470 1.1 bjh21 ) {
5471 1.1 bjh21 if ( float128_is_signaling_nan( a )
5472 1.1 bjh21 || float128_is_signaling_nan( b ) ) {
5473 1.1 bjh21 float_raise( float_flag_invalid );
5474 1.1 bjh21 }
5475 1.1 bjh21 return 0;
5476 1.1 bjh21 }
5477 1.1 bjh21 aSign = extractFloat128Sign( a );
5478 1.1 bjh21 bSign = extractFloat128Sign( b );
5479 1.1 bjh21 if ( aSign != bSign ) {
5480 1.1 bjh21 return
5481 1.1 bjh21 aSign
5482 1.1 bjh21 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483 1.1 bjh21 != 0 );
5484 1.1 bjh21 }
5485 1.1 bjh21 return
5486 1.1 bjh21 aSign ? lt128( b.high, b.low, a.high, a.low )
5487 1.1 bjh21 : lt128( a.high, a.low, b.high, b.low );
5488 1.1 bjh21
5489 1.1 bjh21 }
5490 1.1 bjh21
5491 1.1 bjh21 #endif
5492 1.1 bjh21
5493 1.1 bjh21
5494 1.1 bjh21 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495 1.1 bjh21
5496 1.1 bjh21 /*
5497 1.1 bjh21 * These two routines are not part of the original softfloat distribution.
5498 1.1 bjh21 *
5499 1.1 bjh21 * They are based on the corresponding conversions to integer but return
5500 1.1 bjh21 * unsigned numbers instead since these functions are required by GCC.
5501 1.1 bjh21 *
5502 1.2 salo * Added by Mark Brinicombe <mark (at) NetBSD.org> 27/09/97
5503 1.1 bjh21 *
5504 1.1 bjh21 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505 1.1 bjh21 */
5506 1.1 bjh21
5507 1.1 bjh21 /*
5508 1.1 bjh21 -------------------------------------------------------------------------------
5509 1.1 bjh21 Returns the result of converting the double-precision floating-point value
5510 1.1 bjh21 `a' to the 32-bit unsigned integer format. The conversion is
5511 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-point
5512 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
5513 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. If the conversion
5514 1.1 bjh21 overflows, the largest integer positive is returned.
5515 1.1 bjh21 -------------------------------------------------------------------------------
5516 1.1 bjh21 */
5517 1.1 bjh21 uint32 float64_to_uint32_round_to_zero( float64 a )
5518 1.1 bjh21 {
5519 1.1 bjh21 flag aSign;
5520 1.1 bjh21 int16 aExp, shiftCount;
5521 1.1 bjh21 bits64 aSig, savedASig;
5522 1.1 bjh21 uint32 z;
5523 1.1 bjh21
5524 1.1 bjh21 aSig = extractFloat64Frac( a );
5525 1.1 bjh21 aExp = extractFloat64Exp( a );
5526 1.1 bjh21 aSign = extractFloat64Sign( a );
5527 1.1 bjh21
5528 1.1 bjh21 if (aSign) {
5529 1.1 bjh21 float_raise( float_flag_invalid );
5530 1.1 bjh21 return(0);
5531 1.1 bjh21 }
5532 1.1 bjh21
5533 1.1 bjh21 if ( 0x41E < aExp ) {
5534 1.1 bjh21 float_raise( float_flag_invalid );
5535 1.1 bjh21 return 0xffffffff;
5536 1.1 bjh21 }
5537 1.1 bjh21 else if ( aExp < 0x3FF ) {
5538 1.1 bjh21 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539 1.1 bjh21 return 0;
5540 1.1 bjh21 }
5541 1.1 bjh21 aSig |= LIT64( 0x0010000000000000 );
5542 1.1 bjh21 shiftCount = 0x433 - aExp;
5543 1.1 bjh21 savedASig = aSig;
5544 1.1 bjh21 aSig >>= shiftCount;
5545 1.1 bjh21 z = aSig;
5546 1.1 bjh21 if ( ( aSig<<shiftCount ) != savedASig ) {
5547 1.1 bjh21 float_exception_flags |= float_flag_inexact;
5548 1.1 bjh21 }
5549 1.1 bjh21 return z;
5550 1.1 bjh21
5551 1.1 bjh21 }
5552 1.1 bjh21
5553 1.1 bjh21 /*
5554 1.1 bjh21 -------------------------------------------------------------------------------
5555 1.1 bjh21 Returns the result of converting the single-precision floating-point value
5556 1.1 bjh21 `a' to the 32-bit unsigned integer format. The conversion is
5557 1.1 bjh21 performed according to the IEC/IEEE Standard for Binary Floating-point
5558 1.1 bjh21 Arithmetic, except that the conversion is always rounded toward zero. If
5559 1.1 bjh21 `a' is a NaN, the largest positive integer is returned. If the conversion
5560 1.1 bjh21 overflows, the largest positive integer is returned.
5561 1.1 bjh21 -------------------------------------------------------------------------------
5562 1.1 bjh21 */
5563 1.1 bjh21 uint32 float32_to_uint32_round_to_zero( float32 a )
5564 1.1 bjh21 {
5565 1.1 bjh21 flag aSign;
5566 1.1 bjh21 int16 aExp, shiftCount;
5567 1.1 bjh21 bits32 aSig;
5568 1.1 bjh21 uint32 z;
5569 1.1 bjh21
5570 1.1 bjh21 aSig = extractFloat32Frac( a );
5571 1.1 bjh21 aExp = extractFloat32Exp( a );
5572 1.1 bjh21 aSign = extractFloat32Sign( a );
5573 1.1 bjh21 shiftCount = aExp - 0x9E;
5574 1.1 bjh21
5575 1.1 bjh21 if (aSign) {
5576 1.1 bjh21 float_raise( float_flag_invalid );
5577 1.1 bjh21 return(0);
5578 1.1 bjh21 }
5579 1.1 bjh21 if ( 0 < shiftCount ) {
5580 1.1 bjh21 float_raise( float_flag_invalid );
5581 1.1 bjh21 return 0xFFFFFFFF;
5582 1.1 bjh21 }
5583 1.1 bjh21 else if ( aExp <= 0x7E ) {
5584 1.1 bjh21 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585 1.1 bjh21 return 0;
5586 1.1 bjh21 }
5587 1.1 bjh21 aSig = ( aSig | 0x800000 )<<8;
5588 1.1 bjh21 z = aSig>>( - shiftCount );
5589 1.1 bjh21 if ( aSig<<( shiftCount & 31 ) ) {
5590 1.1 bjh21 float_exception_flags |= float_flag_inexact;
5591 1.1 bjh21 }
5592 1.1 bjh21 return z;
5593 1.1 bjh21
5594 1.1 bjh21 }
5595 1.1 bjh21
5596 1.1 bjh21 #endif
5597