op-2.h revision 1.1 1 1.1 mrg /* Software floating-point emulation.
2 1.1 mrg Basic two-word fraction declaration and manipulation.
3 1.1 mrg Copyright (C) 1997,1998,1999,2006,2007 Free Software Foundation, Inc.
4 1.1 mrg This file is part of the GNU C Library.
5 1.1 mrg Contributed by Richard Henderson (rth (at) cygnus.com),
6 1.1 mrg Jakub Jelinek (jj (at) ultra.linux.cz),
7 1.1 mrg David S. Miller (davem (at) redhat.com) and
8 1.1 mrg Peter Maydell (pmaydell (at) chiark.greenend.org.uk).
9 1.1 mrg
10 1.1 mrg The GNU C Library is free software; you can redistribute it and/or
11 1.1 mrg modify it under the terms of the GNU Lesser General Public
12 1.1 mrg License as published by the Free Software Foundation; either
13 1.1 mrg version 2.1 of the License, or (at your option) any later version.
14 1.1 mrg
15 1.1 mrg In addition to the permissions in the GNU Lesser General Public
16 1.1 mrg License, the Free Software Foundation gives you unlimited
17 1.1 mrg permission to link the compiled version of this file into
18 1.1 mrg combinations with other programs, and to distribute those
19 1.1 mrg combinations without any restriction coming from the use of this
20 1.1 mrg file. (The Lesser General Public License restrictions do apply in
21 1.1 mrg other respects; for example, they cover modification of the file,
22 1.1 mrg and distribution when not linked into a combine executable.)
23 1.1 mrg
24 1.1 mrg The GNU C Library is distributed in the hope that it will be useful,
25 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
26 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 1.1 mrg Lesser General Public License for more details.
28 1.1 mrg
29 1.1 mrg You should have received a copy of the GNU Lesser General Public
30 1.1 mrg License along with the GNU C Library; if not, see
31 1.1 mrg <http://www.gnu.org/licenses/>. */
32 1.1 mrg
33 1.1 mrg #define _FP_FRAC_DECL_2(X) _FP_W_TYPE X##_f0, X##_f1
34 1.1 mrg #define _FP_FRAC_COPY_2(D,S) (D##_f0 = S##_f0, D##_f1 = S##_f1)
35 1.1 mrg #define _FP_FRAC_SET_2(X,I) __FP_FRAC_SET_2(X, I)
36 1.1 mrg #define _FP_FRAC_HIGH_2(X) (X##_f1)
37 1.1 mrg #define _FP_FRAC_LOW_2(X) (X##_f0)
38 1.1 mrg #define _FP_FRAC_WORD_2(X,w) (X##_f##w)
39 1.1 mrg
40 1.1 mrg #define _FP_FRAC_SLL_2(X,N) \
41 1.1 mrg (void)(((N) < _FP_W_TYPE_SIZE) \
42 1.1 mrg ? ({ \
43 1.1 mrg if (__builtin_constant_p(N) && (N) == 1) \
44 1.1 mrg { \
45 1.1 mrg X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0); \
46 1.1 mrg X##_f0 += X##_f0; \
47 1.1 mrg } \
48 1.1 mrg else \
49 1.1 mrg { \
50 1.1 mrg X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
51 1.1 mrg X##_f0 <<= (N); \
52 1.1 mrg } \
53 1.1 mrg 0; \
54 1.1 mrg }) \
55 1.1 mrg : ({ \
56 1.1 mrg X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \
57 1.1 mrg X##_f0 = 0; \
58 1.1 mrg }))
59 1.1 mrg
60 1.1 mrg
61 1.1 mrg #define _FP_FRAC_SRL_2(X,N) \
62 1.1 mrg (void)(((N) < _FP_W_TYPE_SIZE) \
63 1.1 mrg ? ({ \
64 1.1 mrg X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
65 1.1 mrg X##_f1 >>= (N); \
66 1.1 mrg }) \
67 1.1 mrg : ({ \
68 1.1 mrg X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \
69 1.1 mrg X##_f1 = 0; \
70 1.1 mrg }))
71 1.1 mrg
72 1.1 mrg /* Right shift with sticky-lsb. */
73 1.1 mrg #define _FP_FRAC_SRST_2(X,S, N,sz) \
74 1.1 mrg (void)(((N) < _FP_W_TYPE_SIZE) \
75 1.1 mrg ? ({ \
76 1.1 mrg S = (__builtin_constant_p(N) && (N) == 1 \
77 1.1 mrg ? X##_f0 & 1 \
78 1.1 mrg : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0); \
79 1.1 mrg X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
80 1.1 mrg X##_f1 >>= (N); \
81 1.1 mrg }) \
82 1.1 mrg : ({ \
83 1.1 mrg S = ((((N) == _FP_W_TYPE_SIZE \
84 1.1 mrg ? 0 \
85 1.1 mrg : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
86 1.1 mrg | X##_f0) != 0); \
87 1.1 mrg X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)); \
88 1.1 mrg X##_f1 = 0; \
89 1.1 mrg }))
90 1.1 mrg
91 1.1 mrg #define _FP_FRAC_SRS_2(X,N,sz) \
92 1.1 mrg (void)(((N) < _FP_W_TYPE_SIZE) \
93 1.1 mrg ? ({ \
94 1.1 mrg X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
95 1.1 mrg (__builtin_constant_p(N) && (N) == 1 \
96 1.1 mrg ? X##_f0 & 1 \
97 1.1 mrg : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
98 1.1 mrg X##_f1 >>= (N); \
99 1.1 mrg }) \
100 1.1 mrg : ({ \
101 1.1 mrg X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) | \
102 1.1 mrg ((((N) == _FP_W_TYPE_SIZE \
103 1.1 mrg ? 0 \
104 1.1 mrg : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
105 1.1 mrg | X##_f0) != 0)); \
106 1.1 mrg X##_f1 = 0; \
107 1.1 mrg }))
108 1.1 mrg
109 1.1 mrg #define _FP_FRAC_ADDI_2(X,I) \
110 1.1 mrg __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
111 1.1 mrg
112 1.1 mrg #define _FP_FRAC_ADD_2(R,X,Y) \
113 1.1 mrg __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
114 1.1 mrg
115 1.1 mrg #define _FP_FRAC_SUB_2(R,X,Y) \
116 1.1 mrg __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
117 1.1 mrg
118 1.1 mrg #define _FP_FRAC_DEC_2(X,Y) \
119 1.1 mrg __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
120 1.1 mrg
121 1.1 mrg #define _FP_FRAC_CLZ_2(R,X) \
122 1.1 mrg do { \
123 1.1 mrg if (X##_f1) \
124 1.1 mrg __FP_CLZ(R,X##_f1); \
125 1.1 mrg else \
126 1.1 mrg { \
127 1.1 mrg __FP_CLZ(R,X##_f0); \
128 1.1 mrg R += _FP_W_TYPE_SIZE; \
129 1.1 mrg } \
130 1.1 mrg } while(0)
131 1.1 mrg
132 1.1 mrg /* Predicates */
133 1.1 mrg #define _FP_FRAC_NEGP_2(X) ((_FP_WS_TYPE)X##_f1 < 0)
134 1.1 mrg #define _FP_FRAC_ZEROP_2(X) ((X##_f1 | X##_f0) == 0)
135 1.1 mrg #define _FP_FRAC_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
136 1.1 mrg #define _FP_FRAC_CLEAR_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
137 1.1 mrg #define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
138 1.1 mrg #define _FP_FRAC_GT_2(X, Y) \
139 1.1 mrg (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
140 1.1 mrg #define _FP_FRAC_GE_2(X, Y) \
141 1.1 mrg (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
142 1.1 mrg
143 1.1 mrg #define _FP_ZEROFRAC_2 0, 0
144 1.1 mrg #define _FP_MINFRAC_2 0, 1
145 1.1 mrg #define _FP_MAXFRAC_2 (~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
146 1.1 mrg
147 1.1 mrg /*
148 1.1 mrg * Internals
149 1.1 mrg */
150 1.1 mrg
151 1.1 mrg #define __FP_FRAC_SET_2(X,I1,I0) (X##_f0 = I0, X##_f1 = I1)
152 1.1 mrg
153 1.1 mrg #define __FP_CLZ_2(R, xh, xl) \
154 1.1 mrg do { \
155 1.1 mrg if (xh) \
156 1.1 mrg __FP_CLZ(R,xh); \
157 1.1 mrg else \
158 1.1 mrg { \
159 1.1 mrg __FP_CLZ(R,xl); \
160 1.1 mrg R += _FP_W_TYPE_SIZE; \
161 1.1 mrg } \
162 1.1 mrg } while(0)
163 1.1 mrg
164 1.1 mrg #if 0
165 1.1 mrg
166 1.1 mrg #ifndef __FP_FRAC_ADDI_2
167 1.1 mrg #define __FP_FRAC_ADDI_2(xh, xl, i) \
168 1.1 mrg (xh += ((xl += i) < i))
169 1.1 mrg #endif
170 1.1 mrg #ifndef __FP_FRAC_ADD_2
171 1.1 mrg #define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \
172 1.1 mrg (rh = xh + yh + ((rl = xl + yl) < xl))
173 1.1 mrg #endif
174 1.1 mrg #ifndef __FP_FRAC_SUB_2
175 1.1 mrg #define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \
176 1.1 mrg (rh = xh - yh - ((rl = xl - yl) > xl))
177 1.1 mrg #endif
178 1.1 mrg #ifndef __FP_FRAC_DEC_2
179 1.1 mrg #define __FP_FRAC_DEC_2(xh, xl, yh, yl) \
180 1.1 mrg do { \
181 1.1 mrg UWtype _t = xl; \
182 1.1 mrg xh -= yh + ((xl -= yl) > _t); \
183 1.1 mrg } while (0)
184 1.1 mrg #endif
185 1.1 mrg
186 1.1 mrg #else
187 1.1 mrg
188 1.1 mrg #undef __FP_FRAC_ADDI_2
189 1.1 mrg #define __FP_FRAC_ADDI_2(xh, xl, i) add_ssaaaa(xh, xl, xh, xl, 0, i)
190 1.1 mrg #undef __FP_FRAC_ADD_2
191 1.1 mrg #define __FP_FRAC_ADD_2 add_ssaaaa
192 1.1 mrg #undef __FP_FRAC_SUB_2
193 1.1 mrg #define __FP_FRAC_SUB_2 sub_ddmmss
194 1.1 mrg #undef __FP_FRAC_DEC_2
195 1.1 mrg #define __FP_FRAC_DEC_2(xh, xl, yh, yl) sub_ddmmss(xh, xl, xh, xl, yh, yl)
196 1.1 mrg
197 1.1 mrg #endif
198 1.1 mrg
199 1.1 mrg /*
200 1.1 mrg * Unpack the raw bits of a native fp value. Do not classify or
201 1.1 mrg * normalize the data.
202 1.1 mrg */
203 1.1 mrg
204 1.1 mrg #define _FP_UNPACK_RAW_2(fs, X, val) \
205 1.1 mrg do { \
206 1.1 mrg union _FP_UNION_##fs _flo; _flo.flt = (val); \
207 1.1 mrg \
208 1.1 mrg X##_f0 = _flo.bits.frac0; \
209 1.1 mrg X##_f1 = _flo.bits.frac1; \
210 1.1 mrg X##_e = _flo.bits.exp; \
211 1.1 mrg X##_s = _flo.bits.sign; \
212 1.1 mrg } while (0)
213 1.1 mrg
214 1.1 mrg #define _FP_UNPACK_RAW_2_P(fs, X, val) \
215 1.1 mrg do { \
216 1.1 mrg union _FP_UNION_##fs *_flo = \
217 1.1 mrg (union _FP_UNION_##fs *)(val); \
218 1.1 mrg \
219 1.1 mrg X##_f0 = _flo->bits.frac0; \
220 1.1 mrg X##_f1 = _flo->bits.frac1; \
221 1.1 mrg X##_e = _flo->bits.exp; \
222 1.1 mrg X##_s = _flo->bits.sign; \
223 1.1 mrg } while (0)
224 1.1 mrg
225 1.1 mrg
226 1.1 mrg /*
227 1.1 mrg * Repack the raw bits of a native fp value.
228 1.1 mrg */
229 1.1 mrg
230 1.1 mrg #define _FP_PACK_RAW_2(fs, val, X) \
231 1.1 mrg do { \
232 1.1 mrg union _FP_UNION_##fs _flo; \
233 1.1 mrg \
234 1.1 mrg _flo.bits.frac0 = X##_f0; \
235 1.1 mrg _flo.bits.frac1 = X##_f1; \
236 1.1 mrg _flo.bits.exp = X##_e; \
237 1.1 mrg _flo.bits.sign = X##_s; \
238 1.1 mrg \
239 1.1 mrg (val) = _flo.flt; \
240 1.1 mrg } while (0)
241 1.1 mrg
242 1.1 mrg #define _FP_PACK_RAW_2_P(fs, val, X) \
243 1.1 mrg do { \
244 1.1 mrg union _FP_UNION_##fs *_flo = \
245 1.1 mrg (union _FP_UNION_##fs *)(val); \
246 1.1 mrg \
247 1.1 mrg _flo->bits.frac0 = X##_f0; \
248 1.1 mrg _flo->bits.frac1 = X##_f1; \
249 1.1 mrg _flo->bits.exp = X##_e; \
250 1.1 mrg _flo->bits.sign = X##_s; \
251 1.1 mrg } while (0)
252 1.1 mrg
253 1.1 mrg
254 1.1 mrg /*
255 1.1 mrg * Multiplication algorithms:
256 1.1 mrg */
257 1.1 mrg
258 1.1 mrg /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */
259 1.1 mrg
260 1.1 mrg #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \
261 1.1 mrg do { \
262 1.1 mrg _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \
263 1.1 mrg \
264 1.1 mrg doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \
265 1.1 mrg doit(_b_f1, _b_f0, X##_f0, Y##_f1); \
266 1.1 mrg doit(_c_f1, _c_f0, X##_f1, Y##_f0); \
267 1.1 mrg doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1); \
268 1.1 mrg \
269 1.1 mrg __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
270 1.1 mrg _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0, \
271 1.1 mrg _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
272 1.1 mrg _FP_FRAC_WORD_4(_z,1)); \
273 1.1 mrg __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
274 1.1 mrg _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0, \
275 1.1 mrg _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
276 1.1 mrg _FP_FRAC_WORD_4(_z,1)); \
277 1.1 mrg \
278 1.1 mrg /* Normalize since we know where the msb of the multiplicands \
279 1.1 mrg were (bit B), we know that the msb of the of the product is \
280 1.1 mrg at either 2B or 2B-1. */ \
281 1.1 mrg _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
282 1.1 mrg R##_f0 = _FP_FRAC_WORD_4(_z,0); \
283 1.1 mrg R##_f1 = _FP_FRAC_WORD_4(_z,1); \
284 1.1 mrg } while (0)
285 1.1 mrg
286 1.1 mrg /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
287 1.1 mrg Do only 3 multiplications instead of four. This one is for machines
288 1.1 mrg where multiplication is much more expensive than subtraction. */
289 1.1 mrg
290 1.1 mrg #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \
291 1.1 mrg do { \
292 1.1 mrg _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \
293 1.1 mrg _FP_W_TYPE _d; \
294 1.1 mrg int _c1, _c2; \
295 1.1 mrg \
296 1.1 mrg _b_f0 = X##_f0 + X##_f1; \
297 1.1 mrg _c1 = _b_f0 < X##_f0; \
298 1.1 mrg _b_f1 = Y##_f0 + Y##_f1; \
299 1.1 mrg _c2 = _b_f1 < Y##_f0; \
300 1.1 mrg doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \
301 1.1 mrg doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1); \
302 1.1 mrg doit(_c_f1, _c_f0, X##_f1, Y##_f1); \
303 1.1 mrg \
304 1.1 mrg _b_f0 &= -_c2; \
305 1.1 mrg _b_f1 &= -_c1; \
306 1.1 mrg __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
307 1.1 mrg _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d, \
308 1.1 mrg 0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1)); \
309 1.1 mrg __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
310 1.1 mrg _b_f0); \
311 1.1 mrg __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
312 1.1 mrg _b_f1); \
313 1.1 mrg __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
314 1.1 mrg _FP_FRAC_WORD_4(_z,1), \
315 1.1 mrg 0, _d, _FP_FRAC_WORD_4(_z,0)); \
316 1.1 mrg __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
317 1.1 mrg _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0); \
318 1.1 mrg __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), \
319 1.1 mrg _c_f1, _c_f0, \
320 1.1 mrg _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2)); \
321 1.1 mrg \
322 1.1 mrg /* Normalize since we know where the msb of the multiplicands \
323 1.1 mrg were (bit B), we know that the msb of the of the product is \
324 1.1 mrg at either 2B or 2B-1. */ \
325 1.1 mrg _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
326 1.1 mrg R##_f0 = _FP_FRAC_WORD_4(_z,0); \
327 1.1 mrg R##_f1 = _FP_FRAC_WORD_4(_z,1); \
328 1.1 mrg } while (0)
329 1.1 mrg
330 1.1 mrg #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \
331 1.1 mrg do { \
332 1.1 mrg _FP_FRAC_DECL_4(_z); \
333 1.1 mrg _FP_W_TYPE _x[2], _y[2]; \
334 1.1 mrg _x[0] = X##_f0; _x[1] = X##_f1; \
335 1.1 mrg _y[0] = Y##_f0; _y[1] = Y##_f1; \
336 1.1 mrg \
337 1.1 mrg mpn_mul_n(_z_f, _x, _y, 2); \
338 1.1 mrg \
339 1.1 mrg /* Normalize since we know where the msb of the multiplicands \
340 1.1 mrg were (bit B), we know that the msb of the of the product is \
341 1.1 mrg at either 2B or 2B-1. */ \
342 1.1 mrg _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
343 1.1 mrg R##_f0 = _z_f[0]; \
344 1.1 mrg R##_f1 = _z_f[1]; \
345 1.1 mrg } while (0)
346 1.1 mrg
347 1.1 mrg /* Do at most 120x120=240 bits multiplication using double floating
348 1.1 mrg point multiplication. This is useful if floating point
349 1.1 mrg multiplication has much bigger throughput than integer multiply.
350 1.1 mrg It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
351 1.1 mrg between 106 and 120 only.
352 1.1 mrg Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
353 1.1 mrg SETFETZ is a macro which will disable all FPU exceptions and set rounding
354 1.1 mrg towards zero, RESETFE should optionally reset it back. */
355 1.1 mrg
356 1.1 mrg #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
357 1.1 mrg do { \
358 1.1 mrg static const double _const[] = { \
359 1.1 mrg /* 2^-24 */ 5.9604644775390625e-08, \
360 1.1 mrg /* 2^-48 */ 3.5527136788005009e-15, \
361 1.1 mrg /* 2^-72 */ 2.1175823681357508e-22, \
362 1.1 mrg /* 2^-96 */ 1.2621774483536189e-29, \
363 1.1 mrg /* 2^28 */ 2.68435456e+08, \
364 1.1 mrg /* 2^4 */ 1.600000e+01, \
365 1.1 mrg /* 2^-20 */ 9.5367431640625e-07, \
366 1.1 mrg /* 2^-44 */ 5.6843418860808015e-14, \
367 1.1 mrg /* 2^-68 */ 3.3881317890172014e-21, \
368 1.1 mrg /* 2^-92 */ 2.0194839173657902e-28, \
369 1.1 mrg /* 2^-116 */ 1.2037062152420224e-35}; \
370 1.1 mrg double _a240, _b240, _c240, _d240, _e240, _f240, \
371 1.1 mrg _g240, _h240, _i240, _j240, _k240; \
372 1.1 mrg union { double d; UDItype i; } _l240, _m240, _n240, _o240, \
373 1.1 mrg _p240, _q240, _r240, _s240; \
374 1.1 mrg UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0; \
375 1.1 mrg \
376 1.1 mrg if (wfracbits < 106 || wfracbits > 120) \
377 1.1 mrg abort(); \
378 1.1 mrg \
379 1.1 mrg setfetz; \
380 1.1 mrg \
381 1.1 mrg _e240 = (double)(long)(X##_f0 & 0xffffff); \
382 1.1 mrg _j240 = (double)(long)(Y##_f0 & 0xffffff); \
383 1.1 mrg _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff); \
384 1.1 mrg _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff); \
385 1.1 mrg _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
386 1.1 mrg _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
387 1.1 mrg _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff); \
388 1.1 mrg _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff); \
389 1.1 mrg _a240 = (double)(long)(X##_f1 >> 32); \
390 1.1 mrg _f240 = (double)(long)(Y##_f1 >> 32); \
391 1.1 mrg _e240 *= _const[3]; \
392 1.1 mrg _j240 *= _const[3]; \
393 1.1 mrg _d240 *= _const[2]; \
394 1.1 mrg _i240 *= _const[2]; \
395 1.1 mrg _c240 *= _const[1]; \
396 1.1 mrg _h240 *= _const[1]; \
397 1.1 mrg _b240 *= _const[0]; \
398 1.1 mrg _g240 *= _const[0]; \
399 1.1 mrg _s240.d = _e240*_j240;\
400 1.1 mrg _r240.d = _d240*_j240 + _e240*_i240;\
401 1.1 mrg _q240.d = _c240*_j240 + _d240*_i240 + _e240*_h240;\
402 1.1 mrg _p240.d = _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
403 1.1 mrg _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
404 1.1 mrg _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240; \
405 1.1 mrg _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240; \
406 1.1 mrg _l240.d = _a240*_g240 + _b240*_f240; \
407 1.1 mrg _k240 = _a240*_f240; \
408 1.1 mrg _r240.d += _s240.d; \
409 1.1 mrg _q240.d += _r240.d; \
410 1.1 mrg _p240.d += _q240.d; \
411 1.1 mrg _o240.d += _p240.d; \
412 1.1 mrg _n240.d += _o240.d; \
413 1.1 mrg _m240.d += _n240.d; \
414 1.1 mrg _l240.d += _m240.d; \
415 1.1 mrg _k240 += _l240.d; \
416 1.1 mrg _s240.d -= ((_const[10]+_s240.d)-_const[10]); \
417 1.1 mrg _r240.d -= ((_const[9]+_r240.d)-_const[9]); \
418 1.1 mrg _q240.d -= ((_const[8]+_q240.d)-_const[8]); \
419 1.1 mrg _p240.d -= ((_const[7]+_p240.d)-_const[7]); \
420 1.1 mrg _o240.d += _const[7]; \
421 1.1 mrg _n240.d += _const[6]; \
422 1.1 mrg _m240.d += _const[5]; \
423 1.1 mrg _l240.d += _const[4]; \
424 1.1 mrg if (_s240.d != 0.0) _y240 = 1; \
425 1.1 mrg if (_r240.d != 0.0) _y240 = 1; \
426 1.1 mrg if (_q240.d != 0.0) _y240 = 1; \
427 1.1 mrg if (_p240.d != 0.0) _y240 = 1; \
428 1.1 mrg _t240 = (DItype)_k240; \
429 1.1 mrg _u240 = _l240.i; \
430 1.1 mrg _v240 = _m240.i; \
431 1.1 mrg _w240 = _n240.i; \
432 1.1 mrg _x240 = _o240.i; \
433 1.1 mrg R##_f1 = (_t240 << (128 - (wfracbits - 1))) \
434 1.1 mrg | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)); \
435 1.1 mrg R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1))) \
436 1.1 mrg | ((_v240 & 0xffffff) << (144 - (wfracbits - 1))) \
437 1.1 mrg | ((_w240 & 0xffffff) << (120 - (wfracbits - 1))) \
438 1.1 mrg | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96)) \
439 1.1 mrg | _y240; \
440 1.1 mrg resetfe; \
441 1.1 mrg } while (0)
442 1.1 mrg
443 1.1 mrg /*
444 1.1 mrg * Division algorithms:
445 1.1 mrg */
446 1.1 mrg
447 1.1 mrg #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y) \
448 1.1 mrg do { \
449 1.1 mrg _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0; \
450 1.1 mrg if (_FP_FRAC_GT_2(X, Y)) \
451 1.1 mrg { \
452 1.1 mrg _n_f2 = X##_f1 >> 1; \
453 1.1 mrg _n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1; \
454 1.1 mrg _n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1); \
455 1.1 mrg } \
456 1.1 mrg else \
457 1.1 mrg { \
458 1.1 mrg R##_e--; \
459 1.1 mrg _n_f2 = X##_f1; \
460 1.1 mrg _n_f1 = X##_f0; \
461 1.1 mrg _n_f0 = 0; \
462 1.1 mrg } \
463 1.1 mrg \
464 1.1 mrg /* Normalize, i.e. make the most significant bit of the \
465 1.1 mrg denominator set. */ \
466 1.1 mrg _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs); \
467 1.1 mrg \
468 1.1 mrg udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1); \
469 1.1 mrg umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0); \
470 1.1 mrg _r_f0 = _n_f0; \
471 1.1 mrg if (_FP_FRAC_GT_2(_m, _r)) \
472 1.1 mrg { \
473 1.1 mrg R##_f1--; \
474 1.1 mrg _FP_FRAC_ADD_2(_r, Y, _r); \
475 1.1 mrg if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \
476 1.1 mrg { \
477 1.1 mrg R##_f1--; \
478 1.1 mrg _FP_FRAC_ADD_2(_r, Y, _r); \
479 1.1 mrg } \
480 1.1 mrg } \
481 1.1 mrg _FP_FRAC_DEC_2(_r, _m); \
482 1.1 mrg \
483 1.1 mrg if (_r_f1 == Y##_f1) \
484 1.1 mrg { \
485 1.1 mrg /* This is a special case, not an optimization \
486 1.1 mrg (_r/Y##_f1 would not fit into UWtype). \
487 1.1 mrg As _r is guaranteed to be < Y, R##_f0 can be either \
488 1.1 mrg (UWtype)-1 or (UWtype)-2. But as we know what kind \
489 1.1 mrg of bits it is (sticky, guard, round), we don't care. \
490 1.1 mrg We also don't care what the reminder is, because the \
491 1.1 mrg guard bit will be set anyway. -jj */ \
492 1.1 mrg R##_f0 = -1; \
493 1.1 mrg } \
494 1.1 mrg else \
495 1.1 mrg { \
496 1.1 mrg udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1); \
497 1.1 mrg umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0); \
498 1.1 mrg _r_f0 = 0; \
499 1.1 mrg if (_FP_FRAC_GT_2(_m, _r)) \
500 1.1 mrg { \
501 1.1 mrg R##_f0--; \
502 1.1 mrg _FP_FRAC_ADD_2(_r, Y, _r); \
503 1.1 mrg if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \
504 1.1 mrg { \
505 1.1 mrg R##_f0--; \
506 1.1 mrg _FP_FRAC_ADD_2(_r, Y, _r); \
507 1.1 mrg } \
508 1.1 mrg } \
509 1.1 mrg if (!_FP_FRAC_EQ_2(_r, _m)) \
510 1.1 mrg R##_f0 |= _FP_WORK_STICKY; \
511 1.1 mrg } \
512 1.1 mrg } while (0)
513 1.1 mrg
514 1.1 mrg
515 1.1 mrg #define _FP_DIV_MEAT_2_gmp(fs, R, X, Y) \
516 1.1 mrg do { \
517 1.1 mrg _FP_W_TYPE _x[4], _y[2], _z[4]; \
518 1.1 mrg _y[0] = Y##_f0; _y[1] = Y##_f1; \
519 1.1 mrg _x[0] = _x[3] = 0; \
520 1.1 mrg if (_FP_FRAC_GT_2(X, Y)) \
521 1.1 mrg { \
522 1.1 mrg R##_e++; \
523 1.1 mrg _x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) | \
524 1.1 mrg X##_f1 >> (_FP_W_TYPE_SIZE - \
525 1.1 mrg (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE))); \
526 1.1 mrg _x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE); \
527 1.1 mrg } \
528 1.1 mrg else \
529 1.1 mrg { \
530 1.1 mrg _x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) | \
531 1.1 mrg X##_f1 >> (_FP_W_TYPE_SIZE - \
532 1.1 mrg (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE))); \
533 1.1 mrg _x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE); \
534 1.1 mrg } \
535 1.1 mrg \
536 1.1 mrg (void) mpn_divrem (_z, 0, _x, 4, _y, 2); \
537 1.1 mrg R##_f1 = _z[1]; \
538 1.1 mrg R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0); \
539 1.1 mrg } while (0)
540 1.1 mrg
541 1.1 mrg
542 1.1 mrg /*
543 1.1 mrg * Square root algorithms:
544 1.1 mrg * We have just one right now, maybe Newton approximation
545 1.1 mrg * should be added for those machines where division is fast.
546 1.1 mrg */
547 1.1 mrg
548 1.1 mrg #define _FP_SQRT_MEAT_2(R, S, T, X, q) \
549 1.1 mrg do { \
550 1.1 mrg while (q) \
551 1.1 mrg { \
552 1.1 mrg T##_f1 = S##_f1 + q; \
553 1.1 mrg if (T##_f1 <= X##_f1) \
554 1.1 mrg { \
555 1.1 mrg S##_f1 = T##_f1 + q; \
556 1.1 mrg X##_f1 -= T##_f1; \
557 1.1 mrg R##_f1 += q; \
558 1.1 mrg } \
559 1.1 mrg _FP_FRAC_SLL_2(X, 1); \
560 1.1 mrg q >>= 1; \
561 1.1 mrg } \
562 1.1 mrg q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1); \
563 1.1 mrg while (q != _FP_WORK_ROUND) \
564 1.1 mrg { \
565 1.1 mrg T##_f0 = S##_f0 + q; \
566 1.1 mrg T##_f1 = S##_f1; \
567 1.1 mrg if (T##_f1 < X##_f1 || \
568 1.1 mrg (T##_f1 == X##_f1 && T##_f0 <= X##_f0)) \
569 1.1 mrg { \
570 1.1 mrg S##_f0 = T##_f0 + q; \
571 1.1 mrg S##_f1 += (T##_f0 > S##_f0); \
572 1.1 mrg _FP_FRAC_DEC_2(X, T); \
573 1.1 mrg R##_f0 += q; \
574 1.1 mrg } \
575 1.1 mrg _FP_FRAC_SLL_2(X, 1); \
576 1.1 mrg q >>= 1; \
577 1.1 mrg } \
578 1.1 mrg if (X##_f0 | X##_f1) \
579 1.1 mrg { \
580 1.1 mrg if (S##_f1 < X##_f1 || \
581 1.1 mrg (S##_f1 == X##_f1 && S##_f0 < X##_f0)) \
582 1.1 mrg R##_f0 |= _FP_WORK_ROUND; \
583 1.1 mrg R##_f0 |= _FP_WORK_STICKY; \
584 1.1 mrg } \
585 1.1 mrg } while (0)
586 1.1 mrg
587 1.1 mrg
588 1.1 mrg /*
589 1.1 mrg * Assembly/disassembly for converting to/from integral types.
590 1.1 mrg * No shifting or overflow handled here.
591 1.1 mrg */
592 1.1 mrg
593 1.1 mrg #define _FP_FRAC_ASSEMBLE_2(r, X, rsize) \
594 1.1 mrg (void)((rsize <= _FP_W_TYPE_SIZE) \
595 1.1 mrg ? ({ r = X##_f0; }) \
596 1.1 mrg : ({ \
597 1.1 mrg r = X##_f1; \
598 1.1 mrg r <<= _FP_W_TYPE_SIZE; \
599 1.1 mrg r += X##_f0; \
600 1.1 mrg }))
601 1.1 mrg
602 1.1 mrg #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize) \
603 1.1 mrg do { \
604 1.1 mrg X##_f0 = r; \
605 1.1 mrg X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE); \
606 1.1 mrg } while (0)
607 1.1 mrg
608 1.1 mrg /*
609 1.1 mrg * Convert FP values between word sizes
610 1.1 mrg */
611 1.1 mrg
612 1.1 mrg #define _FP_FRAC_COPY_1_2(D, S) (D##_f = S##_f0)
613 1.1 mrg
614 1.1 mrg #define _FP_FRAC_COPY_2_1(D, S) ((D##_f0 = S##_f), (D##_f1 = 0))
615 1.1 mrg
616 1.1 mrg #define _FP_FRAC_COPY_2_2(D,S) _FP_FRAC_COPY_2(D,S)
617