Home | History | Annotate | Line # | Download | only in soft-fp
op-2.h revision 1.1
      1  1.1  mrg /* Software floating-point emulation.
      2  1.1  mrg    Basic two-word fraction declaration and manipulation.
      3  1.1  mrg    Copyright (C) 1997,1998,1999,2006,2007 Free Software Foundation, Inc.
      4  1.1  mrg    This file is part of the GNU C Library.
      5  1.1  mrg    Contributed by Richard Henderson (rth (at) cygnus.com),
      6  1.1  mrg 		  Jakub Jelinek (jj (at) ultra.linux.cz),
      7  1.1  mrg 		  David S. Miller (davem (at) redhat.com) and
      8  1.1  mrg 		  Peter Maydell (pmaydell (at) chiark.greenend.org.uk).
      9  1.1  mrg 
     10  1.1  mrg    The GNU C Library is free software; you can redistribute it and/or
     11  1.1  mrg    modify it under the terms of the GNU Lesser General Public
     12  1.1  mrg    License as published by the Free Software Foundation; either
     13  1.1  mrg    version 2.1 of the License, or (at your option) any later version.
     14  1.1  mrg 
     15  1.1  mrg    In addition to the permissions in the GNU Lesser General Public
     16  1.1  mrg    License, the Free Software Foundation gives you unlimited
     17  1.1  mrg    permission to link the compiled version of this file into
     18  1.1  mrg    combinations with other programs, and to distribute those
     19  1.1  mrg    combinations without any restriction coming from the use of this
     20  1.1  mrg    file.  (The Lesser General Public License restrictions do apply in
     21  1.1  mrg    other respects; for example, they cover modification of the file,
     22  1.1  mrg    and distribution when not linked into a combine executable.)
     23  1.1  mrg 
     24  1.1  mrg    The GNU C Library is distributed in the hope that it will be useful,
     25  1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     26  1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     27  1.1  mrg    Lesser General Public License for more details.
     28  1.1  mrg 
     29  1.1  mrg    You should have received a copy of the GNU Lesser General Public
     30  1.1  mrg    License along with the GNU C Library; if not, see
     31  1.1  mrg    <http://www.gnu.org/licenses/>.  */
     32  1.1  mrg 
     33  1.1  mrg #define _FP_FRAC_DECL_2(X)	_FP_W_TYPE X##_f0, X##_f1
     34  1.1  mrg #define _FP_FRAC_COPY_2(D,S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
     35  1.1  mrg #define _FP_FRAC_SET_2(X,I)	__FP_FRAC_SET_2(X, I)
     36  1.1  mrg #define _FP_FRAC_HIGH_2(X)	(X##_f1)
     37  1.1  mrg #define _FP_FRAC_LOW_2(X)	(X##_f0)
     38  1.1  mrg #define _FP_FRAC_WORD_2(X,w)	(X##_f##w)
     39  1.1  mrg 
     40  1.1  mrg #define _FP_FRAC_SLL_2(X,N)						    \
     41  1.1  mrg (void)(((N) < _FP_W_TYPE_SIZE)						    \
     42  1.1  mrg        ? ({								    \
     43  1.1  mrg 	    if (__builtin_constant_p(N) && (N) == 1)			    \
     44  1.1  mrg 	      {								    \
     45  1.1  mrg 		X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
     46  1.1  mrg 		X##_f0 += X##_f0;					    \
     47  1.1  mrg 	      }								    \
     48  1.1  mrg 	    else							    \
     49  1.1  mrg 	      {								    \
     50  1.1  mrg 		X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
     51  1.1  mrg 		X##_f0 <<= (N);						    \
     52  1.1  mrg 	      }								    \
     53  1.1  mrg 	    0;								    \
     54  1.1  mrg 	  })								    \
     55  1.1  mrg        : ({								    \
     56  1.1  mrg 	    X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);			    \
     57  1.1  mrg 	    X##_f0 = 0;							    \
     58  1.1  mrg 	  }))
     59  1.1  mrg 
     60  1.1  mrg 
     61  1.1  mrg #define _FP_FRAC_SRL_2(X,N)						\
     62  1.1  mrg (void)(((N) < _FP_W_TYPE_SIZE)						\
     63  1.1  mrg        ? ({								\
     64  1.1  mrg 	    X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));	\
     65  1.1  mrg 	    X##_f1 >>= (N);						\
     66  1.1  mrg 	  })								\
     67  1.1  mrg        : ({								\
     68  1.1  mrg 	    X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);			\
     69  1.1  mrg 	    X##_f1 = 0;							\
     70  1.1  mrg 	  }))
     71  1.1  mrg 
     72  1.1  mrg /* Right shift with sticky-lsb.  */
     73  1.1  mrg #define _FP_FRAC_SRST_2(X,S, N,sz)					  \
     74  1.1  mrg (void)(((N) < _FP_W_TYPE_SIZE)						  \
     75  1.1  mrg        ? ({								  \
     76  1.1  mrg 	    S = (__builtin_constant_p(N) && (N) == 1			  \
     77  1.1  mrg 		 ? X##_f0 & 1						  \
     78  1.1  mrg 		 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		  \
     79  1.1  mrg 	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
     80  1.1  mrg 	    X##_f1 >>= (N);						  \
     81  1.1  mrg 	  })								  \
     82  1.1  mrg        : ({								  \
     83  1.1  mrg 	    S = ((((N) == _FP_W_TYPE_SIZE				  \
     84  1.1  mrg 		   ? 0							  \
     85  1.1  mrg 		   : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		  \
     86  1.1  mrg 		  | X##_f0) != 0);					  \
     87  1.1  mrg 	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		  \
     88  1.1  mrg 	    X##_f1 = 0;							  \
     89  1.1  mrg 	  }))
     90  1.1  mrg 
     91  1.1  mrg #define _FP_FRAC_SRS_2(X,N,sz)						  \
     92  1.1  mrg (void)(((N) < _FP_W_TYPE_SIZE)						  \
     93  1.1  mrg        ? ({								  \
     94  1.1  mrg 	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
     95  1.1  mrg 		      (__builtin_constant_p(N) && (N) == 1		  \
     96  1.1  mrg 		       ? X##_f0 & 1					  \
     97  1.1  mrg 		       : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));	  \
     98  1.1  mrg 	    X##_f1 >>= (N);						  \
     99  1.1  mrg 	  })								  \
    100  1.1  mrg        : ({								  \
    101  1.1  mrg 	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |		  \
    102  1.1  mrg 		      ((((N) == _FP_W_TYPE_SIZE				  \
    103  1.1  mrg 			 ? 0						  \
    104  1.1  mrg 			 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	  \
    105  1.1  mrg 			| X##_f0) != 0));				  \
    106  1.1  mrg 	    X##_f1 = 0;							  \
    107  1.1  mrg 	  }))
    108  1.1  mrg 
    109  1.1  mrg #define _FP_FRAC_ADDI_2(X,I)	\
    110  1.1  mrg   __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
    111  1.1  mrg 
    112  1.1  mrg #define _FP_FRAC_ADD_2(R,X,Y)	\
    113  1.1  mrg   __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
    114  1.1  mrg 
    115  1.1  mrg #define _FP_FRAC_SUB_2(R,X,Y)	\
    116  1.1  mrg   __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
    117  1.1  mrg 
    118  1.1  mrg #define _FP_FRAC_DEC_2(X,Y)	\
    119  1.1  mrg   __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
    120  1.1  mrg 
    121  1.1  mrg #define _FP_FRAC_CLZ_2(R,X)	\
    122  1.1  mrg   do {				\
    123  1.1  mrg     if (X##_f1)			\
    124  1.1  mrg       __FP_CLZ(R,X##_f1);	\
    125  1.1  mrg     else 			\
    126  1.1  mrg     {				\
    127  1.1  mrg       __FP_CLZ(R,X##_f0);	\
    128  1.1  mrg       R += _FP_W_TYPE_SIZE;	\
    129  1.1  mrg     }				\
    130  1.1  mrg   } while(0)
    131  1.1  mrg 
    132  1.1  mrg /* Predicates */
    133  1.1  mrg #define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE)X##_f1 < 0)
    134  1.1  mrg #define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
    135  1.1  mrg #define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
    136  1.1  mrg #define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
    137  1.1  mrg #define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
    138  1.1  mrg #define _FP_FRAC_GT_2(X, Y)	\
    139  1.1  mrg   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
    140  1.1  mrg #define _FP_FRAC_GE_2(X, Y)	\
    141  1.1  mrg   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
    142  1.1  mrg 
    143  1.1  mrg #define _FP_ZEROFRAC_2		0, 0
    144  1.1  mrg #define _FP_MINFRAC_2		0, 1
    145  1.1  mrg #define _FP_MAXFRAC_2		(~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
    146  1.1  mrg 
    147  1.1  mrg /*
    148  1.1  mrg  * Internals
    149  1.1  mrg  */
    150  1.1  mrg 
    151  1.1  mrg #define __FP_FRAC_SET_2(X,I1,I0)	(X##_f0 = I0, X##_f1 = I1)
    152  1.1  mrg 
    153  1.1  mrg #define __FP_CLZ_2(R, xh, xl)	\
    154  1.1  mrg   do {				\
    155  1.1  mrg     if (xh)			\
    156  1.1  mrg       __FP_CLZ(R,xh);		\
    157  1.1  mrg     else 			\
    158  1.1  mrg     {				\
    159  1.1  mrg       __FP_CLZ(R,xl);		\
    160  1.1  mrg       R += _FP_W_TYPE_SIZE;	\
    161  1.1  mrg     }				\
    162  1.1  mrg   } while(0)
    163  1.1  mrg 
    164  1.1  mrg #if 0
    165  1.1  mrg 
    166  1.1  mrg #ifndef __FP_FRAC_ADDI_2
    167  1.1  mrg #define __FP_FRAC_ADDI_2(xh, xl, i)	\
    168  1.1  mrg   (xh += ((xl += i) < i))
    169  1.1  mrg #endif
    170  1.1  mrg #ifndef __FP_FRAC_ADD_2
    171  1.1  mrg #define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
    172  1.1  mrg   (rh = xh + yh + ((rl = xl + yl) < xl))
    173  1.1  mrg #endif
    174  1.1  mrg #ifndef __FP_FRAC_SUB_2
    175  1.1  mrg #define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
    176  1.1  mrg   (rh = xh - yh - ((rl = xl - yl) > xl))
    177  1.1  mrg #endif
    178  1.1  mrg #ifndef __FP_FRAC_DEC_2
    179  1.1  mrg #define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
    180  1.1  mrg   do {					\
    181  1.1  mrg     UWtype _t = xl;			\
    182  1.1  mrg     xh -= yh + ((xl -= yl) > _t);	\
    183  1.1  mrg   } while (0)
    184  1.1  mrg #endif
    185  1.1  mrg 
    186  1.1  mrg #else
    187  1.1  mrg 
    188  1.1  mrg #undef __FP_FRAC_ADDI_2
    189  1.1  mrg #define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa(xh, xl, xh, xl, 0, i)
    190  1.1  mrg #undef __FP_FRAC_ADD_2
    191  1.1  mrg #define __FP_FRAC_ADD_2			add_ssaaaa
    192  1.1  mrg #undef __FP_FRAC_SUB_2
    193  1.1  mrg #define __FP_FRAC_SUB_2			sub_ddmmss
    194  1.1  mrg #undef __FP_FRAC_DEC_2
    195  1.1  mrg #define __FP_FRAC_DEC_2(xh, xl, yh, yl)	sub_ddmmss(xh, xl, xh, xl, yh, yl)
    196  1.1  mrg 
    197  1.1  mrg #endif
    198  1.1  mrg 
    199  1.1  mrg /*
    200  1.1  mrg  * Unpack the raw bits of a native fp value.  Do not classify or
    201  1.1  mrg  * normalize the data.
    202  1.1  mrg  */
    203  1.1  mrg 
    204  1.1  mrg #define _FP_UNPACK_RAW_2(fs, X, val)			\
    205  1.1  mrg   do {							\
    206  1.1  mrg     union _FP_UNION_##fs _flo; _flo.flt = (val);	\
    207  1.1  mrg 							\
    208  1.1  mrg     X##_f0 = _flo.bits.frac0;				\
    209  1.1  mrg     X##_f1 = _flo.bits.frac1;				\
    210  1.1  mrg     X##_e  = _flo.bits.exp;				\
    211  1.1  mrg     X##_s  = _flo.bits.sign;				\
    212  1.1  mrg   } while (0)
    213  1.1  mrg 
    214  1.1  mrg #define _FP_UNPACK_RAW_2_P(fs, X, val)			\
    215  1.1  mrg   do {							\
    216  1.1  mrg     union _FP_UNION_##fs *_flo =			\
    217  1.1  mrg       (union _FP_UNION_##fs *)(val);			\
    218  1.1  mrg 							\
    219  1.1  mrg     X##_f0 = _flo->bits.frac0;				\
    220  1.1  mrg     X##_f1 = _flo->bits.frac1;				\
    221  1.1  mrg     X##_e  = _flo->bits.exp;				\
    222  1.1  mrg     X##_s  = _flo->bits.sign;				\
    223  1.1  mrg   } while (0)
    224  1.1  mrg 
    225  1.1  mrg 
    226  1.1  mrg /*
    227  1.1  mrg  * Repack the raw bits of a native fp value.
    228  1.1  mrg  */
    229  1.1  mrg 
    230  1.1  mrg #define _FP_PACK_RAW_2(fs, val, X)			\
    231  1.1  mrg   do {							\
    232  1.1  mrg     union _FP_UNION_##fs _flo;				\
    233  1.1  mrg 							\
    234  1.1  mrg     _flo.bits.frac0 = X##_f0;				\
    235  1.1  mrg     _flo.bits.frac1 = X##_f1;				\
    236  1.1  mrg     _flo.bits.exp   = X##_e;				\
    237  1.1  mrg     _flo.bits.sign  = X##_s;				\
    238  1.1  mrg 							\
    239  1.1  mrg     (val) = _flo.flt;					\
    240  1.1  mrg   } while (0)
    241  1.1  mrg 
    242  1.1  mrg #define _FP_PACK_RAW_2_P(fs, val, X)			\
    243  1.1  mrg   do {							\
    244  1.1  mrg     union _FP_UNION_##fs *_flo =			\
    245  1.1  mrg       (union _FP_UNION_##fs *)(val);			\
    246  1.1  mrg 							\
    247  1.1  mrg     _flo->bits.frac0 = X##_f0;				\
    248  1.1  mrg     _flo->bits.frac1 = X##_f1;				\
    249  1.1  mrg     _flo->bits.exp   = X##_e;				\
    250  1.1  mrg     _flo->bits.sign  = X##_s;				\
    251  1.1  mrg   } while (0)
    252  1.1  mrg 
    253  1.1  mrg 
    254  1.1  mrg /*
    255  1.1  mrg  * Multiplication algorithms:
    256  1.1  mrg  */
    257  1.1  mrg 
    258  1.1  mrg /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
    259  1.1  mrg 
    260  1.1  mrg #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
    261  1.1  mrg   do {									\
    262  1.1  mrg     _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
    263  1.1  mrg 									\
    264  1.1  mrg     doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);	\
    265  1.1  mrg     doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
    266  1.1  mrg     doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
    267  1.1  mrg     doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1);	\
    268  1.1  mrg 									\
    269  1.1  mrg     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    270  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,		\
    271  1.1  mrg 		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    272  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1));				\
    273  1.1  mrg     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    274  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,		\
    275  1.1  mrg 		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    276  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1));				\
    277  1.1  mrg 									\
    278  1.1  mrg     /* Normalize since we know where the msb of the multiplicands	\
    279  1.1  mrg        were (bit B), we know that the msb of the of the product is	\
    280  1.1  mrg        at either 2B or 2B-1.  */					\
    281  1.1  mrg     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    282  1.1  mrg     R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
    283  1.1  mrg     R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
    284  1.1  mrg   } while (0)
    285  1.1  mrg 
    286  1.1  mrg /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
    287  1.1  mrg    Do only 3 multiplications instead of four. This one is for machines
    288  1.1  mrg    where multiplication is much more expensive than subtraction.  */
    289  1.1  mrg 
    290  1.1  mrg #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
    291  1.1  mrg   do {									\
    292  1.1  mrg     _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
    293  1.1  mrg     _FP_W_TYPE _d;							\
    294  1.1  mrg     int _c1, _c2;							\
    295  1.1  mrg 									\
    296  1.1  mrg     _b_f0 = X##_f0 + X##_f1;						\
    297  1.1  mrg     _c1 = _b_f0 < X##_f0;						\
    298  1.1  mrg     _b_f1 = Y##_f0 + Y##_f1;						\
    299  1.1  mrg     _c2 = _b_f1 < Y##_f0;						\
    300  1.1  mrg     doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);			\
    301  1.1  mrg     doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);	\
    302  1.1  mrg     doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
    303  1.1  mrg 									\
    304  1.1  mrg     _b_f0 &= -_c2;							\
    305  1.1  mrg     _b_f1 &= -_c1;							\
    306  1.1  mrg     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    307  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,		\
    308  1.1  mrg 		    0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));	\
    309  1.1  mrg     __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    310  1.1  mrg 		     _b_f0);						\
    311  1.1  mrg     __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    312  1.1  mrg 		     _b_f1);						\
    313  1.1  mrg     __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    314  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1),				\
    315  1.1  mrg 		    0, _d, _FP_FRAC_WORD_4(_z,0));			\
    316  1.1  mrg     __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
    317  1.1  mrg 		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);		\
    318  1.1  mrg     __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),	\
    319  1.1  mrg 		    _c_f1, _c_f0,					\
    320  1.1  mrg 		    _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));	\
    321  1.1  mrg 									\
    322  1.1  mrg     /* Normalize since we know where the msb of the multiplicands	\
    323  1.1  mrg        were (bit B), we know that the msb of the of the product is	\
    324  1.1  mrg        at either 2B or 2B-1.  */					\
    325  1.1  mrg     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    326  1.1  mrg     R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
    327  1.1  mrg     R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
    328  1.1  mrg   } while (0)
    329  1.1  mrg 
    330  1.1  mrg #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
    331  1.1  mrg   do {									\
    332  1.1  mrg     _FP_FRAC_DECL_4(_z);						\
    333  1.1  mrg     _FP_W_TYPE _x[2], _y[2];						\
    334  1.1  mrg     _x[0] = X##_f0; _x[1] = X##_f1;					\
    335  1.1  mrg     _y[0] = Y##_f0; _y[1] = Y##_f1;					\
    336  1.1  mrg 									\
    337  1.1  mrg     mpn_mul_n(_z_f, _x, _y, 2);						\
    338  1.1  mrg 									\
    339  1.1  mrg     /* Normalize since we know where the msb of the multiplicands	\
    340  1.1  mrg        were (bit B), we know that the msb of the of the product is	\
    341  1.1  mrg        at either 2B or 2B-1.  */					\
    342  1.1  mrg     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    343  1.1  mrg     R##_f0 = _z_f[0];							\
    344  1.1  mrg     R##_f1 = _z_f[1];							\
    345  1.1  mrg   } while (0)
    346  1.1  mrg 
    347  1.1  mrg /* Do at most 120x120=240 bits multiplication using double floating
    348  1.1  mrg    point multiplication.  This is useful if floating point
    349  1.1  mrg    multiplication has much bigger throughput than integer multiply.
    350  1.1  mrg    It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
    351  1.1  mrg    between 106 and 120 only.
    352  1.1  mrg    Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
    353  1.1  mrg    SETFETZ is a macro which will disable all FPU exceptions and set rounding
    354  1.1  mrg    towards zero,  RESETFE should optionally reset it back.  */
    355  1.1  mrg 
    356  1.1  mrg #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)	\
    357  1.1  mrg   do {										\
    358  1.1  mrg     static const double _const[] = {						\
    359  1.1  mrg       /* 2^-24 */ 5.9604644775390625e-08,					\
    360  1.1  mrg       /* 2^-48 */ 3.5527136788005009e-15,					\
    361  1.1  mrg       /* 2^-72 */ 2.1175823681357508e-22,					\
    362  1.1  mrg       /* 2^-96 */ 1.2621774483536189e-29,					\
    363  1.1  mrg       /* 2^28 */ 2.68435456e+08,						\
    364  1.1  mrg       /* 2^4 */ 1.600000e+01,							\
    365  1.1  mrg       /* 2^-20 */ 9.5367431640625e-07,						\
    366  1.1  mrg       /* 2^-44 */ 5.6843418860808015e-14,					\
    367  1.1  mrg       /* 2^-68 */ 3.3881317890172014e-21,					\
    368  1.1  mrg       /* 2^-92 */ 2.0194839173657902e-28,					\
    369  1.1  mrg       /* 2^-116 */ 1.2037062152420224e-35};					\
    370  1.1  mrg     double _a240, _b240, _c240, _d240, _e240, _f240, 				\
    371  1.1  mrg 	   _g240, _h240, _i240, _j240, _k240;					\
    372  1.1  mrg     union { double d; UDItype i; } _l240, _m240, _n240, _o240,			\
    373  1.1  mrg 				   _p240, _q240, _r240, _s240;			\
    374  1.1  mrg     UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;			\
    375  1.1  mrg 										\
    376  1.1  mrg     if (wfracbits < 106 || wfracbits > 120)					\
    377  1.1  mrg       abort();									\
    378  1.1  mrg 										\
    379  1.1  mrg     setfetz;									\
    380  1.1  mrg 										\
    381  1.1  mrg     _e240 = (double)(long)(X##_f0 & 0xffffff);					\
    382  1.1  mrg     _j240 = (double)(long)(Y##_f0 & 0xffffff);					\
    383  1.1  mrg     _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);				\
    384  1.1  mrg     _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);				\
    385  1.1  mrg     _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));	\
    386  1.1  mrg     _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));	\
    387  1.1  mrg     _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);				\
    388  1.1  mrg     _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);				\
    389  1.1  mrg     _a240 = (double)(long)(X##_f1 >> 32);					\
    390  1.1  mrg     _f240 = (double)(long)(Y##_f1 >> 32);					\
    391  1.1  mrg     _e240 *= _const[3];								\
    392  1.1  mrg     _j240 *= _const[3];								\
    393  1.1  mrg     _d240 *= _const[2];								\
    394  1.1  mrg     _i240 *= _const[2];								\
    395  1.1  mrg     _c240 *= _const[1];								\
    396  1.1  mrg     _h240 *= _const[1];								\
    397  1.1  mrg     _b240 *= _const[0];								\
    398  1.1  mrg     _g240 *= _const[0];								\
    399  1.1  mrg     _s240.d =							      _e240*_j240;\
    400  1.1  mrg     _r240.d =						_d240*_j240 + _e240*_i240;\
    401  1.1  mrg     _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240;\
    402  1.1  mrg     _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
    403  1.1  mrg     _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
    404  1.1  mrg     _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;		\
    405  1.1  mrg     _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;				\
    406  1.1  mrg     _l240.d = _a240*_g240 + _b240*_f240;					\
    407  1.1  mrg     _k240 =   _a240*_f240;							\
    408  1.1  mrg     _r240.d += _s240.d;								\
    409  1.1  mrg     _q240.d += _r240.d;								\
    410  1.1  mrg     _p240.d += _q240.d;								\
    411  1.1  mrg     _o240.d += _p240.d;								\
    412  1.1  mrg     _n240.d += _o240.d;								\
    413  1.1  mrg     _m240.d += _n240.d;								\
    414  1.1  mrg     _l240.d += _m240.d;								\
    415  1.1  mrg     _k240 += _l240.d;								\
    416  1.1  mrg     _s240.d -= ((_const[10]+_s240.d)-_const[10]);				\
    417  1.1  mrg     _r240.d -= ((_const[9]+_r240.d)-_const[9]);					\
    418  1.1  mrg     _q240.d -= ((_const[8]+_q240.d)-_const[8]);					\
    419  1.1  mrg     _p240.d -= ((_const[7]+_p240.d)-_const[7]);					\
    420  1.1  mrg     _o240.d += _const[7];							\
    421  1.1  mrg     _n240.d += _const[6];							\
    422  1.1  mrg     _m240.d += _const[5];							\
    423  1.1  mrg     _l240.d += _const[4];							\
    424  1.1  mrg     if (_s240.d != 0.0) _y240 = 1;						\
    425  1.1  mrg     if (_r240.d != 0.0) _y240 = 1;						\
    426  1.1  mrg     if (_q240.d != 0.0) _y240 = 1;						\
    427  1.1  mrg     if (_p240.d != 0.0) _y240 = 1;						\
    428  1.1  mrg     _t240 = (DItype)_k240;							\
    429  1.1  mrg     _u240 = _l240.i;								\
    430  1.1  mrg     _v240 = _m240.i;								\
    431  1.1  mrg     _w240 = _n240.i;								\
    432  1.1  mrg     _x240 = _o240.i;								\
    433  1.1  mrg     R##_f1 = (_t240 << (128 - (wfracbits - 1)))					\
    434  1.1  mrg 	     | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));			\
    435  1.1  mrg     R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))			\
    436  1.1  mrg     	     | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))			\
    437  1.1  mrg     	     | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))			\
    438  1.1  mrg     	     | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))			\
    439  1.1  mrg     	     | _y240;								\
    440  1.1  mrg     resetfe;									\
    441  1.1  mrg   } while (0)
    442  1.1  mrg 
    443  1.1  mrg /*
    444  1.1  mrg  * Division algorithms:
    445  1.1  mrg  */
    446  1.1  mrg 
    447  1.1  mrg #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
    448  1.1  mrg   do {									\
    449  1.1  mrg     _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;		\
    450  1.1  mrg     if (_FP_FRAC_GT_2(X, Y))						\
    451  1.1  mrg       {									\
    452  1.1  mrg 	_n_f2 = X##_f1 >> 1;						\
    453  1.1  mrg 	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
    454  1.1  mrg 	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);			\
    455  1.1  mrg       }									\
    456  1.1  mrg     else								\
    457  1.1  mrg       {									\
    458  1.1  mrg 	R##_e--;							\
    459  1.1  mrg 	_n_f2 = X##_f1;							\
    460  1.1  mrg 	_n_f1 = X##_f0;							\
    461  1.1  mrg 	_n_f0 = 0;							\
    462  1.1  mrg       }									\
    463  1.1  mrg 									\
    464  1.1  mrg     /* Normalize, i.e. make the most significant bit of the 		\
    465  1.1  mrg        denominator set. */						\
    466  1.1  mrg     _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);				\
    467  1.1  mrg 									\
    468  1.1  mrg     udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);			\
    469  1.1  mrg     umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);				\
    470  1.1  mrg     _r_f0 = _n_f0;							\
    471  1.1  mrg     if (_FP_FRAC_GT_2(_m, _r))						\
    472  1.1  mrg       {									\
    473  1.1  mrg 	R##_f1--;							\
    474  1.1  mrg 	_FP_FRAC_ADD_2(_r, Y, _r);					\
    475  1.1  mrg 	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
    476  1.1  mrg 	  {								\
    477  1.1  mrg 	    R##_f1--;							\
    478  1.1  mrg 	    _FP_FRAC_ADD_2(_r, Y, _r);					\
    479  1.1  mrg 	  }								\
    480  1.1  mrg       }									\
    481  1.1  mrg     _FP_FRAC_DEC_2(_r, _m);						\
    482  1.1  mrg 									\
    483  1.1  mrg     if (_r_f1 == Y##_f1)						\
    484  1.1  mrg       {									\
    485  1.1  mrg 	/* This is a special case, not an optimization			\
    486  1.1  mrg 	   (_r/Y##_f1 would not fit into UWtype).			\
    487  1.1  mrg 	   As _r is guaranteed to be < Y,  R##_f0 can be either		\
    488  1.1  mrg 	   (UWtype)-1 or (UWtype)-2.  But as we know what kind		\
    489  1.1  mrg 	   of bits it is (sticky, guard, round),  we don't care.	\
    490  1.1  mrg 	   We also don't care what the reminder is,  because the	\
    491  1.1  mrg 	   guard bit will be set anyway.  -jj */			\
    492  1.1  mrg 	R##_f0 = -1;							\
    493  1.1  mrg       }									\
    494  1.1  mrg     else								\
    495  1.1  mrg       {									\
    496  1.1  mrg 	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);		\
    497  1.1  mrg 	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);			\
    498  1.1  mrg 	_r_f0 = 0;							\
    499  1.1  mrg 	if (_FP_FRAC_GT_2(_m, _r))					\
    500  1.1  mrg 	  {								\
    501  1.1  mrg 	    R##_f0--;							\
    502  1.1  mrg 	    _FP_FRAC_ADD_2(_r, Y, _r);					\
    503  1.1  mrg 	    if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
    504  1.1  mrg 	      {								\
    505  1.1  mrg 		R##_f0--;						\
    506  1.1  mrg 		_FP_FRAC_ADD_2(_r, Y, _r);				\
    507  1.1  mrg 	      }								\
    508  1.1  mrg 	  }								\
    509  1.1  mrg 	if (!_FP_FRAC_EQ_2(_r, _m))					\
    510  1.1  mrg 	  R##_f0 |= _FP_WORK_STICKY;					\
    511  1.1  mrg       }									\
    512  1.1  mrg   } while (0)
    513  1.1  mrg 
    514  1.1  mrg 
    515  1.1  mrg #define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)					\
    516  1.1  mrg   do {									\
    517  1.1  mrg     _FP_W_TYPE _x[4], _y[2], _z[4];					\
    518  1.1  mrg     _y[0] = Y##_f0; _y[1] = Y##_f1;					\
    519  1.1  mrg     _x[0] = _x[3] = 0;							\
    520  1.1  mrg     if (_FP_FRAC_GT_2(X, Y))						\
    521  1.1  mrg       {									\
    522  1.1  mrg 	R##_e++;							\
    523  1.1  mrg 	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |	\
    524  1.1  mrg 		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
    525  1.1  mrg 			    (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE)));	\
    526  1.1  mrg 	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);	\
    527  1.1  mrg       }									\
    528  1.1  mrg     else								\
    529  1.1  mrg       {									\
    530  1.1  mrg 	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |	\
    531  1.1  mrg 		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
    532  1.1  mrg 			    (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));	\
    533  1.1  mrg 	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);	\
    534  1.1  mrg       }									\
    535  1.1  mrg 									\
    536  1.1  mrg     (void) mpn_divrem (_z, 0, _x, 4, _y, 2);				\
    537  1.1  mrg     R##_f1 = _z[1];							\
    538  1.1  mrg     R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);				\
    539  1.1  mrg   } while (0)
    540  1.1  mrg 
    541  1.1  mrg 
    542  1.1  mrg /*
    543  1.1  mrg  * Square root algorithms:
    544  1.1  mrg  * We have just one right now, maybe Newton approximation
    545  1.1  mrg  * should be added for those machines where division is fast.
    546  1.1  mrg  */
    547  1.1  mrg 
    548  1.1  mrg #define _FP_SQRT_MEAT_2(R, S, T, X, q)			\
    549  1.1  mrg   do {							\
    550  1.1  mrg     while (q)						\
    551  1.1  mrg       {							\
    552  1.1  mrg 	T##_f1 = S##_f1 + q;				\
    553  1.1  mrg 	if (T##_f1 <= X##_f1)				\
    554  1.1  mrg 	  {						\
    555  1.1  mrg 	    S##_f1 = T##_f1 + q;			\
    556  1.1  mrg 	    X##_f1 -= T##_f1;				\
    557  1.1  mrg 	    R##_f1 += q;				\
    558  1.1  mrg 	  }						\
    559  1.1  mrg 	_FP_FRAC_SLL_2(X, 1);				\
    560  1.1  mrg 	q >>= 1;					\
    561  1.1  mrg       }							\
    562  1.1  mrg     q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);		\
    563  1.1  mrg     while (q != _FP_WORK_ROUND)				\
    564  1.1  mrg       {							\
    565  1.1  mrg 	T##_f0 = S##_f0 + q;				\
    566  1.1  mrg 	T##_f1 = S##_f1;				\
    567  1.1  mrg 	if (T##_f1 < X##_f1 || 				\
    568  1.1  mrg 	    (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
    569  1.1  mrg 	  {						\
    570  1.1  mrg 	    S##_f0 = T##_f0 + q;			\
    571  1.1  mrg 	    S##_f1 += (T##_f0 > S##_f0);		\
    572  1.1  mrg 	    _FP_FRAC_DEC_2(X, T);			\
    573  1.1  mrg 	    R##_f0 += q;				\
    574  1.1  mrg 	  }						\
    575  1.1  mrg 	_FP_FRAC_SLL_2(X, 1);				\
    576  1.1  mrg 	q >>= 1;					\
    577  1.1  mrg       }							\
    578  1.1  mrg     if (X##_f0 | X##_f1)				\
    579  1.1  mrg       {							\
    580  1.1  mrg 	if (S##_f1 < X##_f1 || 				\
    581  1.1  mrg 	    (S##_f1 == X##_f1 && S##_f0 < X##_f0))	\
    582  1.1  mrg 	  R##_f0 |= _FP_WORK_ROUND;			\
    583  1.1  mrg 	R##_f0 |= _FP_WORK_STICKY;			\
    584  1.1  mrg       }							\
    585  1.1  mrg   } while (0)
    586  1.1  mrg 
    587  1.1  mrg 
    588  1.1  mrg /*
    589  1.1  mrg  * Assembly/disassembly for converting to/from integral types.
    590  1.1  mrg  * No shifting or overflow handled here.
    591  1.1  mrg  */
    592  1.1  mrg 
    593  1.1  mrg #define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
    594  1.1  mrg (void)((rsize <= _FP_W_TYPE_SIZE)		\
    595  1.1  mrg        ? ({ r = X##_f0; })			\
    596  1.1  mrg        : ({					\
    597  1.1  mrg 	    r = X##_f1;				\
    598  1.1  mrg 	    r <<= _FP_W_TYPE_SIZE;		\
    599  1.1  mrg 	    r += X##_f0;			\
    600  1.1  mrg 	  }))
    601  1.1  mrg 
    602  1.1  mrg #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)				\
    603  1.1  mrg   do {									\
    604  1.1  mrg     X##_f0 = r;								\
    605  1.1  mrg     X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);	\
    606  1.1  mrg   } while (0)
    607  1.1  mrg 
    608  1.1  mrg /*
    609  1.1  mrg  * Convert FP values between word sizes
    610  1.1  mrg  */
    611  1.1  mrg 
    612  1.1  mrg #define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)
    613  1.1  mrg 
    614  1.1  mrg #define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))
    615  1.1  mrg 
    616  1.1  mrg #define _FP_FRAC_COPY_2_2(D,S)		_FP_FRAC_COPY_2(D,S)
    617