Home | History | Annotate | Line # | Download | only in ia64
lib1funcs.S revision 1.1
      1  1.1  mrg /* Copyright (C) 2000-2013 Free Software Foundation, Inc.
      2  1.1  mrg    Contributed by James E. Wilson <wilson (at) cygnus.com>.
      3  1.1  mrg 
      4  1.1  mrg    This file is part of GCC.
      5  1.1  mrg 
      6  1.1  mrg    GCC is free software; you can redistribute it and/or modify
      7  1.1  mrg    it under the terms of the GNU General Public License as published by
      8  1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
      9  1.1  mrg    any later version.
     10  1.1  mrg 
     11  1.1  mrg    GCC is distributed in the hope that it will be useful,
     12  1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14  1.1  mrg    GNU General Public License for more details.
     15  1.1  mrg 
     16  1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     17  1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     18  1.1  mrg    3.1, as published by the Free Software Foundation.
     19  1.1  mrg 
     20  1.1  mrg    You should have received a copy of the GNU General Public License and
     21  1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     22  1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23  1.1  mrg    <http://www.gnu.org/licenses/>.  */
     24  1.1  mrg 
     25  1.1  mrg #ifdef L__divxf3
     26  1.1  mrg // Compute a 80-bit IEEE double-extended quotient.
     27  1.1  mrg //
     28  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
     29  1.1  mrg // alternative.
     30  1.1  mrg //
     31  1.1  mrg // farg0 holds the dividend.  farg1 holds the divisor.
     32  1.1  mrg //
     33  1.1  mrg // __divtf3 is an alternate symbol name for backward compatibility.
     34  1.1  mrg 
     35  1.1  mrg 	.text
     36  1.1  mrg 	.align 16
     37  1.1  mrg 	.global __divxf3
     38  1.1  mrg 	.proc __divxf3
     39  1.1  mrg __divxf3:
     40  1.1  mrg #ifdef SHARED
     41  1.1  mrg 	.global __divtf3
     42  1.1  mrg __divtf3:
     43  1.1  mrg #endif
     44  1.1  mrg 	cmp.eq p7, p0 = r0, r0
     45  1.1  mrg 	frcpa.s0 f10, p6 = farg0, farg1
     46  1.1  mrg 	;;
     47  1.1  mrg (p6)	cmp.ne p7, p0 = r0, r0
     48  1.1  mrg 	.pred.rel.mutex p6, p7
     49  1.1  mrg (p6)	fnma.s1 f11 = farg1, f10, f1
     50  1.1  mrg (p6)	fma.s1 f12 = farg0, f10, f0
     51  1.1  mrg 	;;
     52  1.1  mrg (p6)	fma.s1 f13 = f11, f11, f0
     53  1.1  mrg (p6)	fma.s1 f14 = f11, f11, f11
     54  1.1  mrg 	;;
     55  1.1  mrg (p6)	fma.s1 f11 = f13, f13, f11
     56  1.1  mrg (p6)	fma.s1 f13 = f14, f10, f10
     57  1.1  mrg 	;;
     58  1.1  mrg (p6)	fma.s1 f10 = f13, f11, f10
     59  1.1  mrg (p6)	fnma.s1 f11 = farg1, f12, farg0
     60  1.1  mrg 	;;
     61  1.1  mrg (p6)	fma.s1 f11 = f11, f10, f12
     62  1.1  mrg (p6)	fnma.s1 f12 = farg1, f10, f1
     63  1.1  mrg 	;;
     64  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f10
     65  1.1  mrg (p6)	fnma.s1 f12 = farg1, f11, farg0
     66  1.1  mrg 	;;
     67  1.1  mrg (p6)	fma.s0 fret0 = f12, f10, f11
     68  1.1  mrg (p7)	mov fret0 = f10
     69  1.1  mrg 	br.ret.sptk rp
     70  1.1  mrg 	.endp __divxf3
     71  1.1  mrg #endif
     72  1.1  mrg 
     73  1.1  mrg #ifdef L__divdf3
     74  1.1  mrg // Compute a 64-bit IEEE double quotient.
     75  1.1  mrg //
     76  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
     77  1.1  mrg // alternative.
     78  1.1  mrg //
     79  1.1  mrg // farg0 holds the dividend.  farg1 holds the divisor.
     80  1.1  mrg 
     81  1.1  mrg 	.text
     82  1.1  mrg 	.align 16
     83  1.1  mrg 	.global __divdf3
     84  1.1  mrg 	.proc __divdf3
     85  1.1  mrg __divdf3:
     86  1.1  mrg 	cmp.eq p7, p0 = r0, r0
     87  1.1  mrg 	frcpa.s0 f10, p6 = farg0, farg1
     88  1.1  mrg 	;;
     89  1.1  mrg (p6)	cmp.ne p7, p0 = r0, r0
     90  1.1  mrg 	.pred.rel.mutex p6, p7
     91  1.1  mrg (p6)	fmpy.s1 f11 = farg0, f10
     92  1.1  mrg (p6)	fnma.s1 f12 = farg1, f10, f1
     93  1.1  mrg 	;;
     94  1.1  mrg (p6)	fma.s1 f11 = f12, f11, f11
     95  1.1  mrg (p6)	fmpy.s1 f13 = f12, f12
     96  1.1  mrg 	;;
     97  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f10
     98  1.1  mrg (p6)	fma.s1 f11 = f13, f11, f11
     99  1.1  mrg 	;;
    100  1.1  mrg (p6)	fmpy.s1 f12 = f13, f13
    101  1.1  mrg (p6)	fma.s1 f10 = f13, f10, f10
    102  1.1  mrg 	;;
    103  1.1  mrg (p6)	fma.d.s1 f11 = f12, f11, f11
    104  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f10
    105  1.1  mrg 	;;
    106  1.1  mrg (p6)	fnma.d.s1 f8 = farg1, f11, farg0
    107  1.1  mrg 	;;
    108  1.1  mrg (p6)	fma.d fret0 = f8, f10, f11
    109  1.1  mrg (p7)	mov fret0 = f10
    110  1.1  mrg 	br.ret.sptk rp
    111  1.1  mrg 	;;
    112  1.1  mrg 	.endp __divdf3
    113  1.1  mrg #endif
    114  1.1  mrg 
    115  1.1  mrg #ifdef L__divsf3
    116  1.1  mrg // Compute a 32-bit IEEE float quotient.
    117  1.1  mrg //
    118  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    119  1.1  mrg // alternative.
    120  1.1  mrg //
    121  1.1  mrg // farg0 holds the dividend.  farg1 holds the divisor.
    122  1.1  mrg 
    123  1.1  mrg 	.text
    124  1.1  mrg 	.align 16
    125  1.1  mrg 	.global __divsf3
    126  1.1  mrg 	.proc __divsf3
    127  1.1  mrg __divsf3:
    128  1.1  mrg 	cmp.eq p7, p0 = r0, r0
    129  1.1  mrg 	frcpa.s0 f10, p6 = farg0, farg1
    130  1.1  mrg 	;;
    131  1.1  mrg (p6)	cmp.ne p7, p0 = r0, r0
    132  1.1  mrg 	.pred.rel.mutex p6, p7
    133  1.1  mrg (p6)	fmpy.s1 f8 = farg0, f10
    134  1.1  mrg (p6)	fnma.s1 f9 = farg1, f10, f1
    135  1.1  mrg 	;;
    136  1.1  mrg (p6)	fma.s1 f8 = f9, f8, f8
    137  1.1  mrg (p6)	fmpy.s1 f9 = f9, f9
    138  1.1  mrg 	;;
    139  1.1  mrg (p6)	fma.s1 f8 = f9, f8, f8
    140  1.1  mrg (p6)	fmpy.s1 f9 = f9, f9
    141  1.1  mrg 	;;
    142  1.1  mrg (p6)	fma.d.s1 f10 = f9, f8, f8
    143  1.1  mrg 	;;
    144  1.1  mrg (p6)	fnorm.s.s0 fret0 = f10
    145  1.1  mrg (p7)	mov fret0 = f10
    146  1.1  mrg 	br.ret.sptk rp
    147  1.1  mrg 	;;
    148  1.1  mrg 	.endp __divsf3
    149  1.1  mrg #endif
    150  1.1  mrg 
    151  1.1  mrg #ifdef L__divdi3
    152  1.1  mrg // Compute a 64-bit integer quotient.
    153  1.1  mrg //
    154  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    155  1.1  mrg // alternative.
    156  1.1  mrg //
    157  1.1  mrg // in0 holds the dividend.  in1 holds the divisor.
    158  1.1  mrg 
    159  1.1  mrg 	.text
    160  1.1  mrg 	.align 16
    161  1.1  mrg 	.global __divdi3
    162  1.1  mrg 	.proc __divdi3
    163  1.1  mrg __divdi3:
    164  1.1  mrg 	.regstk 2,0,0,0
    165  1.1  mrg 	// Transfer inputs to FP registers.
    166  1.1  mrg 	setf.sig f8 = in0
    167  1.1  mrg 	setf.sig f9 = in1
    168  1.1  mrg 	// Check divide by zero.
    169  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    170  1.1  mrg 	;;
    171  1.1  mrg 	// Convert the inputs to FP, so that they won't be treated as unsigned.
    172  1.1  mrg 	fcvt.xf f8 = f8
    173  1.1  mrg 	fcvt.xf f9 = f9
    174  1.1  mrg (p7)	break 1
    175  1.1  mrg 	;;
    176  1.1  mrg 	// Compute the reciprocal approximation.
    177  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    178  1.1  mrg 	;;
    179  1.1  mrg 	// 3 Newton-Raphson iterations.
    180  1.1  mrg (p6)	fnma.s1 f11 = f9, f10, f1
    181  1.1  mrg (p6)	fmpy.s1 f12 = f8, f10
    182  1.1  mrg 	;;
    183  1.1  mrg (p6)	fmpy.s1 f13 = f11, f11
    184  1.1  mrg (p6)	fma.s1 f12 = f11, f12, f12
    185  1.1  mrg 	;;
    186  1.1  mrg (p6)	fma.s1 f10 = f11, f10, f10
    187  1.1  mrg (p6)	fma.s1 f11 = f13, f12, f12
    188  1.1  mrg 	;;
    189  1.1  mrg (p6)	fma.s1 f10 = f13, f10, f10
    190  1.1  mrg (p6)	fnma.s1 f12 = f9, f11, f8
    191  1.1  mrg 	;;
    192  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f11
    193  1.1  mrg 	;;
    194  1.1  mrg 	// Round quotient to an integer.
    195  1.1  mrg 	fcvt.fx.trunc.s1 f10 = f10
    196  1.1  mrg 	;;
    197  1.1  mrg 	// Transfer result to GP registers.
    198  1.1  mrg 	getf.sig ret0 = f10
    199  1.1  mrg 	br.ret.sptk rp
    200  1.1  mrg 	;;
    201  1.1  mrg 	.endp __divdi3
    202  1.1  mrg #endif
    203  1.1  mrg 
    204  1.1  mrg #ifdef L__moddi3
    205  1.1  mrg // Compute a 64-bit integer modulus.
    206  1.1  mrg //
    207  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    208  1.1  mrg // alternative.
    209  1.1  mrg //
    210  1.1  mrg // in0 holds the dividend (a).  in1 holds the divisor (b).
    211  1.1  mrg 
    212  1.1  mrg 	.text
    213  1.1  mrg 	.align 16
    214  1.1  mrg 	.global __moddi3
    215  1.1  mrg 	.proc __moddi3
    216  1.1  mrg __moddi3:
    217  1.1  mrg 	.regstk 2,0,0,0
    218  1.1  mrg 	// Transfer inputs to FP registers.
    219  1.1  mrg 	setf.sig f14 = in0
    220  1.1  mrg 	setf.sig f9 = in1
    221  1.1  mrg 	// Check divide by zero.
    222  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    223  1.1  mrg 	;;
    224  1.1  mrg 	// Convert the inputs to FP, so that they won't be treated as unsigned.
    225  1.1  mrg 	fcvt.xf f8 = f14
    226  1.1  mrg 	fcvt.xf f9 = f9
    227  1.1  mrg (p7)	break 1
    228  1.1  mrg 	;;
    229  1.1  mrg 	// Compute the reciprocal approximation.
    230  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    231  1.1  mrg 	;;
    232  1.1  mrg 	// 3 Newton-Raphson iterations.
    233  1.1  mrg (p6)	fmpy.s1 f12 = f8, f10
    234  1.1  mrg (p6)	fnma.s1 f11 = f9, f10, f1
    235  1.1  mrg 	;;
    236  1.1  mrg (p6)	fma.s1 f12 = f11, f12, f12
    237  1.1  mrg (p6)	fmpy.s1 f13 = f11, f11
    238  1.1  mrg 	;;
    239  1.1  mrg (p6)	fma.s1 f10 = f11, f10, f10
    240  1.1  mrg (p6)	fma.s1 f11 = f13, f12, f12
    241  1.1  mrg 	;;
    242  1.1  mrg 	sub in1 = r0, in1
    243  1.1  mrg (p6)	fma.s1 f10 = f13, f10, f10
    244  1.1  mrg (p6)	fnma.s1 f12 = f9, f11, f8
    245  1.1  mrg 	;;
    246  1.1  mrg 	setf.sig f9 = in1
    247  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f11
    248  1.1  mrg 	;;
    249  1.1  mrg 	fcvt.fx.trunc.s1 f10 = f10
    250  1.1  mrg 	;;
    251  1.1  mrg 	// r = q * (-b) + a
    252  1.1  mrg 	xma.l f10 = f10, f9, f14
    253  1.1  mrg 	;;
    254  1.1  mrg 	// Transfer result to GP registers.
    255  1.1  mrg 	getf.sig ret0 = f10
    256  1.1  mrg 	br.ret.sptk rp
    257  1.1  mrg 	;;
    258  1.1  mrg 	.endp __moddi3
    259  1.1  mrg #endif
    260  1.1  mrg 
    261  1.1  mrg #ifdef L__udivdi3
    262  1.1  mrg // Compute a 64-bit unsigned integer quotient.
    263  1.1  mrg //
    264  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    265  1.1  mrg // alternative.
    266  1.1  mrg //
    267  1.1  mrg // in0 holds the dividend.  in1 holds the divisor.
    268  1.1  mrg 
    269  1.1  mrg 	.text
    270  1.1  mrg 	.align 16
    271  1.1  mrg 	.global __udivdi3
    272  1.1  mrg 	.proc __udivdi3
    273  1.1  mrg __udivdi3:
    274  1.1  mrg 	.regstk 2,0,0,0
    275  1.1  mrg 	// Transfer inputs to FP registers.
    276  1.1  mrg 	setf.sig f8 = in0
    277  1.1  mrg 	setf.sig f9 = in1
    278  1.1  mrg 	// Check divide by zero.
    279  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    280  1.1  mrg 	;;
    281  1.1  mrg 	// Convert the inputs to FP, to avoid FP software-assist faults.
    282  1.1  mrg 	fcvt.xuf.s1 f8 = f8
    283  1.1  mrg 	fcvt.xuf.s1 f9 = f9
    284  1.1  mrg (p7)	break 1
    285  1.1  mrg 	;;
    286  1.1  mrg 	// Compute the reciprocal approximation.
    287  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    288  1.1  mrg 	;;
    289  1.1  mrg 	// 3 Newton-Raphson iterations.
    290  1.1  mrg (p6)	fnma.s1 f11 = f9, f10, f1
    291  1.1  mrg (p6)	fmpy.s1 f12 = f8, f10
    292  1.1  mrg 	;;
    293  1.1  mrg (p6)	fmpy.s1 f13 = f11, f11
    294  1.1  mrg (p6)	fma.s1 f12 = f11, f12, f12
    295  1.1  mrg 	;;
    296  1.1  mrg (p6)	fma.s1 f10 = f11, f10, f10
    297  1.1  mrg (p6)	fma.s1 f11 = f13, f12, f12
    298  1.1  mrg 	;;
    299  1.1  mrg (p6)	fma.s1 f10 = f13, f10, f10
    300  1.1  mrg (p6)	fnma.s1 f12 = f9, f11, f8
    301  1.1  mrg 	;;
    302  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f11
    303  1.1  mrg 	;;
    304  1.1  mrg 	// Round quotient to an unsigned integer.
    305  1.1  mrg 	fcvt.fxu.trunc.s1 f10 = f10
    306  1.1  mrg 	;;
    307  1.1  mrg 	// Transfer result to GP registers.
    308  1.1  mrg 	getf.sig ret0 = f10
    309  1.1  mrg 	br.ret.sptk rp
    310  1.1  mrg 	;;
    311  1.1  mrg 	.endp __udivdi3
    312  1.1  mrg #endif
    313  1.1  mrg 
    314  1.1  mrg #ifdef L__umoddi3
    315  1.1  mrg // Compute a 64-bit unsigned integer modulus.
    316  1.1  mrg //
    317  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    318  1.1  mrg // alternative.
    319  1.1  mrg //
    320  1.1  mrg // in0 holds the dividend (a).  in1 holds the divisor (b).
    321  1.1  mrg 
    322  1.1  mrg 	.text
    323  1.1  mrg 	.align 16
    324  1.1  mrg 	.global __umoddi3
    325  1.1  mrg 	.proc __umoddi3
    326  1.1  mrg __umoddi3:
    327  1.1  mrg 	.regstk 2,0,0,0
    328  1.1  mrg 	// Transfer inputs to FP registers.
    329  1.1  mrg 	setf.sig f14 = in0
    330  1.1  mrg 	setf.sig f9 = in1
    331  1.1  mrg 	// Check divide by zero.
    332  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    333  1.1  mrg 	;;
    334  1.1  mrg 	// Convert the inputs to FP, to avoid FP software assist faults.
    335  1.1  mrg 	fcvt.xuf.s1 f8 = f14
    336  1.1  mrg 	fcvt.xuf.s1 f9 = f9
    337  1.1  mrg (p7)	break 1;
    338  1.1  mrg 	;;
    339  1.1  mrg 	// Compute the reciprocal approximation.
    340  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    341  1.1  mrg 	;;
    342  1.1  mrg 	// 3 Newton-Raphson iterations.
    343  1.1  mrg (p6)	fmpy.s1 f12 = f8, f10
    344  1.1  mrg (p6)	fnma.s1 f11 = f9, f10, f1
    345  1.1  mrg 	;;
    346  1.1  mrg (p6)	fma.s1 f12 = f11, f12, f12
    347  1.1  mrg (p6)	fmpy.s1 f13 = f11, f11
    348  1.1  mrg 	;;
    349  1.1  mrg (p6)	fma.s1 f10 = f11, f10, f10
    350  1.1  mrg (p6)	fma.s1 f11 = f13, f12, f12
    351  1.1  mrg 	;;
    352  1.1  mrg 	sub in1 = r0, in1
    353  1.1  mrg (p6)	fma.s1 f10 = f13, f10, f10
    354  1.1  mrg (p6)	fnma.s1 f12 = f9, f11, f8
    355  1.1  mrg 	;;
    356  1.1  mrg 	setf.sig f9 = in1
    357  1.1  mrg (p6)	fma.s1 f10 = f12, f10, f11
    358  1.1  mrg 	;;
    359  1.1  mrg 	// Round quotient to an unsigned integer.
    360  1.1  mrg 	fcvt.fxu.trunc.s1 f10 = f10
    361  1.1  mrg 	;;
    362  1.1  mrg 	// r = q * (-b) + a
    363  1.1  mrg 	xma.l f10 = f10, f9, f14
    364  1.1  mrg 	;;
    365  1.1  mrg 	// Transfer result to GP registers.
    366  1.1  mrg 	getf.sig ret0 = f10
    367  1.1  mrg 	br.ret.sptk rp
    368  1.1  mrg 	;;
    369  1.1  mrg 	.endp __umoddi3
    370  1.1  mrg #endif
    371  1.1  mrg 
    372  1.1  mrg #ifdef L__divsi3
    373  1.1  mrg // Compute a 32-bit integer quotient.
    374  1.1  mrg //
    375  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    376  1.1  mrg // alternative.
    377  1.1  mrg //
    378  1.1  mrg // in0 holds the dividend.  in1 holds the divisor.
    379  1.1  mrg 
    380  1.1  mrg 	.text
    381  1.1  mrg 	.align 16
    382  1.1  mrg 	.global __divsi3
    383  1.1  mrg 	.proc __divsi3
    384  1.1  mrg __divsi3:
    385  1.1  mrg 	.regstk 2,0,0,0
    386  1.1  mrg 	// Check divide by zero.
    387  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    388  1.1  mrg 	sxt4 in0 = in0
    389  1.1  mrg 	sxt4 in1 = in1
    390  1.1  mrg 	;;
    391  1.1  mrg 	setf.sig f8 = in0
    392  1.1  mrg 	setf.sig f9 = in1
    393  1.1  mrg (p7)	break 1
    394  1.1  mrg 	;;
    395  1.1  mrg 	mov r2 = 0x0ffdd
    396  1.1  mrg 	fcvt.xf f8 = f8
    397  1.1  mrg 	fcvt.xf f9 = f9
    398  1.1  mrg 	;;
    399  1.1  mrg 	setf.exp f11 = r2
    400  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    401  1.1  mrg 	;;
    402  1.1  mrg (p6)	fmpy.s1 f8 = f8, f10
    403  1.1  mrg (p6)	fnma.s1 f9 = f9, f10, f1
    404  1.1  mrg 	;;
    405  1.1  mrg (p6)	fma.s1 f8 = f9, f8, f8
    406  1.1  mrg (p6)	fma.s1 f9 = f9, f9, f11
    407  1.1  mrg 	;;
    408  1.1  mrg (p6)	fma.s1 f10 = f9, f8, f8
    409  1.1  mrg 	;;
    410  1.1  mrg 	fcvt.fx.trunc.s1 f10 = f10
    411  1.1  mrg 	;;
    412  1.1  mrg 	getf.sig ret0 = f10
    413  1.1  mrg 	br.ret.sptk rp
    414  1.1  mrg 	;;
    415  1.1  mrg 	.endp __divsi3
    416  1.1  mrg #endif
    417  1.1  mrg 
    418  1.1  mrg #ifdef L__modsi3
    419  1.1  mrg // Compute a 32-bit integer modulus.
    420  1.1  mrg //
    421  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    422  1.1  mrg // alternative.
    423  1.1  mrg //
    424  1.1  mrg // in0 holds the dividend.  in1 holds the divisor.
    425  1.1  mrg 
    426  1.1  mrg 	.text
    427  1.1  mrg 	.align 16
    428  1.1  mrg 	.global __modsi3
    429  1.1  mrg 	.proc __modsi3
    430  1.1  mrg __modsi3:
    431  1.1  mrg 	.regstk 2,0,0,0
    432  1.1  mrg 	mov r2 = 0x0ffdd
    433  1.1  mrg 	sxt4 in0 = in0
    434  1.1  mrg 	sxt4 in1 = in1
    435  1.1  mrg 	;;
    436  1.1  mrg 	setf.sig f13 = r32
    437  1.1  mrg 	setf.sig f9 = r33
    438  1.1  mrg 	// Check divide by zero.
    439  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    440  1.1  mrg 	;;
    441  1.1  mrg 	sub in1 = r0, in1
    442  1.1  mrg 	fcvt.xf f8 = f13
    443  1.1  mrg 	fcvt.xf f9 = f9
    444  1.1  mrg 	;;
    445  1.1  mrg 	setf.exp f11 = r2
    446  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    447  1.1  mrg (p7)	break 1
    448  1.1  mrg 	;;
    449  1.1  mrg (p6)	fmpy.s1 f12 = f8, f10
    450  1.1  mrg (p6)	fnma.s1 f10 = f9, f10, f1
    451  1.1  mrg 	;;
    452  1.1  mrg 	setf.sig f9 = in1
    453  1.1  mrg (p6)	fma.s1 f12 = f10, f12, f12
    454  1.1  mrg (p6)	fma.s1 f10 = f10, f10, f11
    455  1.1  mrg 	;;
    456  1.1  mrg (p6)	fma.s1 f10 = f10, f12, f12
    457  1.1  mrg 	;;
    458  1.1  mrg 	fcvt.fx.trunc.s1 f10 = f10
    459  1.1  mrg 	;;
    460  1.1  mrg 	xma.l f10 = f10, f9, f13
    461  1.1  mrg 	;;
    462  1.1  mrg 	getf.sig ret0 = f10
    463  1.1  mrg 	br.ret.sptk rp
    464  1.1  mrg 	;;
    465  1.1  mrg 	.endp __modsi3
    466  1.1  mrg #endif
    467  1.1  mrg 
    468  1.1  mrg #ifdef L__udivsi3
    469  1.1  mrg // Compute a 32-bit unsigned integer quotient.
    470  1.1  mrg //
    471  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    472  1.1  mrg // alternative.
    473  1.1  mrg //
    474  1.1  mrg // in0 holds the dividend.  in1 holds the divisor.
    475  1.1  mrg 
    476  1.1  mrg 	.text
    477  1.1  mrg 	.align 16
    478  1.1  mrg 	.global __udivsi3
    479  1.1  mrg 	.proc __udivsi3
    480  1.1  mrg __udivsi3:
    481  1.1  mrg 	.regstk 2,0,0,0
    482  1.1  mrg 	mov r2 = 0x0ffdd
    483  1.1  mrg 	zxt4 in0 = in0
    484  1.1  mrg 	zxt4 in1 = in1
    485  1.1  mrg 	;;
    486  1.1  mrg 	setf.sig f8 = in0
    487  1.1  mrg 	setf.sig f9 = in1
    488  1.1  mrg 	// Check divide by zero.
    489  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    490  1.1  mrg 	;;
    491  1.1  mrg 	fcvt.xf f8 = f8
    492  1.1  mrg 	fcvt.xf f9 = f9
    493  1.1  mrg (p7)	break 1
    494  1.1  mrg 	;;
    495  1.1  mrg 	setf.exp f11 = r2
    496  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    497  1.1  mrg 	;;
    498  1.1  mrg (p6)	fmpy.s1 f8 = f8, f10
    499  1.1  mrg (p6)	fnma.s1 f9 = f9, f10, f1
    500  1.1  mrg 	;;
    501  1.1  mrg (p6)	fma.s1 f8 = f9, f8, f8
    502  1.1  mrg (p6)	fma.s1 f9 = f9, f9, f11
    503  1.1  mrg 	;;
    504  1.1  mrg (p6)	fma.s1 f10 = f9, f8, f8
    505  1.1  mrg 	;;
    506  1.1  mrg 	fcvt.fxu.trunc.s1 f10 = f10
    507  1.1  mrg 	;;
    508  1.1  mrg 	getf.sig ret0 = f10
    509  1.1  mrg 	br.ret.sptk rp
    510  1.1  mrg 	;;
    511  1.1  mrg 	.endp __udivsi3
    512  1.1  mrg #endif
    513  1.1  mrg 
    514  1.1  mrg #ifdef L__umodsi3
    515  1.1  mrg // Compute a 32-bit unsigned integer modulus.
    516  1.1  mrg //
    517  1.1  mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
    518  1.1  mrg // alternative.
    519  1.1  mrg //
    520  1.1  mrg // in0 holds the dividend.  in1 holds the divisor.
    521  1.1  mrg 
    522  1.1  mrg 	.text
    523  1.1  mrg 	.align 16
    524  1.1  mrg 	.global __umodsi3
    525  1.1  mrg 	.proc __umodsi3
    526  1.1  mrg __umodsi3:
    527  1.1  mrg 	.regstk 2,0,0,0
    528  1.1  mrg 	mov r2 = 0x0ffdd
    529  1.1  mrg 	zxt4 in0 = in0
    530  1.1  mrg 	zxt4 in1 = in1
    531  1.1  mrg 	;;
    532  1.1  mrg 	setf.sig f13 = in0
    533  1.1  mrg 	setf.sig f9 = in1
    534  1.1  mrg 	// Check divide by zero.
    535  1.1  mrg 	cmp.ne.unc p0,p7=0,in1
    536  1.1  mrg 	;;
    537  1.1  mrg 	sub in1 = r0, in1
    538  1.1  mrg 	fcvt.xf f8 = f13
    539  1.1  mrg 	fcvt.xf f9 = f9
    540  1.1  mrg 	;;
    541  1.1  mrg 	setf.exp f11 = r2
    542  1.1  mrg 	frcpa.s1 f10, p6 = f8, f9
    543  1.1  mrg (p7)	break 1;
    544  1.1  mrg 	;;
    545  1.1  mrg (p6)	fmpy.s1 f12 = f8, f10
    546  1.1  mrg (p6)	fnma.s1 f10 = f9, f10, f1
    547  1.1  mrg 	;;
    548  1.1  mrg 	setf.sig f9 = in1
    549  1.1  mrg (p6)	fma.s1 f12 = f10, f12, f12
    550  1.1  mrg (p6)	fma.s1 f10 = f10, f10, f11
    551  1.1  mrg 	;;
    552  1.1  mrg (p6)	fma.s1 f10 = f10, f12, f12
    553  1.1  mrg 	;;
    554  1.1  mrg 	fcvt.fxu.trunc.s1 f10 = f10
    555  1.1  mrg 	;;
    556  1.1  mrg 	xma.l f10 = f10, f9, f13
    557  1.1  mrg 	;;
    558  1.1  mrg 	getf.sig ret0 = f10
    559  1.1  mrg 	br.ret.sptk rp
    560  1.1  mrg 	;;
    561  1.1  mrg 	.endp __umodsi3
    562  1.1  mrg #endif
    563  1.1  mrg 
    564  1.1  mrg #ifdef L__save_stack_nonlocal
    565  1.1  mrg // Notes on save/restore stack nonlocal: We read ar.bsp but write
    566  1.1  mrg // ar.bspstore.  This is because ar.bsp can be read at all times
    567  1.1  mrg // (independent of the RSE mode) but since it's read-only we need to
    568  1.1  mrg // restore the value via ar.bspstore.  This is OK because
    569  1.1  mrg // ar.bsp==ar.bspstore after executing "flushrs".
    570  1.1  mrg 
    571  1.1  mrg // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
    572  1.1  mrg 
    573  1.1  mrg 	.text
    574  1.1  mrg 	.align 16
    575  1.1  mrg 	.global __ia64_save_stack_nonlocal
    576  1.1  mrg 	.proc __ia64_save_stack_nonlocal
    577  1.1  mrg __ia64_save_stack_nonlocal:
    578  1.1  mrg 	{ .mmf
    579  1.1  mrg 	  alloc r18 = ar.pfs, 2, 0, 0, 0
    580  1.1  mrg 	  mov r19 = ar.rsc
    581  1.1  mrg 	  ;;
    582  1.1  mrg 	}
    583  1.1  mrg 	{ .mmi
    584  1.1  mrg 	  flushrs
    585  1.1  mrg 	  st8 [in0] = in1, 24
    586  1.1  mrg 	  and r19 = 0x1c, r19
    587  1.1  mrg 	  ;;
    588  1.1  mrg 	}
    589  1.1  mrg 	{ .mmi
    590  1.1  mrg 	  st8 [in0] = r18, -16
    591  1.1  mrg 	  mov ar.rsc = r19
    592  1.1  mrg 	  or r19 = 0x3, r19
    593  1.1  mrg 	  ;;
    594  1.1  mrg 	}
    595  1.1  mrg 	{ .mmi
    596  1.1  mrg 	  mov r16 = ar.bsp
    597  1.1  mrg 	  mov r17 = ar.rnat
    598  1.1  mrg 	  adds r2 = 8, in0
    599  1.1  mrg 	  ;;
    600  1.1  mrg 	}
    601  1.1  mrg 	{ .mmi
    602  1.1  mrg 	  st8 [in0] = r16
    603  1.1  mrg 	  st8 [r2] = r17
    604  1.1  mrg 	}
    605  1.1  mrg 	{ .mib
    606  1.1  mrg 	  mov ar.rsc = r19
    607  1.1  mrg 	  br.ret.sptk.few rp
    608  1.1  mrg 	  ;;
    609  1.1  mrg 	}
    610  1.1  mrg 	.endp __ia64_save_stack_nonlocal
    611  1.1  mrg #endif
    612  1.1  mrg 
    613  1.1  mrg #ifdef L__nonlocal_goto
    614  1.1  mrg // void __ia64_nonlocal_goto(void *target_label, void *save_area,
    615  1.1  mrg //			     void *static_chain);
    616  1.1  mrg 
    617  1.1  mrg 	.text
    618  1.1  mrg 	.align 16
    619  1.1  mrg 	.global __ia64_nonlocal_goto
    620  1.1  mrg 	.proc __ia64_nonlocal_goto
    621  1.1  mrg __ia64_nonlocal_goto:
    622  1.1  mrg 	{ .mmi
    623  1.1  mrg 	  alloc r20 = ar.pfs, 3, 0, 0, 0
    624  1.1  mrg 	  ld8 r12 = [in1], 8
    625  1.1  mrg 	  mov.ret.sptk rp = in0, .L0
    626  1.1  mrg 	  ;;
    627  1.1  mrg 	}
    628  1.1  mrg 	{ .mmf
    629  1.1  mrg 	  ld8 r16 = [in1], 8
    630  1.1  mrg 	  mov r19 = ar.rsc
    631  1.1  mrg 	  ;;
    632  1.1  mrg 	}
    633  1.1  mrg 	{ .mmi
    634  1.1  mrg 	  flushrs
    635  1.1  mrg 	  ld8 r17 = [in1], 8
    636  1.1  mrg 	  and r19 = 0x1c, r19
    637  1.1  mrg 	  ;;
    638  1.1  mrg 	}
    639  1.1  mrg 	{ .mmi
    640  1.1  mrg 	  ld8 r18 = [in1]
    641  1.1  mrg 	  mov ar.rsc = r19
    642  1.1  mrg 	  or r19 = 0x3, r19
    643  1.1  mrg 	  ;;
    644  1.1  mrg 	}
    645  1.1  mrg 	{ .mmi
    646  1.1  mrg 	  mov ar.bspstore = r16
    647  1.1  mrg 	  ;;
    648  1.1  mrg 	  mov ar.rnat = r17
    649  1.1  mrg 	  ;;
    650  1.1  mrg 	}
    651  1.1  mrg 	{ .mmi
    652  1.1  mrg 	  loadrs
    653  1.1  mrg 	  invala
    654  1.1  mrg 	  mov r15 = in2
    655  1.1  mrg 	  ;;
    656  1.1  mrg 	}
    657  1.1  mrg .L0:	{ .mib
    658  1.1  mrg 	  mov ar.rsc = r19
    659  1.1  mrg 	  mov ar.pfs = r18
    660  1.1  mrg 	  br.ret.sptk.few rp
    661  1.1  mrg 	  ;;
    662  1.1  mrg 	}
    663  1.1  mrg 	.endp __ia64_nonlocal_goto
    664  1.1  mrg #endif
    665  1.1  mrg 
    666  1.1  mrg #ifdef L__restore_stack_nonlocal
    667  1.1  mrg // This is mostly the same as nonlocal_goto above.
    668  1.1  mrg // ??? This has not been tested yet.
    669  1.1  mrg 
    670  1.1  mrg // void __ia64_restore_stack_nonlocal(void *save_area)
    671  1.1  mrg 
    672  1.1  mrg 	.text
    673  1.1  mrg 	.align 16
    674  1.1  mrg 	.global __ia64_restore_stack_nonlocal
    675  1.1  mrg 	.proc __ia64_restore_stack_nonlocal
    676  1.1  mrg __ia64_restore_stack_nonlocal:
    677  1.1  mrg 	{ .mmf
    678  1.1  mrg 	  alloc r20 = ar.pfs, 4, 0, 0, 0
    679  1.1  mrg 	  ld8 r12 = [in0], 8
    680  1.1  mrg 	  ;;
    681  1.1  mrg 	}
    682  1.1  mrg 	{ .mmb
    683  1.1  mrg 	  ld8 r16=[in0], 8
    684  1.1  mrg 	  mov r19 = ar.rsc
    685  1.1  mrg 	  ;;
    686  1.1  mrg 	}
    687  1.1  mrg 	{ .mmi
    688  1.1  mrg 	  flushrs
    689  1.1  mrg 	  ld8 r17 = [in0], 8
    690  1.1  mrg 	  and r19 = 0x1c, r19
    691  1.1  mrg 	  ;;
    692  1.1  mrg 	}
    693  1.1  mrg 	{ .mmf
    694  1.1  mrg 	  ld8 r18 = [in0]
    695  1.1  mrg 	  mov ar.rsc = r19
    696  1.1  mrg 	  ;;
    697  1.1  mrg 	}
    698  1.1  mrg 	{ .mmi
    699  1.1  mrg 	  mov ar.bspstore = r16
    700  1.1  mrg 	  ;;
    701  1.1  mrg 	  mov ar.rnat = r17
    702  1.1  mrg 	  or r19 = 0x3, r19
    703  1.1  mrg 	  ;;
    704  1.1  mrg 	}
    705  1.1  mrg 	{ .mmf
    706  1.1  mrg 	  loadrs
    707  1.1  mrg 	  invala
    708  1.1  mrg 	  ;;
    709  1.1  mrg 	}
    710  1.1  mrg .L0:	{ .mib
    711  1.1  mrg 	  mov ar.rsc = r19
    712  1.1  mrg 	  mov ar.pfs = r18
    713  1.1  mrg 	  br.ret.sptk.few rp
    714  1.1  mrg 	  ;;
    715  1.1  mrg 	}
    716  1.1  mrg 	.endp __ia64_restore_stack_nonlocal
    717  1.1  mrg #endif
    718  1.1  mrg 
    719  1.1  mrg #ifdef L__trampoline
    720  1.1  mrg // Implement the nested function trampoline.  This is out of line
    721  1.1  mrg // so that we don't have to bother with flushing the icache, as
    722  1.1  mrg // well as making the on-stack trampoline smaller.
    723  1.1  mrg //
    724  1.1  mrg // The trampoline has the following form:
    725  1.1  mrg //
    726  1.1  mrg //		+-------------------+ >
    727  1.1  mrg //	TRAMP:	| __ia64_trampoline | |
    728  1.1  mrg //		+-------------------+  > fake function descriptor
    729  1.1  mrg //		| TRAMP+16          | |
    730  1.1  mrg //		+-------------------+ >
    731  1.1  mrg //		| target descriptor |
    732  1.1  mrg //		+-------------------+
    733  1.1  mrg //		| static link	    |
    734  1.1  mrg //		+-------------------+
    735  1.1  mrg 
    736  1.1  mrg 	.text
    737  1.1  mrg 	.align 16
    738  1.1  mrg 	.global __ia64_trampoline
    739  1.1  mrg 	.proc __ia64_trampoline
    740  1.1  mrg __ia64_trampoline:
    741  1.1  mrg 	{ .mmi
    742  1.1  mrg 	  ld8 r2 = [r1], 8
    743  1.1  mrg 	  ;;
    744  1.1  mrg 	  ld8 r15 = [r1]
    745  1.1  mrg 	}
    746  1.1  mrg 	{ .mmi
    747  1.1  mrg 	  ld8 r3 = [r2], 8
    748  1.1  mrg 	  ;;
    749  1.1  mrg 	  ld8 r1 = [r2]
    750  1.1  mrg 	  mov b6 = r3
    751  1.1  mrg 	}
    752  1.1  mrg 	{ .bbb
    753  1.1  mrg 	  br.sptk.many b6
    754  1.1  mrg 	  ;;
    755  1.1  mrg 	}
    756  1.1  mrg 	.endp __ia64_trampoline
    757  1.1  mrg #endif
    758  1.1  mrg 
    759  1.1  mrg #ifdef SHARED
    760  1.1  mrg // Thunks for backward compatibility.
    761  1.1  mrg #ifdef L_fixtfdi
    762  1.1  mrg 	.text
    763  1.1  mrg 	.align 16
    764  1.1  mrg 	.global __fixtfti
    765  1.1  mrg 	.proc __fixtfti
    766  1.1  mrg __fixtfti:
    767  1.1  mrg 	{ .bbb
    768  1.1  mrg 	  br.sptk.many __fixxfti
    769  1.1  mrg 	  ;;
    770  1.1  mrg 	}
    771  1.1  mrg 	.endp __fixtfti
    772  1.1  mrg #endif
    773  1.1  mrg #ifdef L_fixunstfdi
    774  1.1  mrg 	.align 16
    775  1.1  mrg 	.global __fixunstfti
    776  1.1  mrg 	.proc __fixunstfti
    777  1.1  mrg __fixunstfti:
    778  1.1  mrg 	{ .bbb
    779  1.1  mrg 	  br.sptk.many __fixunsxfti
    780  1.1  mrg 	  ;;
    781  1.1  mrg 	}
    782  1.1  mrg 	.endp __fixunstfti
    783  1.1  mrg #endif
    784  1.1  mrg #ifdef L_floatditf
    785  1.1  mrg 	.align 16
    786  1.1  mrg 	.global __floattitf
    787  1.1  mrg 	.proc __floattitf
    788  1.1  mrg __floattitf:
    789  1.1  mrg 	{ .bbb
    790  1.1  mrg 	  br.sptk.many __floattixf
    791  1.1  mrg 	  ;;
    792  1.1  mrg 	}
    793  1.1  mrg 	.endp __floattitf
    794  1.1  mrg #endif
    795  1.1  mrg #endif
    796