Home | History | Annotate | Line # | Download | only in ia64
dive_1.asm revision 1.1.1.1.8.1
      1          1.1  mrg dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
      2          1.1  mrg 
      3  1.1.1.1.8.1  tls dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
      4  1.1.1.1.8.1  tls 
      5  1.1.1.1.8.1  tls dnl  Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc.
      6          1.1  mrg 
      7          1.1  mrg dnl  This file is part of the GNU MP Library.
      8          1.1  mrg 
      9          1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10          1.1  mrg dnl  it under the terms of the GNU Lesser General Public License as published
     11          1.1  mrg dnl  by the Free Software Foundation; either version 3 of the License, or (at
     12          1.1  mrg dnl  your option) any later version.
     13          1.1  mrg 
     14          1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     15          1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     16          1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     17          1.1  mrg dnl  License for more details.
     18          1.1  mrg 
     19          1.1  mrg dnl  You should have received a copy of the GNU Lesser General Public License
     20          1.1  mrg dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     21          1.1  mrg 
     22          1.1  mrg include(`../config.m4')
     23          1.1  mrg 
     24          1.1  mrg C            cycles/limb
     25          1.1  mrg C Itanium:      16
     26          1.1  mrg C Itanium 2:     8
     27          1.1  mrg 
     28          1.1  mrg C INPUT PARAMETERS
     29          1.1  mrg define(`rp', `r32')
     30          1.1  mrg define(`up', `r33')
     31          1.1  mrg define(`n',  `r34')
     32          1.1  mrg define(`divisor', `r35')
     33          1.1  mrg 
     34          1.1  mrg define(`lshift', `r24')
     35          1.1  mrg define(`rshift', `r25')
     36          1.1  mrg 
     37          1.1  mrg C This code is a bit messy, and not as similar to mode1o.asm as desired.
     38          1.1  mrg 
     39          1.1  mrg C The critical path during initialization is for computing the inverse of the
     40          1.1  mrg C divisor.  Since odd divisors are probably common, we conditionally execute
     41          1.1  mrg C the initial count_traling_zeros code and the downshift.
     42          1.1  mrg 
     43          1.1  mrg C Possible improvement: Merge more of the feed-in code into the inverse
     44          1.1  mrg C computation.
     45          1.1  mrg 
     46          1.1  mrg ASM_START()
     47          1.1  mrg 	.text
     48          1.1  mrg 	.align	32
     49          1.1  mrg .Ltab:
     50          1.1  mrg data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
     51          1.1  mrg data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
     52          1.1  mrg data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
     53          1.1  mrg data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
     54          1.1  mrg data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
     55          1.1  mrg data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
     56          1.1  mrg data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
     57          1.1  mrg data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
     58          1.1  mrg data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
     59          1.1  mrg data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
     60          1.1  mrg data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
     61          1.1  mrg data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
     62          1.1  mrg data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
     63          1.1  mrg data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
     64          1.1  mrg data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
     65          1.1  mrg data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
     66          1.1  mrg 
     67          1.1  mrg 
     68          1.1  mrg PROLOGUE(mpn_divexact_1)
     69          1.1  mrg 	.prologue
     70          1.1  mrg 	.save		ar.lc, r2
     71          1.1  mrg 	.body
     72          1.1  mrg 
     73          1.1  mrg  {.mmi;	add		r8 = -1, divisor	C M0
     74          1.1  mrg 	nop		0			C M1
     75          1.1  mrg 	tbit.z		p8, p9 = divisor, 0	C I0
     76          1.1  mrg }
     77          1.1  mrg ifdef(`HAVE_ABI_32',
     78          1.1  mrg `	addp4		rp = 0, rp		C M2  rp extend
     79          1.1  mrg 	addp4		up = 0, up		C M3  up extend
     80          1.1  mrg 	sxt4		n = n')			C I1  size extend
     81          1.1  mrg 	;;
     82          1.1  mrg .Lhere:
     83          1.1  mrg  {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
     84          1.1  mrg   (p8)	andcm		r8 = r8, divisor	C M1
     85          1.1  mrg 	mov		r15 = ip		C I0  .Lhere
     86          1.1  mrg 	;;
     87          1.1  mrg }{.mii
     88          1.1  mrg 	.pred.rel "mutex", p8, p9
     89          1.1  mrg   (p9)	mov		rshift = 0		C M0
     90          1.1  mrg   (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
     91          1.1  mrg 	cmp.eq		p6, p10 = 1, n		C I1
     92          1.1  mrg 	;;
     93          1.1  mrg }{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
     94          1.1  mrg   (p8)	shr.u		divisor = divisor, rshift C I0
     95          1.1  mrg 	nop		0			C I1
     96          1.1  mrg 	;;
     97          1.1  mrg }{.mmi;	add		n = -4, n		C M0  size-1
     98          1.1  mrg   (p10)	ld8		r21 = [up], 8		C M1  up[1]
     99          1.1  mrg 	mov		r14 = 2			C M1  2
    100          1.1  mrg }{.mfi;	setf.sig	f6 = divisor		C M2  divisor
    101          1.1  mrg 	mov		f9 = f0			C M3  carry		FIXME
    102          1.1  mrg 	zxt1		r3 = divisor		C I1  divisor low byte
    103          1.1  mrg 	;;
    104          1.1  mrg }{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
    105          1.1  mrg 	sub		r16 = 0, divisor	C M1  -divisor
    106          1.1  mrg 	mov		r2 = ar.lc		C I0
    107          1.1  mrg }{.mmi;	sub		lshift = 64, rshift	C M2
    108          1.1  mrg 	setf.sig	f13 = r14		C M3  2 in significand
    109          1.1  mrg 	mov		r17 = -1		C I1  -1
    110          1.1  mrg 	;;
    111          1.1  mrg }{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
    112          1.1  mrg 	nop		0			C M1
    113          1.1  mrg 	mov		ar.lc = n		C I0  size-1 loop count
    114          1.1  mrg }{.mmi;	setf.sig	f12 = r16		C M2  -divisor
    115          1.1  mrg 	setf.sig	f8 = r17		C M3  -1
    116          1.1  mrg 	cmp.eq		p7, p0 = -2, n		C I1
    117          1.1  mrg 	;;
    118          1.1  mrg }{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
    119          1.1  mrg 	cmp.eq		p8, p0 = -1, n		C M0
    120          1.1  mrg 	shr.u		r23 = r20, rshift	C I0
    121          1.1  mrg 	;;
    122          1.1  mrg }
    123          1.1  mrg 
    124          1.1  mrg 	C f6	divisor
    125          1.1  mrg 	C f7	inverse, being calculated
    126          1.1  mrg 	C f8	-1, will be -inverse
    127          1.1  mrg 	C f9	carry
    128          1.1  mrg 	C f12	-divisor
    129          1.1  mrg 	C f13	2
    130          1.1  mrg 	C f14	scratch
    131          1.1  mrg 
    132          1.1  mrg 	xmpy.l		f14 = f13, f7		C Newton 2*i
    133          1.1  mrg 	xmpy.l		f7 = f7, f7		C Newton i*i
    134          1.1  mrg 	;;
    135          1.1  mrg 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
    136          1.1  mrg 	;;
    137          1.1  mrg 	setf.sig	f10 = r23		C speculative, used iff n = 1
    138          1.1  mrg 	xmpy.l		f14 = f13, f7		C Newton 2*i
    139          1.1  mrg 	shl		r22 = r21, lshift	C speculative, used iff n > 1
    140          1.1  mrg 	xmpy.l		f7 = f7, f7		C Newton i*i
    141          1.1  mrg 	;;
    142          1.1  mrg 	or		r31 = r22, r23		C speculative, used iff n > 1
    143          1.1  mrg 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
    144          1.1  mrg 	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
    145          1.1  mrg 	;;
    146          1.1  mrg 	setf.sig	f11 = r31		C speculative, used iff n > 1
    147          1.1  mrg 	xmpy.l		f14 = f13, f7		C Newton 2*i
    148          1.1  mrg 	xmpy.l		f7 = f7, f7		C Newton i*i
    149          1.1  mrg 	;;
    150          1.1  mrg 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
    151          1.1  mrg 
    152          1.1  mrg   (p7)	br.cond.dptk	.Ln2
    153          1.1  mrg   (p10)	br.cond.dptk	.grt3
    154          1.1  mrg 	;;
    155          1.1  mrg 
    156          1.1  mrg .Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
    157          1.1  mrg 	br		.Lx1
    158          1.1  mrg 
    159          1.1  mrg .Ln2:
    160          1.1  mrg 	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
    161          1.1  mrg 	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
    162          1.1  mrg 	setf.sig	f11 = r23
    163          1.1  mrg 	br		.Lx2
    164          1.1  mrg 
    165          1.1  mrg .grt3:
    166          1.1  mrg 	ld8		r21 = [up], 8		C up[2]
    167          1.1  mrg 	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
    168          1.1  mrg 	;;
    169          1.1  mrg 	shl		r22 = r21, lshift
    170          1.1  mrg 	;;
    171          1.1  mrg 	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
    172          1.1  mrg 	;;
    173          1.1  mrg 	or		r31 = r22, r23
    174          1.1  mrg 	shr.u		r23 = r21, rshift
    175          1.1  mrg 	;;
    176          1.1  mrg 	setf.sig	f11 = r31
    177          1.1  mrg   (p8)	br.cond.dptk	.Lx3			C branch for n = 3
    178          1.1  mrg 	;;
    179          1.1  mrg 	ld8		r21 = [up], 8
    180          1.1  mrg 	br		.Lent
    181          1.1  mrg 
    182  1.1.1.1.8.1  tls .Ltop:	ld8		r21 = [up], 8
    183          1.1  mrg 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    184  1.1.1.1.8.1  tls 	nop.b		0
    185          1.1  mrg 	;;
    186          1.1  mrg .Lent:	add		r16 = 160, up
    187          1.1  mrg 	shl		r22 = r21, lshift
    188  1.1.1.1.8.1  tls 	nop.b		0
    189          1.1  mrg 	;;
    190          1.1  mrg 	stf8		[rp] = f12, 8
    191          1.1  mrg 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    192  1.1.1.1.8.1  tls 	nop.b		0
    193  1.1.1.1.8.1  tls 	nop.m		0
    194          1.1  mrg 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    195  1.1.1.1.8.1  tls 	nop.b		0
    196          1.1  mrg 	;;
    197          1.1  mrg 	or		r31 = r22, r23
    198          1.1  mrg 	shr.u		r23 = r21, rshift
    199  1.1.1.1.8.1  tls 	nop.b		0
    200          1.1  mrg 	;;
    201          1.1  mrg 	lfetch		[r16]
    202          1.1  mrg 	setf.sig	f11 = r31
    203  1.1.1.1.8.1  tls 	br.cloop.sptk.few.clr .Ltop
    204          1.1  mrg 
    205          1.1  mrg 
    206          1.1  mrg 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    207          1.1  mrg 	;;
    208          1.1  mrg .Lx3:	stf8		[rp] = f12, 8
    209          1.1  mrg 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    210          1.1  mrg 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    211          1.1  mrg 	;;
    212          1.1  mrg 	setf.sig	f11 = r23
    213          1.1  mrg 	;;
    214          1.1  mrg 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    215          1.1  mrg 	;;
    216          1.1  mrg .Lx2:	stf8		[rp] = f12, 8
    217          1.1  mrg 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    218          1.1  mrg 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    219          1.1  mrg 	;;
    220          1.1  mrg 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    221          1.1  mrg 	;;
    222          1.1  mrg .Lx1:	stf8		[rp] = f12, 8
    223          1.1  mrg 	mov		ar.lc = r2		C I0
    224          1.1  mrg 	br.ret.sptk.many b0
    225          1.1  mrg EPILOGUE()
    226