Home | History | Annotate | Line # | Download | only in ia64
dive_1.asm revision 1.1.1.1.8.1
      1 dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
      2 
      3 dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
      4 
      5 dnl  Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of the GNU Lesser General Public License as published
     11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     12 dnl  your option) any later version.
     13 
     14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     17 dnl  License for more details.
     18 
     19 dnl  You should have received a copy of the GNU Lesser General Public License
     20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     21 
     22 include(`../config.m4')
     23 
     24 C            cycles/limb
     25 C Itanium:      16
     26 C Itanium 2:     8
     27 
     28 C INPUT PARAMETERS
     29 define(`rp', `r32')
     30 define(`up', `r33')
     31 define(`n',  `r34')
     32 define(`divisor', `r35')
     33 
     34 define(`lshift', `r24')
     35 define(`rshift', `r25')
     36 
     37 C This code is a bit messy, and not as similar to mode1o.asm as desired.
     38 
     39 C The critical path during initialization is for computing the inverse of the
     40 C divisor.  Since odd divisors are probably common, we conditionally execute
     41 C the initial count_traling_zeros code and the downshift.
     42 
     43 C Possible improvement: Merge more of the feed-in code into the inverse
     44 C computation.
     45 
     46 ASM_START()
     47 	.text
     48 	.align	32
     49 .Ltab:
     50 data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
     51 data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
     52 data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
     53 data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
     54 data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
     55 data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
     56 data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
     57 data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
     58 data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
     59 data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
     60 data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
     61 data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
     62 data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
     63 data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
     64 data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
     65 data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
     66 
     67 
     68 PROLOGUE(mpn_divexact_1)
     69 	.prologue
     70 	.save		ar.lc, r2
     71 	.body
     72 
     73  {.mmi;	add		r8 = -1, divisor	C M0
     74 	nop		0			C M1
     75 	tbit.z		p8, p9 = divisor, 0	C I0
     76 }
     77 ifdef(`HAVE_ABI_32',
     78 `	addp4		rp = 0, rp		C M2  rp extend
     79 	addp4		up = 0, up		C M3  up extend
     80 	sxt4		n = n')			C I1  size extend
     81 	;;
     82 .Lhere:
     83  {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
     84   (p8)	andcm		r8 = r8, divisor	C M1
     85 	mov		r15 = ip		C I0  .Lhere
     86 	;;
     87 }{.mii
     88 	.pred.rel "mutex", p8, p9
     89   (p9)	mov		rshift = 0		C M0
     90   (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
     91 	cmp.eq		p6, p10 = 1, n		C I1
     92 	;;
     93 }{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
     94   (p8)	shr.u		divisor = divisor, rshift C I0
     95 	nop		0			C I1
     96 	;;
     97 }{.mmi;	add		n = -4, n		C M0  size-1
     98   (p10)	ld8		r21 = [up], 8		C M1  up[1]
     99 	mov		r14 = 2			C M1  2
    100 }{.mfi;	setf.sig	f6 = divisor		C M2  divisor
    101 	mov		f9 = f0			C M3  carry		FIXME
    102 	zxt1		r3 = divisor		C I1  divisor low byte
    103 	;;
    104 }{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
    105 	sub		r16 = 0, divisor	C M1  -divisor
    106 	mov		r2 = ar.lc		C I0
    107 }{.mmi;	sub		lshift = 64, rshift	C M2
    108 	setf.sig	f13 = r14		C M3  2 in significand
    109 	mov		r17 = -1		C I1  -1
    110 	;;
    111 }{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
    112 	nop		0			C M1
    113 	mov		ar.lc = n		C I0  size-1 loop count
    114 }{.mmi;	setf.sig	f12 = r16		C M2  -divisor
    115 	setf.sig	f8 = r17		C M3  -1
    116 	cmp.eq		p7, p0 = -2, n		C I1
    117 	;;
    118 }{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
    119 	cmp.eq		p8, p0 = -1, n		C M0
    120 	shr.u		r23 = r20, rshift	C I0
    121 	;;
    122 }
    123 
    124 	C f6	divisor
    125 	C f7	inverse, being calculated
    126 	C f8	-1, will be -inverse
    127 	C f9	carry
    128 	C f12	-divisor
    129 	C f13	2
    130 	C f14	scratch
    131 
    132 	xmpy.l		f14 = f13, f7		C Newton 2*i
    133 	xmpy.l		f7 = f7, f7		C Newton i*i
    134 	;;
    135 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
    136 	;;
    137 	setf.sig	f10 = r23		C speculative, used iff n = 1
    138 	xmpy.l		f14 = f13, f7		C Newton 2*i
    139 	shl		r22 = r21, lshift	C speculative, used iff n > 1
    140 	xmpy.l		f7 = f7, f7		C Newton i*i
    141 	;;
    142 	or		r31 = r22, r23		C speculative, used iff n > 1
    143 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
    144 	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
    145 	;;
    146 	setf.sig	f11 = r31		C speculative, used iff n > 1
    147 	xmpy.l		f14 = f13, f7		C Newton 2*i
    148 	xmpy.l		f7 = f7, f7		C Newton i*i
    149 	;;
    150 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
    151 
    152   (p7)	br.cond.dptk	.Ln2
    153   (p10)	br.cond.dptk	.grt3
    154 	;;
    155 
    156 .Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
    157 	br		.Lx1
    158 
    159 .Ln2:
    160 	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
    161 	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
    162 	setf.sig	f11 = r23
    163 	br		.Lx2
    164 
    165 .grt3:
    166 	ld8		r21 = [up], 8		C up[2]
    167 	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
    168 	;;
    169 	shl		r22 = r21, lshift
    170 	;;
    171 	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
    172 	;;
    173 	or		r31 = r22, r23
    174 	shr.u		r23 = r21, rshift
    175 	;;
    176 	setf.sig	f11 = r31
    177   (p8)	br.cond.dptk	.Lx3			C branch for n = 3
    178 	;;
    179 	ld8		r21 = [up], 8
    180 	br		.Lent
    181 
    182 .Ltop:	ld8		r21 = [up], 8
    183 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    184 	nop.b		0
    185 	;;
    186 .Lent:	add		r16 = 160, up
    187 	shl		r22 = r21, lshift
    188 	nop.b		0
    189 	;;
    190 	stf8		[rp] = f12, 8
    191 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    192 	nop.b		0
    193 	nop.m		0
    194 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    195 	nop.b		0
    196 	;;
    197 	or		r31 = r22, r23
    198 	shr.u		r23 = r21, rshift
    199 	nop.b		0
    200 	;;
    201 	lfetch		[r16]
    202 	setf.sig	f11 = r31
    203 	br.cloop.sptk.few.clr .Ltop
    204 
    205 
    206 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    207 	;;
    208 .Lx3:	stf8		[rp] = f12, 8
    209 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    210 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    211 	;;
    212 	setf.sig	f11 = r23
    213 	;;
    214 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    215 	;;
    216 .Lx2:	stf8		[rp] = f12, 8
    217 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    218 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    219 	;;
    220 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    221 	;;
    222 .Lx1:	stf8		[rp] = f12, 8
    223 	mov		ar.lc = r2		C I0
    224 	br.ret.sptk.many b0
    225 EPILOGUE()
    226