Home | History | Annotate | Line # | Download | only in ia64
dive_1.asm revision 1.1
      1 dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
      2 
      3 dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 C            cycles/limb
     23 C Itanium:      16
     24 C Itanium 2:     8
     25 
     26 C INPUT PARAMETERS
     27 define(`rp', `r32')
     28 define(`up', `r33')
     29 define(`n',  `r34')
     30 define(`divisor', `r35')
     31 
     32 define(`lshift', `r24')
     33 define(`rshift', `r25')
     34 
     35 C This code is a bit messy, and not as similar to mode1o.asm as desired.
     36 
     37 C The critical path during initialization is for computing the inverse of the
     38 C divisor.  Since odd divisors are probably common, we conditionally execute
     39 C the initial count_traling_zeros code and the downshift.
     40 
     41 C Possible improvement: Merge more of the feed-in code into the inverse
     42 C computation.
     43 
     44 ASM_START()
     45 	.text
     46 	.align	32
     47 .Ltab:
     48 data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
     49 data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
     50 data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
     51 data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
     52 data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
     53 data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
     54 data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
     55 data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
     56 data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
     57 data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
     58 data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
     59 data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
     60 data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
     61 data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
     62 data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
     63 data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
     64 
     65 
     66 PROLOGUE(mpn_divexact_1)
     67 	.prologue
     68 	.save		ar.lc, r2
     69 	.body
     70 
     71  {.mmi;	add		r8 = -1, divisor	C M0
     72 	nop		0			C M1
     73 	tbit.z		p8, p9 = divisor, 0	C I0
     74 }
     75 ifdef(`HAVE_ABI_32',
     76 `	addp4		rp = 0, rp		C M2  rp extend
     77 	addp4		up = 0, up		C M3  up extend
     78 	sxt4		n = n')			C I1  size extend
     79 	;;
     80 .Lhere:
     81  {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
     82   (p8)	andcm		r8 = r8, divisor	C M1
     83 	mov		r15 = ip		C I0  .Lhere
     84 	;;
     85 }{.mii
     86 	.pred.rel "mutex", p8, p9
     87   (p9)	mov		rshift = 0		C M0
     88   (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
     89 	cmp.eq		p6, p10 = 1, n		C I1
     90 	;;
     91 }{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
     92   (p8)	shr.u		divisor = divisor, rshift C I0
     93 	nop		0			C I1
     94 	;;
     95 }{.mmi;	add		n = -4, n		C M0  size-1
     96   (p10)	ld8		r21 = [up], 8		C M1  up[1]
     97 	mov		r14 = 2			C M1  2
     98 }{.mfi;	setf.sig	f6 = divisor		C M2  divisor
     99 	mov		f9 = f0			C M3  carry		FIXME
    100 	zxt1		r3 = divisor		C I1  divisor low byte
    101 	;;
    102 }{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
    103 	sub		r16 = 0, divisor	C M1  -divisor
    104 	mov		r2 = ar.lc		C I0
    105 }{.mmi;	sub		lshift = 64, rshift	C M2
    106 	setf.sig	f13 = r14		C M3  2 in significand
    107 	mov		r17 = -1		C I1  -1
    108 	;;
    109 }{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
    110 	nop		0			C M1
    111 	mov		ar.lc = n		C I0  size-1 loop count
    112 }{.mmi;	setf.sig	f12 = r16		C M2  -divisor
    113 	setf.sig	f8 = r17		C M3  -1
    114 	cmp.eq		p7, p0 = -2, n		C I1
    115 	;;
    116 }{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
    117 	cmp.eq		p8, p0 = -1, n		C M0
    118 	shr.u		r23 = r20, rshift	C I0
    119 	;;
    120 }
    121 
    122 	C f6	divisor
    123 	C f7	inverse, being calculated
    124 	C f8	-1, will be -inverse
    125 	C f9	carry
    126 	C f12	-divisor
    127 	C f13	2
    128 	C f14	scratch
    129 
    130 	xmpy.l		f14 = f13, f7		C Newton 2*i
    131 	xmpy.l		f7 = f7, f7		C Newton i*i
    132 	;;
    133 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
    134 	;;
    135 	setf.sig	f10 = r23		C speculative, used iff n = 1
    136 	xmpy.l		f14 = f13, f7		C Newton 2*i
    137 	shl		r22 = r21, lshift	C speculative, used iff n > 1
    138 	xmpy.l		f7 = f7, f7		C Newton i*i
    139 	;;
    140 	or		r31 = r22, r23		C speculative, used iff n > 1
    141 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
    142 	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
    143 	;;
    144 	setf.sig	f11 = r31		C speculative, used iff n > 1
    145 	xmpy.l		f14 = f13, f7		C Newton 2*i
    146 	xmpy.l		f7 = f7, f7		C Newton i*i
    147 	;;
    148 	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
    149 
    150   (p7)	br.cond.dptk	.Ln2
    151   (p10)	br.cond.dptk	.grt3
    152 	;;
    153 
    154 .Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
    155 	br		.Lx1
    156 
    157 .Ln2:
    158 	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
    159 	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
    160 	setf.sig	f11 = r23
    161 	br		.Lx2
    162 
    163 .grt3:
    164 	ld8		r21 = [up], 8		C up[2]
    165 	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
    166 	;;
    167 	shl		r22 = r21, lshift
    168 	;;
    169 	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
    170 	;;
    171 	or		r31 = r22, r23
    172 	shr.u		r23 = r21, rshift
    173 	;;
    174 	setf.sig	f11 = r31
    175   (p8)	br.cond.dptk	.Lx3			C branch for n = 3
    176 	;;
    177 	ld8		r21 = [up], 8
    178 	br		.Lent
    179 
    180 .Loop:	ld8		r21 = [up], 8
    181 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    182 	;;
    183 .Lent:	add		r16 = 160, up
    184 	shl		r22 = r21, lshift
    185 	;;
    186 	stf8		[rp] = f12, 8
    187 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    188 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    189 	;;
    190 	or		r31 = r22, r23
    191 	shr.u		r23 = r21, rshift
    192 	;;
    193 	lfetch		[r16]
    194 	setf.sig	f11 = r31
    195 	br.cloop.sptk.few.clr .Loop
    196 
    197 
    198 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    199 	;;
    200 .Lx3:	stf8		[rp] = f12, 8
    201 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    202 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    203 	;;
    204 	setf.sig	f11 = r23
    205 	;;
    206 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    207 	;;
    208 .Lx2:	stf8		[rp] = f12, 8
    209 	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
    210 	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
    211 	;;
    212 	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
    213 	;;
    214 .Lx1:	stf8		[rp] = f12, 8
    215 	mov		ar.lc = r2		C I0
    216 	br.ret.sptk.many b0
    217 EPILOGUE()
    218