dive_1.asm revision 1.1 1 dnl IA-64 mpn_divexact_1 -- mpn by limb exact division.
2
3 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22 C cycles/limb
23 C Itanium: 16
24 C Itanium 2: 8
25
26 C INPUT PARAMETERS
27 define(`rp', `r32')
28 define(`up', `r33')
29 define(`n', `r34')
30 define(`divisor', `r35')
31
32 define(`lshift', `r24')
33 define(`rshift', `r25')
34
35 C This code is a bit messy, and not as similar to mode1o.asm as desired.
36
37 C The critical path during initialization is for computing the inverse of the
38 C divisor. Since odd divisors are probably common, we conditionally execute
39 C the initial count_traling_zeros code and the downshift.
40
41 C Possible improvement: Merge more of the feed-in code into the inverse
42 C computation.
43
44 ASM_START()
45 .text
46 .align 32
47 .Ltab:
48 data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
49 data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
50 data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
51 data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
52 data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
53 data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
54 data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
55 data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
56 data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
57 data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
58 data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
59 data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
60 data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
61 data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
62 data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
63 data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
64
65
66 PROLOGUE(mpn_divexact_1)
67 .prologue
68 .save ar.lc, r2
69 .body
70
71 {.mmi; add r8 = -1, divisor C M0
72 nop 0 C M1
73 tbit.z p8, p9 = divisor, 0 C I0
74 }
75 ifdef(`HAVE_ABI_32',
76 ` addp4 rp = 0, rp C M2 rp extend
77 addp4 up = 0, up C M3 up extend
78 sxt4 n = n') C I1 size extend
79 ;;
80 .Lhere:
81 {.mmi; ld8 r20 = [up], 8 C M0 up[0]
82 (p8) andcm r8 = r8, divisor C M1
83 mov r15 = ip C I0 .Lhere
84 ;;
85 }{.mii
86 .pred.rel "mutex", p8, p9
87 (p9) mov rshift = 0 C M0
88 (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor)
89 cmp.eq p6, p10 = 1, n C I1
90 ;;
91 }{.mii; add r9 = .Ltab-.Lhere, r15 C M0
92 (p8) shr.u divisor = divisor, rshift C I0
93 nop 0 C I1
94 ;;
95 }{.mmi; add n = -4, n C M0 size-1
96 (p10) ld8 r21 = [up], 8 C M1 up[1]
97 mov r14 = 2 C M1 2
98 }{.mfi; setf.sig f6 = divisor C M2 divisor
99 mov f9 = f0 C M3 carry FIXME
100 zxt1 r3 = divisor C I1 divisor low byte
101 ;;
102 }{.mmi; add r3 = r9, r3 C M0 table offset ip and index
103 sub r16 = 0, divisor C M1 -divisor
104 mov r2 = ar.lc C I0
105 }{.mmi; sub lshift = 64, rshift C M2
106 setf.sig f13 = r14 C M3 2 in significand
107 mov r17 = -1 C I1 -1
108 ;;
109 }{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
110 nop 0 C M1
111 mov ar.lc = n C I0 size-1 loop count
112 }{.mmi; setf.sig f12 = r16 C M2 -divisor
113 setf.sig f8 = r17 C M3 -1
114 cmp.eq p7, p0 = -2, n C I1
115 ;;
116 }{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits
117 cmp.eq p8, p0 = -1, n C M0
118 shr.u r23 = r20, rshift C I0
119 ;;
120 }
121
122 C f6 divisor
123 C f7 inverse, being calculated
124 C f8 -1, will be -inverse
125 C f9 carry
126 C f12 -divisor
127 C f13 2
128 C f14 scratch
129
130 xmpy.l f14 = f13, f7 C Newton 2*i
131 xmpy.l f7 = f7, f7 C Newton i*i
132 ;;
133 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits
134 ;;
135 setf.sig f10 = r23 C speculative, used iff n = 1
136 xmpy.l f14 = f13, f7 C Newton 2*i
137 shl r22 = r21, lshift C speculative, used iff n > 1
138 xmpy.l f7 = f7, f7 C Newton i*i
139 ;;
140 or r31 = r22, r23 C speculative, used iff n > 1
141 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits
142 shr.u r23 = r21, rshift C speculative, used iff n > 1
143 ;;
144 setf.sig f11 = r31 C speculative, used iff n > 1
145 xmpy.l f14 = f13, f7 C Newton 2*i
146 xmpy.l f7 = f7, f7 C Newton i*i
147 ;;
148 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits
149
150 (p7) br.cond.dptk .Ln2
151 (p10) br.cond.dptk .grt3
152 ;;
153
154 .Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse
155 br .Lx1
156
157 .Ln2:
158 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
159 xmpy.l f12 = f11, f7 C q = ulimb * inverse
160 setf.sig f11 = r23
161 br .Lx2
162
163 .grt3:
164 ld8 r21 = [up], 8 C up[2]
165 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
166 ;;
167 shl r22 = r21, lshift
168 ;;
169 xmpy.l f12 = f11, f7 C q = ulimb * inverse
170 ;;
171 or r31 = r22, r23
172 shr.u r23 = r21, rshift
173 ;;
174 setf.sig f11 = r31
175 (p8) br.cond.dptk .Lx3 C branch for n = 3
176 ;;
177 ld8 r21 = [up], 8
178 br .Lent
179
180 .Loop: ld8 r21 = [up], 8
181 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
182 ;;
183 .Lent: add r16 = 160, up
184 shl r22 = r21, lshift
185 ;;
186 stf8 [rp] = f12, 8
187 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
188 xmpy.l f10 = f11, f7 C si = ulimb * inverse
189 ;;
190 or r31 = r22, r23
191 shr.u r23 = r21, rshift
192 ;;
193 lfetch [r16]
194 setf.sig f11 = r31
195 br.cloop.sptk.few.clr .Loop
196
197
198 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
199 ;;
200 .Lx3: stf8 [rp] = f12, 8
201 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
202 xmpy.l f10 = f11, f7 C si = ulimb * inverse
203 ;;
204 setf.sig f11 = r23
205 ;;
206 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
207 ;;
208 .Lx2: stf8 [rp] = f12, 8
209 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
210 xmpy.l f10 = f11, f7 C si = ulimb * inverse
211 ;;
212 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
213 ;;
214 .Lx1: stf8 [rp] = f12, 8
215 mov ar.lc = r2 C I0
216 br.ret.sptk.many b0
217 EPILOGUE()
218