dive_1.asm revision 1.1.1.1.2.1 1 dnl IA-64 mpn_divexact_1 -- mpn by limb exact division.
2
3 dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
4
5 dnl Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
13
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
18
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24 C cycles/limb
25 C Itanium: 16
26 C Itanium 2: 8
27
28 C INPUT PARAMETERS
29 define(`rp', `r32')
30 define(`up', `r33')
31 define(`n', `r34')
32 define(`divisor', `r35')
33
34 define(`lshift', `r24')
35 define(`rshift', `r25')
36
37 C This code is a bit messy, and not as similar to mode1o.asm as desired.
38
39 C The critical path during initialization is for computing the inverse of the
40 C divisor. Since odd divisors are probably common, we conditionally execute
41 C the initial count_traling_zeros code and the downshift.
42
43 C Possible improvement: Merge more of the feed-in code into the inverse
44 C computation.
45
46 ASM_START()
47 .text
48 .align 32
49 .Ltab:
50 data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
51 data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
52 data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
53 data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
54 data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
55 data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
56 data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
57 data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
58 data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
59 data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
60 data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
61 data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
62 data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
63 data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
64 data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
65 data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
66
67
68 PROLOGUE(mpn_divexact_1)
69 .prologue
70 .save ar.lc, r2
71 .body
72
73 {.mmi; add r8 = -1, divisor C M0
74 nop 0 C M1
75 tbit.z p8, p9 = divisor, 0 C I0
76 }
77 ifdef(`HAVE_ABI_32',
78 ` addp4 rp = 0, rp C M2 rp extend
79 addp4 up = 0, up C M3 up extend
80 sxt4 n = n') C I1 size extend
81 ;;
82 .Lhere:
83 {.mmi; ld8 r20 = [up], 8 C M0 up[0]
84 (p8) andcm r8 = r8, divisor C M1
85 mov r15 = ip C I0 .Lhere
86 ;;
87 }{.mii
88 .pred.rel "mutex", p8, p9
89 (p9) mov rshift = 0 C M0
90 (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor)
91 cmp.eq p6, p10 = 1, n C I1
92 ;;
93 }{.mii; add r9 = .Ltab-.Lhere, r15 C M0
94 (p8) shr.u divisor = divisor, rshift C I0
95 nop 0 C I1
96 ;;
97 }{.mmi; add n = -4, n C M0 size-1
98 (p10) ld8 r21 = [up], 8 C M1 up[1]
99 mov r14 = 2 C M1 2
100 }{.mfi; setf.sig f6 = divisor C M2 divisor
101 mov f9 = f0 C M3 carry FIXME
102 zxt1 r3 = divisor C I1 divisor low byte
103 ;;
104 }{.mmi; add r3 = r9, r3 C M0 table offset ip and index
105 sub r16 = 0, divisor C M1 -divisor
106 mov r2 = ar.lc C I0
107 }{.mmi; sub lshift = 64, rshift C M2
108 setf.sig f13 = r14 C M3 2 in significand
109 mov r17 = -1 C I1 -1
110 ;;
111 }{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
112 nop 0 C M1
113 mov ar.lc = n C I0 size-1 loop count
114 }{.mmi; setf.sig f12 = r16 C M2 -divisor
115 setf.sig f8 = r17 C M3 -1
116 cmp.eq p7, p0 = -2, n C I1
117 ;;
118 }{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits
119 cmp.eq p8, p0 = -1, n C M0
120 shr.u r23 = r20, rshift C I0
121 ;;
122 }
123
124 C f6 divisor
125 C f7 inverse, being calculated
126 C f8 -1, will be -inverse
127 C f9 carry
128 C f12 -divisor
129 C f13 2
130 C f14 scratch
131
132 xmpy.l f14 = f13, f7 C Newton 2*i
133 xmpy.l f7 = f7, f7 C Newton i*i
134 ;;
135 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits
136 ;;
137 setf.sig f10 = r23 C speculative, used iff n = 1
138 xmpy.l f14 = f13, f7 C Newton 2*i
139 shl r22 = r21, lshift C speculative, used iff n > 1
140 xmpy.l f7 = f7, f7 C Newton i*i
141 ;;
142 or r31 = r22, r23 C speculative, used iff n > 1
143 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits
144 shr.u r23 = r21, rshift C speculative, used iff n > 1
145 ;;
146 setf.sig f11 = r31 C speculative, used iff n > 1
147 xmpy.l f14 = f13, f7 C Newton 2*i
148 xmpy.l f7 = f7, f7 C Newton i*i
149 ;;
150 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits
151
152 (p7) br.cond.dptk .Ln2
153 (p10) br.cond.dptk .grt3
154 ;;
155
156 .Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse
157 br .Lx1
158
159 .Ln2:
160 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
161 xmpy.l f12 = f11, f7 C q = ulimb * inverse
162 setf.sig f11 = r23
163 br .Lx2
164
165 .grt3:
166 ld8 r21 = [up], 8 C up[2]
167 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
168 ;;
169 shl r22 = r21, lshift
170 ;;
171 xmpy.l f12 = f11, f7 C q = ulimb * inverse
172 ;;
173 or r31 = r22, r23
174 shr.u r23 = r21, rshift
175 ;;
176 setf.sig f11 = r31
177 (p8) br.cond.dptk .Lx3 C branch for n = 3
178 ;;
179 ld8 r21 = [up], 8
180 br .Lent
181
182 .Ltop: ld8 r21 = [up], 8
183 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
184 nop.b 0
185 ;;
186 .Lent: add r16 = 160, up
187 shl r22 = r21, lshift
188 nop.b 0
189 ;;
190 stf8 [rp] = f12, 8
191 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
192 nop.b 0
193 nop.m 0
194 xmpy.l f10 = f11, f7 C si = ulimb * inverse
195 nop.b 0
196 ;;
197 or r31 = r22, r23
198 shr.u r23 = r21, rshift
199 nop.b 0
200 ;;
201 lfetch [r16]
202 setf.sig f11 = r31
203 br.cloop.sptk.few.clr .Ltop
204
205
206 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
207 ;;
208 .Lx3: stf8 [rp] = f12, 8
209 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
210 xmpy.l f10 = f11, f7 C si = ulimb * inverse
211 ;;
212 setf.sig f11 = r23
213 ;;
214 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
215 ;;
216 .Lx2: stf8 [rp] = f12, 8
217 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
218 xmpy.l f10 = f11, f7 C si = ulimb * inverse
219 ;;
220 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
221 ;;
222 .Lx1: stf8 [rp] = f12, 8
223 mov ar.lc = r2 C I0
224 br.ret.sptk.many b0
225 EPILOGUE()
226