dive_1.asm revision 1.1.1.1.8.1 1 1.1 mrg dnl IA-64 mpn_divexact_1 -- mpn by limb exact division.
2 1.1 mrg
3 1.1.1.1.8.1 tls dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
4 1.1.1.1.8.1 tls
5 1.1.1.1.8.1 tls dnl Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc.
6 1.1 mrg
7 1.1 mrg dnl This file is part of the GNU MP Library.
8 1.1 mrg
9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 1.1 mrg dnl it under the terms of the GNU Lesser General Public License as published
11 1.1 mrg dnl by the Free Software Foundation; either version 3 of the License, or (at
12 1.1 mrg dnl your option) any later version.
13 1.1 mrg
14 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 1.1 mrg dnl License for more details.
18 1.1 mrg
19 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
20 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 1.1 mrg
22 1.1 mrg include(`../config.m4')
23 1.1 mrg
24 1.1 mrg C cycles/limb
25 1.1 mrg C Itanium: 16
26 1.1 mrg C Itanium 2: 8
27 1.1 mrg
28 1.1 mrg C INPUT PARAMETERS
29 1.1 mrg define(`rp', `r32')
30 1.1 mrg define(`up', `r33')
31 1.1 mrg define(`n', `r34')
32 1.1 mrg define(`divisor', `r35')
33 1.1 mrg
34 1.1 mrg define(`lshift', `r24')
35 1.1 mrg define(`rshift', `r25')
36 1.1 mrg
37 1.1 mrg C This code is a bit messy, and not as similar to mode1o.asm as desired.
38 1.1 mrg
39 1.1 mrg C The critical path during initialization is for computing the inverse of the
40 1.1 mrg C divisor. Since odd divisors are probably common, we conditionally execute
41 1.1 mrg C the initial count_traling_zeros code and the downshift.
42 1.1 mrg
43 1.1 mrg C Possible improvement: Merge more of the feed-in code into the inverse
44 1.1 mrg C computation.
45 1.1 mrg
46 1.1 mrg ASM_START()
47 1.1 mrg .text
48 1.1 mrg .align 32
49 1.1 mrg .Ltab:
50 1.1 mrg data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
51 1.1 mrg data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
52 1.1 mrg data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
53 1.1 mrg data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
54 1.1 mrg data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
55 1.1 mrg data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
56 1.1 mrg data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
57 1.1 mrg data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
58 1.1 mrg data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
59 1.1 mrg data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
60 1.1 mrg data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
61 1.1 mrg data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
62 1.1 mrg data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
63 1.1 mrg data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
64 1.1 mrg data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
65 1.1 mrg data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
66 1.1 mrg
67 1.1 mrg
68 1.1 mrg PROLOGUE(mpn_divexact_1)
69 1.1 mrg .prologue
70 1.1 mrg .save ar.lc, r2
71 1.1 mrg .body
72 1.1 mrg
73 1.1 mrg {.mmi; add r8 = -1, divisor C M0
74 1.1 mrg nop 0 C M1
75 1.1 mrg tbit.z p8, p9 = divisor, 0 C I0
76 1.1 mrg }
77 1.1 mrg ifdef(`HAVE_ABI_32',
78 1.1 mrg ` addp4 rp = 0, rp C M2 rp extend
79 1.1 mrg addp4 up = 0, up C M3 up extend
80 1.1 mrg sxt4 n = n') C I1 size extend
81 1.1 mrg ;;
82 1.1 mrg .Lhere:
83 1.1 mrg {.mmi; ld8 r20 = [up], 8 C M0 up[0]
84 1.1 mrg (p8) andcm r8 = r8, divisor C M1
85 1.1 mrg mov r15 = ip C I0 .Lhere
86 1.1 mrg ;;
87 1.1 mrg }{.mii
88 1.1 mrg .pred.rel "mutex", p8, p9
89 1.1 mrg (p9) mov rshift = 0 C M0
90 1.1 mrg (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor)
91 1.1 mrg cmp.eq p6, p10 = 1, n C I1
92 1.1 mrg ;;
93 1.1 mrg }{.mii; add r9 = .Ltab-.Lhere, r15 C M0
94 1.1 mrg (p8) shr.u divisor = divisor, rshift C I0
95 1.1 mrg nop 0 C I1
96 1.1 mrg ;;
97 1.1 mrg }{.mmi; add n = -4, n C M0 size-1
98 1.1 mrg (p10) ld8 r21 = [up], 8 C M1 up[1]
99 1.1 mrg mov r14 = 2 C M1 2
100 1.1 mrg }{.mfi; setf.sig f6 = divisor C M2 divisor
101 1.1 mrg mov f9 = f0 C M3 carry FIXME
102 1.1 mrg zxt1 r3 = divisor C I1 divisor low byte
103 1.1 mrg ;;
104 1.1 mrg }{.mmi; add r3 = r9, r3 C M0 table offset ip and index
105 1.1 mrg sub r16 = 0, divisor C M1 -divisor
106 1.1 mrg mov r2 = ar.lc C I0
107 1.1 mrg }{.mmi; sub lshift = 64, rshift C M2
108 1.1 mrg setf.sig f13 = r14 C M3 2 in significand
109 1.1 mrg mov r17 = -1 C I1 -1
110 1.1 mrg ;;
111 1.1 mrg }{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
112 1.1 mrg nop 0 C M1
113 1.1 mrg mov ar.lc = n C I0 size-1 loop count
114 1.1 mrg }{.mmi; setf.sig f12 = r16 C M2 -divisor
115 1.1 mrg setf.sig f8 = r17 C M3 -1
116 1.1 mrg cmp.eq p7, p0 = -2, n C I1
117 1.1 mrg ;;
118 1.1 mrg }{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits
119 1.1 mrg cmp.eq p8, p0 = -1, n C M0
120 1.1 mrg shr.u r23 = r20, rshift C I0
121 1.1 mrg ;;
122 1.1 mrg }
123 1.1 mrg
124 1.1 mrg C f6 divisor
125 1.1 mrg C f7 inverse, being calculated
126 1.1 mrg C f8 -1, will be -inverse
127 1.1 mrg C f9 carry
128 1.1 mrg C f12 -divisor
129 1.1 mrg C f13 2
130 1.1 mrg C f14 scratch
131 1.1 mrg
132 1.1 mrg xmpy.l f14 = f13, f7 C Newton 2*i
133 1.1 mrg xmpy.l f7 = f7, f7 C Newton i*i
134 1.1 mrg ;;
135 1.1 mrg xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits
136 1.1 mrg ;;
137 1.1 mrg setf.sig f10 = r23 C speculative, used iff n = 1
138 1.1 mrg xmpy.l f14 = f13, f7 C Newton 2*i
139 1.1 mrg shl r22 = r21, lshift C speculative, used iff n > 1
140 1.1 mrg xmpy.l f7 = f7, f7 C Newton i*i
141 1.1 mrg ;;
142 1.1 mrg or r31 = r22, r23 C speculative, used iff n > 1
143 1.1 mrg xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits
144 1.1 mrg shr.u r23 = r21, rshift C speculative, used iff n > 1
145 1.1 mrg ;;
146 1.1 mrg setf.sig f11 = r31 C speculative, used iff n > 1
147 1.1 mrg xmpy.l f14 = f13, f7 C Newton 2*i
148 1.1 mrg xmpy.l f7 = f7, f7 C Newton i*i
149 1.1 mrg ;;
150 1.1 mrg xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits
151 1.1 mrg
152 1.1 mrg (p7) br.cond.dptk .Ln2
153 1.1 mrg (p10) br.cond.dptk .grt3
154 1.1 mrg ;;
155 1.1 mrg
156 1.1 mrg .Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse
157 1.1 mrg br .Lx1
158 1.1 mrg
159 1.1 mrg .Ln2:
160 1.1 mrg xmpy.l f8 = f7, f8 C -inverse = inverse * -1
161 1.1 mrg xmpy.l f12 = f11, f7 C q = ulimb * inverse
162 1.1 mrg setf.sig f11 = r23
163 1.1 mrg br .Lx2
164 1.1 mrg
165 1.1 mrg .grt3:
166 1.1 mrg ld8 r21 = [up], 8 C up[2]
167 1.1 mrg xmpy.l f8 = f7, f8 C -inverse = inverse * -1
168 1.1 mrg ;;
169 1.1 mrg shl r22 = r21, lshift
170 1.1 mrg ;;
171 1.1 mrg xmpy.l f12 = f11, f7 C q = ulimb * inverse
172 1.1 mrg ;;
173 1.1 mrg or r31 = r22, r23
174 1.1 mrg shr.u r23 = r21, rshift
175 1.1 mrg ;;
176 1.1 mrg setf.sig f11 = r31
177 1.1 mrg (p8) br.cond.dptk .Lx3 C branch for n = 3
178 1.1 mrg ;;
179 1.1 mrg ld8 r21 = [up], 8
180 1.1 mrg br .Lent
181 1.1 mrg
182 1.1.1.1.8.1 tls .Ltop: ld8 r21 = [up], 8
183 1.1 mrg xma.l f12 = f9, f8, f10 C q = c * -inverse + si
184 1.1.1.1.8.1 tls nop.b 0
185 1.1 mrg ;;
186 1.1 mrg .Lent: add r16 = 160, up
187 1.1 mrg shl r22 = r21, lshift
188 1.1.1.1.8.1 tls nop.b 0
189 1.1 mrg ;;
190 1.1 mrg stf8 [rp] = f12, 8
191 1.1 mrg xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
192 1.1.1.1.8.1 tls nop.b 0
193 1.1.1.1.8.1 tls nop.m 0
194 1.1 mrg xmpy.l f10 = f11, f7 C si = ulimb * inverse
195 1.1.1.1.8.1 tls nop.b 0
196 1.1 mrg ;;
197 1.1 mrg or r31 = r22, r23
198 1.1 mrg shr.u r23 = r21, rshift
199 1.1.1.1.8.1 tls nop.b 0
200 1.1 mrg ;;
201 1.1 mrg lfetch [r16]
202 1.1 mrg setf.sig f11 = r31
203 1.1.1.1.8.1 tls br.cloop.sptk.few.clr .Ltop
204 1.1 mrg
205 1.1 mrg
206 1.1 mrg xma.l f12 = f9, f8, f10 C q = c * -inverse + si
207 1.1 mrg ;;
208 1.1 mrg .Lx3: stf8 [rp] = f12, 8
209 1.1 mrg xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
210 1.1 mrg xmpy.l f10 = f11, f7 C si = ulimb * inverse
211 1.1 mrg ;;
212 1.1 mrg setf.sig f11 = r23
213 1.1 mrg ;;
214 1.1 mrg xma.l f12 = f9, f8, f10 C q = c * -inverse + si
215 1.1 mrg ;;
216 1.1 mrg .Lx2: stf8 [rp] = f12, 8
217 1.1 mrg xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
218 1.1 mrg xmpy.l f10 = f11, f7 C si = ulimb * inverse
219 1.1 mrg ;;
220 1.1 mrg xma.l f12 = f9, f8, f10 C q = c * -inverse + si
221 1.1 mrg ;;
222 1.1 mrg .Lx1: stf8 [rp] = f12, 8
223 1.1 mrg mov ar.lc = r2 C I0
224 1.1 mrg br.ret.sptk.many b0
225 1.1 mrg EPILOGUE()
226