divrem_1.asm revision 1.1.1.1 1 dnl x86-64 mpn_divrem_1 -- mpn by limb division.
2
3 dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C norm unorm frac
24 C K8 13 13 12
25 C P4 44.2 44.2 42.3
26 C P6 core2 25 24.5 19.3
27 C P6 corei7 21.5 20.7 18
28 C P6 atom 42 52 37
29
30 C TODO
31 C * Compute the inverse without relying on the div instruction.
32 C Newton's method and mulq, or perhaps the faster fdiv.
33 C * Tune prologue.
34 C * Optimize for Core 2.
35
36 C The code for unnormalized divisors works also for normalized divisors, but
37 C for some reason it runs really slowly (on K8) for that case. Use special
38 C code until we can address this. The Intel Atom is also affected, but
39 C understandably (shld slowness).
40 define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
41
42 C mp_limb_t
43 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
44 C mp_srcptr np, mp_size_t nn, mp_limb_t d)
45
46 C mp_limb_t
47 C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
48 C mp_srcptr np, mp_size_t nn, mp_limb_t d,
49 C mp_limb_t dinv, int cnt)
50
51 C INPUT PARAMETERS
52 define(`qp', `%rdi')
53 define(`fn_param', `%rsi')
54 define(`up_param', `%rdx')
55 define(`un_param', `%rcx')
56 define(`d', `%r8')
57 define(`dinv', `%r9') C only for mpn_preinv_divrem_1
58 C shift passed on stack C only for mpn_preinv_divrem_1
59
60 define(`cnt', `%rcx')
61 define(`up', `%rsi')
62 define(`fn', `%r12')
63 define(`un', `%rbx')
64
65
66 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
67 C cnt qp d dinv
68
69 ASM_START()
70 TEXT
71 ALIGN(16)
72 PROLOGUE(mpn_preinv_divrem_1)
73 xor %eax, %eax
74 push %r13
75 push %r12
76 push %rbp
77 push %rbx
78
79 mov fn_param, fn
80 mov un_param, un
81 add fn_param, un_param
82 mov up_param, up
83
84 lea -8(qp,un_param,8), qp
85
86 test d, d
87 js L(nent)
88 mov 40(%rsp), R8(cnt)
89 shl R8(cnt), d
90 jmp L(uent)
91 EPILOGUE()
92
93 ALIGN(16)
94 PROLOGUE(mpn_divrem_1)
95 xor %eax, %eax
96 push %r13
97 push %r12
98 push %rbp
99 push %rbx
100
101 mov fn_param, fn
102 mov un_param, un
103 add fn_param, un_param
104 mov up_param, up
105 je L(ret)
106
107 lea -8(qp,un_param,8), qp
108 xor R32(%rbp), R32(%rbp)
109
110
111 ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
112 test d, d
113 jns L(unnormalized)
114
115 L(normalized):
116 test un, un
117 je L(8) C un == 0
118 mov -8(up,un,8), %rbp
119 dec un
120 mov %rbp, %rax
121 sub d, %rbp
122 cmovb %rax, %rbp
123 sbb %eax, %eax
124 inc %eax
125 mov %rax, (qp)
126 lea -8(qp), qp
127 L(8):
128 mov d, %rdx
129 mov $-1, %rax
130 not %rdx
131 div d C FREE rax rdx rcx r9 r10 r11
132 mov %rax, dinv
133 mov %rbp, %rax
134 jmp L(nent)
135
136 ALIGN(16)
137 L(nloop): C cycK8 cycP6 cycP4
138 mov (up,un,8), %r10 C
139 lea 1(%rax), %rbp C
140 mul dinv C 0,13 0,19 0,45
141 add %r10, %rax C 4 8 12
142 adc %rbp, %rdx C 5 9 13
143 mov %rax, %rbp C 5 9 13
144 mov %rdx, %r13 C 6 11 23
145 imul d, %rdx C 6 11 23
146 sub %rdx, %r10 C 10 16 33
147 mov d, %rax C
148 add %r10, %rax C 11 17 34
149 cmp %rbp, %r10 C 11 17 34
150 cmovb %r10, %rax C 12 18 35
151 adc $-1, %r13 C
152 cmp d, %rax C
153 jae L(nfx) C
154 L(nok): mov %r13, (qp) C
155 sub $8, qp C
156 L(nent):dec un C
157 jns L(nloop) C
158
159 xor %ecx, %ecx
160 jmp L(87)
161
162 L(nfx): sub d, %rax
163 inc %r13
164 jmp L(nok)
165 ')
166
167 L(unnormalized):
168 test un, un
169 je L(44)
170 mov -8(up,un,8), %rax
171 cmp d, %rax
172 jae L(44)
173 mov %rbp, (qp)
174 mov %rax, %rbp
175 lea -8(qp), qp
176 je L(ret)
177 dec un
178 L(44):
179 bsr d, %rcx
180 not %ecx
181 sal %cl, d
182 sal %cl, %rbp
183 mov d, %rdx
184 mov $-1, %rax
185 not %rdx
186 div d C FREE rax rdx r9 r10 r11
187 test un, un
188 mov %rax, dinv
189 mov %rbp, %rax
190 je L(87)
191 L(uent):
192 mov -8(up,un,8), %rbp
193 shr %cl, %rax
194 shld %cl, %rbp, %rax
195 sub $2, un
196 js L(ulast)
197
198 ALIGN(16)
199 L(uloop):
200 nop
201 mov (up,un,8), %r10
202 lea 1(%rax), %r11
203 shld %cl, %r10, %rbp
204 mul dinv
205 add %rbp, %rax
206 adc %r11, %rdx
207 mov %rax, %r11
208 mov %rdx, %r13
209 imul d, %rdx
210 sub %rdx, %rbp
211 mov d, %rax
212 add %rbp, %rax
213 cmp %r11, %rbp
214 cmovb %rbp, %rax
215 adc $-1, %r13
216 cmp d, %rax
217 jae L(ufx)
218 L(uok): mov %r13, (qp)
219 sub $8, qp
220 dec un
221 mov %r10, %rbp
222 jns L(uloop)
223 L(ulast):
224 lea 1(%rax), %r11
225 sal %cl, %rbp
226 mul dinv
227 add %rbp, %rax
228 adc %r11, %rdx
229 mov %rax, %r11
230 mov %rdx, %r13
231 imul d, %rdx
232 sub %rdx, %rbp
233 mov d, %rax
234 add %rbp, %rax
235 cmp %r11, %rbp
236 cmovb %rbp, %rax
237 adc $-1, %r13
238 cmp d, %rax
239 jae L(93)
240 L(69): mov %r13, (qp)
241 sub $8, qp
242 jmp L(87)
243
244 L(ufx): sub d, %rax
245 inc %r13
246 jmp L(uok)
247
248 L(93): sub d, %rax
249 inc %r13
250 jmp L(69)
251
252 L(87): mov d, %rbp
253 neg %rbp
254 jmp L(87b)
255
256 ALIGN(16)
257 L(floop): C cycK8 cycP6 cycP4
258 lea 1(%rax), %r11 C
259 mul dinv C 0,12
260 add %r11, %rdx C 5
261 mov %rax, %r11 C 4
262 mov %rdx, %r13 C 6
263 imul %rbp, %rdx C 6
264 mov d, %rax C
265 add %rdx, %rax C 10
266 cmp %r11, %rdx C 10
267 cmovb %rdx, %rax C 11
268 adc $-1, %r13 C
269 mov %r13, (qp) C
270 sub $8, qp C
271 L(87b): dec fn C
272 jns L(floop) C
273
274 shr %cl, %rax
275 L(ret): pop %rbx
276 pop %rbp
277 pop %r12
278 pop %r13
279 ret
280 EPILOGUE()
281