divrem_1.asm revision 1.1.1.1.2.1 1 dnl x86-64 mpn_divrem_1 -- mpn by limb division.
2
3 dnl Copyright 2004, 2005, 2007, 2008, 2009, 2010, 2011, 2012 Free Software
4 dnl Foundation, Inc.
5
6 dnl This file is part of the GNU MP Library.
7
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
12
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
17
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23
24 C norm unorm frac
25 C AMD K8,K9 13 13 12
26 C AMD K10 13 13 12
27 C Intel P4 43 44 43
28 C Intel core2 24.5 24.5 19.5
29 C Intel corei 20.5 19.5 18
30 C Intel atom 43 46 36
31 C VIA nano 25.5 25.5 24
32
33 C mp_limb_t
34 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
35 C mp_srcptr np, mp_size_t nn, mp_limb_t d)
36
37 C mp_limb_t
38 C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
39 C mp_srcptr np, mp_size_t nn, mp_limb_t d,
40 C mp_limb_t dinv, int cnt)
41
42 C INPUT PARAMETERS
43 define(`qp', `%rdi')
44 define(`fn_param', `%rsi')
45 define(`up_param', `%rdx')
46 define(`un_param', `%rcx')
47 define(`d', `%r8')
48 define(`dinv', `%r9') C only for mpn_preinv_divrem_1
49 C shift passed on stack C only for mpn_preinv_divrem_1
50
51 define(`cnt', `%rcx')
52 define(`up', `%rsi')
53 define(`fn', `%r12')
54 define(`un', `%rbx')
55
56
57 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
58 C cnt qp d dinv
59
60 ABI_SUPPORT(DOS64)
61 ABI_SUPPORT(STD64)
62
63 IFSTD(`define(`CNTOFF', `40($1)')')
64 IFDOS(`define(`CNTOFF', `104($1)')')
65
66 ASM_START()
67 TEXT
68 ALIGN(16)
69 PROLOGUE(mpn_preinv_divrem_1)
70 FUNC_ENTRY(4)
71 IFDOS(` mov 56(%rsp), %r8 ')
72 IFDOS(` mov 64(%rsp), %r9 ')
73 xor R32(%rax), R32(%rax)
74 push %r13
75 push %r12
76 push %rbp
77 push %rbx
78
79 mov fn_param, fn
80 mov un_param, un
81 add fn_param, un_param
82 mov up_param, up
83
84 lea -8(qp,un_param,8), qp
85
86 test d, d
87 js L(nent)
88
89 mov CNTOFF(%rsp), R8(cnt)
90 shl R8(cnt), d
91 jmp L(uent)
92 EPILOGUE()
93
94 ALIGN(16)
95 PROLOGUE(mpn_divrem_1)
96 FUNC_ENTRY(4)
97 IFDOS(` mov 56(%rsp), %r8 ')
98 xor R32(%rax), R32(%rax)
99 push %r13
100 push %r12
101 push %rbp
102 push %rbx
103
104 mov fn_param, fn
105 mov un_param, un
106 add fn_param, un_param
107 mov up_param, up
108 je L(ret)
109
110 lea -8(qp,un_param,8), qp
111 xor R32(%rbp), R32(%rbp)
112
113 test d, d
114 jns L(unnormalized)
115
116 L(normalized):
117 test un, un
118 je L(8) C un == 0
119 mov -8(up,un,8), %rbp
120 dec un
121 mov %rbp, %rax
122 sub d, %rbp
123 cmovc %rax, %rbp
124 sbb R32(%rax), R32(%rax)
125 inc R32(%rax)
126 mov %rax, (qp)
127 lea -8(qp), qp
128 L(8):
129 IFSTD(` push %rdi ')
130 IFSTD(` push %rsi ')
131 push %r8
132 IFSTD(` mov d, %rdi ')
133 IFDOS(` mov d, %rcx ')
134 CALL( mpn_invert_limb)
135 pop %r8
136 IFSTD(` pop %rsi ')
137 IFSTD(` pop %rdi ')
138
139 mov %rax, dinv
140 mov %rbp, %rax
141 jmp L(nent)
142
143 ALIGN(16)
144 L(ntop): C K8-K10 P6-CNR P6-NHM P4
145 mov (up,un,8), %r10 C
146 mul dinv C 0,13 0,20 0,18 0,45
147 add %r10, %rax C 4 8 3 12
148 adc %rbp, %rdx C 5 9 10 13
149 mov %rax, %rbp C 5 9 4 13
150 mov %rdx, %r13 C 6 11 12 23
151 imul d, %rdx C 6 11 11 23
152 sub %rdx, %r10 C 10 16 14 33
153 mov d, %rax C
154 add %r10, %rax C 11 17 15 34
155 cmp %rbp, %r10 C 11 17 15 34
156 cmovc %r10, %rax C 12 18 16 35
157 adc $-1, %r13 C
158 cmp d, %rax C
159 jae L(nfx) C
160 L(nok): mov %r13, (qp) C
161 sub $8, qp C
162 L(nent):lea 1(%rax), %rbp C
163 dec un C
164 jns L(ntop) C
165
166 xor R32(%rcx), R32(%rcx)
167 jmp L(87)
168
169 L(nfx): sub d, %rax
170 inc %r13
171 jmp L(nok)
172
173 L(unnormalized):
174 test un, un
175 je L(44)
176 mov -8(up,un,8), %rax
177 cmp d, %rax
178 jae L(44)
179 mov %rbp, (qp)
180 mov %rax, %rbp
181 lea -8(qp), qp
182 je L(ret)
183 dec un
184 L(44):
185 bsr d, %rcx
186 not R32(%rcx)
187 shl R8(%rcx), d
188 shl R8(%rcx), %rbp
189
190 push %rcx
191 IFSTD(` push %rdi ')
192 IFSTD(` push %rsi ')
193 push %r8
194 IFSTD(` mov d, %rdi ')
195 IFDOS(` mov d, %rcx ')
196 CALL( mpn_invert_limb)
197 pop %r8
198 IFSTD(` pop %rsi ')
199 IFSTD(` pop %rdi ')
200 pop %rcx
201
202 mov %rax, dinv
203 mov %rbp, %rax
204 test un, un
205 je L(87)
206
207 L(uent):dec un
208 mov (up,un,8), %rbp
209 neg R32(%rcx)
210 shr R8(%rcx), %rbp
211 neg R32(%rcx)
212 or %rbp, %rax
213 jmp L(ent)
214
215 ALIGN(16)
216 L(utop):mov (up,un,8), %r10
217 shl R8(%rcx), %rbp
218 neg R32(%rcx)
219 shr R8(%rcx), %r10
220 neg R32(%rcx)
221 or %r10, %rbp
222 mul dinv
223 add %rbp, %rax
224 adc %r11, %rdx
225 mov %rax, %r11
226 mov %rdx, %r13
227 imul d, %rdx
228 sub %rdx, %rbp
229 mov d, %rax
230 add %rbp, %rax
231 cmp %r11, %rbp
232 cmovc %rbp, %rax
233 adc $-1, %r13
234 cmp d, %rax
235 jae L(ufx)
236 L(uok): mov %r13, (qp)
237 sub $8, qp
238 L(ent): mov (up,un,8), %rbp
239 dec un
240 lea 1(%rax), %r11
241 jns L(utop)
242
243 L(uend):shl R8(%rcx), %rbp
244 mul dinv
245 add %rbp, %rax
246 adc %r11, %rdx
247 mov %rax, %r11
248 mov %rdx, %r13
249 imul d, %rdx
250 sub %rdx, %rbp
251 mov d, %rax
252 add %rbp, %rax
253 cmp %r11, %rbp
254 cmovc %rbp, %rax
255 adc $-1, %r13
256 cmp d, %rax
257 jae L(efx)
258 L(eok): mov %r13, (qp)
259 sub $8, qp
260 jmp L(87)
261
262 L(ufx): sub d, %rax
263 inc %r13
264 jmp L(uok)
265 L(efx): sub d, %rax
266 inc %r13
267 jmp L(eok)
268
269 L(87): mov d, %rbp
270 neg %rbp
271 jmp L(fent)
272
273 ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
274 L(ftop):mul dinv C 0,12 0,17 0,17
275 add %r11, %rdx C 5 8 10
276 mov %rax, %r11 C 4 8 3
277 mov %rdx, %r13 C 6 9 11
278 imul %rbp, %rdx C 6 9 11
279 mov d, %rax C
280 add %rdx, %rax C 10 14 14
281 cmp %r11, %rdx C 10 14 14
282 cmovc %rdx, %rax C 11 15 15
283 adc $-1, %r13 C
284 mov %r13, (qp) C
285 sub $8, qp C
286 L(fent):lea 1(%rax), %r11 C
287 dec fn C
288 jns L(ftop) C
289
290 shr R8(%rcx), %rax
291 L(ret): pop %rbx
292 pop %rbp
293 pop %r12
294 pop %r13
295 FUNC_EXIT()
296 ret
297 EPILOGUE()
298