dive_1.asm revision 1.1.1.2 1 dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
2
3 dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
30
31 include(`../config.m4')
32
33
34 C cycles/limb
35 C Athlon: 11.0
36 C Hammer: 9.0
37
38
39 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor);
41 C
42 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
43 C achieved with no special effort. The load and shrld latencies are hidden
44 C by out of order execution.
45 C
46 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
47
48 defframe(PARAM_DIVISOR,16)
49 defframe(PARAM_SIZE, 12)
50 defframe(PARAM_SRC, 8)
51 defframe(PARAM_DST, 4)
52
53 defframe(SAVE_EBX, -4)
54 defframe(SAVE_ESI, -8)
55 defframe(SAVE_EDI, -12)
56 defframe(SAVE_EBP, -16)
57 defframe(VAR_INVERSE, -20)
58 defframe(VAR_DST_END, -24)
59
60 deflit(STACK_SPACE, 24)
61
62 TEXT
63
64 ALIGN(16)
65 PROLOGUE(mpn_divexact_1)
66 deflit(`FRAME',0)
67
68 movl PARAM_DIVISOR, %eax
69 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
70 movl $-1, %ecx C shift count
71
72 movl %ebp, SAVE_EBP
73 movl PARAM_SIZE, %ebp
74
75 movl %esi, SAVE_ESI
76 movl %edi, SAVE_EDI
77
78 C If there's usually only one or two trailing zero bits then this
79 C should be faster than bsfl.
80 L(strip_twos):
81 incl %ecx
82 shrl %eax
83 jnc L(strip_twos)
84
85 movl %ebx, SAVE_EBX
86 leal 1(%eax,%eax), %ebx C d without twos
87 andl $127, %eax C d/2, 7 bits
88
89 ifdef(`PIC',`
90 LEA( binvert_limb_table, %edx)
91 movzbl (%eax,%edx), %eax C inv 8 bits
92 ',`
93 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
94 ')
95
96 leal (%eax,%eax), %edx C 2*inv
97 movl %ebx, PARAM_DIVISOR C d without twos
98
99 imull %eax, %eax C inv*inv
100
101 movl PARAM_SRC, %esi
102 movl PARAM_DST, %edi
103
104 imull %ebx, %eax C inv*inv*d
105
106 subl %eax, %edx C inv = 2*inv - inv*inv*d
107 leal (%edx,%edx), %eax C 2*inv
108
109 imull %edx, %edx C inv*inv
110
111 leal (%esi,%ebp,4), %esi C src end
112 leal (%edi,%ebp,4), %edi C dst end
113 negl %ebp C -size
114
115 imull %ebx, %edx C inv*inv*d
116
117 subl %edx, %eax C inv = 2*inv - inv*inv*d
118
119 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
120 pushl %eax FRAME_pushl()
121 imull PARAM_DIVISOR, %eax
122 cmpl $1, %eax
123 popl %eax FRAME_popl()')
124
125 movl %eax, VAR_INVERSE
126 movl (%esi,%ebp,4), %eax C src[0]
127
128 incl %ebp
129 jz L(one)
130
131 movl (%esi,%ebp,4), %edx C src[1]
132
133 shrdl( %cl, %edx, %eax)
134
135 movl %edi, VAR_DST_END
136 xorl %ebx, %ebx
137 jmp L(entry)
138
139 ALIGN(8)
140 L(top):
141 C eax q
142 C ebx carry bit, 0 or 1
143 C ecx shift
144 C edx
145 C esi src end
146 C edi dst end
147 C ebp counter, limbs, negative
148
149 mull PARAM_DIVISOR C carry limb in edx
150
151 movl -4(%esi,%ebp,4), %eax
152 movl (%esi,%ebp,4), %edi
153
154 shrdl( %cl, %edi, %eax)
155
156 subl %ebx, %eax C apply carry bit
157 setc %bl
158 movl VAR_DST_END, %edi
159
160 subl %edx, %eax C apply carry limb
161 adcl $0, %ebx
162
163 L(entry):
164 imull VAR_INVERSE, %eax
165
166 movl %eax, -4(%edi,%ebp,4)
167 incl %ebp
168 jnz L(top)
169
170
171 mull PARAM_DIVISOR C carry limb in edx
172
173 movl -4(%esi), %eax C src high limb
174 shrl %cl, %eax
175 movl SAVE_ESI, %esi
176
177 subl %ebx, %eax C apply carry bit
178 movl SAVE_EBX, %ebx
179 movl SAVE_EBP, %ebp
180
181 subl %edx, %eax C apply carry limb
182
183 imull VAR_INVERSE, %eax
184
185 movl %eax, -4(%edi)
186 movl SAVE_EDI, %edi
187 addl $STACK_SPACE, %esp
188
189 ret
190
191
192 L(one):
193 shrl %cl, %eax
194 movl SAVE_ESI, %esi
195 movl SAVE_EBX, %ebx
196
197 imull VAR_INVERSE, %eax
198
199 movl SAVE_EBP, %ebp
200 movl %eax, -4(%edi)
201
202 movl SAVE_EDI, %edi
203 addl $STACK_SPACE, %esp
204
205 ret
206
207 EPILOGUE()
208 ASM_END()
209