dive_1.asm revision 1.1.1.1 1 dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
2
3 dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb
24 C Athlon: 11.0
25 C Hammer: 9.0
26
27
28 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29 C mp_limb_t divisor);
30 C
31 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
32 C achieved with no special effort. The load and shrld latencies are hidden
33 C by out of order execution.
34 C
35 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
36
37 defframe(PARAM_DIVISOR,16)
38 defframe(PARAM_SIZE, 12)
39 defframe(PARAM_SRC, 8)
40 defframe(PARAM_DST, 4)
41
42 defframe(SAVE_EBX, -4)
43 defframe(SAVE_ESI, -8)
44 defframe(SAVE_EDI, -12)
45 defframe(SAVE_EBP, -16)
46 defframe(VAR_INVERSE, -20)
47 defframe(VAR_DST_END, -24)
48
49 deflit(STACK_SPACE, 24)
50
51 TEXT
52
53 ALIGN(16)
54 PROLOGUE(mpn_divexact_1)
55 deflit(`FRAME',0)
56
57 movl PARAM_DIVISOR, %eax
58 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
59 movl $-1, %ecx C shift count
60
61 movl %ebp, SAVE_EBP
62 movl PARAM_SIZE, %ebp
63
64 movl %esi, SAVE_ESI
65 movl %edi, SAVE_EDI
66
67 C If there's usually only one or two trailing zero bits then this
68 C should be faster than bsfl.
69 L(strip_twos):
70 incl %ecx
71 shrl %eax
72 jnc L(strip_twos)
73
74 movl %ebx, SAVE_EBX
75 leal 1(%eax,%eax), %ebx C d without twos
76 andl $127, %eax C d/2, 7 bits
77
78 ifdef(`PIC',`
79 LEA( binvert_limb_table, %edx)
80 movzbl (%eax,%edx), %eax C inv 8 bits
81 ',`
82 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
83 ')
84
85 leal (%eax,%eax), %edx C 2*inv
86 movl %ebx, PARAM_DIVISOR C d without twos
87
88 imull %eax, %eax C inv*inv
89
90 movl PARAM_SRC, %esi
91 movl PARAM_DST, %edi
92
93 imull %ebx, %eax C inv*inv*d
94
95 subl %eax, %edx C inv = 2*inv - inv*inv*d
96 leal (%edx,%edx), %eax C 2*inv
97
98 imull %edx, %edx C inv*inv
99
100 leal (%esi,%ebp,4), %esi C src end
101 leal (%edi,%ebp,4), %edi C dst end
102 negl %ebp C -size
103
104 imull %ebx, %edx C inv*inv*d
105
106 subl %edx, %eax C inv = 2*inv - inv*inv*d
107
108 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
109 pushl %eax FRAME_pushl()
110 imull PARAM_DIVISOR, %eax
111 cmpl $1, %eax
112 popl %eax FRAME_popl()')
113
114 movl %eax, VAR_INVERSE
115 movl (%esi,%ebp,4), %eax C src[0]
116
117 incl %ebp
118 jz L(one)
119
120 movl (%esi,%ebp,4), %edx C src[1]
121
122 shrdl( %cl, %edx, %eax)
123
124 movl %edi, VAR_DST_END
125 xorl %ebx, %ebx
126 jmp L(entry)
127
128 ALIGN(8)
129 L(top):
130 C eax q
131 C ebx carry bit, 0 or 1
132 C ecx shift
133 C edx
134 C esi src end
135 C edi dst end
136 C ebp counter, limbs, negative
137
138 mull PARAM_DIVISOR C carry limb in edx
139
140 movl -4(%esi,%ebp,4), %eax
141 movl (%esi,%ebp,4), %edi
142
143 shrdl( %cl, %edi, %eax)
144
145 subl %ebx, %eax C apply carry bit
146 setc %bl
147 movl VAR_DST_END, %edi
148
149 subl %edx, %eax C apply carry limb
150 adcl $0, %ebx
151
152 L(entry):
153 imull VAR_INVERSE, %eax
154
155 movl %eax, -4(%edi,%ebp,4)
156 incl %ebp
157 jnz L(top)
158
159
160 mull PARAM_DIVISOR C carry limb in edx
161
162 movl -4(%esi), %eax C src high limb
163 shrl %cl, %eax
164 movl SAVE_ESI, %esi
165
166 subl %ebx, %eax C apply carry bit
167 movl SAVE_EBX, %ebx
168 movl SAVE_EBP, %ebp
169
170 subl %edx, %eax C apply carry limb
171
172 imull VAR_INVERSE, %eax
173
174 movl %eax, -4(%edi)
175 movl SAVE_EDI, %edi
176 addl $STACK_SPACE, %esp
177
178 ret
179
180
181 L(one):
182 shrl %cl, %eax
183 movl SAVE_ESI, %esi
184 movl SAVE_EBX, %ebx
185
186 imull VAR_INVERSE, %eax
187
188 movl SAVE_EBP, %ebp
189 movl %eax, -4(%edi)
190
191 movl SAVE_EDI, %edi
192 addl $STACK_SPACE, %esp
193
194 ret
195
196 EPILOGUE()
197