dive_1.asm revision 1.1 1 dnl x86 mpn_divexact_1 -- mpn by limb exact division.
2
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb
24 C P54 30.0
25 C P55 29.0
26 C P6 13.0 odd divisor, 12.0 even (strangely)
27 C K6 14.0
28 C K7 12.0
29 C P4 42.0
30
31
32 C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
33 C mp_limb_t divisor);
34 C
35
36 defframe(PARAM_DIVISOR,16)
37 defframe(PARAM_SIZE, 12)
38 defframe(PARAM_SRC, 8)
39 defframe(PARAM_DST, 4)
40
41 dnl re-use parameter space
42 define(VAR_INVERSE,`PARAM_SRC')
43
44 TEXT
45
46 ALIGN(16)
47 PROLOGUE(mpn_divexact_1)
48 deflit(`FRAME',0)
49
50 movl PARAM_DIVISOR, %eax
51 pushl %ebp FRAME_pushl()
52
53 movl PARAM_SIZE, %ebp
54 pushl %edi FRAME_pushl()
55
56 pushl %ebx FRAME_pushl()
57 movl $-1, %ecx C shift count
58
59 pushl %esi FRAME_pushl()
60
61 L(strip_twos):
62 incl %ecx
63
64 shrl %eax
65 jnc L(strip_twos)
66
67 leal 1(%eax,%eax), %ebx C d without twos
68 andl $127, %eax C d/2, 7 bits
69
70 ifdef(`PIC',`
71 LEA( binvert_limb_table, %edx)
72 movzbl (%eax,%edx), %eax C inv 8 bits
73 ',`
74 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
75 ')
76
77 leal (%eax,%eax), %edx C 2*inv
78 movl %ebx, PARAM_DIVISOR C d without twos
79
80 imull %eax, %eax C inv*inv
81
82 movl PARAM_SRC, %esi
83 movl PARAM_DST, %edi
84
85 imull %ebx, %eax C inv*inv*d
86
87 subl %eax, %edx C inv = 2*inv - inv*inv*d
88 leal (%edx,%edx), %eax C 2*inv
89
90 imull %edx, %edx C inv*inv
91
92 leal (%esi,%ebp,4), %esi C src end
93 leal (%edi,%ebp,4), %edi C dst end
94 negl %ebp C -size
95
96 imull %ebx, %edx C inv*inv*d
97
98 subl %edx, %eax C inv = 2*inv - inv*inv*d
99
100 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
101 pushl %eax FRAME_pushl()
102 imull PARAM_DIVISOR, %eax
103 cmpl $1, %eax
104 popl %eax FRAME_popl()')
105
106 movl %eax, VAR_INVERSE
107 movl (%esi,%ebp,4), %eax C src[0]
108
109 xorl %ebx, %ebx
110 xorl %edx, %edx
111
112 incl %ebp
113 jz L(one)
114
115 movl (%esi,%ebp,4), %edx C src[1]
116
117 shrdl( %cl, %edx, %eax)
118
119 movl VAR_INVERSE, %edx
120 jmp L(entry)
121
122
123 ALIGN(8)
124 nop C k6 code alignment
125 nop
126 L(top):
127 C eax q
128 C ebx carry bit, 0 or -1
129 C ecx shift
130 C edx carry limb
131 C esi src end
132 C edi dst end
133 C ebp counter, limbs, negative
134
135 movl -4(%esi,%ebp,4), %eax
136 subl %ebx, %edx C accumulate carry bit
137
138 movl (%esi,%ebp,4), %ebx
139
140 shrdl( %cl, %ebx, %eax)
141
142 subl %edx, %eax C apply carry limb
143 movl VAR_INVERSE, %edx
144
145 sbbl %ebx, %ebx
146
147 L(entry):
148 imull %edx, %eax
149
150 movl %eax, -4(%edi,%ebp,4)
151 movl PARAM_DIVISOR, %edx
152
153 mull %edx
154
155 incl %ebp
156 jnz L(top)
157
158
159 movl -4(%esi), %eax C src high limb
160 L(one):
161 shrl %cl, %eax
162 popl %esi FRAME_popl()
163
164 addl %ebx, %eax C apply carry bit
165 popl %ebx FRAME_popl()
166
167 subl %edx, %eax C apply carry limb
168
169 imull VAR_INVERSE, %eax
170
171 movl %eax, -4(%edi)
172
173 popl %edi
174 popl %ebp
175
176 ret
177
178 EPILOGUE()
179