lshift.asm revision 1.1.1.1 1 dnl AMD64 mpn_lshift -- mpn left shift.
2
3 dnl Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb cycles/limb cnt=1
24 C K8,K9: 2.375 1.375
25 C K10: 2.375 1.375
26 C P4: 8 10.5
27 C P6-15 (Core2): 2.11 4.28
28 C P6-28 (Atom): 5.75 3.5
29
30
31 C INPUT PARAMETERS
32 define(`rp', `%rdi')
33 define(`up', `%rsi')
34 define(`n', `%rdx')
35 define(`cnt', `%rcx')
36
37 ASM_START()
38 TEXT
39 ALIGN(32)
40 PROLOGUE(mpn_lshift)
41 cmp $1, R8(%rcx)
42 jne L(gen)
43
44 C For cnt=1 we want to work from lowest limb towards higher limbs.
45 C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad.
46 C FIXME: this could surely be done more cleverly.
47
48 mov rp, %rax
49 sub up, %rax
50 je L(fwd) C rp = up
51 shr $3, %rax
52 cmp n, %rax
53 jb L(gen)
54
55 L(fwd): mov R32(n), R32(%rax)
56 shr $2, n
57 je L(e1)
58 and $3, R32(%rax)
59
60 ALIGN(8)
61 nop
62 nop
63 L(t1): mov (up), %r8
64 mov 8(up), %r9
65 mov 16(up), %r10
66 mov 24(up), %r11
67 lea 32(up), up
68 adc %r8, %r8
69 mov %r8, (rp)
70 adc %r9, %r9
71 mov %r9, 8(rp)
72 adc %r10, %r10
73 mov %r10, 16(rp)
74 adc %r11, %r11
75 mov %r11, 24(rp)
76 lea 32(rp), rp
77 dec n
78 jne L(t1)
79
80 inc R32(%rax)
81 dec R32(%rax)
82 jne L(n00)
83 adc R32(%rax), R32(%rax)
84 ret
85 L(e1): test R32(%rax), R32(%rax) C clear cy
86 L(n00): mov (up), %r8
87 dec R32(%rax)
88 jne L(n01)
89 adc %r8, %r8
90 mov %r8, (rp)
91 L(ret): adc R32(%rax), R32(%rax)
92 ret
93 L(n01): dec R32(%rax)
94 mov 8(up), %r9
95 jne L(n10)
96 adc %r8, %r8
97 adc %r9, %r9
98 mov %r8, (rp)
99 mov %r9, 8(rp)
100 adc R32(%rax), R32(%rax)
101 ret
102 L(n10): mov 16(up), %r10
103 adc %r8, %r8
104 adc %r9, %r9
105 adc %r10, %r10
106 mov %r8, (rp)
107 mov %r9, 8(rp)
108 mov %r10, 16(rp)
109 adc $-1, R32(%rax)
110 ret
111
112 L(gen): neg R32(%rcx) C put rsh count in cl
113 mov -8(up,n,8), %rax
114 shr R8(%rcx), %rax C function return value
115
116 neg R32(%rcx) C put lsh count in cl
117 lea 1(n), R32(%r8)
118 and $3, R32(%r8)
119 je L(rlx) C jump for n = 3, 7, 11, ...
120
121 dec R32(%r8)
122 jne L(1)
123 C n = 4, 8, 12, ...
124 mov -8(up,n,8), %r10
125 shl R8(%rcx), %r10
126 neg R32(%rcx) C put rsh count in cl
127 mov -16(up,n,8), %r8
128 shr R8(%rcx), %r8
129 or %r8, %r10
130 mov %r10, -8(rp,n,8)
131 dec n
132 jmp L(rll)
133
134 L(1): dec R32(%r8)
135 je L(1x) C jump for n = 1, 5, 9, 13, ...
136 C n = 2, 6, 10, 16, ...
137 mov -8(up,n,8), %r10
138 shl R8(%rcx), %r10
139 neg R32(%rcx) C put rsh count in cl
140 mov -16(up,n,8), %r8
141 shr R8(%rcx), %r8
142 or %r8, %r10
143 mov %r10, -8(rp,n,8)
144 dec n
145 neg R32(%rcx) C put lsh count in cl
146 L(1x):
147 cmp $1, n
148 je L(ast)
149 mov -8(up,n,8), %r10
150 shl R8(%rcx), %r10
151 mov -16(up,n,8), %r11
152 shl R8(%rcx), %r11
153 neg R32(%rcx) C put rsh count in cl
154 mov -16(up,n,8), %r8
155 mov -24(up,n,8), %r9
156 shr R8(%rcx), %r8
157 or %r8, %r10
158 shr R8(%rcx), %r9
159 or %r9, %r11
160 mov %r10, -8(rp,n,8)
161 mov %r11, -16(rp,n,8)
162 sub $2, n
163
164 L(rll): neg R32(%rcx) C put lsh count in cl
165 L(rlx): mov -8(up,n,8), %r10
166 shl R8(%rcx), %r10
167 mov -16(up,n,8), %r11
168 shl R8(%rcx), %r11
169
170 sub $4, n C 4
171 jb L(end) C 2
172 ALIGN(16)
173 L(top):
174 C finish stuff from lsh block
175 neg R32(%rcx) C put rsh count in cl
176 mov 16(up,n,8), %r8
177 mov 8(up,n,8), %r9
178 shr R8(%rcx), %r8
179 or %r8, %r10
180 shr R8(%rcx), %r9
181 or %r9, %r11
182 mov %r10, 24(rp,n,8)
183 mov %r11, 16(rp,n,8)
184 C start two new rsh
185 mov 0(up,n,8), %r8
186 mov -8(up,n,8), %r9
187 shr R8(%rcx), %r8
188 shr R8(%rcx), %r9
189
190 C finish stuff from rsh block
191 neg R32(%rcx) C put lsh count in cl
192 mov 8(up,n,8), %r10
193 mov 0(up,n,8), %r11
194 shl R8(%rcx), %r10
195 or %r10, %r8
196 shl R8(%rcx), %r11
197 or %r11, %r9
198 mov %r8, 8(rp,n,8)
199 mov %r9, 0(rp,n,8)
200 C start two new lsh
201 mov -8(up,n,8), %r10
202 mov -16(up,n,8), %r11
203 shl R8(%rcx), %r10
204 shl R8(%rcx), %r11
205
206 sub $4, n
207 jae L(top) C 2
208 L(end):
209 neg R32(%rcx) C put rsh count in cl
210 mov 8(up), %r8
211 shr R8(%rcx), %r8
212 or %r8, %r10
213 mov (up), %r9
214 shr R8(%rcx), %r9
215 or %r9, %r11
216 mov %r10, 16(rp)
217 mov %r11, 8(rp)
218
219 neg R32(%rcx) C put lsh count in cl
220 L(ast): mov (up), %r10
221 shl R8(%rcx), %r10
222 mov %r10, (rp)
223 ret
224 EPILOGUE()
225