lshift.asm revision 1.1 1 1.1 mrg dnl Intel Atom mpn_lshift -- mpn left shift.
2 1.1 mrg
3 1.1 mrg dnl Copyright 2011 Free Software Foundation, Inc.
4 1.1 mrg
5 1.1 mrg dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 1.1 mrg
7 1.1 mrg dnl This file is part of the GNU MP Library.
8 1.1 mrg dnl
9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or
10 1.1 mrg dnl modify it under the terms of the GNU Lesser General Public License as
11 1.1 mrg dnl published by the Free Software Foundation; either version 3 of the
12 1.1 mrg dnl License, or (at your option) any later version.
13 1.1 mrg dnl
14 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful,
15 1.1 mrg dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16 1.1 mrg dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 1.1 mrg dnl Lesser General Public License for more details.
18 1.1 mrg dnl
19 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
20 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 1.1 mrg
22 1.1 mrg include(`../config.m4')
23 1.1 mrg
24 1.1 mrg C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
25 1.1 mrg C unsigned cnt);
26 1.1 mrg
27 1.1 mrg C cycles/limb
28 1.1 mrg C cnt!=1 cnt==1
29 1.1 mrg C P5
30 1.1 mrg C P6 model 0-8,10-12
31 1.1 mrg C P6 model 9 (Banias)
32 1.1 mrg C P6 model 13 (Dothan)
33 1.1 mrg C P4 model 0 (Willamette)
34 1.1 mrg C P4 model 1 (?)
35 1.1 mrg C P4 model 2 (Northwood)
36 1.1 mrg C P4 model 3 (Prescott)
37 1.1 mrg C P4 model 4 (Nocona)
38 1.1 mrg C Intel Atom 5 2.5
39 1.1 mrg C AMD K6
40 1.1 mrg C AMD K7
41 1.1 mrg C AMD K8
42 1.1 mrg C AMD K10
43 1.1 mrg
44 1.1 mrg defframe(PARAM_CNT, 16)
45 1.1 mrg defframe(PARAM_SIZE,12)
46 1.1 mrg defframe(PARAM_SRC, 8)
47 1.1 mrg defframe(PARAM_DST, 4)
48 1.1 mrg
49 1.1 mrg dnl re-use parameter space
50 1.1 mrg define(SAVE_UP,`PARAM_CNT')
51 1.1 mrg define(VAR_COUNT,`PARAM_SIZE')
52 1.1 mrg define(SAVE_EBX,`PARAM_SRC')
53 1.1 mrg define(SAVE_EBP,`PARAM_DST')
54 1.1 mrg
55 1.1 mrg define(`rp', `%edi')
56 1.1 mrg define(`up', `%esi')
57 1.1 mrg define(`cnt', `%ecx')
58 1.1 mrg
59 1.1 mrg ASM_START()
60 1.1 mrg TEXT
61 1.1 mrg ALIGN(8)
62 1.1 mrg deflit(`FRAME',0)
63 1.1 mrg PROLOGUE(mpn_lshift)
64 1.1 mrg mov PARAM_CNT, cnt
65 1.1 mrg mov PARAM_SIZE, %edx
66 1.1 mrg mov up, SAVE_UP
67 1.1 mrg mov PARAM_SRC, up
68 1.1 mrg push rp FRAME_pushl()
69 1.1 mrg mov PARAM_DST, rp
70 1.1 mrg
71 1.1 mrg C We can use faster code for shift-by-1 under certain conditions.
72 1.1 mrg cmp $1,cnt
73 1.1 mrg jne L(normal)
74 1.1 mrg cmpl rp, up
75 1.1 mrg jnc L(special) C jump if s_ptr + 1 >= res_ptr
76 1.1 mrg leal (up,%edx,4),%eax
77 1.1 mrg cmpl %eax,rp
78 1.1 mrg jnc L(special) C jump if res_ptr >= s_ptr + size
79 1.1 mrg
80 1.1 mrg L(normal):
81 1.1 mrg lea -4(up,%edx,4), up
82 1.1 mrg mov %ebx, SAVE_EBX
83 1.1 mrg lea -4(rp,%edx,4), rp
84 1.1 mrg
85 1.1 mrg shr %edx
86 1.1 mrg mov (up), %eax
87 1.1 mrg mov %edx, VAR_COUNT
88 1.1 mrg jnc L(evn)
89 1.1 mrg
90 1.1 mrg mov %eax, %ebx
91 1.1 mrg shl %cl, %ebx
92 1.1 mrg neg cnt
93 1.1 mrg shr %cl, %eax
94 1.1 mrg test %edx, %edx
95 1.1 mrg jnz L(gt1)
96 1.1 mrg mov %ebx, (rp)
97 1.1 mrg jmp L(quit)
98 1.1 mrg
99 1.1 mrg L(gt1): mov %ebp, SAVE_EBP
100 1.1 mrg push %eax
101 1.1 mrg mov -4(up), %eax
102 1.1 mrg mov %eax, %ebp
103 1.1 mrg shr %cl, %eax
104 1.1 mrg jmp L(lo1)
105 1.1 mrg
106 1.1 mrg L(evn): mov %ebp, SAVE_EBP
107 1.1 mrg neg cnt
108 1.1 mrg mov %eax, %ebp
109 1.1 mrg mov -4(up), %edx
110 1.1 mrg shr %cl, %eax
111 1.1 mrg mov %edx, %ebx
112 1.1 mrg shr %cl, %edx
113 1.1 mrg neg cnt
114 1.1 mrg decl VAR_COUNT
115 1.1 mrg lea 4(rp), rp
116 1.1 mrg lea -4(up), up
117 1.1 mrg jz L(end)
118 1.1 mrg push %eax FRAME_pushl()
119 1.1 mrg
120 1.1 mrg ALIGN(8)
121 1.1 mrg L(top): shl %cl, %ebp
122 1.1 mrg or %ebp, %edx
123 1.1 mrg shl %cl, %ebx
124 1.1 mrg neg cnt
125 1.1 mrg mov -4(up), %eax
126 1.1 mrg mov %eax, %ebp
127 1.1 mrg mov %edx, -4(rp)
128 1.1 mrg shr %cl, %eax
129 1.1 mrg lea -8(rp), rp
130 1.1 mrg L(lo1): mov -8(up), %edx
131 1.1 mrg or %ebx, %eax
132 1.1 mrg mov %edx, %ebx
133 1.1 mrg shr %cl, %edx
134 1.1 mrg lea -8(up), up
135 1.1 mrg neg cnt
136 1.1 mrg mov %eax, (rp)
137 1.1 mrg decl VAR_COUNT
138 1.1 mrg jg L(top)
139 1.1 mrg
140 1.1 mrg pop %eax FRAME_popl()
141 1.1 mrg L(end):
142 1.1 mrg shl %cl, %ebp
143 1.1 mrg shl %cl, %ebx
144 1.1 mrg or %ebp, %edx
145 1.1 mrg mov SAVE_EBP, %ebp
146 1.1 mrg mov %edx, -4(rp)
147 1.1 mrg mov %ebx, -8(rp)
148 1.1 mrg
149 1.1 mrg L(quit):
150 1.1 mrg mov SAVE_UP, up
151 1.1 mrg mov SAVE_EBX, %ebx
152 1.1 mrg pop rp FRAME_popl()
153 1.1 mrg ret
154 1.1 mrg
155 1.1 mrg L(special):
156 1.1 mrg deflit(`FRAME',4)
157 1.1 mrg lea 3(%edx), %eax C size + 3
158 1.1 mrg dec %edx C size - 1
159 1.1 mrg mov (up), %ecx
160 1.1 mrg shr $2, %eax C (size + 3) / 4
161 1.1 mrg and $3, %edx C (size - 1) % 4
162 1.1 mrg jz L(goloop) C jmp if size == 1 (mod 4)
163 1.1 mrg shr %edx
164 1.1 mrg jnc L(odd) C jum if size == 3 (mod 4)
165 1.1 mrg
166 1.1 mrg add %ecx, %ecx
167 1.1 mrg lea 4(up), up
168 1.1 mrg mov %ecx, (rp)
169 1.1 mrg mov (up), %ecx
170 1.1 mrg lea 4(rp), rp
171 1.1 mrg
172 1.1 mrg dec %edx
173 1.1 mrg jnz L(goloop) C jump if size == 0 (mod 4)
174 1.1 mrg L(odd): lea -8(up), up
175 1.1 mrg lea -8(rp), rp
176 1.1 mrg jmp L(sentry) C reached if size == 2 or 3 (mod 4)
177 1.1 mrg
178 1.1 mrg L(sloop):
179 1.1 mrg adc %ecx, %ecx
180 1.1 mrg mov 4(up), %edx
181 1.1 mrg mov %ecx, (rp)
182 1.1 mrg adc %edx, %edx
183 1.1 mrg mov 8(up), %ecx
184 1.1 mrg mov %edx, 4(rp)
185 1.1 mrg L(sentry):
186 1.1 mrg adc %ecx, %ecx
187 1.1 mrg mov 12(up), %edx
188 1.1 mrg mov %ecx, 8(rp)
189 1.1 mrg adc %edx, %edx
190 1.1 mrg lea 16(up), up
191 1.1 mrg mov %edx, 12(rp)
192 1.1 mrg lea 16(rp), rp
193 1.1 mrg mov (up), %ecx
194 1.1 mrg L(goloop):
195 1.1 mrg decl %eax
196 1.1 mrg jnz L(sloop)
197 1.1 mrg
198 1.1 mrg L(squit):
199 1.1 mrg adc %ecx, %ecx
200 1.1 mrg mov %ecx, (rp)
201 1.1 mrg adc %eax, %eax
202 1.1 mrg
203 1.1 mrg mov SAVE_UP, up
204 1.1 mrg pop rp FRAME_popl()
205 1.1 mrg ret
206 1.1 mrg EPILOGUE()
207 1.1 mrg ASM_END()
208