lshift.asm revision 1.1.1.1.2.1 1 dnl x86-64 mpn_lshift optimized for Pentium 4.
2
3 dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb
24 C AMD K8,K9 2.5
25 C AMD K10 ?
26 C Intel P4 3.29
27 C Intel core2 2.1 (fluctuates, presumably cache related)
28 C Intel corei ?
29 C Intel atom 14.3
30 C VIA nano ?
31
32 C INPUT PARAMETERS
33 define(`rp',`%rdi')
34 define(`up',`%rsi')
35 define(`n',`%rdx')
36 define(`cnt',`%cl')
37
38 ABI_SUPPORT(DOS64)
39 ABI_SUPPORT(STD64)
40
41 ASM_START()
42 TEXT
43 ALIGN(32)
44 PROLOGUE(mpn_lshift)
45 FUNC_ENTRY(4)
46 mov -8(up,n,8), %rax
47 movd R32(%rcx), %mm4
48 neg R32(%rcx) C put rsh count in cl
49 and $63, R32(%rcx)
50 movd R32(%rcx), %mm5
51
52 lea 1(n), R32(%r8)
53
54 shr R8(%rcx), %rax C function return value
55
56 and $3, R32(%r8)
57 je L(rol) C jump for n = 3, 7, 11, ...
58
59 dec R32(%r8)
60 jne L(1)
61 C n = 4, 8, 12, ...
62 movq -8(up,n,8), %mm2
63 psllq %mm4, %mm2
64 movq -16(up,n,8), %mm0
65 psrlq %mm5, %mm0
66 por %mm0, %mm2
67 movq %mm2, -8(rp,n,8)
68 dec n
69 jmp L(rol)
70
71 L(1): dec R32(%r8)
72 je L(1x) C jump for n = 1, 5, 9, 13, ...
73 C n = 2, 6, 10, 16, ...
74 movq -8(up,n,8), %mm2
75 psllq %mm4, %mm2
76 movq -16(up,n,8), %mm0
77 psrlq %mm5, %mm0
78 por %mm0, %mm2
79 movq %mm2, -8(rp,n,8)
80 dec n
81 L(1x):
82 cmp $1, n
83 je L(ast)
84 movq -8(up,n,8), %mm2
85 psllq %mm4, %mm2
86 movq -16(up,n,8), %mm3
87 psllq %mm4, %mm3
88 movq -16(up,n,8), %mm0
89 movq -24(up,n,8), %mm1
90 psrlq %mm5, %mm0
91 por %mm0, %mm2
92 psrlq %mm5, %mm1
93 por %mm1, %mm3
94 movq %mm2, -8(rp,n,8)
95 movq %mm3, -16(rp,n,8)
96 sub $2, n
97
98 L(rol): movq -8(up,n,8), %mm2
99 psllq %mm4, %mm2
100 movq -16(up,n,8), %mm3
101 psllq %mm4, %mm3
102
103 sub $4, n C 4
104 jb L(end) C 2
105 ALIGN(32)
106 L(top):
107 C finish stuff from lsh block
108 movq 16(up,n,8), %mm0
109 movq 8(up,n,8), %mm1
110 psrlq %mm5, %mm0
111 por %mm0, %mm2
112 psrlq %mm5, %mm1
113 movq (up,n,8), %mm0
114 por %mm1, %mm3
115 movq -8(up,n,8), %mm1
116 movq %mm2, 24(rp,n,8)
117 movq %mm3, 16(rp,n,8)
118 C start two new rsh
119 psrlq %mm5, %mm0
120 psrlq %mm5, %mm1
121
122 C finish stuff from rsh block
123 movq 8(up,n,8), %mm2
124 movq (up,n,8), %mm3
125 psllq %mm4, %mm2
126 por %mm2, %mm0
127 psllq %mm4, %mm3
128 movq -8(up,n,8), %mm2
129 por %mm3, %mm1
130 movq -16(up,n,8), %mm3
131 movq %mm0, 8(rp,n,8)
132 movq %mm1, (rp,n,8)
133 C start two new lsh
134 sub $4, n
135 psllq %mm4, %mm2
136 psllq %mm4, %mm3
137
138 jae L(top) C 2
139 L(end):
140 movq 8(up), %mm0
141 psrlq %mm5, %mm0
142 por %mm0, %mm2
143 movq (up), %mm1
144 psrlq %mm5, %mm1
145 por %mm1, %mm3
146 movq %mm2, 16(rp)
147 movq %mm3, 8(rp)
148
149 L(ast): movq (up), %mm2
150 psllq %mm4, %mm2
151 movq %mm2, (rp)
152 emms
153 FUNC_EXIT()
154 ret
155 EPILOGUE()
156