lshift.asm revision 1.1.1.1 1 dnl x86-64 mpn_lshift optimized for Pentium 4.
2
3 dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb
24 C K8,K9: 2.5
25 C K10: ?
26 C P4: 3.29
27 C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
28 C P6-28 (Atom): 14.3
29
30 C INPUT PARAMETERS
31 define(`rp',`%rdi')
32 define(`up',`%rsi')
33 define(`n',`%rdx')
34 define(`cnt',`%cl')
35
36 ASM_START()
37 TEXT
38 ALIGN(32)
39 PROLOGUE(mpn_lshift)
40 mov -8(up,n,8), %rax
41 movd %ecx, %mm4
42 neg %ecx C put rsh count in cl
43 and $63, %ecx
44 movd %ecx, %mm5
45
46 lea 1(n), %r8d
47
48 shr %cl, %rax C function return value
49
50 and $3, %r8d
51 je L(rol) C jump for n = 3, 7, 11, ...
52
53 dec %r8d
54 jne L(1)
55 C n = 4, 8, 12, ...
56 movq -8(up,n,8), %mm2
57 psllq %mm4, %mm2
58 movq -16(up,n,8), %mm0
59 psrlq %mm5, %mm0
60 por %mm0, %mm2
61 movq %mm2, -8(rp,n,8)
62 dec n
63 jmp L(rol)
64
65 L(1): dec %r8d
66 je L(1x) C jump for n = 1, 5, 9, 13, ...
67 C n = 2, 6, 10, 16, ...
68 movq -8(up,n,8), %mm2
69 psllq %mm4, %mm2
70 movq -16(up,n,8), %mm0
71 psrlq %mm5, %mm0
72 por %mm0, %mm2
73 movq %mm2, -8(rp,n,8)
74 dec n
75 L(1x):
76 cmp $1, n
77 je L(ast)
78 movq -8(up,n,8), %mm2
79 psllq %mm4, %mm2
80 movq -16(up,n,8), %mm3
81 psllq %mm4, %mm3
82 movq -16(up,n,8), %mm0
83 movq -24(up,n,8), %mm1
84 psrlq %mm5, %mm0
85 por %mm0, %mm2
86 psrlq %mm5, %mm1
87 por %mm1, %mm3
88 movq %mm2, -8(rp,n,8)
89 movq %mm3, -16(rp,n,8)
90 sub $2, n
91
92 L(rol): movq -8(up,n,8), %mm2
93 psllq %mm4, %mm2
94 movq -16(up,n,8), %mm3
95 psllq %mm4, %mm3
96
97 sub $4, n C 4
98 jb L(end) C 2
99 ALIGN(32)
100 L(top):
101 C finish stuff from lsh block
102 movq 16(up,n,8), %mm0
103 movq 8(up,n,8), %mm1
104 psrlq %mm5, %mm0
105 por %mm0, %mm2
106 psrlq %mm5, %mm1
107 movq (up,n,8), %mm0
108 por %mm1, %mm3
109 movq -8(up,n,8), %mm1
110 movq %mm2, 24(rp,n,8)
111 movq %mm3, 16(rp,n,8)
112 C start two new rsh
113 psrlq %mm5, %mm0
114 psrlq %mm5, %mm1
115
116 C finish stuff from rsh block
117 movq 8(up,n,8), %mm2
118 movq (up,n,8), %mm3
119 psllq %mm4, %mm2
120 por %mm2, %mm0
121 psllq %mm4, %mm3
122 movq -8(up,n,8), %mm2
123 por %mm3, %mm1
124 movq -16(up,n,8), %mm3
125 movq %mm0, 8(rp,n,8)
126 movq %mm1, (rp,n,8)
127 C start two new lsh
128 sub $4, n
129 psllq %mm4, %mm2
130 psllq %mm4, %mm3
131
132 jae L(top) C 2
133 L(end):
134 movq 16(up,n,8), %mm0
135 psrlq %mm5, %mm0
136 por %mm0, %mm2
137 movq 8(up,n,8), %mm1
138 psrlq %mm5, %mm1
139 por %mm1, %mm3
140 movq %mm2, 24(rp,n,8)
141 movq %mm3, 16(rp,n,8)
142
143 L(ast): movq (up), %mm2
144 psllq %mm4, %mm2
145 movq %mm2, (rp)
146 emms
147 ret
148 EPILOGUE()
149