rshift.asm revision 1.1.1.2 1 dnl x86-64 mpn_rshift optimized for Pentium 4.
2
3 dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb
24 C AMD K8,K9 2.5
25 C AMD K10 ?
26 C Intel P4 3.29
27 C Intel core2 2.1 (fluctuates, presumably cache related)
28 C Intel corei ?
29 C Intel atom 14.3
30 C VIA nano ?
31
32 C INPUT PARAMETERS
33 define(`rp',`%rdi')
34 define(`up',`%rsi')
35 define(`n',`%rdx')
36 define(`cnt',`%cl')
37
38 ABI_SUPPORT(DOS64)
39 ABI_SUPPORT(STD64)
40
41 ASM_START()
42 TEXT
43 ALIGN(32)
44 PROLOGUE(mpn_rshift)
45 FUNC_ENTRY(4)
46 mov (up), %rax
47 movd R32(%rcx), %mm4
48 neg R32(%rcx) C put lsh count in cl
49 and $63, R32(%rcx)
50 movd R32(%rcx), %mm5
51
52 lea -8(up,n,8), up
53 lea -8(rp,n,8), rp
54 lea 1(n), R32(%r8)
55 neg n
56
57 shl R8(%rcx), %rax C function return value
58
59 and $3, R32(%r8)
60 je L(rol) C jump for n = 3, 7, 11, ...
61
62 dec R32(%r8)
63 jne L(1)
64 C n = 4, 8, 12, ...
65 movq 8(up,n,8), %mm2
66 psrlq %mm4, %mm2
67 movq 16(up,n,8), %mm0
68 psllq %mm5, %mm0
69 por %mm0, %mm2
70 movq %mm2, 8(rp,n,8)
71 inc n
72 jmp L(rol)
73
74 L(1): dec R32(%r8)
75 je L(1x) C jump for n = 1, 5, 9, 13, ...
76 C n = 2, 6, 10, 16, ...
77 movq 8(up,n,8), %mm2
78 psrlq %mm4, %mm2
79 movq 16(up,n,8), %mm0
80 psllq %mm5, %mm0
81 por %mm0, %mm2
82 movq %mm2, 8(rp,n,8)
83 inc n
84 L(1x):
85 cmp $-1, n
86 je L(ast)
87 movq 8(up,n,8), %mm2
88 psrlq %mm4, %mm2
89 movq 16(up,n,8), %mm3
90 psrlq %mm4, %mm3
91 movq 16(up,n,8), %mm0
92 movq 24(up,n,8), %mm1
93 psllq %mm5, %mm0
94 por %mm0, %mm2
95 psllq %mm5, %mm1
96 por %mm1, %mm3
97 movq %mm2, 8(rp,n,8)
98 movq %mm3, 16(rp,n,8)
99 add $2, n
100
101 L(rol): movq 8(up,n,8), %mm2
102 psrlq %mm4, %mm2
103 movq 16(up,n,8), %mm3
104 psrlq %mm4, %mm3
105
106 add $4, n C 4
107 jb L(end) C 2
108 ALIGN(32)
109 L(top):
110 C finish stuff from lsh block
111 movq -16(up,n,8), %mm0
112 movq -8(up,n,8), %mm1
113 psllq %mm5, %mm0
114 por %mm0, %mm2
115 psllq %mm5, %mm1
116 movq (up,n,8), %mm0
117 por %mm1, %mm3
118 movq 8(up,n,8), %mm1
119 movq %mm2, -24(rp,n,8)
120 movq %mm3, -16(rp,n,8)
121 C start two new rsh
122 psllq %mm5, %mm0
123 psllq %mm5, %mm1
124
125 C finish stuff from rsh block
126 movq -8(up,n,8), %mm2
127 movq (up,n,8), %mm3
128 psrlq %mm4, %mm2
129 por %mm2, %mm0
130 psrlq %mm4, %mm3
131 movq 8(up,n,8), %mm2
132 por %mm3, %mm1
133 movq 16(up,n,8), %mm3
134 movq %mm0, -8(rp,n,8)
135 movq %mm1, (rp,n,8)
136 C start two new lsh
137 add $4, n
138 psrlq %mm4, %mm2
139 psrlq %mm4, %mm3
140
141 jae L(top) C 2
142 L(end):
143 movq -8(up), %mm0
144 psllq %mm5, %mm0
145 por %mm0, %mm2
146 movq (up), %mm1
147 psllq %mm5, %mm1
148 por %mm1, %mm3
149 movq %mm2, -16(rp)
150 movq %mm3, -8(rp)
151
152 L(ast): movq (up), %mm2
153 psrlq %mm4, %mm2
154 movq %mm2, (rp)
155 emms
156 FUNC_EXIT()
157 ret
158 EPILOGUE()
159