lshift-movdqu2.asm revision 1.1.1.1.4.2 1 dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
2
3 dnl Contributed to the GNU project by Torbjorn Granlund.
4
5 dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
13
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
18
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24
25 C cycles/limb cycles/limb cycles/limb good
26 C aligned unaligned best seen for cpu?
27 C AMD K8,K9 3 3 2.35 no, use shl/shr
28 C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
29 C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
30 C AMD bobcat 3.17 3.17 yes, bad for n < 20
31 C Intel P4 4.67 4.67 2.7 no, slow movdqu
32 C Intel core2 2.15 2.15 1.25 no, use shld/shrd
33 C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
34 C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
35 C Intel atom 11.7 11.7 4.5 no
36 C VIA nano 5.7 5.95 2.0 no, slow movdqu
37
38 C We try to do as many aligned 16-byte operations as possible. The top-most
39 C and bottom-most writes might need 8-byte operations.
40 C
41 C This variant rely on fast load movdqu, and uses it even for aligned operands,
42 C in order to avoid the need for two separate loops.
43 C
44 C TODO
45 C * Could 2-limb wind-down code be simplified?
46 C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
47 C for other affected CPUs.
48
49 C INPUT PARAMETERS
50 define(`rp', `%rdi')
51 define(`ap', `%rsi')
52 define(`n', `%rdx')
53 define(`cnt', `%rcx')
54
55 ASM_START()
56 TEXT
57 ALIGN(64)
58 PROLOGUE(mpn_lshift)
59 FUNC_ENTRY(4)
60 movd R32(%rcx), %xmm4
61 mov $64, R32(%rax)
62 sub R32(%rcx), R32(%rax)
63 movd R32(%rax), %xmm5
64
65 neg R32(%rcx)
66 mov -8(ap,n,8), %rax
67 shr R8(%rcx), %rax
68
69 cmp $3, n
70 jle L(bc)
71
72 lea (rp,n,8), R32(%rcx)
73 bt $3, R32(%rcx)
74 jnc L(rp_aligned)
75
76 C Do one initial limb in order to make rp aligned
77 movq -8(ap,n,8), %xmm0
78 movq -16(ap,n,8), %xmm1
79 psllq %xmm4, %xmm0
80 psrlq %xmm5, %xmm1
81 por %xmm1, %xmm0
82 movq %xmm0, -8(rp,n,8)
83 dec n
84
85 L(rp_aligned):
86 lea 1(n), %r8d
87
88 and $6, R32(%r8)
89 jz L(ba0)
90 cmp $4, R32(%r8)
91 jz L(ba4)
92 jc L(ba2)
93 L(ba6): add $-4, n
94 jmp L(i56)
95 L(ba0): add $-6, n
96 jmp L(i70)
97 L(ba4): add $-2, n
98 jmp L(i34)
99 L(ba2): add $-8, n
100 jle L(end)
101
102 ALIGN(16)
103 L(top): movdqu 40(ap,n,8), %xmm1
104 movdqu 48(ap,n,8), %xmm0
105 psllq %xmm4, %xmm0
106 psrlq %xmm5, %xmm1
107 por %xmm1, %xmm0
108 movdqa %xmm0, 48(rp,n,8)
109 L(i70):
110 movdqu 24(ap,n,8), %xmm1
111 movdqu 32(ap,n,8), %xmm0
112 psllq %xmm4, %xmm0
113 psrlq %xmm5, %xmm1
114 por %xmm1, %xmm0
115 movdqa %xmm0, 32(rp,n,8)
116 L(i56):
117 movdqu 8(ap,n,8), %xmm1
118 movdqu 16(ap,n,8), %xmm0
119 psllq %xmm4, %xmm0
120 psrlq %xmm5, %xmm1
121 por %xmm1, %xmm0
122 movdqa %xmm0, 16(rp,n,8)
123 L(i34):
124 movdqu -8(ap,n,8), %xmm1
125 movdqu (ap,n,8), %xmm0
126 psllq %xmm4, %xmm0
127 psrlq %xmm5, %xmm1
128 por %xmm1, %xmm0
129 movdqa %xmm0, (rp,n,8)
130 sub $8, n
131 jg L(top)
132
133 L(end): bt $0, R32(n)
134 jc L(end8)
135
136 movdqu (ap), %xmm1
137 pxor %xmm0, %xmm0
138 punpcklqdq %xmm1, %xmm0
139 psllq %xmm4, %xmm1
140 psrlq %xmm5, %xmm0
141 por %xmm1, %xmm0
142 movdqa %xmm0, (rp)
143 FUNC_EXIT()
144 ret
145
146 C Basecase
147 ALIGN(16)
148 L(bc): dec R32(n)
149 jz L(end8)
150
151 movq (ap,n,8), %xmm1
152 movq -8(ap,n,8), %xmm0
153 psllq %xmm4, %xmm1
154 psrlq %xmm5, %xmm0
155 por %xmm1, %xmm0
156 movq %xmm0, (rp,n,8)
157 sub $2, R32(n)
158 jl L(end8)
159 movq 8(ap), %xmm1
160 movq (ap), %xmm0
161 psllq %xmm4, %xmm1
162 psrlq %xmm5, %xmm0
163 por %xmm1, %xmm0
164 movq %xmm0, 8(rp)
165
166 L(end8):movq (ap), %xmm0
167 psllq %xmm4, %xmm0
168 movq %xmm0, (rp)
169 FUNC_EXIT()
170 ret
171 EPILOGUE()
172