popcount.asm revision 1.1.1.1.8.1 1 dnl X86-32 and X86-64 mpn_popcount using SSE2.
2
3 dnl Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20
21 include(`../config.m4')
22
23
24 C 32-bit popcount hamdist
25 C cycles/limb cycles/limb
26 C P5 -
27 C P6 model 0-8,10-12 -
28 C P6 model 9 (Banias) ?
29 C P6 model 13 (Dothan) 4
30 C P4 model 0 (Willamette) ?
31 C P4 model 1 (?) ?
32 C P4 model 2 (Northwood) 3.9
33 C P4 model 3 (Prescott) ?
34 C P4 model 4 (Nocona) ?
35 C AMD K6 -
36 C AMD K7 -
37 C AMD K8 ?
38
39 C 64-bit popcount hamdist
40 C cycles/limb cycles/limb
41 C P4 model 4 (Nocona): 8
42 C AMD K8,K9 7.5
43 C AMD K10 3.5
44 C Intel core2 3.68
45 C Intel corei 3.15
46 C Intel atom 10.8
47 C VIA nano 6.5
48
49 C TODO
50 C * Make a mpn_hamdist based on this. Alignment could either be handled by
51 C using movdqu for one operand and movdqa for the other, or by painfully
52 C shifting as we go. Unfortunately, there seem to be no useable shift
53 C instruction, except for one that takes an immediate count.
54 C * It would probably be possible to cut a few cycles/limb using software
55 C pipelining.
56 C * There are 35 decode slots unused by the SSE2 instructions. Loop control
57 C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
58 C integer based popcount. Such a combined loop would handle 6 limbs in
59 C about 30 cycles on K8.
60 C * We could save a byte or two by using 32-bit operations on areg.
61 C * Check if using movdqa to a temp of and then register-based pand is faster.
62
63 ifelse(GMP_LIMB_BITS,`32',
64 ` define(`up', `%edx')
65 define(`n', `%ecx')
66 define(`areg',`%eax')
67 define(`breg',`%ebx')
68 define(`zero',`%xmm4')
69 define(`LIMB32',` $1')
70 define(`LIMB64',`dnl')
71 ',`
72 define(`up', `%rdi')
73 define(`n', `%rsi')
74 define(`areg',`%rax')
75 define(`breg',`%rdx')
76 define(`zero',`%xmm8')
77 define(`LIMB32',`dnl')
78 define(`LIMB64',` $1')
79 ')
80
81 define(`mm01010101',`%xmm6')
82 define(`mm00110011',`%xmm7')
83 define(`mm00001111',`%xmm2')
84
85 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
86 define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
87 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
88
89 undefine(`psadbw') C override inherited m4 version
90
91 ASM_START()
92
93 C Make cnsts global to work around Apple relocation bug.
94 ifdef(`DARWIN',`
95 define(`cnsts', MPN(popccnsts))
96 GLOBL cnsts')
97
98 TEXT
99 ALIGN(32)
100 PROLOGUE(mpn_popcount)
101
102 LIMB32(`mov 4(%esp), up ')
103 LIMB32(`mov 8(%esp), n ')
104 LIMB32(`push %ebx ')
105
106 pxor %xmm3, %xmm3 C zero grand total count
107 LIMB64(`pxor zero, zero ')
108 ifdef(`PIC',`
109 LEA( cnsts, breg)
110 ',`
111 LIMB32(`mov $cnsts, breg ')
112 LIMB64(`movabs $cnsts, breg ')
113 ')
114
115 movdqa -48(breg), mm01010101
116 movdqa -32(breg), mm00110011
117 movdqa -16(breg), mm00001111
118
119 mov up, areg
120 and $-16, up C round `up' down to 128-bit boundary
121 and $12, areg C 32:areg = 0, 4, 8, 12
122 C 64:areg = 0, 8
123 movdqa (up), %xmm0
124 pand 64(breg,areg,4), %xmm0
125 shr $m4_log2(GMP_LIMB_BYTES), %eax
126 add areg, n C compensate n for rounded down `up'
127
128 pxor %xmm4, %xmm4
129 sub $LIMBS_PER_XMM, n
130 jbe L(sum)
131
132 sub $LIMBS_PER_XMM, n
133 ja L(ent)
134 jmp L(lsum)
135
136 ALIGN(16)
137 L(top): movdqa (up), %xmm0
138 L(ent): movdqa 16(up), %xmm4
139
140 movdqa %xmm0, %xmm1
141 movdqa %xmm4, %xmm5
142 psrld $1, %xmm0
143 psrld $1, %xmm4
144 pand mm01010101, %xmm0
145 pand mm01010101, %xmm4
146 psubd %xmm0, %xmm1
147 psubd %xmm4, %xmm5
148
149 movdqa %xmm1, %xmm0
150 movdqa %xmm5, %xmm4
151 psrlq $2, %xmm1
152 psrlq $2, %xmm5
153 pand mm00110011, %xmm0
154 pand mm00110011, %xmm4
155 pand mm00110011, %xmm1
156 pand mm00110011, %xmm5
157 paddq %xmm0, %xmm1
158 paddq %xmm4, %xmm5
159
160 LIMB32(`pxor zero, zero ')
161
162 add $32, up
163 sub $LIMBS_PER_2XMM, n
164
165 paddq %xmm5, %xmm1
166 movdqa %xmm1, %xmm0
167 psrlq $4, %xmm1
168 pand mm00001111, %xmm0
169 pand mm00001111, %xmm1
170 paddq %xmm0, %xmm1
171
172 psadbw zero, %xmm1
173 paddq %xmm1, %xmm3 C add to grand total
174
175 jnc L(top)
176 L(end):
177 add $LIMBS_PER_2XMM, n
178 jz L(rt)
179 movdqa (up), %xmm0
180 pxor %xmm4, %xmm4
181 sub $LIMBS_PER_XMM, n
182 jbe L(sum)
183 L(lsum):
184 movdqa %xmm0, %xmm4
185 movdqa 16(up), %xmm0
186 L(sum):
187 shl $m4_log2(GMP_LIMB_BYTES), n
188 and $12, n
189 pand (breg,n,4), %xmm0
190
191 movdqa %xmm0, %xmm1
192 movdqa %xmm4, %xmm5
193 psrld $1, %xmm0
194 psrld $1, %xmm4
195 pand mm01010101, %xmm0
196 pand mm01010101, %xmm4
197 psubd %xmm0, %xmm1
198 psubd %xmm4, %xmm5
199
200 movdqa %xmm1, %xmm0
201 movdqa %xmm5, %xmm4
202 psrlq $2, %xmm1
203 psrlq $2, %xmm5
204 pand mm00110011, %xmm0
205 pand mm00110011, %xmm4
206 pand mm00110011, %xmm1
207 pand mm00110011, %xmm5
208 paddq %xmm0, %xmm1
209 paddq %xmm4, %xmm5
210
211 LIMB32(`pxor zero, zero ')
212
213 paddq %xmm5, %xmm1
214 movdqa %xmm1, %xmm0
215 psrlq $4, %xmm1
216 pand mm00001111, %xmm0
217 pand mm00001111, %xmm1
218 paddq %xmm0, %xmm1
219
220 psadbw zero, %xmm1
221 paddq %xmm1, %xmm3 C add to grand total
222
223
224 C Add the two 64-bit halves of the grand total counter
225 L(rt): movdqa %xmm3, %xmm0
226 psrldq $8, %xmm3
227 paddq %xmm3, %xmm0
228 movd %xmm0, areg C movq avoided due to gas bug
229
230 LIMB32(`pop %ebx ')
231 ret
232
233 EPILOGUE()
234 DEF_OBJECT(dummy,16)
235 C Three magic constants used for masking out bits
236 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
237 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
238
239 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
240 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
241
242 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
243 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
244 cnsts:
245 C Masks for high end of number
246 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
247 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
248
249 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
250 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
251
252 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
253 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
254
255 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
256 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
257 C Masks for low end of number
258 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
259 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
260
261 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
262 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
263
264 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
265 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
266
267 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
268 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
269 END_OBJECT(dummy)
270