popcount.asm revision 1.1 1 dnl X86-32 and X86-64 mpn_popcount using SSE2.
2
3 dnl Copyright 2006, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20
21 include(`../config.m4')
22
23
24 C 32-bit popcount hamdist
25 C cycles/limb cycles/limb
26 C P5: -
27 C P6 model 0-8,10-12) -
28 C P6 model 9 (Banias) ?
29 C P6 model 13 (Dothan) 4
30 C P4 model 0 (Willamette) ?
31 C P4 model 1 (?) ?
32 C P4 model 2 (Northwood) 3.9
33 C P4 model 3 (Prescott) ?
34 C P4 model 4 (Nocona) ?
35 C K6: -
36 C K7: -
37 C K8: ?
38
39 C 64-bit popcount hamdist
40 C cycles/limb cycles/limb
41 C P4 model 4 (Nocona): 8
42 C K8: 7.5
43 C K10: 3.5
44 C P6 core2: 3.68
45 C P6 corei7: 3.15
46
47 C TODO
48 C * Make a mpn_hamdist based on this. Alignment could either be handled by
49 C using movdqu for one operand and movdqa for the other, or by painfully
50 C shifting as we go. Unfortunately, there seem to be no useable shift
51 C instruction, except for one that takes an immediate count.
52 C * It would probably be possible to cut a few cycles/limb using software
53 C pipelining.
54 C * There are 35 decode slots unused by the SSE2 instructions. Loop control
55 C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
56 C integer based popcount. Such a combined loop would handle 6 limbs in
57 C about 30 cycles on K8.
58 C * We could save a byte or two by using 32-bit operations on areg.
59 C * Check if using movdqa to a temp of and then register-based pand is faster.
60
61 ifelse(GMP_LIMB_BITS,`32',
62 ` define(`up', `%edx')
63 define(`n', `%ecx')
64 define(`areg',`%eax')
65 define(`breg',`%ebx')
66 define(`zero',`%xmm4')
67 define(`LIMB32',` $1')
68 define(`LIMB64',`dnl')
69 ',`
70 define(`up', `%rdi')
71 define(`n', `%rsi')
72 define(`areg',`%rax')
73 define(`breg',`%rdx')
74 define(`zero',`%xmm8')
75 define(`LIMB32',`dnl')
76 define(`LIMB64',` $1')
77 ')
78
79 define(`mm01010101',`%xmm6')
80 define(`mm00110011',`%xmm7')
81 define(`mm00001111',`%xmm2')
82
83 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
84 define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
85 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
86
87 undefine(`psadbw') C override inherited m4 version
88
89 ASM_START()
90
91 C Make cnsts global to work around Apple relocation bug.
92 ifdef(`DARWIN',`
93 define(`cnsts', MPN(popccnsts))
94 GLOBL cnsts')
95
96 TEXT
97 ALIGN(32)
98 PROLOGUE(mpn_popcount)
99
100 LIMB32(`mov 4(%esp), up ')
101 LIMB32(`mov 8(%esp), n ')
102 LIMB32(`push %ebx ')
103
104 pxor %xmm3, %xmm3 C zero grand total count
105 LIMB64(`pxor zero, zero ')
106 ifdef(`PIC',`
107 LEA( cnsts, breg)
108 ',`
109 LIMB32(`mov $cnsts, breg ')
110 LIMB64(`movabs $cnsts, breg ')
111 ')
112
113 movdqa -48(breg), mm01010101
114 movdqa -32(breg), mm00110011
115 movdqa -16(breg), mm00001111
116
117 mov up, areg
118 and $-16, up C round `up' down to 128-bit boundary
119 and $12, areg C 32:areg = 0, 4, 8, 12
120 C 64:areg = 0, 8
121 movdqa (up), %xmm0
122 pand 64(breg,areg,4), %xmm0
123 shr $m4_log2(GMP_LIMB_BYTES), %eax
124 add areg, n C compensate n for rounded down `up'
125
126 pxor %xmm4, %xmm4
127 sub $LIMBS_PER_XMM, n
128 jbe L(sum)
129
130 sub $LIMBS_PER_XMM, n
131 ja L(ent)
132 jmp L(lsum)
133
134 ALIGN(16)
135 L(top): movdqa (up), %xmm0
136 L(ent): movdqa 16(up), %xmm4
137
138 movdqa %xmm0, %xmm1
139 movdqa %xmm4, %xmm5
140 psrld $1, %xmm0
141 psrld $1, %xmm4
142 pand mm01010101, %xmm0
143 pand mm01010101, %xmm4
144 psubd %xmm0, %xmm1
145 psubd %xmm4, %xmm5
146
147 movdqa %xmm1, %xmm0
148 movdqa %xmm5, %xmm4
149 psrlq $2, %xmm1
150 psrlq $2, %xmm5
151 pand mm00110011, %xmm0
152 pand mm00110011, %xmm4
153 pand mm00110011, %xmm1
154 pand mm00110011, %xmm5
155 paddq %xmm0, %xmm1
156 paddq %xmm4, %xmm5
157
158 LIMB32(`pxor zero, zero ')
159
160 add $32, up
161 sub $LIMBS_PER_2XMM, n
162
163 paddq %xmm5, %xmm1
164 movdqa %xmm1, %xmm0
165 psrlq $4, %xmm1
166 pand mm00001111, %xmm0
167 pand mm00001111, %xmm1
168 paddq %xmm0, %xmm1
169
170 psadbw zero, %xmm1
171 paddq %xmm1, %xmm3 C add to grand total
172
173 jnc L(top)
174 L(end):
175 add $LIMBS_PER_2XMM, n
176 jz L(rt)
177 movdqa (up), %xmm0
178 pxor %xmm4, %xmm4
179 sub $LIMBS_PER_XMM, n
180 jbe L(sum)
181 L(lsum):
182 movdqa %xmm0, %xmm4
183 movdqa 16(up), %xmm0
184 L(sum):
185 shl $m4_log2(GMP_LIMB_BYTES), n
186 and $12, n
187 pand (breg,n,4), %xmm0
188
189 movdqa %xmm0, %xmm1
190 movdqa %xmm4, %xmm5
191 psrld $1, %xmm0
192 psrld $1, %xmm4
193 pand mm01010101, %xmm0
194 pand mm01010101, %xmm4
195 psubd %xmm0, %xmm1
196 psubd %xmm4, %xmm5
197
198 movdqa %xmm1, %xmm0
199 movdqa %xmm5, %xmm4
200 psrlq $2, %xmm1
201 psrlq $2, %xmm5
202 pand mm00110011, %xmm0
203 pand mm00110011, %xmm4
204 pand mm00110011, %xmm1
205 pand mm00110011, %xmm5
206 paddq %xmm0, %xmm1
207 paddq %xmm4, %xmm5
208
209 LIMB32(`pxor zero, zero ')
210
211 paddq %xmm5, %xmm1
212 movdqa %xmm1, %xmm0
213 psrlq $4, %xmm1
214 pand mm00001111, %xmm0
215 pand mm00001111, %xmm1
216 paddq %xmm0, %xmm1
217
218 psadbw zero, %xmm1
219 paddq %xmm1, %xmm3 C add to grand total
220
221
222 C Add the two 64-bit halves of the grand total counter
223 L(rt): movdqa %xmm3, %xmm0
224 psrldq $8, %xmm3
225 paddq %xmm3, %xmm0
226 movd %xmm0, areg C movq avoided due to gas bug
227
228 LIMB32(`pop %ebx ')
229 ret
230
231 EPILOGUE()
232 DEF_OBJECT(dummy,16)
233 C Three magic constants used for masking out bits
234 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
235 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
236
237 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
238 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
239
240 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
241 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
242 cnsts:
243 C Masks for high end of number
244 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
245 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
246
247 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
248 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
249
250 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
251 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
252
253 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
254 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
255 C Masks for low end of number
256 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
257 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
258
259 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
260 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
261
262 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
263 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
264
265 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
266 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
267 END_OBJECT(dummy)
268