popcount.asm revision 1.1.1.4 1 dnl X86-32 and X86-64 mpn_popcount using SSE2.
2
3 dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
30
31
32 include(`../config.m4')
33
34
35 C 32-bit popcount hamdist
36 C cycles/limb cycles/limb
37 C P5 -
38 C P6 model 0-8,10-12 -
39 C P6 model 9 (Banias) ?
40 C P6 model 13 (Dothan) 4
41 C P4 model 0 (Willamette) ?
42 C P4 model 1 (?) ?
43 C P4 model 2 (Northwood) 3.9
44 C P4 model 3 (Prescott) ?
45 C P4 model 4 (Nocona) ?
46 C AMD K6 -
47 C AMD K7 -
48 C AMD K8 ?
49
50 C 64-bit popcount hamdist
51 C cycles/limb cycles/limb
52 C P4 model 4 (Nocona): 8
53 C AMD K8,K9 7.5
54 C AMD K10 3.5
55 C Intel core2 3.68
56 C Intel corei 3.15
57 C Intel atom 10.8
58 C VIA nano 6.5
59
60 C TODO
61 C * Make an mpn_hamdist based on this. Alignment could either be handled by
62 C using movdqu for one operand and movdqa for the other, or by painfully
63 C shifting as we go. Unfortunately, there seem to be no usable shift
64 C instruction, except for one that takes an immediate count.
65 C * It would probably be possible to cut a few cycles/limb using software
66 C pipelining.
67 C * There are 35 decode slots unused by the SSE2 instructions. Loop control
68 C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
69 C integer based popcount. Such a combined loop would handle 6 limbs in
70 C about 30 cycles on K8.
71 C * We could save a byte or two by using 32-bit operations on areg.
72 C * Check if using movdqa to a temp of and then register-based pand is faster.
73
74 ifelse(GMP_LIMB_BITS,`32',
75 ` define(`up', `%edx')
76 define(`n', `%ecx')
77 define(`areg',`%eax')
78 define(`breg',`%ebx')
79 define(`zero',`%xmm4')
80 define(`LIMB32',` $1')
81 define(`LIMB64',`dnl')
82 ',`
83 define(`up', `%rdi')
84 define(`n', `%rsi')
85 define(`areg',`%rax')
86 define(`breg',`%rdx')
87 define(`zero',`%xmm8')
88 define(`LIMB32',`dnl')
89 define(`LIMB64',` $1')
90 ')
91
92 define(`mm01010101',`%xmm6')
93 define(`mm00110011',`%xmm7')
94 define(`mm00001111',`%xmm2')
95
96 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
97 define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
98 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
99
100 undefine(`psadbw') C override inherited m4 version
101
102 C This file is shared between 32-bit and 64-bit builds. Only the former has
103 C LEAL. Default LEAL as an alias of LEA.
104 ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
105
106 ASM_START()
107
108 C Make cnsts global to work around Apple relocation bug.
109 ifdef(`DARWIN',`
110 define(`cnsts', MPN(popccnsts))
111 GLOBL cnsts')
112
113 TEXT
114 ALIGN(32)
115 PROLOGUE(mpn_popcount)
116
117 LIMB32(`mov 4(%esp), up ')
118 LIMB32(`mov 8(%esp), n ')
119 LIMB32(`push %ebx ')
120
121 pxor %xmm3, %xmm3 C zero grand total count
122 LIMB64(`pxor zero, zero ')
123
124 LEAL( cnsts, breg)
125
126 movdqa -48(breg), mm01010101
127 movdqa -32(breg), mm00110011
128 movdqa -16(breg), mm00001111
129
130 mov up, areg
131 and $-16, up C round `up' down to 128-bit boundary
132 and $12, areg C 32:areg = 0, 4, 8, 12
133 C 64:areg = 0, 8
134 movdqa (up), %xmm0
135 pand 64(breg,areg,4), %xmm0
136 shr $m4_log2(GMP_LIMB_BYTES), %eax
137 add areg, n C compensate n for rounded down `up'
138
139 pxor %xmm4, %xmm4
140 sub $LIMBS_PER_XMM, n
141 jbe L(sum)
142
143 sub $LIMBS_PER_XMM, n
144 ja L(ent)
145 jmp L(lsum)
146
147 ALIGN(16)
148 L(top): movdqa (up), %xmm0
149 L(ent): movdqa 16(up), %xmm4
150
151 movdqa %xmm0, %xmm1
152 movdqa %xmm4, %xmm5
153 psrld $1, %xmm0
154 psrld $1, %xmm4
155 pand mm01010101, %xmm0
156 pand mm01010101, %xmm4
157 psubd %xmm0, %xmm1
158 psubd %xmm4, %xmm5
159
160 movdqa %xmm1, %xmm0
161 movdqa %xmm5, %xmm4
162 psrlq $2, %xmm1
163 psrlq $2, %xmm5
164 pand mm00110011, %xmm0
165 pand mm00110011, %xmm4
166 pand mm00110011, %xmm1
167 pand mm00110011, %xmm5
168 paddq %xmm0, %xmm1
169 paddq %xmm4, %xmm5
170
171 LIMB32(`pxor zero, zero ')
172
173 add $32, up
174 sub $LIMBS_PER_2XMM, n
175
176 paddq %xmm5, %xmm1
177 movdqa %xmm1, %xmm0
178 psrlq $4, %xmm1
179 pand mm00001111, %xmm0
180 pand mm00001111, %xmm1
181 paddq %xmm0, %xmm1
182
183 psadbw zero, %xmm1
184 paddq %xmm1, %xmm3 C add to grand total
185
186 jnc L(top)
187 L(end):
188 add $LIMBS_PER_2XMM, n
189 jz L(rt)
190 movdqa (up), %xmm0
191 pxor %xmm4, %xmm4
192 sub $LIMBS_PER_XMM, n
193 jbe L(sum)
194 L(lsum):
195 movdqa %xmm0, %xmm4
196 movdqa 16(up), %xmm0
197 L(sum):
198 shl $m4_log2(GMP_LIMB_BYTES), n
199 and $12, n
200 pand (breg,n,4), %xmm0
201
202 movdqa %xmm0, %xmm1
203 movdqa %xmm4, %xmm5
204 psrld $1, %xmm0
205 psrld $1, %xmm4
206 pand mm01010101, %xmm0
207 pand mm01010101, %xmm4
208 psubd %xmm0, %xmm1
209 psubd %xmm4, %xmm5
210
211 movdqa %xmm1, %xmm0
212 movdqa %xmm5, %xmm4
213 psrlq $2, %xmm1
214 psrlq $2, %xmm5
215 pand mm00110011, %xmm0
216 pand mm00110011, %xmm4
217 pand mm00110011, %xmm1
218 pand mm00110011, %xmm5
219 paddq %xmm0, %xmm1
220 paddq %xmm4, %xmm5
221
222 LIMB32(`pxor zero, zero ')
223
224 paddq %xmm5, %xmm1
225 movdqa %xmm1, %xmm0
226 psrlq $4, %xmm1
227 pand mm00001111, %xmm0
228 pand mm00001111, %xmm1
229 paddq %xmm0, %xmm1
230
231 psadbw zero, %xmm1
232 paddq %xmm1, %xmm3 C add to grand total
233
234
235 C Add the two 64-bit halves of the grand total counter
236 L(rt): movdqa %xmm3, %xmm0
237 psrldq $8, %xmm3
238 paddq %xmm3, %xmm0
239 movd %xmm0, areg C movq avoided due to gas bug
240
241 LIMB32(`pop %ebx ')
242 ret
243
244 EPILOGUE()
245 DEF_OBJECT(dummy,16)
246 C Three magic constants used for masking out bits
247 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
248 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
249
250 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
251 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
252
253 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
254 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
255 cnsts:
256 C Masks for high end of number
257 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
258 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
259
260 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
261 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
262
263 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
264 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
265
266 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
267 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
268 C Masks for low end of number
269 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
270 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
271
272 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
273 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
274
275 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
276 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
277
278 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
279 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
280 END_OBJECT(dummy)
281 ASM_END()
282