popham.asm revision 1.1 1 dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
2 dnl hamming distance.
3
4 dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
12 dnl
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
17 dnl
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23
24 C popcount hamdist
25 C K6-2: 9.0 11.5 cycles/limb
26 C K6: 12.5 13.0
27
28
29 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
30 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
31 C
32 C The code here isn't optimal, but it's already a 2x speedup over the plain
33 C integer mpn/generic/popcount.c,hamdist.c.
34
35
36 ifdef(`OPERATION_popcount',,
37 `ifdef(`OPERATION_hamdist',,
38 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
39 ')m4exit(1)')')
40
41 define(HAM,
42 m4_assert_numargs(1)
43 `ifdef(`OPERATION_hamdist',`$1')')
44
45 define(POP,
46 m4_assert_numargs(1)
47 `ifdef(`OPERATION_popcount',`$1')')
48
49 HAM(`
50 defframe(PARAM_SIZE, 12)
51 defframe(PARAM_SRC2, 8)
52 defframe(PARAM_SRC, 4)
53 define(M4_function,mpn_hamdist)
54 ')
55 POP(`
56 defframe(PARAM_SIZE, 8)
57 defframe(PARAM_SRC, 4)
58 define(M4_function,mpn_popcount)
59 ')
60
61 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
62
63
64 ifdef(`PIC',,`
65 dnl non-PIC
66
67 RODATA
68 ALIGN(8)
69
70 L(rodata_AAAAAAAAAAAAAAAA):
71 .long 0xAAAAAAAA
72 .long 0xAAAAAAAA
73
74 L(rodata_3333333333333333):
75 .long 0x33333333
76 .long 0x33333333
77
78 L(rodata_0F0F0F0F0F0F0F0F):
79 .long 0x0F0F0F0F
80 .long 0x0F0F0F0F
81
82 L(rodata_000000FF000000FF):
83 .long 0x000000FF
84 .long 0x000000FF
85 ')
86
87 TEXT
88 ALIGN(32)
89
90 POP(`ifdef(`PIC', `
91 C avoid shrl crossing a 32-byte boundary
92 nop')')
93
94 PROLOGUE(M4_function)
95 deflit(`FRAME',0)
96
97 movl PARAM_SIZE, %ecx
98
99 ifdef(`PIC',`
100 movl $0xAAAAAAAA, %eax
101 movl $0x33333333, %edx
102
103 movd %eax, %mm7
104 movd %edx, %mm6
105
106 movl $0x0F0F0F0F, %eax
107 movl $0x000000FF, %edx
108
109 punpckldq %mm7, %mm7
110 punpckldq %mm6, %mm6
111
112 movd %eax, %mm5
113 movd %edx, %mm4
114
115 punpckldq %mm5, %mm5
116 punpckldq %mm4, %mm4
117 ',`
118
119 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
120 movq L(rodata_3333333333333333), %mm6
121 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
122 movq L(rodata_000000FF000000FF), %mm4
123 ')
124
125 define(REG_AAAAAAAAAAAAAAAA, %mm7)
126 define(REG_3333333333333333, %mm6)
127 define(REG_0F0F0F0F0F0F0F0F, %mm5)
128 define(REG_000000FF000000FF, %mm4)
129
130
131 movl PARAM_SRC, %eax
132 HAM(` movl PARAM_SRC2, %edx')
133
134 pxor %mm2, %mm2 C total
135
136 shrl %ecx
137 jnc L(top)
138
139 Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
140
141 HAM(`
142 Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
143 pxor %mm0, %mm1
144 ')
145
146 incl %ecx
147 jmp L(loaded)
148
149
150 ALIGN(16)
151 POP(` nop C alignment to avoid crossing 32-byte boundaries')
152
153 L(top):
154 C eax src
155 C ebx
156 C ecx counter, qwords, decrementing
157 C edx [hamdist] src2
158 C
159 C mm0 (scratch)
160 C mm1 (scratch)
161 C mm2 total (low dword)
162 C mm3
163 C mm4 \
164 C mm5 | special constants
165 C mm6 |
166 C mm7 /
167
168 movq -8(%eax,%ecx,8), %mm1
169 HAM(` pxor -8(%edx,%ecx,8), %mm1')
170
171 L(loaded):
172 movq %mm1, %mm0
173 pand REG_AAAAAAAAAAAAAAAA, %mm1
174
175 psrlq $1, %mm1
176 HAM(` nop C code alignment')
177
178 psubd %mm1, %mm0 C bit pairs
179 HAM(` nop C code alignment')
180
181
182 movq %mm0, %mm1
183 psrlq $2, %mm0
184
185 pand REG_3333333333333333, %mm0
186 pand REG_3333333333333333, %mm1
187
188 paddd %mm1, %mm0 C nibbles
189
190
191 movq %mm0, %mm1
192 psrlq $4, %mm0
193
194 pand REG_0F0F0F0F0F0F0F0F, %mm0
195 pand REG_0F0F0F0F0F0F0F0F, %mm1
196
197 paddd %mm1, %mm0 C bytes
198
199 movq %mm0, %mm1
200 psrlq $8, %mm0
201
202
203 paddb %mm1, %mm0 C words
204
205
206 movq %mm0, %mm1
207 psrlq $16, %mm0
208
209 paddd %mm1, %mm0 C dwords
210
211 pand REG_000000FF000000FF, %mm0
212
213 paddd %mm0, %mm2 C low to total
214 psrlq $32, %mm0
215
216 paddd %mm0, %mm2 C high to total
217 loop L(top)
218
219
220
221 movd %mm2, %eax
222 emms_or_femms
223 ret
224
225 EPILOGUE()
226