1 1.1 mrg dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and 2 1.1 mrg dnl hamming distance. 3 1.1 mrg 4 1.1.1.2 mrg dnl Copyright 2000-2002 Free Software Foundation, Inc. 5 1.1.1.2 mrg 6 1.1 mrg dnl This file is part of the GNU MP Library. 7 1.1 mrg dnl 8 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.2 mrg dnl it under the terms of either: 10 1.1.1.2 mrg dnl 11 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 12 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 13 1.1.1.2 mrg dnl option) any later version. 14 1.1.1.2 mrg dnl 15 1.1.1.2 mrg dnl or 16 1.1.1.2 mrg dnl 17 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 18 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.2 mrg dnl later version. 20 1.1.1.2 mrg dnl 21 1.1.1.2 mrg dnl or both in parallel, as here. 22 1.1.1.2 mrg dnl 23 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.2 mrg dnl for more details. 27 1.1 mrg dnl 28 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 29 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg 35 1.1 mrg C popcount hamdist 36 1.1 mrg C K6-2: 9.0 11.5 cycles/limb 37 1.1 mrg C K6: 12.5 13.0 38 1.1 mrg 39 1.1 mrg 40 1.1 mrg C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 41 1.1 mrg C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 42 1.1 mrg C 43 1.1 mrg C The code here isn't optimal, but it's already a 2x speedup over the plain 44 1.1 mrg C integer mpn/generic/popcount.c,hamdist.c. 45 1.1 mrg 46 1.1 mrg 47 1.1 mrg ifdef(`OPERATION_popcount',, 48 1.1 mrg `ifdef(`OPERATION_hamdist',, 49 1.1 mrg `m4_error(`Need OPERATION_popcount or OPERATION_hamdist 50 1.1 mrg ')m4exit(1)')') 51 1.1 mrg 52 1.1 mrg define(HAM, 53 1.1 mrg m4_assert_numargs(1) 54 1.1 mrg `ifdef(`OPERATION_hamdist',`$1')') 55 1.1 mrg 56 1.1 mrg define(POP, 57 1.1 mrg m4_assert_numargs(1) 58 1.1 mrg `ifdef(`OPERATION_popcount',`$1')') 59 1.1 mrg 60 1.1 mrg HAM(` 61 1.1 mrg defframe(PARAM_SIZE, 12) 62 1.1 mrg defframe(PARAM_SRC2, 8) 63 1.1 mrg defframe(PARAM_SRC, 4) 64 1.1 mrg define(M4_function,mpn_hamdist) 65 1.1 mrg ') 66 1.1 mrg POP(` 67 1.1 mrg defframe(PARAM_SIZE, 8) 68 1.1 mrg defframe(PARAM_SRC, 4) 69 1.1 mrg define(M4_function,mpn_popcount) 70 1.1 mrg ') 71 1.1 mrg 72 1.1 mrg MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 73 1.1 mrg 74 1.1 mrg 75 1.1 mrg ifdef(`PIC',,` 76 1.1 mrg dnl non-PIC 77 1.1 mrg 78 1.1 mrg RODATA 79 1.1 mrg ALIGN(8) 80 1.1 mrg 81 1.1 mrg L(rodata_AAAAAAAAAAAAAAAA): 82 1.1 mrg .long 0xAAAAAAAA 83 1.1 mrg .long 0xAAAAAAAA 84 1.1 mrg 85 1.1 mrg L(rodata_3333333333333333): 86 1.1 mrg .long 0x33333333 87 1.1 mrg .long 0x33333333 88 1.1 mrg 89 1.1 mrg L(rodata_0F0F0F0F0F0F0F0F): 90 1.1 mrg .long 0x0F0F0F0F 91 1.1 mrg .long 0x0F0F0F0F 92 1.1 mrg 93 1.1 mrg L(rodata_000000FF000000FF): 94 1.1 mrg .long 0x000000FF 95 1.1 mrg .long 0x000000FF 96 1.1 mrg ') 97 1.1 mrg 98 1.1 mrg TEXT 99 1.1 mrg ALIGN(32) 100 1.1 mrg 101 1.1 mrg POP(`ifdef(`PIC', ` 102 1.1 mrg C avoid shrl crossing a 32-byte boundary 103 1.1 mrg nop')') 104 1.1 mrg 105 1.1 mrg PROLOGUE(M4_function) 106 1.1 mrg deflit(`FRAME',0) 107 1.1 mrg 108 1.1 mrg movl PARAM_SIZE, %ecx 109 1.1 mrg 110 1.1 mrg ifdef(`PIC',` 111 1.1 mrg movl $0xAAAAAAAA, %eax 112 1.1 mrg movl $0x33333333, %edx 113 1.1 mrg 114 1.1 mrg movd %eax, %mm7 115 1.1 mrg movd %edx, %mm6 116 1.1 mrg 117 1.1 mrg movl $0x0F0F0F0F, %eax 118 1.1 mrg movl $0x000000FF, %edx 119 1.1 mrg 120 1.1 mrg punpckldq %mm7, %mm7 121 1.1 mrg punpckldq %mm6, %mm6 122 1.1 mrg 123 1.1 mrg movd %eax, %mm5 124 1.1 mrg movd %edx, %mm4 125 1.1 mrg 126 1.1 mrg punpckldq %mm5, %mm5 127 1.1 mrg punpckldq %mm4, %mm4 128 1.1 mrg ',` 129 1.1 mrg 130 1.1 mrg movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 131 1.1 mrg movq L(rodata_3333333333333333), %mm6 132 1.1 mrg movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 133 1.1 mrg movq L(rodata_000000FF000000FF), %mm4 134 1.1 mrg ') 135 1.1 mrg 136 1.1 mrg define(REG_AAAAAAAAAAAAAAAA, %mm7) 137 1.1 mrg define(REG_3333333333333333, %mm6) 138 1.1 mrg define(REG_0F0F0F0F0F0F0F0F, %mm5) 139 1.1 mrg define(REG_000000FF000000FF, %mm4) 140 1.1 mrg 141 1.1 mrg 142 1.1 mrg movl PARAM_SRC, %eax 143 1.1 mrg HAM(` movl PARAM_SRC2, %edx') 144 1.1 mrg 145 1.1 mrg pxor %mm2, %mm2 C total 146 1.1 mrg 147 1.1 mrg shrl %ecx 148 1.1 mrg jnc L(top) 149 1.1 mrg 150 1.1 mrg Zdisp( movd, 0,(%eax,%ecx,8), %mm1) 151 1.1 mrg 152 1.1 mrg HAM(` 153 1.1 mrg Zdisp( movd, 0,(%edx,%ecx,8), %mm0) 154 1.1 mrg pxor %mm0, %mm1 155 1.1 mrg ') 156 1.1 mrg 157 1.1 mrg incl %ecx 158 1.1 mrg jmp L(loaded) 159 1.1 mrg 160 1.1 mrg 161 1.1 mrg ALIGN(16) 162 1.1 mrg POP(` nop C alignment to avoid crossing 32-byte boundaries') 163 1.1 mrg 164 1.1 mrg L(top): 165 1.1 mrg C eax src 166 1.1 mrg C ebx 167 1.1 mrg C ecx counter, qwords, decrementing 168 1.1 mrg C edx [hamdist] src2 169 1.1 mrg C 170 1.1 mrg C mm0 (scratch) 171 1.1 mrg C mm1 (scratch) 172 1.1 mrg C mm2 total (low dword) 173 1.1 mrg C mm3 174 1.1 mrg C mm4 \ 175 1.1 mrg C mm5 | special constants 176 1.1 mrg C mm6 | 177 1.1 mrg C mm7 / 178 1.1 mrg 179 1.1 mrg movq -8(%eax,%ecx,8), %mm1 180 1.1 mrg HAM(` pxor -8(%edx,%ecx,8), %mm1') 181 1.1 mrg 182 1.1 mrg L(loaded): 183 1.1 mrg movq %mm1, %mm0 184 1.1 mrg pand REG_AAAAAAAAAAAAAAAA, %mm1 185 1.1 mrg 186 1.1 mrg psrlq $1, %mm1 187 1.1 mrg HAM(` nop C code alignment') 188 1.1 mrg 189 1.1 mrg psubd %mm1, %mm0 C bit pairs 190 1.1 mrg HAM(` nop C code alignment') 191 1.1 mrg 192 1.1 mrg 193 1.1 mrg movq %mm0, %mm1 194 1.1 mrg psrlq $2, %mm0 195 1.1 mrg 196 1.1 mrg pand REG_3333333333333333, %mm0 197 1.1 mrg pand REG_3333333333333333, %mm1 198 1.1 mrg 199 1.1 mrg paddd %mm1, %mm0 C nibbles 200 1.1 mrg 201 1.1 mrg 202 1.1 mrg movq %mm0, %mm1 203 1.1 mrg psrlq $4, %mm0 204 1.1 mrg 205 1.1 mrg pand REG_0F0F0F0F0F0F0F0F, %mm0 206 1.1 mrg pand REG_0F0F0F0F0F0F0F0F, %mm1 207 1.1 mrg 208 1.1 mrg paddd %mm1, %mm0 C bytes 209 1.1 mrg 210 1.1 mrg movq %mm0, %mm1 211 1.1 mrg psrlq $8, %mm0 212 1.1 mrg 213 1.1 mrg 214 1.1 mrg paddb %mm1, %mm0 C words 215 1.1 mrg 216 1.1 mrg 217 1.1 mrg movq %mm0, %mm1 218 1.1 mrg psrlq $16, %mm0 219 1.1 mrg 220 1.1 mrg paddd %mm1, %mm0 C dwords 221 1.1 mrg 222 1.1 mrg pand REG_000000FF000000FF, %mm0 223 1.1 mrg 224 1.1 mrg paddd %mm0, %mm2 C low to total 225 1.1 mrg psrlq $32, %mm0 226 1.1 mrg 227 1.1 mrg paddd %mm0, %mm2 C high to total 228 1.1 mrg loop L(top) 229 1.1 mrg 230 1.1 mrg 231 1.1 mrg 232 1.1 mrg movd %mm2, %eax 233 1.1 mrg emms_or_femms 234 1.1 mrg ret 235 1.1 mrg 236 1.1 mrg EPILOGUE() 237