1 1.1 mrg dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming 2 1.1 mrg dnl distance. 3 1.1 mrg 4 1.1.1.3 mrg dnl Copyright 2000-2002 Free Software Foundation, Inc. 5 1.1.1.3 mrg 6 1.1 mrg dnl This file is part of the GNU MP Library. 7 1.1 mrg dnl 8 1.1.1.3 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.3 mrg dnl it under the terms of either: 10 1.1.1.3 mrg dnl 11 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 12 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 13 1.1.1.3 mrg dnl option) any later version. 14 1.1.1.3 mrg dnl 15 1.1.1.3 mrg dnl or 16 1.1.1.3 mrg dnl 17 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 18 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.3 mrg dnl later version. 20 1.1.1.3 mrg dnl 21 1.1.1.3 mrg dnl or both in parallel, as here. 22 1.1.1.3 mrg dnl 23 1.1.1.3 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1.1.3 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.3 mrg dnl for more details. 27 1.1 mrg dnl 28 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 29 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg 35 1.1 mrg C popcount hamdist 36 1.1 mrg C P3 generic 6.5 7 37 1.1.1.2 mrg C P3 model 9 (Banias) 5.7 6.1 38 1.1 mrg C P3 model 13 (Dothan) 5.75 6 39 1.1 mrg C K7 5 6 40 1.1 mrg 41 1.1 mrg C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 42 1.1 mrg C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 43 1.1 mrg C 44 1.1 mrg C The code here is almost certainly not optimal, but is already a 3x speedup 45 1.1 mrg C over the generic C code. The main improvement would be to interleave 46 1.1 mrg C processing of two qwords in the loop so as to fully exploit the available 47 1.1 mrg C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). 48 1.1 mrg C 49 1.1 mrg C The loop is based on the example "Efficient 64-bit population count using 50 1.1 mrg C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, 51 1.1 mrg C page 158 of rev E (reference in mpn/x86/k7/README). 52 1.1 mrg 53 1.1 mrg ifdef(`OPERATION_popcount',, 54 1.1 mrg `ifdef(`OPERATION_hamdist',, 55 1.1 mrg `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined 56 1.1 mrg ')')') 57 1.1 mrg 58 1.1 mrg define(HAM, 59 1.1 mrg m4_assert_numargs(1) 60 1.1 mrg `ifdef(`OPERATION_hamdist',`$1')') 61 1.1 mrg 62 1.1 mrg define(POP, 63 1.1 mrg m4_assert_numargs(1) 64 1.1 mrg `ifdef(`OPERATION_popcount',`$1')') 65 1.1 mrg 66 1.1 mrg HAM(` 67 1.1 mrg defframe(PARAM_SIZE, 12) 68 1.1 mrg defframe(PARAM_SRC2, 8) 69 1.1 mrg defframe(PARAM_SRC, 4) 70 1.1 mrg define(M4_function,mpn_hamdist) 71 1.1 mrg ') 72 1.1 mrg POP(` 73 1.1 mrg defframe(PARAM_SIZE, 8) 74 1.1 mrg defframe(PARAM_SRC, 4) 75 1.1 mrg define(M4_function,mpn_popcount) 76 1.1 mrg ') 77 1.1 mrg 78 1.1 mrg MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 79 1.1 mrg 80 1.1 mrg 81 1.1 mrg ifdef(`PIC',,` 82 1.1 mrg dnl non-PIC 83 1.1 mrg 84 1.1 mrg RODATA 85 1.1 mrg ALIGN(8) 86 1.1 mrg 87 1.1 mrg L(rodata_AAAAAAAAAAAAAAAA): 88 1.1 mrg .long 0xAAAAAAAA 89 1.1 mrg .long 0xAAAAAAAA 90 1.1 mrg 91 1.1 mrg L(rodata_3333333333333333): 92 1.1 mrg .long 0x33333333 93 1.1 mrg .long 0x33333333 94 1.1 mrg 95 1.1 mrg L(rodata_0F0F0F0F0F0F0F0F): 96 1.1 mrg .long 0x0F0F0F0F 97 1.1 mrg .long 0x0F0F0F0F 98 1.1 mrg ') 99 1.1 mrg 100 1.1 mrg TEXT 101 1.1 mrg ALIGN(32) 102 1.1 mrg 103 1.1 mrg PROLOGUE(M4_function) 104 1.1 mrg deflit(`FRAME',0) 105 1.1 mrg 106 1.1 mrg movl PARAM_SIZE, %ecx 107 1.1 mrg 108 1.1 mrg ifdef(`PIC',` 109 1.1 mrg movl $0xAAAAAAAA, %eax 110 1.1 mrg movl $0x33333333, %edx 111 1.1 mrg 112 1.1 mrg movd %eax, %mm7 113 1.1 mrg movd %edx, %mm6 114 1.1 mrg 115 1.1 mrg movl $0x0F0F0F0F, %eax 116 1.1 mrg 117 1.1 mrg punpckldq %mm7, %mm7 118 1.1 mrg punpckldq %mm6, %mm6 119 1.1 mrg 120 1.1 mrg movd %eax, %mm5 121 1.1 mrg movd %edx, %mm4 122 1.1 mrg 123 1.1 mrg punpckldq %mm5, %mm5 124 1.1 mrg 125 1.1 mrg ',` 126 1.1 mrg movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 127 1.1 mrg movq L(rodata_3333333333333333), %mm6 128 1.1 mrg movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 129 1.1 mrg ') 130 1.1 mrg pxor %mm4, %mm4 131 1.1 mrg 132 1.1 mrg define(REG_AAAAAAAAAAAAAAAA,%mm7) 133 1.1 mrg define(REG_3333333333333333,%mm6) 134 1.1 mrg define(REG_0F0F0F0F0F0F0F0F,%mm5) 135 1.1 mrg define(REG_0000000000000000,%mm4) 136 1.1 mrg 137 1.1 mrg 138 1.1 mrg movl PARAM_SRC, %eax 139 1.1 mrg HAM(` movl PARAM_SRC2, %edx') 140 1.1 mrg 141 1.1 mrg pxor %mm2, %mm2 C total 142 1.1 mrg 143 1.1 mrg shrl %ecx 144 1.1 mrg jnc L(top) 145 1.1 mrg 146 1.1 mrg movd (%eax,%ecx,8), %mm1 147 1.1 mrg 148 1.1 mrg HAM(` movd (%edx,%ecx,8), %mm0 149 1.1 mrg pxor %mm0, %mm1 150 1.1 mrg ') 151 1.1 mrg orl %ecx, %ecx 152 1.1 mrg jmp L(loaded) 153 1.1 mrg 154 1.1 mrg 155 1.1 mrg ALIGN(16) 156 1.1 mrg L(top): 157 1.1 mrg C eax src 158 1.1 mrg C ebx 159 1.1 mrg C ecx counter, qwords, decrementing 160 1.1 mrg C edx [hamdist] src2 161 1.1 mrg C 162 1.1 mrg C mm0 (scratch) 163 1.1 mrg C mm1 (scratch) 164 1.1 mrg C mm2 total (low dword) 165 1.1 mrg C mm3 166 1.1 mrg C mm4 \ 167 1.1 mrg C mm5 | special constants 168 1.1 mrg C mm6 | 169 1.1 mrg C mm7 / 170 1.1 mrg 171 1.1 mrg movq -8(%eax,%ecx,8), %mm1 172 1.1 mrg 173 1.1 mrg HAM(` pxor -8(%edx,%ecx,8), %mm1') 174 1.1 mrg decl %ecx 175 1.1 mrg 176 1.1 mrg L(loaded): 177 1.1 mrg movq %mm1, %mm0 178 1.1 mrg pand REG_AAAAAAAAAAAAAAAA, %mm1 179 1.1 mrg 180 1.1 mrg psrlq $1, %mm1 181 1.1 mrg 182 1.1 mrg psubd %mm1, %mm0 C bit pairs 183 1.1 mrg 184 1.1 mrg 185 1.1 mrg movq %mm0, %mm1 186 1.1 mrg psrlq $2, %mm0 187 1.1 mrg 188 1.1 mrg pand REG_3333333333333333, %mm0 189 1.1 mrg pand REG_3333333333333333, %mm1 190 1.1 mrg 191 1.1 mrg paddd %mm1, %mm0 C nibbles 192 1.1 mrg 193 1.1 mrg 194 1.1 mrg movq %mm0, %mm1 195 1.1 mrg psrlq $4, %mm0 196 1.1 mrg 197 1.1 mrg pand REG_0F0F0F0F0F0F0F0F, %mm0 198 1.1 mrg pand REG_0F0F0F0F0F0F0F0F, %mm1 199 1.1 mrg 200 1.1 mrg paddd %mm1, %mm0 C bytes 201 1.1 mrg 202 1.1 mrg 203 1.1 mrg psadbw( %mm4, %mm0) 204 1.1 mrg 205 1.1 mrg paddd %mm0, %mm2 C add to total 206 1.1 mrg jnz L(top) 207 1.1 mrg 208 1.1 mrg 209 1.1 mrg movd %mm2, %eax 210 1.1 mrg emms 211 1.1 mrg ret 212 1.1 mrg 213 1.1 mrg EPILOGUE() 214