1 1.1 mrg dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and 2 1.1 mrg dnl hamming distance. 3 1.1 mrg 4 1.1.1.2 mrg dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. 5 1.1.1.2 mrg 6 1.1 mrg dnl This file is part of the GNU MP Library. 7 1.1 mrg dnl 8 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.2 mrg dnl it under the terms of either: 10 1.1.1.2 mrg dnl 11 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 12 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 13 1.1.1.2 mrg dnl option) any later version. 14 1.1.1.2 mrg dnl 15 1.1.1.2 mrg dnl or 16 1.1.1.2 mrg dnl 17 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 18 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.2 mrg dnl later version. 20 1.1.1.2 mrg dnl 21 1.1.1.2 mrg dnl or both in parallel, as here. 22 1.1.1.2 mrg dnl 23 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.2 mrg dnl for more details. 27 1.1 mrg dnl 28 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 29 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg 35 1.1 mrg C popcount hamdist 36 1.1 mrg C P3 model 9 (Banias) ? ? 37 1.1 mrg C P3 model 13 (Dothan) 6 6 38 1.1 mrg C P4 model 0 (Willamette) 39 1.1 mrg C P4 model 1 (?) 40 1.1 mrg C P4 model 2 (Northwood) 8 9 41 1.1 mrg C P4 model 3 (Prescott) 8 9 42 1.1 mrg C P4 model 4 (Nocona) 43 1.1 mrg 44 1.1 mrg C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 45 1.1 mrg C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 46 1.1 mrg C 47 1.1 mrg C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided. 48 1.1 mrg C Two movd's and a punpckldq seems to be the same speed as an aligned movq, 49 1.1 mrg C and using them saves fiddling about with alignment testing on entry. 50 1.1 mrg C 51 1.1 mrg C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l 52 1.1 mrg C might be possible, but 8 c/l relying on out-of-order execution is already 53 1.1 mrg C quite reasonable. 54 1.1 mrg 55 1.1 mrg ifdef(`OPERATION_popcount',, 56 1.1 mrg `ifdef(`OPERATION_hamdist',, 57 1.1 mrg `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined 58 1.1 mrg ')')') 59 1.1 mrg 60 1.1 mrg define(HAM, 61 1.1 mrg m4_assert_numargs(1) 62 1.1 mrg `ifdef(`OPERATION_hamdist',`$1')') 63 1.1 mrg 64 1.1 mrg define(POP, 65 1.1 mrg m4_assert_numargs(1) 66 1.1 mrg `ifdef(`OPERATION_popcount',`$1')') 67 1.1 mrg 68 1.1 mrg HAM(` 69 1.1 mrg defframe(PARAM_SIZE, 12) 70 1.1 mrg defframe(PARAM_SRC2, 8) 71 1.1 mrg defframe(PARAM_SRC, 4) 72 1.1 mrg define(M4_function,mpn_hamdist) 73 1.1 mrg ') 74 1.1 mrg POP(` 75 1.1 mrg defframe(PARAM_SIZE, 8) 76 1.1 mrg defframe(PARAM_SRC, 4) 77 1.1 mrg define(M4_function,mpn_popcount) 78 1.1 mrg ') 79 1.1 mrg 80 1.1 mrg MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 81 1.1 mrg 82 1.1 mrg 83 1.1 mrg ifdef(`PIC',,` 84 1.1 mrg dnl non-PIC 85 1.1 mrg RODATA 86 1.1 mrg ALIGN(8) 87 1.1 mrg L(rodata_AAAAAAAAAAAAAAAA): 88 1.1 mrg .long 0xAAAAAAAA 89 1.1 mrg .long 0xAAAAAAAA 90 1.1 mrg L(rodata_3333333333333333): 91 1.1 mrg .long 0x33333333 92 1.1 mrg .long 0x33333333 93 1.1 mrg L(rodata_0F0F0F0F0F0F0F0F): 94 1.1 mrg .long 0x0F0F0F0F 95 1.1 mrg .long 0x0F0F0F0F 96 1.1 mrg ') 97 1.1 mrg 98 1.1 mrg TEXT 99 1.1 mrg ALIGN(16) 100 1.1 mrg 101 1.1 mrg PROLOGUE(M4_function) 102 1.1 mrg deflit(`FRAME',0) 103 1.1 mrg 104 1.1 mrg movl PARAM_SIZE, %ecx 105 1.1 mrg movl PARAM_SRC, %eax 106 1.1 mrg 107 1.1 mrg ifdef(`PIC',` 108 1.1 mrg movl $0xAAAAAAAA, %edx 109 1.1 mrg movd %edx, %mm7 110 1.1 mrg punpckldq %mm7, %mm7 111 1.1 mrg 112 1.1 mrg movl $0x33333333, %edx 113 1.1 mrg movd %edx, %mm6 114 1.1 mrg punpckldq %mm6, %mm6 115 1.1 mrg 116 1.1 mrg movl $0x0F0F0F0F, %edx 117 1.1 mrg movd %edx, %mm5 118 1.1 mrg punpckldq %mm5, %mm5 119 1.1 mrg 120 1.1 mrg HAM(` movl PARAM_SRC2, %edx') 121 1.1 mrg 122 1.1 mrg ',` 123 1.1 mrg dnl non-PIC 124 1.1 mrg HAM(` movl PARAM_SRC2, %edx') 125 1.1 mrg movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 126 1.1 mrg movq L(rodata_3333333333333333), %mm6 127 1.1 mrg movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 128 1.1 mrg ') 129 1.1 mrg 130 1.1 mrg pxor %mm4, %mm4 C zero 131 1.1 mrg pxor %mm0, %mm0 C total 132 1.1 mrg 133 1.1 mrg subl $1, %ecx 134 1.1 mrg ja L(top) 135 1.1 mrg 136 1.1 mrg L(last): 137 1.1 mrg movd (%eax,%ecx,4), %mm1 C src high limb 138 1.1 mrg HAM(` movd (%edx,%ecx,4), %mm2 139 1.1 mrg pxor %mm2, %mm1 140 1.1 mrg ') 141 1.1 mrg jmp L(loaded) 142 1.1 mrg 143 1.1 mrg 144 1.1 mrg L(top): 145 1.1 mrg C eax src 146 1.1 mrg C ebx 147 1.1 mrg C ecx counter, size-1 to 2 or 1, inclusive 148 1.1 mrg C edx [hamdist] src2 149 1.1 mrg C 150 1.1 mrg C mm0 total (low dword) 151 1.1 mrg C mm1 (scratch) 152 1.1 mrg C mm2 (scratch) 153 1.1 mrg C mm3 154 1.1 mrg C mm4 0x0000000000000000 155 1.1 mrg C mm5 0x0F0F0F0F0F0F0F0F 156 1.1 mrg C mm6 0x3333333333333333 157 1.1 mrg C mm7 0xAAAAAAAAAAAAAAAA 158 1.1 mrg 159 1.1 mrg movd (%eax), %mm1 160 1.1 mrg movd 4(%eax), %mm2 161 1.1 mrg punpckldq %mm2, %mm1 162 1.1 mrg addl $8, %eax 163 1.1 mrg 164 1.1 mrg HAM(` movd (%edx), %mm2 165 1.1 mrg movd 4(%edx), %mm3 166 1.1 mrg punpckldq %mm3, %mm2 167 1.1 mrg pxor %mm2, %mm1 168 1.1 mrg addl $8, %edx 169 1.1 mrg ') 170 1.1 mrg 171 1.1 mrg L(loaded): 172 1.1 mrg movq %mm7, %mm2 173 1.1 mrg pand %mm1, %mm2 174 1.1 mrg psrlq $1, %mm2 175 1.1 mrg psubd %mm2, %mm1 C bit pairs 176 1.1 mrg 177 1.1 mrg movq %mm6, %mm2 178 1.1 mrg pand %mm1, %mm2 179 1.1 mrg psrlq $2, %mm1 180 1.1 mrg pand %mm6, %mm1 181 1.1 mrg paddd %mm2, %mm1 C nibbles 182 1.1 mrg 183 1.1 mrg movq %mm5, %mm2 184 1.1 mrg pand %mm1, %mm2 185 1.1 mrg psrlq $4, %mm1 186 1.1 mrg pand %mm5, %mm1 187 1.1 mrg paddd %mm2, %mm1 C bytes 188 1.1 mrg 189 1.1 mrg psadbw( %mm4, %mm1) 190 1.1 mrg paddd %mm1, %mm0 C to total 191 1.1 mrg 192 1.1 mrg subl $2, %ecx 193 1.1 mrg jg L(top) 194 1.1 mrg 195 1.1 mrg C ecx is 0 or -1 representing respectively 1 or 0 further limbs 196 1.1 mrg jz L(last) 197 1.1 mrg 198 1.1 mrg 199 1.1 mrg movd %mm0, %eax 200 1.1 mrg emms 201 1.1 mrg ret 202 1.1 mrg 203 1.1 mrg EPILOGUE() 204