1 1.1 mrg dnl X86-32 and X86-64 mpn_popcount using SSE2. 2 1.1 mrg 3 1.1.1.4 mrg dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc. 4 1.1.1.3 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.3 mrg dnl it under the terms of either: 9 1.1.1.3 mrg dnl 10 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.3 mrg dnl option) any later version. 13 1.1.1.3 mrg dnl 14 1.1.1.3 mrg dnl or 15 1.1.1.3 mrg dnl 16 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.3 mrg dnl later version. 19 1.1.1.3 mrg dnl 20 1.1.1.3 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.3 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg 35 1.1.1.2 mrg C 32-bit popcount hamdist 36 1.1.1.2 mrg C cycles/limb cycles/limb 37 1.1.1.2 mrg C P5 - 38 1.1.1.2 mrg C P6 model 0-8,10-12 - 39 1.1.1.2 mrg C P6 model 9 (Banias) ? 40 1.1.1.2 mrg C P6 model 13 (Dothan) 4 41 1.1.1.2 mrg C P4 model 0 (Willamette) ? 42 1.1.1.2 mrg C P4 model 1 (?) ? 43 1.1.1.2 mrg C P4 model 2 (Northwood) 3.9 44 1.1.1.2 mrg C P4 model 3 (Prescott) ? 45 1.1.1.2 mrg C P4 model 4 (Nocona) ? 46 1.1.1.2 mrg C AMD K6 - 47 1.1.1.2 mrg C AMD K7 - 48 1.1.1.2 mrg C AMD K8 ? 49 1.1.1.2 mrg 50 1.1.1.2 mrg C 64-bit popcount hamdist 51 1.1.1.2 mrg C cycles/limb cycles/limb 52 1.1.1.2 mrg C P4 model 4 (Nocona): 8 53 1.1.1.2 mrg C AMD K8,K9 7.5 54 1.1.1.2 mrg C AMD K10 3.5 55 1.1.1.2 mrg C Intel core2 3.68 56 1.1.1.2 mrg C Intel corei 3.15 57 1.1.1.2 mrg C Intel atom 10.8 58 1.1.1.2 mrg C VIA nano 6.5 59 1.1 mrg 60 1.1 mrg C TODO 61 1.1.1.3 mrg C * Make an mpn_hamdist based on this. Alignment could either be handled by 62 1.1 mrg C using movdqu for one operand and movdqa for the other, or by painfully 63 1.1.1.3 mrg C shifting as we go. Unfortunately, there seem to be no usable shift 64 1.1 mrg C instruction, except for one that takes an immediate count. 65 1.1 mrg C * It would probably be possible to cut a few cycles/limb using software 66 1.1 mrg C pipelining. 67 1.1 mrg C * There are 35 decode slots unused by the SSE2 instructions. Loop control 68 1.1 mrg C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel 69 1.1 mrg C integer based popcount. Such a combined loop would handle 6 limbs in 70 1.1 mrg C about 30 cycles on K8. 71 1.1 mrg C * We could save a byte or two by using 32-bit operations on areg. 72 1.1 mrg C * Check if using movdqa to a temp of and then register-based pand is faster. 73 1.1 mrg 74 1.1 mrg ifelse(GMP_LIMB_BITS,`32', 75 1.1 mrg ` define(`up', `%edx') 76 1.1 mrg define(`n', `%ecx') 77 1.1 mrg define(`areg',`%eax') 78 1.1 mrg define(`breg',`%ebx') 79 1.1 mrg define(`zero',`%xmm4') 80 1.1 mrg define(`LIMB32',` $1') 81 1.1 mrg define(`LIMB64',`dnl') 82 1.1 mrg ',` 83 1.1 mrg define(`up', `%rdi') 84 1.1 mrg define(`n', `%rsi') 85 1.1 mrg define(`areg',`%rax') 86 1.1 mrg define(`breg',`%rdx') 87 1.1 mrg define(`zero',`%xmm8') 88 1.1 mrg define(`LIMB32',`dnl') 89 1.1 mrg define(`LIMB64',` $1') 90 1.1 mrg ') 91 1.1 mrg 92 1.1 mrg define(`mm01010101',`%xmm6') 93 1.1 mrg define(`mm00110011',`%xmm7') 94 1.1 mrg define(`mm00001111',`%xmm2') 95 1.1 mrg 96 1.1 mrg define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 97 1.1 mrg define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) 98 1.1 mrg define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) 99 1.1 mrg 100 1.1 mrg undefine(`psadbw') C override inherited m4 version 101 1.1 mrg 102 1.1.1.3 mrg C This file is shared between 32-bit and 64-bit builds. Only the former has 103 1.1.1.3 mrg C LEAL. Default LEAL as an alias of LEA. 104 1.1.1.3 mrg ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') 105 1.1.1.3 mrg 106 1.1 mrg ASM_START() 107 1.1 mrg 108 1.1 mrg C Make cnsts global to work around Apple relocation bug. 109 1.1 mrg ifdef(`DARWIN',` 110 1.1 mrg define(`cnsts', MPN(popccnsts)) 111 1.1 mrg GLOBL cnsts') 112 1.1 mrg 113 1.1 mrg TEXT 114 1.1 mrg ALIGN(32) 115 1.1 mrg PROLOGUE(mpn_popcount) 116 1.1 mrg 117 1.1 mrg LIMB32(`mov 4(%esp), up ') 118 1.1 mrg LIMB32(`mov 8(%esp), n ') 119 1.1 mrg LIMB32(`push %ebx ') 120 1.1 mrg 121 1.1 mrg pxor %xmm3, %xmm3 C zero grand total count 122 1.1 mrg LIMB64(`pxor zero, zero ') 123 1.1.1.4 mrg 124 1.1.1.3 mrg LEAL( cnsts, breg) 125 1.1 mrg 126 1.1 mrg movdqa -48(breg), mm01010101 127 1.1 mrg movdqa -32(breg), mm00110011 128 1.1 mrg movdqa -16(breg), mm00001111 129 1.1 mrg 130 1.1 mrg mov up, areg 131 1.1 mrg and $-16, up C round `up' down to 128-bit boundary 132 1.1 mrg and $12, areg C 32:areg = 0, 4, 8, 12 133 1.1 mrg C 64:areg = 0, 8 134 1.1 mrg movdqa (up), %xmm0 135 1.1 mrg pand 64(breg,areg,4), %xmm0 136 1.1 mrg shr $m4_log2(GMP_LIMB_BYTES), %eax 137 1.1 mrg add areg, n C compensate n for rounded down `up' 138 1.1 mrg 139 1.1 mrg pxor %xmm4, %xmm4 140 1.1 mrg sub $LIMBS_PER_XMM, n 141 1.1 mrg jbe L(sum) 142 1.1 mrg 143 1.1 mrg sub $LIMBS_PER_XMM, n 144 1.1 mrg ja L(ent) 145 1.1 mrg jmp L(lsum) 146 1.1 mrg 147 1.1 mrg ALIGN(16) 148 1.1 mrg L(top): movdqa (up), %xmm0 149 1.1 mrg L(ent): movdqa 16(up), %xmm4 150 1.1 mrg 151 1.1 mrg movdqa %xmm0, %xmm1 152 1.1 mrg movdqa %xmm4, %xmm5 153 1.1 mrg psrld $1, %xmm0 154 1.1 mrg psrld $1, %xmm4 155 1.1 mrg pand mm01010101, %xmm0 156 1.1 mrg pand mm01010101, %xmm4 157 1.1 mrg psubd %xmm0, %xmm1 158 1.1 mrg psubd %xmm4, %xmm5 159 1.1 mrg 160 1.1 mrg movdqa %xmm1, %xmm0 161 1.1 mrg movdqa %xmm5, %xmm4 162 1.1 mrg psrlq $2, %xmm1 163 1.1 mrg psrlq $2, %xmm5 164 1.1 mrg pand mm00110011, %xmm0 165 1.1 mrg pand mm00110011, %xmm4 166 1.1 mrg pand mm00110011, %xmm1 167 1.1 mrg pand mm00110011, %xmm5 168 1.1 mrg paddq %xmm0, %xmm1 169 1.1 mrg paddq %xmm4, %xmm5 170 1.1 mrg 171 1.1 mrg LIMB32(`pxor zero, zero ') 172 1.1 mrg 173 1.1 mrg add $32, up 174 1.1 mrg sub $LIMBS_PER_2XMM, n 175 1.1 mrg 176 1.1 mrg paddq %xmm5, %xmm1 177 1.1 mrg movdqa %xmm1, %xmm0 178 1.1 mrg psrlq $4, %xmm1 179 1.1 mrg pand mm00001111, %xmm0 180 1.1 mrg pand mm00001111, %xmm1 181 1.1 mrg paddq %xmm0, %xmm1 182 1.1 mrg 183 1.1 mrg psadbw zero, %xmm1 184 1.1 mrg paddq %xmm1, %xmm3 C add to grand total 185 1.1 mrg 186 1.1 mrg jnc L(top) 187 1.1 mrg L(end): 188 1.1 mrg add $LIMBS_PER_2XMM, n 189 1.1 mrg jz L(rt) 190 1.1 mrg movdqa (up), %xmm0 191 1.1 mrg pxor %xmm4, %xmm4 192 1.1 mrg sub $LIMBS_PER_XMM, n 193 1.1 mrg jbe L(sum) 194 1.1 mrg L(lsum): 195 1.1 mrg movdqa %xmm0, %xmm4 196 1.1 mrg movdqa 16(up), %xmm0 197 1.1 mrg L(sum): 198 1.1 mrg shl $m4_log2(GMP_LIMB_BYTES), n 199 1.1 mrg and $12, n 200 1.1 mrg pand (breg,n,4), %xmm0 201 1.1 mrg 202 1.1 mrg movdqa %xmm0, %xmm1 203 1.1 mrg movdqa %xmm4, %xmm5 204 1.1 mrg psrld $1, %xmm0 205 1.1 mrg psrld $1, %xmm4 206 1.1 mrg pand mm01010101, %xmm0 207 1.1 mrg pand mm01010101, %xmm4 208 1.1 mrg psubd %xmm0, %xmm1 209 1.1 mrg psubd %xmm4, %xmm5 210 1.1 mrg 211 1.1 mrg movdqa %xmm1, %xmm0 212 1.1 mrg movdqa %xmm5, %xmm4 213 1.1 mrg psrlq $2, %xmm1 214 1.1 mrg psrlq $2, %xmm5 215 1.1 mrg pand mm00110011, %xmm0 216 1.1 mrg pand mm00110011, %xmm4 217 1.1 mrg pand mm00110011, %xmm1 218 1.1 mrg pand mm00110011, %xmm5 219 1.1 mrg paddq %xmm0, %xmm1 220 1.1 mrg paddq %xmm4, %xmm5 221 1.1 mrg 222 1.1 mrg LIMB32(`pxor zero, zero ') 223 1.1 mrg 224 1.1 mrg paddq %xmm5, %xmm1 225 1.1 mrg movdqa %xmm1, %xmm0 226 1.1 mrg psrlq $4, %xmm1 227 1.1 mrg pand mm00001111, %xmm0 228 1.1 mrg pand mm00001111, %xmm1 229 1.1 mrg paddq %xmm0, %xmm1 230 1.1 mrg 231 1.1 mrg psadbw zero, %xmm1 232 1.1 mrg paddq %xmm1, %xmm3 C add to grand total 233 1.1 mrg 234 1.1 mrg 235 1.1 mrg C Add the two 64-bit halves of the grand total counter 236 1.1 mrg L(rt): movdqa %xmm3, %xmm0 237 1.1 mrg psrldq $8, %xmm3 238 1.1 mrg paddq %xmm3, %xmm0 239 1.1 mrg movd %xmm0, areg C movq avoided due to gas bug 240 1.1 mrg 241 1.1 mrg LIMB32(`pop %ebx ') 242 1.1 mrg ret 243 1.1 mrg 244 1.1 mrg EPILOGUE() 245 1.1 mrg DEF_OBJECT(dummy,16) 246 1.1 mrg C Three magic constants used for masking out bits 247 1.1 mrg .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 248 1.1 mrg .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 249 1.1 mrg 250 1.1 mrg .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 251 1.1 mrg .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 252 1.1 mrg 253 1.1 mrg .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 254 1.1 mrg .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 255 1.1 mrg cnsts: 256 1.1 mrg C Masks for high end of number 257 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 258 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 259 1.1 mrg 260 1.1 mrg .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 261 1.1 mrg .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 262 1.1 mrg 263 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 264 1.1 mrg .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 265 1.1 mrg 266 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 267 1.1 mrg .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 268 1.1 mrg C Masks for low end of number 269 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 270 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 271 1.1 mrg 272 1.1 mrg .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 273 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 274 1.1 mrg 275 1.1 mrg .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 276 1.1 mrg .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 277 1.1 mrg 278 1.1 mrg .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 279 1.1 mrg .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 280 1.1 mrg END_OBJECT(dummy) 281 1.1.1.3 mrg ASM_END() 282