1 1.1.1.3 mrg dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance. 2 1.1 mrg 3 1.1.1.3 mrg dnl Copyright 2010-2017 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.2 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1.1.2 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1.1.3 mrg C cycles/limb good for cpu? 35 1.1.1.3 mrg C AMD K8,K9 n/a 36 1.1.1.3 mrg C AMD K10 n/a 37 1.1.1.3 mrg C AMD bd1 1.51-2.0 y 38 1.1.1.3 mrg C AMD bd2 1.50-1.9 y 39 1.1.1.3 mrg C AMD bd3 ? 40 1.1.1.3 mrg C AMD bd4 ? 41 1.1.1.3 mrg C AMD zen n/a 42 1.1.1.3 mrg C AMD bobcat n/a 43 1.1.1.3 mrg C AMD jaguar n/a 44 1.1.1.3 mrg C Intel P4 n/a 45 1.1.1.3 mrg C Intel PNR n/a 46 1.1.1.3 mrg C Intel NHM n/a 47 1.1.1.3 mrg C Intel SBR n/a 48 1.1.1.3 mrg C Intel IBR n/a 49 1.1.1.3 mrg C Intel HWL n/a 50 1.1.1.3 mrg C Intel BWL n/a 51 1.1.1.3 mrg C Intel SKL n/a 52 1.1.1.3 mrg C Intel atom n/a 53 1.1.1.3 mrg C Intel SLM n/a 54 1.1.1.3 mrg C VIA nano n/a 55 1.1.1.3 mrg 56 1.1.1.3 mrg C TODO 57 1.1.1.3 mrg C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we 58 1.1.1.3 mrg C intend to support old systems. 59 1.1.1.3 mrg 60 1.1.1.3 mrg C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some 61 1.1.1.3 mrg C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. 62 1.1.1.3 mrg C We fall back to the core2 code. 63 1.1.1.3 mrg ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` 64 1.1.1.3 mrg MULFUNC_PROLOGUE(mpn_hamdist) 65 1.1.1.3 mrg include_mpn(`x86_64/core2/hamdist.asm') 66 1.1.1.3 mrg ',` 67 1.1.1.3 mrg 68 1.1.1.3 mrg define(`up', `%rdi') 69 1.1.1.3 mrg define(`vp', `%rsi') 70 1.1.1.3 mrg define(`n', `%rdx') 71 1.1.1.3 mrg 72 1.1 mrg ABI_SUPPORT(DOS64) 73 1.1 mrg ABI_SUPPORT(STD64) 74 1.1 mrg 75 1.1.1.3 mrg ASM_START() 76 1.1.1.3 mrg TEXT 77 1.1.1.3 mrg ALIGN(32) 78 1.1.1.3 mrg PROLOGUE(mpn_hamdist) 79 1.1.1.3 mrg FUNC_ENTRY(3) 80 1.1.1.3 mrg cmp $5, n 81 1.1.1.3 mrg jl L(sma) 82 1.1.1.3 mrg 83 1.1.1.3 mrg lea L(cnsts)(%rip), %r9 84 1.1.1.3 mrg 85 1.1.1.3 mrg xor R32(%r10), R32(%r10) 86 1.1.1.3 mrg test $8, R8(vp) 87 1.1.1.3 mrg jz L(ali) 88 1.1.1.3 mrg mov (up), %r8 89 1.1.1.3 mrg xor (vp), %r8 90 1.1.1.3 mrg add $8, up 91 1.1.1.3 mrg add $8, vp 92 1.1.1.3 mrg dec n 93 1.1.1.3 mrg popcnt %r8, %r10 94 1.1.1.3 mrg L(ali): 95 1.1.1.3 mrg 96 1.1.1.3 mrg ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)', 97 1.1.1.3 mrg `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)') 98 1.1.1.3 mrg movdqa OFF1`'(%r9), %xmm7 C nibble counts table 99 1.1.1.3 mrg movdqa OFF2`'(%r9), %xmm6 C splat shift counts 100 1.1.1.3 mrg movdqa OFF3`'(%r9), %xmm5 C masks 101 1.1.1.3 mrg pxor %xmm4, %xmm4 102 1.1.1.3 mrg pxor %xmm8, %xmm8 C grand total count 103 1.1.1.3 mrg 104 1.1.1.3 mrg mov R32(n), R32(%rax) 105 1.1.1.3 mrg and $6, R32(%rax) 106 1.1.1.3 mrg lea -64(up,%rax,8), up 107 1.1.1.3 mrg lea -64(vp,%rax,8), vp 108 1.1.1.3 mrg ifdef(`PIC',` 109 1.1.1.3 mrg movslq (%r9,%rax,2), %r11 110 1.1.1.3 mrg add %r9, %r11 111 1.1.1.3 mrg jmp *%r11 112 1.1.1.3 mrg ',` 113 1.1.1.3 mrg jmp *(%r9,%rax,4) 114 1.1.1.3 mrg ') 115 1.1.1.3 mrg 116 1.1.1.3 mrg L(0): add $64, up 117 1.1.1.3 mrg add $64, vp 118 1.1.1.3 mrg sub $2, n 119 1.1.1.3 mrg 120 1.1.1.3 mrg ALIGN(32) 121 1.1.1.3 mrg L(top): lddqu (up), %xmm0 122 1.1.1.3 mrg pxor (vp), %xmm0 123 1.1.1.3 mrg .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 124 1.1.1.3 mrg pand %xmm5, %xmm0 125 1.1.1.3 mrg pand %xmm5, %xmm1 126 1.1.1.3 mrg .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 127 1.1.1.3 mrg .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 128 1.1.1.3 mrg paddb %xmm2, %xmm3 129 1.1.1.3 mrg paddb %xmm3, %xmm4 130 1.1.1.3 mrg L(6): lddqu 16(up), %xmm0 131 1.1.1.3 mrg pxor 16(vp), %xmm0 132 1.1.1.3 mrg .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 133 1.1.1.3 mrg pand %xmm5, %xmm0 134 1.1.1.3 mrg pand %xmm5, %xmm1 135 1.1.1.3 mrg .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 136 1.1.1.3 mrg .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 137 1.1.1.3 mrg paddb %xmm2, %xmm3 138 1.1.1.3 mrg paddb %xmm3, %xmm4 139 1.1.1.3 mrg L(4): lddqu 32(up), %xmm0 140 1.1.1.3 mrg pxor 32(vp), %xmm0 141 1.1.1.3 mrg .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 142 1.1.1.3 mrg pand %xmm5, %xmm0 143 1.1.1.3 mrg pand %xmm5, %xmm1 144 1.1.1.3 mrg .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 145 1.1.1.3 mrg .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 146 1.1.1.3 mrg .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 147 1.1.1.3 mrg paddb %xmm2, %xmm3 148 1.1.1.3 mrg paddb %xmm2, %xmm4 149 1.1.1.3 mrg paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts 150 1.1.1.3 mrg L(2): mov 48(up), %r8 151 1.1.1.3 mrg mov 56(up), %r9 152 1.1.1.3 mrg add $64, up 153 1.1.1.3 mrg xor 48(vp), %r8 154 1.1.1.3 mrg xor 56(vp), %r9 155 1.1.1.3 mrg add $64, vp 156 1.1.1.3 mrg popcnt %r8, %r8 157 1.1.1.3 mrg popcnt %r9, %r9 158 1.1.1.3 mrg add %r8, %r10 159 1.1.1.3 mrg add %r9, %r10 160 1.1.1.3 mrg sub $8, n 161 1.1.1.3 mrg jg L(top) 162 1.1.1.3 mrg 163 1.1.1.3 mrg test $1, R8(n) 164 1.1.1.3 mrg jz L(x) 165 1.1.1.3 mrg mov (up), %r8 166 1.1.1.3 mrg xor (vp), %r8 167 1.1.1.3 mrg popcnt %r8, %r8 168 1.1.1.3 mrg add %r8, %r10 169 1.1.1.3 mrg L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 170 1.1.1.3 mrg paddq %xmm0, %xmm8 171 1.1.1.3 mrg pshufd $14, %xmm8, %xmm0 172 1.1.1.3 mrg paddq %xmm8, %xmm0 173 1.1.1.3 mrg movq %xmm0, %rax 174 1.1.1.3 mrg add %r10, %rax 175 1.1.1.3 mrg FUNC_EXIT() 176 1.1.1.3 mrg ret 177 1.1.1.3 mrg 178 1.1.1.3 mrg L(sma): mov (up), %r8 179 1.1.1.3 mrg xor (vp), %r8 180 1.1.1.3 mrg popcnt %r8, %rax 181 1.1.1.3 mrg dec n 182 1.1.1.3 mrg jz L(ed) 183 1.1.1.3 mrg L(tp): mov 8(up), %r8 184 1.1.1.3 mrg add $8, up 185 1.1.1.3 mrg xor 8(vp), %r8 186 1.1.1.3 mrg add $8, vp 187 1.1.1.3 mrg popcnt %r8, %r8 188 1.1.1.3 mrg add %r8, %rax 189 1.1.1.3 mrg dec n 190 1.1.1.3 mrg jnz L(tp) 191 1.1.1.3 mrg L(ed): FUNC_EXIT() 192 1.1.1.3 mrg ret 193 1.1.1.3 mrg EPILOGUE() 194 1.1.1.3 mrg DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') 195 1.1.1.3 mrg JMPENT( L(0), L(cnsts)) 196 1.1.1.3 mrg JMPENT( L(2), L(cnsts)) 197 1.1.1.3 mrg JMPENT( L(4), L(cnsts)) 198 1.1.1.3 mrg JMPENT( L(6), L(cnsts)) 199 1.1.1.3 mrg .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 200 1.1.1.3 mrg .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 201 1.1.1.3 mrg .byte -4,-4,-4,-4,-4,-4,-4,-4 202 1.1.1.3 mrg .byte -4,-4,-4,-4,-4,-4,-4,-4 203 1.1.1.3 mrg .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 204 1.1.1.3 mrg .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 205 1.1.1.3 mrg END_OBJECT(L(cnsts)) 206 1.1.1.3 mrg ') 207