Home | History | Annotate | Line # | Download | only in bd1
      1 dnl  AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
      2 
      3 dnl  Copyright 2010-2017 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 
     32 include(`../config.m4')
     33 
     34 C		    cycles/limb	  good for cpu?
     35 C AMD K8,K9		n/a
     36 C AMD K10		n/a
     37 C AMD bd1	     1.51-2.0		y
     38 C AMD bd2	     1.50-1.9		y
     39 C AMD bd3		 ?
     40 C AMD bd4		 ?
     41 C AMD zen		n/a
     42 C AMD bobcat		n/a
     43 C AMD jaguar		n/a
     44 C Intel P4		n/a
     45 C Intel PNR		n/a
     46 C Intel NHM		n/a
     47 C Intel SBR		n/a
     48 C Intel IBR		n/a
     49 C Intel HWL		n/a
     50 C Intel BWL		n/a
     51 C Intel SKL		n/a
     52 C Intel atom		n/a
     53 C Intel SLM		n/a
     54 C VIA nano		n/a
     55 
     56 C TODO
     57 C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
     58 C    intend to support old systems.
     59 
     60 C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
     61 C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
     62 C We fall back to the core2 code.
     63 ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
     64 MULFUNC_PROLOGUE(mpn_hamdist)
     65 include_mpn(`x86_64/core2/hamdist.asm')
     66 ',`
     67 
     68 define(`up',		`%rdi')
     69 define(`vp',		`%rsi')
     70 define(`n',		`%rdx')
     71 
     72 ABI_SUPPORT(DOS64)
     73 ABI_SUPPORT(STD64)
     74 
     75 ASM_START()
     76 	TEXT
     77 	ALIGN(32)
     78 PROLOGUE(mpn_hamdist)
     79 	FUNC_ENTRY(3)
     80 	cmp	$5, n
     81 	jl	L(sma)
     82 
     83 	lea	L(cnsts)(%rip), %r9
     84 
     85 	xor	R32(%r10), R32(%r10)
     86 	test	$8, R8(vp)
     87 	jz	L(ali)
     88 	mov	(up), %r8
     89 	xor	(vp), %r8
     90 	add	$8, up
     91 	add	$8, vp
     92 	dec	n
     93 	popcnt	%r8, %r10
     94 L(ali):
     95 
     96 ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
     97 	     `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
     98 	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
     99 	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
    100 	movdqa	OFF3`'(%r9), %xmm5	C masks
    101 	pxor	%xmm4, %xmm4
    102 	pxor	%xmm8, %xmm8		C grand total count
    103 
    104 	mov	R32(n), R32(%rax)
    105 	and	$6, R32(%rax)
    106 	lea	-64(up,%rax,8), up
    107 	lea	-64(vp,%rax,8), vp
    108 ifdef(`PIC',`
    109 	movslq	(%r9,%rax,2), %r11
    110 	add	%r9, %r11
    111 	jmp	*%r11
    112 ',`
    113 	jmp	*(%r9,%rax,4)
    114 ')
    115 
    116 L(0):	add	$64, up
    117 	add	$64, vp
    118 	sub	$2, n
    119 
    120 	ALIGN(32)
    121 L(top):	lddqu	(up), %xmm0
    122 	pxor	(vp), %xmm0
    123 	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
    124 	pand	%xmm5, %xmm0
    125 	pand	%xmm5, %xmm1
    126 	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
    127 	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
    128 	paddb	%xmm2, %xmm3
    129 	paddb	%xmm3, %xmm4
    130 L(6):	lddqu	16(up), %xmm0
    131 	pxor	16(vp), %xmm0
    132 	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
    133 	pand	%xmm5, %xmm0
    134 	pand	%xmm5, %xmm1
    135 	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
    136 	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
    137 	paddb	%xmm2, %xmm3
    138 	paddb	%xmm3, %xmm4
    139 L(4):	lddqu	32(up), %xmm0
    140 	pxor	32(vp), %xmm0
    141 	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
    142 	pand	%xmm5, %xmm0
    143 	pand	%xmm5, %xmm1
    144 	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
    145 	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
    146 	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
    147 	paddb	%xmm2, %xmm3
    148 	paddb	%xmm2, %xmm4
    149 	paddq	%xmm0, %xmm8		C sum to 2 x 64-bit counts
    150 L(2):	mov	48(up), %r8
    151 	mov	56(up), %r9
    152 	add	$64, up
    153 	xor	48(vp), %r8
    154 	xor	56(vp), %r9
    155 	add	$64, vp
    156 	popcnt	%r8, %r8
    157 	popcnt	%r9, %r9
    158 	add	%r8, %r10
    159 	add	%r9, %r10
    160 	sub	$8, n
    161 	jg	L(top)
    162 
    163 	test	$1, R8(n)
    164 	jz	L(x)
    165 	mov	(up), %r8
    166 	xor	(vp), %r8
    167 	popcnt	%r8, %r8
    168 	add	%r8, %r10
    169 L(x):	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
    170 	paddq	%xmm0, %xmm8
    171 	pshufd	$14, %xmm8, %xmm0
    172 	paddq	%xmm8, %xmm0
    173 	movq	%xmm0, %rax
    174 	add	%r10, %rax
    175 	FUNC_EXIT()
    176 	ret
    177 
    178 L(sma):	mov	(up), %r8
    179 	xor	(vp), %r8
    180 	popcnt	%r8, %rax
    181 	dec	n
    182 	jz	L(ed)
    183 L(tp):	mov	8(up), %r8
    184 	add	$8, up
    185 	xor	8(vp), %r8
    186 	add	$8, vp
    187 	popcnt	%r8, %r8
    188 	add	%r8, %rax
    189 	dec	n
    190 	jnz	L(tp)
    191 L(ed):	FUNC_EXIT()
    192 	ret
    193 EPILOGUE()
    194 DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
    195 	JMPENT(	L(0), L(cnsts))
    196 	JMPENT(	L(2), L(cnsts))
    197 	JMPENT(	L(4), L(cnsts))
    198 	JMPENT(	L(6), L(cnsts))
    199 	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
    200 	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
    201 	.byte	-4,-4,-4,-4,-4,-4,-4,-4
    202 	.byte	-4,-4,-4,-4,-4,-4,-4,-4
    203 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    204 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    205 END_OBJECT(L(cnsts))
    206 ')
    207