Home | History | Annotate | Line # | Download | only in bd1
      1  1.1.1.3  mrg dnl  AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
      2      1.1  mrg 
      3  1.1.1.3  mrg dnl  Copyright 2010-2017 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1.1.2  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26  1.1.1.2  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34  1.1.1.3  mrg C		    cycles/limb	  good for cpu?
     35  1.1.1.3  mrg C AMD K8,K9		n/a
     36  1.1.1.3  mrg C AMD K10		n/a
     37  1.1.1.3  mrg C AMD bd1	     1.51-2.0		y
     38  1.1.1.3  mrg C AMD bd2	     1.50-1.9		y
     39  1.1.1.3  mrg C AMD bd3		 ?
     40  1.1.1.3  mrg C AMD bd4		 ?
     41  1.1.1.3  mrg C AMD zen		n/a
     42  1.1.1.3  mrg C AMD bobcat		n/a
     43  1.1.1.3  mrg C AMD jaguar		n/a
     44  1.1.1.3  mrg C Intel P4		n/a
     45  1.1.1.3  mrg C Intel PNR		n/a
     46  1.1.1.3  mrg C Intel NHM		n/a
     47  1.1.1.3  mrg C Intel SBR		n/a
     48  1.1.1.3  mrg C Intel IBR		n/a
     49  1.1.1.3  mrg C Intel HWL		n/a
     50  1.1.1.3  mrg C Intel BWL		n/a
     51  1.1.1.3  mrg C Intel SKL		n/a
     52  1.1.1.3  mrg C Intel atom		n/a
     53  1.1.1.3  mrg C Intel SLM		n/a
     54  1.1.1.3  mrg C VIA nano		n/a
     55  1.1.1.3  mrg 
     56  1.1.1.3  mrg C TODO
     57  1.1.1.3  mrg C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
     58  1.1.1.3  mrg C    intend to support old systems.
     59  1.1.1.3  mrg 
     60  1.1.1.3  mrg C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
     61  1.1.1.3  mrg C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
     62  1.1.1.3  mrg C We fall back to the core2 code.
     63  1.1.1.3  mrg ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
     64  1.1.1.3  mrg MULFUNC_PROLOGUE(mpn_hamdist)
     65  1.1.1.3  mrg include_mpn(`x86_64/core2/hamdist.asm')
     66  1.1.1.3  mrg ',`
     67  1.1.1.3  mrg 
     68  1.1.1.3  mrg define(`up',		`%rdi')
     69  1.1.1.3  mrg define(`vp',		`%rsi')
     70  1.1.1.3  mrg define(`n',		`%rdx')
     71  1.1.1.3  mrg 
     72      1.1  mrg ABI_SUPPORT(DOS64)
     73      1.1  mrg ABI_SUPPORT(STD64)
     74      1.1  mrg 
     75  1.1.1.3  mrg ASM_START()
     76  1.1.1.3  mrg 	TEXT
     77  1.1.1.3  mrg 	ALIGN(32)
     78  1.1.1.3  mrg PROLOGUE(mpn_hamdist)
     79  1.1.1.3  mrg 	FUNC_ENTRY(3)
     80  1.1.1.3  mrg 	cmp	$5, n
     81  1.1.1.3  mrg 	jl	L(sma)
     82  1.1.1.3  mrg 
     83  1.1.1.3  mrg 	lea	L(cnsts)(%rip), %r9
     84  1.1.1.3  mrg 
     85  1.1.1.3  mrg 	xor	R32(%r10), R32(%r10)
     86  1.1.1.3  mrg 	test	$8, R8(vp)
     87  1.1.1.3  mrg 	jz	L(ali)
     88  1.1.1.3  mrg 	mov	(up), %r8
     89  1.1.1.3  mrg 	xor	(vp), %r8
     90  1.1.1.3  mrg 	add	$8, up
     91  1.1.1.3  mrg 	add	$8, vp
     92  1.1.1.3  mrg 	dec	n
     93  1.1.1.3  mrg 	popcnt	%r8, %r10
     94  1.1.1.3  mrg L(ali):
     95  1.1.1.3  mrg 
     96  1.1.1.3  mrg ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
     97  1.1.1.3  mrg 	     `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
     98  1.1.1.3  mrg 	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
     99  1.1.1.3  mrg 	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
    100  1.1.1.3  mrg 	movdqa	OFF3`'(%r9), %xmm5	C masks
    101  1.1.1.3  mrg 	pxor	%xmm4, %xmm4
    102  1.1.1.3  mrg 	pxor	%xmm8, %xmm8		C grand total count
    103  1.1.1.3  mrg 
    104  1.1.1.3  mrg 	mov	R32(n), R32(%rax)
    105  1.1.1.3  mrg 	and	$6, R32(%rax)
    106  1.1.1.3  mrg 	lea	-64(up,%rax,8), up
    107  1.1.1.3  mrg 	lea	-64(vp,%rax,8), vp
    108  1.1.1.3  mrg ifdef(`PIC',`
    109  1.1.1.3  mrg 	movslq	(%r9,%rax,2), %r11
    110  1.1.1.3  mrg 	add	%r9, %r11
    111  1.1.1.3  mrg 	jmp	*%r11
    112  1.1.1.3  mrg ',`
    113  1.1.1.3  mrg 	jmp	*(%r9,%rax,4)
    114  1.1.1.3  mrg ')
    115  1.1.1.3  mrg 
    116  1.1.1.3  mrg L(0):	add	$64, up
    117  1.1.1.3  mrg 	add	$64, vp
    118  1.1.1.3  mrg 	sub	$2, n
    119  1.1.1.3  mrg 
    120  1.1.1.3  mrg 	ALIGN(32)
    121  1.1.1.3  mrg L(top):	lddqu	(up), %xmm0
    122  1.1.1.3  mrg 	pxor	(vp), %xmm0
    123  1.1.1.3  mrg 	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
    124  1.1.1.3  mrg 	pand	%xmm5, %xmm0
    125  1.1.1.3  mrg 	pand	%xmm5, %xmm1
    126  1.1.1.3  mrg 	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
    127  1.1.1.3  mrg 	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
    128  1.1.1.3  mrg 	paddb	%xmm2, %xmm3
    129  1.1.1.3  mrg 	paddb	%xmm3, %xmm4
    130  1.1.1.3  mrg L(6):	lddqu	16(up), %xmm0
    131  1.1.1.3  mrg 	pxor	16(vp), %xmm0
    132  1.1.1.3  mrg 	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
    133  1.1.1.3  mrg 	pand	%xmm5, %xmm0
    134  1.1.1.3  mrg 	pand	%xmm5, %xmm1
    135  1.1.1.3  mrg 	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
    136  1.1.1.3  mrg 	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
    137  1.1.1.3  mrg 	paddb	%xmm2, %xmm3
    138  1.1.1.3  mrg 	paddb	%xmm3, %xmm4
    139  1.1.1.3  mrg L(4):	lddqu	32(up), %xmm0
    140  1.1.1.3  mrg 	pxor	32(vp), %xmm0
    141  1.1.1.3  mrg 	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
    142  1.1.1.3  mrg 	pand	%xmm5, %xmm0
    143  1.1.1.3  mrg 	pand	%xmm5, %xmm1
    144  1.1.1.3  mrg 	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
    145  1.1.1.3  mrg 	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
    146  1.1.1.3  mrg 	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
    147  1.1.1.3  mrg 	paddb	%xmm2, %xmm3
    148  1.1.1.3  mrg 	paddb	%xmm2, %xmm4
    149  1.1.1.3  mrg 	paddq	%xmm0, %xmm8		C sum to 2 x 64-bit counts
    150  1.1.1.3  mrg L(2):	mov	48(up), %r8
    151  1.1.1.3  mrg 	mov	56(up), %r9
    152  1.1.1.3  mrg 	add	$64, up
    153  1.1.1.3  mrg 	xor	48(vp), %r8
    154  1.1.1.3  mrg 	xor	56(vp), %r9
    155  1.1.1.3  mrg 	add	$64, vp
    156  1.1.1.3  mrg 	popcnt	%r8, %r8
    157  1.1.1.3  mrg 	popcnt	%r9, %r9
    158  1.1.1.3  mrg 	add	%r8, %r10
    159  1.1.1.3  mrg 	add	%r9, %r10
    160  1.1.1.3  mrg 	sub	$8, n
    161  1.1.1.3  mrg 	jg	L(top)
    162  1.1.1.3  mrg 
    163  1.1.1.3  mrg 	test	$1, R8(n)
    164  1.1.1.3  mrg 	jz	L(x)
    165  1.1.1.3  mrg 	mov	(up), %r8
    166  1.1.1.3  mrg 	xor	(vp), %r8
    167  1.1.1.3  mrg 	popcnt	%r8, %r8
    168  1.1.1.3  mrg 	add	%r8, %r10
    169  1.1.1.3  mrg L(x):	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
    170  1.1.1.3  mrg 	paddq	%xmm0, %xmm8
    171  1.1.1.3  mrg 	pshufd	$14, %xmm8, %xmm0
    172  1.1.1.3  mrg 	paddq	%xmm8, %xmm0
    173  1.1.1.3  mrg 	movq	%xmm0, %rax
    174  1.1.1.3  mrg 	add	%r10, %rax
    175  1.1.1.3  mrg 	FUNC_EXIT()
    176  1.1.1.3  mrg 	ret
    177  1.1.1.3  mrg 
    178  1.1.1.3  mrg L(sma):	mov	(up), %r8
    179  1.1.1.3  mrg 	xor	(vp), %r8
    180  1.1.1.3  mrg 	popcnt	%r8, %rax
    181  1.1.1.3  mrg 	dec	n
    182  1.1.1.3  mrg 	jz	L(ed)
    183  1.1.1.3  mrg L(tp):	mov	8(up), %r8
    184  1.1.1.3  mrg 	add	$8, up
    185  1.1.1.3  mrg 	xor	8(vp), %r8
    186  1.1.1.3  mrg 	add	$8, vp
    187  1.1.1.3  mrg 	popcnt	%r8, %r8
    188  1.1.1.3  mrg 	add	%r8, %rax
    189  1.1.1.3  mrg 	dec	n
    190  1.1.1.3  mrg 	jnz	L(tp)
    191  1.1.1.3  mrg L(ed):	FUNC_EXIT()
    192  1.1.1.3  mrg 	ret
    193  1.1.1.3  mrg EPILOGUE()
    194  1.1.1.3  mrg DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
    195  1.1.1.3  mrg 	JMPENT(	L(0), L(cnsts))
    196  1.1.1.3  mrg 	JMPENT(	L(2), L(cnsts))
    197  1.1.1.3  mrg 	JMPENT(	L(4), L(cnsts))
    198  1.1.1.3  mrg 	JMPENT(	L(6), L(cnsts))
    199  1.1.1.3  mrg 	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
    200  1.1.1.3  mrg 	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
    201  1.1.1.3  mrg 	.byte	-4,-4,-4,-4,-4,-4,-4,-4
    202  1.1.1.3  mrg 	.byte	-4,-4,-4,-4,-4,-4,-4,-4
    203  1.1.1.3  mrg 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    204  1.1.1.3  mrg 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    205  1.1.1.3  mrg END_OBJECT(L(cnsts))
    206  1.1.1.3  mrg ')
    207