Home | History | Annotate | Line # | Download | only in mmx
      1      1.1  mrg dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
      2      1.1  mrg dnl  hamming distance.
      3      1.1  mrg 
      4  1.1.1.2  mrg dnl  Copyright 2000-2002 Free Software Foundation, Inc.
      5  1.1.1.2  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7      1.1  mrg dnl
      8  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg dnl  it under the terms of either:
     10  1.1.1.2  mrg dnl
     11  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg dnl      option) any later version.
     14  1.1.1.2  mrg dnl
     15  1.1.1.2  mrg dnl  or
     16  1.1.1.2  mrg dnl
     17  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg dnl      later version.
     20  1.1.1.2  mrg dnl
     21  1.1.1.2  mrg dnl  or both in parallel, as here.
     22  1.1.1.2  mrg dnl
     23  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg dnl  for more details.
     27      1.1  mrg dnl
     28  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg 
     35      1.1  mrg C        popcount  hamdist
     36      1.1  mrg C K6-2:    9.0       11.5   cycles/limb
     37      1.1  mrg C K6:      12.5      13.0
     38      1.1  mrg 
     39      1.1  mrg 
     40      1.1  mrg C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     41      1.1  mrg C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     42      1.1  mrg C
     43      1.1  mrg C The code here isn't optimal, but it's already a 2x speedup over the plain
     44      1.1  mrg C integer mpn/generic/popcount.c,hamdist.c.
     45      1.1  mrg 
     46      1.1  mrg 
     47      1.1  mrg ifdef(`OPERATION_popcount',,
     48      1.1  mrg `ifdef(`OPERATION_hamdist',,
     49      1.1  mrg `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
     50      1.1  mrg ')m4exit(1)')')
     51      1.1  mrg 
     52      1.1  mrg define(HAM,
     53      1.1  mrg m4_assert_numargs(1)
     54      1.1  mrg `ifdef(`OPERATION_hamdist',`$1')')
     55      1.1  mrg 
     56      1.1  mrg define(POP,
     57      1.1  mrg m4_assert_numargs(1)
     58      1.1  mrg `ifdef(`OPERATION_popcount',`$1')')
     59      1.1  mrg 
     60      1.1  mrg HAM(`
     61      1.1  mrg defframe(PARAM_SIZE,   12)
     62      1.1  mrg defframe(PARAM_SRC2,   8)
     63      1.1  mrg defframe(PARAM_SRC,    4)
     64      1.1  mrg define(M4_function,mpn_hamdist)
     65      1.1  mrg ')
     66      1.1  mrg POP(`
     67      1.1  mrg defframe(PARAM_SIZE,   8)
     68      1.1  mrg defframe(PARAM_SRC,    4)
     69      1.1  mrg define(M4_function,mpn_popcount)
     70      1.1  mrg ')
     71      1.1  mrg 
     72      1.1  mrg MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     73      1.1  mrg 
     74      1.1  mrg 
     75      1.1  mrg ifdef(`PIC',,`
     76      1.1  mrg 	dnl  non-PIC
     77      1.1  mrg 
     78      1.1  mrg 	RODATA
     79      1.1  mrg 	ALIGN(8)
     80      1.1  mrg 
     81      1.1  mrg L(rodata_AAAAAAAAAAAAAAAA):
     82      1.1  mrg 	.long	0xAAAAAAAA
     83      1.1  mrg 	.long	0xAAAAAAAA
     84      1.1  mrg 
     85      1.1  mrg L(rodata_3333333333333333):
     86      1.1  mrg 	.long	0x33333333
     87      1.1  mrg 	.long	0x33333333
     88      1.1  mrg 
     89      1.1  mrg L(rodata_0F0F0F0F0F0F0F0F):
     90      1.1  mrg 	.long	0x0F0F0F0F
     91      1.1  mrg 	.long	0x0F0F0F0F
     92      1.1  mrg 
     93      1.1  mrg L(rodata_000000FF000000FF):
     94      1.1  mrg 	.long	0x000000FF
     95      1.1  mrg 	.long	0x000000FF
     96      1.1  mrg ')
     97      1.1  mrg 
     98      1.1  mrg 	TEXT
     99      1.1  mrg 	ALIGN(32)
    100      1.1  mrg 
    101      1.1  mrg POP(`ifdef(`PIC', `
    102      1.1  mrg 	C avoid shrl crossing a 32-byte boundary
    103      1.1  mrg 	nop')')
    104      1.1  mrg 
    105      1.1  mrg PROLOGUE(M4_function)
    106      1.1  mrg deflit(`FRAME',0)
    107      1.1  mrg 
    108      1.1  mrg 	movl	PARAM_SIZE, %ecx
    109      1.1  mrg 
    110      1.1  mrg ifdef(`PIC',`
    111      1.1  mrg 	movl	$0xAAAAAAAA, %eax
    112      1.1  mrg 	movl	$0x33333333, %edx
    113      1.1  mrg 
    114      1.1  mrg 	movd	%eax, %mm7
    115      1.1  mrg 	movd	%edx, %mm6
    116      1.1  mrg 
    117      1.1  mrg 	movl	$0x0F0F0F0F, %eax
    118      1.1  mrg 	movl	$0x000000FF, %edx
    119      1.1  mrg 
    120      1.1  mrg 	punpckldq %mm7, %mm7
    121      1.1  mrg 	punpckldq %mm6, %mm6
    122      1.1  mrg 
    123      1.1  mrg 	movd	%eax, %mm5
    124      1.1  mrg 	movd	%edx, %mm4
    125      1.1  mrg 
    126      1.1  mrg 	punpckldq %mm5, %mm5
    127      1.1  mrg 	punpckldq %mm4, %mm4
    128      1.1  mrg ',`
    129      1.1  mrg 
    130      1.1  mrg 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    131      1.1  mrg 	movq	L(rodata_3333333333333333), %mm6
    132      1.1  mrg 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    133      1.1  mrg 	movq	L(rodata_000000FF000000FF), %mm4
    134      1.1  mrg ')
    135      1.1  mrg 
    136      1.1  mrg define(REG_AAAAAAAAAAAAAAAA, %mm7)
    137      1.1  mrg define(REG_3333333333333333, %mm6)
    138      1.1  mrg define(REG_0F0F0F0F0F0F0F0F, %mm5)
    139      1.1  mrg define(REG_000000FF000000FF, %mm4)
    140      1.1  mrg 
    141      1.1  mrg 
    142      1.1  mrg 	movl	PARAM_SRC, %eax
    143      1.1  mrg HAM(`	movl	PARAM_SRC2, %edx')
    144      1.1  mrg 
    145      1.1  mrg 	pxor	%mm2, %mm2	C total
    146      1.1  mrg 
    147      1.1  mrg 	shrl	%ecx
    148      1.1  mrg 	jnc	L(top)
    149      1.1  mrg 
    150      1.1  mrg Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
    151      1.1  mrg 
    152      1.1  mrg HAM(`
    153      1.1  mrg Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
    154      1.1  mrg 	pxor	%mm0, %mm1
    155      1.1  mrg ')
    156      1.1  mrg 
    157      1.1  mrg 	incl	%ecx
    158      1.1  mrg 	jmp	L(loaded)
    159      1.1  mrg 
    160      1.1  mrg 
    161      1.1  mrg 	ALIGN(16)
    162      1.1  mrg POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
    163      1.1  mrg 
    164      1.1  mrg L(top):
    165      1.1  mrg 	C eax	src
    166      1.1  mrg 	C ebx
    167      1.1  mrg 	C ecx	counter, qwords, decrementing
    168      1.1  mrg 	C edx	[hamdist] src2
    169      1.1  mrg 	C
    170      1.1  mrg 	C mm0	(scratch)
    171      1.1  mrg 	C mm1	(scratch)
    172      1.1  mrg 	C mm2	total (low dword)
    173      1.1  mrg 	C mm3
    174      1.1  mrg 	C mm4	\
    175      1.1  mrg 	C mm5	| special constants
    176      1.1  mrg 	C mm6	|
    177      1.1  mrg 	C mm7	/
    178      1.1  mrg 
    179      1.1  mrg 	movq	-8(%eax,%ecx,8), %mm1
    180      1.1  mrg HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
    181      1.1  mrg 
    182      1.1  mrg L(loaded):
    183      1.1  mrg 	movq	%mm1, %mm0
    184      1.1  mrg 	pand	REG_AAAAAAAAAAAAAAAA, %mm1
    185      1.1  mrg 
    186      1.1  mrg 	psrlq	$1, %mm1
    187      1.1  mrg HAM(`	nop			C code alignment')
    188      1.1  mrg 
    189      1.1  mrg 	psubd	%mm1, %mm0	C bit pairs
    190      1.1  mrg HAM(`	nop			C code alignment')
    191      1.1  mrg 
    192      1.1  mrg 
    193      1.1  mrg 	movq	%mm0, %mm1
    194      1.1  mrg 	psrlq	$2, %mm0
    195      1.1  mrg 
    196      1.1  mrg 	pand	REG_3333333333333333, %mm0
    197      1.1  mrg 	pand	REG_3333333333333333, %mm1
    198      1.1  mrg 
    199      1.1  mrg 	paddd	%mm1, %mm0	C nibbles
    200      1.1  mrg 
    201      1.1  mrg 
    202      1.1  mrg 	movq	%mm0, %mm1
    203      1.1  mrg 	psrlq	$4, %mm0
    204      1.1  mrg 
    205      1.1  mrg 	pand	REG_0F0F0F0F0F0F0F0F, %mm0
    206      1.1  mrg 	pand	REG_0F0F0F0F0F0F0F0F, %mm1
    207      1.1  mrg 
    208      1.1  mrg 	paddd	%mm1, %mm0	C bytes
    209      1.1  mrg 
    210      1.1  mrg 	movq	%mm0, %mm1
    211      1.1  mrg 	psrlq	$8, %mm0
    212      1.1  mrg 
    213      1.1  mrg 
    214      1.1  mrg 	paddb	%mm1, %mm0	C words
    215      1.1  mrg 
    216      1.1  mrg 
    217      1.1  mrg 	movq	%mm0, %mm1
    218      1.1  mrg 	psrlq	$16, %mm0
    219      1.1  mrg 
    220      1.1  mrg 	paddd	%mm1, %mm0	C dwords
    221      1.1  mrg 
    222      1.1  mrg 	pand	REG_000000FF000000FF, %mm0
    223      1.1  mrg 
    224      1.1  mrg 	paddd	%mm0, %mm2	C low to total
    225      1.1  mrg 	psrlq	$32, %mm0
    226      1.1  mrg 
    227      1.1  mrg 	paddd	%mm0, %mm2	C high to total
    228      1.1  mrg 	loop	L(top)
    229      1.1  mrg 
    230      1.1  mrg 
    231      1.1  mrg 
    232      1.1  mrg 	movd	%mm2, %eax
    233      1.1  mrg 	emms_or_femms
    234      1.1  mrg 	ret
    235      1.1  mrg 
    236      1.1  mrg EPILOGUE()
    237