Home | History | Annotate | Line # | Download | only in mmx
      1      1.1  mrg dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
      2      1.1  mrg dnl  distance.
      3      1.1  mrg 
      4  1.1.1.3  mrg dnl  Copyright 2000-2002 Free Software Foundation, Inc.
      5  1.1.1.3  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7      1.1  mrg dnl
      8  1.1.1.3  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.3  mrg dnl  it under the terms of either:
     10  1.1.1.3  mrg dnl
     11  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.3  mrg dnl      option) any later version.
     14  1.1.1.3  mrg dnl
     15  1.1.1.3  mrg dnl  or
     16  1.1.1.3  mrg dnl
     17  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.3  mrg dnl      later version.
     20  1.1.1.3  mrg dnl
     21  1.1.1.3  mrg dnl  or both in parallel, as here.
     22  1.1.1.3  mrg dnl
     23  1.1.1.3  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24  1.1.1.3  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.3  mrg dnl  for more details.
     27      1.1  mrg dnl
     28  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg 
     35      1.1  mrg C			     popcount	     hamdist
     36      1.1  mrg C P3 generic			6.5		7
     37  1.1.1.2  mrg C P3 model 9  (Banias)          5.7		6.1
     38      1.1  mrg C P3 model 13 (Dothan)		5.75		6
     39      1.1  mrg C K7				5		6
     40      1.1  mrg 
     41      1.1  mrg C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     42      1.1  mrg C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     43      1.1  mrg C
     44      1.1  mrg C The code here is almost certainly not optimal, but is already a 3x speedup
     45      1.1  mrg C over the generic C code.  The main improvement would be to interleave
     46      1.1  mrg C processing of two qwords in the loop so as to fully exploit the available
     47      1.1  mrg C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
     48      1.1  mrg C
     49      1.1  mrg C The loop is based on the example "Efficient 64-bit population count using
     50      1.1  mrg C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
     51      1.1  mrg C page 158 of rev E (reference in mpn/x86/k7/README).
     52      1.1  mrg 
     53      1.1  mrg ifdef(`OPERATION_popcount',,
     54      1.1  mrg `ifdef(`OPERATION_hamdist',,
     55      1.1  mrg `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
     56      1.1  mrg ')')')
     57      1.1  mrg 
     58      1.1  mrg define(HAM,
     59      1.1  mrg m4_assert_numargs(1)
     60      1.1  mrg `ifdef(`OPERATION_hamdist',`$1')')
     61      1.1  mrg 
     62      1.1  mrg define(POP,
     63      1.1  mrg m4_assert_numargs(1)
     64      1.1  mrg `ifdef(`OPERATION_popcount',`$1')')
     65      1.1  mrg 
     66      1.1  mrg HAM(`
     67      1.1  mrg defframe(PARAM_SIZE,   12)
     68      1.1  mrg defframe(PARAM_SRC2,   8)
     69      1.1  mrg defframe(PARAM_SRC,    4)
     70      1.1  mrg define(M4_function,mpn_hamdist)
     71      1.1  mrg ')
     72      1.1  mrg POP(`
     73      1.1  mrg defframe(PARAM_SIZE,   8)
     74      1.1  mrg defframe(PARAM_SRC,    4)
     75      1.1  mrg define(M4_function,mpn_popcount)
     76      1.1  mrg ')
     77      1.1  mrg 
     78      1.1  mrg MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     79      1.1  mrg 
     80      1.1  mrg 
     81      1.1  mrg ifdef(`PIC',,`
     82      1.1  mrg 	dnl  non-PIC
     83      1.1  mrg 
     84      1.1  mrg 	RODATA
     85      1.1  mrg 	ALIGN(8)
     86      1.1  mrg 
     87      1.1  mrg L(rodata_AAAAAAAAAAAAAAAA):
     88      1.1  mrg 	.long	0xAAAAAAAA
     89      1.1  mrg 	.long	0xAAAAAAAA
     90      1.1  mrg 
     91      1.1  mrg L(rodata_3333333333333333):
     92      1.1  mrg 	.long	0x33333333
     93      1.1  mrg 	.long	0x33333333
     94      1.1  mrg 
     95      1.1  mrg L(rodata_0F0F0F0F0F0F0F0F):
     96      1.1  mrg 	.long	0x0F0F0F0F
     97      1.1  mrg 	.long	0x0F0F0F0F
     98      1.1  mrg ')
     99      1.1  mrg 
    100      1.1  mrg 	TEXT
    101      1.1  mrg 	ALIGN(32)
    102      1.1  mrg 
    103      1.1  mrg PROLOGUE(M4_function)
    104      1.1  mrg deflit(`FRAME',0)
    105      1.1  mrg 
    106      1.1  mrg 	movl	PARAM_SIZE, %ecx
    107      1.1  mrg 
    108      1.1  mrg ifdef(`PIC',`
    109      1.1  mrg 	movl	$0xAAAAAAAA, %eax
    110      1.1  mrg 	movl	$0x33333333, %edx
    111      1.1  mrg 
    112      1.1  mrg 	movd	%eax, %mm7
    113      1.1  mrg 	movd	%edx, %mm6
    114      1.1  mrg 
    115      1.1  mrg 	movl	$0x0F0F0F0F, %eax
    116      1.1  mrg 
    117      1.1  mrg 	punpckldq %mm7, %mm7
    118      1.1  mrg 	punpckldq %mm6, %mm6
    119      1.1  mrg 
    120      1.1  mrg 	movd	%eax, %mm5
    121      1.1  mrg 	movd	%edx, %mm4
    122      1.1  mrg 
    123      1.1  mrg 	punpckldq %mm5, %mm5
    124      1.1  mrg 
    125      1.1  mrg ',`
    126      1.1  mrg 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    127      1.1  mrg 	movq	L(rodata_3333333333333333), %mm6
    128      1.1  mrg 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    129      1.1  mrg ')
    130      1.1  mrg 	pxor	%mm4, %mm4
    131      1.1  mrg 
    132      1.1  mrg define(REG_AAAAAAAAAAAAAAAA,%mm7)
    133      1.1  mrg define(REG_3333333333333333,%mm6)
    134      1.1  mrg define(REG_0F0F0F0F0F0F0F0F,%mm5)
    135      1.1  mrg define(REG_0000000000000000,%mm4)
    136      1.1  mrg 
    137      1.1  mrg 
    138      1.1  mrg 	movl	PARAM_SRC, %eax
    139      1.1  mrg HAM(`	movl	PARAM_SRC2, %edx')
    140      1.1  mrg 
    141      1.1  mrg 	pxor	%mm2, %mm2	C total
    142      1.1  mrg 
    143      1.1  mrg 	shrl	%ecx
    144      1.1  mrg 	jnc	L(top)
    145      1.1  mrg 
    146      1.1  mrg 	movd	(%eax,%ecx,8), %mm1
    147      1.1  mrg 
    148      1.1  mrg HAM(`	movd	(%edx,%ecx,8), %mm0
    149      1.1  mrg 	pxor	%mm0, %mm1
    150      1.1  mrg ')
    151      1.1  mrg 	orl	%ecx, %ecx
    152      1.1  mrg 	jmp	L(loaded)
    153      1.1  mrg 
    154      1.1  mrg 
    155      1.1  mrg 	ALIGN(16)
    156      1.1  mrg L(top):
    157      1.1  mrg 	C eax	src
    158      1.1  mrg 	C ebx
    159      1.1  mrg 	C ecx	counter, qwords, decrementing
    160      1.1  mrg 	C edx	[hamdist] src2
    161      1.1  mrg 	C
    162      1.1  mrg 	C mm0	(scratch)
    163      1.1  mrg 	C mm1	(scratch)
    164      1.1  mrg 	C mm2	total (low dword)
    165      1.1  mrg 	C mm3
    166      1.1  mrg 	C mm4	\
    167      1.1  mrg 	C mm5	| special constants
    168      1.1  mrg 	C mm6	|
    169      1.1  mrg 	C mm7	/
    170      1.1  mrg 
    171      1.1  mrg 	movq	-8(%eax,%ecx,8), %mm1
    172      1.1  mrg 
    173      1.1  mrg HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
    174      1.1  mrg 	decl	%ecx
    175      1.1  mrg 
    176      1.1  mrg L(loaded):
    177      1.1  mrg 	movq	%mm1, %mm0
    178      1.1  mrg 	pand	REG_AAAAAAAAAAAAAAAA, %mm1
    179      1.1  mrg 
    180      1.1  mrg 	psrlq	$1, %mm1
    181      1.1  mrg 
    182      1.1  mrg 	psubd	%mm1, %mm0	C bit pairs
    183      1.1  mrg 
    184      1.1  mrg 
    185      1.1  mrg 	movq	%mm0, %mm1
    186      1.1  mrg 	psrlq	$2, %mm0
    187      1.1  mrg 
    188      1.1  mrg 	pand	REG_3333333333333333, %mm0
    189      1.1  mrg 	pand	REG_3333333333333333, %mm1
    190      1.1  mrg 
    191      1.1  mrg 	paddd	%mm1, %mm0	C nibbles
    192      1.1  mrg 
    193      1.1  mrg 
    194      1.1  mrg 	movq	%mm0, %mm1
    195      1.1  mrg 	psrlq	$4, %mm0
    196      1.1  mrg 
    197      1.1  mrg 	pand	REG_0F0F0F0F0F0F0F0F, %mm0
    198      1.1  mrg 	pand	REG_0F0F0F0F0F0F0F0F, %mm1
    199      1.1  mrg 
    200      1.1  mrg 	paddd	%mm1, %mm0	C bytes
    201      1.1  mrg 
    202      1.1  mrg 
    203      1.1  mrg 	psadbw(	%mm4, %mm0)
    204      1.1  mrg 
    205      1.1  mrg 	paddd	%mm0, %mm2	C add to total
    206      1.1  mrg 	jnz	L(top)
    207      1.1  mrg 
    208      1.1  mrg 
    209      1.1  mrg 	movd	%mm2, %eax
    210      1.1  mrg 	emms
    211      1.1  mrg 	ret
    212      1.1  mrg 
    213      1.1  mrg EPILOGUE()
    214