Home | History | Annotate | Line # | Download | only in mmx
popham.asm revision 1.1.1.1.8.1
      1 dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
      2 dnl  distance.
      3 
      4 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
      5 dnl
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or
      9 dnl  modify it under the terms of the GNU Lesser General Public License as
     10 dnl  published by the Free Software Foundation; either version 3 of the
     11 dnl  License, or (at your option) any later version.
     12 dnl
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 dnl  Lesser General Public License for more details.
     17 dnl
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 
     24 C			     popcount	     hamdist
     25 C P3 generic			6.5		7
     26 C P3 model 9  (Banias)          5.7		6.1
     27 C P3 model 13 (Dothan)		5.75		6
     28 C K7				5		6
     29 
     30 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     31 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     32 C
     33 C The code here is almost certainly not optimal, but is already a 3x speedup
     34 C over the generic C code.  The main improvement would be to interleave
     35 C processing of two qwords in the loop so as to fully exploit the available
     36 C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
     37 C
     38 C The loop is based on the example "Efficient 64-bit population count using
     39 C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
     40 C page 158 of rev E (reference in mpn/x86/k7/README).
     41 
     42 ifdef(`OPERATION_popcount',,
     43 `ifdef(`OPERATION_hamdist',,
     44 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
     45 ')')')
     46 
     47 define(HAM,
     48 m4_assert_numargs(1)
     49 `ifdef(`OPERATION_hamdist',`$1')')
     50 
     51 define(POP,
     52 m4_assert_numargs(1)
     53 `ifdef(`OPERATION_popcount',`$1')')
     54 
     55 HAM(`
     56 defframe(PARAM_SIZE,   12)
     57 defframe(PARAM_SRC2,   8)
     58 defframe(PARAM_SRC,    4)
     59 define(M4_function,mpn_hamdist)
     60 ')
     61 POP(`
     62 defframe(PARAM_SIZE,   8)
     63 defframe(PARAM_SRC,    4)
     64 define(M4_function,mpn_popcount)
     65 ')
     66 
     67 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     68 
     69 
     70 ifdef(`PIC',,`
     71 	dnl  non-PIC
     72 
     73 	RODATA
     74 	ALIGN(8)
     75 
     76 L(rodata_AAAAAAAAAAAAAAAA):
     77 	.long	0xAAAAAAAA
     78 	.long	0xAAAAAAAA
     79 
     80 L(rodata_3333333333333333):
     81 	.long	0x33333333
     82 	.long	0x33333333
     83 
     84 L(rodata_0F0F0F0F0F0F0F0F):
     85 	.long	0x0F0F0F0F
     86 	.long	0x0F0F0F0F
     87 ')
     88 
     89 	TEXT
     90 	ALIGN(32)
     91 
     92 PROLOGUE(M4_function)
     93 deflit(`FRAME',0)
     94 
     95 	movl	PARAM_SIZE, %ecx
     96 
     97 ifdef(`PIC',`
     98 	movl	$0xAAAAAAAA, %eax
     99 	movl	$0x33333333, %edx
    100 
    101 	movd	%eax, %mm7
    102 	movd	%edx, %mm6
    103 
    104 	movl	$0x0F0F0F0F, %eax
    105 
    106 	punpckldq %mm7, %mm7
    107 	punpckldq %mm6, %mm6
    108 
    109 	movd	%eax, %mm5
    110 	movd	%edx, %mm4
    111 
    112 	punpckldq %mm5, %mm5
    113 
    114 ',`
    115 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    116 	movq	L(rodata_3333333333333333), %mm6
    117 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    118 ')
    119 	pxor	%mm4, %mm4
    120 
    121 define(REG_AAAAAAAAAAAAAAAA,%mm7)
    122 define(REG_3333333333333333,%mm6)
    123 define(REG_0F0F0F0F0F0F0F0F,%mm5)
    124 define(REG_0000000000000000,%mm4)
    125 
    126 
    127 	movl	PARAM_SRC, %eax
    128 HAM(`	movl	PARAM_SRC2, %edx')
    129 
    130 	pxor	%mm2, %mm2	C total
    131 
    132 	shrl	%ecx
    133 	jnc	L(top)
    134 
    135 	movd	(%eax,%ecx,8), %mm1
    136 
    137 HAM(`	movd	(%edx,%ecx,8), %mm0
    138 	pxor	%mm0, %mm1
    139 ')
    140 	orl	%ecx, %ecx
    141 	jmp	L(loaded)
    142 
    143 
    144 	ALIGN(16)
    145 L(top):
    146 	C eax	src
    147 	C ebx
    148 	C ecx	counter, qwords, decrementing
    149 	C edx	[hamdist] src2
    150 	C
    151 	C mm0	(scratch)
    152 	C mm1	(scratch)
    153 	C mm2	total (low dword)
    154 	C mm3
    155 	C mm4	\
    156 	C mm5	| special constants
    157 	C mm6	|
    158 	C mm7	/
    159 
    160 	movq	-8(%eax,%ecx,8), %mm1
    161 
    162 HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
    163 	decl	%ecx
    164 
    165 L(loaded):
    166 	movq	%mm1, %mm0
    167 	pand	REG_AAAAAAAAAAAAAAAA, %mm1
    168 
    169 	psrlq	$1, %mm1
    170 
    171 	psubd	%mm1, %mm0	C bit pairs
    172 
    173 
    174 	movq	%mm0, %mm1
    175 	psrlq	$2, %mm0
    176 
    177 	pand	REG_3333333333333333, %mm0
    178 	pand	REG_3333333333333333, %mm1
    179 
    180 	paddd	%mm1, %mm0	C nibbles
    181 
    182 
    183 	movq	%mm0, %mm1
    184 	psrlq	$4, %mm0
    185 
    186 	pand	REG_0F0F0F0F0F0F0F0F, %mm0
    187 	pand	REG_0F0F0F0F0F0F0F0F, %mm1
    188 
    189 	paddd	%mm1, %mm0	C bytes
    190 
    191 
    192 	psadbw(	%mm4, %mm0)
    193 
    194 	paddd	%mm0, %mm2	C add to total
    195 	jnz	L(top)
    196 
    197 
    198 	movd	%mm2, %eax
    199 	emms
    200 	ret
    201 
    202 EPILOGUE()
    203