Home | History | Annotate | Line # | Download | only in mmx
popham.asm revision 1.1
      1 dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
      2 dnl  hamming distance.
      3 
      4 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
      5 dnl
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or
      9 dnl  modify it under the terms of the GNU Lesser General Public License as
     10 dnl  published by the Free Software Foundation; either version 3 of the
     11 dnl  License, or (at your option) any later version.
     12 dnl
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 dnl  Lesser General Public License for more details.
     17 dnl
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 
     24 C        popcount  hamdist
     25 C K6-2:    9.0       11.5   cycles/limb
     26 C K6:      12.5      13.0
     27 
     28 
     29 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     30 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     31 C
     32 C The code here isn't optimal, but it's already a 2x speedup over the plain
     33 C integer mpn/generic/popcount.c,hamdist.c.
     34 
     35 
     36 ifdef(`OPERATION_popcount',,
     37 `ifdef(`OPERATION_hamdist',,
     38 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
     39 ')m4exit(1)')')
     40 
     41 define(HAM,
     42 m4_assert_numargs(1)
     43 `ifdef(`OPERATION_hamdist',`$1')')
     44 
     45 define(POP,
     46 m4_assert_numargs(1)
     47 `ifdef(`OPERATION_popcount',`$1')')
     48 
     49 HAM(`
     50 defframe(PARAM_SIZE,   12)
     51 defframe(PARAM_SRC2,   8)
     52 defframe(PARAM_SRC,    4)
     53 define(M4_function,mpn_hamdist)
     54 ')
     55 POP(`
     56 defframe(PARAM_SIZE,   8)
     57 defframe(PARAM_SRC,    4)
     58 define(M4_function,mpn_popcount)
     59 ')
     60 
     61 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     62 
     63 
     64 ifdef(`PIC',,`
     65 	dnl  non-PIC
     66 
     67 	RODATA
     68 	ALIGN(8)
     69 
     70 L(rodata_AAAAAAAAAAAAAAAA):
     71 	.long	0xAAAAAAAA
     72 	.long	0xAAAAAAAA
     73 
     74 L(rodata_3333333333333333):
     75 	.long	0x33333333
     76 	.long	0x33333333
     77 
     78 L(rodata_0F0F0F0F0F0F0F0F):
     79 	.long	0x0F0F0F0F
     80 	.long	0x0F0F0F0F
     81 
     82 L(rodata_000000FF000000FF):
     83 	.long	0x000000FF
     84 	.long	0x000000FF
     85 ')
     86 
     87 	TEXT
     88 	ALIGN(32)
     89 
     90 POP(`ifdef(`PIC', `
     91 	C avoid shrl crossing a 32-byte boundary
     92 	nop')')
     93 
     94 PROLOGUE(M4_function)
     95 deflit(`FRAME',0)
     96 
     97 	movl	PARAM_SIZE, %ecx
     98 
     99 ifdef(`PIC',`
    100 	movl	$0xAAAAAAAA, %eax
    101 	movl	$0x33333333, %edx
    102 
    103 	movd	%eax, %mm7
    104 	movd	%edx, %mm6
    105 
    106 	movl	$0x0F0F0F0F, %eax
    107 	movl	$0x000000FF, %edx
    108 
    109 	punpckldq %mm7, %mm7
    110 	punpckldq %mm6, %mm6
    111 
    112 	movd	%eax, %mm5
    113 	movd	%edx, %mm4
    114 
    115 	punpckldq %mm5, %mm5
    116 	punpckldq %mm4, %mm4
    117 ',`
    118 
    119 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    120 	movq	L(rodata_3333333333333333), %mm6
    121 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    122 	movq	L(rodata_000000FF000000FF), %mm4
    123 ')
    124 
    125 define(REG_AAAAAAAAAAAAAAAA, %mm7)
    126 define(REG_3333333333333333, %mm6)
    127 define(REG_0F0F0F0F0F0F0F0F, %mm5)
    128 define(REG_000000FF000000FF, %mm4)
    129 
    130 
    131 	movl	PARAM_SRC, %eax
    132 HAM(`	movl	PARAM_SRC2, %edx')
    133 
    134 	pxor	%mm2, %mm2	C total
    135 
    136 	shrl	%ecx
    137 	jnc	L(top)
    138 
    139 Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
    140 
    141 HAM(`
    142 Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
    143 	pxor	%mm0, %mm1
    144 ')
    145 
    146 	incl	%ecx
    147 	jmp	L(loaded)
    148 
    149 
    150 	ALIGN(16)
    151 POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
    152 
    153 L(top):
    154 	C eax	src
    155 	C ebx
    156 	C ecx	counter, qwords, decrementing
    157 	C edx	[hamdist] src2
    158 	C
    159 	C mm0	(scratch)
    160 	C mm1	(scratch)
    161 	C mm2	total (low dword)
    162 	C mm3
    163 	C mm4	\
    164 	C mm5	| special constants
    165 	C mm6	|
    166 	C mm7	/
    167 
    168 	movq	-8(%eax,%ecx,8), %mm1
    169 HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
    170 
    171 L(loaded):
    172 	movq	%mm1, %mm0
    173 	pand	REG_AAAAAAAAAAAAAAAA, %mm1
    174 
    175 	psrlq	$1, %mm1
    176 HAM(`	nop			C code alignment')
    177 
    178 	psubd	%mm1, %mm0	C bit pairs
    179 HAM(`	nop			C code alignment')
    180 
    181 
    182 	movq	%mm0, %mm1
    183 	psrlq	$2, %mm0
    184 
    185 	pand	REG_3333333333333333, %mm0
    186 	pand	REG_3333333333333333, %mm1
    187 
    188 	paddd	%mm1, %mm0	C nibbles
    189 
    190 
    191 	movq	%mm0, %mm1
    192 	psrlq	$4, %mm0
    193 
    194 	pand	REG_0F0F0F0F0F0F0F0F, %mm0
    195 	pand	REG_0F0F0F0F0F0F0F0F, %mm1
    196 
    197 	paddd	%mm1, %mm0	C bytes
    198 
    199 	movq	%mm0, %mm1
    200 	psrlq	$8, %mm0
    201 
    202 
    203 	paddb	%mm1, %mm0	C words
    204 
    205 
    206 	movq	%mm0, %mm1
    207 	psrlq	$16, %mm0
    208 
    209 	paddd	%mm1, %mm0	C dwords
    210 
    211 	pand	REG_000000FF000000FF, %mm0
    212 
    213 	paddd	%mm0, %mm2	C low to total
    214 	psrlq	$32, %mm0
    215 
    216 	paddd	%mm0, %mm2	C high to total
    217 	loop	L(top)
    218 
    219 
    220 
    221 	movd	%mm2, %eax
    222 	emms_or_femms
    223 	ret
    224 
    225 EPILOGUE()
    226