Home | History | Annotate | Line # | Download | only in mmx
popham.asm revision 1.1
      1 dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
      2 dnl  hamming distance.
      3 
      4 dnl  Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
      5 dnl
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or
      9 dnl  modify it under the terms of the GNU Lesser General Public License as
     10 dnl  published by the Free Software Foundation; either version 3 of the
     11 dnl  License, or (at your option) any later version.
     12 dnl
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 dnl  Lesser General Public License for more details.
     17 dnl
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 
     24 C			     popcount	     hamdist
     25 C P3 model 9  (Banias)		?		?
     26 C P3 model 13 (Dothan)		6		6
     27 C P4 model 0  (Willamette)
     28 C P4 model 1  (?)
     29 C P4 model 2  (Northwood)	8		9
     30 C P4 model 3  (Prescott)	8		9
     31 C P4 model 4  (Nocona)
     32 
     33 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     34 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     35 C
     36 C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
     37 C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
     38 C and using them saves fiddling about with alignment testing on entry.
     39 C
     40 C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
     41 C might be possible, but 8 c/l relying on out-of-order execution is already
     42 C quite reasonable.
     43 
     44 ifdef(`OPERATION_popcount',,
     45 `ifdef(`OPERATION_hamdist',,
     46 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
     47 ')')')
     48 
     49 define(HAM,
     50 m4_assert_numargs(1)
     51 `ifdef(`OPERATION_hamdist',`$1')')
     52 
     53 define(POP,
     54 m4_assert_numargs(1)
     55 `ifdef(`OPERATION_popcount',`$1')')
     56 
     57 HAM(`
     58 defframe(PARAM_SIZE, 12)
     59 defframe(PARAM_SRC2,  8)
     60 defframe(PARAM_SRC,   4)
     61 define(M4_function,mpn_hamdist)
     62 ')
     63 POP(`
     64 defframe(PARAM_SIZE,  8)
     65 defframe(PARAM_SRC,   4)
     66 define(M4_function,mpn_popcount)
     67 ')
     68 
     69 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     70 
     71 
     72 ifdef(`PIC',,`
     73 	dnl  non-PIC
     74 	RODATA
     75 	ALIGN(8)
     76 L(rodata_AAAAAAAAAAAAAAAA):
     77 	.long	0xAAAAAAAA
     78 	.long	0xAAAAAAAA
     79 L(rodata_3333333333333333):
     80 	.long	0x33333333
     81 	.long	0x33333333
     82 L(rodata_0F0F0F0F0F0F0F0F):
     83 	.long	0x0F0F0F0F
     84 	.long	0x0F0F0F0F
     85 ')
     86 
     87 	TEXT
     88 	ALIGN(16)
     89 
     90 PROLOGUE(M4_function)
     91 deflit(`FRAME',0)
     92 
     93 	movl	PARAM_SIZE, %ecx
     94 	movl	PARAM_SRC, %eax
     95 
     96 ifdef(`PIC',`
     97 	movl	$0xAAAAAAAA, %edx
     98 	movd	%edx, %mm7
     99 	punpckldq %mm7, %mm7
    100 
    101 	movl	$0x33333333, %edx
    102 	movd	%edx, %mm6
    103 	punpckldq %mm6, %mm6
    104 
    105 	movl	$0x0F0F0F0F, %edx
    106 	movd	%edx, %mm5
    107 	punpckldq %mm5, %mm5
    108 
    109 HAM(`	movl	PARAM_SRC2, %edx')
    110 
    111 ',`
    112 	dnl non-PIC
    113 HAM(`	movl	PARAM_SRC2, %edx')
    114 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    115 	movq	L(rodata_3333333333333333), %mm6
    116 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    117 ')
    118 
    119 	pxor	%mm4, %mm4		C zero
    120 	pxor	%mm0, %mm0		C total
    121 
    122 	subl	$1, %ecx
    123 	ja	L(top)
    124 
    125 L(last):
    126 	movd	(%eax,%ecx,4), %mm1		C src high limb
    127 HAM(`	movd	(%edx,%ecx,4), %mm2
    128 	pxor	%mm2, %mm1
    129 ')
    130 	jmp	L(loaded)
    131 
    132 
    133 L(top):
    134 	C eax	src
    135 	C ebx
    136 	C ecx	counter, size-1 to 2 or 1, inclusive
    137 	C edx	[hamdist] src2
    138 	C
    139 	C mm0	total (low dword)
    140 	C mm1	(scratch)
    141 	C mm2	(scratch)
    142 	C mm3
    143 	C mm4	0x0000000000000000
    144 	C mm5	0x0F0F0F0F0F0F0F0F
    145 	C mm6	0x3333333333333333
    146 	C mm7	0xAAAAAAAAAAAAAAAA
    147 
    148 	movd	(%eax), %mm1
    149 	movd	4(%eax), %mm2
    150 	punpckldq %mm2, %mm1
    151 	addl	$8, %eax
    152 
    153 HAM(`	movd	(%edx), %mm2
    154 	movd	4(%edx), %mm3
    155 	punpckldq %mm3, %mm2
    156 	pxor	%mm2, %mm1
    157 	addl	$8, %edx
    158 ')
    159 
    160 L(loaded):
    161 	movq	%mm7, %mm2
    162 	pand	%mm1, %mm2
    163 	psrlq	$1, %mm2
    164 	psubd	%mm2, %mm1	C bit pairs
    165 
    166 	movq	%mm6, %mm2
    167 	pand	%mm1, %mm2
    168 	psrlq	$2, %mm1
    169 	pand	%mm6, %mm1
    170 	paddd	%mm2, %mm1	C nibbles
    171 
    172 	movq	%mm5, %mm2
    173 	pand	%mm1, %mm2
    174 	psrlq	$4, %mm1
    175 	pand	%mm5, %mm1
    176 	paddd	%mm2, %mm1	C bytes
    177 
    178 	psadbw(	%mm4, %mm1)
    179 	paddd	%mm1, %mm0	C to total
    180 
    181 	subl	$2, %ecx
    182 	jg	L(top)
    183 
    184 	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
    185 	jz	L(last)
    186 
    187 
    188 	movd	%mm0, %eax
    189 	emms
    190 	ret
    191 
    192 EPILOGUE()
    193