Home | History | Annotate | Line # | Download | only in mmx
popham.asm revision 1.1.1.2
      1 dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
      2 dnl  hamming distance.
      3 
      4 dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 
     35 C			     popcount	     hamdist
     36 C P3 model 9  (Banias)		?		?
     37 C P3 model 13 (Dothan)		6		6
     38 C P4 model 0  (Willamette)
     39 C P4 model 1  (?)
     40 C P4 model 2  (Northwood)	8		9
     41 C P4 model 3  (Prescott)	8		9
     42 C P4 model 4  (Nocona)
     43 
     44 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     45 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     46 C
     47 C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
     48 C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
     49 C and using them saves fiddling about with alignment testing on entry.
     50 C
     51 C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
     52 C might be possible, but 8 c/l relying on out-of-order execution is already
     53 C quite reasonable.
     54 
     55 ifdef(`OPERATION_popcount',,
     56 `ifdef(`OPERATION_hamdist',,
     57 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
     58 ')')')
     59 
     60 define(HAM,
     61 m4_assert_numargs(1)
     62 `ifdef(`OPERATION_hamdist',`$1')')
     63 
     64 define(POP,
     65 m4_assert_numargs(1)
     66 `ifdef(`OPERATION_popcount',`$1')')
     67 
     68 HAM(`
     69 defframe(PARAM_SIZE, 12)
     70 defframe(PARAM_SRC2,  8)
     71 defframe(PARAM_SRC,   4)
     72 define(M4_function,mpn_hamdist)
     73 ')
     74 POP(`
     75 defframe(PARAM_SIZE,  8)
     76 defframe(PARAM_SRC,   4)
     77 define(M4_function,mpn_popcount)
     78 ')
     79 
     80 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     81 
     82 
     83 ifdef(`PIC',,`
     84 	dnl  non-PIC
     85 	RODATA
     86 	ALIGN(8)
     87 L(rodata_AAAAAAAAAAAAAAAA):
     88 	.long	0xAAAAAAAA
     89 	.long	0xAAAAAAAA
     90 L(rodata_3333333333333333):
     91 	.long	0x33333333
     92 	.long	0x33333333
     93 L(rodata_0F0F0F0F0F0F0F0F):
     94 	.long	0x0F0F0F0F
     95 	.long	0x0F0F0F0F
     96 ')
     97 
     98 	TEXT
     99 	ALIGN(16)
    100 
    101 PROLOGUE(M4_function)
    102 deflit(`FRAME',0)
    103 
    104 	movl	PARAM_SIZE, %ecx
    105 	movl	PARAM_SRC, %eax
    106 
    107 ifdef(`PIC',`
    108 	movl	$0xAAAAAAAA, %edx
    109 	movd	%edx, %mm7
    110 	punpckldq %mm7, %mm7
    111 
    112 	movl	$0x33333333, %edx
    113 	movd	%edx, %mm6
    114 	punpckldq %mm6, %mm6
    115 
    116 	movl	$0x0F0F0F0F, %edx
    117 	movd	%edx, %mm5
    118 	punpckldq %mm5, %mm5
    119 
    120 HAM(`	movl	PARAM_SRC2, %edx')
    121 
    122 ',`
    123 	dnl non-PIC
    124 HAM(`	movl	PARAM_SRC2, %edx')
    125 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    126 	movq	L(rodata_3333333333333333), %mm6
    127 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    128 ')
    129 
    130 	pxor	%mm4, %mm4		C zero
    131 	pxor	%mm0, %mm0		C total
    132 
    133 	subl	$1, %ecx
    134 	ja	L(top)
    135 
    136 L(last):
    137 	movd	(%eax,%ecx,4), %mm1		C src high limb
    138 HAM(`	movd	(%edx,%ecx,4), %mm2
    139 	pxor	%mm2, %mm1
    140 ')
    141 	jmp	L(loaded)
    142 
    143 
    144 L(top):
    145 	C eax	src
    146 	C ebx
    147 	C ecx	counter, size-1 to 2 or 1, inclusive
    148 	C edx	[hamdist] src2
    149 	C
    150 	C mm0	total (low dword)
    151 	C mm1	(scratch)
    152 	C mm2	(scratch)
    153 	C mm3
    154 	C mm4	0x0000000000000000
    155 	C mm5	0x0F0F0F0F0F0F0F0F
    156 	C mm6	0x3333333333333333
    157 	C mm7	0xAAAAAAAAAAAAAAAA
    158 
    159 	movd	(%eax), %mm1
    160 	movd	4(%eax), %mm2
    161 	punpckldq %mm2, %mm1
    162 	addl	$8, %eax
    163 
    164 HAM(`	movd	(%edx), %mm2
    165 	movd	4(%edx), %mm3
    166 	punpckldq %mm3, %mm2
    167 	pxor	%mm2, %mm1
    168 	addl	$8, %edx
    169 ')
    170 
    171 L(loaded):
    172 	movq	%mm7, %mm2
    173 	pand	%mm1, %mm2
    174 	psrlq	$1, %mm2
    175 	psubd	%mm2, %mm1	C bit pairs
    176 
    177 	movq	%mm6, %mm2
    178 	pand	%mm1, %mm2
    179 	psrlq	$2, %mm1
    180 	pand	%mm6, %mm1
    181 	paddd	%mm2, %mm1	C nibbles
    182 
    183 	movq	%mm5, %mm2
    184 	pand	%mm1, %mm2
    185 	psrlq	$4, %mm1
    186 	pand	%mm5, %mm1
    187 	paddd	%mm2, %mm1	C bytes
    188 
    189 	psadbw(	%mm4, %mm1)
    190 	paddd	%mm1, %mm0	C to total
    191 
    192 	subl	$2, %ecx
    193 	jg	L(top)
    194 
    195 	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
    196 	jz	L(last)
    197 
    198 
    199 	movd	%mm0, %eax
    200 	emms
    201 	ret
    202 
    203 EPILOGUE()
    204