Home | History | Annotate | Line # | Download | only in mmx
      1      1.1  mrg dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
      2      1.1  mrg dnl  hamming distance.
      3      1.1  mrg 
      4  1.1.1.2  mrg dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
      5  1.1.1.2  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7      1.1  mrg dnl
      8  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.2  mrg dnl  it under the terms of either:
     10  1.1.1.2  mrg dnl
     11  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.2  mrg dnl      option) any later version.
     14  1.1.1.2  mrg dnl
     15  1.1.1.2  mrg dnl  or
     16  1.1.1.2  mrg dnl
     17  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.2  mrg dnl      later version.
     20  1.1.1.2  mrg dnl
     21  1.1.1.2  mrg dnl  or both in parallel, as here.
     22  1.1.1.2  mrg dnl
     23  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.2  mrg dnl  for more details.
     27      1.1  mrg dnl
     28  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg 
     35      1.1  mrg C			     popcount	     hamdist
     36      1.1  mrg C P3 model 9  (Banias)		?		?
     37      1.1  mrg C P3 model 13 (Dothan)		6		6
     38      1.1  mrg C P4 model 0  (Willamette)
     39      1.1  mrg C P4 model 1  (?)
     40      1.1  mrg C P4 model 2  (Northwood)	8		9
     41      1.1  mrg C P4 model 3  (Prescott)	8		9
     42      1.1  mrg C P4 model 4  (Nocona)
     43      1.1  mrg 
     44      1.1  mrg C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
     45      1.1  mrg C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
     46      1.1  mrg C
     47      1.1  mrg C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
     48      1.1  mrg C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
     49      1.1  mrg C and using them saves fiddling about with alignment testing on entry.
     50      1.1  mrg C
     51      1.1  mrg C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
     52      1.1  mrg C might be possible, but 8 c/l relying on out-of-order execution is already
     53      1.1  mrg C quite reasonable.
     54      1.1  mrg 
     55      1.1  mrg ifdef(`OPERATION_popcount',,
     56      1.1  mrg `ifdef(`OPERATION_hamdist',,
     57      1.1  mrg `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
     58      1.1  mrg ')')')
     59      1.1  mrg 
     60      1.1  mrg define(HAM,
     61      1.1  mrg m4_assert_numargs(1)
     62      1.1  mrg `ifdef(`OPERATION_hamdist',`$1')')
     63      1.1  mrg 
     64      1.1  mrg define(POP,
     65      1.1  mrg m4_assert_numargs(1)
     66      1.1  mrg `ifdef(`OPERATION_popcount',`$1')')
     67      1.1  mrg 
     68      1.1  mrg HAM(`
     69      1.1  mrg defframe(PARAM_SIZE, 12)
     70      1.1  mrg defframe(PARAM_SRC2,  8)
     71      1.1  mrg defframe(PARAM_SRC,   4)
     72      1.1  mrg define(M4_function,mpn_hamdist)
     73      1.1  mrg ')
     74      1.1  mrg POP(`
     75      1.1  mrg defframe(PARAM_SIZE,  8)
     76      1.1  mrg defframe(PARAM_SRC,   4)
     77      1.1  mrg define(M4_function,mpn_popcount)
     78      1.1  mrg ')
     79      1.1  mrg 
     80      1.1  mrg MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
     81      1.1  mrg 
     82      1.1  mrg 
     83      1.1  mrg ifdef(`PIC',,`
     84      1.1  mrg 	dnl  non-PIC
     85      1.1  mrg 	RODATA
     86      1.1  mrg 	ALIGN(8)
     87      1.1  mrg L(rodata_AAAAAAAAAAAAAAAA):
     88      1.1  mrg 	.long	0xAAAAAAAA
     89      1.1  mrg 	.long	0xAAAAAAAA
     90      1.1  mrg L(rodata_3333333333333333):
     91      1.1  mrg 	.long	0x33333333
     92      1.1  mrg 	.long	0x33333333
     93      1.1  mrg L(rodata_0F0F0F0F0F0F0F0F):
     94      1.1  mrg 	.long	0x0F0F0F0F
     95      1.1  mrg 	.long	0x0F0F0F0F
     96      1.1  mrg ')
     97      1.1  mrg 
     98      1.1  mrg 	TEXT
     99      1.1  mrg 	ALIGN(16)
    100      1.1  mrg 
    101      1.1  mrg PROLOGUE(M4_function)
    102      1.1  mrg deflit(`FRAME',0)
    103      1.1  mrg 
    104      1.1  mrg 	movl	PARAM_SIZE, %ecx
    105      1.1  mrg 	movl	PARAM_SRC, %eax
    106      1.1  mrg 
    107      1.1  mrg ifdef(`PIC',`
    108      1.1  mrg 	movl	$0xAAAAAAAA, %edx
    109      1.1  mrg 	movd	%edx, %mm7
    110      1.1  mrg 	punpckldq %mm7, %mm7
    111      1.1  mrg 
    112      1.1  mrg 	movl	$0x33333333, %edx
    113      1.1  mrg 	movd	%edx, %mm6
    114      1.1  mrg 	punpckldq %mm6, %mm6
    115      1.1  mrg 
    116      1.1  mrg 	movl	$0x0F0F0F0F, %edx
    117      1.1  mrg 	movd	%edx, %mm5
    118      1.1  mrg 	punpckldq %mm5, %mm5
    119      1.1  mrg 
    120      1.1  mrg HAM(`	movl	PARAM_SRC2, %edx')
    121      1.1  mrg 
    122      1.1  mrg ',`
    123      1.1  mrg 	dnl non-PIC
    124      1.1  mrg HAM(`	movl	PARAM_SRC2, %edx')
    125      1.1  mrg 	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
    126      1.1  mrg 	movq	L(rodata_3333333333333333), %mm6
    127      1.1  mrg 	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
    128      1.1  mrg ')
    129      1.1  mrg 
    130      1.1  mrg 	pxor	%mm4, %mm4		C zero
    131      1.1  mrg 	pxor	%mm0, %mm0		C total
    132      1.1  mrg 
    133      1.1  mrg 	subl	$1, %ecx
    134      1.1  mrg 	ja	L(top)
    135      1.1  mrg 
    136      1.1  mrg L(last):
    137      1.1  mrg 	movd	(%eax,%ecx,4), %mm1		C src high limb
    138      1.1  mrg HAM(`	movd	(%edx,%ecx,4), %mm2
    139      1.1  mrg 	pxor	%mm2, %mm1
    140      1.1  mrg ')
    141      1.1  mrg 	jmp	L(loaded)
    142      1.1  mrg 
    143      1.1  mrg 
    144      1.1  mrg L(top):
    145      1.1  mrg 	C eax	src
    146      1.1  mrg 	C ebx
    147      1.1  mrg 	C ecx	counter, size-1 to 2 or 1, inclusive
    148      1.1  mrg 	C edx	[hamdist] src2
    149      1.1  mrg 	C
    150      1.1  mrg 	C mm0	total (low dword)
    151      1.1  mrg 	C mm1	(scratch)
    152      1.1  mrg 	C mm2	(scratch)
    153      1.1  mrg 	C mm3
    154      1.1  mrg 	C mm4	0x0000000000000000
    155      1.1  mrg 	C mm5	0x0F0F0F0F0F0F0F0F
    156      1.1  mrg 	C mm6	0x3333333333333333
    157      1.1  mrg 	C mm7	0xAAAAAAAAAAAAAAAA
    158      1.1  mrg 
    159      1.1  mrg 	movd	(%eax), %mm1
    160      1.1  mrg 	movd	4(%eax), %mm2
    161      1.1  mrg 	punpckldq %mm2, %mm1
    162      1.1  mrg 	addl	$8, %eax
    163      1.1  mrg 
    164      1.1  mrg HAM(`	movd	(%edx), %mm2
    165      1.1  mrg 	movd	4(%edx), %mm3
    166      1.1  mrg 	punpckldq %mm3, %mm2
    167      1.1  mrg 	pxor	%mm2, %mm1
    168      1.1  mrg 	addl	$8, %edx
    169      1.1  mrg ')
    170      1.1  mrg 
    171      1.1  mrg L(loaded):
    172      1.1  mrg 	movq	%mm7, %mm2
    173      1.1  mrg 	pand	%mm1, %mm2
    174      1.1  mrg 	psrlq	$1, %mm2
    175      1.1  mrg 	psubd	%mm2, %mm1	C bit pairs
    176      1.1  mrg 
    177      1.1  mrg 	movq	%mm6, %mm2
    178      1.1  mrg 	pand	%mm1, %mm2
    179      1.1  mrg 	psrlq	$2, %mm1
    180      1.1  mrg 	pand	%mm6, %mm1
    181      1.1  mrg 	paddd	%mm2, %mm1	C nibbles
    182      1.1  mrg 
    183      1.1  mrg 	movq	%mm5, %mm2
    184      1.1  mrg 	pand	%mm1, %mm2
    185      1.1  mrg 	psrlq	$4, %mm1
    186      1.1  mrg 	pand	%mm5, %mm1
    187      1.1  mrg 	paddd	%mm2, %mm1	C bytes
    188      1.1  mrg 
    189      1.1  mrg 	psadbw(	%mm4, %mm1)
    190      1.1  mrg 	paddd	%mm1, %mm0	C to total
    191      1.1  mrg 
    192      1.1  mrg 	subl	$2, %ecx
    193      1.1  mrg 	jg	L(top)
    194      1.1  mrg 
    195      1.1  mrg 	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
    196      1.1  mrg 	jz	L(last)
    197      1.1  mrg 
    198      1.1  mrg 
    199      1.1  mrg 	movd	%mm0, %eax
    200      1.1  mrg 	emms
    201      1.1  mrg 	ret
    202      1.1  mrg 
    203      1.1  mrg EPILOGUE()
    204