Home | History | Annotate | Line # | Download | only in sse2
popcount.asm revision 1.1.1.1.8.1
      1 dnl  X86-32 and X86-64 mpn_popcount using SSE2.
      2 
      3 dnl  Copyright 2006, 2007, 2011 Free Software Foundation, Inc.
      4 dnl
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 dnl
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 dnl
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 
     21 include(`../config.m4')
     22 
     23 
     24 C 32-bit		     popcount	     hamdist
     25 C			    cycles/limb	    cycles/limb
     26 C P5				-
     27 C P6 model 0-8,10-12		-
     28 C P6 model 9  (Banias)		?
     29 C P6 model 13 (Dothan)		4
     30 C P4 model 0  (Willamette)	?
     31 C P4 model 1  (?)		?
     32 C P4 model 2  (Northwood)	3.9
     33 C P4 model 3  (Prescott)	?
     34 C P4 model 4  (Nocona)		?
     35 C AMD K6			-
     36 C AMD K7			-
     37 C AMD K8			?
     38 
     39 C 64-bit		     popcount	     hamdist
     40 C			    cycles/limb	    cycles/limb
     41 C P4 model 4 (Nocona):		8
     42 C AMD K8,K9			7.5
     43 C AMD K10			3.5
     44 C Intel core2			3.68
     45 C Intel corei			3.15
     46 C Intel atom		       10.8
     47 C VIA nano			6.5
     48 
     49 C TODO
     50 C  * Make a mpn_hamdist based on this.  Alignment could either be handled by
     51 C    using movdqu for one operand and movdqa for the other, or by painfully
     52 C    shifting as we go.  Unfortunately, there seem to be no useable shift
     53 C    instruction, except for one that takes an immediate count.
     54 C  * It would probably be possible to cut a few cycles/limb using software
     55 C    pipelining.
     56 C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
     57 C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
     58 C    integer based popcount.  Such a combined loop would handle 6 limbs in
     59 C    about 30 cycles on K8.
     60 C  * We could save a byte or two by using 32-bit operations on areg.
     61 C  * Check if using movdqa to a temp of and then register-based pand is faster.
     62 
     63 ifelse(GMP_LIMB_BITS,`32',
     64 `	define(`up',  `%edx')
     65 	define(`n',   `%ecx')
     66 	define(`areg',`%eax')
     67 	define(`breg',`%ebx')
     68 	define(`zero',`%xmm4')
     69 	define(`LIMB32',`	$1')
     70 	define(`LIMB64',`dnl')
     71 ',`
     72 	define(`up',  `%rdi')
     73 	define(`n',   `%rsi')
     74 	define(`areg',`%rax')
     75 	define(`breg',`%rdx')
     76 	define(`zero',`%xmm8')
     77 	define(`LIMB32',`dnl')
     78 	define(`LIMB64',`	$1')
     79 ')
     80 
     81 define(`mm01010101',`%xmm6')
     82 define(`mm00110011',`%xmm7')
     83 define(`mm00001111',`%xmm2')
     84 
     85 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     86 define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
     87 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
     88 
     89 undefine(`psadbw')			C override inherited m4 version
     90 
     91 ASM_START()
     92 
     93 C Make cnsts global to work around Apple relocation bug.
     94 ifdef(`DARWIN',`
     95 	define(`cnsts', MPN(popccnsts))
     96 	GLOBL	cnsts')
     97 
     98 	TEXT
     99 	ALIGN(32)
    100 PROLOGUE(mpn_popcount)
    101 
    102 LIMB32(`mov	4(%esp), up	')
    103 LIMB32(`mov	8(%esp), n	')
    104 LIMB32(`push	%ebx		')
    105 
    106 	pxor	%xmm3, %xmm3		C zero grand total count
    107 LIMB64(`pxor	zero, zero	')
    108 ifdef(`PIC',`
    109 	LEA(	cnsts, breg)
    110 ',`
    111 LIMB32(`mov	$cnsts, breg	')
    112 LIMB64(`movabs	$cnsts, breg	')
    113 ')
    114 
    115 	movdqa	-48(breg), mm01010101
    116 	movdqa	-32(breg), mm00110011
    117 	movdqa	-16(breg), mm00001111
    118 
    119 	mov	up, areg
    120 	and	$-16, up		C round `up' down to 128-bit boundary
    121 	and	$12, areg		C 32:areg = 0, 4, 8, 12
    122 					C 64:areg = 0, 8
    123 	movdqa	(up), %xmm0
    124 	pand	64(breg,areg,4), %xmm0
    125 	shr	$m4_log2(GMP_LIMB_BYTES), %eax
    126 	add	areg, n			C compensate n for rounded down `up'
    127 
    128 	pxor	%xmm4, %xmm4
    129 	sub	$LIMBS_PER_XMM, n
    130 	jbe	L(sum)
    131 
    132 	sub	$LIMBS_PER_XMM, n
    133 	ja	L(ent)
    134 	jmp	L(lsum)
    135 
    136 	ALIGN(16)
    137 L(top):	movdqa	(up), %xmm0
    138 L(ent):	movdqa	16(up), %xmm4
    139 
    140 	movdqa	%xmm0, %xmm1
    141 	movdqa	%xmm4, %xmm5
    142 	psrld	$1, %xmm0
    143 	psrld	$1, %xmm4
    144 	pand	mm01010101, %xmm0
    145 	pand	mm01010101, %xmm4
    146 	psubd	%xmm0, %xmm1
    147 	psubd	%xmm4, %xmm5
    148 
    149 	movdqa	%xmm1, %xmm0
    150 	movdqa	%xmm5, %xmm4
    151 	psrlq	$2, %xmm1
    152 	psrlq	$2, %xmm5
    153 	pand	mm00110011, %xmm0
    154 	pand	mm00110011, %xmm4
    155 	pand	mm00110011, %xmm1
    156 	pand	mm00110011, %xmm5
    157 	paddq	%xmm0, %xmm1
    158 	paddq	%xmm4, %xmm5
    159 
    160 LIMB32(`pxor	zero, zero	')
    161 
    162 	add	$32, up
    163 	sub	$LIMBS_PER_2XMM, n
    164 
    165 	paddq	%xmm5, %xmm1
    166 	movdqa	%xmm1, %xmm0
    167 	psrlq	$4, %xmm1
    168 	pand	mm00001111, %xmm0
    169 	pand	mm00001111, %xmm1
    170 	paddq	%xmm0, %xmm1
    171 
    172 	psadbw	zero, %xmm1
    173 	paddq	%xmm1, %xmm3		C add to grand total
    174 
    175 	jnc	L(top)
    176 L(end):
    177 	add	$LIMBS_PER_2XMM, n
    178 	jz	L(rt)
    179 	movdqa	(up), %xmm0
    180 	pxor	%xmm4, %xmm4
    181 	sub	$LIMBS_PER_XMM, n
    182 	jbe	L(sum)
    183 L(lsum):
    184 	movdqa	%xmm0, %xmm4
    185 	movdqa	16(up), %xmm0
    186 L(sum):
    187 	shl	$m4_log2(GMP_LIMB_BYTES), n
    188 	and	$12, n
    189 	pand	(breg,n,4), %xmm0
    190 
    191 	movdqa	%xmm0, %xmm1
    192 	movdqa	%xmm4, %xmm5
    193 	psrld	$1, %xmm0
    194 	psrld	$1, %xmm4
    195 	pand	mm01010101, %xmm0
    196 	pand	mm01010101, %xmm4
    197 	psubd	%xmm0, %xmm1
    198 	psubd	%xmm4, %xmm5
    199 
    200 	movdqa	%xmm1, %xmm0
    201 	movdqa	%xmm5, %xmm4
    202 	psrlq	$2, %xmm1
    203 	psrlq	$2, %xmm5
    204 	pand	mm00110011, %xmm0
    205 	pand	mm00110011, %xmm4
    206 	pand	mm00110011, %xmm1
    207 	pand	mm00110011, %xmm5
    208 	paddq	%xmm0, %xmm1
    209 	paddq	%xmm4, %xmm5
    210 
    211 LIMB32(`pxor	zero, zero	')
    212 
    213 	paddq	%xmm5, %xmm1
    214 	movdqa	%xmm1, %xmm0
    215 	psrlq	$4, %xmm1
    216 	pand	mm00001111, %xmm0
    217 	pand	mm00001111, %xmm1
    218 	paddq	%xmm0, %xmm1
    219 
    220 	psadbw	zero, %xmm1
    221 	paddq	%xmm1, %xmm3		C add to grand total
    222 
    223 
    224 C Add the two 64-bit halves of the grand total counter
    225 L(rt):	movdqa	%xmm3, %xmm0
    226 	psrldq	$8, %xmm3
    227 	paddq	%xmm3, %xmm0
    228 	movd	%xmm0, areg		C movq avoided due to gas bug
    229 
    230 LIMB32(`pop	%ebx		')
    231 	ret
    232 
    233 EPILOGUE()
    234 DEF_OBJECT(dummy,16)
    235 C Three magic constants used for masking out bits
    236 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    237 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    238 
    239 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    240 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    241 
    242 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    243 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    244 cnsts:
    245 C Masks for high end of number
    246 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    247 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    248 
    249 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    250 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    251 
    252 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    253 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    254 
    255 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    256 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    257 C Masks for low end of number
    258 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    259 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    260 
    261 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    262 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    263 
    264 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    265 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    266 
    267 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    268 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    269 END_OBJECT(dummy)
    270