Home | History | Annotate | Line # | Download | only in sse2
popcount.asm revision 1.1
      1 dnl  X86-32 and X86-64 mpn_popcount using SSE2.
      2 
      3 dnl  Copyright 2006, 2007 Free Software Foundation, Inc.
      4 dnl
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 dnl
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 dnl
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 
     21 include(`../config.m4')
     22 
     23 
     24 C 32-bit                     popcount        hamdist
     25 C                           cycles/limb     cycles/limb
     26 C P5:                           -
     27 C P6 model 0-8,10-12)           -
     28 C P6 model 9  (Banias)          ?
     29 C P6 model 13 (Dothan)          4
     30 C P4 model 0  (Willamette)      ?
     31 C P4 model 1  (?)               ?
     32 C P4 model 2  (Northwood)       3.9
     33 C P4 model 3  (Prescott)        ?
     34 C P4 model 4  (Nocona)          ?
     35 C K6:                           -
     36 C K7:                           -
     37 C K8:                           ?
     38 
     39 C 64-bit                     popcount        hamdist
     40 C                           cycles/limb     cycles/limb
     41 C P4 model 4 (Nocona):          8
     42 C K8:                           7.5
     43 C K10:				3.5
     44 C P6 core2:			3.68
     45 C P6 corei7:			3.15
     46 
     47 C TODO
     48 C  * Make a mpn_hamdist based on this.  Alignment could either be handled by
     49 C    using movdqu for one operand and movdqa for the other, or by painfully
     50 C    shifting as we go.  Unfortunately, there seem to be no useable shift
     51 C    instruction, except for one that takes an immediate count.
     52 C  * It would probably be possible to cut a few cycles/limb using software
     53 C    pipelining.
     54 C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
     55 C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
     56 C    integer based popcount.  Such a combined loop would handle 6 limbs in
     57 C    about 30 cycles on K8.
     58 C  * We could save a byte or two by using 32-bit operations on areg.
     59 C  * Check if using movdqa to a temp of and then register-based pand is faster.
     60 
     61 ifelse(GMP_LIMB_BITS,`32',
     62 `	define(`up',  `%edx')
     63 	define(`n',   `%ecx')
     64 	define(`areg',`%eax')
     65 	define(`breg',`%ebx')
     66 	define(`zero',`%xmm4')
     67 	define(`LIMB32',`	$1')
     68 	define(`LIMB64',`dnl')
     69 ',`
     70 	define(`up',  `%rdi')
     71 	define(`n',   `%rsi')
     72 	define(`areg',`%rax')
     73 	define(`breg',`%rdx')
     74 	define(`zero',`%xmm8')
     75 	define(`LIMB32',`dnl')
     76 	define(`LIMB64',`	$1')
     77 ')
     78 
     79 define(`mm01010101',`%xmm6')
     80 define(`mm00110011',`%xmm7')
     81 define(`mm00001111',`%xmm2')
     82 
     83 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     84 define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
     85 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
     86 
     87 undefine(`psadbw')			C override inherited m4 version
     88 
     89 ASM_START()
     90 
     91 C Make cnsts global to work around Apple relocation bug.
     92 ifdef(`DARWIN',`
     93 	define(`cnsts', MPN(popccnsts))
     94 	GLOBL	cnsts')
     95 
     96 	TEXT
     97 	ALIGN(32)
     98 PROLOGUE(mpn_popcount)
     99 
    100 LIMB32(`mov	4(%esp), up	')
    101 LIMB32(`mov	8(%esp), n	')
    102 LIMB32(`push	%ebx		')
    103 
    104 	pxor	%xmm3, %xmm3		C zero grand total count
    105 LIMB64(`pxor	zero, zero	')
    106 ifdef(`PIC',`
    107 	LEA(	cnsts, breg)
    108 ',`
    109 LIMB32(`mov	$cnsts, breg	')
    110 LIMB64(`movabs	$cnsts, breg	')
    111 ')
    112 
    113 	movdqa	-48(breg), mm01010101
    114 	movdqa	-32(breg), mm00110011
    115 	movdqa	-16(breg), mm00001111
    116 
    117 	mov	up, areg
    118 	and	$-16, up		C round `up' down to 128-bit boundary
    119 	and	$12, areg		C 32:areg = 0, 4, 8, 12
    120 					C 64:areg = 0, 8
    121 	movdqa	(up), %xmm0
    122 	pand	64(breg,areg,4), %xmm0
    123 	shr	$m4_log2(GMP_LIMB_BYTES), %eax
    124 	add	areg, n			C compensate n for rounded down `up'
    125 
    126 	pxor	%xmm4, %xmm4
    127 	sub	$LIMBS_PER_XMM, n
    128 	jbe	L(sum)
    129 
    130 	sub	$LIMBS_PER_XMM, n
    131 	ja	L(ent)
    132 	jmp	L(lsum)
    133 
    134 	ALIGN(16)
    135 L(top):	movdqa	(up), %xmm0
    136 L(ent):	movdqa	16(up), %xmm4
    137 
    138 	movdqa	%xmm0, %xmm1
    139 	movdqa	%xmm4, %xmm5
    140 	psrld	$1, %xmm0
    141 	psrld	$1, %xmm4
    142 	pand	mm01010101, %xmm0
    143 	pand	mm01010101, %xmm4
    144 	psubd	%xmm0, %xmm1
    145 	psubd	%xmm4, %xmm5
    146 
    147 	movdqa	%xmm1, %xmm0
    148 	movdqa	%xmm5, %xmm4
    149 	psrlq	$2, %xmm1
    150 	psrlq	$2, %xmm5
    151 	pand	mm00110011, %xmm0
    152 	pand	mm00110011, %xmm4
    153 	pand	mm00110011, %xmm1
    154 	pand	mm00110011, %xmm5
    155 	paddq	%xmm0, %xmm1
    156 	paddq	%xmm4, %xmm5
    157 
    158 LIMB32(`pxor	zero, zero	')
    159 
    160 	add	$32, up
    161 	sub	$LIMBS_PER_2XMM, n
    162 
    163 	paddq	%xmm5, %xmm1
    164 	movdqa	%xmm1, %xmm0
    165 	psrlq	$4, %xmm1
    166 	pand	mm00001111, %xmm0
    167 	pand	mm00001111, %xmm1
    168 	paddq	%xmm0, %xmm1
    169 
    170 	psadbw	zero, %xmm1
    171 	paddq	%xmm1, %xmm3		C add to grand total
    172 
    173 	jnc	L(top)
    174 L(end):
    175 	add	$LIMBS_PER_2XMM, n
    176 	jz	L(rt)
    177 	movdqa	(up), %xmm0
    178 	pxor	%xmm4, %xmm4
    179 	sub	$LIMBS_PER_XMM, n
    180 	jbe	L(sum)
    181 L(lsum):
    182 	movdqa	%xmm0, %xmm4
    183 	movdqa	16(up), %xmm0
    184 L(sum):
    185 	shl	$m4_log2(GMP_LIMB_BYTES), n
    186 	and	$12, n
    187 	pand	(breg,n,4), %xmm0
    188 
    189 	movdqa	%xmm0, %xmm1
    190 	movdqa	%xmm4, %xmm5
    191 	psrld	$1, %xmm0
    192 	psrld	$1, %xmm4
    193 	pand	mm01010101, %xmm0
    194 	pand	mm01010101, %xmm4
    195 	psubd	%xmm0, %xmm1
    196 	psubd	%xmm4, %xmm5
    197 
    198 	movdqa	%xmm1, %xmm0
    199 	movdqa	%xmm5, %xmm4
    200 	psrlq	$2, %xmm1
    201 	psrlq	$2, %xmm5
    202 	pand	mm00110011, %xmm0
    203 	pand	mm00110011, %xmm4
    204 	pand	mm00110011, %xmm1
    205 	pand	mm00110011, %xmm5
    206 	paddq	%xmm0, %xmm1
    207 	paddq	%xmm4, %xmm5
    208 
    209 LIMB32(`pxor	zero, zero	')
    210 
    211 	paddq	%xmm5, %xmm1
    212 	movdqa	%xmm1, %xmm0
    213 	psrlq	$4, %xmm1
    214 	pand	mm00001111, %xmm0
    215 	pand	mm00001111, %xmm1
    216 	paddq	%xmm0, %xmm1
    217 
    218 	psadbw	zero, %xmm1
    219 	paddq	%xmm1, %xmm3		C add to grand total
    220 
    221 
    222 C Add the two 64-bit halves of the grand total counter
    223 L(rt):	movdqa	%xmm3, %xmm0
    224 	psrldq	$8, %xmm3
    225 	paddq	%xmm3, %xmm0
    226 	movd	%xmm0, areg		C movq avoided due to gas bug
    227 
    228 LIMB32(`pop	%ebx		')
    229 	ret
    230 
    231 EPILOGUE()
    232 DEF_OBJECT(dummy,16)
    233 C Three magic constants used for masking out bits
    234 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    235 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    236 
    237 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    238 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    239 
    240 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    241 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    242 cnsts:
    243 C Masks for high end of number
    244 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    245 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    246 
    247 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    248 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    249 
    250 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    251 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    252 
    253 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    254 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    255 C Masks for low end of number
    256 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    257 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    258 
    259 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    260 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    261 
    262 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    263 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    264 
    265 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    266 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    267 END_OBJECT(dummy)
    268