Home | History | Annotate | Line # | Download | only in sse2
popcount.asm revision 1.1.1.3
      1 dnl  X86-32 and X86-64 mpn_popcount using SSE2.
      2 
      3 dnl  Copyright 2006, 2007, 2011, 2015 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 
     32 include(`../config.m4')
     33 
     34 
     35 C 32-bit		     popcount	     hamdist
     36 C			    cycles/limb	    cycles/limb
     37 C P5				-
     38 C P6 model 0-8,10-12		-
     39 C P6 model 9  (Banias)		?
     40 C P6 model 13 (Dothan)		4
     41 C P4 model 0  (Willamette)	?
     42 C P4 model 1  (?)		?
     43 C P4 model 2  (Northwood)	3.9
     44 C P4 model 3  (Prescott)	?
     45 C P4 model 4  (Nocona)		?
     46 C AMD K6			-
     47 C AMD K7			-
     48 C AMD K8			?
     49 
     50 C 64-bit		     popcount	     hamdist
     51 C			    cycles/limb	    cycles/limb
     52 C P4 model 4 (Nocona):		8
     53 C AMD K8,K9			7.5
     54 C AMD K10			3.5
     55 C Intel core2			3.68
     56 C Intel corei			3.15
     57 C Intel atom		       10.8
     58 C VIA nano			6.5
     59 
     60 C TODO
     61 C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
     62 C    using movdqu for one operand and movdqa for the other, or by painfully
     63 C    shifting as we go.  Unfortunately, there seem to be no usable shift
     64 C    instruction, except for one that takes an immediate count.
     65 C  * It would probably be possible to cut a few cycles/limb using software
     66 C    pipelining.
     67 C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
     68 C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
     69 C    integer based popcount.  Such a combined loop would handle 6 limbs in
     70 C    about 30 cycles on K8.
     71 C  * We could save a byte or two by using 32-bit operations on areg.
     72 C  * Check if using movdqa to a temp of and then register-based pand is faster.
     73 
     74 ifelse(GMP_LIMB_BITS,`32',
     75 `	define(`up',  `%edx')
     76 	define(`n',   `%ecx')
     77 	define(`areg',`%eax')
     78 	define(`breg',`%ebx')
     79 	define(`zero',`%xmm4')
     80 	define(`LIMB32',`	$1')
     81 	define(`LIMB64',`dnl')
     82 ',`
     83 	define(`up',  `%rdi')
     84 	define(`n',   `%rsi')
     85 	define(`areg',`%rax')
     86 	define(`breg',`%rdx')
     87 	define(`zero',`%xmm8')
     88 	define(`LIMB32',`dnl')
     89 	define(`LIMB64',`	$1')
     90 ')
     91 
     92 define(`mm01010101',`%xmm6')
     93 define(`mm00110011',`%xmm7')
     94 define(`mm00001111',`%xmm2')
     95 
     96 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     97 define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
     98 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
     99 
    100 undefine(`psadbw')			C override inherited m4 version
    101 
    102 C This file is shared between 32-bit and 64-bit builds.  Only the former has
    103 C LEAL.  Default LEAL as an alias of LEA.
    104 ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
    105 
    106 ASM_START()
    107 
    108 C Make cnsts global to work around Apple relocation bug.
    109 ifdef(`DARWIN',`
    110 	define(`cnsts', MPN(popccnsts))
    111 	GLOBL	cnsts')
    112 
    113 	TEXT
    114 	ALIGN(32)
    115 PROLOGUE(mpn_popcount)
    116 
    117 LIMB32(`mov	4(%esp), up	')
    118 LIMB32(`mov	8(%esp), n	')
    119 LIMB32(`push	%ebx		')
    120 
    121 	pxor	%xmm3, %xmm3		C zero grand total count
    122 LIMB64(`pxor	zero, zero	')
    123 ifdef(`PIC',`
    124 	LEAL(	cnsts, breg)
    125 ',`
    126 LIMB32(`mov	$cnsts, breg	')
    127 LIMB64(`movabs	$cnsts, breg	')
    128 ')
    129 
    130 	movdqa	-48(breg), mm01010101
    131 	movdqa	-32(breg), mm00110011
    132 	movdqa	-16(breg), mm00001111
    133 
    134 	mov	up, areg
    135 	and	$-16, up		C round `up' down to 128-bit boundary
    136 	and	$12, areg		C 32:areg = 0, 4, 8, 12
    137 					C 64:areg = 0, 8
    138 	movdqa	(up), %xmm0
    139 	pand	64(breg,areg,4), %xmm0
    140 	shr	$m4_log2(GMP_LIMB_BYTES), %eax
    141 	add	areg, n			C compensate n for rounded down `up'
    142 
    143 	pxor	%xmm4, %xmm4
    144 	sub	$LIMBS_PER_XMM, n
    145 	jbe	L(sum)
    146 
    147 	sub	$LIMBS_PER_XMM, n
    148 	ja	L(ent)
    149 	jmp	L(lsum)
    150 
    151 	ALIGN(16)
    152 L(top):	movdqa	(up), %xmm0
    153 L(ent):	movdqa	16(up), %xmm4
    154 
    155 	movdqa	%xmm0, %xmm1
    156 	movdqa	%xmm4, %xmm5
    157 	psrld	$1, %xmm0
    158 	psrld	$1, %xmm4
    159 	pand	mm01010101, %xmm0
    160 	pand	mm01010101, %xmm4
    161 	psubd	%xmm0, %xmm1
    162 	psubd	%xmm4, %xmm5
    163 
    164 	movdqa	%xmm1, %xmm0
    165 	movdqa	%xmm5, %xmm4
    166 	psrlq	$2, %xmm1
    167 	psrlq	$2, %xmm5
    168 	pand	mm00110011, %xmm0
    169 	pand	mm00110011, %xmm4
    170 	pand	mm00110011, %xmm1
    171 	pand	mm00110011, %xmm5
    172 	paddq	%xmm0, %xmm1
    173 	paddq	%xmm4, %xmm5
    174 
    175 LIMB32(`pxor	zero, zero	')
    176 
    177 	add	$32, up
    178 	sub	$LIMBS_PER_2XMM, n
    179 
    180 	paddq	%xmm5, %xmm1
    181 	movdqa	%xmm1, %xmm0
    182 	psrlq	$4, %xmm1
    183 	pand	mm00001111, %xmm0
    184 	pand	mm00001111, %xmm1
    185 	paddq	%xmm0, %xmm1
    186 
    187 	psadbw	zero, %xmm1
    188 	paddq	%xmm1, %xmm3		C add to grand total
    189 
    190 	jnc	L(top)
    191 L(end):
    192 	add	$LIMBS_PER_2XMM, n
    193 	jz	L(rt)
    194 	movdqa	(up), %xmm0
    195 	pxor	%xmm4, %xmm4
    196 	sub	$LIMBS_PER_XMM, n
    197 	jbe	L(sum)
    198 L(lsum):
    199 	movdqa	%xmm0, %xmm4
    200 	movdqa	16(up), %xmm0
    201 L(sum):
    202 	shl	$m4_log2(GMP_LIMB_BYTES), n
    203 	and	$12, n
    204 	pand	(breg,n,4), %xmm0
    205 
    206 	movdqa	%xmm0, %xmm1
    207 	movdqa	%xmm4, %xmm5
    208 	psrld	$1, %xmm0
    209 	psrld	$1, %xmm4
    210 	pand	mm01010101, %xmm0
    211 	pand	mm01010101, %xmm4
    212 	psubd	%xmm0, %xmm1
    213 	psubd	%xmm4, %xmm5
    214 
    215 	movdqa	%xmm1, %xmm0
    216 	movdqa	%xmm5, %xmm4
    217 	psrlq	$2, %xmm1
    218 	psrlq	$2, %xmm5
    219 	pand	mm00110011, %xmm0
    220 	pand	mm00110011, %xmm4
    221 	pand	mm00110011, %xmm1
    222 	pand	mm00110011, %xmm5
    223 	paddq	%xmm0, %xmm1
    224 	paddq	%xmm4, %xmm5
    225 
    226 LIMB32(`pxor	zero, zero	')
    227 
    228 	paddq	%xmm5, %xmm1
    229 	movdqa	%xmm1, %xmm0
    230 	psrlq	$4, %xmm1
    231 	pand	mm00001111, %xmm0
    232 	pand	mm00001111, %xmm1
    233 	paddq	%xmm0, %xmm1
    234 
    235 	psadbw	zero, %xmm1
    236 	paddq	%xmm1, %xmm3		C add to grand total
    237 
    238 
    239 C Add the two 64-bit halves of the grand total counter
    240 L(rt):	movdqa	%xmm3, %xmm0
    241 	psrldq	$8, %xmm3
    242 	paddq	%xmm3, %xmm0
    243 	movd	%xmm0, areg		C movq avoided due to gas bug
    244 
    245 LIMB32(`pop	%ebx		')
    246 	ret
    247 
    248 EPILOGUE()
    249 DEF_OBJECT(dummy,16)
    250 C Three magic constants used for masking out bits
    251 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    252 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    253 
    254 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    255 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    256 
    257 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    258 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    259 cnsts:
    260 C Masks for high end of number
    261 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    262 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    263 
    264 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    265 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    266 
    267 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    268 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    269 
    270 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    271 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    272 C Masks for low end of number
    273 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    274 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    275 
    276 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    277 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    278 
    279 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    280 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    281 
    282 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    283 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    284 END_OBJECT(dummy)
    285 ASM_END()
    286