Home | History | Annotate | Line # | Download | only in sse2
popcount.asm revision 1.1.1.4
      1 dnl  X86-32 and X86-64 mpn_popcount using SSE2.
      2 
      3 dnl  Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 
     32 include(`../config.m4')
     33 
     34 
     35 C 32-bit		     popcount	     hamdist
     36 C			    cycles/limb	    cycles/limb
     37 C P5				-
     38 C P6 model 0-8,10-12		-
     39 C P6 model 9  (Banias)		?
     40 C P6 model 13 (Dothan)		4
     41 C P4 model 0  (Willamette)	?
     42 C P4 model 1  (?)		?
     43 C P4 model 2  (Northwood)	3.9
     44 C P4 model 3  (Prescott)	?
     45 C P4 model 4  (Nocona)		?
     46 C AMD K6			-
     47 C AMD K7			-
     48 C AMD K8			?
     49 
     50 C 64-bit		     popcount	     hamdist
     51 C			    cycles/limb	    cycles/limb
     52 C P4 model 4 (Nocona):		8
     53 C AMD K8,K9			7.5
     54 C AMD K10			3.5
     55 C Intel core2			3.68
     56 C Intel corei			3.15
     57 C Intel atom		       10.8
     58 C VIA nano			6.5
     59 
     60 C TODO
     61 C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
     62 C    using movdqu for one operand and movdqa for the other, or by painfully
     63 C    shifting as we go.  Unfortunately, there seem to be no usable shift
     64 C    instruction, except for one that takes an immediate count.
     65 C  * It would probably be possible to cut a few cycles/limb using software
     66 C    pipelining.
     67 C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
     68 C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
     69 C    integer based popcount.  Such a combined loop would handle 6 limbs in
     70 C    about 30 cycles on K8.
     71 C  * We could save a byte or two by using 32-bit operations on areg.
     72 C  * Check if using movdqa to a temp of and then register-based pand is faster.
     73 
     74 ifelse(GMP_LIMB_BITS,`32',
     75 `	define(`up',  `%edx')
     76 	define(`n',   `%ecx')
     77 	define(`areg',`%eax')
     78 	define(`breg',`%ebx')
     79 	define(`zero',`%xmm4')
     80 	define(`LIMB32',`	$1')
     81 	define(`LIMB64',`dnl')
     82 ',`
     83 	define(`up',  `%rdi')
     84 	define(`n',   `%rsi')
     85 	define(`areg',`%rax')
     86 	define(`breg',`%rdx')
     87 	define(`zero',`%xmm8')
     88 	define(`LIMB32',`dnl')
     89 	define(`LIMB64',`	$1')
     90 ')
     91 
     92 define(`mm01010101',`%xmm6')
     93 define(`mm00110011',`%xmm7')
     94 define(`mm00001111',`%xmm2')
     95 
     96 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     97 define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
     98 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
     99 
    100 undefine(`psadbw')			C override inherited m4 version
    101 
    102 C This file is shared between 32-bit and 64-bit builds.  Only the former has
    103 C LEAL.  Default LEAL as an alias of LEA.
    104 ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
    105 
    106 ASM_START()
    107 
    108 C Make cnsts global to work around Apple relocation bug.
    109 ifdef(`DARWIN',`
    110 	define(`cnsts', MPN(popccnsts))
    111 	GLOBL	cnsts')
    112 
    113 	TEXT
    114 	ALIGN(32)
    115 PROLOGUE(mpn_popcount)
    116 
    117 LIMB32(`mov	4(%esp), up	')
    118 LIMB32(`mov	8(%esp), n	')
    119 LIMB32(`push	%ebx		')
    120 
    121 	pxor	%xmm3, %xmm3		C zero grand total count
    122 LIMB64(`pxor	zero, zero	')
    123 
    124 	LEAL(	cnsts, breg)
    125 
    126 	movdqa	-48(breg), mm01010101
    127 	movdqa	-32(breg), mm00110011
    128 	movdqa	-16(breg), mm00001111
    129 
    130 	mov	up, areg
    131 	and	$-16, up		C round `up' down to 128-bit boundary
    132 	and	$12, areg		C 32:areg = 0, 4, 8, 12
    133 					C 64:areg = 0, 8
    134 	movdqa	(up), %xmm0
    135 	pand	64(breg,areg,4), %xmm0
    136 	shr	$m4_log2(GMP_LIMB_BYTES), %eax
    137 	add	areg, n			C compensate n for rounded down `up'
    138 
    139 	pxor	%xmm4, %xmm4
    140 	sub	$LIMBS_PER_XMM, n
    141 	jbe	L(sum)
    142 
    143 	sub	$LIMBS_PER_XMM, n
    144 	ja	L(ent)
    145 	jmp	L(lsum)
    146 
    147 	ALIGN(16)
    148 L(top):	movdqa	(up), %xmm0
    149 L(ent):	movdqa	16(up), %xmm4
    150 
    151 	movdqa	%xmm0, %xmm1
    152 	movdqa	%xmm4, %xmm5
    153 	psrld	$1, %xmm0
    154 	psrld	$1, %xmm4
    155 	pand	mm01010101, %xmm0
    156 	pand	mm01010101, %xmm4
    157 	psubd	%xmm0, %xmm1
    158 	psubd	%xmm4, %xmm5
    159 
    160 	movdqa	%xmm1, %xmm0
    161 	movdqa	%xmm5, %xmm4
    162 	psrlq	$2, %xmm1
    163 	psrlq	$2, %xmm5
    164 	pand	mm00110011, %xmm0
    165 	pand	mm00110011, %xmm4
    166 	pand	mm00110011, %xmm1
    167 	pand	mm00110011, %xmm5
    168 	paddq	%xmm0, %xmm1
    169 	paddq	%xmm4, %xmm5
    170 
    171 LIMB32(`pxor	zero, zero	')
    172 
    173 	add	$32, up
    174 	sub	$LIMBS_PER_2XMM, n
    175 
    176 	paddq	%xmm5, %xmm1
    177 	movdqa	%xmm1, %xmm0
    178 	psrlq	$4, %xmm1
    179 	pand	mm00001111, %xmm0
    180 	pand	mm00001111, %xmm1
    181 	paddq	%xmm0, %xmm1
    182 
    183 	psadbw	zero, %xmm1
    184 	paddq	%xmm1, %xmm3		C add to grand total
    185 
    186 	jnc	L(top)
    187 L(end):
    188 	add	$LIMBS_PER_2XMM, n
    189 	jz	L(rt)
    190 	movdqa	(up), %xmm0
    191 	pxor	%xmm4, %xmm4
    192 	sub	$LIMBS_PER_XMM, n
    193 	jbe	L(sum)
    194 L(lsum):
    195 	movdqa	%xmm0, %xmm4
    196 	movdqa	16(up), %xmm0
    197 L(sum):
    198 	shl	$m4_log2(GMP_LIMB_BYTES), n
    199 	and	$12, n
    200 	pand	(breg,n,4), %xmm0
    201 
    202 	movdqa	%xmm0, %xmm1
    203 	movdqa	%xmm4, %xmm5
    204 	psrld	$1, %xmm0
    205 	psrld	$1, %xmm4
    206 	pand	mm01010101, %xmm0
    207 	pand	mm01010101, %xmm4
    208 	psubd	%xmm0, %xmm1
    209 	psubd	%xmm4, %xmm5
    210 
    211 	movdqa	%xmm1, %xmm0
    212 	movdqa	%xmm5, %xmm4
    213 	psrlq	$2, %xmm1
    214 	psrlq	$2, %xmm5
    215 	pand	mm00110011, %xmm0
    216 	pand	mm00110011, %xmm4
    217 	pand	mm00110011, %xmm1
    218 	pand	mm00110011, %xmm5
    219 	paddq	%xmm0, %xmm1
    220 	paddq	%xmm4, %xmm5
    221 
    222 LIMB32(`pxor	zero, zero	')
    223 
    224 	paddq	%xmm5, %xmm1
    225 	movdqa	%xmm1, %xmm0
    226 	psrlq	$4, %xmm1
    227 	pand	mm00001111, %xmm0
    228 	pand	mm00001111, %xmm1
    229 	paddq	%xmm0, %xmm1
    230 
    231 	psadbw	zero, %xmm1
    232 	paddq	%xmm1, %xmm3		C add to grand total
    233 
    234 
    235 C Add the two 64-bit halves of the grand total counter
    236 L(rt):	movdqa	%xmm3, %xmm0
    237 	psrldq	$8, %xmm3
    238 	paddq	%xmm3, %xmm0
    239 	movd	%xmm0, areg		C movq avoided due to gas bug
    240 
    241 LIMB32(`pop	%ebx		')
    242 	ret
    243 
    244 EPILOGUE()
    245 DEF_OBJECT(dummy,16)
    246 C Three magic constants used for masking out bits
    247 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    248 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    249 
    250 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    251 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    252 
    253 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    254 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    255 cnsts:
    256 C Masks for high end of number
    257 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    258 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    259 
    260 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    261 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    262 
    263 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    264 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    265 
    266 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    267 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    268 C Masks for low end of number
    269 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    270 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    271 
    272 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    273 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    274 
    275 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    276 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    277 
    278 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    279 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    280 END_OBJECT(dummy)
    281 ASM_END()
    282