Home | History | Annotate | Line # | Download | only in sse2
      1      1.1  mrg dnl  X86-32 and X86-64 mpn_popcount using SSE2.
      2      1.1  mrg 
      3  1.1.1.4  mrg dnl  Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
      4  1.1.1.3  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg dnl  it under the terms of either:
      9  1.1.1.3  mrg dnl
     10  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg dnl      option) any later version.
     13  1.1.1.3  mrg dnl
     14  1.1.1.3  mrg dnl  or
     15  1.1.1.3  mrg dnl
     16  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg dnl      later version.
     19  1.1.1.3  mrg dnl
     20  1.1.1.3  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg 
     35  1.1.1.2  mrg C 32-bit		     popcount	     hamdist
     36  1.1.1.2  mrg C			    cycles/limb	    cycles/limb
     37  1.1.1.2  mrg C P5				-
     38  1.1.1.2  mrg C P6 model 0-8,10-12		-
     39  1.1.1.2  mrg C P6 model 9  (Banias)		?
     40  1.1.1.2  mrg C P6 model 13 (Dothan)		4
     41  1.1.1.2  mrg C P4 model 0  (Willamette)	?
     42  1.1.1.2  mrg C P4 model 1  (?)		?
     43  1.1.1.2  mrg C P4 model 2  (Northwood)	3.9
     44  1.1.1.2  mrg C P4 model 3  (Prescott)	?
     45  1.1.1.2  mrg C P4 model 4  (Nocona)		?
     46  1.1.1.2  mrg C AMD K6			-
     47  1.1.1.2  mrg C AMD K7			-
     48  1.1.1.2  mrg C AMD K8			?
     49  1.1.1.2  mrg 
     50  1.1.1.2  mrg C 64-bit		     popcount	     hamdist
     51  1.1.1.2  mrg C			    cycles/limb	    cycles/limb
     52  1.1.1.2  mrg C P4 model 4 (Nocona):		8
     53  1.1.1.2  mrg C AMD K8,K9			7.5
     54  1.1.1.2  mrg C AMD K10			3.5
     55  1.1.1.2  mrg C Intel core2			3.68
     56  1.1.1.2  mrg C Intel corei			3.15
     57  1.1.1.2  mrg C Intel atom		       10.8
     58  1.1.1.2  mrg C VIA nano			6.5
     59      1.1  mrg 
     60      1.1  mrg C TODO
     61  1.1.1.3  mrg C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
     62      1.1  mrg C    using movdqu for one operand and movdqa for the other, or by painfully
     63  1.1.1.3  mrg C    shifting as we go.  Unfortunately, there seem to be no usable shift
     64      1.1  mrg C    instruction, except for one that takes an immediate count.
     65      1.1  mrg C  * It would probably be possible to cut a few cycles/limb using software
     66      1.1  mrg C    pipelining.
     67      1.1  mrg C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
     68      1.1  mrg C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
     69      1.1  mrg C    integer based popcount.  Such a combined loop would handle 6 limbs in
     70      1.1  mrg C    about 30 cycles on K8.
     71      1.1  mrg C  * We could save a byte or two by using 32-bit operations on areg.
     72      1.1  mrg C  * Check if using movdqa to a temp of and then register-based pand is faster.
     73      1.1  mrg 
     74      1.1  mrg ifelse(GMP_LIMB_BITS,`32',
     75      1.1  mrg `	define(`up',  `%edx')
     76      1.1  mrg 	define(`n',   `%ecx')
     77      1.1  mrg 	define(`areg',`%eax')
     78      1.1  mrg 	define(`breg',`%ebx')
     79      1.1  mrg 	define(`zero',`%xmm4')
     80      1.1  mrg 	define(`LIMB32',`	$1')
     81      1.1  mrg 	define(`LIMB64',`dnl')
     82      1.1  mrg ',`
     83      1.1  mrg 	define(`up',  `%rdi')
     84      1.1  mrg 	define(`n',   `%rsi')
     85      1.1  mrg 	define(`areg',`%rax')
     86      1.1  mrg 	define(`breg',`%rdx')
     87      1.1  mrg 	define(`zero',`%xmm8')
     88      1.1  mrg 	define(`LIMB32',`dnl')
     89      1.1  mrg 	define(`LIMB64',`	$1')
     90      1.1  mrg ')
     91      1.1  mrg 
     92      1.1  mrg define(`mm01010101',`%xmm6')
     93      1.1  mrg define(`mm00110011',`%xmm7')
     94      1.1  mrg define(`mm00001111',`%xmm2')
     95      1.1  mrg 
     96      1.1  mrg define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     97      1.1  mrg define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
     98      1.1  mrg define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
     99      1.1  mrg 
    100      1.1  mrg undefine(`psadbw')			C override inherited m4 version
    101      1.1  mrg 
    102  1.1.1.3  mrg C This file is shared between 32-bit and 64-bit builds.  Only the former has
    103  1.1.1.3  mrg C LEAL.  Default LEAL as an alias of LEA.
    104  1.1.1.3  mrg ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
    105  1.1.1.3  mrg 
    106      1.1  mrg ASM_START()
    107      1.1  mrg 
    108      1.1  mrg C Make cnsts global to work around Apple relocation bug.
    109      1.1  mrg ifdef(`DARWIN',`
    110      1.1  mrg 	define(`cnsts', MPN(popccnsts))
    111      1.1  mrg 	GLOBL	cnsts')
    112      1.1  mrg 
    113      1.1  mrg 	TEXT
    114      1.1  mrg 	ALIGN(32)
    115      1.1  mrg PROLOGUE(mpn_popcount)
    116      1.1  mrg 
    117      1.1  mrg LIMB32(`mov	4(%esp), up	')
    118      1.1  mrg LIMB32(`mov	8(%esp), n	')
    119      1.1  mrg LIMB32(`push	%ebx		')
    120      1.1  mrg 
    121      1.1  mrg 	pxor	%xmm3, %xmm3		C zero grand total count
    122      1.1  mrg LIMB64(`pxor	zero, zero	')
    123  1.1.1.4  mrg 
    124  1.1.1.3  mrg 	LEAL(	cnsts, breg)
    125      1.1  mrg 
    126      1.1  mrg 	movdqa	-48(breg), mm01010101
    127      1.1  mrg 	movdqa	-32(breg), mm00110011
    128      1.1  mrg 	movdqa	-16(breg), mm00001111
    129      1.1  mrg 
    130      1.1  mrg 	mov	up, areg
    131      1.1  mrg 	and	$-16, up		C round `up' down to 128-bit boundary
    132      1.1  mrg 	and	$12, areg		C 32:areg = 0, 4, 8, 12
    133      1.1  mrg 					C 64:areg = 0, 8
    134      1.1  mrg 	movdqa	(up), %xmm0
    135      1.1  mrg 	pand	64(breg,areg,4), %xmm0
    136      1.1  mrg 	shr	$m4_log2(GMP_LIMB_BYTES), %eax
    137      1.1  mrg 	add	areg, n			C compensate n for rounded down `up'
    138      1.1  mrg 
    139      1.1  mrg 	pxor	%xmm4, %xmm4
    140      1.1  mrg 	sub	$LIMBS_PER_XMM, n
    141      1.1  mrg 	jbe	L(sum)
    142      1.1  mrg 
    143      1.1  mrg 	sub	$LIMBS_PER_XMM, n
    144      1.1  mrg 	ja	L(ent)
    145      1.1  mrg 	jmp	L(lsum)
    146      1.1  mrg 
    147      1.1  mrg 	ALIGN(16)
    148      1.1  mrg L(top):	movdqa	(up), %xmm0
    149      1.1  mrg L(ent):	movdqa	16(up), %xmm4
    150      1.1  mrg 
    151      1.1  mrg 	movdqa	%xmm0, %xmm1
    152      1.1  mrg 	movdqa	%xmm4, %xmm5
    153      1.1  mrg 	psrld	$1, %xmm0
    154      1.1  mrg 	psrld	$1, %xmm4
    155      1.1  mrg 	pand	mm01010101, %xmm0
    156      1.1  mrg 	pand	mm01010101, %xmm4
    157      1.1  mrg 	psubd	%xmm0, %xmm1
    158      1.1  mrg 	psubd	%xmm4, %xmm5
    159      1.1  mrg 
    160      1.1  mrg 	movdqa	%xmm1, %xmm0
    161      1.1  mrg 	movdqa	%xmm5, %xmm4
    162      1.1  mrg 	psrlq	$2, %xmm1
    163      1.1  mrg 	psrlq	$2, %xmm5
    164      1.1  mrg 	pand	mm00110011, %xmm0
    165      1.1  mrg 	pand	mm00110011, %xmm4
    166      1.1  mrg 	pand	mm00110011, %xmm1
    167      1.1  mrg 	pand	mm00110011, %xmm5
    168      1.1  mrg 	paddq	%xmm0, %xmm1
    169      1.1  mrg 	paddq	%xmm4, %xmm5
    170      1.1  mrg 
    171      1.1  mrg LIMB32(`pxor	zero, zero	')
    172      1.1  mrg 
    173      1.1  mrg 	add	$32, up
    174      1.1  mrg 	sub	$LIMBS_PER_2XMM, n
    175      1.1  mrg 
    176      1.1  mrg 	paddq	%xmm5, %xmm1
    177      1.1  mrg 	movdqa	%xmm1, %xmm0
    178      1.1  mrg 	psrlq	$4, %xmm1
    179      1.1  mrg 	pand	mm00001111, %xmm0
    180      1.1  mrg 	pand	mm00001111, %xmm1
    181      1.1  mrg 	paddq	%xmm0, %xmm1
    182      1.1  mrg 
    183      1.1  mrg 	psadbw	zero, %xmm1
    184      1.1  mrg 	paddq	%xmm1, %xmm3		C add to grand total
    185      1.1  mrg 
    186      1.1  mrg 	jnc	L(top)
    187      1.1  mrg L(end):
    188      1.1  mrg 	add	$LIMBS_PER_2XMM, n
    189      1.1  mrg 	jz	L(rt)
    190      1.1  mrg 	movdqa	(up), %xmm0
    191      1.1  mrg 	pxor	%xmm4, %xmm4
    192      1.1  mrg 	sub	$LIMBS_PER_XMM, n
    193      1.1  mrg 	jbe	L(sum)
    194      1.1  mrg L(lsum):
    195      1.1  mrg 	movdqa	%xmm0, %xmm4
    196      1.1  mrg 	movdqa	16(up), %xmm0
    197      1.1  mrg L(sum):
    198      1.1  mrg 	shl	$m4_log2(GMP_LIMB_BYTES), n
    199      1.1  mrg 	and	$12, n
    200      1.1  mrg 	pand	(breg,n,4), %xmm0
    201      1.1  mrg 
    202      1.1  mrg 	movdqa	%xmm0, %xmm1
    203      1.1  mrg 	movdqa	%xmm4, %xmm5
    204      1.1  mrg 	psrld	$1, %xmm0
    205      1.1  mrg 	psrld	$1, %xmm4
    206      1.1  mrg 	pand	mm01010101, %xmm0
    207      1.1  mrg 	pand	mm01010101, %xmm4
    208      1.1  mrg 	psubd	%xmm0, %xmm1
    209      1.1  mrg 	psubd	%xmm4, %xmm5
    210      1.1  mrg 
    211      1.1  mrg 	movdqa	%xmm1, %xmm0
    212      1.1  mrg 	movdqa	%xmm5, %xmm4
    213      1.1  mrg 	psrlq	$2, %xmm1
    214      1.1  mrg 	psrlq	$2, %xmm5
    215      1.1  mrg 	pand	mm00110011, %xmm0
    216      1.1  mrg 	pand	mm00110011, %xmm4
    217      1.1  mrg 	pand	mm00110011, %xmm1
    218      1.1  mrg 	pand	mm00110011, %xmm5
    219      1.1  mrg 	paddq	%xmm0, %xmm1
    220      1.1  mrg 	paddq	%xmm4, %xmm5
    221      1.1  mrg 
    222      1.1  mrg LIMB32(`pxor	zero, zero	')
    223      1.1  mrg 
    224      1.1  mrg 	paddq	%xmm5, %xmm1
    225      1.1  mrg 	movdqa	%xmm1, %xmm0
    226      1.1  mrg 	psrlq	$4, %xmm1
    227      1.1  mrg 	pand	mm00001111, %xmm0
    228      1.1  mrg 	pand	mm00001111, %xmm1
    229      1.1  mrg 	paddq	%xmm0, %xmm1
    230      1.1  mrg 
    231      1.1  mrg 	psadbw	zero, %xmm1
    232      1.1  mrg 	paddq	%xmm1, %xmm3		C add to grand total
    233      1.1  mrg 
    234      1.1  mrg 
    235      1.1  mrg C Add the two 64-bit halves of the grand total counter
    236      1.1  mrg L(rt):	movdqa	%xmm3, %xmm0
    237      1.1  mrg 	psrldq	$8, %xmm3
    238      1.1  mrg 	paddq	%xmm3, %xmm0
    239      1.1  mrg 	movd	%xmm0, areg		C movq avoided due to gas bug
    240      1.1  mrg 
    241      1.1  mrg LIMB32(`pop	%ebx		')
    242      1.1  mrg 	ret
    243      1.1  mrg 
    244      1.1  mrg EPILOGUE()
    245      1.1  mrg DEF_OBJECT(dummy,16)
    246      1.1  mrg C Three magic constants used for masking out bits
    247      1.1  mrg 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    248      1.1  mrg 	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
    249      1.1  mrg 
    250      1.1  mrg 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    251      1.1  mrg 	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
    252      1.1  mrg 
    253      1.1  mrg 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    254      1.1  mrg 	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
    255      1.1  mrg cnsts:
    256      1.1  mrg C Masks for high end of number
    257      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    258      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    259      1.1  mrg 
    260      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    261      1.1  mrg 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    262      1.1  mrg 
    263      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    264      1.1  mrg 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    265      1.1  mrg 
    266      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    267      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
    268      1.1  mrg C Masks for low end of number
    269      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    270      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    271      1.1  mrg 
    272      1.1  mrg 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    273      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    274      1.1  mrg 
    275      1.1  mrg 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    276      1.1  mrg 	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
    277      1.1  mrg 
    278      1.1  mrg 	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
    279      1.1  mrg 	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
    280      1.1  mrg END_OBJECT(dummy)
    281  1.1.1.3  mrg ASM_END()
    282