Home | History | Annotate | Line # | Download | only in arm64
      1      1.1  mrg dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8      1.1  mrg dnl  it under the terms of either:
      9      1.1  mrg dnl
     10      1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11      1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12      1.1  mrg dnl      option) any later version.
     13      1.1  mrg dnl
     14      1.1  mrg dnl  or
     15      1.1  mrg dnl
     16      1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     17      1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18      1.1  mrg dnl      later version.
     19      1.1  mrg dnl
     20      1.1  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24      1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25      1.1  mrg dnl  for more details.
     26      1.1  mrg dnl
     27      1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     28      1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29      1.1  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg C	     cycles/limb
     34  1.1.1.2  mrg C Cortex-A53	 2.5
     35  1.1.1.2  mrg C Cortex-A57	 1.14
     36  1.1.1.2  mrg C X-Gene	 3
     37      1.1  mrg 
     38      1.1  mrg C TODO
     39      1.1  mrg C  * Consider greater unrolling.
     40      1.1  mrg C  * Arrange to align the pointer, if that helps performance.  Use the same
     41      1.1  mrg C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
     42      1.1  mrg C    valgrind!)
     43      1.1  mrg C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
     44      1.1  mrg C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
     45      1.1  mrg 
     46  1.1.1.2  mrg changecom(blah)
     47      1.1  mrg 
     48      1.1  mrg C INPUT PARAMETERS
     49      1.1  mrg define(`ap', x0)
     50      1.1  mrg define(`n',  x1)
     51      1.1  mrg 
     52      1.1  mrg C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
     53      1.1  mrg C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
     54      1.1  mrg C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
     55      1.1  mrg C  allows the huge count code to jump deep into the code (at L(chu)).
     56      1.1  mrg 
     57      1.1  mrg define(`maxsize',  0x1fff)
     58      1.1  mrg define(`chunksize',0x1ff0)
     59      1.1  mrg 
     60      1.1  mrg ASM_START()
     61      1.1  mrg PROLOGUE(mpn_popcount)
     62      1.1  mrg 
     63      1.1  mrg 	mov	x11, #maxsize
     64      1.1  mrg 	cmp	n, x11
     65      1.1  mrg 	b.hi	L(gt8k)
     66      1.1  mrg 
     67      1.1  mrg L(lt8k):
     68      1.1  mrg 	movi	v4.16b, #0			C clear summation register
     69      1.1  mrg 	movi	v5.16b, #0			C clear summation register
     70      1.1  mrg 
     71      1.1  mrg 	tbz	n, #0, L(xx0)
     72      1.1  mrg 	sub	n, n, #1
     73      1.1  mrg 	ld1	{v0.1d}, [ap], #8		C load 1 limb
     74      1.1  mrg 	cnt	v6.16b, v0.16b
     75      1.1  mrg 	uadalp	v4.8h,  v6.16b			C could also splat
     76      1.1  mrg 
     77      1.1  mrg L(xx0):	tbz	n, #1, L(x00)
     78      1.1  mrg 	sub	n, n, #2
     79      1.1  mrg 	ld1	{v0.2d}, [ap], #16		C load 2 limbs
     80      1.1  mrg 	cnt	v6.16b, v0.16b
     81      1.1  mrg 	uadalp	v4.8h,  v6.16b
     82      1.1  mrg 
     83      1.1  mrg L(x00):	tbz	n, #2, L(000)
     84      1.1  mrg 	subs	n, n, #4
     85      1.1  mrg 	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
     86      1.1  mrg 	b.ls	L(sum)
     87      1.1  mrg 
     88      1.1  mrg L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
     89      1.1  mrg 	sub	n, n, #4
     90      1.1  mrg 	cnt	v6.16b, v0.16b
     91      1.1  mrg 	cnt	v7.16b, v1.16b
     92      1.1  mrg 	b	L(mid)
     93      1.1  mrg 
     94      1.1  mrg L(000):	subs	n, n, #8
     95      1.1  mrg 	b.lo	L(e0)
     96      1.1  mrg 
     97      1.1  mrg L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
     98      1.1  mrg 	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
     99      1.1  mrg 	cnt	v6.16b, v2.16b
    100      1.1  mrg 	cnt	v7.16b, v3.16b
    101      1.1  mrg 	subs	n, n, #8
    102      1.1  mrg 	b.lo	L(end)
    103      1.1  mrg 
    104      1.1  mrg L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
    105      1.1  mrg 	uadalp	v4.8h,  v6.16b
    106      1.1  mrg 	cnt	v6.16b, v0.16b
    107      1.1  mrg 	uadalp	v5.8h,  v7.16b
    108      1.1  mrg 	cnt	v7.16b, v1.16b
    109      1.1  mrg L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
    110      1.1  mrg 	subs	n, n, #8
    111      1.1  mrg 	uadalp	v4.8h,  v6.16b
    112      1.1  mrg 	cnt	v6.16b, v2.16b
    113      1.1  mrg 	uadalp	v5.8h,  v7.16b
    114      1.1  mrg 	cnt	v7.16b, v3.16b
    115      1.1  mrg 	b.hs	L(top)
    116      1.1  mrg 
    117      1.1  mrg L(end):	uadalp	v4.8h,  v6.16b
    118      1.1  mrg 	uadalp	v5.8h,  v7.16b
    119      1.1  mrg L(sum):	cnt	v6.16b, v0.16b
    120      1.1  mrg 	cnt	v7.16b, v1.16b
    121      1.1  mrg 	uadalp	v4.8h,  v6.16b
    122      1.1  mrg 	uadalp	v5.8h,  v7.16b
    123      1.1  mrg 	add	v4.8h, v4.8h, v5.8h
    124      1.1  mrg 					C we have 8 16-bit counts
    125      1.1  mrg L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
    126      1.1  mrg 	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
    127      1.1  mrg 	mov	x0, v4.d[0]
    128      1.1  mrg 	mov	x1, v4.d[1]
    129      1.1  mrg 	add	x0, x0, x1
    130      1.1  mrg 	ret
    131      1.1  mrg 
    132      1.1  mrg C Code for count > maxsize.  Splits operand and calls above code.
    133      1.1  mrg define(`ap2', x5)			C caller-saves reg not used above
    134      1.1  mrg L(gt8k):
    135      1.1  mrg 	mov	x8, x30
    136      1.1  mrg 	mov	x7, n			C full count (caller-saves reg not used above)
    137      1.1  mrg 	mov	x4, #0			C total sum  (caller-saves reg not used above)
    138      1.1  mrg 	mov	x9, #chunksize*8	C caller-saves reg not used above
    139      1.1  mrg 	mov	x10, #chunksize		C caller-saves reg not used above
    140      1.1  mrg 
    141      1.1  mrg 1:	add	ap2, ap, x9		C point at subsequent block
    142      1.1  mrg 	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
    143      1.1  mrg 	movi	v4.16b, #0		C clear chunk summation register
    144      1.1  mrg 	movi	v5.16b, #0		C clear chunk summation register
    145      1.1  mrg 	bl	L(chu)			C jump deep inside code
    146      1.1  mrg 	add	x4, x4, x0
    147      1.1  mrg 	mov	ap, ap2			C put chunk pointer in place for calls
    148      1.1  mrg 	sub	x7, x7, x10
    149      1.1  mrg 	cmp	x7, x11
    150      1.1  mrg 	b.hi	1b
    151      1.1  mrg 
    152      1.1  mrg 	mov	n, x7			C count for final invocation
    153      1.1  mrg 	bl	L(lt8k)
    154      1.1  mrg 	add	x0, x4, x0
    155      1.1  mrg 	mov	x30, x8
    156      1.1  mrg 	ret
    157      1.1  mrg EPILOGUE()
    158