Home | History | Annotate | Line # | Download | only in arm64
      1      1.1  mrg dnl  ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2012-2014 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8      1.1  mrg dnl  it under the terms of either:
      9      1.1  mrg dnl
     10      1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11      1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12      1.1  mrg dnl      option) any later version.
     13      1.1  mrg dnl
     14      1.1  mrg dnl  or
     15      1.1  mrg dnl
     16      1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     17      1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18      1.1  mrg dnl      later version.
     19      1.1  mrg dnl
     20      1.1  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24      1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25      1.1  mrg dnl  for more details.
     26      1.1  mrg dnl
     27      1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     28      1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29      1.1  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg C	     cycles/limb
     34  1.1.1.2  mrg C Cortex-A53	 2
     35  1.1.1.2  mrg C Cortex-A57	 1
     36  1.1.1.2  mrg C X-Gene	 1.45
     37      1.1  mrg 
     38      1.1  mrg define(`ap',	x0)
     39      1.1  mrg define(`n',	x1)
     40      1.1  mrg 
     41  1.1.1.2  mrg changecom(blah)
     42      1.1  mrg 
     43      1.1  mrg C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
     44      1.1  mrg 
     45      1.1  mrg C TODO
     46  1.1.1.2  mrg C  * An alternative inner loop which could run at 0.722 c/l on A57:
     47      1.1  mrg C	adds	x8, x8, x2
     48      1.1  mrg C	adcs	x9, x9, x3
     49      1.1  mrg C	ldp	x2, x3, [ap, #-32]
     50      1.1  mrg C	adcs	x10, x10, x4
     51      1.1  mrg C	adc	x12, x12, xzr
     52      1.1  mrg C	adds	x8, x8, x5
     53      1.1  mrg C	ldp	x4, x5, [ap, #-16]
     54      1.1  mrg C	sub	n, n, #6
     55      1.1  mrg C	adcs	x9, x9, x6
     56      1.1  mrg C	adcs	x10, x10, x7
     57      1.1  mrg C	ldp	x6, x7, [ap], #48
     58      1.1  mrg C	adc	x12, x12, xzr
     59      1.1  mrg C	tbz	n, #63, L(top)
     60      1.1  mrg 
     61      1.1  mrg ASM_START()
     62      1.1  mrg 	TEXT
     63      1.1  mrg 	ALIGN(32)
     64      1.1  mrg PROLOGUE(mpn_mod_34lsub1)
     65      1.1  mrg 	subs	n, n, #3
     66      1.1  mrg 	mov	x8, #0
     67      1.1  mrg 	b.lt	L(le2)			C n <= 2
     68      1.1  mrg 
     69      1.1  mrg 	ldp	x2, x3, [ap, #0]
     70      1.1  mrg 	ldr	x4, [ap, #16]
     71      1.1  mrg 	add	ap, ap, #24
     72      1.1  mrg 	subs	n, n, #3
     73      1.1  mrg 	b.lt	L(sum)			C n <= 5
     74      1.1  mrg 	cmn	x0, #0			C clear carry
     75      1.1  mrg 
     76      1.1  mrg L(top):	ldp	x5, x6, [ap, #0]
     77      1.1  mrg 	ldr	x7, [ap, #16]
     78      1.1  mrg 	add	ap, ap, #24
     79      1.1  mrg 	sub	n, n, #3
     80      1.1  mrg 	adcs	x2, x2, x5
     81      1.1  mrg 	adcs	x3, x3, x6
     82      1.1  mrg 	adcs	x4, x4, x7
     83      1.1  mrg 	tbz	n, #63, L(top)
     84      1.1  mrg 
     85      1.1  mrg 	adc	x8, xzr, xzr		C x8 <= 1
     86      1.1  mrg 
     87      1.1  mrg L(sum):	cmn	n, #2
     88      1.1  mrg 	mov	x5, #0
     89      1.1  mrg 	b.lo	1f
     90      1.1  mrg 	ldr	x5, [ap], #8
     91      1.1  mrg 1:	mov	x6, #0
     92      1.1  mrg 	b.ls	1f
     93      1.1  mrg 	ldr	x6, [ap], #8
     94      1.1  mrg 1:	adds	x2, x2, x5
     95      1.1  mrg 	adcs	x3, x3, x6
     96      1.1  mrg 	adcs	x4, x4, xzr
     97      1.1  mrg 	adc	x8, x8, xzr		C x8 <= 2
     98      1.1  mrg 
     99      1.1  mrg L(sum2):
    100      1.1  mrg 	and	x0, x2, #0xffffffffffff
    101      1.1  mrg 	add	x0, x0, x2, lsr #48
    102      1.1  mrg 	add	x0, x0, x8
    103      1.1  mrg 
    104      1.1  mrg 	lsl	x8, x3, #16
    105      1.1  mrg 	and	x1, x8, #0xffffffffffff
    106      1.1  mrg 	add	x0, x0, x1
    107      1.1  mrg 	add	x0, x0, x3, lsr #32
    108      1.1  mrg 
    109      1.1  mrg 	lsl	x8, x4, #32
    110      1.1  mrg 	and	x1, x8, #0xffffffffffff
    111      1.1  mrg 	add	x0, x0, x1
    112      1.1  mrg 	add	x0, x0, x4, lsr #16
    113      1.1  mrg 	ret
    114      1.1  mrg 
    115      1.1  mrg L(le2):	cmn	n, #1
    116      1.1  mrg 	b.ne	L(1)
    117      1.1  mrg 	ldp	x2, x3, [ap]
    118      1.1  mrg 	mov	x4, #0
    119      1.1  mrg 	b	L(sum2)
    120      1.1  mrg L(1):	ldr	x2, [ap]
    121      1.1  mrg 	and	x0, x2, #0xffffffffffff
    122      1.1  mrg 	add	x0, x0, x2, lsr #48
    123      1.1  mrg 	ret
    124      1.1  mrg EPILOGUE()
    125