Home | History | Annotate | Line # | Download | only in arm64
      1 dnl  ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
      2 
      3 dnl  Copyright 2012-2014 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	     cycles/limb
     34 C Cortex-A53	 2
     35 C Cortex-A57	 1
     36 C X-Gene	 1.45
     37 
     38 define(`ap',	x0)
     39 define(`n',	x1)
     40 
     41 changecom(blah)
     42 
     43 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
     44 
     45 C TODO
     46 C  * An alternative inner loop which could run at 0.722 c/l on A57:
     47 C	adds	x8, x8, x2
     48 C	adcs	x9, x9, x3
     49 C	ldp	x2, x3, [ap, #-32]
     50 C	adcs	x10, x10, x4
     51 C	adc	x12, x12, xzr
     52 C	adds	x8, x8, x5
     53 C	ldp	x4, x5, [ap, #-16]
     54 C	sub	n, n, #6
     55 C	adcs	x9, x9, x6
     56 C	adcs	x10, x10, x7
     57 C	ldp	x6, x7, [ap], #48
     58 C	adc	x12, x12, xzr
     59 C	tbz	n, #63, L(top)
     60 
     61 ASM_START()
     62 	TEXT
     63 	ALIGN(32)
     64 PROLOGUE(mpn_mod_34lsub1)
     65 	subs	n, n, #3
     66 	mov	x8, #0
     67 	b.lt	L(le2)			C n <= 2
     68 
     69 	ldp	x2, x3, [ap, #0]
     70 	ldr	x4, [ap, #16]
     71 	add	ap, ap, #24
     72 	subs	n, n, #3
     73 	b.lt	L(sum)			C n <= 5
     74 	cmn	x0, #0			C clear carry
     75 
     76 L(top):	ldp	x5, x6, [ap, #0]
     77 	ldr	x7, [ap, #16]
     78 	add	ap, ap, #24
     79 	sub	n, n, #3
     80 	adcs	x2, x2, x5
     81 	adcs	x3, x3, x6
     82 	adcs	x4, x4, x7
     83 	tbz	n, #63, L(top)
     84 
     85 	adc	x8, xzr, xzr		C x8 <= 1
     86 
     87 L(sum):	cmn	n, #2
     88 	mov	x5, #0
     89 	b.lo	1f
     90 	ldr	x5, [ap], #8
     91 1:	mov	x6, #0
     92 	b.ls	1f
     93 	ldr	x6, [ap], #8
     94 1:	adds	x2, x2, x5
     95 	adcs	x3, x3, x6
     96 	adcs	x4, x4, xzr
     97 	adc	x8, x8, xzr		C x8 <= 2
     98 
     99 L(sum2):
    100 	and	x0, x2, #0xffffffffffff
    101 	add	x0, x0, x2, lsr #48
    102 	add	x0, x0, x8
    103 
    104 	lsl	x8, x3, #16
    105 	and	x1, x8, #0xffffffffffff
    106 	add	x0, x0, x1
    107 	add	x0, x0, x3, lsr #32
    108 
    109 	lsl	x8, x4, #32
    110 	and	x1, x8, #0xffffffffffff
    111 	add	x0, x0, x1
    112 	add	x0, x0, x4, lsr #16
    113 	ret
    114 
    115 L(le2):	cmn	n, #1
    116 	b.ne	L(1)
    117 	ldp	x2, x3, [ap]
    118 	mov	x4, #0
    119 	b	L(sum2)
    120 L(1):	ldr	x2, [ap]
    121 	and	x0, x2, #0xffffffffffff
    122 	add	x0, x0, x2, lsr #48
    123 	ret
    124 EPILOGUE()
    125