Home | History | Annotate | Line # | Download | only in arm64
rsh1aors_n.asm revision 1.1.1.1
      1 dnl  ARM64 mpn_rsh1add_n and mpn_rsh1sub_n.
      2 
      3 dnl  Contributed to the GNU project by Torbjrn Granlund.
      4 
      5 dnl  Copyright 2017 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C	     cycles/limb   assumed optimal c/l
     36 C Cortex-A53	3.25-3.75	 3.0 steady
     37 C Cortex-A57	 2.15		 1.75
     38 C X-Gene	 2.75		 2.5
     39 
     40 changecom(blah)
     41 
     42 define(`rp', `x0')
     43 define(`up', `x1')
     44 define(`vp', `x2')
     45 define(`n',  `x3')
     46 
     47 ifdef(`OPERATION_rsh1add_n', `
     48   define(`ADDSUB',	adds)
     49   define(`ADDSUBC',	adcs)
     50   define(`COND',	`cs')
     51   define(`func_n',	mpn_rsh1add_n)')
     52 ifdef(`OPERATION_rsh1sub_n', `
     53   define(`ADDSUB',	subs)
     54   define(`ADDSUBC',	sbcs)
     55   define(`COND',	`cc')
     56   define(`func_n',	mpn_rsh1sub_n)')
     57 
     58 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
     59 
     60 ASM_START()
     61 PROLOGUE(func_n)
     62 	lsr	x18, n, #2
     63 
     64 	tbz	n, #0, L(bx0)
     65 
     66 L(bx1):	ldr	x5, [up],#8
     67 	ldr	x9, [vp],#8
     68 	tbnz	n, #1, L(b11)
     69 
     70 L(b01):	ADDSUB	x13, x5, x9
     71 	and	x10, x13, #1
     72 	cbz	x18, L(1)
     73 	ldp	x4, x5, [up],#48
     74 	ldp	x8, x9, [vp],#48
     75 	ADDSUBC	x14, x4, x8
     76 	ADDSUBC	x15, x5, x9
     77 	ldp	x4, x5, [up,#-32]
     78 	ldp	x8, x9, [vp,#-32]
     79 	extr	x17, x14, x13, #1
     80 	ADDSUBC	x12, x4, x8
     81 	ADDSUBC	x13, x5, x9
     82 	str	x17, [rp], #24
     83 	sub	x18, x18, #1
     84 	cbz	x18, L(end)
     85 	b	L(top)
     86 
     87 L(1):	cset	x14, COND
     88 	extr	x17, x14, x13, #1
     89 	str	x17, [rp]
     90 	mov	x0, x10
     91 	ret
     92 
     93 L(b11):	ADDSUB	x15, x5, x9
     94 	and	x10, x15, #1
     95 
     96 	ldp	x4, x5, [up],#32
     97 	ldp	x8, x9, [vp],#32
     98 	ADDSUBC	x12, x4, x8
     99 	ADDSUBC	x13, x5, x9
    100 	cbz	x18, L(3)
    101 	ldp	x4, x5, [up,#-16]
    102 	ldp	x8, x9, [vp,#-16]
    103 	extr	x17, x12, x15, #1
    104 	ADDSUBC	x14, x4, x8
    105 	ADDSUBC	x15, x5, x9
    106 	str	x17, [rp], #8
    107 	b	L(mid)
    108 
    109 L(3):	extr	x17, x12, x15, #1
    110 	str	x17, [rp], #8
    111 	b	L(2)
    112 
    113 L(bx0):	tbz	n, #1, L(b00)
    114 
    115 L(b10):	ldp	x4, x5, [up],#32
    116 	ldp	x8, x9, [vp],#32
    117 	ADDSUB	x12, x4, x8
    118 	ADDSUBC	x13, x5, x9
    119 	and	x10, x12, #1
    120 	cbz	x18, L(2)
    121 	ldp	x4, x5, [up,#-16]
    122 	ldp	x8, x9, [vp,#-16]
    123 	ADDSUBC	x14, x4, x8
    124 	ADDSUBC	x15, x5, x9
    125 	b	L(mid)
    126 
    127 L(b00):	ldp	x4, x5, [up],#48
    128 	ldp	x8, x9, [vp],#48
    129 	ADDSUB	x14, x4, x8
    130 	ADDSUBC	x15, x5, x9
    131 	and	x10, x14, #1
    132 	ldp	x4, x5, [up,#-32]
    133 	ldp	x8, x9, [vp,#-32]
    134 	ADDSUBC	x12, x4, x8
    135 	ADDSUBC	x13, x5, x9
    136 	add	rp, rp, #16
    137 	sub	x18, x18, #1
    138 	cbz	x18, L(end)
    139 
    140 	ALIGN(16)
    141 L(top):	ldp	x4, x5, [up,#-16]
    142 	ldp	x8, x9, [vp,#-16]
    143 	extr	x16, x15, x14, #1
    144 	extr	x17, x12, x15, #1
    145 	ADDSUBC	x14, x4, x8
    146 	ADDSUBC	x15, x5, x9
    147 	stp	x16, x17, [rp,#-16]
    148 L(mid):	ldp	x4, x5, [up],#32
    149 	ldp	x8, x9, [vp],#32
    150 	extr	x16, x13, x12, #1
    151 	extr	x17, x14, x13, #1
    152 	ADDSUBC	x12, x4, x8
    153 	ADDSUBC	x13, x5, x9
    154 	stp	x16, x17, [rp],#32
    155 	sub	x18, x18, #1
    156 	cbnz	x18, L(top)
    157 
    158 L(end):	extr	x16, x15, x14, #1
    159 	extr	x17, x12, x15, #1
    160 	stp	x16, x17, [rp,#-16]
    161 L(2):	cset	x14, COND
    162 	extr	x16, x13, x12, #1
    163 	extr	x17, x14, x13, #1
    164 	stp	x16, x17, [rp]
    165 
    166 L(ret):	mov	x0, x10
    167 	ret
    168 EPILOGUE()
    169