rsh1aors_n.asm revision 1.1.1.1 1 dnl ARM64 mpn_rsh1add_n and mpn_rsh1sub_n.
2
3 dnl Contributed to the GNU project by Torbjrn Granlund.
4
5 dnl Copyright 2017 Free Software Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
32
33 include(`../config.m4')
34
35 C cycles/limb assumed optimal c/l
36 C Cortex-A53 3.25-3.75 3.0 steady
37 C Cortex-A57 2.15 1.75
38 C X-Gene 2.75 2.5
39
40 changecom(blah)
41
42 define(`rp', `x0')
43 define(`up', `x1')
44 define(`vp', `x2')
45 define(`n', `x3')
46
47 ifdef(`OPERATION_rsh1add_n', `
48 define(`ADDSUB', adds)
49 define(`ADDSUBC', adcs)
50 define(`COND', `cs')
51 define(`func_n', mpn_rsh1add_n)')
52 ifdef(`OPERATION_rsh1sub_n', `
53 define(`ADDSUB', subs)
54 define(`ADDSUBC', sbcs)
55 define(`COND', `cc')
56 define(`func_n', mpn_rsh1sub_n)')
57
58 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
59
60 ASM_START()
61 PROLOGUE(func_n)
62 lsr x18, n, #2
63
64 tbz n, #0, L(bx0)
65
66 L(bx1): ldr x5, [up],#8
67 ldr x9, [vp],#8
68 tbnz n, #1, L(b11)
69
70 L(b01): ADDSUB x13, x5, x9
71 and x10, x13, #1
72 cbz x18, L(1)
73 ldp x4, x5, [up],#48
74 ldp x8, x9, [vp],#48
75 ADDSUBC x14, x4, x8
76 ADDSUBC x15, x5, x9
77 ldp x4, x5, [up,#-32]
78 ldp x8, x9, [vp,#-32]
79 extr x17, x14, x13, #1
80 ADDSUBC x12, x4, x8
81 ADDSUBC x13, x5, x9
82 str x17, [rp], #24
83 sub x18, x18, #1
84 cbz x18, L(end)
85 b L(top)
86
87 L(1): cset x14, COND
88 extr x17, x14, x13, #1
89 str x17, [rp]
90 mov x0, x10
91 ret
92
93 L(b11): ADDSUB x15, x5, x9
94 and x10, x15, #1
95
96 ldp x4, x5, [up],#32
97 ldp x8, x9, [vp],#32
98 ADDSUBC x12, x4, x8
99 ADDSUBC x13, x5, x9
100 cbz x18, L(3)
101 ldp x4, x5, [up,#-16]
102 ldp x8, x9, [vp,#-16]
103 extr x17, x12, x15, #1
104 ADDSUBC x14, x4, x8
105 ADDSUBC x15, x5, x9
106 str x17, [rp], #8
107 b L(mid)
108
109 L(3): extr x17, x12, x15, #1
110 str x17, [rp], #8
111 b L(2)
112
113 L(bx0): tbz n, #1, L(b00)
114
115 L(b10): ldp x4, x5, [up],#32
116 ldp x8, x9, [vp],#32
117 ADDSUB x12, x4, x8
118 ADDSUBC x13, x5, x9
119 and x10, x12, #1
120 cbz x18, L(2)
121 ldp x4, x5, [up,#-16]
122 ldp x8, x9, [vp,#-16]
123 ADDSUBC x14, x4, x8
124 ADDSUBC x15, x5, x9
125 b L(mid)
126
127 L(b00): ldp x4, x5, [up],#48
128 ldp x8, x9, [vp],#48
129 ADDSUB x14, x4, x8
130 ADDSUBC x15, x5, x9
131 and x10, x14, #1
132 ldp x4, x5, [up,#-32]
133 ldp x8, x9, [vp,#-32]
134 ADDSUBC x12, x4, x8
135 ADDSUBC x13, x5, x9
136 add rp, rp, #16
137 sub x18, x18, #1
138 cbz x18, L(end)
139
140 ALIGN(16)
141 L(top): ldp x4, x5, [up,#-16]
142 ldp x8, x9, [vp,#-16]
143 extr x16, x15, x14, #1
144 extr x17, x12, x15, #1
145 ADDSUBC x14, x4, x8
146 ADDSUBC x15, x5, x9
147 stp x16, x17, [rp,#-16]
148 L(mid): ldp x4, x5, [up],#32
149 ldp x8, x9, [vp],#32
150 extr x16, x13, x12, #1
151 extr x17, x14, x13, #1
152 ADDSUBC x12, x4, x8
153 ADDSUBC x13, x5, x9
154 stp x16, x17, [rp],#32
155 sub x18, x18, #1
156 cbnz x18, L(top)
157
158 L(end): extr x16, x15, x14, #1
159 extr x17, x12, x15, #1
160 stp x16, x17, [rp,#-16]
161 L(2): cset x14, COND
162 extr x16, x13, x12, #1
163 extr x17, x14, x13, #1
164 stp x16, x17, [rp]
165
166 L(ret): mov x0, x10
167 ret
168 EPILOGUE()
169