mod_34lsub1.asm revision 1.1.1.2 1 1.1 mrg dnl ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2 1.1 mrg
3 1.1 mrg dnl Copyright 2012-2014 Free Software Foundation, Inc.
4 1.1 mrg
5 1.1 mrg dnl This file is part of the GNU MP Library.
6 1.1 mrg dnl
7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 1.1 mrg dnl it under the terms of either:
9 1.1 mrg dnl
10 1.1 mrg dnl * the GNU Lesser General Public License as published by the Free
11 1.1 mrg dnl Software Foundation; either version 3 of the License, or (at your
12 1.1 mrg dnl option) any later version.
13 1.1 mrg dnl
14 1.1 mrg dnl or
15 1.1 mrg dnl
16 1.1 mrg dnl * the GNU General Public License as published by the Free Software
17 1.1 mrg dnl Foundation; either version 2 of the License, or (at your option) any
18 1.1 mrg dnl later version.
19 1.1 mrg dnl
20 1.1 mrg dnl or both in parallel, as here.
21 1.1 mrg dnl
22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 1.1 mrg dnl for more details.
26 1.1 mrg dnl
27 1.1 mrg dnl You should have received copies of the GNU General Public License and the
28 1.1 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 1.1 mrg dnl see https://www.gnu.org/licenses/.
30 1.1 mrg
31 1.1 mrg include(`../config.m4')
32 1.1 mrg
33 1.1 mrg C cycles/limb
34 1.1.1.2 mrg C Cortex-A53 2
35 1.1.1.2 mrg C Cortex-A57 1
36 1.1.1.2 mrg C X-Gene 1.45
37 1.1 mrg
38 1.1 mrg define(`ap', x0)
39 1.1 mrg define(`n', x1)
40 1.1 mrg
41 1.1.1.2 mrg changecom(blah)
42 1.1 mrg
43 1.1 mrg C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
44 1.1 mrg
45 1.1 mrg C TODO
46 1.1.1.2 mrg C * An alternative inner loop which could run at 0.722 c/l on A57:
47 1.1 mrg C adds x8, x8, x2
48 1.1 mrg C adcs x9, x9, x3
49 1.1 mrg C ldp x2, x3, [ap, #-32]
50 1.1 mrg C adcs x10, x10, x4
51 1.1 mrg C adc x12, x12, xzr
52 1.1 mrg C adds x8, x8, x5
53 1.1 mrg C ldp x4, x5, [ap, #-16]
54 1.1 mrg C sub n, n, #6
55 1.1 mrg C adcs x9, x9, x6
56 1.1 mrg C adcs x10, x10, x7
57 1.1 mrg C ldp x6, x7, [ap], #48
58 1.1 mrg C adc x12, x12, xzr
59 1.1 mrg C tbz n, #63, L(top)
60 1.1 mrg
61 1.1 mrg ASM_START()
62 1.1 mrg TEXT
63 1.1 mrg ALIGN(32)
64 1.1 mrg PROLOGUE(mpn_mod_34lsub1)
65 1.1 mrg subs n, n, #3
66 1.1 mrg mov x8, #0
67 1.1 mrg b.lt L(le2) C n <= 2
68 1.1 mrg
69 1.1 mrg ldp x2, x3, [ap, #0]
70 1.1 mrg ldr x4, [ap, #16]
71 1.1 mrg add ap, ap, #24
72 1.1 mrg subs n, n, #3
73 1.1 mrg b.lt L(sum) C n <= 5
74 1.1 mrg cmn x0, #0 C clear carry
75 1.1 mrg
76 1.1 mrg L(top): ldp x5, x6, [ap, #0]
77 1.1 mrg ldr x7, [ap, #16]
78 1.1 mrg add ap, ap, #24
79 1.1 mrg sub n, n, #3
80 1.1 mrg adcs x2, x2, x5
81 1.1 mrg adcs x3, x3, x6
82 1.1 mrg adcs x4, x4, x7
83 1.1 mrg tbz n, #63, L(top)
84 1.1 mrg
85 1.1 mrg adc x8, xzr, xzr C x8 <= 1
86 1.1 mrg
87 1.1 mrg L(sum): cmn n, #2
88 1.1 mrg mov x5, #0
89 1.1 mrg b.lo 1f
90 1.1 mrg ldr x5, [ap], #8
91 1.1 mrg 1: mov x6, #0
92 1.1 mrg b.ls 1f
93 1.1 mrg ldr x6, [ap], #8
94 1.1 mrg 1: adds x2, x2, x5
95 1.1 mrg adcs x3, x3, x6
96 1.1 mrg adcs x4, x4, xzr
97 1.1 mrg adc x8, x8, xzr C x8 <= 2
98 1.1 mrg
99 1.1 mrg L(sum2):
100 1.1 mrg and x0, x2, #0xffffffffffff
101 1.1 mrg add x0, x0, x2, lsr #48
102 1.1 mrg add x0, x0, x8
103 1.1 mrg
104 1.1 mrg lsl x8, x3, #16
105 1.1 mrg and x1, x8, #0xffffffffffff
106 1.1 mrg add x0, x0, x1
107 1.1 mrg add x0, x0, x3, lsr #32
108 1.1 mrg
109 1.1 mrg lsl x8, x4, #32
110 1.1 mrg and x1, x8, #0xffffffffffff
111 1.1 mrg add x0, x0, x1
112 1.1 mrg add x0, x0, x4, lsr #16
113 1.1 mrg ret
114 1.1 mrg
115 1.1 mrg L(le2): cmn n, #1
116 1.1 mrg b.ne L(1)
117 1.1 mrg ldp x2, x3, [ap]
118 1.1 mrg mov x4, #0
119 1.1 mrg b L(sum2)
120 1.1 mrg L(1): ldr x2, [ap]
121 1.1 mrg and x0, x2, #0xffffffffffff
122 1.1 mrg add x0, x0, x2, lsr #48
123 1.1 mrg ret
124 1.1 mrg EPILOGUE()
125