1 1.1.1.2 mrg dnl ARM64 mpn_addmul_1 and mpn_submul_1 2 1.1 mrg 3 1.1 mrg dnl Contributed to the GNU project by Torbjrn Granlund. 4 1.1 mrg 5 1.1.1.2 mrg dnl Copyright 2013, 2015, 2017 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1 mrg dnl it under the terms of either: 11 1.1 mrg dnl 12 1.1 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1 mrg dnl option) any later version. 15 1.1 mrg dnl 16 1.1 mrg dnl or 17 1.1 mrg dnl 18 1.1 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1 mrg dnl later version. 21 1.1 mrg dnl 22 1.1 mrg dnl or both in parallel, as here. 23 1.1 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1 mrg dnl for more details. 28 1.1 mrg dnl 29 1.1 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg C cycles/limb 36 1.1.1.2 mrg C Cortex-A53 9.3-9.8 37 1.1.1.2 mrg C Cortex-A57 7.0 38 1.1.1.2 mrg C X-Gene 5.0 39 1.1.1.2 mrg 40 1.1.1.2 mrg C NOTES 41 1.1.1.2 mrg C * It is possible to keep the carry chain alive between the addition blocks 42 1.1.1.2 mrg C and thus avoid csinc, but only for addmul_1. Since that saves no time 43 1.1.1.2 mrg C on the tested pipelines, we keep addmul_1 and submul_1 similar. 44 1.1.1.2 mrg C * We could separate feed-in into 4 blocks, one for each residue (mod 4). 45 1.1.1.2 mrg C That is likely to save a few cycles. 46 1.1 mrg 47 1.1.1.2 mrg changecom(blah) 48 1.1 mrg 49 1.1 mrg define(`rp', `x0') 50 1.1 mrg define(`up', `x1') 51 1.1 mrg define(`n', `x2') 52 1.1 mrg define(`v0', `x3') 53 1.1 mrg 54 1.1 mrg ifdef(`OPERATION_addmul_1', ` 55 1.1 mrg define(`ADDSUB', adds) 56 1.1 mrg define(`ADDSUBC', adcs) 57 1.1 mrg define(`COND', `cc') 58 1.1 mrg define(`func', mpn_addmul_1)') 59 1.1 mrg ifdef(`OPERATION_submul_1', ` 60 1.1 mrg define(`ADDSUB', subs) 61 1.1 mrg define(`ADDSUBC', sbcs) 62 1.1 mrg define(`COND', `cs') 63 1.1 mrg define(`func', mpn_submul_1)') 64 1.1 mrg 65 1.1 mrg MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 66 1.1 mrg 67 1.1 mrg PROLOGUE(func) 68 1.1.1.2 mrg adds x15, xzr, xzr 69 1.1 mrg 70 1.1 mrg tbz n, #0, L(1) 71 1.1 mrg 72 1.1 mrg ldr x4, [up],#8 73 1.1 mrg mul x8, x4, v0 74 1.1 mrg umulh x12, x4, v0 75 1.1.1.2 mrg ldr x4, [rp] 76 1.1 mrg ADDSUB x8, x4, x8 77 1.1 mrg csinc x15, x12, x12, COND 78 1.1 mrg str x8, [rp],#8 79 1.1 mrg 80 1.1 mrg L(1): tbz n, #1, L(2) 81 1.1 mrg 82 1.1 mrg ldp x4, x5, [up],#16 83 1.1 mrg mul x8, x4, v0 84 1.1 mrg umulh x12, x4, v0 85 1.1 mrg mul x9, x5, v0 86 1.1 mrg umulh x13, x5, v0 87 1.1 mrg adds x8, x8, x15 88 1.1 mrg adcs x9, x9, x12 89 1.1.1.2 mrg ldp x4, x5, [rp] 90 1.1 mrg adc x15, x13, xzr 91 1.1 mrg ADDSUB x8, x4, x8 92 1.1 mrg ADDSUBC x9, x5, x9 93 1.1 mrg csinc x15, x15, x15, COND 94 1.1 mrg stp x8, x9, [rp],#16 95 1.1 mrg 96 1.1 mrg L(2): lsr n, n, #2 97 1.1.1.2 mrg cbz n, L(le3) 98 1.1.1.2 mrg ldp x4, x5, [up],#32 99 1.1.1.2 mrg ldp x6, x7, [up,#-16] 100 1.1.1.2 mrg b L(mid) 101 1.1.1.2 mrg L(le3): mov x0, x15 102 1.1.1.2 mrg ret 103 1.1 mrg 104 1.1.1.2 mrg ALIGN(16) 105 1.1.1.2 mrg L(top): ldp x4, x5, [up],#32 106 1.1.1.2 mrg ldp x6, x7, [up,#-16] 107 1.1.1.2 mrg ADDSUB x8, x16, x8 108 1.1.1.2 mrg ADDSUBC x9, x17, x9 109 1.1.1.2 mrg stp x8, x9, [rp],#32 110 1.1.1.2 mrg ADDSUBC x10, x12, x10 111 1.1.1.2 mrg ADDSUBC x11, x13, x11 112 1.1.1.2 mrg stp x10, x11, [rp,#-16] 113 1.1.1.2 mrg csinc x15, x15, x15, COND 114 1.1.1.2 mrg L(mid): sub n, n, #1 115 1.1 mrg mul x8, x4, v0 116 1.1 mrg umulh x12, x4, v0 117 1.1 mrg mul x9, x5, v0 118 1.1 mrg umulh x13, x5, v0 119 1.1 mrg adds x8, x8, x15 120 1.1 mrg mul x10, x6, v0 121 1.1 mrg umulh x14, x6, v0 122 1.1 mrg adcs x9, x9, x12 123 1.1 mrg mul x11, x7, v0 124 1.1 mrg umulh x15, x7, v0 125 1.1 mrg adcs x10, x10, x13 126 1.1.1.2 mrg ldp x16, x17, [rp] 127 1.1 mrg adcs x11, x11, x14 128 1.1.1.2 mrg ldp x12, x13, [rp,#16] 129 1.1 mrg adc x15, x15, xzr 130 1.1 mrg cbnz n, L(top) 131 1.1 mrg 132 1.1.1.2 mrg ADDSUB x8, x16, x8 133 1.1.1.2 mrg ADDSUBC x9, x17, x9 134 1.1.1.2 mrg ADDSUBC x10, x12, x10 135 1.1.1.2 mrg ADDSUBC x11, x13, x11 136 1.1.1.2 mrg stp x8, x9, [rp] 137 1.1.1.2 mrg stp x10, x11, [rp,#16] 138 1.1.1.2 mrg csinc x0, x15, x15, COND 139 1.1 mrg ret 140 1.1 mrg EPILOGUE() 141