1 1.1 mrg dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5. 2 1.1 mrg 3 1.1 mrg dnl Contributed to the GNU project by Torbjrn Granlund. 4 1.1 mrg 5 1.1 mrg dnl Copyright 2013 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1 mrg dnl it under the terms of either: 11 1.1 mrg dnl 12 1.1 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1 mrg dnl option) any later version. 15 1.1 mrg dnl 16 1.1 mrg dnl or 17 1.1 mrg dnl 18 1.1 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1 mrg dnl later version. 21 1.1 mrg dnl 22 1.1 mrg dnl or both in parallel, as here. 23 1.1 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1 mrg dnl for more details. 28 1.1 mrg dnl 29 1.1 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg 36 1.1 mrg C cycles/limb cycles/limb 37 1.1 mrg C mul_4 addmul_4 38 1.1 mrg C UltraSPARC T3: 21.5 22.0 39 1.1 mrg C UltraSPARC T4: 2.625 2.75 40 1.1 mrg 41 1.1 mrg 42 1.1 mrg C The code is well-scheduled and relies on OoO very little. There is hope that 43 1.1 mrg C this will run at around 2.5 and 2.75 c/l respectively, on T4. 44 1.1 mrg 45 1.1 mrg define(`rp', `%i0') 46 1.1 mrg define(`up', `%i1') 47 1.1 mrg define(`n', `%i2') 48 1.1 mrg define(`vp', `%i3') 49 1.1 mrg 50 1.1 mrg define(`v0', `%g1') 51 1.1 mrg define(`v1', `%o7') 52 1.1 mrg define(`v2', `%g2') 53 1.1 mrg define(`v3', `%i3') 54 1.1 mrg 55 1.1 mrg define(`w0', `%o0') 56 1.1 mrg define(`w1', `%o1') 57 1.1 mrg define(`w2', `%o2') 58 1.1 mrg define(`w3', `%o3') 59 1.1 mrg define(`w4', `%o4') 60 1.1 mrg 61 1.1 mrg define(`r0', `%o5') 62 1.1 mrg 63 1.1 mrg define(`u0', `%i4') 64 1.1 mrg define(`u1', `%i5') 65 1.1 mrg 66 1.1 mrg define(`rp0', `rp') 67 1.1 mrg define(`rp1', `%g3') 68 1.1 mrg define(`rp2', `%g4') 69 1.1 mrg define(`up0', `up') 70 1.1 mrg define(`up1', `%g5') 71 1.1 mrg 72 1.1 mrg ifdef(`OPERATION_mul_4',` 73 1.1 mrg define(`AM4', `') 74 1.1 mrg define(`ADDX', `addcc`'$1') 75 1.1 mrg define(`func', `mpn_mul_4') 76 1.1 mrg ') 77 1.1 mrg ifdef(`OPERATION_addmul_4',` 78 1.1 mrg define(`AM4', `$1') 79 1.1 mrg define(`ADDX', `addxccc($1,$2,$3)') 80 1.1 mrg define(`func', `mpn_addmul_4') 81 1.1 mrg ') 82 1.1 mrg 83 1.1 mrg 84 1.1 mrg MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4) 85 1.1 mrg 86 1.1 mrg ASM_START() 87 1.1 mrg REGISTER(%g2,#scratch) 88 1.1 mrg REGISTER(%g3,#scratch) 89 1.1 mrg PROLOGUE(func) 90 1.1 mrg save %sp, -176, %sp 91 1.1 mrg 92 1.1 mrg ldx [up + 0], u1 C load up[0] early 93 1.1 mrg andcc n, 1, %g0 C is n odd? 94 1.1 mrg ldx [vp + 0], v0 95 1.1 mrg sllx n, 3, n 96 1.1 mrg ldx [vp + 8], v1 97 1.1 mrg add n, -28, n 98 1.1 mrg ldx [vp + 16], v2 99 1.1 mrg add rp, -16, rp 100 1.1 mrg ldx [vp + 24], v3 101 1.1 mrg add up, n, up0 102 1.1 mrg add rp, n, rp0 103 1.1 mrg add up0, 8, up1 104 1.1 mrg add rp0, 8, rp1 105 1.1 mrg add rp0, 16, rp2 106 1.1 mrg mulx u1, v0, %l0 107 1.1 mrg mov 0, w0 108 1.1 mrg mulx u1, v1, %l1 109 1.1 mrg mov 0, w1 110 1.1 mrg mulx u1, v2, %l2 111 1.1 mrg mov 0, w2 112 1.1 mrg mulx u1, v3, %l3 113 1.1 mrg mov 0, w3 114 1.1 mrg 115 1.1 mrg be L(evn) 116 1.1 mrg neg n, n 117 1.1 mrg 118 1.1 mrg L(odd): mov u1, u0 119 1.1 mrg ldx [up1 + n], u1 120 1.1 mrg AM4(` ldx [rp2 + n], r0') 121 1.1 mrg umulxhi(u0, v0, %l4) 122 1.1 mrg umulxhi(u0, v1, %l5) 123 1.1 mrg umulxhi(u0, v2, %l6) 124 1.1 mrg umulxhi(u0, v3, %l7) 125 1.1 mrg b L(mid) 126 1.1 mrg add n, 8, n 127 1.1 mrg 128 1.1 mrg L(evn): ldx [up1 + n], u0 129 1.1 mrg AM4(` ldx [rp2 + n], r0') 130 1.1 mrg umulxhi(u1, v0, %l4) 131 1.1 mrg umulxhi(u1, v1, %l5) 132 1.1 mrg umulxhi(u1, v2, %l6) 133 1.1 mrg umulxhi(u1, v3, %l7) 134 1.1 mrg add n, 16, n 135 1.1 mrg 136 1.1 mrg ALIGN(16) 137 1.1 mrg L(top): addcc %l0, w0, w0 138 1.1 mrg mulx u0, v0, %l0 C w 0 139 1.1 mrg addxccc(%l1, w1, w1) 140 1.1 mrg mulx u0, v1, %l1 C w 1 141 1.1 mrg addxccc(%l2, w2, w2) 142 1.1 mrg mulx u0, v2, %l2 C w 2 143 1.1 mrg addxccc(%l3, w3, w3) 144 1.1 mrg mulx u0, v3, %l3 C w 3 145 1.1 mrg ldx [up0 + n], u1 146 1.1 mrg addxc( %g0, %g0, w4) 147 1.1 mrg AM4(` addcc r0, w0, w0') 148 1.1 mrg stx w0, [rp0 + n] 149 1.1 mrg ADDX(` %l4, w1, w0') 150 1.1 mrg umulxhi(u0, v0, %l4) C w 1 151 1.1 mrg AM4(` ldx [rp1 + n], r0') 152 1.1 mrg addxccc(%l5, w2, w1) 153 1.1 mrg umulxhi(u0, v1, %l5) C w 2 154 1.1 mrg addxccc(%l6, w3, w2) 155 1.1 mrg umulxhi(u0, v2, %l6) C w 3 156 1.1 mrg addxc( %l7, w4, w3) 157 1.1 mrg umulxhi(u0, v3, %l7) C w 4 158 1.1 mrg L(mid): addcc %l0, w0, w0 159 1.1 mrg mulx u1, v0, %l0 C w 1 160 1.1 mrg addxccc(%l1, w1, w1) 161 1.1 mrg mulx u1, v1, %l1 C w 2 162 1.1 mrg addxccc(%l2, w2, w2) 163 1.1 mrg mulx u1, v2, %l2 C w 3 164 1.1 mrg addxccc(%l3, w3, w3) 165 1.1 mrg mulx u1, v3, %l3 C w 4 166 1.1 mrg ldx [up1 + n], u0 167 1.1 mrg addxc( %g0, %g0, w4) 168 1.1 mrg AM4(` addcc r0, w0, w0') 169 1.1 mrg stx w0, [rp1 + n] 170 1.1 mrg ADDX(` %l4, w1, w0') 171 1.1 mrg umulxhi(u1, v0, %l4) C w 2 172 1.1 mrg AM4(` ldx [rp2 + n], r0') 173 1.1 mrg addxccc(%l5, w2, w1) 174 1.1 mrg umulxhi(u1, v1, %l5) C w 3 175 1.1 mrg addxccc(%l6, w3, w2) 176 1.1 mrg umulxhi(u1, v2, %l6) C w 4 177 1.1 mrg addxc( %l7, w4, w3) 178 1.1 mrg umulxhi(u1, v3, %l7) C w 5 179 1.1 mrg brlz n, L(top) 180 1.1 mrg add n, 16, n 181 1.1 mrg 182 1.1 mrg L(end): addcc %l0, w0, w0 183 1.1 mrg mulx u0, v0, %l0 184 1.1 mrg addxccc(%l1, w1, w1) 185 1.1 mrg mulx u0, v1, %l1 186 1.1 mrg addxccc(%l2, w2, w2) 187 1.1 mrg mulx u0, v2, %l2 188 1.1 mrg addxccc(%l3, w3, w3) 189 1.1 mrg mulx u0, v3, %l3 190 1.1 mrg addxc( %g0, %g0, w4) 191 1.1 mrg AM4(` addcc r0, w0, w0') 192 1.1 mrg stx w0, [rp0 + n] 193 1.1 mrg ADDX(` %l4, w1, w0') 194 1.1 mrg umulxhi(u0, v0, %l4) 195 1.1 mrg AM4(` ldx [rp1 + n], r0') 196 1.1 mrg addxccc(%l5, w2, w1) 197 1.1 mrg umulxhi(u0, v1, %l5) 198 1.1 mrg addxccc(%l6, w3, w2) 199 1.1 mrg umulxhi(u0, v2, %l6) 200 1.1 mrg addxc( %l7, w4, w3) 201 1.1 mrg umulxhi(u0, v3, %l7) 202 1.1 mrg addcc %l0, w0, w0 203 1.1 mrg addxccc(%l1, w1, w1) 204 1.1 mrg addxccc(%l2, w2, w2) 205 1.1 mrg addxccc(%l3, w3, w3) 206 1.1 mrg addxc( %g0, %g0, w4) 207 1.1 mrg AM4(` addcc r0, w0, w0') 208 1.1 mrg stx w0, [rp1 + n] 209 1.1 mrg ADDX(` %l4, w1, w0') 210 1.1 mrg addxccc(%l5, w2, w1) 211 1.1 mrg addxccc(%l6, w3, w2) 212 1.1 mrg stx w0, [rp2 + n] 213 1.1 mrg add n, 16, n 214 1.1 mrg stx w1, [rp1 + n] 215 1.1 mrg stx w2, [rp2 + n] 216 1.1 mrg addxc( %l7, w4, %i0) 217 1.1 mrg ret 218 1.1 mrg restore 219 1.1 mrg EPILOGUE() 220