1 1.1 mrg dnl SPARC v9 64-bit mpn_sqr_diagonal. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.2 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1.1.2 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg C cycles/limb 34 1.1 mrg C UltraSPARC 1&2: 22 35 1.1 mrg C UltraSPARC 3: 36 36 1.1 mrg 37 1.1 mrg C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the 38 1.1 mrg C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal 39 1.1 mrg C code using the same algorithm. For 1-3 limbs, a special loop was generated, 40 1.1 mrg C which causes performance problems in particular for 2 and 3 limbs. 41 1.1 mrg C Ultimately, this should be replaced by hand-written code in the same software 42 1.1 mrg C pipeline style as e.g., addmul_1.asm. 43 1.1 mrg 44 1.1 mrg ASM_START() 45 1.1 mrg REGISTER(%g2,#scratch) 46 1.1 mrg REGISTER(%g3,#scratch) 47 1.1 mrg PROLOGUE(mpn_sqr_diagonal) 48 1.1 mrg save %sp, -240, %sp 49 1.1 mrg 50 1.1 mrg sethi %hi(0x1ffc00), %o0 51 1.1 mrg sethi %hi(0x3ffc00), %o1 52 1.1 mrg add %o0, 1023, %o7 53 1.1 mrg cmp %i2, 4 54 1.1 mrg add %o1, 1023, %o4 55 1.1 mrg or %g0, %i1, %g1 56 1.1 mrg or %g0, %i0, %o0 57 1.1 mrg bl,pn %xcc, .Lsmall 58 1.1 mrg or %g0, 0, %g2 59 1.1 mrg 60 1.1 mrg ldx [%i1], %o1 61 1.1 mrg add %i1, 24, %g1 62 1.1 mrg or %g0, 3, %g2 63 1.1 mrg srlx %o1, 42, %g3 64 1.1 mrg stx %g3, [%sp+2279] 65 1.1 mrg and %o1, %o7, %o2 66 1.1 mrg stx %o2, [%sp+2263] 67 1.1 mrg srlx %o1, 21, %o1 68 1.1 mrg ldd [%sp+2279], %f0 69 1.1 mrg and %o1, %o7, %o1 70 1.1 mrg stx %o1, [%sp+2271] 71 1.1 mrg ldx [%i1+8], %o2 72 1.1 mrg fxtod %f0, %f12 73 1.1 mrg srlx %o2, 21, %o1 74 1.1 mrg and %o2, %o7, %g3 75 1.1 mrg ldd [%sp+2263], %f2 76 1.1 mrg fmuld %f12, %f12, %f10 77 1.1 mrg srlx %o2, 42, %o2 78 1.1 mrg ldd [%sp+2271], %f0 79 1.1 mrg and %o1, %o7, %o1 80 1.1 mrg fxtod %f2, %f8 81 1.1 mrg stx %o2, [%sp+2279] 82 1.1 mrg stx %o1, [%sp+2271] 83 1.1 mrg fxtod %f0, %f0 84 1.1 mrg stx %g3, [%sp+2263] 85 1.1 mrg fdtox %f10, %f14 86 1.1 mrg fmuld %f12, %f8, %f6 87 1.1 mrg ldx [%i1+16], %o2 88 1.1 mrg std %f14, [%sp+2255] 89 1.1 mrg fmuld %f0, %f0, %f2 90 1.1 mrg fmuld %f8, %f8, %f10 91 1.1 mrg srlx %o2, 42, %o1 92 1.1 mrg faddd %f6, %f6, %f6 93 1.1 mrg fmuld %f12, %f0, %f12 94 1.1 mrg fmuld %f0, %f8, %f8 95 1.1 mrg ldd [%sp+2279], %f0 96 1.1 mrg ldd [%sp+2263], %f4 97 1.1 mrg fdtox %f10, %f10 98 1.1 mrg std %f10, [%sp+2239] 99 1.1 mrg faddd %f2, %f6, %f6 100 1.1 mrg ldd [%sp+2271], %f2 101 1.1 mrg fdtox %f12, %f12 102 1.1 mrg std %f12, [%sp+2247] 103 1.1 mrg fdtox %f8, %f8 104 1.1 mrg std %f8, [%sp+2231] 105 1.1 mrg fdtox %f6, %f6 106 1.1 mrg std %f6, [%sp+2223] 107 1.1 mrg 108 1.1 mrg .Loop: srlx %o2, 21, %g3 109 1.1 mrg stx %o1, [%sp+2279] 110 1.1 mrg add %g2, 1, %g2 111 1.1 mrg and %g3, %o7, %o1 112 1.1 mrg ldx [%sp+2255], %g4 113 1.1 mrg cmp %g2, %i2 114 1.1 mrg stx %o1, [%sp+2271] 115 1.1 mrg add %g1, 8, %g1 116 1.1 mrg add %o0, 16, %o0 117 1.1 mrg ldx [%sp+2239], %o1 118 1.1 mrg fxtod %f0, %f10 119 1.1 mrg fxtod %f4, %f14 120 1.1 mrg ldx [%sp+2231], %i0 121 1.1 mrg ldx [%sp+2223], %g5 122 1.1 mrg ldx [%sp+2247], %g3 123 1.1 mrg and %o2, %o7, %o2 124 1.1 mrg fxtod %f2, %f8 125 1.1 mrg fmuld %f10, %f10, %f0 126 1.1 mrg stx %o2, [%sp+2263] 127 1.1 mrg fmuld %f10, %f14, %f6 128 1.1 mrg ldx [%g1-8], %o2 129 1.1 mrg fmuld %f10, %f8, %f12 130 1.1 mrg fdtox %f0, %f2 131 1.1 mrg ldd [%sp+2279], %f0 132 1.1 mrg fmuld %f8, %f8, %f4 133 1.1 mrg faddd %f6, %f6, %f6 134 1.1 mrg fmuld %f14, %f14, %f10 135 1.1 mrg std %f2, [%sp+2255] 136 1.1 mrg sllx %g4, 20, %g4 137 1.1 mrg ldd [%sp+2271], %f2 138 1.1 mrg fmuld %f8, %f14, %f8 139 1.1 mrg sllx %i0, 22, %i1 140 1.1 mrg fdtox %f12, %f12 141 1.1 mrg std %f12, [%sp+2247] 142 1.1 mrg sllx %g5, 42, %i0 143 1.1 mrg add %o1, %i1, %o1 144 1.1 mrg faddd %f4, %f6, %f6 145 1.1 mrg ldd [%sp+2263], %f4 146 1.1 mrg add %o1, %i0, %o1 147 1.1 mrg add %g3, %g4, %g3 148 1.1 mrg fdtox %f10, %f10 149 1.1 mrg std %f10, [%sp+2239] 150 1.1 mrg srlx %o1, 42, %g4 151 1.1 mrg and %g5, %o4, %i0 152 1.1 mrg fdtox %f8, %f8 153 1.1 mrg std %f8, [%sp+2231] 154 1.1 mrg srlx %g5, 22, %g5 155 1.1 mrg sub %g4, %i0, %g4 156 1.1 mrg fdtox %f6, %f6 157 1.1 mrg std %f6, [%sp+2223] 158 1.1 mrg srlx %g4, 63, %g4 159 1.1 mrg add %g3, %g5, %g3 160 1.1 mrg add %g3, %g4, %g3 161 1.1 mrg stx %o1, [%o0-16] 162 1.1 mrg srlx %o2, 42, %o1 163 1.1 mrg bl,pt %xcc, .Loop 164 1.1 mrg stx %g3, [%o0-8] 165 1.1 mrg 166 1.1 mrg stx %o1, [%sp+2279] 167 1.1 mrg srlx %o2, 21, %o1 168 1.1 mrg fxtod %f0, %f16 169 1.1 mrg ldx [%sp+2223], %g3 170 1.1 mrg fxtod %f4, %f6 171 1.1 mrg and %o2, %o7, %o3 172 1.1 mrg stx %o3, [%sp+2263] 173 1.1 mrg fxtod %f2, %f4 174 1.1 mrg and %o1, %o7, %o1 175 1.1 mrg ldx [%sp+2231], %o2 176 1.1 mrg sllx %g3, 42, %g4 177 1.1 mrg fmuld %f16, %f16, %f14 178 1.1 mrg stx %o1, [%sp+2271] 179 1.1 mrg fmuld %f16, %f6, %f8 180 1.1 mrg add %o0, 48, %o0 181 1.1 mrg ldx [%sp+2239], %o1 182 1.1 mrg sllx %o2, 22, %o2 183 1.1 mrg fmuld %f4, %f4, %f10 184 1.1 mrg ldx [%sp+2255], %o3 185 1.1 mrg fdtox %f14, %f14 186 1.1 mrg fmuld %f4, %f6, %f2 187 1.1 mrg std %f14, [%sp+2255] 188 1.1 mrg faddd %f8, %f8, %f12 189 1.1 mrg add %o1, %o2, %o2 190 1.1 mrg fmuld %f16, %f4, %f4 191 1.1 mrg ldd [%sp+2279], %f0 192 1.1 mrg sllx %o3, 20, %g5 193 1.1 mrg add %o2, %g4, %o2 194 1.1 mrg fmuld %f6, %f6, %f6 195 1.1 mrg srlx %o2, 42, %o3 196 1.1 mrg and %g3, %o4, %g4 197 1.1 mrg srlx %g3, 22, %g3 198 1.1 mrg faddd %f10, %f12, %f16 199 1.1 mrg ldd [%sp+2271], %f12 200 1.1 mrg ldd [%sp+2263], %f8 201 1.1 mrg fxtod %f0, %f0 202 1.1 mrg sub %o3, %g4, %o3 203 1.1 mrg ldx [%sp+2247], %o1 204 1.1 mrg srlx %o3, 63, %o3 205 1.1 mrg fdtox %f2, %f10 206 1.1 mrg fxtod %f8, %f8 207 1.1 mrg std %f10, [%sp+2231] 208 1.1 mrg fdtox %f6, %f6 209 1.1 mrg std %f6, [%sp+2239] 210 1.1 mrg add %o1, %g5, %o1 211 1.1 mrg fmuld %f0, %f0, %f2 212 1.1 mrg fdtox %f16, %f16 213 1.1 mrg std %f16, [%sp+2223] 214 1.1 mrg add %o1, %g3, %o1 215 1.1 mrg fdtox %f4, %f4 216 1.1 mrg std %f4, [%sp+2247] 217 1.1 mrg fmuld %f0, %f8, %f10 218 1.1 mrg fxtod %f12, %f12 219 1.1 mrg add %o1, %o3, %o1 220 1.1 mrg stx %o2, [%o0-48] 221 1.1 mrg fmuld %f8, %f8, %f6 222 1.1 mrg stx %o1, [%o0-40] 223 1.1 mrg fdtox %f2, %f2 224 1.1 mrg ldx [%sp+2231], %o2 225 1.1 mrg faddd %f10, %f10, %f10 226 1.1 mrg ldx [%sp+2223], %g3 227 1.1 mrg fmuld %f12, %f12, %f4 228 1.1 mrg fdtox %f6, %f6 229 1.1 mrg ldx [%sp+2239], %o1 230 1.1 mrg sllx %o2, 22, %o2 231 1.1 mrg fmuld %f12, %f8, %f8 232 1.1 mrg sllx %g3, 42, %g5 233 1.1 mrg ldx [%sp+2255], %o3 234 1.1 mrg fmuld %f0, %f12, %f0 235 1.1 mrg add %o1, %o2, %o2 236 1.1 mrg faddd %f4, %f10, %f4 237 1.1 mrg ldx [%sp+2247], %o1 238 1.1 mrg add %o2, %g5, %o2 239 1.1 mrg and %g3, %o4, %g4 240 1.1 mrg fdtox %f8, %f8 241 1.1 mrg sllx %o3, 20, %g5 242 1.1 mrg std %f8, [%sp+2231] 243 1.1 mrg fdtox %f0, %f0 244 1.1 mrg srlx %o2, 42, %o3 245 1.1 mrg add %o1, %g5, %o1 246 1.1 mrg fdtox %f4, %f4 247 1.1 mrg srlx %g3, 22, %g3 248 1.1 mrg sub %o3, %g4, %o3 249 1.1 mrg std %f6, [%sp+2239] 250 1.1 mrg std %f4, [%sp+2223] 251 1.1 mrg srlx %o3, 63, %o3 252 1.1 mrg add %o1, %g3, %o1 253 1.1 mrg std %f2, [%sp+2255] 254 1.1 mrg add %o1, %o3, %o1 255 1.1 mrg std %f0, [%sp+2247] 256 1.1 mrg stx %o2, [%o0-32] 257 1.1 mrg stx %o1, [%o0-24] 258 1.1 mrg ldx [%sp+2231], %o2 259 1.1 mrg ldx [%sp+2223], %o3 260 1.1 mrg ldx [%sp+2239], %o1 261 1.1 mrg sllx %o2, 22, %o2 262 1.1 mrg sllx %o3, 42, %g5 263 1.1 mrg ldx [%sp+2255], %g4 264 1.1 mrg and %o3, %o4, %g3 265 1.1 mrg add %o1, %o2, %o2 266 1.1 mrg ldx [%sp+2247], %o1 267 1.1 mrg add %o2, %g5, %o2 268 1.1 mrg stx %o2, [%o0-16] 269 1.1 mrg sllx %g4, 20, %g4 270 1.1 mrg srlx %o2, 42, %o2 271 1.1 mrg add %o1, %g4, %o1 272 1.1 mrg srlx %o3, 22, %o3 273 1.1 mrg sub %o2, %g3, %o2 274 1.1 mrg srlx %o2, 63, %o2 275 1.1 mrg add %o1, %o3, %o1 276 1.1 mrg add %o1, %o2, %o1 277 1.1 mrg stx %o1, [%o0-8] 278 1.1 mrg ret 279 1.1 mrg restore %g0, %g0, %g0 280 1.1 mrg .Lsmall: 281 1.1 mrg ldx [%g1], %o2 282 1.1 mrg .Loop0: 283 1.1 mrg and %o2, %o7, %o1 284 1.1 mrg stx %o1, [%sp+2263] 285 1.1 mrg add %g2, 1, %g2 286 1.1 mrg srlx %o2, 21, %o1 287 1.1 mrg add %g1, 8, %g1 288 1.1 mrg srlx %o2, 42, %o2 289 1.1 mrg stx %o2, [%sp+2279] 290 1.1 mrg and %o1, %o7, %o1 291 1.1 mrg ldd [%sp+2263], %f0 292 1.1 mrg cmp %g2, %i2 293 1.1 mrg stx %o1, [%sp+2271] 294 1.1 mrg fxtod %f0, %f6 295 1.1 mrg ldd [%sp+2279], %f0 296 1.1 mrg ldd [%sp+2271], %f4 297 1.1 mrg fxtod %f0, %f2 298 1.1 mrg fmuld %f6, %f6, %f0 299 1.1 mrg fxtod %f4, %f10 300 1.1 mrg fmuld %f2, %f6, %f4 301 1.1 mrg fdtox %f0, %f0 302 1.1 mrg std %f0, [%sp+2239] 303 1.1 mrg fmuld %f10, %f6, %f8 304 1.1 mrg fmuld %f10, %f10, %f0 305 1.1 mrg faddd %f4, %f4, %f6 306 1.1 mrg fmuld %f2, %f2, %f4 307 1.1 mrg fdtox %f8, %f8 308 1.1 mrg std %f8, [%sp+2231] 309 1.1 mrg fmuld %f2, %f10, %f2 310 1.1 mrg faddd %f0, %f6, %f0 311 1.1 mrg fdtox %f4, %f4 312 1.1 mrg std %f4, [%sp+2255] 313 1.1 mrg fdtox %f2, %f2 314 1.1 mrg std %f2, [%sp+2247] 315 1.1 mrg fdtox %f0, %f0 316 1.1 mrg std %f0, [%sp+2223] 317 1.1 mrg ldx [%sp+2239], %o1 318 1.1 mrg ldx [%sp+2255], %g4 319 1.1 mrg ldx [%sp+2231], %o2 320 1.1 mrg sllx %g4, 20, %g4 321 1.1 mrg ldx [%sp+2223], %o3 322 1.1 mrg sllx %o2, 22, %o2 323 1.1 mrg sllx %o3, 42, %g5 324 1.1 mrg add %o1, %o2, %o2 325 1.1 mrg ldx [%sp+2247], %o1 326 1.1 mrg add %o2, %g5, %o2 327 1.1 mrg stx %o2, [%o0] 328 1.1 mrg and %o3, %o4, %g3 329 1.1 mrg srlx %o2, 42, %o2 330 1.1 mrg add %o1, %g4, %o1 331 1.1 mrg srlx %o3, 22, %o3 332 1.1 mrg sub %o2, %g3, %o2 333 1.1 mrg srlx %o2, 63, %o2 334 1.1 mrg add %o1, %o3, %o1 335 1.1 mrg add %o1, %o2, %o1 336 1.1 mrg stx %o1, [%o0+8] 337 1.1 mrg add %o0, 16, %o0 338 1.1 mrg bl,a,pt %xcc, .Loop0 339 1.1 mrg ldx [%g1], %o2 340 1.1 mrg ret 341 1.1 mrg restore %g0, %g0, %g0 342 1.1 mrg EPILOGUE(mpn_sqr_diagonal) 343