1 1.1 mrg dnl SPARC v9 32-bit mpn_sqr_diagonal. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2001, 2003 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.2 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1.1.2 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg C INPUT PARAMETERS 35 1.1 mrg C rp i0 36 1.1 mrg C up i1 37 1.1 mrg C n i2 38 1.1 mrg 39 1.1 mrg C This code uses a very deep software pipeline, due to the need for moving data 40 1.1 mrg C forth and back between the integer registers and floating-point registers. 41 1.1 mrg C 42 1.1 mrg C A VIS variant of this code would make the pipeline less deep, since the 43 1.1 mrg C masking now done in the integer unit could take place in the floating-point 44 1.1 mrg C unit using the FAND instruction. It would be possible to save several cycles 45 1.1 mrg C too. 46 1.1 mrg C 47 1.1 mrg C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and 48 1.1 mrg C not much slower from the Ecache. It would perhaps be possible to shave off 49 1.1 mrg C one cycle, but not easily. We cannot do better than 10 cycles/limb with the 50 1.1 mrg C used instructions, since we have 10 memory operations per limb. But a VIS 51 1.1 mrg C variant could run three cycles faster than the corresponding non-VIS code. 52 1.1 mrg 53 1.1 mrg C This is non-pipelined code showing the algorithm: 54 1.1 mrg C 55 1.1 mrg C .Loop: 56 1.1 mrg C lduw [up+0],%g4 C 00000000hhhhllll 57 1.1 mrg C sllx %g4,16,%g3 C 0000hhhhllll0000 58 1.1 mrg C or %g3,%g4,%g2 C 0000hhhhXXXXllll 59 1.1 mrg C andn %g2,%g5,%g2 C 0000hhhh0000llll 60 1.1 mrg C stx %g2,[%fp+80] 61 1.1 mrg C ldd [%fp+80],%f0 62 1.1 mrg C fitod %f0,%f4 C hi16 63 1.1 mrg C fitod %f1,%f6 C lo16 64 1.1 mrg C ld [up+0],%f9 65 1.1 mrg C fxtod %f8,%f2 66 1.1 mrg C fmuld %f2,%f4,%f4 67 1.1 mrg C fmuld %f2,%f6,%f6 68 1.1 mrg C fdtox %f4,%f4 69 1.1 mrg C fdtox %f6,%f6 70 1.1 mrg C std %f4,[%fp-24] 71 1.1 mrg C std %f6,[%fp-16] 72 1.1 mrg C ldx [%fp-24],%g2 73 1.1 mrg C ldx [%fp-16],%g1 74 1.1 mrg C sllx %g2,16,%g2 75 1.1 mrg C add %g2,%g1,%g1 76 1.1 mrg C stw %g1,[rp+0] 77 1.1 mrg C srlx %g1,32,%l0 78 1.1 mrg C stw %l0,[rp+4] 79 1.1 mrg C add up,4,up 80 1.1 mrg C subcc n,1,n 81 1.1 mrg C bne,pt %icc,.Loop 82 1.1 mrg C add rp,8,rp 83 1.1 mrg 84 1.1 mrg define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe 85 1.1 mrg 86 1.1 mrg ASM_START() 87 1.1 mrg 88 1.1 mrg TEXT 89 1.1 mrg ALIGN(4) 90 1.1 mrg .Lnoll: 91 1.1 mrg .word 0 92 1.1 mrg 93 1.1 mrg PROLOGUE(mpn_sqr_diagonal) 94 1.1 mrg save %sp,-256,%sp 95 1.1 mrg 96 1.1 mrg ifdef(`PIC', 97 1.1 mrg `.Lpc: rd %pc,%o7 98 1.1 mrg ld [%o7+.Lnoll-.Lpc],%f8', 99 1.1 mrg ` sethi %hi(.Lnoll),%g1 100 1.1 mrg ld [%g1+%lo(.Lnoll)],%f8') 101 1.1 mrg 102 1.1 mrg sethi %hi(0xffff0000),%g5 103 1.1 mrg add %i1,-8,%i1 104 1.1 mrg 105 1.1 mrg lduw [%i1+8],%g4 106 1.1 mrg add %i1,4,%i1 C s1_ptr++ 107 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 108 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 109 1.1 mrg subcc %i2,1,%i2 110 1.1 mrg bne,pt %icc,.L_grt_1 111 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 112 1.1 mrg 113 1.1 mrg add %i1,4,%i1 C s1_ptr++ 114 1.1 mrg stx %g2,[%fp+80] 115 1.1 mrg ld [%i1],%f9 116 1.1 mrg ldd [%fp+80],%f0 117 1.1 mrg fxtod %f8,%f2 118 1.1 mrg fitod %f0,%f4 119 1.1 mrg fitod %f1,%f6 120 1.1 mrg fmuld %f2,%f4,%f4 121 1.1 mrg fmuld %f2,%f6,%f6 122 1.1 mrg fdtox %f4,%f4 123 1.1 mrg fdtox %f6,%f6 124 1.1 mrg std %f4,[%fp-24] 125 1.1 mrg std %f6,[%fp-16] 126 1.1 mrg 127 1.1 mrg add %fp, 80, %l3 128 1.1 mrg add %fp, -24, %l4 129 1.1 mrg add %fp, 72, %l5 130 1.1 mrg b .L1 131 1.1 mrg add %fp, -40, %l6 132 1.1 mrg 133 1.1 mrg .L_grt_1: 134 1.1 mrg stx %g2,[%fp+80] 135 1.1 mrg lduw [%i1+8],%g4 136 1.1 mrg add %i1,4,%i1 C s1_ptr++ 137 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 138 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 139 1.1 mrg subcc %i2,1,%i2 140 1.1 mrg bne,pt %icc,.L_grt_2 141 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 142 1.1 mrg 143 1.1 mrg stx %g2,[%fp+72] 144 1.1 mrg ld [%i1],%f9 145 1.1 mrg add %i1,4,%i1 C s1_ptr++ 146 1.1 mrg ldd [%fp+80],%f0 147 1.1 mrg fxtod %f8,%f2 148 1.1 mrg fitod %f0,%f4 149 1.1 mrg fitod %f1,%f6 150 1.1 mrg fmuld %f2,%f4,%f4 151 1.1 mrg ld [%i1],%f9 152 1.1 mrg fmuld %f2,%f6,%f6 153 1.1 mrg ldd [%fp+72],%f0 154 1.1 mrg fdtox %f4,%f4 155 1.1 mrg fdtox %f6,%f6 156 1.1 mrg std %f4,[%fp-24] 157 1.1 mrg fxtod %f8,%f2 158 1.1 mrg std %f6,[%fp-16] 159 1.1 mrg fitod %f0,%f4 160 1.1 mrg fitod %f1,%f6 161 1.1 mrg fmuld %f2,%f4,%f4 162 1.1 mrg fmuld %f2,%f6,%f6 163 1.1 mrg fdtox %f4,%f4 164 1.1 mrg 165 1.1 mrg add %fp, 72, %l3 166 1.1 mrg add %fp, -40, %l4 167 1.1 mrg add %fp, 80, %l5 168 1.1 mrg b .L2 169 1.1 mrg add %fp, -24, %l6 170 1.1 mrg 171 1.1 mrg .L_grt_2: 172 1.1 mrg stx %g2,[%fp+72] 173 1.1 mrg lduw [%i1+8],%g4 174 1.1 mrg ld [%i1],%f9 175 1.1 mrg add %i1,4,%i1 C s1_ptr++ 176 1.1 mrg ldd [%fp+80],%f0 177 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 178 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 179 1.1 mrg subcc %i2,1,%i2 180 1.1 mrg fxtod %f8,%f2 181 1.1 mrg bne,pt %icc,.L_grt_3 182 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 183 1.1 mrg 184 1.1 mrg stx %g2,[%fp+80] 185 1.1 mrg fitod %f0,%f4 186 1.1 mrg fitod %f1,%f6 187 1.1 mrg fmuld %f2,%f4,%f4 188 1.1 mrg ld [%i1],%f9 189 1.1 mrg fmuld %f2,%f6,%f6 190 1.1 mrg add %i1,4,%i1 C s1_ptr++ 191 1.1 mrg ldd [%fp+72],%f0 192 1.1 mrg fdtox %f4,%f4 193 1.1 mrg fdtox %f6,%f6 194 1.1 mrg std %f4,[%fp-24] 195 1.1 mrg fxtod %f8,%f2 196 1.1 mrg std %f6,[%fp-16] 197 1.1 mrg fitod %f0,%f4 198 1.1 mrg fitod %f1,%f6 199 1.1 mrg fmuld %f2,%f4,%f4 200 1.1 mrg ld [%i1],%f9 201 1.1 mrg add %fp, 80, %l3 202 1.1 mrg fmuld %f2,%f6,%f6 203 1.1 mrg add %fp, -24, %l4 204 1.1 mrg ldd [%fp+80],%f0 205 1.1 mrg add %fp, 72, %l5 206 1.1 mrg fdtox %f4,%f4 207 1.1 mrg b .L3 208 1.1 mrg add %fp, -40, %l6 209 1.1 mrg 210 1.1 mrg .L_grt_3: 211 1.1 mrg stx %g2,[%fp+80] 212 1.1 mrg fitod %f0,%f4 213 1.1 mrg lduw [%i1+8],%g4 214 1.1 mrg fitod %f1,%f6 215 1.1 mrg fmuld %f2,%f4,%f4 216 1.1 mrg ld [%i1],%f9 217 1.1 mrg fmuld %f2,%f6,%f6 218 1.1 mrg add %i1,4,%i1 C s1_ptr++ 219 1.1 mrg ldd [%fp+72],%f0 220 1.1 mrg fdtox %f4,%f4 221 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 222 1.1 mrg fdtox %f6,%f6 223 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 224 1.1 mrg subcc %i2,1,%i2 225 1.1 mrg std %f4,[%fp-24] 226 1.1 mrg fxtod %f8,%f2 227 1.1 mrg std %f6,[%fp-16] 228 1.1 mrg bne,pt %icc,.L_grt_4 229 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 230 1.1 mrg 231 1.1 mrg stx %g2,[%fp+72] 232 1.1 mrg fitod %f0,%f4 233 1.1 mrg fitod %f1,%f6 234 1.1 mrg add %fp, 72, %l3 235 1.1 mrg fmuld %f2,%f4,%f4 236 1.1 mrg add %fp, -40, %l4 237 1.1 mrg ld [%i1],%f9 238 1.1 mrg fmuld %f2,%f6,%f6 239 1.1 mrg add %i1,4,%i1 C s1_ptr++ 240 1.1 mrg ldd [%fp+80],%f0 241 1.1 mrg add %fp, 80, %l5 242 1.1 mrg fdtox %f4,%f4 243 1.1 mrg b .L4 244 1.1 mrg add %fp, -24, %l6 245 1.1 mrg 246 1.1 mrg .L_grt_4: 247 1.1 mrg stx %g2,[%fp+72] 248 1.1 mrg fitod %f0,%f4 249 1.1 mrg lduw [%i1+8],%g4 250 1.1 mrg fitod %f1,%f6 251 1.1 mrg fmuld %f2,%f4,%f4 252 1.1 mrg ld [%i1],%f9 253 1.1 mrg fmuld %f2,%f6,%f6 254 1.1 mrg add %i1,4,%i1 C s1_ptr++ 255 1.1 mrg ldd [%fp+80],%f0 256 1.1 mrg fdtox %f4,%f4 257 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 258 1.1 mrg fdtox %f6,%f6 259 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 260 1.1 mrg subcc %i2,1,%i2 261 1.1 mrg std %f4,[%fp-40] 262 1.1 mrg fxtod %f8,%f2 263 1.1 mrg std %f6,[%fp-32] 264 1.1 mrg be,pn %icc,.L5 265 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 266 1.1 mrg 267 1.1 mrg b,a .Loop 268 1.1 mrg 269 1.1 mrg .align 16 270 1.1 mrg C --- LOOP BEGIN 271 1.1 mrg .Loop: nop 272 1.1 mrg nop 273 1.1 mrg stx %g2,[%fp+80] 274 1.1 mrg fitod %f0,%f4 275 1.1 mrg C --- 276 1.1 mrg nop 277 1.1 mrg nop 278 1.1 mrg lduw [%i1+8],%g4 279 1.1 mrg fitod %f1,%f6 280 1.1 mrg C --- 281 1.1 mrg nop 282 1.1 mrg nop 283 1.1 mrg ldx [%fp-24],%g2 C p16 284 1.1 mrg fanop 285 1.1 mrg C --- 286 1.1 mrg nop 287 1.1 mrg nop 288 1.1 mrg ldx [%fp-16],%g1 C p0 289 1.1 mrg fmuld %f2,%f4,%f4 290 1.1 mrg C --- 291 1.1 mrg sllx %g2,16,%g2 C align p16 292 1.1 mrg add %i0,8,%i0 C res_ptr++ 293 1.1 mrg ld [%i1],%f9 294 1.1 mrg fmuld %f2,%f6,%f6 295 1.1 mrg C --- 296 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 297 1.1 mrg add %i1,4,%i1 C s1_ptr++ 298 1.1 mrg ldd [%fp+72],%f0 299 1.1 mrg fanop 300 1.1 mrg C --- 301 1.1 mrg srlx %g1,32,%l0 302 1.1 mrg nop 303 1.1 mrg stw %g1,[%i0-8] 304 1.1 mrg fdtox %f4,%f4 305 1.1 mrg C --- 306 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 307 1.1 mrg nop 308 1.1 mrg stw %l0,[%i0-4] 309 1.1 mrg fdtox %f6,%f6 310 1.1 mrg C --- 311 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 312 1.1 mrg subcc %i2,1,%i2 313 1.1 mrg std %f4,[%fp-24] 314 1.1 mrg fxtod %f8,%f2 315 1.1 mrg C --- 316 1.1 mrg std %f6,[%fp-16] 317 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 318 1.1 mrg be,pn %icc,.Lend 319 1.1 mrg fanop 320 1.1 mrg C --- LOOP MIDDLE 321 1.1 mrg nop 322 1.1 mrg nop 323 1.1 mrg stx %g2,[%fp+72] 324 1.1 mrg fitod %f0,%f4 325 1.1 mrg C --- 326 1.1 mrg nop 327 1.1 mrg nop 328 1.1 mrg lduw [%i1+8],%g4 329 1.1 mrg fitod %f1,%f6 330 1.1 mrg C --- 331 1.1 mrg nop 332 1.1 mrg nop 333 1.1 mrg ldx [%fp-40],%g2 C p16 334 1.1 mrg fanop 335 1.1 mrg C --- 336 1.1 mrg nop 337 1.1 mrg nop 338 1.1 mrg ldx [%fp-32],%g1 C p0 339 1.1 mrg fmuld %f2,%f4,%f4 340 1.1 mrg C --- 341 1.1 mrg sllx %g2,16,%g2 C align p16 342 1.1 mrg add %i0,8,%i0 C res_ptr++ 343 1.1 mrg ld [%i1],%f9 344 1.1 mrg fmuld %f2,%f6,%f6 345 1.1 mrg C --- 346 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 347 1.1 mrg add %i1,4,%i1 C s1_ptr++ 348 1.1 mrg ldd [%fp+80],%f0 349 1.1 mrg fanop 350 1.1 mrg C --- 351 1.1 mrg srlx %g1,32,%l0 352 1.1 mrg nop 353 1.1 mrg stw %g1,[%i0-8] 354 1.1 mrg fdtox %f4,%f4 355 1.1 mrg C --- 356 1.1 mrg sllx %g4,16,%g3 C 0000hhhhllll0000 357 1.1 mrg nop 358 1.1 mrg stw %l0,[%i0-4] 359 1.1 mrg fdtox %f6,%f6 360 1.1 mrg C --- 361 1.1 mrg or %g3,%g4,%g2 C 0000hhhhXXXXllll 362 1.1 mrg subcc %i2,1,%i2 363 1.1 mrg std %f4,[%fp-40] 364 1.1 mrg fxtod %f8,%f2 365 1.1 mrg C --- 366 1.1 mrg std %f6,[%fp-32] 367 1.1 mrg andn %g2,%g5,%g2 C 0000hhhh0000llll 368 1.1 mrg bne,pt %icc,.Loop 369 1.1 mrg fanop 370 1.1 mrg C --- LOOP END 371 1.1 mrg 372 1.1 mrg .L5: add %fp, 80, %l3 373 1.1 mrg add %fp, -24, %l4 374 1.1 mrg add %fp, 72, %l5 375 1.1 mrg b .Ltail 376 1.1 mrg add %fp, -40, %l6 377 1.1 mrg 378 1.1 mrg .Lend: add %fp, 72, %l3 379 1.1 mrg add %fp, -40, %l4 380 1.1 mrg add %fp, 80, %l5 381 1.1 mrg add %fp, -24, %l6 382 1.1 mrg .Ltail: stx %g2,[%l3] 383 1.1 mrg fitod %f0,%f4 384 1.1 mrg fitod %f1,%f6 385 1.1 mrg ldx [%l4],%g2 C p16 386 1.1 mrg ldx [%l4+8],%g1 C p0 387 1.1 mrg fmuld %f2,%f4,%f4 388 1.1 mrg sllx %g2,16,%g2 C align p16 389 1.1 mrg add %i0,8,%i0 C res_ptr++ 390 1.1 mrg ld [%i1],%f9 391 1.1 mrg fmuld %f2,%f6,%f6 392 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 393 1.1 mrg add %i1,4,%i1 C s1_ptr++ 394 1.1 mrg ldd [%l5],%f0 395 1.1 mrg srlx %g1,32,%l0 396 1.1 mrg stw %g1,[%i0-8] 397 1.1 mrg fdtox %f4,%f4 398 1.1 mrg stw %l0,[%i0-4] 399 1.1 mrg .L4: fdtox %f6,%f6 400 1.1 mrg std %f4,[%l4] 401 1.1 mrg fxtod %f8,%f2 402 1.1 mrg std %f6,[%l4+8] 403 1.1 mrg 404 1.1 mrg fitod %f0,%f4 405 1.1 mrg fitod %f1,%f6 406 1.1 mrg ldx [%l6],%g2 C p16 407 1.1 mrg ldx [%l6+8],%g1 C p0 408 1.1 mrg fmuld %f2,%f4,%f4 409 1.1 mrg sllx %g2,16,%g2 C align p16 410 1.1 mrg add %i0,8,%i0 C res_ptr++ 411 1.1 mrg ld [%i1],%f9 412 1.1 mrg fmuld %f2,%f6,%f6 413 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 414 1.1 mrg ldd [%l3],%f0 415 1.1 mrg srlx %g1,32,%l0 416 1.1 mrg stw %g1,[%i0-8] 417 1.1 mrg fdtox %f4,%f4 418 1.1 mrg stw %l0,[%i0-4] 419 1.1 mrg .L3: fdtox %f6,%f6 420 1.1 mrg std %f4,[%l6] 421 1.1 mrg fxtod %f8,%f2 422 1.1 mrg std %f6,[%l6+8] 423 1.1 mrg 424 1.1 mrg fitod %f0,%f4 425 1.1 mrg fitod %f1,%f6 426 1.1 mrg ldx [%l4],%g2 C p16 427 1.1 mrg ldx [%l4+8],%g1 C p0 428 1.1 mrg fmuld %f2,%f4,%f4 429 1.1 mrg sllx %g2,16,%g2 C align p16 430 1.1 mrg add %i0,8,%i0 C res_ptr++ 431 1.1 mrg fmuld %f2,%f6,%f6 432 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 433 1.1 mrg srlx %g1,32,%l0 434 1.1 mrg stw %g1,[%i0-8] 435 1.1 mrg fdtox %f4,%f4 436 1.1 mrg stw %l0,[%i0-4] 437 1.1 mrg .L2: fdtox %f6,%f6 438 1.1 mrg std %f4,[%l4] 439 1.1 mrg std %f6,[%l4+8] 440 1.1 mrg 441 1.1 mrg ldx [%l6],%g2 C p16 442 1.1 mrg ldx [%l6+8],%g1 C p0 443 1.1 mrg sllx %g2,16,%g2 C align p16 444 1.1 mrg add %i0,8,%i0 C res_ptr++ 445 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 446 1.1 mrg srlx %g1,32,%l0 447 1.1 mrg stw %g1,[%i0-8] 448 1.1 mrg stw %l0,[%i0-4] 449 1.1 mrg 450 1.1 mrg .L1: ldx [%l4],%g2 C p16 451 1.1 mrg ldx [%l4+8],%g1 C p0 452 1.1 mrg sllx %g2,16,%g2 C align p16 453 1.1 mrg add %i0,8,%i0 C res_ptr++ 454 1.1 mrg add %g2,%g1,%g1 C add p16 to p0 (ADD1) 455 1.1 mrg srlx %g1,32,%l0 456 1.1 mrg stw %g1,[%i0-8] 457 1.1 mrg stw %l0,[%i0-4] 458 1.1 mrg 459 1.1 mrg ret 460 1.1 mrg restore %g0,%g0,%o0 461 1.1 mrg 462 1.1 mrg EPILOGUE(mpn_sqr_diagonal) 463