1 #ifndef __ASSEMBLER__ 2 # define __ASSEMBLER__ 1 3 #endif 4 #include "crypto/sparc_arch.h" 5 6 .section ".text",#alloc,#execinstr 7 8 .global bn_mul_mont_fpu 9 .align 32 10 bn_mul_mont_fpu: 11 save %sp,-STACK_FRAME-64,%sp 12 13 cmp %i5,4 14 bl,a,pn %icc,.Lret 15 clr %i0 16 andcc %i5,1,%g0 ! %i5 has to be even... 17 bnz,a,pn %icc,.Lret 18 clr %i0 ! signal "unsupported input value" 19 20 srl %i5,1,%i5 21 sethi %hi(0xffff),%l7 22 ld [%i4+0],%g4 ! %g4 reassigned, remember? 23 or %l7,%lo(0xffff),%l7 24 ld [%i4+4],%o0 25 sllx %o0,32,%o0 26 or %o0,%g4,%g4 ! %g4=n0[1].n0[0] 27 28 sll %i5,3,%i5 ! num*=8 29 30 add %sp,STACK_BIAS,%o0 ! real top of stack 31 sll %i5,2,%o1 32 add %o1,%i5,%o1 ! %o1=num*5 33 sub %o0,%o1,%o0 34 and %o0,-2048,%o0 ! optimize TLB utilization 35 sub %o0,STACK_BIAS,%sp ! alloca(5*num*8) 36 37 rd %asi,%o7 ! save %asi 38 add %sp,STACK_BIAS+STACK_FRAME+64,%l0 39 add %l0,%i5,%l1 40 add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends ! 41 add %l1,%i5,%l2 42 add %l2,%i5,%l3 43 add %l3,%i5,%l4 44 45 wr %g0,210,%asi ! setup %asi for 16-bit FP loads 46 47 add %i0,%i5,%i0 ! readjust input pointers to point 48 add %i1,%i5,%i1 ! at the ends too... 49 add %i2,%i5,%i2 50 add %i3,%i5,%i3 51 52 stx %o7,[%sp+STACK_BIAS+STACK_FRAME+48] ! save %asi 53 54 sub %g0,%i5,%l5 ! i=-num 56 sub %g0,%i5,%l6 ! j=-num 57 58 add %i1,%l6,%o3 59 add %i2,%l5,%o4 60 61 ld [%o3+4],%g1 ! bp[0] 62 ld [%o3+0],%o0 63 ld [%o4+4],%g5 ! ap[0] 64 sllx %g1,32,%g1 65 ld [%o4+0],%o1 66 sllx %g5,32,%g5 67 or %g1,%o0,%o0 68 or %g5,%o1,%o1 69 70 add %i3,%l6,%o5 71 72 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 73 mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0 74 stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0] 75 76 ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words 77 .word 0xa1b00c20 ! fzeros %f16 78 ld [%o3+4],%f19 79 .word 0xa5b00c20 ! fzeros %f18 80 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 81 .word 0xa9b00c20 ! fzeros %f20 82 ld [%o5+4],%f23 83 .word 0xadb00c20 ! fzeros %f22 84 85 ! transfer b[i] to FPU as 4x16-bit values 86 ldda [%o4+2]%asi,%f0 87 fxtod %f16,%f16 88 ldda [%o4+0]%asi,%f2 89 fxtod %f18,%f18 90 ldda [%o4+6]%asi,%f4 91 fxtod %f20,%f20 92 ldda [%o4+4]%asi,%f6 93 fxtod %f22,%f22 94 95 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 96 ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8 97 fxtod %f0,%f0 98 ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10 99 fxtod %f2,%f2 100 ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12 101 fxtod %f4,%f4 102 ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14 103 fxtod %f6,%f6 104 105 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 106 fxtod %f8,%f8 107 std %f18,[%l2+%l6] 108 fxtod %f10,%f10 109 std %f20,[%l3+%l6] ! save smashed np[j] in double format 110 fxtod %f12,%f12 111 std %f22,[%l4+%l6] 112 fxtod %f14,%f14 113 114 fmuld %f16,%f0,%f32 115 fmuld %f20,%f8,%f48 116 fmuld %f16,%f2,%f34 117 fmuld %f20,%f10,%f50 118 fmuld %f16,%f4,%f36 119 faddd %f32,%f48,%f48 120 fmuld %f20,%f12,%f52 121 fmuld %f16,%f6,%f38 122 faddd %f34,%f50,%f50 123 fmuld %f20,%f14,%f54 124 fmuld %f18,%f0,%f40 125 faddd %f36,%f52,%f52 126 fmuld %f22,%f8,%f56 127 fmuld %f18,%f2,%f42 128 faddd %f38,%f54,%f54 129 fmuld %f22,%f10,%f58 130 fmuld %f18,%f4,%f44 131 faddd %f40,%f56,%f56 132 fmuld %f22,%f12,%f60 133 fmuld %f18,%f6,%f46 134 faddd %f42,%f58,%f58 135 fmuld %f22,%f14,%f62 136 137 faddd %f44,%f60,%f24 ! %f60 138 faddd %f46,%f62,%f26 ! %f62 139 140 faddd %f52,%f56,%f52 141 faddd %f54,%f58,%f54 142 143 fdtox %f48,%f48 144 fdtox %f50,%f50 145 fdtox %f52,%f52 146 fdtox %f54,%f54 147 148 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 149 add %l6,8,%l6 150 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 151 add %i1,%l6,%o4 152 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 153 add %i3,%l6,%o5 154 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 155 156 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 158 .word 0xa1b00c20 ! fzeros %f16 159 ld [%o4+4],%f19 160 .word 0xa5b00c20 ! fzeros %f18 161 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 162 .word 0xa9b00c20 ! fzeros %f20 163 ld [%o5+4],%f23 164 .word 0xadb00c20 ! fzeros %f22 165 166 fxtod %f16,%f16 167 fxtod %f18,%f18 168 fxtod %f20,%f20 169 fxtod %f22,%f22 170 171 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 172 fmuld %f16,%f0,%f32 173 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 174 fmuld %f20,%f8,%f48 175 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 176 fmuld %f16,%f2,%f34 177 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 178 fmuld %f20,%f10,%f50 179 180 srlx %o0,16,%o7 181 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 182 fmuld %f16,%f4,%f36 183 add %o7,%o1,%o1 184 std %f18,[%l2+%l6] 185 faddd %f32,%f48,%f48 186 fmuld %f20,%f12,%f52 187 srlx %o1,16,%o7 188 std %f20,[%l3+%l6] ! save smashed np[j] in double format 189 fmuld %f16,%f6,%f38 190 add %o7,%o2,%o2 191 std %f22,[%l4+%l6] 192 faddd %f34,%f50,%f50 193 fmuld %f20,%f14,%f54 194 srlx %o2,16,%o7 195 fmuld %f18,%f0,%f40 196 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 197 faddd %f36,%f52,%f52 198 fmuld %f22,%f8,%f56 199 !and %o0,%l7,%o0 200 !and %o1,%l7,%o1 201 !and %o2,%l7,%o2 202 !sllx %o1,16,%o1 203 !sllx %o2,32,%o2 204 !sllx %o3,48,%o7 205 !or %o1,%o0,%o0 206 !or %o2,%o0,%o0 207 !or %o7,%o0,%o0 ! 64-bit result 208 srlx %o3,16,%g1 ! 34-bit carry 209 fmuld %f18,%f2,%f42 210 211 faddd %f38,%f54,%f54 212 fmuld %f22,%f10,%f58 213 fmuld %f18,%f4,%f44 214 faddd %f40,%f56,%f56 215 fmuld %f22,%f12,%f60 216 fmuld %f18,%f6,%f46 217 faddd %f42,%f58,%f58 218 fmuld %f22,%f14,%f62 219 220 faddd %f24,%f48,%f48 221 faddd %f26,%f50,%f50 222 faddd %f44,%f60,%f24 ! %f60 223 faddd %f46,%f62,%f26 ! %f62 224 225 faddd %f52,%f56,%f52 226 faddd %f54,%f58,%f54 227 228 fdtox %f48,%f48 229 fdtox %f50,%f50 230 fdtox %f52,%f52 231 fdtox %f54,%f54 232 233 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 234 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 235 addcc %l6,8,%l6 236 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 237 bz,pn %icc,.L1stskip 238 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 239 240 .align 32 ! incidentally already aligned ! 242 .L1st: 243 add %i1,%l6,%o4 244 add %i3,%l6,%o5 245 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 246 .word 0xa1b00c20 ! fzeros %f16 247 ld [%o4+4],%f19 248 .word 0xa5b00c20 ! fzeros %f18 249 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 250 .word 0xa9b00c20 ! fzeros %f20 251 ld [%o5+4],%f23 252 .word 0xadb00c20 ! fzeros %f22 253 254 fxtod %f16,%f16 255 fxtod %f18,%f18 256 fxtod %f20,%f20 257 fxtod %f22,%f22 258 259 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 260 fmuld %f16,%f0,%f32 261 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 262 fmuld %f20,%f8,%f48 263 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 264 fmuld %f16,%f2,%f34 265 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 266 fmuld %f20,%f10,%f50 267 268 srlx %o0,16,%o7 269 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 270 fmuld %f16,%f4,%f36 271 add %o7,%o1,%o1 272 std %f18,[%l2+%l6] 273 faddd %f32,%f48,%f48 274 fmuld %f20,%f12,%f52 275 srlx %o1,16,%o7 276 std %f20,[%l3+%l6] ! save smashed np[j] in double format 277 fmuld %f16,%f6,%f38 278 add %o7,%o2,%o2 279 std %f22,[%l4+%l6] 280 faddd %f34,%f50,%f50 281 fmuld %f20,%f14,%f54 282 srlx %o2,16,%o7 283 fmuld %f18,%f0,%f40 284 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 285 and %o0,%l7,%o0 286 faddd %f36,%f52,%f52 287 fmuld %f22,%f8,%f56 288 and %o1,%l7,%o1 289 and %o2,%l7,%o2 290 fmuld %f18,%f2,%f42 291 sllx %o1,16,%o1 292 faddd %f38,%f54,%f54 293 fmuld %f22,%f10,%f58 294 sllx %o2,32,%o2 295 fmuld %f18,%f4,%f44 296 sllx %o3,48,%o7 297 or %o1,%o0,%o0 298 faddd %f40,%f56,%f56 299 fmuld %f22,%f12,%f60 300 or %o2,%o0,%o0 301 fmuld %f18,%f6,%f46 302 or %o7,%o0,%o0 ! 64-bit result 303 faddd %f42,%f58,%f58 304 fmuld %f22,%f14,%f62 305 addcc %g1,%o0,%o0 306 faddd %f24,%f48,%f48 307 srlx %o3,16,%g1 ! 34-bit carry 308 faddd %f26,%f50,%f50 309 bcs,a %xcc,.+8 310 add %g1,1,%g1 311 312 stx %o0,[%l0] ! tp[j-1]= 313 314 faddd %f44,%f60,%f24 ! %f60 315 faddd %f46,%f62,%f26 ! %f62 316 317 faddd %f52,%f56,%f52 318 faddd %f54,%f58,%f54 319 320 fdtox %f48,%f48 321 fdtox %f50,%f50 322 fdtox %f52,%f52 323 fdtox %f54,%f54 324 325 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 326 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 327 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 328 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 329 330 addcc %l6,8,%l6 331 bnz,pt %icc,.L1st 332 add %l0,8,%l0 333 334 .L1stskip: 336 fdtox %f24,%f24 337 fdtox %f26,%f26 338 339 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 340 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 341 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 342 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 343 344 srlx %o0,16,%o7 345 std %f24,[%sp+STACK_BIAS+STACK_FRAME+32] 346 add %o7,%o1,%o1 347 std %f26,[%sp+STACK_BIAS+STACK_FRAME+40] 348 srlx %o1,16,%o7 349 add %o7,%o2,%o2 350 srlx %o2,16,%o7 351 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 352 and %o0,%l7,%o0 353 and %o1,%l7,%o1 354 and %o2,%l7,%o2 355 sllx %o1,16,%o1 356 sllx %o2,32,%o2 357 sllx %o3,48,%o7 358 or %o1,%o0,%o0 359 or %o2,%o0,%o0 360 or %o7,%o0,%o0 ! 64-bit result 361 ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4 362 addcc %g1,%o0,%o0 363 ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5 364 srlx %o3,16,%g1 ! 34-bit carry 365 bcs,a %xcc,.+8 366 add %g1,1,%g1 367 368 stx %o0,[%l0] ! tp[j-1]= 369 add %l0,8,%l0 370 371 srlx %o4,16,%o7 372 add %o7,%o5,%o5 373 and %o4,%l7,%o4 374 sllx %o5,16,%o7 375 or %o7,%o4,%o4 376 addcc %g1,%o4,%o4 377 srlx %o5,48,%g1 378 bcs,a %xcc,.+8 379 add %g1,1,%g1 380 381 mov %g1,%i4 382 stx %o4,[%l0] ! tp[num-1]= 383 384 ba .Louter 386 add %l5,8,%l5 387 .align 32 388 .Louter: 389 sub %g0,%i5,%l6 ! j=-num 390 add %sp,STACK_BIAS+STACK_FRAME+64,%l0 391 392 add %i1,%l6,%o3 393 add %i2,%l5,%o4 394 395 ld [%o3+4],%g1 ! bp[i] 396 ld [%o3+0],%o0 397 ld [%o4+4],%g5 ! ap[0] 398 sllx %g1,32,%g1 399 ld [%o4+0],%o1 400 sllx %g5,32,%g5 401 or %g1,%o0,%o0 402 or %g5,%o1,%o1 403 404 ldx [%l0],%o2 ! tp[0] 405 mulx %o1,%o0,%o0 406 addcc %o2,%o0,%o0 407 mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 408 stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0] 409 410 ! transfer b[i] to FPU as 4x16-bit values 411 ldda [%o4+2]%asi,%f0 412 ldda [%o4+0]%asi,%f2 413 ldda [%o4+6]%asi,%f4 414 ldda [%o4+4]%asi,%f6 415 416 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 417 ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8 418 fxtod %f0,%f0 419 ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10 420 fxtod %f2,%f2 421 ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12 422 fxtod %f4,%f4 423 ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14 424 fxtod %f6,%f6 425 ldd [%l1+%l6],%f16 ! load a[j] in double format 426 fxtod %f8,%f8 427 ldd [%l2+%l6],%f18 428 fxtod %f10,%f10 429 ldd [%l3+%l6],%f20 ! load n[j] in double format 430 fxtod %f12,%f12 431 ldd [%l4+%l6],%f22 432 fxtod %f14,%f14 433 434 fmuld %f16,%f0,%f32 435 fmuld %f20,%f8,%f48 436 fmuld %f16,%f2,%f34 437 fmuld %f20,%f10,%f50 438 fmuld %f16,%f4,%f36 439 faddd %f32,%f48,%f48 440 fmuld %f20,%f12,%f52 441 fmuld %f16,%f6,%f38 442 faddd %f34,%f50,%f50 443 fmuld %f20,%f14,%f54 444 fmuld %f18,%f0,%f40 445 faddd %f36,%f52,%f52 446 fmuld %f22,%f8,%f56 447 fmuld %f18,%f2,%f42 448 faddd %f38,%f54,%f54 449 fmuld %f22,%f10,%f58 450 fmuld %f18,%f4,%f44 451 faddd %f40,%f56,%f56 452 fmuld %f22,%f12,%f60 453 fmuld %f18,%f6,%f46 454 faddd %f42,%f58,%f58 455 fmuld %f22,%f14,%f62 456 457 faddd %f44,%f60,%f24 ! %f60 458 faddd %f46,%f62,%f26 ! %f62 459 460 faddd %f52,%f56,%f52 461 faddd %f54,%f58,%f54 462 463 fdtox %f48,%f48 464 fdtox %f50,%f50 465 fdtox %f52,%f52 466 fdtox %f54,%f54 467 468 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 469 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 470 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 471 add %l6,8,%l6 472 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 473 474 ldd [%l1+%l6],%f16 ! load a[j] in double format 476 ldd [%l2+%l6],%f18 477 ldd [%l3+%l6],%f20 ! load n[j] in double format 478 ldd [%l4+%l6],%f22 479 480 fmuld %f16,%f0,%f32 481 fmuld %f20,%f8,%f48 482 fmuld %f16,%f2,%f34 483 fmuld %f20,%f10,%f50 484 fmuld %f16,%f4,%f36 485 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 486 faddd %f32,%f48,%f48 487 fmuld %f20,%f12,%f52 488 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 489 fmuld %f16,%f6,%f38 490 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 491 faddd %f34,%f50,%f50 492 fmuld %f20,%f14,%f54 493 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 494 fmuld %f18,%f0,%f40 495 496 srlx %o0,16,%o7 497 faddd %f36,%f52,%f52 498 fmuld %f22,%f8,%f56 499 add %o7,%o1,%o1 500 fmuld %f18,%f2,%f42 501 srlx %o1,16,%o7 502 faddd %f38,%f54,%f54 503 fmuld %f22,%f10,%f58 504 add %o7,%o2,%o2 505 fmuld %f18,%f4,%f44 506 srlx %o2,16,%o7 507 faddd %f40,%f56,%f56 508 fmuld %f22,%f12,%f60 509 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 510 ! why? 511 and %o0,%l7,%o0 512 fmuld %f18,%f6,%f46 513 and %o1,%l7,%o1 514 and %o2,%l7,%o2 515 faddd %f42,%f58,%f58 516 fmuld %f22,%f14,%f62 517 sllx %o1,16,%o1 518 faddd %f24,%f48,%f48 519 sllx %o2,32,%o2 520 faddd %f26,%f50,%f50 521 sllx %o3,48,%o7 522 or %o1,%o0,%o0 523 faddd %f44,%f60,%f24 ! %f60 524 or %o2,%o0,%o0 525 faddd %f46,%f62,%f26 ! %f62 526 or %o7,%o0,%o0 ! 64-bit result 527 ldx [%l0],%o7 528 faddd %f52,%f56,%f52 529 addcc %o7,%o0,%o0 530 ! end-of-why? 531 faddd %f54,%f58,%f54 532 srlx %o3,16,%g1 ! 34-bit carry 533 fdtox %f48,%f48 534 bcs,a %xcc,.+8 535 add %g1,1,%g1 536 537 fdtox %f50,%f50 538 fdtox %f52,%f52 539 fdtox %f54,%f54 540 541 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 542 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 543 addcc %l6,8,%l6 544 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 545 bz,pn %icc,.Linnerskip 546 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 547 548 ba .Linner 550 nop 551 .align 32 552 .Linner: 553 ldd [%l1+%l6],%f16 ! load a[j] in double format 554 ldd [%l2+%l6],%f18 555 ldd [%l3+%l6],%f20 ! load n[j] in double format 556 ldd [%l4+%l6],%f22 557 558 fmuld %f16,%f0,%f32 559 fmuld %f20,%f8,%f48 560 fmuld %f16,%f2,%f34 561 fmuld %f20,%f10,%f50 562 fmuld %f16,%f4,%f36 563 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 564 faddd %f32,%f48,%f48 565 fmuld %f20,%f12,%f52 566 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 567 fmuld %f16,%f6,%f38 568 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 569 faddd %f34,%f50,%f50 570 fmuld %f20,%f14,%f54 571 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 572 fmuld %f18,%f0,%f40 573 574 srlx %o0,16,%o7 575 faddd %f36,%f52,%f52 576 fmuld %f22,%f8,%f56 577 add %o7,%o1,%o1 578 fmuld %f18,%f2,%f42 579 srlx %o1,16,%o7 580 faddd %f38,%f54,%f54 581 fmuld %f22,%f10,%f58 582 add %o7,%o2,%o2 583 fmuld %f18,%f4,%f44 584 srlx %o2,16,%o7 585 faddd %f40,%f56,%f56 586 fmuld %f22,%f12,%f60 587 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 588 and %o0,%l7,%o0 589 fmuld %f18,%f6,%f46 590 and %o1,%l7,%o1 591 and %o2,%l7,%o2 592 faddd %f42,%f58,%f58 593 fmuld %f22,%f14,%f62 594 sllx %o1,16,%o1 595 faddd %f24,%f48,%f48 596 sllx %o2,32,%o2 597 faddd %f26,%f50,%f50 598 sllx %o3,48,%o7 599 or %o1,%o0,%o0 600 faddd %f44,%f60,%f24 ! %f60 601 or %o2,%o0,%o0 602 faddd %f46,%f62,%f26 ! %f62 603 or %o7,%o0,%o0 ! 64-bit result 604 faddd %f52,%f56,%f52 605 addcc %g1,%o0,%o0 606 ldx [%l0+8],%o7 ! tp[j] 607 faddd %f54,%f58,%f54 608 srlx %o3,16,%g1 ! 34-bit carry 609 fdtox %f48,%f48 610 bcs,a %xcc,.+8 611 add %g1,1,%g1 612 fdtox %f50,%f50 613 addcc %o7,%o0,%o0 614 fdtox %f52,%f52 615 bcs,a %xcc,.+8 616 add %g1,1,%g1 617 618 stx %o0,[%l0] ! tp[j-1] 619 fdtox %f54,%f54 620 621 std %f48,[%sp+STACK_BIAS+STACK_FRAME+0] 622 std %f50,[%sp+STACK_BIAS+STACK_FRAME+8] 623 std %f52,[%sp+STACK_BIAS+STACK_FRAME+16] 624 addcc %l6,8,%l6 625 std %f54,[%sp+STACK_BIAS+STACK_FRAME+24] 626 bnz,pt %icc,.Linner 627 add %l0,8,%l0 628 629 .Linnerskip: 631 fdtox %f24,%f24 632 fdtox %f26,%f26 633 634 ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0 635 ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1 636 ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2 637 ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3 638 639 srlx %o0,16,%o7 640 std %f24,[%sp+STACK_BIAS+STACK_FRAME+32] 641 add %o7,%o1,%o1 642 std %f26,[%sp+STACK_BIAS+STACK_FRAME+40] 643 srlx %o1,16,%o7 644 add %o7,%o2,%o2 645 srlx %o2,16,%o7 646 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 647 and %o0,%l7,%o0 648 and %o1,%l7,%o1 649 and %o2,%l7,%o2 650 sllx %o1,16,%o1 651 sllx %o2,32,%o2 652 sllx %o3,48,%o7 653 or %o1,%o0,%o0 654 or %o2,%o0,%o0 655 ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4 656 or %o7,%o0,%o0 ! 64-bit result 657 ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5 658 addcc %g1,%o0,%o0 659 ldx [%l0+8],%o7 ! tp[j] 660 srlx %o3,16,%g1 ! 34-bit carry 661 bcs,a %xcc,.+8 662 add %g1,1,%g1 663 664 addcc %o7,%o0,%o0 665 bcs,a %xcc,.+8 666 add %g1,1,%g1 667 668 stx %o0,[%l0] ! tp[j-1] 669 add %l0,8,%l0 670 671 srlx %o4,16,%o7 672 add %o7,%o5,%o5 673 and %o4,%l7,%o4 674 sllx %o5,16,%o7 675 or %o7,%o4,%o4 676 addcc %g1,%o4,%o4 677 srlx %o5,48,%g1 678 bcs,a %xcc,.+8 679 add %g1,1,%g1 680 681 addcc %i4,%o4,%o4 682 stx %o4,[%l0] ! tp[num-1] 683 mov %g1,%i4 684 bcs,a %xcc,.+8 685 add %i4,1,%i4 686 687 addcc %l5,8,%l5 688 bnz %icc,.Louter 689 nop 690 691 add %l0,8,%l0 ! adjust tp to point at the end 693 orn %g0,%g0,%g4 694 sub %g0,%i5,%o7 ! n=-num 695 ba .Lsub 696 subcc %g0,%g0,%g0 ! clear %icc.c 697 698 .align 32 699 .Lsub: 700 ldx [%l0+%o7],%o0 701 add %i3,%o7,%g1 702 ld [%g1+0],%o2 703 ld [%g1+4],%o3 704 srlx %o0,32,%o1 705 subccc %o0,%o2,%o2 706 add %i0,%o7,%g1 707 subccc %o1,%o3,%o3 708 st %o2,[%g1+0] 709 add %o7,8,%o7 710 brnz,pt %o7,.Lsub 711 st %o3,[%g1+4] 712 subc %i4,0,%g4 713 sub %g0,%i5,%o7 ! n=-num 714 ba .Lcopy 715 nop 716 717 .align 32 718 .Lcopy: 719 ldx [%l0+%o7],%o0 720 add %i0,%o7,%g1 721 ld [%g1+0],%o2 722 ld [%g1+4],%o3 723 stx %g0,[%l0+%o7] 724 and %o0,%g4,%o0 725 srlx %o0,32,%o1 726 andn %o2,%g4,%o2 727 andn %o3,%g4,%o3 728 or %o2,%o0,%o0 729 or %o3,%o1,%o1 730 st %o0,[%g1+0] 731 add %o7,8,%o7 732 brnz,pt %o7,.Lcopy 733 st %o1,[%g1+4] 734 sub %g0,%i5,%o7 ! n=-num 735 736 .Lzap: 737 stx %g0,[%l1+%o7] 738 stx %g0,[%l2+%o7] 739 stx %g0,[%l3+%o7] 740 stx %g0,[%l4+%o7] 741 add %o7,8,%o7 742 brnz,pt %o7,.Lzap 743 nop 744 745 ldx [%sp+STACK_BIAS+STACK_FRAME+48],%o7 746 wr %g0,%o7,%asi ! restore %asi 747 748 mov 1,%i0 749 .Lret: 750 ret 751 restore 752 .type bn_mul_mont_fpu,#function 753 .size bn_mul_mont_fpu,(.-bn_mul_mont_fpu) 754 .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro (at) openssl.org>" 755 .align 32 756