1 1.1 christos .text 2 1.1 christos 3 1.1 christos .globl bn_mul_mont 4 1.1 christos .type bn_mul_mont,%function 5 1.1 christos .align 5 6 1.1 christos bn_mul_mont: 7 1.1 christos tst x5,#7 8 1.1 christos b.eq __bn_sqr8x_mont 9 1.1 christos tst x5,#3 10 1.1 christos b.eq __bn_mul4x_mont 11 1.1 christos .Lmul_mont: 12 1.1 christos stp x29,x30,[sp,#-64]! 13 1.1 christos add x29,sp,#0 14 1.1 christos stp x19,x20,[sp,#16] 15 1.1 christos stp x21,x22,[sp,#32] 16 1.1 christos stp x23,x24,[sp,#48] 17 1.1 christos 18 1.1 christos ldr x9,[x2],#8 // bp[0] 19 1.1 christos sub x22,sp,x5,lsl#3 20 1.1 christos ldp x7,x8,[x1],#16 // ap[0..1] 21 1.1 christos lsl x5,x5,#3 22 1.1 christos ldr x4,[x4] // *n0 23 1.1 christos and x22,x22,#-16 // ABI says so 24 1.1 christos ldp x13,x14,[x3],#16 // np[0..1] 25 1.1 christos 26 1.1 christos mul x6,x7,x9 // ap[0]*bp[0] 27 1.1 christos sub x21,x5,#16 // j=num-2 28 1.1 christos umulh x7,x7,x9 29 1.1 christos mul x10,x8,x9 // ap[1]*bp[0] 30 1.1 christos umulh x11,x8,x9 31 1.1 christos 32 1.1 christos mul x15,x6,x4 // "tp[0]"*n0 33 1.1 christos mov sp,x22 // alloca 34 1.1 christos 35 1.1 christos // (*) mul x12,x13,x15 // np[0]*m1 36 1.1 christos umulh x13,x13,x15 37 1.1 christos mul x16,x14,x15 // np[1]*m1 38 1.1 christos // (*) adds x12,x12,x6 // discarded 39 1.1 christos // (*) As for removal of first multiplication and addition 40 1.1 christos // instructions. The outcome of first addition is 41 1.1 christos // guaranteed to be zero, which leaves two computationally 42 1.1 christos // significant outcomes: it either carries or not. Then 43 1.1 christos // question is when does it carry? Is there alternative 44 1.1 christos // way to deduce it? If you follow operations, you can 45 1.1 christos // observe that condition for carry is quite simple: 46 1.1 christos // x6 being non-zero. So that carry can be calculated 47 1.1 christos // by adding -1 to x6. That's what next instruction does. 48 1.1 christos subs xzr,x6,#1 // (*) 49 1.1 christos umulh x17,x14,x15 50 1.1 christos adc x13,x13,xzr 51 1.1 christos cbz x21,.L1st_skip 52 1.1 christos 53 1.1 christos .L1st: 54 1.1 christos ldr x8,[x1],#8 55 1.1 christos adds x6,x10,x7 56 1.1 christos sub x21,x21,#8 // j-- 57 1.1 christos adc x7,x11,xzr 58 1.1 christos 59 1.1 christos ldr x14,[x3],#8 60 1.1 christos adds x12,x16,x13 61 1.1 christos mul x10,x8,x9 // ap[j]*bp[0] 62 1.1 christos adc x13,x17,xzr 63 1.1 christos umulh x11,x8,x9 64 1.1 christos 65 1.1 christos adds x12,x12,x6 66 1.1 christos mul x16,x14,x15 // np[j]*m1 67 1.1 christos adc x13,x13,xzr 68 1.1 christos umulh x17,x14,x15 69 1.1 christos str x12,[x22],#8 // tp[j-1] 70 1.1 christos cbnz x21,.L1st 71 1.1 christos 72 1.1 christos .L1st_skip: 73 1.1 christos adds x6,x10,x7 74 1.1 christos sub x1,x1,x5 // rewind x1 75 1.1 christos adc x7,x11,xzr 76 1.1 christos 77 1.1 christos adds x12,x16,x13 78 1.1 christos sub x3,x3,x5 // rewind x3 79 1.1 christos adc x13,x17,xzr 80 1.1 christos 81 1.1 christos adds x12,x12,x6 82 1.1 christos sub x20,x5,#8 // i=num-1 83 1.1 christos adcs x13,x13,x7 84 1.1 christos 85 1.1 christos adc x19,xzr,xzr // upmost overflow bit 86 1.1 christos stp x12,x13,[x22] 87 1.1 christos 88 1.1 christos .Louter: 89 1.1 christos ldr x9,[x2],#8 // bp[i] 90 1.1 christos ldp x7,x8,[x1],#16 91 1.1 christos ldr x23,[sp] // tp[0] 92 1.1 christos add x22,sp,#8 93 1.1 christos 94 1.1 christos mul x6,x7,x9 // ap[0]*bp[i] 95 1.1 christos sub x21,x5,#16 // j=num-2 96 1.1 christos umulh x7,x7,x9 97 1.1 christos ldp x13,x14,[x3],#16 98 1.1 christos mul x10,x8,x9 // ap[1]*bp[i] 99 1.1 christos adds x6,x6,x23 100 1.1 christos umulh x11,x8,x9 101 1.1 christos adc x7,x7,xzr 102 1.1 christos 103 1.1 christos mul x15,x6,x4 104 1.1 christos sub x20,x20,#8 // i-- 105 1.1 christos 106 1.1 christos // (*) mul x12,x13,x15 // np[0]*m1 107 1.1 christos umulh x13,x13,x15 108 1.1 christos mul x16,x14,x15 // np[1]*m1 109 1.1 christos // (*) adds x12,x12,x6 110 1.1 christos subs xzr,x6,#1 // (*) 111 1.1 christos umulh x17,x14,x15 112 1.1 christos cbz x21,.Linner_skip 113 1.1 christos 114 1.1 christos .Linner: 115 1.1 christos ldr x8,[x1],#8 116 1.1 christos adc x13,x13,xzr 117 1.1 christos ldr x23,[x22],#8 // tp[j] 118 1.1 christos adds x6,x10,x7 119 1.1 christos sub x21,x21,#8 // j-- 120 1.1 christos adc x7,x11,xzr 121 1.1 christos 122 1.1 christos adds x12,x16,x13 123 1.1 christos ldr x14,[x3],#8 124 1.1 christos adc x13,x17,xzr 125 1.1 christos 126 1.1 christos mul x10,x8,x9 // ap[j]*bp[i] 127 1.1 christos adds x6,x6,x23 128 1.1 christos umulh x11,x8,x9 129 1.1 christos adc x7,x7,xzr 130 1.1 christos 131 1.1 christos mul x16,x14,x15 // np[j]*m1 132 1.1 christos adds x12,x12,x6 133 1.1 christos umulh x17,x14,x15 134 1.1 christos str x12,[x22,#-16] // tp[j-1] 135 1.1 christos cbnz x21,.Linner 136 1.1 christos 137 1.1 christos .Linner_skip: 138 1.1 christos ldr x23,[x22],#8 // tp[j] 139 1.1 christos adc x13,x13,xzr 140 1.1 christos adds x6,x10,x7 141 1.1 christos sub x1,x1,x5 // rewind x1 142 1.1 christos adc x7,x11,xzr 143 1.1 christos 144 1.1 christos adds x12,x16,x13 145 1.1 christos sub x3,x3,x5 // rewind x3 146 1.1 christos adcs x13,x17,x19 147 1.1 christos adc x19,xzr,xzr 148 1.1 christos 149 1.1 christos adds x6,x6,x23 150 1.1 christos adc x7,x7,xzr 151 1.1 christos 152 1.1 christos adds x12,x12,x6 153 1.1 christos adcs x13,x13,x7 154 1.1 christos adc x19,x19,xzr // upmost overflow bit 155 1.1 christos stp x12,x13,[x22,#-16] 156 1.1 christos 157 1.1 christos cbnz x20,.Louter 158 1.1 christos 159 1.1 christos // Final step. We see if result is larger than modulus, and 160 1.1 christos // if it is, subtract the modulus. But comparison implies 161 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 162 1.1 christos // and conditionally copy original value. 163 1.1 christos ldr x23,[sp] // tp[0] 164 1.1 christos add x22,sp,#8 165 1.1 christos ldr x14,[x3],#8 // np[0] 166 1.1 christos subs x21,x5,#8 // j=num-1 and clear borrow 167 1.1 christos mov x1,x0 168 1.1 christos .Lsub: 169 1.1 christos sbcs x8,x23,x14 // tp[j]-np[j] 170 1.1 christos ldr x23,[x22],#8 171 1.1 christos sub x21,x21,#8 // j-- 172 1.1 christos ldr x14,[x3],#8 173 1.1 christos str x8,[x1],#8 // rp[j]=tp[j]-np[j] 174 1.1 christos cbnz x21,.Lsub 175 1.1 christos 176 1.1 christos sbcs x8,x23,x14 177 1.1 christos sbcs x19,x19,xzr // did it borrow? 178 1.1 christos str x8,[x1],#8 // rp[num-1] 179 1.1 christos 180 1.1 christos ldr x23,[sp] // tp[0] 181 1.1 christos add x22,sp,#8 182 1.1 christos ldr x8,[x0],#8 // rp[0] 183 1.1 christos sub x5,x5,#8 // num-- 184 1.1 christos nop 185 1.1 christos .Lcond_copy: 186 1.1 christos sub x5,x5,#8 // num-- 187 1.1 christos csel x14,x23,x8,lo // did it borrow? 188 1.1 christos ldr x23,[x22],#8 189 1.1 christos ldr x8,[x0],#8 190 1.1 christos str xzr,[x22,#-16] // wipe tp 191 1.1 christos str x14,[x0,#-16] 192 1.1 christos cbnz x5,.Lcond_copy 193 1.1 christos 194 1.1 christos csel x14,x23,x8,lo 195 1.1 christos str xzr,[x22,#-8] // wipe tp 196 1.1 christos str x14,[x0,#-8] 197 1.1 christos 198 1.1 christos ldp x19,x20,[x29,#16] 199 1.1 christos mov sp,x29 200 1.1 christos ldp x21,x22,[x29,#32] 201 1.1 christos mov x0,#1 202 1.1 christos ldp x23,x24,[x29,#48] 203 1.1 christos ldr x29,[sp],#64 204 1.1 christos ret 205 1.1 christos .size bn_mul_mont,.-bn_mul_mont 206 1.1 christos .type __bn_sqr8x_mont,%function 207 1.1 christos .align 5 208 1.1 christos __bn_sqr8x_mont: 209 1.1 christos cmp x1,x2 210 1.1 christos b.ne __bn_mul4x_mont 211 1.1 christos .Lsqr8x_mont: 212 1.1 christos .inst 0xd503233f // paciasp 213 1.1 christos stp x29,x30,[sp,#-128]! 214 1.1 christos add x29,sp,#0 215 1.1 christos stp x19,x20,[sp,#16] 216 1.1 christos stp x21,x22,[sp,#32] 217 1.1 christos stp x23,x24,[sp,#48] 218 1.1 christos stp x25,x26,[sp,#64] 219 1.1 christos stp x27,x28,[sp,#80] 220 1.1 christos stp x0,x3,[sp,#96] // offload rp and np 221 1.1 christos 222 1.1 christos ldp x6,x7,[x1,#8*0] 223 1.1 christos ldp x8,x9,[x1,#8*2] 224 1.1 christos ldp x10,x11,[x1,#8*4] 225 1.1 christos ldp x12,x13,[x1,#8*6] 226 1.1 christos 227 1.1 christos sub x2,sp,x5,lsl#4 228 1.1 christos lsl x5,x5,#3 229 1.1 christos ldr x4,[x4] // *n0 230 1.1 christos mov sp,x2 // alloca 231 1.1 christos sub x27,x5,#8*8 232 1.1 christos b .Lsqr8x_zero_start 233 1.1 christos 234 1.1 christos .Lsqr8x_zero: 235 1.1 christos sub x27,x27,#8*8 236 1.1 christos stp xzr,xzr,[x2,#8*0] 237 1.1 christos stp xzr,xzr,[x2,#8*2] 238 1.1 christos stp xzr,xzr,[x2,#8*4] 239 1.1 christos stp xzr,xzr,[x2,#8*6] 240 1.1 christos .Lsqr8x_zero_start: 241 1.1 christos stp xzr,xzr,[x2,#8*8] 242 1.1 christos stp xzr,xzr,[x2,#8*10] 243 1.1 christos stp xzr,xzr,[x2,#8*12] 244 1.1 christos stp xzr,xzr,[x2,#8*14] 245 1.1 christos add x2,x2,#8*16 246 1.1 christos cbnz x27,.Lsqr8x_zero 247 1.1 christos 248 1.1 christos add x3,x1,x5 249 1.1 christos add x1,x1,#8*8 250 1.1 christos mov x19,xzr 251 1.1 christos mov x20,xzr 252 1.1 christos mov x21,xzr 253 1.1 christos mov x22,xzr 254 1.1 christos mov x23,xzr 255 1.1 christos mov x24,xzr 256 1.1 christos mov x25,xzr 257 1.1 christos mov x26,xzr 258 1.1 christos mov x2,sp 259 1.1 christos str x4,[x29,#112] // offload n0 260 1.1 christos 261 1.1 christos // Multiply everything but a[i]*a[i] 262 1.1 christos .align 4 263 1.1 christos .Lsqr8x_outer_loop: 264 1.1 christos // a[1]a[0] (i) 265 1.1 christos // a[2]a[0] 266 1.1 christos // a[3]a[0] 267 1.1 christos // a[4]a[0] 268 1.1 christos // a[5]a[0] 269 1.1 christos // a[6]a[0] 270 1.1 christos // a[7]a[0] 271 1.1 christos // a[2]a[1] (ii) 272 1.1 christos // a[3]a[1] 273 1.1 christos // a[4]a[1] 274 1.1 christos // a[5]a[1] 275 1.1 christos // a[6]a[1] 276 1.1 christos // a[7]a[1] 277 1.1 christos // a[3]a[2] (iii) 278 1.1 christos // a[4]a[2] 279 1.1 christos // a[5]a[2] 280 1.1 christos // a[6]a[2] 281 1.1 christos // a[7]a[2] 282 1.1 christos // a[4]a[3] (iv) 283 1.1 christos // a[5]a[3] 284 1.1 christos // a[6]a[3] 285 1.1 christos // a[7]a[3] 286 1.1 christos // a[5]a[4] (v) 287 1.1 christos // a[6]a[4] 288 1.1 christos // a[7]a[4] 289 1.1 christos // a[6]a[5] (vi) 290 1.1 christos // a[7]a[5] 291 1.1 christos // a[7]a[6] (vii) 292 1.1 christos 293 1.1 christos mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 294 1.1 christos mul x15,x8,x6 295 1.1 christos mul x16,x9,x6 296 1.1 christos mul x17,x10,x6 297 1.1 christos adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 298 1.1 christos mul x14,x11,x6 299 1.1 christos adcs x21,x21,x15 300 1.1 christos mul x15,x12,x6 301 1.1 christos adcs x22,x22,x16 302 1.1 christos mul x16,x13,x6 303 1.1 christos adcs x23,x23,x17 304 1.1 christos umulh x17,x7,x6 // hi(a[1..7]*a[0]) 305 1.1 christos adcs x24,x24,x14 306 1.1 christos umulh x14,x8,x6 307 1.1 christos adcs x25,x25,x15 308 1.1 christos umulh x15,x9,x6 309 1.1 christos adcs x26,x26,x16 310 1.1 christos umulh x16,x10,x6 311 1.1 christos stp x19,x20,[x2],#8*2 // t[0..1] 312 1.1 christos adc x19,xzr,xzr // t[8] 313 1.1 christos adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 314 1.1 christos umulh x17,x11,x6 315 1.1 christos adcs x22,x22,x14 316 1.1 christos umulh x14,x12,x6 317 1.1 christos adcs x23,x23,x15 318 1.1 christos umulh x15,x13,x6 319 1.1 christos adcs x24,x24,x16 320 1.1 christos mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 321 1.1 christos adcs x25,x25,x17 322 1.1 christos mul x17,x9,x7 323 1.1 christos adcs x26,x26,x14 324 1.1 christos mul x14,x10,x7 325 1.1 christos adc x19,x19,x15 326 1.1 christos 327 1.1 christos mul x15,x11,x7 328 1.1 christos adds x22,x22,x16 329 1.1 christos mul x16,x12,x7 330 1.1 christos adcs x23,x23,x17 331 1.1 christos mul x17,x13,x7 332 1.1 christos adcs x24,x24,x14 333 1.1 christos umulh x14,x8,x7 // hi(a[2..7]*a[1]) 334 1.1 christos adcs x25,x25,x15 335 1.1 christos umulh x15,x9,x7 336 1.1 christos adcs x26,x26,x16 337 1.1 christos umulh x16,x10,x7 338 1.1 christos adcs x19,x19,x17 339 1.1 christos umulh x17,x11,x7 340 1.1 christos stp x21,x22,[x2],#8*2 // t[2..3] 341 1.1 christos adc x20,xzr,xzr // t[9] 342 1.1 christos adds x23,x23,x14 343 1.1 christos umulh x14,x12,x7 344 1.1 christos adcs x24,x24,x15 345 1.1 christos umulh x15,x13,x7 346 1.1 christos adcs x25,x25,x16 347 1.1 christos mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 348 1.1 christos adcs x26,x26,x17 349 1.1 christos mul x17,x10,x8 350 1.1 christos adcs x19,x19,x14 351 1.1 christos mul x14,x11,x8 352 1.1 christos adc x20,x20,x15 353 1.1 christos 354 1.1 christos mul x15,x12,x8 355 1.1 christos adds x24,x24,x16 356 1.1 christos mul x16,x13,x8 357 1.1 christos adcs x25,x25,x17 358 1.1 christos umulh x17,x9,x8 // hi(a[3..7]*a[2]) 359 1.1 christos adcs x26,x26,x14 360 1.1 christos umulh x14,x10,x8 361 1.1 christos adcs x19,x19,x15 362 1.1 christos umulh x15,x11,x8 363 1.1 christos adcs x20,x20,x16 364 1.1 christos umulh x16,x12,x8 365 1.1 christos stp x23,x24,[x2],#8*2 // t[4..5] 366 1.1 christos adc x21,xzr,xzr // t[10] 367 1.1 christos adds x25,x25,x17 368 1.1 christos umulh x17,x13,x8 369 1.1 christos adcs x26,x26,x14 370 1.1 christos mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 371 1.1 christos adcs x19,x19,x15 372 1.1 christos mul x15,x11,x9 373 1.1 christos adcs x20,x20,x16 374 1.1 christos mul x16,x12,x9 375 1.1 christos adc x21,x21,x17 376 1.1 christos 377 1.1 christos mul x17,x13,x9 378 1.1 christos adds x26,x26,x14 379 1.1 christos umulh x14,x10,x9 // hi(a[4..7]*a[3]) 380 1.1 christos adcs x19,x19,x15 381 1.1 christos umulh x15,x11,x9 382 1.1 christos adcs x20,x20,x16 383 1.1 christos umulh x16,x12,x9 384 1.1 christos adcs x21,x21,x17 385 1.1 christos umulh x17,x13,x9 386 1.1 christos stp x25,x26,[x2],#8*2 // t[6..7] 387 1.1 christos adc x22,xzr,xzr // t[11] 388 1.1 christos adds x19,x19,x14 389 1.1 christos mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 390 1.1 christos adcs x20,x20,x15 391 1.1 christos mul x15,x12,x10 392 1.1 christos adcs x21,x21,x16 393 1.1 christos mul x16,x13,x10 394 1.1 christos adc x22,x22,x17 395 1.1 christos 396 1.1 christos umulh x17,x11,x10 // hi(a[5..7]*a[4]) 397 1.1 christos adds x20,x20,x14 398 1.1 christos umulh x14,x12,x10 399 1.1 christos adcs x21,x21,x15 400 1.1 christos umulh x15,x13,x10 401 1.1 christos adcs x22,x22,x16 402 1.1 christos mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 403 1.1 christos adc x23,xzr,xzr // t[12] 404 1.1 christos adds x21,x21,x17 405 1.1 christos mul x17,x13,x11 406 1.1 christos adcs x22,x22,x14 407 1.1 christos umulh x14,x12,x11 // hi(a[6..7]*a[5]) 408 1.1 christos adc x23,x23,x15 409 1.1 christos 410 1.1 christos umulh x15,x13,x11 411 1.1 christos adds x22,x22,x16 412 1.1 christos mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 413 1.1 christos adcs x23,x23,x17 414 1.1 christos umulh x17,x13,x12 // hi(a[7]*a[6]) 415 1.1 christos adc x24,xzr,xzr // t[13] 416 1.1 christos adds x23,x23,x14 417 1.1 christos sub x27,x3,x1 // done yet? 418 1.1 christos adc x24,x24,x15 419 1.1 christos 420 1.1 christos adds x24,x24,x16 421 1.1 christos sub x14,x3,x5 // rewinded ap 422 1.1 christos adc x25,xzr,xzr // t[14] 423 1.1 christos add x25,x25,x17 424 1.1 christos 425 1.1 christos cbz x27,.Lsqr8x_outer_break 426 1.1 christos 427 1.1 christos mov x4,x6 428 1.1 christos ldp x6,x7,[x2,#8*0] 429 1.1 christos ldp x8,x9,[x2,#8*2] 430 1.1 christos ldp x10,x11,[x2,#8*4] 431 1.1 christos ldp x12,x13,[x2,#8*6] 432 1.1 christos adds x19,x19,x6 433 1.1 christos adcs x20,x20,x7 434 1.1 christos ldp x6,x7,[x1,#8*0] 435 1.1 christos adcs x21,x21,x8 436 1.1 christos adcs x22,x22,x9 437 1.1 christos ldp x8,x9,[x1,#8*2] 438 1.1 christos adcs x23,x23,x10 439 1.1 christos adcs x24,x24,x11 440 1.1 christos ldp x10,x11,[x1,#8*4] 441 1.1 christos adcs x25,x25,x12 442 1.1 christos mov x0,x1 443 1.1 christos adcs x26,xzr,x13 444 1.1 christos ldp x12,x13,[x1,#8*6] 445 1.1 christos add x1,x1,#8*8 446 1.1 christos //adc x28,xzr,xzr // moved below 447 1.1 christos mov x27,#-8*8 448 1.1 christos 449 1.1 christos // a[8]a[0] 450 1.1 christos // a[9]a[0] 451 1.1 christos // a[a]a[0] 452 1.1 christos // a[b]a[0] 453 1.1 christos // a[c]a[0] 454 1.1 christos // a[d]a[0] 455 1.1 christos // a[e]a[0] 456 1.1 christos // a[f]a[0] 457 1.1 christos // a[8]a[1] 458 1.1 christos // a[f]a[1]........................ 459 1.1 christos // a[8]a[2] 460 1.1 christos // a[f]a[2]........................ 461 1.1 christos // a[8]a[3] 462 1.1 christos // a[f]a[3]........................ 463 1.1 christos // a[8]a[4] 464 1.1 christos // a[f]a[4]........................ 465 1.1 christos // a[8]a[5] 466 1.1 christos // a[f]a[5]........................ 467 1.1 christos // a[8]a[6] 468 1.1 christos // a[f]a[6]........................ 469 1.1 christos // a[8]a[7] 470 1.1 christos // a[f]a[7]........................ 471 1.1 christos .Lsqr8x_mul: 472 1.1 christos mul x14,x6,x4 473 1.1 christos adc x28,xzr,xzr // carry bit, modulo-scheduled 474 1.1 christos mul x15,x7,x4 475 1.1 christos add x27,x27,#8 476 1.1 christos mul x16,x8,x4 477 1.1 christos mul x17,x9,x4 478 1.1 christos adds x19,x19,x14 479 1.1 christos mul x14,x10,x4 480 1.1 christos adcs x20,x20,x15 481 1.1 christos mul x15,x11,x4 482 1.1 christos adcs x21,x21,x16 483 1.1 christos mul x16,x12,x4 484 1.1 christos adcs x22,x22,x17 485 1.1 christos mul x17,x13,x4 486 1.1 christos adcs x23,x23,x14 487 1.1 christos umulh x14,x6,x4 488 1.1 christos adcs x24,x24,x15 489 1.1 christos umulh x15,x7,x4 490 1.1 christos adcs x25,x25,x16 491 1.1 christos umulh x16,x8,x4 492 1.1 christos adcs x26,x26,x17 493 1.1 christos umulh x17,x9,x4 494 1.1 christos adc x28,x28,xzr 495 1.1 christos str x19,[x2],#8 496 1.1 christos adds x19,x20,x14 497 1.1 christos umulh x14,x10,x4 498 1.1 christos adcs x20,x21,x15 499 1.1 christos umulh x15,x11,x4 500 1.1 christos adcs x21,x22,x16 501 1.1 christos umulh x16,x12,x4 502 1.1 christos adcs x22,x23,x17 503 1.1 christos umulh x17,x13,x4 504 1.1 christos ldr x4,[x0,x27] 505 1.1 christos adcs x23,x24,x14 506 1.1 christos adcs x24,x25,x15 507 1.1 christos adcs x25,x26,x16 508 1.1 christos adcs x26,x28,x17 509 1.1 christos //adc x28,xzr,xzr // moved above 510 1.1 christos cbnz x27,.Lsqr8x_mul 511 1.1 christos // note that carry flag is guaranteed 512 1.1 christos // to be zero at this point 513 1.1 christos cmp x1,x3 // done yet? 514 1.1 christos b.eq .Lsqr8x_break 515 1.1 christos 516 1.1 christos ldp x6,x7,[x2,#8*0] 517 1.1 christos ldp x8,x9,[x2,#8*2] 518 1.1 christos ldp x10,x11,[x2,#8*4] 519 1.1 christos ldp x12,x13,[x2,#8*6] 520 1.1 christos adds x19,x19,x6 521 1.1 christos ldr x4,[x0,#-8*8] 522 1.1 christos adcs x20,x20,x7 523 1.1 christos ldp x6,x7,[x1,#8*0] 524 1.1 christos adcs x21,x21,x8 525 1.1 christos adcs x22,x22,x9 526 1.1 christos ldp x8,x9,[x1,#8*2] 527 1.1 christos adcs x23,x23,x10 528 1.1 christos adcs x24,x24,x11 529 1.1 christos ldp x10,x11,[x1,#8*4] 530 1.1 christos adcs x25,x25,x12 531 1.1 christos mov x27,#-8*8 532 1.1 christos adcs x26,x26,x13 533 1.1 christos ldp x12,x13,[x1,#8*6] 534 1.1 christos add x1,x1,#8*8 535 1.1 christos //adc x28,xzr,xzr // moved above 536 1.1 christos b .Lsqr8x_mul 537 1.1 christos 538 1.1 christos .align 4 539 1.1 christos .Lsqr8x_break: 540 1.1 christos ldp x6,x7,[x0,#8*0] 541 1.1 christos add x1,x0,#8*8 542 1.1 christos ldp x8,x9,[x0,#8*2] 543 1.1 christos sub x14,x3,x1 // is it last iteration? 544 1.1 christos ldp x10,x11,[x0,#8*4] 545 1.1 christos sub x15,x2,x14 546 1.1 christos ldp x12,x13,[x0,#8*6] 547 1.1 christos cbz x14,.Lsqr8x_outer_loop 548 1.1 christos 549 1.1 christos stp x19,x20,[x2,#8*0] 550 1.1 christos ldp x19,x20,[x15,#8*0] 551 1.1 christos stp x21,x22,[x2,#8*2] 552 1.1 christos ldp x21,x22,[x15,#8*2] 553 1.1 christos stp x23,x24,[x2,#8*4] 554 1.1 christos ldp x23,x24,[x15,#8*4] 555 1.1 christos stp x25,x26,[x2,#8*6] 556 1.1 christos mov x2,x15 557 1.1 christos ldp x25,x26,[x15,#8*6] 558 1.1 christos b .Lsqr8x_outer_loop 559 1.1 christos 560 1.1 christos .align 4 561 1.1 christos .Lsqr8x_outer_break: 562 1.1 christos // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 563 1.1 christos ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 564 1.1 christos ldp x15,x16,[sp,#8*1] 565 1.1 christos ldp x11,x13,[x14,#8*2] 566 1.1 christos add x1,x14,#8*4 567 1.1 christos ldp x17,x14,[sp,#8*3] 568 1.1 christos 569 1.1 christos stp x19,x20,[x2,#8*0] 570 1.1 christos mul x19,x7,x7 571 1.1 christos stp x21,x22,[x2,#8*2] 572 1.1 christos umulh x7,x7,x7 573 1.1 christos stp x23,x24,[x2,#8*4] 574 1.1 christos mul x8,x9,x9 575 1.1 christos stp x25,x26,[x2,#8*6] 576 1.1 christos mov x2,sp 577 1.1 christos umulh x9,x9,x9 578 1.1 christos adds x20,x7,x15,lsl#1 579 1.1 christos extr x15,x16,x15,#63 580 1.1 christos sub x27,x5,#8*4 581 1.1 christos 582 1.1 christos .Lsqr4x_shift_n_add: 583 1.1 christos adcs x21,x8,x15 584 1.1 christos extr x16,x17,x16,#63 585 1.1 christos sub x27,x27,#8*4 586 1.1 christos adcs x22,x9,x16 587 1.1 christos ldp x15,x16,[x2,#8*5] 588 1.1 christos mul x10,x11,x11 589 1.1 christos ldp x7,x9,[x1],#8*2 590 1.1 christos umulh x11,x11,x11 591 1.1 christos mul x12,x13,x13 592 1.1 christos umulh x13,x13,x13 593 1.1 christos extr x17,x14,x17,#63 594 1.1 christos stp x19,x20,[x2,#8*0] 595 1.1 christos adcs x23,x10,x17 596 1.1 christos extr x14,x15,x14,#63 597 1.1 christos stp x21,x22,[x2,#8*2] 598 1.1 christos adcs x24,x11,x14 599 1.1 christos ldp x17,x14,[x2,#8*7] 600 1.1 christos extr x15,x16,x15,#63 601 1.1 christos adcs x25,x12,x15 602 1.1 christos extr x16,x17,x16,#63 603 1.1 christos adcs x26,x13,x16 604 1.1 christos ldp x15,x16,[x2,#8*9] 605 1.1 christos mul x6,x7,x7 606 1.1 christos ldp x11,x13,[x1],#8*2 607 1.1 christos umulh x7,x7,x7 608 1.1 christos mul x8,x9,x9 609 1.1 christos umulh x9,x9,x9 610 1.1 christos stp x23,x24,[x2,#8*4] 611 1.1 christos extr x17,x14,x17,#63 612 1.1 christos stp x25,x26,[x2,#8*6] 613 1.1 christos add x2,x2,#8*8 614 1.1 christos adcs x19,x6,x17 615 1.1 christos extr x14,x15,x14,#63 616 1.1 christos adcs x20,x7,x14 617 1.1 christos ldp x17,x14,[x2,#8*3] 618 1.1 christos extr x15,x16,x15,#63 619 1.1 christos cbnz x27,.Lsqr4x_shift_n_add 620 1.1 christos ldp x1,x4,[x29,#104] // pull np and n0 621 1.1 christos 622 1.1 christos adcs x21,x8,x15 623 1.1 christos extr x16,x17,x16,#63 624 1.1 christos adcs x22,x9,x16 625 1.1 christos ldp x15,x16,[x2,#8*5] 626 1.1 christos mul x10,x11,x11 627 1.1 christos umulh x11,x11,x11 628 1.1 christos stp x19,x20,[x2,#8*0] 629 1.1 christos mul x12,x13,x13 630 1.1 christos umulh x13,x13,x13 631 1.1 christos stp x21,x22,[x2,#8*2] 632 1.1 christos extr x17,x14,x17,#63 633 1.1 christos adcs x23,x10,x17 634 1.1 christos extr x14,x15,x14,#63 635 1.1 christos ldp x19,x20,[sp,#8*0] 636 1.1 christos adcs x24,x11,x14 637 1.1 christos extr x15,x16,x15,#63 638 1.1 christos ldp x6,x7,[x1,#8*0] 639 1.1 christos adcs x25,x12,x15 640 1.1 christos extr x16,xzr,x16,#63 641 1.1 christos ldp x8,x9,[x1,#8*2] 642 1.1 christos adc x26,x13,x16 643 1.1 christos ldp x10,x11,[x1,#8*4] 644 1.1 christos 645 1.1 christos // Reduce by 512 bits per iteration 646 1.1 christos mul x28,x4,x19 // t[0]*n0 647 1.1 christos ldp x12,x13,[x1,#8*6] 648 1.1 christos add x3,x1,x5 649 1.1 christos ldp x21,x22,[sp,#8*2] 650 1.1 christos stp x23,x24,[x2,#8*4] 651 1.1 christos ldp x23,x24,[sp,#8*4] 652 1.1 christos stp x25,x26,[x2,#8*6] 653 1.1 christos ldp x25,x26,[sp,#8*6] 654 1.1 christos add x1,x1,#8*8 655 1.1 christos mov x30,xzr // initial top-most carry 656 1.1 christos mov x2,sp 657 1.1 christos mov x27,#8 658 1.1 christos 659 1.1 christos .Lsqr8x_reduction: 660 1.1 christos // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 661 1.1 christos mul x15,x7,x28 662 1.1 christos sub x27,x27,#1 663 1.1 christos mul x16,x8,x28 664 1.1 christos str x28,[x2],#8 // put aside t[0]*n0 for tail processing 665 1.1 christos mul x17,x9,x28 666 1.1 christos // (*) adds xzr,x19,x14 667 1.1 christos subs xzr,x19,#1 // (*) 668 1.1 christos mul x14,x10,x28 669 1.1 christos adcs x19,x20,x15 670 1.1 christos mul x15,x11,x28 671 1.1 christos adcs x20,x21,x16 672 1.1 christos mul x16,x12,x28 673 1.1 christos adcs x21,x22,x17 674 1.1 christos mul x17,x13,x28 675 1.1 christos adcs x22,x23,x14 676 1.1 christos umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 677 1.1 christos adcs x23,x24,x15 678 1.1 christos umulh x15,x7,x28 679 1.1 christos adcs x24,x25,x16 680 1.1 christos umulh x16,x8,x28 681 1.1 christos adcs x25,x26,x17 682 1.1 christos umulh x17,x9,x28 683 1.1 christos adc x26,xzr,xzr 684 1.1 christos adds x19,x19,x14 685 1.1 christos umulh x14,x10,x28 686 1.1 christos adcs x20,x20,x15 687 1.1 christos umulh x15,x11,x28 688 1.1 christos adcs x21,x21,x16 689 1.1 christos umulh x16,x12,x28 690 1.1 christos adcs x22,x22,x17 691 1.1 christos umulh x17,x13,x28 692 1.1 christos mul x28,x4,x19 // next t[0]*n0 693 1.1 christos adcs x23,x23,x14 694 1.1 christos adcs x24,x24,x15 695 1.1 christos adcs x25,x25,x16 696 1.1 christos adc x26,x26,x17 697 1.1 christos cbnz x27,.Lsqr8x_reduction 698 1.1 christos 699 1.1 christos ldp x14,x15,[x2,#8*0] 700 1.1 christos ldp x16,x17,[x2,#8*2] 701 1.1 christos mov x0,x2 702 1.1 christos sub x27,x3,x1 // done yet? 703 1.1 christos adds x19,x19,x14 704 1.1 christos adcs x20,x20,x15 705 1.1 christos ldp x14,x15,[x2,#8*4] 706 1.1 christos adcs x21,x21,x16 707 1.1 christos adcs x22,x22,x17 708 1.1 christos ldp x16,x17,[x2,#8*6] 709 1.1 christos adcs x23,x23,x14 710 1.1 christos adcs x24,x24,x15 711 1.1 christos adcs x25,x25,x16 712 1.1 christos adcs x26,x26,x17 713 1.1 christos //adc x28,xzr,xzr // moved below 714 1.1 christos cbz x27,.Lsqr8x8_post_condition 715 1.1 christos 716 1.1 christos ldr x4,[x2,#-8*8] 717 1.1 christos ldp x6,x7,[x1,#8*0] 718 1.1 christos ldp x8,x9,[x1,#8*2] 719 1.1 christos ldp x10,x11,[x1,#8*4] 720 1.1 christos mov x27,#-8*8 721 1.1 christos ldp x12,x13,[x1,#8*6] 722 1.1 christos add x1,x1,#8*8 723 1.1 christos 724 1.1 christos .Lsqr8x_tail: 725 1.1 christos mul x14,x6,x4 726 1.1 christos adc x28,xzr,xzr // carry bit, modulo-scheduled 727 1.1 christos mul x15,x7,x4 728 1.1 christos add x27,x27,#8 729 1.1 christos mul x16,x8,x4 730 1.1 christos mul x17,x9,x4 731 1.1 christos adds x19,x19,x14 732 1.1 christos mul x14,x10,x4 733 1.1 christos adcs x20,x20,x15 734 1.1 christos mul x15,x11,x4 735 1.1 christos adcs x21,x21,x16 736 1.1 christos mul x16,x12,x4 737 1.1 christos adcs x22,x22,x17 738 1.1 christos mul x17,x13,x4 739 1.1 christos adcs x23,x23,x14 740 1.1 christos umulh x14,x6,x4 741 1.1 christos adcs x24,x24,x15 742 1.1 christos umulh x15,x7,x4 743 1.1 christos adcs x25,x25,x16 744 1.1 christos umulh x16,x8,x4 745 1.1 christos adcs x26,x26,x17 746 1.1 christos umulh x17,x9,x4 747 1.1 christos adc x28,x28,xzr 748 1.1 christos str x19,[x2],#8 749 1.1 christos adds x19,x20,x14 750 1.1 christos umulh x14,x10,x4 751 1.1 christos adcs x20,x21,x15 752 1.1 christos umulh x15,x11,x4 753 1.1 christos adcs x21,x22,x16 754 1.1 christos umulh x16,x12,x4 755 1.1 christos adcs x22,x23,x17 756 1.1 christos umulh x17,x13,x4 757 1.1 christos ldr x4,[x0,x27] 758 1.1 christos adcs x23,x24,x14 759 1.1 christos adcs x24,x25,x15 760 1.1 christos adcs x25,x26,x16 761 1.1 christos adcs x26,x28,x17 762 1.1 christos //adc x28,xzr,xzr // moved above 763 1.1 christos cbnz x27,.Lsqr8x_tail 764 1.1 christos // note that carry flag is guaranteed 765 1.1 christos // to be zero at this point 766 1.1 christos ldp x6,x7,[x2,#8*0] 767 1.1 christos sub x27,x3,x1 // done yet? 768 1.1 christos sub x16,x3,x5 // rewinded np 769 1.1 christos ldp x8,x9,[x2,#8*2] 770 1.1 christos ldp x10,x11,[x2,#8*4] 771 1.1 christos ldp x12,x13,[x2,#8*6] 772 1.1 christos cbz x27,.Lsqr8x_tail_break 773 1.1 christos 774 1.1 christos ldr x4,[x0,#-8*8] 775 1.1 christos adds x19,x19,x6 776 1.1 christos adcs x20,x20,x7 777 1.1 christos ldp x6,x7,[x1,#8*0] 778 1.1 christos adcs x21,x21,x8 779 1.1 christos adcs x22,x22,x9 780 1.1 christos ldp x8,x9,[x1,#8*2] 781 1.1 christos adcs x23,x23,x10 782 1.1 christos adcs x24,x24,x11 783 1.1 christos ldp x10,x11,[x1,#8*4] 784 1.1 christos adcs x25,x25,x12 785 1.1 christos mov x27,#-8*8 786 1.1 christos adcs x26,x26,x13 787 1.1 christos ldp x12,x13,[x1,#8*6] 788 1.1 christos add x1,x1,#8*8 789 1.1 christos //adc x28,xzr,xzr // moved above 790 1.1 christos b .Lsqr8x_tail 791 1.1 christos 792 1.1 christos .align 4 793 1.1 christos .Lsqr8x_tail_break: 794 1.1 christos ldr x4,[x29,#112] // pull n0 795 1.1 christos add x27,x2,#8*8 // end of current t[num] window 796 1.1 christos 797 1.1 christos subs xzr,x30,#1 // "move" top-most carry to carry bit 798 1.1 christos adcs x14,x19,x6 799 1.1 christos adcs x15,x20,x7 800 1.1 christos ldp x19,x20,[x0,#8*0] 801 1.1 christos adcs x21,x21,x8 802 1.1 christos ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 803 1.1 christos adcs x22,x22,x9 804 1.1 christos ldp x8,x9,[x16,#8*2] 805 1.1 christos adcs x23,x23,x10 806 1.1 christos adcs x24,x24,x11 807 1.1 christos ldp x10,x11,[x16,#8*4] 808 1.1 christos adcs x25,x25,x12 809 1.1 christos adcs x26,x26,x13 810 1.1 christos ldp x12,x13,[x16,#8*6] 811 1.1 christos add x1,x16,#8*8 812 1.1 christos adc x30,xzr,xzr // top-most carry 813 1.1 christos mul x28,x4,x19 814 1.1 christos stp x14,x15,[x2,#8*0] 815 1.1 christos stp x21,x22,[x2,#8*2] 816 1.1 christos ldp x21,x22,[x0,#8*2] 817 1.1 christos stp x23,x24,[x2,#8*4] 818 1.1 christos ldp x23,x24,[x0,#8*4] 819 1.1 christos cmp x27,x29 // did we hit the bottom? 820 1.1 christos stp x25,x26,[x2,#8*6] 821 1.1 christos mov x2,x0 // slide the window 822 1.1 christos ldp x25,x26,[x0,#8*6] 823 1.1 christos mov x27,#8 824 1.1 christos b.ne .Lsqr8x_reduction 825 1.1 christos 826 1.1 christos // Final step. We see if result is larger than modulus, and 827 1.1 christos // if it is, subtract the modulus. But comparison implies 828 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 829 1.1 christos // and conditionally copy original value. 830 1.1 christos ldr x0,[x29,#96] // pull rp 831 1.1 christos add x2,x2,#8*8 832 1.1 christos subs x14,x19,x6 833 1.1 christos sbcs x15,x20,x7 834 1.1 christos sub x27,x5,#8*8 835 1.1 christos mov x3,x0 // x0 copy 836 1.1 christos 837 1.1 christos .Lsqr8x_sub: 838 1.1 christos sbcs x16,x21,x8 839 1.1 christos ldp x6,x7,[x1,#8*0] 840 1.1 christos sbcs x17,x22,x9 841 1.1 christos stp x14,x15,[x0,#8*0] 842 1.1 christos sbcs x14,x23,x10 843 1.1 christos ldp x8,x9,[x1,#8*2] 844 1.1 christos sbcs x15,x24,x11 845 1.1 christos stp x16,x17,[x0,#8*2] 846 1.1 christos sbcs x16,x25,x12 847 1.1 christos ldp x10,x11,[x1,#8*4] 848 1.1 christos sbcs x17,x26,x13 849 1.1 christos ldp x12,x13,[x1,#8*6] 850 1.1 christos add x1,x1,#8*8 851 1.1 christos ldp x19,x20,[x2,#8*0] 852 1.1 christos sub x27,x27,#8*8 853 1.1 christos ldp x21,x22,[x2,#8*2] 854 1.1 christos ldp x23,x24,[x2,#8*4] 855 1.1 christos ldp x25,x26,[x2,#8*6] 856 1.1 christos add x2,x2,#8*8 857 1.1 christos stp x14,x15,[x0,#8*4] 858 1.1 christos sbcs x14,x19,x6 859 1.1 christos stp x16,x17,[x0,#8*6] 860 1.1 christos add x0,x0,#8*8 861 1.1 christos sbcs x15,x20,x7 862 1.1 christos cbnz x27,.Lsqr8x_sub 863 1.1 christos 864 1.1 christos sbcs x16,x21,x8 865 1.1 christos mov x2,sp 866 1.1 christos add x1,sp,x5 867 1.1 christos ldp x6,x7,[x3,#8*0] 868 1.1 christos sbcs x17,x22,x9 869 1.1 christos stp x14,x15,[x0,#8*0] 870 1.1 christos sbcs x14,x23,x10 871 1.1 christos ldp x8,x9,[x3,#8*2] 872 1.1 christos sbcs x15,x24,x11 873 1.1 christos stp x16,x17,[x0,#8*2] 874 1.1 christos sbcs x16,x25,x12 875 1.1 christos ldp x19,x20,[x1,#8*0] 876 1.1 christos sbcs x17,x26,x13 877 1.1 christos ldp x21,x22,[x1,#8*2] 878 1.1 christos sbcs xzr,x30,xzr // did it borrow? 879 1.1 christos ldr x30,[x29,#8] // pull return address 880 1.1 christos stp x14,x15,[x0,#8*4] 881 1.1 christos stp x16,x17,[x0,#8*6] 882 1.1 christos 883 1.1 christos sub x27,x5,#8*4 884 1.1 christos .Lsqr4x_cond_copy: 885 1.1 christos sub x27,x27,#8*4 886 1.1 christos csel x14,x19,x6,lo 887 1.1 christos stp xzr,xzr,[x2,#8*0] 888 1.1 christos csel x15,x20,x7,lo 889 1.1 christos ldp x6,x7,[x3,#8*4] 890 1.1 christos ldp x19,x20,[x1,#8*4] 891 1.1 christos csel x16,x21,x8,lo 892 1.1 christos stp xzr,xzr,[x2,#8*2] 893 1.1 christos add x2,x2,#8*4 894 1.1 christos csel x17,x22,x9,lo 895 1.1 christos ldp x8,x9,[x3,#8*6] 896 1.1 christos ldp x21,x22,[x1,#8*6] 897 1.1 christos add x1,x1,#8*4 898 1.1 christos stp x14,x15,[x3,#8*0] 899 1.1 christos stp x16,x17,[x3,#8*2] 900 1.1 christos add x3,x3,#8*4 901 1.1 christos stp xzr,xzr,[x1,#8*0] 902 1.1 christos stp xzr,xzr,[x1,#8*2] 903 1.1 christos cbnz x27,.Lsqr4x_cond_copy 904 1.1 christos 905 1.1 christos csel x14,x19,x6,lo 906 1.1 christos stp xzr,xzr,[x2,#8*0] 907 1.1 christos csel x15,x20,x7,lo 908 1.1 christos stp xzr,xzr,[x2,#8*2] 909 1.1 christos csel x16,x21,x8,lo 910 1.1 christos csel x17,x22,x9,lo 911 1.1 christos stp x14,x15,[x3,#8*0] 912 1.1 christos stp x16,x17,[x3,#8*2] 913 1.1 christos 914 1.1 christos b .Lsqr8x_done 915 1.1 christos 916 1.1 christos .align 4 917 1.1 christos .Lsqr8x8_post_condition: 918 1.1 christos adc x28,xzr,xzr 919 1.1 christos ldr x30,[x29,#8] // pull return address 920 1.1 christos // x19-7,x28 hold result, x6-7 hold modulus 921 1.1 christos subs x6,x19,x6 922 1.1 christos ldr x1,[x29,#96] // pull rp 923 1.1 christos sbcs x7,x20,x7 924 1.1 christos stp xzr,xzr,[sp,#8*0] 925 1.1 christos sbcs x8,x21,x8 926 1.1 christos stp xzr,xzr,[sp,#8*2] 927 1.1 christos sbcs x9,x22,x9 928 1.1 christos stp xzr,xzr,[sp,#8*4] 929 1.1 christos sbcs x10,x23,x10 930 1.1 christos stp xzr,xzr,[sp,#8*6] 931 1.1 christos sbcs x11,x24,x11 932 1.1 christos stp xzr,xzr,[sp,#8*8] 933 1.1 christos sbcs x12,x25,x12 934 1.1 christos stp xzr,xzr,[sp,#8*10] 935 1.1 christos sbcs x13,x26,x13 936 1.1 christos stp xzr,xzr,[sp,#8*12] 937 1.1 christos sbcs x28,x28,xzr // did it borrow? 938 1.1 christos stp xzr,xzr,[sp,#8*14] 939 1.1 christos 940 1.1 christos // x6-7 hold result-modulus 941 1.1 christos csel x6,x19,x6,lo 942 1.1 christos csel x7,x20,x7,lo 943 1.1 christos csel x8,x21,x8,lo 944 1.1 christos csel x9,x22,x9,lo 945 1.1 christos stp x6,x7,[x1,#8*0] 946 1.1 christos csel x10,x23,x10,lo 947 1.1 christos csel x11,x24,x11,lo 948 1.1 christos stp x8,x9,[x1,#8*2] 949 1.1 christos csel x12,x25,x12,lo 950 1.1 christos csel x13,x26,x13,lo 951 1.1 christos stp x10,x11,[x1,#8*4] 952 1.1 christos stp x12,x13,[x1,#8*6] 953 1.1 christos 954 1.1 christos .Lsqr8x_done: 955 1.1 christos ldp x19,x20,[x29,#16] 956 1.1 christos mov sp,x29 957 1.1 christos ldp x21,x22,[x29,#32] 958 1.1 christos mov x0,#1 959 1.1 christos ldp x23,x24,[x29,#48] 960 1.1 christos ldp x25,x26,[x29,#64] 961 1.1 christos ldp x27,x28,[x29,#80] 962 1.1 christos ldr x29,[sp],#128 963 1.1 christos .inst 0xd50323bf // autiasp 964 1.1 christos ret 965 1.1 christos .size __bn_sqr8x_mont,.-__bn_sqr8x_mont 966 1.1 christos .type __bn_mul4x_mont,%function 967 1.1 christos .align 5 968 1.1 christos __bn_mul4x_mont: 969 1.1 christos .inst 0xd503233f // paciasp 970 1.1 christos stp x29,x30,[sp,#-128]! 971 1.1 christos add x29,sp,#0 972 1.1 christos stp x19,x20,[sp,#16] 973 1.1 christos stp x21,x22,[sp,#32] 974 1.1 christos stp x23,x24,[sp,#48] 975 1.1 christos stp x25,x26,[sp,#64] 976 1.1 christos stp x27,x28,[sp,#80] 977 1.1 christos 978 1.1 christos sub x26,sp,x5,lsl#3 979 1.1 christos lsl x5,x5,#3 980 1.1 christos ldr x4,[x4] // *n0 981 1.1 christos sub sp,x26,#8*4 // alloca 982 1.1 christos 983 1.1 christos add x10,x2,x5 984 1.1 christos add x27,x1,x5 985 1.1 christos stp x0,x10,[x29,#96] // offload rp and &b[num] 986 1.1 christos 987 1.1 christos ldr x24,[x2,#8*0] // b[0] 988 1.1 christos ldp x6,x7,[x1,#8*0] // a[0..3] 989 1.1 christos ldp x8,x9,[x1,#8*2] 990 1.1 christos add x1,x1,#8*4 991 1.1 christos mov x19,xzr 992 1.1 christos mov x20,xzr 993 1.1 christos mov x21,xzr 994 1.1 christos mov x22,xzr 995 1.1 christos ldp x14,x15,[x3,#8*0] // n[0..3] 996 1.1 christos ldp x16,x17,[x3,#8*2] 997 1.1 christos adds x3,x3,#8*4 // clear carry bit 998 1.1 christos mov x0,xzr 999 1.1 christos mov x28,#0 1000 1.1 christos mov x26,sp 1001 1.1 christos 1002 1.1 christos .Loop_mul4x_1st_reduction: 1003 1.1 christos mul x10,x6,x24 // lo(a[0..3]*b[0]) 1004 1.1 christos adc x0,x0,xzr // modulo-scheduled 1005 1.1 christos mul x11,x7,x24 1006 1.1 christos add x28,x28,#8 1007 1.1 christos mul x12,x8,x24 1008 1.1 christos and x28,x28,#31 1009 1.1 christos mul x13,x9,x24 1010 1.1 christos adds x19,x19,x10 1011 1.1 christos umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1012 1.1 christos adcs x20,x20,x11 1013 1.1 christos mul x25,x19,x4 // t[0]*n0 1014 1.1 christos adcs x21,x21,x12 1015 1.1 christos umulh x11,x7,x24 1016 1.1 christos adcs x22,x22,x13 1017 1.1 christos umulh x12,x8,x24 1018 1.1 christos adc x23,xzr,xzr 1019 1.1 christos umulh x13,x9,x24 1020 1.1 christos ldr x24,[x2,x28] // next b[i] (or b[0]) 1021 1.1 christos adds x20,x20,x10 1022 1.1 christos // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1023 1.1 christos str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1024 1.1 christos adcs x21,x21,x11 1025 1.1 christos mul x11,x15,x25 1026 1.1 christos adcs x22,x22,x12 1027 1.1 christos mul x12,x16,x25 1028 1.1 christos adc x23,x23,x13 // can't overflow 1029 1.1 christos mul x13,x17,x25 1030 1.1 christos // (*) adds xzr,x19,x10 1031 1.1 christos subs xzr,x19,#1 // (*) 1032 1.1 christos umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1033 1.1 christos adcs x19,x20,x11 1034 1.1 christos umulh x11,x15,x25 1035 1.1 christos adcs x20,x21,x12 1036 1.1 christos umulh x12,x16,x25 1037 1.1 christos adcs x21,x22,x13 1038 1.1 christos umulh x13,x17,x25 1039 1.1 christos adcs x22,x23,x0 1040 1.1 christos adc x0,xzr,xzr 1041 1.1 christos adds x19,x19,x10 1042 1.1 christos sub x10,x27,x1 1043 1.1 christos adcs x20,x20,x11 1044 1.1 christos adcs x21,x21,x12 1045 1.1 christos adcs x22,x22,x13 1046 1.1 christos //adc x0,x0,xzr 1047 1.1 christos cbnz x28,.Loop_mul4x_1st_reduction 1048 1.1 christos 1049 1.1 christos cbz x10,.Lmul4x4_post_condition 1050 1.1 christos 1051 1.1 christos ldp x6,x7,[x1,#8*0] // a[4..7] 1052 1.1 christos ldp x8,x9,[x1,#8*2] 1053 1.1 christos add x1,x1,#8*4 1054 1.1 christos ldr x25,[sp] // a[0]*n0 1055 1.1 christos ldp x14,x15,[x3,#8*0] // n[4..7] 1056 1.1 christos ldp x16,x17,[x3,#8*2] 1057 1.1 christos add x3,x3,#8*4 1058 1.1 christos 1059 1.1 christos .Loop_mul4x_1st_tail: 1060 1.1 christos mul x10,x6,x24 // lo(a[4..7]*b[i]) 1061 1.1 christos adc x0,x0,xzr // modulo-scheduled 1062 1.1 christos mul x11,x7,x24 1063 1.1 christos add x28,x28,#8 1064 1.1 christos mul x12,x8,x24 1065 1.1 christos and x28,x28,#31 1066 1.1 christos mul x13,x9,x24 1067 1.1 christos adds x19,x19,x10 1068 1.1 christos umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1069 1.1 christos adcs x20,x20,x11 1070 1.1 christos umulh x11,x7,x24 1071 1.1 christos adcs x21,x21,x12 1072 1.1 christos umulh x12,x8,x24 1073 1.1 christos adcs x22,x22,x13 1074 1.1 christos umulh x13,x9,x24 1075 1.1 christos adc x23,xzr,xzr 1076 1.1 christos ldr x24,[x2,x28] // next b[i] (or b[0]) 1077 1.1 christos adds x20,x20,x10 1078 1.1 christos mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1079 1.1 christos adcs x21,x21,x11 1080 1.1 christos mul x11,x15,x25 1081 1.1 christos adcs x22,x22,x12 1082 1.1 christos mul x12,x16,x25 1083 1.1 christos adc x23,x23,x13 // can't overflow 1084 1.1 christos mul x13,x17,x25 1085 1.1 christos adds x19,x19,x10 1086 1.1 christos umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1087 1.1 christos adcs x20,x20,x11 1088 1.1 christos umulh x11,x15,x25 1089 1.1 christos adcs x21,x21,x12 1090 1.1 christos umulh x12,x16,x25 1091 1.1 christos adcs x22,x22,x13 1092 1.1 christos adcs x23,x23,x0 1093 1.1 christos umulh x13,x17,x25 1094 1.1 christos adc x0,xzr,xzr 1095 1.1 christos ldr x25,[sp,x28] // next t[0]*n0 1096 1.1 christos str x19,[x26],#8 // result!!! 1097 1.1 christos adds x19,x20,x10 1098 1.1 christos sub x10,x27,x1 // done yet? 1099 1.1 christos adcs x20,x21,x11 1100 1.1 christos adcs x21,x22,x12 1101 1.1 christos adcs x22,x23,x13 1102 1.1 christos //adc x0,x0,xzr 1103 1.1 christos cbnz x28,.Loop_mul4x_1st_tail 1104 1.1 christos 1105 1.1 christos sub x11,x27,x5 // rewinded x1 1106 1.1 christos cbz x10,.Lmul4x_proceed 1107 1.1 christos 1108 1.1 christos ldp x6,x7,[x1,#8*0] 1109 1.1 christos ldp x8,x9,[x1,#8*2] 1110 1.1 christos add x1,x1,#8*4 1111 1.1 christos ldp x14,x15,[x3,#8*0] 1112 1.1 christos ldp x16,x17,[x3,#8*2] 1113 1.1 christos add x3,x3,#8*4 1114 1.1 christos b .Loop_mul4x_1st_tail 1115 1.1 christos 1116 1.1 christos .align 5 1117 1.1 christos .Lmul4x_proceed: 1118 1.1 christos ldr x24,[x2,#8*4]! // *++b 1119 1.1 christos adc x30,x0,xzr 1120 1.1 christos ldp x6,x7,[x11,#8*0] // a[0..3] 1121 1.1 christos sub x3,x3,x5 // rewind np 1122 1.1 christos ldp x8,x9,[x11,#8*2] 1123 1.1 christos add x1,x11,#8*4 1124 1.1 christos 1125 1.1 christos stp x19,x20,[x26,#8*0] // result!!! 1126 1.1 christos ldp x19,x20,[sp,#8*4] // t[0..3] 1127 1.1 christos stp x21,x22,[x26,#8*2] // result!!! 1128 1.1 christos ldp x21,x22,[sp,#8*6] 1129 1.1 christos 1130 1.1 christos ldp x14,x15,[x3,#8*0] // n[0..3] 1131 1.1 christos mov x26,sp 1132 1.1 christos ldp x16,x17,[x3,#8*2] 1133 1.1 christos adds x3,x3,#8*4 // clear carry bit 1134 1.1 christos mov x0,xzr 1135 1.1 christos 1136 1.1 christos .align 4 1137 1.1 christos .Loop_mul4x_reduction: 1138 1.1 christos mul x10,x6,x24 // lo(a[0..3]*b[4]) 1139 1.1 christos adc x0,x0,xzr // modulo-scheduled 1140 1.1 christos mul x11,x7,x24 1141 1.1 christos add x28,x28,#8 1142 1.1 christos mul x12,x8,x24 1143 1.1 christos and x28,x28,#31 1144 1.1 christos mul x13,x9,x24 1145 1.1 christos adds x19,x19,x10 1146 1.1 christos umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1147 1.1 christos adcs x20,x20,x11 1148 1.1 christos mul x25,x19,x4 // t[0]*n0 1149 1.1 christos adcs x21,x21,x12 1150 1.1 christos umulh x11,x7,x24 1151 1.1 christos adcs x22,x22,x13 1152 1.1 christos umulh x12,x8,x24 1153 1.1 christos adc x23,xzr,xzr 1154 1.1 christos umulh x13,x9,x24 1155 1.1 christos ldr x24,[x2,x28] // next b[i] 1156 1.1 christos adds x20,x20,x10 1157 1.1 christos // (*) mul x10,x14,x25 1158 1.1 christos str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1159 1.1 christos adcs x21,x21,x11 1160 1.1 christos mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1161 1.1 christos adcs x22,x22,x12 1162 1.1 christos mul x12,x16,x25 1163 1.1 christos adc x23,x23,x13 // can't overflow 1164 1.1 christos mul x13,x17,x25 1165 1.1 christos // (*) adds xzr,x19,x10 1166 1.1 christos subs xzr,x19,#1 // (*) 1167 1.1 christos umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1168 1.1 christos adcs x19,x20,x11 1169 1.1 christos umulh x11,x15,x25 1170 1.1 christos adcs x20,x21,x12 1171 1.1 christos umulh x12,x16,x25 1172 1.1 christos adcs x21,x22,x13 1173 1.1 christos umulh x13,x17,x25 1174 1.1 christos adcs x22,x23,x0 1175 1.1 christos adc x0,xzr,xzr 1176 1.1 christos adds x19,x19,x10 1177 1.1 christos adcs x20,x20,x11 1178 1.1 christos adcs x21,x21,x12 1179 1.1 christos adcs x22,x22,x13 1180 1.1 christos //adc x0,x0,xzr 1181 1.1 christos cbnz x28,.Loop_mul4x_reduction 1182 1.1 christos 1183 1.1 christos adc x0,x0,xzr 1184 1.1 christos ldp x10,x11,[x26,#8*4] // t[4..7] 1185 1.1 christos ldp x12,x13,[x26,#8*6] 1186 1.1 christos ldp x6,x7,[x1,#8*0] // a[4..7] 1187 1.1 christos ldp x8,x9,[x1,#8*2] 1188 1.1 christos add x1,x1,#8*4 1189 1.1 christos adds x19,x19,x10 1190 1.1 christos adcs x20,x20,x11 1191 1.1 christos adcs x21,x21,x12 1192 1.1 christos adcs x22,x22,x13 1193 1.1 christos //adc x0,x0,xzr 1194 1.1 christos 1195 1.1 christos ldr x25,[sp] // t[0]*n0 1196 1.1 christos ldp x14,x15,[x3,#8*0] // n[4..7] 1197 1.1 christos ldp x16,x17,[x3,#8*2] 1198 1.1 christos add x3,x3,#8*4 1199 1.1 christos 1200 1.1 christos .align 4 1201 1.1 christos .Loop_mul4x_tail: 1202 1.1 christos mul x10,x6,x24 // lo(a[4..7]*b[4]) 1203 1.1 christos adc x0,x0,xzr // modulo-scheduled 1204 1.1 christos mul x11,x7,x24 1205 1.1 christos add x28,x28,#8 1206 1.1 christos mul x12,x8,x24 1207 1.1 christos and x28,x28,#31 1208 1.1 christos mul x13,x9,x24 1209 1.1 christos adds x19,x19,x10 1210 1.1 christos umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1211 1.1 christos adcs x20,x20,x11 1212 1.1 christos umulh x11,x7,x24 1213 1.1 christos adcs x21,x21,x12 1214 1.1 christos umulh x12,x8,x24 1215 1.1 christos adcs x22,x22,x13 1216 1.1 christos umulh x13,x9,x24 1217 1.1 christos adc x23,xzr,xzr 1218 1.1 christos ldr x24,[x2,x28] // next b[i] 1219 1.1 christos adds x20,x20,x10 1220 1.1 christos mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1221 1.1 christos adcs x21,x21,x11 1222 1.1 christos mul x11,x15,x25 1223 1.1 christos adcs x22,x22,x12 1224 1.1 christos mul x12,x16,x25 1225 1.1 christos adc x23,x23,x13 // can't overflow 1226 1.1 christos mul x13,x17,x25 1227 1.1 christos adds x19,x19,x10 1228 1.1 christos umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1229 1.1 christos adcs x20,x20,x11 1230 1.1 christos umulh x11,x15,x25 1231 1.1 christos adcs x21,x21,x12 1232 1.1 christos umulh x12,x16,x25 1233 1.1 christos adcs x22,x22,x13 1234 1.1 christos umulh x13,x17,x25 1235 1.1 christos adcs x23,x23,x0 1236 1.1 christos ldr x25,[sp,x28] // next a[0]*n0 1237 1.1 christos adc x0,xzr,xzr 1238 1.1 christos str x19,[x26],#8 // result!!! 1239 1.1 christos adds x19,x20,x10 1240 1.1 christos sub x10,x27,x1 // done yet? 1241 1.1 christos adcs x20,x21,x11 1242 1.1 christos adcs x21,x22,x12 1243 1.1 christos adcs x22,x23,x13 1244 1.1 christos //adc x0,x0,xzr 1245 1.1 christos cbnz x28,.Loop_mul4x_tail 1246 1.1 christos 1247 1.1 christos sub x11,x3,x5 // rewinded np? 1248 1.1 christos adc x0,x0,xzr 1249 1.1 christos cbz x10,.Loop_mul4x_break 1250 1.1 christos 1251 1.1 christos ldp x10,x11,[x26,#8*4] 1252 1.1 christos ldp x12,x13,[x26,#8*6] 1253 1.1 christos ldp x6,x7,[x1,#8*0] 1254 1.1 christos ldp x8,x9,[x1,#8*2] 1255 1.1 christos add x1,x1,#8*4 1256 1.1 christos adds x19,x19,x10 1257 1.1 christos adcs x20,x20,x11 1258 1.1 christos adcs x21,x21,x12 1259 1.1 christos adcs x22,x22,x13 1260 1.1 christos //adc x0,x0,xzr 1261 1.1 christos ldp x14,x15,[x3,#8*0] 1262 1.1 christos ldp x16,x17,[x3,#8*2] 1263 1.1 christos add x3,x3,#8*4 1264 1.1 christos b .Loop_mul4x_tail 1265 1.1 christos 1266 1.1 christos .align 4 1267 1.1 christos .Loop_mul4x_break: 1268 1.1 christos ldp x12,x13,[x29,#96] // pull rp and &b[num] 1269 1.1 christos adds x19,x19,x30 1270 1.1 christos add x2,x2,#8*4 // bp++ 1271 1.1 christos adcs x20,x20,xzr 1272 1.1 christos sub x1,x1,x5 // rewind ap 1273 1.1 christos adcs x21,x21,xzr 1274 1.1 christos stp x19,x20,[x26,#8*0] // result!!! 1275 1.1 christos adcs x22,x22,xzr 1276 1.1 christos ldp x19,x20,[sp,#8*4] // t[0..3] 1277 1.1 christos adc x30,x0,xzr 1278 1.1 christos stp x21,x22,[x26,#8*2] // result!!! 1279 1.1 christos cmp x2,x13 // done yet? 1280 1.1 christos ldp x21,x22,[sp,#8*6] 1281 1.1 christos ldp x14,x15,[x11,#8*0] // n[0..3] 1282 1.1 christos ldp x16,x17,[x11,#8*2] 1283 1.1 christos add x3,x11,#8*4 1284 1.1 christos b.eq .Lmul4x_post 1285 1.1 christos 1286 1.1 christos ldr x24,[x2] 1287 1.1 christos ldp x6,x7,[x1,#8*0] // a[0..3] 1288 1.1 christos ldp x8,x9,[x1,#8*2] 1289 1.1 christos adds x1,x1,#8*4 // clear carry bit 1290 1.1 christos mov x0,xzr 1291 1.1 christos mov x26,sp 1292 1.1 christos b .Loop_mul4x_reduction 1293 1.1 christos 1294 1.1 christos .align 4 1295 1.1 christos .Lmul4x_post: 1296 1.1 christos // Final step. We see if result is larger than modulus, and 1297 1.1 christos // if it is, subtract the modulus. But comparison implies 1298 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 1299 1.1 christos // and conditionally copy original value. 1300 1.1 christos mov x0,x12 1301 1.1 christos mov x27,x12 // x0 copy 1302 1.1 christos subs x10,x19,x14 1303 1.1 christos add x26,sp,#8*8 1304 1.1 christos sbcs x11,x20,x15 1305 1.1 christos sub x28,x5,#8*4 1306 1.1 christos 1307 1.1 christos .Lmul4x_sub: 1308 1.1 christos sbcs x12,x21,x16 1309 1.1 christos ldp x14,x15,[x3,#8*0] 1310 1.1 christos sub x28,x28,#8*4 1311 1.1 christos ldp x19,x20,[x26,#8*0] 1312 1.1 christos sbcs x13,x22,x17 1313 1.1 christos ldp x16,x17,[x3,#8*2] 1314 1.1 christos add x3,x3,#8*4 1315 1.1 christos ldp x21,x22,[x26,#8*2] 1316 1.1 christos add x26,x26,#8*4 1317 1.1 christos stp x10,x11,[x0,#8*0] 1318 1.1 christos sbcs x10,x19,x14 1319 1.1 christos stp x12,x13,[x0,#8*2] 1320 1.1 christos add x0,x0,#8*4 1321 1.1 christos sbcs x11,x20,x15 1322 1.1 christos cbnz x28,.Lmul4x_sub 1323 1.1 christos 1324 1.1 christos sbcs x12,x21,x16 1325 1.1 christos mov x26,sp 1326 1.1 christos add x1,sp,#8*4 1327 1.1 christos ldp x6,x7,[x27,#8*0] 1328 1.1 christos sbcs x13,x22,x17 1329 1.1 christos stp x10,x11,[x0,#8*0] 1330 1.1 christos ldp x8,x9,[x27,#8*2] 1331 1.1 christos stp x12,x13,[x0,#8*2] 1332 1.1 christos ldp x19,x20,[x1,#8*0] 1333 1.1 christos ldp x21,x22,[x1,#8*2] 1334 1.1 christos sbcs xzr,x30,xzr // did it borrow? 1335 1.1 christos ldr x30,[x29,#8] // pull return address 1336 1.1 christos 1337 1.1 christos sub x28,x5,#8*4 1338 1.1 christos .Lmul4x_cond_copy: 1339 1.1 christos sub x28,x28,#8*4 1340 1.1 christos csel x10,x19,x6,lo 1341 1.1 christos stp xzr,xzr,[x26,#8*0] 1342 1.1 christos csel x11,x20,x7,lo 1343 1.1 christos ldp x6,x7,[x27,#8*4] 1344 1.1 christos ldp x19,x20,[x1,#8*4] 1345 1.1 christos csel x12,x21,x8,lo 1346 1.1 christos stp xzr,xzr,[x26,#8*2] 1347 1.1 christos add x26,x26,#8*4 1348 1.1 christos csel x13,x22,x9,lo 1349 1.1 christos ldp x8,x9,[x27,#8*6] 1350 1.1 christos ldp x21,x22,[x1,#8*6] 1351 1.1 christos add x1,x1,#8*4 1352 1.1 christos stp x10,x11,[x27,#8*0] 1353 1.1 christos stp x12,x13,[x27,#8*2] 1354 1.1 christos add x27,x27,#8*4 1355 1.1 christos cbnz x28,.Lmul4x_cond_copy 1356 1.1 christos 1357 1.1 christos csel x10,x19,x6,lo 1358 1.1 christos stp xzr,xzr,[x26,#8*0] 1359 1.1 christos csel x11,x20,x7,lo 1360 1.1 christos stp xzr,xzr,[x26,#8*2] 1361 1.1 christos csel x12,x21,x8,lo 1362 1.1 christos stp xzr,xzr,[x26,#8*3] 1363 1.1 christos csel x13,x22,x9,lo 1364 1.1 christos stp xzr,xzr,[x26,#8*4] 1365 1.1 christos stp x10,x11,[x27,#8*0] 1366 1.1 christos stp x12,x13,[x27,#8*2] 1367 1.1 christos 1368 1.1 christos b .Lmul4x_done 1369 1.1 christos 1370 1.1 christos .align 4 1371 1.1 christos .Lmul4x4_post_condition: 1372 1.1 christos adc x0,x0,xzr 1373 1.1 christos ldr x1,[x29,#96] // pull rp 1374 1.1 christos // x19-3,x0 hold result, x14-7 hold modulus 1375 1.1 christos subs x6,x19,x14 1376 1.1 christos ldr x30,[x29,#8] // pull return address 1377 1.1 christos sbcs x7,x20,x15 1378 1.1 christos stp xzr,xzr,[sp,#8*0] 1379 1.1 christos sbcs x8,x21,x16 1380 1.1 christos stp xzr,xzr,[sp,#8*2] 1381 1.1 christos sbcs x9,x22,x17 1382 1.1 christos stp xzr,xzr,[sp,#8*4] 1383 1.1 christos sbcs xzr,x0,xzr // did it borrow? 1384 1.1 christos stp xzr,xzr,[sp,#8*6] 1385 1.1 christos 1386 1.1 christos // x6-3 hold result-modulus 1387 1.1 christos csel x6,x19,x6,lo 1388 1.1 christos csel x7,x20,x7,lo 1389 1.1 christos csel x8,x21,x8,lo 1390 1.1 christos csel x9,x22,x9,lo 1391 1.1 christos stp x6,x7,[x1,#8*0] 1392 1.1 christos stp x8,x9,[x1,#8*2] 1393 1.1 christos 1394 1.1 christos .Lmul4x_done: 1395 1.1 christos ldp x19,x20,[x29,#16] 1396 1.1 christos mov sp,x29 1397 1.1 christos ldp x21,x22,[x29,#32] 1398 1.1 christos mov x0,#1 1399 1.1 christos ldp x23,x24,[x29,#48] 1400 1.1 christos ldp x25,x26,[x29,#64] 1401 1.1 christos ldp x27,x28,[x29,#80] 1402 1.1 christos ldr x29,[sp],#128 1403 1.1 christos .inst 0xd50323bf // autiasp 1404 1.1 christos ret 1405 1.1 christos .size __bn_mul4x_mont,.-__bn_mul4x_mont 1406 1.1 christos .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1407 1.1 christos .align 2 1408 1.1 christos .align 4 1409