1 1.1 christos #include "arm_asm.h" 2 1.2 christos #include "arm_arch.h" 3 1.1 christos #ifndef __KERNEL__ 4 1.1 christos 5 1.1 christos .hidden OPENSSL_armv8_rsa_neonized 6 1.1 christos #endif 7 1.1 christos .text 8 1.1 christos 9 1.1 christos .globl bn_mul_mont 10 1.1 christos .type bn_mul_mont,%function 11 1.1 christos .align 5 12 1.1 christos bn_mul_mont: 13 1.2 christos AARCH64_SIGN_LINK_REGISTER 14 1.1 christos .Lbn_mul_mont: 15 1.1 christos tst x5,#3 16 1.1 christos b.ne .Lmul_mont 17 1.1 christos cmp x5,#32 18 1.1 christos b.le .Lscalar_impl 19 1.1 christos #ifndef __KERNEL__ 20 1.1 christos #ifndef __AARCH64EB__ 21 1.1 christos adrp x17,OPENSSL_armv8_rsa_neonized 22 1.1 christos ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 23 1.1 christos cbnz w17, bn_mul8x_mont_neon 24 1.1 christos #endif 25 1.1 christos #endif 26 1.1 christos 27 1.1 christos .Lscalar_impl: 28 1.1 christos tst x5,#7 29 1.1 christos b.eq __bn_sqr8x_mont 30 1.1 christos tst x5,#3 31 1.1 christos b.eq __bn_mul4x_mont 32 1.1 christos 33 1.1 christos .Lmul_mont: 34 1.1 christos stp x29,x30,[sp,#-64]! 35 1.1 christos add x29,sp,#0 36 1.1 christos stp x19,x20,[sp,#16] 37 1.1 christos stp x21,x22,[sp,#32] 38 1.1 christos stp x23,x24,[sp,#48] 39 1.1 christos 40 1.1 christos ldr x9,[x2],#8 // bp[0] 41 1.1 christos sub x22,sp,x5,lsl#3 42 1.1 christos ldp x7,x8,[x1],#16 // ap[0..1] 43 1.1 christos lsl x5,x5,#3 44 1.1 christos ldr x4,[x4] // *n0 45 1.1 christos and x22,x22,#-16 // ABI says so 46 1.1 christos ldp x13,x14,[x3],#16 // np[0..1] 47 1.1 christos 48 1.1 christos mul x6,x7,x9 // ap[0]*bp[0] 49 1.1 christos sub x21,x5,#16 // j=num-2 50 1.1 christos umulh x7,x7,x9 51 1.1 christos mul x10,x8,x9 // ap[1]*bp[0] 52 1.1 christos umulh x11,x8,x9 53 1.1 christos 54 1.1 christos mul x15,x6,x4 // "tp[0]"*n0 55 1.1 christos mov sp,x22 // alloca 56 1.1 christos 57 1.1 christos // (*) mul x12,x13,x15 // np[0]*m1 58 1.1 christos umulh x13,x13,x15 59 1.1 christos mul x16,x14,x15 // np[1]*m1 60 1.1 christos // (*) adds x12,x12,x6 // discarded 61 1.1 christos // (*) As for removal of first multiplication and addition 62 1.1 christos // instructions. The outcome of first addition is 63 1.1 christos // guaranteed to be zero, which leaves two computationally 64 1.1 christos // significant outcomes: it either carries or not. Then 65 1.1 christos // question is when does it carry? Is there alternative 66 1.1 christos // way to deduce it? If you follow operations, you can 67 1.1 christos // observe that condition for carry is quite simple: 68 1.1 christos // x6 being non-zero. So that carry can be calculated 69 1.1 christos // by adding -1 to x6. That's what next instruction does. 70 1.1 christos subs xzr,x6,#1 // (*) 71 1.1 christos umulh x17,x14,x15 72 1.1 christos adc x13,x13,xzr 73 1.1 christos cbz x21,.L1st_skip 74 1.1 christos 75 1.1 christos .L1st: 76 1.1 christos ldr x8,[x1],#8 77 1.1 christos adds x6,x10,x7 78 1.1 christos sub x21,x21,#8 // j-- 79 1.1 christos adc x7,x11,xzr 80 1.1 christos 81 1.1 christos ldr x14,[x3],#8 82 1.1 christos adds x12,x16,x13 83 1.1 christos mul x10,x8,x9 // ap[j]*bp[0] 84 1.1 christos adc x13,x17,xzr 85 1.1 christos umulh x11,x8,x9 86 1.1 christos 87 1.1 christos adds x12,x12,x6 88 1.1 christos mul x16,x14,x15 // np[j]*m1 89 1.1 christos adc x13,x13,xzr 90 1.1 christos umulh x17,x14,x15 91 1.1 christos str x12,[x22],#8 // tp[j-1] 92 1.1 christos cbnz x21,.L1st 93 1.1 christos 94 1.1 christos .L1st_skip: 95 1.1 christos adds x6,x10,x7 96 1.1 christos sub x1,x1,x5 // rewind x1 97 1.1 christos adc x7,x11,xzr 98 1.1 christos 99 1.1 christos adds x12,x16,x13 100 1.1 christos sub x3,x3,x5 // rewind x3 101 1.1 christos adc x13,x17,xzr 102 1.1 christos 103 1.1 christos adds x12,x12,x6 104 1.1 christos sub x20,x5,#8 // i=num-1 105 1.1 christos adcs x13,x13,x7 106 1.1 christos 107 1.1 christos adc x19,xzr,xzr // upmost overflow bit 108 1.1 christos stp x12,x13,[x22] 109 1.1 christos 110 1.1 christos .Louter: 111 1.1 christos ldr x9,[x2],#8 // bp[i] 112 1.1 christos ldp x7,x8,[x1],#16 113 1.1 christos ldr x23,[sp] // tp[0] 114 1.1 christos add x22,sp,#8 115 1.1 christos 116 1.1 christos mul x6,x7,x9 // ap[0]*bp[i] 117 1.1 christos sub x21,x5,#16 // j=num-2 118 1.1 christos umulh x7,x7,x9 119 1.1 christos ldp x13,x14,[x3],#16 120 1.1 christos mul x10,x8,x9 // ap[1]*bp[i] 121 1.1 christos adds x6,x6,x23 122 1.1 christos umulh x11,x8,x9 123 1.1 christos adc x7,x7,xzr 124 1.1 christos 125 1.1 christos mul x15,x6,x4 126 1.1 christos sub x20,x20,#8 // i-- 127 1.1 christos 128 1.1 christos // (*) mul x12,x13,x15 // np[0]*m1 129 1.1 christos umulh x13,x13,x15 130 1.1 christos mul x16,x14,x15 // np[1]*m1 131 1.1 christos // (*) adds x12,x12,x6 132 1.1 christos subs xzr,x6,#1 // (*) 133 1.1 christos umulh x17,x14,x15 134 1.1 christos cbz x21,.Linner_skip 135 1.1 christos 136 1.1 christos .Linner: 137 1.1 christos ldr x8,[x1],#8 138 1.1 christos adc x13,x13,xzr 139 1.1 christos ldr x23,[x22],#8 // tp[j] 140 1.1 christos adds x6,x10,x7 141 1.1 christos sub x21,x21,#8 // j-- 142 1.1 christos adc x7,x11,xzr 143 1.1 christos 144 1.1 christos adds x12,x16,x13 145 1.1 christos ldr x14,[x3],#8 146 1.1 christos adc x13,x17,xzr 147 1.1 christos 148 1.1 christos mul x10,x8,x9 // ap[j]*bp[i] 149 1.1 christos adds x6,x6,x23 150 1.1 christos umulh x11,x8,x9 151 1.1 christos adc x7,x7,xzr 152 1.1 christos 153 1.1 christos mul x16,x14,x15 // np[j]*m1 154 1.1 christos adds x12,x12,x6 155 1.1 christos umulh x17,x14,x15 156 1.1 christos stur x12,[x22,#-16] // tp[j-1] 157 1.1 christos cbnz x21,.Linner 158 1.1 christos 159 1.1 christos .Linner_skip: 160 1.1 christos ldr x23,[x22],#8 // tp[j] 161 1.1 christos adc x13,x13,xzr 162 1.1 christos adds x6,x10,x7 163 1.1 christos sub x1,x1,x5 // rewind x1 164 1.1 christos adc x7,x11,xzr 165 1.1 christos 166 1.1 christos adds x12,x16,x13 167 1.1 christos sub x3,x3,x5 // rewind x3 168 1.1 christos adcs x13,x17,x19 169 1.1 christos adc x19,xzr,xzr 170 1.1 christos 171 1.1 christos adds x6,x6,x23 172 1.1 christos adc x7,x7,xzr 173 1.1 christos 174 1.1 christos adds x12,x12,x6 175 1.1 christos adcs x13,x13,x7 176 1.1 christos adc x19,x19,xzr // upmost overflow bit 177 1.1 christos stp x12,x13,[x22,#-16] 178 1.1 christos 179 1.1 christos cbnz x20,.Louter 180 1.1 christos 181 1.1 christos // Final step. We see if result is larger than modulus, and 182 1.1 christos // if it is, subtract the modulus. But comparison implies 183 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 184 1.1 christos // and conditionally copy original value. 185 1.1 christos ldr x23,[sp] // tp[0] 186 1.1 christos add x22,sp,#8 187 1.1 christos ldr x14,[x3],#8 // np[0] 188 1.1 christos subs x21,x5,#8 // j=num-1 and clear borrow 189 1.1 christos mov x1,x0 190 1.1 christos .Lsub: 191 1.1 christos sbcs x8,x23,x14 // tp[j]-np[j] 192 1.1 christos ldr x23,[x22],#8 193 1.1 christos sub x21,x21,#8 // j-- 194 1.1 christos ldr x14,[x3],#8 195 1.1 christos str x8,[x1],#8 // rp[j]=tp[j]-np[j] 196 1.1 christos cbnz x21,.Lsub 197 1.1 christos 198 1.1 christos sbcs x8,x23,x14 199 1.1 christos sbcs x19,x19,xzr // did it borrow? 200 1.1 christos str x8,[x1],#8 // rp[num-1] 201 1.1 christos 202 1.1 christos ldr x23,[sp] // tp[0] 203 1.1 christos add x22,sp,#8 204 1.1 christos ldr x8,[x0],#8 // rp[0] 205 1.1 christos sub x5,x5,#8 // num-- 206 1.1 christos nop 207 1.1 christos .Lcond_copy: 208 1.1 christos sub x5,x5,#8 // num-- 209 1.1 christos csel x14,x23,x8,lo // did it borrow? 210 1.1 christos ldr x23,[x22],#8 211 1.1 christos ldr x8,[x0],#8 212 1.1 christos stur xzr,[x22,#-16] // wipe tp 213 1.1 christos stur x14,[x0,#-16] 214 1.1 christos cbnz x5,.Lcond_copy 215 1.1 christos 216 1.1 christos csel x14,x23,x8,lo 217 1.1 christos stur xzr,[x22,#-8] // wipe tp 218 1.1 christos stur x14,[x0,#-8] 219 1.1 christos 220 1.1 christos ldp x19,x20,[x29,#16] 221 1.1 christos mov sp,x29 222 1.1 christos ldp x21,x22,[x29,#32] 223 1.1 christos mov x0,#1 224 1.1 christos ldp x23,x24,[x29,#48] 225 1.1 christos ldr x29,[sp],#64 226 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 227 1.1 christos ret 228 1.1 christos .size bn_mul_mont,.-bn_mul_mont 229 1.1 christos .type bn_mul8x_mont_neon,%function 230 1.1 christos .align 5 231 1.1 christos bn_mul8x_mont_neon: 232 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 233 1.2 christos // only from bn_mul_mont which has already signed the return address. 234 1.1 christos stp x29,x30,[sp,#-80]! 235 1.1 christos mov x16,sp 236 1.1 christos stp d8,d9,[sp,#16] 237 1.1 christos stp d10,d11,[sp,#32] 238 1.1 christos stp d12,d13,[sp,#48] 239 1.1 christos stp d14,d15,[sp,#64] 240 1.1 christos lsl x5,x5,#1 241 1.1 christos eor v14.16b,v14.16b,v14.16b 242 1.1 christos 243 1.1 christos .align 4 244 1.1 christos .LNEON_8n: 245 1.1 christos eor v6.16b,v6.16b,v6.16b 246 1.1 christos sub x7,sp,#128 247 1.1 christos eor v7.16b,v7.16b,v7.16b 248 1.1 christos sub x7,x7,x5,lsl#4 249 1.1 christos eor v8.16b,v8.16b,v8.16b 250 1.1 christos and x7,x7,#-64 251 1.1 christos eor v9.16b,v9.16b,v9.16b 252 1.1 christos mov sp,x7 // alloca 253 1.1 christos eor v10.16b,v10.16b,v10.16b 254 1.1 christos add x7,x7,#256 255 1.1 christos eor v11.16b,v11.16b,v11.16b 256 1.1 christos sub x8,x5,#8 257 1.1 christos eor v12.16b,v12.16b,v12.16b 258 1.1 christos eor v13.16b,v13.16b,v13.16b 259 1.1 christos 260 1.1 christos .LNEON_8n_init: 261 1.1 christos st1 {v6.2d,v7.2d},[x7],#32 262 1.1 christos subs x8,x8,#8 263 1.1 christos st1 {v8.2d,v9.2d},[x7],#32 264 1.1 christos st1 {v10.2d,v11.2d},[x7],#32 265 1.1 christos st1 {v12.2d,v13.2d},[x7],#32 266 1.1 christos bne .LNEON_8n_init 267 1.1 christos 268 1.1 christos add x6,sp,#256 269 1.1 christos ld1 {v0.4s,v1.4s},[x1],#32 270 1.1 christos add x10,sp,#8 271 1.1 christos ldr s30,[x4],#4 272 1.1 christos mov x9,x5 273 1.1 christos b .LNEON_8n_outer 274 1.1 christos 275 1.1 christos .align 4 276 1.1 christos .LNEON_8n_outer: 277 1.1 christos ldr s28,[x2],#4 // *b++ 278 1.1 christos uxtl v28.4s,v28.4h 279 1.1 christos add x7,sp,#128 280 1.1 christos ld1 {v2.4s,v3.4s},[x3],#32 281 1.1 christos 282 1.1 christos umlal v6.2d,v28.2s,v0.s[0] 283 1.1 christos umlal v7.2d,v28.2s,v0.s[1] 284 1.1 christos umlal v8.2d,v28.2s,v0.s[2] 285 1.1 christos shl v29.2d,v6.2d,#16 286 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 287 1.1 christos umlal v9.2d,v28.2s,v0.s[3] 288 1.1 christos add v29.2d,v29.2d,v6.2d 289 1.1 christos umlal v10.2d,v28.2s,v1.s[0] 290 1.1 christos mul v29.2s,v29.2s,v30.2s 291 1.1 christos umlal v11.2d,v28.2s,v1.s[1] 292 1.1 christos st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 293 1.1 christos umlal v12.2d,v28.2s,v1.s[2] 294 1.1 christos uxtl v29.4s,v29.4h 295 1.1 christos umlal v13.2d,v28.2s,v1.s[3] 296 1.1 christos ldr s28,[x2],#4 // *b++ 297 1.1 christos umlal v6.2d,v29.2s,v2.s[0] 298 1.1 christos umlal v7.2d,v29.2s,v2.s[1] 299 1.1 christos uxtl v28.4s,v28.4h 300 1.1 christos umlal v8.2d,v29.2s,v2.s[2] 301 1.1 christos ushr v15.2d,v6.2d,#16 302 1.1 christos umlal v9.2d,v29.2s,v2.s[3] 303 1.1 christos umlal v10.2d,v29.2s,v3.s[0] 304 1.1 christos ext v6.16b,v6.16b,v6.16b,#8 305 1.1 christos add v6.2d,v6.2d,v15.2d 306 1.1 christos umlal v11.2d,v29.2s,v3.s[1] 307 1.1 christos ushr v6.2d,v6.2d,#16 308 1.1 christos umlal v12.2d,v29.2s,v3.s[2] 309 1.1 christos umlal v13.2d,v29.2s,v3.s[3] 310 1.1 christos add v16.2d,v7.2d,v6.2d 311 1.1 christos ins v7.d[0],v16.d[0] 312 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 313 1.1 christos umlal v7.2d,v28.2s,v0.s[0] 314 1.1 christos ld1 {v6.2d},[x6],#16 315 1.1 christos umlal v8.2d,v28.2s,v0.s[1] 316 1.1 christos umlal v9.2d,v28.2s,v0.s[2] 317 1.1 christos shl v29.2d,v7.2d,#16 318 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 319 1.1 christos umlal v10.2d,v28.2s,v0.s[3] 320 1.1 christos add v29.2d,v29.2d,v7.2d 321 1.1 christos umlal v11.2d,v28.2s,v1.s[0] 322 1.1 christos mul v29.2s,v29.2s,v30.2s 323 1.1 christos umlal v12.2d,v28.2s,v1.s[1] 324 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 325 1.1 christos umlal v13.2d,v28.2s,v1.s[2] 326 1.1 christos uxtl v29.4s,v29.4h 327 1.1 christos umlal v6.2d,v28.2s,v1.s[3] 328 1.1 christos ldr s28,[x2],#4 // *b++ 329 1.1 christos umlal v7.2d,v29.2s,v2.s[0] 330 1.1 christos umlal v8.2d,v29.2s,v2.s[1] 331 1.1 christos uxtl v28.4s,v28.4h 332 1.1 christos umlal v9.2d,v29.2s,v2.s[2] 333 1.1 christos ushr v15.2d,v7.2d,#16 334 1.1 christos umlal v10.2d,v29.2s,v2.s[3] 335 1.1 christos umlal v11.2d,v29.2s,v3.s[0] 336 1.1 christos ext v7.16b,v7.16b,v7.16b,#8 337 1.1 christos add v7.2d,v7.2d,v15.2d 338 1.1 christos umlal v12.2d,v29.2s,v3.s[1] 339 1.1 christos ushr v7.2d,v7.2d,#16 340 1.1 christos umlal v13.2d,v29.2s,v3.s[2] 341 1.1 christos umlal v6.2d,v29.2s,v3.s[3] 342 1.1 christos add v16.2d,v8.2d,v7.2d 343 1.1 christos ins v8.d[0],v16.d[0] 344 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 345 1.1 christos umlal v8.2d,v28.2s,v0.s[0] 346 1.1 christos ld1 {v7.2d},[x6],#16 347 1.1 christos umlal v9.2d,v28.2s,v0.s[1] 348 1.1 christos umlal v10.2d,v28.2s,v0.s[2] 349 1.1 christos shl v29.2d,v8.2d,#16 350 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 351 1.1 christos umlal v11.2d,v28.2s,v0.s[3] 352 1.1 christos add v29.2d,v29.2d,v8.2d 353 1.1 christos umlal v12.2d,v28.2s,v1.s[0] 354 1.1 christos mul v29.2s,v29.2s,v30.2s 355 1.1 christos umlal v13.2d,v28.2s,v1.s[1] 356 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 357 1.1 christos umlal v6.2d,v28.2s,v1.s[2] 358 1.1 christos uxtl v29.4s,v29.4h 359 1.1 christos umlal v7.2d,v28.2s,v1.s[3] 360 1.1 christos ldr s28,[x2],#4 // *b++ 361 1.1 christos umlal v8.2d,v29.2s,v2.s[0] 362 1.1 christos umlal v9.2d,v29.2s,v2.s[1] 363 1.1 christos uxtl v28.4s,v28.4h 364 1.1 christos umlal v10.2d,v29.2s,v2.s[2] 365 1.1 christos ushr v15.2d,v8.2d,#16 366 1.1 christos umlal v11.2d,v29.2s,v2.s[3] 367 1.1 christos umlal v12.2d,v29.2s,v3.s[0] 368 1.1 christos ext v8.16b,v8.16b,v8.16b,#8 369 1.1 christos add v8.2d,v8.2d,v15.2d 370 1.1 christos umlal v13.2d,v29.2s,v3.s[1] 371 1.1 christos ushr v8.2d,v8.2d,#16 372 1.1 christos umlal v6.2d,v29.2s,v3.s[2] 373 1.1 christos umlal v7.2d,v29.2s,v3.s[3] 374 1.1 christos add v16.2d,v9.2d,v8.2d 375 1.1 christos ins v9.d[0],v16.d[0] 376 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 377 1.1 christos umlal v9.2d,v28.2s,v0.s[0] 378 1.1 christos ld1 {v8.2d},[x6],#16 379 1.1 christos umlal v10.2d,v28.2s,v0.s[1] 380 1.1 christos umlal v11.2d,v28.2s,v0.s[2] 381 1.1 christos shl v29.2d,v9.2d,#16 382 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 383 1.1 christos umlal v12.2d,v28.2s,v0.s[3] 384 1.1 christos add v29.2d,v29.2d,v9.2d 385 1.1 christos umlal v13.2d,v28.2s,v1.s[0] 386 1.1 christos mul v29.2s,v29.2s,v30.2s 387 1.1 christos umlal v6.2d,v28.2s,v1.s[1] 388 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 389 1.1 christos umlal v7.2d,v28.2s,v1.s[2] 390 1.1 christos uxtl v29.4s,v29.4h 391 1.1 christos umlal v8.2d,v28.2s,v1.s[3] 392 1.1 christos ldr s28,[x2],#4 // *b++ 393 1.1 christos umlal v9.2d,v29.2s,v2.s[0] 394 1.1 christos umlal v10.2d,v29.2s,v2.s[1] 395 1.1 christos uxtl v28.4s,v28.4h 396 1.1 christos umlal v11.2d,v29.2s,v2.s[2] 397 1.1 christos ushr v15.2d,v9.2d,#16 398 1.1 christos umlal v12.2d,v29.2s,v2.s[3] 399 1.1 christos umlal v13.2d,v29.2s,v3.s[0] 400 1.1 christos ext v9.16b,v9.16b,v9.16b,#8 401 1.1 christos add v9.2d,v9.2d,v15.2d 402 1.1 christos umlal v6.2d,v29.2s,v3.s[1] 403 1.1 christos ushr v9.2d,v9.2d,#16 404 1.1 christos umlal v7.2d,v29.2s,v3.s[2] 405 1.1 christos umlal v8.2d,v29.2s,v3.s[3] 406 1.1 christos add v16.2d,v10.2d,v9.2d 407 1.1 christos ins v10.d[0],v16.d[0] 408 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 409 1.1 christos umlal v10.2d,v28.2s,v0.s[0] 410 1.1 christos ld1 {v9.2d},[x6],#16 411 1.1 christos umlal v11.2d,v28.2s,v0.s[1] 412 1.1 christos umlal v12.2d,v28.2s,v0.s[2] 413 1.1 christos shl v29.2d,v10.2d,#16 414 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 415 1.1 christos umlal v13.2d,v28.2s,v0.s[3] 416 1.1 christos add v29.2d,v29.2d,v10.2d 417 1.1 christos umlal v6.2d,v28.2s,v1.s[0] 418 1.1 christos mul v29.2s,v29.2s,v30.2s 419 1.1 christos umlal v7.2d,v28.2s,v1.s[1] 420 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 421 1.1 christos umlal v8.2d,v28.2s,v1.s[2] 422 1.1 christos uxtl v29.4s,v29.4h 423 1.1 christos umlal v9.2d,v28.2s,v1.s[3] 424 1.1 christos ldr s28,[x2],#4 // *b++ 425 1.1 christos umlal v10.2d,v29.2s,v2.s[0] 426 1.1 christos umlal v11.2d,v29.2s,v2.s[1] 427 1.1 christos uxtl v28.4s,v28.4h 428 1.1 christos umlal v12.2d,v29.2s,v2.s[2] 429 1.1 christos ushr v15.2d,v10.2d,#16 430 1.1 christos umlal v13.2d,v29.2s,v2.s[3] 431 1.1 christos umlal v6.2d,v29.2s,v3.s[0] 432 1.1 christos ext v10.16b,v10.16b,v10.16b,#8 433 1.1 christos add v10.2d,v10.2d,v15.2d 434 1.1 christos umlal v7.2d,v29.2s,v3.s[1] 435 1.1 christos ushr v10.2d,v10.2d,#16 436 1.1 christos umlal v8.2d,v29.2s,v3.s[2] 437 1.1 christos umlal v9.2d,v29.2s,v3.s[3] 438 1.1 christos add v16.2d,v11.2d,v10.2d 439 1.1 christos ins v11.d[0],v16.d[0] 440 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 441 1.1 christos umlal v11.2d,v28.2s,v0.s[0] 442 1.1 christos ld1 {v10.2d},[x6],#16 443 1.1 christos umlal v12.2d,v28.2s,v0.s[1] 444 1.1 christos umlal v13.2d,v28.2s,v0.s[2] 445 1.1 christos shl v29.2d,v11.2d,#16 446 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 447 1.1 christos umlal v6.2d,v28.2s,v0.s[3] 448 1.1 christos add v29.2d,v29.2d,v11.2d 449 1.1 christos umlal v7.2d,v28.2s,v1.s[0] 450 1.1 christos mul v29.2s,v29.2s,v30.2s 451 1.1 christos umlal v8.2d,v28.2s,v1.s[1] 452 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 453 1.1 christos umlal v9.2d,v28.2s,v1.s[2] 454 1.1 christos uxtl v29.4s,v29.4h 455 1.1 christos umlal v10.2d,v28.2s,v1.s[3] 456 1.1 christos ldr s28,[x2],#4 // *b++ 457 1.1 christos umlal v11.2d,v29.2s,v2.s[0] 458 1.1 christos umlal v12.2d,v29.2s,v2.s[1] 459 1.1 christos uxtl v28.4s,v28.4h 460 1.1 christos umlal v13.2d,v29.2s,v2.s[2] 461 1.1 christos ushr v15.2d,v11.2d,#16 462 1.1 christos umlal v6.2d,v29.2s,v2.s[3] 463 1.1 christos umlal v7.2d,v29.2s,v3.s[0] 464 1.1 christos ext v11.16b,v11.16b,v11.16b,#8 465 1.1 christos add v11.2d,v11.2d,v15.2d 466 1.1 christos umlal v8.2d,v29.2s,v3.s[1] 467 1.1 christos ushr v11.2d,v11.2d,#16 468 1.1 christos umlal v9.2d,v29.2s,v3.s[2] 469 1.1 christos umlal v10.2d,v29.2s,v3.s[3] 470 1.1 christos add v16.2d,v12.2d,v11.2d 471 1.1 christos ins v12.d[0],v16.d[0] 472 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 473 1.1 christos umlal v12.2d,v28.2s,v0.s[0] 474 1.1 christos ld1 {v11.2d},[x6],#16 475 1.1 christos umlal v13.2d,v28.2s,v0.s[1] 476 1.1 christos umlal v6.2d,v28.2s,v0.s[2] 477 1.1 christos shl v29.2d,v12.2d,#16 478 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 479 1.1 christos umlal v7.2d,v28.2s,v0.s[3] 480 1.1 christos add v29.2d,v29.2d,v12.2d 481 1.1 christos umlal v8.2d,v28.2s,v1.s[0] 482 1.1 christos mul v29.2s,v29.2s,v30.2s 483 1.1 christos umlal v9.2d,v28.2s,v1.s[1] 484 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 485 1.1 christos umlal v10.2d,v28.2s,v1.s[2] 486 1.1 christos uxtl v29.4s,v29.4h 487 1.1 christos umlal v11.2d,v28.2s,v1.s[3] 488 1.1 christos ldr s28,[x2],#4 // *b++ 489 1.1 christos umlal v12.2d,v29.2s,v2.s[0] 490 1.1 christos umlal v13.2d,v29.2s,v2.s[1] 491 1.1 christos uxtl v28.4s,v28.4h 492 1.1 christos umlal v6.2d,v29.2s,v2.s[2] 493 1.1 christos ushr v15.2d,v12.2d,#16 494 1.1 christos umlal v7.2d,v29.2s,v2.s[3] 495 1.1 christos umlal v8.2d,v29.2s,v3.s[0] 496 1.1 christos ext v12.16b,v12.16b,v12.16b,#8 497 1.1 christos add v12.2d,v12.2d,v15.2d 498 1.1 christos umlal v9.2d,v29.2s,v3.s[1] 499 1.1 christos ushr v12.2d,v12.2d,#16 500 1.1 christos umlal v10.2d,v29.2s,v3.s[2] 501 1.1 christos umlal v11.2d,v29.2s,v3.s[3] 502 1.1 christos add v16.2d,v13.2d,v12.2d 503 1.1 christos ins v13.d[0],v16.d[0] 504 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 505 1.1 christos umlal v13.2d,v28.2s,v0.s[0] 506 1.1 christos ld1 {v12.2d},[x6],#16 507 1.1 christos umlal v6.2d,v28.2s,v0.s[1] 508 1.1 christos umlal v7.2d,v28.2s,v0.s[2] 509 1.1 christos shl v29.2d,v13.2d,#16 510 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 511 1.1 christos umlal v8.2d,v28.2s,v0.s[3] 512 1.1 christos add v29.2d,v29.2d,v13.2d 513 1.1 christos umlal v9.2d,v28.2s,v1.s[0] 514 1.1 christos mul v29.2s,v29.2s,v30.2s 515 1.1 christos umlal v10.2d,v28.2s,v1.s[1] 516 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 517 1.1 christos umlal v11.2d,v28.2s,v1.s[2] 518 1.1 christos uxtl v29.4s,v29.4h 519 1.1 christos umlal v12.2d,v28.2s,v1.s[3] 520 1.1 christos ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 521 1.1 christos umlal v13.2d,v29.2s,v2.s[0] 522 1.1 christos ld1 {v0.4s,v1.4s},[x1],#32 523 1.1 christos umlal v6.2d,v29.2s,v2.s[1] 524 1.1 christos umlal v7.2d,v29.2s,v2.s[2] 525 1.1 christos mov v5.16b,v13.16b 526 1.1 christos ushr v5.2d,v5.2d,#16 527 1.1 christos ext v13.16b,v13.16b,v13.16b,#8 528 1.1 christos umlal v8.2d,v29.2s,v2.s[3] 529 1.1 christos umlal v9.2d,v29.2s,v3.s[0] 530 1.1 christos add v13.2d,v13.2d,v5.2d 531 1.1 christos umlal v10.2d,v29.2s,v3.s[1] 532 1.1 christos ushr v13.2d,v13.2d,#16 533 1.1 christos eor v15.16b,v15.16b,v15.16b 534 1.1 christos ins v13.d[1],v15.d[0] 535 1.1 christos umlal v11.2d,v29.2s,v3.s[2] 536 1.1 christos umlal v12.2d,v29.2s,v3.s[3] 537 1.1 christos add v6.2d,v6.2d,v13.2d 538 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 539 1.1 christos add x10,sp,#8 // rewind 540 1.1 christos sub x8,x5,#8 541 1.1 christos b .LNEON_8n_inner 542 1.1 christos 543 1.1 christos .align 4 544 1.1 christos .LNEON_8n_inner: 545 1.1 christos subs x8,x8,#8 546 1.1 christos umlal v6.2d,v28.2s,v0.s[0] 547 1.1 christos ld1 {v13.2d},[x6] 548 1.1 christos umlal v7.2d,v28.2s,v0.s[1] 549 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 550 1.1 christos umlal v8.2d,v28.2s,v0.s[2] 551 1.1 christos ld1 {v2.4s,v3.4s},[x3],#32 552 1.1 christos umlal v9.2d,v28.2s,v0.s[3] 553 1.1 christos b.eq .LInner_jump 554 1.1 christos add x6,x6,#16 // don't advance in last iteration 555 1.1 christos .LInner_jump: 556 1.1 christos umlal v10.2d,v28.2s,v1.s[0] 557 1.1 christos umlal v11.2d,v28.2s,v1.s[1] 558 1.1 christos umlal v12.2d,v28.2s,v1.s[2] 559 1.1 christos umlal v13.2d,v28.2s,v1.s[3] 560 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 561 1.1 christos umlal v6.2d,v29.2s,v2.s[0] 562 1.1 christos umlal v7.2d,v29.2s,v2.s[1] 563 1.1 christos umlal v8.2d,v29.2s,v2.s[2] 564 1.1 christos umlal v9.2d,v29.2s,v2.s[3] 565 1.1 christos umlal v10.2d,v29.2s,v3.s[0] 566 1.1 christos umlal v11.2d,v29.2s,v3.s[1] 567 1.1 christos umlal v12.2d,v29.2s,v3.s[2] 568 1.1 christos umlal v13.2d,v29.2s,v3.s[3] 569 1.1 christos st1 {v6.2d},[x7],#16 570 1.1 christos umlal v7.2d,v28.2s,v0.s[0] 571 1.1 christos ld1 {v6.2d},[x6] 572 1.1 christos umlal v8.2d,v28.2s,v0.s[1] 573 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 574 1.1 christos umlal v9.2d,v28.2s,v0.s[2] 575 1.1 christos b.eq .LInner_jump1 576 1.1 christos add x6,x6,#16 // don't advance in last iteration 577 1.1 christos .LInner_jump1: 578 1.1 christos umlal v10.2d,v28.2s,v0.s[3] 579 1.1 christos umlal v11.2d,v28.2s,v1.s[0] 580 1.1 christos umlal v12.2d,v28.2s,v1.s[1] 581 1.1 christos umlal v13.2d,v28.2s,v1.s[2] 582 1.1 christos umlal v6.2d,v28.2s,v1.s[3] 583 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 584 1.1 christos umlal v7.2d,v29.2s,v2.s[0] 585 1.1 christos umlal v8.2d,v29.2s,v2.s[1] 586 1.1 christos umlal v9.2d,v29.2s,v2.s[2] 587 1.1 christos umlal v10.2d,v29.2s,v2.s[3] 588 1.1 christos umlal v11.2d,v29.2s,v3.s[0] 589 1.1 christos umlal v12.2d,v29.2s,v3.s[1] 590 1.1 christos umlal v13.2d,v29.2s,v3.s[2] 591 1.1 christos umlal v6.2d,v29.2s,v3.s[3] 592 1.1 christos st1 {v7.2d},[x7],#16 593 1.1 christos umlal v8.2d,v28.2s,v0.s[0] 594 1.1 christos ld1 {v7.2d},[x6] 595 1.1 christos umlal v9.2d,v28.2s,v0.s[1] 596 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 597 1.1 christos umlal v10.2d,v28.2s,v0.s[2] 598 1.1 christos b.eq .LInner_jump2 599 1.1 christos add x6,x6,#16 // don't advance in last iteration 600 1.1 christos .LInner_jump2: 601 1.1 christos umlal v11.2d,v28.2s,v0.s[3] 602 1.1 christos umlal v12.2d,v28.2s,v1.s[0] 603 1.1 christos umlal v13.2d,v28.2s,v1.s[1] 604 1.1 christos umlal v6.2d,v28.2s,v1.s[2] 605 1.1 christos umlal v7.2d,v28.2s,v1.s[3] 606 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 607 1.1 christos umlal v8.2d,v29.2s,v2.s[0] 608 1.1 christos umlal v9.2d,v29.2s,v2.s[1] 609 1.1 christos umlal v10.2d,v29.2s,v2.s[2] 610 1.1 christos umlal v11.2d,v29.2s,v2.s[3] 611 1.1 christos umlal v12.2d,v29.2s,v3.s[0] 612 1.1 christos umlal v13.2d,v29.2s,v3.s[1] 613 1.1 christos umlal v6.2d,v29.2s,v3.s[2] 614 1.1 christos umlal v7.2d,v29.2s,v3.s[3] 615 1.1 christos st1 {v8.2d},[x7],#16 616 1.1 christos umlal v9.2d,v28.2s,v0.s[0] 617 1.1 christos ld1 {v8.2d},[x6] 618 1.1 christos umlal v10.2d,v28.2s,v0.s[1] 619 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 620 1.1 christos umlal v11.2d,v28.2s,v0.s[2] 621 1.1 christos b.eq .LInner_jump3 622 1.1 christos add x6,x6,#16 // don't advance in last iteration 623 1.1 christos .LInner_jump3: 624 1.1 christos umlal v12.2d,v28.2s,v0.s[3] 625 1.1 christos umlal v13.2d,v28.2s,v1.s[0] 626 1.1 christos umlal v6.2d,v28.2s,v1.s[1] 627 1.1 christos umlal v7.2d,v28.2s,v1.s[2] 628 1.1 christos umlal v8.2d,v28.2s,v1.s[3] 629 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 630 1.1 christos umlal v9.2d,v29.2s,v2.s[0] 631 1.1 christos umlal v10.2d,v29.2s,v2.s[1] 632 1.1 christos umlal v11.2d,v29.2s,v2.s[2] 633 1.1 christos umlal v12.2d,v29.2s,v2.s[3] 634 1.1 christos umlal v13.2d,v29.2s,v3.s[0] 635 1.1 christos umlal v6.2d,v29.2s,v3.s[1] 636 1.1 christos umlal v7.2d,v29.2s,v3.s[2] 637 1.1 christos umlal v8.2d,v29.2s,v3.s[3] 638 1.1 christos st1 {v9.2d},[x7],#16 639 1.1 christos umlal v10.2d,v28.2s,v0.s[0] 640 1.1 christos ld1 {v9.2d},[x6] 641 1.1 christos umlal v11.2d,v28.2s,v0.s[1] 642 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 643 1.1 christos umlal v12.2d,v28.2s,v0.s[2] 644 1.1 christos b.eq .LInner_jump4 645 1.1 christos add x6,x6,#16 // don't advance in last iteration 646 1.1 christos .LInner_jump4: 647 1.1 christos umlal v13.2d,v28.2s,v0.s[3] 648 1.1 christos umlal v6.2d,v28.2s,v1.s[0] 649 1.1 christos umlal v7.2d,v28.2s,v1.s[1] 650 1.1 christos umlal v8.2d,v28.2s,v1.s[2] 651 1.1 christos umlal v9.2d,v28.2s,v1.s[3] 652 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 653 1.1 christos umlal v10.2d,v29.2s,v2.s[0] 654 1.1 christos umlal v11.2d,v29.2s,v2.s[1] 655 1.1 christos umlal v12.2d,v29.2s,v2.s[2] 656 1.1 christos umlal v13.2d,v29.2s,v2.s[3] 657 1.1 christos umlal v6.2d,v29.2s,v3.s[0] 658 1.1 christos umlal v7.2d,v29.2s,v3.s[1] 659 1.1 christos umlal v8.2d,v29.2s,v3.s[2] 660 1.1 christos umlal v9.2d,v29.2s,v3.s[3] 661 1.1 christos st1 {v10.2d},[x7],#16 662 1.1 christos umlal v11.2d,v28.2s,v0.s[0] 663 1.1 christos ld1 {v10.2d},[x6] 664 1.1 christos umlal v12.2d,v28.2s,v0.s[1] 665 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 666 1.1 christos umlal v13.2d,v28.2s,v0.s[2] 667 1.1 christos b.eq .LInner_jump5 668 1.1 christos add x6,x6,#16 // don't advance in last iteration 669 1.1 christos .LInner_jump5: 670 1.1 christos umlal v6.2d,v28.2s,v0.s[3] 671 1.1 christos umlal v7.2d,v28.2s,v1.s[0] 672 1.1 christos umlal v8.2d,v28.2s,v1.s[1] 673 1.1 christos umlal v9.2d,v28.2s,v1.s[2] 674 1.1 christos umlal v10.2d,v28.2s,v1.s[3] 675 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 676 1.1 christos umlal v11.2d,v29.2s,v2.s[0] 677 1.1 christos umlal v12.2d,v29.2s,v2.s[1] 678 1.1 christos umlal v13.2d,v29.2s,v2.s[2] 679 1.1 christos umlal v6.2d,v29.2s,v2.s[3] 680 1.1 christos umlal v7.2d,v29.2s,v3.s[0] 681 1.1 christos umlal v8.2d,v29.2s,v3.s[1] 682 1.1 christos umlal v9.2d,v29.2s,v3.s[2] 683 1.1 christos umlal v10.2d,v29.2s,v3.s[3] 684 1.1 christos st1 {v11.2d},[x7],#16 685 1.1 christos umlal v12.2d,v28.2s,v0.s[0] 686 1.1 christos ld1 {v11.2d},[x6] 687 1.1 christos umlal v13.2d,v28.2s,v0.s[1] 688 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 689 1.1 christos umlal v6.2d,v28.2s,v0.s[2] 690 1.1 christos b.eq .LInner_jump6 691 1.1 christos add x6,x6,#16 // don't advance in last iteration 692 1.1 christos .LInner_jump6: 693 1.1 christos umlal v7.2d,v28.2s,v0.s[3] 694 1.1 christos umlal v8.2d,v28.2s,v1.s[0] 695 1.1 christos umlal v9.2d,v28.2s,v1.s[1] 696 1.1 christos umlal v10.2d,v28.2s,v1.s[2] 697 1.1 christos umlal v11.2d,v28.2s,v1.s[3] 698 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 699 1.1 christos umlal v12.2d,v29.2s,v2.s[0] 700 1.1 christos umlal v13.2d,v29.2s,v2.s[1] 701 1.1 christos umlal v6.2d,v29.2s,v2.s[2] 702 1.1 christos umlal v7.2d,v29.2s,v2.s[3] 703 1.1 christos umlal v8.2d,v29.2s,v3.s[0] 704 1.1 christos umlal v9.2d,v29.2s,v3.s[1] 705 1.1 christos umlal v10.2d,v29.2s,v3.s[2] 706 1.1 christos umlal v11.2d,v29.2s,v3.s[3] 707 1.1 christos st1 {v12.2d},[x7],#16 708 1.1 christos umlal v13.2d,v28.2s,v0.s[0] 709 1.1 christos ld1 {v12.2d},[x6] 710 1.1 christos umlal v6.2d,v28.2s,v0.s[1] 711 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 712 1.1 christos umlal v7.2d,v28.2s,v0.s[2] 713 1.1 christos b.eq .LInner_jump7 714 1.1 christos add x6,x6,#16 // don't advance in last iteration 715 1.1 christos .LInner_jump7: 716 1.1 christos umlal v8.2d,v28.2s,v0.s[3] 717 1.1 christos umlal v9.2d,v28.2s,v1.s[0] 718 1.1 christos umlal v10.2d,v28.2s,v1.s[1] 719 1.1 christos umlal v11.2d,v28.2s,v1.s[2] 720 1.1 christos umlal v12.2d,v28.2s,v1.s[3] 721 1.1 christos b.ne .LInner_after_rewind8 722 1.1 christos sub x1,x1,x5,lsl#2 // rewind 723 1.1 christos .LInner_after_rewind8: 724 1.1 christos umlal v13.2d,v29.2s,v2.s[0] 725 1.1 christos ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 726 1.1 christos umlal v6.2d,v29.2s,v2.s[1] 727 1.1 christos ld1 {v0.4s,v1.4s},[x1],#32 728 1.1 christos umlal v7.2d,v29.2s,v2.s[2] 729 1.1 christos add x10,sp,#8 // rewind 730 1.1 christos umlal v8.2d,v29.2s,v2.s[3] 731 1.1 christos umlal v9.2d,v29.2s,v3.s[0] 732 1.1 christos umlal v10.2d,v29.2s,v3.s[1] 733 1.1 christos umlal v11.2d,v29.2s,v3.s[2] 734 1.1 christos st1 {v13.2d},[x7],#16 735 1.1 christos umlal v12.2d,v29.2s,v3.s[3] 736 1.1 christos 737 1.1 christos bne .LNEON_8n_inner 738 1.1 christos add x6,sp,#128 739 1.1 christos st1 {v6.2d,v7.2d},[x7],#32 740 1.1 christos eor v2.16b,v2.16b,v2.16b // v2 741 1.1 christos st1 {v8.2d,v9.2d},[x7],#32 742 1.1 christos eor v3.16b,v3.16b,v3.16b // v3 743 1.1 christos st1 {v10.2d,v11.2d},[x7],#32 744 1.1 christos st1 {v12.2d},[x7] 745 1.1 christos 746 1.1 christos subs x9,x9,#8 747 1.1 christos ld1 {v6.2d,v7.2d},[x6],#32 748 1.1 christos ld1 {v8.2d,v9.2d},[x6],#32 749 1.1 christos ld1 {v10.2d,v11.2d},[x6],#32 750 1.1 christos ld1 {v12.2d,v13.2d},[x6],#32 751 1.1 christos 752 1.1 christos b.eq .LInner_8n_jump_2steps 753 1.1 christos sub x3,x3,x5,lsl#2 // rewind 754 1.1 christos b .LNEON_8n_outer 755 1.1 christos 756 1.1 christos .LInner_8n_jump_2steps: 757 1.1 christos add x7,sp,#128 758 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 759 1.1 christos mov v5.16b,v6.16b 760 1.1 christos ushr v15.2d,v6.2d,#16 761 1.1 christos ext v6.16b,v6.16b,v6.16b,#8 762 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 763 1.1 christos add v6.2d,v6.2d,v15.2d 764 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 765 1.1 christos ushr v15.2d,v6.2d,#16 766 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 767 1.1 christos zip1 v6.4h,v5.4h,v6.4h 768 1.1 christos ins v15.d[1],v14.d[0] 769 1.1 christos 770 1.1 christos mov x8,x5 771 1.1 christos b .LNEON_tail_entry 772 1.1 christos 773 1.1 christos .align 4 774 1.1 christos .LNEON_tail: 775 1.1 christos add v6.2d,v6.2d,v15.2d 776 1.1 christos mov v5.16b,v6.16b 777 1.1 christos ushr v15.2d,v6.2d,#16 778 1.1 christos ext v6.16b,v6.16b,v6.16b,#8 779 1.1 christos ld1 {v8.2d,v9.2d}, [x6],#32 780 1.1 christos add v6.2d,v6.2d,v15.2d 781 1.1 christos ld1 {v10.2d,v11.2d}, [x6],#32 782 1.1 christos ushr v15.2d,v6.2d,#16 783 1.1 christos ld1 {v12.2d,v13.2d}, [x6],#32 784 1.1 christos zip1 v6.4h,v5.4h,v6.4h 785 1.1 christos ins v15.d[1],v14.d[0] 786 1.1 christos 787 1.1 christos .LNEON_tail_entry: 788 1.1 christos add v7.2d,v7.2d,v15.2d 789 1.1 christos st1 {v6.s}[0], [x7],#4 790 1.1 christos ushr v15.2d,v7.2d,#16 791 1.1 christos mov v5.16b,v7.16b 792 1.1 christos ext v7.16b,v7.16b,v7.16b,#8 793 1.1 christos add v7.2d,v7.2d,v15.2d 794 1.1 christos ushr v15.2d,v7.2d,#16 795 1.1 christos zip1 v7.4h,v5.4h,v7.4h 796 1.1 christos ins v15.d[1],v14.d[0] 797 1.1 christos add v8.2d,v8.2d,v15.2d 798 1.1 christos st1 {v7.s}[0], [x7],#4 799 1.1 christos ushr v15.2d,v8.2d,#16 800 1.1 christos mov v5.16b,v8.16b 801 1.1 christos ext v8.16b,v8.16b,v8.16b,#8 802 1.1 christos add v8.2d,v8.2d,v15.2d 803 1.1 christos ushr v15.2d,v8.2d,#16 804 1.1 christos zip1 v8.4h,v5.4h,v8.4h 805 1.1 christos ins v15.d[1],v14.d[0] 806 1.1 christos add v9.2d,v9.2d,v15.2d 807 1.1 christos st1 {v8.s}[0], [x7],#4 808 1.1 christos ushr v15.2d,v9.2d,#16 809 1.1 christos mov v5.16b,v9.16b 810 1.1 christos ext v9.16b,v9.16b,v9.16b,#8 811 1.1 christos add v9.2d,v9.2d,v15.2d 812 1.1 christos ushr v15.2d,v9.2d,#16 813 1.1 christos zip1 v9.4h,v5.4h,v9.4h 814 1.1 christos ins v15.d[1],v14.d[0] 815 1.1 christos add v10.2d,v10.2d,v15.2d 816 1.1 christos st1 {v9.s}[0], [x7],#4 817 1.1 christos ushr v15.2d,v10.2d,#16 818 1.1 christos mov v5.16b,v10.16b 819 1.1 christos ext v10.16b,v10.16b,v10.16b,#8 820 1.1 christos add v10.2d,v10.2d,v15.2d 821 1.1 christos ushr v15.2d,v10.2d,#16 822 1.1 christos zip1 v10.4h,v5.4h,v10.4h 823 1.1 christos ins v15.d[1],v14.d[0] 824 1.1 christos add v11.2d,v11.2d,v15.2d 825 1.1 christos st1 {v10.s}[0], [x7],#4 826 1.1 christos ushr v15.2d,v11.2d,#16 827 1.1 christos mov v5.16b,v11.16b 828 1.1 christos ext v11.16b,v11.16b,v11.16b,#8 829 1.1 christos add v11.2d,v11.2d,v15.2d 830 1.1 christos ushr v15.2d,v11.2d,#16 831 1.1 christos zip1 v11.4h,v5.4h,v11.4h 832 1.1 christos ins v15.d[1],v14.d[0] 833 1.1 christos add v12.2d,v12.2d,v15.2d 834 1.1 christos st1 {v11.s}[0], [x7],#4 835 1.1 christos ushr v15.2d,v12.2d,#16 836 1.1 christos mov v5.16b,v12.16b 837 1.1 christos ext v12.16b,v12.16b,v12.16b,#8 838 1.1 christos add v12.2d,v12.2d,v15.2d 839 1.1 christos ushr v15.2d,v12.2d,#16 840 1.1 christos zip1 v12.4h,v5.4h,v12.4h 841 1.1 christos ins v15.d[1],v14.d[0] 842 1.1 christos add v13.2d,v13.2d,v15.2d 843 1.1 christos st1 {v12.s}[0], [x7],#4 844 1.1 christos ushr v15.2d,v13.2d,#16 845 1.1 christos mov v5.16b,v13.16b 846 1.1 christos ext v13.16b,v13.16b,v13.16b,#8 847 1.1 christos add v13.2d,v13.2d,v15.2d 848 1.1 christos ushr v15.2d,v13.2d,#16 849 1.1 christos zip1 v13.4h,v5.4h,v13.4h 850 1.1 christos ins v15.d[1],v14.d[0] 851 1.1 christos ld1 {v6.2d,v7.2d}, [x6],#32 852 1.1 christos subs x8,x8,#8 853 1.1 christos st1 {v13.s}[0], [x7],#4 854 1.1 christos bne .LNEON_tail 855 1.1 christos 856 1.1 christos st1 {v15.s}[0], [x7],#4 // top-most bit 857 1.1 christos sub x3,x3,x5,lsl#2 // rewind x3 858 1.1 christos subs x1,sp,#0 // clear carry flag 859 1.1 christos add x2,sp,x5,lsl#2 860 1.1 christos 861 1.1 christos .LNEON_sub: 862 1.1 christos ldp w4,w5,[x1],#8 863 1.1 christos ldp w6,w7,[x1],#8 864 1.1 christos ldp w8,w9,[x3],#8 865 1.1 christos ldp w10,w11,[x3],#8 866 1.1 christos sbcs w8,w4,w8 867 1.1 christos sbcs w9,w5,w9 868 1.1 christos sbcs w10,w6,w10 869 1.1 christos sbcs w11,w7,w11 870 1.1 christos sub x17,x2,x1 871 1.1 christos stp w8,w9,[x0],#8 872 1.1 christos stp w10,w11,[x0],#8 873 1.1 christos cbnz x17,.LNEON_sub 874 1.1 christos 875 1.1 christos ldr w10, [x1] // load top-most bit 876 1.1 christos mov x11,sp 877 1.1 christos eor v0.16b,v0.16b,v0.16b 878 1.1 christos sub x11,x2,x11 // this is num*4 879 1.1 christos eor v1.16b,v1.16b,v1.16b 880 1.1 christos mov x1,sp 881 1.1 christos sub x0,x0,x11 // rewind x0 882 1.1 christos mov x3,x2 // second 3/4th of frame 883 1.1 christos sbcs w10,w10,wzr // result is carry flag 884 1.1 christos 885 1.1 christos .LNEON_copy_n_zap: 886 1.1 christos ldp w4,w5,[x1],#8 887 1.1 christos ldp w6,w7,[x1],#8 888 1.1 christos ldp w8,w9,[x0],#8 889 1.1 christos ldp w10,w11,[x0] 890 1.1 christos sub x0,x0,#8 891 1.1 christos b.cs .LCopy_1 892 1.1 christos mov w8,w4 893 1.1 christos mov w9,w5 894 1.1 christos mov w10,w6 895 1.1 christos mov w11,w7 896 1.1 christos .LCopy_1: 897 1.1 christos st1 {v0.2d,v1.2d}, [x3],#32 // wipe 898 1.1 christos st1 {v0.2d,v1.2d}, [x3],#32 // wipe 899 1.1 christos ldp w4,w5,[x1],#8 900 1.1 christos ldp w6,w7,[x1],#8 901 1.1 christos stp w8,w9,[x0],#8 902 1.1 christos stp w10,w11,[x0],#8 903 1.1 christos sub x1,x1,#32 904 1.1 christos ldp w8,w9,[x0],#8 905 1.1 christos ldp w10,w11,[x0] 906 1.1 christos sub x0,x0,#8 907 1.1 christos b.cs .LCopy_2 908 1.1 christos mov w8, w4 909 1.1 christos mov w9, w5 910 1.1 christos mov w10, w6 911 1.1 christos mov w11, w7 912 1.1 christos .LCopy_2: 913 1.1 christos st1 {v0.2d,v1.2d}, [x1],#32 // wipe 914 1.1 christos st1 {v0.2d,v1.2d}, [x3],#32 // wipe 915 1.1 christos sub x17,x2,x1 // preserves carry 916 1.1 christos stp w8,w9,[x0],#8 917 1.1 christos stp w10,w11,[x0],#8 918 1.1 christos cbnz x17,.LNEON_copy_n_zap 919 1.1 christos 920 1.1 christos mov sp,x16 921 1.1 christos ldp d14,d15,[sp,#64] 922 1.1 christos ldp d12,d13,[sp,#48] 923 1.1 christos ldp d10,d11,[sp,#32] 924 1.1 christos ldp d8,d9,[sp,#16] 925 1.1 christos ldr x29,[sp],#80 926 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 927 1.1 christos ret // RET 928 1.1 christos 929 1.1 christos .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 930 1.1 christos .type __bn_sqr8x_mont,%function 931 1.1 christos .align 5 932 1.1 christos __bn_sqr8x_mont: 933 1.1 christos cmp x1,x2 934 1.1 christos b.ne __bn_mul4x_mont 935 1.1 christos .Lsqr8x_mont: 936 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 937 1.2 christos // only from bn_mul_mont which has already signed the return address. 938 1.1 christos stp x29,x30,[sp,#-128]! 939 1.1 christos add x29,sp,#0 940 1.1 christos stp x19,x20,[sp,#16] 941 1.1 christos stp x21,x22,[sp,#32] 942 1.1 christos stp x23,x24,[sp,#48] 943 1.1 christos stp x25,x26,[sp,#64] 944 1.1 christos stp x27,x28,[sp,#80] 945 1.1 christos stp x0,x3,[sp,#96] // offload rp and np 946 1.1 christos 947 1.1 christos ldp x6,x7,[x1,#8*0] 948 1.1 christos ldp x8,x9,[x1,#8*2] 949 1.1 christos ldp x10,x11,[x1,#8*4] 950 1.1 christos ldp x12,x13,[x1,#8*6] 951 1.1 christos 952 1.1 christos sub x2,sp,x5,lsl#4 953 1.1 christos lsl x5,x5,#3 954 1.1 christos ldr x4,[x4] // *n0 955 1.1 christos mov sp,x2 // alloca 956 1.1 christos sub x27,x5,#8*8 957 1.1 christos b .Lsqr8x_zero_start 958 1.1 christos 959 1.1 christos .Lsqr8x_zero: 960 1.1 christos sub x27,x27,#8*8 961 1.1 christos stp xzr,xzr,[x2,#8*0] 962 1.1 christos stp xzr,xzr,[x2,#8*2] 963 1.1 christos stp xzr,xzr,[x2,#8*4] 964 1.1 christos stp xzr,xzr,[x2,#8*6] 965 1.1 christos .Lsqr8x_zero_start: 966 1.1 christos stp xzr,xzr,[x2,#8*8] 967 1.1 christos stp xzr,xzr,[x2,#8*10] 968 1.1 christos stp xzr,xzr,[x2,#8*12] 969 1.1 christos stp xzr,xzr,[x2,#8*14] 970 1.1 christos add x2,x2,#8*16 971 1.1 christos cbnz x27,.Lsqr8x_zero 972 1.1 christos 973 1.1 christos add x3,x1,x5 974 1.1 christos add x1,x1,#8*8 975 1.1 christos mov x19,xzr 976 1.1 christos mov x20,xzr 977 1.1 christos mov x21,xzr 978 1.1 christos mov x22,xzr 979 1.1 christos mov x23,xzr 980 1.1 christos mov x24,xzr 981 1.1 christos mov x25,xzr 982 1.1 christos mov x26,xzr 983 1.1 christos mov x2,sp 984 1.1 christos str x4,[x29,#112] // offload n0 985 1.1 christos 986 1.1 christos // Multiply everything but a[i]*a[i] 987 1.1 christos .align 4 988 1.1 christos .Lsqr8x_outer_loop: 989 1.1 christos // a[1]a[0] (i) 990 1.1 christos // a[2]a[0] 991 1.1 christos // a[3]a[0] 992 1.1 christos // a[4]a[0] 993 1.1 christos // a[5]a[0] 994 1.1 christos // a[6]a[0] 995 1.1 christos // a[7]a[0] 996 1.1 christos // a[2]a[1] (ii) 997 1.1 christos // a[3]a[1] 998 1.1 christos // a[4]a[1] 999 1.1 christos // a[5]a[1] 1000 1.1 christos // a[6]a[1] 1001 1.1 christos // a[7]a[1] 1002 1.1 christos // a[3]a[2] (iii) 1003 1.1 christos // a[4]a[2] 1004 1.1 christos // a[5]a[2] 1005 1.1 christos // a[6]a[2] 1006 1.1 christos // a[7]a[2] 1007 1.1 christos // a[4]a[3] (iv) 1008 1.1 christos // a[5]a[3] 1009 1.1 christos // a[6]a[3] 1010 1.1 christos // a[7]a[3] 1011 1.1 christos // a[5]a[4] (v) 1012 1.1 christos // a[6]a[4] 1013 1.1 christos // a[7]a[4] 1014 1.1 christos // a[6]a[5] (vi) 1015 1.1 christos // a[7]a[5] 1016 1.1 christos // a[7]a[6] (vii) 1017 1.1 christos 1018 1.1 christos mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1019 1.1 christos mul x15,x8,x6 1020 1.1 christos mul x16,x9,x6 1021 1.1 christos mul x17,x10,x6 1022 1.1 christos adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1023 1.1 christos mul x14,x11,x6 1024 1.1 christos adcs x21,x21,x15 1025 1.1 christos mul x15,x12,x6 1026 1.1 christos adcs x22,x22,x16 1027 1.1 christos mul x16,x13,x6 1028 1.1 christos adcs x23,x23,x17 1029 1.1 christos umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1030 1.1 christos adcs x24,x24,x14 1031 1.1 christos umulh x14,x8,x6 1032 1.1 christos adcs x25,x25,x15 1033 1.1 christos umulh x15,x9,x6 1034 1.1 christos adcs x26,x26,x16 1035 1.1 christos umulh x16,x10,x6 1036 1.1 christos stp x19,x20,[x2],#8*2 // t[0..1] 1037 1.1 christos adc x19,xzr,xzr // t[8] 1038 1.1 christos adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1039 1.1 christos umulh x17,x11,x6 1040 1.1 christos adcs x22,x22,x14 1041 1.1 christos umulh x14,x12,x6 1042 1.1 christos adcs x23,x23,x15 1043 1.1 christos umulh x15,x13,x6 1044 1.1 christos adcs x24,x24,x16 1045 1.1 christos mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1046 1.1 christos adcs x25,x25,x17 1047 1.1 christos mul x17,x9,x7 1048 1.1 christos adcs x26,x26,x14 1049 1.1 christos mul x14,x10,x7 1050 1.1 christos adc x19,x19,x15 1051 1.1 christos 1052 1.1 christos mul x15,x11,x7 1053 1.1 christos adds x22,x22,x16 1054 1.1 christos mul x16,x12,x7 1055 1.1 christos adcs x23,x23,x17 1056 1.1 christos mul x17,x13,x7 1057 1.1 christos adcs x24,x24,x14 1058 1.1 christos umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1059 1.1 christos adcs x25,x25,x15 1060 1.1 christos umulh x15,x9,x7 1061 1.1 christos adcs x26,x26,x16 1062 1.1 christos umulh x16,x10,x7 1063 1.1 christos adcs x19,x19,x17 1064 1.1 christos umulh x17,x11,x7 1065 1.1 christos stp x21,x22,[x2],#8*2 // t[2..3] 1066 1.1 christos adc x20,xzr,xzr // t[9] 1067 1.1 christos adds x23,x23,x14 1068 1.1 christos umulh x14,x12,x7 1069 1.1 christos adcs x24,x24,x15 1070 1.1 christos umulh x15,x13,x7 1071 1.1 christos adcs x25,x25,x16 1072 1.1 christos mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1073 1.1 christos adcs x26,x26,x17 1074 1.1 christos mul x17,x10,x8 1075 1.1 christos adcs x19,x19,x14 1076 1.1 christos mul x14,x11,x8 1077 1.1 christos adc x20,x20,x15 1078 1.1 christos 1079 1.1 christos mul x15,x12,x8 1080 1.1 christos adds x24,x24,x16 1081 1.1 christos mul x16,x13,x8 1082 1.1 christos adcs x25,x25,x17 1083 1.1 christos umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1084 1.1 christos adcs x26,x26,x14 1085 1.1 christos umulh x14,x10,x8 1086 1.1 christos adcs x19,x19,x15 1087 1.1 christos umulh x15,x11,x8 1088 1.1 christos adcs x20,x20,x16 1089 1.1 christos umulh x16,x12,x8 1090 1.1 christos stp x23,x24,[x2],#8*2 // t[4..5] 1091 1.1 christos adc x21,xzr,xzr // t[10] 1092 1.1 christos adds x25,x25,x17 1093 1.1 christos umulh x17,x13,x8 1094 1.1 christos adcs x26,x26,x14 1095 1.1 christos mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1096 1.1 christos adcs x19,x19,x15 1097 1.1 christos mul x15,x11,x9 1098 1.1 christos adcs x20,x20,x16 1099 1.1 christos mul x16,x12,x9 1100 1.1 christos adc x21,x21,x17 1101 1.1 christos 1102 1.1 christos mul x17,x13,x9 1103 1.1 christos adds x26,x26,x14 1104 1.1 christos umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1105 1.1 christos adcs x19,x19,x15 1106 1.1 christos umulh x15,x11,x9 1107 1.1 christos adcs x20,x20,x16 1108 1.1 christos umulh x16,x12,x9 1109 1.1 christos adcs x21,x21,x17 1110 1.1 christos umulh x17,x13,x9 1111 1.1 christos stp x25,x26,[x2],#8*2 // t[6..7] 1112 1.1 christos adc x22,xzr,xzr // t[11] 1113 1.1 christos adds x19,x19,x14 1114 1.1 christos mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1115 1.1 christos adcs x20,x20,x15 1116 1.1 christos mul x15,x12,x10 1117 1.1 christos adcs x21,x21,x16 1118 1.1 christos mul x16,x13,x10 1119 1.1 christos adc x22,x22,x17 1120 1.1 christos 1121 1.1 christos umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1122 1.1 christos adds x20,x20,x14 1123 1.1 christos umulh x14,x12,x10 1124 1.1 christos adcs x21,x21,x15 1125 1.1 christos umulh x15,x13,x10 1126 1.1 christos adcs x22,x22,x16 1127 1.1 christos mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1128 1.1 christos adc x23,xzr,xzr // t[12] 1129 1.1 christos adds x21,x21,x17 1130 1.1 christos mul x17,x13,x11 1131 1.1 christos adcs x22,x22,x14 1132 1.1 christos umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1133 1.1 christos adc x23,x23,x15 1134 1.1 christos 1135 1.1 christos umulh x15,x13,x11 1136 1.1 christos adds x22,x22,x16 1137 1.1 christos mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1138 1.1 christos adcs x23,x23,x17 1139 1.1 christos umulh x17,x13,x12 // hi(a[7]*a[6]) 1140 1.1 christos adc x24,xzr,xzr // t[13] 1141 1.1 christos adds x23,x23,x14 1142 1.1 christos sub x27,x3,x1 // done yet? 1143 1.1 christos adc x24,x24,x15 1144 1.1 christos 1145 1.1 christos adds x24,x24,x16 1146 1.1 christos sub x14,x3,x5 // rewinded ap 1147 1.1 christos adc x25,xzr,xzr // t[14] 1148 1.1 christos add x25,x25,x17 1149 1.1 christos 1150 1.1 christos cbz x27,.Lsqr8x_outer_break 1151 1.1 christos 1152 1.1 christos mov x4,x6 1153 1.1 christos ldp x6,x7,[x2,#8*0] 1154 1.1 christos ldp x8,x9,[x2,#8*2] 1155 1.1 christos ldp x10,x11,[x2,#8*4] 1156 1.1 christos ldp x12,x13,[x2,#8*6] 1157 1.1 christos adds x19,x19,x6 1158 1.1 christos adcs x20,x20,x7 1159 1.1 christos ldp x6,x7,[x1,#8*0] 1160 1.1 christos adcs x21,x21,x8 1161 1.1 christos adcs x22,x22,x9 1162 1.1 christos ldp x8,x9,[x1,#8*2] 1163 1.1 christos adcs x23,x23,x10 1164 1.1 christos adcs x24,x24,x11 1165 1.1 christos ldp x10,x11,[x1,#8*4] 1166 1.1 christos adcs x25,x25,x12 1167 1.1 christos mov x0,x1 1168 1.1 christos adcs x26,xzr,x13 1169 1.1 christos ldp x12,x13,[x1,#8*6] 1170 1.1 christos add x1,x1,#8*8 1171 1.1 christos //adc x28,xzr,xzr // moved below 1172 1.1 christos mov x27,#-8*8 1173 1.1 christos 1174 1.1 christos // a[8]a[0] 1175 1.1 christos // a[9]a[0] 1176 1.1 christos // a[a]a[0] 1177 1.1 christos // a[b]a[0] 1178 1.1 christos // a[c]a[0] 1179 1.1 christos // a[d]a[0] 1180 1.1 christos // a[e]a[0] 1181 1.1 christos // a[f]a[0] 1182 1.1 christos // a[8]a[1] 1183 1.1 christos // a[f]a[1]........................ 1184 1.1 christos // a[8]a[2] 1185 1.1 christos // a[f]a[2]........................ 1186 1.1 christos // a[8]a[3] 1187 1.1 christos // a[f]a[3]........................ 1188 1.1 christos // a[8]a[4] 1189 1.1 christos // a[f]a[4]........................ 1190 1.1 christos // a[8]a[5] 1191 1.1 christos // a[f]a[5]........................ 1192 1.1 christos // a[8]a[6] 1193 1.1 christos // a[f]a[6]........................ 1194 1.1 christos // a[8]a[7] 1195 1.1 christos // a[f]a[7]........................ 1196 1.1 christos .Lsqr8x_mul: 1197 1.1 christos mul x14,x6,x4 1198 1.1 christos adc x28,xzr,xzr // carry bit, modulo-scheduled 1199 1.1 christos mul x15,x7,x4 1200 1.1 christos add x27,x27,#8 1201 1.1 christos mul x16,x8,x4 1202 1.1 christos mul x17,x9,x4 1203 1.1 christos adds x19,x19,x14 1204 1.1 christos mul x14,x10,x4 1205 1.1 christos adcs x20,x20,x15 1206 1.1 christos mul x15,x11,x4 1207 1.1 christos adcs x21,x21,x16 1208 1.1 christos mul x16,x12,x4 1209 1.1 christos adcs x22,x22,x17 1210 1.1 christos mul x17,x13,x4 1211 1.1 christos adcs x23,x23,x14 1212 1.1 christos umulh x14,x6,x4 1213 1.1 christos adcs x24,x24,x15 1214 1.1 christos umulh x15,x7,x4 1215 1.1 christos adcs x25,x25,x16 1216 1.1 christos umulh x16,x8,x4 1217 1.1 christos adcs x26,x26,x17 1218 1.1 christos umulh x17,x9,x4 1219 1.1 christos adc x28,x28,xzr 1220 1.1 christos str x19,[x2],#8 1221 1.1 christos adds x19,x20,x14 1222 1.1 christos umulh x14,x10,x4 1223 1.1 christos adcs x20,x21,x15 1224 1.1 christos umulh x15,x11,x4 1225 1.1 christos adcs x21,x22,x16 1226 1.1 christos umulh x16,x12,x4 1227 1.1 christos adcs x22,x23,x17 1228 1.1 christos umulh x17,x13,x4 1229 1.1 christos ldr x4,[x0,x27] 1230 1.1 christos adcs x23,x24,x14 1231 1.1 christos adcs x24,x25,x15 1232 1.1 christos adcs x25,x26,x16 1233 1.1 christos adcs x26,x28,x17 1234 1.1 christos //adc x28,xzr,xzr // moved above 1235 1.1 christos cbnz x27,.Lsqr8x_mul 1236 1.1 christos // note that carry flag is guaranteed 1237 1.1 christos // to be zero at this point 1238 1.1 christos cmp x1,x3 // done yet? 1239 1.1 christos b.eq .Lsqr8x_break 1240 1.1 christos 1241 1.1 christos ldp x6,x7,[x2,#8*0] 1242 1.1 christos ldp x8,x9,[x2,#8*2] 1243 1.1 christos ldp x10,x11,[x2,#8*4] 1244 1.1 christos ldp x12,x13,[x2,#8*6] 1245 1.1 christos adds x19,x19,x6 1246 1.1 christos ldur x4,[x0,#-8*8] 1247 1.1 christos adcs x20,x20,x7 1248 1.1 christos ldp x6,x7,[x1,#8*0] 1249 1.1 christos adcs x21,x21,x8 1250 1.1 christos adcs x22,x22,x9 1251 1.1 christos ldp x8,x9,[x1,#8*2] 1252 1.1 christos adcs x23,x23,x10 1253 1.1 christos adcs x24,x24,x11 1254 1.1 christos ldp x10,x11,[x1,#8*4] 1255 1.1 christos adcs x25,x25,x12 1256 1.1 christos mov x27,#-8*8 1257 1.1 christos adcs x26,x26,x13 1258 1.1 christos ldp x12,x13,[x1,#8*6] 1259 1.1 christos add x1,x1,#8*8 1260 1.1 christos //adc x28,xzr,xzr // moved above 1261 1.1 christos b .Lsqr8x_mul 1262 1.1 christos 1263 1.1 christos .align 4 1264 1.1 christos .Lsqr8x_break: 1265 1.1 christos ldp x6,x7,[x0,#8*0] 1266 1.1 christos add x1,x0,#8*8 1267 1.1 christos ldp x8,x9,[x0,#8*2] 1268 1.1 christos sub x14,x3,x1 // is it last iteration? 1269 1.1 christos ldp x10,x11,[x0,#8*4] 1270 1.1 christos sub x15,x2,x14 1271 1.1 christos ldp x12,x13,[x0,#8*6] 1272 1.1 christos cbz x14,.Lsqr8x_outer_loop 1273 1.1 christos 1274 1.1 christos stp x19,x20,[x2,#8*0] 1275 1.1 christos ldp x19,x20,[x15,#8*0] 1276 1.1 christos stp x21,x22,[x2,#8*2] 1277 1.1 christos ldp x21,x22,[x15,#8*2] 1278 1.1 christos stp x23,x24,[x2,#8*4] 1279 1.1 christos ldp x23,x24,[x15,#8*4] 1280 1.1 christos stp x25,x26,[x2,#8*6] 1281 1.1 christos mov x2,x15 1282 1.1 christos ldp x25,x26,[x15,#8*6] 1283 1.1 christos b .Lsqr8x_outer_loop 1284 1.1 christos 1285 1.1 christos .align 4 1286 1.1 christos .Lsqr8x_outer_break: 1287 1.1 christos // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1288 1.1 christos ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1289 1.1 christos ldp x15,x16,[sp,#8*1] 1290 1.1 christos ldp x11,x13,[x14,#8*2] 1291 1.1 christos add x1,x14,#8*4 1292 1.1 christos ldp x17,x14,[sp,#8*3] 1293 1.1 christos 1294 1.1 christos stp x19,x20,[x2,#8*0] 1295 1.1 christos mul x19,x7,x7 1296 1.1 christos stp x21,x22,[x2,#8*2] 1297 1.1 christos umulh x7,x7,x7 1298 1.1 christos stp x23,x24,[x2,#8*4] 1299 1.1 christos mul x8,x9,x9 1300 1.1 christos stp x25,x26,[x2,#8*6] 1301 1.1 christos mov x2,sp 1302 1.1 christos umulh x9,x9,x9 1303 1.1 christos adds x20,x7,x15,lsl#1 1304 1.1 christos extr x15,x16,x15,#63 1305 1.1 christos sub x27,x5,#8*4 1306 1.1 christos 1307 1.1 christos .Lsqr4x_shift_n_add: 1308 1.1 christos adcs x21,x8,x15 1309 1.1 christos extr x16,x17,x16,#63 1310 1.1 christos sub x27,x27,#8*4 1311 1.1 christos adcs x22,x9,x16 1312 1.1 christos ldp x15,x16,[x2,#8*5] 1313 1.1 christos mul x10,x11,x11 1314 1.1 christos ldp x7,x9,[x1],#8*2 1315 1.1 christos umulh x11,x11,x11 1316 1.1 christos mul x12,x13,x13 1317 1.1 christos umulh x13,x13,x13 1318 1.1 christos extr x17,x14,x17,#63 1319 1.1 christos stp x19,x20,[x2,#8*0] 1320 1.1 christos adcs x23,x10,x17 1321 1.1 christos extr x14,x15,x14,#63 1322 1.1 christos stp x21,x22,[x2,#8*2] 1323 1.1 christos adcs x24,x11,x14 1324 1.1 christos ldp x17,x14,[x2,#8*7] 1325 1.1 christos extr x15,x16,x15,#63 1326 1.1 christos adcs x25,x12,x15 1327 1.1 christos extr x16,x17,x16,#63 1328 1.1 christos adcs x26,x13,x16 1329 1.1 christos ldp x15,x16,[x2,#8*9] 1330 1.1 christos mul x6,x7,x7 1331 1.1 christos ldp x11,x13,[x1],#8*2 1332 1.1 christos umulh x7,x7,x7 1333 1.1 christos mul x8,x9,x9 1334 1.1 christos umulh x9,x9,x9 1335 1.1 christos stp x23,x24,[x2,#8*4] 1336 1.1 christos extr x17,x14,x17,#63 1337 1.1 christos stp x25,x26,[x2,#8*6] 1338 1.1 christos add x2,x2,#8*8 1339 1.1 christos adcs x19,x6,x17 1340 1.1 christos extr x14,x15,x14,#63 1341 1.1 christos adcs x20,x7,x14 1342 1.1 christos ldp x17,x14,[x2,#8*3] 1343 1.1 christos extr x15,x16,x15,#63 1344 1.1 christos cbnz x27,.Lsqr4x_shift_n_add 1345 1.1 christos ldp x1,x4,[x29,#104] // pull np and n0 1346 1.1 christos 1347 1.1 christos adcs x21,x8,x15 1348 1.1 christos extr x16,x17,x16,#63 1349 1.1 christos adcs x22,x9,x16 1350 1.1 christos ldp x15,x16,[x2,#8*5] 1351 1.1 christos mul x10,x11,x11 1352 1.1 christos umulh x11,x11,x11 1353 1.1 christos stp x19,x20,[x2,#8*0] 1354 1.1 christos mul x12,x13,x13 1355 1.1 christos umulh x13,x13,x13 1356 1.1 christos stp x21,x22,[x2,#8*2] 1357 1.1 christos extr x17,x14,x17,#63 1358 1.1 christos adcs x23,x10,x17 1359 1.1 christos extr x14,x15,x14,#63 1360 1.1 christos ldp x19,x20,[sp,#8*0] 1361 1.1 christos adcs x24,x11,x14 1362 1.1 christos extr x15,x16,x15,#63 1363 1.1 christos ldp x6,x7,[x1,#8*0] 1364 1.1 christos adcs x25,x12,x15 1365 1.1 christos extr x16,xzr,x16,#63 1366 1.1 christos ldp x8,x9,[x1,#8*2] 1367 1.1 christos adc x26,x13,x16 1368 1.1 christos ldp x10,x11,[x1,#8*4] 1369 1.1 christos 1370 1.1 christos // Reduce by 512 bits per iteration 1371 1.1 christos mul x28,x4,x19 // t[0]*n0 1372 1.1 christos ldp x12,x13,[x1,#8*6] 1373 1.1 christos add x3,x1,x5 1374 1.1 christos ldp x21,x22,[sp,#8*2] 1375 1.1 christos stp x23,x24,[x2,#8*4] 1376 1.1 christos ldp x23,x24,[sp,#8*4] 1377 1.1 christos stp x25,x26,[x2,#8*6] 1378 1.1 christos ldp x25,x26,[sp,#8*6] 1379 1.1 christos add x1,x1,#8*8 1380 1.1 christos mov x30,xzr // initial top-most carry 1381 1.1 christos mov x2,sp 1382 1.1 christos mov x27,#8 1383 1.1 christos 1384 1.1 christos .Lsqr8x_reduction: 1385 1.1 christos // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1386 1.1 christos mul x15,x7,x28 1387 1.1 christos sub x27,x27,#1 1388 1.1 christos mul x16,x8,x28 1389 1.1 christos str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1390 1.1 christos mul x17,x9,x28 1391 1.1 christos // (*) adds xzr,x19,x14 1392 1.1 christos subs xzr,x19,#1 // (*) 1393 1.1 christos mul x14,x10,x28 1394 1.1 christos adcs x19,x20,x15 1395 1.1 christos mul x15,x11,x28 1396 1.1 christos adcs x20,x21,x16 1397 1.1 christos mul x16,x12,x28 1398 1.1 christos adcs x21,x22,x17 1399 1.1 christos mul x17,x13,x28 1400 1.1 christos adcs x22,x23,x14 1401 1.1 christos umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1402 1.1 christos adcs x23,x24,x15 1403 1.1 christos umulh x15,x7,x28 1404 1.1 christos adcs x24,x25,x16 1405 1.1 christos umulh x16,x8,x28 1406 1.1 christos adcs x25,x26,x17 1407 1.1 christos umulh x17,x9,x28 1408 1.1 christos adc x26,xzr,xzr 1409 1.1 christos adds x19,x19,x14 1410 1.1 christos umulh x14,x10,x28 1411 1.1 christos adcs x20,x20,x15 1412 1.1 christos umulh x15,x11,x28 1413 1.1 christos adcs x21,x21,x16 1414 1.1 christos umulh x16,x12,x28 1415 1.1 christos adcs x22,x22,x17 1416 1.1 christos umulh x17,x13,x28 1417 1.1 christos mul x28,x4,x19 // next t[0]*n0 1418 1.1 christos adcs x23,x23,x14 1419 1.1 christos adcs x24,x24,x15 1420 1.1 christos adcs x25,x25,x16 1421 1.1 christos adc x26,x26,x17 1422 1.1 christos cbnz x27,.Lsqr8x_reduction 1423 1.1 christos 1424 1.1 christos ldp x14,x15,[x2,#8*0] 1425 1.1 christos ldp x16,x17,[x2,#8*2] 1426 1.1 christos mov x0,x2 1427 1.1 christos sub x27,x3,x1 // done yet? 1428 1.1 christos adds x19,x19,x14 1429 1.1 christos adcs x20,x20,x15 1430 1.1 christos ldp x14,x15,[x2,#8*4] 1431 1.1 christos adcs x21,x21,x16 1432 1.1 christos adcs x22,x22,x17 1433 1.1 christos ldp x16,x17,[x2,#8*6] 1434 1.1 christos adcs x23,x23,x14 1435 1.1 christos adcs x24,x24,x15 1436 1.1 christos adcs x25,x25,x16 1437 1.1 christos adcs x26,x26,x17 1438 1.1 christos //adc x28,xzr,xzr // moved below 1439 1.1 christos cbz x27,.Lsqr8x8_post_condition 1440 1.1 christos 1441 1.1 christos ldur x4,[x2,#-8*8] 1442 1.1 christos ldp x6,x7,[x1,#8*0] 1443 1.1 christos ldp x8,x9,[x1,#8*2] 1444 1.1 christos ldp x10,x11,[x1,#8*4] 1445 1.1 christos mov x27,#-8*8 1446 1.1 christos ldp x12,x13,[x1,#8*6] 1447 1.1 christos add x1,x1,#8*8 1448 1.1 christos 1449 1.1 christos .Lsqr8x_tail: 1450 1.1 christos mul x14,x6,x4 1451 1.1 christos adc x28,xzr,xzr // carry bit, modulo-scheduled 1452 1.1 christos mul x15,x7,x4 1453 1.1 christos add x27,x27,#8 1454 1.1 christos mul x16,x8,x4 1455 1.1 christos mul x17,x9,x4 1456 1.1 christos adds x19,x19,x14 1457 1.1 christos mul x14,x10,x4 1458 1.1 christos adcs x20,x20,x15 1459 1.1 christos mul x15,x11,x4 1460 1.1 christos adcs x21,x21,x16 1461 1.1 christos mul x16,x12,x4 1462 1.1 christos adcs x22,x22,x17 1463 1.1 christos mul x17,x13,x4 1464 1.1 christos adcs x23,x23,x14 1465 1.1 christos umulh x14,x6,x4 1466 1.1 christos adcs x24,x24,x15 1467 1.1 christos umulh x15,x7,x4 1468 1.1 christos adcs x25,x25,x16 1469 1.1 christos umulh x16,x8,x4 1470 1.1 christos adcs x26,x26,x17 1471 1.1 christos umulh x17,x9,x4 1472 1.1 christos adc x28,x28,xzr 1473 1.1 christos str x19,[x2],#8 1474 1.1 christos adds x19,x20,x14 1475 1.1 christos umulh x14,x10,x4 1476 1.1 christos adcs x20,x21,x15 1477 1.1 christos umulh x15,x11,x4 1478 1.1 christos adcs x21,x22,x16 1479 1.1 christos umulh x16,x12,x4 1480 1.1 christos adcs x22,x23,x17 1481 1.1 christos umulh x17,x13,x4 1482 1.1 christos ldr x4,[x0,x27] 1483 1.1 christos adcs x23,x24,x14 1484 1.1 christos adcs x24,x25,x15 1485 1.1 christos adcs x25,x26,x16 1486 1.1 christos adcs x26,x28,x17 1487 1.1 christos //adc x28,xzr,xzr // moved above 1488 1.1 christos cbnz x27,.Lsqr8x_tail 1489 1.1 christos // note that carry flag is guaranteed 1490 1.1 christos // to be zero at this point 1491 1.1 christos ldp x6,x7,[x2,#8*0] 1492 1.1 christos sub x27,x3,x1 // done yet? 1493 1.1 christos sub x16,x3,x5 // rewinded np 1494 1.1 christos ldp x8,x9,[x2,#8*2] 1495 1.1 christos ldp x10,x11,[x2,#8*4] 1496 1.1 christos ldp x12,x13,[x2,#8*6] 1497 1.1 christos cbz x27,.Lsqr8x_tail_break 1498 1.1 christos 1499 1.1 christos ldur x4,[x0,#-8*8] 1500 1.1 christos adds x19,x19,x6 1501 1.1 christos adcs x20,x20,x7 1502 1.1 christos ldp x6,x7,[x1,#8*0] 1503 1.1 christos adcs x21,x21,x8 1504 1.1 christos adcs x22,x22,x9 1505 1.1 christos ldp x8,x9,[x1,#8*2] 1506 1.1 christos adcs x23,x23,x10 1507 1.1 christos adcs x24,x24,x11 1508 1.1 christos ldp x10,x11,[x1,#8*4] 1509 1.1 christos adcs x25,x25,x12 1510 1.1 christos mov x27,#-8*8 1511 1.1 christos adcs x26,x26,x13 1512 1.1 christos ldp x12,x13,[x1,#8*6] 1513 1.1 christos add x1,x1,#8*8 1514 1.1 christos //adc x28,xzr,xzr // moved above 1515 1.1 christos b .Lsqr8x_tail 1516 1.1 christos 1517 1.1 christos .align 4 1518 1.1 christos .Lsqr8x_tail_break: 1519 1.1 christos ldr x4,[x29,#112] // pull n0 1520 1.1 christos add x27,x2,#8*8 // end of current t[num] window 1521 1.1 christos 1522 1.1 christos subs xzr,x30,#1 // "move" top-most carry to carry bit 1523 1.1 christos adcs x14,x19,x6 1524 1.1 christos adcs x15,x20,x7 1525 1.1 christos ldp x19,x20,[x0,#8*0] 1526 1.1 christos adcs x21,x21,x8 1527 1.1 christos ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1528 1.1 christos adcs x22,x22,x9 1529 1.1 christos ldp x8,x9,[x16,#8*2] 1530 1.1 christos adcs x23,x23,x10 1531 1.1 christos adcs x24,x24,x11 1532 1.1 christos ldp x10,x11,[x16,#8*4] 1533 1.1 christos adcs x25,x25,x12 1534 1.1 christos adcs x26,x26,x13 1535 1.1 christos ldp x12,x13,[x16,#8*6] 1536 1.1 christos add x1,x16,#8*8 1537 1.1 christos adc x30,xzr,xzr // top-most carry 1538 1.1 christos mul x28,x4,x19 1539 1.1 christos stp x14,x15,[x2,#8*0] 1540 1.1 christos stp x21,x22,[x2,#8*2] 1541 1.1 christos ldp x21,x22,[x0,#8*2] 1542 1.1 christos stp x23,x24,[x2,#8*4] 1543 1.1 christos ldp x23,x24,[x0,#8*4] 1544 1.1 christos cmp x27,x29 // did we hit the bottom? 1545 1.1 christos stp x25,x26,[x2,#8*6] 1546 1.1 christos mov x2,x0 // slide the window 1547 1.1 christos ldp x25,x26,[x0,#8*6] 1548 1.1 christos mov x27,#8 1549 1.1 christos b.ne .Lsqr8x_reduction 1550 1.1 christos 1551 1.1 christos // Final step. We see if result is larger than modulus, and 1552 1.1 christos // if it is, subtract the modulus. But comparison implies 1553 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 1554 1.1 christos // and conditionally copy original value. 1555 1.1 christos ldr x0,[x29,#96] // pull rp 1556 1.1 christos add x2,x2,#8*8 1557 1.1 christos subs x14,x19,x6 1558 1.1 christos sbcs x15,x20,x7 1559 1.1 christos sub x27,x5,#8*8 1560 1.1 christos mov x3,x0 // x0 copy 1561 1.1 christos 1562 1.1 christos .Lsqr8x_sub: 1563 1.1 christos sbcs x16,x21,x8 1564 1.1 christos ldp x6,x7,[x1,#8*0] 1565 1.1 christos sbcs x17,x22,x9 1566 1.1 christos stp x14,x15,[x0,#8*0] 1567 1.1 christos sbcs x14,x23,x10 1568 1.1 christos ldp x8,x9,[x1,#8*2] 1569 1.1 christos sbcs x15,x24,x11 1570 1.1 christos stp x16,x17,[x0,#8*2] 1571 1.1 christos sbcs x16,x25,x12 1572 1.1 christos ldp x10,x11,[x1,#8*4] 1573 1.1 christos sbcs x17,x26,x13 1574 1.1 christos ldp x12,x13,[x1,#8*6] 1575 1.1 christos add x1,x1,#8*8 1576 1.1 christos ldp x19,x20,[x2,#8*0] 1577 1.1 christos sub x27,x27,#8*8 1578 1.1 christos ldp x21,x22,[x2,#8*2] 1579 1.1 christos ldp x23,x24,[x2,#8*4] 1580 1.1 christos ldp x25,x26,[x2,#8*6] 1581 1.1 christos add x2,x2,#8*8 1582 1.1 christos stp x14,x15,[x0,#8*4] 1583 1.1 christos sbcs x14,x19,x6 1584 1.1 christos stp x16,x17,[x0,#8*6] 1585 1.1 christos add x0,x0,#8*8 1586 1.1 christos sbcs x15,x20,x7 1587 1.1 christos cbnz x27,.Lsqr8x_sub 1588 1.1 christos 1589 1.1 christos sbcs x16,x21,x8 1590 1.1 christos mov x2,sp 1591 1.1 christos add x1,sp,x5 1592 1.1 christos ldp x6,x7,[x3,#8*0] 1593 1.1 christos sbcs x17,x22,x9 1594 1.1 christos stp x14,x15,[x0,#8*0] 1595 1.1 christos sbcs x14,x23,x10 1596 1.1 christos ldp x8,x9,[x3,#8*2] 1597 1.1 christos sbcs x15,x24,x11 1598 1.1 christos stp x16,x17,[x0,#8*2] 1599 1.1 christos sbcs x16,x25,x12 1600 1.1 christos ldp x19,x20,[x1,#8*0] 1601 1.1 christos sbcs x17,x26,x13 1602 1.1 christos ldp x21,x22,[x1,#8*2] 1603 1.1 christos sbcs xzr,x30,xzr // did it borrow? 1604 1.1 christos ldr x30,[x29,#8] // pull return address 1605 1.1 christos stp x14,x15,[x0,#8*4] 1606 1.1 christos stp x16,x17,[x0,#8*6] 1607 1.1 christos 1608 1.1 christos sub x27,x5,#8*4 1609 1.1 christos .Lsqr4x_cond_copy: 1610 1.1 christos sub x27,x27,#8*4 1611 1.1 christos csel x14,x19,x6,lo 1612 1.1 christos stp xzr,xzr,[x2,#8*0] 1613 1.1 christos csel x15,x20,x7,lo 1614 1.1 christos ldp x6,x7,[x3,#8*4] 1615 1.1 christos ldp x19,x20,[x1,#8*4] 1616 1.1 christos csel x16,x21,x8,lo 1617 1.1 christos stp xzr,xzr,[x2,#8*2] 1618 1.1 christos add x2,x2,#8*4 1619 1.1 christos csel x17,x22,x9,lo 1620 1.1 christos ldp x8,x9,[x3,#8*6] 1621 1.1 christos ldp x21,x22,[x1,#8*6] 1622 1.1 christos add x1,x1,#8*4 1623 1.1 christos stp x14,x15,[x3,#8*0] 1624 1.1 christos stp x16,x17,[x3,#8*2] 1625 1.1 christos add x3,x3,#8*4 1626 1.1 christos stp xzr,xzr,[x1,#8*0] 1627 1.1 christos stp xzr,xzr,[x1,#8*2] 1628 1.1 christos cbnz x27,.Lsqr4x_cond_copy 1629 1.1 christos 1630 1.1 christos csel x14,x19,x6,lo 1631 1.1 christos stp xzr,xzr,[x2,#8*0] 1632 1.1 christos csel x15,x20,x7,lo 1633 1.1 christos stp xzr,xzr,[x2,#8*2] 1634 1.1 christos csel x16,x21,x8,lo 1635 1.1 christos csel x17,x22,x9,lo 1636 1.1 christos stp x14,x15,[x3,#8*0] 1637 1.1 christos stp x16,x17,[x3,#8*2] 1638 1.1 christos 1639 1.1 christos b .Lsqr8x_done 1640 1.1 christos 1641 1.1 christos .align 4 1642 1.1 christos .Lsqr8x8_post_condition: 1643 1.1 christos adc x28,xzr,xzr 1644 1.1 christos ldr x30,[x29,#8] // pull return address 1645 1.1 christos // x19-7,x28 hold result, x6-7 hold modulus 1646 1.1 christos subs x6,x19,x6 1647 1.1 christos ldr x1,[x29,#96] // pull rp 1648 1.1 christos sbcs x7,x20,x7 1649 1.1 christos stp xzr,xzr,[sp,#8*0] 1650 1.1 christos sbcs x8,x21,x8 1651 1.1 christos stp xzr,xzr,[sp,#8*2] 1652 1.1 christos sbcs x9,x22,x9 1653 1.1 christos stp xzr,xzr,[sp,#8*4] 1654 1.1 christos sbcs x10,x23,x10 1655 1.1 christos stp xzr,xzr,[sp,#8*6] 1656 1.1 christos sbcs x11,x24,x11 1657 1.1 christos stp xzr,xzr,[sp,#8*8] 1658 1.1 christos sbcs x12,x25,x12 1659 1.1 christos stp xzr,xzr,[sp,#8*10] 1660 1.1 christos sbcs x13,x26,x13 1661 1.1 christos stp xzr,xzr,[sp,#8*12] 1662 1.1 christos sbcs x28,x28,xzr // did it borrow? 1663 1.1 christos stp xzr,xzr,[sp,#8*14] 1664 1.1 christos 1665 1.1 christos // x6-7 hold result-modulus 1666 1.1 christos csel x6,x19,x6,lo 1667 1.1 christos csel x7,x20,x7,lo 1668 1.1 christos csel x8,x21,x8,lo 1669 1.1 christos csel x9,x22,x9,lo 1670 1.1 christos stp x6,x7,[x1,#8*0] 1671 1.1 christos csel x10,x23,x10,lo 1672 1.1 christos csel x11,x24,x11,lo 1673 1.1 christos stp x8,x9,[x1,#8*2] 1674 1.1 christos csel x12,x25,x12,lo 1675 1.1 christos csel x13,x26,x13,lo 1676 1.1 christos stp x10,x11,[x1,#8*4] 1677 1.1 christos stp x12,x13,[x1,#8*6] 1678 1.1 christos 1679 1.1 christos .Lsqr8x_done: 1680 1.1 christos ldp x19,x20,[x29,#16] 1681 1.1 christos mov sp,x29 1682 1.1 christos ldp x21,x22,[x29,#32] 1683 1.1 christos mov x0,#1 1684 1.1 christos ldp x23,x24,[x29,#48] 1685 1.1 christos ldp x25,x26,[x29,#64] 1686 1.1 christos ldp x27,x28,[x29,#80] 1687 1.1 christos ldr x29,[sp],#128 1688 1.2 christos // x30 is loaded earlier 1689 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1690 1.1 christos ret 1691 1.1 christos .size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1692 1.1 christos .type __bn_mul4x_mont,%function 1693 1.1 christos .align 5 1694 1.1 christos __bn_mul4x_mont: 1695 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1696 1.2 christos // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1697 1.1 christos stp x29,x30,[sp,#-128]! 1698 1.1 christos add x29,sp,#0 1699 1.1 christos stp x19,x20,[sp,#16] 1700 1.1 christos stp x21,x22,[sp,#32] 1701 1.1 christos stp x23,x24,[sp,#48] 1702 1.1 christos stp x25,x26,[sp,#64] 1703 1.1 christos stp x27,x28,[sp,#80] 1704 1.1 christos 1705 1.1 christos sub x26,sp,x5,lsl#3 1706 1.1 christos lsl x5,x5,#3 1707 1.1 christos ldr x4,[x4] // *n0 1708 1.1 christos sub sp,x26,#8*4 // alloca 1709 1.1 christos 1710 1.1 christos add x10,x2,x5 1711 1.1 christos add x27,x1,x5 1712 1.1 christos stp x0,x10,[x29,#96] // offload rp and &b[num] 1713 1.1 christos 1714 1.1 christos ldr x24,[x2,#8*0] // b[0] 1715 1.1 christos ldp x6,x7,[x1,#8*0] // a[0..3] 1716 1.1 christos ldp x8,x9,[x1,#8*2] 1717 1.1 christos add x1,x1,#8*4 1718 1.1 christos mov x19,xzr 1719 1.1 christos mov x20,xzr 1720 1.1 christos mov x21,xzr 1721 1.1 christos mov x22,xzr 1722 1.1 christos ldp x14,x15,[x3,#8*0] // n[0..3] 1723 1.1 christos ldp x16,x17,[x3,#8*2] 1724 1.1 christos adds x3,x3,#8*4 // clear carry bit 1725 1.1 christos mov x0,xzr 1726 1.1 christos mov x28,#0 1727 1.1 christos mov x26,sp 1728 1.1 christos 1729 1.1 christos .Loop_mul4x_1st_reduction: 1730 1.1 christos mul x10,x6,x24 // lo(a[0..3]*b[0]) 1731 1.1 christos adc x0,x0,xzr // modulo-scheduled 1732 1.1 christos mul x11,x7,x24 1733 1.1 christos add x28,x28,#8 1734 1.1 christos mul x12,x8,x24 1735 1.1 christos and x28,x28,#31 1736 1.1 christos mul x13,x9,x24 1737 1.1 christos adds x19,x19,x10 1738 1.1 christos umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1739 1.1 christos adcs x20,x20,x11 1740 1.1 christos mul x25,x19,x4 // t[0]*n0 1741 1.1 christos adcs x21,x21,x12 1742 1.1 christos umulh x11,x7,x24 1743 1.1 christos adcs x22,x22,x13 1744 1.1 christos umulh x12,x8,x24 1745 1.1 christos adc x23,xzr,xzr 1746 1.1 christos umulh x13,x9,x24 1747 1.1 christos ldr x24,[x2,x28] // next b[i] (or b[0]) 1748 1.1 christos adds x20,x20,x10 1749 1.1 christos // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1750 1.1 christos str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1751 1.1 christos adcs x21,x21,x11 1752 1.1 christos mul x11,x15,x25 1753 1.1 christos adcs x22,x22,x12 1754 1.1 christos mul x12,x16,x25 1755 1.1 christos adc x23,x23,x13 // can't overflow 1756 1.1 christos mul x13,x17,x25 1757 1.1 christos // (*) adds xzr,x19,x10 1758 1.1 christos subs xzr,x19,#1 // (*) 1759 1.1 christos umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1760 1.1 christos adcs x19,x20,x11 1761 1.1 christos umulh x11,x15,x25 1762 1.1 christos adcs x20,x21,x12 1763 1.1 christos umulh x12,x16,x25 1764 1.1 christos adcs x21,x22,x13 1765 1.1 christos umulh x13,x17,x25 1766 1.1 christos adcs x22,x23,x0 1767 1.1 christos adc x0,xzr,xzr 1768 1.1 christos adds x19,x19,x10 1769 1.1 christos sub x10,x27,x1 1770 1.1 christos adcs x20,x20,x11 1771 1.1 christos adcs x21,x21,x12 1772 1.1 christos adcs x22,x22,x13 1773 1.1 christos //adc x0,x0,xzr 1774 1.1 christos cbnz x28,.Loop_mul4x_1st_reduction 1775 1.1 christos 1776 1.1 christos cbz x10,.Lmul4x4_post_condition 1777 1.1 christos 1778 1.1 christos ldp x6,x7,[x1,#8*0] // a[4..7] 1779 1.1 christos ldp x8,x9,[x1,#8*2] 1780 1.1 christos add x1,x1,#8*4 1781 1.1 christos ldr x25,[sp] // a[0]*n0 1782 1.1 christos ldp x14,x15,[x3,#8*0] // n[4..7] 1783 1.1 christos ldp x16,x17,[x3,#8*2] 1784 1.1 christos add x3,x3,#8*4 1785 1.1 christos 1786 1.1 christos .Loop_mul4x_1st_tail: 1787 1.1 christos mul x10,x6,x24 // lo(a[4..7]*b[i]) 1788 1.1 christos adc x0,x0,xzr // modulo-scheduled 1789 1.1 christos mul x11,x7,x24 1790 1.1 christos add x28,x28,#8 1791 1.1 christos mul x12,x8,x24 1792 1.1 christos and x28,x28,#31 1793 1.1 christos mul x13,x9,x24 1794 1.1 christos adds x19,x19,x10 1795 1.1 christos umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1796 1.1 christos adcs x20,x20,x11 1797 1.1 christos umulh x11,x7,x24 1798 1.1 christos adcs x21,x21,x12 1799 1.1 christos umulh x12,x8,x24 1800 1.1 christos adcs x22,x22,x13 1801 1.1 christos umulh x13,x9,x24 1802 1.1 christos adc x23,xzr,xzr 1803 1.1 christos ldr x24,[x2,x28] // next b[i] (or b[0]) 1804 1.1 christos adds x20,x20,x10 1805 1.1 christos mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1806 1.1 christos adcs x21,x21,x11 1807 1.1 christos mul x11,x15,x25 1808 1.1 christos adcs x22,x22,x12 1809 1.1 christos mul x12,x16,x25 1810 1.1 christos adc x23,x23,x13 // can't overflow 1811 1.1 christos mul x13,x17,x25 1812 1.1 christos adds x19,x19,x10 1813 1.1 christos umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1814 1.1 christos adcs x20,x20,x11 1815 1.1 christos umulh x11,x15,x25 1816 1.1 christos adcs x21,x21,x12 1817 1.1 christos umulh x12,x16,x25 1818 1.1 christos adcs x22,x22,x13 1819 1.1 christos adcs x23,x23,x0 1820 1.1 christos umulh x13,x17,x25 1821 1.1 christos adc x0,xzr,xzr 1822 1.1 christos ldr x25,[sp,x28] // next t[0]*n0 1823 1.1 christos str x19,[x26],#8 // result!!! 1824 1.1 christos adds x19,x20,x10 1825 1.1 christos sub x10,x27,x1 // done yet? 1826 1.1 christos adcs x20,x21,x11 1827 1.1 christos adcs x21,x22,x12 1828 1.1 christos adcs x22,x23,x13 1829 1.1 christos //adc x0,x0,xzr 1830 1.1 christos cbnz x28,.Loop_mul4x_1st_tail 1831 1.1 christos 1832 1.1 christos sub x11,x27,x5 // rewinded x1 1833 1.1 christos cbz x10,.Lmul4x_proceed 1834 1.1 christos 1835 1.1 christos ldp x6,x7,[x1,#8*0] 1836 1.1 christos ldp x8,x9,[x1,#8*2] 1837 1.1 christos add x1,x1,#8*4 1838 1.1 christos ldp x14,x15,[x3,#8*0] 1839 1.1 christos ldp x16,x17,[x3,#8*2] 1840 1.1 christos add x3,x3,#8*4 1841 1.1 christos b .Loop_mul4x_1st_tail 1842 1.1 christos 1843 1.1 christos .align 5 1844 1.1 christos .Lmul4x_proceed: 1845 1.1 christos ldr x24,[x2,#8*4]! // *++b 1846 1.1 christos adc x30,x0,xzr 1847 1.1 christos ldp x6,x7,[x11,#8*0] // a[0..3] 1848 1.1 christos sub x3,x3,x5 // rewind np 1849 1.1 christos ldp x8,x9,[x11,#8*2] 1850 1.1 christos add x1,x11,#8*4 1851 1.1 christos 1852 1.1 christos stp x19,x20,[x26,#8*0] // result!!! 1853 1.1 christos ldp x19,x20,[sp,#8*4] // t[0..3] 1854 1.1 christos stp x21,x22,[x26,#8*2] // result!!! 1855 1.1 christos ldp x21,x22,[sp,#8*6] 1856 1.1 christos 1857 1.1 christos ldp x14,x15,[x3,#8*0] // n[0..3] 1858 1.1 christos mov x26,sp 1859 1.1 christos ldp x16,x17,[x3,#8*2] 1860 1.1 christos adds x3,x3,#8*4 // clear carry bit 1861 1.1 christos mov x0,xzr 1862 1.1 christos 1863 1.1 christos .align 4 1864 1.1 christos .Loop_mul4x_reduction: 1865 1.1 christos mul x10,x6,x24 // lo(a[0..3]*b[4]) 1866 1.1 christos adc x0,x0,xzr // modulo-scheduled 1867 1.1 christos mul x11,x7,x24 1868 1.1 christos add x28,x28,#8 1869 1.1 christos mul x12,x8,x24 1870 1.1 christos and x28,x28,#31 1871 1.1 christos mul x13,x9,x24 1872 1.1 christos adds x19,x19,x10 1873 1.1 christos umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1874 1.1 christos adcs x20,x20,x11 1875 1.1 christos mul x25,x19,x4 // t[0]*n0 1876 1.1 christos adcs x21,x21,x12 1877 1.1 christos umulh x11,x7,x24 1878 1.1 christos adcs x22,x22,x13 1879 1.1 christos umulh x12,x8,x24 1880 1.1 christos adc x23,xzr,xzr 1881 1.1 christos umulh x13,x9,x24 1882 1.1 christos ldr x24,[x2,x28] // next b[i] 1883 1.1 christos adds x20,x20,x10 1884 1.1 christos // (*) mul x10,x14,x25 1885 1.1 christos str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1886 1.1 christos adcs x21,x21,x11 1887 1.1 christos mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1888 1.1 christos adcs x22,x22,x12 1889 1.1 christos mul x12,x16,x25 1890 1.1 christos adc x23,x23,x13 // can't overflow 1891 1.1 christos mul x13,x17,x25 1892 1.1 christos // (*) adds xzr,x19,x10 1893 1.1 christos subs xzr,x19,#1 // (*) 1894 1.1 christos umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1895 1.1 christos adcs x19,x20,x11 1896 1.1 christos umulh x11,x15,x25 1897 1.1 christos adcs x20,x21,x12 1898 1.1 christos umulh x12,x16,x25 1899 1.1 christos adcs x21,x22,x13 1900 1.1 christos umulh x13,x17,x25 1901 1.1 christos adcs x22,x23,x0 1902 1.1 christos adc x0,xzr,xzr 1903 1.1 christos adds x19,x19,x10 1904 1.1 christos adcs x20,x20,x11 1905 1.1 christos adcs x21,x21,x12 1906 1.1 christos adcs x22,x22,x13 1907 1.1 christos //adc x0,x0,xzr 1908 1.1 christos cbnz x28,.Loop_mul4x_reduction 1909 1.1 christos 1910 1.1 christos adc x0,x0,xzr 1911 1.1 christos ldp x10,x11,[x26,#8*4] // t[4..7] 1912 1.1 christos ldp x12,x13,[x26,#8*6] 1913 1.1 christos ldp x6,x7,[x1,#8*0] // a[4..7] 1914 1.1 christos ldp x8,x9,[x1,#8*2] 1915 1.1 christos add x1,x1,#8*4 1916 1.1 christos adds x19,x19,x10 1917 1.1 christos adcs x20,x20,x11 1918 1.1 christos adcs x21,x21,x12 1919 1.1 christos adcs x22,x22,x13 1920 1.1 christos //adc x0,x0,xzr 1921 1.1 christos 1922 1.1 christos ldr x25,[sp] // t[0]*n0 1923 1.1 christos ldp x14,x15,[x3,#8*0] // n[4..7] 1924 1.1 christos ldp x16,x17,[x3,#8*2] 1925 1.1 christos add x3,x3,#8*4 1926 1.1 christos 1927 1.1 christos .align 4 1928 1.1 christos .Loop_mul4x_tail: 1929 1.1 christos mul x10,x6,x24 // lo(a[4..7]*b[4]) 1930 1.1 christos adc x0,x0,xzr // modulo-scheduled 1931 1.1 christos mul x11,x7,x24 1932 1.1 christos add x28,x28,#8 1933 1.1 christos mul x12,x8,x24 1934 1.1 christos and x28,x28,#31 1935 1.1 christos mul x13,x9,x24 1936 1.1 christos adds x19,x19,x10 1937 1.1 christos umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1938 1.1 christos adcs x20,x20,x11 1939 1.1 christos umulh x11,x7,x24 1940 1.1 christos adcs x21,x21,x12 1941 1.1 christos umulh x12,x8,x24 1942 1.1 christos adcs x22,x22,x13 1943 1.1 christos umulh x13,x9,x24 1944 1.1 christos adc x23,xzr,xzr 1945 1.1 christos ldr x24,[x2,x28] // next b[i] 1946 1.1 christos adds x20,x20,x10 1947 1.1 christos mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1948 1.1 christos adcs x21,x21,x11 1949 1.1 christos mul x11,x15,x25 1950 1.1 christos adcs x22,x22,x12 1951 1.1 christos mul x12,x16,x25 1952 1.1 christos adc x23,x23,x13 // can't overflow 1953 1.1 christos mul x13,x17,x25 1954 1.1 christos adds x19,x19,x10 1955 1.1 christos umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1956 1.1 christos adcs x20,x20,x11 1957 1.1 christos umulh x11,x15,x25 1958 1.1 christos adcs x21,x21,x12 1959 1.1 christos umulh x12,x16,x25 1960 1.1 christos adcs x22,x22,x13 1961 1.1 christos umulh x13,x17,x25 1962 1.1 christos adcs x23,x23,x0 1963 1.1 christos ldr x25,[sp,x28] // next a[0]*n0 1964 1.1 christos adc x0,xzr,xzr 1965 1.1 christos str x19,[x26],#8 // result!!! 1966 1.1 christos adds x19,x20,x10 1967 1.1 christos sub x10,x27,x1 // done yet? 1968 1.1 christos adcs x20,x21,x11 1969 1.1 christos adcs x21,x22,x12 1970 1.1 christos adcs x22,x23,x13 1971 1.1 christos //adc x0,x0,xzr 1972 1.1 christos cbnz x28,.Loop_mul4x_tail 1973 1.1 christos 1974 1.1 christos sub x11,x3,x5 // rewinded np? 1975 1.1 christos adc x0,x0,xzr 1976 1.1 christos cbz x10,.Loop_mul4x_break 1977 1.1 christos 1978 1.1 christos ldp x10,x11,[x26,#8*4] 1979 1.1 christos ldp x12,x13,[x26,#8*6] 1980 1.1 christos ldp x6,x7,[x1,#8*0] 1981 1.1 christos ldp x8,x9,[x1,#8*2] 1982 1.1 christos add x1,x1,#8*4 1983 1.1 christos adds x19,x19,x10 1984 1.1 christos adcs x20,x20,x11 1985 1.1 christos adcs x21,x21,x12 1986 1.1 christos adcs x22,x22,x13 1987 1.1 christos //adc x0,x0,xzr 1988 1.1 christos ldp x14,x15,[x3,#8*0] 1989 1.1 christos ldp x16,x17,[x3,#8*2] 1990 1.1 christos add x3,x3,#8*4 1991 1.1 christos b .Loop_mul4x_tail 1992 1.1 christos 1993 1.1 christos .align 4 1994 1.1 christos .Loop_mul4x_break: 1995 1.1 christos ldp x12,x13,[x29,#96] // pull rp and &b[num] 1996 1.1 christos adds x19,x19,x30 1997 1.1 christos add x2,x2,#8*4 // bp++ 1998 1.1 christos adcs x20,x20,xzr 1999 1.1 christos sub x1,x1,x5 // rewind ap 2000 1.1 christos adcs x21,x21,xzr 2001 1.1 christos stp x19,x20,[x26,#8*0] // result!!! 2002 1.1 christos adcs x22,x22,xzr 2003 1.1 christos ldp x19,x20,[sp,#8*4] // t[0..3] 2004 1.1 christos adc x30,x0,xzr 2005 1.1 christos stp x21,x22,[x26,#8*2] // result!!! 2006 1.1 christos cmp x2,x13 // done yet? 2007 1.1 christos ldp x21,x22,[sp,#8*6] 2008 1.1 christos ldp x14,x15,[x11,#8*0] // n[0..3] 2009 1.1 christos ldp x16,x17,[x11,#8*2] 2010 1.1 christos add x3,x11,#8*4 2011 1.1 christos b.eq .Lmul4x_post 2012 1.1 christos 2013 1.1 christos ldr x24,[x2] 2014 1.1 christos ldp x6,x7,[x1,#8*0] // a[0..3] 2015 1.1 christos ldp x8,x9,[x1,#8*2] 2016 1.1 christos adds x1,x1,#8*4 // clear carry bit 2017 1.1 christos mov x0,xzr 2018 1.1 christos mov x26,sp 2019 1.1 christos b .Loop_mul4x_reduction 2020 1.1 christos 2021 1.1 christos .align 4 2022 1.1 christos .Lmul4x_post: 2023 1.1 christos // Final step. We see if result is larger than modulus, and 2024 1.1 christos // if it is, subtract the modulus. But comparison implies 2025 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 2026 1.1 christos // and conditionally copy original value. 2027 1.1 christos mov x0,x12 2028 1.1 christos mov x27,x12 // x0 copy 2029 1.1 christos subs x10,x19,x14 2030 1.1 christos add x26,sp,#8*8 2031 1.1 christos sbcs x11,x20,x15 2032 1.1 christos sub x28,x5,#8*4 2033 1.1 christos 2034 1.1 christos .Lmul4x_sub: 2035 1.1 christos sbcs x12,x21,x16 2036 1.1 christos ldp x14,x15,[x3,#8*0] 2037 1.1 christos sub x28,x28,#8*4 2038 1.1 christos ldp x19,x20,[x26,#8*0] 2039 1.1 christos sbcs x13,x22,x17 2040 1.1 christos ldp x16,x17,[x3,#8*2] 2041 1.1 christos add x3,x3,#8*4 2042 1.1 christos ldp x21,x22,[x26,#8*2] 2043 1.1 christos add x26,x26,#8*4 2044 1.1 christos stp x10,x11,[x0,#8*0] 2045 1.1 christos sbcs x10,x19,x14 2046 1.1 christos stp x12,x13,[x0,#8*2] 2047 1.1 christos add x0,x0,#8*4 2048 1.1 christos sbcs x11,x20,x15 2049 1.1 christos cbnz x28,.Lmul4x_sub 2050 1.1 christos 2051 1.1 christos sbcs x12,x21,x16 2052 1.1 christos mov x26,sp 2053 1.1 christos add x1,sp,#8*4 2054 1.1 christos ldp x6,x7,[x27,#8*0] 2055 1.1 christos sbcs x13,x22,x17 2056 1.1 christos stp x10,x11,[x0,#8*0] 2057 1.1 christos ldp x8,x9,[x27,#8*2] 2058 1.1 christos stp x12,x13,[x0,#8*2] 2059 1.1 christos ldp x19,x20,[x1,#8*0] 2060 1.1 christos ldp x21,x22,[x1,#8*2] 2061 1.1 christos sbcs xzr,x30,xzr // did it borrow? 2062 1.1 christos ldr x30,[x29,#8] // pull return address 2063 1.1 christos 2064 1.1 christos sub x28,x5,#8*4 2065 1.1 christos .Lmul4x_cond_copy: 2066 1.1 christos sub x28,x28,#8*4 2067 1.1 christos csel x10,x19,x6,lo 2068 1.1 christos stp xzr,xzr,[x26,#8*0] 2069 1.1 christos csel x11,x20,x7,lo 2070 1.1 christos ldp x6,x7,[x27,#8*4] 2071 1.1 christos ldp x19,x20,[x1,#8*4] 2072 1.1 christos csel x12,x21,x8,lo 2073 1.1 christos stp xzr,xzr,[x26,#8*2] 2074 1.1 christos add x26,x26,#8*4 2075 1.1 christos csel x13,x22,x9,lo 2076 1.1 christos ldp x8,x9,[x27,#8*6] 2077 1.1 christos ldp x21,x22,[x1,#8*6] 2078 1.1 christos add x1,x1,#8*4 2079 1.1 christos stp x10,x11,[x27,#8*0] 2080 1.1 christos stp x12,x13,[x27,#8*2] 2081 1.1 christos add x27,x27,#8*4 2082 1.1 christos cbnz x28,.Lmul4x_cond_copy 2083 1.1 christos 2084 1.1 christos csel x10,x19,x6,lo 2085 1.1 christos stp xzr,xzr,[x26,#8*0] 2086 1.1 christos csel x11,x20,x7,lo 2087 1.1 christos stp xzr,xzr,[x26,#8*2] 2088 1.1 christos csel x12,x21,x8,lo 2089 1.1 christos stp xzr,xzr,[x26,#8*3] 2090 1.1 christos csel x13,x22,x9,lo 2091 1.1 christos stp xzr,xzr,[x26,#8*4] 2092 1.1 christos stp x10,x11,[x27,#8*0] 2093 1.1 christos stp x12,x13,[x27,#8*2] 2094 1.1 christos 2095 1.1 christos b .Lmul4x_done 2096 1.1 christos 2097 1.1 christos .align 4 2098 1.1 christos .Lmul4x4_post_condition: 2099 1.1 christos adc x0,x0,xzr 2100 1.1 christos ldr x1,[x29,#96] // pull rp 2101 1.1 christos // x19-3,x0 hold result, x14-7 hold modulus 2102 1.1 christos subs x6,x19,x14 2103 1.1 christos ldr x30,[x29,#8] // pull return address 2104 1.1 christos sbcs x7,x20,x15 2105 1.1 christos stp xzr,xzr,[sp,#8*0] 2106 1.1 christos sbcs x8,x21,x16 2107 1.1 christos stp xzr,xzr,[sp,#8*2] 2108 1.1 christos sbcs x9,x22,x17 2109 1.1 christos stp xzr,xzr,[sp,#8*4] 2110 1.1 christos sbcs xzr,x0,xzr // did it borrow? 2111 1.1 christos stp xzr,xzr,[sp,#8*6] 2112 1.1 christos 2113 1.1 christos // x6-3 hold result-modulus 2114 1.1 christos csel x6,x19,x6,lo 2115 1.1 christos csel x7,x20,x7,lo 2116 1.1 christos csel x8,x21,x8,lo 2117 1.1 christos csel x9,x22,x9,lo 2118 1.1 christos stp x6,x7,[x1,#8*0] 2119 1.1 christos stp x8,x9,[x1,#8*2] 2120 1.1 christos 2121 1.1 christos .Lmul4x_done: 2122 1.1 christos ldp x19,x20,[x29,#16] 2123 1.1 christos mov sp,x29 2124 1.1 christos ldp x21,x22,[x29,#32] 2125 1.1 christos mov x0,#1 2126 1.1 christos ldp x23,x24,[x29,#48] 2127 1.1 christos ldp x25,x26,[x29,#64] 2128 1.1 christos ldp x27,x28,[x29,#80] 2129 1.1 christos ldr x29,[sp],#128 2130 1.2 christos // x30 loaded earlier 2131 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 2132 1.1 christos ret 2133 1.1 christos .size __bn_mul4x_mont,.-__bn_mul4x_mont 2134 1.2 christos .section .rodata 2135 1.1 christos .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2136 1.1 christos .align 2 2137 1.1 christos .align 4 2138