1 1.2 christos #include "arm_arch.h" 2 1.1 christos #ifndef __KERNEL__ 3 1.1 christos 4 1.1 christos .hidden OPENSSL_armv8_rsa_neonized 5 1.1 christos #endif 6 1.1 christos .text 7 1.1 christos 8 1.1 christos .globl bn_mul_mont 9 1.1 christos .type bn_mul_mont,%function 10 1.1 christos .align 5 11 1.1 christos bn_mul_mont: 12 1.2 christos AARCH64_SIGN_LINK_REGISTER 13 1.1 christos .Lbn_mul_mont: 14 1.1 christos tst x5,#3 15 1.1 christos b.ne .Lmul_mont 16 1.1 christos cmp x5,#32 17 1.1 christos b.le .Lscalar_impl 18 1.1 christos #ifndef __KERNEL__ 19 1.1 christos #ifndef __AARCH64EB__ 20 1.1 christos adrp x17,OPENSSL_armv8_rsa_neonized 21 1.1 christos ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 22 1.1 christos cbnz w17, bn_mul8x_mont_neon 23 1.1 christos #endif 24 1.1 christos #endif 25 1.1 christos 26 1.1 christos .Lscalar_impl: 27 1.1 christos tst x5,#7 28 1.1 christos b.eq __bn_sqr8x_mont 29 1.1 christos tst x5,#3 30 1.1 christos b.eq __bn_mul4x_mont 31 1.1 christos 32 1.1 christos .Lmul_mont: 33 1.1 christos stp x29,x30,[sp,#-64]! 34 1.1 christos add x29,sp,#0 35 1.1 christos stp x19,x20,[sp,#16] 36 1.1 christos stp x21,x22,[sp,#32] 37 1.1 christos stp x23,x24,[sp,#48] 38 1.1 christos 39 1.1 christos ldr x9,[x2],#8 // bp[0] 40 1.1 christos sub x22,sp,x5,lsl#3 41 1.1 christos ldp x7,x8,[x1],#16 // ap[0..1] 42 1.1 christos lsl x5,x5,#3 43 1.1 christos ldr x4,[x4] // *n0 44 1.1 christos and x22,x22,#-16 // ABI says so 45 1.1 christos ldp x13,x14,[x3],#16 // np[0..1] 46 1.1 christos 47 1.1 christos mul x6,x7,x9 // ap[0]*bp[0] 48 1.1 christos sub x21,x5,#16 // j=num-2 49 1.1 christos umulh x7,x7,x9 50 1.1 christos mul x10,x8,x9 // ap[1]*bp[0] 51 1.1 christos umulh x11,x8,x9 52 1.1 christos 53 1.1 christos mul x15,x6,x4 // "tp[0]"*n0 54 1.1 christos mov sp,x22 // alloca 55 1.1 christos 56 1.1 christos // (*) mul x12,x13,x15 // np[0]*m1 57 1.1 christos umulh x13,x13,x15 58 1.1 christos mul x16,x14,x15 // np[1]*m1 59 1.1 christos // (*) adds x12,x12,x6 // discarded 60 1.1 christos // (*) As for removal of first multiplication and addition 61 1.1 christos // instructions. The outcome of first addition is 62 1.1 christos // guaranteed to be zero, which leaves two computationally 63 1.1 christos // significant outcomes: it either carries or not. Then 64 1.1 christos // question is when does it carry? Is there alternative 65 1.1 christos // way to deduce it? If you follow operations, you can 66 1.1 christos // observe that condition for carry is quite simple: 67 1.1 christos // x6 being non-zero. So that carry can be calculated 68 1.1 christos // by adding -1 to x6. That's what next instruction does. 69 1.1 christos subs xzr,x6,#1 // (*) 70 1.1 christos umulh x17,x14,x15 71 1.1 christos adc x13,x13,xzr 72 1.1 christos cbz x21,.L1st_skip 73 1.1 christos 74 1.1 christos .L1st: 75 1.1 christos ldr x8,[x1],#8 76 1.1 christos adds x6,x10,x7 77 1.1 christos sub x21,x21,#8 // j-- 78 1.1 christos adc x7,x11,xzr 79 1.1 christos 80 1.1 christos ldr x14,[x3],#8 81 1.1 christos adds x12,x16,x13 82 1.1 christos mul x10,x8,x9 // ap[j]*bp[0] 83 1.1 christos adc x13,x17,xzr 84 1.1 christos umulh x11,x8,x9 85 1.1 christos 86 1.1 christos adds x12,x12,x6 87 1.1 christos mul x16,x14,x15 // np[j]*m1 88 1.1 christos adc x13,x13,xzr 89 1.1 christos umulh x17,x14,x15 90 1.1 christos str x12,[x22],#8 // tp[j-1] 91 1.1 christos cbnz x21,.L1st 92 1.1 christos 93 1.1 christos .L1st_skip: 94 1.1 christos adds x6,x10,x7 95 1.1 christos sub x1,x1,x5 // rewind x1 96 1.1 christos adc x7,x11,xzr 97 1.1 christos 98 1.1 christos adds x12,x16,x13 99 1.1 christos sub x3,x3,x5 // rewind x3 100 1.1 christos adc x13,x17,xzr 101 1.1 christos 102 1.1 christos adds x12,x12,x6 103 1.1 christos sub x20,x5,#8 // i=num-1 104 1.1 christos adcs x13,x13,x7 105 1.1 christos 106 1.1 christos adc x19,xzr,xzr // upmost overflow bit 107 1.1 christos stp x12,x13,[x22] 108 1.1 christos 109 1.1 christos .Louter: 110 1.1 christos ldr x9,[x2],#8 // bp[i] 111 1.1 christos ldp x7,x8,[x1],#16 112 1.1 christos ldr x23,[sp] // tp[0] 113 1.1 christos add x22,sp,#8 114 1.1 christos 115 1.1 christos mul x6,x7,x9 // ap[0]*bp[i] 116 1.1 christos sub x21,x5,#16 // j=num-2 117 1.1 christos umulh x7,x7,x9 118 1.1 christos ldp x13,x14,[x3],#16 119 1.1 christos mul x10,x8,x9 // ap[1]*bp[i] 120 1.1 christos adds x6,x6,x23 121 1.1 christos umulh x11,x8,x9 122 1.1 christos adc x7,x7,xzr 123 1.1 christos 124 1.1 christos mul x15,x6,x4 125 1.1 christos sub x20,x20,#8 // i-- 126 1.1 christos 127 1.1 christos // (*) mul x12,x13,x15 // np[0]*m1 128 1.1 christos umulh x13,x13,x15 129 1.1 christos mul x16,x14,x15 // np[1]*m1 130 1.1 christos // (*) adds x12,x12,x6 131 1.1 christos subs xzr,x6,#1 // (*) 132 1.1 christos umulh x17,x14,x15 133 1.1 christos cbz x21,.Linner_skip 134 1.1 christos 135 1.1 christos .Linner: 136 1.1 christos ldr x8,[x1],#8 137 1.1 christos adc x13,x13,xzr 138 1.1 christos ldr x23,[x22],#8 // tp[j] 139 1.1 christos adds x6,x10,x7 140 1.1 christos sub x21,x21,#8 // j-- 141 1.1 christos adc x7,x11,xzr 142 1.1 christos 143 1.1 christos adds x12,x16,x13 144 1.1 christos ldr x14,[x3],#8 145 1.1 christos adc x13,x17,xzr 146 1.1 christos 147 1.1 christos mul x10,x8,x9 // ap[j]*bp[i] 148 1.1 christos adds x6,x6,x23 149 1.1 christos umulh x11,x8,x9 150 1.1 christos adc x7,x7,xzr 151 1.1 christos 152 1.1 christos mul x16,x14,x15 // np[j]*m1 153 1.1 christos adds x12,x12,x6 154 1.1 christos umulh x17,x14,x15 155 1.1 christos stur x12,[x22,#-16] // tp[j-1] 156 1.1 christos cbnz x21,.Linner 157 1.1 christos 158 1.1 christos .Linner_skip: 159 1.1 christos ldr x23,[x22],#8 // tp[j] 160 1.1 christos adc x13,x13,xzr 161 1.1 christos adds x6,x10,x7 162 1.1 christos sub x1,x1,x5 // rewind x1 163 1.1 christos adc x7,x11,xzr 164 1.1 christos 165 1.1 christos adds x12,x16,x13 166 1.1 christos sub x3,x3,x5 // rewind x3 167 1.1 christos adcs x13,x17,x19 168 1.1 christos adc x19,xzr,xzr 169 1.1 christos 170 1.1 christos adds x6,x6,x23 171 1.1 christos adc x7,x7,xzr 172 1.1 christos 173 1.1 christos adds x12,x12,x6 174 1.1 christos adcs x13,x13,x7 175 1.1 christos adc x19,x19,xzr // upmost overflow bit 176 1.1 christos stp x12,x13,[x22,#-16] 177 1.1 christos 178 1.1 christos cbnz x20,.Louter 179 1.1 christos 180 1.1 christos // Final step. We see if result is larger than modulus, and 181 1.1 christos // if it is, subtract the modulus. But comparison implies 182 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 183 1.1 christos // and conditionally copy original value. 184 1.1 christos ldr x23,[sp] // tp[0] 185 1.1 christos add x22,sp,#8 186 1.1 christos ldr x14,[x3],#8 // np[0] 187 1.1 christos subs x21,x5,#8 // j=num-1 and clear borrow 188 1.1 christos mov x1,x0 189 1.1 christos .Lsub: 190 1.1 christos sbcs x8,x23,x14 // tp[j]-np[j] 191 1.1 christos ldr x23,[x22],#8 192 1.1 christos sub x21,x21,#8 // j-- 193 1.1 christos ldr x14,[x3],#8 194 1.1 christos str x8,[x1],#8 // rp[j]=tp[j]-np[j] 195 1.1 christos cbnz x21,.Lsub 196 1.1 christos 197 1.1 christos sbcs x8,x23,x14 198 1.1 christos sbcs x19,x19,xzr // did it borrow? 199 1.1 christos str x8,[x1],#8 // rp[num-1] 200 1.1 christos 201 1.1 christos ldr x23,[sp] // tp[0] 202 1.1 christos add x22,sp,#8 203 1.1 christos ldr x8,[x0],#8 // rp[0] 204 1.1 christos sub x5,x5,#8 // num-- 205 1.1 christos nop 206 1.1 christos .Lcond_copy: 207 1.1 christos sub x5,x5,#8 // num-- 208 1.1 christos csel x14,x23,x8,lo // did it borrow? 209 1.1 christos ldr x23,[x22],#8 210 1.1 christos ldr x8,[x0],#8 211 1.1 christos stur xzr,[x22,#-16] // wipe tp 212 1.1 christos stur x14,[x0,#-16] 213 1.1 christos cbnz x5,.Lcond_copy 214 1.1 christos 215 1.1 christos csel x14,x23,x8,lo 216 1.1 christos stur xzr,[x22,#-8] // wipe tp 217 1.1 christos stur x14,[x0,#-8] 218 1.1 christos 219 1.1 christos ldp x19,x20,[x29,#16] 220 1.1 christos mov sp,x29 221 1.1 christos ldp x21,x22,[x29,#32] 222 1.1 christos mov x0,#1 223 1.1 christos ldp x23,x24,[x29,#48] 224 1.1 christos ldr x29,[sp],#64 225 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 226 1.1 christos ret 227 1.1 christos .size bn_mul_mont,.-bn_mul_mont 228 1.1 christos .type bn_mul8x_mont_neon,%function 229 1.1 christos .align 5 230 1.1 christos bn_mul8x_mont_neon: 231 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 232 1.2 christos // only from bn_mul_mont which has already signed the return address. 233 1.1 christos stp x29,x30,[sp,#-80]! 234 1.1 christos mov x16,sp 235 1.1 christos stp d8,d9,[sp,#16] 236 1.1 christos stp d10,d11,[sp,#32] 237 1.1 christos stp d12,d13,[sp,#48] 238 1.1 christos stp d14,d15,[sp,#64] 239 1.1 christos lsl x5,x5,#1 240 1.1 christos eor v14.16b,v14.16b,v14.16b 241 1.1 christos 242 1.1 christos .align 4 243 1.1 christos .LNEON_8n: 244 1.1 christos eor v6.16b,v6.16b,v6.16b 245 1.1 christos sub x7,sp,#128 246 1.1 christos eor v7.16b,v7.16b,v7.16b 247 1.1 christos sub x7,x7,x5,lsl#4 248 1.1 christos eor v8.16b,v8.16b,v8.16b 249 1.1 christos and x7,x7,#-64 250 1.1 christos eor v9.16b,v9.16b,v9.16b 251 1.1 christos mov sp,x7 // alloca 252 1.1 christos eor v10.16b,v10.16b,v10.16b 253 1.1 christos add x7,x7,#256 254 1.1 christos eor v11.16b,v11.16b,v11.16b 255 1.1 christos sub x8,x5,#8 256 1.1 christos eor v12.16b,v12.16b,v12.16b 257 1.1 christos eor v13.16b,v13.16b,v13.16b 258 1.1 christos 259 1.1 christos .LNEON_8n_init: 260 1.1 christos st1 {v6.2d,v7.2d},[x7],#32 261 1.1 christos subs x8,x8,#8 262 1.1 christos st1 {v8.2d,v9.2d},[x7],#32 263 1.1 christos st1 {v10.2d,v11.2d},[x7],#32 264 1.1 christos st1 {v12.2d,v13.2d},[x7],#32 265 1.1 christos bne .LNEON_8n_init 266 1.1 christos 267 1.1 christos add x6,sp,#256 268 1.1 christos ld1 {v0.4s,v1.4s},[x1],#32 269 1.1 christos add x10,sp,#8 270 1.1 christos ldr s30,[x4],#4 271 1.1 christos mov x9,x5 272 1.1 christos b .LNEON_8n_outer 273 1.1 christos 274 1.1 christos .align 4 275 1.1 christos .LNEON_8n_outer: 276 1.1 christos ldr s28,[x2],#4 // *b++ 277 1.1 christos uxtl v28.4s,v28.4h 278 1.1 christos add x7,sp,#128 279 1.1 christos ld1 {v2.4s,v3.4s},[x3],#32 280 1.1 christos 281 1.1 christos umlal v6.2d,v28.2s,v0.s[0] 282 1.1 christos umlal v7.2d,v28.2s,v0.s[1] 283 1.1 christos umlal v8.2d,v28.2s,v0.s[2] 284 1.1 christos shl v29.2d,v6.2d,#16 285 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 286 1.1 christos umlal v9.2d,v28.2s,v0.s[3] 287 1.1 christos add v29.2d,v29.2d,v6.2d 288 1.1 christos umlal v10.2d,v28.2s,v1.s[0] 289 1.1 christos mul v29.2s,v29.2s,v30.2s 290 1.1 christos umlal v11.2d,v28.2s,v1.s[1] 291 1.1 christos st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 292 1.1 christos umlal v12.2d,v28.2s,v1.s[2] 293 1.1 christos uxtl v29.4s,v29.4h 294 1.1 christos umlal v13.2d,v28.2s,v1.s[3] 295 1.1 christos ldr s28,[x2],#4 // *b++ 296 1.1 christos umlal v6.2d,v29.2s,v2.s[0] 297 1.1 christos umlal v7.2d,v29.2s,v2.s[1] 298 1.1 christos uxtl v28.4s,v28.4h 299 1.1 christos umlal v8.2d,v29.2s,v2.s[2] 300 1.1 christos ushr v15.2d,v6.2d,#16 301 1.1 christos umlal v9.2d,v29.2s,v2.s[3] 302 1.1 christos umlal v10.2d,v29.2s,v3.s[0] 303 1.1 christos ext v6.16b,v6.16b,v6.16b,#8 304 1.1 christos add v6.2d,v6.2d,v15.2d 305 1.1 christos umlal v11.2d,v29.2s,v3.s[1] 306 1.1 christos ushr v6.2d,v6.2d,#16 307 1.1 christos umlal v12.2d,v29.2s,v3.s[2] 308 1.1 christos umlal v13.2d,v29.2s,v3.s[3] 309 1.1 christos add v16.2d,v7.2d,v6.2d 310 1.1 christos ins v7.d[0],v16.d[0] 311 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 312 1.1 christos umlal v7.2d,v28.2s,v0.s[0] 313 1.1 christos ld1 {v6.2d},[x6],#16 314 1.1 christos umlal v8.2d,v28.2s,v0.s[1] 315 1.1 christos umlal v9.2d,v28.2s,v0.s[2] 316 1.1 christos shl v29.2d,v7.2d,#16 317 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 318 1.1 christos umlal v10.2d,v28.2s,v0.s[3] 319 1.1 christos add v29.2d,v29.2d,v7.2d 320 1.1 christos umlal v11.2d,v28.2s,v1.s[0] 321 1.1 christos mul v29.2s,v29.2s,v30.2s 322 1.1 christos umlal v12.2d,v28.2s,v1.s[1] 323 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 324 1.1 christos umlal v13.2d,v28.2s,v1.s[2] 325 1.1 christos uxtl v29.4s,v29.4h 326 1.1 christos umlal v6.2d,v28.2s,v1.s[3] 327 1.1 christos ldr s28,[x2],#4 // *b++ 328 1.1 christos umlal v7.2d,v29.2s,v2.s[0] 329 1.1 christos umlal v8.2d,v29.2s,v2.s[1] 330 1.1 christos uxtl v28.4s,v28.4h 331 1.1 christos umlal v9.2d,v29.2s,v2.s[2] 332 1.1 christos ushr v15.2d,v7.2d,#16 333 1.1 christos umlal v10.2d,v29.2s,v2.s[3] 334 1.1 christos umlal v11.2d,v29.2s,v3.s[0] 335 1.1 christos ext v7.16b,v7.16b,v7.16b,#8 336 1.1 christos add v7.2d,v7.2d,v15.2d 337 1.1 christos umlal v12.2d,v29.2s,v3.s[1] 338 1.1 christos ushr v7.2d,v7.2d,#16 339 1.1 christos umlal v13.2d,v29.2s,v3.s[2] 340 1.1 christos umlal v6.2d,v29.2s,v3.s[3] 341 1.1 christos add v16.2d,v8.2d,v7.2d 342 1.1 christos ins v8.d[0],v16.d[0] 343 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 344 1.1 christos umlal v8.2d,v28.2s,v0.s[0] 345 1.1 christos ld1 {v7.2d},[x6],#16 346 1.1 christos umlal v9.2d,v28.2s,v0.s[1] 347 1.1 christos umlal v10.2d,v28.2s,v0.s[2] 348 1.1 christos shl v29.2d,v8.2d,#16 349 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 350 1.1 christos umlal v11.2d,v28.2s,v0.s[3] 351 1.1 christos add v29.2d,v29.2d,v8.2d 352 1.1 christos umlal v12.2d,v28.2s,v1.s[0] 353 1.1 christos mul v29.2s,v29.2s,v30.2s 354 1.1 christos umlal v13.2d,v28.2s,v1.s[1] 355 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 356 1.1 christos umlal v6.2d,v28.2s,v1.s[2] 357 1.1 christos uxtl v29.4s,v29.4h 358 1.1 christos umlal v7.2d,v28.2s,v1.s[3] 359 1.1 christos ldr s28,[x2],#4 // *b++ 360 1.1 christos umlal v8.2d,v29.2s,v2.s[0] 361 1.1 christos umlal v9.2d,v29.2s,v2.s[1] 362 1.1 christos uxtl v28.4s,v28.4h 363 1.1 christos umlal v10.2d,v29.2s,v2.s[2] 364 1.1 christos ushr v15.2d,v8.2d,#16 365 1.1 christos umlal v11.2d,v29.2s,v2.s[3] 366 1.1 christos umlal v12.2d,v29.2s,v3.s[0] 367 1.1 christos ext v8.16b,v8.16b,v8.16b,#8 368 1.1 christos add v8.2d,v8.2d,v15.2d 369 1.1 christos umlal v13.2d,v29.2s,v3.s[1] 370 1.1 christos ushr v8.2d,v8.2d,#16 371 1.1 christos umlal v6.2d,v29.2s,v3.s[2] 372 1.1 christos umlal v7.2d,v29.2s,v3.s[3] 373 1.1 christos add v16.2d,v9.2d,v8.2d 374 1.1 christos ins v9.d[0],v16.d[0] 375 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 376 1.1 christos umlal v9.2d,v28.2s,v0.s[0] 377 1.1 christos ld1 {v8.2d},[x6],#16 378 1.1 christos umlal v10.2d,v28.2s,v0.s[1] 379 1.1 christos umlal v11.2d,v28.2s,v0.s[2] 380 1.1 christos shl v29.2d,v9.2d,#16 381 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 382 1.1 christos umlal v12.2d,v28.2s,v0.s[3] 383 1.1 christos add v29.2d,v29.2d,v9.2d 384 1.1 christos umlal v13.2d,v28.2s,v1.s[0] 385 1.1 christos mul v29.2s,v29.2s,v30.2s 386 1.1 christos umlal v6.2d,v28.2s,v1.s[1] 387 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 388 1.1 christos umlal v7.2d,v28.2s,v1.s[2] 389 1.1 christos uxtl v29.4s,v29.4h 390 1.1 christos umlal v8.2d,v28.2s,v1.s[3] 391 1.1 christos ldr s28,[x2],#4 // *b++ 392 1.1 christos umlal v9.2d,v29.2s,v2.s[0] 393 1.1 christos umlal v10.2d,v29.2s,v2.s[1] 394 1.1 christos uxtl v28.4s,v28.4h 395 1.1 christos umlal v11.2d,v29.2s,v2.s[2] 396 1.1 christos ushr v15.2d,v9.2d,#16 397 1.1 christos umlal v12.2d,v29.2s,v2.s[3] 398 1.1 christos umlal v13.2d,v29.2s,v3.s[0] 399 1.1 christos ext v9.16b,v9.16b,v9.16b,#8 400 1.1 christos add v9.2d,v9.2d,v15.2d 401 1.1 christos umlal v6.2d,v29.2s,v3.s[1] 402 1.1 christos ushr v9.2d,v9.2d,#16 403 1.1 christos umlal v7.2d,v29.2s,v3.s[2] 404 1.1 christos umlal v8.2d,v29.2s,v3.s[3] 405 1.1 christos add v16.2d,v10.2d,v9.2d 406 1.1 christos ins v10.d[0],v16.d[0] 407 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 408 1.1 christos umlal v10.2d,v28.2s,v0.s[0] 409 1.1 christos ld1 {v9.2d},[x6],#16 410 1.1 christos umlal v11.2d,v28.2s,v0.s[1] 411 1.1 christos umlal v12.2d,v28.2s,v0.s[2] 412 1.1 christos shl v29.2d,v10.2d,#16 413 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 414 1.1 christos umlal v13.2d,v28.2s,v0.s[3] 415 1.1 christos add v29.2d,v29.2d,v10.2d 416 1.1 christos umlal v6.2d,v28.2s,v1.s[0] 417 1.1 christos mul v29.2s,v29.2s,v30.2s 418 1.1 christos umlal v7.2d,v28.2s,v1.s[1] 419 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 420 1.1 christos umlal v8.2d,v28.2s,v1.s[2] 421 1.1 christos uxtl v29.4s,v29.4h 422 1.1 christos umlal v9.2d,v28.2s,v1.s[3] 423 1.1 christos ldr s28,[x2],#4 // *b++ 424 1.1 christos umlal v10.2d,v29.2s,v2.s[0] 425 1.1 christos umlal v11.2d,v29.2s,v2.s[1] 426 1.1 christos uxtl v28.4s,v28.4h 427 1.1 christos umlal v12.2d,v29.2s,v2.s[2] 428 1.1 christos ushr v15.2d,v10.2d,#16 429 1.1 christos umlal v13.2d,v29.2s,v2.s[3] 430 1.1 christos umlal v6.2d,v29.2s,v3.s[0] 431 1.1 christos ext v10.16b,v10.16b,v10.16b,#8 432 1.1 christos add v10.2d,v10.2d,v15.2d 433 1.1 christos umlal v7.2d,v29.2s,v3.s[1] 434 1.1 christos ushr v10.2d,v10.2d,#16 435 1.1 christos umlal v8.2d,v29.2s,v3.s[2] 436 1.1 christos umlal v9.2d,v29.2s,v3.s[3] 437 1.1 christos add v16.2d,v11.2d,v10.2d 438 1.1 christos ins v11.d[0],v16.d[0] 439 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 440 1.1 christos umlal v11.2d,v28.2s,v0.s[0] 441 1.1 christos ld1 {v10.2d},[x6],#16 442 1.1 christos umlal v12.2d,v28.2s,v0.s[1] 443 1.1 christos umlal v13.2d,v28.2s,v0.s[2] 444 1.1 christos shl v29.2d,v11.2d,#16 445 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 446 1.1 christos umlal v6.2d,v28.2s,v0.s[3] 447 1.1 christos add v29.2d,v29.2d,v11.2d 448 1.1 christos umlal v7.2d,v28.2s,v1.s[0] 449 1.1 christos mul v29.2s,v29.2s,v30.2s 450 1.1 christos umlal v8.2d,v28.2s,v1.s[1] 451 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 452 1.1 christos umlal v9.2d,v28.2s,v1.s[2] 453 1.1 christos uxtl v29.4s,v29.4h 454 1.1 christos umlal v10.2d,v28.2s,v1.s[3] 455 1.1 christos ldr s28,[x2],#4 // *b++ 456 1.1 christos umlal v11.2d,v29.2s,v2.s[0] 457 1.1 christos umlal v12.2d,v29.2s,v2.s[1] 458 1.1 christos uxtl v28.4s,v28.4h 459 1.1 christos umlal v13.2d,v29.2s,v2.s[2] 460 1.1 christos ushr v15.2d,v11.2d,#16 461 1.1 christos umlal v6.2d,v29.2s,v2.s[3] 462 1.1 christos umlal v7.2d,v29.2s,v3.s[0] 463 1.1 christos ext v11.16b,v11.16b,v11.16b,#8 464 1.1 christos add v11.2d,v11.2d,v15.2d 465 1.1 christos umlal v8.2d,v29.2s,v3.s[1] 466 1.1 christos ushr v11.2d,v11.2d,#16 467 1.1 christos umlal v9.2d,v29.2s,v3.s[2] 468 1.1 christos umlal v10.2d,v29.2s,v3.s[3] 469 1.1 christos add v16.2d,v12.2d,v11.2d 470 1.1 christos ins v12.d[0],v16.d[0] 471 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 472 1.1 christos umlal v12.2d,v28.2s,v0.s[0] 473 1.1 christos ld1 {v11.2d},[x6],#16 474 1.1 christos umlal v13.2d,v28.2s,v0.s[1] 475 1.1 christos umlal v6.2d,v28.2s,v0.s[2] 476 1.1 christos shl v29.2d,v12.2d,#16 477 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 478 1.1 christos umlal v7.2d,v28.2s,v0.s[3] 479 1.1 christos add v29.2d,v29.2d,v12.2d 480 1.1 christos umlal v8.2d,v28.2s,v1.s[0] 481 1.1 christos mul v29.2s,v29.2s,v30.2s 482 1.1 christos umlal v9.2d,v28.2s,v1.s[1] 483 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 484 1.1 christos umlal v10.2d,v28.2s,v1.s[2] 485 1.1 christos uxtl v29.4s,v29.4h 486 1.1 christos umlal v11.2d,v28.2s,v1.s[3] 487 1.1 christos ldr s28,[x2],#4 // *b++ 488 1.1 christos umlal v12.2d,v29.2s,v2.s[0] 489 1.1 christos umlal v13.2d,v29.2s,v2.s[1] 490 1.1 christos uxtl v28.4s,v28.4h 491 1.1 christos umlal v6.2d,v29.2s,v2.s[2] 492 1.1 christos ushr v15.2d,v12.2d,#16 493 1.1 christos umlal v7.2d,v29.2s,v2.s[3] 494 1.1 christos umlal v8.2d,v29.2s,v3.s[0] 495 1.1 christos ext v12.16b,v12.16b,v12.16b,#8 496 1.1 christos add v12.2d,v12.2d,v15.2d 497 1.1 christos umlal v9.2d,v29.2s,v3.s[1] 498 1.1 christos ushr v12.2d,v12.2d,#16 499 1.1 christos umlal v10.2d,v29.2s,v3.s[2] 500 1.1 christos umlal v11.2d,v29.2s,v3.s[3] 501 1.1 christos add v16.2d,v13.2d,v12.2d 502 1.1 christos ins v13.d[0],v16.d[0] 503 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 504 1.1 christos umlal v13.2d,v28.2s,v0.s[0] 505 1.1 christos ld1 {v12.2d},[x6],#16 506 1.1 christos umlal v6.2d,v28.2s,v0.s[1] 507 1.1 christos umlal v7.2d,v28.2s,v0.s[2] 508 1.1 christos shl v29.2d,v13.2d,#16 509 1.1 christos ext v29.16b,v29.16b,v29.16b,#8 510 1.1 christos umlal v8.2d,v28.2s,v0.s[3] 511 1.1 christos add v29.2d,v29.2d,v13.2d 512 1.1 christos umlal v9.2d,v28.2s,v1.s[0] 513 1.1 christos mul v29.2s,v29.2s,v30.2s 514 1.1 christos umlal v10.2d,v28.2s,v1.s[1] 515 1.1 christos st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 516 1.1 christos umlal v11.2d,v28.2s,v1.s[2] 517 1.1 christos uxtl v29.4s,v29.4h 518 1.1 christos umlal v12.2d,v28.2s,v1.s[3] 519 1.1 christos ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 520 1.1 christos umlal v13.2d,v29.2s,v2.s[0] 521 1.1 christos ld1 {v0.4s,v1.4s},[x1],#32 522 1.1 christos umlal v6.2d,v29.2s,v2.s[1] 523 1.1 christos umlal v7.2d,v29.2s,v2.s[2] 524 1.1 christos mov v5.16b,v13.16b 525 1.1 christos ushr v5.2d,v5.2d,#16 526 1.1 christos ext v13.16b,v13.16b,v13.16b,#8 527 1.1 christos umlal v8.2d,v29.2s,v2.s[3] 528 1.1 christos umlal v9.2d,v29.2s,v3.s[0] 529 1.1 christos add v13.2d,v13.2d,v5.2d 530 1.1 christos umlal v10.2d,v29.2s,v3.s[1] 531 1.1 christos ushr v13.2d,v13.2d,#16 532 1.1 christos eor v15.16b,v15.16b,v15.16b 533 1.1 christos ins v13.d[1],v15.d[0] 534 1.1 christos umlal v11.2d,v29.2s,v3.s[2] 535 1.1 christos umlal v12.2d,v29.2s,v3.s[3] 536 1.1 christos add v6.2d,v6.2d,v13.2d 537 1.1 christos st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 538 1.1 christos add x10,sp,#8 // rewind 539 1.1 christos sub x8,x5,#8 540 1.1 christos b .LNEON_8n_inner 541 1.1 christos 542 1.1 christos .align 4 543 1.1 christos .LNEON_8n_inner: 544 1.1 christos subs x8,x8,#8 545 1.1 christos umlal v6.2d,v28.2s,v0.s[0] 546 1.1 christos ld1 {v13.2d},[x6] 547 1.1 christos umlal v7.2d,v28.2s,v0.s[1] 548 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 549 1.1 christos umlal v8.2d,v28.2s,v0.s[2] 550 1.1 christos ld1 {v2.4s,v3.4s},[x3],#32 551 1.1 christos umlal v9.2d,v28.2s,v0.s[3] 552 1.1 christos b.eq .LInner_jump 553 1.1 christos add x6,x6,#16 // don't advance in last iteration 554 1.1 christos .LInner_jump: 555 1.1 christos umlal v10.2d,v28.2s,v1.s[0] 556 1.1 christos umlal v11.2d,v28.2s,v1.s[1] 557 1.1 christos umlal v12.2d,v28.2s,v1.s[2] 558 1.1 christos umlal v13.2d,v28.2s,v1.s[3] 559 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 560 1.1 christos umlal v6.2d,v29.2s,v2.s[0] 561 1.1 christos umlal v7.2d,v29.2s,v2.s[1] 562 1.1 christos umlal v8.2d,v29.2s,v2.s[2] 563 1.1 christos umlal v9.2d,v29.2s,v2.s[3] 564 1.1 christos umlal v10.2d,v29.2s,v3.s[0] 565 1.1 christos umlal v11.2d,v29.2s,v3.s[1] 566 1.1 christos umlal v12.2d,v29.2s,v3.s[2] 567 1.1 christos umlal v13.2d,v29.2s,v3.s[3] 568 1.1 christos st1 {v6.2d},[x7],#16 569 1.1 christos umlal v7.2d,v28.2s,v0.s[0] 570 1.1 christos ld1 {v6.2d},[x6] 571 1.1 christos umlal v8.2d,v28.2s,v0.s[1] 572 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 573 1.1 christos umlal v9.2d,v28.2s,v0.s[2] 574 1.1 christos b.eq .LInner_jump1 575 1.1 christos add x6,x6,#16 // don't advance in last iteration 576 1.1 christos .LInner_jump1: 577 1.1 christos umlal v10.2d,v28.2s,v0.s[3] 578 1.1 christos umlal v11.2d,v28.2s,v1.s[0] 579 1.1 christos umlal v12.2d,v28.2s,v1.s[1] 580 1.1 christos umlal v13.2d,v28.2s,v1.s[2] 581 1.1 christos umlal v6.2d,v28.2s,v1.s[3] 582 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 583 1.1 christos umlal v7.2d,v29.2s,v2.s[0] 584 1.1 christos umlal v8.2d,v29.2s,v2.s[1] 585 1.1 christos umlal v9.2d,v29.2s,v2.s[2] 586 1.1 christos umlal v10.2d,v29.2s,v2.s[3] 587 1.1 christos umlal v11.2d,v29.2s,v3.s[0] 588 1.1 christos umlal v12.2d,v29.2s,v3.s[1] 589 1.1 christos umlal v13.2d,v29.2s,v3.s[2] 590 1.1 christos umlal v6.2d,v29.2s,v3.s[3] 591 1.1 christos st1 {v7.2d},[x7],#16 592 1.1 christos umlal v8.2d,v28.2s,v0.s[0] 593 1.1 christos ld1 {v7.2d},[x6] 594 1.1 christos umlal v9.2d,v28.2s,v0.s[1] 595 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 596 1.1 christos umlal v10.2d,v28.2s,v0.s[2] 597 1.1 christos b.eq .LInner_jump2 598 1.1 christos add x6,x6,#16 // don't advance in last iteration 599 1.1 christos .LInner_jump2: 600 1.1 christos umlal v11.2d,v28.2s,v0.s[3] 601 1.1 christos umlal v12.2d,v28.2s,v1.s[0] 602 1.1 christos umlal v13.2d,v28.2s,v1.s[1] 603 1.1 christos umlal v6.2d,v28.2s,v1.s[2] 604 1.1 christos umlal v7.2d,v28.2s,v1.s[3] 605 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 606 1.1 christos umlal v8.2d,v29.2s,v2.s[0] 607 1.1 christos umlal v9.2d,v29.2s,v2.s[1] 608 1.1 christos umlal v10.2d,v29.2s,v2.s[2] 609 1.1 christos umlal v11.2d,v29.2s,v2.s[3] 610 1.1 christos umlal v12.2d,v29.2s,v3.s[0] 611 1.1 christos umlal v13.2d,v29.2s,v3.s[1] 612 1.1 christos umlal v6.2d,v29.2s,v3.s[2] 613 1.1 christos umlal v7.2d,v29.2s,v3.s[3] 614 1.1 christos st1 {v8.2d},[x7],#16 615 1.1 christos umlal v9.2d,v28.2s,v0.s[0] 616 1.1 christos ld1 {v8.2d},[x6] 617 1.1 christos umlal v10.2d,v28.2s,v0.s[1] 618 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 619 1.1 christos umlal v11.2d,v28.2s,v0.s[2] 620 1.1 christos b.eq .LInner_jump3 621 1.1 christos add x6,x6,#16 // don't advance in last iteration 622 1.1 christos .LInner_jump3: 623 1.1 christos umlal v12.2d,v28.2s,v0.s[3] 624 1.1 christos umlal v13.2d,v28.2s,v1.s[0] 625 1.1 christos umlal v6.2d,v28.2s,v1.s[1] 626 1.1 christos umlal v7.2d,v28.2s,v1.s[2] 627 1.1 christos umlal v8.2d,v28.2s,v1.s[3] 628 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 629 1.1 christos umlal v9.2d,v29.2s,v2.s[0] 630 1.1 christos umlal v10.2d,v29.2s,v2.s[1] 631 1.1 christos umlal v11.2d,v29.2s,v2.s[2] 632 1.1 christos umlal v12.2d,v29.2s,v2.s[3] 633 1.1 christos umlal v13.2d,v29.2s,v3.s[0] 634 1.1 christos umlal v6.2d,v29.2s,v3.s[1] 635 1.1 christos umlal v7.2d,v29.2s,v3.s[2] 636 1.1 christos umlal v8.2d,v29.2s,v3.s[3] 637 1.1 christos st1 {v9.2d},[x7],#16 638 1.1 christos umlal v10.2d,v28.2s,v0.s[0] 639 1.1 christos ld1 {v9.2d},[x6] 640 1.1 christos umlal v11.2d,v28.2s,v0.s[1] 641 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 642 1.1 christos umlal v12.2d,v28.2s,v0.s[2] 643 1.1 christos b.eq .LInner_jump4 644 1.1 christos add x6,x6,#16 // don't advance in last iteration 645 1.1 christos .LInner_jump4: 646 1.1 christos umlal v13.2d,v28.2s,v0.s[3] 647 1.1 christos umlal v6.2d,v28.2s,v1.s[0] 648 1.1 christos umlal v7.2d,v28.2s,v1.s[1] 649 1.1 christos umlal v8.2d,v28.2s,v1.s[2] 650 1.1 christos umlal v9.2d,v28.2s,v1.s[3] 651 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 652 1.1 christos umlal v10.2d,v29.2s,v2.s[0] 653 1.1 christos umlal v11.2d,v29.2s,v2.s[1] 654 1.1 christos umlal v12.2d,v29.2s,v2.s[2] 655 1.1 christos umlal v13.2d,v29.2s,v2.s[3] 656 1.1 christos umlal v6.2d,v29.2s,v3.s[0] 657 1.1 christos umlal v7.2d,v29.2s,v3.s[1] 658 1.1 christos umlal v8.2d,v29.2s,v3.s[2] 659 1.1 christos umlal v9.2d,v29.2s,v3.s[3] 660 1.1 christos st1 {v10.2d},[x7],#16 661 1.1 christos umlal v11.2d,v28.2s,v0.s[0] 662 1.1 christos ld1 {v10.2d},[x6] 663 1.1 christos umlal v12.2d,v28.2s,v0.s[1] 664 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 665 1.1 christos umlal v13.2d,v28.2s,v0.s[2] 666 1.1 christos b.eq .LInner_jump5 667 1.1 christos add x6,x6,#16 // don't advance in last iteration 668 1.1 christos .LInner_jump5: 669 1.1 christos umlal v6.2d,v28.2s,v0.s[3] 670 1.1 christos umlal v7.2d,v28.2s,v1.s[0] 671 1.1 christos umlal v8.2d,v28.2s,v1.s[1] 672 1.1 christos umlal v9.2d,v28.2s,v1.s[2] 673 1.1 christos umlal v10.2d,v28.2s,v1.s[3] 674 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 675 1.1 christos umlal v11.2d,v29.2s,v2.s[0] 676 1.1 christos umlal v12.2d,v29.2s,v2.s[1] 677 1.1 christos umlal v13.2d,v29.2s,v2.s[2] 678 1.1 christos umlal v6.2d,v29.2s,v2.s[3] 679 1.1 christos umlal v7.2d,v29.2s,v3.s[0] 680 1.1 christos umlal v8.2d,v29.2s,v3.s[1] 681 1.1 christos umlal v9.2d,v29.2s,v3.s[2] 682 1.1 christos umlal v10.2d,v29.2s,v3.s[3] 683 1.1 christos st1 {v11.2d},[x7],#16 684 1.1 christos umlal v12.2d,v28.2s,v0.s[0] 685 1.1 christos ld1 {v11.2d},[x6] 686 1.1 christos umlal v13.2d,v28.2s,v0.s[1] 687 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 688 1.1 christos umlal v6.2d,v28.2s,v0.s[2] 689 1.1 christos b.eq .LInner_jump6 690 1.1 christos add x6,x6,#16 // don't advance in last iteration 691 1.1 christos .LInner_jump6: 692 1.1 christos umlal v7.2d,v28.2s,v0.s[3] 693 1.1 christos umlal v8.2d,v28.2s,v1.s[0] 694 1.1 christos umlal v9.2d,v28.2s,v1.s[1] 695 1.1 christos umlal v10.2d,v28.2s,v1.s[2] 696 1.1 christos umlal v11.2d,v28.2s,v1.s[3] 697 1.1 christos ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 698 1.1 christos umlal v12.2d,v29.2s,v2.s[0] 699 1.1 christos umlal v13.2d,v29.2s,v2.s[1] 700 1.1 christos umlal v6.2d,v29.2s,v2.s[2] 701 1.1 christos umlal v7.2d,v29.2s,v2.s[3] 702 1.1 christos umlal v8.2d,v29.2s,v3.s[0] 703 1.1 christos umlal v9.2d,v29.2s,v3.s[1] 704 1.1 christos umlal v10.2d,v29.2s,v3.s[2] 705 1.1 christos umlal v11.2d,v29.2s,v3.s[3] 706 1.1 christos st1 {v12.2d},[x7],#16 707 1.1 christos umlal v13.2d,v28.2s,v0.s[0] 708 1.1 christos ld1 {v12.2d},[x6] 709 1.1 christos umlal v6.2d,v28.2s,v0.s[1] 710 1.1 christos ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 711 1.1 christos umlal v7.2d,v28.2s,v0.s[2] 712 1.1 christos b.eq .LInner_jump7 713 1.1 christos add x6,x6,#16 // don't advance in last iteration 714 1.1 christos .LInner_jump7: 715 1.1 christos umlal v8.2d,v28.2s,v0.s[3] 716 1.1 christos umlal v9.2d,v28.2s,v1.s[0] 717 1.1 christos umlal v10.2d,v28.2s,v1.s[1] 718 1.1 christos umlal v11.2d,v28.2s,v1.s[2] 719 1.1 christos umlal v12.2d,v28.2s,v1.s[3] 720 1.1 christos b.ne .LInner_after_rewind8 721 1.1 christos sub x1,x1,x5,lsl#2 // rewind 722 1.1 christos .LInner_after_rewind8: 723 1.1 christos umlal v13.2d,v29.2s,v2.s[0] 724 1.1 christos ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 725 1.1 christos umlal v6.2d,v29.2s,v2.s[1] 726 1.1 christos ld1 {v0.4s,v1.4s},[x1],#32 727 1.1 christos umlal v7.2d,v29.2s,v2.s[2] 728 1.1 christos add x10,sp,#8 // rewind 729 1.1 christos umlal v8.2d,v29.2s,v2.s[3] 730 1.1 christos umlal v9.2d,v29.2s,v3.s[0] 731 1.1 christos umlal v10.2d,v29.2s,v3.s[1] 732 1.1 christos umlal v11.2d,v29.2s,v3.s[2] 733 1.1 christos st1 {v13.2d},[x7],#16 734 1.1 christos umlal v12.2d,v29.2s,v3.s[3] 735 1.1 christos 736 1.1 christos bne .LNEON_8n_inner 737 1.1 christos add x6,sp,#128 738 1.1 christos st1 {v6.2d,v7.2d},[x7],#32 739 1.1 christos eor v2.16b,v2.16b,v2.16b // v2 740 1.1 christos st1 {v8.2d,v9.2d},[x7],#32 741 1.1 christos eor v3.16b,v3.16b,v3.16b // v3 742 1.1 christos st1 {v10.2d,v11.2d},[x7],#32 743 1.1 christos st1 {v12.2d},[x7] 744 1.1 christos 745 1.1 christos subs x9,x9,#8 746 1.1 christos ld1 {v6.2d,v7.2d},[x6],#32 747 1.1 christos ld1 {v8.2d,v9.2d},[x6],#32 748 1.1 christos ld1 {v10.2d,v11.2d},[x6],#32 749 1.1 christos ld1 {v12.2d,v13.2d},[x6],#32 750 1.1 christos 751 1.1 christos b.eq .LInner_8n_jump_2steps 752 1.1 christos sub x3,x3,x5,lsl#2 // rewind 753 1.1 christos b .LNEON_8n_outer 754 1.1 christos 755 1.1 christos .LInner_8n_jump_2steps: 756 1.1 christos add x7,sp,#128 757 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 758 1.1 christos mov v5.16b,v6.16b 759 1.1 christos ushr v15.2d,v6.2d,#16 760 1.1 christos ext v6.16b,v6.16b,v6.16b,#8 761 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 762 1.1 christos add v6.2d,v6.2d,v15.2d 763 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 764 1.1 christos ushr v15.2d,v6.2d,#16 765 1.1 christos st1 {v2.2d,v3.2d}, [sp],#32 766 1.1 christos zip1 v6.4h,v5.4h,v6.4h 767 1.1 christos ins v15.d[1],v14.d[0] 768 1.1 christos 769 1.1 christos mov x8,x5 770 1.1 christos b .LNEON_tail_entry 771 1.1 christos 772 1.1 christos .align 4 773 1.1 christos .LNEON_tail: 774 1.1 christos add v6.2d,v6.2d,v15.2d 775 1.1 christos mov v5.16b,v6.16b 776 1.1 christos ushr v15.2d,v6.2d,#16 777 1.1 christos ext v6.16b,v6.16b,v6.16b,#8 778 1.1 christos ld1 {v8.2d,v9.2d}, [x6],#32 779 1.1 christos add v6.2d,v6.2d,v15.2d 780 1.1 christos ld1 {v10.2d,v11.2d}, [x6],#32 781 1.1 christos ushr v15.2d,v6.2d,#16 782 1.1 christos ld1 {v12.2d,v13.2d}, [x6],#32 783 1.1 christos zip1 v6.4h,v5.4h,v6.4h 784 1.1 christos ins v15.d[1],v14.d[0] 785 1.1 christos 786 1.1 christos .LNEON_tail_entry: 787 1.1 christos add v7.2d,v7.2d,v15.2d 788 1.1 christos st1 {v6.s}[0], [x7],#4 789 1.1 christos ushr v15.2d,v7.2d,#16 790 1.1 christos mov v5.16b,v7.16b 791 1.1 christos ext v7.16b,v7.16b,v7.16b,#8 792 1.1 christos add v7.2d,v7.2d,v15.2d 793 1.1 christos ushr v15.2d,v7.2d,#16 794 1.1 christos zip1 v7.4h,v5.4h,v7.4h 795 1.1 christos ins v15.d[1],v14.d[0] 796 1.1 christos add v8.2d,v8.2d,v15.2d 797 1.1 christos st1 {v7.s}[0], [x7],#4 798 1.1 christos ushr v15.2d,v8.2d,#16 799 1.1 christos mov v5.16b,v8.16b 800 1.1 christos ext v8.16b,v8.16b,v8.16b,#8 801 1.1 christos add v8.2d,v8.2d,v15.2d 802 1.1 christos ushr v15.2d,v8.2d,#16 803 1.1 christos zip1 v8.4h,v5.4h,v8.4h 804 1.1 christos ins v15.d[1],v14.d[0] 805 1.1 christos add v9.2d,v9.2d,v15.2d 806 1.1 christos st1 {v8.s}[0], [x7],#4 807 1.1 christos ushr v15.2d,v9.2d,#16 808 1.1 christos mov v5.16b,v9.16b 809 1.1 christos ext v9.16b,v9.16b,v9.16b,#8 810 1.1 christos add v9.2d,v9.2d,v15.2d 811 1.1 christos ushr v15.2d,v9.2d,#16 812 1.1 christos zip1 v9.4h,v5.4h,v9.4h 813 1.1 christos ins v15.d[1],v14.d[0] 814 1.1 christos add v10.2d,v10.2d,v15.2d 815 1.1 christos st1 {v9.s}[0], [x7],#4 816 1.1 christos ushr v15.2d,v10.2d,#16 817 1.1 christos mov v5.16b,v10.16b 818 1.1 christos ext v10.16b,v10.16b,v10.16b,#8 819 1.1 christos add v10.2d,v10.2d,v15.2d 820 1.1 christos ushr v15.2d,v10.2d,#16 821 1.1 christos zip1 v10.4h,v5.4h,v10.4h 822 1.1 christos ins v15.d[1],v14.d[0] 823 1.1 christos add v11.2d,v11.2d,v15.2d 824 1.1 christos st1 {v10.s}[0], [x7],#4 825 1.1 christos ushr v15.2d,v11.2d,#16 826 1.1 christos mov v5.16b,v11.16b 827 1.1 christos ext v11.16b,v11.16b,v11.16b,#8 828 1.1 christos add v11.2d,v11.2d,v15.2d 829 1.1 christos ushr v15.2d,v11.2d,#16 830 1.1 christos zip1 v11.4h,v5.4h,v11.4h 831 1.1 christos ins v15.d[1],v14.d[0] 832 1.1 christos add v12.2d,v12.2d,v15.2d 833 1.1 christos st1 {v11.s}[0], [x7],#4 834 1.1 christos ushr v15.2d,v12.2d,#16 835 1.1 christos mov v5.16b,v12.16b 836 1.1 christos ext v12.16b,v12.16b,v12.16b,#8 837 1.1 christos add v12.2d,v12.2d,v15.2d 838 1.1 christos ushr v15.2d,v12.2d,#16 839 1.1 christos zip1 v12.4h,v5.4h,v12.4h 840 1.1 christos ins v15.d[1],v14.d[0] 841 1.1 christos add v13.2d,v13.2d,v15.2d 842 1.1 christos st1 {v12.s}[0], [x7],#4 843 1.1 christos ushr v15.2d,v13.2d,#16 844 1.1 christos mov v5.16b,v13.16b 845 1.1 christos ext v13.16b,v13.16b,v13.16b,#8 846 1.1 christos add v13.2d,v13.2d,v15.2d 847 1.1 christos ushr v15.2d,v13.2d,#16 848 1.1 christos zip1 v13.4h,v5.4h,v13.4h 849 1.1 christos ins v15.d[1],v14.d[0] 850 1.1 christos ld1 {v6.2d,v7.2d}, [x6],#32 851 1.1 christos subs x8,x8,#8 852 1.1 christos st1 {v13.s}[0], [x7],#4 853 1.1 christos bne .LNEON_tail 854 1.1 christos 855 1.1 christos st1 {v15.s}[0], [x7],#4 // top-most bit 856 1.1 christos sub x3,x3,x5,lsl#2 // rewind x3 857 1.1 christos subs x1,sp,#0 // clear carry flag 858 1.1 christos add x2,sp,x5,lsl#2 859 1.1 christos 860 1.1 christos .LNEON_sub: 861 1.1 christos ldp w4,w5,[x1],#8 862 1.1 christos ldp w6,w7,[x1],#8 863 1.1 christos ldp w8,w9,[x3],#8 864 1.1 christos ldp w10,w11,[x3],#8 865 1.1 christos sbcs w8,w4,w8 866 1.1 christos sbcs w9,w5,w9 867 1.1 christos sbcs w10,w6,w10 868 1.1 christos sbcs w11,w7,w11 869 1.1 christos sub x17,x2,x1 870 1.1 christos stp w8,w9,[x0],#8 871 1.1 christos stp w10,w11,[x0],#8 872 1.1 christos cbnz x17,.LNEON_sub 873 1.1 christos 874 1.1 christos ldr w10, [x1] // load top-most bit 875 1.1 christos mov x11,sp 876 1.1 christos eor v0.16b,v0.16b,v0.16b 877 1.1 christos sub x11,x2,x11 // this is num*4 878 1.1 christos eor v1.16b,v1.16b,v1.16b 879 1.1 christos mov x1,sp 880 1.1 christos sub x0,x0,x11 // rewind x0 881 1.1 christos mov x3,x2 // second 3/4th of frame 882 1.1 christos sbcs w10,w10,wzr // result is carry flag 883 1.1 christos 884 1.1 christos .LNEON_copy_n_zap: 885 1.1 christos ldp w4,w5,[x1],#8 886 1.1 christos ldp w6,w7,[x1],#8 887 1.1 christos ldp w8,w9,[x0],#8 888 1.1 christos ldp w10,w11,[x0] 889 1.1 christos sub x0,x0,#8 890 1.1 christos b.cs .LCopy_1 891 1.1 christos mov w8,w4 892 1.1 christos mov w9,w5 893 1.1 christos mov w10,w6 894 1.1 christos mov w11,w7 895 1.1 christos .LCopy_1: 896 1.1 christos st1 {v0.2d,v1.2d}, [x3],#32 // wipe 897 1.1 christos st1 {v0.2d,v1.2d}, [x3],#32 // wipe 898 1.1 christos ldp w4,w5,[x1],#8 899 1.1 christos ldp w6,w7,[x1],#8 900 1.1 christos stp w8,w9,[x0],#8 901 1.1 christos stp w10,w11,[x0],#8 902 1.1 christos sub x1,x1,#32 903 1.1 christos ldp w8,w9,[x0],#8 904 1.1 christos ldp w10,w11,[x0] 905 1.1 christos sub x0,x0,#8 906 1.1 christos b.cs .LCopy_2 907 1.1 christos mov w8, w4 908 1.1 christos mov w9, w5 909 1.1 christos mov w10, w6 910 1.1 christos mov w11, w7 911 1.1 christos .LCopy_2: 912 1.1 christos st1 {v0.2d,v1.2d}, [x1],#32 // wipe 913 1.1 christos st1 {v0.2d,v1.2d}, [x3],#32 // wipe 914 1.1 christos sub x17,x2,x1 // preserves carry 915 1.1 christos stp w8,w9,[x0],#8 916 1.1 christos stp w10,w11,[x0],#8 917 1.1 christos cbnz x17,.LNEON_copy_n_zap 918 1.1 christos 919 1.1 christos mov sp,x16 920 1.1 christos ldp d14,d15,[sp,#64] 921 1.1 christos ldp d12,d13,[sp,#48] 922 1.1 christos ldp d10,d11,[sp,#32] 923 1.1 christos ldp d8,d9,[sp,#16] 924 1.1 christos ldr x29,[sp],#80 925 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 926 1.1 christos ret // bx lr 927 1.1 christos 928 1.1 christos .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 929 1.1 christos .type __bn_sqr8x_mont,%function 930 1.1 christos .align 5 931 1.1 christos __bn_sqr8x_mont: 932 1.1 christos cmp x1,x2 933 1.1 christos b.ne __bn_mul4x_mont 934 1.1 christos .Lsqr8x_mont: 935 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 936 1.2 christos // only from bn_mul_mont which has already signed the return address. 937 1.1 christos stp x29,x30,[sp,#-128]! 938 1.1 christos add x29,sp,#0 939 1.1 christos stp x19,x20,[sp,#16] 940 1.1 christos stp x21,x22,[sp,#32] 941 1.1 christos stp x23,x24,[sp,#48] 942 1.1 christos stp x25,x26,[sp,#64] 943 1.1 christos stp x27,x28,[sp,#80] 944 1.1 christos stp x0,x3,[sp,#96] // offload rp and np 945 1.1 christos 946 1.1 christos ldp x6,x7,[x1,#8*0] 947 1.1 christos ldp x8,x9,[x1,#8*2] 948 1.1 christos ldp x10,x11,[x1,#8*4] 949 1.1 christos ldp x12,x13,[x1,#8*6] 950 1.1 christos 951 1.1 christos sub x2,sp,x5,lsl#4 952 1.1 christos lsl x5,x5,#3 953 1.1 christos ldr x4,[x4] // *n0 954 1.1 christos mov sp,x2 // alloca 955 1.1 christos sub x27,x5,#8*8 956 1.1 christos b .Lsqr8x_zero_start 957 1.1 christos 958 1.1 christos .Lsqr8x_zero: 959 1.1 christos sub x27,x27,#8*8 960 1.1 christos stp xzr,xzr,[x2,#8*0] 961 1.1 christos stp xzr,xzr,[x2,#8*2] 962 1.1 christos stp xzr,xzr,[x2,#8*4] 963 1.1 christos stp xzr,xzr,[x2,#8*6] 964 1.1 christos .Lsqr8x_zero_start: 965 1.1 christos stp xzr,xzr,[x2,#8*8] 966 1.1 christos stp xzr,xzr,[x2,#8*10] 967 1.1 christos stp xzr,xzr,[x2,#8*12] 968 1.1 christos stp xzr,xzr,[x2,#8*14] 969 1.1 christos add x2,x2,#8*16 970 1.1 christos cbnz x27,.Lsqr8x_zero 971 1.1 christos 972 1.1 christos add x3,x1,x5 973 1.1 christos add x1,x1,#8*8 974 1.1 christos mov x19,xzr 975 1.1 christos mov x20,xzr 976 1.1 christos mov x21,xzr 977 1.1 christos mov x22,xzr 978 1.1 christos mov x23,xzr 979 1.1 christos mov x24,xzr 980 1.1 christos mov x25,xzr 981 1.1 christos mov x26,xzr 982 1.1 christos mov x2,sp 983 1.1 christos str x4,[x29,#112] // offload n0 984 1.1 christos 985 1.1 christos // Multiply everything but a[i]*a[i] 986 1.1 christos .align 4 987 1.1 christos .Lsqr8x_outer_loop: 988 1.1 christos // a[1]a[0] (i) 989 1.1 christos // a[2]a[0] 990 1.1 christos // a[3]a[0] 991 1.1 christos // a[4]a[0] 992 1.1 christos // a[5]a[0] 993 1.1 christos // a[6]a[0] 994 1.1 christos // a[7]a[0] 995 1.1 christos // a[2]a[1] (ii) 996 1.1 christos // a[3]a[1] 997 1.1 christos // a[4]a[1] 998 1.1 christos // a[5]a[1] 999 1.1 christos // a[6]a[1] 1000 1.1 christos // a[7]a[1] 1001 1.1 christos // a[3]a[2] (iii) 1002 1.1 christos // a[4]a[2] 1003 1.1 christos // a[5]a[2] 1004 1.1 christos // a[6]a[2] 1005 1.1 christos // a[7]a[2] 1006 1.1 christos // a[4]a[3] (iv) 1007 1.1 christos // a[5]a[3] 1008 1.1 christos // a[6]a[3] 1009 1.1 christos // a[7]a[3] 1010 1.1 christos // a[5]a[4] (v) 1011 1.1 christos // a[6]a[4] 1012 1.1 christos // a[7]a[4] 1013 1.1 christos // a[6]a[5] (vi) 1014 1.1 christos // a[7]a[5] 1015 1.1 christos // a[7]a[6] (vii) 1016 1.1 christos 1017 1.1 christos mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1018 1.1 christos mul x15,x8,x6 1019 1.1 christos mul x16,x9,x6 1020 1.1 christos mul x17,x10,x6 1021 1.1 christos adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1022 1.1 christos mul x14,x11,x6 1023 1.1 christos adcs x21,x21,x15 1024 1.1 christos mul x15,x12,x6 1025 1.1 christos adcs x22,x22,x16 1026 1.1 christos mul x16,x13,x6 1027 1.1 christos adcs x23,x23,x17 1028 1.1 christos umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1029 1.1 christos adcs x24,x24,x14 1030 1.1 christos umulh x14,x8,x6 1031 1.1 christos adcs x25,x25,x15 1032 1.1 christos umulh x15,x9,x6 1033 1.1 christos adcs x26,x26,x16 1034 1.1 christos umulh x16,x10,x6 1035 1.1 christos stp x19,x20,[x2],#8*2 // t[0..1] 1036 1.1 christos adc x19,xzr,xzr // t[8] 1037 1.1 christos adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1038 1.1 christos umulh x17,x11,x6 1039 1.1 christos adcs x22,x22,x14 1040 1.1 christos umulh x14,x12,x6 1041 1.1 christos adcs x23,x23,x15 1042 1.1 christos umulh x15,x13,x6 1043 1.1 christos adcs x24,x24,x16 1044 1.1 christos mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1045 1.1 christos adcs x25,x25,x17 1046 1.1 christos mul x17,x9,x7 1047 1.1 christos adcs x26,x26,x14 1048 1.1 christos mul x14,x10,x7 1049 1.1 christos adc x19,x19,x15 1050 1.1 christos 1051 1.1 christos mul x15,x11,x7 1052 1.1 christos adds x22,x22,x16 1053 1.1 christos mul x16,x12,x7 1054 1.1 christos adcs x23,x23,x17 1055 1.1 christos mul x17,x13,x7 1056 1.1 christos adcs x24,x24,x14 1057 1.1 christos umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1058 1.1 christos adcs x25,x25,x15 1059 1.1 christos umulh x15,x9,x7 1060 1.1 christos adcs x26,x26,x16 1061 1.1 christos umulh x16,x10,x7 1062 1.1 christos adcs x19,x19,x17 1063 1.1 christos umulh x17,x11,x7 1064 1.1 christos stp x21,x22,[x2],#8*2 // t[2..3] 1065 1.1 christos adc x20,xzr,xzr // t[9] 1066 1.1 christos adds x23,x23,x14 1067 1.1 christos umulh x14,x12,x7 1068 1.1 christos adcs x24,x24,x15 1069 1.1 christos umulh x15,x13,x7 1070 1.1 christos adcs x25,x25,x16 1071 1.1 christos mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1072 1.1 christos adcs x26,x26,x17 1073 1.1 christos mul x17,x10,x8 1074 1.1 christos adcs x19,x19,x14 1075 1.1 christos mul x14,x11,x8 1076 1.1 christos adc x20,x20,x15 1077 1.1 christos 1078 1.1 christos mul x15,x12,x8 1079 1.1 christos adds x24,x24,x16 1080 1.1 christos mul x16,x13,x8 1081 1.1 christos adcs x25,x25,x17 1082 1.1 christos umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1083 1.1 christos adcs x26,x26,x14 1084 1.1 christos umulh x14,x10,x8 1085 1.1 christos adcs x19,x19,x15 1086 1.1 christos umulh x15,x11,x8 1087 1.1 christos adcs x20,x20,x16 1088 1.1 christos umulh x16,x12,x8 1089 1.1 christos stp x23,x24,[x2],#8*2 // t[4..5] 1090 1.1 christos adc x21,xzr,xzr // t[10] 1091 1.1 christos adds x25,x25,x17 1092 1.1 christos umulh x17,x13,x8 1093 1.1 christos adcs x26,x26,x14 1094 1.1 christos mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1095 1.1 christos adcs x19,x19,x15 1096 1.1 christos mul x15,x11,x9 1097 1.1 christos adcs x20,x20,x16 1098 1.1 christos mul x16,x12,x9 1099 1.1 christos adc x21,x21,x17 1100 1.1 christos 1101 1.1 christos mul x17,x13,x9 1102 1.1 christos adds x26,x26,x14 1103 1.1 christos umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1104 1.1 christos adcs x19,x19,x15 1105 1.1 christos umulh x15,x11,x9 1106 1.1 christos adcs x20,x20,x16 1107 1.1 christos umulh x16,x12,x9 1108 1.1 christos adcs x21,x21,x17 1109 1.1 christos umulh x17,x13,x9 1110 1.1 christos stp x25,x26,[x2],#8*2 // t[6..7] 1111 1.1 christos adc x22,xzr,xzr // t[11] 1112 1.1 christos adds x19,x19,x14 1113 1.1 christos mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1114 1.1 christos adcs x20,x20,x15 1115 1.1 christos mul x15,x12,x10 1116 1.1 christos adcs x21,x21,x16 1117 1.1 christos mul x16,x13,x10 1118 1.1 christos adc x22,x22,x17 1119 1.1 christos 1120 1.1 christos umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1121 1.1 christos adds x20,x20,x14 1122 1.1 christos umulh x14,x12,x10 1123 1.1 christos adcs x21,x21,x15 1124 1.1 christos umulh x15,x13,x10 1125 1.1 christos adcs x22,x22,x16 1126 1.1 christos mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1127 1.1 christos adc x23,xzr,xzr // t[12] 1128 1.1 christos adds x21,x21,x17 1129 1.1 christos mul x17,x13,x11 1130 1.1 christos adcs x22,x22,x14 1131 1.1 christos umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1132 1.1 christos adc x23,x23,x15 1133 1.1 christos 1134 1.1 christos umulh x15,x13,x11 1135 1.1 christos adds x22,x22,x16 1136 1.1 christos mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1137 1.1 christos adcs x23,x23,x17 1138 1.1 christos umulh x17,x13,x12 // hi(a[7]*a[6]) 1139 1.1 christos adc x24,xzr,xzr // t[13] 1140 1.1 christos adds x23,x23,x14 1141 1.1 christos sub x27,x3,x1 // done yet? 1142 1.1 christos adc x24,x24,x15 1143 1.1 christos 1144 1.1 christos adds x24,x24,x16 1145 1.1 christos sub x14,x3,x5 // rewinded ap 1146 1.1 christos adc x25,xzr,xzr // t[14] 1147 1.1 christos add x25,x25,x17 1148 1.1 christos 1149 1.1 christos cbz x27,.Lsqr8x_outer_break 1150 1.1 christos 1151 1.1 christos mov x4,x6 1152 1.1 christos ldp x6,x7,[x2,#8*0] 1153 1.1 christos ldp x8,x9,[x2,#8*2] 1154 1.1 christos ldp x10,x11,[x2,#8*4] 1155 1.1 christos ldp x12,x13,[x2,#8*6] 1156 1.1 christos adds x19,x19,x6 1157 1.1 christos adcs x20,x20,x7 1158 1.1 christos ldp x6,x7,[x1,#8*0] 1159 1.1 christos adcs x21,x21,x8 1160 1.1 christos adcs x22,x22,x9 1161 1.1 christos ldp x8,x9,[x1,#8*2] 1162 1.1 christos adcs x23,x23,x10 1163 1.1 christos adcs x24,x24,x11 1164 1.1 christos ldp x10,x11,[x1,#8*4] 1165 1.1 christos adcs x25,x25,x12 1166 1.1 christos mov x0,x1 1167 1.1 christos adcs x26,xzr,x13 1168 1.1 christos ldp x12,x13,[x1,#8*6] 1169 1.1 christos add x1,x1,#8*8 1170 1.1 christos //adc x28,xzr,xzr // moved below 1171 1.1 christos mov x27,#-8*8 1172 1.1 christos 1173 1.1 christos // a[8]a[0] 1174 1.1 christos // a[9]a[0] 1175 1.1 christos // a[a]a[0] 1176 1.1 christos // a[b]a[0] 1177 1.1 christos // a[c]a[0] 1178 1.1 christos // a[d]a[0] 1179 1.1 christos // a[e]a[0] 1180 1.1 christos // a[f]a[0] 1181 1.1 christos // a[8]a[1] 1182 1.1 christos // a[f]a[1]........................ 1183 1.1 christos // a[8]a[2] 1184 1.1 christos // a[f]a[2]........................ 1185 1.1 christos // a[8]a[3] 1186 1.1 christos // a[f]a[3]........................ 1187 1.1 christos // a[8]a[4] 1188 1.1 christos // a[f]a[4]........................ 1189 1.1 christos // a[8]a[5] 1190 1.1 christos // a[f]a[5]........................ 1191 1.1 christos // a[8]a[6] 1192 1.1 christos // a[f]a[6]........................ 1193 1.1 christos // a[8]a[7] 1194 1.1 christos // a[f]a[7]........................ 1195 1.1 christos .Lsqr8x_mul: 1196 1.1 christos mul x14,x6,x4 1197 1.1 christos adc x28,xzr,xzr // carry bit, modulo-scheduled 1198 1.1 christos mul x15,x7,x4 1199 1.1 christos add x27,x27,#8 1200 1.1 christos mul x16,x8,x4 1201 1.1 christos mul x17,x9,x4 1202 1.1 christos adds x19,x19,x14 1203 1.1 christos mul x14,x10,x4 1204 1.1 christos adcs x20,x20,x15 1205 1.1 christos mul x15,x11,x4 1206 1.1 christos adcs x21,x21,x16 1207 1.1 christos mul x16,x12,x4 1208 1.1 christos adcs x22,x22,x17 1209 1.1 christos mul x17,x13,x4 1210 1.1 christos adcs x23,x23,x14 1211 1.1 christos umulh x14,x6,x4 1212 1.1 christos adcs x24,x24,x15 1213 1.1 christos umulh x15,x7,x4 1214 1.1 christos adcs x25,x25,x16 1215 1.1 christos umulh x16,x8,x4 1216 1.1 christos adcs x26,x26,x17 1217 1.1 christos umulh x17,x9,x4 1218 1.1 christos adc x28,x28,xzr 1219 1.1 christos str x19,[x2],#8 1220 1.1 christos adds x19,x20,x14 1221 1.1 christos umulh x14,x10,x4 1222 1.1 christos adcs x20,x21,x15 1223 1.1 christos umulh x15,x11,x4 1224 1.1 christos adcs x21,x22,x16 1225 1.1 christos umulh x16,x12,x4 1226 1.1 christos adcs x22,x23,x17 1227 1.1 christos umulh x17,x13,x4 1228 1.1 christos ldr x4,[x0,x27] 1229 1.1 christos adcs x23,x24,x14 1230 1.1 christos adcs x24,x25,x15 1231 1.1 christos adcs x25,x26,x16 1232 1.1 christos adcs x26,x28,x17 1233 1.1 christos //adc x28,xzr,xzr // moved above 1234 1.1 christos cbnz x27,.Lsqr8x_mul 1235 1.1 christos // note that carry flag is guaranteed 1236 1.1 christos // to be zero at this point 1237 1.1 christos cmp x1,x3 // done yet? 1238 1.1 christos b.eq .Lsqr8x_break 1239 1.1 christos 1240 1.1 christos ldp x6,x7,[x2,#8*0] 1241 1.1 christos ldp x8,x9,[x2,#8*2] 1242 1.1 christos ldp x10,x11,[x2,#8*4] 1243 1.1 christos ldp x12,x13,[x2,#8*6] 1244 1.1 christos adds x19,x19,x6 1245 1.1 christos ldur x4,[x0,#-8*8] 1246 1.1 christos adcs x20,x20,x7 1247 1.1 christos ldp x6,x7,[x1,#8*0] 1248 1.1 christos adcs x21,x21,x8 1249 1.1 christos adcs x22,x22,x9 1250 1.1 christos ldp x8,x9,[x1,#8*2] 1251 1.1 christos adcs x23,x23,x10 1252 1.1 christos adcs x24,x24,x11 1253 1.1 christos ldp x10,x11,[x1,#8*4] 1254 1.1 christos adcs x25,x25,x12 1255 1.1 christos mov x27,#-8*8 1256 1.1 christos adcs x26,x26,x13 1257 1.1 christos ldp x12,x13,[x1,#8*6] 1258 1.1 christos add x1,x1,#8*8 1259 1.1 christos //adc x28,xzr,xzr // moved above 1260 1.1 christos b .Lsqr8x_mul 1261 1.1 christos 1262 1.1 christos .align 4 1263 1.1 christos .Lsqr8x_break: 1264 1.1 christos ldp x6,x7,[x0,#8*0] 1265 1.1 christos add x1,x0,#8*8 1266 1.1 christos ldp x8,x9,[x0,#8*2] 1267 1.1 christos sub x14,x3,x1 // is it last iteration? 1268 1.1 christos ldp x10,x11,[x0,#8*4] 1269 1.1 christos sub x15,x2,x14 1270 1.1 christos ldp x12,x13,[x0,#8*6] 1271 1.1 christos cbz x14,.Lsqr8x_outer_loop 1272 1.1 christos 1273 1.1 christos stp x19,x20,[x2,#8*0] 1274 1.1 christos ldp x19,x20,[x15,#8*0] 1275 1.1 christos stp x21,x22,[x2,#8*2] 1276 1.1 christos ldp x21,x22,[x15,#8*2] 1277 1.1 christos stp x23,x24,[x2,#8*4] 1278 1.1 christos ldp x23,x24,[x15,#8*4] 1279 1.1 christos stp x25,x26,[x2,#8*6] 1280 1.1 christos mov x2,x15 1281 1.1 christos ldp x25,x26,[x15,#8*6] 1282 1.1 christos b .Lsqr8x_outer_loop 1283 1.1 christos 1284 1.1 christos .align 4 1285 1.1 christos .Lsqr8x_outer_break: 1286 1.1 christos // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1287 1.1 christos ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1288 1.1 christos ldp x15,x16,[sp,#8*1] 1289 1.1 christos ldp x11,x13,[x14,#8*2] 1290 1.1 christos add x1,x14,#8*4 1291 1.1 christos ldp x17,x14,[sp,#8*3] 1292 1.1 christos 1293 1.1 christos stp x19,x20,[x2,#8*0] 1294 1.1 christos mul x19,x7,x7 1295 1.1 christos stp x21,x22,[x2,#8*2] 1296 1.1 christos umulh x7,x7,x7 1297 1.1 christos stp x23,x24,[x2,#8*4] 1298 1.1 christos mul x8,x9,x9 1299 1.1 christos stp x25,x26,[x2,#8*6] 1300 1.1 christos mov x2,sp 1301 1.1 christos umulh x9,x9,x9 1302 1.1 christos adds x20,x7,x15,lsl#1 1303 1.1 christos extr x15,x16,x15,#63 1304 1.1 christos sub x27,x5,#8*4 1305 1.1 christos 1306 1.1 christos .Lsqr4x_shift_n_add: 1307 1.1 christos adcs x21,x8,x15 1308 1.1 christos extr x16,x17,x16,#63 1309 1.1 christos sub x27,x27,#8*4 1310 1.1 christos adcs x22,x9,x16 1311 1.1 christos ldp x15,x16,[x2,#8*5] 1312 1.1 christos mul x10,x11,x11 1313 1.1 christos ldp x7,x9,[x1],#8*2 1314 1.1 christos umulh x11,x11,x11 1315 1.1 christos mul x12,x13,x13 1316 1.1 christos umulh x13,x13,x13 1317 1.1 christos extr x17,x14,x17,#63 1318 1.1 christos stp x19,x20,[x2,#8*0] 1319 1.1 christos adcs x23,x10,x17 1320 1.1 christos extr x14,x15,x14,#63 1321 1.1 christos stp x21,x22,[x2,#8*2] 1322 1.1 christos adcs x24,x11,x14 1323 1.1 christos ldp x17,x14,[x2,#8*7] 1324 1.1 christos extr x15,x16,x15,#63 1325 1.1 christos adcs x25,x12,x15 1326 1.1 christos extr x16,x17,x16,#63 1327 1.1 christos adcs x26,x13,x16 1328 1.1 christos ldp x15,x16,[x2,#8*9] 1329 1.1 christos mul x6,x7,x7 1330 1.1 christos ldp x11,x13,[x1],#8*2 1331 1.1 christos umulh x7,x7,x7 1332 1.1 christos mul x8,x9,x9 1333 1.1 christos umulh x9,x9,x9 1334 1.1 christos stp x23,x24,[x2,#8*4] 1335 1.1 christos extr x17,x14,x17,#63 1336 1.1 christos stp x25,x26,[x2,#8*6] 1337 1.1 christos add x2,x2,#8*8 1338 1.1 christos adcs x19,x6,x17 1339 1.1 christos extr x14,x15,x14,#63 1340 1.1 christos adcs x20,x7,x14 1341 1.1 christos ldp x17,x14,[x2,#8*3] 1342 1.1 christos extr x15,x16,x15,#63 1343 1.1 christos cbnz x27,.Lsqr4x_shift_n_add 1344 1.1 christos ldp x1,x4,[x29,#104] // pull np and n0 1345 1.1 christos 1346 1.1 christos adcs x21,x8,x15 1347 1.1 christos extr x16,x17,x16,#63 1348 1.1 christos adcs x22,x9,x16 1349 1.1 christos ldp x15,x16,[x2,#8*5] 1350 1.1 christos mul x10,x11,x11 1351 1.1 christos umulh x11,x11,x11 1352 1.1 christos stp x19,x20,[x2,#8*0] 1353 1.1 christos mul x12,x13,x13 1354 1.1 christos umulh x13,x13,x13 1355 1.1 christos stp x21,x22,[x2,#8*2] 1356 1.1 christos extr x17,x14,x17,#63 1357 1.1 christos adcs x23,x10,x17 1358 1.1 christos extr x14,x15,x14,#63 1359 1.1 christos ldp x19,x20,[sp,#8*0] 1360 1.1 christos adcs x24,x11,x14 1361 1.1 christos extr x15,x16,x15,#63 1362 1.1 christos ldp x6,x7,[x1,#8*0] 1363 1.1 christos adcs x25,x12,x15 1364 1.1 christos extr x16,xzr,x16,#63 1365 1.1 christos ldp x8,x9,[x1,#8*2] 1366 1.1 christos adc x26,x13,x16 1367 1.1 christos ldp x10,x11,[x1,#8*4] 1368 1.1 christos 1369 1.1 christos // Reduce by 512 bits per iteration 1370 1.1 christos mul x28,x4,x19 // t[0]*n0 1371 1.1 christos ldp x12,x13,[x1,#8*6] 1372 1.1 christos add x3,x1,x5 1373 1.1 christos ldp x21,x22,[sp,#8*2] 1374 1.1 christos stp x23,x24,[x2,#8*4] 1375 1.1 christos ldp x23,x24,[sp,#8*4] 1376 1.1 christos stp x25,x26,[x2,#8*6] 1377 1.1 christos ldp x25,x26,[sp,#8*6] 1378 1.1 christos add x1,x1,#8*8 1379 1.1 christos mov x30,xzr // initial top-most carry 1380 1.1 christos mov x2,sp 1381 1.1 christos mov x27,#8 1382 1.1 christos 1383 1.1 christos .Lsqr8x_reduction: 1384 1.1 christos // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1385 1.1 christos mul x15,x7,x28 1386 1.1 christos sub x27,x27,#1 1387 1.1 christos mul x16,x8,x28 1388 1.1 christos str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1389 1.1 christos mul x17,x9,x28 1390 1.1 christos // (*) adds xzr,x19,x14 1391 1.1 christos subs xzr,x19,#1 // (*) 1392 1.1 christos mul x14,x10,x28 1393 1.1 christos adcs x19,x20,x15 1394 1.1 christos mul x15,x11,x28 1395 1.1 christos adcs x20,x21,x16 1396 1.1 christos mul x16,x12,x28 1397 1.1 christos adcs x21,x22,x17 1398 1.1 christos mul x17,x13,x28 1399 1.1 christos adcs x22,x23,x14 1400 1.1 christos umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1401 1.1 christos adcs x23,x24,x15 1402 1.1 christos umulh x15,x7,x28 1403 1.1 christos adcs x24,x25,x16 1404 1.1 christos umulh x16,x8,x28 1405 1.1 christos adcs x25,x26,x17 1406 1.1 christos umulh x17,x9,x28 1407 1.1 christos adc x26,xzr,xzr 1408 1.1 christos adds x19,x19,x14 1409 1.1 christos umulh x14,x10,x28 1410 1.1 christos adcs x20,x20,x15 1411 1.1 christos umulh x15,x11,x28 1412 1.1 christos adcs x21,x21,x16 1413 1.1 christos umulh x16,x12,x28 1414 1.1 christos adcs x22,x22,x17 1415 1.1 christos umulh x17,x13,x28 1416 1.1 christos mul x28,x4,x19 // next t[0]*n0 1417 1.1 christos adcs x23,x23,x14 1418 1.1 christos adcs x24,x24,x15 1419 1.1 christos adcs x25,x25,x16 1420 1.1 christos adc x26,x26,x17 1421 1.1 christos cbnz x27,.Lsqr8x_reduction 1422 1.1 christos 1423 1.1 christos ldp x14,x15,[x2,#8*0] 1424 1.1 christos ldp x16,x17,[x2,#8*2] 1425 1.1 christos mov x0,x2 1426 1.1 christos sub x27,x3,x1 // done yet? 1427 1.1 christos adds x19,x19,x14 1428 1.1 christos adcs x20,x20,x15 1429 1.1 christos ldp x14,x15,[x2,#8*4] 1430 1.1 christos adcs x21,x21,x16 1431 1.1 christos adcs x22,x22,x17 1432 1.1 christos ldp x16,x17,[x2,#8*6] 1433 1.1 christos adcs x23,x23,x14 1434 1.1 christos adcs x24,x24,x15 1435 1.1 christos adcs x25,x25,x16 1436 1.1 christos adcs x26,x26,x17 1437 1.1 christos //adc x28,xzr,xzr // moved below 1438 1.1 christos cbz x27,.Lsqr8x8_post_condition 1439 1.1 christos 1440 1.1 christos ldur x4,[x2,#-8*8] 1441 1.1 christos ldp x6,x7,[x1,#8*0] 1442 1.1 christos ldp x8,x9,[x1,#8*2] 1443 1.1 christos ldp x10,x11,[x1,#8*4] 1444 1.1 christos mov x27,#-8*8 1445 1.1 christos ldp x12,x13,[x1,#8*6] 1446 1.1 christos add x1,x1,#8*8 1447 1.1 christos 1448 1.1 christos .Lsqr8x_tail: 1449 1.1 christos mul x14,x6,x4 1450 1.1 christos adc x28,xzr,xzr // carry bit, modulo-scheduled 1451 1.1 christos mul x15,x7,x4 1452 1.1 christos add x27,x27,#8 1453 1.1 christos mul x16,x8,x4 1454 1.1 christos mul x17,x9,x4 1455 1.1 christos adds x19,x19,x14 1456 1.1 christos mul x14,x10,x4 1457 1.1 christos adcs x20,x20,x15 1458 1.1 christos mul x15,x11,x4 1459 1.1 christos adcs x21,x21,x16 1460 1.1 christos mul x16,x12,x4 1461 1.1 christos adcs x22,x22,x17 1462 1.1 christos mul x17,x13,x4 1463 1.1 christos adcs x23,x23,x14 1464 1.1 christos umulh x14,x6,x4 1465 1.1 christos adcs x24,x24,x15 1466 1.1 christos umulh x15,x7,x4 1467 1.1 christos adcs x25,x25,x16 1468 1.1 christos umulh x16,x8,x4 1469 1.1 christos adcs x26,x26,x17 1470 1.1 christos umulh x17,x9,x4 1471 1.1 christos adc x28,x28,xzr 1472 1.1 christos str x19,[x2],#8 1473 1.1 christos adds x19,x20,x14 1474 1.1 christos umulh x14,x10,x4 1475 1.1 christos adcs x20,x21,x15 1476 1.1 christos umulh x15,x11,x4 1477 1.1 christos adcs x21,x22,x16 1478 1.1 christos umulh x16,x12,x4 1479 1.1 christos adcs x22,x23,x17 1480 1.1 christos umulh x17,x13,x4 1481 1.1 christos ldr x4,[x0,x27] 1482 1.1 christos adcs x23,x24,x14 1483 1.1 christos adcs x24,x25,x15 1484 1.1 christos adcs x25,x26,x16 1485 1.1 christos adcs x26,x28,x17 1486 1.1 christos //adc x28,xzr,xzr // moved above 1487 1.1 christos cbnz x27,.Lsqr8x_tail 1488 1.1 christos // note that carry flag is guaranteed 1489 1.1 christos // to be zero at this point 1490 1.1 christos ldp x6,x7,[x2,#8*0] 1491 1.1 christos sub x27,x3,x1 // done yet? 1492 1.1 christos sub x16,x3,x5 // rewinded np 1493 1.1 christos ldp x8,x9,[x2,#8*2] 1494 1.1 christos ldp x10,x11,[x2,#8*4] 1495 1.1 christos ldp x12,x13,[x2,#8*6] 1496 1.1 christos cbz x27,.Lsqr8x_tail_break 1497 1.1 christos 1498 1.1 christos ldur x4,[x0,#-8*8] 1499 1.1 christos adds x19,x19,x6 1500 1.1 christos adcs x20,x20,x7 1501 1.1 christos ldp x6,x7,[x1,#8*0] 1502 1.1 christos adcs x21,x21,x8 1503 1.1 christos adcs x22,x22,x9 1504 1.1 christos ldp x8,x9,[x1,#8*2] 1505 1.1 christos adcs x23,x23,x10 1506 1.1 christos adcs x24,x24,x11 1507 1.1 christos ldp x10,x11,[x1,#8*4] 1508 1.1 christos adcs x25,x25,x12 1509 1.1 christos mov x27,#-8*8 1510 1.1 christos adcs x26,x26,x13 1511 1.1 christos ldp x12,x13,[x1,#8*6] 1512 1.1 christos add x1,x1,#8*8 1513 1.1 christos //adc x28,xzr,xzr // moved above 1514 1.1 christos b .Lsqr8x_tail 1515 1.1 christos 1516 1.1 christos .align 4 1517 1.1 christos .Lsqr8x_tail_break: 1518 1.1 christos ldr x4,[x29,#112] // pull n0 1519 1.1 christos add x27,x2,#8*8 // end of current t[num] window 1520 1.1 christos 1521 1.1 christos subs xzr,x30,#1 // "move" top-most carry to carry bit 1522 1.1 christos adcs x14,x19,x6 1523 1.1 christos adcs x15,x20,x7 1524 1.1 christos ldp x19,x20,[x0,#8*0] 1525 1.1 christos adcs x21,x21,x8 1526 1.1 christos ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1527 1.1 christos adcs x22,x22,x9 1528 1.1 christos ldp x8,x9,[x16,#8*2] 1529 1.1 christos adcs x23,x23,x10 1530 1.1 christos adcs x24,x24,x11 1531 1.1 christos ldp x10,x11,[x16,#8*4] 1532 1.1 christos adcs x25,x25,x12 1533 1.1 christos adcs x26,x26,x13 1534 1.1 christos ldp x12,x13,[x16,#8*6] 1535 1.1 christos add x1,x16,#8*8 1536 1.1 christos adc x30,xzr,xzr // top-most carry 1537 1.1 christos mul x28,x4,x19 1538 1.1 christos stp x14,x15,[x2,#8*0] 1539 1.1 christos stp x21,x22,[x2,#8*2] 1540 1.1 christos ldp x21,x22,[x0,#8*2] 1541 1.1 christos stp x23,x24,[x2,#8*4] 1542 1.1 christos ldp x23,x24,[x0,#8*4] 1543 1.1 christos cmp x27,x29 // did we hit the bottom? 1544 1.1 christos stp x25,x26,[x2,#8*6] 1545 1.1 christos mov x2,x0 // slide the window 1546 1.1 christos ldp x25,x26,[x0,#8*6] 1547 1.1 christos mov x27,#8 1548 1.1 christos b.ne .Lsqr8x_reduction 1549 1.1 christos 1550 1.1 christos // Final step. We see if result is larger than modulus, and 1551 1.1 christos // if it is, subtract the modulus. But comparison implies 1552 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 1553 1.1 christos // and conditionally copy original value. 1554 1.1 christos ldr x0,[x29,#96] // pull rp 1555 1.1 christos add x2,x2,#8*8 1556 1.1 christos subs x14,x19,x6 1557 1.1 christos sbcs x15,x20,x7 1558 1.1 christos sub x27,x5,#8*8 1559 1.1 christos mov x3,x0 // x0 copy 1560 1.1 christos 1561 1.1 christos .Lsqr8x_sub: 1562 1.1 christos sbcs x16,x21,x8 1563 1.1 christos ldp x6,x7,[x1,#8*0] 1564 1.1 christos sbcs x17,x22,x9 1565 1.1 christos stp x14,x15,[x0,#8*0] 1566 1.1 christos sbcs x14,x23,x10 1567 1.1 christos ldp x8,x9,[x1,#8*2] 1568 1.1 christos sbcs x15,x24,x11 1569 1.1 christos stp x16,x17,[x0,#8*2] 1570 1.1 christos sbcs x16,x25,x12 1571 1.1 christos ldp x10,x11,[x1,#8*4] 1572 1.1 christos sbcs x17,x26,x13 1573 1.1 christos ldp x12,x13,[x1,#8*6] 1574 1.1 christos add x1,x1,#8*8 1575 1.1 christos ldp x19,x20,[x2,#8*0] 1576 1.1 christos sub x27,x27,#8*8 1577 1.1 christos ldp x21,x22,[x2,#8*2] 1578 1.1 christos ldp x23,x24,[x2,#8*4] 1579 1.1 christos ldp x25,x26,[x2,#8*6] 1580 1.1 christos add x2,x2,#8*8 1581 1.1 christos stp x14,x15,[x0,#8*4] 1582 1.1 christos sbcs x14,x19,x6 1583 1.1 christos stp x16,x17,[x0,#8*6] 1584 1.1 christos add x0,x0,#8*8 1585 1.1 christos sbcs x15,x20,x7 1586 1.1 christos cbnz x27,.Lsqr8x_sub 1587 1.1 christos 1588 1.1 christos sbcs x16,x21,x8 1589 1.1 christos mov x2,sp 1590 1.1 christos add x1,sp,x5 1591 1.1 christos ldp x6,x7,[x3,#8*0] 1592 1.1 christos sbcs x17,x22,x9 1593 1.1 christos stp x14,x15,[x0,#8*0] 1594 1.1 christos sbcs x14,x23,x10 1595 1.1 christos ldp x8,x9,[x3,#8*2] 1596 1.1 christos sbcs x15,x24,x11 1597 1.1 christos stp x16,x17,[x0,#8*2] 1598 1.1 christos sbcs x16,x25,x12 1599 1.1 christos ldp x19,x20,[x1,#8*0] 1600 1.1 christos sbcs x17,x26,x13 1601 1.1 christos ldp x21,x22,[x1,#8*2] 1602 1.1 christos sbcs xzr,x30,xzr // did it borrow? 1603 1.1 christos ldr x30,[x29,#8] // pull return address 1604 1.1 christos stp x14,x15,[x0,#8*4] 1605 1.1 christos stp x16,x17,[x0,#8*6] 1606 1.1 christos 1607 1.1 christos sub x27,x5,#8*4 1608 1.1 christos .Lsqr4x_cond_copy: 1609 1.1 christos sub x27,x27,#8*4 1610 1.1 christos csel x14,x19,x6,lo 1611 1.1 christos stp xzr,xzr,[x2,#8*0] 1612 1.1 christos csel x15,x20,x7,lo 1613 1.1 christos ldp x6,x7,[x3,#8*4] 1614 1.1 christos ldp x19,x20,[x1,#8*4] 1615 1.1 christos csel x16,x21,x8,lo 1616 1.1 christos stp xzr,xzr,[x2,#8*2] 1617 1.1 christos add x2,x2,#8*4 1618 1.1 christos csel x17,x22,x9,lo 1619 1.1 christos ldp x8,x9,[x3,#8*6] 1620 1.1 christos ldp x21,x22,[x1,#8*6] 1621 1.1 christos add x1,x1,#8*4 1622 1.1 christos stp x14,x15,[x3,#8*0] 1623 1.1 christos stp x16,x17,[x3,#8*2] 1624 1.1 christos add x3,x3,#8*4 1625 1.1 christos stp xzr,xzr,[x1,#8*0] 1626 1.1 christos stp xzr,xzr,[x1,#8*2] 1627 1.1 christos cbnz x27,.Lsqr4x_cond_copy 1628 1.1 christos 1629 1.1 christos csel x14,x19,x6,lo 1630 1.1 christos stp xzr,xzr,[x2,#8*0] 1631 1.1 christos csel x15,x20,x7,lo 1632 1.1 christos stp xzr,xzr,[x2,#8*2] 1633 1.1 christos csel x16,x21,x8,lo 1634 1.1 christos csel x17,x22,x9,lo 1635 1.1 christos stp x14,x15,[x3,#8*0] 1636 1.1 christos stp x16,x17,[x3,#8*2] 1637 1.1 christos 1638 1.1 christos b .Lsqr8x_done 1639 1.1 christos 1640 1.1 christos .align 4 1641 1.1 christos .Lsqr8x8_post_condition: 1642 1.1 christos adc x28,xzr,xzr 1643 1.1 christos ldr x30,[x29,#8] // pull return address 1644 1.1 christos // x19-7,x28 hold result, x6-7 hold modulus 1645 1.1 christos subs x6,x19,x6 1646 1.1 christos ldr x1,[x29,#96] // pull rp 1647 1.1 christos sbcs x7,x20,x7 1648 1.1 christos stp xzr,xzr,[sp,#8*0] 1649 1.1 christos sbcs x8,x21,x8 1650 1.1 christos stp xzr,xzr,[sp,#8*2] 1651 1.1 christos sbcs x9,x22,x9 1652 1.1 christos stp xzr,xzr,[sp,#8*4] 1653 1.1 christos sbcs x10,x23,x10 1654 1.1 christos stp xzr,xzr,[sp,#8*6] 1655 1.1 christos sbcs x11,x24,x11 1656 1.1 christos stp xzr,xzr,[sp,#8*8] 1657 1.1 christos sbcs x12,x25,x12 1658 1.1 christos stp xzr,xzr,[sp,#8*10] 1659 1.1 christos sbcs x13,x26,x13 1660 1.1 christos stp xzr,xzr,[sp,#8*12] 1661 1.1 christos sbcs x28,x28,xzr // did it borrow? 1662 1.1 christos stp xzr,xzr,[sp,#8*14] 1663 1.1 christos 1664 1.1 christos // x6-7 hold result-modulus 1665 1.1 christos csel x6,x19,x6,lo 1666 1.1 christos csel x7,x20,x7,lo 1667 1.1 christos csel x8,x21,x8,lo 1668 1.1 christos csel x9,x22,x9,lo 1669 1.1 christos stp x6,x7,[x1,#8*0] 1670 1.1 christos csel x10,x23,x10,lo 1671 1.1 christos csel x11,x24,x11,lo 1672 1.1 christos stp x8,x9,[x1,#8*2] 1673 1.1 christos csel x12,x25,x12,lo 1674 1.1 christos csel x13,x26,x13,lo 1675 1.1 christos stp x10,x11,[x1,#8*4] 1676 1.1 christos stp x12,x13,[x1,#8*6] 1677 1.1 christos 1678 1.1 christos .Lsqr8x_done: 1679 1.1 christos ldp x19,x20,[x29,#16] 1680 1.1 christos mov sp,x29 1681 1.1 christos ldp x21,x22,[x29,#32] 1682 1.1 christos mov x0,#1 1683 1.1 christos ldp x23,x24,[x29,#48] 1684 1.1 christos ldp x25,x26,[x29,#64] 1685 1.1 christos ldp x27,x28,[x29,#80] 1686 1.1 christos ldr x29,[sp],#128 1687 1.2 christos // x30 is loaded earlier 1688 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1689 1.1 christos ret 1690 1.1 christos .size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1691 1.1 christos .type __bn_mul4x_mont,%function 1692 1.1 christos .align 5 1693 1.1 christos __bn_mul4x_mont: 1694 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1695 1.2 christos // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1696 1.1 christos stp x29,x30,[sp,#-128]! 1697 1.1 christos add x29,sp,#0 1698 1.1 christos stp x19,x20,[sp,#16] 1699 1.1 christos stp x21,x22,[sp,#32] 1700 1.1 christos stp x23,x24,[sp,#48] 1701 1.1 christos stp x25,x26,[sp,#64] 1702 1.1 christos stp x27,x28,[sp,#80] 1703 1.1 christos 1704 1.1 christos sub x26,sp,x5,lsl#3 1705 1.1 christos lsl x5,x5,#3 1706 1.1 christos ldr x4,[x4] // *n0 1707 1.1 christos sub sp,x26,#8*4 // alloca 1708 1.1 christos 1709 1.1 christos add x10,x2,x5 1710 1.1 christos add x27,x1,x5 1711 1.1 christos stp x0,x10,[x29,#96] // offload rp and &b[num] 1712 1.1 christos 1713 1.1 christos ldr x24,[x2,#8*0] // b[0] 1714 1.1 christos ldp x6,x7,[x1,#8*0] // a[0..3] 1715 1.1 christos ldp x8,x9,[x1,#8*2] 1716 1.1 christos add x1,x1,#8*4 1717 1.1 christos mov x19,xzr 1718 1.1 christos mov x20,xzr 1719 1.1 christos mov x21,xzr 1720 1.1 christos mov x22,xzr 1721 1.1 christos ldp x14,x15,[x3,#8*0] // n[0..3] 1722 1.1 christos ldp x16,x17,[x3,#8*2] 1723 1.1 christos adds x3,x3,#8*4 // clear carry bit 1724 1.1 christos mov x0,xzr 1725 1.1 christos mov x28,#0 1726 1.1 christos mov x26,sp 1727 1.1 christos 1728 1.1 christos .Loop_mul4x_1st_reduction: 1729 1.1 christos mul x10,x6,x24 // lo(a[0..3]*b[0]) 1730 1.1 christos adc x0,x0,xzr // modulo-scheduled 1731 1.1 christos mul x11,x7,x24 1732 1.1 christos add x28,x28,#8 1733 1.1 christos mul x12,x8,x24 1734 1.1 christos and x28,x28,#31 1735 1.1 christos mul x13,x9,x24 1736 1.1 christos adds x19,x19,x10 1737 1.1 christos umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1738 1.1 christos adcs x20,x20,x11 1739 1.1 christos mul x25,x19,x4 // t[0]*n0 1740 1.1 christos adcs x21,x21,x12 1741 1.1 christos umulh x11,x7,x24 1742 1.1 christos adcs x22,x22,x13 1743 1.1 christos umulh x12,x8,x24 1744 1.1 christos adc x23,xzr,xzr 1745 1.1 christos umulh x13,x9,x24 1746 1.1 christos ldr x24,[x2,x28] // next b[i] (or b[0]) 1747 1.1 christos adds x20,x20,x10 1748 1.1 christos // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1749 1.1 christos str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1750 1.1 christos adcs x21,x21,x11 1751 1.1 christos mul x11,x15,x25 1752 1.1 christos adcs x22,x22,x12 1753 1.1 christos mul x12,x16,x25 1754 1.1 christos adc x23,x23,x13 // can't overflow 1755 1.1 christos mul x13,x17,x25 1756 1.1 christos // (*) adds xzr,x19,x10 1757 1.1 christos subs xzr,x19,#1 // (*) 1758 1.1 christos umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1759 1.1 christos adcs x19,x20,x11 1760 1.1 christos umulh x11,x15,x25 1761 1.1 christos adcs x20,x21,x12 1762 1.1 christos umulh x12,x16,x25 1763 1.1 christos adcs x21,x22,x13 1764 1.1 christos umulh x13,x17,x25 1765 1.1 christos adcs x22,x23,x0 1766 1.1 christos adc x0,xzr,xzr 1767 1.1 christos adds x19,x19,x10 1768 1.1 christos sub x10,x27,x1 1769 1.1 christos adcs x20,x20,x11 1770 1.1 christos adcs x21,x21,x12 1771 1.1 christos adcs x22,x22,x13 1772 1.1 christos //adc x0,x0,xzr 1773 1.1 christos cbnz x28,.Loop_mul4x_1st_reduction 1774 1.1 christos 1775 1.1 christos cbz x10,.Lmul4x4_post_condition 1776 1.1 christos 1777 1.1 christos ldp x6,x7,[x1,#8*0] // a[4..7] 1778 1.1 christos ldp x8,x9,[x1,#8*2] 1779 1.1 christos add x1,x1,#8*4 1780 1.1 christos ldr x25,[sp] // a[0]*n0 1781 1.1 christos ldp x14,x15,[x3,#8*0] // n[4..7] 1782 1.1 christos ldp x16,x17,[x3,#8*2] 1783 1.1 christos add x3,x3,#8*4 1784 1.1 christos 1785 1.1 christos .Loop_mul4x_1st_tail: 1786 1.1 christos mul x10,x6,x24 // lo(a[4..7]*b[i]) 1787 1.1 christos adc x0,x0,xzr // modulo-scheduled 1788 1.1 christos mul x11,x7,x24 1789 1.1 christos add x28,x28,#8 1790 1.1 christos mul x12,x8,x24 1791 1.1 christos and x28,x28,#31 1792 1.1 christos mul x13,x9,x24 1793 1.1 christos adds x19,x19,x10 1794 1.1 christos umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1795 1.1 christos adcs x20,x20,x11 1796 1.1 christos umulh x11,x7,x24 1797 1.1 christos adcs x21,x21,x12 1798 1.1 christos umulh x12,x8,x24 1799 1.1 christos adcs x22,x22,x13 1800 1.1 christos umulh x13,x9,x24 1801 1.1 christos adc x23,xzr,xzr 1802 1.1 christos ldr x24,[x2,x28] // next b[i] (or b[0]) 1803 1.1 christos adds x20,x20,x10 1804 1.1 christos mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1805 1.1 christos adcs x21,x21,x11 1806 1.1 christos mul x11,x15,x25 1807 1.1 christos adcs x22,x22,x12 1808 1.1 christos mul x12,x16,x25 1809 1.1 christos adc x23,x23,x13 // can't overflow 1810 1.1 christos mul x13,x17,x25 1811 1.1 christos adds x19,x19,x10 1812 1.1 christos umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1813 1.1 christos adcs x20,x20,x11 1814 1.1 christos umulh x11,x15,x25 1815 1.1 christos adcs x21,x21,x12 1816 1.1 christos umulh x12,x16,x25 1817 1.1 christos adcs x22,x22,x13 1818 1.1 christos adcs x23,x23,x0 1819 1.1 christos umulh x13,x17,x25 1820 1.1 christos adc x0,xzr,xzr 1821 1.1 christos ldr x25,[sp,x28] // next t[0]*n0 1822 1.1 christos str x19,[x26],#8 // result!!! 1823 1.1 christos adds x19,x20,x10 1824 1.1 christos sub x10,x27,x1 // done yet? 1825 1.1 christos adcs x20,x21,x11 1826 1.1 christos adcs x21,x22,x12 1827 1.1 christos adcs x22,x23,x13 1828 1.1 christos //adc x0,x0,xzr 1829 1.1 christos cbnz x28,.Loop_mul4x_1st_tail 1830 1.1 christos 1831 1.1 christos sub x11,x27,x5 // rewinded x1 1832 1.1 christos cbz x10,.Lmul4x_proceed 1833 1.1 christos 1834 1.1 christos ldp x6,x7,[x1,#8*0] 1835 1.1 christos ldp x8,x9,[x1,#8*2] 1836 1.1 christos add x1,x1,#8*4 1837 1.1 christos ldp x14,x15,[x3,#8*0] 1838 1.1 christos ldp x16,x17,[x3,#8*2] 1839 1.1 christos add x3,x3,#8*4 1840 1.1 christos b .Loop_mul4x_1st_tail 1841 1.1 christos 1842 1.1 christos .align 5 1843 1.1 christos .Lmul4x_proceed: 1844 1.1 christos ldr x24,[x2,#8*4]! // *++b 1845 1.1 christos adc x30,x0,xzr 1846 1.1 christos ldp x6,x7,[x11,#8*0] // a[0..3] 1847 1.1 christos sub x3,x3,x5 // rewind np 1848 1.1 christos ldp x8,x9,[x11,#8*2] 1849 1.1 christos add x1,x11,#8*4 1850 1.1 christos 1851 1.1 christos stp x19,x20,[x26,#8*0] // result!!! 1852 1.1 christos ldp x19,x20,[sp,#8*4] // t[0..3] 1853 1.1 christos stp x21,x22,[x26,#8*2] // result!!! 1854 1.1 christos ldp x21,x22,[sp,#8*6] 1855 1.1 christos 1856 1.1 christos ldp x14,x15,[x3,#8*0] // n[0..3] 1857 1.1 christos mov x26,sp 1858 1.1 christos ldp x16,x17,[x3,#8*2] 1859 1.1 christos adds x3,x3,#8*4 // clear carry bit 1860 1.1 christos mov x0,xzr 1861 1.1 christos 1862 1.1 christos .align 4 1863 1.1 christos .Loop_mul4x_reduction: 1864 1.1 christos mul x10,x6,x24 // lo(a[0..3]*b[4]) 1865 1.1 christos adc x0,x0,xzr // modulo-scheduled 1866 1.1 christos mul x11,x7,x24 1867 1.1 christos add x28,x28,#8 1868 1.1 christos mul x12,x8,x24 1869 1.1 christos and x28,x28,#31 1870 1.1 christos mul x13,x9,x24 1871 1.1 christos adds x19,x19,x10 1872 1.1 christos umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1873 1.1 christos adcs x20,x20,x11 1874 1.1 christos mul x25,x19,x4 // t[0]*n0 1875 1.1 christos adcs x21,x21,x12 1876 1.1 christos umulh x11,x7,x24 1877 1.1 christos adcs x22,x22,x13 1878 1.1 christos umulh x12,x8,x24 1879 1.1 christos adc x23,xzr,xzr 1880 1.1 christos umulh x13,x9,x24 1881 1.1 christos ldr x24,[x2,x28] // next b[i] 1882 1.1 christos adds x20,x20,x10 1883 1.1 christos // (*) mul x10,x14,x25 1884 1.1 christos str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1885 1.1 christos adcs x21,x21,x11 1886 1.1 christos mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1887 1.1 christos adcs x22,x22,x12 1888 1.1 christos mul x12,x16,x25 1889 1.1 christos adc x23,x23,x13 // can't overflow 1890 1.1 christos mul x13,x17,x25 1891 1.1 christos // (*) adds xzr,x19,x10 1892 1.1 christos subs xzr,x19,#1 // (*) 1893 1.1 christos umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1894 1.1 christos adcs x19,x20,x11 1895 1.1 christos umulh x11,x15,x25 1896 1.1 christos adcs x20,x21,x12 1897 1.1 christos umulh x12,x16,x25 1898 1.1 christos adcs x21,x22,x13 1899 1.1 christos umulh x13,x17,x25 1900 1.1 christos adcs x22,x23,x0 1901 1.1 christos adc x0,xzr,xzr 1902 1.1 christos adds x19,x19,x10 1903 1.1 christos adcs x20,x20,x11 1904 1.1 christos adcs x21,x21,x12 1905 1.1 christos adcs x22,x22,x13 1906 1.1 christos //adc x0,x0,xzr 1907 1.1 christos cbnz x28,.Loop_mul4x_reduction 1908 1.1 christos 1909 1.1 christos adc x0,x0,xzr 1910 1.1 christos ldp x10,x11,[x26,#8*4] // t[4..7] 1911 1.1 christos ldp x12,x13,[x26,#8*6] 1912 1.1 christos ldp x6,x7,[x1,#8*0] // a[4..7] 1913 1.1 christos ldp x8,x9,[x1,#8*2] 1914 1.1 christos add x1,x1,#8*4 1915 1.1 christos adds x19,x19,x10 1916 1.1 christos adcs x20,x20,x11 1917 1.1 christos adcs x21,x21,x12 1918 1.1 christos adcs x22,x22,x13 1919 1.1 christos //adc x0,x0,xzr 1920 1.1 christos 1921 1.1 christos ldr x25,[sp] // t[0]*n0 1922 1.1 christos ldp x14,x15,[x3,#8*0] // n[4..7] 1923 1.1 christos ldp x16,x17,[x3,#8*2] 1924 1.1 christos add x3,x3,#8*4 1925 1.1 christos 1926 1.1 christos .align 4 1927 1.1 christos .Loop_mul4x_tail: 1928 1.1 christos mul x10,x6,x24 // lo(a[4..7]*b[4]) 1929 1.1 christos adc x0,x0,xzr // modulo-scheduled 1930 1.1 christos mul x11,x7,x24 1931 1.1 christos add x28,x28,#8 1932 1.1 christos mul x12,x8,x24 1933 1.1 christos and x28,x28,#31 1934 1.1 christos mul x13,x9,x24 1935 1.1 christos adds x19,x19,x10 1936 1.1 christos umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1937 1.1 christos adcs x20,x20,x11 1938 1.1 christos umulh x11,x7,x24 1939 1.1 christos adcs x21,x21,x12 1940 1.1 christos umulh x12,x8,x24 1941 1.1 christos adcs x22,x22,x13 1942 1.1 christos umulh x13,x9,x24 1943 1.1 christos adc x23,xzr,xzr 1944 1.1 christos ldr x24,[x2,x28] // next b[i] 1945 1.1 christos adds x20,x20,x10 1946 1.1 christos mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1947 1.1 christos adcs x21,x21,x11 1948 1.1 christos mul x11,x15,x25 1949 1.1 christos adcs x22,x22,x12 1950 1.1 christos mul x12,x16,x25 1951 1.1 christos adc x23,x23,x13 // can't overflow 1952 1.1 christos mul x13,x17,x25 1953 1.1 christos adds x19,x19,x10 1954 1.1 christos umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1955 1.1 christos adcs x20,x20,x11 1956 1.1 christos umulh x11,x15,x25 1957 1.1 christos adcs x21,x21,x12 1958 1.1 christos umulh x12,x16,x25 1959 1.1 christos adcs x22,x22,x13 1960 1.1 christos umulh x13,x17,x25 1961 1.1 christos adcs x23,x23,x0 1962 1.1 christos ldr x25,[sp,x28] // next a[0]*n0 1963 1.1 christos adc x0,xzr,xzr 1964 1.1 christos str x19,[x26],#8 // result!!! 1965 1.1 christos adds x19,x20,x10 1966 1.1 christos sub x10,x27,x1 // done yet? 1967 1.1 christos adcs x20,x21,x11 1968 1.1 christos adcs x21,x22,x12 1969 1.1 christos adcs x22,x23,x13 1970 1.1 christos //adc x0,x0,xzr 1971 1.1 christos cbnz x28,.Loop_mul4x_tail 1972 1.1 christos 1973 1.1 christos sub x11,x3,x5 // rewinded np? 1974 1.1 christos adc x0,x0,xzr 1975 1.1 christos cbz x10,.Loop_mul4x_break 1976 1.1 christos 1977 1.1 christos ldp x10,x11,[x26,#8*4] 1978 1.1 christos ldp x12,x13,[x26,#8*6] 1979 1.1 christos ldp x6,x7,[x1,#8*0] 1980 1.1 christos ldp x8,x9,[x1,#8*2] 1981 1.1 christos add x1,x1,#8*4 1982 1.1 christos adds x19,x19,x10 1983 1.1 christos adcs x20,x20,x11 1984 1.1 christos adcs x21,x21,x12 1985 1.1 christos adcs x22,x22,x13 1986 1.1 christos //adc x0,x0,xzr 1987 1.1 christos ldp x14,x15,[x3,#8*0] 1988 1.1 christos ldp x16,x17,[x3,#8*2] 1989 1.1 christos add x3,x3,#8*4 1990 1.1 christos b .Loop_mul4x_tail 1991 1.1 christos 1992 1.1 christos .align 4 1993 1.1 christos .Loop_mul4x_break: 1994 1.1 christos ldp x12,x13,[x29,#96] // pull rp and &b[num] 1995 1.1 christos adds x19,x19,x30 1996 1.1 christos add x2,x2,#8*4 // bp++ 1997 1.1 christos adcs x20,x20,xzr 1998 1.1 christos sub x1,x1,x5 // rewind ap 1999 1.1 christos adcs x21,x21,xzr 2000 1.1 christos stp x19,x20,[x26,#8*0] // result!!! 2001 1.1 christos adcs x22,x22,xzr 2002 1.1 christos ldp x19,x20,[sp,#8*4] // t[0..3] 2003 1.1 christos adc x30,x0,xzr 2004 1.1 christos stp x21,x22,[x26,#8*2] // result!!! 2005 1.1 christos cmp x2,x13 // done yet? 2006 1.1 christos ldp x21,x22,[sp,#8*6] 2007 1.1 christos ldp x14,x15,[x11,#8*0] // n[0..3] 2008 1.1 christos ldp x16,x17,[x11,#8*2] 2009 1.1 christos add x3,x11,#8*4 2010 1.1 christos b.eq .Lmul4x_post 2011 1.1 christos 2012 1.1 christos ldr x24,[x2] 2013 1.1 christos ldp x6,x7,[x1,#8*0] // a[0..3] 2014 1.1 christos ldp x8,x9,[x1,#8*2] 2015 1.1 christos adds x1,x1,#8*4 // clear carry bit 2016 1.1 christos mov x0,xzr 2017 1.1 christos mov x26,sp 2018 1.1 christos b .Loop_mul4x_reduction 2019 1.1 christos 2020 1.1 christos .align 4 2021 1.1 christos .Lmul4x_post: 2022 1.1 christos // Final step. We see if result is larger than modulus, and 2023 1.1 christos // if it is, subtract the modulus. But comparison implies 2024 1.1 christos // subtraction. So we subtract modulus, see if it borrowed, 2025 1.1 christos // and conditionally copy original value. 2026 1.1 christos mov x0,x12 2027 1.1 christos mov x27,x12 // x0 copy 2028 1.1 christos subs x10,x19,x14 2029 1.1 christos add x26,sp,#8*8 2030 1.1 christos sbcs x11,x20,x15 2031 1.1 christos sub x28,x5,#8*4 2032 1.1 christos 2033 1.1 christos .Lmul4x_sub: 2034 1.1 christos sbcs x12,x21,x16 2035 1.1 christos ldp x14,x15,[x3,#8*0] 2036 1.1 christos sub x28,x28,#8*4 2037 1.1 christos ldp x19,x20,[x26,#8*0] 2038 1.1 christos sbcs x13,x22,x17 2039 1.1 christos ldp x16,x17,[x3,#8*2] 2040 1.1 christos add x3,x3,#8*4 2041 1.1 christos ldp x21,x22,[x26,#8*2] 2042 1.1 christos add x26,x26,#8*4 2043 1.1 christos stp x10,x11,[x0,#8*0] 2044 1.1 christos sbcs x10,x19,x14 2045 1.1 christos stp x12,x13,[x0,#8*2] 2046 1.1 christos add x0,x0,#8*4 2047 1.1 christos sbcs x11,x20,x15 2048 1.1 christos cbnz x28,.Lmul4x_sub 2049 1.1 christos 2050 1.1 christos sbcs x12,x21,x16 2051 1.1 christos mov x26,sp 2052 1.1 christos add x1,sp,#8*4 2053 1.1 christos ldp x6,x7,[x27,#8*0] 2054 1.1 christos sbcs x13,x22,x17 2055 1.1 christos stp x10,x11,[x0,#8*0] 2056 1.1 christos ldp x8,x9,[x27,#8*2] 2057 1.1 christos stp x12,x13,[x0,#8*2] 2058 1.1 christos ldp x19,x20,[x1,#8*0] 2059 1.1 christos ldp x21,x22,[x1,#8*2] 2060 1.1 christos sbcs xzr,x30,xzr // did it borrow? 2061 1.1 christos ldr x30,[x29,#8] // pull return address 2062 1.1 christos 2063 1.1 christos sub x28,x5,#8*4 2064 1.1 christos .Lmul4x_cond_copy: 2065 1.1 christos sub x28,x28,#8*4 2066 1.1 christos csel x10,x19,x6,lo 2067 1.1 christos stp xzr,xzr,[x26,#8*0] 2068 1.1 christos csel x11,x20,x7,lo 2069 1.1 christos ldp x6,x7,[x27,#8*4] 2070 1.1 christos ldp x19,x20,[x1,#8*4] 2071 1.1 christos csel x12,x21,x8,lo 2072 1.1 christos stp xzr,xzr,[x26,#8*2] 2073 1.1 christos add x26,x26,#8*4 2074 1.1 christos csel x13,x22,x9,lo 2075 1.1 christos ldp x8,x9,[x27,#8*6] 2076 1.1 christos ldp x21,x22,[x1,#8*6] 2077 1.1 christos add x1,x1,#8*4 2078 1.1 christos stp x10,x11,[x27,#8*0] 2079 1.1 christos stp x12,x13,[x27,#8*2] 2080 1.1 christos add x27,x27,#8*4 2081 1.1 christos cbnz x28,.Lmul4x_cond_copy 2082 1.1 christos 2083 1.1 christos csel x10,x19,x6,lo 2084 1.1 christos stp xzr,xzr,[x26,#8*0] 2085 1.1 christos csel x11,x20,x7,lo 2086 1.1 christos stp xzr,xzr,[x26,#8*2] 2087 1.1 christos csel x12,x21,x8,lo 2088 1.1 christos stp xzr,xzr,[x26,#8*3] 2089 1.1 christos csel x13,x22,x9,lo 2090 1.1 christos stp xzr,xzr,[x26,#8*4] 2091 1.1 christos stp x10,x11,[x27,#8*0] 2092 1.1 christos stp x12,x13,[x27,#8*2] 2093 1.1 christos 2094 1.1 christos b .Lmul4x_done 2095 1.1 christos 2096 1.1 christos .align 4 2097 1.1 christos .Lmul4x4_post_condition: 2098 1.1 christos adc x0,x0,xzr 2099 1.1 christos ldr x1,[x29,#96] // pull rp 2100 1.1 christos // x19-3,x0 hold result, x14-7 hold modulus 2101 1.1 christos subs x6,x19,x14 2102 1.1 christos ldr x30,[x29,#8] // pull return address 2103 1.1 christos sbcs x7,x20,x15 2104 1.1 christos stp xzr,xzr,[sp,#8*0] 2105 1.1 christos sbcs x8,x21,x16 2106 1.1 christos stp xzr,xzr,[sp,#8*2] 2107 1.1 christos sbcs x9,x22,x17 2108 1.1 christos stp xzr,xzr,[sp,#8*4] 2109 1.1 christos sbcs xzr,x0,xzr // did it borrow? 2110 1.1 christos stp xzr,xzr,[sp,#8*6] 2111 1.1 christos 2112 1.1 christos // x6-3 hold result-modulus 2113 1.1 christos csel x6,x19,x6,lo 2114 1.1 christos csel x7,x20,x7,lo 2115 1.1 christos csel x8,x21,x8,lo 2116 1.1 christos csel x9,x22,x9,lo 2117 1.1 christos stp x6,x7,[x1,#8*0] 2118 1.1 christos stp x8,x9,[x1,#8*2] 2119 1.1 christos 2120 1.1 christos .Lmul4x_done: 2121 1.1 christos ldp x19,x20,[x29,#16] 2122 1.1 christos mov sp,x29 2123 1.1 christos ldp x21,x22,[x29,#32] 2124 1.1 christos mov x0,#1 2125 1.1 christos ldp x23,x24,[x29,#48] 2126 1.1 christos ldp x25,x26,[x29,#64] 2127 1.1 christos ldp x27,x28,[x29,#80] 2128 1.1 christos ldr x29,[sp],#128 2129 1.2 christos // x30 loaded earlier 2130 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 2131 1.1 christos ret 2132 1.1 christos .size __bn_mul4x_mont,.-__bn_mul4x_mont 2133 1.2 christos .section .rodata 2134 1.1 christos .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2135 1.1 christos .align 2 2136 1.1 christos .align 4 2137