armv8-mont.S revision 1.1 1 .text
2
3 .globl bn_mul_mont
4 .type bn_mul_mont,%function
5 .align 5
6 bn_mul_mont:
7 tst x5,#7
8 b.eq __bn_sqr8x_mont
9 tst x5,#3
10 b.eq __bn_mul4x_mont
11 .Lmul_mont:
12 stp x29,x30,[sp,#-64]!
13 add x29,sp,#0
14 stp x19,x20,[sp,#16]
15 stp x21,x22,[sp,#32]
16 stp x23,x24,[sp,#48]
17
18 ldr x9,[x2],#8 // bp[0]
19 sub x22,sp,x5,lsl#3
20 ldp x7,x8,[x1],#16 // ap[0..1]
21 lsl x5,x5,#3
22 ldr x4,[x4] // *n0
23 and x22,x22,#-16 // ABI says so
24 ldp x13,x14,[x3],#16 // np[0..1]
25
26 mul x6,x7,x9 // ap[0]*bp[0]
27 sub x21,x5,#16 // j=num-2
28 umulh x7,x7,x9
29 mul x10,x8,x9 // ap[1]*bp[0]
30 umulh x11,x8,x9
31
32 mul x15,x6,x4 // "tp[0]"*n0
33 mov sp,x22 // alloca
34
35 // (*) mul x12,x13,x15 // np[0]*m1
36 umulh x13,x13,x15
37 mul x16,x14,x15 // np[1]*m1
38 // (*) adds x12,x12,x6 // discarded
39 // (*) As for removal of first multiplication and addition
40 // instructions. The outcome of first addition is
41 // guaranteed to be zero, which leaves two computationally
42 // significant outcomes: it either carries or not. Then
43 // question is when does it carry? Is there alternative
44 // way to deduce it? If you follow operations, you can
45 // observe that condition for carry is quite simple:
46 // x6 being non-zero. So that carry can be calculated
47 // by adding -1 to x6. That's what next instruction does.
48 subs xzr,x6,#1 // (*)
49 umulh x17,x14,x15
50 adc x13,x13,xzr
51 cbz x21,.L1st_skip
52
53 .L1st:
54 ldr x8,[x1],#8
55 adds x6,x10,x7
56 sub x21,x21,#8 // j--
57 adc x7,x11,xzr
58
59 ldr x14,[x3],#8
60 adds x12,x16,x13
61 mul x10,x8,x9 // ap[j]*bp[0]
62 adc x13,x17,xzr
63 umulh x11,x8,x9
64
65 adds x12,x12,x6
66 mul x16,x14,x15 // np[j]*m1
67 adc x13,x13,xzr
68 umulh x17,x14,x15
69 str x12,[x22],#8 // tp[j-1]
70 cbnz x21,.L1st
71
72 .L1st_skip:
73 adds x6,x10,x7
74 sub x1,x1,x5 // rewind x1
75 adc x7,x11,xzr
76
77 adds x12,x16,x13
78 sub x3,x3,x5 // rewind x3
79 adc x13,x17,xzr
80
81 adds x12,x12,x6
82 sub x20,x5,#8 // i=num-1
83 adcs x13,x13,x7
84
85 adc x19,xzr,xzr // upmost overflow bit
86 stp x12,x13,[x22]
87
88 .Louter:
89 ldr x9,[x2],#8 // bp[i]
90 ldp x7,x8,[x1],#16
91 ldr x23,[sp] // tp[0]
92 add x22,sp,#8
93
94 mul x6,x7,x9 // ap[0]*bp[i]
95 sub x21,x5,#16 // j=num-2
96 umulh x7,x7,x9
97 ldp x13,x14,[x3],#16
98 mul x10,x8,x9 // ap[1]*bp[i]
99 adds x6,x6,x23
100 umulh x11,x8,x9
101 adc x7,x7,xzr
102
103 mul x15,x6,x4
104 sub x20,x20,#8 // i--
105
106 // (*) mul x12,x13,x15 // np[0]*m1
107 umulh x13,x13,x15
108 mul x16,x14,x15 // np[1]*m1
109 // (*) adds x12,x12,x6
110 subs xzr,x6,#1 // (*)
111 umulh x17,x14,x15
112 cbz x21,.Linner_skip
113
114 .Linner:
115 ldr x8,[x1],#8
116 adc x13,x13,xzr
117 ldr x23,[x22],#8 // tp[j]
118 adds x6,x10,x7
119 sub x21,x21,#8 // j--
120 adc x7,x11,xzr
121
122 adds x12,x16,x13
123 ldr x14,[x3],#8
124 adc x13,x17,xzr
125
126 mul x10,x8,x9 // ap[j]*bp[i]
127 adds x6,x6,x23
128 umulh x11,x8,x9
129 adc x7,x7,xzr
130
131 mul x16,x14,x15 // np[j]*m1
132 adds x12,x12,x6
133 umulh x17,x14,x15
134 str x12,[x22,#-16] // tp[j-1]
135 cbnz x21,.Linner
136
137 .Linner_skip:
138 ldr x23,[x22],#8 // tp[j]
139 adc x13,x13,xzr
140 adds x6,x10,x7
141 sub x1,x1,x5 // rewind x1
142 adc x7,x11,xzr
143
144 adds x12,x16,x13
145 sub x3,x3,x5 // rewind x3
146 adcs x13,x17,x19
147 adc x19,xzr,xzr
148
149 adds x6,x6,x23
150 adc x7,x7,xzr
151
152 adds x12,x12,x6
153 adcs x13,x13,x7
154 adc x19,x19,xzr // upmost overflow bit
155 stp x12,x13,[x22,#-16]
156
157 cbnz x20,.Louter
158
159 // Final step. We see if result is larger than modulus, and
160 // if it is, subtract the modulus. But comparison implies
161 // subtraction. So we subtract modulus, see if it borrowed,
162 // and conditionally copy original value.
163 ldr x23,[sp] // tp[0]
164 add x22,sp,#8
165 ldr x14,[x3],#8 // np[0]
166 subs x21,x5,#8 // j=num-1 and clear borrow
167 mov x1,x0
168 .Lsub:
169 sbcs x8,x23,x14 // tp[j]-np[j]
170 ldr x23,[x22],#8
171 sub x21,x21,#8 // j--
172 ldr x14,[x3],#8
173 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
174 cbnz x21,.Lsub
175
176 sbcs x8,x23,x14
177 sbcs x19,x19,xzr // did it borrow?
178 str x8,[x1],#8 // rp[num-1]
179
180 ldr x23,[sp] // tp[0]
181 add x22,sp,#8
182 ldr x8,[x0],#8 // rp[0]
183 sub x5,x5,#8 // num--
184 nop
185 .Lcond_copy:
186 sub x5,x5,#8 // num--
187 csel x14,x23,x8,lo // did it borrow?
188 ldr x23,[x22],#8
189 ldr x8,[x0],#8
190 str xzr,[x22,#-16] // wipe tp
191 str x14,[x0,#-16]
192 cbnz x5,.Lcond_copy
193
194 csel x14,x23,x8,lo
195 str xzr,[x22,#-8] // wipe tp
196 str x14,[x0,#-8]
197
198 ldp x19,x20,[x29,#16]
199 mov sp,x29
200 ldp x21,x22,[x29,#32]
201 mov x0,#1
202 ldp x23,x24,[x29,#48]
203 ldr x29,[sp],#64
204 ret
205 .size bn_mul_mont,.-bn_mul_mont
206 .type __bn_sqr8x_mont,%function
207 .align 5
208 __bn_sqr8x_mont:
209 cmp x1,x2
210 b.ne __bn_mul4x_mont
211 .Lsqr8x_mont:
212 .inst 0xd503233f // paciasp
213 stp x29,x30,[sp,#-128]!
214 add x29,sp,#0
215 stp x19,x20,[sp,#16]
216 stp x21,x22,[sp,#32]
217 stp x23,x24,[sp,#48]
218 stp x25,x26,[sp,#64]
219 stp x27,x28,[sp,#80]
220 stp x0,x3,[sp,#96] // offload rp and np
221
222 ldp x6,x7,[x1,#8*0]
223 ldp x8,x9,[x1,#8*2]
224 ldp x10,x11,[x1,#8*4]
225 ldp x12,x13,[x1,#8*6]
226
227 sub x2,sp,x5,lsl#4
228 lsl x5,x5,#3
229 ldr x4,[x4] // *n0
230 mov sp,x2 // alloca
231 sub x27,x5,#8*8
232 b .Lsqr8x_zero_start
233
234 .Lsqr8x_zero:
235 sub x27,x27,#8*8
236 stp xzr,xzr,[x2,#8*0]
237 stp xzr,xzr,[x2,#8*2]
238 stp xzr,xzr,[x2,#8*4]
239 stp xzr,xzr,[x2,#8*6]
240 .Lsqr8x_zero_start:
241 stp xzr,xzr,[x2,#8*8]
242 stp xzr,xzr,[x2,#8*10]
243 stp xzr,xzr,[x2,#8*12]
244 stp xzr,xzr,[x2,#8*14]
245 add x2,x2,#8*16
246 cbnz x27,.Lsqr8x_zero
247
248 add x3,x1,x5
249 add x1,x1,#8*8
250 mov x19,xzr
251 mov x20,xzr
252 mov x21,xzr
253 mov x22,xzr
254 mov x23,xzr
255 mov x24,xzr
256 mov x25,xzr
257 mov x26,xzr
258 mov x2,sp
259 str x4,[x29,#112] // offload n0
260
261 // Multiply everything but a[i]*a[i]
262 .align 4
263 .Lsqr8x_outer_loop:
264 // a[1]a[0] (i)
265 // a[2]a[0]
266 // a[3]a[0]
267 // a[4]a[0]
268 // a[5]a[0]
269 // a[6]a[0]
270 // a[7]a[0]
271 // a[2]a[1] (ii)
272 // a[3]a[1]
273 // a[4]a[1]
274 // a[5]a[1]
275 // a[6]a[1]
276 // a[7]a[1]
277 // a[3]a[2] (iii)
278 // a[4]a[2]
279 // a[5]a[2]
280 // a[6]a[2]
281 // a[7]a[2]
282 // a[4]a[3] (iv)
283 // a[5]a[3]
284 // a[6]a[3]
285 // a[7]a[3]
286 // a[5]a[4] (v)
287 // a[6]a[4]
288 // a[7]a[4]
289 // a[6]a[5] (vi)
290 // a[7]a[5]
291 // a[7]a[6] (vii)
292
293 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
294 mul x15,x8,x6
295 mul x16,x9,x6
296 mul x17,x10,x6
297 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
298 mul x14,x11,x6
299 adcs x21,x21,x15
300 mul x15,x12,x6
301 adcs x22,x22,x16
302 mul x16,x13,x6
303 adcs x23,x23,x17
304 umulh x17,x7,x6 // hi(a[1..7]*a[0])
305 adcs x24,x24,x14
306 umulh x14,x8,x6
307 adcs x25,x25,x15
308 umulh x15,x9,x6
309 adcs x26,x26,x16
310 umulh x16,x10,x6
311 stp x19,x20,[x2],#8*2 // t[0..1]
312 adc x19,xzr,xzr // t[8]
313 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
314 umulh x17,x11,x6
315 adcs x22,x22,x14
316 umulh x14,x12,x6
317 adcs x23,x23,x15
318 umulh x15,x13,x6
319 adcs x24,x24,x16
320 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
321 adcs x25,x25,x17
322 mul x17,x9,x7
323 adcs x26,x26,x14
324 mul x14,x10,x7
325 adc x19,x19,x15
326
327 mul x15,x11,x7
328 adds x22,x22,x16
329 mul x16,x12,x7
330 adcs x23,x23,x17
331 mul x17,x13,x7
332 adcs x24,x24,x14
333 umulh x14,x8,x7 // hi(a[2..7]*a[1])
334 adcs x25,x25,x15
335 umulh x15,x9,x7
336 adcs x26,x26,x16
337 umulh x16,x10,x7
338 adcs x19,x19,x17
339 umulh x17,x11,x7
340 stp x21,x22,[x2],#8*2 // t[2..3]
341 adc x20,xzr,xzr // t[9]
342 adds x23,x23,x14
343 umulh x14,x12,x7
344 adcs x24,x24,x15
345 umulh x15,x13,x7
346 adcs x25,x25,x16
347 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
348 adcs x26,x26,x17
349 mul x17,x10,x8
350 adcs x19,x19,x14
351 mul x14,x11,x8
352 adc x20,x20,x15
353
354 mul x15,x12,x8
355 adds x24,x24,x16
356 mul x16,x13,x8
357 adcs x25,x25,x17
358 umulh x17,x9,x8 // hi(a[3..7]*a[2])
359 adcs x26,x26,x14
360 umulh x14,x10,x8
361 adcs x19,x19,x15
362 umulh x15,x11,x8
363 adcs x20,x20,x16
364 umulh x16,x12,x8
365 stp x23,x24,[x2],#8*2 // t[4..5]
366 adc x21,xzr,xzr // t[10]
367 adds x25,x25,x17
368 umulh x17,x13,x8
369 adcs x26,x26,x14
370 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
371 adcs x19,x19,x15
372 mul x15,x11,x9
373 adcs x20,x20,x16
374 mul x16,x12,x9
375 adc x21,x21,x17
376
377 mul x17,x13,x9
378 adds x26,x26,x14
379 umulh x14,x10,x9 // hi(a[4..7]*a[3])
380 adcs x19,x19,x15
381 umulh x15,x11,x9
382 adcs x20,x20,x16
383 umulh x16,x12,x9
384 adcs x21,x21,x17
385 umulh x17,x13,x9
386 stp x25,x26,[x2],#8*2 // t[6..7]
387 adc x22,xzr,xzr // t[11]
388 adds x19,x19,x14
389 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
390 adcs x20,x20,x15
391 mul x15,x12,x10
392 adcs x21,x21,x16
393 mul x16,x13,x10
394 adc x22,x22,x17
395
396 umulh x17,x11,x10 // hi(a[5..7]*a[4])
397 adds x20,x20,x14
398 umulh x14,x12,x10
399 adcs x21,x21,x15
400 umulh x15,x13,x10
401 adcs x22,x22,x16
402 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
403 adc x23,xzr,xzr // t[12]
404 adds x21,x21,x17
405 mul x17,x13,x11
406 adcs x22,x22,x14
407 umulh x14,x12,x11 // hi(a[6..7]*a[5])
408 adc x23,x23,x15
409
410 umulh x15,x13,x11
411 adds x22,x22,x16
412 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
413 adcs x23,x23,x17
414 umulh x17,x13,x12 // hi(a[7]*a[6])
415 adc x24,xzr,xzr // t[13]
416 adds x23,x23,x14
417 sub x27,x3,x1 // done yet?
418 adc x24,x24,x15
419
420 adds x24,x24,x16
421 sub x14,x3,x5 // rewinded ap
422 adc x25,xzr,xzr // t[14]
423 add x25,x25,x17
424
425 cbz x27,.Lsqr8x_outer_break
426
427 mov x4,x6
428 ldp x6,x7,[x2,#8*0]
429 ldp x8,x9,[x2,#8*2]
430 ldp x10,x11,[x2,#8*4]
431 ldp x12,x13,[x2,#8*6]
432 adds x19,x19,x6
433 adcs x20,x20,x7
434 ldp x6,x7,[x1,#8*0]
435 adcs x21,x21,x8
436 adcs x22,x22,x9
437 ldp x8,x9,[x1,#8*2]
438 adcs x23,x23,x10
439 adcs x24,x24,x11
440 ldp x10,x11,[x1,#8*4]
441 adcs x25,x25,x12
442 mov x0,x1
443 adcs x26,xzr,x13
444 ldp x12,x13,[x1,#8*6]
445 add x1,x1,#8*8
446 //adc x28,xzr,xzr // moved below
447 mov x27,#-8*8
448
449 // a[8]a[0]
450 // a[9]a[0]
451 // a[a]a[0]
452 // a[b]a[0]
453 // a[c]a[0]
454 // a[d]a[0]
455 // a[e]a[0]
456 // a[f]a[0]
457 // a[8]a[1]
458 // a[f]a[1]........................
459 // a[8]a[2]
460 // a[f]a[2]........................
461 // a[8]a[3]
462 // a[f]a[3]........................
463 // a[8]a[4]
464 // a[f]a[4]........................
465 // a[8]a[5]
466 // a[f]a[5]........................
467 // a[8]a[6]
468 // a[f]a[6]........................
469 // a[8]a[7]
470 // a[f]a[7]........................
471 .Lsqr8x_mul:
472 mul x14,x6,x4
473 adc x28,xzr,xzr // carry bit, modulo-scheduled
474 mul x15,x7,x4
475 add x27,x27,#8
476 mul x16,x8,x4
477 mul x17,x9,x4
478 adds x19,x19,x14
479 mul x14,x10,x4
480 adcs x20,x20,x15
481 mul x15,x11,x4
482 adcs x21,x21,x16
483 mul x16,x12,x4
484 adcs x22,x22,x17
485 mul x17,x13,x4
486 adcs x23,x23,x14
487 umulh x14,x6,x4
488 adcs x24,x24,x15
489 umulh x15,x7,x4
490 adcs x25,x25,x16
491 umulh x16,x8,x4
492 adcs x26,x26,x17
493 umulh x17,x9,x4
494 adc x28,x28,xzr
495 str x19,[x2],#8
496 adds x19,x20,x14
497 umulh x14,x10,x4
498 adcs x20,x21,x15
499 umulh x15,x11,x4
500 adcs x21,x22,x16
501 umulh x16,x12,x4
502 adcs x22,x23,x17
503 umulh x17,x13,x4
504 ldr x4,[x0,x27]
505 adcs x23,x24,x14
506 adcs x24,x25,x15
507 adcs x25,x26,x16
508 adcs x26,x28,x17
509 //adc x28,xzr,xzr // moved above
510 cbnz x27,.Lsqr8x_mul
511 // note that carry flag is guaranteed
512 // to be zero at this point
513 cmp x1,x3 // done yet?
514 b.eq .Lsqr8x_break
515
516 ldp x6,x7,[x2,#8*0]
517 ldp x8,x9,[x2,#8*2]
518 ldp x10,x11,[x2,#8*4]
519 ldp x12,x13,[x2,#8*6]
520 adds x19,x19,x6
521 ldr x4,[x0,#-8*8]
522 adcs x20,x20,x7
523 ldp x6,x7,[x1,#8*0]
524 adcs x21,x21,x8
525 adcs x22,x22,x9
526 ldp x8,x9,[x1,#8*2]
527 adcs x23,x23,x10
528 adcs x24,x24,x11
529 ldp x10,x11,[x1,#8*4]
530 adcs x25,x25,x12
531 mov x27,#-8*8
532 adcs x26,x26,x13
533 ldp x12,x13,[x1,#8*6]
534 add x1,x1,#8*8
535 //adc x28,xzr,xzr // moved above
536 b .Lsqr8x_mul
537
538 .align 4
539 .Lsqr8x_break:
540 ldp x6,x7,[x0,#8*0]
541 add x1,x0,#8*8
542 ldp x8,x9,[x0,#8*2]
543 sub x14,x3,x1 // is it last iteration?
544 ldp x10,x11,[x0,#8*4]
545 sub x15,x2,x14
546 ldp x12,x13,[x0,#8*6]
547 cbz x14,.Lsqr8x_outer_loop
548
549 stp x19,x20,[x2,#8*0]
550 ldp x19,x20,[x15,#8*0]
551 stp x21,x22,[x2,#8*2]
552 ldp x21,x22,[x15,#8*2]
553 stp x23,x24,[x2,#8*4]
554 ldp x23,x24,[x15,#8*4]
555 stp x25,x26,[x2,#8*6]
556 mov x2,x15
557 ldp x25,x26,[x15,#8*6]
558 b .Lsqr8x_outer_loop
559
560 .align 4
561 .Lsqr8x_outer_break:
562 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
563 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
564 ldp x15,x16,[sp,#8*1]
565 ldp x11,x13,[x14,#8*2]
566 add x1,x14,#8*4
567 ldp x17,x14,[sp,#8*3]
568
569 stp x19,x20,[x2,#8*0]
570 mul x19,x7,x7
571 stp x21,x22,[x2,#8*2]
572 umulh x7,x7,x7
573 stp x23,x24,[x2,#8*4]
574 mul x8,x9,x9
575 stp x25,x26,[x2,#8*6]
576 mov x2,sp
577 umulh x9,x9,x9
578 adds x20,x7,x15,lsl#1
579 extr x15,x16,x15,#63
580 sub x27,x5,#8*4
581
582 .Lsqr4x_shift_n_add:
583 adcs x21,x8,x15
584 extr x16,x17,x16,#63
585 sub x27,x27,#8*4
586 adcs x22,x9,x16
587 ldp x15,x16,[x2,#8*5]
588 mul x10,x11,x11
589 ldp x7,x9,[x1],#8*2
590 umulh x11,x11,x11
591 mul x12,x13,x13
592 umulh x13,x13,x13
593 extr x17,x14,x17,#63
594 stp x19,x20,[x2,#8*0]
595 adcs x23,x10,x17
596 extr x14,x15,x14,#63
597 stp x21,x22,[x2,#8*2]
598 adcs x24,x11,x14
599 ldp x17,x14,[x2,#8*7]
600 extr x15,x16,x15,#63
601 adcs x25,x12,x15
602 extr x16,x17,x16,#63
603 adcs x26,x13,x16
604 ldp x15,x16,[x2,#8*9]
605 mul x6,x7,x7
606 ldp x11,x13,[x1],#8*2
607 umulh x7,x7,x7
608 mul x8,x9,x9
609 umulh x9,x9,x9
610 stp x23,x24,[x2,#8*4]
611 extr x17,x14,x17,#63
612 stp x25,x26,[x2,#8*6]
613 add x2,x2,#8*8
614 adcs x19,x6,x17
615 extr x14,x15,x14,#63
616 adcs x20,x7,x14
617 ldp x17,x14,[x2,#8*3]
618 extr x15,x16,x15,#63
619 cbnz x27,.Lsqr4x_shift_n_add
620 ldp x1,x4,[x29,#104] // pull np and n0
621
622 adcs x21,x8,x15
623 extr x16,x17,x16,#63
624 adcs x22,x9,x16
625 ldp x15,x16,[x2,#8*5]
626 mul x10,x11,x11
627 umulh x11,x11,x11
628 stp x19,x20,[x2,#8*0]
629 mul x12,x13,x13
630 umulh x13,x13,x13
631 stp x21,x22,[x2,#8*2]
632 extr x17,x14,x17,#63
633 adcs x23,x10,x17
634 extr x14,x15,x14,#63
635 ldp x19,x20,[sp,#8*0]
636 adcs x24,x11,x14
637 extr x15,x16,x15,#63
638 ldp x6,x7,[x1,#8*0]
639 adcs x25,x12,x15
640 extr x16,xzr,x16,#63
641 ldp x8,x9,[x1,#8*2]
642 adc x26,x13,x16
643 ldp x10,x11,[x1,#8*4]
644
645 // Reduce by 512 bits per iteration
646 mul x28,x4,x19 // t[0]*n0
647 ldp x12,x13,[x1,#8*6]
648 add x3,x1,x5
649 ldp x21,x22,[sp,#8*2]
650 stp x23,x24,[x2,#8*4]
651 ldp x23,x24,[sp,#8*4]
652 stp x25,x26,[x2,#8*6]
653 ldp x25,x26,[sp,#8*6]
654 add x1,x1,#8*8
655 mov x30,xzr // initial top-most carry
656 mov x2,sp
657 mov x27,#8
658
659 .Lsqr8x_reduction:
660 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
661 mul x15,x7,x28
662 sub x27,x27,#1
663 mul x16,x8,x28
664 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
665 mul x17,x9,x28
666 // (*) adds xzr,x19,x14
667 subs xzr,x19,#1 // (*)
668 mul x14,x10,x28
669 adcs x19,x20,x15
670 mul x15,x11,x28
671 adcs x20,x21,x16
672 mul x16,x12,x28
673 adcs x21,x22,x17
674 mul x17,x13,x28
675 adcs x22,x23,x14
676 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
677 adcs x23,x24,x15
678 umulh x15,x7,x28
679 adcs x24,x25,x16
680 umulh x16,x8,x28
681 adcs x25,x26,x17
682 umulh x17,x9,x28
683 adc x26,xzr,xzr
684 adds x19,x19,x14
685 umulh x14,x10,x28
686 adcs x20,x20,x15
687 umulh x15,x11,x28
688 adcs x21,x21,x16
689 umulh x16,x12,x28
690 adcs x22,x22,x17
691 umulh x17,x13,x28
692 mul x28,x4,x19 // next t[0]*n0
693 adcs x23,x23,x14
694 adcs x24,x24,x15
695 adcs x25,x25,x16
696 adc x26,x26,x17
697 cbnz x27,.Lsqr8x_reduction
698
699 ldp x14,x15,[x2,#8*0]
700 ldp x16,x17,[x2,#8*2]
701 mov x0,x2
702 sub x27,x3,x1 // done yet?
703 adds x19,x19,x14
704 adcs x20,x20,x15
705 ldp x14,x15,[x2,#8*4]
706 adcs x21,x21,x16
707 adcs x22,x22,x17
708 ldp x16,x17,[x2,#8*6]
709 adcs x23,x23,x14
710 adcs x24,x24,x15
711 adcs x25,x25,x16
712 adcs x26,x26,x17
713 //adc x28,xzr,xzr // moved below
714 cbz x27,.Lsqr8x8_post_condition
715
716 ldr x4,[x2,#-8*8]
717 ldp x6,x7,[x1,#8*0]
718 ldp x8,x9,[x1,#8*2]
719 ldp x10,x11,[x1,#8*4]
720 mov x27,#-8*8
721 ldp x12,x13,[x1,#8*6]
722 add x1,x1,#8*8
723
724 .Lsqr8x_tail:
725 mul x14,x6,x4
726 adc x28,xzr,xzr // carry bit, modulo-scheduled
727 mul x15,x7,x4
728 add x27,x27,#8
729 mul x16,x8,x4
730 mul x17,x9,x4
731 adds x19,x19,x14
732 mul x14,x10,x4
733 adcs x20,x20,x15
734 mul x15,x11,x4
735 adcs x21,x21,x16
736 mul x16,x12,x4
737 adcs x22,x22,x17
738 mul x17,x13,x4
739 adcs x23,x23,x14
740 umulh x14,x6,x4
741 adcs x24,x24,x15
742 umulh x15,x7,x4
743 adcs x25,x25,x16
744 umulh x16,x8,x4
745 adcs x26,x26,x17
746 umulh x17,x9,x4
747 adc x28,x28,xzr
748 str x19,[x2],#8
749 adds x19,x20,x14
750 umulh x14,x10,x4
751 adcs x20,x21,x15
752 umulh x15,x11,x4
753 adcs x21,x22,x16
754 umulh x16,x12,x4
755 adcs x22,x23,x17
756 umulh x17,x13,x4
757 ldr x4,[x0,x27]
758 adcs x23,x24,x14
759 adcs x24,x25,x15
760 adcs x25,x26,x16
761 adcs x26,x28,x17
762 //adc x28,xzr,xzr // moved above
763 cbnz x27,.Lsqr8x_tail
764 // note that carry flag is guaranteed
765 // to be zero at this point
766 ldp x6,x7,[x2,#8*0]
767 sub x27,x3,x1 // done yet?
768 sub x16,x3,x5 // rewinded np
769 ldp x8,x9,[x2,#8*2]
770 ldp x10,x11,[x2,#8*4]
771 ldp x12,x13,[x2,#8*6]
772 cbz x27,.Lsqr8x_tail_break
773
774 ldr x4,[x0,#-8*8]
775 adds x19,x19,x6
776 adcs x20,x20,x7
777 ldp x6,x7,[x1,#8*0]
778 adcs x21,x21,x8
779 adcs x22,x22,x9
780 ldp x8,x9,[x1,#8*2]
781 adcs x23,x23,x10
782 adcs x24,x24,x11
783 ldp x10,x11,[x1,#8*4]
784 adcs x25,x25,x12
785 mov x27,#-8*8
786 adcs x26,x26,x13
787 ldp x12,x13,[x1,#8*6]
788 add x1,x1,#8*8
789 //adc x28,xzr,xzr // moved above
790 b .Lsqr8x_tail
791
792 .align 4
793 .Lsqr8x_tail_break:
794 ldr x4,[x29,#112] // pull n0
795 add x27,x2,#8*8 // end of current t[num] window
796
797 subs xzr,x30,#1 // "move" top-most carry to carry bit
798 adcs x14,x19,x6
799 adcs x15,x20,x7
800 ldp x19,x20,[x0,#8*0]
801 adcs x21,x21,x8
802 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
803 adcs x22,x22,x9
804 ldp x8,x9,[x16,#8*2]
805 adcs x23,x23,x10
806 adcs x24,x24,x11
807 ldp x10,x11,[x16,#8*4]
808 adcs x25,x25,x12
809 adcs x26,x26,x13
810 ldp x12,x13,[x16,#8*6]
811 add x1,x16,#8*8
812 adc x30,xzr,xzr // top-most carry
813 mul x28,x4,x19
814 stp x14,x15,[x2,#8*0]
815 stp x21,x22,[x2,#8*2]
816 ldp x21,x22,[x0,#8*2]
817 stp x23,x24,[x2,#8*4]
818 ldp x23,x24,[x0,#8*4]
819 cmp x27,x29 // did we hit the bottom?
820 stp x25,x26,[x2,#8*6]
821 mov x2,x0 // slide the window
822 ldp x25,x26,[x0,#8*6]
823 mov x27,#8
824 b.ne .Lsqr8x_reduction
825
826 // Final step. We see if result is larger than modulus, and
827 // if it is, subtract the modulus. But comparison implies
828 // subtraction. So we subtract modulus, see if it borrowed,
829 // and conditionally copy original value.
830 ldr x0,[x29,#96] // pull rp
831 add x2,x2,#8*8
832 subs x14,x19,x6
833 sbcs x15,x20,x7
834 sub x27,x5,#8*8
835 mov x3,x0 // x0 copy
836
837 .Lsqr8x_sub:
838 sbcs x16,x21,x8
839 ldp x6,x7,[x1,#8*0]
840 sbcs x17,x22,x9
841 stp x14,x15,[x0,#8*0]
842 sbcs x14,x23,x10
843 ldp x8,x9,[x1,#8*2]
844 sbcs x15,x24,x11
845 stp x16,x17,[x0,#8*2]
846 sbcs x16,x25,x12
847 ldp x10,x11,[x1,#8*4]
848 sbcs x17,x26,x13
849 ldp x12,x13,[x1,#8*6]
850 add x1,x1,#8*8
851 ldp x19,x20,[x2,#8*0]
852 sub x27,x27,#8*8
853 ldp x21,x22,[x2,#8*2]
854 ldp x23,x24,[x2,#8*4]
855 ldp x25,x26,[x2,#8*6]
856 add x2,x2,#8*8
857 stp x14,x15,[x0,#8*4]
858 sbcs x14,x19,x6
859 stp x16,x17,[x0,#8*6]
860 add x0,x0,#8*8
861 sbcs x15,x20,x7
862 cbnz x27,.Lsqr8x_sub
863
864 sbcs x16,x21,x8
865 mov x2,sp
866 add x1,sp,x5
867 ldp x6,x7,[x3,#8*0]
868 sbcs x17,x22,x9
869 stp x14,x15,[x0,#8*0]
870 sbcs x14,x23,x10
871 ldp x8,x9,[x3,#8*2]
872 sbcs x15,x24,x11
873 stp x16,x17,[x0,#8*2]
874 sbcs x16,x25,x12
875 ldp x19,x20,[x1,#8*0]
876 sbcs x17,x26,x13
877 ldp x21,x22,[x1,#8*2]
878 sbcs xzr,x30,xzr // did it borrow?
879 ldr x30,[x29,#8] // pull return address
880 stp x14,x15,[x0,#8*4]
881 stp x16,x17,[x0,#8*6]
882
883 sub x27,x5,#8*4
884 .Lsqr4x_cond_copy:
885 sub x27,x27,#8*4
886 csel x14,x19,x6,lo
887 stp xzr,xzr,[x2,#8*0]
888 csel x15,x20,x7,lo
889 ldp x6,x7,[x3,#8*4]
890 ldp x19,x20,[x1,#8*4]
891 csel x16,x21,x8,lo
892 stp xzr,xzr,[x2,#8*2]
893 add x2,x2,#8*4
894 csel x17,x22,x9,lo
895 ldp x8,x9,[x3,#8*6]
896 ldp x21,x22,[x1,#8*6]
897 add x1,x1,#8*4
898 stp x14,x15,[x3,#8*0]
899 stp x16,x17,[x3,#8*2]
900 add x3,x3,#8*4
901 stp xzr,xzr,[x1,#8*0]
902 stp xzr,xzr,[x1,#8*2]
903 cbnz x27,.Lsqr4x_cond_copy
904
905 csel x14,x19,x6,lo
906 stp xzr,xzr,[x2,#8*0]
907 csel x15,x20,x7,lo
908 stp xzr,xzr,[x2,#8*2]
909 csel x16,x21,x8,lo
910 csel x17,x22,x9,lo
911 stp x14,x15,[x3,#8*0]
912 stp x16,x17,[x3,#8*2]
913
914 b .Lsqr8x_done
915
916 .align 4
917 .Lsqr8x8_post_condition:
918 adc x28,xzr,xzr
919 ldr x30,[x29,#8] // pull return address
920 // x19-7,x28 hold result, x6-7 hold modulus
921 subs x6,x19,x6
922 ldr x1,[x29,#96] // pull rp
923 sbcs x7,x20,x7
924 stp xzr,xzr,[sp,#8*0]
925 sbcs x8,x21,x8
926 stp xzr,xzr,[sp,#8*2]
927 sbcs x9,x22,x9
928 stp xzr,xzr,[sp,#8*4]
929 sbcs x10,x23,x10
930 stp xzr,xzr,[sp,#8*6]
931 sbcs x11,x24,x11
932 stp xzr,xzr,[sp,#8*8]
933 sbcs x12,x25,x12
934 stp xzr,xzr,[sp,#8*10]
935 sbcs x13,x26,x13
936 stp xzr,xzr,[sp,#8*12]
937 sbcs x28,x28,xzr // did it borrow?
938 stp xzr,xzr,[sp,#8*14]
939
940 // x6-7 hold result-modulus
941 csel x6,x19,x6,lo
942 csel x7,x20,x7,lo
943 csel x8,x21,x8,lo
944 csel x9,x22,x9,lo
945 stp x6,x7,[x1,#8*0]
946 csel x10,x23,x10,lo
947 csel x11,x24,x11,lo
948 stp x8,x9,[x1,#8*2]
949 csel x12,x25,x12,lo
950 csel x13,x26,x13,lo
951 stp x10,x11,[x1,#8*4]
952 stp x12,x13,[x1,#8*6]
953
954 .Lsqr8x_done:
955 ldp x19,x20,[x29,#16]
956 mov sp,x29
957 ldp x21,x22,[x29,#32]
958 mov x0,#1
959 ldp x23,x24,[x29,#48]
960 ldp x25,x26,[x29,#64]
961 ldp x27,x28,[x29,#80]
962 ldr x29,[sp],#128
963 .inst 0xd50323bf // autiasp
964 ret
965 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
966 .type __bn_mul4x_mont,%function
967 .align 5
968 __bn_mul4x_mont:
969 .inst 0xd503233f // paciasp
970 stp x29,x30,[sp,#-128]!
971 add x29,sp,#0
972 stp x19,x20,[sp,#16]
973 stp x21,x22,[sp,#32]
974 stp x23,x24,[sp,#48]
975 stp x25,x26,[sp,#64]
976 stp x27,x28,[sp,#80]
977
978 sub x26,sp,x5,lsl#3
979 lsl x5,x5,#3
980 ldr x4,[x4] // *n0
981 sub sp,x26,#8*4 // alloca
982
983 add x10,x2,x5
984 add x27,x1,x5
985 stp x0,x10,[x29,#96] // offload rp and &b[num]
986
987 ldr x24,[x2,#8*0] // b[0]
988 ldp x6,x7,[x1,#8*0] // a[0..3]
989 ldp x8,x9,[x1,#8*2]
990 add x1,x1,#8*4
991 mov x19,xzr
992 mov x20,xzr
993 mov x21,xzr
994 mov x22,xzr
995 ldp x14,x15,[x3,#8*0] // n[0..3]
996 ldp x16,x17,[x3,#8*2]
997 adds x3,x3,#8*4 // clear carry bit
998 mov x0,xzr
999 mov x28,#0
1000 mov x26,sp
1001
1002 .Loop_mul4x_1st_reduction:
1003 mul x10,x6,x24 // lo(a[0..3]*b[0])
1004 adc x0,x0,xzr // modulo-scheduled
1005 mul x11,x7,x24
1006 add x28,x28,#8
1007 mul x12,x8,x24
1008 and x28,x28,#31
1009 mul x13,x9,x24
1010 adds x19,x19,x10
1011 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1012 adcs x20,x20,x11
1013 mul x25,x19,x4 // t[0]*n0
1014 adcs x21,x21,x12
1015 umulh x11,x7,x24
1016 adcs x22,x22,x13
1017 umulh x12,x8,x24
1018 adc x23,xzr,xzr
1019 umulh x13,x9,x24
1020 ldr x24,[x2,x28] // next b[i] (or b[0])
1021 adds x20,x20,x10
1022 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1023 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1024 adcs x21,x21,x11
1025 mul x11,x15,x25
1026 adcs x22,x22,x12
1027 mul x12,x16,x25
1028 adc x23,x23,x13 // can't overflow
1029 mul x13,x17,x25
1030 // (*) adds xzr,x19,x10
1031 subs xzr,x19,#1 // (*)
1032 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1033 adcs x19,x20,x11
1034 umulh x11,x15,x25
1035 adcs x20,x21,x12
1036 umulh x12,x16,x25
1037 adcs x21,x22,x13
1038 umulh x13,x17,x25
1039 adcs x22,x23,x0
1040 adc x0,xzr,xzr
1041 adds x19,x19,x10
1042 sub x10,x27,x1
1043 adcs x20,x20,x11
1044 adcs x21,x21,x12
1045 adcs x22,x22,x13
1046 //adc x0,x0,xzr
1047 cbnz x28,.Loop_mul4x_1st_reduction
1048
1049 cbz x10,.Lmul4x4_post_condition
1050
1051 ldp x6,x7,[x1,#8*0] // a[4..7]
1052 ldp x8,x9,[x1,#8*2]
1053 add x1,x1,#8*4
1054 ldr x25,[sp] // a[0]*n0
1055 ldp x14,x15,[x3,#8*0] // n[4..7]
1056 ldp x16,x17,[x3,#8*2]
1057 add x3,x3,#8*4
1058
1059 .Loop_mul4x_1st_tail:
1060 mul x10,x6,x24 // lo(a[4..7]*b[i])
1061 adc x0,x0,xzr // modulo-scheduled
1062 mul x11,x7,x24
1063 add x28,x28,#8
1064 mul x12,x8,x24
1065 and x28,x28,#31
1066 mul x13,x9,x24
1067 adds x19,x19,x10
1068 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1069 adcs x20,x20,x11
1070 umulh x11,x7,x24
1071 adcs x21,x21,x12
1072 umulh x12,x8,x24
1073 adcs x22,x22,x13
1074 umulh x13,x9,x24
1075 adc x23,xzr,xzr
1076 ldr x24,[x2,x28] // next b[i] (or b[0])
1077 adds x20,x20,x10
1078 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1079 adcs x21,x21,x11
1080 mul x11,x15,x25
1081 adcs x22,x22,x12
1082 mul x12,x16,x25
1083 adc x23,x23,x13 // can't overflow
1084 mul x13,x17,x25
1085 adds x19,x19,x10
1086 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1087 adcs x20,x20,x11
1088 umulh x11,x15,x25
1089 adcs x21,x21,x12
1090 umulh x12,x16,x25
1091 adcs x22,x22,x13
1092 adcs x23,x23,x0
1093 umulh x13,x17,x25
1094 adc x0,xzr,xzr
1095 ldr x25,[sp,x28] // next t[0]*n0
1096 str x19,[x26],#8 // result!!!
1097 adds x19,x20,x10
1098 sub x10,x27,x1 // done yet?
1099 adcs x20,x21,x11
1100 adcs x21,x22,x12
1101 adcs x22,x23,x13
1102 //adc x0,x0,xzr
1103 cbnz x28,.Loop_mul4x_1st_tail
1104
1105 sub x11,x27,x5 // rewinded x1
1106 cbz x10,.Lmul4x_proceed
1107
1108 ldp x6,x7,[x1,#8*0]
1109 ldp x8,x9,[x1,#8*2]
1110 add x1,x1,#8*4
1111 ldp x14,x15,[x3,#8*0]
1112 ldp x16,x17,[x3,#8*2]
1113 add x3,x3,#8*4
1114 b .Loop_mul4x_1st_tail
1115
1116 .align 5
1117 .Lmul4x_proceed:
1118 ldr x24,[x2,#8*4]! // *++b
1119 adc x30,x0,xzr
1120 ldp x6,x7,[x11,#8*0] // a[0..3]
1121 sub x3,x3,x5 // rewind np
1122 ldp x8,x9,[x11,#8*2]
1123 add x1,x11,#8*4
1124
1125 stp x19,x20,[x26,#8*0] // result!!!
1126 ldp x19,x20,[sp,#8*4] // t[0..3]
1127 stp x21,x22,[x26,#8*2] // result!!!
1128 ldp x21,x22,[sp,#8*6]
1129
1130 ldp x14,x15,[x3,#8*0] // n[0..3]
1131 mov x26,sp
1132 ldp x16,x17,[x3,#8*2]
1133 adds x3,x3,#8*4 // clear carry bit
1134 mov x0,xzr
1135
1136 .align 4
1137 .Loop_mul4x_reduction:
1138 mul x10,x6,x24 // lo(a[0..3]*b[4])
1139 adc x0,x0,xzr // modulo-scheduled
1140 mul x11,x7,x24
1141 add x28,x28,#8
1142 mul x12,x8,x24
1143 and x28,x28,#31
1144 mul x13,x9,x24
1145 adds x19,x19,x10
1146 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1147 adcs x20,x20,x11
1148 mul x25,x19,x4 // t[0]*n0
1149 adcs x21,x21,x12
1150 umulh x11,x7,x24
1151 adcs x22,x22,x13
1152 umulh x12,x8,x24
1153 adc x23,xzr,xzr
1154 umulh x13,x9,x24
1155 ldr x24,[x2,x28] // next b[i]
1156 adds x20,x20,x10
1157 // (*) mul x10,x14,x25
1158 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1159 adcs x21,x21,x11
1160 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1161 adcs x22,x22,x12
1162 mul x12,x16,x25
1163 adc x23,x23,x13 // can't overflow
1164 mul x13,x17,x25
1165 // (*) adds xzr,x19,x10
1166 subs xzr,x19,#1 // (*)
1167 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1168 adcs x19,x20,x11
1169 umulh x11,x15,x25
1170 adcs x20,x21,x12
1171 umulh x12,x16,x25
1172 adcs x21,x22,x13
1173 umulh x13,x17,x25
1174 adcs x22,x23,x0
1175 adc x0,xzr,xzr
1176 adds x19,x19,x10
1177 adcs x20,x20,x11
1178 adcs x21,x21,x12
1179 adcs x22,x22,x13
1180 //adc x0,x0,xzr
1181 cbnz x28,.Loop_mul4x_reduction
1182
1183 adc x0,x0,xzr
1184 ldp x10,x11,[x26,#8*4] // t[4..7]
1185 ldp x12,x13,[x26,#8*6]
1186 ldp x6,x7,[x1,#8*0] // a[4..7]
1187 ldp x8,x9,[x1,#8*2]
1188 add x1,x1,#8*4
1189 adds x19,x19,x10
1190 adcs x20,x20,x11
1191 adcs x21,x21,x12
1192 adcs x22,x22,x13
1193 //adc x0,x0,xzr
1194
1195 ldr x25,[sp] // t[0]*n0
1196 ldp x14,x15,[x3,#8*0] // n[4..7]
1197 ldp x16,x17,[x3,#8*2]
1198 add x3,x3,#8*4
1199
1200 .align 4
1201 .Loop_mul4x_tail:
1202 mul x10,x6,x24 // lo(a[4..7]*b[4])
1203 adc x0,x0,xzr // modulo-scheduled
1204 mul x11,x7,x24
1205 add x28,x28,#8
1206 mul x12,x8,x24
1207 and x28,x28,#31
1208 mul x13,x9,x24
1209 adds x19,x19,x10
1210 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1211 adcs x20,x20,x11
1212 umulh x11,x7,x24
1213 adcs x21,x21,x12
1214 umulh x12,x8,x24
1215 adcs x22,x22,x13
1216 umulh x13,x9,x24
1217 adc x23,xzr,xzr
1218 ldr x24,[x2,x28] // next b[i]
1219 adds x20,x20,x10
1220 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1221 adcs x21,x21,x11
1222 mul x11,x15,x25
1223 adcs x22,x22,x12
1224 mul x12,x16,x25
1225 adc x23,x23,x13 // can't overflow
1226 mul x13,x17,x25
1227 adds x19,x19,x10
1228 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1229 adcs x20,x20,x11
1230 umulh x11,x15,x25
1231 adcs x21,x21,x12
1232 umulh x12,x16,x25
1233 adcs x22,x22,x13
1234 umulh x13,x17,x25
1235 adcs x23,x23,x0
1236 ldr x25,[sp,x28] // next a[0]*n0
1237 adc x0,xzr,xzr
1238 str x19,[x26],#8 // result!!!
1239 adds x19,x20,x10
1240 sub x10,x27,x1 // done yet?
1241 adcs x20,x21,x11
1242 adcs x21,x22,x12
1243 adcs x22,x23,x13
1244 //adc x0,x0,xzr
1245 cbnz x28,.Loop_mul4x_tail
1246
1247 sub x11,x3,x5 // rewinded np?
1248 adc x0,x0,xzr
1249 cbz x10,.Loop_mul4x_break
1250
1251 ldp x10,x11,[x26,#8*4]
1252 ldp x12,x13,[x26,#8*6]
1253 ldp x6,x7,[x1,#8*0]
1254 ldp x8,x9,[x1,#8*2]
1255 add x1,x1,#8*4
1256 adds x19,x19,x10
1257 adcs x20,x20,x11
1258 adcs x21,x21,x12
1259 adcs x22,x22,x13
1260 //adc x0,x0,xzr
1261 ldp x14,x15,[x3,#8*0]
1262 ldp x16,x17,[x3,#8*2]
1263 add x3,x3,#8*4
1264 b .Loop_mul4x_tail
1265
1266 .align 4
1267 .Loop_mul4x_break:
1268 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1269 adds x19,x19,x30
1270 add x2,x2,#8*4 // bp++
1271 adcs x20,x20,xzr
1272 sub x1,x1,x5 // rewind ap
1273 adcs x21,x21,xzr
1274 stp x19,x20,[x26,#8*0] // result!!!
1275 adcs x22,x22,xzr
1276 ldp x19,x20,[sp,#8*4] // t[0..3]
1277 adc x30,x0,xzr
1278 stp x21,x22,[x26,#8*2] // result!!!
1279 cmp x2,x13 // done yet?
1280 ldp x21,x22,[sp,#8*6]
1281 ldp x14,x15,[x11,#8*0] // n[0..3]
1282 ldp x16,x17,[x11,#8*2]
1283 add x3,x11,#8*4
1284 b.eq .Lmul4x_post
1285
1286 ldr x24,[x2]
1287 ldp x6,x7,[x1,#8*0] // a[0..3]
1288 ldp x8,x9,[x1,#8*2]
1289 adds x1,x1,#8*4 // clear carry bit
1290 mov x0,xzr
1291 mov x26,sp
1292 b .Loop_mul4x_reduction
1293
1294 .align 4
1295 .Lmul4x_post:
1296 // Final step. We see if result is larger than modulus, and
1297 // if it is, subtract the modulus. But comparison implies
1298 // subtraction. So we subtract modulus, see if it borrowed,
1299 // and conditionally copy original value.
1300 mov x0,x12
1301 mov x27,x12 // x0 copy
1302 subs x10,x19,x14
1303 add x26,sp,#8*8
1304 sbcs x11,x20,x15
1305 sub x28,x5,#8*4
1306
1307 .Lmul4x_sub:
1308 sbcs x12,x21,x16
1309 ldp x14,x15,[x3,#8*0]
1310 sub x28,x28,#8*4
1311 ldp x19,x20,[x26,#8*0]
1312 sbcs x13,x22,x17
1313 ldp x16,x17,[x3,#8*2]
1314 add x3,x3,#8*4
1315 ldp x21,x22,[x26,#8*2]
1316 add x26,x26,#8*4
1317 stp x10,x11,[x0,#8*0]
1318 sbcs x10,x19,x14
1319 stp x12,x13,[x0,#8*2]
1320 add x0,x0,#8*4
1321 sbcs x11,x20,x15
1322 cbnz x28,.Lmul4x_sub
1323
1324 sbcs x12,x21,x16
1325 mov x26,sp
1326 add x1,sp,#8*4
1327 ldp x6,x7,[x27,#8*0]
1328 sbcs x13,x22,x17
1329 stp x10,x11,[x0,#8*0]
1330 ldp x8,x9,[x27,#8*2]
1331 stp x12,x13,[x0,#8*2]
1332 ldp x19,x20,[x1,#8*0]
1333 ldp x21,x22,[x1,#8*2]
1334 sbcs xzr,x30,xzr // did it borrow?
1335 ldr x30,[x29,#8] // pull return address
1336
1337 sub x28,x5,#8*4
1338 .Lmul4x_cond_copy:
1339 sub x28,x28,#8*4
1340 csel x10,x19,x6,lo
1341 stp xzr,xzr,[x26,#8*0]
1342 csel x11,x20,x7,lo
1343 ldp x6,x7,[x27,#8*4]
1344 ldp x19,x20,[x1,#8*4]
1345 csel x12,x21,x8,lo
1346 stp xzr,xzr,[x26,#8*2]
1347 add x26,x26,#8*4
1348 csel x13,x22,x9,lo
1349 ldp x8,x9,[x27,#8*6]
1350 ldp x21,x22,[x1,#8*6]
1351 add x1,x1,#8*4
1352 stp x10,x11,[x27,#8*0]
1353 stp x12,x13,[x27,#8*2]
1354 add x27,x27,#8*4
1355 cbnz x28,.Lmul4x_cond_copy
1356
1357 csel x10,x19,x6,lo
1358 stp xzr,xzr,[x26,#8*0]
1359 csel x11,x20,x7,lo
1360 stp xzr,xzr,[x26,#8*2]
1361 csel x12,x21,x8,lo
1362 stp xzr,xzr,[x26,#8*3]
1363 csel x13,x22,x9,lo
1364 stp xzr,xzr,[x26,#8*4]
1365 stp x10,x11,[x27,#8*0]
1366 stp x12,x13,[x27,#8*2]
1367
1368 b .Lmul4x_done
1369
1370 .align 4
1371 .Lmul4x4_post_condition:
1372 adc x0,x0,xzr
1373 ldr x1,[x29,#96] // pull rp
1374 // x19-3,x0 hold result, x14-7 hold modulus
1375 subs x6,x19,x14
1376 ldr x30,[x29,#8] // pull return address
1377 sbcs x7,x20,x15
1378 stp xzr,xzr,[sp,#8*0]
1379 sbcs x8,x21,x16
1380 stp xzr,xzr,[sp,#8*2]
1381 sbcs x9,x22,x17
1382 stp xzr,xzr,[sp,#8*4]
1383 sbcs xzr,x0,xzr // did it borrow?
1384 stp xzr,xzr,[sp,#8*6]
1385
1386 // x6-3 hold result-modulus
1387 csel x6,x19,x6,lo
1388 csel x7,x20,x7,lo
1389 csel x8,x21,x8,lo
1390 csel x9,x22,x9,lo
1391 stp x6,x7,[x1,#8*0]
1392 stp x8,x9,[x1,#8*2]
1393
1394 .Lmul4x_done:
1395 ldp x19,x20,[x29,#16]
1396 mov sp,x29
1397 ldp x21,x22,[x29,#32]
1398 mov x0,#1
1399 ldp x23,x24,[x29,#48]
1400 ldp x25,x26,[x29,#64]
1401 ldp x27,x28,[x29,#80]
1402 ldr x29,[sp],#128
1403 .inst 0xd50323bf // autiasp
1404 ret
1405 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1406 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1407 .align 2
1408 .align 4
1409