poly1305-armv8.S revision 1.3.6.1 1 #include "arm_arch.h"
2
3 .text
4
5 // forward "declarations" are required for Apple
6
7 .hidden OPENSSL_armcap_P
8 .globl poly1305_init
9 .hidden poly1305_init
10 .globl poly1305_blocks
11 .hidden poly1305_blocks
12 .globl poly1305_emit
13 .hidden poly1305_emit
14
15 .type poly1305_init,%function
16 .align 5
17 poly1305_init:
18 cmp x1,xzr
19 stp xzr,xzr,[x0] // zero hash value
20 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
21
22 csel x0,xzr,x0,eq
23 b.eq .Lno_key
24
25 adrp x17,OPENSSL_armcap_P
26 ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
27
28 ldp x7,x8,[x1] // load key
29 mov x9,#0xfffffffc0fffffff
30 movk x9,#0x0fff,lsl#48
31 #ifdef __ARMEB__
32 rev x7,x7 // flip bytes
33 rev x8,x8
34 #endif
35 and x7,x7,x9 // &=0ffffffc0fffffff
36 and x9,x9,#-4
37 and x8,x8,x9 // &=0ffffffc0ffffffc
38 stp x7,x8,[x0,#32] // save key value
39
40 tst w17,#ARMV7_NEON
41
42 adr x12,.Lpoly1305_blocks
43 adr x7,.Lpoly1305_blocks_neon
44 adr x13,.Lpoly1305_emit
45 adr x8,.Lpoly1305_emit_neon
46
47 csel x12,x12,x7,eq
48 csel x13,x13,x8,eq
49
50 #ifdef __ILP32__
51 stp w12,w13,[x2]
52 #else
53 stp x12,x13,[x2]
54 #endif
55
56 mov x0,#1
57 .Lno_key:
58 ret
59 .size poly1305_init,.-poly1305_init
60
61 .type poly1305_blocks,%function
62 .align 5
63 poly1305_blocks:
64 .Lpoly1305_blocks:
65 ands x2,x2,#-16
66 b.eq .Lno_data
67
68 ldp x4,x5,[x0] // load hash value
69 ldp x7,x8,[x0,#32] // load key value
70 ldr x6,[x0,#16]
71 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
72 b .Loop
73
74 .align 5
75 .Loop:
76 ldp x10,x11,[x1],#16 // load input
77 sub x2,x2,#16
78 #ifdef __ARMEB__
79 rev x10,x10
80 rev x11,x11
81 #endif
82 adds x4,x4,x10 // accumulate input
83 adcs x5,x5,x11
84
85 mul x12,x4,x7 // h0*r0
86 adc x6,x6,x3
87 umulh x13,x4,x7
88
89 mul x10,x5,x9 // h1*5*r1
90 umulh x11,x5,x9
91
92 adds x12,x12,x10
93 mul x10,x4,x8 // h0*r1
94 adc x13,x13,x11
95 umulh x14,x4,x8
96
97 adds x13,x13,x10
98 mul x10,x5,x7 // h1*r0
99 adc x14,x14,xzr
100 umulh x11,x5,x7
101
102 adds x13,x13,x10
103 mul x10,x6,x9 // h2*5*r1
104 adc x14,x14,x11
105 mul x11,x6,x7 // h2*r0
106
107 adds x13,x13,x10
108 adc x14,x14,x11
109
110 and x10,x14,#-4 // final reduction
111 and x6,x14,#3
112 add x10,x10,x14,lsr#2
113 adds x4,x12,x10
114 adcs x5,x13,xzr
115 adc x6,x6,xzr
116
117 cbnz x2,.Loop
118
119 stp x4,x5,[x0] // store hash value
120 str x6,[x0,#16]
121
122 .Lno_data:
123 ret
124 .size poly1305_blocks,.-poly1305_blocks
125
126 .type poly1305_emit,%function
127 .align 5
128 poly1305_emit:
129 .Lpoly1305_emit:
130 ldp x4,x5,[x0] // load hash base 2^64
131 ldr x6,[x0,#16]
132 ldp x10,x11,[x2] // load nonce
133
134 adds x12,x4,#5 // compare to modulus
135 adcs x13,x5,xzr
136 adc x14,x6,xzr
137
138 tst x14,#-4 // see if it's carried/borrowed
139
140 csel x4,x4,x12,eq
141 csel x5,x5,x13,eq
142
143 #ifdef __ARMEB__
144 ror x10,x10,#32 // flip nonce words
145 ror x11,x11,#32
146 #endif
147 adds x4,x4,x10 // accumulate nonce
148 adc x5,x5,x11
149 #ifdef __ARMEB__
150 rev x4,x4 // flip output bytes
151 rev x5,x5
152 #endif
153 stp x4,x5,[x1] // write result
154
155 ret
156 .size poly1305_emit,.-poly1305_emit
157 .type poly1305_mult,%function
158 .align 5
159 poly1305_mult:
160 mul x12,x4,x7 // h0*r0
161 umulh x13,x4,x7
162
163 mul x10,x5,x9 // h1*5*r1
164 umulh x11,x5,x9
165
166 adds x12,x12,x10
167 mul x10,x4,x8 // h0*r1
168 adc x13,x13,x11
169 umulh x14,x4,x8
170
171 adds x13,x13,x10
172 mul x10,x5,x7 // h1*r0
173 adc x14,x14,xzr
174 umulh x11,x5,x7
175
176 adds x13,x13,x10
177 mul x10,x6,x9 // h2*5*r1
178 adc x14,x14,x11
179 mul x11,x6,x7 // h2*r0
180
181 adds x13,x13,x10
182 adc x14,x14,x11
183
184 and x10,x14,#-4 // final reduction
185 and x6,x14,#3
186 add x10,x10,x14,lsr#2
187 adds x4,x12,x10
188 adcs x5,x13,xzr
189 adc x6,x6,xzr
190
191 ret
192 .size poly1305_mult,.-poly1305_mult
193
194 .type poly1305_splat,%function
195 .align 5
196 poly1305_splat:
197 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
198 ubfx x13,x4,#26,#26
199 extr x14,x5,x4,#52
200 and x14,x14,#0x03ffffff
201 ubfx x15,x5,#14,#26
202 extr x16,x6,x5,#40
203
204 str w12,[x0,#16*0] // r0
205 add w12,w13,w13,lsl#2 // r1*5
206 str w13,[x0,#16*1] // r1
207 add w13,w14,w14,lsl#2 // r2*5
208 str w12,[x0,#16*2] // s1
209 str w14,[x0,#16*3] // r2
210 add w14,w15,w15,lsl#2 // r3*5
211 str w13,[x0,#16*4] // s2
212 str w15,[x0,#16*5] // r3
213 add w15,w16,w16,lsl#2 // r4*5
214 str w14,[x0,#16*6] // s3
215 str w16,[x0,#16*7] // r4
216 str w15,[x0,#16*8] // s4
217
218 ret
219 .size poly1305_splat,.-poly1305_splat
220
221 .type poly1305_blocks_neon,%function
222 .align 5
223 poly1305_blocks_neon:
224 .Lpoly1305_blocks_neon:
225 ldr x17,[x0,#24]
226 cmp x2,#128
227 b.hs .Lblocks_neon
228 cbz x17,.Lpoly1305_blocks
229
230 .Lblocks_neon:
231 .inst 0xd503233f // paciasp
232 stp x29,x30,[sp,#-80]!
233 add x29,sp,#0
234
235 ands x2,x2,#-16
236 b.eq .Lno_data_neon
237
238 cbz x17,.Lbase2_64_neon
239
240 ldp w10,w11,[x0] // load hash value base 2^26
241 ldp w12,w13,[x0,#8]
242 ldr w14,[x0,#16]
243
244 tst x2,#31
245 b.eq .Leven_neon
246
247 ldp x7,x8,[x0,#32] // load key value
248
249 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
250 lsr x5,x12,#12
251 adds x4,x4,x12,lsl#52
252 add x5,x5,x13,lsl#14
253 adc x5,x5,xzr
254 lsr x6,x14,#24
255 adds x5,x5,x14,lsl#40
256 adc x14,x6,xzr // can be partially reduced...
257
258 ldp x12,x13,[x1],#16 // load input
259 sub x2,x2,#16
260 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
261
262 and x10,x14,#-4 // ... so reduce
263 and x6,x14,#3
264 add x10,x10,x14,lsr#2
265 adds x4,x4,x10
266 adcs x5,x5,xzr
267 adc x6,x6,xzr
268
269 #ifdef __ARMEB__
270 rev x12,x12
271 rev x13,x13
272 #endif
273 adds x4,x4,x12 // accumulate input
274 adcs x5,x5,x13
275 adc x6,x6,x3
276
277 bl poly1305_mult
278 ldr x30,[sp,#8]
279
280 cbz x3,.Lstore_base2_64_neon
281
282 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
283 ubfx x11,x4,#26,#26
284 extr x12,x5,x4,#52
285 and x12,x12,#0x03ffffff
286 ubfx x13,x5,#14,#26
287 extr x14,x6,x5,#40
288
289 cbnz x2,.Leven_neon
290
291 stp w10,w11,[x0] // store hash value base 2^26
292 stp w12,w13,[x0,#8]
293 str w14,[x0,#16]
294 b .Lno_data_neon
295
296 .align 4
297 .Lstore_base2_64_neon:
298 stp x4,x5,[x0] // store hash value base 2^64
299 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
300 b .Lno_data_neon
301
302 .align 4
303 .Lbase2_64_neon:
304 ldp x7,x8,[x0,#32] // load key value
305
306 ldp x4,x5,[x0] // load hash value base 2^64
307 ldr x6,[x0,#16]
308
309 tst x2,#31
310 b.eq .Linit_neon
311
312 ldp x12,x13,[x1],#16 // load input
313 sub x2,x2,#16
314 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
315 #ifdef __ARMEB__
316 rev x12,x12
317 rev x13,x13
318 #endif
319 adds x4,x4,x12 // accumulate input
320 adcs x5,x5,x13
321 adc x6,x6,x3
322
323 bl poly1305_mult
324
325 .Linit_neon:
326 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
327 ubfx x11,x4,#26,#26
328 extr x12,x5,x4,#52
329 and x12,x12,#0x03ffffff
330 ubfx x13,x5,#14,#26
331 extr x14,x6,x5,#40
332
333 stp d8,d9,[sp,#16] // meet ABI requirements
334 stp d10,d11,[sp,#32]
335 stp d12,d13,[sp,#48]
336 stp d14,d15,[sp,#64]
337
338 fmov d24,x10
339 fmov d25,x11
340 fmov d26,x12
341 fmov d27,x13
342 fmov d28,x14
343
344 ////////////////////////////////// initialize r^n table
345 mov x4,x7 // r^1
346 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
347 mov x5,x8
348 mov x6,xzr
349 add x0,x0,#48+12
350 bl poly1305_splat
351
352 bl poly1305_mult // r^2
353 sub x0,x0,#4
354 bl poly1305_splat
355
356 bl poly1305_mult // r^3
357 sub x0,x0,#4
358 bl poly1305_splat
359
360 bl poly1305_mult // r^4
361 sub x0,x0,#4
362 bl poly1305_splat
363 ldr x30,[sp,#8]
364
365 add x16,x1,#32
366 adr x17,.Lzeros
367 subs x2,x2,#64
368 csel x16,x17,x16,lo
369
370 mov x4,#1
371 stur x4,[x0,#-24] // set is_base2_26
372 sub x0,x0,#48 // restore original x0
373 b .Ldo_neon
374
375 .align 4
376 .Leven_neon:
377 add x16,x1,#32
378 adr x17,.Lzeros
379 subs x2,x2,#64
380 csel x16,x17,x16,lo
381
382 stp d8,d9,[sp,#16] // meet ABI requirements
383 stp d10,d11,[sp,#32]
384 stp d12,d13,[sp,#48]
385 stp d14,d15,[sp,#64]
386
387 fmov d24,x10
388 fmov d25,x11
389 fmov d26,x12
390 fmov d27,x13
391 fmov d28,x14
392
393 .Ldo_neon:
394 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
395 ldp x9,x13,[x16],#48
396
397 lsl x3,x3,#24
398 add x15,x0,#48
399
400 #ifdef __ARMEB__
401 rev x8,x8
402 rev x12,x12
403 rev x9,x9
404 rev x13,x13
405 #endif
406 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
407 and x5,x9,#0x03ffffff
408 ubfx x6,x8,#26,#26
409 ubfx x7,x9,#26,#26
410 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
411 extr x8,x12,x8,#52
412 extr x9,x13,x9,#52
413 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
414 fmov d14,x4
415 and x8,x8,#0x03ffffff
416 and x9,x9,#0x03ffffff
417 ubfx x10,x12,#14,#26
418 ubfx x11,x13,#14,#26
419 add x12,x3,x12,lsr#40
420 add x13,x3,x13,lsr#40
421 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
422 fmov d15,x6
423 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
424 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
425 fmov d16,x8
426 fmov d17,x10
427 fmov d18,x12
428
429 ldp x8,x12,[x1],#16 // inp[0:1]
430 ldp x9,x13,[x1],#48
431
432 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
433 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
434 ld1 {v8.4s},[x15]
435
436 #ifdef __ARMEB__
437 rev x8,x8
438 rev x12,x12
439 rev x9,x9
440 rev x13,x13
441 #endif
442 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
443 and x5,x9,#0x03ffffff
444 ubfx x6,x8,#26,#26
445 ubfx x7,x9,#26,#26
446 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
447 extr x8,x12,x8,#52
448 extr x9,x13,x9,#52
449 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
450 fmov d9,x4
451 and x8,x8,#0x03ffffff
452 and x9,x9,#0x03ffffff
453 ubfx x10,x12,#14,#26
454 ubfx x11,x13,#14,#26
455 add x12,x3,x12,lsr#40
456 add x13,x3,x13,lsr#40
457 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
458 fmov d10,x6
459 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
460 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
461 movi v31.2d,#-1
462 fmov d11,x8
463 fmov d12,x10
464 fmov d13,x12
465 ushr v31.2d,v31.2d,#38
466
467 b.ls .Lskip_loop
468
469 .align 4
470 .Loop_neon:
471 ////////////////////////////////////////////////////////////////
472 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
473 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
474 // ___________________/
475 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
476 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
477 // ___________________/ ____________________/
478 //
479 // Note that we start with inp[2:3]*r^2. This is because it
480 // doesn't depend on reduction in previous iteration.
481 ////////////////////////////////////////////////////////////////
482 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
483 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
484 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
485 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
486 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
487
488 subs x2,x2,#64
489 umull v23.2d,v14.2s,v7.s[2]
490 csel x16,x17,x16,lo
491 umull v22.2d,v14.2s,v5.s[2]
492 umull v21.2d,v14.2s,v3.s[2]
493 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
494 umull v20.2d,v14.2s,v1.s[2]
495 ldp x9,x13,[x16],#48
496 umull v19.2d,v14.2s,v0.s[2]
497 #ifdef __ARMEB__
498 rev x8,x8
499 rev x12,x12
500 rev x9,x9
501 rev x13,x13
502 #endif
503
504 umlal v23.2d,v15.2s,v5.s[2]
505 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
506 umlal v22.2d,v15.2s,v3.s[2]
507 and x5,x9,#0x03ffffff
508 umlal v21.2d,v15.2s,v1.s[2]
509 ubfx x6,x8,#26,#26
510 umlal v20.2d,v15.2s,v0.s[2]
511 ubfx x7,x9,#26,#26
512 umlal v19.2d,v15.2s,v8.s[2]
513 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
514
515 umlal v23.2d,v16.2s,v3.s[2]
516 extr x8,x12,x8,#52
517 umlal v22.2d,v16.2s,v1.s[2]
518 extr x9,x13,x9,#52
519 umlal v21.2d,v16.2s,v0.s[2]
520 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
521 umlal v20.2d,v16.2s,v8.s[2]
522 fmov d14,x4
523 umlal v19.2d,v16.2s,v6.s[2]
524 and x8,x8,#0x03ffffff
525
526 umlal v23.2d,v17.2s,v1.s[2]
527 and x9,x9,#0x03ffffff
528 umlal v22.2d,v17.2s,v0.s[2]
529 ubfx x10,x12,#14,#26
530 umlal v21.2d,v17.2s,v8.s[2]
531 ubfx x11,x13,#14,#26
532 umlal v20.2d,v17.2s,v6.s[2]
533 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
534 umlal v19.2d,v17.2s,v4.s[2]
535 fmov d15,x6
536
537 add v11.2s,v11.2s,v26.2s
538 add x12,x3,x12,lsr#40
539 umlal v23.2d,v18.2s,v0.s[2]
540 add x13,x3,x13,lsr#40
541 umlal v22.2d,v18.2s,v8.s[2]
542 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
543 umlal v21.2d,v18.2s,v6.s[2]
544 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
545 umlal v20.2d,v18.2s,v4.s[2]
546 fmov d16,x8
547 umlal v19.2d,v18.2s,v2.s[2]
548 fmov d17,x10
549
550 ////////////////////////////////////////////////////////////////
551 // (hash+inp[0:1])*r^4 and accumulate
552
553 add v9.2s,v9.2s,v24.2s
554 fmov d18,x12
555 umlal v22.2d,v11.2s,v1.s[0]
556 ldp x8,x12,[x1],#16 // inp[0:1]
557 umlal v19.2d,v11.2s,v6.s[0]
558 ldp x9,x13,[x1],#48
559 umlal v23.2d,v11.2s,v3.s[0]
560 umlal v20.2d,v11.2s,v8.s[0]
561 umlal v21.2d,v11.2s,v0.s[0]
562 #ifdef __ARMEB__
563 rev x8,x8
564 rev x12,x12
565 rev x9,x9
566 rev x13,x13
567 #endif
568
569 add v10.2s,v10.2s,v25.2s
570 umlal v22.2d,v9.2s,v5.s[0]
571 umlal v23.2d,v9.2s,v7.s[0]
572 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
573 umlal v21.2d,v9.2s,v3.s[0]
574 and x5,x9,#0x03ffffff
575 umlal v19.2d,v9.2s,v0.s[0]
576 ubfx x6,x8,#26,#26
577 umlal v20.2d,v9.2s,v1.s[0]
578 ubfx x7,x9,#26,#26
579
580 add v12.2s,v12.2s,v27.2s
581 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
582 umlal v22.2d,v10.2s,v3.s[0]
583 extr x8,x12,x8,#52
584 umlal v23.2d,v10.2s,v5.s[0]
585 extr x9,x13,x9,#52
586 umlal v19.2d,v10.2s,v8.s[0]
587 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
588 umlal v21.2d,v10.2s,v1.s[0]
589 fmov d9,x4
590 umlal v20.2d,v10.2s,v0.s[0]
591 and x8,x8,#0x03ffffff
592
593 add v13.2s,v13.2s,v28.2s
594 and x9,x9,#0x03ffffff
595 umlal v22.2d,v12.2s,v0.s[0]
596 ubfx x10,x12,#14,#26
597 umlal v19.2d,v12.2s,v4.s[0]
598 ubfx x11,x13,#14,#26
599 umlal v23.2d,v12.2s,v1.s[0]
600 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
601 umlal v20.2d,v12.2s,v6.s[0]
602 fmov d10,x6
603 umlal v21.2d,v12.2s,v8.s[0]
604 add x12,x3,x12,lsr#40
605
606 umlal v22.2d,v13.2s,v8.s[0]
607 add x13,x3,x13,lsr#40
608 umlal v19.2d,v13.2s,v2.s[0]
609 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
610 umlal v23.2d,v13.2s,v0.s[0]
611 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
612 umlal v20.2d,v13.2s,v4.s[0]
613 fmov d11,x8
614 umlal v21.2d,v13.2s,v6.s[0]
615 fmov d12,x10
616 fmov d13,x12
617
618 /////////////////////////////////////////////////////////////////
619 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
620 // and P. Schwabe
621 //
622 // [see discussion in poly1305-armv4 module]
623
624 ushr v29.2d,v22.2d,#26
625 xtn v27.2s,v22.2d
626 ushr v30.2d,v19.2d,#26
627 and v19.16b,v19.16b,v31.16b
628 add v23.2d,v23.2d,v29.2d // h3 -> h4
629 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
630 add v20.2d,v20.2d,v30.2d // h0 -> h1
631
632 ushr v29.2d,v23.2d,#26
633 xtn v28.2s,v23.2d
634 ushr v30.2d,v20.2d,#26
635 xtn v25.2s,v20.2d
636 bic v28.2s,#0xfc,lsl#24
637 add v21.2d,v21.2d,v30.2d // h1 -> h2
638
639 add v19.2d,v19.2d,v29.2d
640 shl v29.2d,v29.2d,#2
641 shrn v30.2s,v21.2d,#26
642 xtn v26.2s,v21.2d
643 add v19.2d,v19.2d,v29.2d // h4 -> h0
644 bic v25.2s,#0xfc,lsl#24
645 add v27.2s,v27.2s,v30.2s // h2 -> h3
646 bic v26.2s,#0xfc,lsl#24
647
648 shrn v29.2s,v19.2d,#26
649 xtn v24.2s,v19.2d
650 ushr v30.2s,v27.2s,#26
651 bic v27.2s,#0xfc,lsl#24
652 bic v24.2s,#0xfc,lsl#24
653 add v25.2s,v25.2s,v29.2s // h0 -> h1
654 add v28.2s,v28.2s,v30.2s // h3 -> h4
655
656 b.hi .Loop_neon
657
658 .Lskip_loop:
659 dup v16.2d,v16.d[0]
660 add v11.2s,v11.2s,v26.2s
661
662 ////////////////////////////////////////////////////////////////
663 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
664
665 adds x2,x2,#32
666 b.ne .Long_tail
667
668 dup v16.2d,v11.d[0]
669 add v14.2s,v9.2s,v24.2s
670 add v17.2s,v12.2s,v27.2s
671 add v15.2s,v10.2s,v25.2s
672 add v18.2s,v13.2s,v28.2s
673
674 .Long_tail:
675 dup v14.2d,v14.d[0]
676 umull2 v19.2d,v16.4s,v6.4s
677 umull2 v22.2d,v16.4s,v1.4s
678 umull2 v23.2d,v16.4s,v3.4s
679 umull2 v21.2d,v16.4s,v0.4s
680 umull2 v20.2d,v16.4s,v8.4s
681
682 dup v15.2d,v15.d[0]
683 umlal2 v19.2d,v14.4s,v0.4s
684 umlal2 v21.2d,v14.4s,v3.4s
685 umlal2 v22.2d,v14.4s,v5.4s
686 umlal2 v23.2d,v14.4s,v7.4s
687 umlal2 v20.2d,v14.4s,v1.4s
688
689 dup v17.2d,v17.d[0]
690 umlal2 v19.2d,v15.4s,v8.4s
691 umlal2 v22.2d,v15.4s,v3.4s
692 umlal2 v21.2d,v15.4s,v1.4s
693 umlal2 v23.2d,v15.4s,v5.4s
694 umlal2 v20.2d,v15.4s,v0.4s
695
696 dup v18.2d,v18.d[0]
697 umlal2 v22.2d,v17.4s,v0.4s
698 umlal2 v23.2d,v17.4s,v1.4s
699 umlal2 v19.2d,v17.4s,v4.4s
700 umlal2 v20.2d,v17.4s,v6.4s
701 umlal2 v21.2d,v17.4s,v8.4s
702
703 umlal2 v22.2d,v18.4s,v8.4s
704 umlal2 v19.2d,v18.4s,v2.4s
705 umlal2 v23.2d,v18.4s,v0.4s
706 umlal2 v20.2d,v18.4s,v4.4s
707 umlal2 v21.2d,v18.4s,v6.4s
708
709 b.eq .Lshort_tail
710
711 ////////////////////////////////////////////////////////////////
712 // (hash+inp[0:1])*r^4:r^3 and accumulate
713
714 add v9.2s,v9.2s,v24.2s
715 umlal v22.2d,v11.2s,v1.2s
716 umlal v19.2d,v11.2s,v6.2s
717 umlal v23.2d,v11.2s,v3.2s
718 umlal v20.2d,v11.2s,v8.2s
719 umlal v21.2d,v11.2s,v0.2s
720
721 add v10.2s,v10.2s,v25.2s
722 umlal v22.2d,v9.2s,v5.2s
723 umlal v19.2d,v9.2s,v0.2s
724 umlal v23.2d,v9.2s,v7.2s
725 umlal v20.2d,v9.2s,v1.2s
726 umlal v21.2d,v9.2s,v3.2s
727
728 add v12.2s,v12.2s,v27.2s
729 umlal v22.2d,v10.2s,v3.2s
730 umlal v19.2d,v10.2s,v8.2s
731 umlal v23.2d,v10.2s,v5.2s
732 umlal v20.2d,v10.2s,v0.2s
733 umlal v21.2d,v10.2s,v1.2s
734
735 add v13.2s,v13.2s,v28.2s
736 umlal v22.2d,v12.2s,v0.2s
737 umlal v19.2d,v12.2s,v4.2s
738 umlal v23.2d,v12.2s,v1.2s
739 umlal v20.2d,v12.2s,v6.2s
740 umlal v21.2d,v12.2s,v8.2s
741
742 umlal v22.2d,v13.2s,v8.2s
743 umlal v19.2d,v13.2s,v2.2s
744 umlal v23.2d,v13.2s,v0.2s
745 umlal v20.2d,v13.2s,v4.2s
746 umlal v21.2d,v13.2s,v6.2s
747
748 .Lshort_tail:
749 ////////////////////////////////////////////////////////////////
750 // horizontal add
751
752 addp v22.2d,v22.2d,v22.2d
753 ldp d8,d9,[sp,#16] // meet ABI requirements
754 addp v19.2d,v19.2d,v19.2d
755 ldp d10,d11,[sp,#32]
756 addp v23.2d,v23.2d,v23.2d
757 ldp d12,d13,[sp,#48]
758 addp v20.2d,v20.2d,v20.2d
759 ldp d14,d15,[sp,#64]
760 addp v21.2d,v21.2d,v21.2d
761
762 ////////////////////////////////////////////////////////////////
763 // lazy reduction, but without narrowing
764
765 ushr v29.2d,v22.2d,#26
766 and v22.16b,v22.16b,v31.16b
767 ushr v30.2d,v19.2d,#26
768 and v19.16b,v19.16b,v31.16b
769
770 add v23.2d,v23.2d,v29.2d // h3 -> h4
771 add v20.2d,v20.2d,v30.2d // h0 -> h1
772
773 ushr v29.2d,v23.2d,#26
774 and v23.16b,v23.16b,v31.16b
775 ushr v30.2d,v20.2d,#26
776 and v20.16b,v20.16b,v31.16b
777 add v21.2d,v21.2d,v30.2d // h1 -> h2
778
779 add v19.2d,v19.2d,v29.2d
780 shl v29.2d,v29.2d,#2
781 ushr v30.2d,v21.2d,#26
782 and v21.16b,v21.16b,v31.16b
783 add v19.2d,v19.2d,v29.2d // h4 -> h0
784 add v22.2d,v22.2d,v30.2d // h2 -> h3
785
786 ushr v29.2d,v19.2d,#26
787 and v19.16b,v19.16b,v31.16b
788 ushr v30.2d,v22.2d,#26
789 and v22.16b,v22.16b,v31.16b
790 add v20.2d,v20.2d,v29.2d // h0 -> h1
791 add v23.2d,v23.2d,v30.2d // h3 -> h4
792
793 ////////////////////////////////////////////////////////////////
794 // write the result, can be partially reduced
795
796 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
797 st1 {v23.s}[0],[x0]
798
799 .Lno_data_neon:
800 ldr x29,[sp],#80
801 .inst 0xd50323bf // autiasp
802 ret
803 .size poly1305_blocks_neon,.-poly1305_blocks_neon
804
805 .type poly1305_emit_neon,%function
806 .align 5
807 poly1305_emit_neon:
808 .Lpoly1305_emit_neon:
809 ldr x17,[x0,#24]
810 cbz x17,poly1305_emit
811
812 ldp w10,w11,[x0] // load hash value base 2^26
813 ldp w12,w13,[x0,#8]
814 ldr w14,[x0,#16]
815
816 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
817 lsr x5,x12,#12
818 adds x4,x4,x12,lsl#52
819 add x5,x5,x13,lsl#14
820 adc x5,x5,xzr
821 lsr x6,x14,#24
822 adds x5,x5,x14,lsl#40
823 adc x6,x6,xzr // can be partially reduced...
824
825 ldp x10,x11,[x2] // load nonce
826
827 and x12,x6,#-4 // ... so reduce
828 add x12,x12,x6,lsr#2
829 and x6,x6,#3
830 adds x4,x4,x12
831 adcs x5,x5,xzr
832 adc x6,x6,xzr
833
834 adds x12,x4,#5 // compare to modulus
835 adcs x13,x5,xzr
836 adc x14,x6,xzr
837
838 tst x14,#-4 // see if it's carried/borrowed
839
840 csel x4,x4,x12,eq
841 csel x5,x5,x13,eq
842
843 #ifdef __ARMEB__
844 ror x10,x10,#32 // flip nonce words
845 ror x11,x11,#32
846 #endif
847 adds x4,x4,x10 // accumulate nonce
848 adc x5,x5,x11
849 #ifdef __ARMEB__
850 rev x4,x4 // flip output bytes
851 rev x5,x5
852 #endif
853 stp x4,x5,[x1] // write result
854
855 ret
856 .size poly1305_emit_neon,.-poly1305_emit_neon
857
858 .align 5
859 .Lzeros:
860 .long 0,0,0,0,0,0,0,0
861 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
862 .align 2
863 .align 2
864