poly1305-armv8.S revision 1.1.4.1 1 #include "arm_arch.h"
2
3 .text
4
5 // forward "declarations" are required for Apple
6
7 .globl poly1305_blocks
8 .globl poly1305_emit
9
10 .globl poly1305_init
11 .type poly1305_init,%function
12 .align 5
13 poly1305_init:
14 cmp x1,xzr
15 stp xzr,xzr,[x0] // zero hash value
16 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
17
18 csel x0,xzr,x0,eq
19 b.eq .Lno_key
20
21 #ifdef __ILP32__
22 ldrsw x11,.LOPENSSL_armcap_P
23 #else
24 ldr x11,.LOPENSSL_armcap_P
25 #endif
26 adr x10,.LOPENSSL_armcap_P
27
28 ldp x7,x8,[x1] // load key
29 mov x9,#0xfffffffc0fffffff
30 movk x9,#0x0fff,lsl#48
31 ldr w17,[x10,x11]
32 #ifdef __ARMEB__
33 rev x7,x7 // flip bytes
34 rev x8,x8
35 #endif
36 and x7,x7,x9 // &=0ffffffc0fffffff
37 and x9,x9,#-4
38 and x8,x8,x9 // &=0ffffffc0ffffffc
39 stp x7,x8,[x0,#32] // save key value
40
41 tst w17,#ARMV7_NEON
42
43 adr x12,poly1305_blocks
44 adr x7,poly1305_blocks_neon
45 adr x13,poly1305_emit
46 adr x8,poly1305_emit_neon
47
48 csel x12,x12,x7,eq
49 csel x13,x13,x8,eq
50
51 #ifdef __ILP32__
52 stp w12,w13,[x2]
53 #else
54 stp x12,x13,[x2]
55 #endif
56
57 mov x0,#1
58 .Lno_key:
59 ret
60 .size poly1305_init,.-poly1305_init
61
62 .type poly1305_blocks,%function
63 .align 5
64 poly1305_blocks:
65 ands x2,x2,#-16
66 b.eq .Lno_data
67
68 ldp x4,x5,[x0] // load hash value
69 ldp x7,x8,[x0,#32] // load key value
70 ldr x6,[x0,#16]
71 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
72 b .Loop
73
74 .align 5
75 .Loop:
76 ldp x10,x11,[x1],#16 // load input
77 sub x2,x2,#16
78 #ifdef __ARMEB__
79 rev x10,x10
80 rev x11,x11
81 #endif
82 adds x4,x4,x10 // accumulate input
83 adcs x5,x5,x11
84
85 mul x12,x4,x7 // h0*r0
86 adc x6,x6,x3
87 umulh x13,x4,x7
88
89 mul x10,x5,x9 // h1*5*r1
90 umulh x11,x5,x9
91
92 adds x12,x12,x10
93 mul x10,x4,x8 // h0*r1
94 adc x13,x13,x11
95 umulh x14,x4,x8
96
97 adds x13,x13,x10
98 mul x10,x5,x7 // h1*r0
99 adc x14,x14,xzr
100 umulh x11,x5,x7
101
102 adds x13,x13,x10
103 mul x10,x6,x9 // h2*5*r1
104 adc x14,x14,x11
105 mul x11,x6,x7 // h2*r0
106
107 adds x13,x13,x10
108 adc x14,x14,x11
109
110 and x10,x14,#-4 // final reduction
111 and x6,x14,#3
112 add x10,x10,x14,lsr#2
113 adds x4,x12,x10
114 adcs x5,x13,xzr
115 adc x6,x6,xzr
116
117 cbnz x2,.Loop
118
119 stp x4,x5,[x0] // store hash value
120 str x6,[x0,#16]
121
122 .Lno_data:
123 ret
124 .size poly1305_blocks,.-poly1305_blocks
125
126 .type poly1305_emit,%function
127 .align 5
128 poly1305_emit:
129 ldp x4,x5,[x0] // load hash base 2^64
130 ldr x6,[x0,#16]
131 ldp x10,x11,[x2] // load nonce
132
133 adds x12,x4,#5 // compare to modulus
134 adcs x13,x5,xzr
135 adc x14,x6,xzr
136
137 tst x14,#-4 // see if it's carried/borrowed
138
139 csel x4,x4,x12,eq
140 csel x5,x5,x13,eq
141
142 #ifdef __ARMEB__
143 ror x10,x10,#32 // flip nonce words
144 ror x11,x11,#32
145 #endif
146 adds x4,x4,x10 // accumulate nonce
147 adc x5,x5,x11
148 #ifdef __ARMEB__
149 rev x4,x4 // flip output bytes
150 rev x5,x5
151 #endif
152 stp x4,x5,[x1] // write result
153
154 ret
155 .size poly1305_emit,.-poly1305_emit
156 .type poly1305_mult,%function
157 .align 5
158 poly1305_mult:
159 mul x12,x4,x7 // h0*r0
160 umulh x13,x4,x7
161
162 mul x10,x5,x9 // h1*5*r1
163 umulh x11,x5,x9
164
165 adds x12,x12,x10
166 mul x10,x4,x8 // h0*r1
167 adc x13,x13,x11
168 umulh x14,x4,x8
169
170 adds x13,x13,x10
171 mul x10,x5,x7 // h1*r0
172 adc x14,x14,xzr
173 umulh x11,x5,x7
174
175 adds x13,x13,x10
176 mul x10,x6,x9 // h2*5*r1
177 adc x14,x14,x11
178 mul x11,x6,x7 // h2*r0
179
180 adds x13,x13,x10
181 adc x14,x14,x11
182
183 and x10,x14,#-4 // final reduction
184 and x6,x14,#3
185 add x10,x10,x14,lsr#2
186 adds x4,x12,x10
187 adcs x5,x13,xzr
188 adc x6,x6,xzr
189
190 ret
191 .size poly1305_mult,.-poly1305_mult
192
193 .type poly1305_splat,%function
194 .align 5
195 poly1305_splat:
196 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
197 ubfx x13,x4,#26,#26
198 extr x14,x5,x4,#52
199 and x14,x14,#0x03ffffff
200 ubfx x15,x5,#14,#26
201 extr x16,x6,x5,#40
202
203 str w12,[x0,#16*0] // r0
204 add w12,w13,w13,lsl#2 // r1*5
205 str w13,[x0,#16*1] // r1
206 add w13,w14,w14,lsl#2 // r2*5
207 str w12,[x0,#16*2] // s1
208 str w14,[x0,#16*3] // r2
209 add w14,w15,w15,lsl#2 // r3*5
210 str w13,[x0,#16*4] // s2
211 str w15,[x0,#16*5] // r3
212 add w15,w16,w16,lsl#2 // r4*5
213 str w14,[x0,#16*6] // s3
214 str w16,[x0,#16*7] // r4
215 str w15,[x0,#16*8] // s4
216
217 ret
218 .size poly1305_splat,.-poly1305_splat
219
220 .type poly1305_blocks_neon,%function
221 .align 5
222 poly1305_blocks_neon:
223 ldr x17,[x0,#24]
224 cmp x2,#128
225 b.hs .Lblocks_neon
226 cbz x17,poly1305_blocks
227
228 .Lblocks_neon:
229 .inst 0xd503233f // paciasp
230 stp x29,x30,[sp,#-80]!
231 add x29,sp,#0
232
233 ands x2,x2,#-16
234 b.eq .Lno_data_neon
235
236 cbz x17,.Lbase2_64_neon
237
238 ldp w10,w11,[x0] // load hash value base 2^26
239 ldp w12,w13,[x0,#8]
240 ldr w14,[x0,#16]
241
242 tst x2,#31
243 b.eq .Leven_neon
244
245 ldp x7,x8,[x0,#32] // load key value
246
247 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
248 lsr x5,x12,#12
249 adds x4,x4,x12,lsl#52
250 add x5,x5,x13,lsl#14
251 adc x5,x5,xzr
252 lsr x6,x14,#24
253 adds x5,x5,x14,lsl#40
254 adc x14,x6,xzr // can be partially reduced...
255
256 ldp x12,x13,[x1],#16 // load input
257 sub x2,x2,#16
258 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
259
260 and x10,x14,#-4 // ... so reduce
261 and x6,x14,#3
262 add x10,x10,x14,lsr#2
263 adds x4,x4,x10
264 adcs x5,x5,xzr
265 adc x6,x6,xzr
266
267 #ifdef __ARMEB__
268 rev x12,x12
269 rev x13,x13
270 #endif
271 adds x4,x4,x12 // accumulate input
272 adcs x5,x5,x13
273 adc x6,x6,x3
274
275 bl poly1305_mult
276 ldr x30,[sp,#8]
277
278 cbz x3,.Lstore_base2_64_neon
279
280 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
281 ubfx x11,x4,#26,#26
282 extr x12,x5,x4,#52
283 and x12,x12,#0x03ffffff
284 ubfx x13,x5,#14,#26
285 extr x14,x6,x5,#40
286
287 cbnz x2,.Leven_neon
288
289 stp w10,w11,[x0] // store hash value base 2^26
290 stp w12,w13,[x0,#8]
291 str w14,[x0,#16]
292 b .Lno_data_neon
293
294 .align 4
295 .Lstore_base2_64_neon:
296 stp x4,x5,[x0] // store hash value base 2^64
297 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
298 b .Lno_data_neon
299
300 .align 4
301 .Lbase2_64_neon:
302 ldp x7,x8,[x0,#32] // load key value
303
304 ldp x4,x5,[x0] // load hash value base 2^64
305 ldr x6,[x0,#16]
306
307 tst x2,#31
308 b.eq .Linit_neon
309
310 ldp x12,x13,[x1],#16 // load input
311 sub x2,x2,#16
312 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
313 #ifdef __ARMEB__
314 rev x12,x12
315 rev x13,x13
316 #endif
317 adds x4,x4,x12 // accumulate input
318 adcs x5,x5,x13
319 adc x6,x6,x3
320
321 bl poly1305_mult
322
323 .Linit_neon:
324 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
325 ubfx x11,x4,#26,#26
326 extr x12,x5,x4,#52
327 and x12,x12,#0x03ffffff
328 ubfx x13,x5,#14,#26
329 extr x14,x6,x5,#40
330
331 stp d8,d9,[sp,#16] // meet ABI requirements
332 stp d10,d11,[sp,#32]
333 stp d12,d13,[sp,#48]
334 stp d14,d15,[sp,#64]
335
336 fmov d24,x10
337 fmov d25,x11
338 fmov d26,x12
339 fmov d27,x13
340 fmov d28,x14
341
342 ////////////////////////////////// initialize r^n table
343 mov x4,x7 // r^1
344 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
345 mov x5,x8
346 mov x6,xzr
347 add x0,x0,#48+12
348 bl poly1305_splat
349
350 bl poly1305_mult // r^2
351 sub x0,x0,#4
352 bl poly1305_splat
353
354 bl poly1305_mult // r^3
355 sub x0,x0,#4
356 bl poly1305_splat
357
358 bl poly1305_mult // r^4
359 sub x0,x0,#4
360 bl poly1305_splat
361 ldr x30,[sp,#8]
362
363 add x16,x1,#32
364 adr x17,.Lzeros
365 subs x2,x2,#64
366 csel x16,x17,x16,lo
367
368 mov x4,#1
369 str x4,[x0,#-24] // set is_base2_26
370 sub x0,x0,#48 // restore original x0
371 b .Ldo_neon
372
373 .align 4
374 .Leven_neon:
375 add x16,x1,#32
376 adr x17,.Lzeros
377 subs x2,x2,#64
378 csel x16,x17,x16,lo
379
380 stp d8,d9,[sp,#16] // meet ABI requirements
381 stp d10,d11,[sp,#32]
382 stp d12,d13,[sp,#48]
383 stp d14,d15,[sp,#64]
384
385 fmov d24,x10
386 fmov d25,x11
387 fmov d26,x12
388 fmov d27,x13
389 fmov d28,x14
390
391 .Ldo_neon:
392 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
393 ldp x9,x13,[x16],#48
394
395 lsl x3,x3,#24
396 add x15,x0,#48
397
398 #ifdef __ARMEB__
399 rev x8,x8
400 rev x12,x12
401 rev x9,x9
402 rev x13,x13
403 #endif
404 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
405 and x5,x9,#0x03ffffff
406 ubfx x6,x8,#26,#26
407 ubfx x7,x9,#26,#26
408 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
409 extr x8,x12,x8,#52
410 extr x9,x13,x9,#52
411 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
412 fmov d14,x4
413 and x8,x8,#0x03ffffff
414 and x9,x9,#0x03ffffff
415 ubfx x10,x12,#14,#26
416 ubfx x11,x13,#14,#26
417 add x12,x3,x12,lsr#40
418 add x13,x3,x13,lsr#40
419 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
420 fmov d15,x6
421 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
422 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
423 fmov d16,x8
424 fmov d17,x10
425 fmov d18,x12
426
427 ldp x8,x12,[x1],#16 // inp[0:1]
428 ldp x9,x13,[x1],#48
429
430 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
431 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
432 ld1 {v8.4s},[x15]
433
434 #ifdef __ARMEB__
435 rev x8,x8
436 rev x12,x12
437 rev x9,x9
438 rev x13,x13
439 #endif
440 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
441 and x5,x9,#0x03ffffff
442 ubfx x6,x8,#26,#26
443 ubfx x7,x9,#26,#26
444 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
445 extr x8,x12,x8,#52
446 extr x9,x13,x9,#52
447 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
448 fmov d9,x4
449 and x8,x8,#0x03ffffff
450 and x9,x9,#0x03ffffff
451 ubfx x10,x12,#14,#26
452 ubfx x11,x13,#14,#26
453 add x12,x3,x12,lsr#40
454 add x13,x3,x13,lsr#40
455 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
456 fmov d10,x6
457 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
458 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
459 movi v31.2d,#-1
460 fmov d11,x8
461 fmov d12,x10
462 fmov d13,x12
463 ushr v31.2d,v31.2d,#38
464
465 b.ls .Lskip_loop
466
467 .align 4
468 .Loop_neon:
469 ////////////////////////////////////////////////////////////////
470 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
471 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
472 // ___________________/
473 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
474 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
475 // ___________________/ ____________________/
476 //
477 // Note that we start with inp[2:3]*r^2. This is because it
478 // doesn't depend on reduction in previous iteration.
479 ////////////////////////////////////////////////////////////////
480 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
481 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
482 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
483 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
484 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
485
486 subs x2,x2,#64
487 umull v23.2d,v14.2s,v7.s[2]
488 csel x16,x17,x16,lo
489 umull v22.2d,v14.2s,v5.s[2]
490 umull v21.2d,v14.2s,v3.s[2]
491 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
492 umull v20.2d,v14.2s,v1.s[2]
493 ldp x9,x13,[x16],#48
494 umull v19.2d,v14.2s,v0.s[2]
495 #ifdef __ARMEB__
496 rev x8,x8
497 rev x12,x12
498 rev x9,x9
499 rev x13,x13
500 #endif
501
502 umlal v23.2d,v15.2s,v5.s[2]
503 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
504 umlal v22.2d,v15.2s,v3.s[2]
505 and x5,x9,#0x03ffffff
506 umlal v21.2d,v15.2s,v1.s[2]
507 ubfx x6,x8,#26,#26
508 umlal v20.2d,v15.2s,v0.s[2]
509 ubfx x7,x9,#26,#26
510 umlal v19.2d,v15.2s,v8.s[2]
511 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
512
513 umlal v23.2d,v16.2s,v3.s[2]
514 extr x8,x12,x8,#52
515 umlal v22.2d,v16.2s,v1.s[2]
516 extr x9,x13,x9,#52
517 umlal v21.2d,v16.2s,v0.s[2]
518 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
519 umlal v20.2d,v16.2s,v8.s[2]
520 fmov d14,x4
521 umlal v19.2d,v16.2s,v6.s[2]
522 and x8,x8,#0x03ffffff
523
524 umlal v23.2d,v17.2s,v1.s[2]
525 and x9,x9,#0x03ffffff
526 umlal v22.2d,v17.2s,v0.s[2]
527 ubfx x10,x12,#14,#26
528 umlal v21.2d,v17.2s,v8.s[2]
529 ubfx x11,x13,#14,#26
530 umlal v20.2d,v17.2s,v6.s[2]
531 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
532 umlal v19.2d,v17.2s,v4.s[2]
533 fmov d15,x6
534
535 add v11.2s,v11.2s,v26.2s
536 add x12,x3,x12,lsr#40
537 umlal v23.2d,v18.2s,v0.s[2]
538 add x13,x3,x13,lsr#40
539 umlal v22.2d,v18.2s,v8.s[2]
540 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
541 umlal v21.2d,v18.2s,v6.s[2]
542 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
543 umlal v20.2d,v18.2s,v4.s[2]
544 fmov d16,x8
545 umlal v19.2d,v18.2s,v2.s[2]
546 fmov d17,x10
547
548 ////////////////////////////////////////////////////////////////
549 // (hash+inp[0:1])*r^4 and accumulate
550
551 add v9.2s,v9.2s,v24.2s
552 fmov d18,x12
553 umlal v22.2d,v11.2s,v1.s[0]
554 ldp x8,x12,[x1],#16 // inp[0:1]
555 umlal v19.2d,v11.2s,v6.s[0]
556 ldp x9,x13,[x1],#48
557 umlal v23.2d,v11.2s,v3.s[0]
558 umlal v20.2d,v11.2s,v8.s[0]
559 umlal v21.2d,v11.2s,v0.s[0]
560 #ifdef __ARMEB__
561 rev x8,x8
562 rev x12,x12
563 rev x9,x9
564 rev x13,x13
565 #endif
566
567 add v10.2s,v10.2s,v25.2s
568 umlal v22.2d,v9.2s,v5.s[0]
569 umlal v23.2d,v9.2s,v7.s[0]
570 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
571 umlal v21.2d,v9.2s,v3.s[0]
572 and x5,x9,#0x03ffffff
573 umlal v19.2d,v9.2s,v0.s[0]
574 ubfx x6,x8,#26,#26
575 umlal v20.2d,v9.2s,v1.s[0]
576 ubfx x7,x9,#26,#26
577
578 add v12.2s,v12.2s,v27.2s
579 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
580 umlal v22.2d,v10.2s,v3.s[0]
581 extr x8,x12,x8,#52
582 umlal v23.2d,v10.2s,v5.s[0]
583 extr x9,x13,x9,#52
584 umlal v19.2d,v10.2s,v8.s[0]
585 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
586 umlal v21.2d,v10.2s,v1.s[0]
587 fmov d9,x4
588 umlal v20.2d,v10.2s,v0.s[0]
589 and x8,x8,#0x03ffffff
590
591 add v13.2s,v13.2s,v28.2s
592 and x9,x9,#0x03ffffff
593 umlal v22.2d,v12.2s,v0.s[0]
594 ubfx x10,x12,#14,#26
595 umlal v19.2d,v12.2s,v4.s[0]
596 ubfx x11,x13,#14,#26
597 umlal v23.2d,v12.2s,v1.s[0]
598 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
599 umlal v20.2d,v12.2s,v6.s[0]
600 fmov d10,x6
601 umlal v21.2d,v12.2s,v8.s[0]
602 add x12,x3,x12,lsr#40
603
604 umlal v22.2d,v13.2s,v8.s[0]
605 add x13,x3,x13,lsr#40
606 umlal v19.2d,v13.2s,v2.s[0]
607 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
608 umlal v23.2d,v13.2s,v0.s[0]
609 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
610 umlal v20.2d,v13.2s,v4.s[0]
611 fmov d11,x8
612 umlal v21.2d,v13.2s,v6.s[0]
613 fmov d12,x10
614 fmov d13,x12
615
616 /////////////////////////////////////////////////////////////////
617 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
618 // and P. Schwabe
619 //
620 // [see discussion in poly1305-armv4 module]
621
622 ushr v29.2d,v22.2d,#26
623 xtn v27.2s,v22.2d
624 ushr v30.2d,v19.2d,#26
625 and v19.16b,v19.16b,v31.16b
626 add v23.2d,v23.2d,v29.2d // h3 -> h4
627 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
628 add v20.2d,v20.2d,v30.2d // h0 -> h1
629
630 ushr v29.2d,v23.2d,#26
631 xtn v28.2s,v23.2d
632 ushr v30.2d,v20.2d,#26
633 xtn v25.2s,v20.2d
634 bic v28.2s,#0xfc,lsl#24
635 add v21.2d,v21.2d,v30.2d // h1 -> h2
636
637 add v19.2d,v19.2d,v29.2d
638 shl v29.2d,v29.2d,#2
639 shrn v30.2s,v21.2d,#26
640 xtn v26.2s,v21.2d
641 add v19.2d,v19.2d,v29.2d // h4 -> h0
642 bic v25.2s,#0xfc,lsl#24
643 add v27.2s,v27.2s,v30.2s // h2 -> h3
644 bic v26.2s,#0xfc,lsl#24
645
646 shrn v29.2s,v19.2d,#26
647 xtn v24.2s,v19.2d
648 ushr v30.2s,v27.2s,#26
649 bic v27.2s,#0xfc,lsl#24
650 bic v24.2s,#0xfc,lsl#24
651 add v25.2s,v25.2s,v29.2s // h0 -> h1
652 add v28.2s,v28.2s,v30.2s // h3 -> h4
653
654 b.hi .Loop_neon
655
656 .Lskip_loop:
657 dup v16.2d,v16.d[0]
658 add v11.2s,v11.2s,v26.2s
659
660 ////////////////////////////////////////////////////////////////
661 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
662
663 adds x2,x2,#32
664 b.ne .Long_tail
665
666 dup v16.2d,v11.d[0]
667 add v14.2s,v9.2s,v24.2s
668 add v17.2s,v12.2s,v27.2s
669 add v15.2s,v10.2s,v25.2s
670 add v18.2s,v13.2s,v28.2s
671
672 .Long_tail:
673 dup v14.2d,v14.d[0]
674 umull2 v19.2d,v16.4s,v6.4s
675 umull2 v22.2d,v16.4s,v1.4s
676 umull2 v23.2d,v16.4s,v3.4s
677 umull2 v21.2d,v16.4s,v0.4s
678 umull2 v20.2d,v16.4s,v8.4s
679
680 dup v15.2d,v15.d[0]
681 umlal2 v19.2d,v14.4s,v0.4s
682 umlal2 v21.2d,v14.4s,v3.4s
683 umlal2 v22.2d,v14.4s,v5.4s
684 umlal2 v23.2d,v14.4s,v7.4s
685 umlal2 v20.2d,v14.4s,v1.4s
686
687 dup v17.2d,v17.d[0]
688 umlal2 v19.2d,v15.4s,v8.4s
689 umlal2 v22.2d,v15.4s,v3.4s
690 umlal2 v21.2d,v15.4s,v1.4s
691 umlal2 v23.2d,v15.4s,v5.4s
692 umlal2 v20.2d,v15.4s,v0.4s
693
694 dup v18.2d,v18.d[0]
695 umlal2 v22.2d,v17.4s,v0.4s
696 umlal2 v23.2d,v17.4s,v1.4s
697 umlal2 v19.2d,v17.4s,v4.4s
698 umlal2 v20.2d,v17.4s,v6.4s
699 umlal2 v21.2d,v17.4s,v8.4s
700
701 umlal2 v22.2d,v18.4s,v8.4s
702 umlal2 v19.2d,v18.4s,v2.4s
703 umlal2 v23.2d,v18.4s,v0.4s
704 umlal2 v20.2d,v18.4s,v4.4s
705 umlal2 v21.2d,v18.4s,v6.4s
706
707 b.eq .Lshort_tail
708
709 ////////////////////////////////////////////////////////////////
710 // (hash+inp[0:1])*r^4:r^3 and accumulate
711
712 add v9.2s,v9.2s,v24.2s
713 umlal v22.2d,v11.2s,v1.2s
714 umlal v19.2d,v11.2s,v6.2s
715 umlal v23.2d,v11.2s,v3.2s
716 umlal v20.2d,v11.2s,v8.2s
717 umlal v21.2d,v11.2s,v0.2s
718
719 add v10.2s,v10.2s,v25.2s
720 umlal v22.2d,v9.2s,v5.2s
721 umlal v19.2d,v9.2s,v0.2s
722 umlal v23.2d,v9.2s,v7.2s
723 umlal v20.2d,v9.2s,v1.2s
724 umlal v21.2d,v9.2s,v3.2s
725
726 add v12.2s,v12.2s,v27.2s
727 umlal v22.2d,v10.2s,v3.2s
728 umlal v19.2d,v10.2s,v8.2s
729 umlal v23.2d,v10.2s,v5.2s
730 umlal v20.2d,v10.2s,v0.2s
731 umlal v21.2d,v10.2s,v1.2s
732
733 add v13.2s,v13.2s,v28.2s
734 umlal v22.2d,v12.2s,v0.2s
735 umlal v19.2d,v12.2s,v4.2s
736 umlal v23.2d,v12.2s,v1.2s
737 umlal v20.2d,v12.2s,v6.2s
738 umlal v21.2d,v12.2s,v8.2s
739
740 umlal v22.2d,v13.2s,v8.2s
741 umlal v19.2d,v13.2s,v2.2s
742 umlal v23.2d,v13.2s,v0.2s
743 umlal v20.2d,v13.2s,v4.2s
744 umlal v21.2d,v13.2s,v6.2s
745
746 .Lshort_tail:
747 ////////////////////////////////////////////////////////////////
748 // horizontal add
749
750 addp v22.2d,v22.2d,v22.2d
751 ldp d8,d9,[sp,#16] // meet ABI requirements
752 addp v19.2d,v19.2d,v19.2d
753 ldp d10,d11,[sp,#32]
754 addp v23.2d,v23.2d,v23.2d
755 ldp d12,d13,[sp,#48]
756 addp v20.2d,v20.2d,v20.2d
757 ldp d14,d15,[sp,#64]
758 addp v21.2d,v21.2d,v21.2d
759
760 ////////////////////////////////////////////////////////////////
761 // lazy reduction, but without narrowing
762
763 ushr v29.2d,v22.2d,#26
764 and v22.16b,v22.16b,v31.16b
765 ushr v30.2d,v19.2d,#26
766 and v19.16b,v19.16b,v31.16b
767
768 add v23.2d,v23.2d,v29.2d // h3 -> h4
769 add v20.2d,v20.2d,v30.2d // h0 -> h1
770
771 ushr v29.2d,v23.2d,#26
772 and v23.16b,v23.16b,v31.16b
773 ushr v30.2d,v20.2d,#26
774 and v20.16b,v20.16b,v31.16b
775 add v21.2d,v21.2d,v30.2d // h1 -> h2
776
777 add v19.2d,v19.2d,v29.2d
778 shl v29.2d,v29.2d,#2
779 ushr v30.2d,v21.2d,#26
780 and v21.16b,v21.16b,v31.16b
781 add v19.2d,v19.2d,v29.2d // h4 -> h0
782 add v22.2d,v22.2d,v30.2d // h2 -> h3
783
784 ushr v29.2d,v19.2d,#26
785 and v19.16b,v19.16b,v31.16b
786 ushr v30.2d,v22.2d,#26
787 and v22.16b,v22.16b,v31.16b
788 add v20.2d,v20.2d,v29.2d // h0 -> h1
789 add v23.2d,v23.2d,v30.2d // h3 -> h4
790
791 ////////////////////////////////////////////////////////////////
792 // write the result, can be partially reduced
793
794 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
795 st1 {v23.s}[0],[x0]
796
797 .Lno_data_neon:
798 .inst 0xd50323bf // autiasp
799 ldr x29,[sp],#80
800 ret
801 .size poly1305_blocks_neon,.-poly1305_blocks_neon
802
803 .type poly1305_emit_neon,%function
804 .align 5
805 poly1305_emit_neon:
806 ldr x17,[x0,#24]
807 cbz x17,poly1305_emit
808
809 ldp w10,w11,[x0] // load hash value base 2^26
810 ldp w12,w13,[x0,#8]
811 ldr w14,[x0,#16]
812
813 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
814 lsr x5,x12,#12
815 adds x4,x4,x12,lsl#52
816 add x5,x5,x13,lsl#14
817 adc x5,x5,xzr
818 lsr x6,x14,#24
819 adds x5,x5,x14,lsl#40
820 adc x6,x6,xzr // can be partially reduced...
821
822 ldp x10,x11,[x2] // load nonce
823
824 and x12,x6,#-4 // ... so reduce
825 add x12,x12,x6,lsr#2
826 and x6,x6,#3
827 adds x4,x4,x12
828 adcs x5,x5,xzr
829 adc x6,x6,xzr
830
831 adds x12,x4,#5 // compare to modulus
832 adcs x13,x5,xzr
833 adc x14,x6,xzr
834
835 tst x14,#-4 // see if it's carried/borrowed
836
837 csel x4,x4,x12,eq
838 csel x5,x5,x13,eq
839
840 #ifdef __ARMEB__
841 ror x10,x10,#32 // flip nonce words
842 ror x11,x11,#32
843 #endif
844 adds x4,x4,x10 // accumulate nonce
845 adc x5,x5,x11
846 #ifdef __ARMEB__
847 rev x4,x4 // flip output bytes
848 rev x5,x5
849 #endif
850 stp x4,x5,[x1] // write result
851
852 ret
853 .size poly1305_emit_neon,.-poly1305_emit_neon
854
855 .align 5
856 .Lzeros:
857 .long 0,0,0,0,0,0,0,0
858 .LOPENSSL_armcap_P:
859 #ifdef __ILP32__
860 .long OPENSSL_armcap_P-.
861 #else
862 .quad OPENSSL_armcap_P-.
863 #endif
864 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
865 .align 2
866 .align 2
867