poly1305-armv8.S revision 1.1 1 #include "arm_arch.h"
2
3 .text
4
5 // forward "declarations" are required for Apple
6
7 .globl poly1305_blocks
8 .globl poly1305_emit
9
10 .globl poly1305_init
11 .type poly1305_init,%function
12 .align 5
13 poly1305_init:
14 cmp x1,xzr
15 stp xzr,xzr,[x0] // zero hash value
16 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
17
18 csel x0,xzr,x0,eq
19 b.eq .Lno_key
20
21 #ifdef __ILP32__
22 ldrsw x11,.LOPENSSL_armcap_P
23 #else
24 ldr x11,.LOPENSSL_armcap_P
25 #endif
26 adr x10,.LOPENSSL_armcap_P
27
28 ldp x7,x8,[x1] // load key
29 mov x9,#0xfffffffc0fffffff
30 movk x9,#0x0fff,lsl#48
31 ldr w17,[x10,x11]
32 #ifdef __ARMEB__
33 rev x7,x7 // flip bytes
34 rev x8,x8
35 #endif
36 and x7,x7,x9 // &=0ffffffc0fffffff
37 and x9,x9,#-4
38 and x8,x8,x9 // &=0ffffffc0ffffffc
39 stp x7,x8,[x0,#32] // save key value
40
41 tst w17,#ARMV7_NEON
42
43 adr x12,poly1305_blocks
44 adr x7,poly1305_blocks_neon
45 adr x13,poly1305_emit
46 adr x8,poly1305_emit_neon
47
48 csel x12,x12,x7,eq
49 csel x13,x13,x8,eq
50
51 #ifdef __ILP32__
52 stp w12,w13,[x2]
53 #else
54 stp x12,x13,[x2]
55 #endif
56
57 mov x0,#1
58 .Lno_key:
59 ret
60 .size poly1305_init,.-poly1305_init
61
62 .type poly1305_blocks,%function
63 .align 5
64 poly1305_blocks:
65 ands x2,x2,#-16
66 b.eq .Lno_data
67
68 ldp x4,x5,[x0] // load hash value
69 ldp x7,x8,[x0,#32] // load key value
70 ldr x6,[x0,#16]
71 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
72 b .Loop
73
74 .align 5
75 .Loop:
76 ldp x10,x11,[x1],#16 // load input
77 sub x2,x2,#16
78 #ifdef __ARMEB__
79 rev x10,x10
80 rev x11,x11
81 #endif
82 adds x4,x4,x10 // accumulate input
83 adcs x5,x5,x11
84
85 mul x12,x4,x7 // h0*r0
86 adc x6,x6,x3
87 umulh x13,x4,x7
88
89 mul x10,x5,x9 // h1*5*r1
90 umulh x11,x5,x9
91
92 adds x12,x12,x10
93 mul x10,x4,x8 // h0*r1
94 adc x13,x13,x11
95 umulh x14,x4,x8
96
97 adds x13,x13,x10
98 mul x10,x5,x7 // h1*r0
99 adc x14,x14,xzr
100 umulh x11,x5,x7
101
102 adds x13,x13,x10
103 mul x10,x6,x9 // h2*5*r1
104 adc x14,x14,x11
105 mul x11,x6,x7 // h2*r0
106
107 adds x13,x13,x10
108 adc x14,x14,x11
109
110 and x10,x14,#-4 // final reduction
111 and x6,x14,#3
112 add x10,x10,x14,lsr#2
113 adds x4,x12,x10
114 adcs x5,x13,xzr
115 adc x6,x6,xzr
116
117 cbnz x2,.Loop
118
119 stp x4,x5,[x0] // store hash value
120 str x6,[x0,#16]
121
122 .Lno_data:
123 ret
124 .size poly1305_blocks,.-poly1305_blocks
125
126 .type poly1305_emit,%function
127 .align 5
128 poly1305_emit:
129 ldp x4,x5,[x0] // load hash base 2^64
130 ldr x6,[x0,#16]
131 ldp x10,x11,[x2] // load nonce
132
133 adds x12,x4,#5 // compare to modulus
134 adcs x13,x5,xzr
135 adc x14,x6,xzr
136
137 tst x14,#-4 // see if it's carried/borrowed
138
139 csel x4,x4,x12,eq
140 csel x5,x5,x13,eq
141
142 #ifdef __ARMEB__
143 ror x10,x10,#32 // flip nonce words
144 ror x11,x11,#32
145 #endif
146 adds x4,x4,x10 // accumulate nonce
147 adc x5,x5,x11
148 #ifdef __ARMEB__
149 rev x4,x4 // flip output bytes
150 rev x5,x5
151 #endif
152 stp x4,x5,[x1] // write result
153
154 ret
155 .size poly1305_emit,.-poly1305_emit
156 .type poly1305_mult,%function
157 .align 5
158 poly1305_mult:
159 mul x12,x4,x7 // h0*r0
160 umulh x13,x4,x7
161
162 mul x10,x5,x9 // h1*5*r1
163 umulh x11,x5,x9
164
165 adds x12,x12,x10
166 mul x10,x4,x8 // h0*r1
167 adc x13,x13,x11
168 umulh x14,x4,x8
169
170 adds x13,x13,x10
171 mul x10,x5,x7 // h1*r0
172 adc x14,x14,xzr
173 umulh x11,x5,x7
174
175 adds x13,x13,x10
176 mul x10,x6,x9 // h2*5*r1
177 adc x14,x14,x11
178 mul x11,x6,x7 // h2*r0
179
180 adds x13,x13,x10
181 adc x14,x14,x11
182
183 and x10,x14,#-4 // final reduction
184 and x6,x14,#3
185 add x10,x10,x14,lsr#2
186 adds x4,x12,x10
187 adcs x5,x13,xzr
188 adc x6,x6,xzr
189
190 ret
191 .size poly1305_mult,.-poly1305_mult
192
193 .type poly1305_splat,%function
194 .align 5
195 poly1305_splat:
196 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
197 ubfx x13,x4,#26,#26
198 extr x14,x5,x4,#52
199 and x14,x14,#0x03ffffff
200 ubfx x15,x5,#14,#26
201 extr x16,x6,x5,#40
202
203 str w12,[x0,#16*0] // r0
204 add w12,w13,w13,lsl#2 // r1*5
205 str w13,[x0,#16*1] // r1
206 add w13,w14,w14,lsl#2 // r2*5
207 str w12,[x0,#16*2] // s1
208 str w14,[x0,#16*3] // r2
209 add w14,w15,w15,lsl#2 // r3*5
210 str w13,[x0,#16*4] // s2
211 str w15,[x0,#16*5] // r3
212 add w15,w16,w16,lsl#2 // r4*5
213 str w14,[x0,#16*6] // s3
214 str w16,[x0,#16*7] // r4
215 str w15,[x0,#16*8] // s4
216
217 ret
218 .size poly1305_splat,.-poly1305_splat
219
220 .type poly1305_blocks_neon,%function
221 .align 5
222 poly1305_blocks_neon:
223 ldr x17,[x0,#24]
224 cmp x2,#128
225 b.hs .Lblocks_neon
226 cbz x17,poly1305_blocks
227
228 .Lblocks_neon:
229 stp x29,x30,[sp,#-80]!
230 add x29,sp,#0
231
232 ands x2,x2,#-16
233 b.eq .Lno_data_neon
234
235 cbz x17,.Lbase2_64_neon
236
237 ldp w10,w11,[x0] // load hash value base 2^26
238 ldp w12,w13,[x0,#8]
239 ldr w14,[x0,#16]
240
241 tst x2,#31
242 b.eq .Leven_neon
243
244 ldp x7,x8,[x0,#32] // load key value
245
246 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
247 lsr x5,x12,#12
248 adds x4,x4,x12,lsl#52
249 add x5,x5,x13,lsl#14
250 adc x5,x5,xzr
251 lsr x6,x14,#24
252 adds x5,x5,x14,lsl#40
253 adc x14,x6,xzr // can be partially reduced...
254
255 ldp x12,x13,[x1],#16 // load input
256 sub x2,x2,#16
257 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
258
259 and x10,x14,#-4 // ... so reduce
260 and x6,x14,#3
261 add x10,x10,x14,lsr#2
262 adds x4,x4,x10
263 adcs x5,x5,xzr
264 adc x6,x6,xzr
265
266 #ifdef __ARMEB__
267 rev x12,x12
268 rev x13,x13
269 #endif
270 adds x4,x4,x12 // accumulate input
271 adcs x5,x5,x13
272 adc x6,x6,x3
273
274 bl poly1305_mult
275 ldr x30,[sp,#8]
276
277 cbz x3,.Lstore_base2_64_neon
278
279 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
280 ubfx x11,x4,#26,#26
281 extr x12,x5,x4,#52
282 and x12,x12,#0x03ffffff
283 ubfx x13,x5,#14,#26
284 extr x14,x6,x5,#40
285
286 cbnz x2,.Leven_neon
287
288 stp w10,w11,[x0] // store hash value base 2^26
289 stp w12,w13,[x0,#8]
290 str w14,[x0,#16]
291 b .Lno_data_neon
292
293 .align 4
294 .Lstore_base2_64_neon:
295 stp x4,x5,[x0] // store hash value base 2^64
296 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
297 b .Lno_data_neon
298
299 .align 4
300 .Lbase2_64_neon:
301 ldp x7,x8,[x0,#32] // load key value
302
303 ldp x4,x5,[x0] // load hash value base 2^64
304 ldr x6,[x0,#16]
305
306 tst x2,#31
307 b.eq .Linit_neon
308
309 ldp x12,x13,[x1],#16 // load input
310 sub x2,x2,#16
311 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
312 #ifdef __ARMEB__
313 rev x12,x12
314 rev x13,x13
315 #endif
316 adds x4,x4,x12 // accumulate input
317 adcs x5,x5,x13
318 adc x6,x6,x3
319
320 bl poly1305_mult
321
322 .Linit_neon:
323 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
324 ubfx x11,x4,#26,#26
325 extr x12,x5,x4,#52
326 and x12,x12,#0x03ffffff
327 ubfx x13,x5,#14,#26
328 extr x14,x6,x5,#40
329
330 stp d8,d9,[sp,#16] // meet ABI requirements
331 stp d10,d11,[sp,#32]
332 stp d12,d13,[sp,#48]
333 stp d14,d15,[sp,#64]
334
335 fmov d24,x10
336 fmov d25,x11
337 fmov d26,x12
338 fmov d27,x13
339 fmov d28,x14
340
341 ////////////////////////////////// initialize r^n table
342 mov x4,x7 // r^1
343 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
344 mov x5,x8
345 mov x6,xzr
346 add x0,x0,#48+12
347 bl poly1305_splat
348
349 bl poly1305_mult // r^2
350 sub x0,x0,#4
351 bl poly1305_splat
352
353 bl poly1305_mult // r^3
354 sub x0,x0,#4
355 bl poly1305_splat
356
357 bl poly1305_mult // r^4
358 sub x0,x0,#4
359 bl poly1305_splat
360 ldr x30,[sp,#8]
361
362 add x16,x1,#32
363 adr x17,.Lzeros
364 subs x2,x2,#64
365 csel x16,x17,x16,lo
366
367 mov x4,#1
368 str x4,[x0,#-24] // set is_base2_26
369 sub x0,x0,#48 // restore original x0
370 b .Ldo_neon
371
372 .align 4
373 .Leven_neon:
374 add x16,x1,#32
375 adr x17,.Lzeros
376 subs x2,x2,#64
377 csel x16,x17,x16,lo
378
379 stp d8,d9,[sp,#16] // meet ABI requirements
380 stp d10,d11,[sp,#32]
381 stp d12,d13,[sp,#48]
382 stp d14,d15,[sp,#64]
383
384 fmov d24,x10
385 fmov d25,x11
386 fmov d26,x12
387 fmov d27,x13
388 fmov d28,x14
389
390 .Ldo_neon:
391 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
392 ldp x9,x13,[x16],#48
393
394 lsl x3,x3,#24
395 add x15,x0,#48
396
397 #ifdef __ARMEB__
398 rev x8,x8
399 rev x12,x12
400 rev x9,x9
401 rev x13,x13
402 #endif
403 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
404 and x5,x9,#0x03ffffff
405 ubfx x6,x8,#26,#26
406 ubfx x7,x9,#26,#26
407 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
408 extr x8,x12,x8,#52
409 extr x9,x13,x9,#52
410 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
411 fmov d14,x4
412 and x8,x8,#0x03ffffff
413 and x9,x9,#0x03ffffff
414 ubfx x10,x12,#14,#26
415 ubfx x11,x13,#14,#26
416 add x12,x3,x12,lsr#40
417 add x13,x3,x13,lsr#40
418 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
419 fmov d15,x6
420 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
421 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
422 fmov d16,x8
423 fmov d17,x10
424 fmov d18,x12
425
426 ldp x8,x12,[x1],#16 // inp[0:1]
427 ldp x9,x13,[x1],#48
428
429 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
430 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
431 ld1 {v8.4s},[x15]
432
433 #ifdef __ARMEB__
434 rev x8,x8
435 rev x12,x12
436 rev x9,x9
437 rev x13,x13
438 #endif
439 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
440 and x5,x9,#0x03ffffff
441 ubfx x6,x8,#26,#26
442 ubfx x7,x9,#26,#26
443 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
444 extr x8,x12,x8,#52
445 extr x9,x13,x9,#52
446 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
447 fmov d9,x4
448 and x8,x8,#0x03ffffff
449 and x9,x9,#0x03ffffff
450 ubfx x10,x12,#14,#26
451 ubfx x11,x13,#14,#26
452 add x12,x3,x12,lsr#40
453 add x13,x3,x13,lsr#40
454 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
455 fmov d10,x6
456 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
457 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
458 movi v31.2d,#-1
459 fmov d11,x8
460 fmov d12,x10
461 fmov d13,x12
462 ushr v31.2d,v31.2d,#38
463
464 b.ls .Lskip_loop
465
466 .align 4
467 .Loop_neon:
468 ////////////////////////////////////////////////////////////////
469 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
470 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
471 // ___________________/
472 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
473 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
474 // ___________________/ ____________________/
475 //
476 // Note that we start with inp[2:3]*r^2. This is because it
477 // doesn't depend on reduction in previous iteration.
478 ////////////////////////////////////////////////////////////////
479 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
480 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
481 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
482 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
483 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
484
485 subs x2,x2,#64
486 umull v23.2d,v14.2s,v7.s[2]
487 csel x16,x17,x16,lo
488 umull v22.2d,v14.2s,v5.s[2]
489 umull v21.2d,v14.2s,v3.s[2]
490 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
491 umull v20.2d,v14.2s,v1.s[2]
492 ldp x9,x13,[x16],#48
493 umull v19.2d,v14.2s,v0.s[2]
494 #ifdef __ARMEB__
495 rev x8,x8
496 rev x12,x12
497 rev x9,x9
498 rev x13,x13
499 #endif
500
501 umlal v23.2d,v15.2s,v5.s[2]
502 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
503 umlal v22.2d,v15.2s,v3.s[2]
504 and x5,x9,#0x03ffffff
505 umlal v21.2d,v15.2s,v1.s[2]
506 ubfx x6,x8,#26,#26
507 umlal v20.2d,v15.2s,v0.s[2]
508 ubfx x7,x9,#26,#26
509 umlal v19.2d,v15.2s,v8.s[2]
510 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
511
512 umlal v23.2d,v16.2s,v3.s[2]
513 extr x8,x12,x8,#52
514 umlal v22.2d,v16.2s,v1.s[2]
515 extr x9,x13,x9,#52
516 umlal v21.2d,v16.2s,v0.s[2]
517 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
518 umlal v20.2d,v16.2s,v8.s[2]
519 fmov d14,x4
520 umlal v19.2d,v16.2s,v6.s[2]
521 and x8,x8,#0x03ffffff
522
523 umlal v23.2d,v17.2s,v1.s[2]
524 and x9,x9,#0x03ffffff
525 umlal v22.2d,v17.2s,v0.s[2]
526 ubfx x10,x12,#14,#26
527 umlal v21.2d,v17.2s,v8.s[2]
528 ubfx x11,x13,#14,#26
529 umlal v20.2d,v17.2s,v6.s[2]
530 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
531 umlal v19.2d,v17.2s,v4.s[2]
532 fmov d15,x6
533
534 add v11.2s,v11.2s,v26.2s
535 add x12,x3,x12,lsr#40
536 umlal v23.2d,v18.2s,v0.s[2]
537 add x13,x3,x13,lsr#40
538 umlal v22.2d,v18.2s,v8.s[2]
539 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
540 umlal v21.2d,v18.2s,v6.s[2]
541 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
542 umlal v20.2d,v18.2s,v4.s[2]
543 fmov d16,x8
544 umlal v19.2d,v18.2s,v2.s[2]
545 fmov d17,x10
546
547 ////////////////////////////////////////////////////////////////
548 // (hash+inp[0:1])*r^4 and accumulate
549
550 add v9.2s,v9.2s,v24.2s
551 fmov d18,x12
552 umlal v22.2d,v11.2s,v1.s[0]
553 ldp x8,x12,[x1],#16 // inp[0:1]
554 umlal v19.2d,v11.2s,v6.s[0]
555 ldp x9,x13,[x1],#48
556 umlal v23.2d,v11.2s,v3.s[0]
557 umlal v20.2d,v11.2s,v8.s[0]
558 umlal v21.2d,v11.2s,v0.s[0]
559 #ifdef __ARMEB__
560 rev x8,x8
561 rev x12,x12
562 rev x9,x9
563 rev x13,x13
564 #endif
565
566 add v10.2s,v10.2s,v25.2s
567 umlal v22.2d,v9.2s,v5.s[0]
568 umlal v23.2d,v9.2s,v7.s[0]
569 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
570 umlal v21.2d,v9.2s,v3.s[0]
571 and x5,x9,#0x03ffffff
572 umlal v19.2d,v9.2s,v0.s[0]
573 ubfx x6,x8,#26,#26
574 umlal v20.2d,v9.2s,v1.s[0]
575 ubfx x7,x9,#26,#26
576
577 add v12.2s,v12.2s,v27.2s
578 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
579 umlal v22.2d,v10.2s,v3.s[0]
580 extr x8,x12,x8,#52
581 umlal v23.2d,v10.2s,v5.s[0]
582 extr x9,x13,x9,#52
583 umlal v19.2d,v10.2s,v8.s[0]
584 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
585 umlal v21.2d,v10.2s,v1.s[0]
586 fmov d9,x4
587 umlal v20.2d,v10.2s,v0.s[0]
588 and x8,x8,#0x03ffffff
589
590 add v13.2s,v13.2s,v28.2s
591 and x9,x9,#0x03ffffff
592 umlal v22.2d,v12.2s,v0.s[0]
593 ubfx x10,x12,#14,#26
594 umlal v19.2d,v12.2s,v4.s[0]
595 ubfx x11,x13,#14,#26
596 umlal v23.2d,v12.2s,v1.s[0]
597 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
598 umlal v20.2d,v12.2s,v6.s[0]
599 fmov d10,x6
600 umlal v21.2d,v12.2s,v8.s[0]
601 add x12,x3,x12,lsr#40
602
603 umlal v22.2d,v13.2s,v8.s[0]
604 add x13,x3,x13,lsr#40
605 umlal v19.2d,v13.2s,v2.s[0]
606 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
607 umlal v23.2d,v13.2s,v0.s[0]
608 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
609 umlal v20.2d,v13.2s,v4.s[0]
610 fmov d11,x8
611 umlal v21.2d,v13.2s,v6.s[0]
612 fmov d12,x10
613 fmov d13,x12
614
615 /////////////////////////////////////////////////////////////////
616 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
617 // and P. Schwabe
618 //
619 // [see discussion in poly1305-armv4 module]
620
621 ushr v29.2d,v22.2d,#26
622 xtn v27.2s,v22.2d
623 ushr v30.2d,v19.2d,#26
624 and v19.16b,v19.16b,v31.16b
625 add v23.2d,v23.2d,v29.2d // h3 -> h4
626 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
627 add v20.2d,v20.2d,v30.2d // h0 -> h1
628
629 ushr v29.2d,v23.2d,#26
630 xtn v28.2s,v23.2d
631 ushr v30.2d,v20.2d,#26
632 xtn v25.2s,v20.2d
633 bic v28.2s,#0xfc,lsl#24
634 add v21.2d,v21.2d,v30.2d // h1 -> h2
635
636 add v19.2d,v19.2d,v29.2d
637 shl v29.2d,v29.2d,#2
638 shrn v30.2s,v21.2d,#26
639 xtn v26.2s,v21.2d
640 add v19.2d,v19.2d,v29.2d // h4 -> h0
641 bic v25.2s,#0xfc,lsl#24
642 add v27.2s,v27.2s,v30.2s // h2 -> h3
643 bic v26.2s,#0xfc,lsl#24
644
645 shrn v29.2s,v19.2d,#26
646 xtn v24.2s,v19.2d
647 ushr v30.2s,v27.2s,#26
648 bic v27.2s,#0xfc,lsl#24
649 bic v24.2s,#0xfc,lsl#24
650 add v25.2s,v25.2s,v29.2s // h0 -> h1
651 add v28.2s,v28.2s,v30.2s // h3 -> h4
652
653 b.hi .Loop_neon
654
655 .Lskip_loop:
656 dup v16.2d,v16.d[0]
657 add v11.2s,v11.2s,v26.2s
658
659 ////////////////////////////////////////////////////////////////
660 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
661
662 adds x2,x2,#32
663 b.ne .Long_tail
664
665 dup v16.2d,v11.d[0]
666 add v14.2s,v9.2s,v24.2s
667 add v17.2s,v12.2s,v27.2s
668 add v15.2s,v10.2s,v25.2s
669 add v18.2s,v13.2s,v28.2s
670
671 .Long_tail:
672 dup v14.2d,v14.d[0]
673 umull2 v19.2d,v16.4s,v6.4s
674 umull2 v22.2d,v16.4s,v1.4s
675 umull2 v23.2d,v16.4s,v3.4s
676 umull2 v21.2d,v16.4s,v0.4s
677 umull2 v20.2d,v16.4s,v8.4s
678
679 dup v15.2d,v15.d[0]
680 umlal2 v19.2d,v14.4s,v0.4s
681 umlal2 v21.2d,v14.4s,v3.4s
682 umlal2 v22.2d,v14.4s,v5.4s
683 umlal2 v23.2d,v14.4s,v7.4s
684 umlal2 v20.2d,v14.4s,v1.4s
685
686 dup v17.2d,v17.d[0]
687 umlal2 v19.2d,v15.4s,v8.4s
688 umlal2 v22.2d,v15.4s,v3.4s
689 umlal2 v21.2d,v15.4s,v1.4s
690 umlal2 v23.2d,v15.4s,v5.4s
691 umlal2 v20.2d,v15.4s,v0.4s
692
693 dup v18.2d,v18.d[0]
694 umlal2 v22.2d,v17.4s,v0.4s
695 umlal2 v23.2d,v17.4s,v1.4s
696 umlal2 v19.2d,v17.4s,v4.4s
697 umlal2 v20.2d,v17.4s,v6.4s
698 umlal2 v21.2d,v17.4s,v8.4s
699
700 umlal2 v22.2d,v18.4s,v8.4s
701 umlal2 v19.2d,v18.4s,v2.4s
702 umlal2 v23.2d,v18.4s,v0.4s
703 umlal2 v20.2d,v18.4s,v4.4s
704 umlal2 v21.2d,v18.4s,v6.4s
705
706 b.eq .Lshort_tail
707
708 ////////////////////////////////////////////////////////////////
709 // (hash+inp[0:1])*r^4:r^3 and accumulate
710
711 add v9.2s,v9.2s,v24.2s
712 umlal v22.2d,v11.2s,v1.2s
713 umlal v19.2d,v11.2s,v6.2s
714 umlal v23.2d,v11.2s,v3.2s
715 umlal v20.2d,v11.2s,v8.2s
716 umlal v21.2d,v11.2s,v0.2s
717
718 add v10.2s,v10.2s,v25.2s
719 umlal v22.2d,v9.2s,v5.2s
720 umlal v19.2d,v9.2s,v0.2s
721 umlal v23.2d,v9.2s,v7.2s
722 umlal v20.2d,v9.2s,v1.2s
723 umlal v21.2d,v9.2s,v3.2s
724
725 add v12.2s,v12.2s,v27.2s
726 umlal v22.2d,v10.2s,v3.2s
727 umlal v19.2d,v10.2s,v8.2s
728 umlal v23.2d,v10.2s,v5.2s
729 umlal v20.2d,v10.2s,v0.2s
730 umlal v21.2d,v10.2s,v1.2s
731
732 add v13.2s,v13.2s,v28.2s
733 umlal v22.2d,v12.2s,v0.2s
734 umlal v19.2d,v12.2s,v4.2s
735 umlal v23.2d,v12.2s,v1.2s
736 umlal v20.2d,v12.2s,v6.2s
737 umlal v21.2d,v12.2s,v8.2s
738
739 umlal v22.2d,v13.2s,v8.2s
740 umlal v19.2d,v13.2s,v2.2s
741 umlal v23.2d,v13.2s,v0.2s
742 umlal v20.2d,v13.2s,v4.2s
743 umlal v21.2d,v13.2s,v6.2s
744
745 .Lshort_tail:
746 ////////////////////////////////////////////////////////////////
747 // horizontal add
748
749 addp v22.2d,v22.2d,v22.2d
750 ldp d8,d9,[sp,#16] // meet ABI requirements
751 addp v19.2d,v19.2d,v19.2d
752 ldp d10,d11,[sp,#32]
753 addp v23.2d,v23.2d,v23.2d
754 ldp d12,d13,[sp,#48]
755 addp v20.2d,v20.2d,v20.2d
756 ldp d14,d15,[sp,#64]
757 addp v21.2d,v21.2d,v21.2d
758
759 ////////////////////////////////////////////////////////////////
760 // lazy reduction, but without narrowing
761
762 ushr v29.2d,v22.2d,#26
763 and v22.16b,v22.16b,v31.16b
764 ushr v30.2d,v19.2d,#26
765 and v19.16b,v19.16b,v31.16b
766
767 add v23.2d,v23.2d,v29.2d // h3 -> h4
768 add v20.2d,v20.2d,v30.2d // h0 -> h1
769
770 ushr v29.2d,v23.2d,#26
771 and v23.16b,v23.16b,v31.16b
772 ushr v30.2d,v20.2d,#26
773 and v20.16b,v20.16b,v31.16b
774 add v21.2d,v21.2d,v30.2d // h1 -> h2
775
776 add v19.2d,v19.2d,v29.2d
777 shl v29.2d,v29.2d,#2
778 ushr v30.2d,v21.2d,#26
779 and v21.16b,v21.16b,v31.16b
780 add v19.2d,v19.2d,v29.2d // h4 -> h0
781 add v22.2d,v22.2d,v30.2d // h2 -> h3
782
783 ushr v29.2d,v19.2d,#26
784 and v19.16b,v19.16b,v31.16b
785 ushr v30.2d,v22.2d,#26
786 and v22.16b,v22.16b,v31.16b
787 add v20.2d,v20.2d,v29.2d // h0 -> h1
788 add v23.2d,v23.2d,v30.2d // h3 -> h4
789
790 ////////////////////////////////////////////////////////////////
791 // write the result, can be partially reduced
792
793 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
794 st1 {v23.s}[0],[x0]
795
796 .Lno_data_neon:
797 ldr x29,[sp],#80
798 ret
799 .size poly1305_blocks_neon,.-poly1305_blocks_neon
800
801 .type poly1305_emit_neon,%function
802 .align 5
803 poly1305_emit_neon:
804 ldr x17,[x0,#24]
805 cbz x17,poly1305_emit
806
807 ldp w10,w11,[x0] // load hash value base 2^26
808 ldp w12,w13,[x0,#8]
809 ldr w14,[x0,#16]
810
811 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
812 lsr x5,x12,#12
813 adds x4,x4,x12,lsl#52
814 add x5,x5,x13,lsl#14
815 adc x5,x5,xzr
816 lsr x6,x14,#24
817 adds x5,x5,x14,lsl#40
818 adc x6,x6,xzr // can be partially reduced...
819
820 ldp x10,x11,[x2] // load nonce
821
822 and x12,x6,#-4 // ... so reduce
823 add x12,x12,x6,lsr#2
824 and x6,x6,#3
825 adds x4,x4,x12
826 adcs x5,x5,xzr
827 adc x6,x6,xzr
828
829 adds x12,x4,#5 // compare to modulus
830 adcs x13,x5,xzr
831 adc x14,x6,xzr
832
833 tst x14,#-4 // see if it's carried/borrowed
834
835 csel x4,x4,x12,eq
836 csel x5,x5,x13,eq
837
838 #ifdef __ARMEB__
839 ror x10,x10,#32 // flip nonce words
840 ror x11,x11,#32
841 #endif
842 adds x4,x4,x10 // accumulate nonce
843 adc x5,x5,x11
844 #ifdef __ARMEB__
845 rev x4,x4 // flip output bytes
846 rev x5,x5
847 #endif
848 stp x4,x5,[x1] // write result
849
850 ret
851 .size poly1305_emit_neon,.-poly1305_emit_neon
852
853 .align 5
854 .Lzeros:
855 .long 0,0,0,0,0,0,0,0
856 .LOPENSSL_armcap_P:
857 #ifdef __ILP32__
858 .long OPENSSL_armcap_P-.
859 #else
860 .quad OPENSSL_armcap_P-.
861 #endif
862 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
863 .align 2
864 .align 2
865