poly1305-armv8.S revision 1.1.6.2 1 #include "arm_arch.h"
2
3 .text
4
5 // forward "declarations" are required for Apple
6
7 .hidden OPENSSL_armcap_P
8 .globl poly1305_init
9 .hidden poly1305_init
10 .globl poly1305_blocks
11 .hidden poly1305_blocks
12 .globl poly1305_emit
13 .hidden poly1305_emit
14
15 .type poly1305_init,%function
16 .align 5
17 poly1305_init:
18 cmp x1,xzr
19 stp xzr,xzr,[x0] // zero hash value
20 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
21
22 csel x0,xzr,x0,eq
23 b.eq .Lno_key
24
25 #ifdef __ILP32__
26 ldrsw x11,.LOPENSSL_armcap_P
27 #else
28 ldr x11,.LOPENSSL_armcap_P
29 #endif
30 adr x10,.LOPENSSL_armcap_P
31
32 ldp x7,x8,[x1] // load key
33 mov x9,#0xfffffffc0fffffff
34 movk x9,#0x0fff,lsl#48
35 ldr w17,[x10,x11]
36 #ifdef __ARMEB__
37 rev x7,x7 // flip bytes
38 rev x8,x8
39 #endif
40 and x7,x7,x9 // &=0ffffffc0fffffff
41 and x9,x9,#-4
42 and x8,x8,x9 // &=0ffffffc0ffffffc
43 stp x7,x8,[x0,#32] // save key value
44
45 tst w17,#ARMV7_NEON
46
47 adr x12,poly1305_blocks
48 adr x7,poly1305_blocks_neon
49 adr x13,poly1305_emit
50 adr x8,poly1305_emit_neon
51
52 csel x12,x12,x7,eq
53 csel x13,x13,x8,eq
54
55 #ifdef __ILP32__
56 stp w12,w13,[x2]
57 #else
58 stp x12,x13,[x2]
59 #endif
60
61 mov x0,#1
62 .Lno_key:
63 ret
64 .size poly1305_init,.-poly1305_init
65
66 .type poly1305_blocks,%function
67 .align 5
68 poly1305_blocks:
69 ands x2,x2,#-16
70 b.eq .Lno_data
71
72 ldp x4,x5,[x0] // load hash value
73 ldp x7,x8,[x0,#32] // load key value
74 ldr x6,[x0,#16]
75 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
76 b .Loop
77
78 .align 5
79 .Loop:
80 ldp x10,x11,[x1],#16 // load input
81 sub x2,x2,#16
82 #ifdef __ARMEB__
83 rev x10,x10
84 rev x11,x11
85 #endif
86 adds x4,x4,x10 // accumulate input
87 adcs x5,x5,x11
88
89 mul x12,x4,x7 // h0*r0
90 adc x6,x6,x3
91 umulh x13,x4,x7
92
93 mul x10,x5,x9 // h1*5*r1
94 umulh x11,x5,x9
95
96 adds x12,x12,x10
97 mul x10,x4,x8 // h0*r1
98 adc x13,x13,x11
99 umulh x14,x4,x8
100
101 adds x13,x13,x10
102 mul x10,x5,x7 // h1*r0
103 adc x14,x14,xzr
104 umulh x11,x5,x7
105
106 adds x13,x13,x10
107 mul x10,x6,x9 // h2*5*r1
108 adc x14,x14,x11
109 mul x11,x6,x7 // h2*r0
110
111 adds x13,x13,x10
112 adc x14,x14,x11
113
114 and x10,x14,#-4 // final reduction
115 and x6,x14,#3
116 add x10,x10,x14,lsr#2
117 adds x4,x12,x10
118 adcs x5,x13,xzr
119 adc x6,x6,xzr
120
121 cbnz x2,.Loop
122
123 stp x4,x5,[x0] // store hash value
124 str x6,[x0,#16]
125
126 .Lno_data:
127 ret
128 .size poly1305_blocks,.-poly1305_blocks
129
130 .type poly1305_emit,%function
131 .align 5
132 poly1305_emit:
133 ldp x4,x5,[x0] // load hash base 2^64
134 ldr x6,[x0,#16]
135 ldp x10,x11,[x2] // load nonce
136
137 adds x12,x4,#5 // compare to modulus
138 adcs x13,x5,xzr
139 adc x14,x6,xzr
140
141 tst x14,#-4 // see if it's carried/borrowed
142
143 csel x4,x4,x12,eq
144 csel x5,x5,x13,eq
145
146 #ifdef __ARMEB__
147 ror x10,x10,#32 // flip nonce words
148 ror x11,x11,#32
149 #endif
150 adds x4,x4,x10 // accumulate nonce
151 adc x5,x5,x11
152 #ifdef __ARMEB__
153 rev x4,x4 // flip output bytes
154 rev x5,x5
155 #endif
156 stp x4,x5,[x1] // write result
157
158 ret
159 .size poly1305_emit,.-poly1305_emit
160 .type poly1305_mult,%function
161 .align 5
162 poly1305_mult:
163 mul x12,x4,x7 // h0*r0
164 umulh x13,x4,x7
165
166 mul x10,x5,x9 // h1*5*r1
167 umulh x11,x5,x9
168
169 adds x12,x12,x10
170 mul x10,x4,x8 // h0*r1
171 adc x13,x13,x11
172 umulh x14,x4,x8
173
174 adds x13,x13,x10
175 mul x10,x5,x7 // h1*r0
176 adc x14,x14,xzr
177 umulh x11,x5,x7
178
179 adds x13,x13,x10
180 mul x10,x6,x9 // h2*5*r1
181 adc x14,x14,x11
182 mul x11,x6,x7 // h2*r0
183
184 adds x13,x13,x10
185 adc x14,x14,x11
186
187 and x10,x14,#-4 // final reduction
188 and x6,x14,#3
189 add x10,x10,x14,lsr#2
190 adds x4,x12,x10
191 adcs x5,x13,xzr
192 adc x6,x6,xzr
193
194 ret
195 .size poly1305_mult,.-poly1305_mult
196
197 .type poly1305_splat,%function
198 .align 5
199 poly1305_splat:
200 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
201 ubfx x13,x4,#26,#26
202 extr x14,x5,x4,#52
203 and x14,x14,#0x03ffffff
204 ubfx x15,x5,#14,#26
205 extr x16,x6,x5,#40
206
207 str w12,[x0,#16*0] // r0
208 add w12,w13,w13,lsl#2 // r1*5
209 str w13,[x0,#16*1] // r1
210 add w13,w14,w14,lsl#2 // r2*5
211 str w12,[x0,#16*2] // s1
212 str w14,[x0,#16*3] // r2
213 add w14,w15,w15,lsl#2 // r3*5
214 str w13,[x0,#16*4] // s2
215 str w15,[x0,#16*5] // r3
216 add w15,w16,w16,lsl#2 // r4*5
217 str w14,[x0,#16*6] // s3
218 str w16,[x0,#16*7] // r4
219 str w15,[x0,#16*8] // s4
220
221 ret
222 .size poly1305_splat,.-poly1305_splat
223
224 .type poly1305_blocks_neon,%function
225 .align 5
226 poly1305_blocks_neon:
227 ldr x17,[x0,#24]
228 cmp x2,#128
229 b.hs .Lblocks_neon
230 cbz x17,poly1305_blocks
231
232 .Lblocks_neon:
233 .inst 0xd503233f // paciasp
234 stp x29,x30,[sp,#-80]!
235 add x29,sp,#0
236
237 ands x2,x2,#-16
238 b.eq .Lno_data_neon
239
240 cbz x17,.Lbase2_64_neon
241
242 ldp w10,w11,[x0] // load hash value base 2^26
243 ldp w12,w13,[x0,#8]
244 ldr w14,[x0,#16]
245
246 tst x2,#31
247 b.eq .Leven_neon
248
249 ldp x7,x8,[x0,#32] // load key value
250
251 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
252 lsr x5,x12,#12
253 adds x4,x4,x12,lsl#52
254 add x5,x5,x13,lsl#14
255 adc x5,x5,xzr
256 lsr x6,x14,#24
257 adds x5,x5,x14,lsl#40
258 adc x14,x6,xzr // can be partially reduced...
259
260 ldp x12,x13,[x1],#16 // load input
261 sub x2,x2,#16
262 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
263
264 and x10,x14,#-4 // ... so reduce
265 and x6,x14,#3
266 add x10,x10,x14,lsr#2
267 adds x4,x4,x10
268 adcs x5,x5,xzr
269 adc x6,x6,xzr
270
271 #ifdef __ARMEB__
272 rev x12,x12
273 rev x13,x13
274 #endif
275 adds x4,x4,x12 // accumulate input
276 adcs x5,x5,x13
277 adc x6,x6,x3
278
279 bl poly1305_mult
280 ldr x30,[sp,#8]
281
282 cbz x3,.Lstore_base2_64_neon
283
284 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
285 ubfx x11,x4,#26,#26
286 extr x12,x5,x4,#52
287 and x12,x12,#0x03ffffff
288 ubfx x13,x5,#14,#26
289 extr x14,x6,x5,#40
290
291 cbnz x2,.Leven_neon
292
293 stp w10,w11,[x0] // store hash value base 2^26
294 stp w12,w13,[x0,#8]
295 str w14,[x0,#16]
296 b .Lno_data_neon
297
298 .align 4
299 .Lstore_base2_64_neon:
300 stp x4,x5,[x0] // store hash value base 2^64
301 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
302 b .Lno_data_neon
303
304 .align 4
305 .Lbase2_64_neon:
306 ldp x7,x8,[x0,#32] // load key value
307
308 ldp x4,x5,[x0] // load hash value base 2^64
309 ldr x6,[x0,#16]
310
311 tst x2,#31
312 b.eq .Linit_neon
313
314 ldp x12,x13,[x1],#16 // load input
315 sub x2,x2,#16
316 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
317 #ifdef __ARMEB__
318 rev x12,x12
319 rev x13,x13
320 #endif
321 adds x4,x4,x12 // accumulate input
322 adcs x5,x5,x13
323 adc x6,x6,x3
324
325 bl poly1305_mult
326
327 .Linit_neon:
328 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
329 ubfx x11,x4,#26,#26
330 extr x12,x5,x4,#52
331 and x12,x12,#0x03ffffff
332 ubfx x13,x5,#14,#26
333 extr x14,x6,x5,#40
334
335 stp d8,d9,[sp,#16] // meet ABI requirements
336 stp d10,d11,[sp,#32]
337 stp d12,d13,[sp,#48]
338 stp d14,d15,[sp,#64]
339
340 fmov d24,x10
341 fmov d25,x11
342 fmov d26,x12
343 fmov d27,x13
344 fmov d28,x14
345
346 ////////////////////////////////// initialize r^n table
347 mov x4,x7 // r^1
348 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
349 mov x5,x8
350 mov x6,xzr
351 add x0,x0,#48+12
352 bl poly1305_splat
353
354 bl poly1305_mult // r^2
355 sub x0,x0,#4
356 bl poly1305_splat
357
358 bl poly1305_mult // r^3
359 sub x0,x0,#4
360 bl poly1305_splat
361
362 bl poly1305_mult // r^4
363 sub x0,x0,#4
364 bl poly1305_splat
365 ldr x30,[sp,#8]
366
367 add x16,x1,#32
368 adr x17,.Lzeros
369 subs x2,x2,#64
370 csel x16,x17,x16,lo
371
372 mov x4,#1
373 str x4,[x0,#-24] // set is_base2_26
374 sub x0,x0,#48 // restore original x0
375 b .Ldo_neon
376
377 .align 4
378 .Leven_neon:
379 add x16,x1,#32
380 adr x17,.Lzeros
381 subs x2,x2,#64
382 csel x16,x17,x16,lo
383
384 stp d8,d9,[sp,#16] // meet ABI requirements
385 stp d10,d11,[sp,#32]
386 stp d12,d13,[sp,#48]
387 stp d14,d15,[sp,#64]
388
389 fmov d24,x10
390 fmov d25,x11
391 fmov d26,x12
392 fmov d27,x13
393 fmov d28,x14
394
395 .Ldo_neon:
396 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
397 ldp x9,x13,[x16],#48
398
399 lsl x3,x3,#24
400 add x15,x0,#48
401
402 #ifdef __ARMEB__
403 rev x8,x8
404 rev x12,x12
405 rev x9,x9
406 rev x13,x13
407 #endif
408 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
409 and x5,x9,#0x03ffffff
410 ubfx x6,x8,#26,#26
411 ubfx x7,x9,#26,#26
412 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
413 extr x8,x12,x8,#52
414 extr x9,x13,x9,#52
415 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
416 fmov d14,x4
417 and x8,x8,#0x03ffffff
418 and x9,x9,#0x03ffffff
419 ubfx x10,x12,#14,#26
420 ubfx x11,x13,#14,#26
421 add x12,x3,x12,lsr#40
422 add x13,x3,x13,lsr#40
423 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
424 fmov d15,x6
425 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
426 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
427 fmov d16,x8
428 fmov d17,x10
429 fmov d18,x12
430
431 ldp x8,x12,[x1],#16 // inp[0:1]
432 ldp x9,x13,[x1],#48
433
434 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
435 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
436 ld1 {v8.4s},[x15]
437
438 #ifdef __ARMEB__
439 rev x8,x8
440 rev x12,x12
441 rev x9,x9
442 rev x13,x13
443 #endif
444 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
445 and x5,x9,#0x03ffffff
446 ubfx x6,x8,#26,#26
447 ubfx x7,x9,#26,#26
448 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
449 extr x8,x12,x8,#52
450 extr x9,x13,x9,#52
451 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
452 fmov d9,x4
453 and x8,x8,#0x03ffffff
454 and x9,x9,#0x03ffffff
455 ubfx x10,x12,#14,#26
456 ubfx x11,x13,#14,#26
457 add x12,x3,x12,lsr#40
458 add x13,x3,x13,lsr#40
459 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
460 fmov d10,x6
461 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
462 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
463 movi v31.2d,#-1
464 fmov d11,x8
465 fmov d12,x10
466 fmov d13,x12
467 ushr v31.2d,v31.2d,#38
468
469 b.ls .Lskip_loop
470
471 .align 4
472 .Loop_neon:
473 ////////////////////////////////////////////////////////////////
474 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
475 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
476 // ___________________/
477 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
478 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
479 // ___________________/ ____________________/
480 //
481 // Note that we start with inp[2:3]*r^2. This is because it
482 // doesn't depend on reduction in previous iteration.
483 ////////////////////////////////////////////////////////////////
484 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
485 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
486 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
487 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
488 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
489
490 subs x2,x2,#64
491 umull v23.2d,v14.2s,v7.s[2]
492 csel x16,x17,x16,lo
493 umull v22.2d,v14.2s,v5.s[2]
494 umull v21.2d,v14.2s,v3.s[2]
495 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
496 umull v20.2d,v14.2s,v1.s[2]
497 ldp x9,x13,[x16],#48
498 umull v19.2d,v14.2s,v0.s[2]
499 #ifdef __ARMEB__
500 rev x8,x8
501 rev x12,x12
502 rev x9,x9
503 rev x13,x13
504 #endif
505
506 umlal v23.2d,v15.2s,v5.s[2]
507 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
508 umlal v22.2d,v15.2s,v3.s[2]
509 and x5,x9,#0x03ffffff
510 umlal v21.2d,v15.2s,v1.s[2]
511 ubfx x6,x8,#26,#26
512 umlal v20.2d,v15.2s,v0.s[2]
513 ubfx x7,x9,#26,#26
514 umlal v19.2d,v15.2s,v8.s[2]
515 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
516
517 umlal v23.2d,v16.2s,v3.s[2]
518 extr x8,x12,x8,#52
519 umlal v22.2d,v16.2s,v1.s[2]
520 extr x9,x13,x9,#52
521 umlal v21.2d,v16.2s,v0.s[2]
522 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
523 umlal v20.2d,v16.2s,v8.s[2]
524 fmov d14,x4
525 umlal v19.2d,v16.2s,v6.s[2]
526 and x8,x8,#0x03ffffff
527
528 umlal v23.2d,v17.2s,v1.s[2]
529 and x9,x9,#0x03ffffff
530 umlal v22.2d,v17.2s,v0.s[2]
531 ubfx x10,x12,#14,#26
532 umlal v21.2d,v17.2s,v8.s[2]
533 ubfx x11,x13,#14,#26
534 umlal v20.2d,v17.2s,v6.s[2]
535 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
536 umlal v19.2d,v17.2s,v4.s[2]
537 fmov d15,x6
538
539 add v11.2s,v11.2s,v26.2s
540 add x12,x3,x12,lsr#40
541 umlal v23.2d,v18.2s,v0.s[2]
542 add x13,x3,x13,lsr#40
543 umlal v22.2d,v18.2s,v8.s[2]
544 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
545 umlal v21.2d,v18.2s,v6.s[2]
546 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
547 umlal v20.2d,v18.2s,v4.s[2]
548 fmov d16,x8
549 umlal v19.2d,v18.2s,v2.s[2]
550 fmov d17,x10
551
552 ////////////////////////////////////////////////////////////////
553 // (hash+inp[0:1])*r^4 and accumulate
554
555 add v9.2s,v9.2s,v24.2s
556 fmov d18,x12
557 umlal v22.2d,v11.2s,v1.s[0]
558 ldp x8,x12,[x1],#16 // inp[0:1]
559 umlal v19.2d,v11.2s,v6.s[0]
560 ldp x9,x13,[x1],#48
561 umlal v23.2d,v11.2s,v3.s[0]
562 umlal v20.2d,v11.2s,v8.s[0]
563 umlal v21.2d,v11.2s,v0.s[0]
564 #ifdef __ARMEB__
565 rev x8,x8
566 rev x12,x12
567 rev x9,x9
568 rev x13,x13
569 #endif
570
571 add v10.2s,v10.2s,v25.2s
572 umlal v22.2d,v9.2s,v5.s[0]
573 umlal v23.2d,v9.2s,v7.s[0]
574 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
575 umlal v21.2d,v9.2s,v3.s[0]
576 and x5,x9,#0x03ffffff
577 umlal v19.2d,v9.2s,v0.s[0]
578 ubfx x6,x8,#26,#26
579 umlal v20.2d,v9.2s,v1.s[0]
580 ubfx x7,x9,#26,#26
581
582 add v12.2s,v12.2s,v27.2s
583 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
584 umlal v22.2d,v10.2s,v3.s[0]
585 extr x8,x12,x8,#52
586 umlal v23.2d,v10.2s,v5.s[0]
587 extr x9,x13,x9,#52
588 umlal v19.2d,v10.2s,v8.s[0]
589 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
590 umlal v21.2d,v10.2s,v1.s[0]
591 fmov d9,x4
592 umlal v20.2d,v10.2s,v0.s[0]
593 and x8,x8,#0x03ffffff
594
595 add v13.2s,v13.2s,v28.2s
596 and x9,x9,#0x03ffffff
597 umlal v22.2d,v12.2s,v0.s[0]
598 ubfx x10,x12,#14,#26
599 umlal v19.2d,v12.2s,v4.s[0]
600 ubfx x11,x13,#14,#26
601 umlal v23.2d,v12.2s,v1.s[0]
602 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
603 umlal v20.2d,v12.2s,v6.s[0]
604 fmov d10,x6
605 umlal v21.2d,v12.2s,v8.s[0]
606 add x12,x3,x12,lsr#40
607
608 umlal v22.2d,v13.2s,v8.s[0]
609 add x13,x3,x13,lsr#40
610 umlal v19.2d,v13.2s,v2.s[0]
611 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
612 umlal v23.2d,v13.2s,v0.s[0]
613 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
614 umlal v20.2d,v13.2s,v4.s[0]
615 fmov d11,x8
616 umlal v21.2d,v13.2s,v6.s[0]
617 fmov d12,x10
618 fmov d13,x12
619
620 /////////////////////////////////////////////////////////////////
621 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
622 // and P. Schwabe
623 //
624 // [see discussion in poly1305-armv4 module]
625
626 ushr v29.2d,v22.2d,#26
627 xtn v27.2s,v22.2d
628 ushr v30.2d,v19.2d,#26
629 and v19.16b,v19.16b,v31.16b
630 add v23.2d,v23.2d,v29.2d // h3 -> h4
631 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
632 add v20.2d,v20.2d,v30.2d // h0 -> h1
633
634 ushr v29.2d,v23.2d,#26
635 xtn v28.2s,v23.2d
636 ushr v30.2d,v20.2d,#26
637 xtn v25.2s,v20.2d
638 bic v28.2s,#0xfc,lsl#24
639 add v21.2d,v21.2d,v30.2d // h1 -> h2
640
641 add v19.2d,v19.2d,v29.2d
642 shl v29.2d,v29.2d,#2
643 shrn v30.2s,v21.2d,#26
644 xtn v26.2s,v21.2d
645 add v19.2d,v19.2d,v29.2d // h4 -> h0
646 bic v25.2s,#0xfc,lsl#24
647 add v27.2s,v27.2s,v30.2s // h2 -> h3
648 bic v26.2s,#0xfc,lsl#24
649
650 shrn v29.2s,v19.2d,#26
651 xtn v24.2s,v19.2d
652 ushr v30.2s,v27.2s,#26
653 bic v27.2s,#0xfc,lsl#24
654 bic v24.2s,#0xfc,lsl#24
655 add v25.2s,v25.2s,v29.2s // h0 -> h1
656 add v28.2s,v28.2s,v30.2s // h3 -> h4
657
658 b.hi .Loop_neon
659
660 .Lskip_loop:
661 dup v16.2d,v16.d[0]
662 add v11.2s,v11.2s,v26.2s
663
664 ////////////////////////////////////////////////////////////////
665 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
666
667 adds x2,x2,#32
668 b.ne .Long_tail
669
670 dup v16.2d,v11.d[0]
671 add v14.2s,v9.2s,v24.2s
672 add v17.2s,v12.2s,v27.2s
673 add v15.2s,v10.2s,v25.2s
674 add v18.2s,v13.2s,v28.2s
675
676 .Long_tail:
677 dup v14.2d,v14.d[0]
678 umull2 v19.2d,v16.4s,v6.4s
679 umull2 v22.2d,v16.4s,v1.4s
680 umull2 v23.2d,v16.4s,v3.4s
681 umull2 v21.2d,v16.4s,v0.4s
682 umull2 v20.2d,v16.4s,v8.4s
683
684 dup v15.2d,v15.d[0]
685 umlal2 v19.2d,v14.4s,v0.4s
686 umlal2 v21.2d,v14.4s,v3.4s
687 umlal2 v22.2d,v14.4s,v5.4s
688 umlal2 v23.2d,v14.4s,v7.4s
689 umlal2 v20.2d,v14.4s,v1.4s
690
691 dup v17.2d,v17.d[0]
692 umlal2 v19.2d,v15.4s,v8.4s
693 umlal2 v22.2d,v15.4s,v3.4s
694 umlal2 v21.2d,v15.4s,v1.4s
695 umlal2 v23.2d,v15.4s,v5.4s
696 umlal2 v20.2d,v15.4s,v0.4s
697
698 dup v18.2d,v18.d[0]
699 umlal2 v22.2d,v17.4s,v0.4s
700 umlal2 v23.2d,v17.4s,v1.4s
701 umlal2 v19.2d,v17.4s,v4.4s
702 umlal2 v20.2d,v17.4s,v6.4s
703 umlal2 v21.2d,v17.4s,v8.4s
704
705 umlal2 v22.2d,v18.4s,v8.4s
706 umlal2 v19.2d,v18.4s,v2.4s
707 umlal2 v23.2d,v18.4s,v0.4s
708 umlal2 v20.2d,v18.4s,v4.4s
709 umlal2 v21.2d,v18.4s,v6.4s
710
711 b.eq .Lshort_tail
712
713 ////////////////////////////////////////////////////////////////
714 // (hash+inp[0:1])*r^4:r^3 and accumulate
715
716 add v9.2s,v9.2s,v24.2s
717 umlal v22.2d,v11.2s,v1.2s
718 umlal v19.2d,v11.2s,v6.2s
719 umlal v23.2d,v11.2s,v3.2s
720 umlal v20.2d,v11.2s,v8.2s
721 umlal v21.2d,v11.2s,v0.2s
722
723 add v10.2s,v10.2s,v25.2s
724 umlal v22.2d,v9.2s,v5.2s
725 umlal v19.2d,v9.2s,v0.2s
726 umlal v23.2d,v9.2s,v7.2s
727 umlal v20.2d,v9.2s,v1.2s
728 umlal v21.2d,v9.2s,v3.2s
729
730 add v12.2s,v12.2s,v27.2s
731 umlal v22.2d,v10.2s,v3.2s
732 umlal v19.2d,v10.2s,v8.2s
733 umlal v23.2d,v10.2s,v5.2s
734 umlal v20.2d,v10.2s,v0.2s
735 umlal v21.2d,v10.2s,v1.2s
736
737 add v13.2s,v13.2s,v28.2s
738 umlal v22.2d,v12.2s,v0.2s
739 umlal v19.2d,v12.2s,v4.2s
740 umlal v23.2d,v12.2s,v1.2s
741 umlal v20.2d,v12.2s,v6.2s
742 umlal v21.2d,v12.2s,v8.2s
743
744 umlal v22.2d,v13.2s,v8.2s
745 umlal v19.2d,v13.2s,v2.2s
746 umlal v23.2d,v13.2s,v0.2s
747 umlal v20.2d,v13.2s,v4.2s
748 umlal v21.2d,v13.2s,v6.2s
749
750 .Lshort_tail:
751 ////////////////////////////////////////////////////////////////
752 // horizontal add
753
754 addp v22.2d,v22.2d,v22.2d
755 ldp d8,d9,[sp,#16] // meet ABI requirements
756 addp v19.2d,v19.2d,v19.2d
757 ldp d10,d11,[sp,#32]
758 addp v23.2d,v23.2d,v23.2d
759 ldp d12,d13,[sp,#48]
760 addp v20.2d,v20.2d,v20.2d
761 ldp d14,d15,[sp,#64]
762 addp v21.2d,v21.2d,v21.2d
763
764 ////////////////////////////////////////////////////////////////
765 // lazy reduction, but without narrowing
766
767 ushr v29.2d,v22.2d,#26
768 and v22.16b,v22.16b,v31.16b
769 ushr v30.2d,v19.2d,#26
770 and v19.16b,v19.16b,v31.16b
771
772 add v23.2d,v23.2d,v29.2d // h3 -> h4
773 add v20.2d,v20.2d,v30.2d // h0 -> h1
774
775 ushr v29.2d,v23.2d,#26
776 and v23.16b,v23.16b,v31.16b
777 ushr v30.2d,v20.2d,#26
778 and v20.16b,v20.16b,v31.16b
779 add v21.2d,v21.2d,v30.2d // h1 -> h2
780
781 add v19.2d,v19.2d,v29.2d
782 shl v29.2d,v29.2d,#2
783 ushr v30.2d,v21.2d,#26
784 and v21.16b,v21.16b,v31.16b
785 add v19.2d,v19.2d,v29.2d // h4 -> h0
786 add v22.2d,v22.2d,v30.2d // h2 -> h3
787
788 ushr v29.2d,v19.2d,#26
789 and v19.16b,v19.16b,v31.16b
790 ushr v30.2d,v22.2d,#26
791 and v22.16b,v22.16b,v31.16b
792 add v20.2d,v20.2d,v29.2d // h0 -> h1
793 add v23.2d,v23.2d,v30.2d // h3 -> h4
794
795 ////////////////////////////////////////////////////////////////
796 // write the result, can be partially reduced
797
798 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
799 st1 {v23.s}[0],[x0]
800
801 .Lno_data_neon:
802 ldr x29,[sp],#80
803 .inst 0xd50323bf // autiasp
804 ret
805 .size poly1305_blocks_neon,.-poly1305_blocks_neon
806
807 .type poly1305_emit_neon,%function
808 .align 5
809 poly1305_emit_neon:
810 ldr x17,[x0,#24]
811 cbz x17,poly1305_emit
812
813 ldp w10,w11,[x0] // load hash value base 2^26
814 ldp w12,w13,[x0,#8]
815 ldr w14,[x0,#16]
816
817 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
818 lsr x5,x12,#12
819 adds x4,x4,x12,lsl#52
820 add x5,x5,x13,lsl#14
821 adc x5,x5,xzr
822 lsr x6,x14,#24
823 adds x5,x5,x14,lsl#40
824 adc x6,x6,xzr // can be partially reduced...
825
826 ldp x10,x11,[x2] // load nonce
827
828 and x12,x6,#-4 // ... so reduce
829 add x12,x12,x6,lsr#2
830 and x6,x6,#3
831 adds x4,x4,x12
832 adcs x5,x5,xzr
833 adc x6,x6,xzr
834
835 adds x12,x4,#5 // compare to modulus
836 adcs x13,x5,xzr
837 adc x14,x6,xzr
838
839 tst x14,#-4 // see if it's carried/borrowed
840
841 csel x4,x4,x12,eq
842 csel x5,x5,x13,eq
843
844 #ifdef __ARMEB__
845 ror x10,x10,#32 // flip nonce words
846 ror x11,x11,#32
847 #endif
848 adds x4,x4,x10 // accumulate nonce
849 adc x5,x5,x11
850 #ifdef __ARMEB__
851 rev x4,x4 // flip output bytes
852 rev x5,x5
853 #endif
854 stp x4,x5,[x1] // write result
855
856 ret
857 .size poly1305_emit_neon,.-poly1305_emit_neon
858
859 .align 5
860 .Lzeros:
861 .long 0,0,0,0,0,0,0,0
862 .LOPENSSL_armcap_P:
863 #ifdef __ILP32__
864 .long OPENSSL_armcap_P-.
865 #else
866 .quad OPENSSL_armcap_P-.
867 #endif
868 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
869 .align 2
870 .align 2
871