keccak1600-armv8.S revision 1.1 1 .text
2
3 .align 8 // strategic alignment and padding that allows to use
4 // address value as loop termination condition...
5 .quad 0,0,0,0,0,0,0,0
6 .type iotas,%object
7 iotas:
8 .quad 0x0000000000000001
9 .quad 0x0000000000008082
10 .quad 0x800000000000808a
11 .quad 0x8000000080008000
12 .quad 0x000000000000808b
13 .quad 0x0000000080000001
14 .quad 0x8000000080008081
15 .quad 0x8000000000008009
16 .quad 0x000000000000008a
17 .quad 0x0000000000000088
18 .quad 0x0000000080008009
19 .quad 0x000000008000000a
20 .quad 0x000000008000808b
21 .quad 0x800000000000008b
22 .quad 0x8000000000008089
23 .quad 0x8000000000008003
24 .quad 0x8000000000008002
25 .quad 0x8000000000000080
26 .quad 0x000000000000800a
27 .quad 0x800000008000000a
28 .quad 0x8000000080008081
29 .quad 0x8000000000008080
30 .quad 0x0000000080000001
31 .quad 0x8000000080008008
32 .size iotas,.-iotas
33 .type KeccakF1600_int,%function
34 .align 5
35 KeccakF1600_int:
36 adr x28,iotas
37 stp x28,x30,[sp,#16] // 32 bytes on top are mine
38 b .Loop
39 .align 4
40 .Loop:
41 ////////////////////////////////////////// Theta
42 eor x26,x0,x5
43 stp x4,x9,[sp,#0] // offload pair...
44 eor x27,x1,x6
45 eor x28,x2,x7
46 eor x30,x3,x8
47 eor x4,x4,x9
48 eor x26,x26,x10
49 eor x27,x27,x11
50 eor x28,x28,x12
51 eor x30,x30,x13
52 eor x4,x4,x14
53 eor x26,x26,x15
54 eor x27,x27,x16
55 eor x28,x28,x17
56 eor x30,x30,x25
57 eor x4,x4,x19
58 eor x26,x26,x20
59 eor x28,x28,x22
60 eor x27,x27,x21
61 eor x30,x30,x23
62 eor x4,x4,x24
63
64 eor x9,x26,x28,ror#63
65
66 eor x1,x1,x9
67 eor x6,x6,x9
68 eor x11,x11,x9
69 eor x16,x16,x9
70 eor x21,x21,x9
71
72 eor x9,x27,x30,ror#63
73 eor x28,x28,x4,ror#63
74 eor x30,x30,x26,ror#63
75 eor x4,x4,x27,ror#63
76
77 eor x27, x2,x9 // mov x27,x2
78 eor x7,x7,x9
79 eor x12,x12,x9
80 eor x17,x17,x9
81 eor x22,x22,x9
82
83 eor x0,x0,x4
84 eor x5,x5,x4
85 eor x10,x10,x4
86 eor x15,x15,x4
87 eor x20,x20,x4
88 ldp x4,x9,[sp,#0] // re-load offloaded data
89 eor x26, x3,x28 // mov x26,x3
90 eor x8,x8,x28
91 eor x13,x13,x28
92 eor x25,x25,x28
93 eor x23,x23,x28
94
95 eor x28, x4,x30 // mov x28,x4
96 eor x9,x9,x30
97 eor x14,x14,x30
98 eor x19,x19,x30
99 eor x24,x24,x30
100
101 ////////////////////////////////////////// Rho+Pi
102 mov x30,x1
103 ror x1,x6,#64-44
104 //mov x27,x2
105 ror x2,x12,#64-43
106 //mov x26,x3
107 ror x3,x25,#64-21
108 //mov x28,x4
109 ror x4,x24,#64-14
110
111 ror x6,x9,#64-20
112 ror x12,x13,#64-25
113 ror x25,x17,#64-15
114 ror x24,x21,#64-2
115
116 ror x9,x22,#64-61
117 ror x13,x19,#64-8
118 ror x17,x11,#64-10
119 ror x21,x8,#64-55
120
121 ror x22,x14,#64-39
122 ror x19,x23,#64-56
123 ror x11,x7,#64-6
124 ror x8,x16,#64-45
125
126 ror x14,x20,#64-18
127 ror x23,x15,#64-41
128 ror x7,x10,#64-3
129 ror x16,x5,#64-36
130
131 ror x5,x26,#64-28
132 ror x10,x30,#64-1
133 ror x15,x28,#64-27
134 ror x20,x27,#64-62
135
136 ////////////////////////////////////////// Chi+Iota
137 bic x26,x2,x1
138 bic x27,x3,x2
139 bic x28,x0,x4
140 bic x30,x1,x0
141 eor x0,x0,x26
142 bic x26,x4,x3
143 eor x1,x1,x27
144 ldr x27,[sp,#16]
145 eor x3,x3,x28
146 eor x4,x4,x30
147 eor x2,x2,x26
148 ldr x30,[x27],#8 // Iota[i++]
149
150 bic x26,x7,x6
151 tst x27,#255 // are we done?
152 str x27,[sp,#16]
153 bic x27,x8,x7
154 bic x28,x5,x9
155 eor x0,x0,x30 // A[0][0] ^= Iota
156 bic x30,x6,x5
157 eor x5,x5,x26
158 bic x26,x9,x8
159 eor x6,x6,x27
160 eor x8,x8,x28
161 eor x9,x9,x30
162 eor x7,x7,x26
163
164 bic x26,x12,x11
165 bic x27,x13,x12
166 bic x28,x10,x14
167 bic x30,x11,x10
168 eor x10,x10,x26
169 bic x26,x14,x13
170 eor x11,x11,x27
171 eor x13,x13,x28
172 eor x14,x14,x30
173 eor x12,x12,x26
174
175 bic x26,x17,x16
176 bic x27,x25,x17
177 bic x28,x15,x19
178 bic x30,x16,x15
179 eor x15,x15,x26
180 bic x26,x19,x25
181 eor x16,x16,x27
182 eor x25,x25,x28
183 eor x19,x19,x30
184 eor x17,x17,x26
185
186 bic x26,x22,x21
187 bic x27,x23,x22
188 bic x28,x20,x24
189 bic x30,x21,x20
190 eor x20,x20,x26
191 bic x26,x24,x23
192 eor x21,x21,x27
193 eor x23,x23,x28
194 eor x24,x24,x30
195 eor x22,x22,x26
196
197 bne .Loop
198
199 ldr x30,[sp,#24]
200 ret
201 .size KeccakF1600_int,.-KeccakF1600_int
202
203 .type KeccakF1600,%function
204 .align 5
205 KeccakF1600:
206 stp x29,x30,[sp,#-128]!
207 add x29,sp,#0
208 stp x19,x20,[sp,#16]
209 stp x21,x22,[sp,#32]
210 stp x23,x24,[sp,#48]
211 stp x25,x26,[sp,#64]
212 stp x27,x28,[sp,#80]
213 sub sp,sp,#48
214
215 str x0,[sp,#32] // offload argument
216 mov x26,x0
217 ldp x0,x1,[x0,#16*0]
218 ldp x2,x3,[x26,#16*1]
219 ldp x4,x5,[x26,#16*2]
220 ldp x6,x7,[x26,#16*3]
221 ldp x8,x9,[x26,#16*4]
222 ldp x10,x11,[x26,#16*5]
223 ldp x12,x13,[x26,#16*6]
224 ldp x14,x15,[x26,#16*7]
225 ldp x16,x17,[x26,#16*8]
226 ldp x25,x19,[x26,#16*9]
227 ldp x20,x21,[x26,#16*10]
228 ldp x22,x23,[x26,#16*11]
229 ldr x24,[x26,#16*12]
230
231 bl KeccakF1600_int
232
233 ldr x26,[sp,#32]
234 stp x0,x1,[x26,#16*0]
235 stp x2,x3,[x26,#16*1]
236 stp x4,x5,[x26,#16*2]
237 stp x6,x7,[x26,#16*3]
238 stp x8,x9,[x26,#16*4]
239 stp x10,x11,[x26,#16*5]
240 stp x12,x13,[x26,#16*6]
241 stp x14,x15,[x26,#16*7]
242 stp x16,x17,[x26,#16*8]
243 stp x25,x19,[x26,#16*9]
244 stp x20,x21,[x26,#16*10]
245 stp x22,x23,[x26,#16*11]
246 str x24,[x26,#16*12]
247
248 ldp x19,x20,[x29,#16]
249 add sp,sp,#48
250 ldp x21,x22,[x29,#32]
251 ldp x23,x24,[x29,#48]
252 ldp x25,x26,[x29,#64]
253 ldp x27,x28,[x29,#80]
254 ldp x29,x30,[sp],#128
255 ret
256 .size KeccakF1600,.-KeccakF1600
257
258 .globl SHA3_absorb
259 .type SHA3_absorb,%function
260 .align 5
261 SHA3_absorb:
262 stp x29,x30,[sp,#-128]!
263 add x29,sp,#0
264 stp x19,x20,[sp,#16]
265 stp x21,x22,[sp,#32]
266 stp x23,x24,[sp,#48]
267 stp x25,x26,[sp,#64]
268 stp x27,x28,[sp,#80]
269 sub sp,sp,#64
270
271 stp x0,x1,[sp,#32] // offload arguments
272 stp x2,x3,[sp,#48]
273
274 mov x26,x0 // uint64_t A[5][5]
275 mov x27,x1 // const void *inp
276 mov x28,x2 // size_t len
277 mov x30,x3 // size_t bsz
278 ldp x0,x1,[x26,#16*0]
279 ldp x2,x3,[x26,#16*1]
280 ldp x4,x5,[x26,#16*2]
281 ldp x6,x7,[x26,#16*3]
282 ldp x8,x9,[x26,#16*4]
283 ldp x10,x11,[x26,#16*5]
284 ldp x12,x13,[x26,#16*6]
285 ldp x14,x15,[x26,#16*7]
286 ldp x16,x17,[x26,#16*8]
287 ldp x25,x19,[x26,#16*9]
288 ldp x20,x21,[x26,#16*10]
289 ldp x22,x23,[x26,#16*11]
290 ldr x24,[x26,#16*12]
291 b .Loop_absorb
292
293 .align 4
294 .Loop_absorb:
295 subs x26,x28,x30 // len - bsz
296 blo .Labsorbed
297
298 str x26,[sp,#48] // save len - bsz
299 ldr x26,[x27],#8 // *inp++
300 #ifdef __AARCH64EB__
301 rev x26,x26
302 #endif
303 eor x0,x0,x26
304 cmp x30,#8*(0+2)
305 blo .Lprocess_block
306 ldr x26,[x27],#8 // *inp++
307 #ifdef __AARCH64EB__
308 rev x26,x26
309 #endif
310 eor x1,x1,x26
311 beq .Lprocess_block
312 ldr x26,[x27],#8 // *inp++
313 #ifdef __AARCH64EB__
314 rev x26,x26
315 #endif
316 eor x2,x2,x26
317 cmp x30,#8*(2+2)
318 blo .Lprocess_block
319 ldr x26,[x27],#8 // *inp++
320 #ifdef __AARCH64EB__
321 rev x26,x26
322 #endif
323 eor x3,x3,x26
324 beq .Lprocess_block
325 ldr x26,[x27],#8 // *inp++
326 #ifdef __AARCH64EB__
327 rev x26,x26
328 #endif
329 eor x4,x4,x26
330 cmp x30,#8*(4+2)
331 blo .Lprocess_block
332 ldr x26,[x27],#8 // *inp++
333 #ifdef __AARCH64EB__
334 rev x26,x26
335 #endif
336 eor x5,x5,x26
337 beq .Lprocess_block
338 ldr x26,[x27],#8 // *inp++
339 #ifdef __AARCH64EB__
340 rev x26,x26
341 #endif
342 eor x6,x6,x26
343 cmp x30,#8*(6+2)
344 blo .Lprocess_block
345 ldr x26,[x27],#8 // *inp++
346 #ifdef __AARCH64EB__
347 rev x26,x26
348 #endif
349 eor x7,x7,x26
350 beq .Lprocess_block
351 ldr x26,[x27],#8 // *inp++
352 #ifdef __AARCH64EB__
353 rev x26,x26
354 #endif
355 eor x8,x8,x26
356 cmp x30,#8*(8+2)
357 blo .Lprocess_block
358 ldr x26,[x27],#8 // *inp++
359 #ifdef __AARCH64EB__
360 rev x26,x26
361 #endif
362 eor x9,x9,x26
363 beq .Lprocess_block
364 ldr x26,[x27],#8 // *inp++
365 #ifdef __AARCH64EB__
366 rev x26,x26
367 #endif
368 eor x10,x10,x26
369 cmp x30,#8*(10+2)
370 blo .Lprocess_block
371 ldr x26,[x27],#8 // *inp++
372 #ifdef __AARCH64EB__
373 rev x26,x26
374 #endif
375 eor x11,x11,x26
376 beq .Lprocess_block
377 ldr x26,[x27],#8 // *inp++
378 #ifdef __AARCH64EB__
379 rev x26,x26
380 #endif
381 eor x12,x12,x26
382 cmp x30,#8*(12+2)
383 blo .Lprocess_block
384 ldr x26,[x27],#8 // *inp++
385 #ifdef __AARCH64EB__
386 rev x26,x26
387 #endif
388 eor x13,x13,x26
389 beq .Lprocess_block
390 ldr x26,[x27],#8 // *inp++
391 #ifdef __AARCH64EB__
392 rev x26,x26
393 #endif
394 eor x14,x14,x26
395 cmp x30,#8*(14+2)
396 blo .Lprocess_block
397 ldr x26,[x27],#8 // *inp++
398 #ifdef __AARCH64EB__
399 rev x26,x26
400 #endif
401 eor x15,x15,x26
402 beq .Lprocess_block
403 ldr x26,[x27],#8 // *inp++
404 #ifdef __AARCH64EB__
405 rev x26,x26
406 #endif
407 eor x16,x16,x26
408 cmp x30,#8*(16+2)
409 blo .Lprocess_block
410 ldr x26,[x27],#8 // *inp++
411 #ifdef __AARCH64EB__
412 rev x26,x26
413 #endif
414 eor x17,x17,x26
415 beq .Lprocess_block
416 ldr x26,[x27],#8 // *inp++
417 #ifdef __AARCH64EB__
418 rev x26,x26
419 #endif
420 eor x25,x25,x26
421 cmp x30,#8*(18+2)
422 blo .Lprocess_block
423 ldr x26,[x27],#8 // *inp++
424 #ifdef __AARCH64EB__
425 rev x26,x26
426 #endif
427 eor x19,x19,x26
428 beq .Lprocess_block
429 ldr x26,[x27],#8 // *inp++
430 #ifdef __AARCH64EB__
431 rev x26,x26
432 #endif
433 eor x20,x20,x26
434 cmp x30,#8*(20+2)
435 blo .Lprocess_block
436 ldr x26,[x27],#8 // *inp++
437 #ifdef __AARCH64EB__
438 rev x26,x26
439 #endif
440 eor x21,x21,x26
441 beq .Lprocess_block
442 ldr x26,[x27],#8 // *inp++
443 #ifdef __AARCH64EB__
444 rev x26,x26
445 #endif
446 eor x22,x22,x26
447 cmp x30,#8*(22+2)
448 blo .Lprocess_block
449 ldr x26,[x27],#8 // *inp++
450 #ifdef __AARCH64EB__
451 rev x26,x26
452 #endif
453 eor x23,x23,x26
454 beq .Lprocess_block
455 ldr x26,[x27],#8 // *inp++
456 #ifdef __AARCH64EB__
457 rev x26,x26
458 #endif
459 eor x24,x24,x26
460
461 .Lprocess_block:
462 str x27,[sp,#40] // save inp
463
464 bl KeccakF1600_int
465
466 ldr x27,[sp,#40] // restore arguments
467 ldp x28,x30,[sp,#48]
468 b .Loop_absorb
469
470 .align 4
471 .Labsorbed:
472 ldr x27,[sp,#32]
473 stp x0,x1,[x27,#16*0]
474 stp x2,x3,[x27,#16*1]
475 stp x4,x5,[x27,#16*2]
476 stp x6,x7,[x27,#16*3]
477 stp x8,x9,[x27,#16*4]
478 stp x10,x11,[x27,#16*5]
479 stp x12,x13,[x27,#16*6]
480 stp x14,x15,[x27,#16*7]
481 stp x16,x17,[x27,#16*8]
482 stp x25,x19,[x27,#16*9]
483 stp x20,x21,[x27,#16*10]
484 stp x22,x23,[x27,#16*11]
485 str x24,[x27,#16*12]
486
487 mov x0,x28 // return value
488 ldp x19,x20,[x29,#16]
489 add sp,sp,#64
490 ldp x21,x22,[x29,#32]
491 ldp x23,x24,[x29,#48]
492 ldp x25,x26,[x29,#64]
493 ldp x27,x28,[x29,#80]
494 ldp x29,x30,[sp],#128
495 ret
496 .size SHA3_absorb,.-SHA3_absorb
497 .globl SHA3_squeeze
498 .type SHA3_squeeze,%function
499 .align 5
500 SHA3_squeeze:
501 stp x29,x30,[sp,#-48]!
502 add x29,sp,#0
503 stp x19,x20,[sp,#16]
504 stp x21,x22,[sp,#32]
505
506 mov x19,x0 // put aside arguments
507 mov x20,x1
508 mov x21,x2
509 mov x22,x3
510
511 .Loop_squeeze:
512 ldr x4,[x0],#8
513 cmp x21,#8
514 blo .Lsqueeze_tail
515 #ifdef __AARCH64EB__
516 rev x4,x4
517 #endif
518 str x4,[x20],#8
519 subs x21,x21,#8
520 beq .Lsqueeze_done
521
522 subs x3,x3,#8
523 bhi .Loop_squeeze
524
525 mov x0,x19
526 bl KeccakF1600
527 mov x0,x19
528 mov x3,x22
529 b .Loop_squeeze
530
531 .align 4
532 .Lsqueeze_tail:
533 strb w4,[x20],#1
534 lsr x4,x4,#8
535 subs x21,x21,#1
536 beq .Lsqueeze_done
537 strb w4,[x20],#1
538 lsr x4,x4,#8
539 subs x21,x21,#1
540 beq .Lsqueeze_done
541 strb w4,[x20],#1
542 lsr x4,x4,#8
543 subs x21,x21,#1
544 beq .Lsqueeze_done
545 strb w4,[x20],#1
546 lsr x4,x4,#8
547 subs x21,x21,#1
548 beq .Lsqueeze_done
549 strb w4,[x20],#1
550 lsr x4,x4,#8
551 subs x21,x21,#1
552 beq .Lsqueeze_done
553 strb w4,[x20],#1
554 lsr x4,x4,#8
555 subs x21,x21,#1
556 beq .Lsqueeze_done
557 strb w4,[x20],#1
558
559 .Lsqueeze_done:
560 ldp x19,x20,[sp,#16]
561 ldp x21,x22,[sp,#32]
562 ldp x29,x30,[sp],#48
563 ret
564 .size SHA3_squeeze,.-SHA3_squeeze
565 .type KeccakF1600_ce,%function
566 .align 5
567 KeccakF1600_ce:
568 mov x9,#12
569 adr x10,iotas
570 b .Loop_ce
571 .align 4
572 .Loop_ce:
573 ////////////////////////////////////////////////// Theta
574 .inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
575 .inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
576 .inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
577 .inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
578 .inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
579 .inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
580 .inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
581 .inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
582 .inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
583 .inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
584
585 .inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
586 .inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
587 .inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
588 .inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
589 .inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
590
591 ////////////////////////////////////////////////// Theta+Rho+Pi
592 .inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
593 .inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
594 .inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
595 .inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
596 .inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
597
598 .inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
599
600 .inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
601 .inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
602 .inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
603 .inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
604 .inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
605
606 .inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
607
608 eor v0.16b,v0.16b,v29.16b
609 ldr x11,[x10],#8
610
611 .inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
612 .inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
613 .inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
614 .inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
615 .inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
616
617 .inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
618
619 .inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
620 .inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
621 .inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
622 .inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
623 .inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
624
625 .inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
626
627 ////////////////////////////////////////////////// Chi+Iota
628 dup v31.2d,x11 // borrow C[6]
629 .inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
630 .inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
631 .inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
632 .inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
633 .inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
634
635 .inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
636 .inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
637 .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
638 .inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
639 .inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
640
641 eor v0.16b,v28.16b,v31.16b // Iota
642
643 .inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
644 .inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
645 .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
646 .inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
647 .inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
648
649 .inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
650 .inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
651 .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
652 .inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
653 .inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
654
655 .inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
656 .inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
657 .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
658 .inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
659 .inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
660 ////////////////////////////////////////////////// Theta
661 .inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
662 .inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
663 .inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
664 .inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
665 .inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
666 .inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
667 .inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
668 .inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
669 .inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
670 .inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
671
672 .inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
673 .inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
674 .inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
675 .inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
676 .inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
677
678 ////////////////////////////////////////////////// Theta+Rho+Pi
679 .inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
680 .inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
681 .inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
682 .inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
683 .inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
684
685 .inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
686
687 .inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
688 .inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
689 .inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
690 .inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
691 .inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
692
693 .inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
694
695 eor v0.16b,v0.16b,v16.16b
696 ldr x11,[x10],#8
697
698 .inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
699 .inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
700 .inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
701 .inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
702 .inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
703
704 .inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
705
706 .inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
707 .inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
708 .inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
709 .inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
710 .inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
711
712 .inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
713
714 ////////////////////////////////////////////////// Chi+Iota
715 dup v21.2d,x11 // borrow C[6]
716 .inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
717 .inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
718 .inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
719 .inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
720 .inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
721
722 .inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
723 .inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
724 .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
725 .inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
726 .inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
727
728 eor v0.16b,v15.16b,v21.16b // Iota
729
730 .inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
731 .inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
732 .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
733 .inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
734 .inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
735
736 .inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
737 .inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
738 .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
739 .inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
740 .inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
741
742 .inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
743 .inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
744 .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
745 .inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
746 .inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
747 subs x9,x9,#1
748 bne .Loop_ce
749
750 ret
751 .size KeccakF1600_ce,.-KeccakF1600_ce
752
753 .type KeccakF1600_cext,%function
754 .align 5
755 KeccakF1600_cext:
756 stp x29,x30,[sp,#-80]!
757 add x29,sp,#0
758 stp d8,d9,[sp,#16] // per ABI requirement
759 stp d10,d11,[sp,#32]
760 stp d12,d13,[sp,#48]
761 stp d14,d15,[sp,#64]
762 ldp d0,d1,[x0,#8*0]
763 ldp d2,d3,[x0,#8*2]
764 ldp d4,d5,[x0,#8*4]
765 ldp d6,d7,[x0,#8*6]
766 ldp d8,d9,[x0,#8*8]
767 ldp d10,d11,[x0,#8*10]
768 ldp d12,d13,[x0,#8*12]
769 ldp d14,d15,[x0,#8*14]
770 ldp d16,d17,[x0,#8*16]
771 ldp d18,d19,[x0,#8*18]
772 ldp d20,d21,[x0,#8*20]
773 ldp d22,d23,[x0,#8*22]
774 ldr d24,[x0,#8*24]
775 bl KeccakF1600_ce
776 ldr x30,[sp,#8]
777 stp d0,d1,[x0,#8*0]
778 stp d2,d3,[x0,#8*2]
779 stp d4,d5,[x0,#8*4]
780 stp d6,d7,[x0,#8*6]
781 stp d8,d9,[x0,#8*8]
782 stp d10,d11,[x0,#8*10]
783 stp d12,d13,[x0,#8*12]
784 stp d14,d15,[x0,#8*14]
785 stp d16,d17,[x0,#8*16]
786 stp d18,d19,[x0,#8*18]
787 stp d20,d21,[x0,#8*20]
788 stp d22,d23,[x0,#8*22]
789 str d24,[x0,#8*24]
790
791 ldp d8,d9,[sp,#16]
792 ldp d10,d11,[sp,#32]
793 ldp d12,d13,[sp,#48]
794 ldp d14,d15,[sp,#64]
795 ldr x29,[sp],#80
796 ret
797 .size KeccakF1600_cext,.-KeccakF1600_cext
798 .globl SHA3_absorb_cext
799 .type SHA3_absorb_cext,%function
800 .align 5
801 SHA3_absorb_cext:
802 stp x29,x30,[sp,#-80]!
803 add x29,sp,#0
804 stp d8,d9,[sp,#16] // per ABI requirement
805 stp d10,d11,[sp,#32]
806 stp d12,d13,[sp,#48]
807 stp d14,d15,[sp,#64]
808 ldp d0,d1,[x0,#8*0]
809 ldp d2,d3,[x0,#8*2]
810 ldp d4,d5,[x0,#8*4]
811 ldp d6,d7,[x0,#8*6]
812 ldp d8,d9,[x0,#8*8]
813 ldp d10,d11,[x0,#8*10]
814 ldp d12,d13,[x0,#8*12]
815 ldp d14,d15,[x0,#8*14]
816 ldp d16,d17,[x0,#8*16]
817 ldp d18,d19,[x0,#8*18]
818 ldp d20,d21,[x0,#8*20]
819 ldp d22,d23,[x0,#8*22]
820 ldr d24,[x0,#8*24]
821 b .Loop_absorb_ce
822
823 .align 4
824 .Loop_absorb_ce:
825 subs x2,x2,x3 // len - bsz
826 blo .Labsorbed_ce
827 ldr d31,[x1],#8 // *inp++
828 #ifdef __AARCH64EB__
829 rev64 v31.16b,v31.16b
830 #endif
831 eor v0.16b,v0.16b,v31.16b
832 cmp x3,#8*(0+2)
833 blo .Lprocess_block_ce
834 ldr d31,[x1],#8 // *inp++
835 #ifdef __AARCH64EB__
836 rev v31.16b,v31.16b
837 #endif
838 eor v1.16b,v1.16b,v31.16b
839 beq .Lprocess_block_ce
840 ldr d31,[x1],#8 // *inp++
841 #ifdef __AARCH64EB__
842 rev64 v31.16b,v31.16b
843 #endif
844 eor v2.16b,v2.16b,v31.16b
845 cmp x3,#8*(2+2)
846 blo .Lprocess_block_ce
847 ldr d31,[x1],#8 // *inp++
848 #ifdef __AARCH64EB__
849 rev v31.16b,v31.16b
850 #endif
851 eor v3.16b,v3.16b,v31.16b
852 beq .Lprocess_block_ce
853 ldr d31,[x1],#8 // *inp++
854 #ifdef __AARCH64EB__
855 rev64 v31.16b,v31.16b
856 #endif
857 eor v4.16b,v4.16b,v31.16b
858 cmp x3,#8*(4+2)
859 blo .Lprocess_block_ce
860 ldr d31,[x1],#8 // *inp++
861 #ifdef __AARCH64EB__
862 rev v31.16b,v31.16b
863 #endif
864 eor v5.16b,v5.16b,v31.16b
865 beq .Lprocess_block_ce
866 ldr d31,[x1],#8 // *inp++
867 #ifdef __AARCH64EB__
868 rev64 v31.16b,v31.16b
869 #endif
870 eor v6.16b,v6.16b,v31.16b
871 cmp x3,#8*(6+2)
872 blo .Lprocess_block_ce
873 ldr d31,[x1],#8 // *inp++
874 #ifdef __AARCH64EB__
875 rev v31.16b,v31.16b
876 #endif
877 eor v7.16b,v7.16b,v31.16b
878 beq .Lprocess_block_ce
879 ldr d31,[x1],#8 // *inp++
880 #ifdef __AARCH64EB__
881 rev64 v31.16b,v31.16b
882 #endif
883 eor v8.16b,v8.16b,v31.16b
884 cmp x3,#8*(8+2)
885 blo .Lprocess_block_ce
886 ldr d31,[x1],#8 // *inp++
887 #ifdef __AARCH64EB__
888 rev v31.16b,v31.16b
889 #endif
890 eor v9.16b,v9.16b,v31.16b
891 beq .Lprocess_block_ce
892 ldr d31,[x1],#8 // *inp++
893 #ifdef __AARCH64EB__
894 rev64 v31.16b,v31.16b
895 #endif
896 eor v10.16b,v10.16b,v31.16b
897 cmp x3,#8*(10+2)
898 blo .Lprocess_block_ce
899 ldr d31,[x1],#8 // *inp++
900 #ifdef __AARCH64EB__
901 rev v31.16b,v31.16b
902 #endif
903 eor v11.16b,v11.16b,v31.16b
904 beq .Lprocess_block_ce
905 ldr d31,[x1],#8 // *inp++
906 #ifdef __AARCH64EB__
907 rev64 v31.16b,v31.16b
908 #endif
909 eor v12.16b,v12.16b,v31.16b
910 cmp x3,#8*(12+2)
911 blo .Lprocess_block_ce
912 ldr d31,[x1],#8 // *inp++
913 #ifdef __AARCH64EB__
914 rev v31.16b,v31.16b
915 #endif
916 eor v13.16b,v13.16b,v31.16b
917 beq .Lprocess_block_ce
918 ldr d31,[x1],#8 // *inp++
919 #ifdef __AARCH64EB__
920 rev64 v31.16b,v31.16b
921 #endif
922 eor v14.16b,v14.16b,v31.16b
923 cmp x3,#8*(14+2)
924 blo .Lprocess_block_ce
925 ldr d31,[x1],#8 // *inp++
926 #ifdef __AARCH64EB__
927 rev v31.16b,v31.16b
928 #endif
929 eor v15.16b,v15.16b,v31.16b
930 beq .Lprocess_block_ce
931 ldr d31,[x1],#8 // *inp++
932 #ifdef __AARCH64EB__
933 rev64 v31.16b,v31.16b
934 #endif
935 eor v16.16b,v16.16b,v31.16b
936 cmp x3,#8*(16+2)
937 blo .Lprocess_block_ce
938 ldr d31,[x1],#8 // *inp++
939 #ifdef __AARCH64EB__
940 rev v31.16b,v31.16b
941 #endif
942 eor v17.16b,v17.16b,v31.16b
943 beq .Lprocess_block_ce
944 ldr d31,[x1],#8 // *inp++
945 #ifdef __AARCH64EB__
946 rev64 v31.16b,v31.16b
947 #endif
948 eor v18.16b,v18.16b,v31.16b
949 cmp x3,#8*(18+2)
950 blo .Lprocess_block_ce
951 ldr d31,[x1],#8 // *inp++
952 #ifdef __AARCH64EB__
953 rev v31.16b,v31.16b
954 #endif
955 eor v19.16b,v19.16b,v31.16b
956 beq .Lprocess_block_ce
957 ldr d31,[x1],#8 // *inp++
958 #ifdef __AARCH64EB__
959 rev64 v31.16b,v31.16b
960 #endif
961 eor v20.16b,v20.16b,v31.16b
962 cmp x3,#8*(20+2)
963 blo .Lprocess_block_ce
964 ldr d31,[x1],#8 // *inp++
965 #ifdef __AARCH64EB__
966 rev v31.16b,v31.16b
967 #endif
968 eor v21.16b,v21.16b,v31.16b
969 beq .Lprocess_block_ce
970 ldr d31,[x1],#8 // *inp++
971 #ifdef __AARCH64EB__
972 rev64 v31.16b,v31.16b
973 #endif
974 eor v22.16b,v22.16b,v31.16b
975 cmp x3,#8*(22+2)
976 blo .Lprocess_block_ce
977 ldr d31,[x1],#8 // *inp++
978 #ifdef __AARCH64EB__
979 rev v31.16b,v31.16b
980 #endif
981 eor v23.16b,v23.16b,v31.16b
982 beq .Lprocess_block_ce
983 ldr d31,[x1],#8 // *inp++
984 #ifdef __AARCH64EB__
985 rev v31.16b,v31.16b
986 #endif
987 eor v24.16b,v24.16b,v31.16b
988
989 .Lprocess_block_ce:
990
991 bl KeccakF1600_ce
992
993 b .Loop_absorb_ce
994
995 .align 4
996 .Labsorbed_ce:
997 stp d0,d1,[x0,#8*0]
998 stp d2,d3,[x0,#8*2]
999 stp d4,d5,[x0,#8*4]
1000 stp d6,d7,[x0,#8*6]
1001 stp d8,d9,[x0,#8*8]
1002 stp d10,d11,[x0,#8*10]
1003 stp d12,d13,[x0,#8*12]
1004 stp d14,d15,[x0,#8*14]
1005 stp d16,d17,[x0,#8*16]
1006 stp d18,d19,[x0,#8*18]
1007 stp d20,d21,[x0,#8*20]
1008 stp d22,d23,[x0,#8*22]
1009 str d24,[x0,#8*24]
1010 add x0,x2,x3 // return value
1011
1012 ldp d8,d9,[sp,#16]
1013 ldp d10,d11,[sp,#32]
1014 ldp d12,d13,[sp,#48]
1015 ldp d14,d15,[sp,#64]
1016 ldp x29,x30,[sp],#80
1017 ret
1018 .size SHA3_absorb_cext,.-SHA3_absorb_cext
1019 .globl SHA3_squeeze_cext
1020 .type SHA3_squeeze_cext,%function
1021 .align 5
1022 SHA3_squeeze_cext:
1023 stp x29,x30,[sp,#-16]!
1024 add x29,sp,#0
1025 mov x9,x0
1026 mov x10,x3
1027
1028 .Loop_squeeze_ce:
1029 ldr x4,[x9],#8
1030 cmp x2,#8
1031 blo .Lsqueeze_tail_ce
1032 #ifdef __AARCH64EB__
1033 rev x4,x4
1034 #endif
1035 str x4,[x1],#8
1036 beq .Lsqueeze_done_ce
1037
1038 sub x2,x2,#8
1039 subs x10,x10,#8
1040 bhi .Loop_squeeze_ce
1041
1042 bl KeccakF1600_cext
1043 ldr x30,[sp,#8]
1044 mov x9,x0
1045 mov x10,x3
1046 b .Loop_squeeze_ce
1047
1048 .align 4
1049 .Lsqueeze_tail_ce:
1050 strb w4,[x1],#1
1051 lsr x4,x4,#8
1052 subs x2,x2,#1
1053 beq .Lsqueeze_done_ce
1054 strb w4,[x1],#1
1055 lsr x4,x4,#8
1056 subs x2,x2,#1
1057 beq .Lsqueeze_done_ce
1058 strb w4,[x1],#1
1059 lsr x4,x4,#8
1060 subs x2,x2,#1
1061 beq .Lsqueeze_done_ce
1062 strb w4,[x1],#1
1063 lsr x4,x4,#8
1064 subs x2,x2,#1
1065 beq .Lsqueeze_done_ce
1066 strb w4,[x1],#1
1067 lsr x4,x4,#8
1068 subs x2,x2,#1
1069 beq .Lsqueeze_done_ce
1070 strb w4,[x1],#1
1071 lsr x4,x4,#8
1072 subs x2,x2,#1
1073 beq .Lsqueeze_done_ce
1074 strb w4,[x1],#1
1075
1076 .Lsqueeze_done_ce:
1077 ldr x29,[sp],#16
1078 ret
1079 .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
1080 .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1081 .align 2
1082