keccak1600-armv8.S revision 1.1 1 .text
2
3 .align 8 // strategic alignment and padding that allows to use
4 // address value as loop termination condition...
5 .quad 0,0,0,0,0,0,0,0
6 .type iotas,%object
7 iotas:
8 .quad 0x0000000000000001
9 .quad 0x0000000000008082
10 .quad 0x800000000000808a
11 .quad 0x8000000080008000
12 .quad 0x000000000000808b
13 .quad 0x0000000080000001
14 .quad 0x8000000080008081
15 .quad 0x8000000000008009
16 .quad 0x000000000000008a
17 .quad 0x0000000000000088
18 .quad 0x0000000080008009
19 .quad 0x000000008000000a
20 .quad 0x000000008000808b
21 .quad 0x800000000000008b
22 .quad 0x8000000000008089
23 .quad 0x8000000000008003
24 .quad 0x8000000000008002
25 .quad 0x8000000000000080
26 .quad 0x000000000000800a
27 .quad 0x800000008000000a
28 .quad 0x8000000080008081
29 .quad 0x8000000000008080
30 .quad 0x0000000080000001
31 .quad 0x8000000080008008
32 .size iotas,.-iotas
33 .type KeccakF1600_int,%function
34 .align 5
35 KeccakF1600_int:
36 adr x28,iotas
37 .inst 0xd503233f // paciasp
38 stp x28,x30,[sp,#16] // 32 bytes on top are mine
39 b .Loop
40 .align 4
41 .Loop:
42 ////////////////////////////////////////// Theta
43 eor x26,x0,x5
44 stp x4,x9,[sp,#0] // offload pair...
45 eor x27,x1,x6
46 eor x28,x2,x7
47 eor x30,x3,x8
48 eor x4,x4,x9
49 eor x26,x26,x10
50 eor x27,x27,x11
51 eor x28,x28,x12
52 eor x30,x30,x13
53 eor x4,x4,x14
54 eor x26,x26,x15
55 eor x27,x27,x16
56 eor x28,x28,x17
57 eor x30,x30,x25
58 eor x4,x4,x19
59 eor x26,x26,x20
60 eor x28,x28,x22
61 eor x27,x27,x21
62 eor x30,x30,x23
63 eor x4,x4,x24
64
65 eor x9,x26,x28,ror#63
66
67 eor x1,x1,x9
68 eor x6,x6,x9
69 eor x11,x11,x9
70 eor x16,x16,x9
71 eor x21,x21,x9
72
73 eor x9,x27,x30,ror#63
74 eor x28,x28,x4,ror#63
75 eor x30,x30,x26,ror#63
76 eor x4,x4,x27,ror#63
77
78 eor x27, x2,x9 // mov x27,x2
79 eor x7,x7,x9
80 eor x12,x12,x9
81 eor x17,x17,x9
82 eor x22,x22,x9
83
84 eor x0,x0,x4
85 eor x5,x5,x4
86 eor x10,x10,x4
87 eor x15,x15,x4
88 eor x20,x20,x4
89 ldp x4,x9,[sp,#0] // re-load offloaded data
90 eor x26, x3,x28 // mov x26,x3
91 eor x8,x8,x28
92 eor x13,x13,x28
93 eor x25,x25,x28
94 eor x23,x23,x28
95
96 eor x28, x4,x30 // mov x28,x4
97 eor x9,x9,x30
98 eor x14,x14,x30
99 eor x19,x19,x30
100 eor x24,x24,x30
101
102 ////////////////////////////////////////// Rho+Pi
103 mov x30,x1
104 ror x1,x6,#64-44
105 //mov x27,x2
106 ror x2,x12,#64-43
107 //mov x26,x3
108 ror x3,x25,#64-21
109 //mov x28,x4
110 ror x4,x24,#64-14
111
112 ror x6,x9,#64-20
113 ror x12,x13,#64-25
114 ror x25,x17,#64-15
115 ror x24,x21,#64-2
116
117 ror x9,x22,#64-61
118 ror x13,x19,#64-8
119 ror x17,x11,#64-10
120 ror x21,x8,#64-55
121
122 ror x22,x14,#64-39
123 ror x19,x23,#64-56
124 ror x11,x7,#64-6
125 ror x8,x16,#64-45
126
127 ror x14,x20,#64-18
128 ror x23,x15,#64-41
129 ror x7,x10,#64-3
130 ror x16,x5,#64-36
131
132 ror x5,x26,#64-28
133 ror x10,x30,#64-1
134 ror x15,x28,#64-27
135 ror x20,x27,#64-62
136
137 ////////////////////////////////////////// Chi+Iota
138 bic x26,x2,x1
139 bic x27,x3,x2
140 bic x28,x0,x4
141 bic x30,x1,x0
142 eor x0,x0,x26
143 bic x26,x4,x3
144 eor x1,x1,x27
145 ldr x27,[sp,#16]
146 eor x3,x3,x28
147 eor x4,x4,x30
148 eor x2,x2,x26
149 ldr x30,[x27],#8 // Iota[i++]
150
151 bic x26,x7,x6
152 tst x27,#255 // are we done?
153 str x27,[sp,#16]
154 bic x27,x8,x7
155 bic x28,x5,x9
156 eor x0,x0,x30 // A[0][0] ^= Iota
157 bic x30,x6,x5
158 eor x5,x5,x26
159 bic x26,x9,x8
160 eor x6,x6,x27
161 eor x8,x8,x28
162 eor x9,x9,x30
163 eor x7,x7,x26
164
165 bic x26,x12,x11
166 bic x27,x13,x12
167 bic x28,x10,x14
168 bic x30,x11,x10
169 eor x10,x10,x26
170 bic x26,x14,x13
171 eor x11,x11,x27
172 eor x13,x13,x28
173 eor x14,x14,x30
174 eor x12,x12,x26
175
176 bic x26,x17,x16
177 bic x27,x25,x17
178 bic x28,x15,x19
179 bic x30,x16,x15
180 eor x15,x15,x26
181 bic x26,x19,x25
182 eor x16,x16,x27
183 eor x25,x25,x28
184 eor x19,x19,x30
185 eor x17,x17,x26
186
187 bic x26,x22,x21
188 bic x27,x23,x22
189 bic x28,x20,x24
190 bic x30,x21,x20
191 eor x20,x20,x26
192 bic x26,x24,x23
193 eor x21,x21,x27
194 eor x23,x23,x28
195 eor x24,x24,x30
196 eor x22,x22,x26
197
198 bne .Loop
199
200 ldr x30,[sp,#24]
201 .inst 0xd50323bf // autiasp
202 ret
203 .size KeccakF1600_int,.-KeccakF1600_int
204
205 .type KeccakF1600,%function
206 .align 5
207 KeccakF1600:
208 .inst 0xd503233f // paciasp
209 stp x29,x30,[sp,#-128]!
210 add x29,sp,#0
211 stp x19,x20,[sp,#16]
212 stp x21,x22,[sp,#32]
213 stp x23,x24,[sp,#48]
214 stp x25,x26,[sp,#64]
215 stp x27,x28,[sp,#80]
216 sub sp,sp,#48
217
218 str x0,[sp,#32] // offload argument
219 mov x26,x0
220 ldp x0,x1,[x0,#16*0]
221 ldp x2,x3,[x26,#16*1]
222 ldp x4,x5,[x26,#16*2]
223 ldp x6,x7,[x26,#16*3]
224 ldp x8,x9,[x26,#16*4]
225 ldp x10,x11,[x26,#16*5]
226 ldp x12,x13,[x26,#16*6]
227 ldp x14,x15,[x26,#16*7]
228 ldp x16,x17,[x26,#16*8]
229 ldp x25,x19,[x26,#16*9]
230 ldp x20,x21,[x26,#16*10]
231 ldp x22,x23,[x26,#16*11]
232 ldr x24,[x26,#16*12]
233
234 bl KeccakF1600_int
235
236 ldr x26,[sp,#32]
237 stp x0,x1,[x26,#16*0]
238 stp x2,x3,[x26,#16*1]
239 stp x4,x5,[x26,#16*2]
240 stp x6,x7,[x26,#16*3]
241 stp x8,x9,[x26,#16*4]
242 stp x10,x11,[x26,#16*5]
243 stp x12,x13,[x26,#16*6]
244 stp x14,x15,[x26,#16*7]
245 stp x16,x17,[x26,#16*8]
246 stp x25,x19,[x26,#16*9]
247 stp x20,x21,[x26,#16*10]
248 stp x22,x23,[x26,#16*11]
249 str x24,[x26,#16*12]
250
251 ldp x19,x20,[x29,#16]
252 add sp,sp,#48
253 ldp x21,x22,[x29,#32]
254 ldp x23,x24,[x29,#48]
255 ldp x25,x26,[x29,#64]
256 ldp x27,x28,[x29,#80]
257 ldp x29,x30,[sp],#128
258 .inst 0xd50323bf // autiasp
259 ret
260 .size KeccakF1600,.-KeccakF1600
261
262 .globl SHA3_absorb
263 .type SHA3_absorb,%function
264 .align 5
265 SHA3_absorb:
266 .inst 0xd503233f // paciasp
267 stp x29,x30,[sp,#-128]!
268 add x29,sp,#0
269 stp x19,x20,[sp,#16]
270 stp x21,x22,[sp,#32]
271 stp x23,x24,[sp,#48]
272 stp x25,x26,[sp,#64]
273 stp x27,x28,[sp,#80]
274 sub sp,sp,#64
275
276 stp x0,x1,[sp,#32] // offload arguments
277 stp x2,x3,[sp,#48]
278
279 mov x26,x0 // uint64_t A[5][5]
280 mov x27,x1 // const void *inp
281 mov x28,x2 // size_t len
282 mov x30,x3 // size_t bsz
283 ldp x0,x1,[x26,#16*0]
284 ldp x2,x3,[x26,#16*1]
285 ldp x4,x5,[x26,#16*2]
286 ldp x6,x7,[x26,#16*3]
287 ldp x8,x9,[x26,#16*4]
288 ldp x10,x11,[x26,#16*5]
289 ldp x12,x13,[x26,#16*6]
290 ldp x14,x15,[x26,#16*7]
291 ldp x16,x17,[x26,#16*8]
292 ldp x25,x19,[x26,#16*9]
293 ldp x20,x21,[x26,#16*10]
294 ldp x22,x23,[x26,#16*11]
295 ldr x24,[x26,#16*12]
296 b .Loop_absorb
297
298 .align 4
299 .Loop_absorb:
300 subs x26,x28,x30 // len - bsz
301 blo .Labsorbed
302
303 str x26,[sp,#48] // save len - bsz
304 ldr x26,[x27],#8 // *inp++
305 #ifdef __AARCH64EB__
306 rev x26,x26
307 #endif
308 eor x0,x0,x26
309 cmp x30,#8*(0+2)
310 blo .Lprocess_block
311 ldr x26,[x27],#8 // *inp++
312 #ifdef __AARCH64EB__
313 rev x26,x26
314 #endif
315 eor x1,x1,x26
316 beq .Lprocess_block
317 ldr x26,[x27],#8 // *inp++
318 #ifdef __AARCH64EB__
319 rev x26,x26
320 #endif
321 eor x2,x2,x26
322 cmp x30,#8*(2+2)
323 blo .Lprocess_block
324 ldr x26,[x27],#8 // *inp++
325 #ifdef __AARCH64EB__
326 rev x26,x26
327 #endif
328 eor x3,x3,x26
329 beq .Lprocess_block
330 ldr x26,[x27],#8 // *inp++
331 #ifdef __AARCH64EB__
332 rev x26,x26
333 #endif
334 eor x4,x4,x26
335 cmp x30,#8*(4+2)
336 blo .Lprocess_block
337 ldr x26,[x27],#8 // *inp++
338 #ifdef __AARCH64EB__
339 rev x26,x26
340 #endif
341 eor x5,x5,x26
342 beq .Lprocess_block
343 ldr x26,[x27],#8 // *inp++
344 #ifdef __AARCH64EB__
345 rev x26,x26
346 #endif
347 eor x6,x6,x26
348 cmp x30,#8*(6+2)
349 blo .Lprocess_block
350 ldr x26,[x27],#8 // *inp++
351 #ifdef __AARCH64EB__
352 rev x26,x26
353 #endif
354 eor x7,x7,x26
355 beq .Lprocess_block
356 ldr x26,[x27],#8 // *inp++
357 #ifdef __AARCH64EB__
358 rev x26,x26
359 #endif
360 eor x8,x8,x26
361 cmp x30,#8*(8+2)
362 blo .Lprocess_block
363 ldr x26,[x27],#8 // *inp++
364 #ifdef __AARCH64EB__
365 rev x26,x26
366 #endif
367 eor x9,x9,x26
368 beq .Lprocess_block
369 ldr x26,[x27],#8 // *inp++
370 #ifdef __AARCH64EB__
371 rev x26,x26
372 #endif
373 eor x10,x10,x26
374 cmp x30,#8*(10+2)
375 blo .Lprocess_block
376 ldr x26,[x27],#8 // *inp++
377 #ifdef __AARCH64EB__
378 rev x26,x26
379 #endif
380 eor x11,x11,x26
381 beq .Lprocess_block
382 ldr x26,[x27],#8 // *inp++
383 #ifdef __AARCH64EB__
384 rev x26,x26
385 #endif
386 eor x12,x12,x26
387 cmp x30,#8*(12+2)
388 blo .Lprocess_block
389 ldr x26,[x27],#8 // *inp++
390 #ifdef __AARCH64EB__
391 rev x26,x26
392 #endif
393 eor x13,x13,x26
394 beq .Lprocess_block
395 ldr x26,[x27],#8 // *inp++
396 #ifdef __AARCH64EB__
397 rev x26,x26
398 #endif
399 eor x14,x14,x26
400 cmp x30,#8*(14+2)
401 blo .Lprocess_block
402 ldr x26,[x27],#8 // *inp++
403 #ifdef __AARCH64EB__
404 rev x26,x26
405 #endif
406 eor x15,x15,x26
407 beq .Lprocess_block
408 ldr x26,[x27],#8 // *inp++
409 #ifdef __AARCH64EB__
410 rev x26,x26
411 #endif
412 eor x16,x16,x26
413 cmp x30,#8*(16+2)
414 blo .Lprocess_block
415 ldr x26,[x27],#8 // *inp++
416 #ifdef __AARCH64EB__
417 rev x26,x26
418 #endif
419 eor x17,x17,x26
420 beq .Lprocess_block
421 ldr x26,[x27],#8 // *inp++
422 #ifdef __AARCH64EB__
423 rev x26,x26
424 #endif
425 eor x25,x25,x26
426 cmp x30,#8*(18+2)
427 blo .Lprocess_block
428 ldr x26,[x27],#8 // *inp++
429 #ifdef __AARCH64EB__
430 rev x26,x26
431 #endif
432 eor x19,x19,x26
433 beq .Lprocess_block
434 ldr x26,[x27],#8 // *inp++
435 #ifdef __AARCH64EB__
436 rev x26,x26
437 #endif
438 eor x20,x20,x26
439 cmp x30,#8*(20+2)
440 blo .Lprocess_block
441 ldr x26,[x27],#8 // *inp++
442 #ifdef __AARCH64EB__
443 rev x26,x26
444 #endif
445 eor x21,x21,x26
446 beq .Lprocess_block
447 ldr x26,[x27],#8 // *inp++
448 #ifdef __AARCH64EB__
449 rev x26,x26
450 #endif
451 eor x22,x22,x26
452 cmp x30,#8*(22+2)
453 blo .Lprocess_block
454 ldr x26,[x27],#8 // *inp++
455 #ifdef __AARCH64EB__
456 rev x26,x26
457 #endif
458 eor x23,x23,x26
459 beq .Lprocess_block
460 ldr x26,[x27],#8 // *inp++
461 #ifdef __AARCH64EB__
462 rev x26,x26
463 #endif
464 eor x24,x24,x26
465
466 .Lprocess_block:
467 str x27,[sp,#40] // save inp
468
469 bl KeccakF1600_int
470
471 ldr x27,[sp,#40] // restore arguments
472 ldp x28,x30,[sp,#48]
473 b .Loop_absorb
474
475 .align 4
476 .Labsorbed:
477 ldr x27,[sp,#32]
478 stp x0,x1,[x27,#16*0]
479 stp x2,x3,[x27,#16*1]
480 stp x4,x5,[x27,#16*2]
481 stp x6,x7,[x27,#16*3]
482 stp x8,x9,[x27,#16*4]
483 stp x10,x11,[x27,#16*5]
484 stp x12,x13,[x27,#16*6]
485 stp x14,x15,[x27,#16*7]
486 stp x16,x17,[x27,#16*8]
487 stp x25,x19,[x27,#16*9]
488 stp x20,x21,[x27,#16*10]
489 stp x22,x23,[x27,#16*11]
490 str x24,[x27,#16*12]
491
492 mov x0,x28 // return value
493 ldp x19,x20,[x29,#16]
494 add sp,sp,#64
495 ldp x21,x22,[x29,#32]
496 ldp x23,x24,[x29,#48]
497 ldp x25,x26,[x29,#64]
498 ldp x27,x28,[x29,#80]
499 ldp x29,x30,[sp],#128
500 .inst 0xd50323bf // autiasp
501 ret
502 .size SHA3_absorb,.-SHA3_absorb
503 .globl SHA3_squeeze
504 .type SHA3_squeeze,%function
505 .align 5
506 SHA3_squeeze:
507 .inst 0xd503233f // paciasp
508 stp x29,x30,[sp,#-48]!
509 add x29,sp,#0
510 stp x19,x20,[sp,#16]
511 stp x21,x22,[sp,#32]
512
513 mov x19,x0 // put aside arguments
514 mov x20,x1
515 mov x21,x2
516 mov x22,x3
517
518 .Loop_squeeze:
519 ldr x4,[x0],#8
520 cmp x21,#8
521 blo .Lsqueeze_tail
522 #ifdef __AARCH64EB__
523 rev x4,x4
524 #endif
525 str x4,[x20],#8
526 subs x21,x21,#8
527 beq .Lsqueeze_done
528
529 subs x3,x3,#8
530 bhi .Loop_squeeze
531
532 mov x0,x19
533 bl KeccakF1600
534 mov x0,x19
535 mov x3,x22
536 b .Loop_squeeze
537
538 .align 4
539 .Lsqueeze_tail:
540 strb w4,[x20],#1
541 lsr x4,x4,#8
542 subs x21,x21,#1
543 beq .Lsqueeze_done
544 strb w4,[x20],#1
545 lsr x4,x4,#8
546 subs x21,x21,#1
547 beq .Lsqueeze_done
548 strb w4,[x20],#1
549 lsr x4,x4,#8
550 subs x21,x21,#1
551 beq .Lsqueeze_done
552 strb w4,[x20],#1
553 lsr x4,x4,#8
554 subs x21,x21,#1
555 beq .Lsqueeze_done
556 strb w4,[x20],#1
557 lsr x4,x4,#8
558 subs x21,x21,#1
559 beq .Lsqueeze_done
560 strb w4,[x20],#1
561 lsr x4,x4,#8
562 subs x21,x21,#1
563 beq .Lsqueeze_done
564 strb w4,[x20],#1
565
566 .Lsqueeze_done:
567 ldp x19,x20,[sp,#16]
568 ldp x21,x22,[sp,#32]
569 ldp x29,x30,[sp],#48
570 .inst 0xd50323bf // autiasp
571 ret
572 .size SHA3_squeeze,.-SHA3_squeeze
573 .type KeccakF1600_ce,%function
574 .align 5
575 KeccakF1600_ce:
576 mov x9,#12
577 adr x10,iotas
578 b .Loop_ce
579 .align 4
580 .Loop_ce:
581 ////////////////////////////////////////////////// Theta
582 .inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
583 .inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
584 .inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
585 .inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
586 .inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
587 .inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
588 .inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
589 .inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
590 .inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
591 .inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
592
593 .inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
594 .inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
595 .inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
596 .inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
597 .inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
598
599 ////////////////////////////////////////////////// Theta+Rho+Pi
600 .inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
601 .inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
602 .inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
603 .inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
604 .inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
605
606 .inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
607
608 .inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
609 .inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
610 .inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
611 .inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
612 .inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
613
614 .inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
615
616 eor v0.16b,v0.16b,v29.16b
617 ldr x11,[x10],#8
618
619 .inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
620 .inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
621 .inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
622 .inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
623 .inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
624
625 .inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
626
627 .inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
628 .inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
629 .inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
630 .inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
631 .inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
632
633 .inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
634
635 ////////////////////////////////////////////////// Chi+Iota
636 dup v31.2d,x11 // borrow C[6]
637 .inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
638 .inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
639 .inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
640 .inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
641 .inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
642
643 .inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
644 .inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
645 .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
646 .inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
647 .inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
648
649 eor v0.16b,v28.16b,v31.16b // Iota
650
651 .inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
652 .inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
653 .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
654 .inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
655 .inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
656
657 .inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
658 .inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
659 .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
660 .inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
661 .inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
662
663 .inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
664 .inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
665 .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
666 .inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
667 .inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
668 ////////////////////////////////////////////////// Theta
669 .inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
670 .inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
671 .inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
672 .inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
673 .inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
674 .inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
675 .inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
676 .inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
677 .inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
678 .inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
679
680 .inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
681 .inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
682 .inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
683 .inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
684 .inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
685
686 ////////////////////////////////////////////////// Theta+Rho+Pi
687 .inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
688 .inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
689 .inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
690 .inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
691 .inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
692
693 .inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
694
695 .inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
696 .inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
697 .inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
698 .inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
699 .inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
700
701 .inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
702
703 eor v0.16b,v0.16b,v16.16b
704 ldr x11,[x10],#8
705
706 .inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
707 .inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
708 .inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
709 .inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
710 .inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
711
712 .inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
713
714 .inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
715 .inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
716 .inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
717 .inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
718 .inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
719
720 .inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
721
722 ////////////////////////////////////////////////// Chi+Iota
723 dup v21.2d,x11 // borrow C[6]
724 .inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
725 .inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
726 .inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
727 .inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
728 .inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
729
730 .inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
731 .inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
732 .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
733 .inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
734 .inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
735
736 eor v0.16b,v15.16b,v21.16b // Iota
737
738 .inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
739 .inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
740 .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
741 .inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
742 .inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
743
744 .inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
745 .inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
746 .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
747 .inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
748 .inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
749
750 .inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
751 .inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
752 .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
753 .inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
754 .inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
755 subs x9,x9,#1
756 bne .Loop_ce
757
758 ret
759 .size KeccakF1600_ce,.-KeccakF1600_ce
760
761 .type KeccakF1600_cext,%function
762 .align 5
763 KeccakF1600_cext:
764 .inst 0xd503233f // paciasp
765 stp x29,x30,[sp,#-80]!
766 add x29,sp,#0
767 stp d8,d9,[sp,#16] // per ABI requirement
768 stp d10,d11,[sp,#32]
769 stp d12,d13,[sp,#48]
770 stp d14,d15,[sp,#64]
771 ldp d0,d1,[x0,#8*0]
772 ldp d2,d3,[x0,#8*2]
773 ldp d4,d5,[x0,#8*4]
774 ldp d6,d7,[x0,#8*6]
775 ldp d8,d9,[x0,#8*8]
776 ldp d10,d11,[x0,#8*10]
777 ldp d12,d13,[x0,#8*12]
778 ldp d14,d15,[x0,#8*14]
779 ldp d16,d17,[x0,#8*16]
780 ldp d18,d19,[x0,#8*18]
781 ldp d20,d21,[x0,#8*20]
782 ldp d22,d23,[x0,#8*22]
783 ldr d24,[x0,#8*24]
784 bl KeccakF1600_ce
785 ldr x30,[sp,#8]
786 stp d0,d1,[x0,#8*0]
787 stp d2,d3,[x0,#8*2]
788 stp d4,d5,[x0,#8*4]
789 stp d6,d7,[x0,#8*6]
790 stp d8,d9,[x0,#8*8]
791 stp d10,d11,[x0,#8*10]
792 stp d12,d13,[x0,#8*12]
793 stp d14,d15,[x0,#8*14]
794 stp d16,d17,[x0,#8*16]
795 stp d18,d19,[x0,#8*18]
796 stp d20,d21,[x0,#8*20]
797 stp d22,d23,[x0,#8*22]
798 str d24,[x0,#8*24]
799
800 ldp d8,d9,[sp,#16]
801 ldp d10,d11,[sp,#32]
802 ldp d12,d13,[sp,#48]
803 ldp d14,d15,[sp,#64]
804 ldr x29,[sp],#80
805 .inst 0xd50323bf // autiasp
806 ret
807 .size KeccakF1600_cext,.-KeccakF1600_cext
808 .globl SHA3_absorb_cext
809 .type SHA3_absorb_cext,%function
810 .align 5
811 SHA3_absorb_cext:
812 .inst 0xd503233f // paciasp
813 stp x29,x30,[sp,#-80]!
814 add x29,sp,#0
815 stp d8,d9,[sp,#16] // per ABI requirement
816 stp d10,d11,[sp,#32]
817 stp d12,d13,[sp,#48]
818 stp d14,d15,[sp,#64]
819 ldp d0,d1,[x0,#8*0]
820 ldp d2,d3,[x0,#8*2]
821 ldp d4,d5,[x0,#8*4]
822 ldp d6,d7,[x0,#8*6]
823 ldp d8,d9,[x0,#8*8]
824 ldp d10,d11,[x0,#8*10]
825 ldp d12,d13,[x0,#8*12]
826 ldp d14,d15,[x0,#8*14]
827 ldp d16,d17,[x0,#8*16]
828 ldp d18,d19,[x0,#8*18]
829 ldp d20,d21,[x0,#8*20]
830 ldp d22,d23,[x0,#8*22]
831 ldr d24,[x0,#8*24]
832 b .Loop_absorb_ce
833
834 .align 4
835 .Loop_absorb_ce:
836 subs x2,x2,x3 // len - bsz
837 blo .Labsorbed_ce
838 ldr d31,[x1],#8 // *inp++
839 #ifdef __AARCH64EB__
840 rev64 v31.16b,v31.16b
841 #endif
842 eor v0.16b,v0.16b,v31.16b
843 cmp x3,#8*(0+2)
844 blo .Lprocess_block_ce
845 ldr d31,[x1],#8 // *inp++
846 #ifdef __AARCH64EB__
847 rev64 v31.16b,v31.16b
848 #endif
849 eor v1.16b,v1.16b,v31.16b
850 beq .Lprocess_block_ce
851 ldr d31,[x1],#8 // *inp++
852 #ifdef __AARCH64EB__
853 rev64 v31.16b,v31.16b
854 #endif
855 eor v2.16b,v2.16b,v31.16b
856 cmp x3,#8*(2+2)
857 blo .Lprocess_block_ce
858 ldr d31,[x1],#8 // *inp++
859 #ifdef __AARCH64EB__
860 rev64 v31.16b,v31.16b
861 #endif
862 eor v3.16b,v3.16b,v31.16b
863 beq .Lprocess_block_ce
864 ldr d31,[x1],#8 // *inp++
865 #ifdef __AARCH64EB__
866 rev64 v31.16b,v31.16b
867 #endif
868 eor v4.16b,v4.16b,v31.16b
869 cmp x3,#8*(4+2)
870 blo .Lprocess_block_ce
871 ldr d31,[x1],#8 // *inp++
872 #ifdef __AARCH64EB__
873 rev64 v31.16b,v31.16b
874 #endif
875 eor v5.16b,v5.16b,v31.16b
876 beq .Lprocess_block_ce
877 ldr d31,[x1],#8 // *inp++
878 #ifdef __AARCH64EB__
879 rev64 v31.16b,v31.16b
880 #endif
881 eor v6.16b,v6.16b,v31.16b
882 cmp x3,#8*(6+2)
883 blo .Lprocess_block_ce
884 ldr d31,[x1],#8 // *inp++
885 #ifdef __AARCH64EB__
886 rev64 v31.16b,v31.16b
887 #endif
888 eor v7.16b,v7.16b,v31.16b
889 beq .Lprocess_block_ce
890 ldr d31,[x1],#8 // *inp++
891 #ifdef __AARCH64EB__
892 rev64 v31.16b,v31.16b
893 #endif
894 eor v8.16b,v8.16b,v31.16b
895 cmp x3,#8*(8+2)
896 blo .Lprocess_block_ce
897 ldr d31,[x1],#8 // *inp++
898 #ifdef __AARCH64EB__
899 rev64 v31.16b,v31.16b
900 #endif
901 eor v9.16b,v9.16b,v31.16b
902 beq .Lprocess_block_ce
903 ldr d31,[x1],#8 // *inp++
904 #ifdef __AARCH64EB__
905 rev64 v31.16b,v31.16b
906 #endif
907 eor v10.16b,v10.16b,v31.16b
908 cmp x3,#8*(10+2)
909 blo .Lprocess_block_ce
910 ldr d31,[x1],#8 // *inp++
911 #ifdef __AARCH64EB__
912 rev64 v31.16b,v31.16b
913 #endif
914 eor v11.16b,v11.16b,v31.16b
915 beq .Lprocess_block_ce
916 ldr d31,[x1],#8 // *inp++
917 #ifdef __AARCH64EB__
918 rev64 v31.16b,v31.16b
919 #endif
920 eor v12.16b,v12.16b,v31.16b
921 cmp x3,#8*(12+2)
922 blo .Lprocess_block_ce
923 ldr d31,[x1],#8 // *inp++
924 #ifdef __AARCH64EB__
925 rev64 v31.16b,v31.16b
926 #endif
927 eor v13.16b,v13.16b,v31.16b
928 beq .Lprocess_block_ce
929 ldr d31,[x1],#8 // *inp++
930 #ifdef __AARCH64EB__
931 rev64 v31.16b,v31.16b
932 #endif
933 eor v14.16b,v14.16b,v31.16b
934 cmp x3,#8*(14+2)
935 blo .Lprocess_block_ce
936 ldr d31,[x1],#8 // *inp++
937 #ifdef __AARCH64EB__
938 rev64 v31.16b,v31.16b
939 #endif
940 eor v15.16b,v15.16b,v31.16b
941 beq .Lprocess_block_ce
942 ldr d31,[x1],#8 // *inp++
943 #ifdef __AARCH64EB__
944 rev64 v31.16b,v31.16b
945 #endif
946 eor v16.16b,v16.16b,v31.16b
947 cmp x3,#8*(16+2)
948 blo .Lprocess_block_ce
949 ldr d31,[x1],#8 // *inp++
950 #ifdef __AARCH64EB__
951 rev64 v31.16b,v31.16b
952 #endif
953 eor v17.16b,v17.16b,v31.16b
954 beq .Lprocess_block_ce
955 ldr d31,[x1],#8 // *inp++
956 #ifdef __AARCH64EB__
957 rev64 v31.16b,v31.16b
958 #endif
959 eor v18.16b,v18.16b,v31.16b
960 cmp x3,#8*(18+2)
961 blo .Lprocess_block_ce
962 ldr d31,[x1],#8 // *inp++
963 #ifdef __AARCH64EB__
964 rev64 v31.16b,v31.16b
965 #endif
966 eor v19.16b,v19.16b,v31.16b
967 beq .Lprocess_block_ce
968 ldr d31,[x1],#8 // *inp++
969 #ifdef __AARCH64EB__
970 rev64 v31.16b,v31.16b
971 #endif
972 eor v20.16b,v20.16b,v31.16b
973 cmp x3,#8*(20+2)
974 blo .Lprocess_block_ce
975 ldr d31,[x1],#8 // *inp++
976 #ifdef __AARCH64EB__
977 rev64 v31.16b,v31.16b
978 #endif
979 eor v21.16b,v21.16b,v31.16b
980 beq .Lprocess_block_ce
981 ldr d31,[x1],#8 // *inp++
982 #ifdef __AARCH64EB__
983 rev64 v31.16b,v31.16b
984 #endif
985 eor v22.16b,v22.16b,v31.16b
986 cmp x3,#8*(22+2)
987 blo .Lprocess_block_ce
988 ldr d31,[x1],#8 // *inp++
989 #ifdef __AARCH64EB__
990 rev64 v31.16b,v31.16b
991 #endif
992 eor v23.16b,v23.16b,v31.16b
993 beq .Lprocess_block_ce
994 ldr d31,[x1],#8 // *inp++
995 #ifdef __AARCH64EB__
996 rev64 v31.16b,v31.16b
997 #endif
998 eor v24.16b,v24.16b,v31.16b
999
1000 .Lprocess_block_ce:
1001
1002 bl KeccakF1600_ce
1003
1004 b .Loop_absorb_ce
1005
1006 .align 4
1007 .Labsorbed_ce:
1008 stp d0,d1,[x0,#8*0]
1009 stp d2,d3,[x0,#8*2]
1010 stp d4,d5,[x0,#8*4]
1011 stp d6,d7,[x0,#8*6]
1012 stp d8,d9,[x0,#8*8]
1013 stp d10,d11,[x0,#8*10]
1014 stp d12,d13,[x0,#8*12]
1015 stp d14,d15,[x0,#8*14]
1016 stp d16,d17,[x0,#8*16]
1017 stp d18,d19,[x0,#8*18]
1018 stp d20,d21,[x0,#8*20]
1019 stp d22,d23,[x0,#8*22]
1020 str d24,[x0,#8*24]
1021 add x0,x2,x3 // return value
1022
1023 ldp d8,d9,[sp,#16]
1024 ldp d10,d11,[sp,#32]
1025 ldp d12,d13,[sp,#48]
1026 ldp d14,d15,[sp,#64]
1027 ldp x29,x30,[sp],#80
1028 .inst 0xd50323bf // autiasp
1029 ret
1030 .size SHA3_absorb_cext,.-SHA3_absorb_cext
1031 .globl SHA3_squeeze_cext
1032 .type SHA3_squeeze_cext,%function
1033 .align 5
1034 SHA3_squeeze_cext:
1035 .inst 0xd503233f // paciasp
1036 stp x29,x30,[sp,#-16]!
1037 add x29,sp,#0
1038 mov x9,x0
1039 mov x10,x3
1040
1041 .Loop_squeeze_ce:
1042 ldr x4,[x9],#8
1043 cmp x2,#8
1044 blo .Lsqueeze_tail_ce
1045 #ifdef __AARCH64EB__
1046 rev x4,x4
1047 #endif
1048 str x4,[x1],#8
1049 beq .Lsqueeze_done_ce
1050
1051 sub x2,x2,#8
1052 subs x10,x10,#8
1053 bhi .Loop_squeeze_ce
1054
1055 bl KeccakF1600_cext
1056 ldr x30,[sp,#8]
1057 mov x9,x0
1058 mov x10,x3
1059 b .Loop_squeeze_ce
1060
1061 .align 4
1062 .Lsqueeze_tail_ce:
1063 strb w4,[x1],#1
1064 lsr x4,x4,#8
1065 subs x2,x2,#1
1066 beq .Lsqueeze_done_ce
1067 strb w4,[x1],#1
1068 lsr x4,x4,#8
1069 subs x2,x2,#1
1070 beq .Lsqueeze_done_ce
1071 strb w4,[x1],#1
1072 lsr x4,x4,#8
1073 subs x2,x2,#1
1074 beq .Lsqueeze_done_ce
1075 strb w4,[x1],#1
1076 lsr x4,x4,#8
1077 subs x2,x2,#1
1078 beq .Lsqueeze_done_ce
1079 strb w4,[x1],#1
1080 lsr x4,x4,#8
1081 subs x2,x2,#1
1082 beq .Lsqueeze_done_ce
1083 strb w4,[x1],#1
1084 lsr x4,x4,#8
1085 subs x2,x2,#1
1086 beq .Lsqueeze_done_ce
1087 strb w4,[x1],#1
1088
1089 .Lsqueeze_done_ce:
1090 ldr x29,[sp],#16
1091 .inst 0xd50323bf // autiasp
1092 ret
1093 .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
1094 .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1095 .align 2
1096