keccak1600-armv8.S revision 1.2.4.2 1 #include "arm_asm.h"
2 .text
3
4 .align 8 // strategic alignment and padding that allows to use
5 // address value as loop termination condition...
6 .quad 0,0,0,0,0,0,0,0
7 .type iotas,%object
8 iotas:
9 .quad 0x0000000000000001
10 .quad 0x0000000000008082
11 .quad 0x800000000000808a
12 .quad 0x8000000080008000
13 .quad 0x000000000000808b
14 .quad 0x0000000080000001
15 .quad 0x8000000080008081
16 .quad 0x8000000000008009
17 .quad 0x000000000000008a
18 .quad 0x0000000000000088
19 .quad 0x0000000080008009
20 .quad 0x000000008000000a
21 .quad 0x000000008000808b
22 .quad 0x800000000000008b
23 .quad 0x8000000000008089
24 .quad 0x8000000000008003
25 .quad 0x8000000000008002
26 .quad 0x8000000000000080
27 .quad 0x000000000000800a
28 .quad 0x800000008000000a
29 .quad 0x8000000080008081
30 .quad 0x8000000000008080
31 .quad 0x0000000080000001
32 .quad 0x8000000080008008
33 .size iotas,.-iotas
34 .type KeccakF1600_int,%function
35 .align 5
36 KeccakF1600_int:
37 adr x28,iotas
38 stp x28,x30,[sp,#16] // 32 bytes on top are mine
39 b .Loop
40 .align 4
41 .Loop:
42 ////////////////////////////////////////// Theta
43 eor x26,x0,x5
44 stp x4,x9,[sp,#0] // offload pair...
45 eor x27,x1,x6
46 eor x28,x2,x7
47 eor x30,x3,x8
48 eor x4,x4,x9
49 eor x26,x26,x10
50 eor x27,x27,x11
51 eor x28,x28,x12
52 eor x30,x30,x13
53 eor x4,x4,x14
54 eor x26,x26,x15
55 eor x27,x27,x16
56 eor x28,x28,x17
57 eor x30,x30,x25
58 eor x4,x4,x19
59 eor x26,x26,x20
60 eor x28,x28,x22
61 eor x27,x27,x21
62 eor x30,x30,x23
63 eor x4,x4,x24
64
65 eor x9,x26,x28,ror#63
66
67 eor x1,x1,x9
68 eor x6,x6,x9
69 eor x11,x11,x9
70 eor x16,x16,x9
71 eor x21,x21,x9
72
73 eor x9,x27,x30,ror#63
74 eor x28,x28,x4,ror#63
75 eor x30,x30,x26,ror#63
76 eor x4,x4,x27,ror#63
77
78 eor x27, x2,x9 // mov x27,x2
79 eor x7,x7,x9
80 eor x12,x12,x9
81 eor x17,x17,x9
82 eor x22,x22,x9
83
84 eor x0,x0,x4
85 eor x5,x5,x4
86 eor x10,x10,x4
87 eor x15,x15,x4
88 eor x20,x20,x4
89 ldp x4,x9,[sp,#0] // re-load offloaded data
90 eor x26, x3,x28 // mov x26,x3
91 eor x8,x8,x28
92 eor x13,x13,x28
93 eor x25,x25,x28
94 eor x23,x23,x28
95
96 eor x28, x4,x30 // mov x28,x4
97 eor x9,x9,x30
98 eor x14,x14,x30
99 eor x19,x19,x30
100 eor x24,x24,x30
101
102 ////////////////////////////////////////// Rho+Pi
103 mov x30,x1
104 ror x1,x6,#64-44
105 //mov x27,x2
106 ror x2,x12,#64-43
107 //mov x26,x3
108 ror x3,x25,#64-21
109 //mov x28,x4
110 ror x4,x24,#64-14
111
112 ror x6,x9,#64-20
113 ror x12,x13,#64-25
114 ror x25,x17,#64-15
115 ror x24,x21,#64-2
116
117 ror x9,x22,#64-61
118 ror x13,x19,#64-8
119 ror x17,x11,#64-10
120 ror x21,x8,#64-55
121
122 ror x22,x14,#64-39
123 ror x19,x23,#64-56
124 ror x11,x7,#64-6
125 ror x8,x16,#64-45
126
127 ror x14,x20,#64-18
128 ror x23,x15,#64-41
129 ror x7,x10,#64-3
130 ror x16,x5,#64-36
131
132 ror x5,x26,#64-28
133 ror x10,x30,#64-1
134 ror x15,x28,#64-27
135 ror x20,x27,#64-62
136
137 ////////////////////////////////////////// Chi+Iota
138 bic x26,x2,x1
139 bic x27,x3,x2
140 bic x28,x0,x4
141 bic x30,x1,x0
142 eor x0,x0,x26
143 bic x26,x4,x3
144 eor x1,x1,x27
145 ldr x27,[sp,#16]
146 eor x3,x3,x28
147 eor x4,x4,x30
148 eor x2,x2,x26
149 ldr x30,[x27],#8 // Iota[i++]
150
151 bic x26,x7,x6
152 tst x27,#255 // are we done?
153 str x27,[sp,#16]
154 bic x27,x8,x7
155 bic x28,x5,x9
156 eor x0,x0,x30 // A[0][0] ^= Iota
157 bic x30,x6,x5
158 eor x5,x5,x26
159 bic x26,x9,x8
160 eor x6,x6,x27
161 eor x8,x8,x28
162 eor x9,x9,x30
163 eor x7,x7,x26
164
165 bic x26,x12,x11
166 bic x27,x13,x12
167 bic x28,x10,x14
168 bic x30,x11,x10
169 eor x10,x10,x26
170 bic x26,x14,x13
171 eor x11,x11,x27
172 eor x13,x13,x28
173 eor x14,x14,x30
174 eor x12,x12,x26
175
176 bic x26,x17,x16
177 bic x27,x25,x17
178 bic x28,x15,x19
179 bic x30,x16,x15
180 eor x15,x15,x26
181 bic x26,x19,x25
182 eor x16,x16,x27
183 eor x25,x25,x28
184 eor x19,x19,x30
185 eor x17,x17,x26
186
187 bic x26,x22,x21
188 bic x27,x23,x22
189 bic x28,x20,x24
190 bic x30,x21,x20
191 eor x20,x20,x26
192 bic x26,x24,x23
193 eor x21,x21,x27
194 eor x23,x23,x28
195 eor x24,x24,x30
196 eor x22,x22,x26
197
198 bne .Loop
199
200 ldr x30,[sp,#24]
201 ret
202 .size KeccakF1600_int,.-KeccakF1600_int
203
204 .type KeccakF1600,%function
205 .align 5
206 KeccakF1600:
207 stp x29,x30,[sp,#-128]!
208 add x29,sp,#0
209 stp x19,x20,[sp,#16]
210 stp x21,x22,[sp,#32]
211 stp x23,x24,[sp,#48]
212 stp x25,x26,[sp,#64]
213 stp x27,x28,[sp,#80]
214 sub sp,sp,#48
215
216 str x0,[sp,#32] // offload argument
217 mov x26,x0
218 ldp x0,x1,[x0,#16*0]
219 ldp x2,x3,[x26,#16*1]
220 ldp x4,x5,[x26,#16*2]
221 ldp x6,x7,[x26,#16*3]
222 ldp x8,x9,[x26,#16*4]
223 ldp x10,x11,[x26,#16*5]
224 ldp x12,x13,[x26,#16*6]
225 ldp x14,x15,[x26,#16*7]
226 ldp x16,x17,[x26,#16*8]
227 ldp x25,x19,[x26,#16*9]
228 ldp x20,x21,[x26,#16*10]
229 ldp x22,x23,[x26,#16*11]
230 ldr x24,[x26,#16*12]
231
232 bl KeccakF1600_int
233
234 ldr x26,[sp,#32]
235 stp x0,x1,[x26,#16*0]
236 stp x2,x3,[x26,#16*1]
237 stp x4,x5,[x26,#16*2]
238 stp x6,x7,[x26,#16*3]
239 stp x8,x9,[x26,#16*4]
240 stp x10,x11,[x26,#16*5]
241 stp x12,x13,[x26,#16*6]
242 stp x14,x15,[x26,#16*7]
243 stp x16,x17,[x26,#16*8]
244 stp x25,x19,[x26,#16*9]
245 stp x20,x21,[x26,#16*10]
246 stp x22,x23,[x26,#16*11]
247 str x24,[x26,#16*12]
248
249 ldp x19,x20,[x29,#16]
250 add sp,sp,#48
251 ldp x21,x22,[x29,#32]
252 ldp x23,x24,[x29,#48]
253 ldp x25,x26,[x29,#64]
254 ldp x27,x28,[x29,#80]
255 ldp x29,x30,[sp],#128
256 ret
257 .size KeccakF1600,.-KeccakF1600
258
259 .globl SHA3_absorb
260 .type SHA3_absorb,%function
261 .align 5
262 SHA3_absorb:
263 stp x29,x30,[sp,#-128]!
264 add x29,sp,#0
265 stp x19,x20,[sp,#16]
266 stp x21,x22,[sp,#32]
267 stp x23,x24,[sp,#48]
268 stp x25,x26,[sp,#64]
269 stp x27,x28,[sp,#80]
270 sub sp,sp,#64
271
272 stp x0,x1,[sp,#32] // offload arguments
273 stp x2,x3,[sp,#48]
274
275 mov x26,x0 // uint64_t A[5][5]
276 mov x27,x1 // const void *inp
277 mov x28,x2 // size_t len
278 mov x30,x3 // size_t bsz
279 ldp x0,x1,[x26,#16*0]
280 ldp x2,x3,[x26,#16*1]
281 ldp x4,x5,[x26,#16*2]
282 ldp x6,x7,[x26,#16*3]
283 ldp x8,x9,[x26,#16*4]
284 ldp x10,x11,[x26,#16*5]
285 ldp x12,x13,[x26,#16*6]
286 ldp x14,x15,[x26,#16*7]
287 ldp x16,x17,[x26,#16*8]
288 ldp x25,x19,[x26,#16*9]
289 ldp x20,x21,[x26,#16*10]
290 ldp x22,x23,[x26,#16*11]
291 ldr x24,[x26,#16*12]
292 b .Loop_absorb
293
294 .align 4
295 .Loop_absorb:
296 subs x26,x28,x30 // len - bsz
297 blo .Labsorbed
298
299 str x26,[sp,#48] // save len - bsz
300 ldr x26,[x27],#8 // *inp++
301 #ifdef __AARCH64EB__
302 rev x26,x26
303 #endif
304 eor x0,x0,x26
305 cmp x30,#8*(0+2)
306 blo .Lprocess_block
307 ldr x26,[x27],#8 // *inp++
308 #ifdef __AARCH64EB__
309 rev x26,x26
310 #endif
311 eor x1,x1,x26
312 beq .Lprocess_block
313 ldr x26,[x27],#8 // *inp++
314 #ifdef __AARCH64EB__
315 rev x26,x26
316 #endif
317 eor x2,x2,x26
318 cmp x30,#8*(2+2)
319 blo .Lprocess_block
320 ldr x26,[x27],#8 // *inp++
321 #ifdef __AARCH64EB__
322 rev x26,x26
323 #endif
324 eor x3,x3,x26
325 beq .Lprocess_block
326 ldr x26,[x27],#8 // *inp++
327 #ifdef __AARCH64EB__
328 rev x26,x26
329 #endif
330 eor x4,x4,x26
331 cmp x30,#8*(4+2)
332 blo .Lprocess_block
333 ldr x26,[x27],#8 // *inp++
334 #ifdef __AARCH64EB__
335 rev x26,x26
336 #endif
337 eor x5,x5,x26
338 beq .Lprocess_block
339 ldr x26,[x27],#8 // *inp++
340 #ifdef __AARCH64EB__
341 rev x26,x26
342 #endif
343 eor x6,x6,x26
344 cmp x30,#8*(6+2)
345 blo .Lprocess_block
346 ldr x26,[x27],#8 // *inp++
347 #ifdef __AARCH64EB__
348 rev x26,x26
349 #endif
350 eor x7,x7,x26
351 beq .Lprocess_block
352 ldr x26,[x27],#8 // *inp++
353 #ifdef __AARCH64EB__
354 rev x26,x26
355 #endif
356 eor x8,x8,x26
357 cmp x30,#8*(8+2)
358 blo .Lprocess_block
359 ldr x26,[x27],#8 // *inp++
360 #ifdef __AARCH64EB__
361 rev x26,x26
362 #endif
363 eor x9,x9,x26
364 beq .Lprocess_block
365 ldr x26,[x27],#8 // *inp++
366 #ifdef __AARCH64EB__
367 rev x26,x26
368 #endif
369 eor x10,x10,x26
370 cmp x30,#8*(10+2)
371 blo .Lprocess_block
372 ldr x26,[x27],#8 // *inp++
373 #ifdef __AARCH64EB__
374 rev x26,x26
375 #endif
376 eor x11,x11,x26
377 beq .Lprocess_block
378 ldr x26,[x27],#8 // *inp++
379 #ifdef __AARCH64EB__
380 rev x26,x26
381 #endif
382 eor x12,x12,x26
383 cmp x30,#8*(12+2)
384 blo .Lprocess_block
385 ldr x26,[x27],#8 // *inp++
386 #ifdef __AARCH64EB__
387 rev x26,x26
388 #endif
389 eor x13,x13,x26
390 beq .Lprocess_block
391 ldr x26,[x27],#8 // *inp++
392 #ifdef __AARCH64EB__
393 rev x26,x26
394 #endif
395 eor x14,x14,x26
396 cmp x30,#8*(14+2)
397 blo .Lprocess_block
398 ldr x26,[x27],#8 // *inp++
399 #ifdef __AARCH64EB__
400 rev x26,x26
401 #endif
402 eor x15,x15,x26
403 beq .Lprocess_block
404 ldr x26,[x27],#8 // *inp++
405 #ifdef __AARCH64EB__
406 rev x26,x26
407 #endif
408 eor x16,x16,x26
409 cmp x30,#8*(16+2)
410 blo .Lprocess_block
411 ldr x26,[x27],#8 // *inp++
412 #ifdef __AARCH64EB__
413 rev x26,x26
414 #endif
415 eor x17,x17,x26
416 beq .Lprocess_block
417 ldr x26,[x27],#8 // *inp++
418 #ifdef __AARCH64EB__
419 rev x26,x26
420 #endif
421 eor x25,x25,x26
422 cmp x30,#8*(18+2)
423 blo .Lprocess_block
424 ldr x26,[x27],#8 // *inp++
425 #ifdef __AARCH64EB__
426 rev x26,x26
427 #endif
428 eor x19,x19,x26
429 beq .Lprocess_block
430 ldr x26,[x27],#8 // *inp++
431 #ifdef __AARCH64EB__
432 rev x26,x26
433 #endif
434 eor x20,x20,x26
435 cmp x30,#8*(20+2)
436 blo .Lprocess_block
437 ldr x26,[x27],#8 // *inp++
438 #ifdef __AARCH64EB__
439 rev x26,x26
440 #endif
441 eor x21,x21,x26
442 beq .Lprocess_block
443 ldr x26,[x27],#8 // *inp++
444 #ifdef __AARCH64EB__
445 rev x26,x26
446 #endif
447 eor x22,x22,x26
448 cmp x30,#8*(22+2)
449 blo .Lprocess_block
450 ldr x26,[x27],#8 // *inp++
451 #ifdef __AARCH64EB__
452 rev x26,x26
453 #endif
454 eor x23,x23,x26
455 beq .Lprocess_block
456 ldr x26,[x27],#8 // *inp++
457 #ifdef __AARCH64EB__
458 rev x26,x26
459 #endif
460 eor x24,x24,x26
461
462 .Lprocess_block:
463 str x27,[sp,#40] // save inp
464
465 bl KeccakF1600_int
466
467 ldr x27,[sp,#40] // restore arguments
468 ldp x28,x30,[sp,#48]
469 b .Loop_absorb
470
471 .align 4
472 .Labsorbed:
473 ldr x27,[sp,#32]
474 stp x0,x1,[x27,#16*0]
475 stp x2,x3,[x27,#16*1]
476 stp x4,x5,[x27,#16*2]
477 stp x6,x7,[x27,#16*3]
478 stp x8,x9,[x27,#16*4]
479 stp x10,x11,[x27,#16*5]
480 stp x12,x13,[x27,#16*6]
481 stp x14,x15,[x27,#16*7]
482 stp x16,x17,[x27,#16*8]
483 stp x25,x19,[x27,#16*9]
484 stp x20,x21,[x27,#16*10]
485 stp x22,x23,[x27,#16*11]
486 str x24,[x27,#16*12]
487
488 mov x0,x28 // return value
489 ldp x19,x20,[x29,#16]
490 add sp,sp,#64
491 ldp x21,x22,[x29,#32]
492 ldp x23,x24,[x29,#48]
493 ldp x25,x26,[x29,#64]
494 ldp x27,x28,[x29,#80]
495 ldp x29,x30,[sp],#128
496 ret
497 .size SHA3_absorb,.-SHA3_absorb
498 .globl SHA3_squeeze
499 .type SHA3_squeeze,%function
500 .align 5
501 SHA3_squeeze:
502 stp x29,x30,[sp,#-48]!
503 add x29,sp,#0
504 stp x19,x20,[sp,#16]
505 stp x21,x22,[sp,#32]
506
507 mov x19,x0 // put aside arguments
508 mov x20,x1
509 mov x21,x2
510 mov x22,x3
511
512 .Loop_squeeze:
513 ldr x4,[x0],#8
514 cmp x21,#8
515 blo .Lsqueeze_tail
516 #ifdef __AARCH64EB__
517 rev x4,x4
518 #endif
519 str x4,[x20],#8
520 subs x21,x21,#8
521 beq .Lsqueeze_done
522
523 subs x3,x3,#8
524 bhi .Loop_squeeze
525
526 mov x0,x19
527 bl KeccakF1600
528 mov x0,x19
529 mov x3,x22
530 b .Loop_squeeze
531
532 .align 4
533 .Lsqueeze_tail:
534 strb w4,[x20],#1
535 lsr x4,x4,#8
536 subs x21,x21,#1
537 beq .Lsqueeze_done
538 strb w4,[x20],#1
539 lsr x4,x4,#8
540 subs x21,x21,#1
541 beq .Lsqueeze_done
542 strb w4,[x20],#1
543 lsr x4,x4,#8
544 subs x21,x21,#1
545 beq .Lsqueeze_done
546 strb w4,[x20],#1
547 lsr x4,x4,#8
548 subs x21,x21,#1
549 beq .Lsqueeze_done
550 strb w4,[x20],#1
551 lsr x4,x4,#8
552 subs x21,x21,#1
553 beq .Lsqueeze_done
554 strb w4,[x20],#1
555 lsr x4,x4,#8
556 subs x21,x21,#1
557 beq .Lsqueeze_done
558 strb w4,[x20],#1
559
560 .Lsqueeze_done:
561 ldp x19,x20,[sp,#16]
562 ldp x21,x22,[sp,#32]
563 ldp x29,x30,[sp],#48
564 ret
565 .size SHA3_squeeze,.-SHA3_squeeze
566 .type KeccakF1600_ce,%function
567 .align 5
568 KeccakF1600_ce:
569 mov x9,#12
570 adr x10,iotas
571 b .Loop_ce
572 .align 4
573 .Loop_ce:
574 ////////////////////////////////////////////////// Theta
575 .inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
576 .inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
577 .inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
578 .inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
579 .inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
580 .inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
581 .inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
582 .inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
583 .inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
584 .inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
585
586 .inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
587 .inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
588 .inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
589 .inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
590 .inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
591
592 ////////////////////////////////////////////////// Theta+Rho+Pi
593 .inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
594 .inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
595 .inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
596 .inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
597 .inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
598
599 .inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
600
601 .inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
602 .inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
603 .inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
604 .inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
605 .inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
606
607 .inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
608
609 eor v0.16b,v0.16b,v29.16b
610 ldr x11,[x10],#8
611
612 .inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
613 .inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
614 .inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
615 .inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
616 .inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
617
618 .inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
619
620 .inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
621 .inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
622 .inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
623 .inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
624 .inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
625
626 .inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
627
628 ////////////////////////////////////////////////// Chi+Iota
629 dup v31.2d,x11 // borrow C[6]
630 .inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
631 .inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
632 .inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
633 .inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
634 .inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
635
636 .inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
637 .inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
638 .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
639 .inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
640 .inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
641
642 eor v0.16b,v28.16b,v31.16b // Iota
643
644 .inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
645 .inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
646 .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
647 .inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
648 .inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
649
650 .inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
651 .inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
652 .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
653 .inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
654 .inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
655
656 .inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
657 .inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
658 .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
659 .inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
660 .inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
661 ////////////////////////////////////////////////// Theta
662 .inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
663 .inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
664 .inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
665 .inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
666 .inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
667 .inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
668 .inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
669 .inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
670 .inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
671 .inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
672
673 .inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
674 .inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
675 .inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
676 .inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
677 .inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
678
679 ////////////////////////////////////////////////// Theta+Rho+Pi
680 .inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
681 .inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
682 .inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
683 .inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
684 .inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
685
686 .inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
687
688 .inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
689 .inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
690 .inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
691 .inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
692 .inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
693
694 .inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
695
696 eor v0.16b,v0.16b,v16.16b
697 ldr x11,[x10],#8
698
699 .inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
700 .inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
701 .inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
702 .inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
703 .inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
704
705 .inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
706
707 .inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
708 .inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
709 .inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
710 .inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
711 .inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
712
713 .inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
714
715 ////////////////////////////////////////////////// Chi+Iota
716 dup v21.2d,x11 // borrow C[6]
717 .inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
718 .inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
719 .inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
720 .inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
721 .inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
722
723 .inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
724 .inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
725 .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
726 .inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
727 .inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
728
729 eor v0.16b,v15.16b,v21.16b // Iota
730
731 .inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
732 .inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
733 .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
734 .inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
735 .inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
736
737 .inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
738 .inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
739 .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
740 .inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
741 .inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
742
743 .inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
744 .inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
745 .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
746 .inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
747 .inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
748 subs x9,x9,#1
749 bne .Loop_ce
750
751 ret
752 .size KeccakF1600_ce,.-KeccakF1600_ce
753
754 .type KeccakF1600_cext,%function
755 .align 5
756 KeccakF1600_cext:
757 stp x29,x30,[sp,#-80]!
758 add x29,sp,#0
759 stp d8,d9,[sp,#16] // per ABI requirement
760 stp d10,d11,[sp,#32]
761 stp d12,d13,[sp,#48]
762 stp d14,d15,[sp,#64]
763 ldp d0,d1,[x0,#8*0]
764 ldp d2,d3,[x0,#8*2]
765 ldp d4,d5,[x0,#8*4]
766 ldp d6,d7,[x0,#8*6]
767 ldp d8,d9,[x0,#8*8]
768 ldp d10,d11,[x0,#8*10]
769 ldp d12,d13,[x0,#8*12]
770 ldp d14,d15,[x0,#8*14]
771 ldp d16,d17,[x0,#8*16]
772 ldp d18,d19,[x0,#8*18]
773 ldp d20,d21,[x0,#8*20]
774 ldp d22,d23,[x0,#8*22]
775 ldr d24,[x0,#8*24]
776 bl KeccakF1600_ce
777 ldr x30,[sp,#8]
778 stp d0,d1,[x0,#8*0]
779 stp d2,d3,[x0,#8*2]
780 stp d4,d5,[x0,#8*4]
781 stp d6,d7,[x0,#8*6]
782 stp d8,d9,[x0,#8*8]
783 stp d10,d11,[x0,#8*10]
784 stp d12,d13,[x0,#8*12]
785 stp d14,d15,[x0,#8*14]
786 stp d16,d17,[x0,#8*16]
787 stp d18,d19,[x0,#8*18]
788 stp d20,d21,[x0,#8*20]
789 stp d22,d23,[x0,#8*22]
790 str d24,[x0,#8*24]
791
792 ldp d8,d9,[sp,#16]
793 ldp d10,d11,[sp,#32]
794 ldp d12,d13,[sp,#48]
795 ldp d14,d15,[sp,#64]
796 ldr x29,[sp],#80
797 ret
798 .size KeccakF1600_cext,.-KeccakF1600_cext
799 .globl SHA3_absorb_cext
800 .type SHA3_absorb_cext,%function
801 .align 5
802 SHA3_absorb_cext:
803 stp x29,x30,[sp,#-80]!
804 add x29,sp,#0
805 stp d8,d9,[sp,#16] // per ABI requirement
806 stp d10,d11,[sp,#32]
807 stp d12,d13,[sp,#48]
808 stp d14,d15,[sp,#64]
809 ldp d0,d1,[x0,#8*0]
810 ldp d2,d3,[x0,#8*2]
811 ldp d4,d5,[x0,#8*4]
812 ldp d6,d7,[x0,#8*6]
813 ldp d8,d9,[x0,#8*8]
814 ldp d10,d11,[x0,#8*10]
815 ldp d12,d13,[x0,#8*12]
816 ldp d14,d15,[x0,#8*14]
817 ldp d16,d17,[x0,#8*16]
818 ldp d18,d19,[x0,#8*18]
819 ldp d20,d21,[x0,#8*20]
820 ldp d22,d23,[x0,#8*22]
821 ldr d24,[x0,#8*24]
822 b .Loop_absorb_ce
823
824 .align 4
825 .Loop_absorb_ce:
826 subs x2,x2,x3 // len - bsz
827 blo .Labsorbed_ce
828 ldr d31,[x1],#8 // *inp++
829 #ifdef __AARCH64EB__
830 rev64 v31.16b,v31.16b
831 #endif
832 eor v0.16b,v0.16b,v31.16b
833 cmp x3,#8*(0+2)
834 blo .Lprocess_block_ce
835 ldr d31,[x1],#8 // *inp++
836 #ifdef __AARCH64EB__
837 rev v31.16b,v31.16b
838 #endif
839 eor v1.16b,v1.16b,v31.16b
840 beq .Lprocess_block_ce
841 ldr d31,[x1],#8 // *inp++
842 #ifdef __AARCH64EB__
843 rev64 v31.16b,v31.16b
844 #endif
845 eor v2.16b,v2.16b,v31.16b
846 cmp x3,#8*(2+2)
847 blo .Lprocess_block_ce
848 ldr d31,[x1],#8 // *inp++
849 #ifdef __AARCH64EB__
850 rev v31.16b,v31.16b
851 #endif
852 eor v3.16b,v3.16b,v31.16b
853 beq .Lprocess_block_ce
854 ldr d31,[x1],#8 // *inp++
855 #ifdef __AARCH64EB__
856 rev64 v31.16b,v31.16b
857 #endif
858 eor v4.16b,v4.16b,v31.16b
859 cmp x3,#8*(4+2)
860 blo .Lprocess_block_ce
861 ldr d31,[x1],#8 // *inp++
862 #ifdef __AARCH64EB__
863 rev v31.16b,v31.16b
864 #endif
865 eor v5.16b,v5.16b,v31.16b
866 beq .Lprocess_block_ce
867 ldr d31,[x1],#8 // *inp++
868 #ifdef __AARCH64EB__
869 rev64 v31.16b,v31.16b
870 #endif
871 eor v6.16b,v6.16b,v31.16b
872 cmp x3,#8*(6+2)
873 blo .Lprocess_block_ce
874 ldr d31,[x1],#8 // *inp++
875 #ifdef __AARCH64EB__
876 rev v31.16b,v31.16b
877 #endif
878 eor v7.16b,v7.16b,v31.16b
879 beq .Lprocess_block_ce
880 ldr d31,[x1],#8 // *inp++
881 #ifdef __AARCH64EB__
882 rev64 v31.16b,v31.16b
883 #endif
884 eor v8.16b,v8.16b,v31.16b
885 cmp x3,#8*(8+2)
886 blo .Lprocess_block_ce
887 ldr d31,[x1],#8 // *inp++
888 #ifdef __AARCH64EB__
889 rev v31.16b,v31.16b
890 #endif
891 eor v9.16b,v9.16b,v31.16b
892 beq .Lprocess_block_ce
893 ldr d31,[x1],#8 // *inp++
894 #ifdef __AARCH64EB__
895 rev64 v31.16b,v31.16b
896 #endif
897 eor v10.16b,v10.16b,v31.16b
898 cmp x3,#8*(10+2)
899 blo .Lprocess_block_ce
900 ldr d31,[x1],#8 // *inp++
901 #ifdef __AARCH64EB__
902 rev v31.16b,v31.16b
903 #endif
904 eor v11.16b,v11.16b,v31.16b
905 beq .Lprocess_block_ce
906 ldr d31,[x1],#8 // *inp++
907 #ifdef __AARCH64EB__
908 rev64 v31.16b,v31.16b
909 #endif
910 eor v12.16b,v12.16b,v31.16b
911 cmp x3,#8*(12+2)
912 blo .Lprocess_block_ce
913 ldr d31,[x1],#8 // *inp++
914 #ifdef __AARCH64EB__
915 rev v31.16b,v31.16b
916 #endif
917 eor v13.16b,v13.16b,v31.16b
918 beq .Lprocess_block_ce
919 ldr d31,[x1],#8 // *inp++
920 #ifdef __AARCH64EB__
921 rev64 v31.16b,v31.16b
922 #endif
923 eor v14.16b,v14.16b,v31.16b
924 cmp x3,#8*(14+2)
925 blo .Lprocess_block_ce
926 ldr d31,[x1],#8 // *inp++
927 #ifdef __AARCH64EB__
928 rev v31.16b,v31.16b
929 #endif
930 eor v15.16b,v15.16b,v31.16b
931 beq .Lprocess_block_ce
932 ldr d31,[x1],#8 // *inp++
933 #ifdef __AARCH64EB__
934 rev64 v31.16b,v31.16b
935 #endif
936 eor v16.16b,v16.16b,v31.16b
937 cmp x3,#8*(16+2)
938 blo .Lprocess_block_ce
939 ldr d31,[x1],#8 // *inp++
940 #ifdef __AARCH64EB__
941 rev v31.16b,v31.16b
942 #endif
943 eor v17.16b,v17.16b,v31.16b
944 beq .Lprocess_block_ce
945 ldr d31,[x1],#8 // *inp++
946 #ifdef __AARCH64EB__
947 rev64 v31.16b,v31.16b
948 #endif
949 eor v18.16b,v18.16b,v31.16b
950 cmp x3,#8*(18+2)
951 blo .Lprocess_block_ce
952 ldr d31,[x1],#8 // *inp++
953 #ifdef __AARCH64EB__
954 rev v31.16b,v31.16b
955 #endif
956 eor v19.16b,v19.16b,v31.16b
957 beq .Lprocess_block_ce
958 ldr d31,[x1],#8 // *inp++
959 #ifdef __AARCH64EB__
960 rev64 v31.16b,v31.16b
961 #endif
962 eor v20.16b,v20.16b,v31.16b
963 cmp x3,#8*(20+2)
964 blo .Lprocess_block_ce
965 ldr d31,[x1],#8 // *inp++
966 #ifdef __AARCH64EB__
967 rev v31.16b,v31.16b
968 #endif
969 eor v21.16b,v21.16b,v31.16b
970 beq .Lprocess_block_ce
971 ldr d31,[x1],#8 // *inp++
972 #ifdef __AARCH64EB__
973 rev64 v31.16b,v31.16b
974 #endif
975 eor v22.16b,v22.16b,v31.16b
976 cmp x3,#8*(22+2)
977 blo .Lprocess_block_ce
978 ldr d31,[x1],#8 // *inp++
979 #ifdef __AARCH64EB__
980 rev v31.16b,v31.16b
981 #endif
982 eor v23.16b,v23.16b,v31.16b
983 beq .Lprocess_block_ce
984 ldr d31,[x1],#8 // *inp++
985 #ifdef __AARCH64EB__
986 rev v31.16b,v31.16b
987 #endif
988 eor v24.16b,v24.16b,v31.16b
989
990 .Lprocess_block_ce:
991
992 bl KeccakF1600_ce
993
994 b .Loop_absorb_ce
995
996 .align 4
997 .Labsorbed_ce:
998 stp d0,d1,[x0,#8*0]
999 stp d2,d3,[x0,#8*2]
1000 stp d4,d5,[x0,#8*4]
1001 stp d6,d7,[x0,#8*6]
1002 stp d8,d9,[x0,#8*8]
1003 stp d10,d11,[x0,#8*10]
1004 stp d12,d13,[x0,#8*12]
1005 stp d14,d15,[x0,#8*14]
1006 stp d16,d17,[x0,#8*16]
1007 stp d18,d19,[x0,#8*18]
1008 stp d20,d21,[x0,#8*20]
1009 stp d22,d23,[x0,#8*22]
1010 str d24,[x0,#8*24]
1011 add x0,x2,x3 // return value
1012
1013 ldp d8,d9,[sp,#16]
1014 ldp d10,d11,[sp,#32]
1015 ldp d12,d13,[sp,#48]
1016 ldp d14,d15,[sp,#64]
1017 ldp x29,x30,[sp],#80
1018 ret
1019 .size SHA3_absorb_cext,.-SHA3_absorb_cext
1020 .globl SHA3_squeeze_cext
1021 .type SHA3_squeeze_cext,%function
1022 .align 5
1023 SHA3_squeeze_cext:
1024 stp x29,x30,[sp,#-16]!
1025 add x29,sp,#0
1026 mov x9,x0
1027 mov x10,x3
1028
1029 .Loop_squeeze_ce:
1030 ldr x4,[x9],#8
1031 cmp x2,#8
1032 blo .Lsqueeze_tail_ce
1033 #ifdef __AARCH64EB__
1034 rev x4,x4
1035 #endif
1036 str x4,[x1],#8
1037 beq .Lsqueeze_done_ce
1038
1039 sub x2,x2,#8
1040 subs x10,x10,#8
1041 bhi .Loop_squeeze_ce
1042
1043 bl KeccakF1600_cext
1044 ldr x30,[sp,#8]
1045 mov x9,x0
1046 mov x10,x3
1047 b .Loop_squeeze_ce
1048
1049 .align 4
1050 .Lsqueeze_tail_ce:
1051 strb w4,[x1],#1
1052 lsr x4,x4,#8
1053 subs x2,x2,#1
1054 beq .Lsqueeze_done_ce
1055 strb w4,[x1],#1
1056 lsr x4,x4,#8
1057 subs x2,x2,#1
1058 beq .Lsqueeze_done_ce
1059 strb w4,[x1],#1
1060 lsr x4,x4,#8
1061 subs x2,x2,#1
1062 beq .Lsqueeze_done_ce
1063 strb w4,[x1],#1
1064 lsr x4,x4,#8
1065 subs x2,x2,#1
1066 beq .Lsqueeze_done_ce
1067 strb w4,[x1],#1
1068 lsr x4,x4,#8
1069 subs x2,x2,#1
1070 beq .Lsqueeze_done_ce
1071 strb w4,[x1],#1
1072 lsr x4,x4,#8
1073 subs x2,x2,#1
1074 beq .Lsqueeze_done_ce
1075 strb w4,[x1],#1
1076
1077 .Lsqueeze_done_ce:
1078 ldr x29,[sp],#16
1079 ret
1080 .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
1081 .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1082 .align 2
1083