1 .text 2 3 .type __KeccakF1600,@function 4 .align 32 5 __KeccakF1600: 6 lea iotas(%rip),%r10 7 mov $24,%eax 8 jmp .Loop_avx512vl 9 10 .align 32 11 .Loop_avx512vl: 12 ######################################### Theta 13 vpshufd $0b01001110,%ymm2,%ymm13 14 vpxor %ymm3,%ymm5,%ymm12 15 vpxor %ymm6,%ymm4,%ymm9 16 vpternlogq $0x96,%ymm1,%ymm9,%ymm12 # C[1..4] 17 18 vpxor %ymm2,%ymm13,%ymm13 19 vpermq $0b01001110,%ymm13,%ymm7 20 21 vpermq $0b10010011,%ymm12,%ymm11 22 vprolq $1,%ymm12,%ymm8 # ROL64(C[1..4],1) 23 24 vpermq $0b00111001,%ymm8,%ymm15 25 vpxor %ymm11,%ymm8,%ymm14 26 vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] 27 28 vpternlogq $0x96,%ymm7,%ymm0,%ymm13 # C[0..0] 29 vprolq $1,%ymm13,%ymm8 # ROL64(C[0..0],1) 30 31 vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] 32 33 vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 34 vpblendd $0b00000011,%ymm13,%ymm11,%ymm7 35 36 ######################################### Rho + Pi + pre-Chi shuffle 37 vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] from Theta 38 vprolvq %ymm16,%ymm2,%ymm2 39 40 vpternlogq $0x96,%ymm7,%ymm15,%ymm3 # ^= D[1..4] from Theta 41 vprolvq %ymm18,%ymm3,%ymm3 42 43 vpternlogq $0x96,%ymm7,%ymm15,%ymm4 # ^= D[1..4] from Theta 44 vprolvq %ymm19,%ymm4,%ymm4 45 46 vpternlogq $0x96,%ymm7,%ymm15,%ymm5 # ^= D[1..4] from Theta 47 vprolvq %ymm20,%ymm5,%ymm5 48 49 vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 50 vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 51 vpternlogq $0x96,%ymm7,%ymm15,%ymm6 # ^= D[1..4] from Theta 52 vprolvq %ymm21,%ymm6,%ymm8 # %ymm6 -> future %ymm1 53 54 vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 55 vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 56 vpternlogq $0x96,%ymm7,%ymm15,%ymm1 # ^= D[1..4] from Theta 57 vprolvq %ymm17,%ymm1,%ymm9 # %ymm1 -> future %ymm2 58 59 ######################################### Chi 60 vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] 61 vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] 62 vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] 63 vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] 64 vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] 65 vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] 66 vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] 67 vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] 68 vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] 69 vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] 70 vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] 71 vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] 72 vpternlogq $0xC6,%ymm15,%ymm10,%ymm3 # [3][1] [1][2] [4][3] [2][4] 73 vpternlogq $0xC6,%ymm14,%ymm12,%ymm5 # [3][2] [1][4] [4][1] [2][3] 74 75 vpsrldq $8,%ymm8,%ymm7 76 vpandn %ymm7,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] 77 78 vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] 79 vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] 80 vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] 81 vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] 82 vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] 83 vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] 84 vpternlogq $0xC6,%ymm15,%ymm13,%ymm6 # [3][3] [1][1] [4][4] [2][2] 85 86 vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] 87 vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] 88 vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] 89 vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] 90 91 vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] 92 vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] 93 vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] 94 vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] 95 vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] 96 vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] 97 vpternlogq $0xC6,%ymm14,%ymm9,%ymm2 # [3][0] [1][0] [4][0] [2][0] 98 99 vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] 100 vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle 101 vpermq $0b10001101,%ymm5,%ymm5 102 vpermq $0b01110010,%ymm6,%ymm6 103 104 vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] 105 vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] 106 vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] 107 vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] 108 vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] 109 vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] 110 111 vpternlogq $0xC6,%ymm15,%ymm8,%ymm1 # [0][4] [0][3] [0][2] [0][1] 112 vpternlogq $0xC6,%ymm14,%ymm11,%ymm4 # [3][4] [1][3] [4][2] [2][1] 113 114 ######################################### Iota 115 vpternlogq $0x96,(%r10),%ymm7,%ymm0 116 lea 32(%r10),%r10 117 118 dec %eax 119 jnz .Loop_avx512vl 120 121 ret 122 .size __KeccakF1600,.-__KeccakF1600 123 .globl SHA3_absorb 124 .type SHA3_absorb,@function 125 .align 32 126 SHA3_absorb: 127 mov %rsp,%r11 128 129 lea -240(%rsp),%rsp 130 and $-32,%rsp 131 132 lea 96(%rdi),%rdi 133 lea 96(%rsi),%rsi 134 lea 96(%rsp),%r10 135 lea rhotates_left(%rip),%r8 136 137 vzeroupper 138 139 vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] 140 vmovdqu 8+32*0-96(%rdi),%ymm1 141 vmovdqu 8+32*1-96(%rdi),%ymm2 142 vmovdqu 8+32*2-96(%rdi),%ymm3 143 vmovdqu 8+32*3-96(%rdi),%ymm4 144 vmovdqu 8+32*4-96(%rdi),%ymm5 145 vmovdqu 8+32*5-96(%rdi),%ymm6 146 147 vmovdqa64 0*32(%r8),%ymm16 # load "rhotate" indices 148 vmovdqa64 1*32(%r8),%ymm17 149 vmovdqa64 2*32(%r8),%ymm18 150 vmovdqa64 3*32(%r8),%ymm19 151 vmovdqa64 4*32(%r8),%ymm20 152 vmovdqa64 5*32(%r8),%ymm21 153 154 vpxor %ymm7,%ymm7,%ymm7 155 vmovdqa %ymm7,32*2-96(%r10) # zero transfer area on stack 156 vmovdqa %ymm7,32*3-96(%r10) 157 vmovdqa %ymm7,32*4-96(%r10) 158 vmovdqa %ymm7,32*5-96(%r10) 159 vmovdqa %ymm7,32*6-96(%r10) 160 161 .Loop_absorb_avx512vl: 162 mov %rcx,%rax 163 sub %rcx,%rdx 164 jc .Ldone_absorb_avx512vl 165 166 shr $3,%eax 167 vpbroadcastq 0-96(%rsi),%ymm7 168 vmovdqu 8-96(%rsi),%ymm8 169 sub $4,%eax 170 dec %eax 171 jz .Labsorved_avx512vl 172 mov 8*5-96(%rsi),%r8 173 mov %r8,80-96(%r10) 174 dec %eax 175 jz .Labsorved_avx512vl 176 mov 8*6-96(%rsi),%r8 177 mov %r8,192-96(%r10) 178 dec %eax 179 jz .Labsorved_avx512vl 180 mov 8*7-96(%rsi),%r8 181 mov %r8,104-96(%r10) 182 dec %eax 183 jz .Labsorved_avx512vl 184 mov 8*8-96(%rsi),%r8 185 mov %r8,144-96(%r10) 186 dec %eax 187 jz .Labsorved_avx512vl 188 mov 8*9-96(%rsi),%r8 189 mov %r8,184-96(%r10) 190 dec %eax 191 jz .Labsorved_avx512vl 192 mov 8*10-96(%rsi),%r8 193 mov %r8,64-96(%r10) 194 dec %eax 195 jz .Labsorved_avx512vl 196 mov 8*11-96(%rsi),%r8 197 mov %r8,128-96(%r10) 198 dec %eax 199 jz .Labsorved_avx512vl 200 mov 8*12-96(%rsi),%r8 201 mov %r8,200-96(%r10) 202 dec %eax 203 jz .Labsorved_avx512vl 204 mov 8*13-96(%rsi),%r8 205 mov %r8,176-96(%r10) 206 dec %eax 207 jz .Labsorved_avx512vl 208 mov 8*14-96(%rsi),%r8 209 mov %r8,120-96(%r10) 210 dec %eax 211 jz .Labsorved_avx512vl 212 mov 8*15-96(%rsi),%r8 213 mov %r8,88-96(%r10) 214 dec %eax 215 jz .Labsorved_avx512vl 216 mov 8*16-96(%rsi),%r8 217 mov %r8,96-96(%r10) 218 dec %eax 219 jz .Labsorved_avx512vl 220 mov 8*17-96(%rsi),%r8 221 mov %r8,168-96(%r10) 222 dec %eax 223 jz .Labsorved_avx512vl 224 mov 8*18-96(%rsi),%r8 225 mov %r8,208-96(%r10) 226 dec %eax 227 jz .Labsorved_avx512vl 228 mov 8*19-96(%rsi),%r8 229 mov %r8,152-96(%r10) 230 dec %eax 231 jz .Labsorved_avx512vl 232 mov 8*20-96(%rsi),%r8 233 mov %r8,72-96(%r10) 234 dec %eax 235 jz .Labsorved_avx512vl 236 mov 8*21-96(%rsi),%r8 237 mov %r8,160-96(%r10) 238 dec %eax 239 jz .Labsorved_avx512vl 240 mov 8*22-96(%rsi),%r8 241 mov %r8,136-96(%r10) 242 dec %eax 243 jz .Labsorved_avx512vl 244 mov 8*23-96(%rsi),%r8 245 mov %r8,112-96(%r10) 246 dec %eax 247 jz .Labsorved_avx512vl 248 mov 8*24-96(%rsi),%r8 249 mov %r8,216-96(%r10) 250 .Labsorved_avx512vl: 251 lea (%rsi,%rcx),%rsi 252 253 vpxor %ymm7,%ymm0,%ymm0 254 vpxor %ymm8,%ymm1,%ymm1 255 vpxor 32*2-96(%r10),%ymm2,%ymm2 256 vpxor 32*3-96(%r10),%ymm3,%ymm3 257 vpxor 32*4-96(%r10),%ymm4,%ymm4 258 vpxor 32*5-96(%r10),%ymm5,%ymm5 259 vpxor 32*6-96(%r10),%ymm6,%ymm6 260 261 call __KeccakF1600 262 263 lea 96(%rsp),%r10 264 jmp .Loop_absorb_avx512vl 265 266 .Ldone_absorb_avx512vl: 267 vmovq %xmm0,-96(%rdi) 268 vmovdqu %ymm1,8+32*0-96(%rdi) 269 vmovdqu %ymm2,8+32*1-96(%rdi) 270 vmovdqu %ymm3,8+32*2-96(%rdi) 271 vmovdqu %ymm4,8+32*3-96(%rdi) 272 vmovdqu %ymm5,8+32*4-96(%rdi) 273 vmovdqu %ymm6,8+32*5-96(%rdi) 274 275 vzeroupper 276 277 lea (%r11),%rsp 278 lea (%rdx,%rcx),%rax # return value 279 ret 280 .size SHA3_absorb,.-SHA3_absorb 281 282 .globl SHA3_squeeze 283 .type SHA3_squeeze,@function 284 .align 32 285 SHA3_squeeze: 286 mov %rsp,%r11 287 288 lea 96(%rdi),%rdi 289 lea rhotates_left(%rip),%r8 290 shr $3,%rcx 291 292 vzeroupper 293 294 vpbroadcastq -96(%rdi),%ymm0 295 vpxor %ymm7,%ymm7,%ymm7 296 vmovdqu 8+32*0-96(%rdi),%ymm1 297 vmovdqu 8+32*1-96(%rdi),%ymm2 298 vmovdqu 8+32*2-96(%rdi),%ymm3 299 vmovdqu 8+32*3-96(%rdi),%ymm4 300 vmovdqu 8+32*4-96(%rdi),%ymm5 301 vmovdqu 8+32*5-96(%rdi),%ymm6 302 303 vmovdqa64 0*32(%r8),%ymm16 # load "rhotate" indices 304 vmovdqa64 1*32(%r8),%ymm17 305 vmovdqa64 2*32(%r8),%ymm18 306 vmovdqa64 3*32(%r8),%ymm19 307 vmovdqa64 4*32(%r8),%ymm20 308 vmovdqa64 5*32(%r8),%ymm21 309 310 mov %rcx,%rax 311 312 .Loop_squeeze_avx512vl: 313 mov 0-96(%rdi),%r8 314 sub $8,%rdx 315 jc .Ltail_squeeze_avx512vl 316 mov %r8,(%rsi) 317 lea 8(%rsi),%rsi 318 je .Ldone_squeeze_avx512vl 319 dec %eax 320 je .Lextend_output_avx512vl 321 mov 32-120(%rdi),%r8 322 sub $8,%rdx 323 jc .Ltail_squeeze_avx512vl 324 mov %r8,(%rsi) 325 lea 8(%rsi),%rsi 326 je .Ldone_squeeze_avx512vl 327 dec %eax 328 je .Lextend_output_avx512vl 329 mov 40-120(%rdi),%r8 330 sub $8,%rdx 331 jc .Ltail_squeeze_avx512vl 332 mov %r8,(%rsi) 333 lea 8(%rsi),%rsi 334 je .Ldone_squeeze_avx512vl 335 dec %eax 336 je .Lextend_output_avx512vl 337 mov 48-120(%rdi),%r8 338 sub $8,%rdx 339 jc .Ltail_squeeze_avx512vl 340 mov %r8,(%rsi) 341 lea 8(%rsi),%rsi 342 je .Ldone_squeeze_avx512vl 343 dec %eax 344 je .Lextend_output_avx512vl 345 mov 56-120(%rdi),%r8 346 sub $8,%rdx 347 jc .Ltail_squeeze_avx512vl 348 mov %r8,(%rsi) 349 lea 8(%rsi),%rsi 350 je .Ldone_squeeze_avx512vl 351 dec %eax 352 je .Lextend_output_avx512vl 353 mov 80-120(%rdi),%r8 354 sub $8,%rdx 355 jc .Ltail_squeeze_avx512vl 356 mov %r8,(%rsi) 357 lea 8(%rsi),%rsi 358 je .Ldone_squeeze_avx512vl 359 dec %eax 360 je .Lextend_output_avx512vl 361 mov 192-120(%rdi),%r8 362 sub $8,%rdx 363 jc .Ltail_squeeze_avx512vl 364 mov %r8,(%rsi) 365 lea 8(%rsi),%rsi 366 je .Ldone_squeeze_avx512vl 367 dec %eax 368 je .Lextend_output_avx512vl 369 mov 104-120(%rdi),%r8 370 sub $8,%rdx 371 jc .Ltail_squeeze_avx512vl 372 mov %r8,(%rsi) 373 lea 8(%rsi),%rsi 374 je .Ldone_squeeze_avx512vl 375 dec %eax 376 je .Lextend_output_avx512vl 377 mov 144-120(%rdi),%r8 378 sub $8,%rdx 379 jc .Ltail_squeeze_avx512vl 380 mov %r8,(%rsi) 381 lea 8(%rsi),%rsi 382 je .Ldone_squeeze_avx512vl 383 dec %eax 384 je .Lextend_output_avx512vl 385 mov 184-120(%rdi),%r8 386 sub $8,%rdx 387 jc .Ltail_squeeze_avx512vl 388 mov %r8,(%rsi) 389 lea 8(%rsi),%rsi 390 je .Ldone_squeeze_avx512vl 391 dec %eax 392 je .Lextend_output_avx512vl 393 mov 64-120(%rdi),%r8 394 sub $8,%rdx 395 jc .Ltail_squeeze_avx512vl 396 mov %r8,(%rsi) 397 lea 8(%rsi),%rsi 398 je .Ldone_squeeze_avx512vl 399 dec %eax 400 je .Lextend_output_avx512vl 401 mov 128-120(%rdi),%r8 402 sub $8,%rdx 403 jc .Ltail_squeeze_avx512vl 404 mov %r8,(%rsi) 405 lea 8(%rsi),%rsi 406 je .Ldone_squeeze_avx512vl 407 dec %eax 408 je .Lextend_output_avx512vl 409 mov 200-120(%rdi),%r8 410 sub $8,%rdx 411 jc .Ltail_squeeze_avx512vl 412 mov %r8,(%rsi) 413 lea 8(%rsi),%rsi 414 je .Ldone_squeeze_avx512vl 415 dec %eax 416 je .Lextend_output_avx512vl 417 mov 176-120(%rdi),%r8 418 sub $8,%rdx 419 jc .Ltail_squeeze_avx512vl 420 mov %r8,(%rsi) 421 lea 8(%rsi),%rsi 422 je .Ldone_squeeze_avx512vl 423 dec %eax 424 je .Lextend_output_avx512vl 425 mov 120-120(%rdi),%r8 426 sub $8,%rdx 427 jc .Ltail_squeeze_avx512vl 428 mov %r8,(%rsi) 429 lea 8(%rsi),%rsi 430 je .Ldone_squeeze_avx512vl 431 dec %eax 432 je .Lextend_output_avx512vl 433 mov 88-120(%rdi),%r8 434 sub $8,%rdx 435 jc .Ltail_squeeze_avx512vl 436 mov %r8,(%rsi) 437 lea 8(%rsi),%rsi 438 je .Ldone_squeeze_avx512vl 439 dec %eax 440 je .Lextend_output_avx512vl 441 mov 96-120(%rdi),%r8 442 sub $8,%rdx 443 jc .Ltail_squeeze_avx512vl 444 mov %r8,(%rsi) 445 lea 8(%rsi),%rsi 446 je .Ldone_squeeze_avx512vl 447 dec %eax 448 je .Lextend_output_avx512vl 449 mov 168-120(%rdi),%r8 450 sub $8,%rdx 451 jc .Ltail_squeeze_avx512vl 452 mov %r8,(%rsi) 453 lea 8(%rsi),%rsi 454 je .Ldone_squeeze_avx512vl 455 dec %eax 456 je .Lextend_output_avx512vl 457 mov 208-120(%rdi),%r8 458 sub $8,%rdx 459 jc .Ltail_squeeze_avx512vl 460 mov %r8,(%rsi) 461 lea 8(%rsi),%rsi 462 je .Ldone_squeeze_avx512vl 463 dec %eax 464 je .Lextend_output_avx512vl 465 mov 152-120(%rdi),%r8 466 sub $8,%rdx 467 jc .Ltail_squeeze_avx512vl 468 mov %r8,(%rsi) 469 lea 8(%rsi),%rsi 470 je .Ldone_squeeze_avx512vl 471 dec %eax 472 je .Lextend_output_avx512vl 473 mov 72-120(%rdi),%r8 474 sub $8,%rdx 475 jc .Ltail_squeeze_avx512vl 476 mov %r8,(%rsi) 477 lea 8(%rsi),%rsi 478 je .Ldone_squeeze_avx512vl 479 dec %eax 480 je .Lextend_output_avx512vl 481 mov 160-120(%rdi),%r8 482 sub $8,%rdx 483 jc .Ltail_squeeze_avx512vl 484 mov %r8,(%rsi) 485 lea 8(%rsi),%rsi 486 je .Ldone_squeeze_avx512vl 487 dec %eax 488 je .Lextend_output_avx512vl 489 mov 136-120(%rdi),%r8 490 sub $8,%rdx 491 jc .Ltail_squeeze_avx512vl 492 mov %r8,(%rsi) 493 lea 8(%rsi),%rsi 494 je .Ldone_squeeze_avx512vl 495 dec %eax 496 je .Lextend_output_avx512vl 497 mov 112-120(%rdi),%r8 498 sub $8,%rdx 499 jc .Ltail_squeeze_avx512vl 500 mov %r8,(%rsi) 501 lea 8(%rsi),%rsi 502 je .Ldone_squeeze_avx512vl 503 dec %eax 504 je .Lextend_output_avx512vl 505 mov 216-120(%rdi),%r8 506 sub $8,%rdx 507 jc .Ltail_squeeze_avx512vl 508 mov %r8,(%rsi) 509 lea 8(%rsi),%rsi 510 je .Ldone_squeeze_avx512vl 511 dec %eax 512 je .Lextend_output_avx512vl 513 mov -120(%rdi),%r8 514 .Lextend_output_avx512vl: 515 call __KeccakF1600 516 517 vmovq %xmm0,-96(%rdi) 518 vmovdqu %ymm1,8+32*0-96(%rdi) 519 vmovdqu %ymm2,8+32*1-96(%rdi) 520 vmovdqu %ymm3,8+32*2-96(%rdi) 521 vmovdqu %ymm4,8+32*3-96(%rdi) 522 vmovdqu %ymm5,8+32*4-96(%rdi) 523 vmovdqu %ymm6,8+32*5-96(%rdi) 524 525 mov %rcx,%rax 526 jmp .Loop_squeeze_avx512vl 527 528 529 .Ltail_squeeze_avx512vl: 530 add $8,%rdx 531 .Loop_tail_avx512vl: 532 mov %r8b,(%rsi) 533 lea 1(%rsi),%rsi 534 shr $8,%r8 535 dec %rdx 536 jnz .Loop_tail_avx512vl 537 538 .Ldone_squeeze_avx512vl: 539 vzeroupper 540 541 lea (%r11),%rsp 542 ret 543 .size SHA3_squeeze,.-SHA3_squeeze 544 545 .section .rodata 546 .align 64 547 rhotates_left: 548 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] 549 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] 550 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] 551 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] 552 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] 553 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] 554 iotas: 555 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 556 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 557 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a 558 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 559 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b 560 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 561 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 562 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 563 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a 564 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 565 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 566 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a 567 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b 568 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b 569 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 570 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 571 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 572 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 573 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a 574 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a 575 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 576 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 577 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 578 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 579 580 .asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro (at) openssl.org>" 581