1 .text 2 3 .type __KeccakF1600,@function 4 .align 32 5 __KeccakF1600: 6 lea rhotates_left+96(%rip),%r8 7 lea rhotates_right+96(%rip),%r9 8 lea iotas(%rip),%r10 9 mov $24,%eax 10 jmp .Loop_avx2 11 12 .align 32 13 .Loop_avx2: 14 ######################################### Theta 15 vpshufd $0b01001110,%ymm2,%ymm13 16 vpxor %ymm3,%ymm5,%ymm12 17 vpxor %ymm6,%ymm4,%ymm9 18 vpxor %ymm1,%ymm12,%ymm12 19 vpxor %ymm9,%ymm12,%ymm12 # C[1..4] 20 21 vpermq $0b10010011,%ymm12,%ymm11 22 vpxor %ymm2,%ymm13,%ymm13 23 vpermq $0b01001110,%ymm13,%ymm7 24 25 vpsrlq $63,%ymm12,%ymm8 26 vpaddq %ymm12,%ymm12,%ymm9 27 vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) 28 29 vpermq $0b00111001,%ymm8,%ymm15 30 vpxor %ymm11,%ymm8,%ymm14 31 vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] 32 33 vpxor %ymm0,%ymm13,%ymm13 34 vpxor %ymm7,%ymm13,%ymm13 # C[0..0] 35 36 vpsrlq $63,%ymm13,%ymm7 37 vpaddq %ymm13,%ymm13,%ymm8 38 vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) 39 40 vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] 41 vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] 42 43 vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 44 vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 45 vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] 46 47 ######################################### Rho + Pi + pre-Chi shuffle 48 vpsllvq 0*32-96(%r8),%ymm2,%ymm10 49 vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 50 vpor %ymm10,%ymm2,%ymm2 51 52 vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta 53 vpsllvq 2*32-96(%r8),%ymm3,%ymm11 54 vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 55 vpor %ymm11,%ymm3,%ymm3 56 57 vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta 58 vpsllvq 3*32-96(%r8),%ymm4,%ymm12 59 vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 60 vpor %ymm12,%ymm4,%ymm4 61 62 vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta 63 vpsllvq 4*32-96(%r8),%ymm5,%ymm13 64 vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 65 vpor %ymm13,%ymm5,%ymm5 66 67 vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta 68 vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 69 vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 70 vpsllvq 5*32-96(%r8),%ymm6,%ymm14 71 vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 72 vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 73 74 vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta 75 vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 76 vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 77 vpsllvq 1*32-96(%r8),%ymm1,%ymm15 78 vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 79 vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 80 81 ######################################### Chi 82 vpsrldq $8,%ymm8,%ymm14 83 vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] 84 85 vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] 86 vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] 87 vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] 88 vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] 89 vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] 90 vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] 91 vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] 92 vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] 93 vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] 94 vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] 95 vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] 96 vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] 97 vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] 98 vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] 99 100 vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] 101 vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] 102 vpxor %ymm10,%ymm3,%ymm3 103 vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] 104 vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] 105 vpxor %ymm12,%ymm5,%ymm5 106 vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] 107 vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] 108 vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] 109 vpxor %ymm13,%ymm6,%ymm6 110 111 vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] 112 vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] 113 vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] 114 vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] 115 vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] 116 117 vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] 118 vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] 119 vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] 120 vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] 121 vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] 122 vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] 123 vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] 124 vpxor %ymm9,%ymm2,%ymm2 125 126 vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] 127 vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle 128 vpermq $0b10001101,%ymm5,%ymm5 129 vpermq $0b01110010,%ymm6,%ymm6 130 131 vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] 132 vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] 133 vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] 134 vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] 135 vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] 136 vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] 137 vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] 138 139 vpxor %ymm7,%ymm0,%ymm0 140 vpxor %ymm8,%ymm1,%ymm1 141 vpxor %ymm11,%ymm4,%ymm4 142 143 ######################################### Iota 144 vpxor (%r10),%ymm0,%ymm0 145 lea 32(%r10),%r10 146 147 dec %eax 148 jnz .Loop_avx2 149 150 ret 151 .size __KeccakF1600,.-__KeccakF1600 152 .globl SHA3_absorb 153 .type SHA3_absorb,@function 154 .align 32 155 SHA3_absorb: 156 mov %rsp,%r11 157 158 lea -240(%rsp),%rsp 159 and $-32,%rsp 160 161 lea 96(%rdi),%rdi 162 lea 96(%rsi),%rsi 163 lea 96(%rsp),%r10 164 165 vzeroupper 166 167 vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] 168 vmovdqu 8+32*0-96(%rdi),%ymm1 169 vmovdqu 8+32*1-96(%rdi),%ymm2 170 vmovdqu 8+32*2-96(%rdi),%ymm3 171 vmovdqu 8+32*3-96(%rdi),%ymm4 172 vmovdqu 8+32*4-96(%rdi),%ymm5 173 vmovdqu 8+32*5-96(%rdi),%ymm6 174 175 vpxor %ymm7,%ymm7,%ymm7 176 vmovdqa %ymm7,32*2-96(%r10) # zero transfer area on stack 177 vmovdqa %ymm7,32*3-96(%r10) 178 vmovdqa %ymm7,32*4-96(%r10) 179 vmovdqa %ymm7,32*5-96(%r10) 180 vmovdqa %ymm7,32*6-96(%r10) 181 182 .Loop_absorb_avx2: 183 mov %rcx,%rax 184 sub %rcx,%rdx 185 jc .Ldone_absorb_avx2 186 187 shr $3,%eax 188 vpbroadcastq 0-96(%rsi),%ymm7 189 vmovdqu 8-96(%rsi),%ymm8 190 sub $4,%eax 191 dec %eax 192 jz .Labsorved_avx2 193 mov 8*5-96(%rsi),%r8 194 mov %r8,80-96(%r10) 195 dec %eax 196 jz .Labsorved_avx2 197 mov 8*6-96(%rsi),%r8 198 mov %r8,192-96(%r10) 199 dec %eax 200 jz .Labsorved_avx2 201 mov 8*7-96(%rsi),%r8 202 mov %r8,104-96(%r10) 203 dec %eax 204 jz .Labsorved_avx2 205 mov 8*8-96(%rsi),%r8 206 mov %r8,144-96(%r10) 207 dec %eax 208 jz .Labsorved_avx2 209 mov 8*9-96(%rsi),%r8 210 mov %r8,184-96(%r10) 211 dec %eax 212 jz .Labsorved_avx2 213 mov 8*10-96(%rsi),%r8 214 mov %r8,64-96(%r10) 215 dec %eax 216 jz .Labsorved_avx2 217 mov 8*11-96(%rsi),%r8 218 mov %r8,128-96(%r10) 219 dec %eax 220 jz .Labsorved_avx2 221 mov 8*12-96(%rsi),%r8 222 mov %r8,200-96(%r10) 223 dec %eax 224 jz .Labsorved_avx2 225 mov 8*13-96(%rsi),%r8 226 mov %r8,176-96(%r10) 227 dec %eax 228 jz .Labsorved_avx2 229 mov 8*14-96(%rsi),%r8 230 mov %r8,120-96(%r10) 231 dec %eax 232 jz .Labsorved_avx2 233 mov 8*15-96(%rsi),%r8 234 mov %r8,88-96(%r10) 235 dec %eax 236 jz .Labsorved_avx2 237 mov 8*16-96(%rsi),%r8 238 mov %r8,96-96(%r10) 239 dec %eax 240 jz .Labsorved_avx2 241 mov 8*17-96(%rsi),%r8 242 mov %r8,168-96(%r10) 243 dec %eax 244 jz .Labsorved_avx2 245 mov 8*18-96(%rsi),%r8 246 mov %r8,208-96(%r10) 247 dec %eax 248 jz .Labsorved_avx2 249 mov 8*19-96(%rsi),%r8 250 mov %r8,152-96(%r10) 251 dec %eax 252 jz .Labsorved_avx2 253 mov 8*20-96(%rsi),%r8 254 mov %r8,72-96(%r10) 255 dec %eax 256 jz .Labsorved_avx2 257 mov 8*21-96(%rsi),%r8 258 mov %r8,160-96(%r10) 259 dec %eax 260 jz .Labsorved_avx2 261 mov 8*22-96(%rsi),%r8 262 mov %r8,136-96(%r10) 263 dec %eax 264 jz .Labsorved_avx2 265 mov 8*23-96(%rsi),%r8 266 mov %r8,112-96(%r10) 267 dec %eax 268 jz .Labsorved_avx2 269 mov 8*24-96(%rsi),%r8 270 mov %r8,216-96(%r10) 271 .Labsorved_avx2: 272 lea (%rsi,%rcx),%rsi 273 274 vpxor %ymm7,%ymm0,%ymm0 275 vpxor %ymm8,%ymm1,%ymm1 276 vpxor 32*2-96(%r10),%ymm2,%ymm2 277 vpxor 32*3-96(%r10),%ymm3,%ymm3 278 vpxor 32*4-96(%r10),%ymm4,%ymm4 279 vpxor 32*5-96(%r10),%ymm5,%ymm5 280 vpxor 32*6-96(%r10),%ymm6,%ymm6 281 282 call __KeccakF1600 283 284 lea 96(%rsp),%r10 285 jmp .Loop_absorb_avx2 286 287 .Ldone_absorb_avx2: 288 vmovq %xmm0,-96(%rdi) 289 vmovdqu %ymm1,8+32*0-96(%rdi) 290 vmovdqu %ymm2,8+32*1-96(%rdi) 291 vmovdqu %ymm3,8+32*2-96(%rdi) 292 vmovdqu %ymm4,8+32*3-96(%rdi) 293 vmovdqu %ymm5,8+32*4-96(%rdi) 294 vmovdqu %ymm6,8+32*5-96(%rdi) 295 296 vzeroupper 297 298 lea (%r11),%rsp 299 lea (%rdx,%rcx),%rax # return value 300 ret 301 .size SHA3_absorb,.-SHA3_absorb 302 303 .globl SHA3_squeeze 304 .type SHA3_squeeze,@function 305 .align 32 306 SHA3_squeeze: 307 mov %rsp,%r11 308 309 lea 96(%rdi),%rdi 310 shr $3,%rcx 311 312 vzeroupper 313 314 vpbroadcastq -96(%rdi),%ymm0 315 vpxor %ymm7,%ymm7,%ymm7 316 vmovdqu 8+32*0-96(%rdi),%ymm1 317 vmovdqu 8+32*1-96(%rdi),%ymm2 318 vmovdqu 8+32*2-96(%rdi),%ymm3 319 vmovdqu 8+32*3-96(%rdi),%ymm4 320 vmovdqu 8+32*4-96(%rdi),%ymm5 321 vmovdqu 8+32*5-96(%rdi),%ymm6 322 323 mov %rcx,%rax 324 325 .Loop_squeeze_avx2: 326 mov 0-96(%rdi),%r8 327 sub $8,%rdx 328 jc .Ltail_squeeze_avx2 329 mov %r8,(%rsi) 330 lea 8(%rsi),%rsi 331 je .Ldone_squeeze_avx2 332 dec %eax 333 je .Lextend_output_avx2 334 mov 32-120(%rdi),%r8 335 sub $8,%rdx 336 jc .Ltail_squeeze_avx2 337 mov %r8,(%rsi) 338 lea 8(%rsi),%rsi 339 je .Ldone_squeeze_avx2 340 dec %eax 341 je .Lextend_output_avx2 342 mov 40-120(%rdi),%r8 343 sub $8,%rdx 344 jc .Ltail_squeeze_avx2 345 mov %r8,(%rsi) 346 lea 8(%rsi),%rsi 347 je .Ldone_squeeze_avx2 348 dec %eax 349 je .Lextend_output_avx2 350 mov 48-120(%rdi),%r8 351 sub $8,%rdx 352 jc .Ltail_squeeze_avx2 353 mov %r8,(%rsi) 354 lea 8(%rsi),%rsi 355 je .Ldone_squeeze_avx2 356 dec %eax 357 je .Lextend_output_avx2 358 mov 56-120(%rdi),%r8 359 sub $8,%rdx 360 jc .Ltail_squeeze_avx2 361 mov %r8,(%rsi) 362 lea 8(%rsi),%rsi 363 je .Ldone_squeeze_avx2 364 dec %eax 365 je .Lextend_output_avx2 366 mov 80-120(%rdi),%r8 367 sub $8,%rdx 368 jc .Ltail_squeeze_avx2 369 mov %r8,(%rsi) 370 lea 8(%rsi),%rsi 371 je .Ldone_squeeze_avx2 372 dec %eax 373 je .Lextend_output_avx2 374 mov 192-120(%rdi),%r8 375 sub $8,%rdx 376 jc .Ltail_squeeze_avx2 377 mov %r8,(%rsi) 378 lea 8(%rsi),%rsi 379 je .Ldone_squeeze_avx2 380 dec %eax 381 je .Lextend_output_avx2 382 mov 104-120(%rdi),%r8 383 sub $8,%rdx 384 jc .Ltail_squeeze_avx2 385 mov %r8,(%rsi) 386 lea 8(%rsi),%rsi 387 je .Ldone_squeeze_avx2 388 dec %eax 389 je .Lextend_output_avx2 390 mov 144-120(%rdi),%r8 391 sub $8,%rdx 392 jc .Ltail_squeeze_avx2 393 mov %r8,(%rsi) 394 lea 8(%rsi),%rsi 395 je .Ldone_squeeze_avx2 396 dec %eax 397 je .Lextend_output_avx2 398 mov 184-120(%rdi),%r8 399 sub $8,%rdx 400 jc .Ltail_squeeze_avx2 401 mov %r8,(%rsi) 402 lea 8(%rsi),%rsi 403 je .Ldone_squeeze_avx2 404 dec %eax 405 je .Lextend_output_avx2 406 mov 64-120(%rdi),%r8 407 sub $8,%rdx 408 jc .Ltail_squeeze_avx2 409 mov %r8,(%rsi) 410 lea 8(%rsi),%rsi 411 je .Ldone_squeeze_avx2 412 dec %eax 413 je .Lextend_output_avx2 414 mov 128-120(%rdi),%r8 415 sub $8,%rdx 416 jc .Ltail_squeeze_avx2 417 mov %r8,(%rsi) 418 lea 8(%rsi),%rsi 419 je .Ldone_squeeze_avx2 420 dec %eax 421 je .Lextend_output_avx2 422 mov 200-120(%rdi),%r8 423 sub $8,%rdx 424 jc .Ltail_squeeze_avx2 425 mov %r8,(%rsi) 426 lea 8(%rsi),%rsi 427 je .Ldone_squeeze_avx2 428 dec %eax 429 je .Lextend_output_avx2 430 mov 176-120(%rdi),%r8 431 sub $8,%rdx 432 jc .Ltail_squeeze_avx2 433 mov %r8,(%rsi) 434 lea 8(%rsi),%rsi 435 je .Ldone_squeeze_avx2 436 dec %eax 437 je .Lextend_output_avx2 438 mov 120-120(%rdi),%r8 439 sub $8,%rdx 440 jc .Ltail_squeeze_avx2 441 mov %r8,(%rsi) 442 lea 8(%rsi),%rsi 443 je .Ldone_squeeze_avx2 444 dec %eax 445 je .Lextend_output_avx2 446 mov 88-120(%rdi),%r8 447 sub $8,%rdx 448 jc .Ltail_squeeze_avx2 449 mov %r8,(%rsi) 450 lea 8(%rsi),%rsi 451 je .Ldone_squeeze_avx2 452 dec %eax 453 je .Lextend_output_avx2 454 mov 96-120(%rdi),%r8 455 sub $8,%rdx 456 jc .Ltail_squeeze_avx2 457 mov %r8,(%rsi) 458 lea 8(%rsi),%rsi 459 je .Ldone_squeeze_avx2 460 dec %eax 461 je .Lextend_output_avx2 462 mov 168-120(%rdi),%r8 463 sub $8,%rdx 464 jc .Ltail_squeeze_avx2 465 mov %r8,(%rsi) 466 lea 8(%rsi),%rsi 467 je .Ldone_squeeze_avx2 468 dec %eax 469 je .Lextend_output_avx2 470 mov 208-120(%rdi),%r8 471 sub $8,%rdx 472 jc .Ltail_squeeze_avx2 473 mov %r8,(%rsi) 474 lea 8(%rsi),%rsi 475 je .Ldone_squeeze_avx2 476 dec %eax 477 je .Lextend_output_avx2 478 mov 152-120(%rdi),%r8 479 sub $8,%rdx 480 jc .Ltail_squeeze_avx2 481 mov %r8,(%rsi) 482 lea 8(%rsi),%rsi 483 je .Ldone_squeeze_avx2 484 dec %eax 485 je .Lextend_output_avx2 486 mov 72-120(%rdi),%r8 487 sub $8,%rdx 488 jc .Ltail_squeeze_avx2 489 mov %r8,(%rsi) 490 lea 8(%rsi),%rsi 491 je .Ldone_squeeze_avx2 492 dec %eax 493 je .Lextend_output_avx2 494 mov 160-120(%rdi),%r8 495 sub $8,%rdx 496 jc .Ltail_squeeze_avx2 497 mov %r8,(%rsi) 498 lea 8(%rsi),%rsi 499 je .Ldone_squeeze_avx2 500 dec %eax 501 je .Lextend_output_avx2 502 mov 136-120(%rdi),%r8 503 sub $8,%rdx 504 jc .Ltail_squeeze_avx2 505 mov %r8,(%rsi) 506 lea 8(%rsi),%rsi 507 je .Ldone_squeeze_avx2 508 dec %eax 509 je .Lextend_output_avx2 510 mov 112-120(%rdi),%r8 511 sub $8,%rdx 512 jc .Ltail_squeeze_avx2 513 mov %r8,(%rsi) 514 lea 8(%rsi),%rsi 515 je .Ldone_squeeze_avx2 516 dec %eax 517 je .Lextend_output_avx2 518 mov 216-120(%rdi),%r8 519 sub $8,%rdx 520 jc .Ltail_squeeze_avx2 521 mov %r8,(%rsi) 522 lea 8(%rsi),%rsi 523 je .Ldone_squeeze_avx2 524 dec %eax 525 je .Lextend_output_avx2 526 mov -120(%rdi),%r8 527 .Lextend_output_avx2: 528 call __KeccakF1600 529 530 vmovq %xmm0,-96(%rdi) 531 vmovdqu %ymm1,8+32*0-96(%rdi) 532 vmovdqu %ymm2,8+32*1-96(%rdi) 533 vmovdqu %ymm3,8+32*2-96(%rdi) 534 vmovdqu %ymm4,8+32*3-96(%rdi) 535 vmovdqu %ymm5,8+32*4-96(%rdi) 536 vmovdqu %ymm6,8+32*5-96(%rdi) 537 538 mov %rcx,%rax 539 jmp .Loop_squeeze_avx2 540 541 542 .Ltail_squeeze_avx2: 543 add $8,%rdx 544 .Loop_tail_avx2: 545 mov %r8b,(%rsi) 546 lea 1(%rsi),%rsi 547 shr $8,%r8 548 dec %rdx 549 jnz .Loop_tail_avx2 550 551 .Ldone_squeeze_avx2: 552 vzeroupper 553 554 lea (%r11),%rsp 555 ret 556 .size SHA3_squeeze,.-SHA3_squeeze 557 558 .section .rodata 559 .align 64 560 rhotates_left: 561 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] 562 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] 563 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] 564 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] 565 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] 566 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] 567 rhotates_right: 568 .quad 64-3, 64-18, 64-36, 64-41 569 .quad 64-1, 64-62, 64-28, 64-27 570 .quad 64-45, 64-6, 64-56, 64-39 571 .quad 64-10, 64-61, 64-55, 64-8 572 .quad 64-2, 64-15, 64-25, 64-20 573 .quad 64-44, 64-43, 64-21, 64-14 574 iotas: 575 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 576 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 577 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a 578 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 579 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b 580 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 581 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 582 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 583 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a 584 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 585 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 586 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a 587 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b 588 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b 589 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 590 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 591 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 592 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 593 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a 594 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a 595 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 596 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 597 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 598 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 599 600 .asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro (at) openssl.org>" 601