Home | History | Annotate | Line # | Download | only in x86_64
      1 .text
      2 
      3 .type	__KeccakF1600,@function
      4 .align	32
      5 __KeccakF1600:
      6 	lea		iotas(%rip),%r10
      7 	mov		$24,%eax
      8 	jmp		.Loop_avx512vl
      9 
     10 .align	32
     11 .Loop_avx512vl:
     12 	######################################### Theta
     13 	vpshufd		$0b01001110,%ymm2,%ymm13
     14 	vpxor		%ymm3,%ymm5,%ymm12
     15 	vpxor		%ymm6,%ymm4,%ymm9
     16 	vpternlogq	$0x96,%ymm1,%ymm9,%ymm12	# C[1..4]
     17 
     18 	vpxor		%ymm2,%ymm13,%ymm13
     19 	vpermq		$0b01001110,%ymm13,%ymm7
     20 
     21 	vpermq		$0b10010011,%ymm12,%ymm11
     22 	vprolq		$1,%ymm12,%ymm8		# ROL64(C[1..4],1)
     23 
     24 	vpermq		$0b00111001,%ymm8,%ymm15
     25 	vpxor		%ymm11,%ymm8,%ymm14
     26 	vpermq		$0b00000000,%ymm14,%ymm14	# D[0..0] = ROL64(C[1],1) ^ C[4]
     27 
     28 	vpternlogq	$0x96,%ymm7,%ymm0,%ymm13	# C[0..0]
     29 	vprolq		$1,%ymm13,%ymm8		# ROL64(C[0..0],1)
     30 
     31 	vpxor		%ymm14,%ymm0,%ymm0		# ^= D[0..0]
     32 
     33 	vpblendd	$0b11000000,%ymm8,%ymm15,%ymm15
     34 	vpblendd	$0b00000011,%ymm13,%ymm11,%ymm7
     35 
     36 	######################################### Rho + Pi + pre-Chi shuffle
     37 	 vpxor		%ymm14,%ymm2,%ymm2		# ^= D[0..0] from Theta
     38 	vprolvq		%ymm16,%ymm2,%ymm2
     39 
     40 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm3	# ^= D[1..4] from Theta
     41 	vprolvq		%ymm18,%ymm3,%ymm3
     42 
     43 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm4	# ^= D[1..4] from Theta
     44 	vprolvq		%ymm19,%ymm4,%ymm4
     45 
     46 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm5	# ^= D[1..4] from Theta
     47 	vprolvq		%ymm20,%ymm5,%ymm5
     48 
     49 	 vpermq		$0b10001101,%ymm2,%ymm10	# %ymm2 -> future %ymm3
     50 	 vpermq		$0b10001101,%ymm3,%ymm11	# %ymm3 -> future %ymm4
     51 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm6	# ^= D[1..4] from Theta
     52 	vprolvq		%ymm21,%ymm6,%ymm8		# %ymm6 -> future %ymm1
     53 
     54 	 vpermq		$0b00011011,%ymm4,%ymm12	# %ymm4 -> future %ymm5
     55 	 vpermq		$0b01110010,%ymm5,%ymm13	# %ymm5 -> future %ymm6
     56 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm1	# ^= D[1..4] from Theta
     57 	vprolvq		%ymm17,%ymm1,%ymm9		# %ymm1 -> future %ymm2
     58 
     59 	######################################### Chi
     60 	vpblendd	$0b00001100,%ymm13,%ymm9,%ymm3	#               [4][4] [2][0]
     61 	vpblendd	$0b00001100,%ymm9,%ymm11,%ymm15	#               [4][0] [2][1]
     62 	 vpblendd	$0b00001100,%ymm11,%ymm10,%ymm5	#               [4][2] [2][4]
     63 	 vpblendd	$0b00001100,%ymm10,%ymm9,%ymm14	#               [4][3] [2][0]
     64 	vpblendd	$0b00110000,%ymm11,%ymm3,%ymm3	#        [1][3] [4][4] [2][0]
     65 	vpblendd	$0b00110000,%ymm12,%ymm15,%ymm15	#        [1][4] [4][0] [2][1]
     66 	 vpblendd	$0b00110000,%ymm9,%ymm5,%ymm5	#        [1][0] [4][2] [2][4]
     67 	 vpblendd	$0b00110000,%ymm13,%ymm14,%ymm14	#        [1][1] [4][3] [2][0]
     68 	vpblendd	$0b11000000,%ymm12,%ymm3,%ymm3	# [3][2] [1][3] [4][4] [2][0]
     69 	vpblendd	$0b11000000,%ymm13,%ymm15,%ymm15	# [3][3] [1][4] [4][0] [2][1]
     70 	 vpblendd	$0b11000000,%ymm13,%ymm5,%ymm5	# [3][3] [1][0] [4][2] [2][4]
     71 	 vpblendd	$0b11000000,%ymm11,%ymm14,%ymm14	# [3][4] [1][1] [4][3] [2][0]
     72 	vpternlogq	$0xC6,%ymm15,%ymm10,%ymm3		# [3][1] [1][2] [4][3] [2][4]
     73 	 vpternlogq	$0xC6,%ymm14,%ymm12,%ymm5		# [3][2] [1][4] [4][1] [2][3]
     74 
     75 	vpsrldq		$8,%ymm8,%ymm7
     76 	vpandn		%ymm7,%ymm8,%ymm7	# tgting  [0][0] [0][0] [0][0] [0][0]
     77 
     78 	vpblendd	$0b00001100,%ymm9,%ymm12,%ymm6	#               [4][0] [2][3]
     79 	vpblendd	$0b00001100,%ymm12,%ymm10,%ymm15	#               [4][1] [2][4]
     80 	vpblendd	$0b00110000,%ymm10,%ymm6,%ymm6	#        [1][2] [4][0] [2][3]
     81 	vpblendd	$0b00110000,%ymm11,%ymm15,%ymm15	#        [1][3] [4][1] [2][4]
     82 	vpblendd	$0b11000000,%ymm11,%ymm6,%ymm6	# [3][4] [1][2] [4][0] [2][3]
     83 	vpblendd	$0b11000000,%ymm9,%ymm15,%ymm15	# [3][0] [1][3] [4][1] [2][4]
     84 	vpternlogq	$0xC6,%ymm15,%ymm13,%ymm6		# [3][3] [1][1] [4][4] [2][2]
     85 
     86 	  vpermq	$0b00011110,%ymm8,%ymm4		# [0][1] [0][2] [0][4] [0][3]
     87 	  vpblendd	$0b00110000,%ymm0,%ymm4,%ymm15	# [0][1] [0][0] [0][4] [0][3]
     88 	  vpermq	$0b00111001,%ymm8,%ymm1		# [0][1] [0][4] [0][3] [0][2]
     89 	  vpblendd	$0b11000000,%ymm0,%ymm1,%ymm1	# [0][0] [0][4] [0][3] [0][2]
     90 
     91 	vpblendd	$0b00001100,%ymm12,%ymm11,%ymm2	#               [4][1] [2][1]
     92 	vpblendd	$0b00001100,%ymm11,%ymm13,%ymm14	#               [4][2] [2][2]
     93 	vpblendd	$0b00110000,%ymm13,%ymm2,%ymm2	#        [1][1] [4][1] [2][1]
     94 	vpblendd	$0b00110000,%ymm10,%ymm14,%ymm14	#        [1][2] [4][2] [2][2]
     95 	vpblendd	$0b11000000,%ymm10,%ymm2,%ymm2	# [3][1] [1][1] [4][1] [2][1]
     96 	vpblendd	$0b11000000,%ymm12,%ymm14,%ymm14	# [3][2] [1][2] [4][2] [2][2]
     97 	vpternlogq	$0xC6,%ymm14,%ymm9,%ymm2		# [3][0] [1][0] [4][0] [2][0]
     98 
     99 	 vpermq		$0b00000000,%ymm7,%ymm7	# [0][0] [0][0] [0][0] [0][0]
    100 	 vpermq		$0b00011011,%ymm3,%ymm3		# post-Chi shuffle
    101 	 vpermq		$0b10001101,%ymm5,%ymm5
    102 	 vpermq		$0b01110010,%ymm6,%ymm6
    103 
    104 	vpblendd	$0b00001100,%ymm10,%ymm13,%ymm4	#               [4][3] [2][2]
    105 	vpblendd	$0b00001100,%ymm13,%ymm12,%ymm14	#               [4][4] [2][3]
    106 	vpblendd	$0b00110000,%ymm12,%ymm4,%ymm4	#        [1][4] [4][3] [2][2]
    107 	vpblendd	$0b00110000,%ymm9,%ymm14,%ymm14	#        [1][0] [4][4] [2][3]
    108 	vpblendd	$0b11000000,%ymm9,%ymm4,%ymm4	# [3][0] [1][4] [4][3] [2][2]
    109 	vpblendd	$0b11000000,%ymm10,%ymm14,%ymm14	# [3][1] [1][0] [4][4] [2][3]
    110 
    111 	vpternlogq	$0xC6,%ymm15,%ymm8,%ymm1		# [0][4] [0][3] [0][2] [0][1]
    112 	vpternlogq	$0xC6,%ymm14,%ymm11,%ymm4		# [3][4] [1][3] [4][2] [2][1]
    113 
    114 	######################################### Iota
    115 	vpternlogq	$0x96,(%r10),%ymm7,%ymm0
    116 	lea		32(%r10),%r10
    117 
    118 	dec		%eax
    119 	jnz		.Loop_avx512vl
    120 
    121 	ret
    122 .size	__KeccakF1600,.-__KeccakF1600
    123 .globl	SHA3_absorb
    124 .type	SHA3_absorb,@function
    125 .align	32
    126 SHA3_absorb:
    127 	mov	%rsp,%r11
    128 
    129 	lea	-240(%rsp),%rsp
    130 	and	$-32,%rsp
    131 
    132 	lea	96(%rdi),%rdi
    133 	lea	96(%rsi),%rsi
    134 	lea	96(%rsp),%r10
    135 	lea	rhotates_left(%rip),%r8
    136 
    137 	vzeroupper
    138 
    139 	vpbroadcastq	-96(%rdi),%ymm0	# load A[5][5]
    140 	vmovdqu		8+32*0-96(%rdi),%ymm1
    141 	vmovdqu		8+32*1-96(%rdi),%ymm2
    142 	vmovdqu		8+32*2-96(%rdi),%ymm3
    143 	vmovdqu		8+32*3-96(%rdi),%ymm4
    144 	vmovdqu		8+32*4-96(%rdi),%ymm5
    145 	vmovdqu		8+32*5-96(%rdi),%ymm6
    146 
    147 	vmovdqa64	0*32(%r8),%ymm16		# load "rhotate" indices
    148 	vmovdqa64	1*32(%r8),%ymm17
    149 	vmovdqa64	2*32(%r8),%ymm18
    150 	vmovdqa64	3*32(%r8),%ymm19
    151 	vmovdqa64	4*32(%r8),%ymm20
    152 	vmovdqa64	5*32(%r8),%ymm21
    153 
    154 	vpxor		%ymm7,%ymm7,%ymm7
    155 	vmovdqa		%ymm7,32*2-96(%r10)	# zero transfer area on stack
    156 	vmovdqa		%ymm7,32*3-96(%r10)
    157 	vmovdqa		%ymm7,32*4-96(%r10)
    158 	vmovdqa		%ymm7,32*5-96(%r10)
    159 	vmovdqa		%ymm7,32*6-96(%r10)
    160 
    161 .Loop_absorb_avx512vl:
    162 	mov		%rcx,%rax
    163 	sub		%rcx,%rdx
    164 	jc		.Ldone_absorb_avx512vl
    165 
    166 	shr		$3,%eax
    167 	vpbroadcastq	0-96(%rsi),%ymm7
    168 	vmovdqu		8-96(%rsi),%ymm8
    169 	sub		$4,%eax
    170 	dec	%eax
    171 	jz	.Labsorved_avx512vl
    172 	mov	8*5-96(%rsi),%r8
    173 	mov	%r8,80-96(%r10)
    174 	dec	%eax
    175 	jz	.Labsorved_avx512vl
    176 	mov	8*6-96(%rsi),%r8
    177 	mov	%r8,192-96(%r10)
    178 	dec	%eax
    179 	jz	.Labsorved_avx512vl
    180 	mov	8*7-96(%rsi),%r8
    181 	mov	%r8,104-96(%r10)
    182 	dec	%eax
    183 	jz	.Labsorved_avx512vl
    184 	mov	8*8-96(%rsi),%r8
    185 	mov	%r8,144-96(%r10)
    186 	dec	%eax
    187 	jz	.Labsorved_avx512vl
    188 	mov	8*9-96(%rsi),%r8
    189 	mov	%r8,184-96(%r10)
    190 	dec	%eax
    191 	jz	.Labsorved_avx512vl
    192 	mov	8*10-96(%rsi),%r8
    193 	mov	%r8,64-96(%r10)
    194 	dec	%eax
    195 	jz	.Labsorved_avx512vl
    196 	mov	8*11-96(%rsi),%r8
    197 	mov	%r8,128-96(%r10)
    198 	dec	%eax
    199 	jz	.Labsorved_avx512vl
    200 	mov	8*12-96(%rsi),%r8
    201 	mov	%r8,200-96(%r10)
    202 	dec	%eax
    203 	jz	.Labsorved_avx512vl
    204 	mov	8*13-96(%rsi),%r8
    205 	mov	%r8,176-96(%r10)
    206 	dec	%eax
    207 	jz	.Labsorved_avx512vl
    208 	mov	8*14-96(%rsi),%r8
    209 	mov	%r8,120-96(%r10)
    210 	dec	%eax
    211 	jz	.Labsorved_avx512vl
    212 	mov	8*15-96(%rsi),%r8
    213 	mov	%r8,88-96(%r10)
    214 	dec	%eax
    215 	jz	.Labsorved_avx512vl
    216 	mov	8*16-96(%rsi),%r8
    217 	mov	%r8,96-96(%r10)
    218 	dec	%eax
    219 	jz	.Labsorved_avx512vl
    220 	mov	8*17-96(%rsi),%r8
    221 	mov	%r8,168-96(%r10)
    222 	dec	%eax
    223 	jz	.Labsorved_avx512vl
    224 	mov	8*18-96(%rsi),%r8
    225 	mov	%r8,208-96(%r10)
    226 	dec	%eax
    227 	jz	.Labsorved_avx512vl
    228 	mov	8*19-96(%rsi),%r8
    229 	mov	%r8,152-96(%r10)
    230 	dec	%eax
    231 	jz	.Labsorved_avx512vl
    232 	mov	8*20-96(%rsi),%r8
    233 	mov	%r8,72-96(%r10)
    234 	dec	%eax
    235 	jz	.Labsorved_avx512vl
    236 	mov	8*21-96(%rsi),%r8
    237 	mov	%r8,160-96(%r10)
    238 	dec	%eax
    239 	jz	.Labsorved_avx512vl
    240 	mov	8*22-96(%rsi),%r8
    241 	mov	%r8,136-96(%r10)
    242 	dec	%eax
    243 	jz	.Labsorved_avx512vl
    244 	mov	8*23-96(%rsi),%r8
    245 	mov	%r8,112-96(%r10)
    246 	dec	%eax
    247 	jz	.Labsorved_avx512vl
    248 	mov	8*24-96(%rsi),%r8
    249 	mov	%r8,216-96(%r10)
    250 .Labsorved_avx512vl:
    251 	lea	(%rsi,%rcx),%rsi
    252 
    253 	vpxor	%ymm7,%ymm0,%ymm0
    254 	vpxor	%ymm8,%ymm1,%ymm1
    255 	vpxor	32*2-96(%r10),%ymm2,%ymm2
    256 	vpxor	32*3-96(%r10),%ymm3,%ymm3
    257 	vpxor	32*4-96(%r10),%ymm4,%ymm4
    258 	vpxor	32*5-96(%r10),%ymm5,%ymm5
    259 	vpxor	32*6-96(%r10),%ymm6,%ymm6
    260 
    261 	call	__KeccakF1600
    262 
    263 	lea	96(%rsp),%r10
    264 	jmp	.Loop_absorb_avx512vl
    265 
    266 .Ldone_absorb_avx512vl:
    267 	vmovq	%xmm0,-96(%rdi)
    268 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    269 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    270 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    271 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    272 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    273 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    274 
    275 	vzeroupper
    276 
    277 	lea	(%r11),%rsp
    278 	lea	(%rdx,%rcx),%rax		# return value
    279 	ret
    280 .size	SHA3_absorb,.-SHA3_absorb
    281 
    282 .globl	SHA3_squeeze
    283 .type	SHA3_squeeze,@function
    284 .align	32
    285 SHA3_squeeze:
    286 	mov	%rsp,%r11
    287 
    288 	lea	96(%rdi),%rdi
    289 	lea	rhotates_left(%rip),%r8
    290 	shr	$3,%rcx
    291 
    292 	vzeroupper
    293 
    294 	vpbroadcastq	-96(%rdi),%ymm0
    295 	vpxor		%ymm7,%ymm7,%ymm7
    296 	vmovdqu		8+32*0-96(%rdi),%ymm1
    297 	vmovdqu		8+32*1-96(%rdi),%ymm2
    298 	vmovdqu		8+32*2-96(%rdi),%ymm3
    299 	vmovdqu		8+32*3-96(%rdi),%ymm4
    300 	vmovdqu		8+32*4-96(%rdi),%ymm5
    301 	vmovdqu		8+32*5-96(%rdi),%ymm6
    302 
    303 	vmovdqa64	0*32(%r8),%ymm16		# load "rhotate" indices
    304 	vmovdqa64	1*32(%r8),%ymm17
    305 	vmovdqa64	2*32(%r8),%ymm18
    306 	vmovdqa64	3*32(%r8),%ymm19
    307 	vmovdqa64	4*32(%r8),%ymm20
    308 	vmovdqa64	5*32(%r8),%ymm21
    309 
    310 	mov	%rcx,%rax
    311 
    312 .Loop_squeeze_avx512vl:
    313 	mov	0-96(%rdi),%r8
    314 	sub	$8,%rdx
    315 	jc	.Ltail_squeeze_avx512vl
    316 	mov	%r8,(%rsi)
    317 	lea	8(%rsi),%rsi
    318 	je	.Ldone_squeeze_avx512vl
    319 	dec	%eax
    320 	je	.Lextend_output_avx512vl
    321 	mov	32-120(%rdi),%r8
    322 	sub	$8,%rdx
    323 	jc	.Ltail_squeeze_avx512vl
    324 	mov	%r8,(%rsi)
    325 	lea	8(%rsi),%rsi
    326 	je	.Ldone_squeeze_avx512vl
    327 	dec	%eax
    328 	je	.Lextend_output_avx512vl
    329 	mov	40-120(%rdi),%r8
    330 	sub	$8,%rdx
    331 	jc	.Ltail_squeeze_avx512vl
    332 	mov	%r8,(%rsi)
    333 	lea	8(%rsi),%rsi
    334 	je	.Ldone_squeeze_avx512vl
    335 	dec	%eax
    336 	je	.Lextend_output_avx512vl
    337 	mov	48-120(%rdi),%r8
    338 	sub	$8,%rdx
    339 	jc	.Ltail_squeeze_avx512vl
    340 	mov	%r8,(%rsi)
    341 	lea	8(%rsi),%rsi
    342 	je	.Ldone_squeeze_avx512vl
    343 	dec	%eax
    344 	je	.Lextend_output_avx512vl
    345 	mov	56-120(%rdi),%r8
    346 	sub	$8,%rdx
    347 	jc	.Ltail_squeeze_avx512vl
    348 	mov	%r8,(%rsi)
    349 	lea	8(%rsi),%rsi
    350 	je	.Ldone_squeeze_avx512vl
    351 	dec	%eax
    352 	je	.Lextend_output_avx512vl
    353 	mov	80-120(%rdi),%r8
    354 	sub	$8,%rdx
    355 	jc	.Ltail_squeeze_avx512vl
    356 	mov	%r8,(%rsi)
    357 	lea	8(%rsi),%rsi
    358 	je	.Ldone_squeeze_avx512vl
    359 	dec	%eax
    360 	je	.Lextend_output_avx512vl
    361 	mov	192-120(%rdi),%r8
    362 	sub	$8,%rdx
    363 	jc	.Ltail_squeeze_avx512vl
    364 	mov	%r8,(%rsi)
    365 	lea	8(%rsi),%rsi
    366 	je	.Ldone_squeeze_avx512vl
    367 	dec	%eax
    368 	je	.Lextend_output_avx512vl
    369 	mov	104-120(%rdi),%r8
    370 	sub	$8,%rdx
    371 	jc	.Ltail_squeeze_avx512vl
    372 	mov	%r8,(%rsi)
    373 	lea	8(%rsi),%rsi
    374 	je	.Ldone_squeeze_avx512vl
    375 	dec	%eax
    376 	je	.Lextend_output_avx512vl
    377 	mov	144-120(%rdi),%r8
    378 	sub	$8,%rdx
    379 	jc	.Ltail_squeeze_avx512vl
    380 	mov	%r8,(%rsi)
    381 	lea	8(%rsi),%rsi
    382 	je	.Ldone_squeeze_avx512vl
    383 	dec	%eax
    384 	je	.Lextend_output_avx512vl
    385 	mov	184-120(%rdi),%r8
    386 	sub	$8,%rdx
    387 	jc	.Ltail_squeeze_avx512vl
    388 	mov	%r8,(%rsi)
    389 	lea	8(%rsi),%rsi
    390 	je	.Ldone_squeeze_avx512vl
    391 	dec	%eax
    392 	je	.Lextend_output_avx512vl
    393 	mov	64-120(%rdi),%r8
    394 	sub	$8,%rdx
    395 	jc	.Ltail_squeeze_avx512vl
    396 	mov	%r8,(%rsi)
    397 	lea	8(%rsi),%rsi
    398 	je	.Ldone_squeeze_avx512vl
    399 	dec	%eax
    400 	je	.Lextend_output_avx512vl
    401 	mov	128-120(%rdi),%r8
    402 	sub	$8,%rdx
    403 	jc	.Ltail_squeeze_avx512vl
    404 	mov	%r8,(%rsi)
    405 	lea	8(%rsi),%rsi
    406 	je	.Ldone_squeeze_avx512vl
    407 	dec	%eax
    408 	je	.Lextend_output_avx512vl
    409 	mov	200-120(%rdi),%r8
    410 	sub	$8,%rdx
    411 	jc	.Ltail_squeeze_avx512vl
    412 	mov	%r8,(%rsi)
    413 	lea	8(%rsi),%rsi
    414 	je	.Ldone_squeeze_avx512vl
    415 	dec	%eax
    416 	je	.Lextend_output_avx512vl
    417 	mov	176-120(%rdi),%r8
    418 	sub	$8,%rdx
    419 	jc	.Ltail_squeeze_avx512vl
    420 	mov	%r8,(%rsi)
    421 	lea	8(%rsi),%rsi
    422 	je	.Ldone_squeeze_avx512vl
    423 	dec	%eax
    424 	je	.Lextend_output_avx512vl
    425 	mov	120-120(%rdi),%r8
    426 	sub	$8,%rdx
    427 	jc	.Ltail_squeeze_avx512vl
    428 	mov	%r8,(%rsi)
    429 	lea	8(%rsi),%rsi
    430 	je	.Ldone_squeeze_avx512vl
    431 	dec	%eax
    432 	je	.Lextend_output_avx512vl
    433 	mov	88-120(%rdi),%r8
    434 	sub	$8,%rdx
    435 	jc	.Ltail_squeeze_avx512vl
    436 	mov	%r8,(%rsi)
    437 	lea	8(%rsi),%rsi
    438 	je	.Ldone_squeeze_avx512vl
    439 	dec	%eax
    440 	je	.Lextend_output_avx512vl
    441 	mov	96-120(%rdi),%r8
    442 	sub	$8,%rdx
    443 	jc	.Ltail_squeeze_avx512vl
    444 	mov	%r8,(%rsi)
    445 	lea	8(%rsi),%rsi
    446 	je	.Ldone_squeeze_avx512vl
    447 	dec	%eax
    448 	je	.Lextend_output_avx512vl
    449 	mov	168-120(%rdi),%r8
    450 	sub	$8,%rdx
    451 	jc	.Ltail_squeeze_avx512vl
    452 	mov	%r8,(%rsi)
    453 	lea	8(%rsi),%rsi
    454 	je	.Ldone_squeeze_avx512vl
    455 	dec	%eax
    456 	je	.Lextend_output_avx512vl
    457 	mov	208-120(%rdi),%r8
    458 	sub	$8,%rdx
    459 	jc	.Ltail_squeeze_avx512vl
    460 	mov	%r8,(%rsi)
    461 	lea	8(%rsi),%rsi
    462 	je	.Ldone_squeeze_avx512vl
    463 	dec	%eax
    464 	je	.Lextend_output_avx512vl
    465 	mov	152-120(%rdi),%r8
    466 	sub	$8,%rdx
    467 	jc	.Ltail_squeeze_avx512vl
    468 	mov	%r8,(%rsi)
    469 	lea	8(%rsi),%rsi
    470 	je	.Ldone_squeeze_avx512vl
    471 	dec	%eax
    472 	je	.Lextend_output_avx512vl
    473 	mov	72-120(%rdi),%r8
    474 	sub	$8,%rdx
    475 	jc	.Ltail_squeeze_avx512vl
    476 	mov	%r8,(%rsi)
    477 	lea	8(%rsi),%rsi
    478 	je	.Ldone_squeeze_avx512vl
    479 	dec	%eax
    480 	je	.Lextend_output_avx512vl
    481 	mov	160-120(%rdi),%r8
    482 	sub	$8,%rdx
    483 	jc	.Ltail_squeeze_avx512vl
    484 	mov	%r8,(%rsi)
    485 	lea	8(%rsi),%rsi
    486 	je	.Ldone_squeeze_avx512vl
    487 	dec	%eax
    488 	je	.Lextend_output_avx512vl
    489 	mov	136-120(%rdi),%r8
    490 	sub	$8,%rdx
    491 	jc	.Ltail_squeeze_avx512vl
    492 	mov	%r8,(%rsi)
    493 	lea	8(%rsi),%rsi
    494 	je	.Ldone_squeeze_avx512vl
    495 	dec	%eax
    496 	je	.Lextend_output_avx512vl
    497 	mov	112-120(%rdi),%r8
    498 	sub	$8,%rdx
    499 	jc	.Ltail_squeeze_avx512vl
    500 	mov	%r8,(%rsi)
    501 	lea	8(%rsi),%rsi
    502 	je	.Ldone_squeeze_avx512vl
    503 	dec	%eax
    504 	je	.Lextend_output_avx512vl
    505 	mov	216-120(%rdi),%r8
    506 	sub	$8,%rdx
    507 	jc	.Ltail_squeeze_avx512vl
    508 	mov	%r8,(%rsi)
    509 	lea	8(%rsi),%rsi
    510 	je	.Ldone_squeeze_avx512vl
    511 	dec	%eax
    512 	je	.Lextend_output_avx512vl
    513 	mov	-120(%rdi),%r8
    514 .Lextend_output_avx512vl:
    515 	call	__KeccakF1600
    516 
    517 	vmovq	%xmm0,-96(%rdi)
    518 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    519 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    520 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    521 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    522 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    523 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    524 
    525 	mov	%rcx,%rax
    526 	jmp	.Loop_squeeze_avx512vl
    527 
    528 
    529 .Ltail_squeeze_avx512vl:
    530 	add	$8,%rdx
    531 .Loop_tail_avx512vl:
    532 	mov	%r8b,(%rsi)
    533 	lea	1(%rsi),%rsi
    534 	shr	$8,%r8
    535 	dec	%rdx
    536 	jnz	.Loop_tail_avx512vl
    537 
    538 .Ldone_squeeze_avx512vl:
    539 	vzeroupper
    540 
    541 	lea	(%r11),%rsp
    542 	ret
    543 .size	SHA3_squeeze,.-SHA3_squeeze
    544 
    545 .section .rodata
    546 .align	64
    547 rhotates_left:
    548 	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
    549 	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
    550 	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
    551 	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
    552 	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
    553 	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
    554 iotas:
    555 	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
    556 	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
    557 	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
    558 	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
    559 	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
    560 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    561 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    562 	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
    563 	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
    564 	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
    565 	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
    566 	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
    567 	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
    568 	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
    569 	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
    570 	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
    571 	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
    572 	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
    573 	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
    574 	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
    575 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    576 	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
    577 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    578 	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
    579 
    580 .asciz	"Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro (at) openssl.org>"
    581