Home | History | Annotate | Line # | Download | only in x86_64
      1 .text
      2 
      3 .type	__KeccakF1600,@function
      4 .align	32
      5 __KeccakF1600:
      6 	lea		rhotates_left+96(%rip),%r8
      7 	lea		rhotates_right+96(%rip),%r9
      8 	lea		iotas(%rip),%r10
      9 	mov		$24,%eax
     10 	jmp		.Loop_avx2
     11 
     12 .align	32
     13 .Loop_avx2:
     14 	######################################### Theta
     15 	vpshufd		$0b01001110,%ymm2,%ymm13
     16 	vpxor		%ymm3,%ymm5,%ymm12
     17 	vpxor		%ymm6,%ymm4,%ymm9
     18 	vpxor		%ymm1,%ymm12,%ymm12
     19 	vpxor		%ymm9,%ymm12,%ymm12		# C[1..4]
     20 
     21 	vpermq		$0b10010011,%ymm12,%ymm11
     22 	vpxor		%ymm2,%ymm13,%ymm13
     23 	vpermq		$0b01001110,%ymm13,%ymm7
     24 
     25 	vpsrlq		$63,%ymm12,%ymm8
     26 	vpaddq		%ymm12,%ymm12,%ymm9
     27 	vpor		%ymm9,%ymm8,%ymm8	# ROL64(C[1..4],1)
     28 
     29 	vpermq		$0b00111001,%ymm8,%ymm15
     30 	vpxor		%ymm11,%ymm8,%ymm14
     31 	vpermq		$0b00000000,%ymm14,%ymm14	# D[0..0] = ROL64(C[1],1) ^ C[4]
     32 
     33 	vpxor		%ymm0,%ymm13,%ymm13
     34 	vpxor		%ymm7,%ymm13,%ymm13		# C[0..0]
     35 
     36 	vpsrlq		$63,%ymm13,%ymm7
     37 	vpaddq		%ymm13,%ymm13,%ymm8
     38 	vpor		%ymm7,%ymm8,%ymm8	# ROL64(C[0..0],1)
     39 
     40 	vpxor		%ymm14,%ymm2,%ymm2		# ^= D[0..0]
     41 	vpxor		%ymm14,%ymm0,%ymm0		# ^= D[0..0]
     42 
     43 	vpblendd	$0b11000000,%ymm8,%ymm15,%ymm15
     44 	vpblendd	$0b00000011,%ymm13,%ymm11,%ymm11
     45 	vpxor		%ymm11,%ymm15,%ymm15		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
     46 
     47 	######################################### Rho + Pi + pre-Chi shuffle
     48 	vpsllvq		0*32-96(%r8),%ymm2,%ymm10
     49 	vpsrlvq		0*32-96(%r9),%ymm2,%ymm2
     50 	vpor		%ymm10,%ymm2,%ymm2
     51 
     52 	 vpxor		%ymm15,%ymm3,%ymm3		# ^= D[1..4] from Theta
     53 	vpsllvq		2*32-96(%r8),%ymm3,%ymm11
     54 	vpsrlvq		2*32-96(%r9),%ymm3,%ymm3
     55 	vpor		%ymm11,%ymm3,%ymm3
     56 
     57 	 vpxor		%ymm15,%ymm4,%ymm4		# ^= D[1..4] from Theta
     58 	vpsllvq		3*32-96(%r8),%ymm4,%ymm12
     59 	vpsrlvq		3*32-96(%r9),%ymm4,%ymm4
     60 	vpor		%ymm12,%ymm4,%ymm4
     61 
     62 	 vpxor		%ymm15,%ymm5,%ymm5		# ^= D[1..4] from Theta
     63 	vpsllvq		4*32-96(%r8),%ymm5,%ymm13
     64 	vpsrlvq		4*32-96(%r9),%ymm5,%ymm5
     65 	vpor		%ymm13,%ymm5,%ymm5
     66 
     67 	 vpxor		%ymm15,%ymm6,%ymm6		# ^= D[1..4] from Theta
     68 	 vpermq		$0b10001101,%ymm2,%ymm10	# %ymm2 -> future %ymm3
     69 	 vpermq		$0b10001101,%ymm3,%ymm11	# %ymm3 -> future %ymm4
     70 	vpsllvq		5*32-96(%r8),%ymm6,%ymm14
     71 	vpsrlvq		5*32-96(%r9),%ymm6,%ymm8
     72 	vpor		%ymm14,%ymm8,%ymm8	# %ymm6 -> future %ymm1
     73 
     74 	 vpxor		%ymm15,%ymm1,%ymm1		# ^= D[1..4] from Theta
     75 	 vpermq		$0b00011011,%ymm4,%ymm12	# %ymm4 -> future %ymm5
     76 	 vpermq		$0b01110010,%ymm5,%ymm13	# %ymm5 -> future %ymm6
     77 	vpsllvq		1*32-96(%r8),%ymm1,%ymm15
     78 	vpsrlvq		1*32-96(%r9),%ymm1,%ymm9
     79 	vpor		%ymm15,%ymm9,%ymm9	# %ymm1 -> future %ymm2
     80 
     81 	######################################### Chi
     82 	vpsrldq		$8,%ymm8,%ymm14
     83 	vpandn		%ymm14,%ymm8,%ymm7	# tgting  [0][0] [0][0] [0][0] [0][0]
     84 
     85 	vpblendd	$0b00001100,%ymm13,%ymm9,%ymm3	#               [4][4] [2][0]
     86 	vpblendd	$0b00001100,%ymm9,%ymm11,%ymm15	#               [4][0] [2][1]
     87 	 vpblendd	$0b00001100,%ymm11,%ymm10,%ymm5	#               [4][2] [2][4]
     88 	 vpblendd	$0b00001100,%ymm10,%ymm9,%ymm14	#               [4][3] [2][0]
     89 	vpblendd	$0b00110000,%ymm11,%ymm3,%ymm3	#        [1][3] [4][4] [2][0]
     90 	vpblendd	$0b00110000,%ymm12,%ymm15,%ymm15	#        [1][4] [4][0] [2][1]
     91 	 vpblendd	$0b00110000,%ymm9,%ymm5,%ymm5	#        [1][0] [4][2] [2][4]
     92 	 vpblendd	$0b00110000,%ymm13,%ymm14,%ymm14	#        [1][1] [4][3] [2][0]
     93 	vpblendd	$0b11000000,%ymm12,%ymm3,%ymm3	# [3][2] [1][3] [4][4] [2][0]
     94 	vpblendd	$0b11000000,%ymm13,%ymm15,%ymm15	# [3][3] [1][4] [4][0] [2][1]
     95 	 vpblendd	$0b11000000,%ymm13,%ymm5,%ymm5	# [3][3] [1][0] [4][2] [2][4]
     96 	 vpblendd	$0b11000000,%ymm11,%ymm14,%ymm14	# [3][4] [1][1] [4][3] [2][0]
     97 	vpandn		%ymm15,%ymm3,%ymm3		# tgting  [3][1] [1][2] [4][3] [2][4]
     98 	 vpandn		%ymm14,%ymm5,%ymm5		# tgting  [3][2] [1][4] [4][1] [2][3]
     99 
    100 	vpblendd	$0b00001100,%ymm9,%ymm12,%ymm6	#               [4][0] [2][3]
    101 	vpblendd	$0b00001100,%ymm12,%ymm10,%ymm15	#               [4][1] [2][4]
    102 	 vpxor		%ymm10,%ymm3,%ymm3
    103 	vpblendd	$0b00110000,%ymm10,%ymm6,%ymm6	#        [1][2] [4][0] [2][3]
    104 	vpblendd	$0b00110000,%ymm11,%ymm15,%ymm15	#        [1][3] [4][1] [2][4]
    105 	 vpxor		%ymm12,%ymm5,%ymm5
    106 	vpblendd	$0b11000000,%ymm11,%ymm6,%ymm6	# [3][4] [1][2] [4][0] [2][3]
    107 	vpblendd	$0b11000000,%ymm9,%ymm15,%ymm15	# [3][0] [1][3] [4][1] [2][4]
    108 	vpandn		%ymm15,%ymm6,%ymm6		# tgting  [3][3] [1][1] [4][4] [2][2]
    109 	vpxor		%ymm13,%ymm6,%ymm6
    110 
    111 	  vpermq	$0b00011110,%ymm8,%ymm4		# [0][1] [0][2] [0][4] [0][3]
    112 	  vpblendd	$0b00110000,%ymm0,%ymm4,%ymm15	# [0][1] [0][0] [0][4] [0][3]
    113 	  vpermq	$0b00111001,%ymm8,%ymm1		# [0][1] [0][4] [0][3] [0][2]
    114 	  vpblendd	$0b11000000,%ymm0,%ymm1,%ymm1	# [0][0] [0][4] [0][3] [0][2]
    115 	  vpandn	%ymm15,%ymm1,%ymm1		# tgting  [0][4] [0][3] [0][2] [0][1]
    116 
    117 	vpblendd	$0b00001100,%ymm12,%ymm11,%ymm2	#               [4][1] [2][1]
    118 	vpblendd	$0b00001100,%ymm11,%ymm13,%ymm14	#               [4][2] [2][2]
    119 	vpblendd	$0b00110000,%ymm13,%ymm2,%ymm2	#        [1][1] [4][1] [2][1]
    120 	vpblendd	$0b00110000,%ymm10,%ymm14,%ymm14	#        [1][2] [4][2] [2][2]
    121 	vpblendd	$0b11000000,%ymm10,%ymm2,%ymm2	# [3][1] [1][1] [4][1] [2][1]
    122 	vpblendd	$0b11000000,%ymm12,%ymm14,%ymm14	# [3][2] [1][2] [4][2] [2][2]
    123 	vpandn		%ymm14,%ymm2,%ymm2		# tgting  [3][0] [1][0] [4][0] [2][0]
    124 	vpxor		%ymm9,%ymm2,%ymm2
    125 
    126 	 vpermq		$0b00000000,%ymm7,%ymm7	# [0][0] [0][0] [0][0] [0][0]
    127 	 vpermq		$0b00011011,%ymm3,%ymm3	# post-Chi shuffle
    128 	 vpermq		$0b10001101,%ymm5,%ymm5
    129 	 vpermq		$0b01110010,%ymm6,%ymm6
    130 
    131 	vpblendd	$0b00001100,%ymm10,%ymm13,%ymm4	#               [4][3] [2][2]
    132 	vpblendd	$0b00001100,%ymm13,%ymm12,%ymm14	#               [4][4] [2][3]
    133 	vpblendd	$0b00110000,%ymm12,%ymm4,%ymm4	#        [1][4] [4][3] [2][2]
    134 	vpblendd	$0b00110000,%ymm9,%ymm14,%ymm14	#        [1][0] [4][4] [2][3]
    135 	vpblendd	$0b11000000,%ymm9,%ymm4,%ymm4	# [3][0] [1][4] [4][3] [2][2]
    136 	vpblendd	$0b11000000,%ymm10,%ymm14,%ymm14	# [3][1] [1][0] [4][4] [2][3]
    137 	vpandn		%ymm14,%ymm4,%ymm4		# tgting  [3][4] [1][3] [4][2] [2][1]
    138 
    139 	vpxor		%ymm7,%ymm0,%ymm0
    140 	vpxor		%ymm8,%ymm1,%ymm1
    141 	vpxor		%ymm11,%ymm4,%ymm4
    142 
    143 	######################################### Iota
    144 	vpxor		(%r10),%ymm0,%ymm0
    145 	lea		32(%r10),%r10
    146 
    147 	dec		%eax
    148 	jnz		.Loop_avx2
    149 
    150 	ret
    151 .size	__KeccakF1600,.-__KeccakF1600
    152 .globl	SHA3_absorb
    153 .type	SHA3_absorb,@function
    154 .align	32
    155 SHA3_absorb:
    156 	mov	%rsp,%r11
    157 
    158 	lea	-240(%rsp),%rsp
    159 	and	$-32,%rsp
    160 
    161 	lea	96(%rdi),%rdi
    162 	lea	96(%rsi),%rsi
    163 	lea	96(%rsp),%r10
    164 
    165 	vzeroupper
    166 
    167 	vpbroadcastq	-96(%rdi),%ymm0	# load A[5][5]
    168 	vmovdqu		8+32*0-96(%rdi),%ymm1
    169 	vmovdqu		8+32*1-96(%rdi),%ymm2
    170 	vmovdqu		8+32*2-96(%rdi),%ymm3
    171 	vmovdqu		8+32*3-96(%rdi),%ymm4
    172 	vmovdqu		8+32*4-96(%rdi),%ymm5
    173 	vmovdqu		8+32*5-96(%rdi),%ymm6
    174 
    175 	vpxor		%ymm7,%ymm7,%ymm7
    176 	vmovdqa		%ymm7,32*2-96(%r10)	# zero transfer area on stack
    177 	vmovdqa		%ymm7,32*3-96(%r10)
    178 	vmovdqa		%ymm7,32*4-96(%r10)
    179 	vmovdqa		%ymm7,32*5-96(%r10)
    180 	vmovdqa		%ymm7,32*6-96(%r10)
    181 
    182 .Loop_absorb_avx2:
    183 	mov		%rcx,%rax
    184 	sub		%rcx,%rdx
    185 	jc		.Ldone_absorb_avx2
    186 
    187 	shr		$3,%eax
    188 	vpbroadcastq	0-96(%rsi),%ymm7
    189 	vmovdqu		8-96(%rsi),%ymm8
    190 	sub		$4,%eax
    191 	dec	%eax
    192 	jz	.Labsorved_avx2
    193 	mov	8*5-96(%rsi),%r8
    194 	mov	%r8,80-96(%r10)
    195 	dec	%eax
    196 	jz	.Labsorved_avx2
    197 	mov	8*6-96(%rsi),%r8
    198 	mov	%r8,192-96(%r10)
    199 	dec	%eax
    200 	jz	.Labsorved_avx2
    201 	mov	8*7-96(%rsi),%r8
    202 	mov	%r8,104-96(%r10)
    203 	dec	%eax
    204 	jz	.Labsorved_avx2
    205 	mov	8*8-96(%rsi),%r8
    206 	mov	%r8,144-96(%r10)
    207 	dec	%eax
    208 	jz	.Labsorved_avx2
    209 	mov	8*9-96(%rsi),%r8
    210 	mov	%r8,184-96(%r10)
    211 	dec	%eax
    212 	jz	.Labsorved_avx2
    213 	mov	8*10-96(%rsi),%r8
    214 	mov	%r8,64-96(%r10)
    215 	dec	%eax
    216 	jz	.Labsorved_avx2
    217 	mov	8*11-96(%rsi),%r8
    218 	mov	%r8,128-96(%r10)
    219 	dec	%eax
    220 	jz	.Labsorved_avx2
    221 	mov	8*12-96(%rsi),%r8
    222 	mov	%r8,200-96(%r10)
    223 	dec	%eax
    224 	jz	.Labsorved_avx2
    225 	mov	8*13-96(%rsi),%r8
    226 	mov	%r8,176-96(%r10)
    227 	dec	%eax
    228 	jz	.Labsorved_avx2
    229 	mov	8*14-96(%rsi),%r8
    230 	mov	%r8,120-96(%r10)
    231 	dec	%eax
    232 	jz	.Labsorved_avx2
    233 	mov	8*15-96(%rsi),%r8
    234 	mov	%r8,88-96(%r10)
    235 	dec	%eax
    236 	jz	.Labsorved_avx2
    237 	mov	8*16-96(%rsi),%r8
    238 	mov	%r8,96-96(%r10)
    239 	dec	%eax
    240 	jz	.Labsorved_avx2
    241 	mov	8*17-96(%rsi),%r8
    242 	mov	%r8,168-96(%r10)
    243 	dec	%eax
    244 	jz	.Labsorved_avx2
    245 	mov	8*18-96(%rsi),%r8
    246 	mov	%r8,208-96(%r10)
    247 	dec	%eax
    248 	jz	.Labsorved_avx2
    249 	mov	8*19-96(%rsi),%r8
    250 	mov	%r8,152-96(%r10)
    251 	dec	%eax
    252 	jz	.Labsorved_avx2
    253 	mov	8*20-96(%rsi),%r8
    254 	mov	%r8,72-96(%r10)
    255 	dec	%eax
    256 	jz	.Labsorved_avx2
    257 	mov	8*21-96(%rsi),%r8
    258 	mov	%r8,160-96(%r10)
    259 	dec	%eax
    260 	jz	.Labsorved_avx2
    261 	mov	8*22-96(%rsi),%r8
    262 	mov	%r8,136-96(%r10)
    263 	dec	%eax
    264 	jz	.Labsorved_avx2
    265 	mov	8*23-96(%rsi),%r8
    266 	mov	%r8,112-96(%r10)
    267 	dec	%eax
    268 	jz	.Labsorved_avx2
    269 	mov	8*24-96(%rsi),%r8
    270 	mov	%r8,216-96(%r10)
    271 .Labsorved_avx2:
    272 	lea	(%rsi,%rcx),%rsi
    273 
    274 	vpxor	%ymm7,%ymm0,%ymm0
    275 	vpxor	%ymm8,%ymm1,%ymm1
    276 	vpxor	32*2-96(%r10),%ymm2,%ymm2
    277 	vpxor	32*3-96(%r10),%ymm3,%ymm3
    278 	vpxor	32*4-96(%r10),%ymm4,%ymm4
    279 	vpxor	32*5-96(%r10),%ymm5,%ymm5
    280 	vpxor	32*6-96(%r10),%ymm6,%ymm6
    281 
    282 	call	__KeccakF1600
    283 
    284 	lea	96(%rsp),%r10
    285 	jmp	.Loop_absorb_avx2
    286 
    287 .Ldone_absorb_avx2:
    288 	vmovq	%xmm0,-96(%rdi)
    289 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    290 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    291 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    292 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    293 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    294 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    295 
    296 	vzeroupper
    297 
    298 	lea	(%r11),%rsp
    299 	lea	(%rdx,%rcx),%rax		# return value
    300 	ret
    301 .size	SHA3_absorb,.-SHA3_absorb
    302 
    303 .globl	SHA3_squeeze
    304 .type	SHA3_squeeze,@function
    305 .align	32
    306 SHA3_squeeze:
    307 	mov	%rsp,%r11
    308 
    309 	lea	96(%rdi),%rdi
    310 	shr	$3,%rcx
    311 
    312 	vzeroupper
    313 
    314 	vpbroadcastq	-96(%rdi),%ymm0
    315 	vpxor		%ymm7,%ymm7,%ymm7
    316 	vmovdqu		8+32*0-96(%rdi),%ymm1
    317 	vmovdqu		8+32*1-96(%rdi),%ymm2
    318 	vmovdqu		8+32*2-96(%rdi),%ymm3
    319 	vmovdqu		8+32*3-96(%rdi),%ymm4
    320 	vmovdqu		8+32*4-96(%rdi),%ymm5
    321 	vmovdqu		8+32*5-96(%rdi),%ymm6
    322 
    323 	mov	%rcx,%rax
    324 
    325 .Loop_squeeze_avx2:
    326 	mov	0-96(%rdi),%r8
    327 	sub	$8,%rdx
    328 	jc	.Ltail_squeeze_avx2
    329 	mov	%r8,(%rsi)
    330 	lea	8(%rsi),%rsi
    331 	je	.Ldone_squeeze_avx2
    332 	dec	%eax
    333 	je	.Lextend_output_avx2
    334 	mov	32-120(%rdi),%r8
    335 	sub	$8,%rdx
    336 	jc	.Ltail_squeeze_avx2
    337 	mov	%r8,(%rsi)
    338 	lea	8(%rsi),%rsi
    339 	je	.Ldone_squeeze_avx2
    340 	dec	%eax
    341 	je	.Lextend_output_avx2
    342 	mov	40-120(%rdi),%r8
    343 	sub	$8,%rdx
    344 	jc	.Ltail_squeeze_avx2
    345 	mov	%r8,(%rsi)
    346 	lea	8(%rsi),%rsi
    347 	je	.Ldone_squeeze_avx2
    348 	dec	%eax
    349 	je	.Lextend_output_avx2
    350 	mov	48-120(%rdi),%r8
    351 	sub	$8,%rdx
    352 	jc	.Ltail_squeeze_avx2
    353 	mov	%r8,(%rsi)
    354 	lea	8(%rsi),%rsi
    355 	je	.Ldone_squeeze_avx2
    356 	dec	%eax
    357 	je	.Lextend_output_avx2
    358 	mov	56-120(%rdi),%r8
    359 	sub	$8,%rdx
    360 	jc	.Ltail_squeeze_avx2
    361 	mov	%r8,(%rsi)
    362 	lea	8(%rsi),%rsi
    363 	je	.Ldone_squeeze_avx2
    364 	dec	%eax
    365 	je	.Lextend_output_avx2
    366 	mov	80-120(%rdi),%r8
    367 	sub	$8,%rdx
    368 	jc	.Ltail_squeeze_avx2
    369 	mov	%r8,(%rsi)
    370 	lea	8(%rsi),%rsi
    371 	je	.Ldone_squeeze_avx2
    372 	dec	%eax
    373 	je	.Lextend_output_avx2
    374 	mov	192-120(%rdi),%r8
    375 	sub	$8,%rdx
    376 	jc	.Ltail_squeeze_avx2
    377 	mov	%r8,(%rsi)
    378 	lea	8(%rsi),%rsi
    379 	je	.Ldone_squeeze_avx2
    380 	dec	%eax
    381 	je	.Lextend_output_avx2
    382 	mov	104-120(%rdi),%r8
    383 	sub	$8,%rdx
    384 	jc	.Ltail_squeeze_avx2
    385 	mov	%r8,(%rsi)
    386 	lea	8(%rsi),%rsi
    387 	je	.Ldone_squeeze_avx2
    388 	dec	%eax
    389 	je	.Lextend_output_avx2
    390 	mov	144-120(%rdi),%r8
    391 	sub	$8,%rdx
    392 	jc	.Ltail_squeeze_avx2
    393 	mov	%r8,(%rsi)
    394 	lea	8(%rsi),%rsi
    395 	je	.Ldone_squeeze_avx2
    396 	dec	%eax
    397 	je	.Lextend_output_avx2
    398 	mov	184-120(%rdi),%r8
    399 	sub	$8,%rdx
    400 	jc	.Ltail_squeeze_avx2
    401 	mov	%r8,(%rsi)
    402 	lea	8(%rsi),%rsi
    403 	je	.Ldone_squeeze_avx2
    404 	dec	%eax
    405 	je	.Lextend_output_avx2
    406 	mov	64-120(%rdi),%r8
    407 	sub	$8,%rdx
    408 	jc	.Ltail_squeeze_avx2
    409 	mov	%r8,(%rsi)
    410 	lea	8(%rsi),%rsi
    411 	je	.Ldone_squeeze_avx2
    412 	dec	%eax
    413 	je	.Lextend_output_avx2
    414 	mov	128-120(%rdi),%r8
    415 	sub	$8,%rdx
    416 	jc	.Ltail_squeeze_avx2
    417 	mov	%r8,(%rsi)
    418 	lea	8(%rsi),%rsi
    419 	je	.Ldone_squeeze_avx2
    420 	dec	%eax
    421 	je	.Lextend_output_avx2
    422 	mov	200-120(%rdi),%r8
    423 	sub	$8,%rdx
    424 	jc	.Ltail_squeeze_avx2
    425 	mov	%r8,(%rsi)
    426 	lea	8(%rsi),%rsi
    427 	je	.Ldone_squeeze_avx2
    428 	dec	%eax
    429 	je	.Lextend_output_avx2
    430 	mov	176-120(%rdi),%r8
    431 	sub	$8,%rdx
    432 	jc	.Ltail_squeeze_avx2
    433 	mov	%r8,(%rsi)
    434 	lea	8(%rsi),%rsi
    435 	je	.Ldone_squeeze_avx2
    436 	dec	%eax
    437 	je	.Lextend_output_avx2
    438 	mov	120-120(%rdi),%r8
    439 	sub	$8,%rdx
    440 	jc	.Ltail_squeeze_avx2
    441 	mov	%r8,(%rsi)
    442 	lea	8(%rsi),%rsi
    443 	je	.Ldone_squeeze_avx2
    444 	dec	%eax
    445 	je	.Lextend_output_avx2
    446 	mov	88-120(%rdi),%r8
    447 	sub	$8,%rdx
    448 	jc	.Ltail_squeeze_avx2
    449 	mov	%r8,(%rsi)
    450 	lea	8(%rsi),%rsi
    451 	je	.Ldone_squeeze_avx2
    452 	dec	%eax
    453 	je	.Lextend_output_avx2
    454 	mov	96-120(%rdi),%r8
    455 	sub	$8,%rdx
    456 	jc	.Ltail_squeeze_avx2
    457 	mov	%r8,(%rsi)
    458 	lea	8(%rsi),%rsi
    459 	je	.Ldone_squeeze_avx2
    460 	dec	%eax
    461 	je	.Lextend_output_avx2
    462 	mov	168-120(%rdi),%r8
    463 	sub	$8,%rdx
    464 	jc	.Ltail_squeeze_avx2
    465 	mov	%r8,(%rsi)
    466 	lea	8(%rsi),%rsi
    467 	je	.Ldone_squeeze_avx2
    468 	dec	%eax
    469 	je	.Lextend_output_avx2
    470 	mov	208-120(%rdi),%r8
    471 	sub	$8,%rdx
    472 	jc	.Ltail_squeeze_avx2
    473 	mov	%r8,(%rsi)
    474 	lea	8(%rsi),%rsi
    475 	je	.Ldone_squeeze_avx2
    476 	dec	%eax
    477 	je	.Lextend_output_avx2
    478 	mov	152-120(%rdi),%r8
    479 	sub	$8,%rdx
    480 	jc	.Ltail_squeeze_avx2
    481 	mov	%r8,(%rsi)
    482 	lea	8(%rsi),%rsi
    483 	je	.Ldone_squeeze_avx2
    484 	dec	%eax
    485 	je	.Lextend_output_avx2
    486 	mov	72-120(%rdi),%r8
    487 	sub	$8,%rdx
    488 	jc	.Ltail_squeeze_avx2
    489 	mov	%r8,(%rsi)
    490 	lea	8(%rsi),%rsi
    491 	je	.Ldone_squeeze_avx2
    492 	dec	%eax
    493 	je	.Lextend_output_avx2
    494 	mov	160-120(%rdi),%r8
    495 	sub	$8,%rdx
    496 	jc	.Ltail_squeeze_avx2
    497 	mov	%r8,(%rsi)
    498 	lea	8(%rsi),%rsi
    499 	je	.Ldone_squeeze_avx2
    500 	dec	%eax
    501 	je	.Lextend_output_avx2
    502 	mov	136-120(%rdi),%r8
    503 	sub	$8,%rdx
    504 	jc	.Ltail_squeeze_avx2
    505 	mov	%r8,(%rsi)
    506 	lea	8(%rsi),%rsi
    507 	je	.Ldone_squeeze_avx2
    508 	dec	%eax
    509 	je	.Lextend_output_avx2
    510 	mov	112-120(%rdi),%r8
    511 	sub	$8,%rdx
    512 	jc	.Ltail_squeeze_avx2
    513 	mov	%r8,(%rsi)
    514 	lea	8(%rsi),%rsi
    515 	je	.Ldone_squeeze_avx2
    516 	dec	%eax
    517 	je	.Lextend_output_avx2
    518 	mov	216-120(%rdi),%r8
    519 	sub	$8,%rdx
    520 	jc	.Ltail_squeeze_avx2
    521 	mov	%r8,(%rsi)
    522 	lea	8(%rsi),%rsi
    523 	je	.Ldone_squeeze_avx2
    524 	dec	%eax
    525 	je	.Lextend_output_avx2
    526 	mov	-120(%rdi),%r8
    527 .Lextend_output_avx2:
    528 	call	__KeccakF1600
    529 
    530 	vmovq	%xmm0,-96(%rdi)
    531 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    532 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    533 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    534 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    535 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    536 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    537 
    538 	mov	%rcx,%rax
    539 	jmp	.Loop_squeeze_avx2
    540 
    541 
    542 .Ltail_squeeze_avx2:
    543 	add	$8,%rdx
    544 .Loop_tail_avx2:
    545 	mov	%r8b,(%rsi)
    546 	lea	1(%rsi),%rsi
    547 	shr	$8,%r8
    548 	dec	%rdx
    549 	jnz	.Loop_tail_avx2
    550 
    551 .Ldone_squeeze_avx2:
    552 	vzeroupper
    553 
    554 	lea	(%r11),%rsp
    555 	ret
    556 .size	SHA3_squeeze,.-SHA3_squeeze
    557 
    558 .section .rodata
    559 .align	64
    560 rhotates_left:
    561 	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
    562 	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
    563 	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
    564 	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
    565 	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
    566 	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
    567 rhotates_right:
    568 	.quad	64-3,	64-18,	64-36,	64-41
    569 	.quad	64-1,	64-62,	64-28,	64-27
    570 	.quad	64-45,	64-6,	64-56,	64-39
    571 	.quad	64-10,	64-61,	64-55,	64-8
    572 	.quad	64-2,	64-15,	64-25,	64-20
    573 	.quad	64-44,	64-43,	64-21,	64-14
    574 iotas:
    575 	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
    576 	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
    577 	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
    578 	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
    579 	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
    580 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    581 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    582 	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
    583 	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
    584 	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
    585 	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
    586 	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
    587 	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
    588 	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
    589 	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
    590 	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
    591 	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
    592 	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
    593 	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
    594 	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
    595 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    596 	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
    597 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    598 	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
    599 
    600 .asciz	"Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro (at) openssl.org>"
    601