Home | History | Annotate | Line # | Download | only in x86_64
      1  1.1  christos .text
      2  1.1  christos 
      3  1.1  christos .type	__KeccakF1600,@function
      4  1.1  christos .align	32
      5  1.1  christos __KeccakF1600:
      6  1.1  christos 	lea		rhotates_left+96(%rip),%r8
      7  1.1  christos 	lea		rhotates_right+96(%rip),%r9
      8  1.1  christos 	lea		iotas(%rip),%r10
      9  1.1  christos 	mov		$24,%eax
     10  1.1  christos 	jmp		.Loop_avx2
     11  1.1  christos 
     12  1.1  christos .align	32
     13  1.1  christos .Loop_avx2:
     14  1.1  christos 	######################################### Theta
     15  1.1  christos 	vpshufd		$0b01001110,%ymm2,%ymm13
     16  1.1  christos 	vpxor		%ymm3,%ymm5,%ymm12
     17  1.1  christos 	vpxor		%ymm6,%ymm4,%ymm9
     18  1.1  christos 	vpxor		%ymm1,%ymm12,%ymm12
     19  1.1  christos 	vpxor		%ymm9,%ymm12,%ymm12		# C[1..4]
     20  1.1  christos 
     21  1.1  christos 	vpermq		$0b10010011,%ymm12,%ymm11
     22  1.1  christos 	vpxor		%ymm2,%ymm13,%ymm13
     23  1.1  christos 	vpermq		$0b01001110,%ymm13,%ymm7
     24  1.1  christos 
     25  1.1  christos 	vpsrlq		$63,%ymm12,%ymm8
     26  1.1  christos 	vpaddq		%ymm12,%ymm12,%ymm9
     27  1.1  christos 	vpor		%ymm9,%ymm8,%ymm8	# ROL64(C[1..4],1)
     28  1.1  christos 
     29  1.1  christos 	vpermq		$0b00111001,%ymm8,%ymm15
     30  1.1  christos 	vpxor		%ymm11,%ymm8,%ymm14
     31  1.1  christos 	vpermq		$0b00000000,%ymm14,%ymm14	# D[0..0] = ROL64(C[1],1) ^ C[4]
     32  1.1  christos 
     33  1.1  christos 	vpxor		%ymm0,%ymm13,%ymm13
     34  1.1  christos 	vpxor		%ymm7,%ymm13,%ymm13		# C[0..0]
     35  1.1  christos 
     36  1.1  christos 	vpsrlq		$63,%ymm13,%ymm7
     37  1.1  christos 	vpaddq		%ymm13,%ymm13,%ymm8
     38  1.1  christos 	vpor		%ymm7,%ymm8,%ymm8	# ROL64(C[0..0],1)
     39  1.1  christos 
     40  1.1  christos 	vpxor		%ymm14,%ymm2,%ymm2		# ^= D[0..0]
     41  1.1  christos 	vpxor		%ymm14,%ymm0,%ymm0		# ^= D[0..0]
     42  1.1  christos 
     43  1.1  christos 	vpblendd	$0b11000000,%ymm8,%ymm15,%ymm15
     44  1.1  christos 	vpblendd	$0b00000011,%ymm13,%ymm11,%ymm11
     45  1.1  christos 	vpxor		%ymm11,%ymm15,%ymm15		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
     46  1.1  christos 
     47  1.1  christos 	######################################### Rho + Pi + pre-Chi shuffle
     48  1.1  christos 	vpsllvq		0*32-96(%r8),%ymm2,%ymm10
     49  1.1  christos 	vpsrlvq		0*32-96(%r9),%ymm2,%ymm2
     50  1.1  christos 	vpor		%ymm10,%ymm2,%ymm2
     51  1.1  christos 
     52  1.1  christos 	 vpxor		%ymm15,%ymm3,%ymm3		# ^= D[1..4] from Theta
     53  1.1  christos 	vpsllvq		2*32-96(%r8),%ymm3,%ymm11
     54  1.1  christos 	vpsrlvq		2*32-96(%r9),%ymm3,%ymm3
     55  1.1  christos 	vpor		%ymm11,%ymm3,%ymm3
     56  1.1  christos 
     57  1.1  christos 	 vpxor		%ymm15,%ymm4,%ymm4		# ^= D[1..4] from Theta
     58  1.1  christos 	vpsllvq		3*32-96(%r8),%ymm4,%ymm12
     59  1.1  christos 	vpsrlvq		3*32-96(%r9),%ymm4,%ymm4
     60  1.1  christos 	vpor		%ymm12,%ymm4,%ymm4
     61  1.1  christos 
     62  1.1  christos 	 vpxor		%ymm15,%ymm5,%ymm5		# ^= D[1..4] from Theta
     63  1.1  christos 	vpsllvq		4*32-96(%r8),%ymm5,%ymm13
     64  1.1  christos 	vpsrlvq		4*32-96(%r9),%ymm5,%ymm5
     65  1.1  christos 	vpor		%ymm13,%ymm5,%ymm5
     66  1.1  christos 
     67  1.1  christos 	 vpxor		%ymm15,%ymm6,%ymm6		# ^= D[1..4] from Theta
     68  1.1  christos 	 vpermq		$0b10001101,%ymm2,%ymm10	# %ymm2 -> future %ymm3
     69  1.1  christos 	 vpermq		$0b10001101,%ymm3,%ymm11	# %ymm3 -> future %ymm4
     70  1.1  christos 	vpsllvq		5*32-96(%r8),%ymm6,%ymm14
     71  1.1  christos 	vpsrlvq		5*32-96(%r9),%ymm6,%ymm8
     72  1.1  christos 	vpor		%ymm14,%ymm8,%ymm8	# %ymm6 -> future %ymm1
     73  1.1  christos 
     74  1.1  christos 	 vpxor		%ymm15,%ymm1,%ymm1		# ^= D[1..4] from Theta
     75  1.1  christos 	 vpermq		$0b00011011,%ymm4,%ymm12	# %ymm4 -> future %ymm5
     76  1.1  christos 	 vpermq		$0b01110010,%ymm5,%ymm13	# %ymm5 -> future %ymm6
     77  1.1  christos 	vpsllvq		1*32-96(%r8),%ymm1,%ymm15
     78  1.1  christos 	vpsrlvq		1*32-96(%r9),%ymm1,%ymm9
     79  1.1  christos 	vpor		%ymm15,%ymm9,%ymm9	# %ymm1 -> future %ymm2
     80  1.1  christos 
     81  1.1  christos 	######################################### Chi
     82  1.1  christos 	vpsrldq		$8,%ymm8,%ymm14
     83  1.1  christos 	vpandn		%ymm14,%ymm8,%ymm7	# tgting  [0][0] [0][0] [0][0] [0][0]
     84  1.1  christos 
     85  1.1  christos 	vpblendd	$0b00001100,%ymm13,%ymm9,%ymm3	#               [4][4] [2][0]
     86  1.1  christos 	vpblendd	$0b00001100,%ymm9,%ymm11,%ymm15	#               [4][0] [2][1]
     87  1.1  christos 	 vpblendd	$0b00001100,%ymm11,%ymm10,%ymm5	#               [4][2] [2][4]
     88  1.1  christos 	 vpblendd	$0b00001100,%ymm10,%ymm9,%ymm14	#               [4][3] [2][0]
     89  1.1  christos 	vpblendd	$0b00110000,%ymm11,%ymm3,%ymm3	#        [1][3] [4][4] [2][0]
     90  1.1  christos 	vpblendd	$0b00110000,%ymm12,%ymm15,%ymm15	#        [1][4] [4][0] [2][1]
     91  1.1  christos 	 vpblendd	$0b00110000,%ymm9,%ymm5,%ymm5	#        [1][0] [4][2] [2][4]
     92  1.1  christos 	 vpblendd	$0b00110000,%ymm13,%ymm14,%ymm14	#        [1][1] [4][3] [2][0]
     93  1.1  christos 	vpblendd	$0b11000000,%ymm12,%ymm3,%ymm3	# [3][2] [1][3] [4][4] [2][0]
     94  1.1  christos 	vpblendd	$0b11000000,%ymm13,%ymm15,%ymm15	# [3][3] [1][4] [4][0] [2][1]
     95  1.1  christos 	 vpblendd	$0b11000000,%ymm13,%ymm5,%ymm5	# [3][3] [1][0] [4][2] [2][4]
     96  1.1  christos 	 vpblendd	$0b11000000,%ymm11,%ymm14,%ymm14	# [3][4] [1][1] [4][3] [2][0]
     97  1.1  christos 	vpandn		%ymm15,%ymm3,%ymm3		# tgting  [3][1] [1][2] [4][3] [2][4]
     98  1.1  christos 	 vpandn		%ymm14,%ymm5,%ymm5		# tgting  [3][2] [1][4] [4][1] [2][3]
     99  1.1  christos 
    100  1.1  christos 	vpblendd	$0b00001100,%ymm9,%ymm12,%ymm6	#               [4][0] [2][3]
    101  1.1  christos 	vpblendd	$0b00001100,%ymm12,%ymm10,%ymm15	#               [4][1] [2][4]
    102  1.1  christos 	 vpxor		%ymm10,%ymm3,%ymm3
    103  1.1  christos 	vpblendd	$0b00110000,%ymm10,%ymm6,%ymm6	#        [1][2] [4][0] [2][3]
    104  1.1  christos 	vpblendd	$0b00110000,%ymm11,%ymm15,%ymm15	#        [1][3] [4][1] [2][4]
    105  1.1  christos 	 vpxor		%ymm12,%ymm5,%ymm5
    106  1.1  christos 	vpblendd	$0b11000000,%ymm11,%ymm6,%ymm6	# [3][4] [1][2] [4][0] [2][3]
    107  1.1  christos 	vpblendd	$0b11000000,%ymm9,%ymm15,%ymm15	# [3][0] [1][3] [4][1] [2][4]
    108  1.1  christos 	vpandn		%ymm15,%ymm6,%ymm6		# tgting  [3][3] [1][1] [4][4] [2][2]
    109  1.1  christos 	vpxor		%ymm13,%ymm6,%ymm6
    110  1.1  christos 
    111  1.1  christos 	  vpermq	$0b00011110,%ymm8,%ymm4		# [0][1] [0][2] [0][4] [0][3]
    112  1.1  christos 	  vpblendd	$0b00110000,%ymm0,%ymm4,%ymm15	# [0][1] [0][0] [0][4] [0][3]
    113  1.1  christos 	  vpermq	$0b00111001,%ymm8,%ymm1		# [0][1] [0][4] [0][3] [0][2]
    114  1.1  christos 	  vpblendd	$0b11000000,%ymm0,%ymm1,%ymm1	# [0][0] [0][4] [0][3] [0][2]
    115  1.1  christos 	  vpandn	%ymm15,%ymm1,%ymm1		# tgting  [0][4] [0][3] [0][2] [0][1]
    116  1.1  christos 
    117  1.1  christos 	vpblendd	$0b00001100,%ymm12,%ymm11,%ymm2	#               [4][1] [2][1]
    118  1.1  christos 	vpblendd	$0b00001100,%ymm11,%ymm13,%ymm14	#               [4][2] [2][2]
    119  1.1  christos 	vpblendd	$0b00110000,%ymm13,%ymm2,%ymm2	#        [1][1] [4][1] [2][1]
    120  1.1  christos 	vpblendd	$0b00110000,%ymm10,%ymm14,%ymm14	#        [1][2] [4][2] [2][2]
    121  1.1  christos 	vpblendd	$0b11000000,%ymm10,%ymm2,%ymm2	# [3][1] [1][1] [4][1] [2][1]
    122  1.1  christos 	vpblendd	$0b11000000,%ymm12,%ymm14,%ymm14	# [3][2] [1][2] [4][2] [2][2]
    123  1.1  christos 	vpandn		%ymm14,%ymm2,%ymm2		# tgting  [3][0] [1][0] [4][0] [2][0]
    124  1.1  christos 	vpxor		%ymm9,%ymm2,%ymm2
    125  1.1  christos 
    126  1.1  christos 	 vpermq		$0b00000000,%ymm7,%ymm7	# [0][0] [0][0] [0][0] [0][0]
    127  1.1  christos 	 vpermq		$0b00011011,%ymm3,%ymm3	# post-Chi shuffle
    128  1.1  christos 	 vpermq		$0b10001101,%ymm5,%ymm5
    129  1.1  christos 	 vpermq		$0b01110010,%ymm6,%ymm6
    130  1.1  christos 
    131  1.1  christos 	vpblendd	$0b00001100,%ymm10,%ymm13,%ymm4	#               [4][3] [2][2]
    132  1.1  christos 	vpblendd	$0b00001100,%ymm13,%ymm12,%ymm14	#               [4][4] [2][3]
    133  1.1  christos 	vpblendd	$0b00110000,%ymm12,%ymm4,%ymm4	#        [1][4] [4][3] [2][2]
    134  1.1  christos 	vpblendd	$0b00110000,%ymm9,%ymm14,%ymm14	#        [1][0] [4][4] [2][3]
    135  1.1  christos 	vpblendd	$0b11000000,%ymm9,%ymm4,%ymm4	# [3][0] [1][4] [4][3] [2][2]
    136  1.1  christos 	vpblendd	$0b11000000,%ymm10,%ymm14,%ymm14	# [3][1] [1][0] [4][4] [2][3]
    137  1.1  christos 	vpandn		%ymm14,%ymm4,%ymm4		# tgting  [3][4] [1][3] [4][2] [2][1]
    138  1.1  christos 
    139  1.1  christos 	vpxor		%ymm7,%ymm0,%ymm0
    140  1.1  christos 	vpxor		%ymm8,%ymm1,%ymm1
    141  1.1  christos 	vpxor		%ymm11,%ymm4,%ymm4
    142  1.1  christos 
    143  1.1  christos 	######################################### Iota
    144  1.1  christos 	vpxor		(%r10),%ymm0,%ymm0
    145  1.1  christos 	lea		32(%r10),%r10
    146  1.1  christos 
    147  1.1  christos 	dec		%eax
    148  1.1  christos 	jnz		.Loop_avx2
    149  1.1  christos 
    150  1.1  christos 	ret
    151  1.1  christos .size	__KeccakF1600,.-__KeccakF1600
    152  1.1  christos .globl	SHA3_absorb
    153  1.1  christos .type	SHA3_absorb,@function
    154  1.1  christos .align	32
    155  1.1  christos SHA3_absorb:
    156  1.1  christos 	mov	%rsp,%r11
    157  1.1  christos 
    158  1.1  christos 	lea	-240(%rsp),%rsp
    159  1.1  christos 	and	$-32,%rsp
    160  1.1  christos 
    161  1.1  christos 	lea	96(%rdi),%rdi
    162  1.1  christos 	lea	96(%rsi),%rsi
    163  1.1  christos 	lea	96(%rsp),%r10
    164  1.1  christos 
    165  1.1  christos 	vzeroupper
    166  1.1  christos 
    167  1.1  christos 	vpbroadcastq	-96(%rdi),%ymm0	# load A[5][5]
    168  1.1  christos 	vmovdqu		8+32*0-96(%rdi),%ymm1
    169  1.1  christos 	vmovdqu		8+32*1-96(%rdi),%ymm2
    170  1.1  christos 	vmovdqu		8+32*2-96(%rdi),%ymm3
    171  1.1  christos 	vmovdqu		8+32*3-96(%rdi),%ymm4
    172  1.1  christos 	vmovdqu		8+32*4-96(%rdi),%ymm5
    173  1.1  christos 	vmovdqu		8+32*5-96(%rdi),%ymm6
    174  1.1  christos 
    175  1.1  christos 	vpxor		%ymm7,%ymm7,%ymm7
    176  1.1  christos 	vmovdqa		%ymm7,32*2-96(%r10)	# zero transfer area on stack
    177  1.1  christos 	vmovdqa		%ymm7,32*3-96(%r10)
    178  1.1  christos 	vmovdqa		%ymm7,32*4-96(%r10)
    179  1.1  christos 	vmovdqa		%ymm7,32*5-96(%r10)
    180  1.1  christos 	vmovdqa		%ymm7,32*6-96(%r10)
    181  1.1  christos 
    182  1.1  christos .Loop_absorb_avx2:
    183  1.1  christos 	mov		%rcx,%rax
    184  1.1  christos 	sub		%rcx,%rdx
    185  1.1  christos 	jc		.Ldone_absorb_avx2
    186  1.1  christos 
    187  1.1  christos 	shr		$3,%eax
    188  1.1  christos 	vpbroadcastq	0-96(%rsi),%ymm7
    189  1.1  christos 	vmovdqu		8-96(%rsi),%ymm8
    190  1.1  christos 	sub		$4,%eax
    191  1.1  christos 	dec	%eax
    192  1.1  christos 	jz	.Labsorved_avx2
    193  1.1  christos 	mov	8*5-96(%rsi),%r8
    194  1.1  christos 	mov	%r8,80-96(%r10)
    195  1.1  christos 	dec	%eax
    196  1.1  christos 	jz	.Labsorved_avx2
    197  1.1  christos 	mov	8*6-96(%rsi),%r8
    198  1.1  christos 	mov	%r8,192-96(%r10)
    199  1.1  christos 	dec	%eax
    200  1.1  christos 	jz	.Labsorved_avx2
    201  1.1  christos 	mov	8*7-96(%rsi),%r8
    202  1.1  christos 	mov	%r8,104-96(%r10)
    203  1.1  christos 	dec	%eax
    204  1.1  christos 	jz	.Labsorved_avx2
    205  1.1  christos 	mov	8*8-96(%rsi),%r8
    206  1.1  christos 	mov	%r8,144-96(%r10)
    207  1.1  christos 	dec	%eax
    208  1.1  christos 	jz	.Labsorved_avx2
    209  1.1  christos 	mov	8*9-96(%rsi),%r8
    210  1.1  christos 	mov	%r8,184-96(%r10)
    211  1.1  christos 	dec	%eax
    212  1.1  christos 	jz	.Labsorved_avx2
    213  1.1  christos 	mov	8*10-96(%rsi),%r8
    214  1.1  christos 	mov	%r8,64-96(%r10)
    215  1.1  christos 	dec	%eax
    216  1.1  christos 	jz	.Labsorved_avx2
    217  1.1  christos 	mov	8*11-96(%rsi),%r8
    218  1.1  christos 	mov	%r8,128-96(%r10)
    219  1.1  christos 	dec	%eax
    220  1.1  christos 	jz	.Labsorved_avx2
    221  1.1  christos 	mov	8*12-96(%rsi),%r8
    222  1.1  christos 	mov	%r8,200-96(%r10)
    223  1.1  christos 	dec	%eax
    224  1.1  christos 	jz	.Labsorved_avx2
    225  1.1  christos 	mov	8*13-96(%rsi),%r8
    226  1.1  christos 	mov	%r8,176-96(%r10)
    227  1.1  christos 	dec	%eax
    228  1.1  christos 	jz	.Labsorved_avx2
    229  1.1  christos 	mov	8*14-96(%rsi),%r8
    230  1.1  christos 	mov	%r8,120-96(%r10)
    231  1.1  christos 	dec	%eax
    232  1.1  christos 	jz	.Labsorved_avx2
    233  1.1  christos 	mov	8*15-96(%rsi),%r8
    234  1.1  christos 	mov	%r8,88-96(%r10)
    235  1.1  christos 	dec	%eax
    236  1.1  christos 	jz	.Labsorved_avx2
    237  1.1  christos 	mov	8*16-96(%rsi),%r8
    238  1.1  christos 	mov	%r8,96-96(%r10)
    239  1.1  christos 	dec	%eax
    240  1.1  christos 	jz	.Labsorved_avx2
    241  1.1  christos 	mov	8*17-96(%rsi),%r8
    242  1.1  christos 	mov	%r8,168-96(%r10)
    243  1.1  christos 	dec	%eax
    244  1.1  christos 	jz	.Labsorved_avx2
    245  1.1  christos 	mov	8*18-96(%rsi),%r8
    246  1.1  christos 	mov	%r8,208-96(%r10)
    247  1.1  christos 	dec	%eax
    248  1.1  christos 	jz	.Labsorved_avx2
    249  1.1  christos 	mov	8*19-96(%rsi),%r8
    250  1.1  christos 	mov	%r8,152-96(%r10)
    251  1.1  christos 	dec	%eax
    252  1.1  christos 	jz	.Labsorved_avx2
    253  1.1  christos 	mov	8*20-96(%rsi),%r8
    254  1.1  christos 	mov	%r8,72-96(%r10)
    255  1.1  christos 	dec	%eax
    256  1.1  christos 	jz	.Labsorved_avx2
    257  1.1  christos 	mov	8*21-96(%rsi),%r8
    258  1.1  christos 	mov	%r8,160-96(%r10)
    259  1.1  christos 	dec	%eax
    260  1.1  christos 	jz	.Labsorved_avx2
    261  1.1  christos 	mov	8*22-96(%rsi),%r8
    262  1.1  christos 	mov	%r8,136-96(%r10)
    263  1.1  christos 	dec	%eax
    264  1.1  christos 	jz	.Labsorved_avx2
    265  1.1  christos 	mov	8*23-96(%rsi),%r8
    266  1.1  christos 	mov	%r8,112-96(%r10)
    267  1.1  christos 	dec	%eax
    268  1.1  christos 	jz	.Labsorved_avx2
    269  1.1  christos 	mov	8*24-96(%rsi),%r8
    270  1.1  christos 	mov	%r8,216-96(%r10)
    271  1.1  christos .Labsorved_avx2:
    272  1.1  christos 	lea	(%rsi,%rcx),%rsi
    273  1.1  christos 
    274  1.1  christos 	vpxor	%ymm7,%ymm0,%ymm0
    275  1.1  christos 	vpxor	%ymm8,%ymm1,%ymm1
    276  1.1  christos 	vpxor	32*2-96(%r10),%ymm2,%ymm2
    277  1.1  christos 	vpxor	32*3-96(%r10),%ymm3,%ymm3
    278  1.1  christos 	vpxor	32*4-96(%r10),%ymm4,%ymm4
    279  1.1  christos 	vpxor	32*5-96(%r10),%ymm5,%ymm5
    280  1.1  christos 	vpxor	32*6-96(%r10),%ymm6,%ymm6
    281  1.1  christos 
    282  1.1  christos 	call	__KeccakF1600
    283  1.1  christos 
    284  1.1  christos 	lea	96(%rsp),%r10
    285  1.1  christos 	jmp	.Loop_absorb_avx2
    286  1.1  christos 
    287  1.1  christos .Ldone_absorb_avx2:
    288  1.1  christos 	vmovq	%xmm0,-96(%rdi)
    289  1.1  christos 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    290  1.1  christos 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    291  1.1  christos 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    292  1.1  christos 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    293  1.1  christos 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    294  1.1  christos 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    295  1.1  christos 
    296  1.1  christos 	vzeroupper
    297  1.1  christos 
    298  1.1  christos 	lea	(%r11),%rsp
    299  1.1  christos 	lea	(%rdx,%rcx),%rax		# return value
    300  1.1  christos 	ret
    301  1.1  christos .size	SHA3_absorb,.-SHA3_absorb
    302  1.1  christos 
    303  1.1  christos .globl	SHA3_squeeze
    304  1.1  christos .type	SHA3_squeeze,@function
    305  1.1  christos .align	32
    306  1.1  christos SHA3_squeeze:
    307  1.1  christos 	mov	%rsp,%r11
    308  1.1  christos 
    309  1.1  christos 	lea	96(%rdi),%rdi
    310  1.1  christos 	shr	$3,%rcx
    311  1.1  christos 
    312  1.1  christos 	vzeroupper
    313  1.1  christos 
    314  1.1  christos 	vpbroadcastq	-96(%rdi),%ymm0
    315  1.1  christos 	vpxor		%ymm7,%ymm7,%ymm7
    316  1.1  christos 	vmovdqu		8+32*0-96(%rdi),%ymm1
    317  1.1  christos 	vmovdqu		8+32*1-96(%rdi),%ymm2
    318  1.1  christos 	vmovdqu		8+32*2-96(%rdi),%ymm3
    319  1.1  christos 	vmovdqu		8+32*3-96(%rdi),%ymm4
    320  1.1  christos 	vmovdqu		8+32*4-96(%rdi),%ymm5
    321  1.1  christos 	vmovdqu		8+32*5-96(%rdi),%ymm6
    322  1.1  christos 
    323  1.1  christos 	mov	%rcx,%rax
    324  1.1  christos 
    325  1.1  christos .Loop_squeeze_avx2:
    326  1.1  christos 	mov	0-96(%rdi),%r8
    327  1.1  christos 	sub	$8,%rdx
    328  1.1  christos 	jc	.Ltail_squeeze_avx2
    329  1.1  christos 	mov	%r8,(%rsi)
    330  1.1  christos 	lea	8(%rsi),%rsi
    331  1.1  christos 	je	.Ldone_squeeze_avx2
    332  1.1  christos 	dec	%eax
    333  1.1  christos 	je	.Lextend_output_avx2
    334  1.1  christos 	mov	32-120(%rdi),%r8
    335  1.1  christos 	sub	$8,%rdx
    336  1.1  christos 	jc	.Ltail_squeeze_avx2
    337  1.1  christos 	mov	%r8,(%rsi)
    338  1.1  christos 	lea	8(%rsi),%rsi
    339  1.1  christos 	je	.Ldone_squeeze_avx2
    340  1.1  christos 	dec	%eax
    341  1.1  christos 	je	.Lextend_output_avx2
    342  1.1  christos 	mov	40-120(%rdi),%r8
    343  1.1  christos 	sub	$8,%rdx
    344  1.1  christos 	jc	.Ltail_squeeze_avx2
    345  1.1  christos 	mov	%r8,(%rsi)
    346  1.1  christos 	lea	8(%rsi),%rsi
    347  1.1  christos 	je	.Ldone_squeeze_avx2
    348  1.1  christos 	dec	%eax
    349  1.1  christos 	je	.Lextend_output_avx2
    350  1.1  christos 	mov	48-120(%rdi),%r8
    351  1.1  christos 	sub	$8,%rdx
    352  1.1  christos 	jc	.Ltail_squeeze_avx2
    353  1.1  christos 	mov	%r8,(%rsi)
    354  1.1  christos 	lea	8(%rsi),%rsi
    355  1.1  christos 	je	.Ldone_squeeze_avx2
    356  1.1  christos 	dec	%eax
    357  1.1  christos 	je	.Lextend_output_avx2
    358  1.1  christos 	mov	56-120(%rdi),%r8
    359  1.1  christos 	sub	$8,%rdx
    360  1.1  christos 	jc	.Ltail_squeeze_avx2
    361  1.1  christos 	mov	%r8,(%rsi)
    362  1.1  christos 	lea	8(%rsi),%rsi
    363  1.1  christos 	je	.Ldone_squeeze_avx2
    364  1.1  christos 	dec	%eax
    365  1.1  christos 	je	.Lextend_output_avx2
    366  1.1  christos 	mov	80-120(%rdi),%r8
    367  1.1  christos 	sub	$8,%rdx
    368  1.1  christos 	jc	.Ltail_squeeze_avx2
    369  1.1  christos 	mov	%r8,(%rsi)
    370  1.1  christos 	lea	8(%rsi),%rsi
    371  1.1  christos 	je	.Ldone_squeeze_avx2
    372  1.1  christos 	dec	%eax
    373  1.1  christos 	je	.Lextend_output_avx2
    374  1.1  christos 	mov	192-120(%rdi),%r8
    375  1.1  christos 	sub	$8,%rdx
    376  1.1  christos 	jc	.Ltail_squeeze_avx2
    377  1.1  christos 	mov	%r8,(%rsi)
    378  1.1  christos 	lea	8(%rsi),%rsi
    379  1.1  christos 	je	.Ldone_squeeze_avx2
    380  1.1  christos 	dec	%eax
    381  1.1  christos 	je	.Lextend_output_avx2
    382  1.1  christos 	mov	104-120(%rdi),%r8
    383  1.1  christos 	sub	$8,%rdx
    384  1.1  christos 	jc	.Ltail_squeeze_avx2
    385  1.1  christos 	mov	%r8,(%rsi)
    386  1.1  christos 	lea	8(%rsi),%rsi
    387  1.1  christos 	je	.Ldone_squeeze_avx2
    388  1.1  christos 	dec	%eax
    389  1.1  christos 	je	.Lextend_output_avx2
    390  1.1  christos 	mov	144-120(%rdi),%r8
    391  1.1  christos 	sub	$8,%rdx
    392  1.1  christos 	jc	.Ltail_squeeze_avx2
    393  1.1  christos 	mov	%r8,(%rsi)
    394  1.1  christos 	lea	8(%rsi),%rsi
    395  1.1  christos 	je	.Ldone_squeeze_avx2
    396  1.1  christos 	dec	%eax
    397  1.1  christos 	je	.Lextend_output_avx2
    398  1.1  christos 	mov	184-120(%rdi),%r8
    399  1.1  christos 	sub	$8,%rdx
    400  1.1  christos 	jc	.Ltail_squeeze_avx2
    401  1.1  christos 	mov	%r8,(%rsi)
    402  1.1  christos 	lea	8(%rsi),%rsi
    403  1.1  christos 	je	.Ldone_squeeze_avx2
    404  1.1  christos 	dec	%eax
    405  1.1  christos 	je	.Lextend_output_avx2
    406  1.1  christos 	mov	64-120(%rdi),%r8
    407  1.1  christos 	sub	$8,%rdx
    408  1.1  christos 	jc	.Ltail_squeeze_avx2
    409  1.1  christos 	mov	%r8,(%rsi)
    410  1.1  christos 	lea	8(%rsi),%rsi
    411  1.1  christos 	je	.Ldone_squeeze_avx2
    412  1.1  christos 	dec	%eax
    413  1.1  christos 	je	.Lextend_output_avx2
    414  1.1  christos 	mov	128-120(%rdi),%r8
    415  1.1  christos 	sub	$8,%rdx
    416  1.1  christos 	jc	.Ltail_squeeze_avx2
    417  1.1  christos 	mov	%r8,(%rsi)
    418  1.1  christos 	lea	8(%rsi),%rsi
    419  1.1  christos 	je	.Ldone_squeeze_avx2
    420  1.1  christos 	dec	%eax
    421  1.1  christos 	je	.Lextend_output_avx2
    422  1.1  christos 	mov	200-120(%rdi),%r8
    423  1.1  christos 	sub	$8,%rdx
    424  1.1  christos 	jc	.Ltail_squeeze_avx2
    425  1.1  christos 	mov	%r8,(%rsi)
    426  1.1  christos 	lea	8(%rsi),%rsi
    427  1.1  christos 	je	.Ldone_squeeze_avx2
    428  1.1  christos 	dec	%eax
    429  1.1  christos 	je	.Lextend_output_avx2
    430  1.1  christos 	mov	176-120(%rdi),%r8
    431  1.1  christos 	sub	$8,%rdx
    432  1.1  christos 	jc	.Ltail_squeeze_avx2
    433  1.1  christos 	mov	%r8,(%rsi)
    434  1.1  christos 	lea	8(%rsi),%rsi
    435  1.1  christos 	je	.Ldone_squeeze_avx2
    436  1.1  christos 	dec	%eax
    437  1.1  christos 	je	.Lextend_output_avx2
    438  1.1  christos 	mov	120-120(%rdi),%r8
    439  1.1  christos 	sub	$8,%rdx
    440  1.1  christos 	jc	.Ltail_squeeze_avx2
    441  1.1  christos 	mov	%r8,(%rsi)
    442  1.1  christos 	lea	8(%rsi),%rsi
    443  1.1  christos 	je	.Ldone_squeeze_avx2
    444  1.1  christos 	dec	%eax
    445  1.1  christos 	je	.Lextend_output_avx2
    446  1.1  christos 	mov	88-120(%rdi),%r8
    447  1.1  christos 	sub	$8,%rdx
    448  1.1  christos 	jc	.Ltail_squeeze_avx2
    449  1.1  christos 	mov	%r8,(%rsi)
    450  1.1  christos 	lea	8(%rsi),%rsi
    451  1.1  christos 	je	.Ldone_squeeze_avx2
    452  1.1  christos 	dec	%eax
    453  1.1  christos 	je	.Lextend_output_avx2
    454  1.1  christos 	mov	96-120(%rdi),%r8
    455  1.1  christos 	sub	$8,%rdx
    456  1.1  christos 	jc	.Ltail_squeeze_avx2
    457  1.1  christos 	mov	%r8,(%rsi)
    458  1.1  christos 	lea	8(%rsi),%rsi
    459  1.1  christos 	je	.Ldone_squeeze_avx2
    460  1.1  christos 	dec	%eax
    461  1.1  christos 	je	.Lextend_output_avx2
    462  1.1  christos 	mov	168-120(%rdi),%r8
    463  1.1  christos 	sub	$8,%rdx
    464  1.1  christos 	jc	.Ltail_squeeze_avx2
    465  1.1  christos 	mov	%r8,(%rsi)
    466  1.1  christos 	lea	8(%rsi),%rsi
    467  1.1  christos 	je	.Ldone_squeeze_avx2
    468  1.1  christos 	dec	%eax
    469  1.1  christos 	je	.Lextend_output_avx2
    470  1.1  christos 	mov	208-120(%rdi),%r8
    471  1.1  christos 	sub	$8,%rdx
    472  1.1  christos 	jc	.Ltail_squeeze_avx2
    473  1.1  christos 	mov	%r8,(%rsi)
    474  1.1  christos 	lea	8(%rsi),%rsi
    475  1.1  christos 	je	.Ldone_squeeze_avx2
    476  1.1  christos 	dec	%eax
    477  1.1  christos 	je	.Lextend_output_avx2
    478  1.1  christos 	mov	152-120(%rdi),%r8
    479  1.1  christos 	sub	$8,%rdx
    480  1.1  christos 	jc	.Ltail_squeeze_avx2
    481  1.1  christos 	mov	%r8,(%rsi)
    482  1.1  christos 	lea	8(%rsi),%rsi
    483  1.1  christos 	je	.Ldone_squeeze_avx2
    484  1.1  christos 	dec	%eax
    485  1.1  christos 	je	.Lextend_output_avx2
    486  1.1  christos 	mov	72-120(%rdi),%r8
    487  1.1  christos 	sub	$8,%rdx
    488  1.1  christos 	jc	.Ltail_squeeze_avx2
    489  1.1  christos 	mov	%r8,(%rsi)
    490  1.1  christos 	lea	8(%rsi),%rsi
    491  1.1  christos 	je	.Ldone_squeeze_avx2
    492  1.1  christos 	dec	%eax
    493  1.1  christos 	je	.Lextend_output_avx2
    494  1.1  christos 	mov	160-120(%rdi),%r8
    495  1.1  christos 	sub	$8,%rdx
    496  1.1  christos 	jc	.Ltail_squeeze_avx2
    497  1.1  christos 	mov	%r8,(%rsi)
    498  1.1  christos 	lea	8(%rsi),%rsi
    499  1.1  christos 	je	.Ldone_squeeze_avx2
    500  1.1  christos 	dec	%eax
    501  1.1  christos 	je	.Lextend_output_avx2
    502  1.1  christos 	mov	136-120(%rdi),%r8
    503  1.1  christos 	sub	$8,%rdx
    504  1.1  christos 	jc	.Ltail_squeeze_avx2
    505  1.1  christos 	mov	%r8,(%rsi)
    506  1.1  christos 	lea	8(%rsi),%rsi
    507  1.1  christos 	je	.Ldone_squeeze_avx2
    508  1.1  christos 	dec	%eax
    509  1.1  christos 	je	.Lextend_output_avx2
    510  1.1  christos 	mov	112-120(%rdi),%r8
    511  1.1  christos 	sub	$8,%rdx
    512  1.1  christos 	jc	.Ltail_squeeze_avx2
    513  1.1  christos 	mov	%r8,(%rsi)
    514  1.1  christos 	lea	8(%rsi),%rsi
    515  1.1  christos 	je	.Ldone_squeeze_avx2
    516  1.1  christos 	dec	%eax
    517  1.1  christos 	je	.Lextend_output_avx2
    518  1.1  christos 	mov	216-120(%rdi),%r8
    519  1.1  christos 	sub	$8,%rdx
    520  1.1  christos 	jc	.Ltail_squeeze_avx2
    521  1.1  christos 	mov	%r8,(%rsi)
    522  1.1  christos 	lea	8(%rsi),%rsi
    523  1.1  christos 	je	.Ldone_squeeze_avx2
    524  1.1  christos 	dec	%eax
    525  1.1  christos 	je	.Lextend_output_avx2
    526  1.1  christos 	mov	-120(%rdi),%r8
    527  1.1  christos .Lextend_output_avx2:
    528  1.1  christos 	call	__KeccakF1600
    529  1.1  christos 
    530  1.1  christos 	vmovq	%xmm0,-96(%rdi)
    531  1.1  christos 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    532  1.1  christos 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    533  1.1  christos 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    534  1.1  christos 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    535  1.1  christos 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    536  1.1  christos 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    537  1.1  christos 
    538  1.1  christos 	mov	%rcx,%rax
    539  1.1  christos 	jmp	.Loop_squeeze_avx2
    540  1.1  christos 
    541  1.1  christos 
    542  1.1  christos .Ltail_squeeze_avx2:
    543  1.1  christos 	add	$8,%rdx
    544  1.1  christos .Loop_tail_avx2:
    545  1.1  christos 	mov	%r8b,(%rsi)
    546  1.1  christos 	lea	1(%rsi),%rsi
    547  1.1  christos 	shr	$8,%r8
    548  1.1  christos 	dec	%rdx
    549  1.1  christos 	jnz	.Loop_tail_avx2
    550  1.1  christos 
    551  1.1  christos .Ldone_squeeze_avx2:
    552  1.1  christos 	vzeroupper
    553  1.1  christos 
    554  1.1  christos 	lea	(%r11),%rsp
    555  1.1  christos 	ret
    556  1.1  christos .size	SHA3_squeeze,.-SHA3_squeeze
    557  1.1  christos 
    558  1.1  christos .section .rodata
    559  1.1  christos .align	64
    560  1.1  christos rhotates_left:
    561  1.1  christos 	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
    562  1.1  christos 	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
    563  1.1  christos 	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
    564  1.1  christos 	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
    565  1.1  christos 	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
    566  1.1  christos 	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
    567  1.1  christos rhotates_right:
    568  1.1  christos 	.quad	64-3,	64-18,	64-36,	64-41
    569  1.1  christos 	.quad	64-1,	64-62,	64-28,	64-27
    570  1.1  christos 	.quad	64-45,	64-6,	64-56,	64-39
    571  1.1  christos 	.quad	64-10,	64-61,	64-55,	64-8
    572  1.1  christos 	.quad	64-2,	64-15,	64-25,	64-20
    573  1.1  christos 	.quad	64-44,	64-43,	64-21,	64-14
    574  1.1  christos iotas:
    575  1.1  christos 	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
    576  1.1  christos 	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
    577  1.1  christos 	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
    578  1.1  christos 	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
    579  1.1  christos 	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
    580  1.1  christos 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    581  1.1  christos 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    582  1.1  christos 	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
    583  1.1  christos 	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
    584  1.1  christos 	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
    585  1.1  christos 	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
    586  1.1  christos 	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
    587  1.1  christos 	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
    588  1.1  christos 	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
    589  1.1  christos 	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
    590  1.1  christos 	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
    591  1.1  christos 	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
    592  1.1  christos 	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
    593  1.1  christos 	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
    594  1.1  christos 	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
    595  1.1  christos 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    596  1.1  christos 	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
    597  1.1  christos 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    598  1.1  christos 	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
    599  1.1  christos 
    600  1.1  christos .asciz	"Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro (at) openssl.org>"
    601