Home | History | Annotate | Line # | Download | only in x86_64
      1  1.1  christos .text
      2  1.1  christos 
      3  1.1  christos .type	__KeccakF1600,@function
      4  1.1  christos .align	32
      5  1.1  christos __KeccakF1600:
      6  1.1  christos 	lea		iotas(%rip),%r10
      7  1.1  christos 	mov		$24,%eax
      8  1.1  christos 	jmp		.Loop_avx512vl
      9  1.1  christos 
     10  1.1  christos .align	32
     11  1.1  christos .Loop_avx512vl:
     12  1.1  christos 	######################################### Theta
     13  1.1  christos 	vpshufd		$0b01001110,%ymm2,%ymm13
     14  1.1  christos 	vpxor		%ymm3,%ymm5,%ymm12
     15  1.1  christos 	vpxor		%ymm6,%ymm4,%ymm9
     16  1.1  christos 	vpternlogq	$0x96,%ymm1,%ymm9,%ymm12	# C[1..4]
     17  1.1  christos 
     18  1.1  christos 	vpxor		%ymm2,%ymm13,%ymm13
     19  1.1  christos 	vpermq		$0b01001110,%ymm13,%ymm7
     20  1.1  christos 
     21  1.1  christos 	vpermq		$0b10010011,%ymm12,%ymm11
     22  1.1  christos 	vprolq		$1,%ymm12,%ymm8		# ROL64(C[1..4],1)
     23  1.1  christos 
     24  1.1  christos 	vpermq		$0b00111001,%ymm8,%ymm15
     25  1.1  christos 	vpxor		%ymm11,%ymm8,%ymm14
     26  1.1  christos 	vpermq		$0b00000000,%ymm14,%ymm14	# D[0..0] = ROL64(C[1],1) ^ C[4]
     27  1.1  christos 
     28  1.1  christos 	vpternlogq	$0x96,%ymm7,%ymm0,%ymm13	# C[0..0]
     29  1.1  christos 	vprolq		$1,%ymm13,%ymm8		# ROL64(C[0..0],1)
     30  1.1  christos 
     31  1.1  christos 	vpxor		%ymm14,%ymm0,%ymm0		# ^= D[0..0]
     32  1.1  christos 
     33  1.1  christos 	vpblendd	$0b11000000,%ymm8,%ymm15,%ymm15
     34  1.1  christos 	vpblendd	$0b00000011,%ymm13,%ymm11,%ymm7
     35  1.1  christos 
     36  1.1  christos 	######################################### Rho + Pi + pre-Chi shuffle
     37  1.1  christos 	 vpxor		%ymm14,%ymm2,%ymm2		# ^= D[0..0] from Theta
     38  1.1  christos 	vprolvq		%ymm16,%ymm2,%ymm2
     39  1.1  christos 
     40  1.1  christos 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm3	# ^= D[1..4] from Theta
     41  1.1  christos 	vprolvq		%ymm18,%ymm3,%ymm3
     42  1.1  christos 
     43  1.1  christos 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm4	# ^= D[1..4] from Theta
     44  1.1  christos 	vprolvq		%ymm19,%ymm4,%ymm4
     45  1.1  christos 
     46  1.1  christos 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm5	# ^= D[1..4] from Theta
     47  1.1  christos 	vprolvq		%ymm20,%ymm5,%ymm5
     48  1.1  christos 
     49  1.1  christos 	 vpermq		$0b10001101,%ymm2,%ymm10	# %ymm2 -> future %ymm3
     50  1.1  christos 	 vpermq		$0b10001101,%ymm3,%ymm11	# %ymm3 -> future %ymm4
     51  1.1  christos 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm6	# ^= D[1..4] from Theta
     52  1.1  christos 	vprolvq		%ymm21,%ymm6,%ymm8		# %ymm6 -> future %ymm1
     53  1.1  christos 
     54  1.1  christos 	 vpermq		$0b00011011,%ymm4,%ymm12	# %ymm4 -> future %ymm5
     55  1.1  christos 	 vpermq		$0b01110010,%ymm5,%ymm13	# %ymm5 -> future %ymm6
     56  1.1  christos 	 vpternlogq	$0x96,%ymm7,%ymm15,%ymm1	# ^= D[1..4] from Theta
     57  1.1  christos 	vprolvq		%ymm17,%ymm1,%ymm9		# %ymm1 -> future %ymm2
     58  1.1  christos 
     59  1.1  christos 	######################################### Chi
     60  1.1  christos 	vpblendd	$0b00001100,%ymm13,%ymm9,%ymm3	#               [4][4] [2][0]
     61  1.1  christos 	vpblendd	$0b00001100,%ymm9,%ymm11,%ymm15	#               [4][0] [2][1]
     62  1.1  christos 	 vpblendd	$0b00001100,%ymm11,%ymm10,%ymm5	#               [4][2] [2][4]
     63  1.1  christos 	 vpblendd	$0b00001100,%ymm10,%ymm9,%ymm14	#               [4][3] [2][0]
     64  1.1  christos 	vpblendd	$0b00110000,%ymm11,%ymm3,%ymm3	#        [1][3] [4][4] [2][0]
     65  1.1  christos 	vpblendd	$0b00110000,%ymm12,%ymm15,%ymm15	#        [1][4] [4][0] [2][1]
     66  1.1  christos 	 vpblendd	$0b00110000,%ymm9,%ymm5,%ymm5	#        [1][0] [4][2] [2][4]
     67  1.1  christos 	 vpblendd	$0b00110000,%ymm13,%ymm14,%ymm14	#        [1][1] [4][3] [2][0]
     68  1.1  christos 	vpblendd	$0b11000000,%ymm12,%ymm3,%ymm3	# [3][2] [1][3] [4][4] [2][0]
     69  1.1  christos 	vpblendd	$0b11000000,%ymm13,%ymm15,%ymm15	# [3][3] [1][4] [4][0] [2][1]
     70  1.1  christos 	 vpblendd	$0b11000000,%ymm13,%ymm5,%ymm5	# [3][3] [1][0] [4][2] [2][4]
     71  1.1  christos 	 vpblendd	$0b11000000,%ymm11,%ymm14,%ymm14	# [3][4] [1][1] [4][3] [2][0]
     72  1.1  christos 	vpternlogq	$0xC6,%ymm15,%ymm10,%ymm3		# [3][1] [1][2] [4][3] [2][4]
     73  1.1  christos 	 vpternlogq	$0xC6,%ymm14,%ymm12,%ymm5		# [3][2] [1][4] [4][1] [2][3]
     74  1.1  christos 
     75  1.1  christos 	vpsrldq		$8,%ymm8,%ymm7
     76  1.1  christos 	vpandn		%ymm7,%ymm8,%ymm7	# tgting  [0][0] [0][0] [0][0] [0][0]
     77  1.1  christos 
     78  1.1  christos 	vpblendd	$0b00001100,%ymm9,%ymm12,%ymm6	#               [4][0] [2][3]
     79  1.1  christos 	vpblendd	$0b00001100,%ymm12,%ymm10,%ymm15	#               [4][1] [2][4]
     80  1.1  christos 	vpblendd	$0b00110000,%ymm10,%ymm6,%ymm6	#        [1][2] [4][0] [2][3]
     81  1.1  christos 	vpblendd	$0b00110000,%ymm11,%ymm15,%ymm15	#        [1][3] [4][1] [2][4]
     82  1.1  christos 	vpblendd	$0b11000000,%ymm11,%ymm6,%ymm6	# [3][4] [1][2] [4][0] [2][3]
     83  1.1  christos 	vpblendd	$0b11000000,%ymm9,%ymm15,%ymm15	# [3][0] [1][3] [4][1] [2][4]
     84  1.1  christos 	vpternlogq	$0xC6,%ymm15,%ymm13,%ymm6		# [3][3] [1][1] [4][4] [2][2]
     85  1.1  christos 
     86  1.1  christos 	  vpermq	$0b00011110,%ymm8,%ymm4		# [0][1] [0][2] [0][4] [0][3]
     87  1.1  christos 	  vpblendd	$0b00110000,%ymm0,%ymm4,%ymm15	# [0][1] [0][0] [0][4] [0][3]
     88  1.1  christos 	  vpermq	$0b00111001,%ymm8,%ymm1		# [0][1] [0][4] [0][3] [0][2]
     89  1.1  christos 	  vpblendd	$0b11000000,%ymm0,%ymm1,%ymm1	# [0][0] [0][4] [0][3] [0][2]
     90  1.1  christos 
     91  1.1  christos 	vpblendd	$0b00001100,%ymm12,%ymm11,%ymm2	#               [4][1] [2][1]
     92  1.1  christos 	vpblendd	$0b00001100,%ymm11,%ymm13,%ymm14	#               [4][2] [2][2]
     93  1.1  christos 	vpblendd	$0b00110000,%ymm13,%ymm2,%ymm2	#        [1][1] [4][1] [2][1]
     94  1.1  christos 	vpblendd	$0b00110000,%ymm10,%ymm14,%ymm14	#        [1][2] [4][2] [2][2]
     95  1.1  christos 	vpblendd	$0b11000000,%ymm10,%ymm2,%ymm2	# [3][1] [1][1] [4][1] [2][1]
     96  1.1  christos 	vpblendd	$0b11000000,%ymm12,%ymm14,%ymm14	# [3][2] [1][2] [4][2] [2][2]
     97  1.1  christos 	vpternlogq	$0xC6,%ymm14,%ymm9,%ymm2		# [3][0] [1][0] [4][0] [2][0]
     98  1.1  christos 
     99  1.1  christos 	 vpermq		$0b00000000,%ymm7,%ymm7	# [0][0] [0][0] [0][0] [0][0]
    100  1.1  christos 	 vpermq		$0b00011011,%ymm3,%ymm3		# post-Chi shuffle
    101  1.1  christos 	 vpermq		$0b10001101,%ymm5,%ymm5
    102  1.1  christos 	 vpermq		$0b01110010,%ymm6,%ymm6
    103  1.1  christos 
    104  1.1  christos 	vpblendd	$0b00001100,%ymm10,%ymm13,%ymm4	#               [4][3] [2][2]
    105  1.1  christos 	vpblendd	$0b00001100,%ymm13,%ymm12,%ymm14	#               [4][4] [2][3]
    106  1.1  christos 	vpblendd	$0b00110000,%ymm12,%ymm4,%ymm4	#        [1][4] [4][3] [2][2]
    107  1.1  christos 	vpblendd	$0b00110000,%ymm9,%ymm14,%ymm14	#        [1][0] [4][4] [2][3]
    108  1.1  christos 	vpblendd	$0b11000000,%ymm9,%ymm4,%ymm4	# [3][0] [1][4] [4][3] [2][2]
    109  1.1  christos 	vpblendd	$0b11000000,%ymm10,%ymm14,%ymm14	# [3][1] [1][0] [4][4] [2][3]
    110  1.1  christos 
    111  1.1  christos 	vpternlogq	$0xC6,%ymm15,%ymm8,%ymm1		# [0][4] [0][3] [0][2] [0][1]
    112  1.1  christos 	vpternlogq	$0xC6,%ymm14,%ymm11,%ymm4		# [3][4] [1][3] [4][2] [2][1]
    113  1.1  christos 
    114  1.1  christos 	######################################### Iota
    115  1.1  christos 	vpternlogq	$0x96,(%r10),%ymm7,%ymm0
    116  1.1  christos 	lea		32(%r10),%r10
    117  1.1  christos 
    118  1.1  christos 	dec		%eax
    119  1.1  christos 	jnz		.Loop_avx512vl
    120  1.1  christos 
    121  1.1  christos 	ret
    122  1.1  christos .size	__KeccakF1600,.-__KeccakF1600
    123  1.1  christos .globl	SHA3_absorb
    124  1.1  christos .type	SHA3_absorb,@function
    125  1.1  christos .align	32
    126  1.1  christos SHA3_absorb:
    127  1.1  christos 	mov	%rsp,%r11
    128  1.1  christos 
    129  1.1  christos 	lea	-240(%rsp),%rsp
    130  1.1  christos 	and	$-32,%rsp
    131  1.1  christos 
    132  1.1  christos 	lea	96(%rdi),%rdi
    133  1.1  christos 	lea	96(%rsi),%rsi
    134  1.1  christos 	lea	96(%rsp),%r10
    135  1.1  christos 	lea	rhotates_left(%rip),%r8
    136  1.1  christos 
    137  1.1  christos 	vzeroupper
    138  1.1  christos 
    139  1.1  christos 	vpbroadcastq	-96(%rdi),%ymm0	# load A[5][5]
    140  1.1  christos 	vmovdqu		8+32*0-96(%rdi),%ymm1
    141  1.1  christos 	vmovdqu		8+32*1-96(%rdi),%ymm2
    142  1.1  christos 	vmovdqu		8+32*2-96(%rdi),%ymm3
    143  1.1  christos 	vmovdqu		8+32*3-96(%rdi),%ymm4
    144  1.1  christos 	vmovdqu		8+32*4-96(%rdi),%ymm5
    145  1.1  christos 	vmovdqu		8+32*5-96(%rdi),%ymm6
    146  1.1  christos 
    147  1.1  christos 	vmovdqa64	0*32(%r8),%ymm16		# load "rhotate" indices
    148  1.1  christos 	vmovdqa64	1*32(%r8),%ymm17
    149  1.1  christos 	vmovdqa64	2*32(%r8),%ymm18
    150  1.1  christos 	vmovdqa64	3*32(%r8),%ymm19
    151  1.1  christos 	vmovdqa64	4*32(%r8),%ymm20
    152  1.1  christos 	vmovdqa64	5*32(%r8),%ymm21
    153  1.1  christos 
    154  1.1  christos 	vpxor		%ymm7,%ymm7,%ymm7
    155  1.1  christos 	vmovdqa		%ymm7,32*2-96(%r10)	# zero transfer area on stack
    156  1.1  christos 	vmovdqa		%ymm7,32*3-96(%r10)
    157  1.1  christos 	vmovdqa		%ymm7,32*4-96(%r10)
    158  1.1  christos 	vmovdqa		%ymm7,32*5-96(%r10)
    159  1.1  christos 	vmovdqa		%ymm7,32*6-96(%r10)
    160  1.1  christos 
    161  1.1  christos .Loop_absorb_avx512vl:
    162  1.1  christos 	mov		%rcx,%rax
    163  1.1  christos 	sub		%rcx,%rdx
    164  1.1  christos 	jc		.Ldone_absorb_avx512vl
    165  1.1  christos 
    166  1.1  christos 	shr		$3,%eax
    167  1.1  christos 	vpbroadcastq	0-96(%rsi),%ymm7
    168  1.1  christos 	vmovdqu		8-96(%rsi),%ymm8
    169  1.1  christos 	sub		$4,%eax
    170  1.1  christos 	dec	%eax
    171  1.1  christos 	jz	.Labsorved_avx512vl
    172  1.1  christos 	mov	8*5-96(%rsi),%r8
    173  1.1  christos 	mov	%r8,80-96(%r10)
    174  1.1  christos 	dec	%eax
    175  1.1  christos 	jz	.Labsorved_avx512vl
    176  1.1  christos 	mov	8*6-96(%rsi),%r8
    177  1.1  christos 	mov	%r8,192-96(%r10)
    178  1.1  christos 	dec	%eax
    179  1.1  christos 	jz	.Labsorved_avx512vl
    180  1.1  christos 	mov	8*7-96(%rsi),%r8
    181  1.1  christos 	mov	%r8,104-96(%r10)
    182  1.1  christos 	dec	%eax
    183  1.1  christos 	jz	.Labsorved_avx512vl
    184  1.1  christos 	mov	8*8-96(%rsi),%r8
    185  1.1  christos 	mov	%r8,144-96(%r10)
    186  1.1  christos 	dec	%eax
    187  1.1  christos 	jz	.Labsorved_avx512vl
    188  1.1  christos 	mov	8*9-96(%rsi),%r8
    189  1.1  christos 	mov	%r8,184-96(%r10)
    190  1.1  christos 	dec	%eax
    191  1.1  christos 	jz	.Labsorved_avx512vl
    192  1.1  christos 	mov	8*10-96(%rsi),%r8
    193  1.1  christos 	mov	%r8,64-96(%r10)
    194  1.1  christos 	dec	%eax
    195  1.1  christos 	jz	.Labsorved_avx512vl
    196  1.1  christos 	mov	8*11-96(%rsi),%r8
    197  1.1  christos 	mov	%r8,128-96(%r10)
    198  1.1  christos 	dec	%eax
    199  1.1  christos 	jz	.Labsorved_avx512vl
    200  1.1  christos 	mov	8*12-96(%rsi),%r8
    201  1.1  christos 	mov	%r8,200-96(%r10)
    202  1.1  christos 	dec	%eax
    203  1.1  christos 	jz	.Labsorved_avx512vl
    204  1.1  christos 	mov	8*13-96(%rsi),%r8
    205  1.1  christos 	mov	%r8,176-96(%r10)
    206  1.1  christos 	dec	%eax
    207  1.1  christos 	jz	.Labsorved_avx512vl
    208  1.1  christos 	mov	8*14-96(%rsi),%r8
    209  1.1  christos 	mov	%r8,120-96(%r10)
    210  1.1  christos 	dec	%eax
    211  1.1  christos 	jz	.Labsorved_avx512vl
    212  1.1  christos 	mov	8*15-96(%rsi),%r8
    213  1.1  christos 	mov	%r8,88-96(%r10)
    214  1.1  christos 	dec	%eax
    215  1.1  christos 	jz	.Labsorved_avx512vl
    216  1.1  christos 	mov	8*16-96(%rsi),%r8
    217  1.1  christos 	mov	%r8,96-96(%r10)
    218  1.1  christos 	dec	%eax
    219  1.1  christos 	jz	.Labsorved_avx512vl
    220  1.1  christos 	mov	8*17-96(%rsi),%r8
    221  1.1  christos 	mov	%r8,168-96(%r10)
    222  1.1  christos 	dec	%eax
    223  1.1  christos 	jz	.Labsorved_avx512vl
    224  1.1  christos 	mov	8*18-96(%rsi),%r8
    225  1.1  christos 	mov	%r8,208-96(%r10)
    226  1.1  christos 	dec	%eax
    227  1.1  christos 	jz	.Labsorved_avx512vl
    228  1.1  christos 	mov	8*19-96(%rsi),%r8
    229  1.1  christos 	mov	%r8,152-96(%r10)
    230  1.1  christos 	dec	%eax
    231  1.1  christos 	jz	.Labsorved_avx512vl
    232  1.1  christos 	mov	8*20-96(%rsi),%r8
    233  1.1  christos 	mov	%r8,72-96(%r10)
    234  1.1  christos 	dec	%eax
    235  1.1  christos 	jz	.Labsorved_avx512vl
    236  1.1  christos 	mov	8*21-96(%rsi),%r8
    237  1.1  christos 	mov	%r8,160-96(%r10)
    238  1.1  christos 	dec	%eax
    239  1.1  christos 	jz	.Labsorved_avx512vl
    240  1.1  christos 	mov	8*22-96(%rsi),%r8
    241  1.1  christos 	mov	%r8,136-96(%r10)
    242  1.1  christos 	dec	%eax
    243  1.1  christos 	jz	.Labsorved_avx512vl
    244  1.1  christos 	mov	8*23-96(%rsi),%r8
    245  1.1  christos 	mov	%r8,112-96(%r10)
    246  1.1  christos 	dec	%eax
    247  1.1  christos 	jz	.Labsorved_avx512vl
    248  1.1  christos 	mov	8*24-96(%rsi),%r8
    249  1.1  christos 	mov	%r8,216-96(%r10)
    250  1.1  christos .Labsorved_avx512vl:
    251  1.1  christos 	lea	(%rsi,%rcx),%rsi
    252  1.1  christos 
    253  1.1  christos 	vpxor	%ymm7,%ymm0,%ymm0
    254  1.1  christos 	vpxor	%ymm8,%ymm1,%ymm1
    255  1.1  christos 	vpxor	32*2-96(%r10),%ymm2,%ymm2
    256  1.1  christos 	vpxor	32*3-96(%r10),%ymm3,%ymm3
    257  1.1  christos 	vpxor	32*4-96(%r10),%ymm4,%ymm4
    258  1.1  christos 	vpxor	32*5-96(%r10),%ymm5,%ymm5
    259  1.1  christos 	vpxor	32*6-96(%r10),%ymm6,%ymm6
    260  1.1  christos 
    261  1.1  christos 	call	__KeccakF1600
    262  1.1  christos 
    263  1.1  christos 	lea	96(%rsp),%r10
    264  1.1  christos 	jmp	.Loop_absorb_avx512vl
    265  1.1  christos 
    266  1.1  christos .Ldone_absorb_avx512vl:
    267  1.1  christos 	vmovq	%xmm0,-96(%rdi)
    268  1.1  christos 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    269  1.1  christos 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    270  1.1  christos 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    271  1.1  christos 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    272  1.1  christos 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    273  1.1  christos 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    274  1.1  christos 
    275  1.1  christos 	vzeroupper
    276  1.1  christos 
    277  1.1  christos 	lea	(%r11),%rsp
    278  1.1  christos 	lea	(%rdx,%rcx),%rax		# return value
    279  1.1  christos 	ret
    280  1.1  christos .size	SHA3_absorb,.-SHA3_absorb
    281  1.1  christos 
    282  1.1  christos .globl	SHA3_squeeze
    283  1.1  christos .type	SHA3_squeeze,@function
    284  1.1  christos .align	32
    285  1.1  christos SHA3_squeeze:
    286  1.1  christos 	mov	%rsp,%r11
    287  1.1  christos 
    288  1.1  christos 	lea	96(%rdi),%rdi
    289  1.1  christos 	lea	rhotates_left(%rip),%r8
    290  1.1  christos 	shr	$3,%rcx
    291  1.1  christos 
    292  1.1  christos 	vzeroupper
    293  1.1  christos 
    294  1.1  christos 	vpbroadcastq	-96(%rdi),%ymm0
    295  1.1  christos 	vpxor		%ymm7,%ymm7,%ymm7
    296  1.1  christos 	vmovdqu		8+32*0-96(%rdi),%ymm1
    297  1.1  christos 	vmovdqu		8+32*1-96(%rdi),%ymm2
    298  1.1  christos 	vmovdqu		8+32*2-96(%rdi),%ymm3
    299  1.1  christos 	vmovdqu		8+32*3-96(%rdi),%ymm4
    300  1.1  christos 	vmovdqu		8+32*4-96(%rdi),%ymm5
    301  1.1  christos 	vmovdqu		8+32*5-96(%rdi),%ymm6
    302  1.1  christos 
    303  1.1  christos 	vmovdqa64	0*32(%r8),%ymm16		# load "rhotate" indices
    304  1.1  christos 	vmovdqa64	1*32(%r8),%ymm17
    305  1.1  christos 	vmovdqa64	2*32(%r8),%ymm18
    306  1.1  christos 	vmovdqa64	3*32(%r8),%ymm19
    307  1.1  christos 	vmovdqa64	4*32(%r8),%ymm20
    308  1.1  christos 	vmovdqa64	5*32(%r8),%ymm21
    309  1.1  christos 
    310  1.1  christos 	mov	%rcx,%rax
    311  1.1  christos 
    312  1.1  christos .Loop_squeeze_avx512vl:
    313  1.1  christos 	mov	0-96(%rdi),%r8
    314  1.1  christos 	sub	$8,%rdx
    315  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    316  1.1  christos 	mov	%r8,(%rsi)
    317  1.1  christos 	lea	8(%rsi),%rsi
    318  1.1  christos 	je	.Ldone_squeeze_avx512vl
    319  1.1  christos 	dec	%eax
    320  1.1  christos 	je	.Lextend_output_avx512vl
    321  1.1  christos 	mov	32-120(%rdi),%r8
    322  1.1  christos 	sub	$8,%rdx
    323  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    324  1.1  christos 	mov	%r8,(%rsi)
    325  1.1  christos 	lea	8(%rsi),%rsi
    326  1.1  christos 	je	.Ldone_squeeze_avx512vl
    327  1.1  christos 	dec	%eax
    328  1.1  christos 	je	.Lextend_output_avx512vl
    329  1.1  christos 	mov	40-120(%rdi),%r8
    330  1.1  christos 	sub	$8,%rdx
    331  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    332  1.1  christos 	mov	%r8,(%rsi)
    333  1.1  christos 	lea	8(%rsi),%rsi
    334  1.1  christos 	je	.Ldone_squeeze_avx512vl
    335  1.1  christos 	dec	%eax
    336  1.1  christos 	je	.Lextend_output_avx512vl
    337  1.1  christos 	mov	48-120(%rdi),%r8
    338  1.1  christos 	sub	$8,%rdx
    339  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    340  1.1  christos 	mov	%r8,(%rsi)
    341  1.1  christos 	lea	8(%rsi),%rsi
    342  1.1  christos 	je	.Ldone_squeeze_avx512vl
    343  1.1  christos 	dec	%eax
    344  1.1  christos 	je	.Lextend_output_avx512vl
    345  1.1  christos 	mov	56-120(%rdi),%r8
    346  1.1  christos 	sub	$8,%rdx
    347  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    348  1.1  christos 	mov	%r8,(%rsi)
    349  1.1  christos 	lea	8(%rsi),%rsi
    350  1.1  christos 	je	.Ldone_squeeze_avx512vl
    351  1.1  christos 	dec	%eax
    352  1.1  christos 	je	.Lextend_output_avx512vl
    353  1.1  christos 	mov	80-120(%rdi),%r8
    354  1.1  christos 	sub	$8,%rdx
    355  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    356  1.1  christos 	mov	%r8,(%rsi)
    357  1.1  christos 	lea	8(%rsi),%rsi
    358  1.1  christos 	je	.Ldone_squeeze_avx512vl
    359  1.1  christos 	dec	%eax
    360  1.1  christos 	je	.Lextend_output_avx512vl
    361  1.1  christos 	mov	192-120(%rdi),%r8
    362  1.1  christos 	sub	$8,%rdx
    363  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    364  1.1  christos 	mov	%r8,(%rsi)
    365  1.1  christos 	lea	8(%rsi),%rsi
    366  1.1  christos 	je	.Ldone_squeeze_avx512vl
    367  1.1  christos 	dec	%eax
    368  1.1  christos 	je	.Lextend_output_avx512vl
    369  1.1  christos 	mov	104-120(%rdi),%r8
    370  1.1  christos 	sub	$8,%rdx
    371  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    372  1.1  christos 	mov	%r8,(%rsi)
    373  1.1  christos 	lea	8(%rsi),%rsi
    374  1.1  christos 	je	.Ldone_squeeze_avx512vl
    375  1.1  christos 	dec	%eax
    376  1.1  christos 	je	.Lextend_output_avx512vl
    377  1.1  christos 	mov	144-120(%rdi),%r8
    378  1.1  christos 	sub	$8,%rdx
    379  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    380  1.1  christos 	mov	%r8,(%rsi)
    381  1.1  christos 	lea	8(%rsi),%rsi
    382  1.1  christos 	je	.Ldone_squeeze_avx512vl
    383  1.1  christos 	dec	%eax
    384  1.1  christos 	je	.Lextend_output_avx512vl
    385  1.1  christos 	mov	184-120(%rdi),%r8
    386  1.1  christos 	sub	$8,%rdx
    387  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    388  1.1  christos 	mov	%r8,(%rsi)
    389  1.1  christos 	lea	8(%rsi),%rsi
    390  1.1  christos 	je	.Ldone_squeeze_avx512vl
    391  1.1  christos 	dec	%eax
    392  1.1  christos 	je	.Lextend_output_avx512vl
    393  1.1  christos 	mov	64-120(%rdi),%r8
    394  1.1  christos 	sub	$8,%rdx
    395  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    396  1.1  christos 	mov	%r8,(%rsi)
    397  1.1  christos 	lea	8(%rsi),%rsi
    398  1.1  christos 	je	.Ldone_squeeze_avx512vl
    399  1.1  christos 	dec	%eax
    400  1.1  christos 	je	.Lextend_output_avx512vl
    401  1.1  christos 	mov	128-120(%rdi),%r8
    402  1.1  christos 	sub	$8,%rdx
    403  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    404  1.1  christos 	mov	%r8,(%rsi)
    405  1.1  christos 	lea	8(%rsi),%rsi
    406  1.1  christos 	je	.Ldone_squeeze_avx512vl
    407  1.1  christos 	dec	%eax
    408  1.1  christos 	je	.Lextend_output_avx512vl
    409  1.1  christos 	mov	200-120(%rdi),%r8
    410  1.1  christos 	sub	$8,%rdx
    411  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    412  1.1  christos 	mov	%r8,(%rsi)
    413  1.1  christos 	lea	8(%rsi),%rsi
    414  1.1  christos 	je	.Ldone_squeeze_avx512vl
    415  1.1  christos 	dec	%eax
    416  1.1  christos 	je	.Lextend_output_avx512vl
    417  1.1  christos 	mov	176-120(%rdi),%r8
    418  1.1  christos 	sub	$8,%rdx
    419  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    420  1.1  christos 	mov	%r8,(%rsi)
    421  1.1  christos 	lea	8(%rsi),%rsi
    422  1.1  christos 	je	.Ldone_squeeze_avx512vl
    423  1.1  christos 	dec	%eax
    424  1.1  christos 	je	.Lextend_output_avx512vl
    425  1.1  christos 	mov	120-120(%rdi),%r8
    426  1.1  christos 	sub	$8,%rdx
    427  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    428  1.1  christos 	mov	%r8,(%rsi)
    429  1.1  christos 	lea	8(%rsi),%rsi
    430  1.1  christos 	je	.Ldone_squeeze_avx512vl
    431  1.1  christos 	dec	%eax
    432  1.1  christos 	je	.Lextend_output_avx512vl
    433  1.1  christos 	mov	88-120(%rdi),%r8
    434  1.1  christos 	sub	$8,%rdx
    435  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    436  1.1  christos 	mov	%r8,(%rsi)
    437  1.1  christos 	lea	8(%rsi),%rsi
    438  1.1  christos 	je	.Ldone_squeeze_avx512vl
    439  1.1  christos 	dec	%eax
    440  1.1  christos 	je	.Lextend_output_avx512vl
    441  1.1  christos 	mov	96-120(%rdi),%r8
    442  1.1  christos 	sub	$8,%rdx
    443  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    444  1.1  christos 	mov	%r8,(%rsi)
    445  1.1  christos 	lea	8(%rsi),%rsi
    446  1.1  christos 	je	.Ldone_squeeze_avx512vl
    447  1.1  christos 	dec	%eax
    448  1.1  christos 	je	.Lextend_output_avx512vl
    449  1.1  christos 	mov	168-120(%rdi),%r8
    450  1.1  christos 	sub	$8,%rdx
    451  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    452  1.1  christos 	mov	%r8,(%rsi)
    453  1.1  christos 	lea	8(%rsi),%rsi
    454  1.1  christos 	je	.Ldone_squeeze_avx512vl
    455  1.1  christos 	dec	%eax
    456  1.1  christos 	je	.Lextend_output_avx512vl
    457  1.1  christos 	mov	208-120(%rdi),%r8
    458  1.1  christos 	sub	$8,%rdx
    459  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    460  1.1  christos 	mov	%r8,(%rsi)
    461  1.1  christos 	lea	8(%rsi),%rsi
    462  1.1  christos 	je	.Ldone_squeeze_avx512vl
    463  1.1  christos 	dec	%eax
    464  1.1  christos 	je	.Lextend_output_avx512vl
    465  1.1  christos 	mov	152-120(%rdi),%r8
    466  1.1  christos 	sub	$8,%rdx
    467  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    468  1.1  christos 	mov	%r8,(%rsi)
    469  1.1  christos 	lea	8(%rsi),%rsi
    470  1.1  christos 	je	.Ldone_squeeze_avx512vl
    471  1.1  christos 	dec	%eax
    472  1.1  christos 	je	.Lextend_output_avx512vl
    473  1.1  christos 	mov	72-120(%rdi),%r8
    474  1.1  christos 	sub	$8,%rdx
    475  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    476  1.1  christos 	mov	%r8,(%rsi)
    477  1.1  christos 	lea	8(%rsi),%rsi
    478  1.1  christos 	je	.Ldone_squeeze_avx512vl
    479  1.1  christos 	dec	%eax
    480  1.1  christos 	je	.Lextend_output_avx512vl
    481  1.1  christos 	mov	160-120(%rdi),%r8
    482  1.1  christos 	sub	$8,%rdx
    483  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    484  1.1  christos 	mov	%r8,(%rsi)
    485  1.1  christos 	lea	8(%rsi),%rsi
    486  1.1  christos 	je	.Ldone_squeeze_avx512vl
    487  1.1  christos 	dec	%eax
    488  1.1  christos 	je	.Lextend_output_avx512vl
    489  1.1  christos 	mov	136-120(%rdi),%r8
    490  1.1  christos 	sub	$8,%rdx
    491  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    492  1.1  christos 	mov	%r8,(%rsi)
    493  1.1  christos 	lea	8(%rsi),%rsi
    494  1.1  christos 	je	.Ldone_squeeze_avx512vl
    495  1.1  christos 	dec	%eax
    496  1.1  christos 	je	.Lextend_output_avx512vl
    497  1.1  christos 	mov	112-120(%rdi),%r8
    498  1.1  christos 	sub	$8,%rdx
    499  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    500  1.1  christos 	mov	%r8,(%rsi)
    501  1.1  christos 	lea	8(%rsi),%rsi
    502  1.1  christos 	je	.Ldone_squeeze_avx512vl
    503  1.1  christos 	dec	%eax
    504  1.1  christos 	je	.Lextend_output_avx512vl
    505  1.1  christos 	mov	216-120(%rdi),%r8
    506  1.1  christos 	sub	$8,%rdx
    507  1.1  christos 	jc	.Ltail_squeeze_avx512vl
    508  1.1  christos 	mov	%r8,(%rsi)
    509  1.1  christos 	lea	8(%rsi),%rsi
    510  1.1  christos 	je	.Ldone_squeeze_avx512vl
    511  1.1  christos 	dec	%eax
    512  1.1  christos 	je	.Lextend_output_avx512vl
    513  1.1  christos 	mov	-120(%rdi),%r8
    514  1.1  christos .Lextend_output_avx512vl:
    515  1.1  christos 	call	__KeccakF1600
    516  1.1  christos 
    517  1.1  christos 	vmovq	%xmm0,-96(%rdi)
    518  1.1  christos 	vmovdqu	%ymm1,8+32*0-96(%rdi)
    519  1.1  christos 	vmovdqu	%ymm2,8+32*1-96(%rdi)
    520  1.1  christos 	vmovdqu	%ymm3,8+32*2-96(%rdi)
    521  1.1  christos 	vmovdqu	%ymm4,8+32*3-96(%rdi)
    522  1.1  christos 	vmovdqu	%ymm5,8+32*4-96(%rdi)
    523  1.1  christos 	vmovdqu	%ymm6,8+32*5-96(%rdi)
    524  1.1  christos 
    525  1.1  christos 	mov	%rcx,%rax
    526  1.1  christos 	jmp	.Loop_squeeze_avx512vl
    527  1.1  christos 
    528  1.1  christos 
    529  1.1  christos .Ltail_squeeze_avx512vl:
    530  1.1  christos 	add	$8,%rdx
    531  1.1  christos .Loop_tail_avx512vl:
    532  1.1  christos 	mov	%r8b,(%rsi)
    533  1.1  christos 	lea	1(%rsi),%rsi
    534  1.1  christos 	shr	$8,%r8
    535  1.1  christos 	dec	%rdx
    536  1.1  christos 	jnz	.Loop_tail_avx512vl
    537  1.1  christos 
    538  1.1  christos .Ldone_squeeze_avx512vl:
    539  1.1  christos 	vzeroupper
    540  1.1  christos 
    541  1.1  christos 	lea	(%r11),%rsp
    542  1.1  christos 	ret
    543  1.1  christos .size	SHA3_squeeze,.-SHA3_squeeze
    544  1.1  christos 
    545  1.1  christos .section .rodata
    546  1.1  christos .align	64
    547  1.1  christos rhotates_left:
    548  1.1  christos 	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
    549  1.1  christos 	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
    550  1.1  christos 	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
    551  1.1  christos 	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
    552  1.1  christos 	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
    553  1.1  christos 	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
    554  1.1  christos iotas:
    555  1.1  christos 	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
    556  1.1  christos 	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
    557  1.1  christos 	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
    558  1.1  christos 	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
    559  1.1  christos 	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
    560  1.1  christos 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    561  1.1  christos 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    562  1.1  christos 	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
    563  1.1  christos 	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
    564  1.1  christos 	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
    565  1.1  christos 	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
    566  1.1  christos 	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
    567  1.1  christos 	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
    568  1.1  christos 	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
    569  1.1  christos 	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
    570  1.1  christos 	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
    571  1.1  christos 	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
    572  1.1  christos 	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
    573  1.1  christos 	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
    574  1.1  christos 	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
    575  1.1  christos 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    576  1.1  christos 	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
    577  1.1  christos 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    578  1.1  christos 	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
    579  1.1  christos 
    580  1.1  christos .asciz	"Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro (at) openssl.org>"
    581