Home | History | Annotate | Line # | Download | only in asm
      1 #!/usr/bin/env perl
      2 # Copyright 2017-2023 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the Apache License 2.0 (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 #
      9 # ====================================================================
     10 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     11 # project. The module is, however, dual licensed under OpenSSL and
     12 # CRYPTOGAMS licenses depending on where you obtain it. For further
     13 # details see http://www.openssl.org/~appro/cryptogams/.
     14 # ====================================================================
     15 #
     16 # Keccak-1600 for AVX512VL.
     17 #
     18 # December 2017.
     19 #
     20 # This is an adaptation of AVX2 module that reuses register data
     21 # layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
     22 # module for further information on layout.
     23 #
     24 ########################################################################
     25 # Numbers are cycles per processed byte out of large message.
     26 #
     27 #			r=1088(*)
     28 #
     29 # Skylake-X		6.4/+47%
     30 #
     31 # (*)	Corresponds to SHA3-256. Percentage after slash is improvement
     32 #	coefficient in comparison to scalar keccak1600-x86_64.pl.
     33 
     34 # Digits in variables' names denote right-most coordinates:
     35 
     36 my ($A00,	# [0][0] [0][0] [0][0] [0][0]		# %ymm0
     37     $A01,	# [0][4] [0][3] [0][2] [0][1]		# %ymm1
     38     $A20,	# [3][0] [1][0] [4][0] [2][0]		# %ymm2
     39     $A31,	# [2][4] [4][3] [1][2] [3][1]		# %ymm3
     40     $A21,	# [3][4] [1][3] [4][2] [2][1]		# %ymm4
     41     $A41,	# [1][4] [2][3] [3][2] [4][1]		# %ymm5
     42     $A11) =	# [4][4] [3][3] [2][2] [1][1]		# %ymm6
     43     map("%ymm$_",(0..6));
     44 
     45 # We also need to map the magic order into offsets within structure:
     46 
     47 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],	# [0][0..4]
     48 		[2,2], [6,0], [3,1], [4,2], [5,3],	# [1][0..4]
     49 		[2,0], [4,0], [6,1], [5,2], [3,3],	# [2][0..4]
     50 		[2,3], [3,0], [5,1], [6,2], [4,3],	# [3][0..4]
     51 		[2,1], [5,0], [4,1], [3,2], [6,3]);	# [4][0..4]
     52    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);	# ... and now linear
     53 
     54 my @T = map("%ymm$_",(7..15));
     55 my ($C14,$C00,$D00,$D14) = @T[5..8];
     56 my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
     57 
     58 $code.=<<___;
     59 .text
     60 
     61 .type	__KeccakF1600,\@function
     62 .align	32
     63 __KeccakF1600:
     64 	lea		iotas(%rip),%r10
     65 	mov		\$24,%eax
     66 	jmp		.Loop_avx512vl
     67 
     68 .align	32
     69 .Loop_avx512vl:
     70 	######################################### Theta
     71 	vpshufd		\$0b01001110,$A20,$C00
     72 	vpxor		$A31,$A41,$C14
     73 	vpxor		$A11,$A21,@T[2]
     74 	vpternlogq	\$0x96,$A01,$T[2],$C14	# C[1..4]
     75 
     76 	vpxor		$A20,$C00,$C00
     77 	vpermq		\$0b01001110,$C00,@T[0]
     78 
     79 	vpermq		\$0b10010011,$C14,@T[4]
     80 	vprolq		\$1,$C14,@T[1]		# ROL64(C[1..4],1)
     81 
     82 	vpermq		\$0b00111001,@T[1],$D14
     83 	vpxor		@T[4],@T[1],$D00
     84 	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]
     85 
     86 	vpternlogq	\$0x96,@T[0],$A00,$C00	# C[0..0]
     87 	vprolq		\$1,$C00,@T[1]		# ROL64(C[0..0],1)
     88 
     89 	vpxor		$D00,$A00,$A00		# ^= D[0..0]
     90 
     91 	vpblendd	\$0b11000000,@T[1],$D14,$D14
     92 	vpblendd	\$0b00000011,$C00,@T[4],@T[0]
     93 
     94 	######################################### Rho + Pi + pre-Chi shuffle
     95 	 vpxor		$D00,$A20,$A20		# ^= D[0..0] from Theta
     96 	vprolvq		$R20,$A20,$A20
     97 
     98 	 vpternlogq	\$0x96,@T[0],$D14,$A31	# ^= D[1..4] from Theta
     99 	vprolvq		$R31,$A31,$A31
    100 
    101 	 vpternlogq	\$0x96,@T[0],$D14,$A21	# ^= D[1..4] from Theta
    102 	vprolvq		$R21,$A21,$A21
    103 
    104 	 vpternlogq	\$0x96,@T[0],$D14,$A41	# ^= D[1..4] from Theta
    105 	vprolvq		$R41,$A41,$A41
    106 
    107 	 vpermq		\$0b10001101,$A20,@T[3]	# $A20 -> future $A31
    108 	 vpermq		\$0b10001101,$A31,@T[4]	# $A31 -> future $A21
    109 	 vpternlogq	\$0x96,@T[0],$D14,$A11	# ^= D[1..4] from Theta
    110 	vprolvq		$R11,$A11,@T[1]		# $A11 -> future $A01
    111 
    112 	 vpermq		\$0b00011011,$A21,@T[5]	# $A21 -> future $A41
    113 	 vpermq		\$0b01110010,$A41,@T[6]	# $A41 -> future $A11
    114 	 vpternlogq	\$0x96,@T[0],$D14,$A01	# ^= D[1..4] from Theta
    115 	vprolvq		$R01,$A01,@T[2]		# $A01 -> future $A20
    116 
    117 	######################################### Chi
    118 	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
    119 	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
    120 	 vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
    121 	 vpblendd	\$0b00001100,@T[3],@T[2],@T[7]	#               [4][3] [2][0]
    122 	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
    123 	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
    124 	 vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
    125 	 vpblendd	\$0b00110000,@T[6],@T[7],@T[7]	#        [1][1] [4][3] [2][0]
    126 	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
    127 	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
    128 	 vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
    129 	 vpblendd	\$0b11000000,@T[4],@T[7],@T[7]	# [3][4] [1][1] [4][3] [2][0]
    130 	vpternlogq	\$0xC6,@T[8],@T[3],$A31		# [3][1] [1][2] [4][3] [2][4]
    131 	 vpternlogq	\$0xC6,@T[7],@T[5],$A41		# [3][2] [1][4] [4][1] [2][3]
    132 
    133 	vpsrldq		\$8,@T[1],@T[0]
    134 	vpandn		@T[0],@T[1],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]
    135 
    136 	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
    137 	vpblendd	\$0b00001100,@T[5],@T[3],@T[8]	#               [4][1] [2][4]
    138 	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
    139 	vpblendd	\$0b00110000,@T[4],@T[8],@T[8]	#        [1][3] [4][1] [2][4]
    140 	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
    141 	vpblendd	\$0b11000000,@T[2],@T[8],@T[8]	# [3][0] [1][3] [4][1] [2][4]
    142 	vpternlogq	\$0xC6,@T[8],@T[6],$A11		# [3][3] [1][1] [4][4] [2][2]
    143 
    144 	  vpermq	\$0b00011110,@T[1],$A21		# [0][1] [0][2] [0][4] [0][3]
    145 	  vpblendd	\$0b00110000,$A00,$A21,@T[8]	# [0][1] [0][0] [0][4] [0][3]
    146 	  vpermq	\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
    147 	  vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
    148 
    149 	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
    150 	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
    151 	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
    152 	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
    153 	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
    154 	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
    155 	vpternlogq	\$0xC6,@T[7],@T[2],$A20		# [3][0] [1][0] [4][0] [2][0]
    156 
    157 	 vpermq		\$0b00000000,@T[0],@T[0]	# [0][0] [0][0] [0][0] [0][0]
    158 	 vpermq		\$0b00011011,$A31,$A31		# post-Chi shuffle
    159 	 vpermq		\$0b10001101,$A41,$A41
    160 	 vpermq		\$0b01110010,$A11,$A11
    161 
    162 	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
    163 	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
    164 	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
    165 	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
    166 	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
    167 	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
    168 
    169 	vpternlogq	\$0xC6,@T[8],@T[1],$A01		# [0][4] [0][3] [0][2] [0][1]
    170 	vpternlogq	\$0xC6,@T[7],@T[4],$A21		# [3][4] [1][3] [4][2] [2][1]
    171 
    172 	######################################### Iota
    173 	vpternlogq	\$0x96,(%r10),@T[0],$A00
    174 	lea		32(%r10),%r10
    175 
    176 	dec		%eax
    177 	jnz		.Loop_avx512vl
    178 
    179 	ret
    180 .size	__KeccakF1600,.-__KeccakF1600
    181 ___
    182 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
    183 my  $out = $inp;	# in squeeze
    184 
    185 $code.=<<___;
    186 .globl	SHA3_absorb
    187 .type	SHA3_absorb,\@function
    188 .align	32
    189 SHA3_absorb:
    190 	mov	%rsp,%r11
    191 
    192 	lea	-240(%rsp),%rsp
    193 	and	\$-32,%rsp
    194 
    195 	lea	96($A_flat),$A_flat
    196 	lea	96($inp),$inp
    197 	lea	96(%rsp),%r10
    198 	lea	rhotates_left(%rip),%r8
    199 
    200 	vzeroupper
    201 
    202 	vpbroadcastq	-96($A_flat),$A00	# load A[5][5]
    203 	vmovdqu		8+32*0-96($A_flat),$A01
    204 	vmovdqu		8+32*1-96($A_flat),$A20
    205 	vmovdqu		8+32*2-96($A_flat),$A31
    206 	vmovdqu		8+32*3-96($A_flat),$A21
    207 	vmovdqu		8+32*4-96($A_flat),$A41
    208 	vmovdqu		8+32*5-96($A_flat),$A11
    209 
    210 	vmovdqa64	0*32(%r8),$R20		# load "rhotate" indices
    211 	vmovdqa64	1*32(%r8),$R01
    212 	vmovdqa64	2*32(%r8),$R31
    213 	vmovdqa64	3*32(%r8),$R21
    214 	vmovdqa64	4*32(%r8),$R41
    215 	vmovdqa64	5*32(%r8),$R11
    216 
    217 	vpxor		@T[0],@T[0],@T[0]
    218 	vmovdqa		@T[0],32*2-96(%r10)	# zero transfer area on stack
    219 	vmovdqa		@T[0],32*3-96(%r10)
    220 	vmovdqa		@T[0],32*4-96(%r10)
    221 	vmovdqa		@T[0],32*5-96(%r10)
    222 	vmovdqa		@T[0],32*6-96(%r10)
    223 
    224 .Loop_absorb_avx512vl:
    225 	mov		$bsz,%rax
    226 	sub		$bsz,$len
    227 	jc		.Ldone_absorb_avx512vl
    228 
    229 	shr		\$3,%eax
    230 	vpbroadcastq	0-96($inp),@T[0]
    231 	vmovdqu		8-96($inp),@T[1]
    232 	sub		\$4,%eax
    233 ___
    234 for(my $i=5; $i<25; $i++) {
    235 $code.=<<___
    236 	dec	%eax
    237 	jz	.Labsorved_avx512vl
    238 	mov	8*$i-96($inp),%r8
    239 	mov	%r8,$A_jagged[$i]-96(%r10)
    240 ___
    241 }
    242 $code.=<<___;
    243 .Labsorved_avx512vl:
    244 	lea	($inp,$bsz),$inp
    245 
    246 	vpxor	@T[0],$A00,$A00
    247 	vpxor	@T[1],$A01,$A01
    248 	vpxor	32*2-96(%r10),$A20,$A20
    249 	vpxor	32*3-96(%r10),$A31,$A31
    250 	vpxor	32*4-96(%r10),$A21,$A21
    251 	vpxor	32*5-96(%r10),$A41,$A41
    252 	vpxor	32*6-96(%r10),$A11,$A11
    253 
    254 	call	__KeccakF1600
    255 
    256 	lea	96(%rsp),%r10
    257 	jmp	.Loop_absorb_avx512vl
    258 
    259 .Ldone_absorb_avx512vl:
    260 	vmovq	%xmm0,-96($A_flat)
    261 	vmovdqu	$A01,8+32*0-96($A_flat)
    262 	vmovdqu	$A20,8+32*1-96($A_flat)
    263 	vmovdqu	$A31,8+32*2-96($A_flat)
    264 	vmovdqu	$A21,8+32*3-96($A_flat)
    265 	vmovdqu	$A41,8+32*4-96($A_flat)
    266 	vmovdqu	$A11,8+32*5-96($A_flat)
    267 
    268 	vzeroupper
    269 
    270 	lea	(%r11),%rsp
    271 	lea	($len,$bsz),%rax		# return value
    272 	ret
    273 .size	SHA3_absorb,.-SHA3_absorb
    274 
    275 .globl	SHA3_squeeze
    276 .type	SHA3_squeeze,\@function
    277 .align	32
    278 SHA3_squeeze:
    279 	mov	%rsp,%r11
    280 
    281 	lea	96($A_flat),$A_flat
    282 	lea	rhotates_left(%rip),%r8
    283 	shr	\$3,$bsz
    284 
    285 	vzeroupper
    286 
    287 	vpbroadcastq	-96($A_flat),$A00
    288 	vpxor		@T[0],@T[0],@T[0]
    289 	vmovdqu		8+32*0-96($A_flat),$A01
    290 	vmovdqu		8+32*1-96($A_flat),$A20
    291 	vmovdqu		8+32*2-96($A_flat),$A31
    292 	vmovdqu		8+32*3-96($A_flat),$A21
    293 	vmovdqu		8+32*4-96($A_flat),$A41
    294 	vmovdqu		8+32*5-96($A_flat),$A11
    295 
    296 	vmovdqa64	0*32(%r8),$R20		# load "rhotate" indices
    297 	vmovdqa64	1*32(%r8),$R01
    298 	vmovdqa64	2*32(%r8),$R31
    299 	vmovdqa64	3*32(%r8),$R21
    300 	vmovdqa64	4*32(%r8),$R41
    301 	vmovdqa64	5*32(%r8),$R11
    302 
    303 	mov	$bsz,%rax
    304 
    305 .Loop_squeeze_avx512vl:
    306 	mov	@A_jagged[$i]-96($A_flat),%r8
    307 ___
    308 for (my $i=0; $i<25; $i++) {
    309 $code.=<<___;
    310 	sub	\$8,$len
    311 	jc	.Ltail_squeeze_avx512vl
    312 	mov	%r8,($out)
    313 	lea	8($out),$out
    314 	je	.Ldone_squeeze_avx512vl
    315 	dec	%eax
    316 	je	.Lextend_output_avx512vl
    317 	mov	@A_jagged[$i+1]-120($A_flat),%r8
    318 ___
    319 }
    320 $code.=<<___;
    321 .Lextend_output_avx512vl:
    322 	call	__KeccakF1600
    323 
    324 	vmovq	%xmm0,-96($A_flat)
    325 	vmovdqu	$A01,8+32*0-96($A_flat)
    326 	vmovdqu	$A20,8+32*1-96($A_flat)
    327 	vmovdqu	$A31,8+32*2-96($A_flat)
    328 	vmovdqu	$A21,8+32*3-96($A_flat)
    329 	vmovdqu	$A41,8+32*4-96($A_flat)
    330 	vmovdqu	$A11,8+32*5-96($A_flat)
    331 
    332 	mov	$bsz,%rax
    333 	jmp	.Loop_squeeze_avx512vl
    334 
    335 
    336 .Ltail_squeeze_avx512vl:
    337 	add	\$8,$len
    338 .Loop_tail_avx512vl:
    339 	mov	%r8b,($out)
    340 	lea	1($out),$out
    341 	shr	\$8,%r8
    342 	dec	$len
    343 	jnz	.Loop_tail_avx512vl
    344 
    345 .Ldone_squeeze_avx512vl:
    346 	vzeroupper
    347 
    348 	lea	(%r11),%rsp
    349 	ret
    350 .size	SHA3_squeeze,.-SHA3_squeeze
    351 
    352 .section .rodata
    353 .align	64
    354 rhotates_left:
    355 	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
    356 	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
    357 	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
    358 	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
    359 	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
    360 	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
    361 iotas:
    362 	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
    363 	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
    364 	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
    365 	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
    366 	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
    367 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    368 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    369 	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
    370 	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
    371 	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
    372 	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
    373 	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
    374 	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
    375 	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
    376 	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
    377 	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
    378 	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
    379 	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
    380 	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
    381 	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
    382 	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
    383 	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
    384 	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
    385 	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
    386 
    387 .asciz	"Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
    388 ___
    389 
    390 $output=pop and open STDOUT,">$output";
    391 print $code;
    392 close STDOUT or die "error closing STDOUT: $!";
    393