1 1.1 christos .text 2 1.1 christos 3 1.1 christos .type __KeccakF1600,@function 4 1.1 christos .align 32 5 1.1 christos __KeccakF1600: 6 1.1 christos lea iotas(%rip),%r10 7 1.1 christos mov $24,%eax 8 1.1 christos jmp .Loop_avx512vl 9 1.1 christos 10 1.1 christos .align 32 11 1.1 christos .Loop_avx512vl: 12 1.1 christos ######################################### Theta 13 1.1 christos vpshufd $0b01001110,%ymm2,%ymm13 14 1.1 christos vpxor %ymm3,%ymm5,%ymm12 15 1.1 christos vpxor %ymm6,%ymm4,%ymm9 16 1.1 christos vpternlogq $0x96,%ymm1,%ymm9,%ymm12 # C[1..4] 17 1.1 christos 18 1.1 christos vpxor %ymm2,%ymm13,%ymm13 19 1.1 christos vpermq $0b01001110,%ymm13,%ymm7 20 1.1 christos 21 1.1 christos vpermq $0b10010011,%ymm12,%ymm11 22 1.1 christos vprolq $1,%ymm12,%ymm8 # ROL64(C[1..4],1) 23 1.1 christos 24 1.1 christos vpermq $0b00111001,%ymm8,%ymm15 25 1.1 christos vpxor %ymm11,%ymm8,%ymm14 26 1.1 christos vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] 27 1.1 christos 28 1.1 christos vpternlogq $0x96,%ymm7,%ymm0,%ymm13 # C[0..0] 29 1.1 christos vprolq $1,%ymm13,%ymm8 # ROL64(C[0..0],1) 30 1.1 christos 31 1.1 christos vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] 32 1.1 christos 33 1.1 christos vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 34 1.1 christos vpblendd $0b00000011,%ymm13,%ymm11,%ymm7 35 1.1 christos 36 1.1 christos ######################################### Rho + Pi + pre-Chi shuffle 37 1.1 christos vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] from Theta 38 1.1 christos vprolvq %ymm16,%ymm2,%ymm2 39 1.1 christos 40 1.1 christos vpternlogq $0x96,%ymm7,%ymm15,%ymm3 # ^= D[1..4] from Theta 41 1.1 christos vprolvq %ymm18,%ymm3,%ymm3 42 1.1 christos 43 1.1 christos vpternlogq $0x96,%ymm7,%ymm15,%ymm4 # ^= D[1..4] from Theta 44 1.1 christos vprolvq %ymm19,%ymm4,%ymm4 45 1.1 christos 46 1.1 christos vpternlogq $0x96,%ymm7,%ymm15,%ymm5 # ^= D[1..4] from Theta 47 1.1 christos vprolvq %ymm20,%ymm5,%ymm5 48 1.1 christos 49 1.1 christos vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 50 1.1 christos vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 51 1.1 christos vpternlogq $0x96,%ymm7,%ymm15,%ymm6 # ^= D[1..4] from Theta 52 1.1 christos vprolvq %ymm21,%ymm6,%ymm8 # %ymm6 -> future %ymm1 53 1.1 christos 54 1.1 christos vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 55 1.1 christos vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 56 1.1 christos vpternlogq $0x96,%ymm7,%ymm15,%ymm1 # ^= D[1..4] from Theta 57 1.1 christos vprolvq %ymm17,%ymm1,%ymm9 # %ymm1 -> future %ymm2 58 1.1 christos 59 1.1 christos ######################################### Chi 60 1.1 christos vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] 61 1.1 christos vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] 62 1.1 christos vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] 63 1.1 christos vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] 64 1.1 christos vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] 65 1.1 christos vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] 66 1.1 christos vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] 67 1.1 christos vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] 68 1.1 christos vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] 69 1.1 christos vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] 70 1.1 christos vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] 71 1.1 christos vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] 72 1.1 christos vpternlogq $0xC6,%ymm15,%ymm10,%ymm3 # [3][1] [1][2] [4][3] [2][4] 73 1.1 christos vpternlogq $0xC6,%ymm14,%ymm12,%ymm5 # [3][2] [1][4] [4][1] [2][3] 74 1.1 christos 75 1.1 christos vpsrldq $8,%ymm8,%ymm7 76 1.1 christos vpandn %ymm7,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] 77 1.1 christos 78 1.1 christos vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] 79 1.1 christos vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] 80 1.1 christos vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] 81 1.1 christos vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] 82 1.1 christos vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] 83 1.1 christos vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] 84 1.1 christos vpternlogq $0xC6,%ymm15,%ymm13,%ymm6 # [3][3] [1][1] [4][4] [2][2] 85 1.1 christos 86 1.1 christos vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] 87 1.1 christos vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] 88 1.1 christos vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] 89 1.1 christos vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] 90 1.1 christos 91 1.1 christos vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] 92 1.1 christos vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] 93 1.1 christos vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] 94 1.1 christos vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] 95 1.1 christos vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] 96 1.1 christos vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] 97 1.1 christos vpternlogq $0xC6,%ymm14,%ymm9,%ymm2 # [3][0] [1][0] [4][0] [2][0] 98 1.1 christos 99 1.1 christos vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] 100 1.1 christos vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle 101 1.1 christos vpermq $0b10001101,%ymm5,%ymm5 102 1.1 christos vpermq $0b01110010,%ymm6,%ymm6 103 1.1 christos 104 1.1 christos vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] 105 1.1 christos vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] 106 1.1 christos vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] 107 1.1 christos vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] 108 1.1 christos vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] 109 1.1 christos vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] 110 1.1 christos 111 1.1 christos vpternlogq $0xC6,%ymm15,%ymm8,%ymm1 # [0][4] [0][3] [0][2] [0][1] 112 1.1 christos vpternlogq $0xC6,%ymm14,%ymm11,%ymm4 # [3][4] [1][3] [4][2] [2][1] 113 1.1 christos 114 1.1 christos ######################################### Iota 115 1.1 christos vpternlogq $0x96,(%r10),%ymm7,%ymm0 116 1.1 christos lea 32(%r10),%r10 117 1.1 christos 118 1.1 christos dec %eax 119 1.1 christos jnz .Loop_avx512vl 120 1.1 christos 121 1.1 christos ret 122 1.1 christos .size __KeccakF1600,.-__KeccakF1600 123 1.1 christos .globl SHA3_absorb 124 1.1 christos .type SHA3_absorb,@function 125 1.1 christos .align 32 126 1.1 christos SHA3_absorb: 127 1.1 christos mov %rsp,%r11 128 1.1 christos 129 1.1 christos lea -240(%rsp),%rsp 130 1.1 christos and $-32,%rsp 131 1.1 christos 132 1.1 christos lea 96(%rdi),%rdi 133 1.1 christos lea 96(%rsi),%rsi 134 1.1 christos lea 96(%rsp),%r10 135 1.1 christos lea rhotates_left(%rip),%r8 136 1.1 christos 137 1.1 christos vzeroupper 138 1.1 christos 139 1.1 christos vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] 140 1.1 christos vmovdqu 8+32*0-96(%rdi),%ymm1 141 1.1 christos vmovdqu 8+32*1-96(%rdi),%ymm2 142 1.1 christos vmovdqu 8+32*2-96(%rdi),%ymm3 143 1.1 christos vmovdqu 8+32*3-96(%rdi),%ymm4 144 1.1 christos vmovdqu 8+32*4-96(%rdi),%ymm5 145 1.1 christos vmovdqu 8+32*5-96(%rdi),%ymm6 146 1.1 christos 147 1.1 christos vmovdqa64 0*32(%r8),%ymm16 # load "rhotate" indices 148 1.1 christos vmovdqa64 1*32(%r8),%ymm17 149 1.1 christos vmovdqa64 2*32(%r8),%ymm18 150 1.1 christos vmovdqa64 3*32(%r8),%ymm19 151 1.1 christos vmovdqa64 4*32(%r8),%ymm20 152 1.1 christos vmovdqa64 5*32(%r8),%ymm21 153 1.1 christos 154 1.1 christos vpxor %ymm7,%ymm7,%ymm7 155 1.1 christos vmovdqa %ymm7,32*2-96(%r10) # zero transfer area on stack 156 1.1 christos vmovdqa %ymm7,32*3-96(%r10) 157 1.1 christos vmovdqa %ymm7,32*4-96(%r10) 158 1.1 christos vmovdqa %ymm7,32*5-96(%r10) 159 1.1 christos vmovdqa %ymm7,32*6-96(%r10) 160 1.1 christos 161 1.1 christos .Loop_absorb_avx512vl: 162 1.1 christos mov %rcx,%rax 163 1.1 christos sub %rcx,%rdx 164 1.1 christos jc .Ldone_absorb_avx512vl 165 1.1 christos 166 1.1 christos shr $3,%eax 167 1.1 christos vpbroadcastq 0-96(%rsi),%ymm7 168 1.1 christos vmovdqu 8-96(%rsi),%ymm8 169 1.1 christos sub $4,%eax 170 1.1 christos dec %eax 171 1.1 christos jz .Labsorved_avx512vl 172 1.1 christos mov 8*5-96(%rsi),%r8 173 1.1 christos mov %r8,80-96(%r10) 174 1.1 christos dec %eax 175 1.1 christos jz .Labsorved_avx512vl 176 1.1 christos mov 8*6-96(%rsi),%r8 177 1.1 christos mov %r8,192-96(%r10) 178 1.1 christos dec %eax 179 1.1 christos jz .Labsorved_avx512vl 180 1.1 christos mov 8*7-96(%rsi),%r8 181 1.1 christos mov %r8,104-96(%r10) 182 1.1 christos dec %eax 183 1.1 christos jz .Labsorved_avx512vl 184 1.1 christos mov 8*8-96(%rsi),%r8 185 1.1 christos mov %r8,144-96(%r10) 186 1.1 christos dec %eax 187 1.1 christos jz .Labsorved_avx512vl 188 1.1 christos mov 8*9-96(%rsi),%r8 189 1.1 christos mov %r8,184-96(%r10) 190 1.1 christos dec %eax 191 1.1 christos jz .Labsorved_avx512vl 192 1.1 christos mov 8*10-96(%rsi),%r8 193 1.1 christos mov %r8,64-96(%r10) 194 1.1 christos dec %eax 195 1.1 christos jz .Labsorved_avx512vl 196 1.1 christos mov 8*11-96(%rsi),%r8 197 1.1 christos mov %r8,128-96(%r10) 198 1.1 christos dec %eax 199 1.1 christos jz .Labsorved_avx512vl 200 1.1 christos mov 8*12-96(%rsi),%r8 201 1.1 christos mov %r8,200-96(%r10) 202 1.1 christos dec %eax 203 1.1 christos jz .Labsorved_avx512vl 204 1.1 christos mov 8*13-96(%rsi),%r8 205 1.1 christos mov %r8,176-96(%r10) 206 1.1 christos dec %eax 207 1.1 christos jz .Labsorved_avx512vl 208 1.1 christos mov 8*14-96(%rsi),%r8 209 1.1 christos mov %r8,120-96(%r10) 210 1.1 christos dec %eax 211 1.1 christos jz .Labsorved_avx512vl 212 1.1 christos mov 8*15-96(%rsi),%r8 213 1.1 christos mov %r8,88-96(%r10) 214 1.1 christos dec %eax 215 1.1 christos jz .Labsorved_avx512vl 216 1.1 christos mov 8*16-96(%rsi),%r8 217 1.1 christos mov %r8,96-96(%r10) 218 1.1 christos dec %eax 219 1.1 christos jz .Labsorved_avx512vl 220 1.1 christos mov 8*17-96(%rsi),%r8 221 1.1 christos mov %r8,168-96(%r10) 222 1.1 christos dec %eax 223 1.1 christos jz .Labsorved_avx512vl 224 1.1 christos mov 8*18-96(%rsi),%r8 225 1.1 christos mov %r8,208-96(%r10) 226 1.1 christos dec %eax 227 1.1 christos jz .Labsorved_avx512vl 228 1.1 christos mov 8*19-96(%rsi),%r8 229 1.1 christos mov %r8,152-96(%r10) 230 1.1 christos dec %eax 231 1.1 christos jz .Labsorved_avx512vl 232 1.1 christos mov 8*20-96(%rsi),%r8 233 1.1 christos mov %r8,72-96(%r10) 234 1.1 christos dec %eax 235 1.1 christos jz .Labsorved_avx512vl 236 1.1 christos mov 8*21-96(%rsi),%r8 237 1.1 christos mov %r8,160-96(%r10) 238 1.1 christos dec %eax 239 1.1 christos jz .Labsorved_avx512vl 240 1.1 christos mov 8*22-96(%rsi),%r8 241 1.1 christos mov %r8,136-96(%r10) 242 1.1 christos dec %eax 243 1.1 christos jz .Labsorved_avx512vl 244 1.1 christos mov 8*23-96(%rsi),%r8 245 1.1 christos mov %r8,112-96(%r10) 246 1.1 christos dec %eax 247 1.1 christos jz .Labsorved_avx512vl 248 1.1 christos mov 8*24-96(%rsi),%r8 249 1.1 christos mov %r8,216-96(%r10) 250 1.1 christos .Labsorved_avx512vl: 251 1.1 christos lea (%rsi,%rcx),%rsi 252 1.1 christos 253 1.1 christos vpxor %ymm7,%ymm0,%ymm0 254 1.1 christos vpxor %ymm8,%ymm1,%ymm1 255 1.1 christos vpxor 32*2-96(%r10),%ymm2,%ymm2 256 1.1 christos vpxor 32*3-96(%r10),%ymm3,%ymm3 257 1.1 christos vpxor 32*4-96(%r10),%ymm4,%ymm4 258 1.1 christos vpxor 32*5-96(%r10),%ymm5,%ymm5 259 1.1 christos vpxor 32*6-96(%r10),%ymm6,%ymm6 260 1.1 christos 261 1.1 christos call __KeccakF1600 262 1.1 christos 263 1.1 christos lea 96(%rsp),%r10 264 1.1 christos jmp .Loop_absorb_avx512vl 265 1.1 christos 266 1.1 christos .Ldone_absorb_avx512vl: 267 1.1 christos vmovq %xmm0,-96(%rdi) 268 1.1 christos vmovdqu %ymm1,8+32*0-96(%rdi) 269 1.1 christos vmovdqu %ymm2,8+32*1-96(%rdi) 270 1.1 christos vmovdqu %ymm3,8+32*2-96(%rdi) 271 1.1 christos vmovdqu %ymm4,8+32*3-96(%rdi) 272 1.1 christos vmovdqu %ymm5,8+32*4-96(%rdi) 273 1.1 christos vmovdqu %ymm6,8+32*5-96(%rdi) 274 1.1 christos 275 1.1 christos vzeroupper 276 1.1 christos 277 1.1 christos lea (%r11),%rsp 278 1.1 christos lea (%rdx,%rcx),%rax # return value 279 1.1 christos ret 280 1.1 christos .size SHA3_absorb,.-SHA3_absorb 281 1.1 christos 282 1.1 christos .globl SHA3_squeeze 283 1.1 christos .type SHA3_squeeze,@function 284 1.1 christos .align 32 285 1.1 christos SHA3_squeeze: 286 1.1 christos mov %rsp,%r11 287 1.1 christos 288 1.1 christos lea 96(%rdi),%rdi 289 1.1 christos lea rhotates_left(%rip),%r8 290 1.1 christos shr $3,%rcx 291 1.1 christos 292 1.1 christos vzeroupper 293 1.1 christos 294 1.1 christos vpbroadcastq -96(%rdi),%ymm0 295 1.1 christos vpxor %ymm7,%ymm7,%ymm7 296 1.1 christos vmovdqu 8+32*0-96(%rdi),%ymm1 297 1.1 christos vmovdqu 8+32*1-96(%rdi),%ymm2 298 1.1 christos vmovdqu 8+32*2-96(%rdi),%ymm3 299 1.1 christos vmovdqu 8+32*3-96(%rdi),%ymm4 300 1.1 christos vmovdqu 8+32*4-96(%rdi),%ymm5 301 1.1 christos vmovdqu 8+32*5-96(%rdi),%ymm6 302 1.1 christos 303 1.1 christos vmovdqa64 0*32(%r8),%ymm16 # load "rhotate" indices 304 1.1 christos vmovdqa64 1*32(%r8),%ymm17 305 1.1 christos vmovdqa64 2*32(%r8),%ymm18 306 1.1 christos vmovdqa64 3*32(%r8),%ymm19 307 1.1 christos vmovdqa64 4*32(%r8),%ymm20 308 1.1 christos vmovdqa64 5*32(%r8),%ymm21 309 1.1 christos 310 1.1 christos mov %rcx,%rax 311 1.1 christos 312 1.1 christos .Loop_squeeze_avx512vl: 313 1.1 christos mov 0-96(%rdi),%r8 314 1.1 christos sub $8,%rdx 315 1.1 christos jc .Ltail_squeeze_avx512vl 316 1.1 christos mov %r8,(%rsi) 317 1.1 christos lea 8(%rsi),%rsi 318 1.1 christos je .Ldone_squeeze_avx512vl 319 1.1 christos dec %eax 320 1.1 christos je .Lextend_output_avx512vl 321 1.1 christos mov 32-120(%rdi),%r8 322 1.1 christos sub $8,%rdx 323 1.1 christos jc .Ltail_squeeze_avx512vl 324 1.1 christos mov %r8,(%rsi) 325 1.1 christos lea 8(%rsi),%rsi 326 1.1 christos je .Ldone_squeeze_avx512vl 327 1.1 christos dec %eax 328 1.1 christos je .Lextend_output_avx512vl 329 1.1 christos mov 40-120(%rdi),%r8 330 1.1 christos sub $8,%rdx 331 1.1 christos jc .Ltail_squeeze_avx512vl 332 1.1 christos mov %r8,(%rsi) 333 1.1 christos lea 8(%rsi),%rsi 334 1.1 christos je .Ldone_squeeze_avx512vl 335 1.1 christos dec %eax 336 1.1 christos je .Lextend_output_avx512vl 337 1.1 christos mov 48-120(%rdi),%r8 338 1.1 christos sub $8,%rdx 339 1.1 christos jc .Ltail_squeeze_avx512vl 340 1.1 christos mov %r8,(%rsi) 341 1.1 christos lea 8(%rsi),%rsi 342 1.1 christos je .Ldone_squeeze_avx512vl 343 1.1 christos dec %eax 344 1.1 christos je .Lextend_output_avx512vl 345 1.1 christos mov 56-120(%rdi),%r8 346 1.1 christos sub $8,%rdx 347 1.1 christos jc .Ltail_squeeze_avx512vl 348 1.1 christos mov %r8,(%rsi) 349 1.1 christos lea 8(%rsi),%rsi 350 1.1 christos je .Ldone_squeeze_avx512vl 351 1.1 christos dec %eax 352 1.1 christos je .Lextend_output_avx512vl 353 1.1 christos mov 80-120(%rdi),%r8 354 1.1 christos sub $8,%rdx 355 1.1 christos jc .Ltail_squeeze_avx512vl 356 1.1 christos mov %r8,(%rsi) 357 1.1 christos lea 8(%rsi),%rsi 358 1.1 christos je .Ldone_squeeze_avx512vl 359 1.1 christos dec %eax 360 1.1 christos je .Lextend_output_avx512vl 361 1.1 christos mov 192-120(%rdi),%r8 362 1.1 christos sub $8,%rdx 363 1.1 christos jc .Ltail_squeeze_avx512vl 364 1.1 christos mov %r8,(%rsi) 365 1.1 christos lea 8(%rsi),%rsi 366 1.1 christos je .Ldone_squeeze_avx512vl 367 1.1 christos dec %eax 368 1.1 christos je .Lextend_output_avx512vl 369 1.1 christos mov 104-120(%rdi),%r8 370 1.1 christos sub $8,%rdx 371 1.1 christos jc .Ltail_squeeze_avx512vl 372 1.1 christos mov %r8,(%rsi) 373 1.1 christos lea 8(%rsi),%rsi 374 1.1 christos je .Ldone_squeeze_avx512vl 375 1.1 christos dec %eax 376 1.1 christos je .Lextend_output_avx512vl 377 1.1 christos mov 144-120(%rdi),%r8 378 1.1 christos sub $8,%rdx 379 1.1 christos jc .Ltail_squeeze_avx512vl 380 1.1 christos mov %r8,(%rsi) 381 1.1 christos lea 8(%rsi),%rsi 382 1.1 christos je .Ldone_squeeze_avx512vl 383 1.1 christos dec %eax 384 1.1 christos je .Lextend_output_avx512vl 385 1.1 christos mov 184-120(%rdi),%r8 386 1.1 christos sub $8,%rdx 387 1.1 christos jc .Ltail_squeeze_avx512vl 388 1.1 christos mov %r8,(%rsi) 389 1.1 christos lea 8(%rsi),%rsi 390 1.1 christos je .Ldone_squeeze_avx512vl 391 1.1 christos dec %eax 392 1.1 christos je .Lextend_output_avx512vl 393 1.1 christos mov 64-120(%rdi),%r8 394 1.1 christos sub $8,%rdx 395 1.1 christos jc .Ltail_squeeze_avx512vl 396 1.1 christos mov %r8,(%rsi) 397 1.1 christos lea 8(%rsi),%rsi 398 1.1 christos je .Ldone_squeeze_avx512vl 399 1.1 christos dec %eax 400 1.1 christos je .Lextend_output_avx512vl 401 1.1 christos mov 128-120(%rdi),%r8 402 1.1 christos sub $8,%rdx 403 1.1 christos jc .Ltail_squeeze_avx512vl 404 1.1 christos mov %r8,(%rsi) 405 1.1 christos lea 8(%rsi),%rsi 406 1.1 christos je .Ldone_squeeze_avx512vl 407 1.1 christos dec %eax 408 1.1 christos je .Lextend_output_avx512vl 409 1.1 christos mov 200-120(%rdi),%r8 410 1.1 christos sub $8,%rdx 411 1.1 christos jc .Ltail_squeeze_avx512vl 412 1.1 christos mov %r8,(%rsi) 413 1.1 christos lea 8(%rsi),%rsi 414 1.1 christos je .Ldone_squeeze_avx512vl 415 1.1 christos dec %eax 416 1.1 christos je .Lextend_output_avx512vl 417 1.1 christos mov 176-120(%rdi),%r8 418 1.1 christos sub $8,%rdx 419 1.1 christos jc .Ltail_squeeze_avx512vl 420 1.1 christos mov %r8,(%rsi) 421 1.1 christos lea 8(%rsi),%rsi 422 1.1 christos je .Ldone_squeeze_avx512vl 423 1.1 christos dec %eax 424 1.1 christos je .Lextend_output_avx512vl 425 1.1 christos mov 120-120(%rdi),%r8 426 1.1 christos sub $8,%rdx 427 1.1 christos jc .Ltail_squeeze_avx512vl 428 1.1 christos mov %r8,(%rsi) 429 1.1 christos lea 8(%rsi),%rsi 430 1.1 christos je .Ldone_squeeze_avx512vl 431 1.1 christos dec %eax 432 1.1 christos je .Lextend_output_avx512vl 433 1.1 christos mov 88-120(%rdi),%r8 434 1.1 christos sub $8,%rdx 435 1.1 christos jc .Ltail_squeeze_avx512vl 436 1.1 christos mov %r8,(%rsi) 437 1.1 christos lea 8(%rsi),%rsi 438 1.1 christos je .Ldone_squeeze_avx512vl 439 1.1 christos dec %eax 440 1.1 christos je .Lextend_output_avx512vl 441 1.1 christos mov 96-120(%rdi),%r8 442 1.1 christos sub $8,%rdx 443 1.1 christos jc .Ltail_squeeze_avx512vl 444 1.1 christos mov %r8,(%rsi) 445 1.1 christos lea 8(%rsi),%rsi 446 1.1 christos je .Ldone_squeeze_avx512vl 447 1.1 christos dec %eax 448 1.1 christos je .Lextend_output_avx512vl 449 1.1 christos mov 168-120(%rdi),%r8 450 1.1 christos sub $8,%rdx 451 1.1 christos jc .Ltail_squeeze_avx512vl 452 1.1 christos mov %r8,(%rsi) 453 1.1 christos lea 8(%rsi),%rsi 454 1.1 christos je .Ldone_squeeze_avx512vl 455 1.1 christos dec %eax 456 1.1 christos je .Lextend_output_avx512vl 457 1.1 christos mov 208-120(%rdi),%r8 458 1.1 christos sub $8,%rdx 459 1.1 christos jc .Ltail_squeeze_avx512vl 460 1.1 christos mov %r8,(%rsi) 461 1.1 christos lea 8(%rsi),%rsi 462 1.1 christos je .Ldone_squeeze_avx512vl 463 1.1 christos dec %eax 464 1.1 christos je .Lextend_output_avx512vl 465 1.1 christos mov 152-120(%rdi),%r8 466 1.1 christos sub $8,%rdx 467 1.1 christos jc .Ltail_squeeze_avx512vl 468 1.1 christos mov %r8,(%rsi) 469 1.1 christos lea 8(%rsi),%rsi 470 1.1 christos je .Ldone_squeeze_avx512vl 471 1.1 christos dec %eax 472 1.1 christos je .Lextend_output_avx512vl 473 1.1 christos mov 72-120(%rdi),%r8 474 1.1 christos sub $8,%rdx 475 1.1 christos jc .Ltail_squeeze_avx512vl 476 1.1 christos mov %r8,(%rsi) 477 1.1 christos lea 8(%rsi),%rsi 478 1.1 christos je .Ldone_squeeze_avx512vl 479 1.1 christos dec %eax 480 1.1 christos je .Lextend_output_avx512vl 481 1.1 christos mov 160-120(%rdi),%r8 482 1.1 christos sub $8,%rdx 483 1.1 christos jc .Ltail_squeeze_avx512vl 484 1.1 christos mov %r8,(%rsi) 485 1.1 christos lea 8(%rsi),%rsi 486 1.1 christos je .Ldone_squeeze_avx512vl 487 1.1 christos dec %eax 488 1.1 christos je .Lextend_output_avx512vl 489 1.1 christos mov 136-120(%rdi),%r8 490 1.1 christos sub $8,%rdx 491 1.1 christos jc .Ltail_squeeze_avx512vl 492 1.1 christos mov %r8,(%rsi) 493 1.1 christos lea 8(%rsi),%rsi 494 1.1 christos je .Ldone_squeeze_avx512vl 495 1.1 christos dec %eax 496 1.1 christos je .Lextend_output_avx512vl 497 1.1 christos mov 112-120(%rdi),%r8 498 1.1 christos sub $8,%rdx 499 1.1 christos jc .Ltail_squeeze_avx512vl 500 1.1 christos mov %r8,(%rsi) 501 1.1 christos lea 8(%rsi),%rsi 502 1.1 christos je .Ldone_squeeze_avx512vl 503 1.1 christos dec %eax 504 1.1 christos je .Lextend_output_avx512vl 505 1.1 christos mov 216-120(%rdi),%r8 506 1.1 christos sub $8,%rdx 507 1.1 christos jc .Ltail_squeeze_avx512vl 508 1.1 christos mov %r8,(%rsi) 509 1.1 christos lea 8(%rsi),%rsi 510 1.1 christos je .Ldone_squeeze_avx512vl 511 1.1 christos dec %eax 512 1.1 christos je .Lextend_output_avx512vl 513 1.1 christos mov -120(%rdi),%r8 514 1.1 christos .Lextend_output_avx512vl: 515 1.1 christos call __KeccakF1600 516 1.1 christos 517 1.1 christos vmovq %xmm0,-96(%rdi) 518 1.1 christos vmovdqu %ymm1,8+32*0-96(%rdi) 519 1.1 christos vmovdqu %ymm2,8+32*1-96(%rdi) 520 1.1 christos vmovdqu %ymm3,8+32*2-96(%rdi) 521 1.1 christos vmovdqu %ymm4,8+32*3-96(%rdi) 522 1.1 christos vmovdqu %ymm5,8+32*4-96(%rdi) 523 1.1 christos vmovdqu %ymm6,8+32*5-96(%rdi) 524 1.1 christos 525 1.1 christos mov %rcx,%rax 526 1.1 christos jmp .Loop_squeeze_avx512vl 527 1.1 christos 528 1.1 christos 529 1.1 christos .Ltail_squeeze_avx512vl: 530 1.1 christos add $8,%rdx 531 1.1 christos .Loop_tail_avx512vl: 532 1.1 christos mov %r8b,(%rsi) 533 1.1 christos lea 1(%rsi),%rsi 534 1.1 christos shr $8,%r8 535 1.1 christos dec %rdx 536 1.1 christos jnz .Loop_tail_avx512vl 537 1.1 christos 538 1.1 christos .Ldone_squeeze_avx512vl: 539 1.1 christos vzeroupper 540 1.1 christos 541 1.1 christos lea (%r11),%rsp 542 1.1 christos ret 543 1.1 christos .size SHA3_squeeze,.-SHA3_squeeze 544 1.1 christos 545 1.1 christos .section .rodata 546 1.1 christos .align 64 547 1.1 christos rhotates_left: 548 1.1 christos .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] 549 1.1 christos .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] 550 1.1 christos .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] 551 1.1 christos .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] 552 1.1 christos .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] 553 1.1 christos .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] 554 1.1 christos iotas: 555 1.1 christos .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 556 1.1 christos .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 557 1.1 christos .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a 558 1.1 christos .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 559 1.1 christos .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b 560 1.1 christos .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 561 1.1 christos .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 562 1.1 christos .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 563 1.1 christos .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a 564 1.1 christos .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 565 1.1 christos .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 566 1.1 christos .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a 567 1.1 christos .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b 568 1.1 christos .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b 569 1.1 christos .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 570 1.1 christos .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 571 1.1 christos .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 572 1.1 christos .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 573 1.1 christos .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a 574 1.1 christos .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a 575 1.1 christos .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 576 1.1 christos .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 577 1.1 christos .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 578 1.1 christos .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 579 1.1 christos 580 1.1 christos .asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro (at) openssl.org>" 581