1 1.1 christos .text 2 1.1 christos 3 1.1 christos .type __KeccakF1600,@function 4 1.1 christos .align 32 5 1.1 christos __KeccakF1600: 6 1.1 christos lea rhotates_left+96(%rip),%r8 7 1.1 christos lea rhotates_right+96(%rip),%r9 8 1.1 christos lea iotas(%rip),%r10 9 1.1 christos mov $24,%eax 10 1.1 christos jmp .Loop_avx2 11 1.1 christos 12 1.1 christos .align 32 13 1.1 christos .Loop_avx2: 14 1.1 christos ######################################### Theta 15 1.1 christos vpshufd $0b01001110,%ymm2,%ymm13 16 1.1 christos vpxor %ymm3,%ymm5,%ymm12 17 1.1 christos vpxor %ymm6,%ymm4,%ymm9 18 1.1 christos vpxor %ymm1,%ymm12,%ymm12 19 1.1 christos vpxor %ymm9,%ymm12,%ymm12 # C[1..4] 20 1.1 christos 21 1.1 christos vpermq $0b10010011,%ymm12,%ymm11 22 1.1 christos vpxor %ymm2,%ymm13,%ymm13 23 1.1 christos vpermq $0b01001110,%ymm13,%ymm7 24 1.1 christos 25 1.1 christos vpsrlq $63,%ymm12,%ymm8 26 1.1 christos vpaddq %ymm12,%ymm12,%ymm9 27 1.1 christos vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) 28 1.1 christos 29 1.1 christos vpermq $0b00111001,%ymm8,%ymm15 30 1.1 christos vpxor %ymm11,%ymm8,%ymm14 31 1.1 christos vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] 32 1.1 christos 33 1.1 christos vpxor %ymm0,%ymm13,%ymm13 34 1.1 christos vpxor %ymm7,%ymm13,%ymm13 # C[0..0] 35 1.1 christos 36 1.1 christos vpsrlq $63,%ymm13,%ymm7 37 1.1 christos vpaddq %ymm13,%ymm13,%ymm8 38 1.1 christos vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) 39 1.1 christos 40 1.1 christos vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] 41 1.1 christos vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] 42 1.1 christos 43 1.1 christos vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 44 1.1 christos vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 45 1.1 christos vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] 46 1.1 christos 47 1.1 christos ######################################### Rho + Pi + pre-Chi shuffle 48 1.1 christos vpsllvq 0*32-96(%r8),%ymm2,%ymm10 49 1.1 christos vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 50 1.1 christos vpor %ymm10,%ymm2,%ymm2 51 1.1 christos 52 1.1 christos vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta 53 1.1 christos vpsllvq 2*32-96(%r8),%ymm3,%ymm11 54 1.1 christos vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 55 1.1 christos vpor %ymm11,%ymm3,%ymm3 56 1.1 christos 57 1.1 christos vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta 58 1.1 christos vpsllvq 3*32-96(%r8),%ymm4,%ymm12 59 1.1 christos vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 60 1.1 christos vpor %ymm12,%ymm4,%ymm4 61 1.1 christos 62 1.1 christos vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta 63 1.1 christos vpsllvq 4*32-96(%r8),%ymm5,%ymm13 64 1.1 christos vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 65 1.1 christos vpor %ymm13,%ymm5,%ymm5 66 1.1 christos 67 1.1 christos vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta 68 1.1 christos vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 69 1.1 christos vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 70 1.1 christos vpsllvq 5*32-96(%r8),%ymm6,%ymm14 71 1.1 christos vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 72 1.1 christos vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 73 1.1 christos 74 1.1 christos vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta 75 1.1 christos vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 76 1.1 christos vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 77 1.1 christos vpsllvq 1*32-96(%r8),%ymm1,%ymm15 78 1.1 christos vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 79 1.1 christos vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 80 1.1 christos 81 1.1 christos ######################################### Chi 82 1.1 christos vpsrldq $8,%ymm8,%ymm14 83 1.1 christos vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] 84 1.1 christos 85 1.1 christos vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] 86 1.1 christos vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] 87 1.1 christos vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] 88 1.1 christos vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] 89 1.1 christos vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] 90 1.1 christos vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] 91 1.1 christos vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] 92 1.1 christos vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] 93 1.1 christos vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] 94 1.1 christos vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] 95 1.1 christos vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] 96 1.1 christos vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] 97 1.1 christos vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] 98 1.1 christos vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] 99 1.1 christos 100 1.1 christos vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] 101 1.1 christos vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] 102 1.1 christos vpxor %ymm10,%ymm3,%ymm3 103 1.1 christos vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] 104 1.1 christos vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] 105 1.1 christos vpxor %ymm12,%ymm5,%ymm5 106 1.1 christos vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] 107 1.1 christos vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] 108 1.1 christos vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] 109 1.1 christos vpxor %ymm13,%ymm6,%ymm6 110 1.1 christos 111 1.1 christos vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] 112 1.1 christos vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] 113 1.1 christos vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] 114 1.1 christos vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] 115 1.1 christos vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] 116 1.1 christos 117 1.1 christos vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] 118 1.1 christos vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] 119 1.1 christos vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] 120 1.1 christos vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] 121 1.1 christos vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] 122 1.1 christos vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] 123 1.1 christos vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] 124 1.1 christos vpxor %ymm9,%ymm2,%ymm2 125 1.1 christos 126 1.1 christos vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] 127 1.1 christos vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle 128 1.1 christos vpermq $0b10001101,%ymm5,%ymm5 129 1.1 christos vpermq $0b01110010,%ymm6,%ymm6 130 1.1 christos 131 1.1 christos vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] 132 1.1 christos vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] 133 1.1 christos vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] 134 1.1 christos vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] 135 1.1 christos vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] 136 1.1 christos vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] 137 1.1 christos vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] 138 1.1 christos 139 1.1 christos vpxor %ymm7,%ymm0,%ymm0 140 1.1 christos vpxor %ymm8,%ymm1,%ymm1 141 1.1 christos vpxor %ymm11,%ymm4,%ymm4 142 1.1 christos 143 1.1 christos ######################################### Iota 144 1.1 christos vpxor (%r10),%ymm0,%ymm0 145 1.1 christos lea 32(%r10),%r10 146 1.1 christos 147 1.1 christos dec %eax 148 1.1 christos jnz .Loop_avx2 149 1.1 christos 150 1.1 christos ret 151 1.1 christos .size __KeccakF1600,.-__KeccakF1600 152 1.1 christos .globl SHA3_absorb 153 1.1 christos .type SHA3_absorb,@function 154 1.1 christos .align 32 155 1.1 christos SHA3_absorb: 156 1.1 christos mov %rsp,%r11 157 1.1 christos 158 1.1 christos lea -240(%rsp),%rsp 159 1.1 christos and $-32,%rsp 160 1.1 christos 161 1.1 christos lea 96(%rdi),%rdi 162 1.1 christos lea 96(%rsi),%rsi 163 1.1 christos lea 96(%rsp),%r10 164 1.1 christos 165 1.1 christos vzeroupper 166 1.1 christos 167 1.1 christos vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] 168 1.1 christos vmovdqu 8+32*0-96(%rdi),%ymm1 169 1.1 christos vmovdqu 8+32*1-96(%rdi),%ymm2 170 1.1 christos vmovdqu 8+32*2-96(%rdi),%ymm3 171 1.1 christos vmovdqu 8+32*3-96(%rdi),%ymm4 172 1.1 christos vmovdqu 8+32*4-96(%rdi),%ymm5 173 1.1 christos vmovdqu 8+32*5-96(%rdi),%ymm6 174 1.1 christos 175 1.1 christos vpxor %ymm7,%ymm7,%ymm7 176 1.1 christos vmovdqa %ymm7,32*2-96(%r10) # zero transfer area on stack 177 1.1 christos vmovdqa %ymm7,32*3-96(%r10) 178 1.1 christos vmovdqa %ymm7,32*4-96(%r10) 179 1.1 christos vmovdqa %ymm7,32*5-96(%r10) 180 1.1 christos vmovdqa %ymm7,32*6-96(%r10) 181 1.1 christos 182 1.1 christos .Loop_absorb_avx2: 183 1.1 christos mov %rcx,%rax 184 1.1 christos sub %rcx,%rdx 185 1.1 christos jc .Ldone_absorb_avx2 186 1.1 christos 187 1.1 christos shr $3,%eax 188 1.1 christos vpbroadcastq 0-96(%rsi),%ymm7 189 1.1 christos vmovdqu 8-96(%rsi),%ymm8 190 1.1 christos sub $4,%eax 191 1.1 christos dec %eax 192 1.1 christos jz .Labsorved_avx2 193 1.1 christos mov 8*5-96(%rsi),%r8 194 1.1 christos mov %r8,80-96(%r10) 195 1.1 christos dec %eax 196 1.1 christos jz .Labsorved_avx2 197 1.1 christos mov 8*6-96(%rsi),%r8 198 1.1 christos mov %r8,192-96(%r10) 199 1.1 christos dec %eax 200 1.1 christos jz .Labsorved_avx2 201 1.1 christos mov 8*7-96(%rsi),%r8 202 1.1 christos mov %r8,104-96(%r10) 203 1.1 christos dec %eax 204 1.1 christos jz .Labsorved_avx2 205 1.1 christos mov 8*8-96(%rsi),%r8 206 1.1 christos mov %r8,144-96(%r10) 207 1.1 christos dec %eax 208 1.1 christos jz .Labsorved_avx2 209 1.1 christos mov 8*9-96(%rsi),%r8 210 1.1 christos mov %r8,184-96(%r10) 211 1.1 christos dec %eax 212 1.1 christos jz .Labsorved_avx2 213 1.1 christos mov 8*10-96(%rsi),%r8 214 1.1 christos mov %r8,64-96(%r10) 215 1.1 christos dec %eax 216 1.1 christos jz .Labsorved_avx2 217 1.1 christos mov 8*11-96(%rsi),%r8 218 1.1 christos mov %r8,128-96(%r10) 219 1.1 christos dec %eax 220 1.1 christos jz .Labsorved_avx2 221 1.1 christos mov 8*12-96(%rsi),%r8 222 1.1 christos mov %r8,200-96(%r10) 223 1.1 christos dec %eax 224 1.1 christos jz .Labsorved_avx2 225 1.1 christos mov 8*13-96(%rsi),%r8 226 1.1 christos mov %r8,176-96(%r10) 227 1.1 christos dec %eax 228 1.1 christos jz .Labsorved_avx2 229 1.1 christos mov 8*14-96(%rsi),%r8 230 1.1 christos mov %r8,120-96(%r10) 231 1.1 christos dec %eax 232 1.1 christos jz .Labsorved_avx2 233 1.1 christos mov 8*15-96(%rsi),%r8 234 1.1 christos mov %r8,88-96(%r10) 235 1.1 christos dec %eax 236 1.1 christos jz .Labsorved_avx2 237 1.1 christos mov 8*16-96(%rsi),%r8 238 1.1 christos mov %r8,96-96(%r10) 239 1.1 christos dec %eax 240 1.1 christos jz .Labsorved_avx2 241 1.1 christos mov 8*17-96(%rsi),%r8 242 1.1 christos mov %r8,168-96(%r10) 243 1.1 christos dec %eax 244 1.1 christos jz .Labsorved_avx2 245 1.1 christos mov 8*18-96(%rsi),%r8 246 1.1 christos mov %r8,208-96(%r10) 247 1.1 christos dec %eax 248 1.1 christos jz .Labsorved_avx2 249 1.1 christos mov 8*19-96(%rsi),%r8 250 1.1 christos mov %r8,152-96(%r10) 251 1.1 christos dec %eax 252 1.1 christos jz .Labsorved_avx2 253 1.1 christos mov 8*20-96(%rsi),%r8 254 1.1 christos mov %r8,72-96(%r10) 255 1.1 christos dec %eax 256 1.1 christos jz .Labsorved_avx2 257 1.1 christos mov 8*21-96(%rsi),%r8 258 1.1 christos mov %r8,160-96(%r10) 259 1.1 christos dec %eax 260 1.1 christos jz .Labsorved_avx2 261 1.1 christos mov 8*22-96(%rsi),%r8 262 1.1 christos mov %r8,136-96(%r10) 263 1.1 christos dec %eax 264 1.1 christos jz .Labsorved_avx2 265 1.1 christos mov 8*23-96(%rsi),%r8 266 1.1 christos mov %r8,112-96(%r10) 267 1.1 christos dec %eax 268 1.1 christos jz .Labsorved_avx2 269 1.1 christos mov 8*24-96(%rsi),%r8 270 1.1 christos mov %r8,216-96(%r10) 271 1.1 christos .Labsorved_avx2: 272 1.1 christos lea (%rsi,%rcx),%rsi 273 1.1 christos 274 1.1 christos vpxor %ymm7,%ymm0,%ymm0 275 1.1 christos vpxor %ymm8,%ymm1,%ymm1 276 1.1 christos vpxor 32*2-96(%r10),%ymm2,%ymm2 277 1.1 christos vpxor 32*3-96(%r10),%ymm3,%ymm3 278 1.1 christos vpxor 32*4-96(%r10),%ymm4,%ymm4 279 1.1 christos vpxor 32*5-96(%r10),%ymm5,%ymm5 280 1.1 christos vpxor 32*6-96(%r10),%ymm6,%ymm6 281 1.1 christos 282 1.1 christos call __KeccakF1600 283 1.1 christos 284 1.1 christos lea 96(%rsp),%r10 285 1.1 christos jmp .Loop_absorb_avx2 286 1.1 christos 287 1.1 christos .Ldone_absorb_avx2: 288 1.1 christos vmovq %xmm0,-96(%rdi) 289 1.1 christos vmovdqu %ymm1,8+32*0-96(%rdi) 290 1.1 christos vmovdqu %ymm2,8+32*1-96(%rdi) 291 1.1 christos vmovdqu %ymm3,8+32*2-96(%rdi) 292 1.1 christos vmovdqu %ymm4,8+32*3-96(%rdi) 293 1.1 christos vmovdqu %ymm5,8+32*4-96(%rdi) 294 1.1 christos vmovdqu %ymm6,8+32*5-96(%rdi) 295 1.1 christos 296 1.1 christos vzeroupper 297 1.1 christos 298 1.1 christos lea (%r11),%rsp 299 1.1 christos lea (%rdx,%rcx),%rax # return value 300 1.1 christos ret 301 1.1 christos .size SHA3_absorb,.-SHA3_absorb 302 1.1 christos 303 1.1 christos .globl SHA3_squeeze 304 1.1 christos .type SHA3_squeeze,@function 305 1.1 christos .align 32 306 1.1 christos SHA3_squeeze: 307 1.1 christos mov %rsp,%r11 308 1.1 christos 309 1.1 christos lea 96(%rdi),%rdi 310 1.1 christos shr $3,%rcx 311 1.1 christos 312 1.1 christos vzeroupper 313 1.1 christos 314 1.1 christos vpbroadcastq -96(%rdi),%ymm0 315 1.1 christos vpxor %ymm7,%ymm7,%ymm7 316 1.1 christos vmovdqu 8+32*0-96(%rdi),%ymm1 317 1.1 christos vmovdqu 8+32*1-96(%rdi),%ymm2 318 1.1 christos vmovdqu 8+32*2-96(%rdi),%ymm3 319 1.1 christos vmovdqu 8+32*3-96(%rdi),%ymm4 320 1.1 christos vmovdqu 8+32*4-96(%rdi),%ymm5 321 1.1 christos vmovdqu 8+32*5-96(%rdi),%ymm6 322 1.1 christos 323 1.1 christos mov %rcx,%rax 324 1.1 christos 325 1.1 christos .Loop_squeeze_avx2: 326 1.1 christos mov 0-96(%rdi),%r8 327 1.1 christos sub $8,%rdx 328 1.1 christos jc .Ltail_squeeze_avx2 329 1.1 christos mov %r8,(%rsi) 330 1.1 christos lea 8(%rsi),%rsi 331 1.1 christos je .Ldone_squeeze_avx2 332 1.1 christos dec %eax 333 1.1 christos je .Lextend_output_avx2 334 1.1 christos mov 32-120(%rdi),%r8 335 1.1 christos sub $8,%rdx 336 1.1 christos jc .Ltail_squeeze_avx2 337 1.1 christos mov %r8,(%rsi) 338 1.1 christos lea 8(%rsi),%rsi 339 1.1 christos je .Ldone_squeeze_avx2 340 1.1 christos dec %eax 341 1.1 christos je .Lextend_output_avx2 342 1.1 christos mov 40-120(%rdi),%r8 343 1.1 christos sub $8,%rdx 344 1.1 christos jc .Ltail_squeeze_avx2 345 1.1 christos mov %r8,(%rsi) 346 1.1 christos lea 8(%rsi),%rsi 347 1.1 christos je .Ldone_squeeze_avx2 348 1.1 christos dec %eax 349 1.1 christos je .Lextend_output_avx2 350 1.1 christos mov 48-120(%rdi),%r8 351 1.1 christos sub $8,%rdx 352 1.1 christos jc .Ltail_squeeze_avx2 353 1.1 christos mov %r8,(%rsi) 354 1.1 christos lea 8(%rsi),%rsi 355 1.1 christos je .Ldone_squeeze_avx2 356 1.1 christos dec %eax 357 1.1 christos je .Lextend_output_avx2 358 1.1 christos mov 56-120(%rdi),%r8 359 1.1 christos sub $8,%rdx 360 1.1 christos jc .Ltail_squeeze_avx2 361 1.1 christos mov %r8,(%rsi) 362 1.1 christos lea 8(%rsi),%rsi 363 1.1 christos je .Ldone_squeeze_avx2 364 1.1 christos dec %eax 365 1.1 christos je .Lextend_output_avx2 366 1.1 christos mov 80-120(%rdi),%r8 367 1.1 christos sub $8,%rdx 368 1.1 christos jc .Ltail_squeeze_avx2 369 1.1 christos mov %r8,(%rsi) 370 1.1 christos lea 8(%rsi),%rsi 371 1.1 christos je .Ldone_squeeze_avx2 372 1.1 christos dec %eax 373 1.1 christos je .Lextend_output_avx2 374 1.1 christos mov 192-120(%rdi),%r8 375 1.1 christos sub $8,%rdx 376 1.1 christos jc .Ltail_squeeze_avx2 377 1.1 christos mov %r8,(%rsi) 378 1.1 christos lea 8(%rsi),%rsi 379 1.1 christos je .Ldone_squeeze_avx2 380 1.1 christos dec %eax 381 1.1 christos je .Lextend_output_avx2 382 1.1 christos mov 104-120(%rdi),%r8 383 1.1 christos sub $8,%rdx 384 1.1 christos jc .Ltail_squeeze_avx2 385 1.1 christos mov %r8,(%rsi) 386 1.1 christos lea 8(%rsi),%rsi 387 1.1 christos je .Ldone_squeeze_avx2 388 1.1 christos dec %eax 389 1.1 christos je .Lextend_output_avx2 390 1.1 christos mov 144-120(%rdi),%r8 391 1.1 christos sub $8,%rdx 392 1.1 christos jc .Ltail_squeeze_avx2 393 1.1 christos mov %r8,(%rsi) 394 1.1 christos lea 8(%rsi),%rsi 395 1.1 christos je .Ldone_squeeze_avx2 396 1.1 christos dec %eax 397 1.1 christos je .Lextend_output_avx2 398 1.1 christos mov 184-120(%rdi),%r8 399 1.1 christos sub $8,%rdx 400 1.1 christos jc .Ltail_squeeze_avx2 401 1.1 christos mov %r8,(%rsi) 402 1.1 christos lea 8(%rsi),%rsi 403 1.1 christos je .Ldone_squeeze_avx2 404 1.1 christos dec %eax 405 1.1 christos je .Lextend_output_avx2 406 1.1 christos mov 64-120(%rdi),%r8 407 1.1 christos sub $8,%rdx 408 1.1 christos jc .Ltail_squeeze_avx2 409 1.1 christos mov %r8,(%rsi) 410 1.1 christos lea 8(%rsi),%rsi 411 1.1 christos je .Ldone_squeeze_avx2 412 1.1 christos dec %eax 413 1.1 christos je .Lextend_output_avx2 414 1.1 christos mov 128-120(%rdi),%r8 415 1.1 christos sub $8,%rdx 416 1.1 christos jc .Ltail_squeeze_avx2 417 1.1 christos mov %r8,(%rsi) 418 1.1 christos lea 8(%rsi),%rsi 419 1.1 christos je .Ldone_squeeze_avx2 420 1.1 christos dec %eax 421 1.1 christos je .Lextend_output_avx2 422 1.1 christos mov 200-120(%rdi),%r8 423 1.1 christos sub $8,%rdx 424 1.1 christos jc .Ltail_squeeze_avx2 425 1.1 christos mov %r8,(%rsi) 426 1.1 christos lea 8(%rsi),%rsi 427 1.1 christos je .Ldone_squeeze_avx2 428 1.1 christos dec %eax 429 1.1 christos je .Lextend_output_avx2 430 1.1 christos mov 176-120(%rdi),%r8 431 1.1 christos sub $8,%rdx 432 1.1 christos jc .Ltail_squeeze_avx2 433 1.1 christos mov %r8,(%rsi) 434 1.1 christos lea 8(%rsi),%rsi 435 1.1 christos je .Ldone_squeeze_avx2 436 1.1 christos dec %eax 437 1.1 christos je .Lextend_output_avx2 438 1.1 christos mov 120-120(%rdi),%r8 439 1.1 christos sub $8,%rdx 440 1.1 christos jc .Ltail_squeeze_avx2 441 1.1 christos mov %r8,(%rsi) 442 1.1 christos lea 8(%rsi),%rsi 443 1.1 christos je .Ldone_squeeze_avx2 444 1.1 christos dec %eax 445 1.1 christos je .Lextend_output_avx2 446 1.1 christos mov 88-120(%rdi),%r8 447 1.1 christos sub $8,%rdx 448 1.1 christos jc .Ltail_squeeze_avx2 449 1.1 christos mov %r8,(%rsi) 450 1.1 christos lea 8(%rsi),%rsi 451 1.1 christos je .Ldone_squeeze_avx2 452 1.1 christos dec %eax 453 1.1 christos je .Lextend_output_avx2 454 1.1 christos mov 96-120(%rdi),%r8 455 1.1 christos sub $8,%rdx 456 1.1 christos jc .Ltail_squeeze_avx2 457 1.1 christos mov %r8,(%rsi) 458 1.1 christos lea 8(%rsi),%rsi 459 1.1 christos je .Ldone_squeeze_avx2 460 1.1 christos dec %eax 461 1.1 christos je .Lextend_output_avx2 462 1.1 christos mov 168-120(%rdi),%r8 463 1.1 christos sub $8,%rdx 464 1.1 christos jc .Ltail_squeeze_avx2 465 1.1 christos mov %r8,(%rsi) 466 1.1 christos lea 8(%rsi),%rsi 467 1.1 christos je .Ldone_squeeze_avx2 468 1.1 christos dec %eax 469 1.1 christos je .Lextend_output_avx2 470 1.1 christos mov 208-120(%rdi),%r8 471 1.1 christos sub $8,%rdx 472 1.1 christos jc .Ltail_squeeze_avx2 473 1.1 christos mov %r8,(%rsi) 474 1.1 christos lea 8(%rsi),%rsi 475 1.1 christos je .Ldone_squeeze_avx2 476 1.1 christos dec %eax 477 1.1 christos je .Lextend_output_avx2 478 1.1 christos mov 152-120(%rdi),%r8 479 1.1 christos sub $8,%rdx 480 1.1 christos jc .Ltail_squeeze_avx2 481 1.1 christos mov %r8,(%rsi) 482 1.1 christos lea 8(%rsi),%rsi 483 1.1 christos je .Ldone_squeeze_avx2 484 1.1 christos dec %eax 485 1.1 christos je .Lextend_output_avx2 486 1.1 christos mov 72-120(%rdi),%r8 487 1.1 christos sub $8,%rdx 488 1.1 christos jc .Ltail_squeeze_avx2 489 1.1 christos mov %r8,(%rsi) 490 1.1 christos lea 8(%rsi),%rsi 491 1.1 christos je .Ldone_squeeze_avx2 492 1.1 christos dec %eax 493 1.1 christos je .Lextend_output_avx2 494 1.1 christos mov 160-120(%rdi),%r8 495 1.1 christos sub $8,%rdx 496 1.1 christos jc .Ltail_squeeze_avx2 497 1.1 christos mov %r8,(%rsi) 498 1.1 christos lea 8(%rsi),%rsi 499 1.1 christos je .Ldone_squeeze_avx2 500 1.1 christos dec %eax 501 1.1 christos je .Lextend_output_avx2 502 1.1 christos mov 136-120(%rdi),%r8 503 1.1 christos sub $8,%rdx 504 1.1 christos jc .Ltail_squeeze_avx2 505 1.1 christos mov %r8,(%rsi) 506 1.1 christos lea 8(%rsi),%rsi 507 1.1 christos je .Ldone_squeeze_avx2 508 1.1 christos dec %eax 509 1.1 christos je .Lextend_output_avx2 510 1.1 christos mov 112-120(%rdi),%r8 511 1.1 christos sub $8,%rdx 512 1.1 christos jc .Ltail_squeeze_avx2 513 1.1 christos mov %r8,(%rsi) 514 1.1 christos lea 8(%rsi),%rsi 515 1.1 christos je .Ldone_squeeze_avx2 516 1.1 christos dec %eax 517 1.1 christos je .Lextend_output_avx2 518 1.1 christos mov 216-120(%rdi),%r8 519 1.1 christos sub $8,%rdx 520 1.1 christos jc .Ltail_squeeze_avx2 521 1.1 christos mov %r8,(%rsi) 522 1.1 christos lea 8(%rsi),%rsi 523 1.1 christos je .Ldone_squeeze_avx2 524 1.1 christos dec %eax 525 1.1 christos je .Lextend_output_avx2 526 1.1 christos mov -120(%rdi),%r8 527 1.1 christos .Lextend_output_avx2: 528 1.1 christos call __KeccakF1600 529 1.1 christos 530 1.1 christos vmovq %xmm0,-96(%rdi) 531 1.1 christos vmovdqu %ymm1,8+32*0-96(%rdi) 532 1.1 christos vmovdqu %ymm2,8+32*1-96(%rdi) 533 1.1 christos vmovdqu %ymm3,8+32*2-96(%rdi) 534 1.1 christos vmovdqu %ymm4,8+32*3-96(%rdi) 535 1.1 christos vmovdqu %ymm5,8+32*4-96(%rdi) 536 1.1 christos vmovdqu %ymm6,8+32*5-96(%rdi) 537 1.1 christos 538 1.1 christos mov %rcx,%rax 539 1.1 christos jmp .Loop_squeeze_avx2 540 1.1 christos 541 1.1 christos 542 1.1 christos .Ltail_squeeze_avx2: 543 1.1 christos add $8,%rdx 544 1.1 christos .Loop_tail_avx2: 545 1.1 christos mov %r8b,(%rsi) 546 1.1 christos lea 1(%rsi),%rsi 547 1.1 christos shr $8,%r8 548 1.1 christos dec %rdx 549 1.1 christos jnz .Loop_tail_avx2 550 1.1 christos 551 1.1 christos .Ldone_squeeze_avx2: 552 1.1 christos vzeroupper 553 1.1 christos 554 1.1 christos lea (%r11),%rsp 555 1.1 christos ret 556 1.1 christos .size SHA3_squeeze,.-SHA3_squeeze 557 1.1 christos 558 1.1 christos .section .rodata 559 1.1 christos .align 64 560 1.1 christos rhotates_left: 561 1.1 christos .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] 562 1.1 christos .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] 563 1.1 christos .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] 564 1.1 christos .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] 565 1.1 christos .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] 566 1.1 christos .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] 567 1.1 christos rhotates_right: 568 1.1 christos .quad 64-3, 64-18, 64-36, 64-41 569 1.1 christos .quad 64-1, 64-62, 64-28, 64-27 570 1.1 christos .quad 64-45, 64-6, 64-56, 64-39 571 1.1 christos .quad 64-10, 64-61, 64-55, 64-8 572 1.1 christos .quad 64-2, 64-15, 64-25, 64-20 573 1.1 christos .quad 64-44, 64-43, 64-21, 64-14 574 1.1 christos iotas: 575 1.1 christos .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 576 1.1 christos .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 577 1.1 christos .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a 578 1.1 christos .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 579 1.1 christos .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b 580 1.1 christos .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 581 1.1 christos .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 582 1.1 christos .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 583 1.1 christos .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a 584 1.1 christos .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 585 1.1 christos .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 586 1.1 christos .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a 587 1.1 christos .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b 588 1.1 christos .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b 589 1.1 christos .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 590 1.1 christos .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 591 1.1 christos .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 592 1.1 christos .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 593 1.1 christos .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a 594 1.1 christos .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a 595 1.1 christos .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 596 1.1 christos .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 597 1.1 christos .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 598 1.1 christos .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 599 1.1 christos 600 1.1 christos .asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro (at) openssl.org>" 601