1 #include <machine/asm.h> 2 .text 3 4 .globl ossl_rsaz_amm52x40_x1_avxifma256 5 .type ossl_rsaz_amm52x40_x1_avxifma256,@function 6 .align 32 7 ossl_rsaz_amm52x40_x1_avxifma256: 8 .cfi_startproc 9 .byte 243,15,30,250 10 pushq %rbx 11 .cfi_adjust_cfa_offset 8 12 .cfi_offset %rbx,-16 13 pushq %rbp 14 .cfi_adjust_cfa_offset 8 15 .cfi_offset %rbp,-24 16 pushq %r12 17 .cfi_adjust_cfa_offset 8 18 .cfi_offset %r12,-32 19 pushq %r13 20 .cfi_adjust_cfa_offset 8 21 .cfi_offset %r13,-40 22 pushq %r14 23 .cfi_adjust_cfa_offset 8 24 .cfi_offset %r14,-48 25 pushq %r15 26 .cfi_adjust_cfa_offset 8 27 .cfi_offset %r15,-56 28 29 vpxor %ymm0,%ymm0,%ymm0 30 vmovapd %ymm0,%ymm3 31 vmovapd %ymm0,%ymm4 32 vmovapd %ymm0,%ymm5 33 vmovapd %ymm0,%ymm6 34 vmovapd %ymm0,%ymm7 35 vmovapd %ymm0,%ymm8 36 vmovapd %ymm0,%ymm9 37 vmovapd %ymm0,%ymm10 38 vmovapd %ymm0,%ymm11 39 vmovapd %ymm0,%ymm12 40 41 xorl %r9d,%r9d 42 43 movq %rdx,%r11 44 movq $0xfffffffffffff,%rax 45 46 47 movl $10,%ebx 48 49 .align 32 50 .Lloop10: 51 movq 0(%r11),%r13 52 53 vpbroadcastq 0(%r11),%ymm1 54 movq 0(%rsi),%rdx 55 mulxq %r13,%r13,%r12 56 addq %r13,%r9 57 movq %r12,%r10 58 adcq $0,%r10 59 60 movq %r8,%r13 61 imulq %r9,%r13 62 andq %rax,%r13 63 64 vmovq %r13,%xmm2 65 vpbroadcastq %xmm2,%ymm2 66 movq 0(%rcx),%rdx 67 mulxq %r13,%r13,%r12 68 addq %r13,%r9 69 adcq %r12,%r10 70 71 shrq $52,%r9 72 salq $12,%r10 73 orq %r10,%r9 74 75 leaq -328(%rsp),%rsp 76 77 {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 78 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 79 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 80 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 81 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 82 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 83 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 84 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 85 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 86 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 87 88 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 89 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 90 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 91 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 92 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 93 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 94 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 95 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 96 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 97 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 98 vmovdqu %ymm3,0(%rsp) 99 vmovdqu %ymm4,32(%rsp) 100 vmovdqu %ymm5,64(%rsp) 101 vmovdqu %ymm6,96(%rsp) 102 vmovdqu %ymm7,128(%rsp) 103 vmovdqu %ymm8,160(%rsp) 104 vmovdqu %ymm9,192(%rsp) 105 vmovdqu %ymm10,224(%rsp) 106 vmovdqu %ymm11,256(%rsp) 107 vmovdqu %ymm12,288(%rsp) 108 movq $0,320(%rsp) 109 110 vmovdqu 8(%rsp),%ymm3 111 vmovdqu 40(%rsp),%ymm4 112 vmovdqu 72(%rsp),%ymm5 113 vmovdqu 104(%rsp),%ymm6 114 vmovdqu 136(%rsp),%ymm7 115 vmovdqu 168(%rsp),%ymm8 116 vmovdqu 200(%rsp),%ymm9 117 vmovdqu 232(%rsp),%ymm10 118 vmovdqu 264(%rsp),%ymm11 119 vmovdqu 296(%rsp),%ymm12 120 121 addq 8(%rsp),%r9 122 123 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 124 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 125 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 126 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 127 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 128 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 129 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 130 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 131 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 132 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 133 134 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 135 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 136 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 137 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 138 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 139 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 140 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 141 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 142 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 143 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 144 leaq 328(%rsp),%rsp 145 movq 8(%r11),%r13 146 147 vpbroadcastq 8(%r11),%ymm1 148 movq 0(%rsi),%rdx 149 mulxq %r13,%r13,%r12 150 addq %r13,%r9 151 movq %r12,%r10 152 adcq $0,%r10 153 154 movq %r8,%r13 155 imulq %r9,%r13 156 andq %rax,%r13 157 158 vmovq %r13,%xmm2 159 vpbroadcastq %xmm2,%ymm2 160 movq 0(%rcx),%rdx 161 mulxq %r13,%r13,%r12 162 addq %r13,%r9 163 adcq %r12,%r10 164 165 shrq $52,%r9 166 salq $12,%r10 167 orq %r10,%r9 168 169 leaq -328(%rsp),%rsp 170 171 {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 172 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 173 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 174 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 175 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 176 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 177 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 178 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 179 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 180 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 181 182 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 183 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 184 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 185 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 186 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 187 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 188 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 189 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 190 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 191 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 192 vmovdqu %ymm3,0(%rsp) 193 vmovdqu %ymm4,32(%rsp) 194 vmovdqu %ymm5,64(%rsp) 195 vmovdqu %ymm6,96(%rsp) 196 vmovdqu %ymm7,128(%rsp) 197 vmovdqu %ymm8,160(%rsp) 198 vmovdqu %ymm9,192(%rsp) 199 vmovdqu %ymm10,224(%rsp) 200 vmovdqu %ymm11,256(%rsp) 201 vmovdqu %ymm12,288(%rsp) 202 movq $0,320(%rsp) 203 204 vmovdqu 8(%rsp),%ymm3 205 vmovdqu 40(%rsp),%ymm4 206 vmovdqu 72(%rsp),%ymm5 207 vmovdqu 104(%rsp),%ymm6 208 vmovdqu 136(%rsp),%ymm7 209 vmovdqu 168(%rsp),%ymm8 210 vmovdqu 200(%rsp),%ymm9 211 vmovdqu 232(%rsp),%ymm10 212 vmovdqu 264(%rsp),%ymm11 213 vmovdqu 296(%rsp),%ymm12 214 215 addq 8(%rsp),%r9 216 217 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 218 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 219 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 220 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 221 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 222 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 223 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 224 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 225 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 226 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 227 228 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 229 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 230 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 231 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 232 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 233 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 234 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 235 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 236 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 237 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 238 leaq 328(%rsp),%rsp 239 movq 16(%r11),%r13 240 241 vpbroadcastq 16(%r11),%ymm1 242 movq 0(%rsi),%rdx 243 mulxq %r13,%r13,%r12 244 addq %r13,%r9 245 movq %r12,%r10 246 adcq $0,%r10 247 248 movq %r8,%r13 249 imulq %r9,%r13 250 andq %rax,%r13 251 252 vmovq %r13,%xmm2 253 vpbroadcastq %xmm2,%ymm2 254 movq 0(%rcx),%rdx 255 mulxq %r13,%r13,%r12 256 addq %r13,%r9 257 adcq %r12,%r10 258 259 shrq $52,%r9 260 salq $12,%r10 261 orq %r10,%r9 262 263 leaq -328(%rsp),%rsp 264 265 {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 266 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 267 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 268 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 269 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 270 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 271 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 272 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 273 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 274 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 275 276 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 277 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 278 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 279 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 280 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 281 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 282 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 283 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 284 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 285 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 286 vmovdqu %ymm3,0(%rsp) 287 vmovdqu %ymm4,32(%rsp) 288 vmovdqu %ymm5,64(%rsp) 289 vmovdqu %ymm6,96(%rsp) 290 vmovdqu %ymm7,128(%rsp) 291 vmovdqu %ymm8,160(%rsp) 292 vmovdqu %ymm9,192(%rsp) 293 vmovdqu %ymm10,224(%rsp) 294 vmovdqu %ymm11,256(%rsp) 295 vmovdqu %ymm12,288(%rsp) 296 movq $0,320(%rsp) 297 298 vmovdqu 8(%rsp),%ymm3 299 vmovdqu 40(%rsp),%ymm4 300 vmovdqu 72(%rsp),%ymm5 301 vmovdqu 104(%rsp),%ymm6 302 vmovdqu 136(%rsp),%ymm7 303 vmovdqu 168(%rsp),%ymm8 304 vmovdqu 200(%rsp),%ymm9 305 vmovdqu 232(%rsp),%ymm10 306 vmovdqu 264(%rsp),%ymm11 307 vmovdqu 296(%rsp),%ymm12 308 309 addq 8(%rsp),%r9 310 311 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 312 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 313 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 314 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 315 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 316 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 317 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 318 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 319 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 320 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 321 322 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 323 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 324 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 325 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 326 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 327 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 328 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 329 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 330 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 331 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 332 leaq 328(%rsp),%rsp 333 movq 24(%r11),%r13 334 335 vpbroadcastq 24(%r11),%ymm1 336 movq 0(%rsi),%rdx 337 mulxq %r13,%r13,%r12 338 addq %r13,%r9 339 movq %r12,%r10 340 adcq $0,%r10 341 342 movq %r8,%r13 343 imulq %r9,%r13 344 andq %rax,%r13 345 346 vmovq %r13,%xmm2 347 vpbroadcastq %xmm2,%ymm2 348 movq 0(%rcx),%rdx 349 mulxq %r13,%r13,%r12 350 addq %r13,%r9 351 adcq %r12,%r10 352 353 shrq $52,%r9 354 salq $12,%r10 355 orq %r10,%r9 356 357 leaq -328(%rsp),%rsp 358 359 {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 360 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 361 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 362 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 363 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 364 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 365 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 366 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 367 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 368 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 369 370 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 371 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 372 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 373 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 374 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 375 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 376 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 377 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 378 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 379 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 380 vmovdqu %ymm3,0(%rsp) 381 vmovdqu %ymm4,32(%rsp) 382 vmovdqu %ymm5,64(%rsp) 383 vmovdqu %ymm6,96(%rsp) 384 vmovdqu %ymm7,128(%rsp) 385 vmovdqu %ymm8,160(%rsp) 386 vmovdqu %ymm9,192(%rsp) 387 vmovdqu %ymm10,224(%rsp) 388 vmovdqu %ymm11,256(%rsp) 389 vmovdqu %ymm12,288(%rsp) 390 movq $0,320(%rsp) 391 392 vmovdqu 8(%rsp),%ymm3 393 vmovdqu 40(%rsp),%ymm4 394 vmovdqu 72(%rsp),%ymm5 395 vmovdqu 104(%rsp),%ymm6 396 vmovdqu 136(%rsp),%ymm7 397 vmovdqu 168(%rsp),%ymm8 398 vmovdqu 200(%rsp),%ymm9 399 vmovdqu 232(%rsp),%ymm10 400 vmovdqu 264(%rsp),%ymm11 401 vmovdqu 296(%rsp),%ymm12 402 403 addq 8(%rsp),%r9 404 405 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 406 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 407 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 408 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 409 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 410 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 411 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 412 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 413 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 414 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 415 416 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 417 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 418 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 419 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 420 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 421 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 422 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 423 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 424 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 425 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 426 leaq 328(%rsp),%rsp 427 leaq 32(%r11),%r11 428 decl %ebx 429 jne .Lloop10 430 431 vmovq %r9,%xmm0 432 vpbroadcastq %xmm0,%ymm0 433 vpblendd $3,%ymm0,%ymm3,%ymm3 434 435 leaq -640(%rsp),%rsp 436 vmovupd %ymm3,0(%rsp) 437 vmovupd %ymm4,32(%rsp) 438 vmovupd %ymm5,64(%rsp) 439 vmovupd %ymm6,96(%rsp) 440 vmovupd %ymm7,128(%rsp) 441 vmovupd %ymm8,160(%rsp) 442 vmovupd %ymm9,192(%rsp) 443 vmovupd %ymm10,224(%rsp) 444 vmovupd %ymm11,256(%rsp) 445 vmovupd %ymm12,288(%rsp) 446 447 448 449 vpsrlq $52,%ymm3,%ymm3 450 vpsrlq $52,%ymm4,%ymm4 451 vpsrlq $52,%ymm5,%ymm5 452 vpsrlq $52,%ymm6,%ymm6 453 vpsrlq $52,%ymm7,%ymm7 454 vpsrlq $52,%ymm8,%ymm8 455 vpsrlq $52,%ymm9,%ymm9 456 vpsrlq $52,%ymm10,%ymm10 457 vpsrlq $52,%ymm11,%ymm11 458 vpsrlq $52,%ymm12,%ymm12 459 460 461 vpermq $144,%ymm12,%ymm12 462 vpermq $3,%ymm11,%ymm13 463 vblendpd $1,%ymm13,%ymm12,%ymm12 464 465 vpermq $144,%ymm11,%ymm11 466 vpermq $3,%ymm10,%ymm13 467 vblendpd $1,%ymm13,%ymm11,%ymm11 468 469 vpermq $144,%ymm10,%ymm10 470 vpermq $3,%ymm9,%ymm13 471 vblendpd $1,%ymm13,%ymm10,%ymm10 472 473 vpermq $144,%ymm9,%ymm9 474 vpermq $3,%ymm8,%ymm13 475 vblendpd $1,%ymm13,%ymm9,%ymm9 476 477 vpermq $144,%ymm8,%ymm8 478 vpermq $3,%ymm7,%ymm13 479 vblendpd $1,%ymm13,%ymm8,%ymm8 480 481 vpermq $144,%ymm7,%ymm7 482 vpermq $3,%ymm6,%ymm13 483 vblendpd $1,%ymm13,%ymm7,%ymm7 484 485 vpermq $144,%ymm6,%ymm6 486 vpermq $3,%ymm5,%ymm13 487 vblendpd $1,%ymm13,%ymm6,%ymm6 488 489 vpermq $144,%ymm5,%ymm5 490 vpermq $3,%ymm4,%ymm13 491 vblendpd $1,%ymm13,%ymm5,%ymm5 492 493 vpermq $144,%ymm4,%ymm4 494 vpermq $3,%ymm3,%ymm13 495 vblendpd $1,%ymm13,%ymm4,%ymm4 496 497 vpermq $144,%ymm3,%ymm3 498 vpand .Lhigh64x3(%rip),%ymm3,%ymm3 499 500 vmovupd %ymm3,320(%rsp) 501 vmovupd %ymm4,352(%rsp) 502 vmovupd %ymm5,384(%rsp) 503 vmovupd %ymm6,416(%rsp) 504 vmovupd %ymm7,448(%rsp) 505 vmovupd %ymm8,480(%rsp) 506 vmovupd %ymm9,512(%rsp) 507 vmovupd %ymm10,544(%rsp) 508 vmovupd %ymm11,576(%rsp) 509 vmovupd %ymm12,608(%rsp) 510 511 vmovupd 0(%rsp),%ymm3 512 vmovupd 32(%rsp),%ymm4 513 vmovupd 64(%rsp),%ymm5 514 vmovupd 96(%rsp),%ymm6 515 vmovupd 128(%rsp),%ymm7 516 vmovupd 160(%rsp),%ymm8 517 vmovupd 192(%rsp),%ymm9 518 vmovupd 224(%rsp),%ymm10 519 vmovupd 256(%rsp),%ymm11 520 vmovupd 288(%rsp),%ymm12 521 522 523 vpand .Lmask52x4(%rip),%ymm3,%ymm3 524 vpand .Lmask52x4(%rip),%ymm4,%ymm4 525 vpand .Lmask52x4(%rip),%ymm5,%ymm5 526 vpand .Lmask52x4(%rip),%ymm6,%ymm6 527 vpand .Lmask52x4(%rip),%ymm7,%ymm7 528 vpand .Lmask52x4(%rip),%ymm8,%ymm8 529 vpand .Lmask52x4(%rip),%ymm9,%ymm9 530 vpand .Lmask52x4(%rip),%ymm10,%ymm10 531 vpand .Lmask52x4(%rip),%ymm11,%ymm11 532 vpand .Lmask52x4(%rip),%ymm12,%ymm12 533 534 535 vpaddq 320(%rsp),%ymm3,%ymm3 536 vpaddq 352(%rsp),%ymm4,%ymm4 537 vpaddq 384(%rsp),%ymm5,%ymm5 538 vpaddq 416(%rsp),%ymm6,%ymm6 539 vpaddq 448(%rsp),%ymm7,%ymm7 540 vpaddq 480(%rsp),%ymm8,%ymm8 541 vpaddq 512(%rsp),%ymm9,%ymm9 542 vpaddq 544(%rsp),%ymm10,%ymm10 543 vpaddq 576(%rsp),%ymm11,%ymm11 544 vpaddq 608(%rsp),%ymm12,%ymm12 545 546 leaq 640(%rsp),%rsp 547 548 549 550 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13 551 vmovmskpd %ymm13,%r14d 552 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13 553 vmovmskpd %ymm13,%r13d 554 shlb $4,%r13b 555 orb %r13b,%r14b 556 557 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13 558 vmovmskpd %ymm13,%r13d 559 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13 560 vmovmskpd %ymm13,%r12d 561 shlb $4,%r12b 562 orb %r12b,%r13b 563 564 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 565 vmovmskpd %ymm13,%r12d 566 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 567 vmovmskpd %ymm13,%r11d 568 shlb $4,%r11b 569 orb %r11b,%r12b 570 571 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13 572 vmovmskpd %ymm13,%r11d 573 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13 574 vmovmskpd %ymm13,%r10d 575 shlb $4,%r10b 576 orb %r10b,%r11b 577 578 vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 579 vmovmskpd %ymm13,%r10d 580 vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13 581 vmovmskpd %ymm13,%r9d 582 shlb $4,%r9b 583 orb %r9b,%r10b 584 585 addb %r14b,%r14b 586 adcb %r13b,%r13b 587 adcb %r12b,%r12b 588 adcb %r11b,%r11b 589 adcb %r10b,%r10b 590 591 592 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13 593 vmovmskpd %ymm13,%r9d 594 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13 595 vmovmskpd %ymm13,%r8d 596 shlb $4,%r8b 597 orb %r8b,%r9b 598 599 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13 600 vmovmskpd %ymm13,%r8d 601 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13 602 vmovmskpd %ymm13,%edx 603 shlb $4,%dl 604 orb %dl,%r8b 605 606 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 607 vmovmskpd %ymm13,%edx 608 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 609 vmovmskpd %ymm13,%ecx 610 shlb $4,%cl 611 orb %cl,%dl 612 613 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13 614 vmovmskpd %ymm13,%ecx 615 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13 616 vmovmskpd %ymm13,%ebx 617 shlb $4,%bl 618 orb %bl,%cl 619 620 vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 621 vmovmskpd %ymm13,%ebx 622 vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13 623 vmovmskpd %ymm13,%eax 624 shlb $4,%al 625 orb %al,%bl 626 627 addb %r9b,%r14b 628 adcb %r8b,%r13b 629 adcb %dl,%r12b 630 adcb %cl,%r11b 631 adcb %bl,%r10b 632 633 xorb %r9b,%r14b 634 xorb %r8b,%r13b 635 xorb %dl,%r12b 636 xorb %cl,%r11b 637 xorb %bl,%r10b 638 639 pushq %r9 640 pushq %r8 641 642 leaq .Lkmasklut(%rip),%r8 643 644 movb %r14b,%r9b 645 andq $0xf,%r14 646 vpsubq .Lmask52x4(%rip),%ymm3,%ymm13 647 shlq $5,%r14 648 vmovapd (%r8,%r14,1),%ymm14 649 vblendvpd %ymm14,%ymm13,%ymm3,%ymm3 650 651 shrb $4,%r9b 652 andq $0xf,%r9 653 vpsubq .Lmask52x4(%rip),%ymm4,%ymm13 654 shlq $5,%r9 655 vmovapd (%r8,%r9,1),%ymm14 656 vblendvpd %ymm14,%ymm13,%ymm4,%ymm4 657 658 movb %r13b,%r9b 659 andq $0xf,%r13 660 vpsubq .Lmask52x4(%rip),%ymm5,%ymm13 661 shlq $5,%r13 662 vmovapd (%r8,%r13,1),%ymm14 663 vblendvpd %ymm14,%ymm13,%ymm5,%ymm5 664 665 shrb $4,%r9b 666 andq $0xf,%r9 667 vpsubq .Lmask52x4(%rip),%ymm6,%ymm13 668 shlq $5,%r9 669 vmovapd (%r8,%r9,1),%ymm14 670 vblendvpd %ymm14,%ymm13,%ymm6,%ymm6 671 672 movb %r12b,%r9b 673 andq $0xf,%r12 674 vpsubq .Lmask52x4(%rip),%ymm7,%ymm13 675 shlq $5,%r12 676 vmovapd (%r8,%r12,1),%ymm14 677 vblendvpd %ymm14,%ymm13,%ymm7,%ymm7 678 679 shrb $4,%r9b 680 andq $0xf,%r9 681 vpsubq .Lmask52x4(%rip),%ymm8,%ymm13 682 shlq $5,%r9 683 vmovapd (%r8,%r9,1),%ymm14 684 vblendvpd %ymm14,%ymm13,%ymm8,%ymm8 685 686 movb %r11b,%r9b 687 andq $0xf,%r11 688 vpsubq .Lmask52x4(%rip),%ymm9,%ymm13 689 shlq $5,%r11 690 vmovapd (%r8,%r11,1),%ymm14 691 vblendvpd %ymm14,%ymm13,%ymm9,%ymm9 692 693 shrb $4,%r9b 694 andq $0xf,%r9 695 vpsubq .Lmask52x4(%rip),%ymm10,%ymm13 696 shlq $5,%r9 697 vmovapd (%r8,%r9,1),%ymm14 698 vblendvpd %ymm14,%ymm13,%ymm10,%ymm10 699 700 movb %r10b,%r9b 701 andq $0xf,%r10 702 vpsubq .Lmask52x4(%rip),%ymm11,%ymm13 703 shlq $5,%r10 704 vmovapd (%r8,%r10,1),%ymm14 705 vblendvpd %ymm14,%ymm13,%ymm11,%ymm11 706 707 shrb $4,%r9b 708 andq $0xf,%r9 709 vpsubq .Lmask52x4(%rip),%ymm12,%ymm13 710 shlq $5,%r9 711 vmovapd (%r8,%r9,1),%ymm14 712 vblendvpd %ymm14,%ymm13,%ymm12,%ymm12 713 714 popq %r8 715 popq %r9 716 717 vpand .Lmask52x4(%rip),%ymm3,%ymm3 718 vpand .Lmask52x4(%rip),%ymm4,%ymm4 719 vpand .Lmask52x4(%rip),%ymm5,%ymm5 720 vpand .Lmask52x4(%rip),%ymm6,%ymm6 721 vpand .Lmask52x4(%rip),%ymm7,%ymm7 722 vpand .Lmask52x4(%rip),%ymm8,%ymm8 723 vpand .Lmask52x4(%rip),%ymm9,%ymm9 724 725 vpand .Lmask52x4(%rip),%ymm10,%ymm10 726 vpand .Lmask52x4(%rip),%ymm11,%ymm11 727 vpand .Lmask52x4(%rip),%ymm12,%ymm12 728 729 vmovdqu %ymm3,0(%rdi) 730 vmovdqu %ymm4,32(%rdi) 731 vmovdqu %ymm5,64(%rdi) 732 vmovdqu %ymm6,96(%rdi) 733 vmovdqu %ymm7,128(%rdi) 734 vmovdqu %ymm8,160(%rdi) 735 vmovdqu %ymm9,192(%rdi) 736 vmovdqu %ymm10,224(%rdi) 737 vmovdqu %ymm11,256(%rdi) 738 vmovdqu %ymm12,288(%rdi) 739 740 vzeroupper 741 leaq (%rsp),%rax 742 .cfi_def_cfa_register %rax 743 movq 0(%rax),%r15 744 .cfi_restore %r15 745 movq 8(%rax),%r14 746 .cfi_restore %r14 747 movq 16(%rax),%r13 748 .cfi_restore %r13 749 movq 24(%rax),%r12 750 .cfi_restore %r12 751 movq 32(%rax),%rbp 752 .cfi_restore %rbp 753 movq 40(%rax),%rbx 754 .cfi_restore %rbx 755 leaq 48(%rax),%rsp 756 .cfi_def_cfa %rsp,8 757 .Lossl_rsaz_amm52x40_x1_avxifma256_epilogue: 758 759 .byte 0xf3,0xc3 760 .cfi_endproc 761 .size ossl_rsaz_amm52x40_x1_avxifma256, .-ossl_rsaz_amm52x40_x1_avxifma256 762 .section .rodata 763 .align 32 764 .Lmask52x4: 765 .quad 0xfffffffffffff 766 .quad 0xfffffffffffff 767 .quad 0xfffffffffffff 768 .quad 0xfffffffffffff 769 .Lhigh64x3: 770 .quad 0x0 771 .quad 0xffffffffffffffff 772 .quad 0xffffffffffffffff 773 .quad 0xffffffffffffffff 774 .Lkmasklut: 775 776 .quad 0x0 777 .quad 0x0 778 .quad 0x0 779 .quad 0x0 780 781 .quad 0xffffffffffffffff 782 .quad 0x0 783 .quad 0x0 784 .quad 0x0 785 786 .quad 0x0 787 .quad 0xffffffffffffffff 788 .quad 0x0 789 .quad 0x0 790 791 .quad 0xffffffffffffffff 792 .quad 0xffffffffffffffff 793 .quad 0x0 794 .quad 0x0 795 796 .quad 0x0 797 .quad 0x0 798 .quad 0xffffffffffffffff 799 .quad 0x0 800 801 .quad 0xffffffffffffffff 802 .quad 0x0 803 .quad 0xffffffffffffffff 804 .quad 0x0 805 806 .quad 0x0 807 .quad 0xffffffffffffffff 808 .quad 0xffffffffffffffff 809 .quad 0x0 810 811 .quad 0xffffffffffffffff 812 .quad 0xffffffffffffffff 813 .quad 0xffffffffffffffff 814 .quad 0x0 815 816 .quad 0x0 817 .quad 0x0 818 .quad 0x0 819 .quad 0xffffffffffffffff 820 821 .quad 0xffffffffffffffff 822 .quad 0x0 823 .quad 0x0 824 .quad 0xffffffffffffffff 825 826 .quad 0x0 827 .quad 0xffffffffffffffff 828 .quad 0x0 829 .quad 0xffffffffffffffff 830 831 .quad 0xffffffffffffffff 832 .quad 0xffffffffffffffff 833 .quad 0x0 834 .quad 0xffffffffffffffff 835 836 .quad 0x0 837 .quad 0x0 838 .quad 0xffffffffffffffff 839 .quad 0xffffffffffffffff 840 841 .quad 0xffffffffffffffff 842 .quad 0x0 843 .quad 0xffffffffffffffff 844 .quad 0xffffffffffffffff 845 846 .quad 0x0 847 .quad 0xffffffffffffffff 848 .quad 0xffffffffffffffff 849 .quad 0xffffffffffffffff 850 851 .quad 0xffffffffffffffff 852 .quad 0xffffffffffffffff 853 .quad 0xffffffffffffffff 854 .quad 0xffffffffffffffff 855 .text 856 857 .globl ossl_rsaz_amm52x40_x2_avxifma256 858 .type ossl_rsaz_amm52x40_x2_avxifma256,@function 859 .align 32 860 ossl_rsaz_amm52x40_x2_avxifma256: 861 .cfi_startproc 862 .byte 243,15,30,250 863 pushq %rbx 864 .cfi_adjust_cfa_offset 8 865 .cfi_offset %rbx,-16 866 pushq %rbp 867 .cfi_adjust_cfa_offset 8 868 .cfi_offset %rbp,-24 869 pushq %r12 870 .cfi_adjust_cfa_offset 8 871 .cfi_offset %r12,-32 872 pushq %r13 873 .cfi_adjust_cfa_offset 8 874 .cfi_offset %r13,-40 875 pushq %r14 876 .cfi_adjust_cfa_offset 8 877 .cfi_offset %r14,-48 878 pushq %r15 879 .cfi_adjust_cfa_offset 8 880 .cfi_offset %r15,-56 881 882 vpxor %ymm0,%ymm0,%ymm0 883 vmovapd %ymm0,%ymm3 884 vmovapd %ymm0,%ymm4 885 vmovapd %ymm0,%ymm5 886 vmovapd %ymm0,%ymm6 887 vmovapd %ymm0,%ymm7 888 vmovapd %ymm0,%ymm8 889 vmovapd %ymm0,%ymm9 890 vmovapd %ymm0,%ymm10 891 vmovapd %ymm0,%ymm11 892 vmovapd %ymm0,%ymm12 893 894 xorl %r9d,%r9d 895 896 movq %rdx,%r11 897 movq $0xfffffffffffff,%rax 898 899 movl $40,%ebx 900 901 .align 32 902 .Lloop40: 903 movq 0(%r11),%r13 904 905 vpbroadcastq 0(%r11),%ymm1 906 movq 0(%rsi),%rdx 907 mulxq %r13,%r13,%r12 908 addq %r13,%r9 909 movq %r12,%r10 910 adcq $0,%r10 911 912 movq (%r8),%r13 913 imulq %r9,%r13 914 andq %rax,%r13 915 916 vmovq %r13,%xmm2 917 vpbroadcastq %xmm2,%ymm2 918 movq 0(%rcx),%rdx 919 mulxq %r13,%r13,%r12 920 addq %r13,%r9 921 adcq %r12,%r10 922 923 shrq $52,%r9 924 salq $12,%r10 925 orq %r10,%r9 926 927 leaq -328(%rsp),%rsp 928 929 {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 930 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 931 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 932 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 933 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 934 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 935 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 936 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 937 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 938 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 939 940 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 941 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 942 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 943 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 944 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 945 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 946 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 947 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 948 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 949 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 950 vmovdqu %ymm3,0(%rsp) 951 vmovdqu %ymm4,32(%rsp) 952 vmovdqu %ymm5,64(%rsp) 953 vmovdqu %ymm6,96(%rsp) 954 vmovdqu %ymm7,128(%rsp) 955 vmovdqu %ymm8,160(%rsp) 956 vmovdqu %ymm9,192(%rsp) 957 vmovdqu %ymm10,224(%rsp) 958 vmovdqu %ymm11,256(%rsp) 959 vmovdqu %ymm12,288(%rsp) 960 movq $0,320(%rsp) 961 962 vmovdqu 8(%rsp),%ymm3 963 vmovdqu 40(%rsp),%ymm4 964 vmovdqu 72(%rsp),%ymm5 965 vmovdqu 104(%rsp),%ymm6 966 vmovdqu 136(%rsp),%ymm7 967 vmovdqu 168(%rsp),%ymm8 968 vmovdqu 200(%rsp),%ymm9 969 vmovdqu 232(%rsp),%ymm10 970 vmovdqu 264(%rsp),%ymm11 971 vmovdqu 296(%rsp),%ymm12 972 973 addq 8(%rsp),%r9 974 975 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 976 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 977 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 978 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 979 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 980 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 981 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 982 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 983 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 984 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 985 986 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 987 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 988 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 989 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 990 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 991 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 992 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 993 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 994 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 995 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 996 leaq 328(%rsp),%rsp 997 leaq 8(%r11),%r11 998 decl %ebx 999 jne .Lloop40 1000 1001 pushq %r11 1002 pushq %rsi 1003 pushq %rcx 1004 pushq %r8 1005 1006 vmovq %r9,%xmm0 1007 vpbroadcastq %xmm0,%ymm0 1008 vpblendd $3,%ymm0,%ymm3,%ymm3 1009 1010 leaq -640(%rsp),%rsp 1011 vmovupd %ymm3,0(%rsp) 1012 vmovupd %ymm4,32(%rsp) 1013 vmovupd %ymm5,64(%rsp) 1014 vmovupd %ymm6,96(%rsp) 1015 vmovupd %ymm7,128(%rsp) 1016 vmovupd %ymm8,160(%rsp) 1017 vmovupd %ymm9,192(%rsp) 1018 vmovupd %ymm10,224(%rsp) 1019 vmovupd %ymm11,256(%rsp) 1020 vmovupd %ymm12,288(%rsp) 1021 1022 1023 1024 vpsrlq $52,%ymm3,%ymm3 1025 vpsrlq $52,%ymm4,%ymm4 1026 vpsrlq $52,%ymm5,%ymm5 1027 vpsrlq $52,%ymm6,%ymm6 1028 vpsrlq $52,%ymm7,%ymm7 1029 vpsrlq $52,%ymm8,%ymm8 1030 vpsrlq $52,%ymm9,%ymm9 1031 vpsrlq $52,%ymm10,%ymm10 1032 vpsrlq $52,%ymm11,%ymm11 1033 vpsrlq $52,%ymm12,%ymm12 1034 1035 1036 vpermq $144,%ymm12,%ymm12 1037 vpermq $3,%ymm11,%ymm13 1038 vblendpd $1,%ymm13,%ymm12,%ymm12 1039 1040 vpermq $144,%ymm11,%ymm11 1041 vpermq $3,%ymm10,%ymm13 1042 vblendpd $1,%ymm13,%ymm11,%ymm11 1043 1044 vpermq $144,%ymm10,%ymm10 1045 vpermq $3,%ymm9,%ymm13 1046 vblendpd $1,%ymm13,%ymm10,%ymm10 1047 1048 vpermq $144,%ymm9,%ymm9 1049 vpermq $3,%ymm8,%ymm13 1050 vblendpd $1,%ymm13,%ymm9,%ymm9 1051 1052 vpermq $144,%ymm8,%ymm8 1053 vpermq $3,%ymm7,%ymm13 1054 vblendpd $1,%ymm13,%ymm8,%ymm8 1055 1056 vpermq $144,%ymm7,%ymm7 1057 vpermq $3,%ymm6,%ymm13 1058 vblendpd $1,%ymm13,%ymm7,%ymm7 1059 1060 vpermq $144,%ymm6,%ymm6 1061 vpermq $3,%ymm5,%ymm13 1062 vblendpd $1,%ymm13,%ymm6,%ymm6 1063 1064 vpermq $144,%ymm5,%ymm5 1065 vpermq $3,%ymm4,%ymm13 1066 vblendpd $1,%ymm13,%ymm5,%ymm5 1067 1068 vpermq $144,%ymm4,%ymm4 1069 vpermq $3,%ymm3,%ymm13 1070 vblendpd $1,%ymm13,%ymm4,%ymm4 1071 1072 vpermq $144,%ymm3,%ymm3 1073 vpand .Lhigh64x3(%rip),%ymm3,%ymm3 1074 1075 vmovupd %ymm3,320(%rsp) 1076 vmovupd %ymm4,352(%rsp) 1077 vmovupd %ymm5,384(%rsp) 1078 vmovupd %ymm6,416(%rsp) 1079 vmovupd %ymm7,448(%rsp) 1080 vmovupd %ymm8,480(%rsp) 1081 vmovupd %ymm9,512(%rsp) 1082 vmovupd %ymm10,544(%rsp) 1083 vmovupd %ymm11,576(%rsp) 1084 vmovupd %ymm12,608(%rsp) 1085 1086 vmovupd 0(%rsp),%ymm3 1087 vmovupd 32(%rsp),%ymm4 1088 vmovupd 64(%rsp),%ymm5 1089 vmovupd 96(%rsp),%ymm6 1090 vmovupd 128(%rsp),%ymm7 1091 vmovupd 160(%rsp),%ymm8 1092 vmovupd 192(%rsp),%ymm9 1093 vmovupd 224(%rsp),%ymm10 1094 vmovupd 256(%rsp),%ymm11 1095 vmovupd 288(%rsp),%ymm12 1096 1097 1098 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1099 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1100 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1101 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1102 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1103 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1104 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1105 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1106 vpand .Lmask52x4(%rip),%ymm11,%ymm11 1107 vpand .Lmask52x4(%rip),%ymm12,%ymm12 1108 1109 1110 vpaddq 320(%rsp),%ymm3,%ymm3 1111 vpaddq 352(%rsp),%ymm4,%ymm4 1112 vpaddq 384(%rsp),%ymm5,%ymm5 1113 vpaddq 416(%rsp),%ymm6,%ymm6 1114 vpaddq 448(%rsp),%ymm7,%ymm7 1115 vpaddq 480(%rsp),%ymm8,%ymm8 1116 vpaddq 512(%rsp),%ymm9,%ymm9 1117 vpaddq 544(%rsp),%ymm10,%ymm10 1118 vpaddq 576(%rsp),%ymm11,%ymm11 1119 vpaddq 608(%rsp),%ymm12,%ymm12 1120 1121 leaq 640(%rsp),%rsp 1122 1123 1124 1125 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13 1126 vmovmskpd %ymm13,%r14d 1127 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13 1128 vmovmskpd %ymm13,%r13d 1129 shlb $4,%r13b 1130 orb %r13b,%r14b 1131 1132 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13 1133 vmovmskpd %ymm13,%r13d 1134 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13 1135 vmovmskpd %ymm13,%r12d 1136 shlb $4,%r12b 1137 orb %r12b,%r13b 1138 1139 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 1140 vmovmskpd %ymm13,%r12d 1141 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 1142 vmovmskpd %ymm13,%r11d 1143 shlb $4,%r11b 1144 orb %r11b,%r12b 1145 1146 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13 1147 vmovmskpd %ymm13,%r11d 1148 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13 1149 vmovmskpd %ymm13,%r10d 1150 shlb $4,%r10b 1151 orb %r10b,%r11b 1152 1153 vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 1154 vmovmskpd %ymm13,%r10d 1155 vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13 1156 vmovmskpd %ymm13,%r9d 1157 shlb $4,%r9b 1158 orb %r9b,%r10b 1159 1160 addb %r14b,%r14b 1161 adcb %r13b,%r13b 1162 adcb %r12b,%r12b 1163 adcb %r11b,%r11b 1164 adcb %r10b,%r10b 1165 1166 1167 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13 1168 vmovmskpd %ymm13,%r9d 1169 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13 1170 vmovmskpd %ymm13,%r8d 1171 shlb $4,%r8b 1172 orb %r8b,%r9b 1173 1174 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13 1175 vmovmskpd %ymm13,%r8d 1176 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13 1177 vmovmskpd %ymm13,%edx 1178 shlb $4,%dl 1179 orb %dl,%r8b 1180 1181 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 1182 vmovmskpd %ymm13,%edx 1183 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 1184 vmovmskpd %ymm13,%ecx 1185 shlb $4,%cl 1186 orb %cl,%dl 1187 1188 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13 1189 vmovmskpd %ymm13,%ecx 1190 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13 1191 vmovmskpd %ymm13,%ebx 1192 shlb $4,%bl 1193 orb %bl,%cl 1194 1195 vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 1196 vmovmskpd %ymm13,%ebx 1197 vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13 1198 vmovmskpd %ymm13,%eax 1199 shlb $4,%al 1200 orb %al,%bl 1201 1202 addb %r9b,%r14b 1203 adcb %r8b,%r13b 1204 adcb %dl,%r12b 1205 adcb %cl,%r11b 1206 adcb %bl,%r10b 1207 1208 xorb %r9b,%r14b 1209 xorb %r8b,%r13b 1210 xorb %dl,%r12b 1211 xorb %cl,%r11b 1212 xorb %bl,%r10b 1213 1214 pushq %r9 1215 pushq %r8 1216 1217 leaq .Lkmasklut(%rip),%r8 1218 1219 movb %r14b,%r9b 1220 andq $0xf,%r14 1221 vpsubq .Lmask52x4(%rip),%ymm3,%ymm13 1222 shlq $5,%r14 1223 vmovapd (%r8,%r14,1),%ymm14 1224 vblendvpd %ymm14,%ymm13,%ymm3,%ymm3 1225 1226 shrb $4,%r9b 1227 andq $0xf,%r9 1228 vpsubq .Lmask52x4(%rip),%ymm4,%ymm13 1229 shlq $5,%r9 1230 vmovapd (%r8,%r9,1),%ymm14 1231 vblendvpd %ymm14,%ymm13,%ymm4,%ymm4 1232 1233 movb %r13b,%r9b 1234 andq $0xf,%r13 1235 vpsubq .Lmask52x4(%rip),%ymm5,%ymm13 1236 shlq $5,%r13 1237 vmovapd (%r8,%r13,1),%ymm14 1238 vblendvpd %ymm14,%ymm13,%ymm5,%ymm5 1239 1240 shrb $4,%r9b 1241 andq $0xf,%r9 1242 vpsubq .Lmask52x4(%rip),%ymm6,%ymm13 1243 shlq $5,%r9 1244 vmovapd (%r8,%r9,1),%ymm14 1245 vblendvpd %ymm14,%ymm13,%ymm6,%ymm6 1246 1247 movb %r12b,%r9b 1248 andq $0xf,%r12 1249 vpsubq .Lmask52x4(%rip),%ymm7,%ymm13 1250 shlq $5,%r12 1251 vmovapd (%r8,%r12,1),%ymm14 1252 vblendvpd %ymm14,%ymm13,%ymm7,%ymm7 1253 1254 shrb $4,%r9b 1255 andq $0xf,%r9 1256 vpsubq .Lmask52x4(%rip),%ymm8,%ymm13 1257 shlq $5,%r9 1258 vmovapd (%r8,%r9,1),%ymm14 1259 vblendvpd %ymm14,%ymm13,%ymm8,%ymm8 1260 1261 movb %r11b,%r9b 1262 andq $0xf,%r11 1263 vpsubq .Lmask52x4(%rip),%ymm9,%ymm13 1264 shlq $5,%r11 1265 vmovapd (%r8,%r11,1),%ymm14 1266 vblendvpd %ymm14,%ymm13,%ymm9,%ymm9 1267 1268 shrb $4,%r9b 1269 andq $0xf,%r9 1270 vpsubq .Lmask52x4(%rip),%ymm10,%ymm13 1271 shlq $5,%r9 1272 vmovapd (%r8,%r9,1),%ymm14 1273 vblendvpd %ymm14,%ymm13,%ymm10,%ymm10 1274 1275 movb %r10b,%r9b 1276 andq $0xf,%r10 1277 vpsubq .Lmask52x4(%rip),%ymm11,%ymm13 1278 shlq $5,%r10 1279 vmovapd (%r8,%r10,1),%ymm14 1280 vblendvpd %ymm14,%ymm13,%ymm11,%ymm11 1281 1282 shrb $4,%r9b 1283 andq $0xf,%r9 1284 vpsubq .Lmask52x4(%rip),%ymm12,%ymm13 1285 shlq $5,%r9 1286 vmovapd (%r8,%r9,1),%ymm14 1287 vblendvpd %ymm14,%ymm13,%ymm12,%ymm12 1288 1289 popq %r8 1290 popq %r9 1291 1292 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1293 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1294 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1295 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1296 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1297 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1298 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1299 1300 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1301 vpand .Lmask52x4(%rip),%ymm11,%ymm11 1302 vpand .Lmask52x4(%rip),%ymm12,%ymm12 1303 1304 popq %r8 1305 popq %rcx 1306 popq %rsi 1307 popq %r11 1308 1309 vmovdqu %ymm3,0(%rdi) 1310 vmovdqu %ymm4,32(%rdi) 1311 vmovdqu %ymm5,64(%rdi) 1312 vmovdqu %ymm6,96(%rdi) 1313 vmovdqu %ymm7,128(%rdi) 1314 vmovdqu %ymm8,160(%rdi) 1315 vmovdqu %ymm9,192(%rdi) 1316 vmovdqu %ymm10,224(%rdi) 1317 vmovdqu %ymm11,256(%rdi) 1318 vmovdqu %ymm12,288(%rdi) 1319 1320 xorl %r15d,%r15d 1321 1322 movq $0xfffffffffffff,%rax 1323 1324 movl $40,%ebx 1325 1326 vpxor %ymm0,%ymm0,%ymm0 1327 vmovapd %ymm0,%ymm3 1328 vmovapd %ymm0,%ymm4 1329 vmovapd %ymm0,%ymm5 1330 vmovapd %ymm0,%ymm6 1331 vmovapd %ymm0,%ymm7 1332 vmovapd %ymm0,%ymm8 1333 vmovapd %ymm0,%ymm9 1334 vmovapd %ymm0,%ymm10 1335 vmovapd %ymm0,%ymm11 1336 vmovapd %ymm0,%ymm12 1337 .align 32 1338 .Lloop40_1: 1339 movq 0(%r11),%r13 1340 1341 vpbroadcastq 0(%r11),%ymm1 1342 movq 320(%rsi),%rdx 1343 mulxq %r13,%r13,%r12 1344 addq %r13,%r9 1345 movq %r12,%r10 1346 adcq $0,%r10 1347 1348 movq 8(%r8),%r13 1349 imulq %r9,%r13 1350 andq %rax,%r13 1351 1352 vmovq %r13,%xmm2 1353 vpbroadcastq %xmm2,%ymm2 1354 movq 320(%rcx),%rdx 1355 mulxq %r13,%r13,%r12 1356 addq %r13,%r9 1357 adcq %r12,%r10 1358 1359 shrq $52,%r9 1360 salq $12,%r10 1361 orq %r10,%r9 1362 1363 leaq -328(%rsp),%rsp 1364 1365 {vex} vpmadd52luq 320(%rsi),%ymm1,%ymm3 1366 {vex} vpmadd52luq 352(%rsi),%ymm1,%ymm4 1367 {vex} vpmadd52luq 384(%rsi),%ymm1,%ymm5 1368 {vex} vpmadd52luq 416(%rsi),%ymm1,%ymm6 1369 {vex} vpmadd52luq 448(%rsi),%ymm1,%ymm7 1370 {vex} vpmadd52luq 480(%rsi),%ymm1,%ymm8 1371 {vex} vpmadd52luq 512(%rsi),%ymm1,%ymm9 1372 {vex} vpmadd52luq 544(%rsi),%ymm1,%ymm10 1373 {vex} vpmadd52luq 576(%rsi),%ymm1,%ymm11 1374 {vex} vpmadd52luq 608(%rsi),%ymm1,%ymm12 1375 1376 {vex} vpmadd52luq 320(%rcx),%ymm2,%ymm3 1377 {vex} vpmadd52luq 352(%rcx),%ymm2,%ymm4 1378 {vex} vpmadd52luq 384(%rcx),%ymm2,%ymm5 1379 {vex} vpmadd52luq 416(%rcx),%ymm2,%ymm6 1380 {vex} vpmadd52luq 448(%rcx),%ymm2,%ymm7 1381 {vex} vpmadd52luq 480(%rcx),%ymm2,%ymm8 1382 {vex} vpmadd52luq 512(%rcx),%ymm2,%ymm9 1383 {vex} vpmadd52luq 544(%rcx),%ymm2,%ymm10 1384 {vex} vpmadd52luq 576(%rcx),%ymm2,%ymm11 1385 {vex} vpmadd52luq 608(%rcx),%ymm2,%ymm12 1386 vmovdqu %ymm3,0(%rsp) 1387 vmovdqu %ymm4,32(%rsp) 1388 vmovdqu %ymm5,64(%rsp) 1389 vmovdqu %ymm6,96(%rsp) 1390 vmovdqu %ymm7,128(%rsp) 1391 vmovdqu %ymm8,160(%rsp) 1392 vmovdqu %ymm9,192(%rsp) 1393 vmovdqu %ymm10,224(%rsp) 1394 vmovdqu %ymm11,256(%rsp) 1395 vmovdqu %ymm12,288(%rsp) 1396 movq $0,320(%rsp) 1397 1398 vmovdqu 8(%rsp),%ymm3 1399 vmovdqu 40(%rsp),%ymm4 1400 vmovdqu 72(%rsp),%ymm5 1401 vmovdqu 104(%rsp),%ymm6 1402 vmovdqu 136(%rsp),%ymm7 1403 vmovdqu 168(%rsp),%ymm8 1404 vmovdqu 200(%rsp),%ymm9 1405 vmovdqu 232(%rsp),%ymm10 1406 vmovdqu 264(%rsp),%ymm11 1407 vmovdqu 296(%rsp),%ymm12 1408 1409 addq 8(%rsp),%r9 1410 1411 {vex} vpmadd52huq 320(%rsi),%ymm1,%ymm3 1412 {vex} vpmadd52huq 352(%rsi),%ymm1,%ymm4 1413 {vex} vpmadd52huq 384(%rsi),%ymm1,%ymm5 1414 {vex} vpmadd52huq 416(%rsi),%ymm1,%ymm6 1415 {vex} vpmadd52huq 448(%rsi),%ymm1,%ymm7 1416 {vex} vpmadd52huq 480(%rsi),%ymm1,%ymm8 1417 {vex} vpmadd52huq 512(%rsi),%ymm1,%ymm9 1418 {vex} vpmadd52huq 544(%rsi),%ymm1,%ymm10 1419 {vex} vpmadd52huq 576(%rsi),%ymm1,%ymm11 1420 {vex} vpmadd52huq 608(%rsi),%ymm1,%ymm12 1421 1422 {vex} vpmadd52huq 320(%rcx),%ymm2,%ymm3 1423 {vex} vpmadd52huq 352(%rcx),%ymm2,%ymm4 1424 {vex} vpmadd52huq 384(%rcx),%ymm2,%ymm5 1425 {vex} vpmadd52huq 416(%rcx),%ymm2,%ymm6 1426 {vex} vpmadd52huq 448(%rcx),%ymm2,%ymm7 1427 {vex} vpmadd52huq 480(%rcx),%ymm2,%ymm8 1428 {vex} vpmadd52huq 512(%rcx),%ymm2,%ymm9 1429 {vex} vpmadd52huq 544(%rcx),%ymm2,%ymm10 1430 {vex} vpmadd52huq 576(%rcx),%ymm2,%ymm11 1431 {vex} vpmadd52huq 608(%rcx),%ymm2,%ymm12 1432 leaq 328(%rsp),%rsp 1433 leaq 8(%r11),%r11 1434 decl %ebx 1435 jne .Lloop40_1 1436 1437 vmovq %r9,%xmm0 1438 vpbroadcastq %xmm0,%ymm0 1439 vpblendd $3,%ymm0,%ymm3,%ymm3 1440 1441 leaq -640(%rsp),%rsp 1442 vmovupd %ymm3,0(%rsp) 1443 vmovupd %ymm4,32(%rsp) 1444 vmovupd %ymm5,64(%rsp) 1445 vmovupd %ymm6,96(%rsp) 1446 vmovupd %ymm7,128(%rsp) 1447 vmovupd %ymm8,160(%rsp) 1448 vmovupd %ymm9,192(%rsp) 1449 vmovupd %ymm10,224(%rsp) 1450 vmovupd %ymm11,256(%rsp) 1451 vmovupd %ymm12,288(%rsp) 1452 1453 1454 1455 vpsrlq $52,%ymm3,%ymm3 1456 vpsrlq $52,%ymm4,%ymm4 1457 vpsrlq $52,%ymm5,%ymm5 1458 vpsrlq $52,%ymm6,%ymm6 1459 vpsrlq $52,%ymm7,%ymm7 1460 vpsrlq $52,%ymm8,%ymm8 1461 vpsrlq $52,%ymm9,%ymm9 1462 vpsrlq $52,%ymm10,%ymm10 1463 vpsrlq $52,%ymm11,%ymm11 1464 vpsrlq $52,%ymm12,%ymm12 1465 1466 1467 vpermq $144,%ymm12,%ymm12 1468 vpermq $3,%ymm11,%ymm13 1469 vblendpd $1,%ymm13,%ymm12,%ymm12 1470 1471 vpermq $144,%ymm11,%ymm11 1472 vpermq $3,%ymm10,%ymm13 1473 vblendpd $1,%ymm13,%ymm11,%ymm11 1474 1475 vpermq $144,%ymm10,%ymm10 1476 vpermq $3,%ymm9,%ymm13 1477 vblendpd $1,%ymm13,%ymm10,%ymm10 1478 1479 vpermq $144,%ymm9,%ymm9 1480 vpermq $3,%ymm8,%ymm13 1481 vblendpd $1,%ymm13,%ymm9,%ymm9 1482 1483 vpermq $144,%ymm8,%ymm8 1484 vpermq $3,%ymm7,%ymm13 1485 vblendpd $1,%ymm13,%ymm8,%ymm8 1486 1487 vpermq $144,%ymm7,%ymm7 1488 vpermq $3,%ymm6,%ymm13 1489 vblendpd $1,%ymm13,%ymm7,%ymm7 1490 1491 vpermq $144,%ymm6,%ymm6 1492 vpermq $3,%ymm5,%ymm13 1493 vblendpd $1,%ymm13,%ymm6,%ymm6 1494 1495 vpermq $144,%ymm5,%ymm5 1496 vpermq $3,%ymm4,%ymm13 1497 vblendpd $1,%ymm13,%ymm5,%ymm5 1498 1499 vpermq $144,%ymm4,%ymm4 1500 vpermq $3,%ymm3,%ymm13 1501 vblendpd $1,%ymm13,%ymm4,%ymm4 1502 1503 vpermq $144,%ymm3,%ymm3 1504 vpand .Lhigh64x3(%rip),%ymm3,%ymm3 1505 1506 vmovupd %ymm3,320(%rsp) 1507 vmovupd %ymm4,352(%rsp) 1508 vmovupd %ymm5,384(%rsp) 1509 vmovupd %ymm6,416(%rsp) 1510 vmovupd %ymm7,448(%rsp) 1511 vmovupd %ymm8,480(%rsp) 1512 vmovupd %ymm9,512(%rsp) 1513 vmovupd %ymm10,544(%rsp) 1514 vmovupd %ymm11,576(%rsp) 1515 vmovupd %ymm12,608(%rsp) 1516 1517 vmovupd 0(%rsp),%ymm3 1518 vmovupd 32(%rsp),%ymm4 1519 vmovupd 64(%rsp),%ymm5 1520 vmovupd 96(%rsp),%ymm6 1521 vmovupd 128(%rsp),%ymm7 1522 vmovupd 160(%rsp),%ymm8 1523 vmovupd 192(%rsp),%ymm9 1524 vmovupd 224(%rsp),%ymm10 1525 vmovupd 256(%rsp),%ymm11 1526 vmovupd 288(%rsp),%ymm12 1527 1528 1529 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1530 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1531 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1532 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1533 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1534 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1535 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1536 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1537 vpand .Lmask52x4(%rip),%ymm11,%ymm11 1538 vpand .Lmask52x4(%rip),%ymm12,%ymm12 1539 1540 1541 vpaddq 320(%rsp),%ymm3,%ymm3 1542 vpaddq 352(%rsp),%ymm4,%ymm4 1543 vpaddq 384(%rsp),%ymm5,%ymm5 1544 vpaddq 416(%rsp),%ymm6,%ymm6 1545 vpaddq 448(%rsp),%ymm7,%ymm7 1546 vpaddq 480(%rsp),%ymm8,%ymm8 1547 vpaddq 512(%rsp),%ymm9,%ymm9 1548 vpaddq 544(%rsp),%ymm10,%ymm10 1549 vpaddq 576(%rsp),%ymm11,%ymm11 1550 vpaddq 608(%rsp),%ymm12,%ymm12 1551 1552 leaq 640(%rsp),%rsp 1553 1554 1555 1556 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13 1557 vmovmskpd %ymm13,%r14d 1558 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13 1559 vmovmskpd %ymm13,%r13d 1560 shlb $4,%r13b 1561 orb %r13b,%r14b 1562 1563 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13 1564 vmovmskpd %ymm13,%r13d 1565 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13 1566 vmovmskpd %ymm13,%r12d 1567 shlb $4,%r12b 1568 orb %r12b,%r13b 1569 1570 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 1571 vmovmskpd %ymm13,%r12d 1572 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 1573 vmovmskpd %ymm13,%r11d 1574 shlb $4,%r11b 1575 orb %r11b,%r12b 1576 1577 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13 1578 vmovmskpd %ymm13,%r11d 1579 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13 1580 vmovmskpd %ymm13,%r10d 1581 shlb $4,%r10b 1582 orb %r10b,%r11b 1583 1584 vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 1585 vmovmskpd %ymm13,%r10d 1586 vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13 1587 vmovmskpd %ymm13,%r9d 1588 shlb $4,%r9b 1589 orb %r9b,%r10b 1590 1591 addb %r14b,%r14b 1592 adcb %r13b,%r13b 1593 adcb %r12b,%r12b 1594 adcb %r11b,%r11b 1595 adcb %r10b,%r10b 1596 1597 1598 vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13 1599 vmovmskpd %ymm13,%r9d 1600 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13 1601 vmovmskpd %ymm13,%r8d 1602 shlb $4,%r8b 1603 orb %r8b,%r9b 1604 1605 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13 1606 vmovmskpd %ymm13,%r8d 1607 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13 1608 vmovmskpd %ymm13,%edx 1609 shlb $4,%dl 1610 orb %dl,%r8b 1611 1612 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 1613 vmovmskpd %ymm13,%edx 1614 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 1615 vmovmskpd %ymm13,%ecx 1616 shlb $4,%cl 1617 orb %cl,%dl 1618 1619 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13 1620 vmovmskpd %ymm13,%ecx 1621 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13 1622 vmovmskpd %ymm13,%ebx 1623 shlb $4,%bl 1624 orb %bl,%cl 1625 1626 vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 1627 vmovmskpd %ymm13,%ebx 1628 vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13 1629 vmovmskpd %ymm13,%eax 1630 shlb $4,%al 1631 orb %al,%bl 1632 1633 addb %r9b,%r14b 1634 adcb %r8b,%r13b 1635 adcb %dl,%r12b 1636 adcb %cl,%r11b 1637 adcb %bl,%r10b 1638 1639 xorb %r9b,%r14b 1640 xorb %r8b,%r13b 1641 xorb %dl,%r12b 1642 xorb %cl,%r11b 1643 xorb %bl,%r10b 1644 1645 pushq %r9 1646 pushq %r8 1647 1648 leaq .Lkmasklut(%rip),%r8 1649 1650 movb %r14b,%r9b 1651 andq $0xf,%r14 1652 vpsubq .Lmask52x4(%rip),%ymm3,%ymm13 1653 shlq $5,%r14 1654 vmovapd (%r8,%r14,1),%ymm14 1655 vblendvpd %ymm14,%ymm13,%ymm3,%ymm3 1656 1657 shrb $4,%r9b 1658 andq $0xf,%r9 1659 vpsubq .Lmask52x4(%rip),%ymm4,%ymm13 1660 shlq $5,%r9 1661 vmovapd (%r8,%r9,1),%ymm14 1662 vblendvpd %ymm14,%ymm13,%ymm4,%ymm4 1663 1664 movb %r13b,%r9b 1665 andq $0xf,%r13 1666 vpsubq .Lmask52x4(%rip),%ymm5,%ymm13 1667 shlq $5,%r13 1668 vmovapd (%r8,%r13,1),%ymm14 1669 vblendvpd %ymm14,%ymm13,%ymm5,%ymm5 1670 1671 shrb $4,%r9b 1672 andq $0xf,%r9 1673 vpsubq .Lmask52x4(%rip),%ymm6,%ymm13 1674 shlq $5,%r9 1675 vmovapd (%r8,%r9,1),%ymm14 1676 vblendvpd %ymm14,%ymm13,%ymm6,%ymm6 1677 1678 movb %r12b,%r9b 1679 andq $0xf,%r12 1680 vpsubq .Lmask52x4(%rip),%ymm7,%ymm13 1681 shlq $5,%r12 1682 vmovapd (%r8,%r12,1),%ymm14 1683 vblendvpd %ymm14,%ymm13,%ymm7,%ymm7 1684 1685 shrb $4,%r9b 1686 andq $0xf,%r9 1687 vpsubq .Lmask52x4(%rip),%ymm8,%ymm13 1688 shlq $5,%r9 1689 vmovapd (%r8,%r9,1),%ymm14 1690 vblendvpd %ymm14,%ymm13,%ymm8,%ymm8 1691 1692 movb %r11b,%r9b 1693 andq $0xf,%r11 1694 vpsubq .Lmask52x4(%rip),%ymm9,%ymm13 1695 shlq $5,%r11 1696 vmovapd (%r8,%r11,1),%ymm14 1697 vblendvpd %ymm14,%ymm13,%ymm9,%ymm9 1698 1699 shrb $4,%r9b 1700 andq $0xf,%r9 1701 vpsubq .Lmask52x4(%rip),%ymm10,%ymm13 1702 shlq $5,%r9 1703 vmovapd (%r8,%r9,1),%ymm14 1704 vblendvpd %ymm14,%ymm13,%ymm10,%ymm10 1705 1706 movb %r10b,%r9b 1707 andq $0xf,%r10 1708 vpsubq .Lmask52x4(%rip),%ymm11,%ymm13 1709 shlq $5,%r10 1710 vmovapd (%r8,%r10,1),%ymm14 1711 vblendvpd %ymm14,%ymm13,%ymm11,%ymm11 1712 1713 shrb $4,%r9b 1714 andq $0xf,%r9 1715 vpsubq .Lmask52x4(%rip),%ymm12,%ymm13 1716 shlq $5,%r9 1717 vmovapd (%r8,%r9,1),%ymm14 1718 vblendvpd %ymm14,%ymm13,%ymm12,%ymm12 1719 1720 popq %r8 1721 popq %r9 1722 1723 vpand .Lmask52x4(%rip),%ymm3,%ymm3 1724 vpand .Lmask52x4(%rip),%ymm4,%ymm4 1725 vpand .Lmask52x4(%rip),%ymm5,%ymm5 1726 vpand .Lmask52x4(%rip),%ymm6,%ymm6 1727 vpand .Lmask52x4(%rip),%ymm7,%ymm7 1728 vpand .Lmask52x4(%rip),%ymm8,%ymm8 1729 vpand .Lmask52x4(%rip),%ymm9,%ymm9 1730 1731 vpand .Lmask52x4(%rip),%ymm10,%ymm10 1732 vpand .Lmask52x4(%rip),%ymm11,%ymm11 1733 vpand .Lmask52x4(%rip),%ymm12,%ymm12 1734 1735 vmovdqu %ymm3,320(%rdi) 1736 vmovdqu %ymm4,352(%rdi) 1737 vmovdqu %ymm5,384(%rdi) 1738 vmovdqu %ymm6,416(%rdi) 1739 vmovdqu %ymm7,448(%rdi) 1740 vmovdqu %ymm8,480(%rdi) 1741 vmovdqu %ymm9,512(%rdi) 1742 vmovdqu %ymm10,544(%rdi) 1743 vmovdqu %ymm11,576(%rdi) 1744 vmovdqu %ymm12,608(%rdi) 1745 1746 vzeroupper 1747 leaq (%rsp),%rax 1748 .cfi_def_cfa_register %rax 1749 movq 0(%rax),%r15 1750 .cfi_restore %r15 1751 movq 8(%rax),%r14 1752 .cfi_restore %r14 1753 movq 16(%rax),%r13 1754 .cfi_restore %r13 1755 movq 24(%rax),%r12 1756 .cfi_restore %r12 1757 movq 32(%rax),%rbp 1758 .cfi_restore %rbp 1759 movq 40(%rax),%rbx 1760 .cfi_restore %rbx 1761 leaq 48(%rax),%rsp 1762 .cfi_def_cfa %rsp,8 1763 .Lossl_rsaz_amm52x40_x2_avxifma256_epilogue: 1764 .byte 0xf3,0xc3 1765 .cfi_endproc 1766 .size ossl_rsaz_amm52x40_x2_avxifma256, .-ossl_rsaz_amm52x40_x2_avxifma256 1767 .text 1768 1769 .align 32 1770 .globl ossl_extract_multiplier_2x40_win5_avx 1771 .type ossl_extract_multiplier_2x40_win5_avx,@function 1772 ossl_extract_multiplier_2x40_win5_avx: 1773 .cfi_startproc 1774 .byte 243,15,30,250 1775 vmovapd .Lones(%rip),%ymm14 1776 vmovq %rdx,%xmm10 1777 vpbroadcastq %xmm10,%ymm12 1778 vmovq %rcx,%xmm10 1779 vpbroadcastq %xmm10,%ymm13 1780 leaq 20480(%rsi),%rax 1781 1782 1783 movq %rsi,%r10 1784 1785 1786 vpxor %xmm0,%xmm0,%xmm0 1787 vmovapd %ymm0,%ymm1 1788 vmovapd %ymm0,%ymm2 1789 vmovapd %ymm0,%ymm3 1790 vmovapd %ymm0,%ymm4 1791 vmovapd %ymm0,%ymm5 1792 vmovapd %ymm0,%ymm6 1793 vmovapd %ymm0,%ymm7 1794 vmovapd %ymm0,%ymm8 1795 vmovapd %ymm0,%ymm9 1796 vpxor %ymm11,%ymm11,%ymm11 1797 .align 32 1798 .Lloop_0: 1799 vpcmpeqq %ymm11,%ymm12,%ymm15 1800 vmovdqu 0(%rsi),%ymm10 1801 1802 vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 1803 vmovdqu 32(%rsi),%ymm10 1804 1805 vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 1806 vmovdqu 64(%rsi),%ymm10 1807 1808 vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 1809 vmovdqu 96(%rsi),%ymm10 1810 1811 vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 1812 vmovdqu 128(%rsi),%ymm10 1813 1814 vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 1815 vmovdqu 160(%rsi),%ymm10 1816 1817 vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 1818 vmovdqu 192(%rsi),%ymm10 1819 1820 vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 1821 vmovdqu 224(%rsi),%ymm10 1822 1823 vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 1824 vmovdqu 256(%rsi),%ymm10 1825 1826 vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 1827 vmovdqu 288(%rsi),%ymm10 1828 1829 vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 1830 vpaddq %ymm14,%ymm11,%ymm11 1831 addq $640,%rsi 1832 cmpq %rsi,%rax 1833 jne .Lloop_0 1834 vmovdqu %ymm0,0(%rdi) 1835 vmovdqu %ymm1,32(%rdi) 1836 vmovdqu %ymm2,64(%rdi) 1837 vmovdqu %ymm3,96(%rdi) 1838 vmovdqu %ymm4,128(%rdi) 1839 vmovdqu %ymm5,160(%rdi) 1840 vmovdqu %ymm6,192(%rdi) 1841 vmovdqu %ymm7,224(%rdi) 1842 vmovdqu %ymm8,256(%rdi) 1843 vmovdqu %ymm9,288(%rdi) 1844 movq %r10,%rsi 1845 vpxor %ymm11,%ymm11,%ymm11 1846 .align 32 1847 .Lloop_320: 1848 vpcmpeqq %ymm11,%ymm13,%ymm15 1849 vmovdqu 320(%rsi),%ymm10 1850 1851 vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 1852 vmovdqu 352(%rsi),%ymm10 1853 1854 vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 1855 vmovdqu 384(%rsi),%ymm10 1856 1857 vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 1858 vmovdqu 416(%rsi),%ymm10 1859 1860 vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 1861 vmovdqu 448(%rsi),%ymm10 1862 1863 vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 1864 vmovdqu 480(%rsi),%ymm10 1865 1866 vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 1867 vmovdqu 512(%rsi),%ymm10 1868 1869 vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 1870 vmovdqu 544(%rsi),%ymm10 1871 1872 vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 1873 vmovdqu 576(%rsi),%ymm10 1874 1875 vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 1876 vmovdqu 608(%rsi),%ymm10 1877 1878 vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 1879 vpaddq %ymm14,%ymm11,%ymm11 1880 addq $640,%rsi 1881 cmpq %rsi,%rax 1882 jne .Lloop_320 1883 vmovdqu %ymm0,320(%rdi) 1884 vmovdqu %ymm1,352(%rdi) 1885 vmovdqu %ymm2,384(%rdi) 1886 vmovdqu %ymm3,416(%rdi) 1887 vmovdqu %ymm4,448(%rdi) 1888 vmovdqu %ymm5,480(%rdi) 1889 vmovdqu %ymm6,512(%rdi) 1890 vmovdqu %ymm7,544(%rdi) 1891 vmovdqu %ymm8,576(%rdi) 1892 vmovdqu %ymm9,608(%rdi) 1893 1894 .byte 0xf3,0xc3 1895 .cfi_endproc 1896 .size ossl_extract_multiplier_2x40_win5_avx, .-ossl_extract_multiplier_2x40_win5_avx 1897 .section .rodata 1898 .align 32 1899 .Lones: 1900 .quad 1,1,1,1 1901 .Lzeros: 1902 .quad 0,0,0,0 1903 .section ".note.gnu.property", "a" 1904 .p2align 3 1905 .long 1f - 0f 1906 .long 4f - 1f 1907 .long 5 1908 0: 1909 # "GNU" encoded with .byte, since .asciz isn't supported 1910 # on Solaris. 1911 .byte 0x47 1912 .byte 0x4e 1913 .byte 0x55 1914 .byte 0 1915 1: 1916 .p2align 3 1917 .long 0xc0000002 1918 .long 3f - 2f 1919 2: 1920 .long 3 1921 3: 1922 .p2align 3 1923 4: 1924