1 1.1 mrg dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom. 2 1.1 mrg 3 1.1 mrg dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 4 1.1.1.2 mrg 5 1.1 mrg dnl Copyright 2011 Free Software Foundation, Inc. 6 1.1.1.2 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1.1.2 mrg dnl it under the terms of either: 11 1.1.1.2 mrg dnl 12 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1.1.2 mrg dnl option) any later version. 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl or 17 1.1.1.2 mrg dnl 18 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1.1.2 mrg dnl later version. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl or both in parallel, as here. 23 1.1 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1.1.2 mrg dnl for more details. 28 1.1 mrg dnl 29 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg C TODO 36 1.1 mrg C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the 37 1.1 mrg C 4 large loops into one; we could use it for the outer loop branch. 38 1.1 mrg C * Optimise code outside of inner loops. 39 1.1 mrg C * Write combined addmul_1 feed-in a wind-down code, and use when iterating 40 1.1 mrg C outer each loop. ("Overlapping software pipelining") 41 1.1 mrg C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone 42 1.1 mrg C all pushes. 43 1.1 mrg C * Perhaps write special code for n < M, for some small M. 44 1.1 mrg C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps 45 1.1 mrg C with even less pipelined code. 46 1.1 mrg C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left. 47 1.1 mrg C Consider breaking out earlier, saving high the cost of short loops. 48 1.1 mrg 49 1.1 mrg C void mpn_sqr_basecase (mp_ptr wp, 50 1.1 mrg C mp_srcptr xp, mp_size_t xn); 51 1.1 mrg 52 1.1 mrg define(`rp', `%edi') 53 1.1 mrg define(`up', `%esi') 54 1.1 mrg define(`n', `%ecx') 55 1.1 mrg 56 1.1 mrg define(`un', `%ebp') 57 1.1 mrg 58 1.1 mrg TEXT 59 1.1 mrg ALIGN(16) 60 1.1 mrg PROLOGUE(mpn_sqr_basecase) 61 1.1 mrg push %edi 62 1.1 mrg push %esi 63 1.1 mrg mov 12(%esp), rp 64 1.1 mrg mov 16(%esp), up 65 1.1 mrg mov 20(%esp), n 66 1.1 mrg 67 1.1 mrg lea 4(rp), rp C write triangular product starting at rp[1] 68 1.1 mrg dec n 69 1.1 mrg movd (up), %mm7 70 1.1 mrg 71 1.1 mrg jz L(one) 72 1.1 mrg lea 4(up), up 73 1.1 mrg push %ebx 74 1.1 mrg push %ebp 75 1.1 mrg mov n, %eax 76 1.1 mrg 77 1.1 mrg movd (up), %mm0 78 1.1 mrg neg n 79 1.1 mrg pmuludq %mm7, %mm0 80 1.1 mrg pxor %mm6, %mm6 81 1.1 mrg mov n, un 82 1.1 mrg 83 1.1 mrg and $3, %eax 84 1.1 mrg jz L(of0) 85 1.1 mrg cmp $2, %eax 86 1.1 mrg jc L(of1) 87 1.1 mrg jz L(of2) 88 1.1 mrg 89 1.1 mrg C ================================================================ 90 1.1 mrg jmp L(m3) 91 1.1 mrg ALIGN(16) 92 1.1 mrg L(lm3): movd -4(up), %mm0 93 1.1 mrg pmuludq %mm7, %mm0 94 1.1 mrg psrlq $32, %mm6 95 1.1 mrg lea 16(rp), rp 96 1.1 mrg paddq %mm0, %mm6 97 1.1 mrg movd (up), %mm0 98 1.1 mrg pmuludq %mm7, %mm0 99 1.1 mrg movd %mm6, -4(rp) 100 1.1 mrg psrlq $32, %mm6 101 1.1 mrg L(m3): paddq %mm0, %mm6 102 1.1 mrg movd 4(up), %mm0 103 1.1 mrg pmuludq %mm7, %mm0 104 1.1 mrg movd %mm6, (rp) 105 1.1 mrg psrlq $32, %mm6 106 1.1 mrg paddq %mm0, %mm6 107 1.1 mrg movd 8(up), %mm0 108 1.1 mrg pmuludq %mm7, %mm0 109 1.1 mrg movd %mm6, 4(rp) 110 1.1 mrg psrlq $32, %mm6 111 1.1 mrg paddq %mm0, %mm6 112 1.1 mrg add $4, un 113 1.1 mrg movd %mm6, 8(rp) 114 1.1 mrg lea 16(up), up 115 1.1 mrg js L(lm3) 116 1.1 mrg 117 1.1 mrg psrlq $32, %mm6 118 1.1 mrg movd %mm6, 12(rp) 119 1.1 mrg 120 1.1 mrg inc n 121 1.1 mrg C jz L(done) 122 1.1 mrg lea -12(up), up 123 1.1 mrg lea 4(rp), rp 124 1.1 mrg jmp L(ol2) 125 1.1 mrg 126 1.1 mrg C ================================================================ 127 1.1 mrg ALIGN(16) 128 1.1 mrg L(lm0): movd (up), %mm0 129 1.1 mrg pmuludq %mm7, %mm0 130 1.1 mrg psrlq $32, %mm6 131 1.1 mrg lea 16(rp), rp 132 1.1 mrg L(of0): paddq %mm0, %mm6 133 1.1 mrg movd 4(up), %mm0 134 1.1 mrg pmuludq %mm7, %mm0 135 1.1 mrg movd %mm6, (rp) 136 1.1 mrg psrlq $32, %mm6 137 1.1 mrg paddq %mm0, %mm6 138 1.1 mrg movd 8(up), %mm0 139 1.1 mrg pmuludq %mm7, %mm0 140 1.1 mrg movd %mm6, 4(rp) 141 1.1 mrg psrlq $32, %mm6 142 1.1 mrg paddq %mm0, %mm6 143 1.1 mrg movd 12(up), %mm0 144 1.1 mrg pmuludq %mm7, %mm0 145 1.1 mrg movd %mm6, 8(rp) 146 1.1 mrg psrlq $32, %mm6 147 1.1 mrg paddq %mm0, %mm6 148 1.1 mrg add $4, un 149 1.1 mrg movd %mm6, 12(rp) 150 1.1 mrg lea 16(up), up 151 1.1 mrg js L(lm0) 152 1.1 mrg 153 1.1 mrg psrlq $32, %mm6 154 1.1 mrg movd %mm6, 16(rp) 155 1.1 mrg 156 1.1 mrg inc n 157 1.1 mrg C jz L(done) 158 1.1 mrg lea -8(up), up 159 1.1 mrg lea 8(rp), rp 160 1.1 mrg jmp L(ol3) 161 1.1 mrg 162 1.1 mrg C ================================================================ 163 1.1 mrg ALIGN(16) 164 1.1 mrg L(lm1): movd -12(up), %mm0 165 1.1 mrg pmuludq %mm7, %mm0 166 1.1 mrg psrlq $32, %mm6 167 1.1 mrg lea 16(rp), rp 168 1.1 mrg paddq %mm0, %mm6 169 1.1 mrg movd -8(up), %mm0 170 1.1 mrg pmuludq %mm7, %mm0 171 1.1 mrg movd %mm6, -12(rp) 172 1.1 mrg psrlq $32, %mm6 173 1.1 mrg paddq %mm0, %mm6 174 1.1 mrg movd -4(up), %mm0 175 1.1 mrg pmuludq %mm7, %mm0 176 1.1 mrg movd %mm6, -8(rp) 177 1.1 mrg psrlq $32, %mm6 178 1.1 mrg paddq %mm0, %mm6 179 1.1 mrg movd (up), %mm0 180 1.1 mrg pmuludq %mm7, %mm0 181 1.1 mrg movd %mm6, -4(rp) 182 1.1 mrg psrlq $32, %mm6 183 1.1 mrg L(of1): paddq %mm0, %mm6 184 1.1 mrg add $4, un 185 1.1 mrg movd %mm6, (rp) 186 1.1 mrg lea 16(up), up 187 1.1 mrg js L(lm1) 188 1.1 mrg 189 1.1 mrg psrlq $32, %mm6 190 1.1 mrg movd %mm6, 4(rp) 191 1.1 mrg 192 1.1 mrg inc n 193 1.1 mrg jz L(done) C goes away when we add special n=2 code 194 1.1 mrg lea -20(up), up 195 1.1 mrg lea -4(rp), rp 196 1.1 mrg jmp L(ol0) 197 1.1 mrg 198 1.1 mrg C ================================================================ 199 1.1 mrg ALIGN(16) 200 1.1 mrg L(lm2): movd -8(up), %mm0 201 1.1 mrg pmuludq %mm7, %mm0 202 1.1 mrg psrlq $32, %mm6 203 1.1 mrg lea 16(rp), rp 204 1.1 mrg paddq %mm0, %mm6 205 1.1 mrg movd -4(up), %mm0 206 1.1 mrg pmuludq %mm7, %mm0 207 1.1 mrg movd %mm6, -8(rp) 208 1.1 mrg psrlq $32, %mm6 209 1.1 mrg paddq %mm0, %mm6 210 1.1 mrg movd (up), %mm0 211 1.1 mrg pmuludq %mm7, %mm0 212 1.1 mrg movd %mm6, -4(rp) 213 1.1 mrg psrlq $32, %mm6 214 1.1 mrg L(of2): paddq %mm0, %mm6 215 1.1 mrg movd 4(up), %mm0 216 1.1 mrg pmuludq %mm7, %mm0 217 1.1 mrg movd %mm6, (rp) 218 1.1 mrg psrlq $32, %mm6 219 1.1 mrg paddq %mm0, %mm6 220 1.1 mrg add $4, un 221 1.1 mrg movd %mm6, 4(rp) 222 1.1 mrg lea 16(up), up 223 1.1 mrg js L(lm2) 224 1.1 mrg 225 1.1 mrg psrlq $32, %mm6 226 1.1 mrg movd %mm6, 8(rp) 227 1.1 mrg 228 1.1 mrg inc n 229 1.1 mrg C jz L(done) 230 1.1 mrg lea -16(up), up 231 1.1 mrg C lea (rp), rp 232 1.1 mrg C jmp L(ol1) 233 1.1 mrg 234 1.1 mrg C ================================================================ 235 1.1 mrg 236 1.1 mrg L(ol1): lea 4(up,n,4), up 237 1.1 mrg movd (up), %mm7 C read next U invariant limb 238 1.1 mrg lea 8(rp,n,4), rp 239 1.1 mrg mov n, un 240 1.1 mrg 241 1.1 mrg movd 4(up), %mm1 242 1.1 mrg pmuludq %mm7, %mm1 243 1.1 mrg sar $2, un 244 1.1 mrg movd %mm1, %ebx 245 1.1 mrg inc un 246 1.1 mrg jz L(re1) 247 1.1 mrg 248 1.1 mrg movd 8(up), %mm0 249 1.1 mrg pmuludq %mm7, %mm0 250 1.1 mrg xor %edx, %edx C zero edx and CF 251 1.1 mrg jmp L(a1) 252 1.1 mrg 253 1.1 mrg L(la1): adc $0, %edx 254 1.1 mrg add %ebx, 12(rp) 255 1.1 mrg movd %mm0, %eax 256 1.1 mrg pmuludq %mm7, %mm1 257 1.1 mrg lea 16(rp), rp 258 1.1 mrg psrlq $32, %mm0 259 1.1 mrg adc %edx, %eax 260 1.1 mrg movd %mm0, %edx 261 1.1 mrg movd %mm1, %ebx 262 1.1 mrg movd 8(up), %mm0 263 1.1 mrg pmuludq %mm7, %mm0 264 1.1 mrg adc $0, %edx 265 1.1 mrg add %eax, (rp) 266 1.1 mrg L(a1): psrlq $32, %mm1 267 1.1 mrg adc %edx, %ebx 268 1.1 mrg movd %mm1, %edx 269 1.1 mrg movd %mm0, %eax 270 1.1 mrg movd 12(up), %mm1 271 1.1 mrg pmuludq %mm7, %mm1 272 1.1 mrg adc $0, %edx 273 1.1 mrg add %ebx, 4(rp) 274 1.1 mrg psrlq $32, %mm0 275 1.1 mrg adc %edx, %eax 276 1.1 mrg movd %mm0, %edx 277 1.1 mrg movd %mm1, %ebx 278 1.1 mrg lea 16(up), up 279 1.1 mrg movd (up), %mm0 280 1.1 mrg adc $0, %edx 281 1.1 mrg add %eax, 8(rp) 282 1.1 mrg psrlq $32, %mm1 283 1.1 mrg adc %edx, %ebx 284 1.1 mrg movd %mm1, %edx 285 1.1 mrg pmuludq %mm7, %mm0 286 1.1 mrg inc un 287 1.1 mrg movd 4(up), %mm1 288 1.1 mrg jnz L(la1) 289 1.1 mrg 290 1.1 mrg adc un, %edx C un is zero here 291 1.1 mrg add %ebx, 12(rp) 292 1.1 mrg movd %mm0, %eax 293 1.1 mrg pmuludq %mm7, %mm1 294 1.1 mrg lea 16(rp), rp 295 1.1 mrg psrlq $32, %mm0 296 1.1 mrg adc %edx, %eax 297 1.1 mrg movd %mm0, %edx 298 1.1 mrg movd %mm1, %ebx 299 1.1 mrg adc un, %edx 300 1.1 mrg add %eax, (rp) 301 1.1 mrg psrlq $32, %mm1 302 1.1 mrg adc %edx, %ebx 303 1.1 mrg movd %mm1, %eax 304 1.1 mrg adc un, %eax 305 1.1 mrg add %ebx, 4(rp) 306 1.1 mrg adc un, %eax 307 1.1 mrg mov %eax, 8(rp) 308 1.1 mrg 309 1.1 mrg inc n 310 1.1 mrg 311 1.1 mrg C ================================================================ 312 1.1 mrg 313 1.1 mrg L(ol0): lea (up,n,4), up 314 1.1 mrg movd 4(up), %mm7 C read next U invariant limb 315 1.1 mrg lea 4(rp,n,4), rp 316 1.1 mrg mov n, un 317 1.1 mrg 318 1.1 mrg movd 8(up), %mm0 319 1.1 mrg pmuludq %mm7, %mm0 320 1.1 mrg sar $2, un 321 1.1 mrg movd 12(up), %mm1 322 1.1 mrg movd %mm0, %eax 323 1.1 mrg pmuludq %mm7, %mm1 324 1.1 mrg xor %edx, %edx C zero edx and CF 325 1.1 mrg jmp L(a0) 326 1.1 mrg 327 1.1 mrg L(la0): adc $0, %edx 328 1.1 mrg add %ebx, 12(rp) 329 1.1 mrg movd %mm0, %eax 330 1.1 mrg pmuludq %mm7, %mm1 331 1.1 mrg lea 16(rp), rp 332 1.1 mrg psrlq $32, %mm0 333 1.1 mrg adc %edx, %eax 334 1.1 mrg movd %mm0, %edx 335 1.1 mrg movd %mm1, %ebx 336 1.1 mrg movd 8(up), %mm0 337 1.1 mrg pmuludq %mm7, %mm0 338 1.1 mrg adc $0, %edx 339 1.1 mrg add %eax, (rp) 340 1.1 mrg psrlq $32, %mm1 341 1.1 mrg adc %edx, %ebx 342 1.1 mrg movd %mm1, %edx 343 1.1 mrg movd %mm0, %eax 344 1.1 mrg movd 12(up), %mm1 345 1.1 mrg pmuludq %mm7, %mm1 346 1.1 mrg adc $0, %edx 347 1.1 mrg add %ebx, 4(rp) 348 1.1 mrg L(a0): psrlq $32, %mm0 349 1.1 mrg adc %edx, %eax 350 1.1 mrg movd %mm0, %edx 351 1.1 mrg movd %mm1, %ebx 352 1.1 mrg lea 16(up), up 353 1.1 mrg movd (up), %mm0 354 1.1 mrg adc $0, %edx 355 1.1 mrg add %eax, 8(rp) 356 1.1 mrg psrlq $32, %mm1 357 1.1 mrg adc %edx, %ebx 358 1.1 mrg movd %mm1, %edx 359 1.1 mrg pmuludq %mm7, %mm0 360 1.1 mrg inc un 361 1.1 mrg movd 4(up), %mm1 362 1.1 mrg jnz L(la0) 363 1.1 mrg 364 1.1 mrg adc un, %edx C un is zero here 365 1.1 mrg add %ebx, 12(rp) 366 1.1 mrg movd %mm0, %eax 367 1.1 mrg pmuludq %mm7, %mm1 368 1.1 mrg lea 16(rp), rp 369 1.1 mrg psrlq $32, %mm0 370 1.1 mrg adc %edx, %eax 371 1.1 mrg movd %mm0, %edx 372 1.1 mrg movd %mm1, %ebx 373 1.1 mrg adc un, %edx 374 1.1 mrg add %eax, (rp) 375 1.1 mrg psrlq $32, %mm1 376 1.1 mrg adc %edx, %ebx 377 1.1 mrg movd %mm1, %eax 378 1.1 mrg adc un, %eax 379 1.1 mrg add %ebx, 4(rp) 380 1.1 mrg adc un, %eax 381 1.1 mrg mov %eax, 8(rp) 382 1.1 mrg 383 1.1 mrg inc n 384 1.1 mrg 385 1.1 mrg C ================================================================ 386 1.1 mrg 387 1.1 mrg L(ol3): lea 12(up,n,4), up 388 1.1 mrg movd -8(up), %mm7 C read next U invariant limb 389 1.1 mrg lea (rp,n,4), rp C put rp back 390 1.1 mrg mov n, un 391 1.1 mrg 392 1.1 mrg movd -4(up), %mm1 393 1.1 mrg pmuludq %mm7, %mm1 394 1.1 mrg sar $2, un 395 1.1 mrg movd %mm1, %ebx 396 1.1 mrg movd (up), %mm0 397 1.1 mrg xor %edx, %edx C zero edx and CF 398 1.1 mrg jmp L(a3) 399 1.1 mrg 400 1.1 mrg L(la3): adc $0, %edx 401 1.1 mrg add %ebx, 12(rp) 402 1.1 mrg movd %mm0, %eax 403 1.1 mrg pmuludq %mm7, %mm1 404 1.1 mrg lea 16(rp), rp 405 1.1 mrg psrlq $32, %mm0 406 1.1 mrg adc %edx, %eax 407 1.1 mrg movd %mm0, %edx 408 1.1 mrg movd %mm1, %ebx 409 1.1 mrg movd 8(up), %mm0 410 1.1 mrg pmuludq %mm7, %mm0 411 1.1 mrg adc $0, %edx 412 1.1 mrg add %eax, (rp) 413 1.1 mrg psrlq $32, %mm1 414 1.1 mrg adc %edx, %ebx 415 1.1 mrg movd %mm1, %edx 416 1.1 mrg movd %mm0, %eax 417 1.1 mrg movd 12(up), %mm1 418 1.1 mrg pmuludq %mm7, %mm1 419 1.1 mrg adc $0, %edx 420 1.1 mrg add %ebx, 4(rp) 421 1.1 mrg psrlq $32, %mm0 422 1.1 mrg adc %edx, %eax 423 1.1 mrg movd %mm0, %edx 424 1.1 mrg movd %mm1, %ebx 425 1.1 mrg lea 16(up), up 426 1.1 mrg movd (up), %mm0 427 1.1 mrg adc $0, %edx 428 1.1 mrg add %eax, 8(rp) 429 1.1 mrg L(a3): psrlq $32, %mm1 430 1.1 mrg adc %edx, %ebx 431 1.1 mrg movd %mm1, %edx 432 1.1 mrg pmuludq %mm7, %mm0 433 1.1 mrg inc un 434 1.1 mrg movd 4(up), %mm1 435 1.1 mrg jnz L(la3) 436 1.1 mrg 437 1.1 mrg adc un, %edx C un is zero here 438 1.1 mrg add %ebx, 12(rp) 439 1.1 mrg movd %mm0, %eax 440 1.1 mrg pmuludq %mm7, %mm1 441 1.1 mrg lea 16(rp), rp 442 1.1 mrg psrlq $32, %mm0 443 1.1 mrg adc %edx, %eax 444 1.1 mrg movd %mm0, %edx 445 1.1 mrg movd %mm1, %ebx 446 1.1 mrg adc un, %edx 447 1.1 mrg add %eax, (rp) 448 1.1 mrg psrlq $32, %mm1 449 1.1 mrg adc %edx, %ebx 450 1.1 mrg movd %mm1, %eax 451 1.1 mrg adc un, %eax 452 1.1 mrg add %ebx, 4(rp) 453 1.1 mrg adc un, %eax 454 1.1 mrg mov %eax, 8(rp) 455 1.1 mrg 456 1.1 mrg inc n 457 1.1 mrg 458 1.1 mrg C ================================================================ 459 1.1 mrg 460 1.1 mrg L(ol2): lea 8(up,n,4), up 461 1.1 mrg movd -4(up), %mm7 C read next U invariant limb 462 1.1 mrg lea 12(rp,n,4), rp 463 1.1 mrg mov n, un 464 1.1 mrg 465 1.1 mrg movd (up), %mm0 466 1.1 mrg pmuludq %mm7, %mm0 467 1.1 mrg xor %edx, %edx 468 1.1 mrg sar $2, un 469 1.1 mrg movd 4(up), %mm1 470 1.1 mrg test un, un C clear carry 471 1.1 mrg movd %mm0, %eax 472 1.1 mrg pmuludq %mm7, %mm1 473 1.1 mrg inc un 474 1.1 mrg jnz L(a2) 475 1.1 mrg jmp L(re2) 476 1.1 mrg 477 1.1 mrg L(la2): adc $0, %edx 478 1.1 mrg add %ebx, 12(rp) 479 1.1 mrg movd %mm0, %eax 480 1.1 mrg pmuludq %mm7, %mm1 481 1.1 mrg lea 16(rp), rp 482 1.1 mrg L(a2): psrlq $32, %mm0 483 1.1 mrg adc %edx, %eax 484 1.1 mrg movd %mm0, %edx 485 1.1 mrg movd %mm1, %ebx 486 1.1 mrg movd 8(up), %mm0 487 1.1 mrg pmuludq %mm7, %mm0 488 1.1 mrg adc $0, %edx 489 1.1 mrg add %eax, (rp) 490 1.1 mrg psrlq $32, %mm1 491 1.1 mrg adc %edx, %ebx 492 1.1 mrg movd %mm1, %edx 493 1.1 mrg movd %mm0, %eax 494 1.1 mrg movd 12(up), %mm1 495 1.1 mrg pmuludq %mm7, %mm1 496 1.1 mrg adc $0, %edx 497 1.1 mrg add %ebx, 4(rp) 498 1.1 mrg psrlq $32, %mm0 499 1.1 mrg adc %edx, %eax 500 1.1 mrg movd %mm0, %edx 501 1.1 mrg movd %mm1, %ebx 502 1.1 mrg lea 16(up), up 503 1.1 mrg movd (up), %mm0 504 1.1 mrg adc $0, %edx 505 1.1 mrg add %eax, 8(rp) 506 1.1 mrg psrlq $32, %mm1 507 1.1 mrg adc %edx, %ebx 508 1.1 mrg movd %mm1, %edx 509 1.1 mrg pmuludq %mm7, %mm0 510 1.1 mrg inc un 511 1.1 mrg movd 4(up), %mm1 512 1.1 mrg jnz L(la2) 513 1.1 mrg 514 1.1 mrg adc un, %edx C un is zero here 515 1.1 mrg add %ebx, 12(rp) 516 1.1 mrg movd %mm0, %eax 517 1.1 mrg pmuludq %mm7, %mm1 518 1.1 mrg lea 16(rp), rp 519 1.1 mrg psrlq $32, %mm0 520 1.1 mrg adc %edx, %eax 521 1.1 mrg movd %mm0, %edx 522 1.1 mrg movd %mm1, %ebx 523 1.1 mrg adc un, %edx 524 1.1 mrg add %eax, (rp) 525 1.1 mrg psrlq $32, %mm1 526 1.1 mrg adc %edx, %ebx 527 1.1 mrg movd %mm1, %eax 528 1.1 mrg adc un, %eax 529 1.1 mrg add %ebx, 4(rp) 530 1.1 mrg adc un, %eax 531 1.1 mrg mov %eax, 8(rp) 532 1.1 mrg 533 1.1 mrg inc n 534 1.1 mrg jmp L(ol1) 535 1.1 mrg 536 1.1 mrg C ================================================================ 537 1.1 mrg L(re2): psrlq $32, %mm0 538 1.1 mrg movd (up), %mm7 C read next U invariant limb 539 1.1 mrg adc %edx, %eax 540 1.1 mrg movd %mm0, %edx 541 1.1 mrg movd %mm1, %ebx 542 1.1 mrg adc un, %edx 543 1.1 mrg add %eax, (rp) 544 1.1 mrg lea 4(rp), rp 545 1.1 mrg psrlq $32, %mm1 546 1.1 mrg adc %edx, %ebx 547 1.1 mrg movd %mm1, %eax 548 1.1 mrg movd 4(up), %mm1 549 1.1 mrg adc un, %eax 550 1.1 mrg add %ebx, (rp) 551 1.1 mrg pmuludq %mm7, %mm1 552 1.1 mrg adc un, %eax 553 1.1 mrg mov %eax, 4(rp) 554 1.1 mrg movd %mm1, %ebx 555 1.1 mrg 556 1.1 mrg L(re1): psrlq $32, %mm1 557 1.1 mrg add %ebx, 4(rp) 558 1.1 mrg movd %mm1, %eax 559 1.1 mrg adc un, %eax 560 1.1 mrg xor n, n C make n zeroness assumption below true 561 1.1 mrg mov %eax, 8(rp) 562 1.1 mrg 563 1.1 mrg L(done): C n is zero here 564 1.1 mrg mov 24(%esp), up 565 1.1 mrg mov 28(%esp), %eax 566 1.1 mrg 567 1.1 mrg movd (up), %mm0 568 1.1 mrg inc %eax 569 1.1 mrg pmuludq %mm0, %mm0 570 1.1 mrg lea 4(up), up 571 1.1 mrg mov 20(%esp), rp 572 1.1 mrg shr %eax 573 1.1 mrg movd %mm0, (rp) 574 1.1 mrg psrlq $32, %mm0 575 1.1 mrg lea -12(rp), rp 576 1.1 mrg mov %eax, 28(%esp) 577 1.1 mrg jnc L(odd) 578 1.1 mrg 579 1.1 mrg movd %mm0, %ebp 580 1.1 mrg movd (up), %mm0 581 1.1 mrg lea 8(rp), rp 582 1.1 mrg pmuludq %mm0, %mm0 583 1.1 mrg lea -4(up), up 584 1.1 mrg add 8(rp), %ebp 585 1.1 mrg movd %mm0, %edx 586 1.1 mrg adc 12(rp), %edx 587 1.1 mrg rcr n 588 1.1 mrg jmp L(ent) 589 1.1 mrg 590 1.1 mrg C ALIGN(16) C alignment seems irrelevant 591 1.1 mrg L(top): movd (up), %mm1 592 1.1 mrg adc n, n 593 1.1 mrg movd %mm0, %eax 594 1.1 mrg pmuludq %mm1, %mm1 595 1.1 mrg movd 4(up), %mm0 596 1.1 mrg adc (rp), %eax 597 1.1 mrg movd %mm1, %ebx 598 1.1 mrg pmuludq %mm0, %mm0 599 1.1 mrg psrlq $32, %mm1 600 1.1 mrg adc 4(rp), %ebx 601 1.1 mrg movd %mm1, %ebp 602 1.1 mrg movd %mm0, %edx 603 1.1 mrg adc 8(rp), %ebp 604 1.1 mrg adc 12(rp), %edx 605 1.1 mrg rcr n C FIXME: isn't this awfully slow on atom??? 606 1.1 mrg adc %eax, (rp) 607 1.1 mrg adc %ebx, 4(rp) 608 1.1 mrg L(ent): lea 8(up), up 609 1.1 mrg adc %ebp, 8(rp) 610 1.1 mrg psrlq $32, %mm0 611 1.1 mrg adc %edx, 12(rp) 612 1.1 mrg L(odd): decl 28(%esp) 613 1.1 mrg lea 16(rp), rp 614 1.1 mrg jnz L(top) 615 1.1 mrg 616 1.1 mrg L(end): adc n, n 617 1.1 mrg movd %mm0, %eax 618 1.1 mrg adc n, %eax 619 1.1 mrg mov %eax, (rp) 620 1.1 mrg 621 1.1 mrg L(rtn): emms 622 1.1 mrg pop %ebp 623 1.1 mrg pop %ebx 624 1.1 mrg pop %esi 625 1.1 mrg pop %edi 626 1.1 mrg ret 627 1.1 mrg 628 1.1 mrg L(one): pmuludq %mm7, %mm7 629 1.1 mrg movq %mm7, -4(rp) 630 1.1 mrg emms 631 1.1 mrg pop %esi 632 1.1 mrg pop %edi 633 1.1 mrg ret 634 1.1 mrg EPILOGUE() 635