1 1.1 mrg dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16 35 1.1 mrg C limbs/loop unrolling). 36 1.1 mrg 37 1.1 mrg 38 1.1 mrg 39 1.1 mrg dnl K7 UNROLL_COUNT cycles/product (at around 20x20) 40 1.1 mrg dnl 8 4.67 41 1.1 mrg dnl 16 4.59 42 1.1 mrg dnl 32 4.42 43 1.1 mrg dnl Maximum possible with the current code is 32. 44 1.1 mrg dnl 45 1.1 mrg dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get 46 1.1 mrg dnl done with a straight run through a block of code, no inner loop. Using 47 1.1 mrg dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache. 48 1.1 mrg 49 1.1 mrg deflit(UNROLL_COUNT, 32) 50 1.1 mrg 51 1.1 mrg 52 1.1 mrg C void mpn_mul_basecase (mp_ptr wp, 53 1.1 mrg C mp_srcptr xp, mp_size_t xsize, 54 1.1 mrg C mp_srcptr yp, mp_size_t ysize); 55 1.1 mrg C 56 1.1 mrg C Calculate xp,xsize multiplied by yp,ysize, storing the result in 57 1.1 mrg C wp,xsize+ysize. 58 1.1 mrg C 59 1.1 mrg C This routine is essentially the same as mpn/generic/mul_basecase.c, but 60 1.1 mrg C it's faster because it does most of the mpn_addmul_1() startup 61 1.1 mrg C calculations only once. The saving is 15-25% on typical sizes coming from 62 1.1 mrg C the Karatsuba multiply code. 63 1.1 mrg 64 1.1 mrg ifdef(`PIC',` 65 1.1 mrg deflit(UNROLL_THRESHOLD, 5) 66 1.1 mrg ',` 67 1.1 mrg deflit(UNROLL_THRESHOLD, 5) 68 1.1 mrg ') 69 1.1 mrg 70 1.1 mrg defframe(PARAM_YSIZE,20) 71 1.1 mrg defframe(PARAM_YP, 16) 72 1.1 mrg defframe(PARAM_XSIZE,12) 73 1.1 mrg defframe(PARAM_XP, 8) 74 1.1 mrg defframe(PARAM_WP, 4) 75 1.1 mrg 76 1.1 mrg TEXT 77 1.1 mrg ALIGN(32) 78 1.1 mrg PROLOGUE(mpn_mul_basecase) 79 1.1 mrg deflit(`FRAME',0) 80 1.1 mrg 81 1.1 mrg movl PARAM_XSIZE, %ecx 82 1.1 mrg movl PARAM_YP, %eax 83 1.1 mrg 84 1.1 mrg movl PARAM_XP, %edx 85 1.1 mrg movl (%eax), %eax C yp low limb 86 1.1 mrg 87 1.1 mrg cmpl $2, %ecx 88 1.1 mrg ja L(xsize_more_than_two) 89 1.1 mrg je L(two_by_something) 90 1.1 mrg 91 1.1 mrg 92 1.1 mrg C one limb by one limb 93 1.1 mrg 94 1.1 mrg mull (%edx) 95 1.1 mrg 96 1.1 mrg movl PARAM_WP, %ecx 97 1.1 mrg movl %eax, (%ecx) 98 1.1 mrg movl %edx, 4(%ecx) 99 1.1 mrg ret 100 1.1 mrg 101 1.1 mrg 102 1.1 mrg C ----------------------------------------------------------------------------- 103 1.1 mrg L(two_by_something): 104 1.1 mrg deflit(`FRAME',0) 105 1.1 mrg decl PARAM_YSIZE 106 1.1 mrg pushl %ebx defframe_pushl(`SAVE_EBX') 107 1.1 mrg movl %eax, %ecx C yp low limb 108 1.1 mrg 109 1.1 mrg movl PARAM_WP, %ebx 110 1.1 mrg pushl %esi defframe_pushl(`SAVE_ESI') 111 1.1 mrg movl %edx, %esi C xp 112 1.1 mrg 113 1.1 mrg movl (%edx), %eax C xp low limb 114 1.1 mrg jnz L(two_by_two) 115 1.1 mrg 116 1.1 mrg 117 1.1 mrg C two limbs by one limb 118 1.1 mrg 119 1.1 mrg mull %ecx 120 1.1 mrg 121 1.1 mrg movl %eax, (%ebx) 122 1.1 mrg movl 4(%esi), %eax 123 1.1 mrg movl %edx, %esi C carry 124 1.1 mrg 125 1.1 mrg mull %ecx 126 1.1 mrg 127 1.1 mrg addl %eax, %esi 128 1.1 mrg 129 1.1 mrg movl %esi, 4(%ebx) 130 1.1 mrg movl SAVE_ESI, %esi 131 1.1 mrg 132 1.1 mrg adcl $0, %edx 133 1.1 mrg 134 1.1 mrg movl %edx, 8(%ebx) 135 1.1 mrg movl SAVE_EBX, %ebx 136 1.1 mrg addl $FRAME, %esp 137 1.1 mrg 138 1.1 mrg ret 139 1.1 mrg 140 1.1 mrg 141 1.1 mrg 142 1.1 mrg C ----------------------------------------------------------------------------- 143 1.1 mrg C Could load yp earlier into another register. 144 1.1 mrg 145 1.1 mrg ALIGN(16) 146 1.1 mrg L(two_by_two): 147 1.1 mrg C eax xp low limb 148 1.1 mrg C ebx wp 149 1.1 mrg C ecx yp low limb 150 1.1 mrg C edx 151 1.1 mrg C esi xp 152 1.1 mrg C edi 153 1.1 mrg C ebp 154 1.1 mrg 155 1.1 mrg dnl FRAME carries on from previous 156 1.1 mrg 157 1.1 mrg mull %ecx C xp[0] * yp[0] 158 1.1 mrg 159 1.1 mrg push %edi defframe_pushl(`SAVE_EDI') 160 1.1 mrg movl %edx, %edi C carry, for wp[1] 161 1.1 mrg 162 1.1 mrg movl %eax, (%ebx) 163 1.1 mrg movl 4(%esi), %eax 164 1.1 mrg 165 1.1 mrg mull %ecx C xp[1] * yp[0] 166 1.1 mrg 167 1.1 mrg addl %eax, %edi 168 1.1 mrg movl PARAM_YP, %ecx 169 1.1 mrg 170 1.1 mrg adcl $0, %edx 171 1.1 mrg movl 4(%ecx), %ecx C yp[1] 172 1.1 mrg movl %edi, 4(%ebx) 173 1.1 mrg 174 1.1 mrg movl 4(%esi), %eax C xp[1] 175 1.1 mrg movl %edx, %edi C carry, for wp[2] 176 1.1 mrg 177 1.1 mrg mull %ecx C xp[1] * yp[1] 178 1.1 mrg 179 1.1 mrg addl %eax, %edi 180 1.1 mrg 181 1.1 mrg adcl $0, %edx 182 1.1 mrg movl (%esi), %eax C xp[0] 183 1.1 mrg 184 1.1 mrg movl %edx, %esi C carry, for wp[3] 185 1.1 mrg 186 1.1 mrg mull %ecx C xp[0] * yp[1] 187 1.1 mrg 188 1.1 mrg addl %eax, 4(%ebx) 189 1.1 mrg adcl %edx, %edi 190 1.1 mrg movl %edi, 8(%ebx) 191 1.1 mrg 192 1.1 mrg adcl $0, %esi 193 1.1 mrg movl SAVE_EDI, %edi 194 1.1 mrg movl %esi, 12(%ebx) 195 1.1 mrg 196 1.1 mrg movl SAVE_ESI, %esi 197 1.1 mrg movl SAVE_EBX, %ebx 198 1.1 mrg addl $FRAME, %esp 199 1.1 mrg 200 1.1 mrg ret 201 1.1 mrg 202 1.1 mrg 203 1.1 mrg C ----------------------------------------------------------------------------- 204 1.1 mrg ALIGN(16) 205 1.1 mrg L(xsize_more_than_two): 206 1.1 mrg 207 1.1 mrg C The first limb of yp is processed with a simple mpn_mul_1 style loop 208 1.1 mrg C inline. Unrolling this doesn't seem worthwhile since it's only run once 209 1.1 mrg C (whereas the addmul below is run ysize-1 many times). A call to the 210 1.1 mrg C actual mpn_mul_1 will be slowed down by the call and parameter pushing and 211 1.1 mrg C popping, and doesn't seem likely to be worthwhile on the typical 13-26 212 1.1 mrg C limb operations the Karatsuba code calls here with. 213 1.1 mrg 214 1.1 mrg C eax yp[0] 215 1.1 mrg C ebx 216 1.1 mrg C ecx xsize 217 1.1 mrg C edx xp 218 1.1 mrg C esi 219 1.1 mrg C edi 220 1.1 mrg C ebp 221 1.1 mrg 222 1.1 mrg dnl FRAME doesn't carry on from previous, no pushes yet here 223 1.1 mrg defframe(`SAVE_EBX',-4) 224 1.1 mrg defframe(`SAVE_ESI',-8) 225 1.1 mrg defframe(`SAVE_EDI',-12) 226 1.1 mrg defframe(`SAVE_EBP',-16) 227 1.1 mrg deflit(`FRAME',0) 228 1.1 mrg 229 1.1 mrg subl $16, %esp 230 1.1 mrg deflit(`FRAME',16) 231 1.1 mrg 232 1.1 mrg movl %edi, SAVE_EDI 233 1.1 mrg movl PARAM_WP, %edi 234 1.1 mrg 235 1.1 mrg movl %ebx, SAVE_EBX 236 1.1 mrg movl %ebp, SAVE_EBP 237 1.1 mrg movl %eax, %ebp 238 1.1 mrg 239 1.1 mrg movl %esi, SAVE_ESI 240 1.1 mrg xorl %ebx, %ebx 241 1.1 mrg leal (%edx,%ecx,4), %esi C xp end 242 1.1 mrg 243 1.1 mrg leal (%edi,%ecx,4), %edi C wp end of mul1 244 1.1 mrg negl %ecx 245 1.1 mrg 246 1.1 mrg 247 1.1 mrg L(mul1): 248 1.1 mrg C eax scratch 249 1.1 mrg C ebx carry 250 1.1 mrg C ecx counter, negative 251 1.1 mrg C edx scratch 252 1.1 mrg C esi xp end 253 1.1 mrg C edi wp end of mul1 254 1.1 mrg C ebp multiplier 255 1.1 mrg 256 1.1 mrg movl (%esi,%ecx,4), %eax 257 1.1 mrg 258 1.1 mrg mull %ebp 259 1.1 mrg 260 1.1 mrg addl %ebx, %eax 261 1.1 mrg movl %eax, (%edi,%ecx,4) 262 1.1 mrg movl $0, %ebx 263 1.1 mrg 264 1.1 mrg adcl %edx, %ebx 265 1.1 mrg incl %ecx 266 1.1 mrg jnz L(mul1) 267 1.1 mrg 268 1.1 mrg 269 1.1 mrg movl PARAM_YSIZE, %edx 270 1.1 mrg movl PARAM_XSIZE, %ecx 271 1.1 mrg 272 1.1 mrg movl %ebx, (%edi) C final carry 273 1.1 mrg decl %edx 274 1.1 mrg 275 1.1 mrg jnz L(ysize_more_than_one) 276 1.1 mrg 277 1.1 mrg 278 1.1 mrg movl SAVE_EDI, %edi 279 1.1 mrg movl SAVE_EBX, %ebx 280 1.1 mrg 281 1.1 mrg movl SAVE_EBP, %ebp 282 1.1 mrg movl SAVE_ESI, %esi 283 1.1 mrg addl $FRAME, %esp 284 1.1 mrg 285 1.1 mrg ret 286 1.1 mrg 287 1.1 mrg 288 1.1 mrg L(ysize_more_than_one): 289 1.1 mrg cmpl $UNROLL_THRESHOLD, %ecx 290 1.1 mrg movl PARAM_YP, %eax 291 1.1 mrg 292 1.1 mrg jae L(unroll) 293 1.1 mrg 294 1.1 mrg 295 1.1 mrg C ----------------------------------------------------------------------------- 296 1.1 mrg C simple addmul looping 297 1.1 mrg C 298 1.1 mrg C eax yp 299 1.1 mrg C ebx 300 1.1 mrg C ecx xsize 301 1.1 mrg C edx ysize-1 302 1.1 mrg C esi xp end 303 1.1 mrg C edi wp end of mul1 304 1.1 mrg C ebp 305 1.1 mrg 306 1.1 mrg leal 4(%eax,%edx,4), %ebp C yp end 307 1.1 mrg negl %ecx 308 1.1 mrg negl %edx 309 1.1 mrg 310 1.1 mrg movl (%esi,%ecx,4), %eax C xp low limb 311 1.1 mrg movl %edx, PARAM_YSIZE C -(ysize-1) 312 1.1 mrg incl %ecx 313 1.1 mrg 314 1.1 mrg xorl %ebx, %ebx C initial carry 315 1.1 mrg movl %ecx, PARAM_XSIZE C -(xsize-1) 316 1.1 mrg movl %ebp, PARAM_YP 317 1.1 mrg 318 1.1 mrg movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier 319 1.1 mrg jmp L(simple_outer_entry) 320 1.1 mrg 321 1.1 mrg 322 1.1 mrg C this is offset 0x121 so close enough to aligned 323 1.1 mrg L(simple_outer_top): 324 1.1 mrg C ebp ysize counter, negative 325 1.1 mrg 326 1.1 mrg movl PARAM_YP, %edx 327 1.1 mrg movl PARAM_XSIZE, %ecx C -(xsize-1) 328 1.1 mrg xorl %ebx, %ebx C carry 329 1.1 mrg 330 1.1 mrg movl %ebp, PARAM_YSIZE 331 1.1 mrg addl $4, %edi C next position in wp 332 1.1 mrg 333 1.1 mrg movl (%edx,%ebp,4), %ebp C yp limb - multiplier 334 1.1 mrg movl -4(%esi,%ecx,4), %eax C xp low limb 335 1.1 mrg 336 1.1 mrg 337 1.1 mrg L(simple_outer_entry): 338 1.1 mrg 339 1.1 mrg L(simple_inner): 340 1.1 mrg C eax xp limb 341 1.1 mrg C ebx carry limb 342 1.1 mrg C ecx loop counter (negative) 343 1.1 mrg C edx scratch 344 1.1 mrg C esi xp end 345 1.1 mrg C edi wp end 346 1.1 mrg C ebp multiplier 347 1.1 mrg 348 1.1 mrg mull %ebp 349 1.1 mrg 350 1.1 mrg addl %eax, %ebx 351 1.1 mrg adcl $0, %edx 352 1.1 mrg 353 1.1 mrg addl %ebx, (%edi,%ecx,4) 354 1.1 mrg movl (%esi,%ecx,4), %eax 355 1.1 mrg adcl $0, %edx 356 1.1 mrg 357 1.1 mrg incl %ecx 358 1.1 mrg movl %edx, %ebx 359 1.1 mrg jnz L(simple_inner) 360 1.1 mrg 361 1.1 mrg 362 1.1 mrg mull %ebp 363 1.1 mrg 364 1.1 mrg movl PARAM_YSIZE, %ebp 365 1.1 mrg addl %eax, %ebx 366 1.1 mrg 367 1.1 mrg adcl $0, %edx 368 1.1 mrg addl %ebx, (%edi) 369 1.1 mrg 370 1.1 mrg adcl $0, %edx 371 1.1 mrg incl %ebp 372 1.1 mrg 373 1.1 mrg movl %edx, 4(%edi) 374 1.1 mrg jnz L(simple_outer_top) 375 1.1 mrg 376 1.1 mrg 377 1.1 mrg movl SAVE_EBX, %ebx 378 1.1 mrg movl SAVE_ESI, %esi 379 1.1 mrg 380 1.1 mrg movl SAVE_EDI, %edi 381 1.1 mrg movl SAVE_EBP, %ebp 382 1.1 mrg addl $FRAME, %esp 383 1.1 mrg 384 1.1 mrg ret 385 1.1 mrg 386 1.1 mrg 387 1.1 mrg 388 1.1 mrg C ----------------------------------------------------------------------------- 389 1.1 mrg C 390 1.1 mrg C The unrolled loop is the same as in mpn_addmul_1(), see that code for some 391 1.1 mrg C comments. 392 1.1 mrg C 393 1.1 mrg C VAR_ADJUST is the negative of how many limbs the leals in the inner loop 394 1.1 mrg C increment xp and wp. This is used to adjust back xp and wp, and rshifted 395 1.1 mrg C to given an initial VAR_COUNTER at the top of the outer loop. 396 1.1 mrg C 397 1.1 mrg C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT 398 1.1 mrg C up to -1, inclusive. 399 1.1 mrg C 400 1.1 mrg C VAR_JMP is the computed jump into the unrolled loop. 401 1.1 mrg C 402 1.1 mrg C VAR_XP_LOW is the least significant limb of xp, which is needed at the 403 1.1 mrg C start of the unrolled loop. 404 1.1 mrg C 405 1.1 mrg C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, 406 1.1 mrg C inclusive. 407 1.1 mrg C 408 1.1 mrg C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be 409 1.1 mrg C added to give the location of the next limb of yp, which is the multiplier 410 1.1 mrg C in the unrolled loop. 411 1.1 mrg C 412 1.1 mrg C The trick with VAR_ADJUST means it's only necessary to do one fetch in the 413 1.1 mrg C outer loop to take care of xp, wp and the inner loop counter. 414 1.1 mrg 415 1.1 mrg defframe(VAR_COUNTER, -20) 416 1.1 mrg defframe(VAR_ADJUST, -24) 417 1.1 mrg defframe(VAR_JMP, -28) 418 1.1 mrg defframe(VAR_XP_LOW, -32) 419 1.1 mrg deflit(VAR_EXTRA_SPACE, 16) 420 1.1 mrg 421 1.1 mrg 422 1.1 mrg L(unroll): 423 1.1 mrg C eax yp 424 1.1 mrg C ebx 425 1.1 mrg C ecx xsize 426 1.1 mrg C edx ysize-1 427 1.1 mrg C esi xp end 428 1.1 mrg C edi wp end of mul1 429 1.1 mrg C ebp 430 1.1 mrg 431 1.1 mrg movl PARAM_XP, %esi 432 1.1 mrg movl 4(%eax), %ebp C multiplier (yp second limb) 433 1.1 mrg leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing 434 1.1 mrg 435 1.1 mrg movl PARAM_WP, %edi 436 1.1 mrg movl %eax, PARAM_YP 437 1.1 mrg negl %edx 438 1.1 mrg 439 1.1 mrg movl %edx, PARAM_YSIZE 440 1.1 mrg leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 441 1.1 mrg decl %ecx C xsize-1 442 1.1 mrg 443 1.1 mrg movl (%esi), %eax C xp low limb 444 1.1 mrg andl $-UNROLL_MASK-1, %ebx 445 1.1 mrg negl %ecx 446 1.1 mrg 447 1.1 mrg subl $VAR_EXTRA_SPACE, %esp 448 1.1 mrg deflit(`FRAME',16+VAR_EXTRA_SPACE) 449 1.1 mrg negl %ebx 450 1.1 mrg andl $UNROLL_MASK, %ecx 451 1.1 mrg 452 1.1 mrg movl %ebx, VAR_ADJUST 453 1.1 mrg movl %ecx, %edx 454 1.1 mrg shll $4, %ecx 455 1.1 mrg 456 1.1 mrg sarl $UNROLL_LOG2, %ebx 457 1.1 mrg 458 1.1 mrg C 17 code bytes per limb 459 1.1 mrg ifdef(`PIC',` 460 1.1 mrg call L(pic_calc) 461 1.1 mrg L(unroll_here): 462 1.1 mrg ',` 463 1.1 mrg leal L(unroll_entry) (%ecx,%edx,1), %ecx 464 1.1 mrg ') 465 1.1 mrg negl %edx 466 1.1 mrg 467 1.1 mrg movl %eax, VAR_XP_LOW 468 1.1 mrg movl %ecx, VAR_JMP 469 1.1 mrg leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling, 470 1.1 mrg leal 4(%esi,%edx,4), %esi C and start at second limb 471 1.1 mrg jmp L(unroll_outer_entry) 472 1.1 mrg 473 1.1 mrg 474 1.1 mrg ifdef(`PIC',` 475 1.1 mrg L(pic_calc): 476 1.1 mrg C See mpn/x86/README about old gas bugs 477 1.1 mrg leal (%ecx,%edx,1), %ecx 478 1.1 mrg addl $L(unroll_entry)-L(unroll_here), %ecx 479 1.1 mrg addl (%esp), %ecx 480 1.1 mrg ret_internal 481 1.1 mrg ') 482 1.1 mrg 483 1.1 mrg 484 1.1 mrg C -------------------------------------------------------------------------- 485 1.1 mrg ALIGN(32) 486 1.1 mrg L(unroll_outer_top): 487 1.1 mrg C ebp ysize counter, negative 488 1.1 mrg 489 1.1 mrg movl VAR_ADJUST, %ebx 490 1.1 mrg movl PARAM_YP, %edx 491 1.1 mrg 492 1.1 mrg movl VAR_XP_LOW, %eax 493 1.1 mrg movl %ebp, PARAM_YSIZE C store incremented ysize counter 494 1.1 mrg 495 1.1 mrg leal 4(%edi,%ebx,4), %edi 496 1.1 mrg leal (%esi,%ebx,4), %esi 497 1.1 mrg sarl $UNROLL_LOG2, %ebx 498 1.1 mrg 499 1.1 mrg movl (%edx,%ebp,4), %ebp C yp next multiplier 500 1.1 mrg movl VAR_JMP, %ecx 501 1.1 mrg 502 1.1 mrg L(unroll_outer_entry): 503 1.1 mrg mull %ebp 504 1.1 mrg 505 1.1 mrg testb $1, %cl C and clear carry bit 506 1.1 mrg movl %ebx, VAR_COUNTER 507 1.1 mrg movl $0, %ebx 508 1.1 mrg 509 1.1 mrg movl $0, %ecx 510 1.1 mrg cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb 511 1.1 mrg cmovnz( %eax, %ebx) 512 1.1 mrg 513 1.1 mrg C Extra fetch of VAR_JMP is bad, but registers are tight 514 1.1 mrg jmp *VAR_JMP 515 1.1 mrg 516 1.1 mrg 517 1.1 mrg C ----------------------------------------------------------------------------- 518 1.1 mrg ALIGN(32) 519 1.1 mrg L(unroll_top): 520 1.1 mrg C eax xp limb 521 1.1 mrg C ebx carry high 522 1.1 mrg C ecx carry low 523 1.1 mrg C edx scratch 524 1.1 mrg C esi xp+8 525 1.1 mrg C edi wp 526 1.1 mrg C ebp yp multiplier limb 527 1.1 mrg C 528 1.1 mrg C VAR_COUNTER loop counter, negative 529 1.1 mrg C 530 1.1 mrg C 17 bytes each limb 531 1.1 mrg 532 1.1 mrg L(unroll_entry): 533 1.1 mrg 534 1.1 mrg deflit(CHUNK_COUNT,2) 535 1.1 mrg forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 536 1.1 mrg deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 537 1.1 mrg deflit(`disp1', eval(disp0 + 4)) 538 1.1 mrg 539 1.1 mrg Zdisp( movl, disp0,(%esi), %eax) 540 1.1 mrg adcl %edx, %ebx 541 1.1 mrg 542 1.1 mrg mull %ebp 543 1.1 mrg 544 1.1 mrg Zdisp( addl, %ecx, disp0,(%edi)) 545 1.1 mrg movl $0, %ecx 546 1.1 mrg 547 1.1 mrg adcl %eax, %ebx 548 1.1 mrg 549 1.1 mrg 550 1.1 mrg movl disp1(%esi), %eax 551 1.1 mrg adcl %edx, %ecx 552 1.1 mrg 553 1.1 mrg mull %ebp 554 1.1 mrg 555 1.1 mrg addl %ebx, disp1(%edi) 556 1.1 mrg movl $0, %ebx 557 1.1 mrg 558 1.1 mrg adcl %eax, %ecx 559 1.1 mrg ') 560 1.1 mrg 561 1.1 mrg 562 1.1 mrg incl VAR_COUNTER 563 1.1 mrg leal UNROLL_BYTES(%esi), %esi 564 1.1 mrg leal UNROLL_BYTES(%edi), %edi 565 1.1 mrg 566 1.1 mrg jnz L(unroll_top) 567 1.1 mrg 568 1.1 mrg 569 1.1 mrg C eax 570 1.1 mrg C ebx zero 571 1.1 mrg C ecx low 572 1.1 mrg C edx high 573 1.1 mrg C esi 574 1.1 mrg C edi wp, pointing at second last limb) 575 1.1 mrg C ebp 576 1.1 mrg C 577 1.1 mrg C carry flag to be added to high 578 1.1 mrg 579 1.1 mrg deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) 580 1.1 mrg deflit(`disp1', eval(disp0-0 + 4)) 581 1.1 mrg 582 1.1 mrg movl PARAM_YSIZE, %ebp 583 1.1 mrg adcl $0, %edx 584 1.1 mrg addl %ecx, disp0(%edi) 585 1.1 mrg 586 1.1 mrg adcl $0, %edx 587 1.1 mrg incl %ebp 588 1.1 mrg 589 1.1 mrg movl %edx, disp1(%edi) 590 1.1 mrg jnz L(unroll_outer_top) 591 1.1 mrg 592 1.1 mrg 593 1.1 mrg movl SAVE_ESI, %esi 594 1.1 mrg movl SAVE_EBP, %ebp 595 1.1 mrg 596 1.1 mrg movl SAVE_EDI, %edi 597 1.1 mrg movl SAVE_EBX, %ebx 598 1.1 mrg addl $FRAME, %esp 599 1.1 mrg 600 1.1 mrg ret 601 1.1 mrg 602 1.1 mrg EPILOGUE() 603