Home | History | Annotate | Line # | Download | only in k7
      1 dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
      2 
      3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
     35 C     limbs/loop unrolling).
     36 
     37 
     38 
     39 dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
     40 dnl           8           4.67
     41 dnl          16           4.59
     42 dnl          32           4.42
     43 dnl  Maximum possible with the current code is 32.
     44 dnl
     45 dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
     46 dnl  done with a straight run through a block of code, no inner loop.  Using
     47 dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
     48 
     49 deflit(UNROLL_COUNT, 32)
     50 
     51 
     52 C void mpn_mul_basecase (mp_ptr wp,
     53 C                        mp_srcptr xp, mp_size_t xsize,
     54 C                        mp_srcptr yp, mp_size_t ysize);
     55 C
     56 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
     57 C wp,xsize+ysize.
     58 C
     59 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
     60 C it's faster because it does most of the mpn_addmul_1() startup
     61 C calculations only once.  The saving is 15-25% on typical sizes coming from
     62 C the Karatsuba multiply code.
     63 
     64 ifdef(`PIC',`
     65 deflit(UNROLL_THRESHOLD, 5)
     66 ',`
     67 deflit(UNROLL_THRESHOLD, 5)
     68 ')
     69 
     70 defframe(PARAM_YSIZE,20)
     71 defframe(PARAM_YP,   16)
     72 defframe(PARAM_XSIZE,12)
     73 defframe(PARAM_XP,   8)
     74 defframe(PARAM_WP,   4)
     75 
     76 	TEXT
     77 	ALIGN(32)
     78 PROLOGUE(mpn_mul_basecase)
     79 deflit(`FRAME',0)
     80 
     81 	movl	PARAM_XSIZE, %ecx
     82 	movl	PARAM_YP, %eax
     83 
     84 	movl	PARAM_XP, %edx
     85 	movl	(%eax), %eax	C yp low limb
     86 
     87 	cmpl	$2, %ecx
     88 	ja	L(xsize_more_than_two)
     89 	je	L(two_by_something)
     90 
     91 
     92 	C one limb by one limb
     93 
     94 	mull	(%edx)
     95 
     96 	movl	PARAM_WP, %ecx
     97 	movl	%eax, (%ecx)
     98 	movl	%edx, 4(%ecx)
     99 	ret
    100 
    101 
    102 C -----------------------------------------------------------------------------
    103 L(two_by_something):
    104 deflit(`FRAME',0)
    105 	decl	PARAM_YSIZE
    106 	pushl	%ebx		defframe_pushl(`SAVE_EBX')
    107 	movl	%eax, %ecx	C yp low limb
    108 
    109 	movl	PARAM_WP, %ebx
    110 	pushl	%esi		defframe_pushl(`SAVE_ESI')
    111 	movl	%edx, %esi	C xp
    112 
    113 	movl	(%edx), %eax	C xp low limb
    114 	jnz	L(two_by_two)
    115 
    116 
    117 	C two limbs by one limb
    118 
    119 	mull	%ecx
    120 
    121 	movl	%eax, (%ebx)
    122 	movl	4(%esi), %eax
    123 	movl	%edx, %esi	C carry
    124 
    125 	mull	%ecx
    126 
    127 	addl	%eax, %esi
    128 
    129 	movl	%esi, 4(%ebx)
    130 	movl	SAVE_ESI, %esi
    131 
    132 	adcl	$0, %edx
    133 
    134 	movl	%edx, 8(%ebx)
    135 	movl	SAVE_EBX, %ebx
    136 	addl	$FRAME, %esp
    137 
    138 	ret
    139 
    140 
    141 
    142 C -----------------------------------------------------------------------------
    143 C Could load yp earlier into another register.
    144 
    145 	ALIGN(16)
    146 L(two_by_two):
    147 	C eax	xp low limb
    148 	C ebx	wp
    149 	C ecx	yp low limb
    150 	C edx
    151 	C esi	xp
    152 	C edi
    153 	C ebp
    154 
    155 dnl  FRAME carries on from previous
    156 
    157 	mull	%ecx		C xp[0] * yp[0]
    158 
    159 	push	%edi		defframe_pushl(`SAVE_EDI')
    160 	movl	%edx, %edi	C carry, for wp[1]
    161 
    162 	movl	%eax, (%ebx)
    163 	movl	4(%esi), %eax
    164 
    165 	mull	%ecx		C xp[1] * yp[0]
    166 
    167 	addl	%eax, %edi
    168 	movl	PARAM_YP, %ecx
    169 
    170 	adcl	$0, %edx
    171 	movl	4(%ecx), %ecx	C yp[1]
    172 	movl	%edi, 4(%ebx)
    173 
    174 	movl	4(%esi), %eax	C xp[1]
    175 	movl	%edx, %edi	C carry, for wp[2]
    176 
    177 	mull	%ecx		C xp[1] * yp[1]
    178 
    179 	addl	%eax, %edi
    180 
    181 	adcl	$0, %edx
    182 	movl	(%esi), %eax	C xp[0]
    183 
    184 	movl	%edx, %esi	C carry, for wp[3]
    185 
    186 	mull	%ecx		C xp[0] * yp[1]
    187 
    188 	addl	%eax, 4(%ebx)
    189 	adcl	%edx, %edi
    190 	movl	%edi, 8(%ebx)
    191 
    192 	adcl	$0, %esi
    193 	movl	SAVE_EDI, %edi
    194 	movl	%esi, 12(%ebx)
    195 
    196 	movl	SAVE_ESI, %esi
    197 	movl	SAVE_EBX, %ebx
    198 	addl	$FRAME, %esp
    199 
    200 	ret
    201 
    202 
    203 C -----------------------------------------------------------------------------
    204 	ALIGN(16)
    205 L(xsize_more_than_two):
    206 
    207 C The first limb of yp is processed with a simple mpn_mul_1 style loop
    208 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
    209 C (whereas the addmul below is run ysize-1 many times).  A call to the
    210 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
    211 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
    212 C limb operations the Karatsuba code calls here with.
    213 
    214 	C eax	yp[0]
    215 	C ebx
    216 	C ecx	xsize
    217 	C edx	xp
    218 	C esi
    219 	C edi
    220 	C ebp
    221 
    222 dnl  FRAME doesn't carry on from previous, no pushes yet here
    223 defframe(`SAVE_EBX',-4)
    224 defframe(`SAVE_ESI',-8)
    225 defframe(`SAVE_EDI',-12)
    226 defframe(`SAVE_EBP',-16)
    227 deflit(`FRAME',0)
    228 
    229 	subl	$16, %esp
    230 deflit(`FRAME',16)
    231 
    232 	movl	%edi, SAVE_EDI
    233 	movl	PARAM_WP, %edi
    234 
    235 	movl	%ebx, SAVE_EBX
    236 	movl	%ebp, SAVE_EBP
    237 	movl	%eax, %ebp
    238 
    239 	movl	%esi, SAVE_ESI
    240 	xorl	%ebx, %ebx
    241 	leal	(%edx,%ecx,4), %esi	C xp end
    242 
    243 	leal	(%edi,%ecx,4), %edi	C wp end of mul1
    244 	negl	%ecx
    245 
    246 
    247 L(mul1):
    248 	C eax	scratch
    249 	C ebx	carry
    250 	C ecx	counter, negative
    251 	C edx	scratch
    252 	C esi	xp end
    253 	C edi	wp end of mul1
    254 	C ebp	multiplier
    255 
    256 	movl	(%esi,%ecx,4), %eax
    257 
    258 	mull	%ebp
    259 
    260 	addl	%ebx, %eax
    261 	movl	%eax, (%edi,%ecx,4)
    262 	movl	$0, %ebx
    263 
    264 	adcl	%edx, %ebx
    265 	incl	%ecx
    266 	jnz	L(mul1)
    267 
    268 
    269 	movl	PARAM_YSIZE, %edx
    270 	movl	PARAM_XSIZE, %ecx
    271 
    272 	movl	%ebx, (%edi)		C final carry
    273 	decl	%edx
    274 
    275 	jnz	L(ysize_more_than_one)
    276 
    277 
    278 	movl	SAVE_EDI, %edi
    279 	movl	SAVE_EBX, %ebx
    280 
    281 	movl	SAVE_EBP, %ebp
    282 	movl	SAVE_ESI, %esi
    283 	addl	$FRAME, %esp
    284 
    285 	ret
    286 
    287 
    288 L(ysize_more_than_one):
    289 	cmpl	$UNROLL_THRESHOLD, %ecx
    290 	movl	PARAM_YP, %eax
    291 
    292 	jae	L(unroll)
    293 
    294 
    295 C -----------------------------------------------------------------------------
    296 	C simple addmul looping
    297 	C
    298 	C eax	yp
    299 	C ebx
    300 	C ecx	xsize
    301 	C edx	ysize-1
    302 	C esi	xp end
    303 	C edi	wp end of mul1
    304 	C ebp
    305 
    306 	leal	4(%eax,%edx,4), %ebp	C yp end
    307 	negl	%ecx
    308 	negl	%edx
    309 
    310 	movl	(%esi,%ecx,4), %eax	C xp low limb
    311 	movl	%edx, PARAM_YSIZE	C -(ysize-1)
    312 	incl	%ecx
    313 
    314 	xorl	%ebx, %ebx		C initial carry
    315 	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
    316 	movl	%ebp, PARAM_YP
    317 
    318 	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
    319 	jmp	L(simple_outer_entry)
    320 
    321 
    322 	C this is offset 0x121 so close enough to aligned
    323 L(simple_outer_top):
    324 	C ebp	ysize counter, negative
    325 
    326 	movl	PARAM_YP, %edx
    327 	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
    328 	xorl	%ebx, %ebx		C carry
    329 
    330 	movl	%ebp, PARAM_YSIZE
    331 	addl	$4, %edi		C next position in wp
    332 
    333 	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
    334 	movl	-4(%esi,%ecx,4), %eax	C xp low limb
    335 
    336 
    337 L(simple_outer_entry):
    338 
    339 L(simple_inner):
    340 	C eax	xp limb
    341 	C ebx	carry limb
    342 	C ecx	loop counter (negative)
    343 	C edx	scratch
    344 	C esi	xp end
    345 	C edi	wp end
    346 	C ebp	multiplier
    347 
    348 	mull	%ebp
    349 
    350 	addl	%eax, %ebx
    351 	adcl	$0, %edx
    352 
    353 	addl	%ebx, (%edi,%ecx,4)
    354 	movl	(%esi,%ecx,4), %eax
    355 	adcl	$0, %edx
    356 
    357 	incl	%ecx
    358 	movl	%edx, %ebx
    359 	jnz	L(simple_inner)
    360 
    361 
    362 	mull	%ebp
    363 
    364 	movl	PARAM_YSIZE, %ebp
    365 	addl	%eax, %ebx
    366 
    367 	adcl	$0, %edx
    368 	addl	%ebx, (%edi)
    369 
    370 	adcl	$0, %edx
    371 	incl	%ebp
    372 
    373 	movl	%edx, 4(%edi)
    374 	jnz	L(simple_outer_top)
    375 
    376 
    377 	movl	SAVE_EBX, %ebx
    378 	movl	SAVE_ESI, %esi
    379 
    380 	movl	SAVE_EDI, %edi
    381 	movl	SAVE_EBP, %ebp
    382 	addl	$FRAME, %esp
    383 
    384 	ret
    385 
    386 
    387 
    388 C -----------------------------------------------------------------------------
    389 C
    390 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
    391 C comments.
    392 C
    393 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
    394 C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
    395 C to given an initial VAR_COUNTER at the top of the outer loop.
    396 C
    397 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
    398 C up to -1, inclusive.
    399 C
    400 C VAR_JMP is the computed jump into the unrolled loop.
    401 C
    402 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
    403 C start of the unrolled loop.
    404 C
    405 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
    406 C inclusive.
    407 C
    408 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
    409 C added to give the location of the next limb of yp, which is the multiplier
    410 C in the unrolled loop.
    411 C
    412 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
    413 C outer loop to take care of xp, wp and the inner loop counter.
    414 
    415 defframe(VAR_COUNTER,  -20)
    416 defframe(VAR_ADJUST,   -24)
    417 defframe(VAR_JMP,      -28)
    418 defframe(VAR_XP_LOW,   -32)
    419 deflit(VAR_EXTRA_SPACE, 16)
    420 
    421 
    422 L(unroll):
    423 	C eax	yp
    424 	C ebx
    425 	C ecx	xsize
    426 	C edx	ysize-1
    427 	C esi	xp end
    428 	C edi	wp end of mul1
    429 	C ebp
    430 
    431 	movl	PARAM_XP, %esi
    432 	movl	4(%eax), %ebp		C multiplier (yp second limb)
    433 	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
    434 
    435 	movl	PARAM_WP, %edi
    436 	movl	%eax, PARAM_YP
    437 	negl	%edx
    438 
    439 	movl	%edx, PARAM_YSIZE
    440 	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
    441 	decl	%ecx				C xsize-1
    442 
    443 	movl	(%esi), %eax		C xp low limb
    444 	andl	$-UNROLL_MASK-1, %ebx
    445 	negl	%ecx
    446 
    447 	subl	$VAR_EXTRA_SPACE, %esp
    448 deflit(`FRAME',16+VAR_EXTRA_SPACE)
    449 	negl	%ebx
    450 	andl	$UNROLL_MASK, %ecx
    451 
    452 	movl	%ebx, VAR_ADJUST
    453 	movl	%ecx, %edx
    454 	shll	$4, %ecx
    455 
    456 	sarl	$UNROLL_LOG2, %ebx
    457 
    458 	C 17 code bytes per limb
    459 ifdef(`PIC',`
    460 	call	L(pic_calc)
    461 L(unroll_here):
    462 ',`
    463 	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
    464 ')
    465 	negl	%edx
    466 
    467 	movl	%eax, VAR_XP_LOW
    468 	movl	%ecx, VAR_JMP
    469 	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
    470 	leal	4(%esi,%edx,4), %esi	C  and start at second limb
    471 	jmp	L(unroll_outer_entry)
    472 
    473 
    474 ifdef(`PIC',`
    475 L(pic_calc):
    476 	C See mpn/x86/README about old gas bugs
    477 	leal	(%ecx,%edx,1), %ecx
    478 	addl	$L(unroll_entry)-L(unroll_here), %ecx
    479 	addl	(%esp), %ecx
    480 	ret_internal
    481 ')
    482 
    483 
    484 C --------------------------------------------------------------------------
    485 	ALIGN(32)
    486 L(unroll_outer_top):
    487 	C ebp	ysize counter, negative
    488 
    489 	movl	VAR_ADJUST, %ebx
    490 	movl	PARAM_YP, %edx
    491 
    492 	movl	VAR_XP_LOW, %eax
    493 	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
    494 
    495 	leal	4(%edi,%ebx,4), %edi
    496 	leal	(%esi,%ebx,4), %esi
    497 	sarl	$UNROLL_LOG2, %ebx
    498 
    499 	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
    500 	movl	VAR_JMP, %ecx
    501 
    502 L(unroll_outer_entry):
    503 	mull	%ebp
    504 
    505 	testb	$1, %cl		C and clear carry bit
    506 	movl	%ebx, VAR_COUNTER
    507 	movl	$0, %ebx
    508 
    509 	movl	$0, %ecx
    510 	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
    511 	cmovnz(	%eax, %ebx)
    512 
    513 	C Extra fetch of VAR_JMP is bad, but registers are tight
    514 	jmp	*VAR_JMP
    515 
    516 
    517 C -----------------------------------------------------------------------------
    518 	ALIGN(32)
    519 L(unroll_top):
    520 	C eax	xp limb
    521 	C ebx	carry high
    522 	C ecx	carry low
    523 	C edx	scratch
    524 	C esi	xp+8
    525 	C edi	wp
    526 	C ebp	yp multiplier limb
    527 	C
    528 	C VAR_COUNTER  loop counter, negative
    529 	C
    530 	C 17 bytes each limb
    531 
    532 L(unroll_entry):
    533 
    534 deflit(CHUNK_COUNT,2)
    535 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    536 	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
    537 	deflit(`disp1', eval(disp0 + 4))
    538 
    539 Zdisp(	movl,	disp0,(%esi), %eax)
    540 	adcl	%edx, %ebx
    541 
    542 	mull	%ebp
    543 
    544 Zdisp(	addl,	%ecx, disp0,(%edi))
    545 	movl	$0, %ecx
    546 
    547 	adcl	%eax, %ebx
    548 
    549 
    550 	movl	disp1(%esi), %eax
    551 	adcl	%edx, %ecx
    552 
    553 	mull	%ebp
    554 
    555 	addl	%ebx, disp1(%edi)
    556 	movl	$0, %ebx
    557 
    558 	adcl	%eax, %ecx
    559 ')
    560 
    561 
    562 	incl	VAR_COUNTER
    563 	leal	UNROLL_BYTES(%esi), %esi
    564 	leal	UNROLL_BYTES(%edi), %edi
    565 
    566 	jnz	L(unroll_top)
    567 
    568 
    569 	C eax
    570 	C ebx	zero
    571 	C ecx	low
    572 	C edx	high
    573 	C esi
    574 	C edi	wp, pointing at second last limb)
    575 	C ebp
    576 	C
    577 	C carry flag to be added to high
    578 
    579 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
    580 deflit(`disp1', eval(disp0-0 + 4))
    581 
    582 	movl	PARAM_YSIZE, %ebp
    583 	adcl	$0, %edx
    584 	addl	%ecx, disp0(%edi)
    585 
    586 	adcl	$0, %edx
    587 	incl	%ebp
    588 
    589 	movl	%edx, disp1(%edi)
    590 	jnz	L(unroll_outer_top)
    591 
    592 
    593 	movl	SAVE_ESI, %esi
    594 	movl	SAVE_EBP, %ebp
    595 
    596 	movl	SAVE_EDI, %edi
    597 	movl	SAVE_EBX, %ebx
    598 	addl	$FRAME, %esp
    599 
    600 	ret
    601 
    602 EPILOGUE()
    603