Home | History | Annotate | Line # | Download | only in pentium
      1 dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
      2 
      3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
     35 C product at around 20x20 limbs.
     36 
     37 
     38 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
     39 C
     40 C Calculate src,size squared, storing the result in dst,2*size.
     41 C
     42 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
     43 C lot of function call overheads are avoided, especially when the size is
     44 C small.
     45 
     46 defframe(PARAM_SIZE,12)
     47 defframe(PARAM_SRC, 8)
     48 defframe(PARAM_DST, 4)
     49 
     50 	TEXT
     51 	ALIGN(8)
     52 PROLOGUE(mpn_sqr_basecase)
     53 deflit(`FRAME',0)
     54 
     55 	movl	PARAM_SIZE, %edx
     56 	movl	PARAM_SRC, %eax
     57 
     58 	cmpl	$2, %edx
     59 	movl	PARAM_DST, %ecx
     60 
     61 	je	L(two_limbs)
     62 
     63 	movl	(%eax), %eax
     64 	ja	L(three_or_more)
     65 
     66 C -----------------------------------------------------------------------------
     67 C one limb only
     68 	C eax	src
     69 	C ebx
     70 	C ecx	dst
     71 	C edx
     72 
     73 	mull	%eax
     74 
     75 	movl	%eax, (%ecx)
     76 	movl	%edx, 4(%ecx)
     77 
     78 	ret
     79 
     80 C -----------------------------------------------------------------------------
     81 	ALIGN(8)
     82 L(two_limbs):
     83 	C eax	src
     84 	C ebx
     85 	C ecx	dst
     86 	C edx	size
     87 
     88 	pushl	%ebp
     89 	pushl	%edi
     90 
     91 	pushl	%esi
     92 	pushl	%ebx
     93 
     94 	movl	%eax, %ebx
     95 	movl	(%eax), %eax
     96 
     97 	mull	%eax		C src[0]^2
     98 
     99 	movl	%eax, (%ecx)	C dst[0]
    100 	movl	%edx, %esi	C dst[1]
    101 
    102 	movl	4(%ebx), %eax
    103 
    104 	mull	%eax		C src[1]^2
    105 
    106 	movl	%eax, %edi	C dst[2]
    107 	movl	%edx, %ebp	C dst[3]
    108 
    109 	movl	(%ebx), %eax
    110 
    111 	mull	4(%ebx)		C src[0]*src[1]
    112 
    113 	addl	%eax, %esi
    114 	popl	%ebx
    115 
    116 	adcl	%edx, %edi
    117 
    118 	adcl	$0, %ebp
    119 	addl	%esi, %eax
    120 
    121 	adcl	%edi, %edx
    122 	movl	%eax, 4(%ecx)
    123 
    124 	adcl	$0, %ebp
    125 	popl	%esi
    126 
    127 	movl	%edx, 8(%ecx)
    128 	movl	%ebp, 12(%ecx)
    129 
    130 	popl	%edi
    131 	popl	%ebp
    132 
    133 	ret
    134 
    135 
    136 C -----------------------------------------------------------------------------
    137 	ALIGN(8)
    138 L(three_or_more):
    139 	C eax	src low limb
    140 	C ebx
    141 	C ecx	dst
    142 	C edx	size
    143 
    144 	cmpl	$4, %edx
    145 	pushl	%ebx
    146 deflit(`FRAME',4)
    147 
    148 	movl	PARAM_SRC, %ebx
    149 	jae	L(four_or_more)
    150 
    151 
    152 C -----------------------------------------------------------------------------
    153 C three limbs
    154 	C eax	src low limb
    155 	C ebx	src
    156 	C ecx	dst
    157 	C edx	size
    158 
    159 	pushl	%ebp
    160 	pushl	%edi
    161 
    162 	mull	%eax		C src[0] ^ 2
    163 
    164 	movl	%eax, (%ecx)
    165 	movl	%edx, 4(%ecx)
    166 
    167 	movl	4(%ebx), %eax
    168 	xorl	%ebp, %ebp
    169 
    170 	mull	%eax		C src[1] ^ 2
    171 
    172 	movl	%eax, 8(%ecx)
    173 	movl	%edx, 12(%ecx)
    174 
    175 	movl	8(%ebx), %eax
    176 	pushl	%esi		C risk of cache bank clash
    177 
    178 	mull	%eax		C src[2] ^ 2
    179 
    180 	movl	%eax, 16(%ecx)
    181 	movl	%edx, 20(%ecx)
    182 
    183 	movl	(%ebx), %eax
    184 
    185 	mull	4(%ebx)		C src[0] * src[1]
    186 
    187 	movl	%eax, %esi
    188 	movl	%edx, %edi
    189 
    190 	movl	(%ebx), %eax
    191 
    192 	mull	8(%ebx)		C src[0] * src[2]
    193 
    194 	addl	%eax, %edi
    195 	movl	%edx, %ebp
    196 
    197 	adcl	$0, %ebp
    198 	movl	4(%ebx), %eax
    199 
    200 	mull	8(%ebx)		C src[1] * src[2]
    201 
    202 	xorl	%ebx, %ebx
    203 	addl	%eax, %ebp
    204 
    205 	C eax
    206 	C ebx	zero, will be dst[5]
    207 	C ecx	dst
    208 	C edx	dst[4]
    209 	C esi	dst[1]
    210 	C edi	dst[2]
    211 	C ebp	dst[3]
    212 
    213 	adcl	$0, %edx
    214 	addl	%esi, %esi
    215 
    216 	adcl	%edi, %edi
    217 
    218 	adcl	%ebp, %ebp
    219 
    220 	adcl	%edx, %edx
    221 	movl	4(%ecx), %eax
    222 
    223 	adcl	$0, %ebx
    224 	addl	%esi, %eax
    225 
    226 	movl	%eax, 4(%ecx)
    227 	movl	8(%ecx), %eax
    228 
    229 	adcl	%edi, %eax
    230 	movl	12(%ecx), %esi
    231 
    232 	adcl	%ebp, %esi
    233 	movl	16(%ecx), %edi
    234 
    235 	movl	%eax, 8(%ecx)
    236 	movl	%esi, 12(%ecx)
    237 
    238 	adcl	%edx, %edi
    239 	popl	%esi
    240 
    241 	movl	20(%ecx), %eax
    242 	movl	%edi, 16(%ecx)
    243 
    244 	popl	%edi
    245 	popl	%ebp
    246 
    247 	adcl	%ebx, %eax	C no carry out of this
    248 	popl	%ebx
    249 
    250 	movl	%eax, 20(%ecx)
    251 
    252 	ret
    253 
    254 
    255 C -----------------------------------------------------------------------------
    256 	ALIGN(8)
    257 L(four_or_more):
    258 	C eax	src low limb
    259 	C ebx	src
    260 	C ecx	dst
    261 	C edx	size
    262 	C esi
    263 	C edi
    264 	C ebp
    265 	C
    266 	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
    267 
    268 deflit(`FRAME',4)
    269 
    270 	pushl	%edi
    271 FRAME_pushl()
    272 	pushl	%esi
    273 FRAME_pushl()
    274 
    275 	pushl	%ebp
    276 FRAME_pushl()
    277 	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
    278 
    279 	leal	(%ebx,%edx,4), %esi	C src end
    280 	movl	%ebx, %ebp		C src
    281 
    282 	negl	%edx			C -size
    283 	xorl	%ebx, %ebx		C clear carry limb and carry flag
    284 
    285 	leal	1(%edx), %ecx		C -(size-1)
    286 
    287 L(mul1):
    288 	C eax	scratch
    289 	C ebx	carry
    290 	C ecx	counter, negative
    291 	C edx	scratch
    292 	C esi	&src[size]
    293 	C edi	&dst[size]
    294 	C ebp	src
    295 
    296 	adcl	$0, %ebx
    297 	movl	(%esi,%ecx,4), %eax
    298 
    299 	mull	(%ebp)
    300 
    301 	addl	%eax, %ebx
    302 
    303 	movl	%ebx, (%edi,%ecx,4)
    304 	incl	%ecx
    305 
    306 	movl	%edx, %ebx
    307 	jnz	L(mul1)
    308 
    309 
    310 	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
    311 	C n=1..size-2.
    312 	C
    313 	C The last two products, which are the end corner of the product
    314 	C triangle, are handled separately to save looping overhead.  These
    315 	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
    316 	C If size is 4 then it's only these that need to be done.
    317 	C
    318 	C In the outer loop %esi is a constant, and %edi just advances by 1
    319 	C limb each time.  The size of the operation decreases by 1 limb
    320 	C each time.
    321 
    322 	C eax
    323 	C ebx	carry (needing carry flag added)
    324 	C ecx
    325 	C edx
    326 	C esi	&src[size]
    327 	C edi	&dst[size]
    328 	C ebp
    329 
    330 	adcl	$0, %ebx
    331 	movl	PARAM_SIZE, %edx
    332 
    333 	movl	%ebx, (%edi)
    334 	subl	$4, %edx
    335 
    336 	negl	%edx
    337 	jz	L(corner)
    338 
    339 
    340 L(outer):
    341 	C ebx	previous carry limb to store
    342 	C edx	outer loop counter (negative)
    343 	C esi	&src[size]
    344 	C edi	dst, pointing at stored carry limb of previous loop
    345 
    346 	pushl	%edx			C new outer loop counter
    347 	leal	-2(%edx), %ecx
    348 
    349 	movl	%ebx, (%edi)
    350 	addl	$4, %edi
    351 
    352 	addl	$4, %ebp
    353 	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
    354 
    355 L(inner):
    356 	C eax	scratch
    357 	C ebx	carry (needing carry flag added)
    358 	C ecx	counter, negative
    359 	C edx	scratch
    360 	C esi	&src[size]
    361 	C edi	dst end of this addmul
    362 	C ebp	&src[j]
    363 
    364 	adcl	$0, %ebx
    365 	movl	(%esi,%ecx,4), %eax
    366 
    367 	mull	(%ebp)
    368 
    369 	addl	%ebx, %eax
    370 	movl	(%edi,%ecx,4), %ebx
    371 
    372 	adcl	$0, %edx
    373 	addl	%eax, %ebx
    374 
    375 	movl	%ebx, (%edi,%ecx,4)
    376 	incl	%ecx
    377 
    378 	movl	%edx, %ebx
    379 	jnz	L(inner)
    380 
    381 
    382 	adcl	$0, %ebx
    383 	popl	%edx		C outer loop counter
    384 
    385 	incl	%edx
    386 	jnz	L(outer)
    387 
    388 
    389 	movl	%ebx, (%edi)
    390 
    391 L(corner):
    392 	C esi	&src[size]
    393 	C edi	&dst[2*size-4]
    394 
    395 	movl	-8(%esi), %eax
    396 	movl	-4(%edi), %ebx		C risk of data cache bank clash here
    397 
    398 	mull	-12(%esi)		C src[size-2]*src[size-3]
    399 
    400 	addl	%eax, %ebx
    401 	movl	%edx, %ecx
    402 
    403 	adcl	$0, %ecx
    404 	movl	-4(%esi), %eax
    405 
    406 	mull	-12(%esi)		C src[size-1]*src[size-3]
    407 
    408 	addl	%ecx, %eax
    409 	movl	(%edi), %ecx
    410 
    411 	adcl	$0, %edx
    412 	movl	%ebx, -4(%edi)
    413 
    414 	addl	%eax, %ecx
    415 	movl	%edx, %ebx
    416 
    417 	adcl	$0, %ebx
    418 	movl	-4(%esi), %eax
    419 
    420 	mull	-8(%esi)		C src[size-1]*src[size-2]
    421 
    422 	movl	%ecx, (%edi)
    423 	addl	%eax, %ebx
    424 
    425 	adcl	$0, %edx
    426 	movl	PARAM_SIZE, %eax
    427 
    428 	negl	%eax
    429 	movl	%ebx, 4(%edi)
    430 
    431 	addl	$1, %eax		C -(size-1) and clear carry
    432 	movl	%edx, 8(%edi)
    433 
    434 
    435 C -----------------------------------------------------------------------------
    436 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
    437 
    438 L(lshift):
    439 	C eax	counter, negative
    440 	C ebx	next limb
    441 	C ecx
    442 	C edx
    443 	C esi
    444 	C edi	&dst[2*size-4]
    445 	C ebp
    446 
    447 	movl	12(%edi,%eax,8), %ebx
    448 
    449 	rcll	%ebx
    450 	movl	16(%edi,%eax,8), %ecx
    451 
    452 	rcll	%ecx
    453 	movl	%ebx, 12(%edi,%eax,8)
    454 
    455 	movl	%ecx, 16(%edi,%eax,8)
    456 	incl	%eax
    457 
    458 	jnz	L(lshift)
    459 
    460 
    461 	adcl	%eax, %eax		C high bit out
    462 	movl	PARAM_SRC, %esi
    463 
    464 	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
    465 	movl	%eax, 12(%edi)		C dst most significant limb
    466 
    467 
    468 C -----------------------------------------------------------------------------
    469 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
    470 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
    471 C low limb of src[0]^2.
    472 
    473 	movl	(%esi), %eax		C src[0]
    474 	leal	(%esi,%ecx,4), %esi	C src end
    475 
    476 	negl	%ecx
    477 
    478 	mull	%eax
    479 
    480 	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
    481 	movl	%edx, %ebx
    482 
    483 	addl	$1, %ecx		C size-1 and clear carry
    484 
    485 L(diag):
    486 	C eax	scratch (low product)
    487 	C ebx	carry limb
    488 	C ecx	counter, negative
    489 	C edx	scratch (high product)
    490 	C esi	&src[size]
    491 	C edi	&dst[2*size-4]
    492 	C ebp	scratch (fetched dst limbs)
    493 
    494 	movl	(%esi,%ecx,4), %eax
    495 	adcl	$0, %ebx
    496 
    497 	mull	%eax
    498 
    499 	movl	16-4(%edi,%ecx,8), %ebp
    500 
    501 	addl	%ebp, %ebx
    502 	movl	16(%edi,%ecx,8), %ebp
    503 
    504 	adcl	%eax, %ebp
    505 	movl	%ebx, 16-4(%edi,%ecx,8)
    506 
    507 	movl	%ebp, 16(%edi,%ecx,8)
    508 	incl	%ecx
    509 
    510 	movl	%edx, %ebx
    511 	jnz	L(diag)
    512 
    513 
    514 	adcl	$0, %edx
    515 	movl	16-4(%edi), %eax	C dst most significant limb
    516 
    517 	addl	%eax, %edx
    518 	popl	%ebp
    519 
    520 	movl	%edx, 16-4(%edi)
    521 	popl	%esi		C risk of cache bank clash
    522 
    523 	popl	%edi
    524 	popl	%ebx
    525 
    526 	ret
    527 
    528 EPILOGUE()
    529