Home | History | Annotate | Line # | Download | only in k7
      1      1.1  mrg dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 1999-2003 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C K7: 1.64 cycles/limb (at 16 limbs/loop).
     35      1.1  mrg 
     36      1.1  mrg 
     37      1.1  mrg 
     38      1.1  mrg dnl  K7: UNROLL_COUNT cycles/limb
     39      1.1  mrg dnl           8           1.9
     40      1.1  mrg dnl          16           1.64
     41      1.1  mrg dnl          32           1.7
     42      1.1  mrg dnl          64           2.0
     43      1.1  mrg dnl  Maximum possible with the current code is 64.
     44      1.1  mrg 
     45      1.1  mrg deflit(UNROLL_COUNT, 16)
     46      1.1  mrg 
     47      1.1  mrg 
     48      1.1  mrg ifdef(`OPERATION_add_n', `
     49      1.1  mrg 	define(M4_inst,        adcl)
     50      1.1  mrg 	define(M4_function_n,  mpn_add_n)
     51      1.1  mrg 	define(M4_function_nc, mpn_add_nc)
     52      1.1  mrg 	define(M4_description, add)
     53      1.1  mrg ',`ifdef(`OPERATION_sub_n', `
     54      1.1  mrg 	define(M4_inst,        sbbl)
     55      1.1  mrg 	define(M4_function_n,  mpn_sub_n)
     56      1.1  mrg 	define(M4_function_nc, mpn_sub_nc)
     57      1.1  mrg 	define(M4_description, subtract)
     58      1.1  mrg ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
     59      1.1  mrg ')')')
     60      1.1  mrg 
     61      1.1  mrg MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
     62      1.1  mrg 
     63      1.1  mrg 
     64      1.1  mrg C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
     65      1.1  mrg C                         mp_size_t size);
     66      1.1  mrg C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
     67      1.1  mrg C	                   mp_size_t size, mp_limb_t carry);
     68      1.1  mrg C
     69      1.1  mrg C Calculate src1,size M4_description src2,size, and store the result in
     70      1.1  mrg C dst,size.  The return value is the carry bit from the top of the result (1
     71      1.1  mrg C or 0).
     72      1.1  mrg C
     73      1.1  mrg C The _nc version accepts 1 or 0 for an initial carry into the low limb of
     74      1.1  mrg C the calculation.  Note values other than 1 or 0 here will lead to garbage
     75      1.1  mrg C results.
     76      1.1  mrg C
     77      1.1  mrg C This code runs at 1.64 cycles/limb, which might be the best possible with
     78      1.1  mrg C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
     79      1.1  mrg C which can be done each cycle, leading to 1.5 c/l.
     80      1.1  mrg 
     81      1.1  mrg dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
     82      1.1  mrg ifdef(`PIC',`
     83      1.1  mrg deflit(UNROLL_THRESHOLD, 8)
     84      1.1  mrg ',`
     85      1.1  mrg deflit(UNROLL_THRESHOLD, 8)
     86      1.1  mrg ')
     87      1.1  mrg 
     88      1.1  mrg defframe(PARAM_CARRY,20)
     89      1.1  mrg defframe(PARAM_SIZE, 16)
     90      1.1  mrg defframe(PARAM_SRC2, 12)
     91      1.1  mrg defframe(PARAM_SRC1, 8)
     92      1.1  mrg defframe(PARAM_DST,  4)
     93      1.1  mrg 
     94      1.1  mrg defframe(SAVE_EBP, -4)
     95      1.1  mrg defframe(SAVE_ESI, -8)
     96      1.1  mrg defframe(SAVE_EBX, -12)
     97      1.1  mrg defframe(SAVE_EDI, -16)
     98      1.1  mrg deflit(STACK_SPACE, 16)
     99      1.1  mrg 
    100      1.1  mrg 	TEXT
    101      1.1  mrg 	ALIGN(32)
    102      1.1  mrg deflit(`FRAME',0)
    103      1.1  mrg 
    104      1.1  mrg PROLOGUE(M4_function_nc)
    105      1.1  mrg 	movl	PARAM_CARRY, %eax
    106      1.1  mrg 	jmp	L(start)
    107      1.1  mrg EPILOGUE()
    108      1.1  mrg 
    109      1.1  mrg PROLOGUE(M4_function_n)
    110      1.1  mrg 
    111      1.1  mrg 	xorl	%eax, %eax	C carry
    112      1.1  mrg L(start):
    113      1.1  mrg 	movl	PARAM_SIZE, %ecx
    114      1.1  mrg 	subl	$STACK_SPACE, %esp
    115      1.1  mrg deflit(`FRAME',STACK_SPACE)
    116      1.1  mrg 
    117      1.1  mrg 	movl	%edi, SAVE_EDI
    118      1.1  mrg 	movl	%ebx, SAVE_EBX
    119      1.1  mrg 	cmpl	$UNROLL_THRESHOLD, %ecx
    120      1.1  mrg 
    121      1.1  mrg 	movl	PARAM_SRC2, %edx
    122      1.1  mrg 	movl	PARAM_SRC1, %ebx
    123      1.1  mrg 	jae	L(unroll)
    124      1.1  mrg 
    125      1.1  mrg 	movl	PARAM_DST, %edi
    126      1.1  mrg 	leal	(%ebx,%ecx,4), %ebx
    127      1.1  mrg 	leal	(%edx,%ecx,4), %edx
    128      1.1  mrg 
    129      1.1  mrg 	leal	(%edi,%ecx,4), %edi
    130      1.1  mrg 	negl	%ecx
    131      1.1  mrg 	shrl	%eax
    132      1.1  mrg 
    133      1.1  mrg 	C This loop in in a single 16 byte code block already, so no
    134      1.1  mrg 	C alignment necessary.
    135      1.1  mrg L(simple):
    136      1.1  mrg 	C eax	scratch
    137      1.1  mrg 	C ebx	src1
    138      1.1  mrg 	C ecx	counter
    139      1.1  mrg 	C edx	src2
    140      1.1  mrg 	C esi
    141      1.1  mrg 	C edi	dst
    142      1.1  mrg 	C ebp
    143      1.1  mrg 
    144      1.1  mrg 	movl	(%ebx,%ecx,4), %eax
    145      1.1  mrg 	M4_inst	(%edx,%ecx,4), %eax
    146      1.1  mrg 	movl	%eax, (%edi,%ecx,4)
    147      1.1  mrg 	incl	%ecx
    148      1.1  mrg 	jnz	L(simple)
    149      1.1  mrg 
    150      1.1  mrg 	movl	$0, %eax
    151      1.1  mrg 	movl	SAVE_EDI, %edi
    152      1.1  mrg 
    153      1.1  mrg 	movl	SAVE_EBX, %ebx
    154      1.1  mrg 	setc	%al
    155      1.1  mrg 	addl	$STACK_SPACE, %esp
    156      1.1  mrg 
    157      1.1  mrg 	ret
    158      1.1  mrg 
    159      1.1  mrg 
    160      1.1  mrg C -----------------------------------------------------------------------------
    161      1.1  mrg 	C This is at 0x55, close enough to aligned.
    162      1.1  mrg L(unroll):
    163      1.1  mrg deflit(`FRAME',STACK_SPACE)
    164      1.1  mrg 	movl	%ebp, SAVE_EBP
    165      1.1  mrg 	andl	$-2, %ecx		C size low bit masked out
    166      1.1  mrg 	andl	$1, PARAM_SIZE		C size low bit kept
    167      1.1  mrg 
    168      1.1  mrg 	movl	%ecx, %edi
    169      1.1  mrg 	decl	%ecx
    170      1.1  mrg 	movl	PARAM_DST, %ebp
    171      1.1  mrg 
    172      1.1  mrg 	shrl	$UNROLL_LOG2, %ecx
    173      1.1  mrg 	negl	%edi
    174      1.1  mrg 	movl	%esi, SAVE_ESI
    175      1.1  mrg 
    176      1.1  mrg 	andl	$UNROLL_MASK, %edi
    177      1.1  mrg 
    178      1.1  mrg ifdef(`PIC',`
    179      1.1  mrg 	call	L(pic_calc)
    180      1.1  mrg L(here):
    181      1.1  mrg ',`
    182      1.1  mrg 	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
    183      1.1  mrg ')
    184      1.1  mrg 	negl	%edi
    185      1.1  mrg 	shrl	%eax
    186      1.1  mrg 
    187      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
    188      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
    189      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
    190      1.1  mrg 
    191      1.1  mrg 	jmp	*%esi
    192      1.1  mrg 
    193      1.1  mrg 
    194      1.1  mrg ifdef(`PIC',`
    195      1.1  mrg L(pic_calc):
    196      1.1  mrg 	C See mpn/x86/README about old gas bugs
    197      1.1  mrg 	leal	(%edi,%edi,8), %esi
    198      1.1  mrg 	addl	$L(entry)-L(here), %esi
    199      1.1  mrg 	addl	(%esp), %esi
    200      1.1  mrg 	ret_internal
    201      1.1  mrg ')
    202      1.1  mrg 
    203      1.1  mrg 
    204      1.1  mrg C -----------------------------------------------------------------------------
    205      1.1  mrg 	ALIGN(32)
    206      1.1  mrg L(top):
    207      1.1  mrg 	C eax	zero
    208      1.1  mrg 	C ebx	src1
    209      1.1  mrg 	C ecx	counter
    210      1.1  mrg 	C edx	src2
    211      1.1  mrg 	C esi	scratch (was computed jump)
    212      1.1  mrg 	C edi	dst
    213      1.1  mrg 	C ebp	scratch
    214      1.1  mrg 
    215      1.1  mrg 	leal	UNROLL_BYTES(%edx), %edx
    216      1.1  mrg 
    217      1.1  mrg L(entry):
    218      1.1  mrg deflit(CHUNK_COUNT, 2)
    219      1.1  mrg forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    220      1.1  mrg 	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
    221      1.1  mrg 	deflit(`disp1', eval(disp0 + 4))
    222      1.1  mrg 
    223      1.1  mrg Zdisp(	movl,	disp0,(%ebx), %esi)
    224      1.1  mrg 	movl	disp1(%ebx), %ebp
    225      1.1  mrg Zdisp(	M4_inst,disp0,(%edx), %esi)
    226      1.1  mrg Zdisp(	movl,	%esi, disp0,(%edi))
    227      1.1  mrg 	M4_inst	disp1(%edx), %ebp
    228      1.1  mrg 	movl	%ebp, disp1(%edi)
    229      1.1  mrg ')
    230      1.1  mrg 
    231      1.1  mrg 	decl	%ecx
    232      1.1  mrg 	leal	UNROLL_BYTES(%ebx), %ebx
    233      1.1  mrg 	leal	UNROLL_BYTES(%edi), %edi
    234      1.1  mrg 	jns	L(top)
    235      1.1  mrg 
    236      1.1  mrg 
    237      1.1  mrg 	mov	PARAM_SIZE, %esi
    238      1.1  mrg 	movl	SAVE_EBP, %ebp
    239      1.1  mrg 	movl	$0, %eax
    240      1.1  mrg 
    241      1.1  mrg 	decl	%esi
    242      1.1  mrg 	js	L(even)
    243      1.1  mrg 
    244      1.1  mrg 	movl	(%ebx), %ecx
    245      1.1  mrg 	M4_inst	UNROLL_BYTES(%edx), %ecx
    246      1.1  mrg 	movl	%ecx, (%edi)
    247      1.1  mrg L(even):
    248      1.1  mrg 
    249      1.1  mrg 	movl	SAVE_EDI, %edi
    250      1.1  mrg 	movl	SAVE_EBX, %ebx
    251      1.1  mrg 	setc	%al
    252      1.1  mrg 
    253      1.1  mrg 	movl	SAVE_ESI, %esi
    254      1.1  mrg 	addl	$STACK_SPACE, %esp
    255      1.1  mrg 
    256      1.1  mrg 	ret
    257      1.1  mrg 
    258      1.1  mrg EPILOGUE()
    259