Home | History | Annotate | Line # | Download | only in k7
      1 dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
      2 
      3 dnl  Copyright 1999-2003 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C K7: 1.64 cycles/limb (at 16 limbs/loop).
     35 
     36 
     37 
     38 dnl  K7: UNROLL_COUNT cycles/limb
     39 dnl           8           1.9
     40 dnl          16           1.64
     41 dnl          32           1.7
     42 dnl          64           2.0
     43 dnl  Maximum possible with the current code is 64.
     44 
     45 deflit(UNROLL_COUNT, 16)
     46 
     47 
     48 ifdef(`OPERATION_add_n', `
     49 	define(M4_inst,        adcl)
     50 	define(M4_function_n,  mpn_add_n)
     51 	define(M4_function_nc, mpn_add_nc)
     52 	define(M4_description, add)
     53 ',`ifdef(`OPERATION_sub_n', `
     54 	define(M4_inst,        sbbl)
     55 	define(M4_function_n,  mpn_sub_n)
     56 	define(M4_function_nc, mpn_sub_nc)
     57 	define(M4_description, subtract)
     58 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
     59 ')')')
     60 
     61 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
     62 
     63 
     64 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
     65 C                         mp_size_t size);
     66 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
     67 C	                   mp_size_t size, mp_limb_t carry);
     68 C
     69 C Calculate src1,size M4_description src2,size, and store the result in
     70 C dst,size.  The return value is the carry bit from the top of the result (1
     71 C or 0).
     72 C
     73 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
     74 C the calculation.  Note values other than 1 or 0 here will lead to garbage
     75 C results.
     76 C
     77 C This code runs at 1.64 cycles/limb, which might be the best possible with
     78 C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
     79 C which can be done each cycle, leading to 1.5 c/l.
     80 
     81 dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
     82 ifdef(`PIC',`
     83 deflit(UNROLL_THRESHOLD, 8)
     84 ',`
     85 deflit(UNROLL_THRESHOLD, 8)
     86 ')
     87 
     88 defframe(PARAM_CARRY,20)
     89 defframe(PARAM_SIZE, 16)
     90 defframe(PARAM_SRC2, 12)
     91 defframe(PARAM_SRC1, 8)
     92 defframe(PARAM_DST,  4)
     93 
     94 defframe(SAVE_EBP, -4)
     95 defframe(SAVE_ESI, -8)
     96 defframe(SAVE_EBX, -12)
     97 defframe(SAVE_EDI, -16)
     98 deflit(STACK_SPACE, 16)
     99 
    100 	TEXT
    101 	ALIGN(32)
    102 deflit(`FRAME',0)
    103 
    104 PROLOGUE(M4_function_nc)
    105 	movl	PARAM_CARRY, %eax
    106 	jmp	L(start)
    107 EPILOGUE()
    108 
    109 PROLOGUE(M4_function_n)
    110 
    111 	xorl	%eax, %eax	C carry
    112 L(start):
    113 	movl	PARAM_SIZE, %ecx
    114 	subl	$STACK_SPACE, %esp
    115 deflit(`FRAME',STACK_SPACE)
    116 
    117 	movl	%edi, SAVE_EDI
    118 	movl	%ebx, SAVE_EBX
    119 	cmpl	$UNROLL_THRESHOLD, %ecx
    120 
    121 	movl	PARAM_SRC2, %edx
    122 	movl	PARAM_SRC1, %ebx
    123 	jae	L(unroll)
    124 
    125 	movl	PARAM_DST, %edi
    126 	leal	(%ebx,%ecx,4), %ebx
    127 	leal	(%edx,%ecx,4), %edx
    128 
    129 	leal	(%edi,%ecx,4), %edi
    130 	negl	%ecx
    131 	shrl	%eax
    132 
    133 	C This loop in in a single 16 byte code block already, so no
    134 	C alignment necessary.
    135 L(simple):
    136 	C eax	scratch
    137 	C ebx	src1
    138 	C ecx	counter
    139 	C edx	src2
    140 	C esi
    141 	C edi	dst
    142 	C ebp
    143 
    144 	movl	(%ebx,%ecx,4), %eax
    145 	M4_inst	(%edx,%ecx,4), %eax
    146 	movl	%eax, (%edi,%ecx,4)
    147 	incl	%ecx
    148 	jnz	L(simple)
    149 
    150 	movl	$0, %eax
    151 	movl	SAVE_EDI, %edi
    152 
    153 	movl	SAVE_EBX, %ebx
    154 	setc	%al
    155 	addl	$STACK_SPACE, %esp
    156 
    157 	ret
    158 
    159 
    160 C -----------------------------------------------------------------------------
    161 	C This is at 0x55, close enough to aligned.
    162 L(unroll):
    163 deflit(`FRAME',STACK_SPACE)
    164 	movl	%ebp, SAVE_EBP
    165 	andl	$-2, %ecx		C size low bit masked out
    166 	andl	$1, PARAM_SIZE		C size low bit kept
    167 
    168 	movl	%ecx, %edi
    169 	decl	%ecx
    170 	movl	PARAM_DST, %ebp
    171 
    172 	shrl	$UNROLL_LOG2, %ecx
    173 	negl	%edi
    174 	movl	%esi, SAVE_ESI
    175 
    176 	andl	$UNROLL_MASK, %edi
    177 
    178 ifdef(`PIC',`
    179 	call	L(pic_calc)
    180 L(here):
    181 ',`
    182 	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
    183 ')
    184 	negl	%edi
    185 	shrl	%eax
    186 
    187 	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
    188 	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
    189 	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
    190 
    191 	jmp	*%esi
    192 
    193 
    194 ifdef(`PIC',`
    195 L(pic_calc):
    196 	C See mpn/x86/README about old gas bugs
    197 	leal	(%edi,%edi,8), %esi
    198 	addl	$L(entry)-L(here), %esi
    199 	addl	(%esp), %esi
    200 	ret_internal
    201 ')
    202 
    203 
    204 C -----------------------------------------------------------------------------
    205 	ALIGN(32)
    206 L(top):
    207 	C eax	zero
    208 	C ebx	src1
    209 	C ecx	counter
    210 	C edx	src2
    211 	C esi	scratch (was computed jump)
    212 	C edi	dst
    213 	C ebp	scratch
    214 
    215 	leal	UNROLL_BYTES(%edx), %edx
    216 
    217 L(entry):
    218 deflit(CHUNK_COUNT, 2)
    219 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    220 	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
    221 	deflit(`disp1', eval(disp0 + 4))
    222 
    223 Zdisp(	movl,	disp0,(%ebx), %esi)
    224 	movl	disp1(%ebx), %ebp
    225 Zdisp(	M4_inst,disp0,(%edx), %esi)
    226 Zdisp(	movl,	%esi, disp0,(%edi))
    227 	M4_inst	disp1(%edx), %ebp
    228 	movl	%ebp, disp1(%edi)
    229 ')
    230 
    231 	decl	%ecx
    232 	leal	UNROLL_BYTES(%ebx), %ebx
    233 	leal	UNROLL_BYTES(%edi), %edi
    234 	jns	L(top)
    235 
    236 
    237 	mov	PARAM_SIZE, %esi
    238 	movl	SAVE_EBP, %ebp
    239 	movl	$0, %eax
    240 
    241 	decl	%esi
    242 	js	L(even)
    243 
    244 	movl	(%ebx), %ecx
    245 	M4_inst	UNROLL_BYTES(%edx), %ecx
    246 	movl	%ecx, (%edi)
    247 L(even):
    248 
    249 	movl	SAVE_EDI, %edi
    250 	movl	SAVE_EBX, %ebx
    251 	setc	%al
    252 
    253 	movl	SAVE_ESI, %esi
    254 	addl	$STACK_SPACE, %esp
    255 
    256 	ret
    257 
    258 EPILOGUE()
    259