Home | History | Annotate | Line # | Download | only in k6
      1 dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
      2 
      3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
     35 
     36 
     37 ifdef(`OPERATION_add_n', `
     38 	define(M4_inst,        adcl)
     39 	define(M4_function_n,  mpn_add_n)
     40 	define(M4_function_nc, mpn_add_nc)
     41 	define(M4_description, add)
     42 ',`ifdef(`OPERATION_sub_n', `
     43 	define(M4_inst,        sbbl)
     44 	define(M4_function_n,  mpn_sub_n)
     45 	define(M4_function_nc, mpn_sub_nc)
     46 	define(M4_description, subtract)
     47 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
     48 ')')')
     49 
     50 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
     51 
     52 
     53 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
     54 C                          mp_size_t size);
     55 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
     56 C	                      mp_size_t size, mp_limb_t carry);
     57 C
     58 C Calculate src1,size M4_description src2,size, and store the result in
     59 C dst,size.  The return value is the carry bit from the top of the result
     60 C (1 or 0).
     61 C
     62 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
     63 C the calculation.  Note values other than 1 or 0 here will lead to garbage
     64 C results.
     65 C
     66 C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
     67 C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
     68 C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
     69 
     70 define(PARAM_CARRY, `FRAME+20(%esp)')
     71 define(PARAM_SIZE,  `FRAME+16(%esp)')
     72 define(PARAM_SRC2,  `FRAME+12(%esp)')
     73 define(PARAM_SRC1,  `FRAME+8(%esp)')
     74 define(PARAM_DST,   `FRAME+4(%esp)')
     75 deflit(`FRAME',0)
     76 
     77 dnl  minimum 5 because the unrolled code can't handle less
     78 deflit(UNROLL_THRESHOLD, 5)
     79 
     80 	TEXT
     81 	ALIGN(32)
     82 
     83 PROLOGUE(M4_function_nc)
     84 	movl	PARAM_CARRY, %eax
     85 	jmp	L(start)
     86 EPILOGUE()
     87 
     88 
     89 PROLOGUE(M4_function_n)
     90 	xorl	%eax, %eax
     91 L(start):
     92 	movl	PARAM_SIZE, %ecx
     93 	pushl	%ebx
     94 FRAME_pushl()
     95 
     96 	movl	PARAM_SRC1, %ebx
     97 	pushl	%edi
     98 FRAME_pushl()
     99 
    100 	movl	PARAM_SRC2, %edx
    101 	cmpl	$UNROLL_THRESHOLD, %ecx
    102 
    103 	movl	PARAM_DST, %edi
    104 	jae	L(unroll)
    105 
    106 
    107 	shrl	%eax		C initial carry flag
    108 
    109 	C offset 0x21 here, close enough to aligned
    110 L(simple):
    111 	C eax	scratch
    112 	C ebx	src1
    113 	C ecx	counter
    114 	C edx	src2
    115 	C esi
    116 	C edi	dst
    117 	C ebp
    118 	C
    119 	C The store to (%edi) could be done with a stosl; it'd be smaller
    120 	C code, but there's no speed gain and a cld would have to be added
    121 	C (per mpn/x86/README).
    122 
    123 	movl	(%ebx), %eax
    124 	leal	4(%ebx), %ebx
    125 
    126 	M4_inst	(%edx), %eax
    127 
    128 	movl	%eax, (%edi)
    129 	leal	4(%edi), %edi
    130 
    131 	leal	4(%edx), %edx
    132 	loop	L(simple)
    133 
    134 
    135 	movl	$0, %eax
    136 	popl	%edi
    137 
    138 	setc	%al
    139 
    140 	popl	%ebx
    141 	ret
    142 
    143 
    144 C -----------------------------------------------------------------------------
    145 L(unroll):
    146 	C eax	carry
    147 	C ebx	src1
    148 	C ecx	counter
    149 	C edx	src2
    150 	C esi
    151 	C edi	dst
    152 	C ebp
    153 
    154 	cmpl	%edi, %ebx
    155 	pushl	%esi
    156 
    157 	je	L(inplace)
    158 
    159 ifdef(`OPERATION_add_n',`
    160 	cmpl	%edi, %edx
    161 
    162 	je	L(inplace_reverse)
    163 ')
    164 
    165 	movl	%ecx, %esi
    166 
    167 	andl	$-4, %ecx
    168 	andl	$3, %esi
    169 
    170 	leal	(%ebx,%ecx,4), %ebx
    171 	leal	(%edx,%ecx,4), %edx
    172 	leal	(%edi,%ecx,4), %edi
    173 
    174 	negl	%ecx
    175 	shrl	%eax
    176 
    177 	ALIGN(32)
    178 L(normal_top):
    179 	C eax	counter, qwords, negative
    180 	C ebx	src1
    181 	C ecx	scratch
    182 	C edx	src2
    183 	C esi
    184 	C edi	dst
    185 	C ebp
    186 
    187 	movl	(%ebx,%ecx,4), %eax
    188 	leal	5(%ecx), %ecx
    189 	M4_inst	-20(%edx,%ecx,4), %eax
    190 	movl	%eax, -20(%edi,%ecx,4)
    191 
    192 	movl	4-20(%ebx,%ecx,4), %eax
    193 	M4_inst	4-20(%edx,%ecx,4), %eax
    194 	movl	%eax, 4-20(%edi,%ecx,4)
    195 
    196 	movl	8-20(%ebx,%ecx,4), %eax
    197 	M4_inst	8-20(%edx,%ecx,4), %eax
    198 	movl	%eax, 8-20(%edi,%ecx,4)
    199 
    200 	movl	12-20(%ebx,%ecx,4), %eax
    201 	M4_inst	12-20(%edx,%ecx,4), %eax
    202 	movl	%eax, 12-20(%edi,%ecx,4)
    203 
    204 	loop	L(normal_top)
    205 
    206 
    207 	decl	%esi
    208 	jz	L(normal_finish_one)
    209 	js	L(normal_done)
    210 
    211 	C two or three more limbs
    212 
    213 	movl	(%ebx), %eax
    214 	M4_inst	(%edx), %eax
    215 	movl	%eax, (%edi)
    216 
    217 	movl	4(%ebx), %eax
    218 	M4_inst	4(%edx), %eax
    219 	decl	%esi
    220 	movl	%eax, 4(%edi)
    221 
    222 	jz	L(normal_done)
    223 	movl	$2, %ecx
    224 
    225 L(normal_finish_one):
    226 	movl	(%ebx,%ecx,4), %eax
    227 	M4_inst	(%edx,%ecx,4), %eax
    228 	movl	%eax, (%edi,%ecx,4)
    229 
    230 L(normal_done):
    231 	popl	%esi
    232 	popl	%edi
    233 
    234 	movl	$0, %eax
    235 	popl	%ebx
    236 
    237 	setc	%al
    238 
    239 	ret
    240 
    241 
    242 C -----------------------------------------------------------------------------
    243 
    244 ifdef(`OPERATION_add_n',`
    245 L(inplace_reverse):
    246 	C dst==src2
    247 
    248 	movl	%ebx, %edx
    249 ')
    250 
    251 L(inplace):
    252 	C eax	initial carry
    253 	C ebx
    254 	C ecx	size
    255 	C edx	src
    256 	C esi
    257 	C edi	dst
    258 	C ebp
    259 
    260 	leal	-1(%ecx), %esi
    261 	decl	%ecx
    262 
    263 	andl	$-4, %ecx
    264 	andl	$3, %esi
    265 
    266 	movl	(%edx), %ebx		C src low limb
    267 	leal	(%edx,%ecx,4), %edx
    268 
    269 	leal	(%edi,%ecx,4), %edi
    270 	negl	%ecx
    271 
    272 	shrl	%eax
    273 
    274 
    275 	ALIGN(32)
    276 L(inplace_top):
    277 	C eax
    278 	C ebx	next src limb
    279 	C ecx	size
    280 	C edx	src
    281 	C esi
    282 	C edi	dst
    283 	C ebp
    284 
    285 	M4_inst	%ebx, (%edi,%ecx,4)
    286 
    287 	movl	4(%edx,%ecx,4), %eax
    288 	leal	5(%ecx), %ecx
    289 
    290 	M4_inst	%eax, 4-20(%edi,%ecx,4)
    291 
    292 	movl	8-20(%edx,%ecx,4), %eax
    293 	movl	12-20(%edx,%ecx,4), %ebx
    294 
    295 	M4_inst	%eax, 8-20(%edi,%ecx,4)
    296 	M4_inst	%ebx, 12-20(%edi,%ecx,4)
    297 
    298 	movl	16-20(%edx,%ecx,4), %ebx
    299 	loop	L(inplace_top)
    300 
    301 
    302 	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
    303 
    304 	M4_inst	%ebx, (%edi)
    305 
    306 	decl	%esi
    307 	jz	L(inplace_finish_one)
    308 	js	L(inplace_done)
    309 
    310 	C two or three more limbs
    311 
    312 	movl	4(%edx), %eax
    313 	movl	8(%edx), %ebx
    314 	M4_inst	%eax, 4(%edi)
    315 	M4_inst	%ebx, 8(%edi)
    316 
    317 	decl	%esi
    318 	movl	$2, %ecx
    319 
    320 	jz	L(normal_done)
    321 
    322 L(inplace_finish_one):
    323 	movl	4(%edx,%ecx,4), %eax
    324 	M4_inst	%eax, 4(%edi,%ecx,4)
    325 
    326 L(inplace_done):
    327 	popl	%esi
    328 	popl	%edi
    329 
    330 	movl	$0, %eax
    331 	popl	%ebx
    332 
    333 	setc	%al
    334 
    335 	ret
    336 
    337 EPILOGUE()
    338