Home | History | Annotate | Line # | Download | only in p6
      1 dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
      2 
      3 dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C			    cycles/limb
     35 C P5
     36 C P6 model 0-8,10-12		 6.44
     37 C P6 model 9  (Banias)		 6.15
     38 C P6 model 13 (Dothan)		 6.11
     39 C P4 model 0  (Willamette)
     40 C P4 model 1  (?)
     41 C P4 model 2  (Northwood)
     42 C P4 model 3  (Prescott)
     43 C P4 model 4  (Nocona)
     44 C AMD K6
     45 C AMD K7
     46 C AMD K8
     47 
     48 
     49 dnl  P6 UNROLL_COUNT cycles/limb
     50 dnl          8           6.7
     51 dnl         16           6.35
     52 dnl         32           6.3
     53 dnl         64           6.3
     54 dnl  Maximum possible with the current code is 64.
     55 
     56 deflit(UNROLL_COUNT, 16)
     57 
     58 
     59 ifdef(`OPERATION_addmul_1', `
     60 	define(M4_inst,        addl)
     61 	define(M4_function_1,  mpn_addmul_1)
     62 	define(M4_function_1c, mpn_addmul_1c)
     63 	define(M4_description, add it to)
     64 	define(M4_desc_retval, carry)
     65 ',`ifdef(`OPERATION_submul_1', `
     66 	define(M4_inst,        subl)
     67 	define(M4_function_1,  mpn_submul_1)
     68 	define(M4_function_1c, mpn_submul_1c)
     69 	define(M4_description, subtract it from)
     70 	define(M4_desc_retval, borrow)
     71 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
     72 ')')')
     73 
     74 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
     75 
     76 
     77 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     78 C                            mp_limb_t mult);
     79 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     80 C                             mp_limb_t mult, mp_limb_t carry);
     81 C
     82 C Calculate src,size multiplied by mult and M4_description dst,size.
     83 C Return the M4_desc_retval limb from the top of the result.
     84 C
     85 C This code is pretty much the same as the K6 code.  The unrolled loop is
     86 C the same, but there's just a few scheduling tweaks in the setups and the
     87 C simple loop.
     88 C
     89 C A number of variations have been tried for the unrolled loop, with one or
     90 C two carries, and with loads scheduled earlier, but nothing faster than 6
     91 C cycles/limb has been found.
     92 
     93 ifdef(`PIC',`
     94 deflit(UNROLL_THRESHOLD, 5)
     95 ',`
     96 deflit(UNROLL_THRESHOLD, 5)
     97 ')
     98 
     99 defframe(PARAM_CARRY,     20)
    100 defframe(PARAM_MULTIPLIER,16)
    101 defframe(PARAM_SIZE,      12)
    102 defframe(PARAM_SRC,       8)
    103 defframe(PARAM_DST,       4)
    104 
    105 	TEXT
    106 	ALIGN(32)
    107 
    108 PROLOGUE(M4_function_1c)
    109 	pushl	%ebx
    110 deflit(`FRAME',4)
    111 	movl	PARAM_CARRY, %ebx
    112 	jmp	L(start_nc)
    113 EPILOGUE()
    114 
    115 PROLOGUE(M4_function_1)
    116 	push	%ebx
    117 deflit(`FRAME',4)
    118 	xorl	%ebx, %ebx	C initial carry
    119 
    120 L(start_nc):
    121 	movl	PARAM_SIZE, %ecx
    122 	pushl	%esi
    123 deflit(`FRAME',8)
    124 
    125 	movl	PARAM_SRC, %esi
    126 	pushl	%edi
    127 deflit(`FRAME',12)
    128 
    129 	movl	PARAM_DST, %edi
    130 	pushl	%ebp
    131 deflit(`FRAME',16)
    132 	cmpl	$UNROLL_THRESHOLD, %ecx
    133 
    134 	movl	PARAM_MULTIPLIER, %ebp
    135 	jae	L(unroll)
    136 
    137 
    138 	C simple loop
    139 	C this is offset 0x22, so close enough to aligned
    140 L(simple):
    141 	C eax	scratch
    142 	C ebx	carry
    143 	C ecx	counter
    144 	C edx	scratch
    145 	C esi	src
    146 	C edi	dst
    147 	C ebp	multiplier
    148 
    149 	movl	(%esi), %eax
    150 	addl	$4, %edi
    151 
    152 	mull	%ebp
    153 
    154 	addl	%ebx, %eax
    155 	adcl	$0, %edx
    156 
    157 	M4_inst	%eax, -4(%edi)
    158 	movl	%edx, %ebx
    159 
    160 	adcl	$0, %ebx
    161 	decl	%ecx
    162 
    163 	leal	4(%esi), %esi
    164 	jnz	L(simple)
    165 
    166 
    167 	popl	%ebp
    168 	popl	%edi
    169 
    170 	popl	%esi
    171 	movl	%ebx, %eax
    172 
    173 	popl	%ebx
    174 	ret
    175 
    176 
    177 
    178 C------------------------------------------------------------------------------
    179 C VAR_JUMP holds the computed jump temporarily because there's not enough
    180 C registers when doing the mul for the initial two carry limbs.
    181 C
    182 C The add/adc for the initial carry in %ebx is necessary only for the
    183 C mpn_add/submul_1c entry points.  Duplicating the startup code to
    184 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
    185 C idea.
    186 
    187 dnl  overlapping with parameters already fetched
    188 define(VAR_COUNTER,`PARAM_SIZE')
    189 define(VAR_JUMP,   `PARAM_DST')
    190 
    191 	C this is offset 0x43, so close enough to aligned
    192 L(unroll):
    193 	C eax
    194 	C ebx	initial carry
    195 	C ecx	size
    196 	C edx
    197 	C esi	src
    198 	C edi	dst
    199 	C ebp
    200 
    201 	movl	%ecx, %edx
    202 	decl	%ecx
    203 
    204 	subl	$2, %edx
    205 	negl	%ecx
    206 
    207 	shrl	$UNROLL_LOG2, %edx
    208 	andl	$UNROLL_MASK, %ecx
    209 
    210 	movl	%edx, VAR_COUNTER
    211 	movl	%ecx, %edx
    212 
    213 	C 15 code bytes per limb
    214 ifdef(`PIC',`
    215 	call	L(pic_calc)
    216 L(here):
    217 ',`
    218 	shll	$4, %edx
    219 	negl	%ecx
    220 
    221 	leal	L(entry) (%edx,%ecx,1), %edx
    222 ')
    223 	movl	(%esi), %eax		C src low limb
    224 
    225 	movl	%edx, VAR_JUMP
    226 	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
    227 
    228 	mull	%ebp
    229 
    230 	addl	%ebx, %eax	C initial carry (from _1c)
    231 	adcl	$0, %edx
    232 
    233 	movl	%edx, %ebx	C high carry
    234 	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
    235 
    236 	movl	VAR_JUMP, %edx
    237 	testl	$1, %ecx
    238 	movl	%eax, %ecx	C low carry
    239 
    240 	cmovnz(	%ebx, %ecx)	C high,low carry other way around
    241 	cmovnz(	%eax, %ebx)
    242 
    243 	jmp	*%edx
    244 
    245 
    246 ifdef(`PIC',`
    247 L(pic_calc):
    248 	shll	$4, %edx
    249 	negl	%ecx
    250 
    251 	C See mpn/x86/README about old gas bugs
    252 	leal	(%edx,%ecx,1), %edx
    253 	addl	$L(entry)-L(here), %edx
    254 
    255 	addl	(%esp), %edx
    256 
    257 	ret_internal
    258 ')
    259 
    260 
    261 C -----------------------------------------------------------
    262 	ALIGN(32)
    263 L(top):
    264 deflit(`FRAME',16)
    265 	C eax	scratch
    266 	C ebx	carry hi
    267 	C ecx	carry lo
    268 	C edx	scratch
    269 	C esi	src
    270 	C edi	dst
    271 	C ebp	multiplier
    272 	C
    273 	C VAR_COUNTER	loop counter
    274 	C
    275 	C 15 code bytes per limb
    276 
    277 	addl	$UNROLL_BYTES, %edi
    278 
    279 L(entry):
    280 deflit(CHUNK_COUNT,2)
    281 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    282 	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
    283 	deflit(`disp1', eval(disp0 + 4))
    284 
    285 Zdisp(	movl,	disp0,(%esi), %eax)
    286 	mull	%ebp
    287 Zdisp(	M4_inst,%ecx, disp0,(%edi))
    288 	adcl	%eax, %ebx
    289 	movl	%edx, %ecx
    290 	adcl	$0, %ecx
    291 
    292 	movl	disp1(%esi), %eax
    293 	mull	%ebp
    294 	M4_inst	%ebx, disp1(%edi)
    295 	adcl	%eax, %ecx
    296 	movl	%edx, %ebx
    297 	adcl	$0, %ebx
    298 ')
    299 
    300 	decl	VAR_COUNTER
    301 	leal	UNROLL_BYTES(%esi), %esi
    302 
    303 	jns	L(top)
    304 
    305 
    306 deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
    307 
    308 	M4_inst	%ecx, disp0(%edi)
    309 	movl	%ebx, %eax
    310 
    311 	popl	%ebp
    312 	popl	%edi
    313 
    314 	popl	%esi
    315 	popl	%ebx
    316 	adcl	$0, %eax
    317 
    318 	ret
    319 
    320 EPILOGUE()
    321