Home | History | Annotate | Line # | Download | only in p6
      1      1.1  mrg dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
      2      1.1  mrg 
      3  1.1.1.3  mrg dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
      4  1.1.1.3  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.3  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg dnl  it under the terms of either:
      9  1.1.1.3  mrg dnl
     10  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg dnl      option) any later version.
     13  1.1.1.3  mrg dnl
     14  1.1.1.3  mrg dnl  or
     15  1.1.1.3  mrg dnl
     16  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg dnl      later version.
     19  1.1.1.3  mrg dnl
     20  1.1.1.3  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22  1.1.1.3  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.3  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34  1.1.1.2  mrg C			    cycles/limb
     35  1.1.1.2  mrg C P5
     36  1.1.1.2  mrg C P6 model 0-8,10-12		 6.44
     37  1.1.1.2  mrg C P6 model 9  (Banias)		 6.15
     38  1.1.1.2  mrg C P6 model 13 (Dothan)		 6.11
     39      1.1  mrg C P4 model 0  (Willamette)
     40      1.1  mrg C P4 model 1  (?)
     41      1.1  mrg C P4 model 2  (Northwood)
     42      1.1  mrg C P4 model 3  (Prescott)
     43      1.1  mrg C P4 model 4  (Nocona)
     44  1.1.1.2  mrg C AMD K6
     45  1.1.1.2  mrg C AMD K7
     46  1.1.1.2  mrg C AMD K8
     47      1.1  mrg 
     48      1.1  mrg 
     49      1.1  mrg dnl  P6 UNROLL_COUNT cycles/limb
     50      1.1  mrg dnl          8           6.7
     51      1.1  mrg dnl         16           6.35
     52      1.1  mrg dnl         32           6.3
     53      1.1  mrg dnl         64           6.3
     54      1.1  mrg dnl  Maximum possible with the current code is 64.
     55      1.1  mrg 
     56      1.1  mrg deflit(UNROLL_COUNT, 16)
     57      1.1  mrg 
     58      1.1  mrg 
     59      1.1  mrg ifdef(`OPERATION_addmul_1', `
     60      1.1  mrg 	define(M4_inst,        addl)
     61      1.1  mrg 	define(M4_function_1,  mpn_addmul_1)
     62      1.1  mrg 	define(M4_function_1c, mpn_addmul_1c)
     63      1.1  mrg 	define(M4_description, add it to)
     64      1.1  mrg 	define(M4_desc_retval, carry)
     65      1.1  mrg ',`ifdef(`OPERATION_submul_1', `
     66      1.1  mrg 	define(M4_inst,        subl)
     67      1.1  mrg 	define(M4_function_1,  mpn_submul_1)
     68      1.1  mrg 	define(M4_function_1c, mpn_submul_1c)
     69      1.1  mrg 	define(M4_description, subtract it from)
     70      1.1  mrg 	define(M4_desc_retval, borrow)
     71      1.1  mrg ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
     72      1.1  mrg ')')')
     73      1.1  mrg 
     74      1.1  mrg MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
     75      1.1  mrg 
     76      1.1  mrg 
     77      1.1  mrg C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     78      1.1  mrg C                            mp_limb_t mult);
     79      1.1  mrg C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     80      1.1  mrg C                             mp_limb_t mult, mp_limb_t carry);
     81      1.1  mrg C
     82      1.1  mrg C Calculate src,size multiplied by mult and M4_description dst,size.
     83      1.1  mrg C Return the M4_desc_retval limb from the top of the result.
     84      1.1  mrg C
     85      1.1  mrg C This code is pretty much the same as the K6 code.  The unrolled loop is
     86      1.1  mrg C the same, but there's just a few scheduling tweaks in the setups and the
     87      1.1  mrg C simple loop.
     88      1.1  mrg C
     89      1.1  mrg C A number of variations have been tried for the unrolled loop, with one or
     90      1.1  mrg C two carries, and with loads scheduled earlier, but nothing faster than 6
     91      1.1  mrg C cycles/limb has been found.
     92      1.1  mrg 
     93      1.1  mrg ifdef(`PIC',`
     94      1.1  mrg deflit(UNROLL_THRESHOLD, 5)
     95      1.1  mrg ',`
     96      1.1  mrg deflit(UNROLL_THRESHOLD, 5)
     97      1.1  mrg ')
     98      1.1  mrg 
     99      1.1  mrg defframe(PARAM_CARRY,     20)
    100      1.1  mrg defframe(PARAM_MULTIPLIER,16)
    101      1.1  mrg defframe(PARAM_SIZE,      12)
    102      1.1  mrg defframe(PARAM_SRC,       8)
    103      1.1  mrg defframe(PARAM_DST,       4)
    104      1.1  mrg 
    105      1.1  mrg 	TEXT
    106      1.1  mrg 	ALIGN(32)
    107      1.1  mrg 
    108      1.1  mrg PROLOGUE(M4_function_1c)
    109      1.1  mrg 	pushl	%ebx
    110      1.1  mrg deflit(`FRAME',4)
    111      1.1  mrg 	movl	PARAM_CARRY, %ebx
    112      1.1  mrg 	jmp	L(start_nc)
    113      1.1  mrg EPILOGUE()
    114      1.1  mrg 
    115      1.1  mrg PROLOGUE(M4_function_1)
    116      1.1  mrg 	push	%ebx
    117      1.1  mrg deflit(`FRAME',4)
    118      1.1  mrg 	xorl	%ebx, %ebx	C initial carry
    119      1.1  mrg 
    120      1.1  mrg L(start_nc):
    121      1.1  mrg 	movl	PARAM_SIZE, %ecx
    122      1.1  mrg 	pushl	%esi
    123      1.1  mrg deflit(`FRAME',8)
    124      1.1  mrg 
    125      1.1  mrg 	movl	PARAM_SRC, %esi
    126      1.1  mrg 	pushl	%edi
    127      1.1  mrg deflit(`FRAME',12)
    128      1.1  mrg 
    129      1.1  mrg 	movl	PARAM_DST, %edi
    130      1.1  mrg 	pushl	%ebp
    131      1.1  mrg deflit(`FRAME',16)
    132      1.1  mrg 	cmpl	$UNROLL_THRESHOLD, %ecx
    133      1.1  mrg 
    134      1.1  mrg 	movl	PARAM_MULTIPLIER, %ebp
    135      1.1  mrg 	jae	L(unroll)
    136      1.1  mrg 
    137      1.1  mrg 
    138      1.1  mrg 	C simple loop
    139      1.1  mrg 	C this is offset 0x22, so close enough to aligned
    140      1.1  mrg L(simple):
    141      1.1  mrg 	C eax	scratch
    142      1.1  mrg 	C ebx	carry
    143      1.1  mrg 	C ecx	counter
    144      1.1  mrg 	C edx	scratch
    145      1.1  mrg 	C esi	src
    146      1.1  mrg 	C edi	dst
    147      1.1  mrg 	C ebp	multiplier
    148      1.1  mrg 
    149      1.1  mrg 	movl	(%esi), %eax
    150      1.1  mrg 	addl	$4, %edi
    151      1.1  mrg 
    152      1.1  mrg 	mull	%ebp
    153      1.1  mrg 
    154      1.1  mrg 	addl	%ebx, %eax
    155      1.1  mrg 	adcl	$0, %edx
    156      1.1  mrg 
    157      1.1  mrg 	M4_inst	%eax, -4(%edi)
    158      1.1  mrg 	movl	%edx, %ebx
    159      1.1  mrg 
    160      1.1  mrg 	adcl	$0, %ebx
    161      1.1  mrg 	decl	%ecx
    162      1.1  mrg 
    163      1.1  mrg 	leal	4(%esi), %esi
    164      1.1  mrg 	jnz	L(simple)
    165      1.1  mrg 
    166      1.1  mrg 
    167      1.1  mrg 	popl	%ebp
    168      1.1  mrg 	popl	%edi
    169      1.1  mrg 
    170      1.1  mrg 	popl	%esi
    171      1.1  mrg 	movl	%ebx, %eax
    172      1.1  mrg 
    173      1.1  mrg 	popl	%ebx
    174      1.1  mrg 	ret
    175      1.1  mrg 
    176      1.1  mrg 
    177      1.1  mrg 
    178      1.1  mrg C------------------------------------------------------------------------------
    179      1.1  mrg C VAR_JUMP holds the computed jump temporarily because there's not enough
    180      1.1  mrg C registers when doing the mul for the initial two carry limbs.
    181      1.1  mrg C
    182      1.1  mrg C The add/adc for the initial carry in %ebx is necessary only for the
    183      1.1  mrg C mpn_add/submul_1c entry points.  Duplicating the startup code to
    184      1.1  mrg C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
    185      1.1  mrg C idea.
    186      1.1  mrg 
    187      1.1  mrg dnl  overlapping with parameters already fetched
    188      1.1  mrg define(VAR_COUNTER,`PARAM_SIZE')
    189      1.1  mrg define(VAR_JUMP,   `PARAM_DST')
    190      1.1  mrg 
    191      1.1  mrg 	C this is offset 0x43, so close enough to aligned
    192      1.1  mrg L(unroll):
    193      1.1  mrg 	C eax
    194      1.1  mrg 	C ebx	initial carry
    195      1.1  mrg 	C ecx	size
    196      1.1  mrg 	C edx
    197      1.1  mrg 	C esi	src
    198      1.1  mrg 	C edi	dst
    199      1.1  mrg 	C ebp
    200      1.1  mrg 
    201      1.1  mrg 	movl	%ecx, %edx
    202      1.1  mrg 	decl	%ecx
    203      1.1  mrg 
    204      1.1  mrg 	subl	$2, %edx
    205      1.1  mrg 	negl	%ecx
    206      1.1  mrg 
    207      1.1  mrg 	shrl	$UNROLL_LOG2, %edx
    208      1.1  mrg 	andl	$UNROLL_MASK, %ecx
    209      1.1  mrg 
    210      1.1  mrg 	movl	%edx, VAR_COUNTER
    211      1.1  mrg 	movl	%ecx, %edx
    212      1.1  mrg 
    213      1.1  mrg 	C 15 code bytes per limb
    214      1.1  mrg ifdef(`PIC',`
    215      1.1  mrg 	call	L(pic_calc)
    216      1.1  mrg L(here):
    217      1.1  mrg ',`
    218      1.1  mrg 	shll	$4, %edx
    219      1.1  mrg 	negl	%ecx
    220      1.1  mrg 
    221      1.1  mrg 	leal	L(entry) (%edx,%ecx,1), %edx
    222      1.1  mrg ')
    223      1.1  mrg 	movl	(%esi), %eax		C src low limb
    224      1.1  mrg 
    225      1.1  mrg 	movl	%edx, VAR_JUMP
    226      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
    227      1.1  mrg 
    228      1.1  mrg 	mull	%ebp
    229      1.1  mrg 
    230      1.1  mrg 	addl	%ebx, %eax	C initial carry (from _1c)
    231      1.1  mrg 	adcl	$0, %edx
    232      1.1  mrg 
    233      1.1  mrg 	movl	%edx, %ebx	C high carry
    234      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
    235      1.1  mrg 
    236      1.1  mrg 	movl	VAR_JUMP, %edx
    237      1.1  mrg 	testl	$1, %ecx
    238      1.1  mrg 	movl	%eax, %ecx	C low carry
    239      1.1  mrg 
    240      1.1  mrg 	cmovnz(	%ebx, %ecx)	C high,low carry other way around
    241      1.1  mrg 	cmovnz(	%eax, %ebx)
    242      1.1  mrg 
    243      1.1  mrg 	jmp	*%edx
    244      1.1  mrg 
    245      1.1  mrg 
    246      1.1  mrg ifdef(`PIC',`
    247      1.1  mrg L(pic_calc):
    248      1.1  mrg 	shll	$4, %edx
    249      1.1  mrg 	negl	%ecx
    250      1.1  mrg 
    251      1.1  mrg 	C See mpn/x86/README about old gas bugs
    252      1.1  mrg 	leal	(%edx,%ecx,1), %edx
    253      1.1  mrg 	addl	$L(entry)-L(here), %edx
    254      1.1  mrg 
    255      1.1  mrg 	addl	(%esp), %edx
    256      1.1  mrg 
    257      1.1  mrg 	ret_internal
    258      1.1  mrg ')
    259      1.1  mrg 
    260      1.1  mrg 
    261      1.1  mrg C -----------------------------------------------------------
    262      1.1  mrg 	ALIGN(32)
    263      1.1  mrg L(top):
    264      1.1  mrg deflit(`FRAME',16)
    265      1.1  mrg 	C eax	scratch
    266      1.1  mrg 	C ebx	carry hi
    267      1.1  mrg 	C ecx	carry lo
    268      1.1  mrg 	C edx	scratch
    269      1.1  mrg 	C esi	src
    270      1.1  mrg 	C edi	dst
    271      1.1  mrg 	C ebp	multiplier
    272      1.1  mrg 	C
    273      1.1  mrg 	C VAR_COUNTER	loop counter
    274      1.1  mrg 	C
    275      1.1  mrg 	C 15 code bytes per limb
    276      1.1  mrg 
    277      1.1  mrg 	addl	$UNROLL_BYTES, %edi
    278      1.1  mrg 
    279      1.1  mrg L(entry):
    280      1.1  mrg deflit(CHUNK_COUNT,2)
    281      1.1  mrg forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    282      1.1  mrg 	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
    283      1.1  mrg 	deflit(`disp1', eval(disp0 + 4))
    284      1.1  mrg 
    285      1.1  mrg Zdisp(	movl,	disp0,(%esi), %eax)
    286      1.1  mrg 	mull	%ebp
    287      1.1  mrg Zdisp(	M4_inst,%ecx, disp0,(%edi))
    288      1.1  mrg 	adcl	%eax, %ebx
    289      1.1  mrg 	movl	%edx, %ecx
    290      1.1  mrg 	adcl	$0, %ecx
    291      1.1  mrg 
    292      1.1  mrg 	movl	disp1(%esi), %eax
    293      1.1  mrg 	mull	%ebp
    294      1.1  mrg 	M4_inst	%ebx, disp1(%edi)
    295      1.1  mrg 	adcl	%eax, %ecx
    296      1.1  mrg 	movl	%edx, %ebx
    297      1.1  mrg 	adcl	$0, %ebx
    298      1.1  mrg ')
    299      1.1  mrg 
    300      1.1  mrg 	decl	VAR_COUNTER
    301      1.1  mrg 	leal	UNROLL_BYTES(%esi), %esi
    302      1.1  mrg 
    303      1.1  mrg 	jns	L(top)
    304      1.1  mrg 
    305      1.1  mrg 
    306      1.1  mrg deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
    307      1.1  mrg 
    308      1.1  mrg 	M4_inst	%ecx, disp0(%edi)
    309      1.1  mrg 	movl	%ebx, %eax
    310      1.1  mrg 
    311      1.1  mrg 	popl	%ebp
    312      1.1  mrg 	popl	%edi
    313      1.1  mrg 
    314      1.1  mrg 	popl	%esi
    315      1.1  mrg 	popl	%ebx
    316      1.1  mrg 	adcl	$0, %eax
    317      1.1  mrg 
    318      1.1  mrg 	ret
    319      1.1  mrg 
    320      1.1  mrg EPILOGUE()
    321