Home | History | Annotate | Line # | Download | only in k6
      1 dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
      2 
      3 dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C			    cycles/limb
     35 C P5
     36 C P6 model 0-8,10-12		 5.5
     37 C P6 model 9  (Banias)
     38 C P6 model 13 (Dothan)		 4.87
     39 C P4 model 0  (Willamette)
     40 C P4 model 1  (?)
     41 C P4 model 2  (Northwood)
     42 C P4 model 3  (Prescott)
     43 C P4 model 4  (Nocona)
     44 C AMD K6			 6.25
     45 C AMD K7
     46 C AMD K8
     47 
     48 
     49 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     50 C                      mp_limb_t multiplier);
     51 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     52 C                       mp_limb_t multiplier, mp_limb_t carry);
     53 C
     54 C Multiply src,size by mult and store the result in dst,size.
     55 C Return the carry limb from the top of the result.
     56 C
     57 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
     58 C the low limb of the result.
     59 
     60 defframe(PARAM_CARRY,     20)
     61 defframe(PARAM_MULTIPLIER,16)
     62 defframe(PARAM_SIZE,      12)
     63 defframe(PARAM_SRC,       8)
     64 defframe(PARAM_DST,       4)
     65 
     66 dnl  minimum 5 because the unrolled code can't handle less
     67 deflit(UNROLL_THRESHOLD, 5)
     68 
     69 	TEXT
     70 	ALIGN(32)
     71 
     72 PROLOGUE(mpn_mul_1c)
     73 	pushl	%esi
     74 deflit(`FRAME',4)
     75 	movl	PARAM_CARRY, %esi
     76 	jmp	L(start_nc)
     77 EPILOGUE()
     78 
     79 
     80 PROLOGUE(mpn_mul_1)
     81 	push	%esi
     82 deflit(`FRAME',4)
     83 	xorl	%esi, %esi	C initial carry
     84 
     85 L(start_nc):
     86 	mov	PARAM_SIZE, %ecx
     87 	push	%ebx
     88 FRAME_pushl()
     89 
     90 	movl	PARAM_SRC, %ebx
     91 	push	%edi
     92 FRAME_pushl()
     93 
     94 	movl	PARAM_DST, %edi
     95 	pushl	%ebp
     96 FRAME_pushl()
     97 
     98 	cmpl	$UNROLL_THRESHOLD, %ecx
     99 	movl	PARAM_MULTIPLIER, %ebp
    100 
    101 	jae	L(unroll)
    102 
    103 
    104 	C code offset 0x22 here, close enough to aligned
    105 L(simple):
    106 	C eax	scratch
    107 	C ebx	src
    108 	C ecx	counter
    109 	C edx	scratch
    110 	C esi	carry
    111 	C edi	dst
    112 	C ebp	multiplier
    113 	C
    114 	C this loop 8 cycles/limb
    115 
    116 	movl	(%ebx), %eax
    117 	addl	$4, %ebx
    118 
    119 	mull	%ebp
    120 
    121 	addl	%esi, %eax
    122 	movl	$0, %esi
    123 
    124 	adcl	%edx, %esi
    125 
    126 	movl	%eax, (%edi)
    127 	addl	$4, %edi
    128 
    129 	loop	L(simple)
    130 
    131 
    132 	popl	%ebp
    133 
    134 	popl	%edi
    135 	popl	%ebx
    136 
    137 	movl	%esi, %eax
    138 	popl	%esi
    139 
    140 	ret
    141 
    142 
    143 C -----------------------------------------------------------------------------
    144 C The code for each limb is 6 cycles, with instruction decoding being the
    145 C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
    146 C cycles/limb in total.
    147 C
    148 C The secret ingredient to get 6.25 is to start the loop with the mul and
    149 C have the load/store pair at the end.  Rotating the load/store to the top
    150 C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
    151 C
    152 C The whole unrolled loop fits nicely in exactly 80 bytes.
    153 
    154 
    155 	ALIGN(16)	C already aligned to 16 here actually
    156 L(unroll):
    157 	movl	(%ebx), %eax
    158 	leal	-16(%ebx,%ecx,4), %ebx
    159 
    160 	leal	-16(%edi,%ecx,4), %edi
    161 	subl	$4, %ecx
    162 
    163 	negl	%ecx
    164 
    165 
    166 	ALIGN(16)	C one byte nop for this alignment
    167 L(top):
    168 	C eax	scratch
    169 	C ebx	&src[size-4]
    170 	C ecx	counter
    171 	C edx	scratch
    172 	C esi	carry
    173 	C edi	&dst[size-4]
    174 	C ebp	multiplier
    175 
    176 	mull	%ebp
    177 
    178 	addl	%esi, %eax
    179 	movl	$0, %esi
    180 
    181 	adcl	%edx, %esi
    182 
    183 	movl	%eax, (%edi,%ecx,4)
    184 	movl	4(%ebx,%ecx,4), %eax
    185 
    186 
    187 	mull	%ebp
    188 
    189 	addl	%esi, %eax
    190 	movl	$0, %esi
    191 
    192 	adcl	%edx, %esi
    193 
    194 	movl	%eax, 4(%edi,%ecx,4)
    195 	movl	8(%ebx,%ecx,4), %eax
    196 
    197 
    198 	mull	%ebp
    199 
    200 	addl	%esi, %eax
    201 	movl	$0, %esi
    202 
    203 	adcl	%edx, %esi
    204 
    205 	movl	%eax, 8(%edi,%ecx,4)
    206 	movl	12(%ebx,%ecx,4), %eax
    207 
    208 
    209 	mull	%ebp
    210 
    211 	addl	%esi, %eax
    212 	movl	$0, %esi
    213 
    214 	adcl	%edx, %esi
    215 
    216 	movl	%eax, 12(%edi,%ecx,4)
    217 	movl	16(%ebx,%ecx,4), %eax
    218 
    219 
    220 	addl	$4, %ecx
    221 	js	L(top)
    222 
    223 
    224 
    225 	C eax	next src limb
    226 	C ebx	&src[size-4]
    227 	C ecx	0 to 3 representing respectively 4 to 1 further limbs
    228 	C edx
    229 	C esi	carry
    230 	C edi	&dst[size-4]
    231 
    232 	testb	$2, %cl
    233 	jnz	L(finish_not_two)
    234 
    235 	mull	%ebp
    236 
    237 	addl	%esi, %eax
    238 	movl	$0, %esi
    239 
    240 	adcl	%edx, %esi
    241 
    242 	movl	%eax, (%edi,%ecx,4)
    243 	movl	4(%ebx,%ecx,4), %eax
    244 
    245 
    246 	mull	%ebp
    247 
    248 	addl	%esi, %eax
    249 	movl	$0, %esi
    250 
    251 	adcl	%edx, %esi
    252 
    253 	movl	%eax, 4(%edi,%ecx,4)
    254 	movl	8(%ebx,%ecx,4), %eax
    255 
    256 	addl	$2, %ecx
    257 L(finish_not_two):
    258 
    259 
    260 	testb	$1, %cl
    261 	jnz	L(finish_not_one)
    262 
    263 	mull	%ebp
    264 
    265 	addl	%esi, %eax
    266 	movl	$0, %esi
    267 
    268 	adcl	%edx, %esi
    269 
    270 	movl	%eax, 8(%edi)
    271 	movl	12(%ebx), %eax
    272 L(finish_not_one):
    273 
    274 
    275 	mull	%ebp
    276 
    277 	addl	%esi, %eax
    278 	popl	%ebp
    279 
    280 	adcl	$0, %edx
    281 
    282 	movl	%eax, 12(%edi)
    283 	popl	%edi
    284 
    285 	popl	%ebx
    286 	movl	%edx, %eax
    287 
    288 	popl	%esi
    289 
    290 	ret
    291 
    292 EPILOGUE()
    293