Home | History | Annotate | Line # | Download | only in x86_64
divrem_1.asm revision 1.1.1.1
      1 dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
      2 
      3 dnl  Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C		norm	unorm	frac
     24 C K8		13	13	12
     25 C P4		44.2	44.2	42.3
     26 C P6 core2	25	24.5	19.3
     27 C P6 corei7	21.5	20.7	18
     28 C P6 atom	42	52	37
     29 
     30 C TODO
     31 C  * Compute the inverse without relying on the div instruction.
     32 C    Newton's method and mulq, or perhaps the faster fdiv.
     33 C  * Tune prologue.
     34 C  * Optimize for Core 2.
     35 
     36 C The code for unnormalized divisors works also for normalized divisors, but
     37 C for some reason it runs really slowly (on K8) for that case.  Use special
     38 C code until we can address this.  The Intel Atom is also affected, but
     39 C understandably (shld slowness).
     40 define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
     41 
     42 C mp_limb_t
     43 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
     44 C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
     45 
     46 C mp_limb_t
     47 C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
     48 C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
     49 C                      mp_limb_t dinv, int cnt)
     50 
     51 C INPUT PARAMETERS
     52 define(`qp',		`%rdi')
     53 define(`fn_param',	`%rsi')
     54 define(`up_param',	`%rdx')
     55 define(`un_param',	`%rcx')
     56 define(`d',		`%r8')
     57 define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
     58 C       shift passed on stack		C only for mpn_preinv_divrem_1
     59 
     60 define(`cnt',		`%rcx')
     61 define(`up',		`%rsi')
     62 define(`fn',		`%r12')
     63 define(`un',		`%rbx')
     64 
     65 
     66 C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
     67 C         cnt         qp      d  dinv
     68 
     69 ASM_START()
     70 	TEXT
     71 	ALIGN(16)
     72 PROLOGUE(mpn_preinv_divrem_1)
     73 	xor	%eax, %eax
     74 	push	%r13
     75 	push	%r12
     76 	push	%rbp
     77 	push	%rbx
     78 
     79 	mov	fn_param, fn
     80 	mov	un_param, un
     81 	add	fn_param, un_param
     82 	mov	up_param, up
     83 
     84 	lea	-8(qp,un_param,8), qp
     85 
     86 	test	d, d
     87 	js	L(nent)
     88 	mov	40(%rsp), R8(cnt)
     89 	shl	R8(cnt), d
     90 	jmp	L(uent)
     91 EPILOGUE()
     92 
     93 	ALIGN(16)
     94 PROLOGUE(mpn_divrem_1)
     95 	xor	%eax, %eax
     96 	push	%r13
     97 	push	%r12
     98 	push	%rbp
     99 	push	%rbx
    100 
    101 	mov	fn_param, fn
    102 	mov	un_param, un
    103 	add	fn_param, un_param
    104 	mov	up_param, up
    105 	je	L(ret)
    106 
    107 	lea	-8(qp,un_param,8), qp
    108 	xor	R32(%rbp), R32(%rbp)
    109 
    110 
    111 ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
    112 	test	d, d
    113 	jns	L(unnormalized)
    114 
    115 L(normalized):
    116 	test	un, un
    117 	je	L(8)			C un == 0
    118 	mov	-8(up,un,8), %rbp
    119 	dec	un
    120 	mov	%rbp, %rax
    121 	sub	d, %rbp
    122 	cmovb	%rax, %rbp
    123 	sbb	%eax, %eax
    124 	inc	%eax
    125 	mov	%rax, (qp)
    126 	lea	-8(qp), qp
    127 L(8):
    128 	mov	d, %rdx
    129 	mov	$-1, %rax
    130 	not	%rdx
    131 	div	d			C FREE rax rdx rcx r9 r10 r11
    132 	mov	%rax, dinv
    133 	mov	%rbp, %rax
    134 	jmp	L(nent)
    135 
    136 	ALIGN(16)
    137 L(nloop):				C		    cycK8  cycP6  cycP4
    138 	mov	(up,un,8), %r10		C
    139 	lea	1(%rax), %rbp		C
    140 	mul	dinv			C		     0,13   0,19  0,45
    141 	add	%r10, %rax		C		     4      8     12
    142 	adc	%rbp, %rdx		C		     5      9     13
    143 	mov	%rax, %rbp		C		     5      9     13
    144 	mov	%rdx, %r13		C		     6      11    23
    145 	imul	d, %rdx			C		     6      11    23
    146 	sub	%rdx, %r10		C		     10     16    33
    147 	mov	d, %rax			C
    148 	add	%r10, %rax		C		     11     17    34
    149 	cmp	%rbp, %r10		C		     11     17    34
    150 	cmovb	%r10, %rax		C		     12     18    35
    151 	adc	$-1, %r13		C
    152 	cmp	d, %rax			C
    153 	jae	L(nfx)			C
    154 L(nok):	mov	%r13, (qp)		C
    155 	sub	$8, qp			C
    156 L(nent):dec	un			C
    157 	jns	L(nloop)		C
    158 
    159 	xor	%ecx, %ecx
    160 	jmp	L(87)
    161 
    162 L(nfx):	sub	d, %rax
    163 	inc	%r13
    164 	jmp	L(nok)
    165 ')
    166 
    167 L(unnormalized):
    168 	test	un, un
    169 	je	L(44)
    170 	mov	-8(up,un,8), %rax
    171 	cmp	d, %rax
    172 	jae	L(44)
    173 	mov	%rbp, (qp)
    174 	mov	%rax, %rbp
    175 	lea	-8(qp), qp
    176 	je	L(ret)
    177 	dec	un
    178 L(44):
    179 	bsr	d, %rcx
    180 	not	%ecx
    181 	sal	%cl, d
    182 	sal	%cl, %rbp
    183 	mov	d, %rdx
    184 	mov	$-1, %rax
    185 	not	%rdx
    186 	div	d			C FREE rax rdx r9 r10 r11
    187 	test	un, un
    188 	mov	%rax, dinv
    189 	mov	%rbp, %rax
    190 	je	L(87)
    191 L(uent):
    192 	mov	-8(up,un,8), %rbp
    193 	shr	%cl, %rax
    194 	shld	%cl, %rbp, %rax
    195 	sub	$2, un
    196 	js	L(ulast)
    197 
    198 	ALIGN(16)
    199 L(uloop):
    200 	nop
    201 	mov	(up,un,8), %r10
    202 	lea	1(%rax), %r11
    203 	shld	%cl, %r10, %rbp
    204 	mul	dinv
    205 	add	%rbp, %rax
    206 	adc	%r11, %rdx
    207 	mov	%rax, %r11
    208 	mov	%rdx, %r13
    209 	imul	d, %rdx
    210 	sub	%rdx, %rbp
    211 	mov	d, %rax
    212 	add	%rbp, %rax
    213 	cmp	%r11, %rbp
    214 	cmovb	%rbp, %rax
    215 	adc	$-1, %r13
    216 	cmp	d, %rax
    217 	jae	L(ufx)
    218 L(uok):	mov	%r13, (qp)
    219 	sub	$8, qp
    220 	dec	un
    221 	mov	%r10, %rbp
    222 	jns	L(uloop)
    223 L(ulast):
    224 	lea	1(%rax), %r11
    225 	sal	%cl, %rbp
    226 	mul	dinv
    227 	add	%rbp, %rax
    228 	adc	%r11, %rdx
    229 	mov	%rax, %r11
    230 	mov	%rdx, %r13
    231 	imul	d, %rdx
    232 	sub	%rdx, %rbp
    233 	mov	d, %rax
    234 	add	%rbp, %rax
    235 	cmp	%r11, %rbp
    236 	cmovb	%rbp, %rax
    237 	adc	$-1, %r13
    238 	cmp	d, %rax
    239 	jae	L(93)
    240 L(69):	mov	%r13, (qp)
    241 	sub	$8, qp
    242 	jmp	L(87)
    243 
    244 L(ufx):	sub	d, %rax
    245 	inc	%r13
    246 	jmp	L(uok)
    247 
    248 L(93):	sub	d, %rax
    249 	inc	%r13
    250 	jmp	L(69)
    251 
    252 L(87):	mov	d, %rbp
    253 	neg	%rbp
    254 	jmp	L(87b)
    255 
    256 	ALIGN(16)
    257 L(floop):				C		    cycK8  cycP6  cycP4
    258 	lea	1(%rax), %r11		C
    259 	mul	dinv			C		     0,12
    260 	add	%r11, %rdx		C		     5
    261 	mov	%rax, %r11		C		     4
    262 	mov	%rdx, %r13		C		     6
    263 	imul	%rbp, %rdx		C		     6
    264 	mov	d, %rax			C
    265 	add	%rdx, %rax		C		     10
    266 	cmp	%r11, %rdx		C		     10
    267 	cmovb	%rdx, %rax		C		     11
    268 	adc	$-1, %r13		C
    269 	mov	%r13, (qp)		C
    270 	sub	$8, qp			C
    271 L(87b):	dec	fn			C
    272 	jns	L(floop)		C
    273 
    274 	shr	%cl, %rax
    275 L(ret):	pop	%rbx
    276 	pop	%rbp
    277 	pop	%r12
    278 	pop	%r13
    279 	ret
    280 EPILOGUE()
    281