Home | History | Annotate | Line # | Download | only in k8
      1 dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor,
      2 dnl  returning quotient only.
      3 
      4 dnl  Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software
      5 dnl  Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C	    cycles/limb
     36 C	     norm/unorm
     37 C AMD K8,K9	10	+
     38 C AMD K10	10	+
     39 C AMD bull	13.7	-
     40 C AMD pile	13.7	+
     41 C AMD steam
     42 C AMD excavator
     43 C AMD bobcat	15	-
     44 C AMD jaguar	16	-
     45 C Intel P4	33	=
     46 C Intel core2	13.25	=
     47 C Intel NHM	14	=
     48 C Intel SBR	8.5	-
     49 C Intel IBR	8.5	-
     50 C Intel HWL	8	=
     51 C Intel BWL	8	=
     52 C Intel SKL	8	=
     53 C Intel atom	42	--
     54 C Intel SLM	20.4	--
     55 C VIA nano
     56 
     57 C INPUT PARAMETERS
     58 define(`rp',		`%rdi')
     59 define(`up',		`%rsi')
     60 define(`n',		`%rdx')
     61 define(`d',		`%rcx')
     62 define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
     63 define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
     64 
     65 ABI_SUPPORT(DOS64)
     66 ABI_SUPPORT(STD64)
     67 
     68 ASM_START()
     69 	TEXT
     70 	ALIGN(16)
     71 PROLOGUE(mpn_bdiv_q_1)
     72 	FUNC_ENTRY(4)
     73 	push	%rbx
     74 
     75 	mov	%rcx, %rax
     76 	xor	R32(%rcx), R32(%rcx)	C ncnt count
     77 	mov	%rdx, %r10
     78 
     79 	bt	$0, R32(%rax)
     80 	jnc	L(evn)			C skip bsf unless divisor is even
     81 
     82 L(odd):	mov	%rax, %rbx
     83 	shr	R32(%rax)
     84 	and	$127, R32(%rax)		C d/2, 7 bits
     85 
     86 	LEA(	binvert_limb_table, %rdx)
     87 
     88 	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
     89 
     90 	mov	%rbx, %r11		C d without twos
     91 
     92 	lea	(%rax,%rax), R32(%rdx)	C 2*inv
     93 	imul	R32(%rax), R32(%rax)	C inv*inv
     94 	imul	R32(%rbx), R32(%rax)	C inv*inv*d
     95 	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
     96 
     97 	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
     98 	imul	R32(%rdx), R32(%rdx)	C inv*inv
     99 	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
    100 	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
    101 
    102 	lea	(%rax,%rax), %r8	C 2*inv
    103 	imul	%rax, %rax		C inv*inv
    104 	imul	%rbx, %rax		C inv*inv*d
    105 	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
    106 
    107 	jmp	L(pi1)
    108 
    109 L(evn):	bsf	%rax, %rcx
    110 	shr	R8(%rcx), %rax
    111 	jmp	L(odd)
    112 EPILOGUE()
    113 
    114 PROLOGUE(mpn_pi1_bdiv_q_1)
    115 	FUNC_ENTRY(4)
    116 IFDOS(`	mov	56(%rsp), %r8	')
    117 IFDOS(`	mov	64(%rsp), %r9	')
    118 	push	%rbx
    119 
    120 	mov	%rcx, %r11		C d
    121 	mov	%rdx, %r10		C n
    122 	mov	%r9, %rcx		C ncnt
    123 
    124 L(pi1):	mov	(up), %rax		C up[0]
    125 
    126 	dec	%r10
    127 	jz	L(one)
    128 
    129 	mov	8(up), %rdx		C up[1]
    130 	lea	(up,%r10,8), up		C up end
    131 	lea	(rp,%r10,8), rp		C rp end
    132 	neg	%r10			C -n
    133 
    134 	shrd	R8(%rcx), %rdx, %rax
    135 
    136 	xor	R32(%rbx), R32(%rbx)
    137 	jmp	L(ent)
    138 
    139 	ALIGN(8)
    140 L(top):
    141 	C rax	q
    142 	C rbx	carry bit, 0 or 1
    143 	C rcx	ncnt
    144 	C rdx
    145 	C r10	counter, limbs, negative
    146 	C r11	d
    147 
    148 	mul	%r11			C carry limb in rdx
    149 	mov	(up,%r10,8), %rax
    150 	mov	8(up,%r10,8), %r9
    151 	shrd	R8(%rcx), %r9, %rax
    152 	nop
    153 	sub	%rbx, %rax		C apply carry bit
    154 	setc	R8(%rbx)
    155 	sub	%rdx, %rax		C apply carry limb
    156 	adc	$0, R32(%rbx)
    157 L(ent):	imul	%r8, %rax
    158 	mov	%rax, (rp,%r10,8)
    159 	inc	%r10
    160 	jnz	L(top)
    161 
    162 	mul	%r11			C carry limb in rdx
    163 	mov	(up), %rax		C up high limb
    164 	shr	R8(%rcx), %rax
    165 	sub	%rbx, %rax		C apply carry bit
    166 	sub	%rdx, %rax		C apply carry limb
    167 	imul	%r8, %rax
    168 	mov	%rax, (rp)
    169 	pop	%rbx
    170 	FUNC_EXIT()
    171 	ret
    172 
    173 L(one):	shr	R8(%rcx), %rax
    174 	imul	%r8, %rax
    175 	mov	%rax, (rp)
    176 	pop	%rbx
    177 	FUNC_EXIT()
    178 	ret
    179 EPILOGUE()
    180