Home | History | Annotate | Line # | Download | only in x86_64
      1 dnl  x86-64 mpn_div_qr_1n_pi1
      2 dnl  -- Divide an mpn number by a normalized single-limb number,
      3 dnl     using a single-limb inverse.
      4 
      5 dnl  Contributed to the GNU project by Niels Mller
      6 
      7 dnl  Copyright 2013 Free Software Foundation, Inc.
      8 
      9 dnl  This file is part of the GNU MP Library.
     10 dnl
     11 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     12 dnl  it under the terms of either:
     13 dnl
     14 dnl    * the GNU Lesser General Public License as published by the Free
     15 dnl      Software Foundation; either version 3 of the License, or (at your
     16 dnl      option) any later version.
     17 dnl
     18 dnl  or
     19 dnl
     20 dnl    * the GNU General Public License as published by the Free Software
     21 dnl      Foundation; either version 2 of the License, or (at your option) any
     22 dnl      later version.
     23 dnl
     24 dnl  or both in parallel, as here.
     25 dnl
     26 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     27 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     28 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     29 dnl  for more details.
     30 dnl
     31 dnl  You should have received copies of the GNU General Public License and the
     32 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     33 dnl  see https://www.gnu.org/licenses/.
     34 
     35 include(`../config.m4')
     36 
     37 
     38 C		c/l
     39 C AMD K8,K9	13
     40 C AMD K10	13
     41 C AMD bull	16.5
     42 C AMD pile	15
     43 C AMD steam	 ?
     44 C AMD bobcat	16
     45 C AMD jaguar	 ?
     46 C Intel P4	47	poor
     47 C Intel core	19.25
     48 C Intel NHM	18
     49 C Intel SBR	15	poor
     50 C Intel IBR	13
     51 C Intel HWL	11.7
     52 C Intel BWL	 ?
     53 C Intel atom	52	very poor
     54 C VIA nano	19
     55 
     56 
     57 C INPUT Parameters
     58 define(`QP', `%rdi')
     59 define(`UP', `%rsi')
     60 define(`UN_INPUT', `%rdx')
     61 define(`U1', `%rcx')	C Also in %rax
     62 define(`D', `%r8')
     63 define(`DINV', `%r9')
     64 
     65 C Invariants
     66 define(`B2', `%rbp')
     67 define(`B2md', `%rbx')
     68 
     69 C Variables
     70 define(`UN', `%r8')	C Overlaps D input
     71 define(`T', `%r10')
     72 define(`U0', `%r11')
     73 define(`U2', `%r12')
     74 define(`Q0', `%r13')
     75 define(`Q1', `%r14')
     76 define(`Q2', `%r15')
     77 
     78 ABI_SUPPORT(STD64)
     79 
     80 	ASM_START()
     81 	TEXT
     82 	ALIGN(16)
     83 PROLOGUE(mpn_div_qr_1n_pi1)
     84 	FUNC_ENTRY(4)
     85 IFDOS(`	mov	56(%rsp), %r8	')
     86 IFDOS(`	mov	64(%rsp), %r9	')
     87 	dec	UN_INPUT
     88 	jnz	L(first)
     89 
     90 	C Just a single 2/1 division.
     91 	C T, U0 are allocated in scratch registers
     92 	lea	1(U1), T
     93 	mov	U1, %rax
     94 	mul	DINV
     95 	mov	(UP), U0
     96 	add	U0, %rax
     97 	adc	T, %rdx
     98 	mov	%rdx, T
     99 	imul	D, %rdx
    100 	sub	%rdx, U0
    101 	cmp	U0, %rax
    102 	lea	(U0, D), %rax
    103 	cmovnc	U0, %rax
    104 	sbb	$0, T
    105 	cmp	D, %rax
    106 	jc	L(single_div_done)
    107 	sub	D, %rax
    108 	add	$1, T
    109 L(single_div_done):
    110 	mov	T, (QP)
    111 	FUNC_EXIT()
    112 	ret
    113 L(first):
    114 	C FIXME: Could delay some of these until we enter the loop.
    115 	push	%r15
    116 	push	%r14
    117 	push	%r13
    118 	push	%r12
    119 	push	%rbx
    120 	push	%rbp
    121 
    122 	mov	D, B2
    123 	imul	DINV, B2
    124 	neg	B2
    125 	mov	B2, B2md
    126 	sub	D, B2md
    127 
    128 	C D not needed until final reduction
    129 	push	D
    130 	mov	UN_INPUT, UN	C Clobbers D
    131 
    132 	mov	DINV, %rax
    133 	mul	U1
    134 	mov	%rax, Q0
    135 	add	U1, %rdx
    136 	mov	%rdx, T
    137 
    138 	mov	B2, %rax
    139 	mul	U1
    140 	mov	-8(UP, UN, 8), U0
    141 	mov	(UP, UN, 8), U1
    142 	mov	T, (QP, UN, 8)
    143 	add	%rax, U0
    144 	adc	%rdx, U1
    145 	sbb	U2, U2
    146 	dec	UN
    147 	mov	U1, %rax
    148 	jz	L(final)
    149 
    150 	ALIGN(16)
    151 
    152 	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
    153 	C At entry, %rax holds an extra copy of U1
    154 L(loop):
    155 	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
    156 	C Remains to add in B (U1 + c)
    157 	mov	DINV, Q1
    158 	mov	U2, Q2
    159 	and	U2, Q1
    160 	neg	Q2
    161 	mul	DINV
    162 	add	%rdx, Q1
    163 	adc	$0, Q2
    164 	add	Q0, Q1
    165 	mov	%rax, Q0
    166 	mov	B2, %rax
    167 	lea	(B2md, U0), T
    168 	adc	$0, Q2
    169 
    170 	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
    171 	mul	U1
    172 	and	B2, U2
    173 	add	U2, U0
    174 	cmovnc	U0, T
    175 
    176 	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
    177 	adc	U1, Q1
    178 	mov	-8(UP, UN, 8), U0
    179 	adc	Q2, 8(QP, UN, 8)
    180 	jc	L(q_incr)
    181 L(q_incr_done):
    182 	add	%rax, U0
    183 	mov	T, %rax
    184 	adc	%rdx, %rax
    185 	mov	Q1, (QP, UN, 8)
    186 	sbb	U2, U2
    187 	dec	UN
    188 	mov	%rax, U1
    189 	jnz	L(loop)
    190 
    191 L(final):
    192 	pop	D
    193 
    194 	mov	U2, Q1
    195 	and	D, U2
    196 	sub	U2, %rax
    197 	neg	Q1
    198 
    199 	mov	%rax, U1
    200 	sub	D, %rax
    201 	cmovc	U1, %rax
    202 	sbb	$-1, Q1
    203 
    204 	lea	1(%rax), T
    205 	mul	DINV
    206 	add	U0, %rax
    207 	adc	T, %rdx
    208 	mov	%rdx, T
    209 	imul	D, %rdx
    210 	sub	%rdx, U0
    211 	cmp	U0, %rax
    212 	lea	(U0, D), %rax
    213 	cmovnc	U0, %rax
    214 	sbb	$0, T
    215 	cmp	D, %rax
    216 	jc	L(div_done)
    217 	sub	D, %rax
    218 	add	$1, T
    219 L(div_done):
    220 	add	T, Q0
    221 	mov	Q0, (QP)
    222 	adc	Q1, 8(QP)
    223 	jnc	L(done)
    224 L(final_q_incr):
    225 	addq	$1, 16(QP)
    226 	lea	8(QP), QP
    227 	jc	L(final_q_incr)
    228 
    229 L(done):
    230 	pop	%rbp
    231 	pop	%rbx
    232 	pop	%r12
    233 	pop	%r13
    234 	pop	%r14
    235 	pop	%r15
    236 	FUNC_EXIT()
    237 	ret
    238 
    239 L(q_incr):
    240 	C U1 is not live, so use it for indexing
    241 	lea	16(QP, UN, 8), U1
    242 L(q_incr_loop):
    243 	addq	$1, (U1)
    244 	jnc	L(q_incr_done)
    245 	lea	8(U1), U1
    246 	jmp	L(q_incr_loop)
    247 EPILOGUE()
    248