Home | History | Annotate | Line # | Download | only in x86_64
      1 dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
      2 dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
      3 
      4 dnl  Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 
     35 C	     cycles/limb
     36 C AMD K8,K9	 2
     37 C AMD K10	 2
     38 C AMD bd1	 ?
     39 C AMD bobcat	 ?
     40 C Intel P4	 13
     41 C Intel core2	 3.45
     42 C Intel NHM	 ?
     43 C Intel SBR	 ?
     44 C Intel atom	 ?
     45 C VIA nano	 ?
     46 
     47 
     48 C Sometimes speed degenerates, supposedly related to that some operand
     49 C alignments cause cache conflicts.
     50 
     51 C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
     52 C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
     53 
     54 C INPUT PARAMETERS
     55 define(`rp',`%rdi')
     56 define(`up',`%rsi')
     57 define(`vp',`%rdx')
     58 define(`n', `%rcx')
     59 
     60 ifdef(`OPERATION_addlsh1_n', `
     61   define(ADDSUB,	add)
     62   define(ADCSBB,	adc)
     63   define(func,		mpn_addlsh1_n)')
     64 ifdef(`OPERATION_rsblsh1_n', `
     65   define(ADDSUB,	sub)
     66   define(ADCSBB,	sbb)
     67   define(func,		mpn_rsblsh1_n)')
     68 
     69 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
     70 
     71 ABI_SUPPORT(DOS64)
     72 ABI_SUPPORT(STD64)
     73 
     74 ASM_START()
     75 	TEXT
     76 	ALIGN(16)
     77 PROLOGUE(func)
     78 	FUNC_ENTRY(4)
     79 	push	%rbp
     80 
     81 	mov	(vp), %r8
     82 	mov	R32(n), R32(%rax)
     83 	lea	(rp,n,8), rp
     84 	lea	(up,n,8), up
     85 	lea	(vp,n,8), vp
     86 	neg	n
     87 	xor	R32(%rbp), R32(%rbp)
     88 	and	$3, R32(%rax)
     89 	je	L(b00)
     90 	cmp	$2, R32(%rax)
     91 	jc	L(b01)
     92 	je	L(b10)
     93 
     94 L(b11):	add	%r8, %r8
     95 	mov	8(vp,n,8), %r9
     96 	adc	%r9, %r9
     97 	mov	16(vp,n,8), %r10
     98 	adc	%r10, %r10
     99 	sbb	R32(%rax), R32(%rax)	C save scy
    100 	ADDSUB	(up,n,8), %r8
    101 	ADCSBB	8(up,n,8), %r9
    102 	mov	%r8, (rp,n,8)
    103 	mov	%r9, 8(rp,n,8)
    104 	ADCSBB	16(up,n,8), %r10
    105 	mov	%r10, 16(rp,n,8)
    106 	sbb	R32(%rbp), R32(%rbp)	C save acy
    107 	add	$3, n
    108 	jmp	L(ent)
    109 
    110 L(b10):	add	%r8, %r8
    111 	mov	8(vp,n,8), %r9
    112 	adc	%r9, %r9
    113 	sbb	R32(%rax), R32(%rax)	C save scy
    114 	ADDSUB	(up,n,8), %r8
    115 	ADCSBB	8(up,n,8), %r9
    116 	mov	%r8, (rp,n,8)
    117 	mov	%r9, 8(rp,n,8)
    118 	sbb	R32(%rbp), R32(%rbp)	C save acy
    119 	add	$2, n
    120 	jmp	L(ent)
    121 
    122 L(b01):	add	%r8, %r8
    123 	sbb	R32(%rax), R32(%rax)	C save scy
    124 	ADDSUB	(up,n,8), %r8
    125 	mov	%r8, (rp,n,8)
    126 	sbb	R32(%rbp), R32(%rbp)	C save acy
    127 	inc	n
    128 L(ent):	jns	L(end)
    129 
    130 	ALIGN(16)
    131 L(top):	add	R32(%rax), R32(%rax)	C restore scy
    132 
    133 	mov	(vp,n,8), %r8
    134 L(b00):	adc	%r8, %r8
    135 	mov	8(vp,n,8), %r9
    136 	adc	%r9, %r9
    137 	mov	16(vp,n,8), %r10
    138 	adc	%r10, %r10
    139 	mov	24(vp,n,8), %r11
    140 	adc	%r11, %r11
    141 
    142 	sbb	R32(%rax), R32(%rax)	C save scy
    143 	add	R32(%rbp), R32(%rbp)	C restore acy
    144 
    145 	ADCSBB	(up,n,8), %r8
    146 	nop				C Hammer speedup!
    147 	ADCSBB	8(up,n,8), %r9
    148 	mov	%r8, (rp,n,8)
    149 	mov	%r9, 8(rp,n,8)
    150 	ADCSBB	16(up,n,8), %r10
    151 	ADCSBB	24(up,n,8), %r11
    152 	mov	%r10, 16(rp,n,8)
    153 	mov	%r11, 24(rp,n,8)
    154 
    155 	sbb	R32(%rbp), R32(%rbp)	C save acy
    156 	add	$4, n
    157 	js	L(top)
    158 
    159 L(end):
    160 ifdef(`OPERATION_addlsh1_n',`
    161 	add	R32(%rbp), R32(%rax)
    162 	neg	R32(%rax)')
    163 ifdef(`OPERATION_rsblsh1_n',`
    164 	sub	R32(%rax), R32(%rbp)
    165 	movslq	R32(%rbp), %rax')
    166 
    167 	pop	%rbp
    168 	FUNC_EXIT()
    169 	ret
    170 EPILOGUE()
    171