Home | History | Annotate | Line # | Download | only in x86_64
sublsh1_n.asm revision 1.1.1.1.2.1
      1 dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
      2 
      3 dnl  Copyright 2003, 2005, 2006, 2007, 2011, 2012 Free Software Foundation,
      4 dnl  Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of the GNU Lesser General Public License as published
     10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11 dnl  your option) any later version.
     12 
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16 dnl  License for more details.
     17 
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 
     24 C	     cycles/limb
     25 C AMD K8,K9	 2.2
     26 C AMD K10	 2.2
     27 C Intel P4	12.75
     28 C Intel core2	 3.45
     29 C Intel corei	 ?
     30 C Intel atom	 ?
     31 C VIA nano	 3.25
     32 
     33 C Sometimes speed degenerates, supposedly related to that some operand
     34 C alignments cause cache conflicts.
     35 
     36 C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
     37 C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
     38 
     39 C INPUT PARAMETERS
     40 define(`rp',`%rdi')
     41 define(`up',`%rsi')
     42 define(`vp',`%rdx')
     43 define(`n', `%rcx')
     44 
     45 ABI_SUPPORT(DOS64)
     46 ABI_SUPPORT(STD64)
     47 
     48 ASM_START()
     49 	TEXT
     50 	ALIGN(16)
     51 PROLOGUE(mpn_sublsh1_n)
     52 	FUNC_ENTRY(4)
     53 	push	%rbx
     54 	push	%rbp
     55 
     56 	mov	(vp), %r8
     57 	mov	R32(n), R32(%rax)
     58 	lea	(rp,n,8), rp
     59 	lea	(up,n,8), up
     60 	lea	(vp,n,8), vp
     61 	neg	n
     62 	xor	R32(%rbp), R32(%rbp)
     63 	and	$3, R32(%rax)
     64 	je	L(b00)
     65 	cmp	$2, R32(%rax)
     66 	jc	L(b01)
     67 	je	L(b10)
     68 
     69 L(b11):	add	%r8, %r8
     70 	mov	8(vp,n,8), %r9
     71 	adc	%r9, %r9
     72 	mov	16(vp,n,8), %r10
     73 	adc	%r10, %r10
     74 	sbb	R32(%rax), R32(%rax)	C save scy
     75 	mov	(up,n,8), %rbp
     76 	mov	8(up,n,8), %rbx
     77 	sub	%r8, %rbp
     78 	sbb	%r9, %rbx
     79 	mov	%rbp, (rp,n,8)
     80 	mov	%rbx, 8(rp,n,8)
     81 	mov	16(up,n,8), %rbp
     82 	sbb	%r10, %rbp
     83 	mov	%rbp, 16(rp,n,8)
     84 	sbb	R32(%rbp), R32(%rbp)	C save acy
     85 	add	$3, n
     86 	jmp	L(ent)
     87 
     88 L(b10):	add	%r8, %r8
     89 	mov	8(vp,n,8), %r9
     90 	adc	%r9, %r9
     91 	sbb	R32(%rax), R32(%rax)	C save scy
     92 	mov	(up,n,8), %rbp
     93 	mov	8(up,n,8), %rbx
     94 	sub	%r8, %rbp
     95 	sbb	%r9, %rbx
     96 	mov	%rbp, (rp,n,8)
     97 	mov	%rbx, 8(rp,n,8)
     98 	sbb	R32(%rbp), R32(%rbp)	C save acy
     99 	add	$2, n
    100 	jmp	L(ent)
    101 
    102 L(b01):	add	%r8, %r8
    103 	sbb	R32(%rax), R32(%rax)	C save scy
    104 	mov	(up,n,8), %rbp
    105 	sub	%r8, %rbp
    106 	mov	%rbp, (rp,n,8)
    107 	sbb	R32(%rbp), R32(%rbp)	C save acy
    108 	inc	n
    109 L(ent):	jns	L(end)
    110 
    111 	ALIGN(16)
    112 L(top):	add	R32(%rax), R32(%rax)	C restore scy
    113 
    114 	mov	(vp,n,8), %r8
    115 L(b00):	adc	%r8, %r8
    116 	mov	8(vp,n,8), %r9
    117 	adc	%r9, %r9
    118 	mov	16(vp,n,8), %r10
    119 	adc	%r10, %r10
    120 	mov	24(vp,n,8), %r11
    121 	adc	%r11, %r11
    122 
    123 	sbb	R32(%rax), R32(%rax)	C save scy
    124 	add	R32(%rbp), R32(%rbp)	C restore acy
    125 
    126 	mov	(up,n,8), %rbp
    127 	mov	8(up,n,8), %rbx
    128 	sbb	%r8, %rbp
    129 	sbb	%r9, %rbx
    130 	mov	%rbp, (rp,n,8)
    131 	mov	%rbx, 8(rp,n,8)
    132 	mov	16(up,n,8), %rbp
    133 	mov	24(up,n,8), %rbx
    134 	sbb	%r10, %rbp
    135 	sbb	%r11, %rbx
    136 	mov	%rbp, 16(rp,n,8)
    137 	mov	%rbx, 24(rp,n,8)
    138 
    139 	sbb	R32(%rbp), R32(%rbp)	C save acy
    140 	add	$4, n
    141 	js	L(top)
    142 
    143 L(end):	add	R32(%rbp), R32(%rax)
    144 	neg	R32(%rax)
    145 
    146 	pop	%rbp
    147 	pop	%rbx
    148 	FUNC_EXIT()
    149 	ret
    150 EPILOGUE()
    151