Home | History | Annotate | Line # | Download | only in x86_64
sublsh1_n.asm revision 1.1.1.1
      1 dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
      2 
      3 dnl  Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C	     cycles/limb
     24 C K8,K9:	 2.2
     25 C K10:		 2.2
     26 C P4:		12.75
     27 C P6 core2: 	 3.45
     28 C P6 corei7:	 3.45
     29 C P6 atom:	 ?
     30 
     31 
     32 C Sometimes speed degenerates, supposedly related to that some operand
     33 C alignments cause cache conflicts.
     34 
     35 C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
     36 C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
     37 
     38 C INPUT PARAMETERS
     39 define(`rp',`%rdi')
     40 define(`up',`%rsi')
     41 define(`vp',`%rdx')
     42 define(`n', `%rcx')
     43 
     44 ASM_START()
     45 	TEXT
     46 	ALIGN(16)
     47 PROLOGUE(mpn_sublsh1_n)
     48 	push	%rbx
     49 	push	%rbp
     50 
     51 	mov	(vp), %r8
     52 	mov	R32(n), R32(%rax)
     53 	lea	(rp,n,8), rp
     54 	lea	(up,n,8), up
     55 	lea	(vp,n,8), vp
     56 	neg	n
     57 	xor	R32(%rbp), R32(%rbp)
     58 	and	$3, R32(%rax)
     59 	je	L(b00)
     60 	cmp	$2, R32(%rax)
     61 	jc	L(b01)
     62 	je	L(b10)
     63 
     64 L(b11):	add	%r8, %r8
     65 	mov	8(vp,n,8), %r9
     66 	adc	%r9, %r9
     67 	mov	16(vp,n,8), %r10
     68 	adc	%r10, %r10
     69 	sbb	R32(%rax), R32(%rax)	C save scy
     70 	mov	(up,n,8), %rbp
     71 	mov	8(up,n,8), %rbx
     72 	sub	%r8, %rbp
     73 	sbb	%r9, %rbx
     74 	mov	%rbp, (rp,n,8)
     75 	mov	%rbx, 8(rp,n,8)
     76 	mov	16(up,n,8), %rbp
     77 	sbb	%r10, %rbp
     78 	mov	%rbp, 16(rp,n,8)
     79 	sbb	R32(%rbp), R32(%rbp)	C save acy
     80 	add	$3, n
     81 	jmp	L(ent)
     82 
     83 L(b10):	add	%r8, %r8
     84 	mov	8(vp,n,8), %r9
     85 	adc	%r9, %r9
     86 	sbb	R32(%rax), R32(%rax)	C save scy
     87 	mov	(up,n,8), %rbp
     88 	mov	8(up,n,8), %rbx
     89 	sub	%r8, %rbp
     90 	sbb	%r9, %rbx
     91 	mov	%rbp, (rp,n,8)
     92 	mov	%rbx, 8(rp,n,8)
     93 	sbb	R32(%rbp), R32(%rbp)	C save acy
     94 	add	$2, n
     95 	jmp	L(ent)
     96 
     97 L(b01):	add	%r8, %r8
     98 	sbb	R32(%rax), R32(%rax)	C save scy
     99 	mov	(up,n,8), %rbp
    100 	sub	%r8, %rbp
    101 	mov	%rbp, (rp,n,8)
    102 	sbb	R32(%rbp), R32(%rbp)	C save acy
    103 	inc	n
    104 L(ent):	jns	L(end)
    105 
    106 	ALIGN(16)
    107 L(top):	add	R32(%rax), R32(%rax)	C restore scy
    108 
    109 	mov	(vp,n,8), %r8
    110 L(b00):	adc	%r8, %r8
    111 	mov	8(vp,n,8), %r9
    112 	adc	%r9, %r9
    113 	mov	16(vp,n,8), %r10
    114 	adc	%r10, %r10
    115 	mov	24(vp,n,8), %r11
    116 	adc	%r11, %r11
    117 
    118 	sbb	R32(%rax), R32(%rax)	C save scy
    119 	add	R32(%rbp), R32(%rbp)	C restore acy
    120 
    121 	mov	(up,n,8), %rbp
    122 	mov	8(up,n,8), %rbx
    123 	sbb	%r8, %rbp
    124 	sbb	%r9, %rbx
    125 	mov	%rbp, (rp,n,8)
    126 	mov	%rbx, 8(rp,n,8)
    127 	mov	16(up,n,8), %rbp
    128 	mov	24(up,n,8), %rbx
    129 	sbb	%r10, %rbp
    130 	sbb	%r11, %rbx
    131 	mov	%rbp, 16(rp,n,8)
    132 	mov	%rbx, 24(rp,n,8)
    133 
    134 	sbb	R32(%rbp), R32(%rbp)	C save acy
    135 	add	$4, n
    136 	js	L(top)
    137 
    138 L(end):	add	R32(%rbp), R32(%rax)
    139 	neg	R32(%rax)
    140 
    141 	pop	%rbp
    142 	pop	%rbx
    143 	ret
    144 EPILOGUE()
    145