Home | History | Annotate | Line # | Download | only in x86_64
      1 dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
      2 
      3 dnl  Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C	     cycles/limb
     35 C AMD K8,K9	 2.2
     36 C AMD K10	 2.2
     37 C Intel P4	12.75
     38 C Intel core2	 3.45
     39 C Intel corei	 ?
     40 C Intel atom	 ?
     41 C VIA nano	 3.25
     42 
     43 C Sometimes speed degenerates, supposedly related to that some operand
     44 C alignments cause cache conflicts.
     45 
     46 C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
     47 C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
     48 
     49 C INPUT PARAMETERS
     50 define(`rp',`%rdi')
     51 define(`up',`%rsi')
     52 define(`vp',`%rdx')
     53 define(`n', `%rcx')
     54 
     55 ABI_SUPPORT(DOS64)
     56 ABI_SUPPORT(STD64)
     57 
     58 ASM_START()
     59 	TEXT
     60 	ALIGN(16)
     61 PROLOGUE(mpn_sublsh1_n)
     62 	FUNC_ENTRY(4)
     63 	push	%rbx
     64 	push	%rbp
     65 
     66 	mov	(vp), %r8
     67 	mov	R32(n), R32(%rax)
     68 	lea	(rp,n,8), rp
     69 	lea	(up,n,8), up
     70 	lea	(vp,n,8), vp
     71 	neg	n
     72 	xor	R32(%rbp), R32(%rbp)
     73 	and	$3, R32(%rax)
     74 	je	L(b00)
     75 	cmp	$2, R32(%rax)
     76 	jc	L(b01)
     77 	je	L(b10)
     78 
     79 L(b11):	add	%r8, %r8
     80 	mov	8(vp,n,8), %r9
     81 	adc	%r9, %r9
     82 	mov	16(vp,n,8), %r10
     83 	adc	%r10, %r10
     84 	sbb	R32(%rax), R32(%rax)	C save scy
     85 	mov	(up,n,8), %rbp
     86 	mov	8(up,n,8), %rbx
     87 	sub	%r8, %rbp
     88 	sbb	%r9, %rbx
     89 	mov	%rbp, (rp,n,8)
     90 	mov	%rbx, 8(rp,n,8)
     91 	mov	16(up,n,8), %rbp
     92 	sbb	%r10, %rbp
     93 	mov	%rbp, 16(rp,n,8)
     94 	sbb	R32(%rbp), R32(%rbp)	C save acy
     95 	add	$3, n
     96 	jmp	L(ent)
     97 
     98 L(b10):	add	%r8, %r8
     99 	mov	8(vp,n,8), %r9
    100 	adc	%r9, %r9
    101 	sbb	R32(%rax), R32(%rax)	C save scy
    102 	mov	(up,n,8), %rbp
    103 	mov	8(up,n,8), %rbx
    104 	sub	%r8, %rbp
    105 	sbb	%r9, %rbx
    106 	mov	%rbp, (rp,n,8)
    107 	mov	%rbx, 8(rp,n,8)
    108 	sbb	R32(%rbp), R32(%rbp)	C save acy
    109 	add	$2, n
    110 	jmp	L(ent)
    111 
    112 L(b01):	add	%r8, %r8
    113 	sbb	R32(%rax), R32(%rax)	C save scy
    114 	mov	(up,n,8), %rbp
    115 	sub	%r8, %rbp
    116 	mov	%rbp, (rp,n,8)
    117 	sbb	R32(%rbp), R32(%rbp)	C save acy
    118 	inc	n
    119 L(ent):	jns	L(end)
    120 
    121 	ALIGN(16)
    122 L(top):	add	R32(%rax), R32(%rax)	C restore scy
    123 
    124 	mov	(vp,n,8), %r8
    125 L(b00):	adc	%r8, %r8
    126 	mov	8(vp,n,8), %r9
    127 	adc	%r9, %r9
    128 	mov	16(vp,n,8), %r10
    129 	adc	%r10, %r10
    130 	mov	24(vp,n,8), %r11
    131 	adc	%r11, %r11
    132 
    133 	sbb	R32(%rax), R32(%rax)	C save scy
    134 	add	R32(%rbp), R32(%rbp)	C restore acy
    135 
    136 	mov	(up,n,8), %rbp
    137 	mov	8(up,n,8), %rbx
    138 	sbb	%r8, %rbp
    139 	sbb	%r9, %rbx
    140 	mov	%rbp, (rp,n,8)
    141 	mov	%rbx, 8(rp,n,8)
    142 	mov	16(up,n,8), %rbp
    143 	mov	24(up,n,8), %rbx
    144 	sbb	%r10, %rbp
    145 	sbb	%r11, %rbx
    146 	mov	%rbp, 16(rp,n,8)
    147 	mov	%rbx, 24(rp,n,8)
    148 
    149 	sbb	R32(%rbp), R32(%rbp)	C save acy
    150 	add	$4, n
    151 	js	L(top)
    152 
    153 L(end):	add	R32(%rbp), R32(%rax)
    154 	neg	R32(%rax)
    155 
    156 	pop	%rbp
    157 	pop	%rbx
    158 	FUNC_EXIT()
    159 	ret
    160 EPILOGUE()
    161