Home | History | Annotate | Line # | Download | only in x86_64
      1      1.1  mrg dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
      2      1.1  mrg dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
      3      1.1  mrg 
      4  1.1.1.3  mrg dnl  Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
      5      1.1  mrg 
      6      1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1.1.3  mrg dnl
      8      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1.1.3  mrg dnl  it under the terms of either:
     10  1.1.1.3  mrg dnl
     11  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     12  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     13  1.1.1.3  mrg dnl      option) any later version.
     14  1.1.1.3  mrg dnl
     15  1.1.1.3  mrg dnl  or
     16  1.1.1.3  mrg dnl
     17  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     18  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     19  1.1.1.3  mrg dnl      later version.
     20  1.1.1.3  mrg dnl
     21  1.1.1.3  mrg dnl  or both in parallel, as here.
     22  1.1.1.3  mrg dnl
     23      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26  1.1.1.3  mrg dnl  for more details.
     27  1.1.1.3  mrg dnl
     28  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     29  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg 
     35      1.1  mrg C	     cycles/limb
     36  1.1.1.2  mrg C AMD K8,K9	 2
     37  1.1.1.2  mrg C AMD K10	 2
     38  1.1.1.3  mrg C AMD bd1	 ?
     39  1.1.1.3  mrg C AMD bobcat	 ?
     40  1.1.1.2  mrg C Intel P4	 13
     41  1.1.1.2  mrg C Intel core2	 3.45
     42  1.1.1.3  mrg C Intel NHM	 ?
     43  1.1.1.3  mrg C Intel SBR	 ?
     44  1.1.1.2  mrg C Intel atom	 ?
     45  1.1.1.2  mrg C VIA nano	 ?
     46      1.1  mrg 
     47      1.1  mrg 
     48      1.1  mrg C Sometimes speed degenerates, supposedly related to that some operand
     49      1.1  mrg C alignments cause cache conflicts.
     50      1.1  mrg 
     51      1.1  mrg C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
     52      1.1  mrg C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
     53      1.1  mrg 
     54      1.1  mrg C INPUT PARAMETERS
     55      1.1  mrg define(`rp',`%rdi')
     56      1.1  mrg define(`up',`%rsi')
     57      1.1  mrg define(`vp',`%rdx')
     58      1.1  mrg define(`n', `%rcx')
     59      1.1  mrg 
     60      1.1  mrg ifdef(`OPERATION_addlsh1_n', `
     61  1.1.1.2  mrg   define(ADDSUB,	add)
     62  1.1.1.2  mrg   define(ADCSBB,	adc)
     63  1.1.1.2  mrg   define(func,		mpn_addlsh1_n)')
     64      1.1  mrg ifdef(`OPERATION_rsblsh1_n', `
     65  1.1.1.2  mrg   define(ADDSUB,	sub)
     66  1.1.1.2  mrg   define(ADCSBB,	sbb)
     67  1.1.1.2  mrg   define(func,		mpn_rsblsh1_n)')
     68      1.1  mrg 
     69      1.1  mrg MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
     70      1.1  mrg 
     71  1.1.1.2  mrg ABI_SUPPORT(DOS64)
     72  1.1.1.2  mrg ABI_SUPPORT(STD64)
     73  1.1.1.2  mrg 
     74      1.1  mrg ASM_START()
     75      1.1  mrg 	TEXT
     76      1.1  mrg 	ALIGN(16)
     77      1.1  mrg PROLOGUE(func)
     78  1.1.1.2  mrg 	FUNC_ENTRY(4)
     79      1.1  mrg 	push	%rbp
     80      1.1  mrg 
     81      1.1  mrg 	mov	(vp), %r8
     82      1.1  mrg 	mov	R32(n), R32(%rax)
     83      1.1  mrg 	lea	(rp,n,8), rp
     84      1.1  mrg 	lea	(up,n,8), up
     85      1.1  mrg 	lea	(vp,n,8), vp
     86      1.1  mrg 	neg	n
     87      1.1  mrg 	xor	R32(%rbp), R32(%rbp)
     88      1.1  mrg 	and	$3, R32(%rax)
     89      1.1  mrg 	je	L(b00)
     90      1.1  mrg 	cmp	$2, R32(%rax)
     91      1.1  mrg 	jc	L(b01)
     92      1.1  mrg 	je	L(b10)
     93      1.1  mrg 
     94      1.1  mrg L(b11):	add	%r8, %r8
     95      1.1  mrg 	mov	8(vp,n,8), %r9
     96      1.1  mrg 	adc	%r9, %r9
     97      1.1  mrg 	mov	16(vp,n,8), %r10
     98      1.1  mrg 	adc	%r10, %r10
     99      1.1  mrg 	sbb	R32(%rax), R32(%rax)	C save scy
    100      1.1  mrg 	ADDSUB	(up,n,8), %r8
    101      1.1  mrg 	ADCSBB	8(up,n,8), %r9
    102      1.1  mrg 	mov	%r8, (rp,n,8)
    103      1.1  mrg 	mov	%r9, 8(rp,n,8)
    104      1.1  mrg 	ADCSBB	16(up,n,8), %r10
    105      1.1  mrg 	mov	%r10, 16(rp,n,8)
    106      1.1  mrg 	sbb	R32(%rbp), R32(%rbp)	C save acy
    107      1.1  mrg 	add	$3, n
    108      1.1  mrg 	jmp	L(ent)
    109      1.1  mrg 
    110      1.1  mrg L(b10):	add	%r8, %r8
    111      1.1  mrg 	mov	8(vp,n,8), %r9
    112      1.1  mrg 	adc	%r9, %r9
    113      1.1  mrg 	sbb	R32(%rax), R32(%rax)	C save scy
    114      1.1  mrg 	ADDSUB	(up,n,8), %r8
    115      1.1  mrg 	ADCSBB	8(up,n,8), %r9
    116      1.1  mrg 	mov	%r8, (rp,n,8)
    117      1.1  mrg 	mov	%r9, 8(rp,n,8)
    118      1.1  mrg 	sbb	R32(%rbp), R32(%rbp)	C save acy
    119      1.1  mrg 	add	$2, n
    120      1.1  mrg 	jmp	L(ent)
    121      1.1  mrg 
    122      1.1  mrg L(b01):	add	%r8, %r8
    123      1.1  mrg 	sbb	R32(%rax), R32(%rax)	C save scy
    124      1.1  mrg 	ADDSUB	(up,n,8), %r8
    125      1.1  mrg 	mov	%r8, (rp,n,8)
    126      1.1  mrg 	sbb	R32(%rbp), R32(%rbp)	C save acy
    127      1.1  mrg 	inc	n
    128      1.1  mrg L(ent):	jns	L(end)
    129      1.1  mrg 
    130      1.1  mrg 	ALIGN(16)
    131      1.1  mrg L(top):	add	R32(%rax), R32(%rax)	C restore scy
    132      1.1  mrg 
    133      1.1  mrg 	mov	(vp,n,8), %r8
    134      1.1  mrg L(b00):	adc	%r8, %r8
    135      1.1  mrg 	mov	8(vp,n,8), %r9
    136      1.1  mrg 	adc	%r9, %r9
    137      1.1  mrg 	mov	16(vp,n,8), %r10
    138      1.1  mrg 	adc	%r10, %r10
    139      1.1  mrg 	mov	24(vp,n,8), %r11
    140      1.1  mrg 	adc	%r11, %r11
    141      1.1  mrg 
    142      1.1  mrg 	sbb	R32(%rax), R32(%rax)	C save scy
    143      1.1  mrg 	add	R32(%rbp), R32(%rbp)	C restore acy
    144      1.1  mrg 
    145      1.1  mrg 	ADCSBB	(up,n,8), %r8
    146      1.1  mrg 	nop				C Hammer speedup!
    147      1.1  mrg 	ADCSBB	8(up,n,8), %r9
    148      1.1  mrg 	mov	%r8, (rp,n,8)
    149      1.1  mrg 	mov	%r9, 8(rp,n,8)
    150      1.1  mrg 	ADCSBB	16(up,n,8), %r10
    151      1.1  mrg 	ADCSBB	24(up,n,8), %r11
    152      1.1  mrg 	mov	%r10, 16(rp,n,8)
    153      1.1  mrg 	mov	%r11, 24(rp,n,8)
    154      1.1  mrg 
    155      1.1  mrg 	sbb	R32(%rbp), R32(%rbp)	C save acy
    156      1.1  mrg 	add	$4, n
    157      1.1  mrg 	js	L(top)
    158      1.1  mrg 
    159      1.1  mrg L(end):
    160      1.1  mrg ifdef(`OPERATION_addlsh1_n',`
    161      1.1  mrg 	add	R32(%rbp), R32(%rax)
    162      1.1  mrg 	neg	R32(%rax)')
    163      1.1  mrg ifdef(`OPERATION_rsblsh1_n',`
    164      1.1  mrg 	sub	R32(%rax), R32(%rbp)
    165      1.1  mrg 	movslq	R32(%rbp), %rax')
    166      1.1  mrg 
    167      1.1  mrg 	pop	%rbp
    168  1.1.1.2  mrg 	FUNC_EXIT()
    169      1.1  mrg 	ret
    170      1.1  mrg EPILOGUE()
    171