Home | History | Annotate | Line # | Download | only in x86_64
lshift.asm revision 1.1.1.1
      1 dnl  AMD64 mpn_lshift -- mpn left shift.
      2 
      3 dnl  Copyright 2003, 2005, 2007, 2009 Free Software Foundation, Inc.
      4 dnl
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or
      8 dnl  modify it under the terms of the GNU Lesser General Public License as
      9 dnl  published by the Free Software Foundation; either version 3 of the
     10 dnl  License, or (at your option) any later version.
     11 dnl
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 dnl  Lesser General Public License for more details.
     16 dnl
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C	     cycles/limb   cycles/limb cnt=1
     24 C K8,K9:	 2.375		 1.375
     25 C K10:		 2.375		 1.375
     26 C P4:		 8		10.5
     27 C P6-15 (Core2): 2.11		 4.28
     28 C P6-28 (Atom):	 5.75		 3.5
     29 
     30 
     31 C INPUT PARAMETERS
     32 define(`rp',	`%rdi')
     33 define(`up',	`%rsi')
     34 define(`n',	`%rdx')
     35 define(`cnt',	`%rcx')
     36 
     37 ASM_START()
     38 	TEXT
     39 	ALIGN(32)
     40 PROLOGUE(mpn_lshift)
     41 	cmp	$1, R8(%rcx)
     42 	jne	L(gen)
     43 
     44 C For cnt=1 we want to work from lowest limb towards higher limbs.
     45 C Check for bad overlap (up=rp is OK!) up=1..rp+n-1 is bad.
     46 C FIXME: this could surely be done more cleverly.
     47 
     48 	mov    rp, %rax
     49 	sub    up, %rax
     50 	je     L(fwd)			C rp = up
     51 	shr    $3, %rax
     52 	cmp    n, %rax
     53 	jb     L(gen)
     54 
     55 L(fwd):	mov	R32(n), R32(%rax)
     56 	shr	$2, n
     57 	je	L(e1)
     58 	and	$3, R32(%rax)
     59 
     60 	ALIGN(8)
     61 	nop
     62 	nop
     63 L(t1):	mov	(up), %r8
     64 	mov	8(up), %r9
     65 	mov	16(up), %r10
     66 	mov	24(up), %r11
     67 	lea	32(up), up
     68 	adc	%r8, %r8
     69 	mov	%r8, (rp)
     70 	adc	%r9, %r9
     71 	mov	%r9, 8(rp)
     72 	adc	%r10, %r10
     73 	mov	%r10, 16(rp)
     74 	adc	%r11, %r11
     75 	mov	%r11, 24(rp)
     76 	lea	32(rp), rp
     77 	dec	n
     78 	jne	L(t1)
     79 
     80 	inc	R32(%rax)
     81 	dec	R32(%rax)
     82 	jne	L(n00)
     83 	adc	R32(%rax), R32(%rax)
     84 	ret
     85 L(e1):	test	R32(%rax), R32(%rax)	C clear cy
     86 L(n00):	mov	(up), %r8
     87 	dec	R32(%rax)
     88 	jne	L(n01)
     89 	adc	%r8, %r8
     90 	mov	%r8, (rp)
     91 L(ret):	adc	R32(%rax), R32(%rax)
     92 	ret
     93 L(n01):	dec	R32(%rax)
     94 	mov	8(up), %r9
     95 	jne	L(n10)
     96 	adc	%r8, %r8
     97 	adc	%r9, %r9
     98 	mov	%r8, (rp)
     99 	mov	%r9, 8(rp)
    100 	adc	R32(%rax), R32(%rax)
    101 	ret
    102 L(n10):	mov	16(up), %r10
    103 	adc	%r8, %r8
    104 	adc	%r9, %r9
    105 	adc	%r10, %r10
    106 	mov	%r8, (rp)
    107 	mov	%r9, 8(rp)
    108 	mov	%r10, 16(rp)
    109 	adc	$-1, R32(%rax)
    110 	ret
    111 
    112 L(gen):	neg	R32(%rcx)		C put rsh count in cl
    113 	mov	-8(up,n,8), %rax
    114 	shr	R8(%rcx), %rax		C function return value
    115 
    116 	neg	R32(%rcx)		C put lsh count in cl
    117 	lea	1(n), R32(%r8)
    118 	and	$3, R32(%r8)
    119 	je	L(rlx)			C jump for n = 3, 7, 11, ...
    120 
    121 	dec	R32(%r8)
    122 	jne	L(1)
    123 C	n = 4, 8, 12, ...
    124 	mov	-8(up,n,8), %r10
    125 	shl	R8(%rcx), %r10
    126 	neg	R32(%rcx)		C put rsh count in cl
    127 	mov	-16(up,n,8), %r8
    128 	shr	R8(%rcx), %r8
    129 	or	%r8, %r10
    130 	mov	%r10, -8(rp,n,8)
    131 	dec	n
    132 	jmp	L(rll)
    133 
    134 L(1):	dec	R32(%r8)
    135 	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
    136 C	n = 2, 6, 10, 16, ...
    137 	mov	-8(up,n,8), %r10
    138 	shl	R8(%rcx), %r10
    139 	neg	R32(%rcx)		C put rsh count in cl
    140 	mov	-16(up,n,8), %r8
    141 	shr	R8(%rcx), %r8
    142 	or	%r8, %r10
    143 	mov	%r10, -8(rp,n,8)
    144 	dec	n
    145 	neg	R32(%rcx)		C put lsh count in cl
    146 L(1x):
    147 	cmp	$1, n
    148 	je	L(ast)
    149 	mov	-8(up,n,8), %r10
    150 	shl	R8(%rcx), %r10
    151 	mov	-16(up,n,8), %r11
    152 	shl	R8(%rcx), %r11
    153 	neg	R32(%rcx)		C put rsh count in cl
    154 	mov	-16(up,n,8), %r8
    155 	mov	-24(up,n,8), %r9
    156 	shr	R8(%rcx), %r8
    157 	or	%r8, %r10
    158 	shr	R8(%rcx), %r9
    159 	or	%r9, %r11
    160 	mov	%r10, -8(rp,n,8)
    161 	mov	%r11, -16(rp,n,8)
    162 	sub	$2, n
    163 
    164 L(rll):	neg	R32(%rcx)		C put lsh count in cl
    165 L(rlx):	mov	-8(up,n,8), %r10
    166 	shl	R8(%rcx), %r10
    167 	mov	-16(up,n,8), %r11
    168 	shl	R8(%rcx), %r11
    169 
    170 	sub	$4, n			C				      4
    171 	jb	L(end)			C				      2
    172 	ALIGN(16)
    173 L(top):
    174 	C finish stuff from lsh block
    175 	neg	R32(%rcx)		C put rsh count in cl
    176 	mov	16(up,n,8), %r8
    177 	mov	8(up,n,8), %r9
    178 	shr	R8(%rcx), %r8
    179 	or	%r8, %r10
    180 	shr	R8(%rcx), %r9
    181 	or	%r9, %r11
    182 	mov	%r10, 24(rp,n,8)
    183 	mov	%r11, 16(rp,n,8)
    184 	C start two new rsh
    185 	mov	0(up,n,8), %r8
    186 	mov	-8(up,n,8), %r9
    187 	shr	R8(%rcx), %r8
    188 	shr	R8(%rcx), %r9
    189 
    190 	C finish stuff from rsh block
    191 	neg	R32(%rcx)		C put lsh count in cl
    192 	mov	8(up,n,8), %r10
    193 	mov	0(up,n,8), %r11
    194 	shl	R8(%rcx), %r10
    195 	or	%r10, %r8
    196 	shl	R8(%rcx), %r11
    197 	or	%r11, %r9
    198 	mov	%r8, 8(rp,n,8)
    199 	mov	%r9, 0(rp,n,8)
    200 	C start two new lsh
    201 	mov	-8(up,n,8), %r10
    202 	mov	-16(up,n,8), %r11
    203 	shl	R8(%rcx), %r10
    204 	shl	R8(%rcx), %r11
    205 
    206 	sub	$4, n
    207 	jae	L(top)			C				      2
    208 L(end):
    209 	neg	R32(%rcx)		C put rsh count in cl
    210 	mov	8(up), %r8
    211 	shr	R8(%rcx), %r8
    212 	or	%r8, %r10
    213 	mov	(up), %r9
    214 	shr	R8(%rcx), %r9
    215 	or	%r9, %r11
    216 	mov	%r10, 16(rp)
    217 	mov	%r11, 8(rp)
    218 
    219 	neg	R32(%rcx)		C put lsh count in cl
    220 L(ast):	mov	(up), %r10
    221 	shl	R8(%rcx), %r10
    222 	mov	%r10, (rp)
    223 	ret
    224 EPILOGUE()
    225