Home | History | Annotate | Line # | Download | only in pentium4
lshift.asm revision 1.1.1.1.2.1
      1 dnl  x86-64 mpn_lshift optimized for Pentium 4.
      2 
      3 dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of the GNU Lesser General Public License as published
      9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     10 dnl  your option) any later version.
     11 
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     15 dnl  License for more details.
     16 
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C	     cycles/limb
     24 C AMD K8,K9	 2.5
     25 C AMD K10	 ?
     26 C Intel P4	 3.29
     27 C Intel core2	 2.1 (fluctuates, presumably cache related)
     28 C Intel corei	 ?
     29 C Intel atom	14.3
     30 C VIA nano	 ?
     31 
     32 C INPUT PARAMETERS
     33 define(`rp',`%rdi')
     34 define(`up',`%rsi')
     35 define(`n',`%rdx')
     36 define(`cnt',`%cl')
     37 
     38 ABI_SUPPORT(DOS64)
     39 ABI_SUPPORT(STD64)
     40 
     41 ASM_START()
     42 	TEXT
     43 	ALIGN(32)
     44 PROLOGUE(mpn_lshift)
     45 	FUNC_ENTRY(4)
     46 	mov	-8(up,n,8), %rax
     47 	movd	R32(%rcx), %mm4
     48 	neg	R32(%rcx)		C put rsh count in cl
     49 	and	$63, R32(%rcx)
     50 	movd	R32(%rcx), %mm5
     51 
     52 	lea	1(n), R32(%r8)
     53 
     54 	shr	R8(%rcx), %rax		C function return value
     55 
     56 	and	$3, R32(%r8)
     57 	je	L(rol)			C jump for n = 3, 7, 11, ...
     58 
     59 	dec	R32(%r8)
     60 	jne	L(1)
     61 C	n = 4, 8, 12, ...
     62 	movq	-8(up,n,8), %mm2
     63 	psllq	%mm4, %mm2
     64 	movq	-16(up,n,8), %mm0
     65 	psrlq	%mm5, %mm0
     66 	por	%mm0, %mm2
     67 	movq	%mm2, -8(rp,n,8)
     68 	dec	n
     69 	jmp	L(rol)
     70 
     71 L(1):	dec	R32(%r8)
     72 	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
     73 C	n = 2, 6, 10, 16, ...
     74 	movq	-8(up,n,8), %mm2
     75 	psllq	%mm4, %mm2
     76 	movq	-16(up,n,8), %mm0
     77 	psrlq	%mm5, %mm0
     78 	por	%mm0, %mm2
     79 	movq	%mm2, -8(rp,n,8)
     80 	dec	n
     81 L(1x):
     82 	cmp	$1, n
     83 	je	L(ast)
     84 	movq	-8(up,n,8), %mm2
     85 	psllq	%mm4, %mm2
     86 	movq	-16(up,n,8), %mm3
     87 	psllq	%mm4, %mm3
     88 	movq	-16(up,n,8), %mm0
     89 	movq	-24(up,n,8), %mm1
     90 	psrlq	%mm5, %mm0
     91 	por	%mm0, %mm2
     92 	psrlq	%mm5, %mm1
     93 	por	%mm1, %mm3
     94 	movq	%mm2, -8(rp,n,8)
     95 	movq	%mm3, -16(rp,n,8)
     96 	sub	$2, n
     97 
     98 L(rol):	movq	-8(up,n,8), %mm2
     99 	psllq	%mm4, %mm2
    100 	movq	-16(up,n,8), %mm3
    101 	psllq	%mm4, %mm3
    102 
    103 	sub	$4, n			C				      4
    104 	jb	L(end)			C				      2
    105 	ALIGN(32)
    106 L(top):
    107 	C finish stuff from lsh block
    108 	movq	16(up,n,8), %mm0
    109 	movq	8(up,n,8), %mm1
    110 	psrlq	%mm5, %mm0
    111 	por	%mm0, %mm2
    112 	psrlq	%mm5, %mm1
    113 	movq	(up,n,8), %mm0
    114 	por	%mm1, %mm3
    115 	movq	-8(up,n,8), %mm1
    116 	movq	%mm2, 24(rp,n,8)
    117 	movq	%mm3, 16(rp,n,8)
    118 	C start two new rsh
    119 	psrlq	%mm5, %mm0
    120 	psrlq	%mm5, %mm1
    121 
    122 	C finish stuff from rsh block
    123 	movq	8(up,n,8), %mm2
    124 	movq	(up,n,8), %mm3
    125 	psllq	%mm4, %mm2
    126 	por	%mm2, %mm0
    127 	psllq	%mm4, %mm3
    128 	movq	-8(up,n,8), %mm2
    129 	por	%mm3, %mm1
    130 	movq	-16(up,n,8), %mm3
    131 	movq	%mm0, 8(rp,n,8)
    132 	movq	%mm1, (rp,n,8)
    133 	C start two new lsh
    134 	sub	$4, n
    135 	psllq	%mm4, %mm2
    136 	psllq	%mm4, %mm3
    137 
    138 	jae	L(top)			C				      2
    139 L(end):
    140 	movq	8(up), %mm0
    141 	psrlq	%mm5, %mm0
    142 	por	%mm0, %mm2
    143 	movq	(up), %mm1
    144 	psrlq	%mm5, %mm1
    145 	por	%mm1, %mm3
    146 	movq	%mm2, 16(rp)
    147 	movq	%mm3, 8(rp)
    148 
    149 L(ast):	movq	(up), %mm2
    150 	psllq	%mm4, %mm2
    151 	movq	%mm2, (rp)
    152 	emms
    153 	FUNC_EXIT()
    154 	ret
    155 EPILOGUE()
    156