Home | History | Annotate | Line # | Download | only in fastsse
lshift-movdqu2.asm revision 1.1.1.1.8.2
      1 dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
      2 
      3 dnl  Contributed to the GNU project by Torbjorn Granlund.
      4 
      5 dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of the GNU Lesser General Public License as published
     11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     12 dnl  your option) any later version.
     13 
     14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     17 dnl  License for more details.
     18 
     19 dnl  You should have received a copy of the GNU Lesser General Public License
     20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     21 
     22 include(`../config.m4')
     23 
     24 
     25 C	     cycles/limb     cycles/limb     cycles/limb    good
     26 C              aligned	      unaligned	      best seen	   for cpu?
     27 C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
     28 C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
     29 C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
     30 C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
     31 C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
     32 C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
     33 C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
     34 C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
     35 C Intel atom	11.7		11.7		 4.5	  no
     36 C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
     37 
     38 C We try to do as many aligned 16-byte operations as possible.  The top-most
     39 C and bottom-most writes might need 8-byte operations.
     40 C
     41 C This variant rely on fast load movdqu, and uses it even for aligned operands,
     42 C in order to avoid the need for two separate loops.
     43 C
     44 C TODO
     45 C  * Could 2-limb wind-down code be simplified?
     46 C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
     47 C    for other affected CPUs.
     48 
     49 C INPUT PARAMETERS
     50 define(`rp',  `%rdi')
     51 define(`ap',  `%rsi')
     52 define(`n',   `%rdx')
     53 define(`cnt', `%rcx')
     54 
     55 ASM_START()
     56 	TEXT
     57 	ALIGN(64)
     58 PROLOGUE(mpn_lshift)
     59 	FUNC_ENTRY(4)
     60 	movd	R32(%rcx), %xmm4
     61 	mov	$64, R32(%rax)
     62 	sub	R32(%rcx), R32(%rax)
     63 	movd	R32(%rax), %xmm5
     64 
     65 	neg	R32(%rcx)
     66 	mov	-8(ap,n,8), %rax
     67 	shr	R8(%rcx), %rax
     68 
     69 	cmp	$3, n
     70 	jle	L(bc)
     71 
     72 	lea	(rp,n,8), R32(%rcx)
     73 	bt	$3, R32(%rcx)
     74 	jnc	L(rp_aligned)
     75 
     76 C Do one initial limb in order to make rp aligned
     77 	movq	-8(ap,n,8), %xmm0
     78 	movq	-16(ap,n,8), %xmm1
     79 	psllq	%xmm4, %xmm0
     80 	psrlq	%xmm5, %xmm1
     81 	por	%xmm1, %xmm0
     82 	movq	%xmm0, -8(rp,n,8)
     83 	dec	n
     84 
     85 L(rp_aligned):
     86 	lea	1(n), %r8d
     87 
     88 	and	$6, R32(%r8)
     89 	jz	L(ba0)
     90 	cmp	$4, R32(%r8)
     91 	jz	L(ba4)
     92 	jc	L(ba2)
     93 L(ba6):	add	$-4, n
     94 	jmp	L(i56)
     95 L(ba0):	add	$-6, n
     96 	jmp	L(i70)
     97 L(ba4):	add	$-2, n
     98 	jmp	L(i34)
     99 L(ba2):	add	$-8, n
    100 	jle	L(end)
    101 
    102 	ALIGN(16)
    103 L(top):	movdqu	40(ap,n,8), %xmm1
    104 	movdqu	48(ap,n,8), %xmm0
    105 	psllq	%xmm4, %xmm0
    106 	psrlq	%xmm5, %xmm1
    107 	por	%xmm1, %xmm0
    108 	movdqa	%xmm0, 48(rp,n,8)
    109 L(i70):
    110 	movdqu	24(ap,n,8), %xmm1
    111 	movdqu	32(ap,n,8), %xmm0
    112 	psllq	%xmm4, %xmm0
    113 	psrlq	%xmm5, %xmm1
    114 	por	%xmm1, %xmm0
    115 	movdqa	%xmm0, 32(rp,n,8)
    116 L(i56):
    117 	movdqu	8(ap,n,8), %xmm1
    118 	movdqu	16(ap,n,8), %xmm0
    119 	psllq	%xmm4, %xmm0
    120 	psrlq	%xmm5, %xmm1
    121 	por	%xmm1, %xmm0
    122 	movdqa	%xmm0, 16(rp,n,8)
    123 L(i34):
    124 	movdqu	-8(ap,n,8), %xmm1
    125 	movdqu	(ap,n,8), %xmm0
    126 	psllq	%xmm4, %xmm0
    127 	psrlq	%xmm5, %xmm1
    128 	por	%xmm1, %xmm0
    129 	movdqa	%xmm0, (rp,n,8)
    130 	sub	$8, n
    131 	jg	L(top)
    132 
    133 L(end):	bt	$0, R32(n)
    134 	jc	L(end8)
    135 
    136 	movdqu	(ap), %xmm1
    137 	pxor	%xmm0, %xmm0
    138 	punpcklqdq  %xmm1, %xmm0
    139 	psllq	%xmm4, %xmm1
    140 	psrlq	%xmm5, %xmm0
    141 	por	%xmm1, %xmm0
    142 	movdqa	%xmm0, (rp)
    143 	FUNC_EXIT()
    144 	ret
    145 
    146 C Basecase
    147 	ALIGN(16)
    148 L(bc):	dec	R32(n)
    149 	jz	L(end8)
    150 
    151 	movq	(ap,n,8), %xmm1
    152 	movq	-8(ap,n,8), %xmm0
    153 	psllq	%xmm4, %xmm1
    154 	psrlq	%xmm5, %xmm0
    155 	por	%xmm1, %xmm0
    156 	movq	%xmm0, (rp,n,8)
    157 	sub	$2, R32(n)
    158 	jl	L(end8)
    159 	movq	8(ap), %xmm1
    160 	movq	(ap), %xmm0
    161 	psllq	%xmm4, %xmm1
    162 	psrlq	%xmm5, %xmm0
    163 	por	%xmm1, %xmm0
    164 	movq	%xmm0, 8(rp)
    165 
    166 L(end8):movq	(ap), %xmm0
    167 	psllq	%xmm4, %xmm0
    168 	movq	%xmm0, (rp)
    169 	FUNC_EXIT()
    170 	ret
    171 EPILOGUE()
    172