Home | History | Annotate | Line # | Download | only in fastsse
      1      1.1  mrg dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
      2      1.1  mrg 
      3      1.1  mrg dnl  Contributed to the GNU project by Torbjorn Granlund.
      4      1.1  mrg 
      5  1.1.1.2  mrg dnl  Copyright 2010-2012 Free Software Foundation, Inc.
      6      1.1  mrg 
      7      1.1  mrg dnl  This file is part of the GNU MP Library.
      8  1.1.1.2  mrg dnl
      9      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1.1.2  mrg dnl  it under the terms of either:
     11  1.1.1.2  mrg dnl
     12  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1.1.2  mrg dnl      option) any later version.
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl  or
     17  1.1.1.2  mrg dnl
     18  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1.1.2  mrg dnl      later version.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  or both in parallel, as here.
     23  1.1.1.2  mrg dnl
     24      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1.1.2  mrg dnl  for more details.
     28  1.1.1.2  mrg dnl
     29  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     32      1.1  mrg 
     33      1.1  mrg include(`../config.m4')
     34      1.1  mrg 
     35      1.1  mrg 
     36      1.1  mrg C	     cycles/limb     cycles/limb     cycles/limb    good
     37      1.1  mrg C              aligned	      unaligned	      best seen	   for cpu?
     38      1.1  mrg C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
     39      1.1  mrg C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
     40      1.1  mrg C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
     41      1.1  mrg C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
     42      1.1  mrg C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
     43      1.1  mrg C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
     44      1.1  mrg C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
     45      1.1  mrg C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
     46      1.1  mrg C Intel atom	11.7		11.7		 4.5	  no
     47      1.1  mrg C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
     48      1.1  mrg 
     49      1.1  mrg C We try to do as many aligned 16-byte operations as possible.  The top-most
     50      1.1  mrg C and bottom-most writes might need 8-byte operations.
     51      1.1  mrg C
     52      1.1  mrg C This variant rely on fast load movdqu, and uses it even for aligned operands,
     53      1.1  mrg C in order to avoid the need for two separate loops.
     54      1.1  mrg C
     55      1.1  mrg C TODO
     56      1.1  mrg C  * Could 2-limb wind-down code be simplified?
     57      1.1  mrg C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
     58      1.1  mrg C    for other affected CPUs.
     59      1.1  mrg 
     60      1.1  mrg C INPUT PARAMETERS
     61      1.1  mrg define(`rp',  `%rdi')
     62      1.1  mrg define(`ap',  `%rsi')
     63      1.1  mrg define(`n',   `%rdx')
     64      1.1  mrg define(`cnt', `%rcx')
     65      1.1  mrg 
     66      1.1  mrg ASM_START()
     67      1.1  mrg 	TEXT
     68      1.1  mrg 	ALIGN(64)
     69      1.1  mrg PROLOGUE(mpn_lshift)
     70      1.1  mrg 	FUNC_ENTRY(4)
     71      1.1  mrg 	movd	R32(%rcx), %xmm4
     72      1.1  mrg 	mov	$64, R32(%rax)
     73      1.1  mrg 	sub	R32(%rcx), R32(%rax)
     74      1.1  mrg 	movd	R32(%rax), %xmm5
     75      1.1  mrg 
     76      1.1  mrg 	neg	R32(%rcx)
     77      1.1  mrg 	mov	-8(ap,n,8), %rax
     78      1.1  mrg 	shr	R8(%rcx), %rax
     79      1.1  mrg 
     80      1.1  mrg 	cmp	$3, n
     81      1.1  mrg 	jle	L(bc)
     82      1.1  mrg 
     83      1.1  mrg 	lea	(rp,n,8), R32(%rcx)
     84  1.1.1.2  mrg 	test	$8, R8(%rcx)
     85  1.1.1.2  mrg 	jz	L(rp_aligned)
     86      1.1  mrg 
     87      1.1  mrg C Do one initial limb in order to make rp aligned
     88      1.1  mrg 	movq	-8(ap,n,8), %xmm0
     89      1.1  mrg 	movq	-16(ap,n,8), %xmm1
     90      1.1  mrg 	psllq	%xmm4, %xmm0
     91      1.1  mrg 	psrlq	%xmm5, %xmm1
     92      1.1  mrg 	por	%xmm1, %xmm0
     93      1.1  mrg 	movq	%xmm0, -8(rp,n,8)
     94      1.1  mrg 	dec	n
     95      1.1  mrg 
     96      1.1  mrg L(rp_aligned):
     97      1.1  mrg 	lea	1(n), %r8d
     98      1.1  mrg 
     99      1.1  mrg 	and	$6, R32(%r8)
    100      1.1  mrg 	jz	L(ba0)
    101      1.1  mrg 	cmp	$4, R32(%r8)
    102      1.1  mrg 	jz	L(ba4)
    103      1.1  mrg 	jc	L(ba2)
    104      1.1  mrg L(ba6):	add	$-4, n
    105      1.1  mrg 	jmp	L(i56)
    106      1.1  mrg L(ba0):	add	$-6, n
    107      1.1  mrg 	jmp	L(i70)
    108      1.1  mrg L(ba4):	add	$-2, n
    109      1.1  mrg 	jmp	L(i34)
    110      1.1  mrg L(ba2):	add	$-8, n
    111      1.1  mrg 	jle	L(end)
    112      1.1  mrg 
    113      1.1  mrg 	ALIGN(16)
    114      1.1  mrg L(top):	movdqu	40(ap,n,8), %xmm1
    115      1.1  mrg 	movdqu	48(ap,n,8), %xmm0
    116      1.1  mrg 	psllq	%xmm4, %xmm0
    117      1.1  mrg 	psrlq	%xmm5, %xmm1
    118      1.1  mrg 	por	%xmm1, %xmm0
    119      1.1  mrg 	movdqa	%xmm0, 48(rp,n,8)
    120      1.1  mrg L(i70):
    121      1.1  mrg 	movdqu	24(ap,n,8), %xmm1
    122      1.1  mrg 	movdqu	32(ap,n,8), %xmm0
    123      1.1  mrg 	psllq	%xmm4, %xmm0
    124      1.1  mrg 	psrlq	%xmm5, %xmm1
    125      1.1  mrg 	por	%xmm1, %xmm0
    126      1.1  mrg 	movdqa	%xmm0, 32(rp,n,8)
    127      1.1  mrg L(i56):
    128      1.1  mrg 	movdqu	8(ap,n,8), %xmm1
    129      1.1  mrg 	movdqu	16(ap,n,8), %xmm0
    130      1.1  mrg 	psllq	%xmm4, %xmm0
    131      1.1  mrg 	psrlq	%xmm5, %xmm1
    132      1.1  mrg 	por	%xmm1, %xmm0
    133      1.1  mrg 	movdqa	%xmm0, 16(rp,n,8)
    134      1.1  mrg L(i34):
    135      1.1  mrg 	movdqu	-8(ap,n,8), %xmm1
    136      1.1  mrg 	movdqu	(ap,n,8), %xmm0
    137      1.1  mrg 	psllq	%xmm4, %xmm0
    138      1.1  mrg 	psrlq	%xmm5, %xmm1
    139      1.1  mrg 	por	%xmm1, %xmm0
    140      1.1  mrg 	movdqa	%xmm0, (rp,n,8)
    141      1.1  mrg 	sub	$8, n
    142      1.1  mrg 	jg	L(top)
    143      1.1  mrg 
    144  1.1.1.2  mrg L(end):	test	$1, R8(n)
    145  1.1.1.2  mrg 	jnz	L(end8)
    146      1.1  mrg 
    147      1.1  mrg 	movdqu	(ap), %xmm1
    148      1.1  mrg 	pxor	%xmm0, %xmm0
    149      1.1  mrg 	punpcklqdq  %xmm1, %xmm0
    150      1.1  mrg 	psllq	%xmm4, %xmm1
    151      1.1  mrg 	psrlq	%xmm5, %xmm0
    152      1.1  mrg 	por	%xmm1, %xmm0
    153      1.1  mrg 	movdqa	%xmm0, (rp)
    154      1.1  mrg 	FUNC_EXIT()
    155      1.1  mrg 	ret
    156      1.1  mrg 
    157      1.1  mrg C Basecase
    158      1.1  mrg 	ALIGN(16)
    159      1.1  mrg L(bc):	dec	R32(n)
    160      1.1  mrg 	jz	L(end8)
    161      1.1  mrg 
    162      1.1  mrg 	movq	(ap,n,8), %xmm1
    163      1.1  mrg 	movq	-8(ap,n,8), %xmm0
    164      1.1  mrg 	psllq	%xmm4, %xmm1
    165      1.1  mrg 	psrlq	%xmm5, %xmm0
    166      1.1  mrg 	por	%xmm1, %xmm0
    167      1.1  mrg 	movq	%xmm0, (rp,n,8)
    168      1.1  mrg 	sub	$2, R32(n)
    169      1.1  mrg 	jl	L(end8)
    170      1.1  mrg 	movq	8(ap), %xmm1
    171      1.1  mrg 	movq	(ap), %xmm0
    172      1.1  mrg 	psllq	%xmm4, %xmm1
    173      1.1  mrg 	psrlq	%xmm5, %xmm0
    174      1.1  mrg 	por	%xmm1, %xmm0
    175      1.1  mrg 	movq	%xmm0, 8(rp)
    176      1.1  mrg 
    177      1.1  mrg L(end8):movq	(ap), %xmm0
    178      1.1  mrg 	psllq	%xmm4, %xmm0
    179      1.1  mrg 	movq	%xmm0, (rp)
    180      1.1  mrg 	FUNC_EXIT()
    181      1.1  mrg 	ret
    182      1.1  mrg EPILOGUE()
    183