x86_64/fastsse/lshift-movdqu2.asm

    1.1  mrg dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
    1.1  mrg
    1.1  mrg dnl  Contributed to the GNU project by Torbjorn Granlund.
    1.1  mrg
1.1.1.2  mrg dnl  Copyright 2010-2012 Free Software Foundation, Inc.
    1.1  mrg
    1.1  mrg dnl  This file is part of the GNU MP Library.
1.1.1.2  mrg dnl
    1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2  mrg dnl  it under the terms of either:
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
1.1.1.2  mrg dnl      option) any later version.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  or
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
1.1.1.2  mrg dnl      later version.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  or both in parallel, as here.
1.1.1.2  mrg dnl
    1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.2  mrg dnl  for more details.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
    1.1  mrg
    1.1  mrg include(`../config.m4')
    1.1  mrg
    1.1  mrg
    1.1  mrg C	     cycles/limb     cycles/limb     cycles/limb    good
    1.1  mrg C              aligned	      unaligned	      best seen	   for cpu?
    1.1  mrg C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
    1.1  mrg C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
    1.1  mrg C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
    1.1  mrg C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
    1.1  mrg C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
    1.1  mrg C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
    1.1  mrg C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
    1.1  mrg C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
    1.1  mrg C Intel atom	11.7		11.7		 4.5	  no
    1.1  mrg C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
    1.1  mrg
    1.1  mrg C We try to do as many aligned 16-byte operations as possible.  The top-most
    1.1  mrg C and bottom-most writes might need 8-byte operations.
    1.1  mrg C
    1.1  mrg C This variant rely on fast load movdqu, and uses it even for aligned operands,
    1.1  mrg C in order to avoid the need for two separate loops.
    1.1  mrg C
    1.1  mrg C TODO
    1.1  mrg C  * Could 2-limb wind-down code be simplified?
    1.1  mrg C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
    1.1  mrg C    for other affected CPUs.
    1.1  mrg
    1.1  mrg C INPUT PARAMETERS
    1.1  mrg define(`rp',  `%rdi')
    1.1  mrg define(`ap',  `%rsi')
    1.1  mrg define(`n',   `%rdx')
    1.1  mrg define(`cnt', `%rcx')
    1.1  mrg
    1.1  mrg ASM_START()
    1.1  mrg 	TEXT
    1.1  mrg 	ALIGN(64)
    1.1  mrg PROLOGUE(mpn_lshift)
    1.1  mrg 	FUNC_ENTRY(4)
    1.1  mrg 	movd	R32(%rcx), %xmm4
    1.1  mrg 	mov	$64, R32(%rax)
    1.1  mrg 	sub	R32(%rcx), R32(%rax)
    1.1  mrg 	movd	R32(%rax), %xmm5
    1.1  mrg
    1.1  mrg 	neg	R32(%rcx)
    1.1  mrg 	mov	-8(ap,n,8), %rax
    1.1  mrg 	shr	R8(%rcx), %rax
    1.1  mrg
    1.1  mrg 	cmp	$3, n
    1.1  mrg 	jle	L(bc)
    1.1  mrg
    1.1  mrg 	lea	(rp,n,8), R32(%rcx)
1.1.1.2  mrg 	test	$8, R8(%rcx)
1.1.1.2  mrg 	jz	L(rp_aligned)
    1.1  mrg
    1.1  mrg C Do one initial limb in order to make rp aligned
    1.1  mrg 	movq	-8(ap,n,8), %xmm0
    1.1  mrg 	movq	-16(ap,n,8), %xmm1
    1.1  mrg 	psllq	%xmm4, %xmm0
    1.1  mrg 	psrlq	%xmm5, %xmm1
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movq	%xmm0, -8(rp,n,8)
    1.1  mrg 	dec	n
    1.1  mrg
    1.1  mrg L(rp_aligned):
    1.1  mrg 	lea	1(n), %r8d
    1.1  mrg
    1.1  mrg 	and	$6, R32(%r8)
    1.1  mrg 	jz	L(ba0)
    1.1  mrg 	cmp	$4, R32(%r8)
    1.1  mrg 	jz	L(ba4)
    1.1  mrg 	jc	L(ba2)
    1.1  mrg L(ba6):	add	$-4, n
    1.1  mrg 	jmp	L(i56)
    1.1  mrg L(ba0):	add	$-6, n
    1.1  mrg 	jmp	L(i70)
    1.1  mrg L(ba4):	add	$-2, n
    1.1  mrg 	jmp	L(i34)
    1.1  mrg L(ba2):	add	$-8, n
    1.1  mrg 	jle	L(end)
    1.1  mrg
    1.1  mrg 	ALIGN(16)
    1.1  mrg L(top):	movdqu	40(ap,n,8), %xmm1
    1.1  mrg 	movdqu	48(ap,n,8), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm0
    1.1  mrg 	psrlq	%xmm5, %xmm1
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movdqa	%xmm0, 48(rp,n,8)
    1.1  mrg L(i70):
    1.1  mrg 	movdqu	24(ap,n,8), %xmm1
    1.1  mrg 	movdqu	32(ap,n,8), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm0
    1.1  mrg 	psrlq	%xmm5, %xmm1
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movdqa	%xmm0, 32(rp,n,8)
    1.1  mrg L(i56):
    1.1  mrg 	movdqu	8(ap,n,8), %xmm1
    1.1  mrg 	movdqu	16(ap,n,8), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm0
    1.1  mrg 	psrlq	%xmm5, %xmm1
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movdqa	%xmm0, 16(rp,n,8)
    1.1  mrg L(i34):
    1.1  mrg 	movdqu	-8(ap,n,8), %xmm1
    1.1  mrg 	movdqu	(ap,n,8), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm0
    1.1  mrg 	psrlq	%xmm5, %xmm1
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movdqa	%xmm0, (rp,n,8)
    1.1  mrg 	sub	$8, n
    1.1  mrg 	jg	L(top)
    1.1  mrg
1.1.1.2  mrg L(end):	test	$1, R8(n)
1.1.1.2  mrg 	jnz	L(end8)
    1.1  mrg
    1.1  mrg 	movdqu	(ap), %xmm1
    1.1  mrg 	pxor	%xmm0, %xmm0
    1.1  mrg 	punpcklqdq  %xmm1, %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm1
    1.1  mrg 	psrlq	%xmm5, %xmm0
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movdqa	%xmm0, (rp)
    1.1  mrg 	FUNC_EXIT()
    1.1  mrg 	ret
    1.1  mrg
    1.1  mrg C Basecase
    1.1  mrg 	ALIGN(16)
    1.1  mrg L(bc):	dec	R32(n)
    1.1  mrg 	jz	L(end8)
    1.1  mrg
    1.1  mrg 	movq	(ap,n,8), %xmm1
    1.1  mrg 	movq	-8(ap,n,8), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm1
    1.1  mrg 	psrlq	%xmm5, %xmm0
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movq	%xmm0, (rp,n,8)
    1.1  mrg 	sub	$2, R32(n)
    1.1  mrg 	jl	L(end8)
    1.1  mrg 	movq	8(ap), %xmm1
    1.1  mrg 	movq	(ap), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm1
    1.1  mrg 	psrlq	%xmm5, %xmm0
    1.1  mrg 	por	%xmm1, %xmm0
    1.1  mrg 	movq	%xmm0, 8(rp)
    1.1  mrg
    1.1  mrg L(end8):movq	(ap), %xmm0
    1.1  mrg 	psllq	%xmm4, %xmm0
    1.1  mrg 	movq	%xmm0, (rp)
    1.1  mrg 	FUNC_EXIT()
    1.1  mrg 	ret
    1.1  mrg EPILOGUE()