k7/mmx/lshift.asm

    1.1  mrg dnl  AMD K7 mpn_lshift -- mpn left shift.
    1.1  mrg
1.1.1.2  mrg dnl  Copyright 1999-2002 Free Software Foundation, Inc.
1.1.1.2  mrg
    1.1  mrg dnl  This file is part of the GNU MP Library.
    1.1  mrg dnl
1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2  mrg dnl  it under the terms of either:
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
1.1.1.2  mrg dnl      option) any later version.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  or
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
1.1.1.2  mrg dnl      later version.
1.1.1.2  mrg dnl
1.1.1.2  mrg dnl  or both in parallel, as here.
    1.1  mrg dnl
1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.2  mrg dnl  for more details.
    1.1  mrg dnl
1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
    1.1  mrg
    1.1  mrg include(`../config.m4')
    1.1  mrg
    1.1  mrg
    1.1  mrg C K7: 1.21 cycles/limb (at 16 limbs/loop).
    1.1  mrg
    1.1  mrg
    1.1  mrg
    1.1  mrg dnl  K7: UNROLL_COUNT cycles/limb
    1.1  mrg dnl           4           1.51
    1.1  mrg dnl           8           1.26
    1.1  mrg dnl          16           1.21
    1.1  mrg dnl          32           1.2
    1.1  mrg dnl  Maximum possible with the current code is 64.
    1.1  mrg
    1.1  mrg deflit(UNROLL_COUNT, 16)
    1.1  mrg
    1.1  mrg
    1.1  mrg C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    1.1  mrg C                       unsigned shift);
    1.1  mrg C
    1.1  mrg C Shift src,size left by shift many bits and store the result in dst,size.
    1.1  mrg C Zeros are shifted in at the right.  The bits shifted out at the left are
    1.1  mrg C the return value.
    1.1  mrg C
    1.1  mrg C The comments in mpn_rshift apply here too.
    1.1  mrg
    1.1  mrg ifdef(`PIC',`
    1.1  mrg deflit(UNROLL_THRESHOLD, 10)
    1.1  mrg ',`
    1.1  mrg deflit(UNROLL_THRESHOLD, 10)
    1.1  mrg ')
    1.1  mrg
    1.1  mrg defframe(PARAM_SHIFT,16)
    1.1  mrg defframe(PARAM_SIZE, 12)
    1.1  mrg defframe(PARAM_SRC,  8)
    1.1  mrg defframe(PARAM_DST,  4)
    1.1  mrg
    1.1  mrg defframe(SAVE_EDI, -4)
    1.1  mrg defframe(SAVE_ESI, -8)
    1.1  mrg defframe(SAVE_EBX, -12)
    1.1  mrg deflit(SAVE_SIZE, 12)
    1.1  mrg
    1.1  mrg 	TEXT
    1.1  mrg 	ALIGN(32)
    1.1  mrg
    1.1  mrg PROLOGUE(mpn_lshift)
    1.1  mrg deflit(`FRAME',0)
    1.1  mrg
    1.1  mrg 	movl	PARAM_SIZE, %eax
    1.1  mrg 	movl	PARAM_SRC, %edx
    1.1  mrg 	subl	$SAVE_SIZE, %esp
    1.1  mrg deflit(`FRAME',SAVE_SIZE)
    1.1  mrg
    1.1  mrg 	movl	PARAM_SHIFT, %ecx
    1.1  mrg 	movl	%edi, SAVE_EDI
    1.1  mrg
    1.1  mrg 	movl	PARAM_DST, %edi
    1.1  mrg 	decl	%eax
    1.1  mrg 	jnz	L(more_than_one_limb)
    1.1  mrg
    1.1  mrg 	movl	(%edx), %edx
    1.1  mrg
    1.1  mrg 	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
    1.1  mrg
    1.1  mrg 	shll	%cl, %edx
    1.1  mrg
    1.1  mrg 	movl	%edx, (%edi)
    1.1  mrg 	movl	SAVE_EDI, %edi
    1.1  mrg 	addl	$SAVE_SIZE, %esp
    1.1  mrg
    1.1  mrg 	ret
    1.1  mrg
    1.1  mrg
    1.1  mrg C -----------------------------------------------------------------------------
    1.1  mrg L(more_than_one_limb):
    1.1  mrg 	C eax	size-1
    1.1  mrg 	C ebx
    1.1  mrg 	C ecx	shift
    1.1  mrg 	C edx	src
    1.1  mrg 	C esi
    1.1  mrg 	C edi	dst
    1.1  mrg 	C ebp
    1.1  mrg
    1.1  mrg 	movd	PARAM_SHIFT, %mm6
    1.1  mrg 	movd	(%edx,%eax,4), %mm5	C src high limb
    1.1  mrg 	cmp	$UNROLL_THRESHOLD-1, %eax
    1.1  mrg
    1.1  mrg 	jae	L(unroll)
    1.1  mrg 	negl	%ecx
    1.1  mrg 	movd	(%edx), %mm4		C src low limb
    1.1  mrg
    1.1  mrg 	addl	$32, %ecx
    1.1  mrg
    1.1  mrg 	movd	%ecx, %mm7
    1.1  mrg
    1.1  mrg L(simple_top):
    1.1  mrg 	C eax	loop counter, limbs
    1.1  mrg 	C ebx
    1.1  mrg 	C ecx
    1.1  mrg 	C edx	src
    1.1  mrg 	C esi
    1.1  mrg 	C edi	dst
    1.1  mrg 	C ebp
    1.1  mrg 	C
    1.1  mrg 	C mm0	scratch
    1.1  mrg 	C mm4	src low limb
    1.1  mrg 	C mm5	src high limb
    1.1  mrg 	C mm6	shift
    1.1  mrg 	C mm7	32-shift
    1.1  mrg
    1.1  mrg 	movq	-4(%edx,%eax,4), %mm0
    1.1  mrg 	decl	%eax
    1.1  mrg
    1.1  mrg 	psrlq	%mm7, %mm0
    1.1  mrg
    1.1  mrg 	movd	%mm0, 4(%edi,%eax,4)
    1.1  mrg 	jnz	L(simple_top)
    1.1  mrg
    1.1  mrg
    1.1  mrg 	psllq	%mm6, %mm5
    1.1  mrg 	psllq	%mm6, %mm4
    1.1  mrg
    1.1  mrg 	psrlq	$32, %mm5
    1.1  mrg 	movd	%mm4, (%edi)		C dst low limb
    1.1  mrg
    1.1  mrg 	movd	%mm5, %eax		C return value
    1.1  mrg
    1.1  mrg 	movl	SAVE_EDI, %edi
    1.1  mrg 	addl	$SAVE_SIZE, %esp
    1.1  mrg 	emms
    1.1  mrg
    1.1  mrg 	ret
    1.1  mrg
    1.1  mrg
    1.1  mrg C -----------------------------------------------------------------------------
    1.1  mrg 	ALIGN(16)
    1.1  mrg L(unroll):
    1.1  mrg 	C eax	size-1
    1.1  mrg 	C ebx	(saved)
    1.1  mrg 	C ecx	shift
    1.1  mrg 	C edx	src
    1.1  mrg 	C esi
    1.1  mrg 	C edi	dst
    1.1  mrg 	C ebp
    1.1  mrg 	C
    1.1  mrg 	C mm5	src high limb, for return value
    1.1  mrg 	C mm6	lshift
    1.1  mrg
    1.1  mrg 	movl	%esi, SAVE_ESI
    1.1  mrg 	movl	%ebx, SAVE_EBX
    1.1  mrg 	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
    1.1  mrg
    1.1  mrg 	testb	$4, %dl
    1.1  mrg 	movq	(%edx), %mm1		C src high qword
    1.1  mrg
    1.1  mrg 	jz	L(start_src_aligned)
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C src isn't aligned, process high limb (marked xxx) separately to
    1.1  mrg 	C make it so
    1.1  mrg 	C
    1.1  mrg 	C  source    -4(edx,%eax,4)
    1.1  mrg 	C                  |
    1.1  mrg 	C  +-------+-------+-------+--
    1.1  mrg 	C  |  xxx          |
    1.1  mrg 	C  +-------+-------+-------+--
    1.1  mrg 	C        0mod8   4mod8   0mod8
    1.1  mrg 	C
    1.1  mrg 	C  dest      -4(edi,%eax,4)
    1.1  mrg 	C                  |
    1.1  mrg 	C  +-------+-------+--
    1.1  mrg 	C  |  xxx  |       |
    1.1  mrg 	C  +-------+-------+--
    1.1  mrg
    1.1  mrg 	psllq	%mm6, %mm1
    1.1  mrg 	subl	$4, %edx
    1.1  mrg 	movl	%eax, PARAM_SIZE	C size-1
    1.1  mrg
    1.1  mrg 	psrlq	$32, %mm1
    1.1  mrg 	decl	%eax			C size-2 is new size-1
    1.1  mrg
    1.1  mrg 	movd	%mm1, 4(%edi,%eax,4)
    1.1  mrg 	movq	(%edx), %mm1		C new src high qword
    1.1  mrg L(start_src_aligned):
    1.1  mrg
    1.1  mrg
    1.1  mrg 	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
    1.1  mrg 	psllq	%mm6, %mm5
    1.1  mrg
    1.1  mrg 	testl	$4, %edi
    1.1  mrg 	psrlq	$32, %mm5		C return value
    1.1  mrg
    1.1  mrg 	jz	L(start_dst_aligned)
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
    1.1  mrg 	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
    1.1  mrg 	C here separately.
    1.1  mrg 	C
    1.1  mrg 	C  source       %edx
    1.1  mrg 	C  +-------+-------+--
    1.1  mrg 	C  |      mm1      |
    1.1  mrg 	C  +-------+-------+--
    1.1  mrg 	C                0mod8   4mod8
    1.1  mrg 	C
    1.1  mrg 	C  dest         %edi
    1.1  mrg 	C  +-------+-------+-------+--
    1.1  mrg 	C  |  xxx  |
    1.1  mrg 	C  +-------+-------+-------+--
    1.1  mrg 	C        0mod8   4mod8   0mod8
    1.1  mrg
    1.1  mrg 	movq	%mm1, %mm0
    1.1  mrg 	psllq	%mm6, %mm1
    1.1  mrg 	addl	$32, %ecx		C shift+32
    1.1  mrg
    1.1  mrg 	psrlq	$32, %mm1
    1.1  mrg
    1.1  mrg 	movd	%mm1, 4(%edi)
    1.1  mrg 	movq	%mm0, %mm1
    1.1  mrg 	subl	$4, %edi
    1.1  mrg
    1.1  mrg 	movd	%ecx, %mm6		C new lshift
    1.1  mrg L(start_dst_aligned):
    1.1  mrg
    1.1  mrg 	decl	%eax			C size-2, two last limbs handled at end
    1.1  mrg 	movq	%mm1, %mm2		C copy of src high qword
    1.1  mrg 	negl	%ecx
    1.1  mrg
    1.1  mrg 	andl	$-2, %eax		C round size down to even
    1.1  mrg 	addl	$64, %ecx
    1.1  mrg
    1.1  mrg 	movl	%eax, %ebx
    1.1  mrg 	negl	%eax
    1.1  mrg
    1.1  mrg 	andl	$UNROLL_MASK, %eax
    1.1  mrg 	decl	%ebx
    1.1  mrg
    1.1  mrg 	shll	%eax
    1.1  mrg
    1.1  mrg 	movd	%ecx, %mm7		C rshift = 64-lshift
    1.1  mrg
    1.1  mrg ifdef(`PIC',`
    1.1  mrg 	call	L(pic_calc)
    1.1  mrg L(here):
    1.1  mrg ',`
    1.1  mrg 	leal	L(entry) (%eax,%eax,4), %esi
    1.1  mrg ')
    1.1  mrg 	shrl	$UNROLL_LOG2, %ebx	C loop counter
    1.1  mrg
    1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
    1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
    1.1  mrg 	movl	PARAM_SIZE, %eax	C for use at end
    1.1  mrg 	jmp	*%esi
    1.1  mrg
    1.1  mrg
    1.1  mrg ifdef(`PIC',`
    1.1  mrg L(pic_calc):
    1.1  mrg 	C See mpn/x86/README about old gas bugs
    1.1  mrg 	leal	(%eax,%eax,4), %esi
    1.1  mrg 	addl	$L(entry)-L(here), %esi
    1.1  mrg 	addl	(%esp), %esi
    1.1  mrg
    1.1  mrg 	ret_internal
    1.1  mrg ')
    1.1  mrg
    1.1  mrg
    1.1  mrg C -----------------------------------------------------------------------------
    1.1  mrg 	ALIGN(32)
    1.1  mrg L(top):
    1.1  mrg 	C eax	size (for use at end)
    1.1  mrg 	C ebx	loop counter
    1.1  mrg 	C ecx	rshift
    1.1  mrg 	C edx	src
    1.1  mrg 	C esi	computed jump
    1.1  mrg 	C edi	dst
    1.1  mrg 	C ebp
    1.1  mrg 	C
    1.1  mrg 	C mm0	scratch
    1.1  mrg 	C mm1	\ carry (alternating, mm2 first)
    1.1  mrg 	C mm2	/
    1.1  mrg 	C mm6	lshift
    1.1  mrg 	C mm7	rshift
    1.1  mrg 	C
    1.1  mrg 	C 10 code bytes/limb
    1.1  mrg 	C
    1.1  mrg 	C The two chunks differ in whether mm1 or mm2 hold the carry.
    1.1  mrg 	C The computed jump puts the initial carry in both mm1 and mm2.
    1.1  mrg
    1.1  mrg L(entry):
    1.1  mrg deflit(CHUNK_COUNT, 4)
    1.1  mrg forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    1.1  mrg 	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
    1.1  mrg 	deflit(`disp1', eval(disp0 - 8))
    1.1  mrg
    1.1  mrg Zdisp(	movq,	disp0,(%edx), %mm0)
    1.1  mrg 	psllq	%mm6, %mm2
    1.1  mrg
    1.1  mrg 	movq	%mm0, %mm1
    1.1  mrg 	psrlq	%mm7, %mm0
    1.1  mrg
    1.1  mrg 	por	%mm2, %mm0
    1.1  mrg Zdisp(	movq,	%mm0, disp0,(%edi))
    1.1  mrg
    1.1  mrg
    1.1  mrg Zdisp(	movq,	disp1,(%edx), %mm0)
    1.1  mrg 	psllq	%mm6, %mm1
    1.1  mrg
    1.1  mrg 	movq	%mm0, %mm2
    1.1  mrg 	psrlq	%mm7, %mm0
    1.1  mrg
    1.1  mrg 	por	%mm1, %mm0
    1.1  mrg Zdisp(	movq,	%mm0, disp1,(%edi))
    1.1  mrg ')
    1.1  mrg
    1.1  mrg 	subl	$UNROLL_BYTES, %edx
    1.1  mrg 	subl	$UNROLL_BYTES, %edi
    1.1  mrg 	decl	%ebx
    1.1  mrg
    1.1  mrg 	jns	L(top)
    1.1  mrg
    1.1  mrg
    1.1  mrg
    1.1  mrg define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
    1.1  mrg
    1.1  mrg L(end):
    1.1  mrg 	testb	$1, %al
    1.1  mrg 	movl	SAVE_EBX, %ebx
    1.1  mrg 	psllq	%mm6, %mm2	C wanted left shifted in all cases below
    1.1  mrg
    1.1  mrg 	movd	%mm5, %eax
    1.1  mrg
    1.1  mrg 	movl	SAVE_ESI, %esi
    1.1  mrg 	jz	L(end_even)
    1.1  mrg
    1.1  mrg
    1.1  mrg L(end_odd):
    1.1  mrg
    1.1  mrg 	C Size odd, destination was aligned.
    1.1  mrg 	C
    1.1  mrg 	C                 source        edx+8   edx+4
    1.1  mrg 	C                 --+---------------+-------+
    1.1  mrg 	C                   |      mm2      |       |
    1.1  mrg 	C                 --+---------------+-------+
    1.1  mrg 	C
    1.1  mrg 	C dest                            edi
    1.1  mrg 	C --+---------------+---------------+-------+
    1.1  mrg 	C   |   written     |               |       |
    1.1  mrg 	C --+---------------+---------------+-------+
    1.1  mrg 	C
    1.1  mrg 	C mm6 = shift
    1.1  mrg 	C mm7 = ecx = 64-shift
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C Size odd, destination was unaligned.
    1.1  mrg 	C
    1.1  mrg 	C                 source        edx+8   edx+4
    1.1  mrg 	C                 --+---------------+-------+
    1.1  mrg 	C                   |      mm2      |       |
    1.1  mrg 	C                 --+---------------+-------+
    1.1  mrg 	C
    1.1  mrg 	C         dest                            edi
    1.1  mrg 	C         --+---------------+---------------+
    1.1  mrg 	C           |   written     |               |
    1.1  mrg 	C         --+---------------+---------------+
    1.1  mrg 	C
    1.1  mrg 	C mm6 = shift+32
    1.1  mrg 	C mm7 = ecx = 64-(shift+32)
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C In both cases there's one extra limb of src to fetch and combine
    1.1  mrg 	C with mm2 to make a qword at (%edi), and in the aligned case
    1.1  mrg 	C there's an extra limb of dst to be formed from that extra src limb
    1.1  mrg 	C left shifted.
    1.1  mrg
    1.1  mrg 	movd	disp(4) (%edx), %mm0
    1.1  mrg 	testb	$32, %cl
    1.1  mrg
    1.1  mrg 	movq	%mm0, %mm1
    1.1  mrg 	psllq	$32, %mm0
    1.1  mrg
    1.1  mrg 	psrlq	%mm7, %mm0
    1.1  mrg 	psllq	%mm6, %mm1
    1.1  mrg
    1.1  mrg 	por	%mm2, %mm0
    1.1  mrg
    1.1  mrg 	movq	%mm0, disp(0) (%edi)
    1.1  mrg 	jz	L(end_odd_unaligned)
    1.1  mrg 	movd	%mm1, disp(-4) (%edi)
    1.1  mrg L(end_odd_unaligned):
    1.1  mrg
    1.1  mrg 	movl	SAVE_EDI, %edi
    1.1  mrg 	addl	$SAVE_SIZE, %esp
    1.1  mrg 	emms
    1.1  mrg
    1.1  mrg 	ret
    1.1  mrg
    1.1  mrg
    1.1  mrg L(end_even):
    1.1  mrg
    1.1  mrg 	C Size even, destination was aligned.
    1.1  mrg 	C
    1.1  mrg 	C                 source        edx+8
    1.1  mrg 	C                 --+---------------+
    1.1  mrg 	C                   |      mm2      |
    1.1  mrg 	C                 --+---------------+
    1.1  mrg 	C
    1.1  mrg 	C dest                            edi
    1.1  mrg 	C --+---------------+---------------+
    1.1  mrg 	C   |   written     |               |
    1.1  mrg 	C --+---------------+---------------+
    1.1  mrg 	C
    1.1  mrg 	C mm6 = shift
    1.1  mrg 	C mm7 = ecx = 64-shift
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C Size even, destination was unaligned.
    1.1  mrg 	C
    1.1  mrg 	C               source          edx+8
    1.1  mrg 	C                 --+---------------+
    1.1  mrg 	C                   |      mm2      |
    1.1  mrg 	C                 --+---------------+
    1.1  mrg 	C
    1.1  mrg 	C         dest                  edi+4
    1.1  mrg 	C         --+---------------+-------+
    1.1  mrg 	C           |    written    |       |
    1.1  mrg 	C         --+---------------+-------+
    1.1  mrg 	C
    1.1  mrg 	C mm6 = shift+32
    1.1  mrg 	C mm7 = ecx = 64-(shift+32)
    1.1  mrg
    1.1  mrg
    1.1  mrg 	C The movq for the aligned case overwrites the movd for the
    1.1  mrg 	C unaligned case.
    1.1  mrg
    1.1  mrg 	movq	%mm2, %mm0
    1.1  mrg 	psrlq	$32, %mm2
    1.1  mrg
    1.1  mrg 	testb	$32, %cl
    1.1  mrg 	movd	%mm2, disp(4) (%edi)
    1.1  mrg
    1.1  mrg 	jz	L(end_even_unaligned)
    1.1  mrg 	movq	%mm0, disp(0) (%edi)
    1.1  mrg L(end_even_unaligned):
    1.1  mrg
    1.1  mrg 	movl	SAVE_EDI, %edi
    1.1  mrg 	addl	$SAVE_SIZE, %esp
    1.1  mrg 	emms
    1.1  mrg
    1.1  mrg 	ret
    1.1  mrg
    1.1  mrg EPILOGUE()