Home | History | Annotate | Line # | Download | only in mmx
lshift.asm revision 1.1.1.2
      1 dnl  AMD K6 mpn_lshift -- mpn left shift.
      2 
      3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C K6: 3.0 cycles/limb
     35 
     36 
     37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
     38 C                       unsigned shift);
     39 C
     40 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
     41 C instructions.  This is despite every second fetch being unaligned.
     42 
     43 
     44 defframe(PARAM_SHIFT,16)
     45 defframe(PARAM_SIZE, 12)
     46 defframe(PARAM_SRC,  8)
     47 defframe(PARAM_DST,  4)
     48 
     49 	TEXT
     50 	ALIGN(32)
     51 
     52 PROLOGUE(mpn_lshift)
     53 deflit(`FRAME',0)
     54 
     55 	C The 1 limb case can be done without the push %ebx, but it's then
     56 	C still the same speed.  The push is left as a free helping hand for
     57 	C the two_or_more code.
     58 
     59 	movl	PARAM_SIZE, %eax
     60 	pushl	%ebx			FRAME_pushl()
     61 
     62 	movl	PARAM_SRC, %ebx
     63 	decl	%eax
     64 
     65 	movl	PARAM_SHIFT, %ecx
     66 	jnz	L(two_or_more)
     67 
     68 	movl	(%ebx), %edx		C src limb
     69 	movl	PARAM_DST, %ebx
     70 
     71 	shldl(	%cl, %edx, %eax)	C return value
     72 
     73 	shll	%cl, %edx
     74 
     75 	movl	%edx, (%ebx)		C dst limb
     76 	popl	%ebx
     77 
     78 	ret
     79 
     80 
     81 	ALIGN(16)	C avoid offset 0x1f
     82 	nop		C avoid bad cache line crossing
     83 L(two_or_more):
     84 	C eax	size-1
     85 	C ebx	src
     86 	C ecx	shift
     87 	C edx
     88 
     89 	movl	(%ebx,%eax,4), %edx	C src high limb
     90 	negl	%ecx
     91 
     92 	movd	PARAM_SHIFT, %mm6
     93 	addl	$32, %ecx		C 32-shift
     94 
     95 	shrl	%cl, %edx
     96 
     97 	movd	%ecx, %mm7
     98 	movl	PARAM_DST, %ecx
     99 
    100 L(top):
    101 	C eax	counter, size-1 to 1
    102 	C ebx	src
    103 	C ecx	dst
    104 	C edx	retval
    105 	C
    106 	C mm0	scratch
    107 	C mm6	shift
    108 	C mm7	32-shift
    109 
    110 	movq	-4(%ebx,%eax,4), %mm0
    111 	decl	%eax
    112 
    113 	psrlq	%mm7, %mm0
    114 
    115 	movd	%mm0, 4(%ecx,%eax,4)
    116 	jnz	L(top)
    117 
    118 
    119 	movd	(%ebx), %mm0
    120 	popl	%ebx
    121 
    122 	psllq	%mm6, %mm0
    123 	movl	%edx, %eax
    124 
    125 	movd	%mm0, (%ecx)
    126 
    127 	emms
    128 	ret
    129 
    130 EPILOGUE()
    131