Home | History | Annotate | Line # | Download | only in mmx
      1 dnl  AMD K6 mpn_rshift -- mpn right shift.
      2 
      3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C K6: 3.0 cycles/limb
     35 
     36 
     37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
     38 C                       unsigned shift);
     39 C
     40 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
     41 C instructions.  This is despite every second fetch being unaligned.
     42 
     43 
     44 defframe(PARAM_SHIFT,16)
     45 defframe(PARAM_SIZE, 12)
     46 defframe(PARAM_SRC,  8)
     47 defframe(PARAM_DST,  4)
     48 deflit(`FRAME',0)
     49 
     50 	TEXT
     51 	ALIGN(32)
     52 
     53 PROLOGUE(mpn_rshift)
     54 deflit(`FRAME',0)
     55 
     56 	C The 1 limb case can be done without the push %ebx, but it's then
     57 	C still the same speed.  The push is left as a free helping hand for
     58 	C the two_or_more code.
     59 
     60 	movl	PARAM_SIZE, %eax
     61 	pushl	%ebx			FRAME_pushl()
     62 
     63 	movl	PARAM_SRC, %ebx
     64 	decl	%eax
     65 
     66 	movl	PARAM_SHIFT, %ecx
     67 	jnz	L(two_or_more)
     68 
     69 	movl	(%ebx), %edx		C src limb
     70 	movl	PARAM_DST, %ebx
     71 
     72 	shrdl(	%cl, %edx, %eax)	C return value
     73 
     74 	shrl	%cl, %edx
     75 
     76 	movl	%edx, (%ebx)		C dst limb
     77 	popl	%ebx
     78 
     79 	ret
     80 
     81 
     82 	ALIGN(16)	C avoid offset 0x1f
     83 L(two_or_more):
     84 	C eax	size-1
     85 	C ebx	src
     86 	C ecx	shift
     87 	C edx
     88 
     89 	movl	(%ebx), %edx	C src low limb
     90 	negl	%ecx
     91 
     92 	addl	$32, %ecx	C 32-shift
     93 	movd	PARAM_SHIFT, %mm6
     94 
     95 	shll	%cl, %edx	C retval
     96 	movl	PARAM_DST, %ecx
     97 
     98 	leal	(%ebx,%eax,4), %ebx
     99 
    100 	leal	-4(%ecx,%eax,4), %ecx
    101 	negl	%eax
    102 
    103 
    104 L(simple):
    105 	C eax	counter (negative)
    106 	C ebx	&src[size-1]
    107 	C ecx	&dst[size-1]
    108 	C edx	retval
    109 	C
    110 	C mm0	scratch
    111 	C mm6	shift
    112 
    113 Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
    114 	incl	%eax
    115 
    116 	psrlq	%mm6, %mm0
    117 
    118 Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
    119 	jnz	L(simple)
    120 
    121 
    122 	movq	%mm0, (%ecx)
    123 	movl	%edx, %eax
    124 
    125 	popl	%ebx
    126 
    127 	emms
    128 	ret
    129 
    130 EPILOGUE()
    131