Home | History | Annotate | Line # | Download | only in pentium4
      1 dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing.
      2 
      3 dnl  Copyright 1999-2001 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 
     32 dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup
     33 dnl  time seems to be about 110 cycles.  It then copies at a rate of one
     34 dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying
     35 dnl  loop for smaller sizes.
     36 
     37 dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
     38 dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
     39 dnl  speeds, at least for large blocks.
     40 
     41 include(`../config.m4')
     42 
     43 
     44 defframe(PARAM_SIZE, 12)
     45 defframe(PARAM_SRC, 8)
     46 defframe(PARAM_DST,  4)
     47 
     48 	TEXT
     49 	ALIGN(8)
     50 
     51 PROLOGUE(mpn_copyi)
     52 deflit(`FRAME',0)
     53 
     54 	movl	PARAM_SIZE, %ecx
     55 	cmpl	$150, %ecx
     56 	jg	L(replmovs)
     57 
     58 	movl	PARAM_SRC, %eax
     59 	movl	PARAM_DST, %edx
     60 	movl	%ebx, PARAM_SIZE
     61 	testl	%ecx, %ecx
     62 	jz	L(end)
     63 
     64 L(loop):
     65 	movl	(%eax), %ebx
     66 	leal	4(%eax), %eax
     67 	addl	$-1, %ecx
     68 	movl	%ebx, (%edx)
     69 	leal	4(%edx), %edx
     70 
     71 	jnz	L(loop)
     72 
     73 L(end):
     74 	movl	PARAM_SIZE, %ebx
     75 	ret
     76 
     77 L(replmovs):
     78 	cld	C better safe than sorry, see mpn/x86/README
     79 
     80 	movl	%esi, %eax
     81 	movl	PARAM_SRC, %esi
     82 	movl	%edi, %edx
     83 	movl	PARAM_DST, %edi
     84 
     85 	rep
     86 	movsl
     87 
     88 	movl	%eax, %esi
     89 	movl	%edx, %edi
     90 
     91 	ret
     92 
     93 EPILOGUE()
     94