Home | History | Annotate | Line # | Download | only in pentium4
      1      1.1  mrg dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 1999-2001 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup
     33      1.1  mrg dnl  time seems to be about 110 cycles.  It then copies at a rate of one
     34      1.1  mrg dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying
     35      1.1  mrg dnl  loop for smaller sizes.
     36      1.1  mrg 
     37      1.1  mrg dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
     38      1.1  mrg dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
     39      1.1  mrg dnl  speeds, at least for large blocks.
     40      1.1  mrg 
     41      1.1  mrg include(`../config.m4')
     42      1.1  mrg 
     43      1.1  mrg 
     44      1.1  mrg defframe(PARAM_SIZE, 12)
     45      1.1  mrg defframe(PARAM_SRC, 8)
     46      1.1  mrg defframe(PARAM_DST,  4)
     47      1.1  mrg 
     48      1.1  mrg 	TEXT
     49      1.1  mrg 	ALIGN(8)
     50      1.1  mrg 
     51      1.1  mrg PROLOGUE(mpn_copyi)
     52      1.1  mrg deflit(`FRAME',0)
     53      1.1  mrg 
     54      1.1  mrg 	movl	PARAM_SIZE, %ecx
     55      1.1  mrg 	cmpl	$150, %ecx
     56      1.1  mrg 	jg	L(replmovs)
     57      1.1  mrg 
     58      1.1  mrg 	movl	PARAM_SRC, %eax
     59      1.1  mrg 	movl	PARAM_DST, %edx
     60      1.1  mrg 	movl	%ebx, PARAM_SIZE
     61      1.1  mrg 	testl	%ecx, %ecx
     62      1.1  mrg 	jz	L(end)
     63      1.1  mrg 
     64      1.1  mrg L(loop):
     65      1.1  mrg 	movl	(%eax), %ebx
     66      1.1  mrg 	leal	4(%eax), %eax
     67      1.1  mrg 	addl	$-1, %ecx
     68      1.1  mrg 	movl	%ebx, (%edx)
     69      1.1  mrg 	leal	4(%edx), %edx
     70      1.1  mrg 
     71      1.1  mrg 	jnz	L(loop)
     72      1.1  mrg 
     73      1.1  mrg L(end):
     74      1.1  mrg 	movl	PARAM_SIZE, %ebx
     75      1.1  mrg 	ret
     76      1.1  mrg 
     77      1.1  mrg L(replmovs):
     78      1.1  mrg 	cld	C better safe than sorry, see mpn/x86/README
     79      1.1  mrg 
     80      1.1  mrg 	movl	%esi, %eax
     81      1.1  mrg 	movl	PARAM_SRC, %esi
     82      1.1  mrg 	movl	%edi, %edx
     83      1.1  mrg 	movl	PARAM_DST, %edi
     84      1.1  mrg 
     85      1.1  mrg 	rep
     86      1.1  mrg 	movsl
     87      1.1  mrg 
     88      1.1  mrg 	movl	%eax, %esi
     89      1.1  mrg 	movl	%edx, %edi
     90      1.1  mrg 
     91      1.1  mrg 	ret
     92      1.1  mrg 
     93      1.1  mrg EPILOGUE()
     94