1 1.1 mrg dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 1999-2001 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg 32 1.1 mrg dnl The rep/movsl is very slow for small blocks on pentium4. Its startup 33 1.1 mrg dnl time seems to be about 110 cycles. It then copies at a rate of one 34 1.1 mrg dnl limb per cycle. We therefore fall back to an open-coded 2 c/l copying 35 1.1 mrg dnl loop for smaller sizes. 36 1.1 mrg 37 1.1 mrg dnl Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some 38 1.1 mrg dnl nifty unrolled arrangement. Clearly, that could reach much higher 39 1.1 mrg dnl speeds, at least for large blocks. 40 1.1 mrg 41 1.1 mrg include(`../config.m4') 42 1.1 mrg 43 1.1 mrg 44 1.1 mrg defframe(PARAM_SIZE, 12) 45 1.1 mrg defframe(PARAM_SRC, 8) 46 1.1 mrg defframe(PARAM_DST, 4) 47 1.1 mrg 48 1.1 mrg TEXT 49 1.1 mrg ALIGN(8) 50 1.1 mrg 51 1.1 mrg PROLOGUE(mpn_copyi) 52 1.1 mrg deflit(`FRAME',0) 53 1.1 mrg 54 1.1 mrg movl PARAM_SIZE, %ecx 55 1.1 mrg cmpl $150, %ecx 56 1.1 mrg jg L(replmovs) 57 1.1 mrg 58 1.1 mrg movl PARAM_SRC, %eax 59 1.1 mrg movl PARAM_DST, %edx 60 1.1 mrg movl %ebx, PARAM_SIZE 61 1.1 mrg testl %ecx, %ecx 62 1.1 mrg jz L(end) 63 1.1 mrg 64 1.1 mrg L(loop): 65 1.1 mrg movl (%eax), %ebx 66 1.1 mrg leal 4(%eax), %eax 67 1.1 mrg addl $-1, %ecx 68 1.1 mrg movl %ebx, (%edx) 69 1.1 mrg leal 4(%edx), %edx 70 1.1 mrg 71 1.1 mrg jnz L(loop) 72 1.1 mrg 73 1.1 mrg L(end): 74 1.1 mrg movl PARAM_SIZE, %ebx 75 1.1 mrg ret 76 1.1 mrg 77 1.1 mrg L(replmovs): 78 1.1 mrg cld C better safe than sorry, see mpn/x86/README 79 1.1 mrg 80 1.1 mrg movl %esi, %eax 81 1.1 mrg movl PARAM_SRC, %esi 82 1.1 mrg movl %edi, %edx 83 1.1 mrg movl PARAM_DST, %edi 84 1.1 mrg 85 1.1 mrg rep 86 1.1 mrg movsl 87 1.1 mrg 88 1.1 mrg movl %eax, %esi 89 1.1 mrg movl %edx, %edi 90 1.1 mrg 91 1.1 mrg ret 92 1.1 mrg 93 1.1 mrg EPILOGUE() 94