Home | History | Annotate | Line # | Download | only in string
bcopy.S revision 1.4
      1 /*-
      2  * Copyright (c) 1990 The Regents of the University of California.
      3  * All rights reserved.
      4  *
      5  * This code is derived from locore.s.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the University nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 #include <machine/asm.h>
     33 
     34 #if defined(LIBC_SCCS)
     35 	RCSID("$NetBSD: bcopy.S,v 1.4 2009/11/22 17:25:47 dsl Exp $")
     36 #endif
     37 
     38 	/*
     39 	 * (ov)bcopy (src,dst,cnt)
     40 	 *  ws (at) tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
     41 	 *
     42 	 * Hacked about by dsl (at) netbsd.org
     43 	 */
     44 
     45 #ifdef MEMCOPY
     46 ENTRY(memcpy)
     47 #define NO_OVERLAP
     48 #else
     49 #ifdef MEMMOVE
     50 ENTRY(memmove)
     51 #else
     52 ENTRY(bcopy)
     53 #endif
     54 #endif
     55 	movq	%rdx,%rcx
     56 #if defined(MEMCOPY) || defined(MEMMOVE)
     57 	movq	%rdi,%rax	/* must return destination address */
     58 	mov	%rdi,%r11	/* for misaligned check */
     59 #else
     60 	mov	%rsi,%r11	/* for misaligned check */
     61 	xchgq	%rdi,%rsi	/* bcopy() has arg order reversed */
     62 #endif
     63 
     64 #if !defined(NO_OVERLAP)
     65 	movq	%rdi,%r8
     66 	subq	%rsi,%r8
     67 #endif
     68 
     69 	shrq	$3,%rcx		/* count for copy by words */
     70 	jz	8f		/* j if less than 8 bytes */
     71 
     72 	lea	-8(%rdi,%rdx),%r9	/* target address of last 8 */
     73 	mov	-8(%rsi,%rdx),%r10	/* get last word */
     74 #if !defined(NO_OVERLAP)
     75 	cmpq	%rdx,%r8	/* overlapping? */
     76 	jb	10f
     77 #endif
     78 
     79 /*
     80  * Non-overlaping, copy forwards.
     81  * Newer Intel cpus (Nehalem) will do 16byte read/write transfers
     82  * if %ecx is more than 76.
     83  * AMD might do something similar some day.
     84  */
     85 	and	$7,%r11		/* destination misaligned ? */
     86 	jnz	2f
     87 	rep
     88 	movsq
     89 	mov	%r10,(%r9)	/* write last word */
     90 	ret
     91 
     92 /*
     93  * Destination misaligned
     94  * AMD say it is better to align the destination (not the source).
     95  * This will also re-align copies if the source and dest are both
     96  * misaligned by the same amount)
     97  * (I think Nehalem will use its accelerated copy if the source
     98  * and destination have the same alignment.)
     99  */
    100 2:
    101 	lea	-9(%r11,%rdx),%rcx	/* post re-alignment count */
    102 	neg	%r11			/* now -1 .. -7 */
    103 	mov	(%rsi),%rdx		/* get first word */
    104 	mov	%rdi,%r8		/* target for first word */
    105 	lea	8(%rsi,%r11),%rsi
    106 	lea	8(%rdi,%r11),%rdi
    107 	shr	$3,%rcx
    108 	rep
    109 	movsq
    110 	mov	%rdx,(%r8)		/* write first word */
    111 	mov	%r10,(%r9)		/* write last word */
    112 	ret
    113 
    114 #if !defined(NO_OVERLAP)
    115 /* Must copy backwards.
    116  * Reverse copy is probably easy to code faster than 'rep movds'
    117  * since that requires (IIRC) an extra clock every 3 iterations (AMD).
    118  * However I don't suppose anything cares that much!
    119  * The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4.
    120  * The copy is aligned with the buffer start (more likely to
    121  * be a multiple of 8 than the end).
    122  */
    123 10:
    124 	lea	-8(%rsi,%rcx,8),%rsi
    125 	lea	-8(%rdi,%rcx,8),%rdi
    126 	std
    127 	rep
    128 	movsq
    129 	cld
    130 	mov	%r10,(%r9)	/* write last bytes */
    131 	ret
    132 #endif
    133 
    134 /* Less than 8 bytes to copy, copy by bytes */
    135 /* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks).
    136  * For longer transfers it is 50+ !
    137  */
    138 8:	mov	%rdx,%rcx
    139 
    140 #if !defined(NO_OVERLAP)
    141 	cmpq	%rdx,%r8	/* overlapping? */
    142 	jb	81f
    143 #endif
    144 
    145 	/* nope, copy forwards. */
    146 	rep
    147 	movsb
    148 	ret
    149 
    150 #if !defined(NO_OVERLAP)
    151 /* Must copy backwards */
    152 81:
    153 	lea	-1(%rsi,%rcx),%rsi
    154 	lea	-1(%rdi,%rcx),%rdi
    155 	std
    156 	rep
    157 	movsb
    158 	cld
    159 	ret
    160 #endif
    161