Home | History | Annotate | Line # | Download | only in hppa
bcopy.S revision 1.4.4.1
      1  1.4.4.1      yamt /*	$NetBSD: bcopy.S,v 1.4.4.1 2007/09/03 14:41:27 yamt Exp $	*/
      2      1.1  fredette 
      3      1.1  fredette /*
      4      1.1  fredette  * Copyright (c) 2002 The NetBSD Foundation, Inc.
      5      1.1  fredette  * All rights reserved.
      6      1.1  fredette  *
      7      1.1  fredette  * This code is derived from software contributed to The NetBSD Foundation
      8      1.1  fredette  * by Matthew Fredette.
      9      1.1  fredette  *
     10      1.1  fredette  * Redistribution and use in source and binary forms, with or without
     11      1.1  fredette  * modification, are permitted provided that the following conditions
     12      1.1  fredette  * are met:
     13      1.1  fredette  * 1. Redistributions of source code must retain the above copyright
     14      1.1  fredette  *    notice, this list of conditions and the following disclaimer.
     15      1.1  fredette  * 2. Redistributions in binary form must reproduce the above copyright
     16      1.1  fredette  *    notice, this list of conditions and the following disclaimer in the
     17      1.1  fredette  *    documentation and/or other materials provided with the distribution.
     18      1.1  fredette  * 3. All advertising materials mentioning features or use of this software
     19      1.1  fredette  *    must display the following acknowledgement:
     20      1.1  fredette  *        This product includes software developed by the NetBSD
     21      1.1  fredette  *        Foundation, Inc. and its contributors.
     22      1.1  fredette  * 4. Neither the name of The NetBSD Foundation nor the names of its
     23      1.1  fredette  *    contributors may be used to endorse or promote products derived
     24      1.1  fredette  *    from this software without specific prior written permission.
     25      1.1  fredette  *
     26      1.1  fredette  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     27      1.1  fredette  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     28      1.1  fredette  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     29      1.1  fredette  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     30      1.1  fredette  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     31      1.1  fredette  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     32      1.1  fredette  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     33      1.1  fredette  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     34      1.1  fredette  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     35      1.1  fredette  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     36      1.1  fredette  * POSSIBILITY OF SUCH DAMAGE.
     37      1.1  fredette  */
     38      1.1  fredette 
     39      1.1  fredette /*
     40      1.1  fredette  * Copy routines for NetBSD/hppa.
     41      1.1  fredette  */
     42      1.1  fredette 
     43      1.1  fredette #undef _LOCORE
     44      1.1  fredette #define _LOCORE	/* XXX fredette - unfortunate */
     45      1.1  fredette #include <machine/asm.h>
     46      1.1  fredette #include <machine/frame.h>
     47      1.4     perry 
     48      1.4     perry #if defined(LIBC_SCCS) && !defined(lint)
     49  1.4.4.1      yamt RCSID("$NetBSD: bcopy.S,v 1.4.4.1 2007/09/03 14:41:27 yamt Exp $")
     50      1.1  fredette #endif /* LIBC_SCCS and not lint */
     51      1.1  fredette 
     52      1.1  fredette /*
     53      1.4     perry  * The stbys instruction is a little asymmetric.  When (%r2 & 3)
     54      1.1  fredette  * is zero, stbys,b,m %r1, 4(%r2) works like stws,ma.  You
     55      1.4     perry  * might then wish that when (%r2 & 3) == 0, stbys,e,m %r1, -4(%r2)
     56      1.1  fredette  * worked like stws,mb.  But it doesn't.
     57      1.1  fredette  *
     58      1.1  fredette  * This macro works around this problem.  It requires that %t2
     59      1.1  fredette  * hold the number of bytes that will be written by this store
     60      1.1  fredette  * (meaning that it ranges from one to four).
     61      1.1  fredette  *
     62      1.4     perry  * Watch the delay-slot trickery here.  The comib is used to set
     63      1.4     perry  * up which instruction, either the stws or the stbys, is run
     64      1.1  fredette  * in the delay slot of the b instruction.
     65      1.1  fredette  */
     66      1.1  fredette #define _STBYS_E_M(r, dst_spc, dst_off)				  \
     67      1.3       chs 	comib,<>	4, %t2, 4				! \
     68      1.1  fredette 	b		4					! \
     69      1.1  fredette 	stws,mb		r, -4(dst_spc, dst_off)			! \
     70      1.1  fredette 	stbys,e,m	r, 0(dst_spc, dst_off)
     71      1.1  fredette 
     72      1.1  fredette /*
     73      1.4     perry  * This macro does a bulk copy with no shifting.  cmplt and m are
     74      1.4     perry  * the completer and displacement multiplier, respectively, for
     75      1.1  fredette  * the load and store instructions.
     76      1.1  fredette  */
     77      1.1  fredette #define _COPY(src_spc, src_off, dst_spc, dst_off, count, cmplt, m) \
     78      1.1  fredette 								! \
     79      1.1  fredette 	/*							! \
     80      1.1  fredette 	 * Loop storing 16 bytes at a time.  Since count 	! \
     81      1.1  fredette 	 * may be > INT_MAX, we have to be careful and		! \
     82      1.1  fredette 	 * avoid comparisons that treat it as a signed 		! \
     83      1.1  fredette 	 * quantity, until after this loop, when count		! \
     84      1.1  fredette 	 * is guaranteed to be less than 16.			! \
     85      1.1  fredette 	 */							! \
     86      1.1  fredette 	comib,>>=,n	15, count, _LABEL(_skip16)		! \
     87      1.1  fredette .label _LABEL(_loop16)						! \
     88      1.1  fredette 	addi		-16, count, count			! \
     89      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t1		! \
     90      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t2		! \
     91      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t3		! \
     92      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t4		! \
     93      1.3       chs 	stws,cmplt	%t1, m*4(dst_spc, dst_off)		! \
     94      1.3       chs 	stws,cmplt	%t2, m*4(dst_spc, dst_off)		! \
     95      1.3       chs 	stws,cmplt	%t3, m*4(dst_spc, dst_off)		! \
     96      1.1  fredette 	comib,<<	15, count, _LABEL(_loop16)		! \
     97      1.3       chs 	stws,cmplt	%t4, m*4(dst_spc, dst_off)		! \
     98      1.1  fredette .label _LABEL(_skip16)						! \
     99      1.1  fredette 								! \
    100      1.1  fredette 	/* Loop storing 4 bytes at a time. */			! \
    101      1.1  fredette 	addib,<,n	-4, count, _LABEL(_skip4)		! \
    102      1.1  fredette .label _LABEL(_loop4)						! \
    103      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t1		! \
    104      1.1  fredette 	addib,>=	-4, count, _LABEL(_loop4)		! \
    105      1.3       chs 	stws,cmplt	%t1, m*4(dst_spc, dst_off)		! \
    106      1.1  fredette .label _LABEL(_skip4)						! \
    107      1.1  fredette 	/* Restore the correct count. */			! \
    108      1.1  fredette 	addi		4, count, count				! \
    109      1.1  fredette 								! \
    110      1.1  fredette .label _LABEL(_do1)						! \
    111      1.1  fredette 								! \
    112      1.1  fredette 	/* Loop storing 1 byte at a time. */			! \
    113      1.1  fredette 	addib,<,n	-1, count, _LABEL(_skip1)		! \
    114      1.1  fredette .label _LABEL(_loop1)						! \
    115      1.3       chs 	ldbs,cmplt	m*1(src_spc, src_off), %t1		! \
    116      1.1  fredette 	addib,>=	-1, count, _LABEL(_loop1)		! \
    117      1.3       chs 	stbs,cmplt	%t1, m*1(dst_spc, dst_off)		! \
    118      1.1  fredette .label _LABEL(_skip1)						! \
    119      1.1  fredette 	/* Restore the correct count. */			! \
    120      1.1  fredette 	b		_LABEL(_done)				! \
    121      1.1  fredette 	addi		1, count, count
    122      1.1  fredette 
    123      1.1  fredette /*
    124      1.1  fredette  * This macro is definitely strange.  It exists purely to
    125      1.4     perry  * allow the _COPYS macro to be reused, but because it
    126      1.1  fredette  * requires this long attempt to explain it, I'm starting
    127      1.1  fredette  * to doubt the value of that.
    128      1.1  fredette  *
    129      1.1  fredette  * Part of the expansion of the _COPYS macro below are loops
    130      1.1  fredette  * that copy four words or one word at a time, performing shifts
    131      1.1  fredette  * to get data to line up correctly in the destination buffer.
    132      1.1  fredette  *
    133      1.1  fredette  * The _COPYS macro is used when copying backwards, as well
    134      1.3       chs  * as forwards.  The 4-word loop always loads into %t1, %t2, %t3,
    135      1.3       chs  * and %t4 in that order.  This means that when copying forward,
    136      1.3       chs  * %t1 will have the word from the lowest address, and %t4 will
    137      1.4     perry  * have the word from the highest address.  When copying
    138      1.1  fredette  * backwards, the opposite is true.
    139      1.1  fredette  *
    140      1.1  fredette  * The shift instructions need pairs of registers with adjacent
    141      1.4     perry  * words, with the register containing the word from the lowest
    142      1.4     perry  * address *always* coming first.  It is this assymetry that
    143      1.1  fredette  * gives rise to this macro - depending on which direction
    144      1.1  fredette  * we're copying in, these ordered pairs are different.
    145      1.1  fredette  *
    146      1.4     perry  * Fortunately, we can compute those register numbers at compile
    147      1.4     perry  * time, and assemble them manually into a shift instruction.
    148      1.1  fredette  * That's what this macro does.
    149      1.1  fredette  *
    150      1.1  fredette  * This macro takes two arguments.  n ranges from 0 to 3 and
    151      1.1  fredette  * is the "shift number", i.e., n = 0 means we're doing the
    152      1.1  fredette  * shift for what will be the first store.
    153      1.1  fredette  *
    154      1.1  fredette  * m is the displacement multiplier from the _COPYS macro call.
    155      1.1  fredette  * This is 1 for a forward copy and -1 for a backwards copy.
    156      1.1  fredette  * So, the ((m + 1) / 2) term yields 0 for a backwards copy and
    157      1.4     perry  * 1 for a forward copy, and the ((m - 1) / 2) term yields
    158      1.1  fredette  * 0 for a forward copy, and -1 for a backwards copy.
    159      1.1  fredette  * These terms are used to discriminate the register computations
    160      1.1  fredette  * below.
    161      1.1  fredette  *
    162      1.1  fredette  * When copying forward, then, the first register used with
    163      1.3       chs  * the first vshd will be 19 + (3 - ((0 - 1) & 3)), or %t4,
    164      1.1  fredette  * which matches _COPYS' requirement that the word last loaded
    165      1.4     perry  * be in %t4.  The first register used for the second vshd
    166      1.3       chs  * will then "wrap" around to 19 + (3 - ((1 - 1) & 3)), or %t1.
    167      1.3       chs  * And so on to %t2 and %t3.
    168      1.1  fredette  *
    169      1.4     perry  * When copying forward, the second register used with the first
    170      1.4     perry  * vshd will be (19 + (3 - ((n + 0) & 3)), or %t1.  It will
    171      1.3       chs  * continue to be %t2, then %t3, and finally %t4.
    172      1.1  fredette  *
    173      1.4     perry  * When copying backwards, the values for the first and second
    174      1.4     perry  * register for each vshd are reversed from the forwards case.
    175      1.4     perry  * (Symmetry reclaimed!)  Proving this is "left as an exercise
    176      1.1  fredette  * for the reader" (remember the different discriminating values!)
    177      1.1  fredette  */
    178      1.1  fredette #define _VSHD(n, m, t)						  \
    179      1.1  fredette 	.word (0xd0000000					| \
    180      1.1  fredette 	((19 + (3 - ((n - 1 * ((m + 1) / 2)) & 3))) << 16)	| \
    181      1.1  fredette 	((19 + (3 - ((n + 1 * ((m - 1) / 2)) & 3))) << 21)	| \
    182      1.1  fredette 	(t))
    183      1.1  fredette 
    184      1.1  fredette /*
    185      1.4     perry  * This macro does a bulk copy with shifting.  cmplt and m are
    186      1.4     perry  * the completer and displacement multiplier, respectively, for
    187      1.1  fredette  * the load and store instructions.  It is assumed that the
    188      1.3       chs  * word last loaded is already in %t4.
    189      1.1  fredette  */
    190      1.1  fredette #define _COPYS(src_spc, src_off, dst_spc, dst_off, count, cmplt, m) \
    191      1.1  fredette 								! \
    192      1.1  fredette 	/*							! \
    193      1.1  fredette 	 * Loop storing 16 bytes at a time.  Since count 	! \
    194      1.1  fredette 	 * may be > INT_MAX, we have to be careful and		! \
    195      1.1  fredette 	 * avoid comparisons that treat it as a signed 		! \
    196      1.1  fredette 	 * quantity, until after this loop, when count		! \
    197      1.1  fredette 	 * is guaranteed to be less than 16.			! \
    198      1.1  fredette 	 */							! \
    199      1.1  fredette 	comib,>>=,n	15, count, _LABEL(S_skip16)		! \
    200      1.1  fredette .label _LABEL(S_loop16)						! \
    201      1.1  fredette 	addi		-16, count, count			! \
    202      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t1		! \
    203      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t2		! \
    204      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t3		! \
    205      1.3       chs 	_VSHD(0, m, 1)	/* vshd %t4, %t1, %r1 */		! \
    206      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t4		! \
    207      1.3       chs 	_VSHD(1, m, 22)	/* vshd %t1, %t2, %t1 */		! \
    208      1.3       chs 	_VSHD(2, m, 21)	/* vshd %t2, %t3, %t2 */		! \
    209      1.3       chs 	_VSHD(3, m, 20)	/* vshd %t3, %t4, %t3 */		! \
    210      1.1  fredette 	stws,cmplt	%r1, m*4(dst_spc, dst_off)		! \
    211      1.3       chs 	stws,cmplt	%t1, m*4(dst_spc, dst_off)		! \
    212      1.3       chs 	stws,cmplt	%t2, m*4(dst_spc, dst_off)		! \
    213      1.1  fredette 	comib,<<	15, count, _LABEL(S_loop16)		! \
    214      1.3       chs 	stws,cmplt	%t3, m*4(dst_spc, dst_off)		! \
    215      1.1  fredette .label _LABEL(S_skip16)						! \
    216      1.1  fredette 								! \
    217      1.1  fredette 	/* Loop storing 4 bytes at a time. */			! \
    218      1.1  fredette 	addib,<,n	-4, count, _LABEL(S_skip4)		! \
    219      1.1  fredette .label _LABEL(S_loop4)						! \
    220      1.3       chs 	ldws,cmplt	m*4(src_spc, src_off), %t1		! \
    221      1.3       chs 	_VSHD(0, m, 1)	/* into %r1 (1) */			! \
    222      1.3       chs 	copy		%t1, %t4				! \
    223      1.1  fredette 	addib,>=	-4, count, _LABEL(S_loop4)		! \
    224      1.1  fredette 	stws,cmplt	%r1, m*4(dst_spc, dst_off)		! \
    225      1.1  fredette .label _LABEL(S_skip4)						! \
    226      1.1  fredette 								! \
    227      1.1  fredette 	/*							! \
    228      1.1  fredette  	 * We now need to "back up" src_off by the		! \
    229      1.1  fredette 	 * number of bytes remaining in the FIFO		! \
    230      1.3       chs 	 * (i.e., the number of bytes remaining in %t4),	! \
    231      1.1  fredette 	 * because (the correct) count still includes		! \
    232      1.1  fredette 	 * these bytes, and we intent to keep it that		! \
    233      1.1  fredette 	 * way, and finish with the single-byte copier.		! \
    234      1.1  fredette 	 *							! \
    235      1.1  fredette 	 * The number of bytes remaining in the FIFO is		! \
    236      1.1  fredette 	 * related to the shift count, so recover it,		! \
    237      1.1  fredette 	 * restoring the correct count at the same time.	! \
    238      1.1  fredette 	 */							! \
    239      1.3       chs 	mfctl	%cr11, %t1					! \
    240      1.1  fredette 	addi	4, count, count					! \
    241      1.3       chs 	shd	%r0, %t1, 3, %t1				! \
    242      1.1  fredette 								! \
    243      1.1  fredette 	/*							! \
    244      1.1  fredette 	 * If we're copying forward, the shift count		! \
    245      1.1  fredette 	 * is the number of bytes remaining in the		! \
    246      1.1  fredette 	 * FIFO, and we want to subtract it from src_off.	! \
    247      1.1  fredette 	 * If we're copying backwards, (4 - shift count)	! \
    248      1.1  fredette 	 * is the number of bytes remaining in the FIFO,	! \
    249      1.1  fredette 	 * and we want to add it to src_off.			! \
    250      1.1  fredette 	 *							! \
    251      1.1  fredette 	 * We observe that x + (4 - y) = x - (y - 4),		! \
    252      1.1  fredette 	 * and introduce this instruction to add -4 when	! \
    253      1.1  fredette 	 * m is -1, although this does mean one extra		! \
    254      1.1  fredette 	 * instruction in the forward case.			! \
    255      1.1  fredette 	 */							! \
    256      1.3       chs 	addi	4*((m - 1) / 2), %t1, %t1			! \
    257      1.1  fredette 								! \
    258      1.1  fredette 	/* Now branch to the byte-at-a-time loop. */		! \
    259      1.1  fredette 	b	_LABEL(_do1)					! \
    260      1.3       chs 	sub	src_off, %t1, src_off
    261      1.1  fredette 
    262      1.1  fredette /*
    263      1.1  fredette  * This macro copies a region in the forward direction.
    264      1.1  fredette  */
    265      1.1  fredette #define _COPY_FORWARD(src_spc, src_off, dst_spc, dst_off, count)  \
    266      1.1  fredette 								! \
    267      1.1  fredette 	/*							! \
    268      1.1  fredette 	 * Since in the shifting-left case we will		! \
    269      1.1  fredette 	 * load 8 bytes before checking count, to		! \
    270      1.1  fredette 	 * keep things simple, branch to the byte 		! \
    271      1.1  fredette 	 * copier unless we're copying at least 8.		! \
    272      1.1  fredette 	 */							! \
    273      1.1  fredette 	comib,>>,n	8, count, _LABEL(_do1)			! \
    274      1.1  fredette 								! \
    275      1.1  fredette 	/*							! \
    276      1.1  fredette 	 * Once we 4-byte align the source offset, 		! \
    277      1.1  fredette 	 * figure out how many bytes from the region		! \
    278      1.1  fredette 	 * will be in the first 4-byte word we read.		! \
    279      1.1  fredette 	 * Ditto for writing the destination offset.		! \
    280      1.1  fredette 	 */							! \
    281      1.3       chs 	extru		src_off, 31, 2, %t1			! \
    282      1.3       chs 	extru		dst_off, 31, 2, %t2			! \
    283      1.3       chs 	subi		4, %t1, %t1				! \
    284      1.3       chs 	subi		4, %t2, %t2				! \
    285      1.1  fredette 								! \
    286      1.1  fredette 	/*							! \
    287      1.1  fredette 	 * Calculate the byte shift required.  A 		! \
    288      1.1  fredette 	 * positive value means a source 4-byte word 		! \
    289      1.1  fredette 	 * has to be shifted to the right to line up 		! \
    290      1.1  fredette 	 * as a destination 4-byte word.			! \
    291      1.1  fredette 	 */							! \
    292      1.3       chs 	sub		%t1, %t2, %t1				! \
    293      1.1  fredette 								! \
    294      1.1  fredette 	/* 4-byte align src_off. */				! \
    295      1.1  fredette 	depi		0, 31, 2, src_off			! \
    296      1.1  fredette 								! \
    297      1.1  fredette 	/*							! \
    298      1.1  fredette 	 * It's somewhat important to note that this		! \
    299      1.1  fredette 	 * code thinks of count as "the number of bytes		! \
    300      1.1  fredette 	 * that haven't been stored yet", as opposed to		! \
    301      1.1  fredette 	 * "the number of bytes that haven't been copied	! \
    302      1.1  fredette 	 * yet".  The distinction is subtle, but becomes	! \
    303      1.1  fredette 	 * apparent at the end of the shifting code, where	! \
    304      1.1  fredette 	 * we "back up" src_off to correspond to count,		! \
    305      1.1  fredette 	 * as opposed to flushing the FIFO.			! \
    306      1.1  fredette 	 *							! \
    307      1.1  fredette 	 * We calculated above how many bytes our first		! \
    308      1.1  fredette 	 * store will store, so update count now.		! \
    309      1.1  fredette 	 *							! \
    310      1.1  fredette 	 * If the shift is zero, strictly as an optimization	! \
    311      1.1  fredette 	 * we use a copy loop that does no shifting.		! \
    312      1.1  fredette 	 */							! \
    313      1.3       chs 	comb,<>		%r0, %t1, _LABEL(_shifting)		! \
    314      1.3       chs 	sub		count, %t2, count			! \
    315      1.1  fredette 								! \
    316      1.1  fredette 	/* Load and store the first word. */			! \
    317      1.3       chs 	ldws,ma		4(src_spc, src_off), %t4		! \
    318      1.3       chs 	stbys,b,m	%t4, 4(dst_spc, dst_off)		! \
    319      1.1  fredette 								! \
    320      1.1  fredette 	/* Do the rest of the copy. */				! \
    321      1.1  fredette 	_COPY(src_spc,src_off,dst_spc,dst_off,count,ma,1)	! \
    322      1.1  fredette 								! \
    323      1.1  fredette .label _LABEL(_shifting)					! \
    324      1.1  fredette 								! \
    325      1.1  fredette 	/*							! \
    326      1.1  fredette 	 * If shift < 0, we need to shift words to the		! \
    327      1.1  fredette 	 * left.  Since we can't do this directly, we		! \
    328      1.1  fredette 	 * adjust the shift so it's a shift to the right	! \
    329      1.1  fredette 	 * and load the first word into the high word of	! \
    330      1.1  fredette 	 * the FIFO.  Otherwise, we load a zero into the	! \
    331      1.1  fredette 	 * high word of the FIFO.				! \
    332      1.1  fredette 	 */							! \
    333      1.3       chs 	comb,<=		%r0, %t1, _LABEL(_shiftingrt)		! \
    334      1.3       chs 	copy		%r0, %t3				! \
    335      1.3       chs 	addi		4, %t1, %t1				! \
    336      1.3       chs 	ldws,ma		4(src_spc, src_off), %t3		! \
    337      1.1  fredette .label _LABEL(_shiftingrt)					! \
    338      1.1  fredette 								! \
    339      1.1  fredette 	/*							! \
    340      1.1  fredette 	 * Turn the shift byte count into a bit count,		! \
    341      1.1  fredette 	 * load the next word, set the Shift Amount 		! \
    342      1.1  fredette 	 * Register, and form and store the first word.		! \
    343      1.1  fredette 	 */							! \
    344      1.3       chs 	sh3add		%t1, %r0, %t1				! \
    345      1.3       chs 	ldws,ma		4(src_spc, src_off), %t4		! \
    346      1.3       chs 	mtctl		%t1, %cr11				! \
    347      1.3       chs 	vshd		%t3, %t4, %r1				! \
    348      1.1  fredette 	stbys,b,m	%r1, 4(dst_spc, dst_off)		! \
    349      1.1  fredette 								! \
    350      1.1  fredette 	/* Do the rest of the copy. */				! \
    351      1.1  fredette 	_COPYS(src_spc,src_off,dst_spc,dst_off,count,ma,1)
    352      1.1  fredette 
    353      1.1  fredette /* This macro copies a region in the reverse direction. */
    354      1.1  fredette #define _COPY_REVERSE(src_spc, src_off, dst_spc, dst_off, count)  \
    355      1.1  fredette 								! \
    356      1.1  fredette 	/* Immediately add count to both offsets. */		! \
    357      1.1  fredette 	add	src_off, count, src_off				! \
    358      1.1  fredette 	add	dst_off, count, dst_off				! \
    359      1.1  fredette 								! \
    360      1.1  fredette 	/*							! \
    361      1.1  fredette 	 * Since in the shifting-right case we 			! \
    362      1.1  fredette 	 * will load 8 bytes before checking 			! \
    363      1.1  fredette 	 * count, to keep things simple, branch 		! \
    364      1.1  fredette 	 * to the byte copier unless we're 			! \
    365      1.1  fredette 	 * copying at least 8 bytes.				! \
    366      1.1  fredette 	 */							! \
    367      1.1  fredette 	comib,>>,n	8, count, _LABEL(_do1)			! \
    368      1.1  fredette 								! \
    369      1.1  fredette 	/*							! \
    370      1.1  fredette 	 * Once we 4-byte align the source offset, 		! \
    371      1.1  fredette 	 * figure out how many bytes from the region		! \
    372      1.1  fredette 	 * will be in the first 4-byte word we read.		! \
    373      1.1  fredette 	 * Ditto for writing the destination offset.		! \
    374      1.1  fredette 	 */							! \
    375      1.3       chs 	extru,<>	src_off, 31, 2, %t1			! \
    376      1.3       chs 	ldi		4, %t1					! \
    377      1.3       chs 	extru,<>	dst_off, 31, 2, %t2			! \
    378      1.3       chs 	ldi		4, %t2					! \
    379      1.1  fredette 								! \
    380      1.1  fredette 	/*							! \
    381      1.1  fredette 	 * Calculate the byte shift required.  A 		! \
    382      1.1  fredette 	 * positive value means a source 4-byte 		! \
    383      1.1  fredette 	 * word has to be shifted to the right to 		! \
    384      1.1  fredette 	 * line up as a destination 4-byte word.		! \
    385      1.1  fredette 	 */							! \
    386      1.3       chs 	sub		%t2, %t1, %t1				! \
    387      1.1  fredette 								! \
    388      1.1  fredette 	/*							! \
    389      1.1  fredette 	 * 4-byte align src_off, leaving it pointing 		! \
    390      1.1  fredette 	 * to the 4-byte word *after* the next word 		! \
    391      1.1  fredette 	 * we intend to load.					! \
    392      1.1  fredette 	 *							! \
    393      1.1  fredette 	 * It's somewhat important to note that this		! \
    394      1.1  fredette 	 * code thinks of count as "the number of bytes		! \
    395      1.1  fredette 	 * that haven't been stored yet", as opposed to		! \
    396      1.1  fredette 	 * "the number of bytes that haven't been copied	! \
    397      1.1  fredette 	 * yet".  The distinction is subtle, but becomes	! \
    398      1.1  fredette 	 * apparent at the end of the shifting code, where	! \
    399      1.1  fredette 	 * we "back up" src_off to correspond to count,		! \
    400      1.1  fredette 	 * as opposed to flushing the FIFO.			! \
    401      1.1  fredette 	 *							! \
    402      1.1  fredette 	 * We calculated above how many bytes our first		! \
    403      1.1  fredette 	 * store will store, so update count now.		! \
    404      1.1  fredette 	 *							! \
    405      1.1  fredette 	 * If the shift is zero, we use a copy loop that	! \
    406      1.1  fredette 	 * does no shifting.  NB: unlike the forward case,	! \
    407      1.1  fredette 	 * this is NOT strictly an optimization.  If the	! \
    408      1.1  fredette 	 * SAR is zero the vshds do NOT do the right thing.	! \
    409      1.1  fredette 	 * This is another assymetry more or less the "fault"	! \
    410      1.1  fredette 	 * of vshd.						! \
    411      1.1  fredette 	 */							! \
    412      1.1  fredette 	addi		3, src_off, src_off			! \
    413      1.3       chs 	sub		count, %t2, count			! \
    414      1.3       chs 	comb,<>		%r0, %t1, _LABEL(_shifting)		! \
    415      1.1  fredette 	depi		0, 31, 2, src_off			! \
    416      1.1  fredette 								! \
    417      1.1  fredette 	/* Load and store the first word. */			! \
    418      1.3       chs 	ldws,mb		-4(src_spc, src_off), %t4		! \
    419      1.3       chs 	_STBYS_E_M(%t4, dst_spc, dst_off)			! \
    420      1.1  fredette 								! \
    421      1.1  fredette 	/* Do the rest of the copy. */				! \
    422      1.1  fredette 	_COPY(src_spc,src_off,dst_spc,dst_off,count,mb,-1)	! \
    423      1.1  fredette 								! \
    424      1.1  fredette .label _LABEL(_shifting)					! \
    425      1.1  fredette 								! \
    426      1.1  fredette 	/*							! \
    427      1.1  fredette 	 * If shift < 0, we need to shift words to the		! \
    428      1.1  fredette 	 * left.  Since we can't do this directly, we		! \
    429      1.1  fredette 	 * adjust the shift so it's a shift to the right	! \
    430      1.1  fredette 	 * and load a zero in to the low word of the FIFO.	! \
    431      1.1  fredette 	 * Otherwise, we load the first word into the		! \
    432      1.1  fredette 	 * low word of the FIFO.				! \
    433      1.1  fredette 	 *							! \
    434      1.1  fredette 	 * Note the nullification trickery here.  We 		! \
    435      1.1  fredette 	 * assume that we're shifting to the left, and		! \
    436      1.1  fredette 	 * load zero into the low word of the FIFO.  Then	! \
    437      1.1  fredette 	 * we nullify the addi if we're shifting to the		! \
    438      1.1  fredette 	 * right.  If the addi is not nullified, we are		! \
    439      1.1  fredette  	 * shifting to the left, so we nullify the load.	! \
    440      1.1  fredette 	 * we branch if we're shifting to the 			! \
    441      1.1  fredette 	 */							! \
    442      1.3       chs 	copy		%r0, %t3				! \
    443      1.3       chs 	comb,<=,n	%r0, %t1, 0				! \
    444      1.3       chs 	addi,tr		4, %t1, %t1				! \
    445      1.3       chs 	ldws,mb		-4(src_spc, src_off), %t3		! \
    446      1.1  fredette 								! \
    447      1.1  fredette 	/*							! \
    448      1.1  fredette 	 * Turn the shift byte count into a bit count,		! \
    449      1.1  fredette 	 * load the next word, set the Shift Amount 		! \
    450      1.1  fredette 	 * Register, and form and store the first word.		! \
    451      1.1  fredette 	 */							! \
    452      1.3       chs 	sh3add		%t1, %r0, %t1				! \
    453      1.3       chs 	ldws,mb		-4(src_spc, src_off), %t4		! \
    454      1.3       chs 	mtctl		%t1, %cr11				! \
    455      1.3       chs 	vshd		%t4, %t3, %r1				! \
    456      1.1  fredette 	_STBYS_E_M(%r1, dst_spc, dst_off)			! \
    457      1.1  fredette 								! \
    458      1.1  fredette 	/* Do the rest of the copy. */				! \
    459      1.1  fredette 	_COPYS(src_spc,src_off,dst_spc,dst_off,count,mb,-1)
    460      1.1  fredette 
    461      1.1  fredette /*
    462      1.1  fredette  * For paranoia, when things aren't going well, enable this
    463      1.1  fredette  * code to assemble byte-at-a-time-only copying.
    464      1.1  fredette  */
    465      1.1  fredette #if 1
    466      1.1  fredette #undef _COPY_FORWARD
    467      1.1  fredette #define _COPY_FORWARD(src_spc, src_off, dst_spc, dst_off, count)  \
    468      1.1  fredette 	comb,=,n	%r0, count, _LABEL(_done)		! \
    469      1.1  fredette 	ldbs,ma		1(src_spc, src_off), %r1		! \
    470      1.1  fredette 	addib,<>	-1, count, -12				! \
    471      1.1  fredette 	stbs,ma		%r1, 1(dst_spc, dst_off)		! \
    472      1.1  fredette 	b,n		_LABEL(_done)
    473      1.1  fredette #undef _COPY_REVERSE
    474      1.1  fredette #define _COPY_REVERSE(src_spc, src_off, dst_spc, dst_off, count)  \
    475      1.1  fredette 	comb,=		%r0, count, _LABEL(_done)		! \
    476      1.1  fredette 	add		src_off, count, src_off			! \
    477      1.1  fredette 	add		dst_off, count, dst_off			! \
    478      1.1  fredette 	ldbs,mb		-1(src_spc, src_off), %r1		! \
    479      1.1  fredette 	addib,<>	-1, count, -12				! \
    480      1.1  fredette 	stbs,mb		%r1, -1(dst_spc, dst_off)		! \
    481      1.1  fredette 	b,n		_LABEL(_done)
    482      1.1  fredette #endif
    483      1.1  fredette 
    484      1.1  fredette /*
    485      1.1  fredette  * If none of the following are defined, define BCOPY.
    486      1.1  fredette  */
    487      1.1  fredette #if !(defined(SPCOPY) || defined(MEMCPY) || defined(MEMMOVE))
    488      1.1  fredette #define BCOPY
    489      1.1  fredette #endif
    490      1.1  fredette 
    491      1.1  fredette #if defined(SPCOPY) && !defined(_STANDALONE)
    492      1.1  fredette #include <sys/errno.h>
    493      1.1  fredette #include "assym.h"
    494      1.1  fredette 
    495      1.1  fredette /*
    496      1.1  fredette  * int spcopy(pa_space_t ssp, const void *src, pa_space_t dsp, void *dst,
    497      1.1  fredette  * 	size_t len)
    498      1.1  fredette  *
    499      1.1  fredette  * We assume that the regions do not overlap.
    500      1.1  fredette  */
    501      1.1  fredette LEAF_ENTRY(spcopy)
    502      1.1  fredette 
    503      1.1  fredette         /*
    504      1.1  fredette 	 * Setup the fault handler, and load %ret0
    505      1.1  fredette 	 * with EFAULT, assuming the copy will fail.
    506      1.1  fredette 	 */
    507      1.2       chs 	.import	curlwp, data
    508      1.2       chs 	ldil    L%curlwp, %r31
    509      1.2       chs 	ldw     R%curlwp(%r31), %r31
    510      1.1  fredette #ifdef	DIAGNOSTIC
    511      1.2       chs 	comb,<>,n %r0, %r31, Lspcopy_curlwp_ok
    512      1.4     perry 	ldil	L%panic, %r1
    513      1.2       chs 	ldil	L%Lspcopy_curlwp_bad, %arg0
    514      1.1  fredette 	ldo	R%panic(%r1), %r1
    515      1.2       chs 	ldo	R%Lspcopy_curlwp_bad(%arg0), %arg0
    516      1.1  fredette 	.call
    517      1.1  fredette 	bv,n    %r0(%r1)
    518      1.1  fredette 	nop
    519      1.3       chs Lspcopy_curlwp_bad:
    520      1.2       chs 	.asciz	"spcopy: curlwp == NULL\n"
    521      1.1  fredette 	.align	8
    522      1.3       chs Lspcopy_curlwp_ok:
    523      1.1  fredette #endif /* DIAGNOSTIC */
    524      1.2       chs 	ldil    L%spcopy_fault, %r1
    525      1.2       chs 	ldw     L_ADDR(%r31), %r31
    526      1.2       chs 	ldo     R%spcopy_fault(%r1), %r1
    527      1.1  fredette 	ldi	EFAULT, %ret0
    528      1.2       chs 	stw     %r1, U_PCB+PCB_ONFAULT(%r31)
    529      1.1  fredette 
    530      1.1  fredette 	/* Setup the space registers. */
    531      1.3       chs 	mfsp	%sr2, %ret1
    532      1.3       chs 	mtsp	%arg0, %sr1
    533      1.3       chs 	mtsp	%arg2, %sr2
    534      1.1  fredette 
    535      1.1  fredette 	/* Get the len argument and do the copy. */
    536      1.3       chs 	ldw	HPPA_FRAME_ARG(4)(%sp), %arg0
    537      1.1  fredette #define	_LABEL(l) __CONCAT(spcopy,l)
    538      1.3       chs 	_COPY_FORWARD(%sr1,%arg1,%sr2,%arg3,%arg0)
    539      1.3       chs _LABEL(_done):
    540      1.1  fredette 
    541      1.1  fredette 	/* Return. */
    542      1.1  fredette 	copy	%r0, %ret0
    543      1.1  fredette ALTENTRY(spcopy_fault)
    544      1.2       chs 	stw     %r0, U_PCB+PCB_ONFAULT(%r31)
    545      1.1  fredette 	bv	%r0(%rp)
    546      1.3       chs 	mtsp	%ret1, %sr2
    547      1.1  fredette EXIT(spcopy)
    548      1.1  fredette #endif /* SPCOPY && !_STANDALONE */
    549      1.1  fredette 
    550      1.1  fredette #ifdef MEMCPY
    551      1.1  fredette /*
    552  1.4.4.1      yamt  * void *memcpy(void *restrict dst, const void *restrict src, size_t len);
    553      1.1  fredette  *
    554      1.1  fredette  * memcpy is specifically restricted to working on
    555      1.1  fredette  * non-overlapping regions, so we can just copy forward.
    556      1.1  fredette  */
    557      1.1  fredette LEAF_ENTRY(memcpy)
    558      1.1  fredette 	copy	%arg0, %ret0
    559      1.1  fredette #define	_LABEL(l) __CONCAT(memcpy,l)
    560      1.3       chs 	_COPY_FORWARD(%sr0,%arg1,%sr0,%arg0,%arg2)
    561      1.3       chs _LABEL(_done):
    562      1.1  fredette 	bv,n	%r0(%rp)
    563      1.1  fredette 	nop
    564      1.1  fredette EXIT(memcpy)
    565      1.1  fredette #endif /* MEMCPY */
    566      1.1  fredette 
    567      1.1  fredette #ifdef BCOPY
    568      1.1  fredette /*
    569      1.1  fredette  * void bcopy(const void *src, void *dst, size_t len);
    570      1.1  fredette  */
    571      1.1  fredette LEAF_ENTRY(bcopy)
    572      1.1  fredette 	copy	%arg0, %r1
    573      1.1  fredette 	copy	%arg1, %arg0
    574      1.1  fredette 	copy	%r1, %arg1
    575      1.1  fredette 	/* FALLTHROUGH */
    576      1.1  fredette #define _LABEL_F(l) __CONCAT(bcopy_F,l)
    577      1.1  fredette #define _LABEL_R(l) __CONCAT(bcopy_R,l)
    578      1.1  fredette #endif
    579      1.1  fredette 
    580      1.1  fredette #ifdef MEMMOVE
    581      1.1  fredette /*
    582      1.1  fredette  * void *memmove(void *dst, const void *src, size_t len);
    583      1.1  fredette  */
    584      1.1  fredette LEAF_ENTRY(memmove)
    585      1.1  fredette #define _LABEL_F(l) __CONCAT(memmove_F,l)
    586      1.1  fredette #define _LABEL_R(l) __CONCAT(memmove_R,l)
    587      1.1  fredette 	copy	%arg0, %ret0
    588      1.1  fredette #endif /* MEMMOVE */
    589      1.1  fredette 
    590      1.1  fredette #if defined(BCOPY) || defined(MEMMOVE)
    591      1.1  fredette 
    592      1.1  fredette 	/*
    593      1.1  fredette 	 * If src >= dst or src + len <= dst, we copy
    594      1.1  fredette 	 * forward, else we copy in reverse.
    595      1.1  fredette 	 */
    596      1.1  fredette 	add		%arg1, %arg2, %r1
    597      1.1  fredette 	comb,>>=,n	%arg1, %arg0, 0
    598      1.1  fredette 	comb,>>,n	%r1, %arg0, _LABEL_R(_go)
    599      1.1  fredette 
    600      1.1  fredette #define _LABEL _LABEL_F
    601      1.3       chs 	_COPY_FORWARD(%sr0,%arg1,%sr0,%arg0,%arg2)
    602      1.1  fredette #undef _LABEL
    603      1.1  fredette 
    604      1.3       chs _LABEL_R(_go):
    605      1.1  fredette #define _LABEL _LABEL_R
    606      1.3       chs 	_COPY_REVERSE(%sr0,%arg1,%sr0,%arg0,%arg2)
    607      1.1  fredette #undef _LABEL
    608      1.4     perry 
    609      1.3       chs _LABEL_F(_done):
    610      1.3       chs _LABEL_R(_done):
    611      1.1  fredette 	bv,n	%r0(%rp)
    612      1.1  fredette 	nop
    613      1.1  fredette #ifdef BCOPY
    614      1.1  fredette EXIT(bcopy)
    615      1.1  fredette #else
    616      1.1  fredette EXIT(memmove)
    617      1.1  fredette #endif
    618      1.1  fredette #endif /* BCOPY || MEMMOVE */
    619