bcopy.S revision 1.1.4.2 1 1.1.4.2 jdolecek /* $NetBSD: bcopy.S,v 1.1.4.2 2002/06/23 17:49:46 jdolecek Exp $ */
2 1.1.4.2 jdolecek
3 1.1.4.2 jdolecek /*
4 1.1.4.2 jdolecek * Copyright (c) 2002 The NetBSD Foundation, Inc.
5 1.1.4.2 jdolecek * All rights reserved.
6 1.1.4.2 jdolecek *
7 1.1.4.2 jdolecek * This code is derived from software contributed to The NetBSD Foundation
8 1.1.4.2 jdolecek * by Matthew Fredette.
9 1.1.4.2 jdolecek *
10 1.1.4.2 jdolecek * Redistribution and use in source and binary forms, with or without
11 1.1.4.2 jdolecek * modification, are permitted provided that the following conditions
12 1.1.4.2 jdolecek * are met:
13 1.1.4.2 jdolecek * 1. Redistributions of source code must retain the above copyright
14 1.1.4.2 jdolecek * notice, this list of conditions and the following disclaimer.
15 1.1.4.2 jdolecek * 2. Redistributions in binary form must reproduce the above copyright
16 1.1.4.2 jdolecek * notice, this list of conditions and the following disclaimer in the
17 1.1.4.2 jdolecek * documentation and/or other materials provided with the distribution.
18 1.1.4.2 jdolecek * 3. All advertising materials mentioning features or use of this software
19 1.1.4.2 jdolecek * must display the following acknowledgement:
20 1.1.4.2 jdolecek * This product includes software developed by the NetBSD
21 1.1.4.2 jdolecek * Foundation, Inc. and its contributors.
22 1.1.4.2 jdolecek * 4. Neither the name of The NetBSD Foundation nor the names of its
23 1.1.4.2 jdolecek * contributors may be used to endorse or promote products derived
24 1.1.4.2 jdolecek * from this software without specific prior written permission.
25 1.1.4.2 jdolecek *
26 1.1.4.2 jdolecek * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 1.1.4.2 jdolecek * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 1.1.4.2 jdolecek * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 1.1.4.2 jdolecek * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 1.1.4.2 jdolecek * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 1.1.4.2 jdolecek * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 1.1.4.2 jdolecek * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 1.1.4.2 jdolecek * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 1.1.4.2 jdolecek * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 1.1.4.2 jdolecek * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 1.1.4.2 jdolecek * POSSIBILITY OF SUCH DAMAGE.
37 1.1.4.2 jdolecek */
38 1.1.4.2 jdolecek
39 1.1.4.2 jdolecek /*
40 1.1.4.2 jdolecek * Copy routines for NetBSD/hppa.
41 1.1.4.2 jdolecek */
42 1.1.4.2 jdolecek
43 1.1.4.2 jdolecek #undef _LOCORE
44 1.1.4.2 jdolecek #define _LOCORE /* XXX fredette - unfortunate */
45 1.1.4.2 jdolecek #include <machine/asm.h>
46 1.1.4.2 jdolecek #include <machine/frame.h>
47 1.1.4.2 jdolecek
48 1.1.4.2 jdolecek #if defined(LIBC_SCCS) && !defined(lint)
49 1.1.4.2 jdolecek RCSID("$NetBSD: bcopy.S,v 1.1.4.2 2002/06/23 17:49:46 jdolecek Exp $")
50 1.1.4.2 jdolecek #endif /* LIBC_SCCS and not lint */
51 1.1.4.2 jdolecek
52 1.1.4.2 jdolecek /*
53 1.1.4.2 jdolecek * The stbys instruction is a little asymmetric. When (%r2 & 3)
54 1.1.4.2 jdolecek * is zero, stbys,b,m %r1, 4(%r2) works like stws,ma. You
55 1.1.4.2 jdolecek * might then wish that when (%r2 & 3) == 0, stbys,e,m %r1, -4(%r2)
56 1.1.4.2 jdolecek * worked like stws,mb. But it doesn't.
57 1.1.4.2 jdolecek *
58 1.1.4.2 jdolecek * This macro works around this problem. It requires that %t2
59 1.1.4.2 jdolecek * hold the number of bytes that will be written by this store
60 1.1.4.2 jdolecek * (meaning that it ranges from one to four).
61 1.1.4.2 jdolecek *
62 1.1.4.2 jdolecek * Watch the delay-slot trickery here. The comib is used to set
63 1.1.4.2 jdolecek * up which instruction, either the stws or the stbys, is run
64 1.1.4.2 jdolecek * in the delay slot of the b instruction.
65 1.1.4.2 jdolecek */
66 1.1.4.2 jdolecek #define _STBYS_E_M(r, dst_spc, dst_off) \
67 1.1.4.2 jdolecek comib,<> 4, t2, 4 ! \
68 1.1.4.2 jdolecek b 4 ! \
69 1.1.4.2 jdolecek stws,mb r, -4(dst_spc, dst_off) ! \
70 1.1.4.2 jdolecek stbys,e,m r, 0(dst_spc, dst_off)
71 1.1.4.2 jdolecek
72 1.1.4.2 jdolecek /*
73 1.1.4.2 jdolecek * This macro does a bulk copy with no shifting. cmplt and m are
74 1.1.4.2 jdolecek * the completer and displacement multiplier, respectively, for
75 1.1.4.2 jdolecek * the load and store instructions.
76 1.1.4.2 jdolecek */
77 1.1.4.2 jdolecek #define _COPY(src_spc, src_off, dst_spc, dst_off, count, cmplt, m) \
78 1.1.4.2 jdolecek ! \
79 1.1.4.2 jdolecek /* ! \
80 1.1.4.2 jdolecek * Loop storing 16 bytes at a time. Since count ! \
81 1.1.4.2 jdolecek * may be > INT_MAX, we have to be careful and ! \
82 1.1.4.2 jdolecek * avoid comparisons that treat it as a signed ! \
83 1.1.4.2 jdolecek * quantity, until after this loop, when count ! \
84 1.1.4.2 jdolecek * is guaranteed to be less than 16. ! \
85 1.1.4.2 jdolecek */ ! \
86 1.1.4.2 jdolecek comib,>>=,n 15, count, _LABEL(_skip16) ! \
87 1.1.4.2 jdolecek .label _LABEL(_loop16) ! \
88 1.1.4.2 jdolecek addi -16, count, count ! \
89 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t1 ! \
90 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t2 ! \
91 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t3 ! \
92 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t4 ! \
93 1.1.4.2 jdolecek stws,cmplt t1, m*4(dst_spc, dst_off) ! \
94 1.1.4.2 jdolecek stws,cmplt t2, m*4(dst_spc, dst_off) ! \
95 1.1.4.2 jdolecek stws,cmplt t3, m*4(dst_spc, dst_off) ! \
96 1.1.4.2 jdolecek comib,<< 15, count, _LABEL(_loop16) ! \
97 1.1.4.2 jdolecek stws,cmplt t4, m*4(dst_spc, dst_off) ! \
98 1.1.4.2 jdolecek .label _LABEL(_skip16) ! \
99 1.1.4.2 jdolecek ! \
100 1.1.4.2 jdolecek /* Loop storing 4 bytes at a time. */ ! \
101 1.1.4.2 jdolecek addib,<,n -4, count, _LABEL(_skip4) ! \
102 1.1.4.2 jdolecek .label _LABEL(_loop4) ! \
103 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t1 ! \
104 1.1.4.2 jdolecek addib,>= -4, count, _LABEL(_loop4) ! \
105 1.1.4.2 jdolecek stws,cmplt t1, m*4(dst_spc, dst_off) ! \
106 1.1.4.2 jdolecek .label _LABEL(_skip4) ! \
107 1.1.4.2 jdolecek /* Restore the correct count. */ ! \
108 1.1.4.2 jdolecek addi 4, count, count ! \
109 1.1.4.2 jdolecek ! \
110 1.1.4.2 jdolecek .label _LABEL(_do1) ! \
111 1.1.4.2 jdolecek ! \
112 1.1.4.2 jdolecek /* Loop storing 1 byte at a time. */ ! \
113 1.1.4.2 jdolecek addib,<,n -1, count, _LABEL(_skip1) ! \
114 1.1.4.2 jdolecek .label _LABEL(_loop1) ! \
115 1.1.4.2 jdolecek ldbs,cmplt m*1(src_spc, src_off), t1 ! \
116 1.1.4.2 jdolecek addib,>= -1, count, _LABEL(_loop1) ! \
117 1.1.4.2 jdolecek stbs,cmplt t1, m*1(dst_spc, dst_off) ! \
118 1.1.4.2 jdolecek .label _LABEL(_skip1) ! \
119 1.1.4.2 jdolecek /* Restore the correct count. */ ! \
120 1.1.4.2 jdolecek b _LABEL(_done) ! \
121 1.1.4.2 jdolecek addi 1, count, count
122 1.1.4.2 jdolecek
123 1.1.4.2 jdolecek /*
124 1.1.4.2 jdolecek * This macro is definitely strange. It exists purely to
125 1.1.4.2 jdolecek * allow the _COPYS macro to be reused, but because it
126 1.1.4.2 jdolecek * requires this long attempt to explain it, I'm starting
127 1.1.4.2 jdolecek * to doubt the value of that.
128 1.1.4.2 jdolecek *
129 1.1.4.2 jdolecek * Part of the expansion of the _COPYS macro below are loops
130 1.1.4.2 jdolecek * that copy four words or one word at a time, performing shifts
131 1.1.4.2 jdolecek * to get data to line up correctly in the destination buffer.
132 1.1.4.2 jdolecek *
133 1.1.4.2 jdolecek * The _COPYS macro is used when copying backwards, as well
134 1.1.4.2 jdolecek * as forwards. The 4-word loop always loads into t1, t2, t3,
135 1.1.4.2 jdolecek * and t4 in that order. This means that when copying forward,
136 1.1.4.2 jdolecek * t1 will have the word from the lowest address, and t4 will
137 1.1.4.2 jdolecek * have the word from the highest address. When copying
138 1.1.4.2 jdolecek * backwards, the opposite is true.
139 1.1.4.2 jdolecek *
140 1.1.4.2 jdolecek * The shift instructions need pairs of registers with adjacent
141 1.1.4.2 jdolecek * words, with the register containing the word from the lowest
142 1.1.4.2 jdolecek * address *always* coming first. It is this assymetry that
143 1.1.4.2 jdolecek * gives rise to this macro - depending on which direction
144 1.1.4.2 jdolecek * we're copying in, these ordered pairs are different.
145 1.1.4.2 jdolecek *
146 1.1.4.2 jdolecek * Fortunately, we can compute those register numbers at compile
147 1.1.4.2 jdolecek * time, and assemble them manually into a shift instruction.
148 1.1.4.2 jdolecek * That's what this macro does.
149 1.1.4.2 jdolecek *
150 1.1.4.2 jdolecek * This macro takes two arguments. n ranges from 0 to 3 and
151 1.1.4.2 jdolecek * is the "shift number", i.e., n = 0 means we're doing the
152 1.1.4.2 jdolecek * shift for what will be the first store.
153 1.1.4.2 jdolecek *
154 1.1.4.2 jdolecek * m is the displacement multiplier from the _COPYS macro call.
155 1.1.4.2 jdolecek * This is 1 for a forward copy and -1 for a backwards copy.
156 1.1.4.2 jdolecek * So, the ((m + 1) / 2) term yields 0 for a backwards copy and
157 1.1.4.2 jdolecek * 1 for a forward copy, and the ((m - 1) / 2) term yields
158 1.1.4.2 jdolecek * 0 for a forward copy, and -1 for a backwards copy.
159 1.1.4.2 jdolecek * These terms are used to discriminate the register computations
160 1.1.4.2 jdolecek * below.
161 1.1.4.2 jdolecek *
162 1.1.4.2 jdolecek * When copying forward, then, the first register used with
163 1.1.4.2 jdolecek * the first vshd will be 19 + (3 - ((0 - 1) & 3)), or t4,
164 1.1.4.2 jdolecek * which matches _COPYS' requirement that the word last loaded
165 1.1.4.2 jdolecek * be in t4. The first register used for the second vshd
166 1.1.4.2 jdolecek * will then "wrap" around to 19 + (3 - ((1 - 1) & 3)), or t1.
167 1.1.4.2 jdolecek * And so on to t2 and t3.
168 1.1.4.2 jdolecek *
169 1.1.4.2 jdolecek * When copying forward, the second register used with the first
170 1.1.4.2 jdolecek * vshd will be (19 + (3 - ((n + 0) & 3)), or t1. It will
171 1.1.4.2 jdolecek * continue to be t2, then t3, and finally t4.
172 1.1.4.2 jdolecek *
173 1.1.4.2 jdolecek * When copying backwards, the values for the first and second
174 1.1.4.2 jdolecek * register for each vshd are reversed from the forwards case.
175 1.1.4.2 jdolecek * (Symmetry reclaimed!) Proving this is "left as an exercise
176 1.1.4.2 jdolecek * for the reader" (remember the different discriminating values!)
177 1.1.4.2 jdolecek */
178 1.1.4.2 jdolecek #define _VSHD(n, m, t) \
179 1.1.4.2 jdolecek .word (0xd0000000 | \
180 1.1.4.2 jdolecek ((19 + (3 - ((n - 1 * ((m + 1) / 2)) & 3))) << 16) | \
181 1.1.4.2 jdolecek ((19 + (3 - ((n + 1 * ((m - 1) / 2)) & 3))) << 21) | \
182 1.1.4.2 jdolecek (t))
183 1.1.4.2 jdolecek
184 1.1.4.2 jdolecek /*
185 1.1.4.2 jdolecek * This macro does a bulk copy with shifting. cmplt and m are
186 1.1.4.2 jdolecek * the completer and displacement multiplier, respectively, for
187 1.1.4.2 jdolecek * the load and store instructions. It is assumed that the
188 1.1.4.2 jdolecek * word last loaded is already in t4.
189 1.1.4.2 jdolecek */
190 1.1.4.2 jdolecek #define _COPYS(src_spc, src_off, dst_spc, dst_off, count, cmplt, m) \
191 1.1.4.2 jdolecek ! \
192 1.1.4.2 jdolecek /* ! \
193 1.1.4.2 jdolecek * Loop storing 16 bytes at a time. Since count ! \
194 1.1.4.2 jdolecek * may be > INT_MAX, we have to be careful and ! \
195 1.1.4.2 jdolecek * avoid comparisons that treat it as a signed ! \
196 1.1.4.2 jdolecek * quantity, until after this loop, when count ! \
197 1.1.4.2 jdolecek * is guaranteed to be less than 16. ! \
198 1.1.4.2 jdolecek */ ! \
199 1.1.4.2 jdolecek comib,>>=,n 15, count, _LABEL(S_skip16) ! \
200 1.1.4.2 jdolecek .label _LABEL(S_loop16) ! \
201 1.1.4.2 jdolecek addi -16, count, count ! \
202 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t1 ! \
203 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t2 ! \
204 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t3 ! \
205 1.1.4.2 jdolecek _VSHD(0, m, 1) /* vshd t4, t1, %r1 */ ! \
206 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t4 ! \
207 1.1.4.2 jdolecek _VSHD(1, m, 22) /* vshd t1, t2, t1 */ ! \
208 1.1.4.2 jdolecek _VSHD(2, m, 21) /* vshd t2, t3, t2 */ ! \
209 1.1.4.2 jdolecek _VSHD(3, m, 20) /* vshd t3, t4, t3 */ ! \
210 1.1.4.2 jdolecek stws,cmplt %r1, m*4(dst_spc, dst_off) ! \
211 1.1.4.2 jdolecek stws,cmplt t1, m*4(dst_spc, dst_off) ! \
212 1.1.4.2 jdolecek stws,cmplt t2, m*4(dst_spc, dst_off) ! \
213 1.1.4.2 jdolecek comib,<< 15, count, _LABEL(S_loop16) ! \
214 1.1.4.2 jdolecek stws,cmplt t3, m*4(dst_spc, dst_off) ! \
215 1.1.4.2 jdolecek .label _LABEL(S_skip16) ! \
216 1.1.4.2 jdolecek ! \
217 1.1.4.2 jdolecek /* Loop storing 4 bytes at a time. */ ! \
218 1.1.4.2 jdolecek addib,<,n -4, count, _LABEL(S_skip4) ! \
219 1.1.4.2 jdolecek .label _LABEL(S_loop4) ! \
220 1.1.4.2 jdolecek ldws,cmplt m*4(src_spc, src_off), t1 ! \
221 1.1.4.2 jdolecek _VSHD(0, m, 1) /* into r1 (1) */ ! \
222 1.1.4.2 jdolecek copy t1, t4 ! \
223 1.1.4.2 jdolecek addib,>= -4, count, _LABEL(S_loop4) ! \
224 1.1.4.2 jdolecek stws,cmplt %r1, m*4(dst_spc, dst_off) ! \
225 1.1.4.2 jdolecek .label _LABEL(S_skip4) ! \
226 1.1.4.2 jdolecek ! \
227 1.1.4.2 jdolecek /* ! \
228 1.1.4.2 jdolecek * We now need to "back up" src_off by the ! \
229 1.1.4.2 jdolecek * number of bytes remaining in the FIFO ! \
230 1.1.4.2 jdolecek * (i.e., the number of bytes remaining in t4), ! \
231 1.1.4.2 jdolecek * because (the correct) count still includes ! \
232 1.1.4.2 jdolecek * these bytes, and we intent to keep it that ! \
233 1.1.4.2 jdolecek * way, and finish with the single-byte copier. ! \
234 1.1.4.2 jdolecek * ! \
235 1.1.4.2 jdolecek * The number of bytes remaining in the FIFO is ! \
236 1.1.4.2 jdolecek * related to the shift count, so recover it, ! \
237 1.1.4.2 jdolecek * restoring the correct count at the same time. ! \
238 1.1.4.2 jdolecek */ ! \
239 1.1.4.2 jdolecek mfctl %cr11, t1 ! \
240 1.1.4.2 jdolecek addi 4, count, count ! \
241 1.1.4.2 jdolecek shd %r0, t1, 3, t1 ! \
242 1.1.4.2 jdolecek ! \
243 1.1.4.2 jdolecek /* ! \
244 1.1.4.2 jdolecek * If we're copying forward, the shift count ! \
245 1.1.4.2 jdolecek * is the number of bytes remaining in the ! \
246 1.1.4.2 jdolecek * FIFO, and we want to subtract it from src_off. ! \
247 1.1.4.2 jdolecek * If we're copying backwards, (4 - shift count) ! \
248 1.1.4.2 jdolecek * is the number of bytes remaining in the FIFO, ! \
249 1.1.4.2 jdolecek * and we want to add it to src_off. ! \
250 1.1.4.2 jdolecek * ! \
251 1.1.4.2 jdolecek * We observe that x + (4 - y) = x - (y - 4), ! \
252 1.1.4.2 jdolecek * and introduce this instruction to add -4 when ! \
253 1.1.4.2 jdolecek * m is -1, although this does mean one extra ! \
254 1.1.4.2 jdolecek * instruction in the forward case. ! \
255 1.1.4.2 jdolecek */ ! \
256 1.1.4.2 jdolecek addi 4*((m - 1) / 2), t1, t1 ! \
257 1.1.4.2 jdolecek ! \
258 1.1.4.2 jdolecek /* Now branch to the byte-at-a-time loop. */ ! \
259 1.1.4.2 jdolecek b _LABEL(_do1) ! \
260 1.1.4.2 jdolecek sub src_off, t1, src_off
261 1.1.4.2 jdolecek
262 1.1.4.2 jdolecek /*
263 1.1.4.2 jdolecek * This macro copies a region in the forward direction.
264 1.1.4.2 jdolecek */
265 1.1.4.2 jdolecek #define _COPY_FORWARD(src_spc, src_off, dst_spc, dst_off, count) \
266 1.1.4.2 jdolecek ! \
267 1.1.4.2 jdolecek /* ! \
268 1.1.4.2 jdolecek * Since in the shifting-left case we will ! \
269 1.1.4.2 jdolecek * load 8 bytes before checking count, to ! \
270 1.1.4.2 jdolecek * keep things simple, branch to the byte ! \
271 1.1.4.2 jdolecek * copier unless we're copying at least 8. ! \
272 1.1.4.2 jdolecek */ ! \
273 1.1.4.2 jdolecek comib,>>,n 8, count, _LABEL(_do1) ! \
274 1.1.4.2 jdolecek ! \
275 1.1.4.2 jdolecek /* ! \
276 1.1.4.2 jdolecek * Once we 4-byte align the source offset, ! \
277 1.1.4.2 jdolecek * figure out how many bytes from the region ! \
278 1.1.4.2 jdolecek * will be in the first 4-byte word we read. ! \
279 1.1.4.2 jdolecek * Ditto for writing the destination offset. ! \
280 1.1.4.2 jdolecek */ ! \
281 1.1.4.2 jdolecek extru src_off, 31, 2, t1 ! \
282 1.1.4.2 jdolecek extru dst_off, 31, 2, t2 ! \
283 1.1.4.2 jdolecek subi 4, t1, t1 ! \
284 1.1.4.2 jdolecek subi 4, t2, t2 ! \
285 1.1.4.2 jdolecek ! \
286 1.1.4.2 jdolecek /* ! \
287 1.1.4.2 jdolecek * Calculate the byte shift required. A ! \
288 1.1.4.2 jdolecek * positive value means a source 4-byte word ! \
289 1.1.4.2 jdolecek * has to be shifted to the right to line up ! \
290 1.1.4.2 jdolecek * as a destination 4-byte word. ! \
291 1.1.4.2 jdolecek */ ! \
292 1.1.4.2 jdolecek sub t1, t2, t1 ! \
293 1.1.4.2 jdolecek ! \
294 1.1.4.2 jdolecek /* 4-byte align src_off. */ ! \
295 1.1.4.2 jdolecek depi 0, 31, 2, src_off ! \
296 1.1.4.2 jdolecek ! \
297 1.1.4.2 jdolecek /* ! \
298 1.1.4.2 jdolecek * It's somewhat important to note that this ! \
299 1.1.4.2 jdolecek * code thinks of count as "the number of bytes ! \
300 1.1.4.2 jdolecek * that haven't been stored yet", as opposed to ! \
301 1.1.4.2 jdolecek * "the number of bytes that haven't been copied ! \
302 1.1.4.2 jdolecek * yet". The distinction is subtle, but becomes ! \
303 1.1.4.2 jdolecek * apparent at the end of the shifting code, where ! \
304 1.1.4.2 jdolecek * we "back up" src_off to correspond to count, ! \
305 1.1.4.2 jdolecek * as opposed to flushing the FIFO. ! \
306 1.1.4.2 jdolecek * ! \
307 1.1.4.2 jdolecek * We calculated above how many bytes our first ! \
308 1.1.4.2 jdolecek * store will store, so update count now. ! \
309 1.1.4.2 jdolecek * ! \
310 1.1.4.2 jdolecek * If the shift is zero, strictly as an optimization ! \
311 1.1.4.2 jdolecek * we use a copy loop that does no shifting. ! \
312 1.1.4.2 jdolecek */ ! \
313 1.1.4.2 jdolecek comb,<> %r0, t1, _LABEL(_shifting) ! \
314 1.1.4.2 jdolecek sub count, t2, count ! \
315 1.1.4.2 jdolecek ! \
316 1.1.4.2 jdolecek /* Load and store the first word. */ ! \
317 1.1.4.2 jdolecek ldws,ma 4(src_spc, src_off), t4 ! \
318 1.1.4.2 jdolecek stbys,b,m t4, 4(dst_spc, dst_off) ! \
319 1.1.4.2 jdolecek ! \
320 1.1.4.2 jdolecek /* Do the rest of the copy. */ ! \
321 1.1.4.2 jdolecek _COPY(src_spc,src_off,dst_spc,dst_off,count,ma,1) ! \
322 1.1.4.2 jdolecek ! \
323 1.1.4.2 jdolecek .label _LABEL(_shifting) ! \
324 1.1.4.2 jdolecek ! \
325 1.1.4.2 jdolecek /* ! \
326 1.1.4.2 jdolecek * If shift < 0, we need to shift words to the ! \
327 1.1.4.2 jdolecek * left. Since we can't do this directly, we ! \
328 1.1.4.2 jdolecek * adjust the shift so it's a shift to the right ! \
329 1.1.4.2 jdolecek * and load the first word into the high word of ! \
330 1.1.4.2 jdolecek * the FIFO. Otherwise, we load a zero into the ! \
331 1.1.4.2 jdolecek * high word of the FIFO. ! \
332 1.1.4.2 jdolecek */ ! \
333 1.1.4.2 jdolecek comb,<= %r0, t1, _LABEL(_shiftingrt) ! \
334 1.1.4.2 jdolecek copy %r0, t3 ! \
335 1.1.4.2 jdolecek addi 4, t1, t1 ! \
336 1.1.4.2 jdolecek ldws,ma 4(src_spc, src_off), t3 ! \
337 1.1.4.2 jdolecek .label _LABEL(_shiftingrt) ! \
338 1.1.4.2 jdolecek ! \
339 1.1.4.2 jdolecek /* ! \
340 1.1.4.2 jdolecek * Turn the shift byte count into a bit count, ! \
341 1.1.4.2 jdolecek * load the next word, set the Shift Amount ! \
342 1.1.4.2 jdolecek * Register, and form and store the first word. ! \
343 1.1.4.2 jdolecek */ ! \
344 1.1.4.2 jdolecek sh3add t1, %r0, t1 ! \
345 1.1.4.2 jdolecek ldws,ma 4(src_spc, src_off), t4 ! \
346 1.1.4.2 jdolecek mtctl t1, %cr11 ! \
347 1.1.4.2 jdolecek vshd t3, t4, %r1 ! \
348 1.1.4.2 jdolecek stbys,b,m %r1, 4(dst_spc, dst_off) ! \
349 1.1.4.2 jdolecek ! \
350 1.1.4.2 jdolecek /* Do the rest of the copy. */ ! \
351 1.1.4.2 jdolecek _COPYS(src_spc,src_off,dst_spc,dst_off,count,ma,1)
352 1.1.4.2 jdolecek
353 1.1.4.2 jdolecek /* This macro copies a region in the reverse direction. */
354 1.1.4.2 jdolecek #define _COPY_REVERSE(src_spc, src_off, dst_spc, dst_off, count) \
355 1.1.4.2 jdolecek ! \
356 1.1.4.2 jdolecek /* Immediately add count to both offsets. */ ! \
357 1.1.4.2 jdolecek add src_off, count, src_off ! \
358 1.1.4.2 jdolecek add dst_off, count, dst_off ! \
359 1.1.4.2 jdolecek ! \
360 1.1.4.2 jdolecek /* ! \
361 1.1.4.2 jdolecek * Since in the shifting-right case we ! \
362 1.1.4.2 jdolecek * will load 8 bytes before checking ! \
363 1.1.4.2 jdolecek * count, to keep things simple, branch ! \
364 1.1.4.2 jdolecek * to the byte copier unless we're ! \
365 1.1.4.2 jdolecek * copying at least 8 bytes. ! \
366 1.1.4.2 jdolecek */ ! \
367 1.1.4.2 jdolecek comib,>>,n 8, count, _LABEL(_do1) ! \
368 1.1.4.2 jdolecek ! \
369 1.1.4.2 jdolecek /* ! \
370 1.1.4.2 jdolecek * Once we 4-byte align the source offset, ! \
371 1.1.4.2 jdolecek * figure out how many bytes from the region ! \
372 1.1.4.2 jdolecek * will be in the first 4-byte word we read. ! \
373 1.1.4.2 jdolecek * Ditto for writing the destination offset. ! \
374 1.1.4.2 jdolecek */ ! \
375 1.1.4.2 jdolecek extru,<> src_off, 31, 2, t1 ! \
376 1.1.4.2 jdolecek ldi 4, t1 ! \
377 1.1.4.2 jdolecek extru,<> dst_off, 31, 2, t2 ! \
378 1.1.4.2 jdolecek ldi 4, t2 ! \
379 1.1.4.2 jdolecek ! \
380 1.1.4.2 jdolecek /* ! \
381 1.1.4.2 jdolecek * Calculate the byte shift required. A ! \
382 1.1.4.2 jdolecek * positive value means a source 4-byte ! \
383 1.1.4.2 jdolecek * word has to be shifted to the right to ! \
384 1.1.4.2 jdolecek * line up as a destination 4-byte word. ! \
385 1.1.4.2 jdolecek */ ! \
386 1.1.4.2 jdolecek sub t2, t1, t1 ! \
387 1.1.4.2 jdolecek ! \
388 1.1.4.2 jdolecek /* ! \
389 1.1.4.2 jdolecek * 4-byte align src_off, leaving it pointing ! \
390 1.1.4.2 jdolecek * to the 4-byte word *after* the next word ! \
391 1.1.4.2 jdolecek * we intend to load. ! \
392 1.1.4.2 jdolecek * ! \
393 1.1.4.2 jdolecek * It's somewhat important to note that this ! \
394 1.1.4.2 jdolecek * code thinks of count as "the number of bytes ! \
395 1.1.4.2 jdolecek * that haven't been stored yet", as opposed to ! \
396 1.1.4.2 jdolecek * "the number of bytes that haven't been copied ! \
397 1.1.4.2 jdolecek * yet". The distinction is subtle, but becomes ! \
398 1.1.4.2 jdolecek * apparent at the end of the shifting code, where ! \
399 1.1.4.2 jdolecek * we "back up" src_off to correspond to count, ! \
400 1.1.4.2 jdolecek * as opposed to flushing the FIFO. ! \
401 1.1.4.2 jdolecek * ! \
402 1.1.4.2 jdolecek * We calculated above how many bytes our first ! \
403 1.1.4.2 jdolecek * store will store, so update count now. ! \
404 1.1.4.2 jdolecek * ! \
405 1.1.4.2 jdolecek * If the shift is zero, we use a copy loop that ! \
406 1.1.4.2 jdolecek * does no shifting. NB: unlike the forward case, ! \
407 1.1.4.2 jdolecek * this is NOT strictly an optimization. If the ! \
408 1.1.4.2 jdolecek * SAR is zero the vshds do NOT do the right thing. ! \
409 1.1.4.2 jdolecek * This is another assymetry more or less the "fault" ! \
410 1.1.4.2 jdolecek * of vshd. ! \
411 1.1.4.2 jdolecek */ ! \
412 1.1.4.2 jdolecek addi 3, src_off, src_off ! \
413 1.1.4.2 jdolecek sub count, t2, count ! \
414 1.1.4.2 jdolecek comb,<> %r0, t1, _LABEL(_shifting) ! \
415 1.1.4.2 jdolecek depi 0, 31, 2, src_off ! \
416 1.1.4.2 jdolecek ! \
417 1.1.4.2 jdolecek /* Load and store the first word. */ ! \
418 1.1.4.2 jdolecek ldws,mb -4(src_spc, src_off), t4 ! \
419 1.1.4.2 jdolecek _STBYS_E_M(t4, dst_spc, dst_off) ! \
420 1.1.4.2 jdolecek ! \
421 1.1.4.2 jdolecek /* Do the rest of the copy. */ ! \
422 1.1.4.2 jdolecek _COPY(src_spc,src_off,dst_spc,dst_off,count,mb,-1) ! \
423 1.1.4.2 jdolecek ! \
424 1.1.4.2 jdolecek .label _LABEL(_shifting) ! \
425 1.1.4.2 jdolecek ! \
426 1.1.4.2 jdolecek /* ! \
427 1.1.4.2 jdolecek * If shift < 0, we need to shift words to the ! \
428 1.1.4.2 jdolecek * left. Since we can't do this directly, we ! \
429 1.1.4.2 jdolecek * adjust the shift so it's a shift to the right ! \
430 1.1.4.2 jdolecek * and load a zero in to the low word of the FIFO. ! \
431 1.1.4.2 jdolecek * Otherwise, we load the first word into the ! \
432 1.1.4.2 jdolecek * low word of the FIFO. ! \
433 1.1.4.2 jdolecek * ! \
434 1.1.4.2 jdolecek * Note the nullification trickery here. We ! \
435 1.1.4.2 jdolecek * assume that we're shifting to the left, and ! \
436 1.1.4.2 jdolecek * load zero into the low word of the FIFO. Then ! \
437 1.1.4.2 jdolecek * we nullify the addi if we're shifting to the ! \
438 1.1.4.2 jdolecek * right. If the addi is not nullified, we are ! \
439 1.1.4.2 jdolecek * shifting to the left, so we nullify the load. ! \
440 1.1.4.2 jdolecek * we branch if we're shifting to the ! \
441 1.1.4.2 jdolecek */ ! \
442 1.1.4.2 jdolecek copy %r0, t3 ! \
443 1.1.4.2 jdolecek comb,<=,n %r0, t1, 0 ! \
444 1.1.4.2 jdolecek addi,tr 4, t1, t1 ! \
445 1.1.4.2 jdolecek ldws,mb -4(src_spc, src_off), t3 ! \
446 1.1.4.2 jdolecek ! \
447 1.1.4.2 jdolecek /* ! \
448 1.1.4.2 jdolecek * Turn the shift byte count into a bit count, ! \
449 1.1.4.2 jdolecek * load the next word, set the Shift Amount ! \
450 1.1.4.2 jdolecek * Register, and form and store the first word. ! \
451 1.1.4.2 jdolecek */ ! \
452 1.1.4.2 jdolecek sh3add t1, %r0, t1 ! \
453 1.1.4.2 jdolecek ldws,mb -4(src_spc, src_off), t4 ! \
454 1.1.4.2 jdolecek mtctl t1, %cr11 ! \
455 1.1.4.2 jdolecek vshd t4, t3, %r1 ! \
456 1.1.4.2 jdolecek _STBYS_E_M(%r1, dst_spc, dst_off) ! \
457 1.1.4.2 jdolecek ! \
458 1.1.4.2 jdolecek /* Do the rest of the copy. */ ! \
459 1.1.4.2 jdolecek _COPYS(src_spc,src_off,dst_spc,dst_off,count,mb,-1)
460 1.1.4.2 jdolecek
461 1.1.4.2 jdolecek /*
462 1.1.4.2 jdolecek * For paranoia, when things aren't going well, enable this
463 1.1.4.2 jdolecek * code to assemble byte-at-a-time-only copying.
464 1.1.4.2 jdolecek */
465 1.1.4.2 jdolecek #if 1
466 1.1.4.2 jdolecek #undef _COPY_FORWARD
467 1.1.4.2 jdolecek #define _COPY_FORWARD(src_spc, src_off, dst_spc, dst_off, count) \
468 1.1.4.2 jdolecek comb,=,n %r0, count, _LABEL(_done) ! \
469 1.1.4.2 jdolecek ldbs,ma 1(src_spc, src_off), %r1 ! \
470 1.1.4.2 jdolecek addib,<> -1, count, -12 ! \
471 1.1.4.2 jdolecek stbs,ma %r1, 1(dst_spc, dst_off) ! \
472 1.1.4.2 jdolecek b,n _LABEL(_done)
473 1.1.4.2 jdolecek #undef _COPY_REVERSE
474 1.1.4.2 jdolecek #define _COPY_REVERSE(src_spc, src_off, dst_spc, dst_off, count) \
475 1.1.4.2 jdolecek comb,= %r0, count, _LABEL(_done) ! \
476 1.1.4.2 jdolecek add src_off, count, src_off ! \
477 1.1.4.2 jdolecek add dst_off, count, dst_off ! \
478 1.1.4.2 jdolecek ldbs,mb -1(src_spc, src_off), %r1 ! \
479 1.1.4.2 jdolecek addib,<> -1, count, -12 ! \
480 1.1.4.2 jdolecek stbs,mb %r1, -1(dst_spc, dst_off) ! \
481 1.1.4.2 jdolecek b,n _LABEL(_done)
482 1.1.4.2 jdolecek #endif
483 1.1.4.2 jdolecek
484 1.1.4.2 jdolecek /*
485 1.1.4.2 jdolecek * If none of the following are defined, define BCOPY.
486 1.1.4.2 jdolecek */
487 1.1.4.2 jdolecek #if !(defined(SPCOPY) || defined(MEMCPY) || defined(MEMMOVE))
488 1.1.4.2 jdolecek #define BCOPY
489 1.1.4.2 jdolecek #endif
490 1.1.4.2 jdolecek
491 1.1.4.2 jdolecek #if defined(SPCOPY) && !defined(_STANDALONE)
492 1.1.4.2 jdolecek #include <sys/errno.h>
493 1.1.4.2 jdolecek #include "assym.h"
494 1.1.4.2 jdolecek
495 1.1.4.2 jdolecek /*
496 1.1.4.2 jdolecek * int spcopy(pa_space_t ssp, const void *src, pa_space_t dsp, void *dst,
497 1.1.4.2 jdolecek * size_t len)
498 1.1.4.2 jdolecek *
499 1.1.4.2 jdolecek * We assume that the regions do not overlap.
500 1.1.4.2 jdolecek */
501 1.1.4.2 jdolecek LEAF_ENTRY(spcopy)
502 1.1.4.2 jdolecek
503 1.1.4.2 jdolecek /*
504 1.1.4.2 jdolecek * Setup the fault handler, and load %ret0
505 1.1.4.2 jdolecek * with EFAULT, assuming the copy will fail.
506 1.1.4.2 jdolecek */
507 1.1.4.2 jdolecek .import curproc, data
508 1.1.4.2 jdolecek ldil L%curproc, %r31
509 1.1.4.2 jdolecek ldw R%curproc(%r31), %r31
510 1.1.4.2 jdolecek #ifdef DIAGNOSTIC
511 1.1.4.2 jdolecek comb,<>,n %r0, %r31, Lspcopy_curproc_ok
512 1.1.4.2 jdolecek ldil L%panic, %r1
513 1.1.4.2 jdolecek ldil L%Lspcopy_curproc_bad, %arg0
514 1.1.4.2 jdolecek ldo R%panic(%r1), %r1
515 1.1.4.2 jdolecek ldo R%Lspcopy_curproc_bad(%arg0), %arg0
516 1.1.4.2 jdolecek .call
517 1.1.4.2 jdolecek bv,n %r0(%r1)
518 1.1.4.2 jdolecek nop
519 1.1.4.2 jdolecek Lspcopy_curproc_bad
520 1.1.4.2 jdolecek .asciz "spcopy: curproc == NULL\n"
521 1.1.4.2 jdolecek .align 8
522 1.1.4.2 jdolecek Lspcopy_curproc_ok
523 1.1.4.2 jdolecek #endif /* DIAGNOSTIC */
524 1.1.4.2 jdolecek ldil L%spcopy_fault, %r1
525 1.1.4.2 jdolecek ldw P_ADDR(%r31), %r31
526 1.1.4.2 jdolecek ldo R%spcopy_fault(%r1), %r1
527 1.1.4.2 jdolecek ldi EFAULT, %ret0
528 1.1.4.2 jdolecek stw %r1, U_PCB+PCB_ONFAULT(%r31)
529 1.1.4.2 jdolecek
530 1.1.4.2 jdolecek /* Setup the space registers. */
531 1.1.4.2 jdolecek mfsp sr2, %ret1
532 1.1.4.2 jdolecek mtsp %arg0, sr1
533 1.1.4.2 jdolecek mtsp %arg2, sr2
534 1.1.4.2 jdolecek
535 1.1.4.2 jdolecek /* Get the len argument and do the copy. */
536 1.1.4.2 jdolecek ldw HPPA_FRAME_ARG(4)(sp), %arg0
537 1.1.4.2 jdolecek #define _LABEL(l) __CONCAT(spcopy,l)
538 1.1.4.2 jdolecek _COPY_FORWARD(sr1,%arg1,sr2,%arg3,%arg0)
539 1.1.4.2 jdolecek _LABEL(_done)
540 1.1.4.2 jdolecek
541 1.1.4.2 jdolecek /* Return. */
542 1.1.4.2 jdolecek copy %r0, %ret0
543 1.1.4.2 jdolecek ALTENTRY(spcopy_fault)
544 1.1.4.2 jdolecek stw %r0, U_PCB+PCB_ONFAULT(%r31)
545 1.1.4.2 jdolecek bv %r0(%rp)
546 1.1.4.2 jdolecek mtsp %ret1, sr2
547 1.1.4.2 jdolecek EXIT(spcopy)
548 1.1.4.2 jdolecek #endif /* SPCOPY && !_STANDALONE */
549 1.1.4.2 jdolecek
550 1.1.4.2 jdolecek #ifdef MEMCPY
551 1.1.4.2 jdolecek /*
552 1.1.4.2 jdolecek * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
553 1.1.4.2 jdolecek *
554 1.1.4.2 jdolecek * memcpy is specifically restricted to working on
555 1.1.4.2 jdolecek * non-overlapping regions, so we can just copy forward.
556 1.1.4.2 jdolecek */
557 1.1.4.2 jdolecek LEAF_ENTRY(memcpy)
558 1.1.4.2 jdolecek copy %arg0, %ret0
559 1.1.4.2 jdolecek #define _LABEL(l) __CONCAT(memcpy,l)
560 1.1.4.2 jdolecek _COPY_FORWARD(sr0,%arg1,sr0,%arg0,%arg2)
561 1.1.4.2 jdolecek _LABEL(_done)
562 1.1.4.2 jdolecek bv,n %r0(%rp)
563 1.1.4.2 jdolecek nop
564 1.1.4.2 jdolecek EXIT(memcpy)
565 1.1.4.2 jdolecek #endif /* MEMCPY */
566 1.1.4.2 jdolecek
567 1.1.4.2 jdolecek #ifdef BCOPY
568 1.1.4.2 jdolecek /*
569 1.1.4.2 jdolecek * void bcopy(const void *src, void *dst, size_t len);
570 1.1.4.2 jdolecek */
571 1.1.4.2 jdolecek LEAF_ENTRY(bcopy)
572 1.1.4.2 jdolecek copy %arg0, %r1
573 1.1.4.2 jdolecek copy %arg1, %arg0
574 1.1.4.2 jdolecek copy %r1, %arg1
575 1.1.4.2 jdolecek /* FALLTHROUGH */
576 1.1.4.2 jdolecek #define _LABEL_F(l) __CONCAT(bcopy_F,l)
577 1.1.4.2 jdolecek #define _LABEL_R(l) __CONCAT(bcopy_R,l)
578 1.1.4.2 jdolecek #endif
579 1.1.4.2 jdolecek
580 1.1.4.2 jdolecek #ifdef MEMMOVE
581 1.1.4.2 jdolecek /*
582 1.1.4.2 jdolecek * void *memmove(void *dst, const void *src, size_t len);
583 1.1.4.2 jdolecek */
584 1.1.4.2 jdolecek LEAF_ENTRY(memmove)
585 1.1.4.2 jdolecek #define _LABEL_F(l) __CONCAT(memmove_F,l)
586 1.1.4.2 jdolecek #define _LABEL_R(l) __CONCAT(memmove_R,l)
587 1.1.4.2 jdolecek copy %arg0, %ret0
588 1.1.4.2 jdolecek #endif /* MEMMOVE */
589 1.1.4.2 jdolecek
590 1.1.4.2 jdolecek #if defined(BCOPY) || defined(MEMMOVE)
591 1.1.4.2 jdolecek
592 1.1.4.2 jdolecek /*
593 1.1.4.2 jdolecek * If src >= dst or src + len <= dst, we copy
594 1.1.4.2 jdolecek * forward, else we copy in reverse.
595 1.1.4.2 jdolecek */
596 1.1.4.2 jdolecek add %arg1, %arg2, %r1
597 1.1.4.2 jdolecek comb,>>=,n %arg1, %arg0, 0
598 1.1.4.2 jdolecek comb,>>,n %r1, %arg0, _LABEL_R(_go)
599 1.1.4.2 jdolecek
600 1.1.4.2 jdolecek #define _LABEL _LABEL_F
601 1.1.4.2 jdolecek _COPY_FORWARD(sr0,%arg1,sr0,%arg0,%arg2)
602 1.1.4.2 jdolecek #undef _LABEL
603 1.1.4.2 jdolecek
604 1.1.4.2 jdolecek _LABEL_R(_go)
605 1.1.4.2 jdolecek #define _LABEL _LABEL_R
606 1.1.4.2 jdolecek _COPY_REVERSE(sr0,%arg1,sr0,%arg0,%arg2)
607 1.1.4.2 jdolecek #undef _LABEL
608 1.1.4.2 jdolecek
609 1.1.4.2 jdolecek _LABEL_F(_done)
610 1.1.4.2 jdolecek _LABEL_R(_done)
611 1.1.4.2 jdolecek bv,n %r0(%rp)
612 1.1.4.2 jdolecek nop
613 1.1.4.2 jdolecek #ifdef BCOPY
614 1.1.4.2 jdolecek EXIT(bcopy)
615 1.1.4.2 jdolecek #else
616 1.1.4.2 jdolecek EXIT(memmove)
617 1.1.4.2 jdolecek #endif
618 1.1.4.2 jdolecek #endif /* BCOPY || MEMMOVE */
619