memset.S revision 1.2 1 1.2 ryo /* $NetBSD: memset.S,v 1.2 2017/08/29 15:00:23 ryo Exp $ */
2 1.1 matt
3 1.1 matt /*-
4 1.1 matt * Copyright (c) 2014 The NetBSD Foundation, Inc.
5 1.1 matt * All rights reserved.
6 1.1 matt *
7 1.1 matt * This code is derived from software contributed to The NetBSD Foundation
8 1.1 matt * by Matt Thomas of 3am Software Foundry.
9 1.1 matt *
10 1.1 matt * Redistribution and use in source and binary forms, with or without
11 1.1 matt * modification, are permitted provided that the following conditions
12 1.1 matt * are met:
13 1.1 matt * 1. Redistributions of source code must retain the above copyright
14 1.1 matt * notice, this list of conditions and the following disclaimer.
15 1.1 matt * 2. Redistributions in binary form must reproduce the above copyright
16 1.1 matt * notice, this list of conditions and the following disclaimer in the
17 1.1 matt * documentation and/or other materials provided with the distribution.
18 1.1 matt *
19 1.1 matt * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 1.1 matt * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 1.1 matt * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 1.1 matt * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 1.1 matt * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 1.1 matt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 1.1 matt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 1.1 matt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 1.1 matt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 1.1 matt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 1.1 matt * POSSIBILITY OF SUCH DAMAGE.
30 1.1 matt */
31 1.1 matt
32 1.1 matt #include <machine/asm.h>
33 1.1 matt
34 1.1 matt ENTRY(memset)
35 1.1 matt cbz x2, .Lret
36 1.1 matt mov x15, x0 /* working data pointer */
37 1.1 matt cbz x1, .Lzerofill
38 1.1 matt cbz x1, .Lfilled
39 1.1 matt /*
40 1.1 matt * Non zero fill, replicate to all 64 bits of x1.
41 1.1 matt */
42 1.1 matt and x1, x1, #0xff
43 1.1 matt orr x1, x1, x1, lsl #8
44 1.1 matt orr x1, x1, x1, lsl #16
45 1.1 matt orr x1, x1, x1, lsl #32
46 1.1 matt .Lfilled:
47 1.1 matt cmp x2, #15 /* if it's small, ignore alignment */
48 1.1 matt b.ls .Llast_subqword
49 1.1 matt
50 1.1 matt mov x6, x1
51 1.1 matt tst x15, #15
52 1.1 matt b.eq .Lqword_loop
53 1.1 matt
54 1.1 matt /*
55 1.1 matt * We have at least 15 to copy which means we can get qword alignment
56 1.1 matt * without having to check the amount left.
57 1.1 matt */
58 1.1 matt tbz x15, #0, .Lhword_aligned
59 1.1 matt strb w1, [x15], #1
60 1.1 matt .Lhword_aligned:
61 1.1 matt tbz x15, #1, .Lword_aligned
62 1.1 matt strh w1, [x15], #2
63 1.1 matt .Lword_aligned:
64 1.1 matt tbz x15, #2, .Ldword_aligned
65 1.1 matt str w1, [x15], #4
66 1.1 matt .Ldword_aligned:
67 1.1 matt tbz x15, #3, .Lqword_aligned
68 1.1 matt str x1, [x15], #8
69 1.1 matt /*
70 1.1 matt * Now we qword aligned. Figure how much we have to write to get here.
71 1.1 matt * Then subtract from the length. If we get 0, we're done.
72 1.1 matt */
73 1.1 matt .Lqword_aligned:
74 1.1 matt sub x5, x15, x0
75 1.1 matt subs x2, x2, x5
76 1.1 matt b.eq .Lret
77 1.1 matt
78 1.1 matt /*
79 1.1 matt * Write 16 bytes at time. If we don't have 16 bytes to write, bail.
80 1.1 matt * Keep looping if there's data to set.
81 1.1 matt */
82 1.1 matt .Lqword_loop:
83 1.1 matt subs x2, x2, #16
84 1.1 matt b.mi .Llast_subqword
85 1.1 matt stp x1, x6, [x15], #16
86 1.1 matt b.ne .Lqword_loop
87 1.1 matt ret
88 1.1 matt
89 1.1 matt /*
90 1.1 matt * We have less than a qword to write. We hope we are aligned but since
91 1.1 matt * unaligned access works, we don't have to be aligned.
92 1.1 matt */
93 1.1 matt .Llast_subqword:
94 1.1 matt tbz x2, #3, .Llast_subdword
95 1.1 matt str x1, [x15], #8
96 1.1 matt .Llast_subdword:
97 1.1 matt tbz x2, #2, .Llast_subword
98 1.1 matt str w1, [x15], #4
99 1.1 matt .Llast_subword:
100 1.1 matt tbz x2, #1, .Llast_subhword
101 1.1 matt strh w1, [x15], #2
102 1.1 matt .Llast_subhword:
103 1.1 matt tbz x2, #0, .Lret
104 1.1 matt strb w1, [x15]
105 1.1 matt .Lret: ret
106 1.1 matt
107 1.1 matt /*
108 1.1 matt * If we are filling with zeros then let's see if we can use the
109 1.1 matt * dc zva, <Xt>
110 1.1 matt * instruction to speed things up.
111 1.1 matt */
112 1.1 matt .Lzerofill:
113 1.1 matt mrs x9, dczid_el0
114 1.1 matt /*
115 1.1 matt * Make sure we can the instruction isn't prohibited.
116 1.1 matt */
117 1.1 matt tbnz x9, #4, .Lfilled
118 1.1 matt /*
119 1.1 matt * Now find out the block size.
120 1.1 matt */
121 1.1 matt ubfx x9, x9, #0, #4 /* extract low 4 bits */
122 1.1 matt add x9, x9, #2 /* add log2(word) */
123 1.1 matt mov x10, #1 /* the value is log2(words) */
124 1.1 matt lsl x10, x10, x9 /* shift to get the block size */
125 1.1 matt cmp x2, x10 /* are we even copying a block? */
126 1.1 matt b.lt .Lfilled /* no, do it 16 bytes at a time */
127 1.1 matt /*
128 1.1 matt * Now we figure out how many aligned blocks we have
129 1.1 matt */
130 1.1 matt sub x11, x10, #1 /* make block size a mask */
131 1.1 matt add x12, x15, x11 /* round start to a block boundary */
132 1.1 matt asr x12, x12, x9 /* "starting" block number */
133 1.1 matt add x13, x15, x2 /* get ending address */
134 1.1 matt asr x13, x13, x9 /* "ending" block numebr */
135 1.1 matt cmp x13, x12 /* how many blocks? */
136 1.2 ryo b.ls .Lfilled /* none, do it 16 bytes at a time */
137 1.1 matt
138 1.1 matt /*
139 1.1 matt * Now we have one or more blocks to deal with. First now we need
140 1.1 matt * to get block aligned.
141 1.1 matt */
142 1.1 matt and x7, x15, x11 /* are already aligned on a block boundary? */
143 1.1 matt cbz x7, .Lblock_aligned
144 1.1 matt
145 1.1 matt sub x7, x10, x7 /* subtract offset from block length */
146 1.1 matt sub x2, x2, x7 /* subtract that from length */
147 1.2 ryo asr x7, x7, #4 /* length -> N*16 */
148 1.1 matt
149 1.1 matt tbz x15, #0, .Lzero_hword_aligned
150 1.1 matt strb wzr, [x15], #1
151 1.1 matt .Lzero_hword_aligned:
152 1.1 matt tbz x15, #1, .Lzero_word_aligned
153 1.1 matt strh wzr, [x15], #2
154 1.1 matt .Lzero_word_aligned:
155 1.1 matt tbz x15, #2, .Lzero_dword_aligned
156 1.1 matt str wzr, [x15], #4
157 1.1 matt .Lzero_dword_aligned:
158 1.1 matt tbz x15, #3, .Lzero_qword_aligned
159 1.1 matt str xzr, [x15], #8
160 1.1 matt .Lzero_qword_aligned:
161 1.2 ryo cbz x7, .Lblock_aligned /* less than 16 bytes? just branch */
162 1.2 ryo adr x6, .Lunrolled_end
163 1.2 ryo sub x6, x6, x7, lsl #2 /* backup to write the last N insn */
164 1.2 ryo br x6 /* and do it */
165 1.2 ryo
166 1.1 matt /*
167 1.2 ryo * The maximum size of DCZID_EL0:BS supported is 2048 bytes.
168 1.1 matt */
169 1.2 ryo .rept (2048 / 16) - 1
170 1.1 matt stp xzr, xzr, [x15], #16
171 1.2 ryo .endr
172 1.2 ryo .Lunrolled_end:
173 1.1 matt
174 1.1 matt /*
175 1.1 matt * Now we are block aligned.
176 1.1 matt */
177 1.1 matt .Lblock_aligned:
178 1.1 matt subs x2, x2, x10
179 1.1 matt b.mi .Lblock_done
180 1.1 matt dc zva, x15
181 1.1 matt add x15, x15, x10
182 1.1 matt b.ne .Lblock_aligned
183 1.1 matt ret
184 1.1 matt
185 1.1 matt .Lblock_done:
186 1.2 ryo and x2, x2, x11 /* make positive again */
187 1.1 matt mov x6, xzr /* fill 2nd xword */
188 1.1 matt b .Lqword_loop /* and finish filling */
189 1.1 matt
190 1.1 matt END(memset)
191