1 1.3 ryo /* $NetBSD: memset.S,v 1.3 2020/04/11 05:12:52 ryo Exp $ */ 2 1.1 matt 3 1.1 matt /*- 4 1.1 matt * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 1.1 matt * All rights reserved. 6 1.1 matt * 7 1.1 matt * This code is derived from software contributed to The NetBSD Foundation 8 1.1 matt * by Matt Thomas of 3am Software Foundry. 9 1.1 matt * 10 1.1 matt * Redistribution and use in source and binary forms, with or without 11 1.1 matt * modification, are permitted provided that the following conditions 12 1.1 matt * are met: 13 1.1 matt * 1. Redistributions of source code must retain the above copyright 14 1.1 matt * notice, this list of conditions and the following disclaimer. 15 1.1 matt * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 matt * notice, this list of conditions and the following disclaimer in the 17 1.1 matt * documentation and/or other materials provided with the distribution. 18 1.1 matt * 19 1.1 matt * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.1 matt * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.1 matt * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.1 matt * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.1 matt * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.1 matt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.1 matt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.1 matt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.1 matt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.1 matt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.1 matt * POSSIBILITY OF SUCH DAMAGE. 30 1.1 matt */ 31 1.1 matt 32 1.1 matt #include <machine/asm.h> 33 1.1 matt 34 1.1 matt ENTRY(memset) 35 1.1 matt cbz x2, .Lret 36 1.1 matt mov x15, x0 /* working data pointer */ 37 1.1 matt cbz x1, .Lzerofill 38 1.1 matt cbz x1, .Lfilled 39 1.1 matt /* 40 1.1 matt * Non zero fill, replicate to all 64 bits of x1. 41 1.1 matt */ 42 1.1 matt and x1, x1, #0xff 43 1.1 matt orr x1, x1, x1, lsl #8 44 1.1 matt orr x1, x1, x1, lsl #16 45 1.1 matt orr x1, x1, x1, lsl #32 46 1.1 matt .Lfilled: 47 1.1 matt cmp x2, #15 /* if it's small, ignore alignment */ 48 1.1 matt b.ls .Llast_subqword 49 1.1 matt 50 1.1 matt mov x6, x1 51 1.1 matt tst x15, #15 52 1.1 matt b.eq .Lqword_loop 53 1.1 matt 54 1.1 matt /* 55 1.1 matt * We have at least 15 to copy which means we can get qword alignment 56 1.1 matt * without having to check the amount left. 57 1.1 matt */ 58 1.1 matt tbz x15, #0, .Lhword_aligned 59 1.1 matt strb w1, [x15], #1 60 1.1 matt .Lhword_aligned: 61 1.1 matt tbz x15, #1, .Lword_aligned 62 1.1 matt strh w1, [x15], #2 63 1.1 matt .Lword_aligned: 64 1.1 matt tbz x15, #2, .Ldword_aligned 65 1.1 matt str w1, [x15], #4 66 1.1 matt .Ldword_aligned: 67 1.1 matt tbz x15, #3, .Lqword_aligned 68 1.1 matt str x1, [x15], #8 69 1.1 matt /* 70 1.1 matt * Now we qword aligned. Figure how much we have to write to get here. 71 1.1 matt * Then subtract from the length. If we get 0, we're done. 72 1.1 matt */ 73 1.1 matt .Lqword_aligned: 74 1.1 matt sub x5, x15, x0 75 1.1 matt subs x2, x2, x5 76 1.1 matt b.eq .Lret 77 1.1 matt 78 1.1 matt /* 79 1.1 matt * Write 16 bytes at time. If we don't have 16 bytes to write, bail. 80 1.1 matt * Keep looping if there's data to set. 81 1.1 matt */ 82 1.1 matt .Lqword_loop: 83 1.1 matt subs x2, x2, #16 84 1.1 matt b.mi .Llast_subqword 85 1.1 matt stp x1, x6, [x15], #16 86 1.1 matt b.ne .Lqword_loop 87 1.1 matt ret 88 1.1 matt 89 1.1 matt /* 90 1.1 matt * We have less than a qword to write. We hope we are aligned but since 91 1.1 matt * unaligned access works, we don't have to be aligned. 92 1.1 matt */ 93 1.1 matt .Llast_subqword: 94 1.1 matt tbz x2, #3, .Llast_subdword 95 1.1 matt str x1, [x15], #8 96 1.1 matt .Llast_subdword: 97 1.1 matt tbz x2, #2, .Llast_subword 98 1.1 matt str w1, [x15], #4 99 1.1 matt .Llast_subword: 100 1.1 matt tbz x2, #1, .Llast_subhword 101 1.1 matt strh w1, [x15], #2 102 1.1 matt .Llast_subhword: 103 1.1 matt tbz x2, #0, .Lret 104 1.1 matt strb w1, [x15] 105 1.1 matt .Lret: ret 106 1.1 matt 107 1.1 matt /* 108 1.1 matt * If we are filling with zeros then let's see if we can use the 109 1.1 matt * dc zva, <Xt> 110 1.1 matt * instruction to speed things up. 111 1.1 matt */ 112 1.1 matt .Lzerofill: 113 1.1 matt mrs x9, dczid_el0 114 1.1 matt /* 115 1.1 matt * Make sure we can the instruction isn't prohibited. 116 1.1 matt */ 117 1.1 matt tbnz x9, #4, .Lfilled 118 1.1 matt /* 119 1.1 matt * Now find out the block size. 120 1.1 matt */ 121 1.1 matt ubfx x9, x9, #0, #4 /* extract low 4 bits */ 122 1.1 matt add x9, x9, #2 /* add log2(word) */ 123 1.1 matt mov x10, #1 /* the value is log2(words) */ 124 1.1 matt lsl x10, x10, x9 /* shift to get the block size */ 125 1.1 matt cmp x2, x10 /* are we even copying a block? */ 126 1.1 matt b.lt .Lfilled /* no, do it 16 bytes at a time */ 127 1.1 matt /* 128 1.1 matt * Now we figure out how many aligned blocks we have 129 1.1 matt */ 130 1.1 matt sub x11, x10, #1 /* make block size a mask */ 131 1.1 matt add x12, x15, x11 /* round start to a block boundary */ 132 1.1 matt asr x12, x12, x9 /* "starting" block number */ 133 1.1 matt add x13, x15, x2 /* get ending address */ 134 1.1 matt asr x13, x13, x9 /* "ending" block numebr */ 135 1.1 matt cmp x13, x12 /* how many blocks? */ 136 1.2 ryo b.ls .Lfilled /* none, do it 16 bytes at a time */ 137 1.1 matt 138 1.1 matt /* 139 1.1 matt * Now we have one or more blocks to deal with. First now we need 140 1.1 matt * to get block aligned. 141 1.1 matt */ 142 1.1 matt and x7, x15, x11 /* are already aligned on a block boundary? */ 143 1.1 matt cbz x7, .Lblock_aligned 144 1.1 matt 145 1.1 matt sub x7, x10, x7 /* subtract offset from block length */ 146 1.1 matt sub x2, x2, x7 /* subtract that from length */ 147 1.2 ryo asr x7, x7, #4 /* length -> N*16 */ 148 1.1 matt 149 1.1 matt tbz x15, #0, .Lzero_hword_aligned 150 1.1 matt strb wzr, [x15], #1 151 1.1 matt .Lzero_hword_aligned: 152 1.1 matt tbz x15, #1, .Lzero_word_aligned 153 1.1 matt strh wzr, [x15], #2 154 1.1 matt .Lzero_word_aligned: 155 1.1 matt tbz x15, #2, .Lzero_dword_aligned 156 1.1 matt str wzr, [x15], #4 157 1.1 matt .Lzero_dword_aligned: 158 1.1 matt tbz x15, #3, .Lzero_qword_aligned 159 1.1 matt str xzr, [x15], #8 160 1.1 matt .Lzero_qword_aligned: 161 1.3 ryo cbz x7, .Lblock_aligned /* aligned? just branch */ 162 1.2 ryo 163 1.3 ryo /* align to DCZID_EL0:BS boundary */ 164 1.3 ryo tbz x7, #0, 0f /* fill 16byte? */ 165 1.3 ryo stp xzr, xzr, [x15], #16 166 1.3 ryo 0: 167 1.3 ryo tbz x7, #1, 1f /* fill 32byte? */ 168 1.3 ryo stp xzr, xzr, [x15], #16 169 1.3 ryo stp xzr, xzr, [x15], #16 170 1.3 ryo 1: 171 1.3 ryo lsr x7, x7, #2 172 1.3 ryo cbz x7, 9f 173 1.3 ryo .L64bytes_fill: 174 1.3 ryo sub x7, x7, #1 175 1.3 ryo stp xzr, xzr, [x15], #16 176 1.3 ryo stp xzr, xzr, [x15], #16 177 1.3 ryo stp xzr, xzr, [x15], #16 178 1.1 matt stp xzr, xzr, [x15], #16 179 1.3 ryo cbnz x7, .L64bytes_fill 180 1.3 ryo 9: 181 1.1 matt 182 1.1 matt /* 183 1.1 matt * Now we are block aligned. 184 1.1 matt */ 185 1.1 matt .Lblock_aligned: 186 1.1 matt subs x2, x2, x10 187 1.1 matt b.mi .Lblock_done 188 1.1 matt dc zva, x15 189 1.1 matt add x15, x15, x10 190 1.1 matt b.ne .Lblock_aligned 191 1.1 matt ret 192 1.1 matt 193 1.1 matt .Lblock_done: 194 1.2 ryo and x2, x2, x11 /* make positive again */ 195 1.1 matt mov x6, xzr /* fill 2nd xword */ 196 1.1 matt b .Lqword_loop /* and finish filling */ 197 1.1 matt 198 1.1 matt END(memset) 199