1 /* $NetBSD: cpu_in_cksum_asm_neon.S,v 1.5 2025/06/19 22:00:54 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 2012 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Matt Thomas of 3am Software Foundry. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <machine/asm.h> 33 34 RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.5 2025/06/19 22:00:54 andvar Exp $") 35 36 /* 37 * uint32_t 38 * cpu_in_cksum_neon(const void *dptr, size_t dlen) 39 * 40 * r0 = dptr 41 * r1 = dlen 42 */ 43 ENTRY(cpu_in_cksum_neon) 44 mov ip, r0 /* leave r0 as temp */ 45 add r3, r1, ip /* get end pointer */ 46 and r1, ip, #7 /* get start offset (leading bytes) */ 47 and r2, r3, #7 /* get end offset (trailing bytes) */ 48 bic ip, ip, #7 /* start on a dword boundary */ 49 add r3, r3, #7 /* round up to a dword boundary */ 50 bic r3, r3, #7 /* end on a dword boundary */ 51 veor q2, q2, q2 /* clear accumulator */ 52 vmvn.u64 q1, q2 /* create leading/trailing masks */ 53 /* 54 * Normally the lower addressed is in d6 but in this case we want to 55 * reverse it since we might only have a single dword and the final 56 * fold will want the dword to trim in d7 so put the first dword in 57 * d7 until we know we are going to read more than one. 58 */ 59 veor d6, d6, d6 /* clear second dword */ 60 vld1.64 {d7}, [ip:64]! /* load first dword */ 61 orrs r0, r1, r2 /* do we have any offsets */ 62 beq .Lpre_main_loop /* no, proceed to main loop. */ 63 mov r1, r1, lsl #3 /* leading bytes -> bits */ 64 movs r2, r2, lsl #3 /* trailing bytes -> bits */ 65 #ifdef __ARMEL__ 66 subne r2, r2, #64 /* trim trailing MSBs */ 67 #else 68 rsb r1, r1, #0 /* trim leading MSBs */ 69 rsbne r2, r2, #64 /* trim trailing LSBs */ 70 #endif 71 vmov d0, r1, r2 /* move shifts */ 72 vmovl.u32 q0, d0 /* 2 U32 -> 2 U64 */ 73 vshl.u64 q1, q1, q0 /* apply shifts to masks */ 74 vand.u32 d7, d7, d2 /* apply leading mask to 1st dword */ 75 tst r1, #8 /* was the starting address odd? */ 76 beq .Lpre_main_loop /* no, go to pre_main_loop */ 77 veor d2, d2, d2 /* clear d2 (indicate odd addr) */ 78 79 .Lpre_main_loop: 80 cmp ip, r3 /* do we just have a single dword? */ 81 beq .Lfinish_up /* yes, let finish up! */ 82 vmov d6, d7 /* move 1st dword to loaddr reg */ 83 vld1.64 {d7}, [ip:64]! /* read rest of initial qword */ 84 85 .Lmain_loop: 86 subs r1, r3, ip /* how much left to do? */ 87 beq .Lfinish_up /* = 0? we are done. */ 88 89 bics r0, r1, #31 /* we deal with octawords only */ 90 beq .Lloop_end /* no octawords? exit loop */ 91 rsbs r0, r0, #128 /* subtract from 128 */ 92 ble .Lloop128 /* <= 0?, do 128 at a time. */ 93 add r0, r0, r0, lsr #2 /* multiple by 1.25 */ 94 add pc, pc, r0 /* and jump! */ 95 nop 96 97 .Lloop128: 98 vld1.64 {d8-d9}, [ip:64]! /* 128 left */ 99 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 100 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 101 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 102 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 103 vld1.64 {d6-d7}, [ip:64]! 104 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 105 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 106 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 107 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 108 109 vld1.64 {d8-d9}, [ip:64]! /* 96 left */ 110 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 111 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 112 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 113 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 114 vld1.64 {d6-d7}, [ip:64]! 115 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 116 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 117 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 118 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 119 120 vld1.64 {d8-d9}, [ip:64]! /* 64 left */ 121 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 122 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 123 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 124 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 125 vld1.64 {d6-d7}, [ip:64]! 126 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 127 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 128 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 129 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 130 131 vld1.64 {d8-d9}, [ip:64]! /* 32 left */ 132 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 133 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 134 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 135 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 136 vld1.64 {d6-d7}, [ip:64]! 137 vmovl.u16 q0, d8 /* 4 U16 -> 4 U32 */ 138 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 139 vmovl.u16 q0, d9 /* 4 U16 -> 4 U32 */ 140 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 141 142 b .Lmain_loop 143 144 .Lloop_end: 145 /* 146 * We have one to 3 more dwords to process 147 */ 148 rsb r0, r1, #24 149 add r0, r0, r0, lsr #1 150 add pc, pc, r0 151 nop 152 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 153 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 154 vld1.64 {d6}, [ip:64]! 155 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 156 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 157 vld1.64 {d6}, [ip:64]! 158 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 159 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 160 vld1.64 {d7}, [ip:64]! 161 162 .Lfinish_up: 163 /* 164 * Apply remaining data in d6 and d7 165 */ 166 vmovl.u16 q0, d6 /* 4 U16 -> 4 U32 */ 167 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 168 vand d7, d7, d3 /* apply trailing mask */ 169 vmovl.u16 q0, d7 /* 4 U16 -> 4 U32 */ 170 vadd.u32 q2, q2, q0 /* add 4 U32 to accumulator */ 171 172 /* 173 * We now have 4 32-bit sums in q2 (each is 20-bits or less). 174 * Now to get to 1 I32 bit sum. 175 */ 176 vadd.u32 d4, d4, d5 /* 4 I32 -> 2 I32 */ 177 vmov r2, s4 /* get flag for odd start */ 178 teq r2, #0 /* was start addr even? */ 179 vmov r0, r1, d4 /* extract two I32 */ 180 rev16eq r0, r0 /* byte swap if start was odd */ 181 rev16eq r1, r1 /* byte swap if start was odd */ 182 adds ip, r0, r1 /* add them producing carry */ 183 #include "arm/arm/cpu_in_cksum_fold.S" 184 END(cpu_in_cksum_neon) 185