Home | History | Annotate | Line # | Download | only in cortex
      1 /*	$NetBSD: cpu_in_cksum_asm_neon.S,v 1.5 2025/06/19 22:00:54 andvar Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2012 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Matt Thomas of 3am Software Foundry.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <machine/asm.h>
     33 
     34 RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.5 2025/06/19 22:00:54 andvar Exp $")
     35 
     36 /*
     37  * uint32_t
     38  * cpu_in_cksum_neon(const void *dptr, size_t dlen)
     39  *
     40  *	r0 = dptr
     41  *	r1 = dlen
     42  */
     43 ENTRY(cpu_in_cksum_neon)
     44 	mov		ip, r0		/* leave r0 as temp */
     45 	add		r3, r1, ip	/* get end pointer */
     46 	and		r1, ip, #7	/* get start offset (leading bytes) */
     47 	and		r2, r3, #7	/* get end offset (trailing bytes) */
     48 	bic		ip, ip, #7	/* start on a dword boundary */
     49 	add		r3, r3, #7	/* round up to a dword boundary */
     50 	bic		r3, r3, #7	/* end on a dword boundary */
     51 	veor		q2, q2, q2	/* clear accumulator */
     52 	vmvn.u64	q1, q2		/* create leading/trailing masks */
     53 	/*
     54 	 * Normally the lower addressed is in d6 but in this case we want to
     55 	 * reverse it since we might only have a single dword and the final
     56 	 * fold will want the dword to trim in d7 so put the first dword in
     57 	 * d7 until we know we are going to read more than one.
     58 	 */
     59 	veor		d6, d6, d6	/* clear second dword */
     60 	vld1.64		{d7}, [ip:64]!	/* load first dword */
     61 	orrs		r0, r1, r2	/* do we have any offsets */
     62 	beq		.Lpre_main_loop	/*   no, proceed to main loop. */
     63 	mov		r1, r1, lsl #3	/* leading bytes -> bits */
     64 	movs		r2, r2, lsl #3	/* trailing bytes -> bits */
     65 #ifdef __ARMEL__
     66 	subne		r2, r2, #64	/* trim trailing MSBs */
     67 #else
     68 	rsb		r1, r1, #0	/* trim leading MSBs */
     69 	rsbne		r2, r2, #64	/* trim trailing LSBs */
     70 #endif
     71 	vmov		d0, r1, r2	/* move shifts */
     72 	vmovl.u32	q0, d0		/* 2 U32 -> 2 U64 */
     73 	vshl.u64	q1, q1, q0	/* apply shifts to masks */
     74 	vand.u32	d7, d7, d2	/* apply leading mask to 1st dword */
     75 	tst		r1, #8		/* was the starting address odd? */
     76 	beq		.Lpre_main_loop	/*   no, go to pre_main_loop */
     77 	veor		d2, d2, d2	/* clear d2 (indicate odd addr) */
     78 
     79 .Lpre_main_loop:
     80 	cmp		ip, r3		/* do we just have a single dword? */
     81 	beq		.Lfinish_up	/*   yes, let finish up! */
     82 	vmov		d6, d7		/* move 1st dword to loaddr reg */
     83 	vld1.64		{d7}, [ip:64]!	/* read rest of initial qword */
     84 
     85 .Lmain_loop:
     86 	subs		r1, r3, ip	/* how much left to do? */
     87 	beq		.Lfinish_up	/*   = 0? we are done. */
     88 
     89 	bics		r0, r1, #31	/* we deal with octawords only */
     90 	beq		.Lloop_end	/*   no octawords? exit loop */
     91 	rsbs		r0, r0, #128	/* subtract from 128 */
     92 	ble		.Lloop128	/*   <= 0?, do 128 at a time. */
     93 	add		r0, r0, r0, lsr #2 /* multiple by 1.25 */
     94 	add		pc, pc, r0	/* and jump! */
     95 	nop
     96 
     97 .Lloop128:
     98 	vld1.64		{d8-d9}, [ip:64]!	/* 128 left */
     99 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    100 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    101 	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
    102 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    103 	vld1.64		{d6-d7}, [ip:64]!
    104 	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
    105 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    106 	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
    107 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    108 
    109 	vld1.64		{d8-d9}, [ip:64]!	/* 96 left */
    110 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    111 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    112 	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
    113 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    114 	vld1.64		{d6-d7}, [ip:64]!
    115 	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
    116 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    117 	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
    118 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    119 
    120 	vld1.64		{d8-d9}, [ip:64]!	/* 64 left */
    121 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    122 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    123 	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
    124 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    125 	vld1.64		{d6-d7}, [ip:64]!
    126 	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
    127 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    128 	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
    129 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    130 
    131 	vld1.64		{d8-d9}, [ip:64]!	/* 32 left */
    132 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    133 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    134 	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
    135 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    136 	vld1.64		{d6-d7}, [ip:64]!
    137 	vmovl.u16	q0, d8		/* 4 U16 -> 4 U32 */
    138 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    139 	vmovl.u16	q0, d9		/* 4 U16 -> 4 U32 */
    140 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    141 
    142 	b		.Lmain_loop
    143 
    144 .Lloop_end:
    145 	/*
    146 	 * We have one to 3 more dwords to process
    147 	 */
    148 	rsb		r0, r1, #24
    149 	add		r0, r0, r0, lsr #1
    150 	add		pc, pc, r0
    151 	nop
    152 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    153 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    154 	vld1.64		{d6}, [ip:64]!
    155 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    156 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    157 	vld1.64		{d6}, [ip:64]!
    158 	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
    159 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    160 	vld1.64		{d7}, [ip:64]!
    161 
    162 .Lfinish_up:
    163 	/*
    164 	 * Apply remaining data in d6 and d7
    165 	 */
    166 	vmovl.u16	q0, d6		/* 4 U16 -> 4 U32 */
    167 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    168 	vand		d7, d7, d3	/* apply trailing mask */
    169 	vmovl.u16	q0, d7		/* 4 U16 -> 4 U32 */
    170 	vadd.u32	q2, q2, q0	/* add 4 U32 to accumulator */
    171 
    172 	/*
    173 	 * We now have 4 32-bit sums in q2 (each is 20-bits or less).
    174 	 * Now to get to 1 I32 bit sum.
    175 	 */
    176 	vadd.u32	d4, d4, d5	/* 4 I32 -> 2 I32 */
    177 	vmov		r2, s4		/* get flag for odd start */
    178 	teq		r2, #0		/* was start addr even? */
    179 	vmov		r0, r1, d4	/* extract two I32 */
    180 	rev16eq		r0, r0		/* byte swap if start was odd */
    181 	rev16eq		r1, r1		/* byte swap if start was odd */
    182 	adds		ip, r0, r1	/* add them producing carry */
    183 #include "arm/arm/cpu_in_cksum_fold.S"
    184 END(cpu_in_cksum_neon)
    185