Home | History | Annotate | Line # | Download | only in netinet
cpu_in_cksum.c revision 1.1.102.1
      1 /*	$NetBSD: cpu_in_cksum.c,v 1.1.102.1 2018/09/06 06:56:44 pgoyette Exp $	*/
      2 /*-
      3  * Copyright (c) 2008 Joerg Sonnenberger <joerg (at) NetBSD.org>.
      4  * All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  *
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in
     14  *    the documentation and/or other materials provided with the
     15  *    distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
     21  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     22  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
     23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     27  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28  * SUCH DAMAGE.
     29  */
     30 
     31 #include <sys/cdefs.h>
     32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1.102.1 2018/09/06 06:56:44 pgoyette Exp $");
     33 
     34 #include <sys/param.h>
     35 #include <sys/endian.h>
     36 #include <sys/mbuf.h>
     37 #ifdef _KERNEL
     38 #include <sys/systm.h>
     39 #else
     40 #include <assert.h>
     41 #include <stdbool.h>
     42 #include <stdio.h>
     43 
     44 #define KASSERT(x) assert(x)
     45 #endif
     46 
     47 #include <machine/limits.h>
     48 
     49 #include <netinet/in.h>
     50 
     51 #ifndef _KERNEL
     52 int	cpu_in_cksum(struct mbuf*, int, int, uint32_t);
     53 #endif
     54 
     55 /*
     56  * Checksum routine for Internet Protocol family headers (Portable Version).
     57  *
     58  * This routine is very heavily used in the network
     59  * code and should be modified for each CPU to be as fast as possible.
     60  *
     61  * A discussion of different implementation techniques can be found in
     62  * RFC 1071.
     63  *
     64  * The default implementation for 32bit architectures is using
     65  * a 32bit accumulator and operating on 16bit operands.
     66  *
     67  * The default implementation for 64bit architectures is using
     68  * a 64bit accumulator and operating on 32bit operands.
     69  *
     70  * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
     71  * of the inner loop. After each iteration of the inner loop, a partial
     72  * reduction is done to avoid carry in long packets.
     73  */
     74 
     75 #if ULONG_MAX == 0xffffffffUL
     76 /* 32bit version */
     77 int
     78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
     79 {
     80 	int mlen;
     81 	uint32_t sum, partial;
     82 	unsigned int final_acc;
     83 	uint8_t *data;
     84 	bool needs_swap, started_on_odd;
     85 
     86 	KASSERT(len >= 0);
     87 	KASSERT(off >= 0);
     88 
     89 	needs_swap = false;
     90 	started_on_odd = false;
     91 	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
     92 
     93 	for (;;) {
     94 		if (__predict_false(m == NULL)) {
     95 			printf("in_cksum: out of data\n");
     96 			return -1;
     97 		}
     98 		mlen = m->m_len;
     99 		if (mlen > off) {
    100 			mlen -= off;
    101 			data = mtod(m, uint8_t *) + off;
    102 			goto post_initial_offset;
    103 		}
    104 		off -= mlen;
    105 		if (len == 0)
    106 			break;
    107 		m = m->m_next;
    108 	}
    109 
    110 	for (; len > 0; m = m->m_next) {
    111 		if (__predict_false(m == NULL)) {
    112 			printf("in_cksum: out of data\n");
    113 			return -1;
    114 		}
    115 		mlen = m->m_len;
    116 		data = mtod(m, uint8_t *);
    117  post_initial_offset:
    118 		if (mlen == 0)
    119 			continue;
    120 		if (mlen > len)
    121 			mlen = len;
    122 		len -= mlen;
    123 
    124 		partial = 0;
    125 		if ((uintptr_t)data & 1) {
    126 			/* Align on word boundary */
    127 			started_on_odd = !started_on_odd;
    128 #if _BYTE_ORDER == _LITTLE_ENDIAN
    129 			partial = *data << 8;
    130 #else
    131 			partial = *data;
    132 #endif
    133 			++data;
    134 			--mlen;
    135 		}
    136 		needs_swap = started_on_odd;
    137 		while (mlen >= 32) {
    138 			__builtin_prefetch(data + 32);
    139 			partial += *(uint16_t *)data;
    140 			partial += *(uint16_t *)(data + 2);
    141 			partial += *(uint16_t *)(data + 4);
    142 			partial += *(uint16_t *)(data + 6);
    143 			partial += *(uint16_t *)(data + 8);
    144 			partial += *(uint16_t *)(data + 10);
    145 			partial += *(uint16_t *)(data + 12);
    146 			partial += *(uint16_t *)(data + 14);
    147 			partial += *(uint16_t *)(data + 16);
    148 			partial += *(uint16_t *)(data + 18);
    149 			partial += *(uint16_t *)(data + 20);
    150 			partial += *(uint16_t *)(data + 22);
    151 			partial += *(uint16_t *)(data + 24);
    152 			partial += *(uint16_t *)(data + 26);
    153 			partial += *(uint16_t *)(data + 28);
    154 			partial += *(uint16_t *)(data + 30);
    155 			data += 32;
    156 			mlen -= 32;
    157 			if (__predict_false(partial & 0xc0000000)) {
    158 				if (needs_swap)
    159 					partial = (partial << 8) + (partial >> 24);
    160 				sum += (partial >> 16);
    161 				sum += (partial & 0xffff);
    162 				partial = 0;
    163 			}
    164 		}
    165 		/*
    166 		 * mlen is not updated below as the remaining tests
    167 		 * are using bit masks, which are not affected.
    168 		 */
    169 		if (mlen & 16) {
    170 			partial += *(uint16_t *)data;
    171 			partial += *(uint16_t *)(data + 2);
    172 			partial += *(uint16_t *)(data + 4);
    173 			partial += *(uint16_t *)(data + 6);
    174 			partial += *(uint16_t *)(data + 8);
    175 			partial += *(uint16_t *)(data + 10);
    176 			partial += *(uint16_t *)(data + 12);
    177 			partial += *(uint16_t *)(data + 14);
    178 			data += 16;
    179 		}
    180 		if (mlen & 8) {
    181 			partial += *(uint16_t *)data;
    182 			partial += *(uint16_t *)(data + 2);
    183 			partial += *(uint16_t *)(data + 4);
    184 			partial += *(uint16_t *)(data + 6);
    185 			data += 8;
    186 		}
    187 		if (mlen & 4) {
    188 			partial += *(uint16_t *)data;
    189 			partial += *(uint16_t *)(data + 2);
    190 			data += 4;
    191 		}
    192 		if (mlen & 2) {
    193 			partial += *(uint16_t *)data;
    194 			data += 2;
    195 		}
    196 		if (mlen & 1) {
    197 #if _BYTE_ORDER == _LITTLE_ENDIAN
    198 			partial += *data;
    199 #else
    200 			partial += *data << 8;
    201 #endif
    202 			started_on_odd = !started_on_odd;
    203 		}
    204 
    205 		if (needs_swap)
    206 			partial = (partial << 8) + (partial >> 24);
    207 		sum += (partial >> 16) + (partial & 0xffff);
    208 		/*
    209 		 * Reduce sum to allow potential byte swap
    210 		 * in the next iteration without carry.
    211 		 */
    212 		sum = (sum >> 16) + (sum & 0xffff);
    213 	}
    214 	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
    215 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    216 	return ~final_acc & 0xffff;
    217 }
    218 
    219 #else
    220 /* 64bit version */
    221 int
    222 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
    223 {
    224 	int mlen;
    225 	uint64_t sum, partial;
    226 	unsigned int final_acc;
    227 	uint8_t *data;
    228 	bool needs_swap, started_on_odd;
    229 
    230 	KASSERT(len >= 0);
    231 	KASSERT(off >= 0);
    232 
    233 	needs_swap = false;
    234 	started_on_odd = false;
    235 	sum = initial_sum;
    236 
    237 	for (;;) {
    238 		if (__predict_false(m == NULL)) {
    239 			printf("in_cksum: out of data\n");
    240 			return -1;
    241 		}
    242 		mlen = m->m_len;
    243 		if (mlen > off) {
    244 			mlen -= off;
    245 			data = mtod(m, uint8_t *) + off;
    246 			goto post_initial_offset;
    247 		}
    248 		off -= mlen;
    249 		if (len == 0)
    250 			break;
    251 		m = m->m_next;
    252 	}
    253 
    254 	for (; len > 0; m = m->m_next) {
    255 		if (__predict_false(m == NULL)) {
    256 			printf("in_cksum: out of data\n");
    257 			return -1;
    258 		}
    259 		mlen = m->m_len;
    260 		data = mtod(m, uint8_t *);
    261  post_initial_offset:
    262 		if (mlen == 0)
    263 			continue;
    264 		if (mlen > len)
    265 			mlen = len;
    266 		len -= mlen;
    267 
    268 		partial = 0;
    269 		if ((uintptr_t)data & 1) {
    270 			/* Align on word boundary */
    271 			started_on_odd = !started_on_odd;
    272 #if _BYTE_ORDER == _LITTLE_ENDIAN
    273 			partial = *data << 8;
    274 #else
    275 			partial = *data;
    276 #endif
    277 			++data;
    278 			--mlen;
    279 		}
    280 		needs_swap = started_on_odd;
    281 		if ((uintptr_t)data & 2) {
    282 			if (mlen < 2)
    283 				goto trailing_bytes;
    284 			partial += *(uint16_t *)data;
    285 			data += 2;
    286 			mlen -= 2;
    287 		}
    288 		while (mlen >= 64) {
    289 			__builtin_prefetch(data + 32);
    290 			__builtin_prefetch(data + 64);
    291 			partial += *(uint32_t *)data;
    292 			partial += *(uint32_t *)(data + 4);
    293 			partial += *(uint32_t *)(data + 8);
    294 			partial += *(uint32_t *)(data + 12);
    295 			partial += *(uint32_t *)(data + 16);
    296 			partial += *(uint32_t *)(data + 20);
    297 			partial += *(uint32_t *)(data + 24);
    298 			partial += *(uint32_t *)(data + 28);
    299 			partial += *(uint32_t *)(data + 32);
    300 			partial += *(uint32_t *)(data + 36);
    301 			partial += *(uint32_t *)(data + 40);
    302 			partial += *(uint32_t *)(data + 44);
    303 			partial += *(uint32_t *)(data + 48);
    304 			partial += *(uint32_t *)(data + 52);
    305 			partial += *(uint32_t *)(data + 56);
    306 			partial += *(uint32_t *)(data + 60);
    307 			data += 64;
    308 			mlen -= 64;
    309 			if (__predict_false(partial & (3ULL << 62))) {
    310 				if (needs_swap)
    311 					partial = (partial << 8) + (partial >> 56);
    312 				sum += (partial >> 32);
    313 				sum += (partial & 0xffffffff);
    314 				partial = 0;
    315 			}
    316 		}
    317 		/*
    318 		 * mlen is not updated below as the remaining tests
    319 		 * are using bit masks, which are not affected.
    320 		 */
    321 		if (mlen & 32) {
    322 			partial += *(uint32_t *)data;
    323 			partial += *(uint32_t *)(data + 4);
    324 			partial += *(uint32_t *)(data + 8);
    325 			partial += *(uint32_t *)(data + 12);
    326 			partial += *(uint32_t *)(data + 16);
    327 			partial += *(uint32_t *)(data + 20);
    328 			partial += *(uint32_t *)(data + 24);
    329 			partial += *(uint32_t *)(data + 28);
    330 			data += 32;
    331 		}
    332 		if (mlen & 16) {
    333 			partial += *(uint32_t *)data;
    334 			partial += *(uint32_t *)(data + 4);
    335 			partial += *(uint32_t *)(data + 8);
    336 			partial += *(uint32_t *)(data + 12);
    337 			data += 16;
    338 		}
    339 		if (mlen & 8) {
    340 			partial += *(uint32_t *)data;
    341 			partial += *(uint32_t *)(data + 4);
    342 			data += 8;
    343 		}
    344 		if (mlen & 4) {
    345 			partial += *(uint32_t *)data;
    346 			data += 4;
    347 		}
    348 		if (mlen & 2) {
    349 			partial += *(uint16_t *)data;
    350 			data += 2;
    351 		}
    352  trailing_bytes:
    353 		if (mlen & 1) {
    354 #if _BYTE_ORDER == _LITTLE_ENDIAN
    355 			partial += *data;
    356 #else
    357 			partial += *data << 8;
    358 #endif
    359 			started_on_odd = !started_on_odd;
    360 		}
    361 
    362 		if (needs_swap)
    363 			partial = (partial << 8) + (partial >> 56);
    364 		sum += (partial >> 32) + (partial & 0xffffffff);
    365 		/*
    366 		 * Reduce sum to allow potential byte swap
    367 		 * in the next iteration without carry.
    368 		 */
    369 		sum = (sum >> 32) + (sum & 0xffffffff);
    370 	}
    371 	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
    372 	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
    373 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    374 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    375 	return ~final_acc & 0xffff;
    376 }
    377 #endif
    378