Home | History | Annotate | Line # | Download | only in netinet
cpu_in_cksum.c revision 1.1.12.2
      1 /*	$NetBSD: cpu_in_cksum.c,v 1.1.12.2 2008/03/23 02:05:06 matt Exp $	*/
      2 /*-
      3  * Copyright (c) 2008 Joerg Sonnenberger <joerg (at) NetBSD.org>.
      4  * All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  *
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in
     14  *    the documentation and/or other materials provided with the
     15  *    distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
     21  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     22  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
     23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     27  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28  * SUCH DAMAGE.
     29  */
     30 
     31 #include <sys/cdefs.h>
     32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1.12.2 2008/03/23 02:05:06 matt Exp $");
     33 
     34 #include <sys/param.h>
     35 #include <sys/endian.h>
     36 #include <sys/mbuf.h>
     37 #ifdef _KERNEL
     38 #include <sys/systm.h>
     39 #else
     40 #include <assert.h>
     41 #include <stdbool.h>
     42 #include <stdio.h>
     43 
     44 #define KASSERT(x) assert(x)
     45 #endif
     46 
     47 #include <machine/limits.h>
     48 
     49 #include <netinet/in.h>
     50 
     51 #ifndef _KERNEL
     52 int	cpu_in_cksum(struct mbuf*, int, int, uint32_t);
     53 #endif
     54 
     55 /*
     56  * Checksum routine for Internet Protocol family headers (Portable Version).
     57  *
     58  * This routine is very heavily used in the network
     59  * code and should be modified for each CPU to be as fast as possible.
     60  *
     61  * A discussion of different implementation techniques can be found in
     62  * RFC 1071.
     63  *
     64  * The default implementation for 32bit architectures is using
     65  * a 32bit accumulator and operating on 16bit operands.
     66  *
     67  * The default implementation for 64bit architectures is using
     68  * a 64bit accumulator and operating on 32bit operands.
     69  *
     70  * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
     71  * of the inner loop. After each iteration of the inner loop, a partial
     72  * reduction is done to avoid carry in long packets.
     73  */
     74 
     75 #if ULONG_MAX == 0xffffffffUL
     76 /* 32bit version */
     77 int
     78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
     79 {
     80 	int mlen;
     81 	uint32_t sum, partial;
     82 	unsigned int final_acc;
     83 	uint8_t *data;
     84 	bool needs_swap, started_on_odd;
     85 
     86 	KASSERT(len >= 0);
     87 	KASSERT(off >= 0);
     88 
     89 	needs_swap = false;
     90 	started_on_odd = false;
     91 	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
     92 
     93 	for (;;) {
     94 		if (__predict_false(m == NULL)) {
     95 			printf("in_cksum: out of data\n");
     96 			return -1;
     97 		}
     98 		mlen = m->m_len;
     99 		if (mlen > off) {
    100 			mlen -= off;
    101 			data = mtod(m, uint8_t *) + off;
    102 			goto post_initial_offset;
    103 		}
    104 		off -= mlen;
    105 		if (len == 0)
    106 			break;
    107 		m = m->m_next;
    108 	}
    109 
    110 	for (; len > 0; m = m->m_next) {
    111 		if (__predict_false(m == NULL)) {
    112 			printf("in_cksum: out of data\n");
    113 			return -1;
    114 		}
    115 		mlen = m->m_len;
    116 		data = mtod(m, uint8_t *);
    117  post_initial_offset:
    118 		if (mlen == 0)
    119 			continue;
    120 		if (mlen > len)
    121 			mlen = len;
    122 		len -= mlen;
    123 
    124 		partial = 0;
    125 		if ((uintptr_t)data & 1) {
    126 			/* Align on word boundary */
    127 			started_on_odd = !started_on_odd;
    128 #if _BYTE_ORDER == _LITTLE_ENDIAN
    129 			partial = *data << 8;
    130 #else
    131 			partial = *data;
    132 #endif
    133 			++data;
    134 			--mlen;
    135 		}
    136 		needs_swap = started_on_odd;
    137 		while (mlen >= 32) {
    138 			__builtin_prefetch(data + 32);
    139 			partial += *(uint16_t *)data;
    140 			partial += *(uint16_t *)(data + 2);
    141 			partial += *(uint16_t *)(data + 4);
    142 			partial += *(uint16_t *)(data + 6);
    143 			partial += *(uint16_t *)(data + 8);
    144 			partial += *(uint16_t *)(data + 10);
    145 			partial += *(uint16_t *)(data + 12);
    146 			partial += *(uint16_t *)(data + 14);
    147 			partial += *(uint16_t *)(data + 16);
    148 			partial += *(uint16_t *)(data + 18);
    149 			partial += *(uint16_t *)(data + 20);
    150 			partial += *(uint16_t *)(data + 22);
    151 			partial += *(uint16_t *)(data + 24);
    152 			partial += *(uint16_t *)(data + 26);
    153 			partial += *(uint16_t *)(data + 28);
    154 			partial += *(uint16_t *)(data + 30);
    155 			data += 32;
    156 			mlen -= 32;
    157 			if (__predict_false(partial & 0xc0000000)) {
    158 				if (needs_swap)
    159 					partial = (partial << 8) + (partial >> 24);
    160 				sum += (partial >> 16);
    161 				sum += (partial & 0xffff);
    162 				partial = 0;
    163 			}
    164 		}
    165 		if (mlen & 16) {
    166 			partial += *(uint16_t *)data;
    167 			partial += *(uint16_t *)(data + 2);
    168 			partial += *(uint16_t *)(data + 4);
    169 			partial += *(uint16_t *)(data + 6);
    170 			partial += *(uint16_t *)(data + 8);
    171 			partial += *(uint16_t *)(data + 10);
    172 			partial += *(uint16_t *)(data + 12);
    173 			partial += *(uint16_t *)(data + 14);
    174 			data += 16;
    175 			mlen -= 16;
    176 		}
    177 		/*
    178 		 * mlen is not updated below as the remaining tests
    179 		 * are using bit masks, which are not affected.
    180 		 */
    181 		if (mlen & 8) {
    182 			partial += *(uint16_t *)data;
    183 			partial += *(uint16_t *)(data + 2);
    184 			partial += *(uint16_t *)(data + 4);
    185 			partial += *(uint16_t *)(data + 6);
    186 			data += 8;
    187 		}
    188 		if (mlen & 4) {
    189 			partial += *(uint16_t *)data;
    190 			partial += *(uint16_t *)(data + 2);
    191 			data += 4;
    192 		}
    193 		if (mlen & 2) {
    194 			partial += *(uint16_t *)data;
    195 			data += 2;
    196 		}
    197 		if (mlen & 1) {
    198 #if _BYTE_ORDER == _LITTLE_ENDIAN
    199 			partial += *data;
    200 #else
    201 			partial += *data << 8;
    202 #endif
    203 			started_on_odd = !started_on_odd;
    204 		}
    205 
    206 		if (needs_swap)
    207 			partial = (partial << 8) + (partial >> 24);
    208 		sum += (partial >> 16) + (partial & 0xffff);
    209 		/*
    210 		 * Reduce sum to allow potential byte swap
    211 		 * in the next iteration without carry.
    212 		 */
    213 		sum = (sum >> 16) + (sum & 0xffff);
    214 	}
    215 	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
    216 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    217 	return ~final_acc & 0xffff;
    218 }
    219 
    220 #else
    221 /* 64bit version */
    222 int
    223 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
    224 {
    225 	int mlen;
    226 	uint64_t sum, partial;
    227 	unsigned int final_acc;
    228 	uint8_t *data;
    229 	bool needs_swap, started_on_odd;
    230 
    231 	KASSERT(len >= 0);
    232 	KASSERT(off >= 0);
    233 
    234 	needs_swap = false;
    235 	started_on_odd = false;
    236 	sum = initial_sum;
    237 
    238 	for (;;) {
    239 		if (__predict_false(m == NULL)) {
    240 			printf("in_cksum: out of data\n");
    241 			return -1;
    242 		}
    243 		mlen = m->m_len;
    244 		if (mlen > off) {
    245 			mlen -= off;
    246 			data = mtod(m, uint8_t *) + off;
    247 			goto post_initial_offset;
    248 		}
    249 		off -= mlen;
    250 		if (len == 0)
    251 			break;
    252 		m = m->m_next;
    253 	}
    254 
    255 	for (; len > 0; m = m->m_next) {
    256 		if (__predict_false(m == NULL)) {
    257 			printf("in_cksum: out of data\n");
    258 			return -1;
    259 		}
    260 		mlen = m->m_len;
    261 		data = mtod(m, uint8_t *);
    262  post_initial_offset:
    263 		if (mlen == 0)
    264 			continue;
    265 		if (mlen > len)
    266 			mlen = len;
    267 		len -= mlen;
    268 
    269 		partial = 0;
    270 		if ((uintptr_t)data & 1) {
    271 			/* Align on word boundary */
    272 			started_on_odd = !started_on_odd;
    273 #if _BYTE_ORDER == _LITTLE_ENDIAN
    274 			partial = *data << 8;
    275 #else
    276 			partial = *data;
    277 #endif
    278 			++data;
    279 			--mlen;
    280 		}
    281 		needs_swap = started_on_odd;
    282 		if ((uintptr_t)data & 2) {
    283 			if (mlen < 2)
    284 				goto trailing_bytes;
    285 			partial += *(uint16_t *)data;
    286 			data += 2;
    287 			mlen -= 2;
    288 		}
    289 		while (mlen >= 64) {
    290 			__builtin_prefetch(data + 32);
    291 			__builtin_prefetch(data + 64);
    292 			partial += *(uint32_t *)data;
    293 			partial += *(uint32_t *)(data + 4);
    294 			partial += *(uint32_t *)(data + 8);
    295 			partial += *(uint32_t *)(data + 12);
    296 			partial += *(uint32_t *)(data + 16);
    297 			partial += *(uint32_t *)(data + 20);
    298 			partial += *(uint32_t *)(data + 24);
    299 			partial += *(uint32_t *)(data + 28);
    300 			partial += *(uint32_t *)(data + 32);
    301 			partial += *(uint32_t *)(data + 36);
    302 			partial += *(uint32_t *)(data + 40);
    303 			partial += *(uint32_t *)(data + 44);
    304 			partial += *(uint32_t *)(data + 48);
    305 			partial += *(uint32_t *)(data + 52);
    306 			partial += *(uint32_t *)(data + 56);
    307 			partial += *(uint32_t *)(data + 60);
    308 			data += 64;
    309 			mlen -= 64;
    310 			if (__predict_false(partial & (3ULL << 62))) {
    311 				if (needs_swap)
    312 					partial = (partial << 8) + (partial >> 56);
    313 				sum += (partial >> 32);
    314 				sum += (partial & 0xffffffff);
    315 				partial = 0;
    316 			}
    317 		}
    318 		/*
    319 		 * mlen is not updated below as the remaining tests
    320 		 * are using bit masks, which are not affected.
    321 		 */
    322 		if (mlen & 32) {
    323 			partial += *(uint32_t *)data;
    324 			partial += *(uint32_t *)(data + 4);
    325 			partial += *(uint32_t *)(data + 8);
    326 			partial += *(uint32_t *)(data + 12);
    327 			partial += *(uint32_t *)(data + 16);
    328 			partial += *(uint32_t *)(data + 20);
    329 			partial += *(uint32_t *)(data + 24);
    330 			partial += *(uint32_t *)(data + 28);
    331 			data += 32;
    332 		}
    333 		if (mlen & 16) {
    334 			partial += *(uint32_t *)data;
    335 			partial += *(uint32_t *)(data + 4);
    336 			partial += *(uint32_t *)(data + 8);
    337 			partial += *(uint32_t *)(data + 12);
    338 			data += 16;
    339 		}
    340 		if (mlen & 8) {
    341 			partial += *(uint32_t *)data;
    342 			partial += *(uint32_t *)(data + 4);
    343 			data += 8;
    344 		}
    345 		if (mlen & 4) {
    346 			partial += *(uint32_t *)data;
    347 			data += 4;
    348 		}
    349 		if (mlen & 2) {
    350 			partial += *(uint16_t *)data;
    351 			data += 2;
    352 		}
    353  trailing_bytes:
    354 		if (mlen & 1) {
    355 #if _BYTE_ORDER == _LITTLE_ENDIAN
    356 			partial += *data;
    357 #else
    358 			partial += *data << 8;
    359 #endif
    360 			started_on_odd = !started_on_odd;
    361 		}
    362 
    363 		if (needs_swap)
    364 			partial = (partial << 8) + (partial >> 56);
    365 		sum += (partial >> 32) + (partial & 0xffffffff);
    366 		/*
    367 		 * Reduce sum to allow potential byte swap
    368 		 * in the next iteration without carry.
    369 		 */
    370 		sum = (sum >> 32) + (sum & 0xffffffff);
    371 	}
    372 	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
    373 	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
    374 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    375 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    376 	return ~final_acc & 0xffff;
    377 }
    378 #endif
    379