Home | History | Annotate | Line # | Download | only in netinet
cpu_in_cksum.c revision 1.1.104.1
      1  1.1.104.1  christos /*	$NetBSD: cpu_in_cksum.c,v 1.1.104.1 2019/06/10 22:09:47 christos Exp $	*/
      2        1.1     joerg /*-
      3        1.1     joerg  * Copyright (c) 2008 Joerg Sonnenberger <joerg (at) NetBSD.org>.
      4        1.1     joerg  * All rights reserved.
      5        1.1     joerg  *
      6        1.1     joerg  * Redistribution and use in source and binary forms, with or without
      7        1.1     joerg  * modification, are permitted provided that the following conditions
      8        1.1     joerg  * are met:
      9        1.1     joerg  *
     10        1.1     joerg  * 1. Redistributions of source code must retain the above copyright
     11        1.1     joerg  *    notice, this list of conditions and the following disclaimer.
     12        1.1     joerg  * 2. Redistributions in binary form must reproduce the above copyright
     13        1.1     joerg  *    notice, this list of conditions and the following disclaimer in
     14        1.1     joerg  *    the documentation and/or other materials provided with the
     15        1.1     joerg  *    distribution.
     16        1.1     joerg  *
     17        1.1     joerg  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18        1.1     joerg  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19        1.1     joerg  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     20        1.1     joerg  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
     21        1.1     joerg  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     22        1.1     joerg  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
     23        1.1     joerg  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     24        1.1     joerg  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     25        1.1     joerg  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     26        1.1     joerg  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     27        1.1     joerg  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28        1.1     joerg  * SUCH DAMAGE.
     29        1.1     joerg  */
     30        1.1     joerg 
     31        1.1     joerg #include <sys/cdefs.h>
     32  1.1.104.1  christos __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1.104.1 2019/06/10 22:09:47 christos Exp $");
     33        1.1     joerg 
     34        1.1     joerg #include <sys/param.h>
     35        1.1     joerg #include <sys/endian.h>
     36        1.1     joerg #include <sys/mbuf.h>
     37        1.1     joerg #ifdef _KERNEL
     38        1.1     joerg #include <sys/systm.h>
     39        1.1     joerg #else
     40        1.1     joerg #include <assert.h>
     41        1.1     joerg #include <stdbool.h>
     42        1.1     joerg #include <stdio.h>
     43        1.1     joerg 
     44        1.1     joerg #define KASSERT(x) assert(x)
     45        1.1     joerg #endif
     46        1.1     joerg 
     47        1.1     joerg #include <machine/limits.h>
     48        1.1     joerg 
     49        1.1     joerg #include <netinet/in.h>
     50        1.1     joerg 
     51        1.1     joerg #ifndef _KERNEL
     52        1.1     joerg int	cpu_in_cksum(struct mbuf*, int, int, uint32_t);
     53        1.1     joerg #endif
     54        1.1     joerg 
     55        1.1     joerg /*
     56        1.1     joerg  * Checksum routine for Internet Protocol family headers (Portable Version).
     57        1.1     joerg  *
     58        1.1     joerg  * This routine is very heavily used in the network
     59        1.1     joerg  * code and should be modified for each CPU to be as fast as possible.
     60        1.1     joerg  *
     61        1.1     joerg  * A discussion of different implementation techniques can be found in
     62        1.1     joerg  * RFC 1071.
     63        1.1     joerg  *
     64        1.1     joerg  * The default implementation for 32bit architectures is using
     65        1.1     joerg  * a 32bit accumulator and operating on 16bit operands.
     66        1.1     joerg  *
     67        1.1     joerg  * The default implementation for 64bit architectures is using
     68        1.1     joerg  * a 64bit accumulator and operating on 32bit operands.
     69        1.1     joerg  *
     70        1.1     joerg  * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
     71        1.1     joerg  * of the inner loop. After each iteration of the inner loop, a partial
     72        1.1     joerg  * reduction is done to avoid carry in long packets.
     73        1.1     joerg  */
     74        1.1     joerg 
     75        1.1     joerg #if ULONG_MAX == 0xffffffffUL
     76        1.1     joerg /* 32bit version */
     77        1.1     joerg int
     78        1.1     joerg cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
     79        1.1     joerg {
     80        1.1     joerg 	int mlen;
     81        1.1     joerg 	uint32_t sum, partial;
     82        1.1     joerg 	unsigned int final_acc;
     83        1.1     joerg 	uint8_t *data;
     84        1.1     joerg 	bool needs_swap, started_on_odd;
     85        1.1     joerg 
     86        1.1     joerg 	KASSERT(len >= 0);
     87        1.1     joerg 	KASSERT(off >= 0);
     88        1.1     joerg 
     89        1.1     joerg 	needs_swap = false;
     90        1.1     joerg 	started_on_odd = false;
     91        1.1     joerg 	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
     92        1.1     joerg 
     93        1.1     joerg 	for (;;) {
     94        1.1     joerg 		if (__predict_false(m == NULL)) {
     95        1.1     joerg 			printf("in_cksum: out of data\n");
     96        1.1     joerg 			return -1;
     97        1.1     joerg 		}
     98        1.1     joerg 		mlen = m->m_len;
     99        1.1     joerg 		if (mlen > off) {
    100        1.1     joerg 			mlen -= off;
    101        1.1     joerg 			data = mtod(m, uint8_t *) + off;
    102        1.1     joerg 			goto post_initial_offset;
    103        1.1     joerg 		}
    104        1.1     joerg 		off -= mlen;
    105        1.1     joerg 		if (len == 0)
    106        1.1     joerg 			break;
    107        1.1     joerg 		m = m->m_next;
    108        1.1     joerg 	}
    109        1.1     joerg 
    110        1.1     joerg 	for (; len > 0; m = m->m_next) {
    111        1.1     joerg 		if (__predict_false(m == NULL)) {
    112        1.1     joerg 			printf("in_cksum: out of data\n");
    113        1.1     joerg 			return -1;
    114        1.1     joerg 		}
    115        1.1     joerg 		mlen = m->m_len;
    116        1.1     joerg 		data = mtod(m, uint8_t *);
    117        1.1     joerg  post_initial_offset:
    118        1.1     joerg 		if (mlen == 0)
    119        1.1     joerg 			continue;
    120        1.1     joerg 		if (mlen > len)
    121        1.1     joerg 			mlen = len;
    122        1.1     joerg 		len -= mlen;
    123        1.1     joerg 
    124        1.1     joerg 		partial = 0;
    125        1.1     joerg 		if ((uintptr_t)data & 1) {
    126        1.1     joerg 			/* Align on word boundary */
    127        1.1     joerg 			started_on_odd = !started_on_odd;
    128        1.1     joerg #if _BYTE_ORDER == _LITTLE_ENDIAN
    129        1.1     joerg 			partial = *data << 8;
    130        1.1     joerg #else
    131        1.1     joerg 			partial = *data;
    132        1.1     joerg #endif
    133        1.1     joerg 			++data;
    134        1.1     joerg 			--mlen;
    135        1.1     joerg 		}
    136        1.1     joerg 		needs_swap = started_on_odd;
    137        1.1     joerg 		while (mlen >= 32) {
    138        1.1     joerg 			__builtin_prefetch(data + 32);
    139        1.1     joerg 			partial += *(uint16_t *)data;
    140        1.1     joerg 			partial += *(uint16_t *)(data + 2);
    141        1.1     joerg 			partial += *(uint16_t *)(data + 4);
    142        1.1     joerg 			partial += *(uint16_t *)(data + 6);
    143        1.1     joerg 			partial += *(uint16_t *)(data + 8);
    144        1.1     joerg 			partial += *(uint16_t *)(data + 10);
    145        1.1     joerg 			partial += *(uint16_t *)(data + 12);
    146        1.1     joerg 			partial += *(uint16_t *)(data + 14);
    147        1.1     joerg 			partial += *(uint16_t *)(data + 16);
    148        1.1     joerg 			partial += *(uint16_t *)(data + 18);
    149        1.1     joerg 			partial += *(uint16_t *)(data + 20);
    150        1.1     joerg 			partial += *(uint16_t *)(data + 22);
    151        1.1     joerg 			partial += *(uint16_t *)(data + 24);
    152        1.1     joerg 			partial += *(uint16_t *)(data + 26);
    153        1.1     joerg 			partial += *(uint16_t *)(data + 28);
    154        1.1     joerg 			partial += *(uint16_t *)(data + 30);
    155        1.1     joerg 			data += 32;
    156        1.1     joerg 			mlen -= 32;
    157        1.1     joerg 			if (__predict_false(partial & 0xc0000000)) {
    158        1.1     joerg 				if (needs_swap)
    159        1.1     joerg 					partial = (partial << 8) + (partial >> 24);
    160        1.1     joerg 				sum += (partial >> 16);
    161        1.1     joerg 				sum += (partial & 0xffff);
    162        1.1     joerg 				partial = 0;
    163        1.1     joerg 			}
    164        1.1     joerg 		}
    165  1.1.104.1  christos 		/*
    166  1.1.104.1  christos 		 * mlen is not updated below as the remaining tests
    167  1.1.104.1  christos 		 * are using bit masks, which are not affected.
    168  1.1.104.1  christos 		 */
    169        1.1     joerg 		if (mlen & 16) {
    170        1.1     joerg 			partial += *(uint16_t *)data;
    171        1.1     joerg 			partial += *(uint16_t *)(data + 2);
    172        1.1     joerg 			partial += *(uint16_t *)(data + 4);
    173        1.1     joerg 			partial += *(uint16_t *)(data + 6);
    174        1.1     joerg 			partial += *(uint16_t *)(data + 8);
    175        1.1     joerg 			partial += *(uint16_t *)(data + 10);
    176        1.1     joerg 			partial += *(uint16_t *)(data + 12);
    177        1.1     joerg 			partial += *(uint16_t *)(data + 14);
    178        1.1     joerg 			data += 16;
    179        1.1     joerg 		}
    180        1.1     joerg 		if (mlen & 8) {
    181        1.1     joerg 			partial += *(uint16_t *)data;
    182        1.1     joerg 			partial += *(uint16_t *)(data + 2);
    183        1.1     joerg 			partial += *(uint16_t *)(data + 4);
    184        1.1     joerg 			partial += *(uint16_t *)(data + 6);
    185        1.1     joerg 			data += 8;
    186        1.1     joerg 		}
    187        1.1     joerg 		if (mlen & 4) {
    188        1.1     joerg 			partial += *(uint16_t *)data;
    189        1.1     joerg 			partial += *(uint16_t *)(data + 2);
    190        1.1     joerg 			data += 4;
    191        1.1     joerg 		}
    192        1.1     joerg 		if (mlen & 2) {
    193        1.1     joerg 			partial += *(uint16_t *)data;
    194        1.1     joerg 			data += 2;
    195        1.1     joerg 		}
    196        1.1     joerg 		if (mlen & 1) {
    197        1.1     joerg #if _BYTE_ORDER == _LITTLE_ENDIAN
    198        1.1     joerg 			partial += *data;
    199        1.1     joerg #else
    200        1.1     joerg 			partial += *data << 8;
    201        1.1     joerg #endif
    202        1.1     joerg 			started_on_odd = !started_on_odd;
    203        1.1     joerg 		}
    204        1.1     joerg 
    205        1.1     joerg 		if (needs_swap)
    206        1.1     joerg 			partial = (partial << 8) + (partial >> 24);
    207        1.1     joerg 		sum += (partial >> 16) + (partial & 0xffff);
    208        1.1     joerg 		/*
    209        1.1     joerg 		 * Reduce sum to allow potential byte swap
    210        1.1     joerg 		 * in the next iteration without carry.
    211        1.1     joerg 		 */
    212        1.1     joerg 		sum = (sum >> 16) + (sum & 0xffff);
    213        1.1     joerg 	}
    214        1.1     joerg 	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
    215        1.1     joerg 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    216        1.1     joerg 	return ~final_acc & 0xffff;
    217        1.1     joerg }
    218        1.1     joerg 
    219        1.1     joerg #else
    220        1.1     joerg /* 64bit version */
    221        1.1     joerg int
    222        1.1     joerg cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
    223        1.1     joerg {
    224        1.1     joerg 	int mlen;
    225        1.1     joerg 	uint64_t sum, partial;
    226        1.1     joerg 	unsigned int final_acc;
    227        1.1     joerg 	uint8_t *data;
    228        1.1     joerg 	bool needs_swap, started_on_odd;
    229        1.1     joerg 
    230        1.1     joerg 	KASSERT(len >= 0);
    231        1.1     joerg 	KASSERT(off >= 0);
    232        1.1     joerg 
    233        1.1     joerg 	needs_swap = false;
    234        1.1     joerg 	started_on_odd = false;
    235        1.1     joerg 	sum = initial_sum;
    236        1.1     joerg 
    237        1.1     joerg 	for (;;) {
    238        1.1     joerg 		if (__predict_false(m == NULL)) {
    239        1.1     joerg 			printf("in_cksum: out of data\n");
    240        1.1     joerg 			return -1;
    241        1.1     joerg 		}
    242        1.1     joerg 		mlen = m->m_len;
    243        1.1     joerg 		if (mlen > off) {
    244        1.1     joerg 			mlen -= off;
    245        1.1     joerg 			data = mtod(m, uint8_t *) + off;
    246        1.1     joerg 			goto post_initial_offset;
    247        1.1     joerg 		}
    248        1.1     joerg 		off -= mlen;
    249        1.1     joerg 		if (len == 0)
    250        1.1     joerg 			break;
    251        1.1     joerg 		m = m->m_next;
    252        1.1     joerg 	}
    253        1.1     joerg 
    254        1.1     joerg 	for (; len > 0; m = m->m_next) {
    255        1.1     joerg 		if (__predict_false(m == NULL)) {
    256        1.1     joerg 			printf("in_cksum: out of data\n");
    257        1.1     joerg 			return -1;
    258        1.1     joerg 		}
    259        1.1     joerg 		mlen = m->m_len;
    260        1.1     joerg 		data = mtod(m, uint8_t *);
    261        1.1     joerg  post_initial_offset:
    262        1.1     joerg 		if (mlen == 0)
    263        1.1     joerg 			continue;
    264        1.1     joerg 		if (mlen > len)
    265        1.1     joerg 			mlen = len;
    266        1.1     joerg 		len -= mlen;
    267        1.1     joerg 
    268        1.1     joerg 		partial = 0;
    269        1.1     joerg 		if ((uintptr_t)data & 1) {
    270        1.1     joerg 			/* Align on word boundary */
    271        1.1     joerg 			started_on_odd = !started_on_odd;
    272        1.1     joerg #if _BYTE_ORDER == _LITTLE_ENDIAN
    273        1.1     joerg 			partial = *data << 8;
    274        1.1     joerg #else
    275        1.1     joerg 			partial = *data;
    276        1.1     joerg #endif
    277        1.1     joerg 			++data;
    278        1.1     joerg 			--mlen;
    279        1.1     joerg 		}
    280        1.1     joerg 		needs_swap = started_on_odd;
    281        1.1     joerg 		if ((uintptr_t)data & 2) {
    282        1.1     joerg 			if (mlen < 2)
    283        1.1     joerg 				goto trailing_bytes;
    284        1.1     joerg 			partial += *(uint16_t *)data;
    285        1.1     joerg 			data += 2;
    286        1.1     joerg 			mlen -= 2;
    287        1.1     joerg 		}
    288        1.1     joerg 		while (mlen >= 64) {
    289        1.1     joerg 			__builtin_prefetch(data + 32);
    290        1.1     joerg 			__builtin_prefetch(data + 64);
    291        1.1     joerg 			partial += *(uint32_t *)data;
    292        1.1     joerg 			partial += *(uint32_t *)(data + 4);
    293        1.1     joerg 			partial += *(uint32_t *)(data + 8);
    294        1.1     joerg 			partial += *(uint32_t *)(data + 12);
    295        1.1     joerg 			partial += *(uint32_t *)(data + 16);
    296        1.1     joerg 			partial += *(uint32_t *)(data + 20);
    297        1.1     joerg 			partial += *(uint32_t *)(data + 24);
    298        1.1     joerg 			partial += *(uint32_t *)(data + 28);
    299        1.1     joerg 			partial += *(uint32_t *)(data + 32);
    300        1.1     joerg 			partial += *(uint32_t *)(data + 36);
    301        1.1     joerg 			partial += *(uint32_t *)(data + 40);
    302        1.1     joerg 			partial += *(uint32_t *)(data + 44);
    303        1.1     joerg 			partial += *(uint32_t *)(data + 48);
    304        1.1     joerg 			partial += *(uint32_t *)(data + 52);
    305        1.1     joerg 			partial += *(uint32_t *)(data + 56);
    306        1.1     joerg 			partial += *(uint32_t *)(data + 60);
    307        1.1     joerg 			data += 64;
    308        1.1     joerg 			mlen -= 64;
    309        1.1     joerg 			if (__predict_false(partial & (3ULL << 62))) {
    310        1.1     joerg 				if (needs_swap)
    311        1.1     joerg 					partial = (partial << 8) + (partial >> 56);
    312        1.1     joerg 				sum += (partial >> 32);
    313        1.1     joerg 				sum += (partial & 0xffffffff);
    314        1.1     joerg 				partial = 0;
    315        1.1     joerg 			}
    316        1.1     joerg 		}
    317        1.1     joerg 		/*
    318        1.1     joerg 		 * mlen is not updated below as the remaining tests
    319        1.1     joerg 		 * are using bit masks, which are not affected.
    320        1.1     joerg 		 */
    321        1.1     joerg 		if (mlen & 32) {
    322        1.1     joerg 			partial += *(uint32_t *)data;
    323        1.1     joerg 			partial += *(uint32_t *)(data + 4);
    324        1.1     joerg 			partial += *(uint32_t *)(data + 8);
    325        1.1     joerg 			partial += *(uint32_t *)(data + 12);
    326        1.1     joerg 			partial += *(uint32_t *)(data + 16);
    327        1.1     joerg 			partial += *(uint32_t *)(data + 20);
    328        1.1     joerg 			partial += *(uint32_t *)(data + 24);
    329        1.1     joerg 			partial += *(uint32_t *)(data + 28);
    330        1.1     joerg 			data += 32;
    331        1.1     joerg 		}
    332        1.1     joerg 		if (mlen & 16) {
    333        1.1     joerg 			partial += *(uint32_t *)data;
    334        1.1     joerg 			partial += *(uint32_t *)(data + 4);
    335        1.1     joerg 			partial += *(uint32_t *)(data + 8);
    336        1.1     joerg 			partial += *(uint32_t *)(data + 12);
    337        1.1     joerg 			data += 16;
    338        1.1     joerg 		}
    339        1.1     joerg 		if (mlen & 8) {
    340        1.1     joerg 			partial += *(uint32_t *)data;
    341        1.1     joerg 			partial += *(uint32_t *)(data + 4);
    342        1.1     joerg 			data += 8;
    343        1.1     joerg 		}
    344        1.1     joerg 		if (mlen & 4) {
    345        1.1     joerg 			partial += *(uint32_t *)data;
    346        1.1     joerg 			data += 4;
    347        1.1     joerg 		}
    348        1.1     joerg 		if (mlen & 2) {
    349        1.1     joerg 			partial += *(uint16_t *)data;
    350        1.1     joerg 			data += 2;
    351        1.1     joerg 		}
    352        1.1     joerg  trailing_bytes:
    353        1.1     joerg 		if (mlen & 1) {
    354        1.1     joerg #if _BYTE_ORDER == _LITTLE_ENDIAN
    355        1.1     joerg 			partial += *data;
    356        1.1     joerg #else
    357        1.1     joerg 			partial += *data << 8;
    358        1.1     joerg #endif
    359        1.1     joerg 			started_on_odd = !started_on_odd;
    360        1.1     joerg 		}
    361        1.1     joerg 
    362        1.1     joerg 		if (needs_swap)
    363        1.1     joerg 			partial = (partial << 8) + (partial >> 56);
    364        1.1     joerg 		sum += (partial >> 32) + (partial & 0xffffffff);
    365        1.1     joerg 		/*
    366        1.1     joerg 		 * Reduce sum to allow potential byte swap
    367        1.1     joerg 		 * in the next iteration without carry.
    368        1.1     joerg 		 */
    369        1.1     joerg 		sum = (sum >> 32) + (sum & 0xffffffff);
    370        1.1     joerg 	}
    371        1.1     joerg 	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
    372        1.1     joerg 	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
    373        1.1     joerg 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    374        1.1     joerg 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
    375        1.1     joerg 	return ~final_acc & 0xffff;
    376        1.1     joerg }
    377        1.1     joerg #endif
    378