Home | History | Annotate | Line # | Download | only in m68k
      1 |	$NetBSD: oc_cksum.s,v 1.11 2023/09/26 14:33:55 tsutsui Exp $
      2 
      3 | Copyright (c) 1988 Regents of the University of California.
      4 | All rights reserved.
      5 |
      6 | Redistribution and use in source and binary forms, with or without
      7 | modification, are permitted provided that the following conditions
      8 | are met:
      9 | 1. Redistributions of source code must retain the above copyright
     10 |    notice, this list of conditions and the following disclaimer.
     11 | 2. Redistributions in binary form must reproduce the above copyright
     12 |    notice, this list of conditions and the following disclaimer in the
     13 |    documentation and/or other materials provided with the distribution.
     14 | 3. Neither the name of the University nor the names of its contributors
     15 |    may be used to endorse or promote products derived from this software
     16 |    without specific prior written permission.
     17 |
     18 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     21 | ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     24 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     26 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     27 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28 | SUCH DAMAGE.
     29 |
     30 |	@(#)oc_cksum.s	7.2 (Berkeley) 11/3/90
     31 |
     32 |
     33 | oc_cksum: ones complement 16 bit checksum for MC68020.
     34 |
     35 | oc_cksum (buffer, count, strtval)
     36 |
     37 | Do a 16 bit one's complement sum of 'count' bytes from 'buffer'.
     38 | 'strtval' is the starting value of the sum (usually zero).
     39 |
     40 | It simplifies life in in_cksum if strtval can be >= 2^16.
     41 | This routine will work as long as strtval is < 2^31.
     42 |
     43 | Performance
     44 | -----------
     45 | This routine is intended for MC 68020s but should also work
     46 | for 68030s.  It (deliberately) doesn't worry about the alignment
     47 | of the buffer so will only work on a 68010 if the buffer is
     48 | aligned on an even address.  (Also, a routine written to use
     49 | 68010 "loop mode" would almost certainly be faster than this
     50 | code on a 68010).
     51 |
     52 | We don't worry about alignment because this routine is frequently
     53 | called with small counts: 20 bytes for IP header checksums and 40
     54 | bytes for TCP ack checksums.  For these small counts, testing for
     55 | bad alignment adds ~10% to the per-call cost.  Since, by the nature
     56 | of the kernel's allocator, the data we're called with is almost
     57 | always longword aligned, there is no benefit to this added cost
     58 | and we're better off letting the loop take a big performance hit
     59 | in the rare cases where we're handed an unaligned buffer.
     60 |
     61 | Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
     62 | tested on random data on four different types of processors (see
     63 | list below -- 64 was the largest unrolling because anything more
     64 | overflows the 68020 Icache).  On all the processors, the
     65 | throughput asymptote was located between 8 and 16 (closer to 8).
     66 | However, 16 was substantially better than 8 for small counts.
     67 | (It's clear why this happens for a count of 40: unroll-8 pays a
     68 | loop branch cost and unroll-16 doesn't.  But the tests also showed
     69 | that 16 was better than 8 for a count of 20.  It's not obvious to
     70 | me why.)  So, since 16 was good for both large and small counts,
     71 | the loop below is unrolled 16 times.
     72 |
     73 | The processors tested and their average time to checksum 1024 bytes
     74 | of random data were:
     75 |	Sun 3/50 (15MHz)	190 us/KB
     76 |	Sun 3/180 (16.6MHz)	175 us/KB
     77 |	Sun 3/60 (20MHz)	134 us/KB
     78 |	Sun 3/280 (25MHz)	 95 us/KB
     79 |
     80 | The cost of calling this routine was typically 10% of the per-
     81 | kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
     82 | and each additional byte cost 125ns.  With the high fixed cost,
     83 | it would clearly be a gain to "inline" this routine -- the
     84 | subroutine call adds 400% overhead to an IP header checksum.
     85 | However, in absolute terms, inlining would only gain 10us per
     86 | packet -- a 1% effect for a 1ms ethernet packet.  This is not
     87 | enough gain to be worth the effort.
     88 
     89 #include <m68k/asm.h>
     90 
     91 	.text
     92 
     93 ENTRY(oc_cksum)
     94 	movl	4(%sp),%a0	| get buffer ptr
     95 	movl	8(%sp),%d1	| get byte count
     96 	movl	12(%sp),%d0	| get starting value
     97 	movl	%d2,-(%sp)	| free a reg
     98 
     99 	| test for possible 1, 2 or 3 bytes of excess at end
    100 	| of buffer.  The usual case is no excess (the usual
    101 	| case is header checksums) so we give that the faster
    102 	| 'not taken' leg of the compare.  (We do the excess
    103 	| first because we're about the trash the low order
    104 	| bits of the count in d1.)
    105 
    106 	btst	#0,%d1
    107 	jne	.L5		| if one or three bytes excess
    108 	btst	#1,%d1
    109 	jne	.L7		| if two bytes excess
    110 .L1:
    111 #ifdef __mcoldfire__
    112 	movq	#-4,%d2		| mask to clear bottom two bits
    113 	andl	%d2,%d1		| longword truncate length
    114 	movl	%d1,%d2		| move length to d2
    115 	movl	%d1,%a1		| move length to a1
    116 	addl	%a0,%a1		| add start so a1 now points to end
    117 	movq	#0x3c,%d1	| then find fractions of a chunk
    118 	andl	%d1,%d2
    119 	negl	%d2
    120 	subl	%d1,%d1		| this can never carry so X is cleared
    121 #else
    122 	movl	%d1,%d2		| move to d2
    123 	lsrl	#6,%d1		| make cnt into # of 64 byte chunks
    124 	andl	#0x3c,%d2	| then find fractions of a chunk
    125 	negl	%d2
    126 	andb	#0xf,%cc	| clear X
    127 #endif
    128 	jmp	(.L3-.-2:b,%pc,%d2)
    129 .L2:
    130 	movl	(%a0)+,%d2
    131 	addxl	%d2,%d0
    132 	movl	(%a0)+,%d2
    133 	addxl	%d2,%d0
    134 	movl	(%a0)+,%d2
    135 	addxl	%d2,%d0
    136 	movl	(%a0)+,%d2
    137 	addxl	%d2,%d0
    138 	movl	(%a0)+,%d2
    139 	addxl	%d2,%d0
    140 	movl	(%a0)+,%d2
    141 	addxl	%d2,%d0
    142 	movl	(%a0)+,%d2
    143 	addxl	%d2,%d0
    144 	movl	(%a0)+,%d2
    145 	addxl	%d2,%d0
    146 	movl	(%a0)+,%d2
    147 	addxl	%d2,%d0
    148 	movl	(%a0)+,%d2
    149 	addxl	%d2,%d0
    150 	movl	(%a0)+,%d2
    151 	addxl	%d2,%d0
    152 	movl	(%a0)+,%d2
    153 	addxl	%d2,%d0
    154 	movl	(%a0)+,%d2
    155 	addxl	%d2,%d0
    156 	movl	(%a0)+,%d2
    157 	addxl	%d2,%d0
    158 	movl	(%a0)+,%d2
    159 	addxl	%d2,%d0
    160 	movl	(%a0)+,%d2
    161 	addxl	%d2,%d0
    162 .L3:
    163 #ifdef __mcoldfire__
    164 	cmpal	%a0,%a1		| cmpa doesn't affect X
    165 	bne	.L2		| loop until reached
    166 #else
    167 	dbra	%d1,.L2		| (NB- dbra doesn't affect X)
    168 #endif
    169 
    170 	movl	%d0,%d1		| fold 32 bit sum to 16 bits
    171 	swap	%d1		| (NB- swap doesn't affect X)
    172 #ifdef __mcoldfire__
    173 	mvzw	%d1,%d1		| zero extend %d1 (doesn't affect X)
    174 	mvzw	%d0,%d0		| zero extend %d0 (doesn't affect X)
    175 	addxl	%d1,%d0		|
    176 	jcc	.L4
    177 	addql	#1,%d0
    178 #else
    179 	addxw	%d1,%d0
    180 	jcc	.L4
    181 	addw	#1,%d0
    182 #endif
    183 .L4:
    184 #ifdef __mcoldfire__
    185 	mvzw	%d0,%d0
    186 #else
    187 	andl	#0xffff,%d0
    188 #endif
    189 	movl	(%sp)+,%d2
    190 	rts
    191 
    192 .L5:	| deal with 1 or 3 excess bytes at the end of the buffer.
    193 	btst	#1,%d1
    194 	jeq	.L6		| if 1 excess
    195 
    196 	| 3 bytes excess
    197 #ifdef __mcoldfire__
    198 	mvzw	(-3,%a0,%d1:l),%d2	| add in last full word then drop
    199 #else
    200 	clrl	%d2
    201 	movw	(-3,%a0,%d1:l),%d2	| add in last full word then drop
    202 #endif
    203 	addl	%d2,%d0		|  through to pick up last byte
    204 
    205 .L6:	| 1 byte excess
    206 #ifdef __mcoldfire__
    207 	mvzb	(-1,%a0,%d1:l),%d2
    208 #else
    209 	clrl	%d2
    210 	movb	(-1,%a0,%d1:l),%d2
    211 #endif
    212 	lsll	#8,%d2
    213 	addl	%d2,%d0
    214 	jra	.L1
    215 
    216 .L7:	| 2 bytes excess
    217 #ifdef __mcoldfire__
    218 	mvzw	(-2,%a0,%d1:l),%d2
    219 #else
    220 	clrl	%d2
    221 	movw	(-2,%a0,%d1:l),%d2
    222 #endif
    223 	addl	%d2,%d0
    224 	jra	.L1
    225