crc32.c revision 1.6 1 /* $NetBSD: crc32.c,v 1.6 2022/10/15 19:49:32 christos Exp $ */
2
3 /* crc32.c -- compute the CRC-32 of a data stream
4 * Copyright (C) 1995-2022 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
6 *
7 * This interleaved implementation of a CRC makes use of pipelined multiple
8 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
9 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
10 */
11
12 /* @(#) Id */
13
14 /*
15 Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
16 protection on the static variables used to control the first-use generation
17 of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
18 first call get_crc_table() to initialize the tables before allowing more than
19 one thread to use crc32().
20
21 MAKECRCH can be #defined to write out crc32.h. A main() routine is also
22 produced, so that this one source file can be compiled to an executable.
23 */
24
25 #ifdef MAKECRCH
26 # include <stdio.h>
27 # ifndef DYNAMIC_CRC_TABLE
28 # define DYNAMIC_CRC_TABLE
29 # endif /* !DYNAMIC_CRC_TABLE */
30 #endif /* MAKECRCH */
31
32 #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
33
34 /*
35 A CRC of a message is computed on N braids of words in the message, where
36 each word consists of W bytes (4 or 8). If N is 3, for example, then three
37 running sparse CRCs are calculated respectively on each braid, at these
38 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
39 This is done starting at a word boundary, and continues until as many blocks
40 of N * W bytes as are available have been processed. The results are combined
41 into a single CRC at the end. For this code, N must be in the range 1..6 and
42 W must be 4 or 8. The upper limit on N can be increased if desired by adding
43 more #if blocks, extending the patterns apparent in the code. In addition,
44 crc32.h would need to be regenerated, if the maximum N value is increased.
45
46 N and W are chosen empirically by benchmarking the execution time on a given
47 processor. The choices for N and W below were based on testing on Intel Kaby
48 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
49 Octeon II processors. The Intel, AMD, and ARM processors were all fastest
50 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
51 They were all tested with either gcc or clang, all using the -O3 optimization
52 level. Your mileage may vary.
53 */
54
55 /* Define N */
56 #ifdef Z_TESTN
57 # define N Z_TESTN
58 #else
59 # define N 5
60 #endif
61 #if N < 1 || N > 6
62 # error N must be in 1..6
63 #endif
64
65 /*
66 z_crc_t must be at least 32 bits. z_word_t must be at least as long as
67 z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
68 that bytes are eight bits.
69 */
70
71 /*
72 Define W and the associated z_word_t type. If W is not defined, then a
73 braided calculation is not used, and the associated tables and code are not
74 compiled.
75 */
76 #ifdef Z_TESTW
77 # if Z_TESTW-1 != -1
78 # define W Z_TESTW
79 # endif
80 #else
81 # ifdef MAKECRCH
82 # define W 8 /* required for MAKECRCH */
83 # else
84 # if defined(__x86_64__) || defined(__aarch64__)
85 # define W 8
86 # else
87 # define W 4
88 # endif
89 # endif
90 #endif
91 #ifdef W
92 # if W == 8 && defined(Z_U8)
93 typedef Z_U8 z_word_t;
94 # elif defined(Z_U4)
95 # undef W
96 # define W 4
97 typedef Z_U4 z_word_t;
98 # else
99 # undef W
100 # endif
101 #endif
102
103 /* If available, use the ARM processor CRC32 instruction. */
104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
105 # define ARMCRC32
106 #endif
107
108 /* Local functions. */
109 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
110 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
111
112 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
113 local z_word_t byte_swap OF((z_word_t word));
114 #endif
115
116 #if defined(W) && !defined(ARMCRC32)
117 local z_crc_t crc_word OF((z_word_t data));
118 local z_word_t crc_word_big OF((z_word_t data));
119 #endif
120
121 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
122 /*
123 Swap the bytes in a z_word_t to convert between little and big endian. Any
124 self-respecting compiler will optimize this to a single machine byte-swap
125 instruction, if one is available. This assumes that word_t is either 32 bits
126 or 64 bits.
127 */
128 local z_word_t byte_swap(word)
129 z_word_t word;
130 {
131 # if W == 8
132 return
133 (word & 0xff00000000000000) >> 56 |
134 (word & 0xff000000000000) >> 40 |
135 (word & 0xff0000000000) >> 24 |
136 (word & 0xff00000000) >> 8 |
137 (word & 0xff000000) << 8 |
138 (word & 0xff0000) << 24 |
139 (word & 0xff00) << 40 |
140 (word & 0xff) << 56;
141 # else /* W == 4 */
142 return
143 (word & 0xff000000) >> 24 |
144 (word & 0xff0000) >> 8 |
145 (word & 0xff00) << 8 |
146 (word & 0xff) << 24;
147 # endif
148 }
149 #endif
150
151 /* CRC polynomial. */
152 #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
153
154 #ifdef DYNAMIC_CRC_TABLE
155
156 local z_crc_t FAR crc_table[256];
157 local z_crc_t FAR x2n_table[32];
158 local void make_crc_table OF((void));
159 #ifdef W
160 local z_word_t FAR crc_big_table[256];
161 local z_crc_t FAR crc_braid_table[W][256];
162 local z_word_t FAR crc_braid_big_table[W][256];
163 local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
164 #endif
165 #ifdef MAKECRCH
166 local void write_table OF((FILE *, const z_crc_t FAR *, int));
167 local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
168 local void write_table64 OF((FILE *, const z_word_t FAR *, int));
169 #endif /* MAKECRCH */
170
171 /*
172 Define a once() function depending on the availability of atomics. If this is
173 compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
174 multiple threads, and if atomics are not available, then get_crc_table() must
175 be called to initialize the tables and must return before any threads are
176 allowed to compute or combine CRCs.
177 */
178
179 /* Definition of once functionality. */
180 typedef struct once_s once_t;
181 local void once OF((once_t *, void (*)(void)));
182
183 /* Check for the availability of atomics. */
184 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
185 !defined(__STDC_NO_ATOMICS__)
186
187 #include <stdatomic.h>
188
189 /* Structure for once(), which must be initialized with ONCE_INIT. */
190 struct once_s {
191 atomic_flag begun;
192 atomic_int done;
193 };
194 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
195
196 /*
197 Run the provided init() function exactly once, even if multiple threads
198 invoke once() at the same time. The state must be a once_t initialized with
199 ONCE_INIT.
200 */
201 local void once(state, init)
202 once_t *state;
203 void (*init)(void);
204 {
205 if (!atomic_load(&state->done)) {
206 if (atomic_flag_test_and_set(&state->begun))
207 while (!atomic_load(&state->done))
208 ;
209 else {
210 init();
211 atomic_store(&state->done, 1);
212 }
213 }
214 }
215
216 #else /* no atomics */
217
218 /* Structure for once(), which must be initialized with ONCE_INIT. */
219 struct once_s {
220 volatile int begun;
221 volatile int done;
222 };
223 #define ONCE_INIT {0, 0}
224
225 /* Test and set. Alas, not atomic, but tries to minimize the period of
226 vulnerability. */
227 local int test_and_set OF((int volatile *));
228 local int test_and_set(flag)
229 int volatile *flag;
230 {
231 int was;
232
233 was = *flag;
234 *flag = 1;
235 return was;
236 }
237
238 /* Run the provided init() function once. This is not thread-safe. */
239 local void once(state, init)
240 once_t *state;
241 void (*init)(void);
242 {
243 if (!state->done) {
244 if (test_and_set(&state->begun))
245 while (!state->done)
246 ;
247 else {
248 init();
249 state->done = 1;
250 }
251 }
252 }
253
254 #endif
255
256 /* State for once(). */
257 local once_t made = ONCE_INIT;
258
259 /*
260 Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
261 x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
262
263 Polynomials over GF(2) are represented in binary, one bit per coefficient,
264 with the lowest powers in the most significant bit. Then adding polynomials
265 is just exclusive-or, and multiplying a polynomial by x is a right shift by
266 one. If we call the above polynomial p, and represent a byte as the
267 polynomial q, also with the lowest power in the most significant bit (so the
268 byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
269 where a mod b means the remainder after dividing a by b.
270
271 This calculation is done using the shift-register method of multiplying and
272 taking the remainder. The register is initialized to zero, and for each
273 incoming bit, x^32 is added mod p to the register if the bit is a one (where
274 x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
275 (which is shifting right by one and adding x^32 mod p if the bit shifted out
276 is a one). We start with the highest power (least significant bit) of q and
277 repeat for all eight bits of q.
278
279 The table is simply the CRC of all possible eight bit values. This is all the
280 information needed to generate CRCs on data a byte at a time for all
281 combinations of CRC register values and incoming bytes.
282 */
283
284 local void make_crc_table()
285 {
286 unsigned i, j, n;
287 z_crc_t p;
288
289 /* initialize the CRC of bytes tables */
290 for (i = 0; i < 256; i++) {
291 p = i;
292 for (j = 0; j < 8; j++)
293 p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
294 crc_table[i] = p;
295 #ifdef W
296 crc_big_table[i] = byte_swap(p);
297 #endif
298 }
299
300 /* initialize the x^2^n mod p(x) table */
301 p = (z_crc_t)1 << 30; /* x^1 */
302 x2n_table[0] = p;
303 for (n = 1; n < 32; n++)
304 x2n_table[n] = p = multmodp(p, p);
305
306 #ifdef W
307 /* initialize the braiding tables -- needs x2n_table[] */
308 braid(crc_braid_table, crc_braid_big_table, N, W);
309 #endif
310
311 #ifdef MAKECRCH
312 {
313 /*
314 The crc32.h header file contains tables for both 32-bit and 64-bit
315 z_word_t's, and so requires a 64-bit type be available. In that case,
316 z_word_t must be defined to be 64-bits. This code then also generates
317 and writes out the tables for the case that z_word_t is 32 bits.
318 */
319 #if !defined(W) || W != 8
320 # error Need a 64-bit integer type in order to generate crc32.h.
321 #endif
322 FILE *out;
323 int k, n;
324 z_crc_t ltl[8][256];
325 z_word_t big[8][256];
326
327 out = fopen("crc32.h", "w");
328 if (out == NULL) return;
329
330 /* write out little-endian CRC table to crc32.h */
331 fprintf(out,
332 "/* crc32.h -- tables for rapid CRC calculation\n"
333 " * Generated automatically by crc32.c\n */\n"
334 "\n"
335 "local const z_crc_t FAR crc_table[] = {\n"
336 " ");
337 write_table(out, crc_table, 256);
338 fprintf(out,
339 "};\n");
340
341 /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
342 fprintf(out,
343 "\n"
344 "#ifdef W\n"
345 "\n"
346 "#if W == 8\n"
347 "\n"
348 "local const z_word_t FAR crc_big_table[] = {\n"
349 " ");
350 write_table64(out, crc_big_table, 256);
351 fprintf(out,
352 "};\n");
353
354 /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
355 fprintf(out,
356 "\n"
357 "#else /* W == 4 */\n"
358 "\n"
359 "local const z_word_t FAR crc_big_table[] = {\n"
360 " ");
361 write_table32hi(out, crc_big_table, 256);
362 fprintf(out,
363 "};\n"
364 "\n"
365 "#endif\n");
366
367 /* write out braid tables for each value of N */
368 for (n = 1; n <= 6; n++) {
369 fprintf(out,
370 "\n"
371 "#if N == %d\n", n);
372
373 /* compute braid tables for this N and 64-bit word_t */
374 braid(ltl, big, n, 8);
375
376 /* write out braid tables for 64-bit z_word_t to crc32.h */
377 fprintf(out,
378 "\n"
379 "#if W == 8\n"
380 "\n"
381 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
382 for (k = 0; k < 8; k++) {
383 fprintf(out, " {");
384 write_table(out, ltl[k], 256);
385 fprintf(out, "}%s", k < 7 ? ",\n" : "");
386 }
387 fprintf(out,
388 "};\n"
389 "\n"
390 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
391 for (k = 0; k < 8; k++) {
392 fprintf(out, " {");
393 write_table64(out, big[k], 256);
394 fprintf(out, "}%s", k < 7 ? ",\n" : "");
395 }
396 fprintf(out,
397 "};\n");
398
399 /* compute braid tables for this N and 32-bit word_t */
400 braid(ltl, big, n, 4);
401
402 /* write out braid tables for 32-bit z_word_t to crc32.h */
403 fprintf(out,
404 "\n"
405 "#else /* W == 4 */\n"
406 "\n"
407 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
408 for (k = 0; k < 4; k++) {
409 fprintf(out, " {");
410 write_table(out, ltl[k], 256);
411 fprintf(out, "}%s", k < 3 ? ",\n" : "");
412 }
413 fprintf(out,
414 "};\n"
415 "\n"
416 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
417 for (k = 0; k < 4; k++) {
418 fprintf(out, " {");
419 write_table32hi(out, big[k], 256);
420 fprintf(out, "}%s", k < 3 ? ",\n" : "");
421 }
422 fprintf(out,
423 "};\n"
424 "\n"
425 "#endif\n"
426 "\n"
427 "#endif\n");
428 }
429 fprintf(out,
430 "\n"
431 "#endif\n");
432
433 /* write out zeros operator table to crc32.h */
434 fprintf(out,
435 "\n"
436 "local const z_crc_t FAR x2n_table[] = {\n"
437 " ");
438 write_table(out, x2n_table, 32);
439 fprintf(out,
440 "};\n");
441 fclose(out);
442 }
443 #endif /* MAKECRCH */
444 }
445
446 #ifdef MAKECRCH
447
448 /*
449 Write the 32-bit values in table[0..k-1] to out, five per line in
450 hexadecimal separated by commas.
451 */
452 local void write_table(out, table, k)
453 FILE *out;
454 const z_crc_t FAR *table;
455 int k;
456 {
457 int n;
458
459 for (n = 0; n < k; n++)
460 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
461 (unsigned long)(table[n]),
462 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
463 }
464
465 /*
466 Write the high 32-bits of each value in table[0..k-1] to out, five per line
467 in hexadecimal separated by commas.
468 */
469 local void write_table32hi(out, table, k)
470 FILE *out;
471 const z_word_t FAR *table;
472 int k;
473 {
474 int n;
475
476 for (n = 0; n < k; n++)
477 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
478 (unsigned long)(table[n] >> 32),
479 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
480 }
481
482 /*
483 Write the 64-bit values in table[0..k-1] to out, three per line in
484 hexadecimal separated by commas. This assumes that if there is a 64-bit
485 type, then there is also a long long integer type, and it is at least 64
486 bits. If not, then the type cast and format string can be adjusted
487 accordingly.
488 */
489 local void write_table64(out, table, k)
490 FILE *out;
491 const z_word_t FAR *table;
492 int k;
493 {
494 int n;
495
496 for (n = 0; n < k; n++)
497 fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ",
498 (unsigned long long)(table[n]),
499 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
500 }
501
502 /* Actually do the deed. */
503 int main()
504 {
505 make_crc_table();
506 return 0;
507 }
508
509 #endif /* MAKECRCH */
510
511 #ifdef W
512 /*
513 Generate the little and big-endian braid tables for the given n and z_word_t
514 size w. Each array must have room for w blocks of 256 elements.
515 */
516 local void braid(ltl, big, n, w)
517 z_crc_t ltl[][256];
518 z_word_t big[][256];
519 int n;
520 int w;
521 {
522 int k;
523 z_crc_t i, p, q;
524 for (k = 0; k < w; k++) {
525 p = x2nmodp((n * w + 3 - k) << 3, 0);
526 ltl[k][0] = 0;
527 big[w - 1 - k][0] = 0;
528 for (i = 1; i < 256; i++) {
529 ltl[k][i] = q = multmodp(i << 24, p);
530 big[w - 1 - k][i] = byte_swap(q);
531 }
532 }
533 }
534 #endif
535
536 #else /* !DYNAMIC_CRC_TABLE */
537 /* ========================================================================
538 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
539 * of x for combining CRC-32s, all made by make_crc_table().
540 */
541 #include "crc32.h"
542 #endif /* DYNAMIC_CRC_TABLE */
543
544 /* ========================================================================
545 * Routines used for CRC calculation. Some are also required for the table
546 * generation above.
547 */
548
549 /*
550 Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
551 reflected. For speed, this requires that a not be zero.
552 */
553 local z_crc_t multmodp(a, b)
554 z_crc_t a;
555 z_crc_t b;
556 {
557 z_crc_t m, p;
558
559 m = (z_crc_t)1 << 31;
560 p = 0;
561 for (;;) {
562 if (a & m) {
563 p ^= b;
564 if ((a & (m - 1)) == 0)
565 break;
566 }
567 m >>= 1;
568 b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
569 }
570 return p;
571 }
572
573 /*
574 Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
575 initialized.
576 */
577 local z_crc_t x2nmodp(n, k)
578 z_off64_t n;
579 unsigned k;
580 {
581 z_crc_t p;
582
583 p = (z_crc_t)1 << 31; /* x^0 == 1 */
584 while (n) {
585 if (n & 1)
586 p = multmodp(x2n_table[k & 31], p);
587 n >>= 1;
588 k++;
589 }
590 return p;
591 }
592
593 /* =========================================================================
594 * This function can be used by asm versions of crc32(), and to force the
595 * generation of the CRC tables in a threaded application.
596 */
597 const z_crc_t FAR * ZEXPORT get_crc_table()
598 {
599 #ifdef DYNAMIC_CRC_TABLE
600 once(&made, make_crc_table);
601 #endif /* DYNAMIC_CRC_TABLE */
602 return (const z_crc_t FAR *)crc_table;
603 }
604
605 /* =========================================================================
606 * Use ARM machine instructions if available. This will compute the CRC about
607 * ten times faster than the braided calculation. This code does not check for
608 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
609 * only be defined if the compilation specifies an ARM processor architecture
610 * that has the instructions. For example, compiling with -march=armv8.1-a or
611 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
612 * instructions.
613 */
614 #ifdef ARMCRC32
615
616 /*
617 Constants empirically determined to maximize speed. These values are from
618 measurements on a Cortex-A57. Your mileage may vary.
619 */
620 #define Z_BATCH 3990 /* number of words in a batch */
621 #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
622 #define Z_BATCH_MIN 800 /* fewest words in a final batch */
623
624 unsigned long ZEXPORT crc32_z(crc, buf, len)
625 unsigned long crc;
626 const unsigned char FAR *buf;
627 z_size_t len;
628 {
629 z_crc_t val;
630 z_word_t crc1, crc2;
631 const z_word_t *word;
632 z_word_t val0, val1, val2;
633 z_size_t last, last2, i;
634 z_size_t num;
635
636 /* Return initial CRC, if requested. */
637 if (buf == Z_NULL) return 0;
638
639 #ifdef DYNAMIC_CRC_TABLE
640 once(&made, make_crc_table);
641 #endif /* DYNAMIC_CRC_TABLE */
642
643 /* Pre-condition the CRC */
644 crc = (~crc) & 0xffffffff;
645
646 /* Compute the CRC up to a word boundary. */
647 while (len && ((z_size_t)buf & 7) != 0) {
648 len--;
649 val = *buf++;
650 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
651 }
652
653 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
654 word = (z_word_t const *)buf;
655 num = len >> 3;
656 len &= 7;
657
658 /* Do three interleaved CRCs to realize the throughput of one crc32x
659 instruction per cycle. Each CRC is calculated on Z_BATCH words. The
660 three CRCs are combined into a single CRC after each set of batches. */
661 while (num >= 3 * Z_BATCH) {
662 crc1 = 0;
663 crc2 = 0;
664 for (i = 0; i < Z_BATCH; i++) {
665 val0 = word[i];
666 val1 = word[i + Z_BATCH];
667 val2 = word[i + 2 * Z_BATCH];
668 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
669 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
670 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
671 }
672 word += 3 * Z_BATCH;
673 num -= 3 * Z_BATCH;
674 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
675 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
676 }
677
678 /* Do one last smaller batch with the remaining words, if there are enough
679 to pay for the combination of CRCs. */
680 last = num / 3;
681 if (last >= Z_BATCH_MIN) {
682 last2 = last << 1;
683 crc1 = 0;
684 crc2 = 0;
685 for (i = 0; i < last; i++) {
686 val0 = word[i];
687 val1 = word[i + last];
688 val2 = word[i + last2];
689 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
690 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
691 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
692 }
693 word += 3 * last;
694 num -= 3 * last;
695 val = x2nmodp(last, 6);
696 crc = multmodp(val, crc) ^ crc1;
697 crc = multmodp(val, crc) ^ crc2;
698 }
699
700 /* Compute the CRC on any remaining words. */
701 for (i = 0; i < num; i++) {
702 val0 = word[i];
703 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
704 }
705 word += num;
706
707 /* Complete the CRC on any remaining bytes. */
708 buf = (const unsigned char FAR *)word;
709 while (len) {
710 len--;
711 val = *buf++;
712 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
713 }
714
715 /* Return the CRC, post-conditioned. */
716 return crc ^ 0xffffffff;
717 }
718
719 #else
720
721 #ifdef W
722
723 /*
724 Return the CRC of the W bytes in the word_t data, taking the
725 least-significant byte of the word as the first byte of data, without any pre
726 or post conditioning. This is used to combine the CRCs of each braid.
727 */
728 local z_crc_t crc_word(data)
729 z_word_t data;
730 {
731 int k;
732 for (k = 0; k < W; k++)
733 data = (data >> 8) ^ crc_table[data & 0xff];
734 return (z_crc_t)data;
735 }
736
737 local z_word_t crc_word_big(data)
738 z_word_t data;
739 {
740 int k;
741 for (k = 0; k < W; k++)
742 data = (data << 8) ^
743 crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
744 return data;
745 }
746
747 #endif
748
749 /* ========================================================================= */
750 unsigned long ZEXPORT crc32_z(crc, buf, len)
751 unsigned long crc;
752 const unsigned char FAR *buf;
753 z_size_t len;
754 {
755 /* Return initial CRC, if requested. */
756 if (buf == Z_NULL) return 0;
757
758 #ifdef DYNAMIC_CRC_TABLE
759 once(&made, make_crc_table);
760 #endif /* DYNAMIC_CRC_TABLE */
761
762 /* Pre-condition the CRC */
763 crc = (~crc) & 0xffffffff;
764
765 #ifdef W
766
767 /* If provided enough bytes, do a braided CRC calculation. */
768 if (len >= N * W + W - 1) {
769 z_size_t blks;
770 z_word_t const *words;
771 unsigned endian;
772 int k;
773
774 /* Compute the CRC up to a z_word_t boundary. */
775 while (len && ((z_size_t)buf & (W - 1)) != 0) {
776 len--;
777 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
778 }
779
780 /* Compute the CRC on as many N z_word_t blocks as are available. */
781 blks = len / (N * W);
782 len -= blks * N * W;
783 words = (z_word_t const *)buf;
784
785 /* Do endian check at execution time instead of compile time, since ARM
786 processors can change the endianess at execution time. If the
787 compiler knows what the endianess will be, it can optimize out the
788 check and the unused branch. */
789 endian = 1;
790 if (*(unsigned char *)&endian) {
791 /* Little endian. */
792
793 z_crc_t crc0;
794 z_word_t word0;
795 #if N > 1
796 z_crc_t crc1;
797 z_word_t word1;
798 #if N > 2
799 z_crc_t crc2;
800 z_word_t word2;
801 #if N > 3
802 z_crc_t crc3;
803 z_word_t word3;
804 #if N > 4
805 z_crc_t crc4;
806 z_word_t word4;
807 #if N > 5
808 z_crc_t crc5;
809 z_word_t word5;
810 #endif
811 #endif
812 #endif
813 #endif
814 #endif
815
816 /* Initialize the CRC for each braid. */
817 crc0 = crc;
818 #if N > 1
819 crc1 = 0;
820 #if N > 2
821 crc2 = 0;
822 #if N > 3
823 crc3 = 0;
824 #if N > 4
825 crc4 = 0;
826 #if N > 5
827 crc5 = 0;
828 #endif
829 #endif
830 #endif
831 #endif
832 #endif
833
834 /*
835 Process the first blks-1 blocks, computing the CRCs on each braid
836 independently.
837 */
838 while (--blks) {
839 /* Load the word for each braid into registers. */
840 word0 = crc0 ^ words[0];
841 #if N > 1
842 word1 = crc1 ^ words[1];
843 #if N > 2
844 word2 = crc2 ^ words[2];
845 #if N > 3
846 word3 = crc3 ^ words[3];
847 #if N > 4
848 word4 = crc4 ^ words[4];
849 #if N > 5
850 word5 = crc5 ^ words[5];
851 #endif
852 #endif
853 #endif
854 #endif
855 #endif
856 words += N;
857
858 /* Compute and update the CRC for each word. The loop should
859 get unrolled. */
860 crc0 = crc_braid_table[0][word0 & 0xff];
861 #if N > 1
862 crc1 = crc_braid_table[0][word1 & 0xff];
863 #if N > 2
864 crc2 = crc_braid_table[0][word2 & 0xff];
865 #if N > 3
866 crc3 = crc_braid_table[0][word3 & 0xff];
867 #if N > 4
868 crc4 = crc_braid_table[0][word4 & 0xff];
869 #if N > 5
870 crc5 = crc_braid_table[0][word5 & 0xff];
871 #endif
872 #endif
873 #endif
874 #endif
875 #endif
876 for (k = 1; k < W; k++) {
877 crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
878 #if N > 1
879 crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
880 #if N > 2
881 crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
882 #if N > 3
883 crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
884 #if N > 4
885 crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
886 #if N > 5
887 crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
888 #endif
889 #endif
890 #endif
891 #endif
892 #endif
893 }
894 }
895
896 /*
897 Process the last block, combining the CRCs of the N braids at the
898 same time.
899 */
900 crc = crc_word(crc0 ^ words[0]);
901 #if N > 1
902 crc = crc_word(crc1 ^ words[1] ^ crc);
903 #if N > 2
904 crc = crc_word(crc2 ^ words[2] ^ crc);
905 #if N > 3
906 crc = crc_word(crc3 ^ words[3] ^ crc);
907 #if N > 4
908 crc = crc_word(crc4 ^ words[4] ^ crc);
909 #if N > 5
910 crc = crc_word(crc5 ^ words[5] ^ crc);
911 #endif
912 #endif
913 #endif
914 #endif
915 #endif
916 words += N;
917 }
918 else {
919 /* Big endian. */
920
921 z_word_t crc0, word0, comb;
922 #if N > 1
923 z_word_t crc1, word1;
924 #if N > 2
925 z_word_t crc2, word2;
926 #if N > 3
927 z_word_t crc3, word3;
928 #if N > 4
929 z_word_t crc4, word4;
930 #if N > 5
931 z_word_t crc5, word5;
932 #endif
933 #endif
934 #endif
935 #endif
936 #endif
937
938 /* Initialize the CRC for each braid. */
939 crc0 = byte_swap(crc);
940 #if N > 1
941 crc1 = 0;
942 #if N > 2
943 crc2 = 0;
944 #if N > 3
945 crc3 = 0;
946 #if N > 4
947 crc4 = 0;
948 #if N > 5
949 crc5 = 0;
950 #endif
951 #endif
952 #endif
953 #endif
954 #endif
955
956 /*
957 Process the first blks-1 blocks, computing the CRCs on each braid
958 independently.
959 */
960 while (--blks) {
961 /* Load the word for each braid into registers. */
962 word0 = crc0 ^ words[0];
963 #if N > 1
964 word1 = crc1 ^ words[1];
965 #if N > 2
966 word2 = crc2 ^ words[2];
967 #if N > 3
968 word3 = crc3 ^ words[3];
969 #if N > 4
970 word4 = crc4 ^ words[4];
971 #if N > 5
972 word5 = crc5 ^ words[5];
973 #endif
974 #endif
975 #endif
976 #endif
977 #endif
978 words += N;
979
980 /* Compute and update the CRC for each word. The loop should
981 get unrolled. */
982 crc0 = crc_braid_big_table[0][word0 & 0xff];
983 #if N > 1
984 crc1 = crc_braid_big_table[0][word1 & 0xff];
985 #if N > 2
986 crc2 = crc_braid_big_table[0][word2 & 0xff];
987 #if N > 3
988 crc3 = crc_braid_big_table[0][word3 & 0xff];
989 #if N > 4
990 crc4 = crc_braid_big_table[0][word4 & 0xff];
991 #if N > 5
992 crc5 = crc_braid_big_table[0][word5 & 0xff];
993 #endif
994 #endif
995 #endif
996 #endif
997 #endif
998 for (k = 1; k < W; k++) {
999 crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
1000 #if N > 1
1001 crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1002 #if N > 2
1003 crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1004 #if N > 3
1005 crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1006 #if N > 4
1007 crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1008 #if N > 5
1009 crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1010 #endif
1011 #endif
1012 #endif
1013 #endif
1014 #endif
1015 }
1016 }
1017
1018 /*
1019 Process the last block, combining the CRCs of the N braids at the
1020 same time.
1021 */
1022 comb = crc_word_big(crc0 ^ words[0]);
1023 #if N > 1
1024 comb = crc_word_big(crc1 ^ words[1] ^ comb);
1025 #if N > 2
1026 comb = crc_word_big(crc2 ^ words[2] ^ comb);
1027 #if N > 3
1028 comb = crc_word_big(crc3 ^ words[3] ^ comb);
1029 #if N > 4
1030 comb = crc_word_big(crc4 ^ words[4] ^ comb);
1031 #if N > 5
1032 comb = crc_word_big(crc5 ^ words[5] ^ comb);
1033 #endif
1034 #endif
1035 #endif
1036 #endif
1037 #endif
1038 words += N;
1039 crc = byte_swap(comb);
1040 }
1041
1042 /*
1043 Update the pointer to the remaining bytes to process.
1044 */
1045 buf = (unsigned char const *)words;
1046 }
1047
1048 #endif /* W */
1049
1050 /* Complete the computation of the CRC on any remaining bytes. */
1051 while (len >= 8) {
1052 len -= 8;
1053 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1056 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1057 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1058 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1059 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1060 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1061 }
1062 while (len) {
1063 len--;
1064 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1065 }
1066
1067 /* Return the CRC, post-conditioned. */
1068 return crc ^ 0xffffffff;
1069 }
1070
1071 #endif
1072
1073 /* ========================================================================= */
1074 unsigned long ZEXPORT crc32(crc, buf, len)
1075 unsigned long crc;
1076 const unsigned char FAR *buf;
1077 uInt len;
1078 {
1079 return crc32_z(crc, buf, len);
1080 }
1081
1082 /* ========================================================================= */
1083 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1084 uLong crc1;
1085 uLong crc2;
1086 z_off64_t len2;
1087 {
1088 #ifdef DYNAMIC_CRC_TABLE
1089 once(&made, make_crc_table);
1090 #endif /* DYNAMIC_CRC_TABLE */
1091 return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1092 }
1093
1094 /* ========================================================================= */
1095 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1096 uLong crc1;
1097 uLong crc2;
1098 z_off_t len2;
1099 {
1100 return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1101 }
1102
1103 /* ========================================================================= */
1104 uLong ZEXPORT crc32_combine_gen64(len2)
1105 z_off64_t len2;
1106 {
1107 #ifdef DYNAMIC_CRC_TABLE
1108 once(&made, make_crc_table);
1109 #endif /* DYNAMIC_CRC_TABLE */
1110 return x2nmodp(len2, 3);
1111 }
1112
1113 /* ========================================================================= */
1114 uLong ZEXPORT crc32_combine_gen(len2)
1115 z_off_t len2;
1116 {
1117 return crc32_combine_gen64((z_off64_t)len2);
1118 }
1119
1120 /* ========================================================================= */
1121 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1122 uLong crc1;
1123 uLong crc2;
1124 uLong op;
1125 {
1126 return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1127 }
1128