Home | History | Annotate | Line # | Download | only in zlib
      1 /* crc32.c -- compute the CRC-32 of a data stream
      2  * Copyright (C) 1995-2022 Mark Adler
      3  * For conditions of distribution and use, see copyright notice in zlib.h
      4  *
      5  * This interleaved implementation of a CRC makes use of pipelined multiple
      6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
      7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
      8  */
      9 
     10 /* @(#) Id */
     11 
     12 /*
     13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
     14   protection on the static variables used to control the first-use generation
     15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
     16   first call get_crc_table() to initialize the tables before allowing more than
     17   one thread to use crc32().
     18 
     19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
     20   produced, so that this one source file can be compiled to an executable.
     21  */
     22 
     23 #ifdef MAKECRCH
     24 #  include <stdio.h>
     25 #  ifndef DYNAMIC_CRC_TABLE
     26 #    define DYNAMIC_CRC_TABLE
     27 #  endif /* !DYNAMIC_CRC_TABLE */
     28 #endif /* MAKECRCH */
     29 
     30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
     31 
     32  /*
     33   A CRC of a message is computed on N braids of words in the message, where
     34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
     35   running sparse CRCs are calculated respectively on each braid, at these
     36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
     37   This is done starting at a word boundary, and continues until as many blocks
     38   of N * W bytes as are available have been processed. The results are combined
     39   into a single CRC at the end. For this code, N must be in the range 1..6 and
     40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
     41   more #if blocks, extending the patterns apparent in the code. In addition,
     42   crc32.h would need to be regenerated, if the maximum N value is increased.
     43 
     44   N and W are chosen empirically by benchmarking the execution time on a given
     45   processor. The choices for N and W below were based on testing on Intel Kaby
     46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
     47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
     48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
     49   They were all tested with either gcc or clang, all using the -O3 optimization
     50   level. Your mileage may vary.
     51  */
     52 
     53 /* Define N */
     54 #ifdef Z_TESTN
     55 #  define N Z_TESTN
     56 #else
     57 #  define N 5
     58 #endif
     59 #if N < 1 || N > 6
     60 #  error N must be in 1..6
     61 #endif
     62 
     63 /*
     64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
     65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
     66   that bytes are eight bits.
     67  */
     68 
     69 /*
     70   Define W and the associated z_word_t type. If W is not defined, then a
     71   braided calculation is not used, and the associated tables and code are not
     72   compiled.
     73  */
     74 #ifdef Z_TESTW
     75 #  if Z_TESTW-1 != -1
     76 #    define W Z_TESTW
     77 #  endif
     78 #else
     79 #  ifdef MAKECRCH
     80 #    define W 8         /* required for MAKECRCH */
     81 #  else
     82 #    if defined(__x86_64__) || defined(__aarch64__)
     83 #      define W 8
     84 #    else
     85 #      define W 4
     86 #    endif
     87 #  endif
     88 #endif
     89 #ifdef W
     90 #  if W == 8 && defined(Z_U8)
     91      typedef Z_U8 z_word_t;
     92 #  elif defined(Z_U4)
     93 #    undef W
     94 #    define W 4
     95      typedef Z_U4 z_word_t;
     96 #  else
     97 #    undef W
     98 #  endif
     99 #endif
    100 
    101 /* Local functions. */
    102 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
    103 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
    104 
    105 /* If available, use the ARM processor CRC32 instruction. */
    106 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
    107 #  define ARMCRC32
    108 #endif
    109 
    110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
    111 /*
    112   Swap the bytes in a z_word_t to convert between little and big endian. Any
    113   self-respecting compiler will optimize this to a single machine byte-swap
    114   instruction, if one is available. This assumes that word_t is either 32 bits
    115   or 64 bits.
    116  */
    117 local z_word_t byte_swap(word)
    118     z_word_t word;
    119 {
    120 #  if W == 8
    121     return
    122         (word & 0xff00000000000000) >> 56 |
    123         (word & 0xff000000000000) >> 40 |
    124         (word & 0xff0000000000) >> 24 |
    125         (word & 0xff00000000) >> 8 |
    126         (word & 0xff000000) << 8 |
    127         (word & 0xff0000) << 24 |
    128         (word & 0xff00) << 40 |
    129         (word & 0xff) << 56;
    130 #  else   /* W == 4 */
    131     return
    132         (word & 0xff000000) >> 24 |
    133         (word & 0xff0000) >> 8 |
    134         (word & 0xff00) << 8 |
    135         (word & 0xff) << 24;
    136 #  endif
    137 }
    138 #endif
    139 
    140 /* CRC polynomial. */
    141 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
    142 
    143 #ifdef DYNAMIC_CRC_TABLE
    144 
    145 local z_crc_t FAR crc_table[256];
    146 local z_crc_t FAR x2n_table[32];
    147 local void make_crc_table OF((void));
    148 #ifdef W
    149    local z_word_t FAR crc_big_table[256];
    150    local z_crc_t FAR crc_braid_table[W][256];
    151    local z_word_t FAR crc_braid_big_table[W][256];
    152    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
    153 #endif
    154 #ifdef MAKECRCH
    155    local void write_table OF((FILE *, const z_crc_t FAR *, int));
    156    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
    157    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
    158 #endif /* MAKECRCH */
    159 
    160 /*
    161   Define a once() function depending on the availability of atomics. If this is
    162   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
    163   multiple threads, and if atomics are not available, then get_crc_table() must
    164   be called to initialize the tables and must return before any threads are
    165   allowed to compute or combine CRCs.
    166  */
    167 
    168 /* Definition of once functionality. */
    169 typedef struct once_s once_t;
    170 local void once OF((once_t *, void (*)(void)));
    171 
    172 /* Check for the availability of atomics. */
    173 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
    174     !defined(__STDC_NO_ATOMICS__)
    175 
    176 #include <stdatomic.h>
    177 
    178 /* Structure for once(), which must be initialized with ONCE_INIT. */
    179 struct once_s {
    180     atomic_flag begun;
    181     atomic_int done;
    182 };
    183 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
    184 
    185 /*
    186   Run the provided init() function exactly once, even if multiple threads
    187   invoke once() at the same time. The state must be a once_t initialized with
    188   ONCE_INIT.
    189  */
    190 local void once(state, init)
    191     once_t *state;
    192     void (*init)(void);
    193 {
    194     if (!atomic_load(&state->done)) {
    195         if (atomic_flag_test_and_set(&state->begun))
    196             while (!atomic_load(&state->done))
    197                 ;
    198         else {
    199             init();
    200             atomic_store(&state->done, 1);
    201         }
    202     }
    203 }
    204 
    205 #else   /* no atomics */
    206 
    207 /* Structure for once(), which must be initialized with ONCE_INIT. */
    208 struct once_s {
    209     volatile int begun;
    210     volatile int done;
    211 };
    212 #define ONCE_INIT {0, 0}
    213 
    214 /* Test and set. Alas, not atomic, but tries to minimize the period of
    215    vulnerability. */
    216 local int test_and_set OF((int volatile *));
    217 local int test_and_set(flag)
    218     int volatile *flag;
    219 {
    220     int was;
    221 
    222     was = *flag;
    223     *flag = 1;
    224     return was;
    225 }
    226 
    227 /* Run the provided init() function once. This is not thread-safe. */
    228 local void once(state, init)
    229     once_t *state;
    230     void (*init)(void);
    231 {
    232     if (!state->done) {
    233         if (test_and_set(&state->begun))
    234             while (!state->done)
    235                 ;
    236         else {
    237             init();
    238             state->done = 1;
    239         }
    240     }
    241 }
    242 
    243 #endif
    244 
    245 /* State for once(). */
    246 local once_t made = ONCE_INIT;
    247 
    248 /*
    249   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
    250   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
    251 
    252   Polynomials over GF(2) are represented in binary, one bit per coefficient,
    253   with the lowest powers in the most significant bit. Then adding polynomials
    254   is just exclusive-or, and multiplying a polynomial by x is a right shift by
    255   one. If we call the above polynomial p, and represent a byte as the
    256   polynomial q, also with the lowest power in the most significant bit (so the
    257   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
    258   where a mod b means the remainder after dividing a by b.
    259 
    260   This calculation is done using the shift-register method of multiplying and
    261   taking the remainder. The register is initialized to zero, and for each
    262   incoming bit, x^32 is added mod p to the register if the bit is a one (where
    263   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
    264   (which is shifting right by one and adding x^32 mod p if the bit shifted out
    265   is a one). We start with the highest power (least significant bit) of q and
    266   repeat for all eight bits of q.
    267 
    268   The table is simply the CRC of all possible eight bit values. This is all the
    269   information needed to generate CRCs on data a byte at a time for all
    270   combinations of CRC register values and incoming bytes.
    271  */
    272 
    273 local void make_crc_table()
    274 {
    275     unsigned i, j, n;
    276     z_crc_t p;
    277 
    278     /* initialize the CRC of bytes tables */
    279     for (i = 0; i < 256; i++) {
    280         p = i;
    281         for (j = 0; j < 8; j++)
    282             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
    283         crc_table[i] = p;
    284 #ifdef W
    285         crc_big_table[i] = byte_swap(p);
    286 #endif
    287     }
    288 
    289     /* initialize the x^2^n mod p(x) table */
    290     p = (z_crc_t)1 << 30;         /* x^1 */
    291     x2n_table[0] = p;
    292     for (n = 1; n < 32; n++)
    293         x2n_table[n] = p = multmodp(p, p);
    294 
    295 #ifdef W
    296     /* initialize the braiding tables -- needs x2n_table[] */
    297     braid(crc_braid_table, crc_braid_big_table, N, W);
    298 #endif
    299 
    300 #ifdef MAKECRCH
    301     {
    302         /*
    303           The crc32.h header file contains tables for both 32-bit and 64-bit
    304           z_word_t's, and so requires a 64-bit type be available. In that case,
    305           z_word_t must be defined to be 64-bits. This code then also generates
    306           and writes out the tables for the case that z_word_t is 32 bits.
    307          */
    308 #if !defined(W) || W != 8
    309 #  error Need a 64-bit integer type in order to generate crc32.h.
    310 #endif
    311         FILE *out;
    312         int k, n;
    313         z_crc_t ltl[8][256];
    314         z_word_t big[8][256];
    315 
    316         out = fopen("crc32.h", "w");
    317         if (out == NULL) return;
    318 
    319         /* write out little-endian CRC table to crc32.h */
    320         fprintf(out,
    321             "/* crc32.h -- tables for rapid CRC calculation\n"
    322             " * Generated automatically by crc32.c\n */\n"
    323             "\n"
    324             "local const z_crc_t FAR crc_table[] = {\n"
    325             "    ");
    326         write_table(out, crc_table, 256);
    327         fprintf(out,
    328             "};\n");
    329 
    330         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
    331         fprintf(out,
    332             "\n"
    333             "#ifdef W\n"
    334             "\n"
    335             "#if W == 8\n"
    336             "\n"
    337             "local const z_word_t FAR crc_big_table[] = {\n"
    338             "    ");
    339         write_table64(out, crc_big_table, 256);
    340         fprintf(out,
    341             "};\n");
    342 
    343         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
    344         fprintf(out,
    345             "\n"
    346             "#else /* W == 4 */\n"
    347             "\n"
    348             "local const z_word_t FAR crc_big_table[] = {\n"
    349             "    ");
    350         write_table32hi(out, crc_big_table, 256);
    351         fprintf(out,
    352             "};\n"
    353             "\n"
    354             "#endif\n");
    355 
    356         /* write out braid tables for each value of N */
    357         for (n = 1; n <= 6; n++) {
    358             fprintf(out,
    359             "\n"
    360             "#if N == %d\n", n);
    361 
    362             /* compute braid tables for this N and 64-bit word_t */
    363             braid(ltl, big, n, 8);
    364 
    365             /* write out braid tables for 64-bit z_word_t to crc32.h */
    366             fprintf(out,
    367             "\n"
    368             "#if W == 8\n"
    369             "\n"
    370             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    371             for (k = 0; k < 8; k++) {
    372                 fprintf(out, "   {");
    373                 write_table(out, ltl[k], 256);
    374                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
    375             }
    376             fprintf(out,
    377             "};\n"
    378             "\n"
    379             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    380             for (k = 0; k < 8; k++) {
    381                 fprintf(out, "   {");
    382                 write_table64(out, big[k], 256);
    383                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
    384             }
    385             fprintf(out,
    386             "};\n");
    387 
    388             /* compute braid tables for this N and 32-bit word_t */
    389             braid(ltl, big, n, 4);
    390 
    391             /* write out braid tables for 32-bit z_word_t to crc32.h */
    392             fprintf(out,
    393             "\n"
    394             "#else /* W == 4 */\n"
    395             "\n"
    396             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    397             for (k = 0; k < 4; k++) {
    398                 fprintf(out, "   {");
    399                 write_table(out, ltl[k], 256);
    400                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
    401             }
    402             fprintf(out,
    403             "};\n"
    404             "\n"
    405             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    406             for (k = 0; k < 4; k++) {
    407                 fprintf(out, "   {");
    408                 write_table32hi(out, big[k], 256);
    409                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
    410             }
    411             fprintf(out,
    412             "};\n"
    413             "\n"
    414             "#endif\n"
    415             "\n"
    416             "#endif\n");
    417         }
    418         fprintf(out,
    419             "\n"
    420             "#endif\n");
    421 
    422         /* write out zeros operator table to crc32.h */
    423         fprintf(out,
    424             "\n"
    425             "local const z_crc_t FAR x2n_table[] = {\n"
    426             "    ");
    427         write_table(out, x2n_table, 32);
    428         fprintf(out,
    429             "};\n");
    430         fclose(out);
    431     }
    432 #endif /* MAKECRCH */
    433 }
    434 
    435 #ifdef MAKECRCH
    436 
    437 /*
    438    Write the 32-bit values in table[0..k-1] to out, five per line in
    439    hexadecimal separated by commas.
    440  */
    441 local void write_table(out, table, k)
    442     FILE *out;
    443     const z_crc_t FAR *table;
    444     int k;
    445 {
    446     int n;
    447 
    448     for (n = 0; n < k; n++)
    449         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    450                 (unsigned long)(table[n]),
    451                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    452 }
    453 
    454 /*
    455    Write the high 32-bits of each value in table[0..k-1] to out, five per line
    456    in hexadecimal separated by commas.
    457  */
    458 local void write_table32hi(out, table, k)
    459 FILE *out;
    460 const z_word_t FAR *table;
    461 int k;
    462 {
    463     int n;
    464 
    465     for (n = 0; n < k; n++)
    466         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    467                 (unsigned long)(table[n] >> 32),
    468                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    469 }
    470 
    471 /*
    472   Write the 64-bit values in table[0..k-1] to out, three per line in
    473   hexadecimal separated by commas. This assumes that if there is a 64-bit
    474   type, then there is also a long long integer type, and it is at least 64
    475   bits. If not, then the type cast and format string can be adjusted
    476   accordingly.
    477  */
    478 local void write_table64(out, table, k)
    479     FILE *out;
    480     const z_word_t FAR *table;
    481     int k;
    482 {
    483     int n;
    484 
    485     for (n = 0; n < k; n++)
    486         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
    487                 (unsigned long long)(table[n]),
    488                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
    489 }
    490 
    491 /* Actually do the deed. */
    492 int main()
    493 {
    494     make_crc_table();
    495     return 0;
    496 }
    497 
    498 #endif /* MAKECRCH */
    499 
    500 #ifdef W
    501 /*
    502   Generate the little and big-endian braid tables for the given n and z_word_t
    503   size w. Each array must have room for w blocks of 256 elements.
    504  */
    505 local void braid(ltl, big, n, w)
    506     z_crc_t ltl[][256];
    507     z_word_t big[][256];
    508     int n;
    509     int w;
    510 {
    511     int k;
    512     z_crc_t i, p, q;
    513     for (k = 0; k < w; k++) {
    514         p = x2nmodp((n * w + 3 - k) << 3, 0);
    515         ltl[k][0] = 0;
    516         big[w - 1 - k][0] = 0;
    517         for (i = 1; i < 256; i++) {
    518             ltl[k][i] = q = multmodp(i << 24, p);
    519             big[w - 1 - k][i] = byte_swap(q);
    520         }
    521     }
    522 }
    523 #endif
    524 
    525 #else /* !DYNAMIC_CRC_TABLE */
    526 /* ========================================================================
    527  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
    528  * of x for combining CRC-32s, all made by make_crc_table().
    529  */
    530 #include "crc32.h"
    531 #endif /* DYNAMIC_CRC_TABLE */
    532 
    533 /* ========================================================================
    534  * Routines used for CRC calculation. Some are also required for the table
    535  * generation above.
    536  */
    537 
    538 /*
    539   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
    540   reflected. For speed, this requires that a not be zero.
    541  */
    542 local z_crc_t multmodp(a, b)
    543     z_crc_t a;
    544     z_crc_t b;
    545 {
    546     z_crc_t m, p;
    547 
    548     m = (z_crc_t)1 << 31;
    549     p = 0;
    550     for (;;) {
    551         if (a & m) {
    552             p ^= b;
    553             if ((a & (m - 1)) == 0)
    554                 break;
    555         }
    556         m >>= 1;
    557         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
    558     }
    559     return p;
    560 }
    561 
    562 /*
    563   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
    564   initialized.
    565  */
    566 local z_crc_t x2nmodp(n, k)
    567     z_off64_t n;
    568     unsigned k;
    569 {
    570     z_crc_t p;
    571 
    572     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
    573     while (n) {
    574         if (n & 1)
    575             p = multmodp(x2n_table[k & 31], p);
    576         n >>= 1;
    577         k++;
    578     }
    579     return p;
    580 }
    581 
    582 /* =========================================================================
    583  * This function can be used by asm versions of crc32(), and to force the
    584  * generation of the CRC tables in a threaded application.
    585  */
    586 const z_crc_t FAR * ZEXPORT get_crc_table()
    587 {
    588 #ifdef DYNAMIC_CRC_TABLE
    589     once(&made, make_crc_table);
    590 #endif /* DYNAMIC_CRC_TABLE */
    591     return (const z_crc_t FAR *)crc_table;
    592 }
    593 
    594 /* =========================================================================
    595  * Use ARM machine instructions if available. This will compute the CRC about
    596  * ten times faster than the braided calculation. This code does not check for
    597  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
    598  * only be defined if the compilation specifies an ARM processor architecture
    599  * that has the instructions. For example, compiling with -march=armv8.1-a or
    600  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
    601  * instructions.
    602  */
    603 #ifdef ARMCRC32
    604 
    605 /*
    606    Constants empirically determined to maximize speed. These values are from
    607    measurements on a Cortex-A57. Your mileage may vary.
    608  */
    609 #define Z_BATCH 3990                /* number of words in a batch */
    610 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
    611 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
    612 
    613 unsigned long ZEXPORT crc32_z(crc, buf, len)
    614     unsigned long crc;
    615     const unsigned char FAR *buf;
    616     z_size_t len;
    617 {
    618     z_crc_t val;
    619     z_word_t crc1, crc2;
    620     const z_word_t *word;
    621     z_word_t val0, val1, val2;
    622     z_size_t last, last2, i;
    623     z_size_t num;
    624 
    625     /* Return initial CRC, if requested. */
    626     if (buf == Z_NULL) return 0;
    627 
    628 #ifdef DYNAMIC_CRC_TABLE
    629     once(&made, make_crc_table);
    630 #endif /* DYNAMIC_CRC_TABLE */
    631 
    632     /* Pre-condition the CRC */
    633     crc ^= 0xffffffff;
    634 
    635     /* Compute the CRC up to a word boundary. */
    636     while (len && ((z_size_t)buf & 7) != 0) {
    637         len--;
    638         val = *buf++;
    639         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    640     }
    641 
    642     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
    643     word = (z_word_t const *)buf;
    644     num = len >> 3;
    645     len &= 7;
    646 
    647     /* Do three interleaved CRCs to realize the throughput of one crc32x
    648        instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
    649        CRCs are combined into a single CRC after each set of batches. */
    650     while (num >= 3 * Z_BATCH) {
    651         crc1 = 0;
    652         crc2 = 0;
    653         for (i = 0; i < Z_BATCH; i++) {
    654             val0 = word[i];
    655             val1 = word[i + Z_BATCH];
    656             val2 = word[i + 2 * Z_BATCH];
    657             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    658             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    659             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    660         }
    661         word += 3 * Z_BATCH;
    662         num -= 3 * Z_BATCH;
    663         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
    664         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
    665     }
    666 
    667     /* Do one last smaller batch with the remaining words, if there are enough
    668        to pay for the combination of CRCs. */
    669     last = num / 3;
    670     if (last >= Z_BATCH_MIN) {
    671         last2 = last << 1;
    672         crc1 = 0;
    673         crc2 = 0;
    674         for (i = 0; i < last; i++) {
    675             val0 = word[i];
    676             val1 = word[i + last];
    677             val2 = word[i + last2];
    678             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    679             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    680             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    681         }
    682         word += 3 * last;
    683         num -= 3 * last;
    684         val = x2nmodp(last, 6);
    685         crc = multmodp(val, crc) ^ crc1;
    686         crc = multmodp(val, crc) ^ crc2;
    687     }
    688 
    689     /* Compute the CRC on any remaining words. */
    690     for (i = 0; i < num; i++) {
    691         val0 = word[i];
    692         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    693     }
    694     word += num;
    695 
    696     /* Complete the CRC on any remaining bytes. */
    697     buf = (const unsigned char FAR *)word;
    698     while (len) {
    699         len--;
    700         val = *buf++;
    701         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    702     }
    703 
    704     /* Return the CRC, post-conditioned. */
    705     return crc ^ 0xffffffff;
    706 }
    707 
    708 #else
    709 
    710 #ifdef W
    711 
    712 /*
    713   Return the CRC of the W bytes in the word_t data, taking the
    714   least-significant byte of the word as the first byte of data, without any pre
    715   or post conditioning. This is used to combine the CRCs of each braid.
    716  */
    717 local z_crc_t crc_word(data)
    718     z_word_t data;
    719 {
    720     int k;
    721     for (k = 0; k < W; k++)
    722         data = (data >> 8) ^ crc_table[data & 0xff];
    723     return (z_crc_t)data;
    724 }
    725 
    726 local z_word_t crc_word_big(data)
    727     z_word_t data;
    728 {
    729     int k;
    730     for (k = 0; k < W; k++)
    731         data = (data << 8) ^
    732             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
    733     return data;
    734 }
    735 
    736 #endif
    737 
    738 /* ========================================================================= */
    739 unsigned long ZEXPORT crc32_z(crc, buf, len)
    740     unsigned long crc;
    741     const unsigned char FAR *buf;
    742     z_size_t len;
    743 {
    744     /* Return initial CRC, if requested. */
    745     if (buf == Z_NULL) return 0;
    746 
    747 #ifdef DYNAMIC_CRC_TABLE
    748     once(&made, make_crc_table);
    749 #endif /* DYNAMIC_CRC_TABLE */
    750 
    751     /* Pre-condition the CRC */
    752     crc ^= 0xffffffff;
    753 
    754 #ifdef W
    755 
    756     /* If provided enough bytes, do a braided CRC calculation. */
    757     if (len >= N * W + W - 1) {
    758         z_size_t blks;
    759         z_word_t const *words;
    760         unsigned endian;
    761         int k;
    762 
    763         /* Compute the CRC up to a z_word_t boundary. */
    764         while (len && ((z_size_t)buf & (W - 1)) != 0) {
    765             len--;
    766             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    767         }
    768 
    769         /* Compute the CRC on as many N z_word_t blocks as are available. */
    770         blks = len / (N * W);
    771         len -= blks * N * W;
    772         words = (z_word_t const *)buf;
    773 
    774         /* Do endian check at execution time instead of compile time, since ARM
    775            processors can change the endianess at execution time. If the
    776            compiler knows what the endianess will be, it can optimize out the
    777            check and the unused branch. */
    778         endian = 1;
    779         if (*(unsigned char *)&endian) {
    780             /* Little endian. */
    781 
    782             z_crc_t crc0;
    783             z_word_t word0;
    784 #if N > 1
    785             z_crc_t crc1;
    786             z_word_t word1;
    787 #if N > 2
    788             z_crc_t crc2;
    789             z_word_t word2;
    790 #if N > 3
    791             z_crc_t crc3;
    792             z_word_t word3;
    793 #if N > 4
    794             z_crc_t crc4;
    795             z_word_t word4;
    796 #if N > 5
    797             z_crc_t crc5;
    798             z_word_t word5;
    799 #endif
    800 #endif
    801 #endif
    802 #endif
    803 #endif
    804 
    805             /* Initialize the CRC for each braid. */
    806             crc0 = crc;
    807 #if N > 1
    808             crc1 = 0;
    809 #if N > 2
    810             crc2 = 0;
    811 #if N > 3
    812             crc3 = 0;
    813 #if N > 4
    814             crc4 = 0;
    815 #if N > 5
    816             crc5 = 0;
    817 #endif
    818 #endif
    819 #endif
    820 #endif
    821 #endif
    822 
    823             /*
    824               Process the first blks-1 blocks, computing the CRCs on each braid
    825               independently.
    826              */
    827             while (--blks) {
    828                 /* Load the word for each braid into registers. */
    829                 word0 = crc0 ^ words[0];
    830 #if N > 1
    831                 word1 = crc1 ^ words[1];
    832 #if N > 2
    833                 word2 = crc2 ^ words[2];
    834 #if N > 3
    835                 word3 = crc3 ^ words[3];
    836 #if N > 4
    837                 word4 = crc4 ^ words[4];
    838 #if N > 5
    839                 word5 = crc5 ^ words[5];
    840 #endif
    841 #endif
    842 #endif
    843 #endif
    844 #endif
    845                 words += N;
    846 
    847                 /* Compute and update the CRC for each word. The loop should
    848                    get unrolled. */
    849                 crc0 = crc_braid_table[0][word0 & 0xff];
    850 #if N > 1
    851                 crc1 = crc_braid_table[0][word1 & 0xff];
    852 #if N > 2
    853                 crc2 = crc_braid_table[0][word2 & 0xff];
    854 #if N > 3
    855                 crc3 = crc_braid_table[0][word3 & 0xff];
    856 #if N > 4
    857                 crc4 = crc_braid_table[0][word4 & 0xff];
    858 #if N > 5
    859                 crc5 = crc_braid_table[0][word5 & 0xff];
    860 #endif
    861 #endif
    862 #endif
    863 #endif
    864 #endif
    865                 for (k = 1; k < W; k++) {
    866                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
    867 #if N > 1
    868                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
    869 #if N > 2
    870                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
    871 #if N > 3
    872                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
    873 #if N > 4
    874                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
    875 #if N > 5
    876                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
    877 #endif
    878 #endif
    879 #endif
    880 #endif
    881 #endif
    882                 }
    883             }
    884 
    885             /*
    886               Process the last block, combining the CRCs of the N braids at the
    887               same time.
    888              */
    889             crc = crc_word(crc0 ^ words[0]);
    890 #if N > 1
    891             crc = crc_word(crc1 ^ words[1] ^ crc);
    892 #if N > 2
    893             crc = crc_word(crc2 ^ words[2] ^ crc);
    894 #if N > 3
    895             crc = crc_word(crc3 ^ words[3] ^ crc);
    896 #if N > 4
    897             crc = crc_word(crc4 ^ words[4] ^ crc);
    898 #if N > 5
    899             crc = crc_word(crc5 ^ words[5] ^ crc);
    900 #endif
    901 #endif
    902 #endif
    903 #endif
    904 #endif
    905             words += N;
    906         }
    907         else {
    908             /* Big endian. */
    909 
    910             z_word_t crc0, word0, comb;
    911 #if N > 1
    912             z_word_t crc1, word1;
    913 #if N > 2
    914             z_word_t crc2, word2;
    915 #if N > 3
    916             z_word_t crc3, word3;
    917 #if N > 4
    918             z_word_t crc4, word4;
    919 #if N > 5
    920             z_word_t crc5, word5;
    921 #endif
    922 #endif
    923 #endif
    924 #endif
    925 #endif
    926 
    927             /* Initialize the CRC for each braid. */
    928             crc0 = byte_swap(crc);
    929 #if N > 1
    930             crc1 = 0;
    931 #if N > 2
    932             crc2 = 0;
    933 #if N > 3
    934             crc3 = 0;
    935 #if N > 4
    936             crc4 = 0;
    937 #if N > 5
    938             crc5 = 0;
    939 #endif
    940 #endif
    941 #endif
    942 #endif
    943 #endif
    944 
    945             /*
    946               Process the first blks-1 blocks, computing the CRCs on each braid
    947               independently.
    948              */
    949             while (--blks) {
    950                 /* Load the word for each braid into registers. */
    951                 word0 = crc0 ^ words[0];
    952 #if N > 1
    953                 word1 = crc1 ^ words[1];
    954 #if N > 2
    955                 word2 = crc2 ^ words[2];
    956 #if N > 3
    957                 word3 = crc3 ^ words[3];
    958 #if N > 4
    959                 word4 = crc4 ^ words[4];
    960 #if N > 5
    961                 word5 = crc5 ^ words[5];
    962 #endif
    963 #endif
    964 #endif
    965 #endif
    966 #endif
    967                 words += N;
    968 
    969                 /* Compute and update the CRC for each word. The loop should
    970                    get unrolled. */
    971                 crc0 = crc_braid_big_table[0][word0 & 0xff];
    972 #if N > 1
    973                 crc1 = crc_braid_big_table[0][word1 & 0xff];
    974 #if N > 2
    975                 crc2 = crc_braid_big_table[0][word2 & 0xff];
    976 #if N > 3
    977                 crc3 = crc_braid_big_table[0][word3 & 0xff];
    978 #if N > 4
    979                 crc4 = crc_braid_big_table[0][word4 & 0xff];
    980 #if N > 5
    981                 crc5 = crc_braid_big_table[0][word5 & 0xff];
    982 #endif
    983 #endif
    984 #endif
    985 #endif
    986 #endif
    987                 for (k = 1; k < W; k++) {
    988                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
    989 #if N > 1
    990                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
    991 #if N > 2
    992                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
    993 #if N > 3
    994                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
    995 #if N > 4
    996                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
    997 #if N > 5
    998                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
    999 #endif
   1000 #endif
   1001 #endif
   1002 #endif
   1003 #endif
   1004                 }
   1005             }
   1006 
   1007             /*
   1008               Process the last block, combining the CRCs of the N braids at the
   1009               same time.
   1010              */
   1011             comb = crc_word_big(crc0 ^ words[0]);
   1012 #if N > 1
   1013             comb = crc_word_big(crc1 ^ words[1] ^ comb);
   1014 #if N > 2
   1015             comb = crc_word_big(crc2 ^ words[2] ^ comb);
   1016 #if N > 3
   1017             comb = crc_word_big(crc3 ^ words[3] ^ comb);
   1018 #if N > 4
   1019             comb = crc_word_big(crc4 ^ words[4] ^ comb);
   1020 #if N > 5
   1021             comb = crc_word_big(crc5 ^ words[5] ^ comb);
   1022 #endif
   1023 #endif
   1024 #endif
   1025 #endif
   1026 #endif
   1027             words += N;
   1028             crc = byte_swap(comb);
   1029         }
   1030 
   1031         /*
   1032           Update the pointer to the remaining bytes to process.
   1033          */
   1034         buf = (unsigned char const *)words;
   1035     }
   1036 
   1037 #endif /* W */
   1038 
   1039     /* Complete the computation of the CRC on any remaining bytes. */
   1040     while (len >= 8) {
   1041         len -= 8;
   1042         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1043         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1044         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1045         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1046         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1047         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1048         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1049         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1050     }
   1051     while (len) {
   1052         len--;
   1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1054     }
   1055 
   1056     /* Return the CRC, post-conditioned. */
   1057     return crc ^ 0xffffffff;
   1058 }
   1059 
   1060 #endif
   1061 
   1062 /* ========================================================================= */
   1063 unsigned long ZEXPORT crc32(crc, buf, len)
   1064     unsigned long crc;
   1065     const unsigned char FAR *buf;
   1066     uInt len;
   1067 {
   1068     return crc32_z(crc, buf, len);
   1069 }
   1070 
   1071 /* ========================================================================= */
   1072 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
   1073     uLong crc1;
   1074     uLong crc2;
   1075     z_off64_t len2;
   1076 {
   1077 #ifdef DYNAMIC_CRC_TABLE
   1078     once(&made, make_crc_table);
   1079 #endif /* DYNAMIC_CRC_TABLE */
   1080     return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
   1081 }
   1082 
   1083 /* ========================================================================= */
   1084 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
   1085     uLong crc1;
   1086     uLong crc2;
   1087     z_off_t len2;
   1088 {
   1089     return crc32_combine64(crc1, crc2, len2);
   1090 }
   1091 
   1092 /* ========================================================================= */
   1093 uLong ZEXPORT crc32_combine_gen64(len2)
   1094     z_off64_t len2;
   1095 {
   1096 #ifdef DYNAMIC_CRC_TABLE
   1097     once(&made, make_crc_table);
   1098 #endif /* DYNAMIC_CRC_TABLE */
   1099     return x2nmodp(len2, 3);
   1100 }
   1101 
   1102 /* ========================================================================= */
   1103 uLong ZEXPORT crc32_combine_gen(len2)
   1104     z_off_t len2;
   1105 {
   1106     return crc32_combine_gen64(len2);
   1107 }
   1108 
   1109 /* ========================================================================= */
   1110 uLong crc32_combine_op(crc1, crc2, op)
   1111     uLong crc1;
   1112     uLong crc2;
   1113     uLong op;
   1114 {
   1115     return multmodp(op, crc1) ^ crc2;
   1116 }
   1117