Home | History | Annotate | Line # | Download | only in zlib
crc32.c revision 1.1.1.4
      1 /* crc32.c -- compute the CRC-32 of a data stream
      2  * Copyright (C) 1995-2022 Mark Adler
      3  * For conditions of distribution and use, see copyright notice in zlib.h
      4  *
      5  * This interleaved implementation of a CRC makes use of pipelined multiple
      6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
      7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
      8  */
      9 
     10 /* @(#) Id */
     11 
     12 /*
     13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
     14   protection on the static variables used to control the first-use generation
     15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
     16   first call get_crc_table() to initialize the tables before allowing more than
     17   one thread to use crc32().
     18 
     19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
     20   produced, so that this one source file can be compiled to an executable.
     21  */
     22 
     23 #ifdef MAKECRCH
     24 #  include <stdio.h>
     25 #  ifndef DYNAMIC_CRC_TABLE
     26 #    define DYNAMIC_CRC_TABLE
     27 #  endif /* !DYNAMIC_CRC_TABLE */
     28 #endif /* MAKECRCH */
     29 
     30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
     31 
     32  /*
     33   A CRC of a message is computed on N braids of words in the message, where
     34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
     35   running sparse CRCs are calculated respectively on each braid, at these
     36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
     37   This is done starting at a word boundary, and continues until as many blocks
     38   of N * W bytes as are available have been processed. The results are combined
     39   into a single CRC at the end. For this code, N must be in the range 1..6 and
     40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
     41   more #if blocks, extending the patterns apparent in the code. In addition,
     42   crc32.h would need to be regenerated, if the maximum N value is increased.
     43 
     44   N and W are chosen empirically by benchmarking the execution time on a given
     45   processor. The choices for N and W below were based on testing on Intel Kaby
     46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
     47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
     48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
     49   They were all tested with either gcc or clang, all using the -O3 optimization
     50   level. Your mileage may vary.
     51  */
     52 
     53 /* Define N */
     54 #ifdef Z_TESTN
     55 #  define N Z_TESTN
     56 #else
     57 #  define N 5
     58 #endif
     59 #if N < 1 || N > 6
     60 #  error N must be in 1..6
     61 #endif
     62 
     63 /*
     64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
     65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
     66   that bytes are eight bits.
     67  */
     68 
     69 /*
     70   Define W and the associated z_word_t type. If W is not defined, then a
     71   braided calculation is not used, and the associated tables and code are not
     72   compiled.
     73  */
     74 #ifdef Z_TESTW
     75 #  if Z_TESTW-1 != -1
     76 #    define W Z_TESTW
     77 #  endif
     78 #else
     79 #  ifdef MAKECRCH
     80 #    define W 8         /* required for MAKECRCH */
     81 #  else
     82 #    if defined(__x86_64__) || defined(__aarch64__)
     83 #      define W 8
     84 #    else
     85 #      define W 4
     86 #    endif
     87 #  endif
     88 #endif
     89 #ifdef W
     90 #  if W == 8 && defined(Z_U8)
     91      typedef Z_U8 z_word_t;
     92 #  elif defined(Z_U4)
     93 #    undef W
     94 #    define W 4
     95      typedef Z_U4 z_word_t;
     96 #  else
     97 #    undef W
     98 #  endif
     99 #endif
    100 
    101 /* If available, use the ARM processor CRC32 instruction. */
    102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
    103 #  define ARMCRC32
    104 #endif
    105 
    106 /* Local functions. */
    107 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
    108 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
    109 
    110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
    111     local z_word_t byte_swap OF((z_word_t word));
    112 #endif
    113 
    114 #if defined(W) && !defined(ARMCRC32)
    115     local z_crc_t crc_word OF((z_word_t data));
    116     local z_word_t crc_word_big OF((z_word_t data));
    117 #endif
    118 
    119 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
    120 /*
    121   Swap the bytes in a z_word_t to convert between little and big endian. Any
    122   self-respecting compiler will optimize this to a single machine byte-swap
    123   instruction, if one is available. This assumes that word_t is either 32 bits
    124   or 64 bits.
    125  */
    126 local z_word_t byte_swap(word)
    127     z_word_t word;
    128 {
    129 #  if W == 8
    130     return
    131         (word & 0xff00000000000000) >> 56 |
    132         (word & 0xff000000000000) >> 40 |
    133         (word & 0xff0000000000) >> 24 |
    134         (word & 0xff00000000) >> 8 |
    135         (word & 0xff000000) << 8 |
    136         (word & 0xff0000) << 24 |
    137         (word & 0xff00) << 40 |
    138         (word & 0xff) << 56;
    139 #  else   /* W == 4 */
    140     return
    141         (word & 0xff000000) >> 24 |
    142         (word & 0xff0000) >> 8 |
    143         (word & 0xff00) << 8 |
    144         (word & 0xff) << 24;
    145 #  endif
    146 }
    147 #endif
    148 
    149 /* CRC polynomial. */
    150 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
    151 
    152 #ifdef DYNAMIC_CRC_TABLE
    153 
    154 local z_crc_t FAR crc_table[256];
    155 local z_crc_t FAR x2n_table[32];
    156 local void make_crc_table OF((void));
    157 #ifdef W
    158    local z_word_t FAR crc_big_table[256];
    159    local z_crc_t FAR crc_braid_table[W][256];
    160    local z_word_t FAR crc_braid_big_table[W][256];
    161    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
    162 #endif
    163 #ifdef MAKECRCH
    164    local void write_table OF((FILE *, const z_crc_t FAR *, int));
    165    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
    166    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
    167 #endif /* MAKECRCH */
    168 
    169 /*
    170   Define a once() function depending on the availability of atomics. If this is
    171   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
    172   multiple threads, and if atomics are not available, then get_crc_table() must
    173   be called to initialize the tables and must return before any threads are
    174   allowed to compute or combine CRCs.
    175  */
    176 
    177 /* Definition of once functionality. */
    178 typedef struct once_s once_t;
    179 local void once OF((once_t *, void (*)(void)));
    180 
    181 /* Check for the availability of atomics. */
    182 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
    183     !defined(__STDC_NO_ATOMICS__)
    184 
    185 #include <stdatomic.h>
    186 
    187 /* Structure for once(), which must be initialized with ONCE_INIT. */
    188 struct once_s {
    189     atomic_flag begun;
    190     atomic_int done;
    191 };
    192 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
    193 
    194 /*
    195   Run the provided init() function exactly once, even if multiple threads
    196   invoke once() at the same time. The state must be a once_t initialized with
    197   ONCE_INIT.
    198  */
    199 local void once(state, init)
    200     once_t *state;
    201     void (*init)(void);
    202 {
    203     if (!atomic_load(&state->done)) {
    204         if (atomic_flag_test_and_set(&state->begun))
    205             while (!atomic_load(&state->done))
    206                 ;
    207         else {
    208             init();
    209             atomic_store(&state->done, 1);
    210         }
    211     }
    212 }
    213 
    214 #else   /* no atomics */
    215 
    216 /* Structure for once(), which must be initialized with ONCE_INIT. */
    217 struct once_s {
    218     volatile int begun;
    219     volatile int done;
    220 };
    221 #define ONCE_INIT {0, 0}
    222 
    223 /* Test and set. Alas, not atomic, but tries to minimize the period of
    224    vulnerability. */
    225 local int test_and_set OF((int volatile *));
    226 local int test_and_set(flag)
    227     int volatile *flag;
    228 {
    229     int was;
    230 
    231     was = *flag;
    232     *flag = 1;
    233     return was;
    234 }
    235 
    236 /* Run the provided init() function once. This is not thread-safe. */
    237 local void once(state, init)
    238     once_t *state;
    239     void (*init)(void);
    240 {
    241     if (!state->done) {
    242         if (test_and_set(&state->begun))
    243             while (!state->done)
    244                 ;
    245         else {
    246             init();
    247             state->done = 1;
    248         }
    249     }
    250 }
    251 
    252 #endif
    253 
    254 /* State for once(). */
    255 local once_t made = ONCE_INIT;
    256 
    257 /*
    258   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
    259   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
    260 
    261   Polynomials over GF(2) are represented in binary, one bit per coefficient,
    262   with the lowest powers in the most significant bit. Then adding polynomials
    263   is just exclusive-or, and multiplying a polynomial by x is a right shift by
    264   one. If we call the above polynomial p, and represent a byte as the
    265   polynomial q, also with the lowest power in the most significant bit (so the
    266   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
    267   where a mod b means the remainder after dividing a by b.
    268 
    269   This calculation is done using the shift-register method of multiplying and
    270   taking the remainder. The register is initialized to zero, and for each
    271   incoming bit, x^32 is added mod p to the register if the bit is a one (where
    272   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
    273   (which is shifting right by one and adding x^32 mod p if the bit shifted out
    274   is a one). We start with the highest power (least significant bit) of q and
    275   repeat for all eight bits of q.
    276 
    277   The table is simply the CRC of all possible eight bit values. This is all the
    278   information needed to generate CRCs on data a byte at a time for all
    279   combinations of CRC register values and incoming bytes.
    280  */
    281 
    282 local void make_crc_table()
    283 {
    284     unsigned i, j, n;
    285     z_crc_t p;
    286 
    287     /* initialize the CRC of bytes tables */
    288     for (i = 0; i < 256; i++) {
    289         p = i;
    290         for (j = 0; j < 8; j++)
    291             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
    292         crc_table[i] = p;
    293 #ifdef W
    294         crc_big_table[i] = byte_swap(p);
    295 #endif
    296     }
    297 
    298     /* initialize the x^2^n mod p(x) table */
    299     p = (z_crc_t)1 << 30;         /* x^1 */
    300     x2n_table[0] = p;
    301     for (n = 1; n < 32; n++)
    302         x2n_table[n] = p = multmodp(p, p);
    303 
    304 #ifdef W
    305     /* initialize the braiding tables -- needs x2n_table[] */
    306     braid(crc_braid_table, crc_braid_big_table, N, W);
    307 #endif
    308 
    309 #ifdef MAKECRCH
    310     {
    311         /*
    312           The crc32.h header file contains tables for both 32-bit and 64-bit
    313           z_word_t's, and so requires a 64-bit type be available. In that case,
    314           z_word_t must be defined to be 64-bits. This code then also generates
    315           and writes out the tables for the case that z_word_t is 32 bits.
    316          */
    317 #if !defined(W) || W != 8
    318 #  error Need a 64-bit integer type in order to generate crc32.h.
    319 #endif
    320         FILE *out;
    321         int k, n;
    322         z_crc_t ltl[8][256];
    323         z_word_t big[8][256];
    324 
    325         out = fopen("crc32.h", "w");
    326         if (out == NULL) return;
    327 
    328         /* write out little-endian CRC table to crc32.h */
    329         fprintf(out,
    330             "/* crc32.h -- tables for rapid CRC calculation\n"
    331             " * Generated automatically by crc32.c\n */\n"
    332             "\n"
    333             "local const z_crc_t FAR crc_table[] = {\n"
    334             "    ");
    335         write_table(out, crc_table, 256);
    336         fprintf(out,
    337             "};\n");
    338 
    339         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
    340         fprintf(out,
    341             "\n"
    342             "#ifdef W\n"
    343             "\n"
    344             "#if W == 8\n"
    345             "\n"
    346             "local const z_word_t FAR crc_big_table[] = {\n"
    347             "    ");
    348         write_table64(out, crc_big_table, 256);
    349         fprintf(out,
    350             "};\n");
    351 
    352         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
    353         fprintf(out,
    354             "\n"
    355             "#else /* W == 4 */\n"
    356             "\n"
    357             "local const z_word_t FAR crc_big_table[] = {\n"
    358             "    ");
    359         write_table32hi(out, crc_big_table, 256);
    360         fprintf(out,
    361             "};\n"
    362             "\n"
    363             "#endif\n");
    364 
    365         /* write out braid tables for each value of N */
    366         for (n = 1; n <= 6; n++) {
    367             fprintf(out,
    368             "\n"
    369             "#if N == %d\n", n);
    370 
    371             /* compute braid tables for this N and 64-bit word_t */
    372             braid(ltl, big, n, 8);
    373 
    374             /* write out braid tables for 64-bit z_word_t to crc32.h */
    375             fprintf(out,
    376             "\n"
    377             "#if W == 8\n"
    378             "\n"
    379             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    380             for (k = 0; k < 8; k++) {
    381                 fprintf(out, "   {");
    382                 write_table(out, ltl[k], 256);
    383                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
    384             }
    385             fprintf(out,
    386             "};\n"
    387             "\n"
    388             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    389             for (k = 0; k < 8; k++) {
    390                 fprintf(out, "   {");
    391                 write_table64(out, big[k], 256);
    392                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
    393             }
    394             fprintf(out,
    395             "};\n");
    396 
    397             /* compute braid tables for this N and 32-bit word_t */
    398             braid(ltl, big, n, 4);
    399 
    400             /* write out braid tables for 32-bit z_word_t to crc32.h */
    401             fprintf(out,
    402             "\n"
    403             "#else /* W == 4 */\n"
    404             "\n"
    405             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    406             for (k = 0; k < 4; k++) {
    407                 fprintf(out, "   {");
    408                 write_table(out, ltl[k], 256);
    409                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
    410             }
    411             fprintf(out,
    412             "};\n"
    413             "\n"
    414             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    415             for (k = 0; k < 4; k++) {
    416                 fprintf(out, "   {");
    417                 write_table32hi(out, big[k], 256);
    418                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
    419             }
    420             fprintf(out,
    421             "};\n"
    422             "\n"
    423             "#endif\n"
    424             "\n"
    425             "#endif\n");
    426         }
    427         fprintf(out,
    428             "\n"
    429             "#endif\n");
    430 
    431         /* write out zeros operator table to crc32.h */
    432         fprintf(out,
    433             "\n"
    434             "local const z_crc_t FAR x2n_table[] = {\n"
    435             "    ");
    436         write_table(out, x2n_table, 32);
    437         fprintf(out,
    438             "};\n");
    439         fclose(out);
    440     }
    441 #endif /* MAKECRCH */
    442 }
    443 
    444 #ifdef MAKECRCH
    445 
    446 /*
    447    Write the 32-bit values in table[0..k-1] to out, five per line in
    448    hexadecimal separated by commas.
    449  */
    450 local void write_table(out, table, k)
    451     FILE *out;
    452     const z_crc_t FAR *table;
    453     int k;
    454 {
    455     int n;
    456 
    457     for (n = 0; n < k; n++)
    458         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    459                 (unsigned long)(table[n]),
    460                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    461 }
    462 
    463 /*
    464    Write the high 32-bits of each value in table[0..k-1] to out, five per line
    465    in hexadecimal separated by commas.
    466  */
    467 local void write_table32hi(out, table, k)
    468 FILE *out;
    469 const z_word_t FAR *table;
    470 int k;
    471 {
    472     int n;
    473 
    474     for (n = 0; n < k; n++)
    475         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    476                 (unsigned long)(table[n] >> 32),
    477                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    478 }
    479 
    480 /*
    481   Write the 64-bit values in table[0..k-1] to out, three per line in
    482   hexadecimal separated by commas. This assumes that if there is a 64-bit
    483   type, then there is also a long long integer type, and it is at least 64
    484   bits. If not, then the type cast and format string can be adjusted
    485   accordingly.
    486  */
    487 local void write_table64(out, table, k)
    488     FILE *out;
    489     const z_word_t FAR *table;
    490     int k;
    491 {
    492     int n;
    493 
    494     for (n = 0; n < k; n++)
    495         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
    496                 (unsigned long long)(table[n]),
    497                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
    498 }
    499 
    500 /* Actually do the deed. */
    501 int main()
    502 {
    503     make_crc_table();
    504     return 0;
    505 }
    506 
    507 #endif /* MAKECRCH */
    508 
    509 #ifdef W
    510 /*
    511   Generate the little and big-endian braid tables for the given n and z_word_t
    512   size w. Each array must have room for w blocks of 256 elements.
    513  */
    514 local void braid(ltl, big, n, w)
    515     z_crc_t ltl[][256];
    516     z_word_t big[][256];
    517     int n;
    518     int w;
    519 {
    520     int k;
    521     z_crc_t i, p, q;
    522     for (k = 0; k < w; k++) {
    523         p = x2nmodp((n * w + 3 - k) << 3, 0);
    524         ltl[k][0] = 0;
    525         big[w - 1 - k][0] = 0;
    526         for (i = 1; i < 256; i++) {
    527             ltl[k][i] = q = multmodp(i << 24, p);
    528             big[w - 1 - k][i] = byte_swap(q);
    529         }
    530     }
    531 }
    532 #endif
    533 
    534 #else /* !DYNAMIC_CRC_TABLE */
    535 /* ========================================================================
    536  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
    537  * of x for combining CRC-32s, all made by make_crc_table().
    538  */
    539 #include "crc32.h"
    540 #endif /* DYNAMIC_CRC_TABLE */
    541 
    542 /* ========================================================================
    543  * Routines used for CRC calculation. Some are also required for the table
    544  * generation above.
    545  */
    546 
    547 /*
    548   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
    549   reflected. For speed, this requires that a not be zero.
    550  */
    551 local z_crc_t multmodp(a, b)
    552     z_crc_t a;
    553     z_crc_t b;
    554 {
    555     z_crc_t m, p;
    556 
    557     m = (z_crc_t)1 << 31;
    558     p = 0;
    559     for (;;) {
    560         if (a & m) {
    561             p ^= b;
    562             if ((a & (m - 1)) == 0)
    563                 break;
    564         }
    565         m >>= 1;
    566         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
    567     }
    568     return p;
    569 }
    570 
    571 /*
    572   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
    573   initialized.
    574  */
    575 local z_crc_t x2nmodp(n, k)
    576     z_off64_t n;
    577     unsigned k;
    578 {
    579     z_crc_t p;
    580 
    581     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
    582     while (n) {
    583         if (n & 1)
    584             p = multmodp(x2n_table[k & 31], p);
    585         n >>= 1;
    586         k++;
    587     }
    588     return p;
    589 }
    590 
    591 /* =========================================================================
    592  * This function can be used by asm versions of crc32(), and to force the
    593  * generation of the CRC tables in a threaded application.
    594  */
    595 const z_crc_t FAR * ZEXPORT get_crc_table()
    596 {
    597 #ifdef DYNAMIC_CRC_TABLE
    598     once(&made, make_crc_table);
    599 #endif /* DYNAMIC_CRC_TABLE */
    600     return (const z_crc_t FAR *)crc_table;
    601 }
    602 
    603 /* =========================================================================
    604  * Use ARM machine instructions if available. This will compute the CRC about
    605  * ten times faster than the braided calculation. This code does not check for
    606  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
    607  * only be defined if the compilation specifies an ARM processor architecture
    608  * that has the instructions. For example, compiling with -march=armv8.1-a or
    609  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
    610  * instructions.
    611  */
    612 #ifdef ARMCRC32
    613 
    614 /*
    615    Constants empirically determined to maximize speed. These values are from
    616    measurements on a Cortex-A57. Your mileage may vary.
    617  */
    618 #define Z_BATCH 3990                /* number of words in a batch */
    619 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
    620 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
    621 
    622 unsigned long ZEXPORT crc32_z(crc, buf, len)
    623     unsigned long crc;
    624     const unsigned char FAR *buf;
    625     z_size_t len;
    626 {
    627     z_crc_t val;
    628     z_word_t crc1, crc2;
    629     const z_word_t *word;
    630     z_word_t val0, val1, val2;
    631     z_size_t last, last2, i;
    632     z_size_t num;
    633 
    634     /* Return initial CRC, if requested. */
    635     if (buf == Z_NULL) return 0;
    636 
    637 #ifdef DYNAMIC_CRC_TABLE
    638     once(&made, make_crc_table);
    639 #endif /* DYNAMIC_CRC_TABLE */
    640 
    641     /* Pre-condition the CRC */
    642     crc = (~crc) & 0xffffffff;
    643 
    644     /* Compute the CRC up to a word boundary. */
    645     while (len && ((z_size_t)buf & 7) != 0) {
    646         len--;
    647         val = *buf++;
    648         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    649     }
    650 
    651     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
    652     word = (z_word_t const *)buf;
    653     num = len >> 3;
    654     len &= 7;
    655 
    656     /* Do three interleaved CRCs to realize the throughput of one crc32x
    657        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
    658        three CRCs are combined into a single CRC after each set of batches. */
    659     while (num >= 3 * Z_BATCH) {
    660         crc1 = 0;
    661         crc2 = 0;
    662         for (i = 0; i < Z_BATCH; i++) {
    663             val0 = word[i];
    664             val1 = word[i + Z_BATCH];
    665             val2 = word[i + 2 * Z_BATCH];
    666             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    667             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    668             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    669         }
    670         word += 3 * Z_BATCH;
    671         num -= 3 * Z_BATCH;
    672         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
    673         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
    674     }
    675 
    676     /* Do one last smaller batch with the remaining words, if there are enough
    677        to pay for the combination of CRCs. */
    678     last = num / 3;
    679     if (last >= Z_BATCH_MIN) {
    680         last2 = last << 1;
    681         crc1 = 0;
    682         crc2 = 0;
    683         for (i = 0; i < last; i++) {
    684             val0 = word[i];
    685             val1 = word[i + last];
    686             val2 = word[i + last2];
    687             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    688             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    689             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    690         }
    691         word += 3 * last;
    692         num -= 3 * last;
    693         val = x2nmodp(last, 6);
    694         crc = multmodp(val, crc) ^ crc1;
    695         crc = multmodp(val, crc) ^ crc2;
    696     }
    697 
    698     /* Compute the CRC on any remaining words. */
    699     for (i = 0; i < num; i++) {
    700         val0 = word[i];
    701         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    702     }
    703     word += num;
    704 
    705     /* Complete the CRC on any remaining bytes. */
    706     buf = (const unsigned char FAR *)word;
    707     while (len) {
    708         len--;
    709         val = *buf++;
    710         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    711     }
    712 
    713     /* Return the CRC, post-conditioned. */
    714     return crc ^ 0xffffffff;
    715 }
    716 
    717 #else
    718 
    719 #ifdef W
    720 
    721 /*
    722   Return the CRC of the W bytes in the word_t data, taking the
    723   least-significant byte of the word as the first byte of data, without any pre
    724   or post conditioning. This is used to combine the CRCs of each braid.
    725  */
    726 local z_crc_t crc_word(data)
    727     z_word_t data;
    728 {
    729     int k;
    730     for (k = 0; k < W; k++)
    731         data = (data >> 8) ^ crc_table[data & 0xff];
    732     return (z_crc_t)data;
    733 }
    734 
    735 local z_word_t crc_word_big(data)
    736     z_word_t data;
    737 {
    738     int k;
    739     for (k = 0; k < W; k++)
    740         data = (data << 8) ^
    741             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
    742     return data;
    743 }
    744 
    745 #endif
    746 
    747 /* ========================================================================= */
    748 unsigned long ZEXPORT crc32_z(crc, buf, len)
    749     unsigned long crc;
    750     const unsigned char FAR *buf;
    751     z_size_t len;
    752 {
    753     /* Return initial CRC, if requested. */
    754     if (buf == Z_NULL) return 0;
    755 
    756 #ifdef DYNAMIC_CRC_TABLE
    757     once(&made, make_crc_table);
    758 #endif /* DYNAMIC_CRC_TABLE */
    759 
    760     /* Pre-condition the CRC */
    761     crc = (~crc) & 0xffffffff;
    762 
    763 #ifdef W
    764 
    765     /* If provided enough bytes, do a braided CRC calculation. */
    766     if (len >= N * W + W - 1) {
    767         z_size_t blks;
    768         z_word_t const *words;
    769         unsigned endian;
    770         int k;
    771 
    772         /* Compute the CRC up to a z_word_t boundary. */
    773         while (len && ((z_size_t)buf & (W - 1)) != 0) {
    774             len--;
    775             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    776         }
    777 
    778         /* Compute the CRC on as many N z_word_t blocks as are available. */
    779         blks = len / (N * W);
    780         len -= blks * N * W;
    781         words = (z_word_t const *)buf;
    782 
    783         /* Do endian check at execution time instead of compile time, since ARM
    784            processors can change the endianess at execution time. If the
    785            compiler knows what the endianess will be, it can optimize out the
    786            check and the unused branch. */
    787         endian = 1;
    788         if (*(unsigned char *)&endian) {
    789             /* Little endian. */
    790 
    791             z_crc_t crc0;
    792             z_word_t word0;
    793 #if N > 1
    794             z_crc_t crc1;
    795             z_word_t word1;
    796 #if N > 2
    797             z_crc_t crc2;
    798             z_word_t word2;
    799 #if N > 3
    800             z_crc_t crc3;
    801             z_word_t word3;
    802 #if N > 4
    803             z_crc_t crc4;
    804             z_word_t word4;
    805 #if N > 5
    806             z_crc_t crc5;
    807             z_word_t word5;
    808 #endif
    809 #endif
    810 #endif
    811 #endif
    812 #endif
    813 
    814             /* Initialize the CRC for each braid. */
    815             crc0 = crc;
    816 #if N > 1
    817             crc1 = 0;
    818 #if N > 2
    819             crc2 = 0;
    820 #if N > 3
    821             crc3 = 0;
    822 #if N > 4
    823             crc4 = 0;
    824 #if N > 5
    825             crc5 = 0;
    826 #endif
    827 #endif
    828 #endif
    829 #endif
    830 #endif
    831 
    832             /*
    833               Process the first blks-1 blocks, computing the CRCs on each braid
    834               independently.
    835              */
    836             while (--blks) {
    837                 /* Load the word for each braid into registers. */
    838                 word0 = crc0 ^ words[0];
    839 #if N > 1
    840                 word1 = crc1 ^ words[1];
    841 #if N > 2
    842                 word2 = crc2 ^ words[2];
    843 #if N > 3
    844                 word3 = crc3 ^ words[3];
    845 #if N > 4
    846                 word4 = crc4 ^ words[4];
    847 #if N > 5
    848                 word5 = crc5 ^ words[5];
    849 #endif
    850 #endif
    851 #endif
    852 #endif
    853 #endif
    854                 words += N;
    855 
    856                 /* Compute and update the CRC for each word. The loop should
    857                    get unrolled. */
    858                 crc0 = crc_braid_table[0][word0 & 0xff];
    859 #if N > 1
    860                 crc1 = crc_braid_table[0][word1 & 0xff];
    861 #if N > 2
    862                 crc2 = crc_braid_table[0][word2 & 0xff];
    863 #if N > 3
    864                 crc3 = crc_braid_table[0][word3 & 0xff];
    865 #if N > 4
    866                 crc4 = crc_braid_table[0][word4 & 0xff];
    867 #if N > 5
    868                 crc5 = crc_braid_table[0][word5 & 0xff];
    869 #endif
    870 #endif
    871 #endif
    872 #endif
    873 #endif
    874                 for (k = 1; k < W; k++) {
    875                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
    876 #if N > 1
    877                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
    878 #if N > 2
    879                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
    880 #if N > 3
    881                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
    882 #if N > 4
    883                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
    884 #if N > 5
    885                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
    886 #endif
    887 #endif
    888 #endif
    889 #endif
    890 #endif
    891                 }
    892             }
    893 
    894             /*
    895               Process the last block, combining the CRCs of the N braids at the
    896               same time.
    897              */
    898             crc = crc_word(crc0 ^ words[0]);
    899 #if N > 1
    900             crc = crc_word(crc1 ^ words[1] ^ crc);
    901 #if N > 2
    902             crc = crc_word(crc2 ^ words[2] ^ crc);
    903 #if N > 3
    904             crc = crc_word(crc3 ^ words[3] ^ crc);
    905 #if N > 4
    906             crc = crc_word(crc4 ^ words[4] ^ crc);
    907 #if N > 5
    908             crc = crc_word(crc5 ^ words[5] ^ crc);
    909 #endif
    910 #endif
    911 #endif
    912 #endif
    913 #endif
    914             words += N;
    915         }
    916         else {
    917             /* Big endian. */
    918 
    919             z_word_t crc0, word0, comb;
    920 #if N > 1
    921             z_word_t crc1, word1;
    922 #if N > 2
    923             z_word_t crc2, word2;
    924 #if N > 3
    925             z_word_t crc3, word3;
    926 #if N > 4
    927             z_word_t crc4, word4;
    928 #if N > 5
    929             z_word_t crc5, word5;
    930 #endif
    931 #endif
    932 #endif
    933 #endif
    934 #endif
    935 
    936             /* Initialize the CRC for each braid. */
    937             crc0 = byte_swap(crc);
    938 #if N > 1
    939             crc1 = 0;
    940 #if N > 2
    941             crc2 = 0;
    942 #if N > 3
    943             crc3 = 0;
    944 #if N > 4
    945             crc4 = 0;
    946 #if N > 5
    947             crc5 = 0;
    948 #endif
    949 #endif
    950 #endif
    951 #endif
    952 #endif
    953 
    954             /*
    955               Process the first blks-1 blocks, computing the CRCs on each braid
    956               independently.
    957              */
    958             while (--blks) {
    959                 /* Load the word for each braid into registers. */
    960                 word0 = crc0 ^ words[0];
    961 #if N > 1
    962                 word1 = crc1 ^ words[1];
    963 #if N > 2
    964                 word2 = crc2 ^ words[2];
    965 #if N > 3
    966                 word3 = crc3 ^ words[3];
    967 #if N > 4
    968                 word4 = crc4 ^ words[4];
    969 #if N > 5
    970                 word5 = crc5 ^ words[5];
    971 #endif
    972 #endif
    973 #endif
    974 #endif
    975 #endif
    976                 words += N;
    977 
    978                 /* Compute and update the CRC for each word. The loop should
    979                    get unrolled. */
    980                 crc0 = crc_braid_big_table[0][word0 & 0xff];
    981 #if N > 1
    982                 crc1 = crc_braid_big_table[0][word1 & 0xff];
    983 #if N > 2
    984                 crc2 = crc_braid_big_table[0][word2 & 0xff];
    985 #if N > 3
    986                 crc3 = crc_braid_big_table[0][word3 & 0xff];
    987 #if N > 4
    988                 crc4 = crc_braid_big_table[0][word4 & 0xff];
    989 #if N > 5
    990                 crc5 = crc_braid_big_table[0][word5 & 0xff];
    991 #endif
    992 #endif
    993 #endif
    994 #endif
    995 #endif
    996                 for (k = 1; k < W; k++) {
    997                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
    998 #if N > 1
    999                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
   1000 #if N > 2
   1001                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
   1002 #if N > 3
   1003                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
   1004 #if N > 4
   1005                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
   1006 #if N > 5
   1007                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
   1008 #endif
   1009 #endif
   1010 #endif
   1011 #endif
   1012 #endif
   1013                 }
   1014             }
   1015 
   1016             /*
   1017               Process the last block, combining the CRCs of the N braids at the
   1018               same time.
   1019              */
   1020             comb = crc_word_big(crc0 ^ words[0]);
   1021 #if N > 1
   1022             comb = crc_word_big(crc1 ^ words[1] ^ comb);
   1023 #if N > 2
   1024             comb = crc_word_big(crc2 ^ words[2] ^ comb);
   1025 #if N > 3
   1026             comb = crc_word_big(crc3 ^ words[3] ^ comb);
   1027 #if N > 4
   1028             comb = crc_word_big(crc4 ^ words[4] ^ comb);
   1029 #if N > 5
   1030             comb = crc_word_big(crc5 ^ words[5] ^ comb);
   1031 #endif
   1032 #endif
   1033 #endif
   1034 #endif
   1035 #endif
   1036             words += N;
   1037             crc = byte_swap(comb);
   1038         }
   1039 
   1040         /*
   1041           Update the pointer to the remaining bytes to process.
   1042          */
   1043         buf = (unsigned char const *)words;
   1044     }
   1045 
   1046 #endif /* W */
   1047 
   1048     /* Complete the computation of the CRC on any remaining bytes. */
   1049     while (len >= 8) {
   1050         len -= 8;
   1051         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1052         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1056         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1057         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1058         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1059     }
   1060     while (len) {
   1061         len--;
   1062         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1063     }
   1064 
   1065     /* Return the CRC, post-conditioned. */
   1066     return crc ^ 0xffffffff;
   1067 }
   1068 
   1069 #endif
   1070 
   1071 /* ========================================================================= */
   1072 unsigned long ZEXPORT crc32(crc, buf, len)
   1073     unsigned long crc;
   1074     const unsigned char FAR *buf;
   1075     uInt len;
   1076 {
   1077     return crc32_z(crc, buf, len);
   1078 }
   1079 
   1080 /* ========================================================================= */
   1081 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
   1082     uLong crc1;
   1083     uLong crc2;
   1084     z_off64_t len2;
   1085 {
   1086 #ifdef DYNAMIC_CRC_TABLE
   1087     once(&made, make_crc_table);
   1088 #endif /* DYNAMIC_CRC_TABLE */
   1089     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
   1090 }
   1091 
   1092 /* ========================================================================= */
   1093 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
   1094     uLong crc1;
   1095     uLong crc2;
   1096     z_off_t len2;
   1097 {
   1098     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
   1099 }
   1100 
   1101 /* ========================================================================= */
   1102 uLong ZEXPORT crc32_combine_gen64(len2)
   1103     z_off64_t len2;
   1104 {
   1105 #ifdef DYNAMIC_CRC_TABLE
   1106     once(&made, make_crc_table);
   1107 #endif /* DYNAMIC_CRC_TABLE */
   1108     return x2nmodp(len2, 3);
   1109 }
   1110 
   1111 /* ========================================================================= */
   1112 uLong ZEXPORT crc32_combine_gen(len2)
   1113     z_off_t len2;
   1114 {
   1115     return crc32_combine_gen64((z_off64_t)len2);
   1116 }
   1117 
   1118 /* ========================================================================= */
   1119 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
   1120     uLong crc1;
   1121     uLong crc2;
   1122     uLong op;
   1123 {
   1124     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
   1125 }
   1126