Home | History | Annotate | Line # | Download | only in zlib
crc32.c revision 1.6
      1 /*	$NetBSD: crc32.c,v 1.6 2022/10/15 19:49:32 christos Exp $	*/
      2 
      3 /* crc32.c -- compute the CRC-32 of a data stream
      4  * Copyright (C) 1995-2022 Mark Adler
      5  * For conditions of distribution and use, see copyright notice in zlib.h
      6  *
      7  * This interleaved implementation of a CRC makes use of pipelined multiple
      8  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
      9  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
     10  */
     11 
     12 /* @(#) Id */
     13 
     14 /*
     15   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
     16   protection on the static variables used to control the first-use generation
     17   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
     18   first call get_crc_table() to initialize the tables before allowing more than
     19   one thread to use crc32().
     20 
     21   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
     22   produced, so that this one source file can be compiled to an executable.
     23  */
     24 
     25 #ifdef MAKECRCH
     26 #  include <stdio.h>
     27 #  ifndef DYNAMIC_CRC_TABLE
     28 #    define DYNAMIC_CRC_TABLE
     29 #  endif /* !DYNAMIC_CRC_TABLE */
     30 #endif /* MAKECRCH */
     31 
     32 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
     33 
     34  /*
     35   A CRC of a message is computed on N braids of words in the message, where
     36   each word consists of W bytes (4 or 8). If N is 3, for example, then three
     37   running sparse CRCs are calculated respectively on each braid, at these
     38   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
     39   This is done starting at a word boundary, and continues until as many blocks
     40   of N * W bytes as are available have been processed. The results are combined
     41   into a single CRC at the end. For this code, N must be in the range 1..6 and
     42   W must be 4 or 8. The upper limit on N can be increased if desired by adding
     43   more #if blocks, extending the patterns apparent in the code. In addition,
     44   crc32.h would need to be regenerated, if the maximum N value is increased.
     45 
     46   N and W are chosen empirically by benchmarking the execution time on a given
     47   processor. The choices for N and W below were based on testing on Intel Kaby
     48   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
     49   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
     50   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
     51   They were all tested with either gcc or clang, all using the -O3 optimization
     52   level. Your mileage may vary.
     53  */
     54 
     55 /* Define N */
     56 #ifdef Z_TESTN
     57 #  define N Z_TESTN
     58 #else
     59 #  define N 5
     60 #endif
     61 #if N < 1 || N > 6
     62 #  error N must be in 1..6
     63 #endif
     64 
     65 /*
     66   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
     67   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
     68   that bytes are eight bits.
     69  */
     70 
     71 /*
     72   Define W and the associated z_word_t type. If W is not defined, then a
     73   braided calculation is not used, and the associated tables and code are not
     74   compiled.
     75  */
     76 #ifdef Z_TESTW
     77 #  if Z_TESTW-1 != -1
     78 #    define W Z_TESTW
     79 #  endif
     80 #else
     81 #  ifdef MAKECRCH
     82 #    define W 8         /* required for MAKECRCH */
     83 #  else
     84 #    if defined(__x86_64__) || defined(__aarch64__)
     85 #      define W 8
     86 #    else
     87 #      define W 4
     88 #    endif
     89 #  endif
     90 #endif
     91 #ifdef W
     92 #  if W == 8 && defined(Z_U8)
     93      typedef Z_U8 z_word_t;
     94 #  elif defined(Z_U4)
     95 #    undef W
     96 #    define W 4
     97      typedef Z_U4 z_word_t;
     98 #  else
     99 #    undef W
    100 #  endif
    101 #endif
    102 
    103 /* If available, use the ARM processor CRC32 instruction. */
    104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
    105 #  define ARMCRC32
    106 #endif
    107 
    108 /* Local functions. */
    109 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
    110 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
    111 
    112 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
    113     local z_word_t byte_swap OF((z_word_t word));
    114 #endif
    115 
    116 #if defined(W) && !defined(ARMCRC32)
    117     local z_crc_t crc_word OF((z_word_t data));
    118     local z_word_t crc_word_big OF((z_word_t data));
    119 #endif
    120 
    121 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
    122 /*
    123   Swap the bytes in a z_word_t to convert between little and big endian. Any
    124   self-respecting compiler will optimize this to a single machine byte-swap
    125   instruction, if one is available. This assumes that word_t is either 32 bits
    126   or 64 bits.
    127  */
    128 local z_word_t byte_swap(word)
    129     z_word_t word;
    130 {
    131 #  if W == 8
    132     return
    133         (word & 0xff00000000000000) >> 56 |
    134         (word & 0xff000000000000) >> 40 |
    135         (word & 0xff0000000000) >> 24 |
    136         (word & 0xff00000000) >> 8 |
    137         (word & 0xff000000) << 8 |
    138         (word & 0xff0000) << 24 |
    139         (word & 0xff00) << 40 |
    140         (word & 0xff) << 56;
    141 #  else   /* W == 4 */
    142     return
    143         (word & 0xff000000) >> 24 |
    144         (word & 0xff0000) >> 8 |
    145         (word & 0xff00) << 8 |
    146         (word & 0xff) << 24;
    147 #  endif
    148 }
    149 #endif
    150 
    151 /* CRC polynomial. */
    152 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
    153 
    154 #ifdef DYNAMIC_CRC_TABLE
    155 
    156 local z_crc_t FAR crc_table[256];
    157 local z_crc_t FAR x2n_table[32];
    158 local void make_crc_table OF((void));
    159 #ifdef W
    160    local z_word_t FAR crc_big_table[256];
    161    local z_crc_t FAR crc_braid_table[W][256];
    162    local z_word_t FAR crc_braid_big_table[W][256];
    163    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
    164 #endif
    165 #ifdef MAKECRCH
    166    local void write_table OF((FILE *, const z_crc_t FAR *, int));
    167    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
    168    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
    169 #endif /* MAKECRCH */
    170 
    171 /*
    172   Define a once() function depending on the availability of atomics. If this is
    173   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
    174   multiple threads, and if atomics are not available, then get_crc_table() must
    175   be called to initialize the tables and must return before any threads are
    176   allowed to compute or combine CRCs.
    177  */
    178 
    179 /* Definition of once functionality. */
    180 typedef struct once_s once_t;
    181 local void once OF((once_t *, void (*)(void)));
    182 
    183 /* Check for the availability of atomics. */
    184 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
    185     !defined(__STDC_NO_ATOMICS__)
    186 
    187 #include <stdatomic.h>
    188 
    189 /* Structure for once(), which must be initialized with ONCE_INIT. */
    190 struct once_s {
    191     atomic_flag begun;
    192     atomic_int done;
    193 };
    194 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
    195 
    196 /*
    197   Run the provided init() function exactly once, even if multiple threads
    198   invoke once() at the same time. The state must be a once_t initialized with
    199   ONCE_INIT.
    200  */
    201 local void once(state, init)
    202     once_t *state;
    203     void (*init)(void);
    204 {
    205     if (!atomic_load(&state->done)) {
    206         if (atomic_flag_test_and_set(&state->begun))
    207             while (!atomic_load(&state->done))
    208                 ;
    209         else {
    210             init();
    211             atomic_store(&state->done, 1);
    212         }
    213     }
    214 }
    215 
    216 #else   /* no atomics */
    217 
    218 /* Structure for once(), which must be initialized with ONCE_INIT. */
    219 struct once_s {
    220     volatile int begun;
    221     volatile int done;
    222 };
    223 #define ONCE_INIT {0, 0}
    224 
    225 /* Test and set. Alas, not atomic, but tries to minimize the period of
    226    vulnerability. */
    227 local int test_and_set OF((int volatile *));
    228 local int test_and_set(flag)
    229     int volatile *flag;
    230 {
    231     int was;
    232 
    233     was = *flag;
    234     *flag = 1;
    235     return was;
    236 }
    237 
    238 /* Run the provided init() function once. This is not thread-safe. */
    239 local void once(state, init)
    240     once_t *state;
    241     void (*init)(void);
    242 {
    243     if (!state->done) {
    244         if (test_and_set(&state->begun))
    245             while (!state->done)
    246                 ;
    247         else {
    248             init();
    249             state->done = 1;
    250         }
    251     }
    252 }
    253 
    254 #endif
    255 
    256 /* State for once(). */
    257 local once_t made = ONCE_INIT;
    258 
    259 /*
    260   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
    261   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
    262 
    263   Polynomials over GF(2) are represented in binary, one bit per coefficient,
    264   with the lowest powers in the most significant bit. Then adding polynomials
    265   is just exclusive-or, and multiplying a polynomial by x is a right shift by
    266   one. If we call the above polynomial p, and represent a byte as the
    267   polynomial q, also with the lowest power in the most significant bit (so the
    268   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
    269   where a mod b means the remainder after dividing a by b.
    270 
    271   This calculation is done using the shift-register method of multiplying and
    272   taking the remainder. The register is initialized to zero, and for each
    273   incoming bit, x^32 is added mod p to the register if the bit is a one (where
    274   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
    275   (which is shifting right by one and adding x^32 mod p if the bit shifted out
    276   is a one). We start with the highest power (least significant bit) of q and
    277   repeat for all eight bits of q.
    278 
    279   The table is simply the CRC of all possible eight bit values. This is all the
    280   information needed to generate CRCs on data a byte at a time for all
    281   combinations of CRC register values and incoming bytes.
    282  */
    283 
    284 local void make_crc_table()
    285 {
    286     unsigned i, j, n;
    287     z_crc_t p;
    288 
    289     /* initialize the CRC of bytes tables */
    290     for (i = 0; i < 256; i++) {
    291         p = i;
    292         for (j = 0; j < 8; j++)
    293             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
    294         crc_table[i] = p;
    295 #ifdef W
    296         crc_big_table[i] = byte_swap(p);
    297 #endif
    298     }
    299 
    300     /* initialize the x^2^n mod p(x) table */
    301     p = (z_crc_t)1 << 30;         /* x^1 */
    302     x2n_table[0] = p;
    303     for (n = 1; n < 32; n++)
    304         x2n_table[n] = p = multmodp(p, p);
    305 
    306 #ifdef W
    307     /* initialize the braiding tables -- needs x2n_table[] */
    308     braid(crc_braid_table, crc_braid_big_table, N, W);
    309 #endif
    310 
    311 #ifdef MAKECRCH
    312     {
    313         /*
    314           The crc32.h header file contains tables for both 32-bit and 64-bit
    315           z_word_t's, and so requires a 64-bit type be available. In that case,
    316           z_word_t must be defined to be 64-bits. This code then also generates
    317           and writes out the tables for the case that z_word_t is 32 bits.
    318          */
    319 #if !defined(W) || W != 8
    320 #  error Need a 64-bit integer type in order to generate crc32.h.
    321 #endif
    322         FILE *out;
    323         int k, n;
    324         z_crc_t ltl[8][256];
    325         z_word_t big[8][256];
    326 
    327         out = fopen("crc32.h", "w");
    328         if (out == NULL) return;
    329 
    330         /* write out little-endian CRC table to crc32.h */
    331         fprintf(out,
    332             "/* crc32.h -- tables for rapid CRC calculation\n"
    333             " * Generated automatically by crc32.c\n */\n"
    334             "\n"
    335             "local const z_crc_t FAR crc_table[] = {\n"
    336             "    ");
    337         write_table(out, crc_table, 256);
    338         fprintf(out,
    339             "};\n");
    340 
    341         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
    342         fprintf(out,
    343             "\n"
    344             "#ifdef W\n"
    345             "\n"
    346             "#if W == 8\n"
    347             "\n"
    348             "local const z_word_t FAR crc_big_table[] = {\n"
    349             "    ");
    350         write_table64(out, crc_big_table, 256);
    351         fprintf(out,
    352             "};\n");
    353 
    354         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
    355         fprintf(out,
    356             "\n"
    357             "#else /* W == 4 */\n"
    358             "\n"
    359             "local const z_word_t FAR crc_big_table[] = {\n"
    360             "    ");
    361         write_table32hi(out, crc_big_table, 256);
    362         fprintf(out,
    363             "};\n"
    364             "\n"
    365             "#endif\n");
    366 
    367         /* write out braid tables for each value of N */
    368         for (n = 1; n <= 6; n++) {
    369             fprintf(out,
    370             "\n"
    371             "#if N == %d\n", n);
    372 
    373             /* compute braid tables for this N and 64-bit word_t */
    374             braid(ltl, big, n, 8);
    375 
    376             /* write out braid tables for 64-bit z_word_t to crc32.h */
    377             fprintf(out,
    378             "\n"
    379             "#if W == 8\n"
    380             "\n"
    381             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    382             for (k = 0; k < 8; k++) {
    383                 fprintf(out, "   {");
    384                 write_table(out, ltl[k], 256);
    385                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
    386             }
    387             fprintf(out,
    388             "};\n"
    389             "\n"
    390             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    391             for (k = 0; k < 8; k++) {
    392                 fprintf(out, "   {");
    393                 write_table64(out, big[k], 256);
    394                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
    395             }
    396             fprintf(out,
    397             "};\n");
    398 
    399             /* compute braid tables for this N and 32-bit word_t */
    400             braid(ltl, big, n, 4);
    401 
    402             /* write out braid tables for 32-bit z_word_t to crc32.h */
    403             fprintf(out,
    404             "\n"
    405             "#else /* W == 4 */\n"
    406             "\n"
    407             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    408             for (k = 0; k < 4; k++) {
    409                 fprintf(out, "   {");
    410                 write_table(out, ltl[k], 256);
    411                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
    412             }
    413             fprintf(out,
    414             "};\n"
    415             "\n"
    416             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    417             for (k = 0; k < 4; k++) {
    418                 fprintf(out, "   {");
    419                 write_table32hi(out, big[k], 256);
    420                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
    421             }
    422             fprintf(out,
    423             "};\n"
    424             "\n"
    425             "#endif\n"
    426             "\n"
    427             "#endif\n");
    428         }
    429         fprintf(out,
    430             "\n"
    431             "#endif\n");
    432 
    433         /* write out zeros operator table to crc32.h */
    434         fprintf(out,
    435             "\n"
    436             "local const z_crc_t FAR x2n_table[] = {\n"
    437             "    ");
    438         write_table(out, x2n_table, 32);
    439         fprintf(out,
    440             "};\n");
    441         fclose(out);
    442     }
    443 #endif /* MAKECRCH */
    444 }
    445 
    446 #ifdef MAKECRCH
    447 
    448 /*
    449    Write the 32-bit values in table[0..k-1] to out, five per line in
    450    hexadecimal separated by commas.
    451  */
    452 local void write_table(out, table, k)
    453     FILE *out;
    454     const z_crc_t FAR *table;
    455     int k;
    456 {
    457     int n;
    458 
    459     for (n = 0; n < k; n++)
    460         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    461                 (unsigned long)(table[n]),
    462                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    463 }
    464 
    465 /*
    466    Write the high 32-bits of each value in table[0..k-1] to out, five per line
    467    in hexadecimal separated by commas.
    468  */
    469 local void write_table32hi(out, table, k)
    470 FILE *out;
    471 const z_word_t FAR *table;
    472 int k;
    473 {
    474     int n;
    475 
    476     for (n = 0; n < k; n++)
    477         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    478                 (unsigned long)(table[n] >> 32),
    479                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    480 }
    481 
    482 /*
    483   Write the 64-bit values in table[0..k-1] to out, three per line in
    484   hexadecimal separated by commas. This assumes that if there is a 64-bit
    485   type, then there is also a long long integer type, and it is at least 64
    486   bits. If not, then the type cast and format string can be adjusted
    487   accordingly.
    488  */
    489 local void write_table64(out, table, k)
    490     FILE *out;
    491     const z_word_t FAR *table;
    492     int k;
    493 {
    494     int n;
    495 
    496     for (n = 0; n < k; n++)
    497         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
    498                 (unsigned long long)(table[n]),
    499                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
    500 }
    501 
    502 /* Actually do the deed. */
    503 int main()
    504 {
    505     make_crc_table();
    506     return 0;
    507 }
    508 
    509 #endif /* MAKECRCH */
    510 
    511 #ifdef W
    512 /*
    513   Generate the little and big-endian braid tables for the given n and z_word_t
    514   size w. Each array must have room for w blocks of 256 elements.
    515  */
    516 local void braid(ltl, big, n, w)
    517     z_crc_t ltl[][256];
    518     z_word_t big[][256];
    519     int n;
    520     int w;
    521 {
    522     int k;
    523     z_crc_t i, p, q;
    524     for (k = 0; k < w; k++) {
    525         p = x2nmodp((n * w + 3 - k) << 3, 0);
    526         ltl[k][0] = 0;
    527         big[w - 1 - k][0] = 0;
    528         for (i = 1; i < 256; i++) {
    529             ltl[k][i] = q = multmodp(i << 24, p);
    530             big[w - 1 - k][i] = byte_swap(q);
    531         }
    532     }
    533 }
    534 #endif
    535 
    536 #else /* !DYNAMIC_CRC_TABLE */
    537 /* ========================================================================
    538  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
    539  * of x for combining CRC-32s, all made by make_crc_table().
    540  */
    541 #include "crc32.h"
    542 #endif /* DYNAMIC_CRC_TABLE */
    543 
    544 /* ========================================================================
    545  * Routines used for CRC calculation. Some are also required for the table
    546  * generation above.
    547  */
    548 
    549 /*
    550   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
    551   reflected. For speed, this requires that a not be zero.
    552  */
    553 local z_crc_t multmodp(a, b)
    554     z_crc_t a;
    555     z_crc_t b;
    556 {
    557     z_crc_t m, p;
    558 
    559     m = (z_crc_t)1 << 31;
    560     p = 0;
    561     for (;;) {
    562         if (a & m) {
    563             p ^= b;
    564             if ((a & (m - 1)) == 0)
    565                 break;
    566         }
    567         m >>= 1;
    568         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
    569     }
    570     return p;
    571 }
    572 
    573 /*
    574   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
    575   initialized.
    576  */
    577 local z_crc_t x2nmodp(n, k)
    578     z_off64_t n;
    579     unsigned k;
    580 {
    581     z_crc_t p;
    582 
    583     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
    584     while (n) {
    585         if (n & 1)
    586             p = multmodp(x2n_table[k & 31], p);
    587         n >>= 1;
    588         k++;
    589     }
    590     return p;
    591 }
    592 
    593 /* =========================================================================
    594  * This function can be used by asm versions of crc32(), and to force the
    595  * generation of the CRC tables in a threaded application.
    596  */
    597 const z_crc_t FAR * ZEXPORT get_crc_table()
    598 {
    599 #ifdef DYNAMIC_CRC_TABLE
    600     once(&made, make_crc_table);
    601 #endif /* DYNAMIC_CRC_TABLE */
    602     return (const z_crc_t FAR *)crc_table;
    603 }
    604 
    605 /* =========================================================================
    606  * Use ARM machine instructions if available. This will compute the CRC about
    607  * ten times faster than the braided calculation. This code does not check for
    608  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
    609  * only be defined if the compilation specifies an ARM processor architecture
    610  * that has the instructions. For example, compiling with -march=armv8.1-a or
    611  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
    612  * instructions.
    613  */
    614 #ifdef ARMCRC32
    615 
    616 /*
    617    Constants empirically determined to maximize speed. These values are from
    618    measurements on a Cortex-A57. Your mileage may vary.
    619  */
    620 #define Z_BATCH 3990                /* number of words in a batch */
    621 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
    622 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
    623 
    624 unsigned long ZEXPORT crc32_z(crc, buf, len)
    625     unsigned long crc;
    626     const unsigned char FAR *buf;
    627     z_size_t len;
    628 {
    629     z_crc_t val;
    630     z_word_t crc1, crc2;
    631     const z_word_t *word;
    632     z_word_t val0, val1, val2;
    633     z_size_t last, last2, i;
    634     z_size_t num;
    635 
    636     /* Return initial CRC, if requested. */
    637     if (buf == Z_NULL) return 0;
    638 
    639 #ifdef DYNAMIC_CRC_TABLE
    640     once(&made, make_crc_table);
    641 #endif /* DYNAMIC_CRC_TABLE */
    642 
    643     /* Pre-condition the CRC */
    644     crc = (~crc) & 0xffffffff;
    645 
    646     /* Compute the CRC up to a word boundary. */
    647     while (len && ((z_size_t)buf & 7) != 0) {
    648         len--;
    649         val = *buf++;
    650         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    651     }
    652 
    653     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
    654     word = (z_word_t const *)buf;
    655     num = len >> 3;
    656     len &= 7;
    657 
    658     /* Do three interleaved CRCs to realize the throughput of one crc32x
    659        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
    660        three CRCs are combined into a single CRC after each set of batches. */
    661     while (num >= 3 * Z_BATCH) {
    662         crc1 = 0;
    663         crc2 = 0;
    664         for (i = 0; i < Z_BATCH; i++) {
    665             val0 = word[i];
    666             val1 = word[i + Z_BATCH];
    667             val2 = word[i + 2 * Z_BATCH];
    668             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    669             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    670             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    671         }
    672         word += 3 * Z_BATCH;
    673         num -= 3 * Z_BATCH;
    674         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
    675         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
    676     }
    677 
    678     /* Do one last smaller batch with the remaining words, if there are enough
    679        to pay for the combination of CRCs. */
    680     last = num / 3;
    681     if (last >= Z_BATCH_MIN) {
    682         last2 = last << 1;
    683         crc1 = 0;
    684         crc2 = 0;
    685         for (i = 0; i < last; i++) {
    686             val0 = word[i];
    687             val1 = word[i + last];
    688             val2 = word[i + last2];
    689             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    690             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    691             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    692         }
    693         word += 3 * last;
    694         num -= 3 * last;
    695         val = x2nmodp(last, 6);
    696         crc = multmodp(val, crc) ^ crc1;
    697         crc = multmodp(val, crc) ^ crc2;
    698     }
    699 
    700     /* Compute the CRC on any remaining words. */
    701     for (i = 0; i < num; i++) {
    702         val0 = word[i];
    703         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    704     }
    705     word += num;
    706 
    707     /* Complete the CRC on any remaining bytes. */
    708     buf = (const unsigned char FAR *)word;
    709     while (len) {
    710         len--;
    711         val = *buf++;
    712         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    713     }
    714 
    715     /* Return the CRC, post-conditioned. */
    716     return crc ^ 0xffffffff;
    717 }
    718 
    719 #else
    720 
    721 #ifdef W
    722 
    723 /*
    724   Return the CRC of the W bytes in the word_t data, taking the
    725   least-significant byte of the word as the first byte of data, without any pre
    726   or post conditioning. This is used to combine the CRCs of each braid.
    727  */
    728 local z_crc_t crc_word(data)
    729     z_word_t data;
    730 {
    731     int k;
    732     for (k = 0; k < W; k++)
    733         data = (data >> 8) ^ crc_table[data & 0xff];
    734     return (z_crc_t)data;
    735 }
    736 
    737 local z_word_t crc_word_big(data)
    738     z_word_t data;
    739 {
    740     int k;
    741     for (k = 0; k < W; k++)
    742         data = (data << 8) ^
    743             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
    744     return data;
    745 }
    746 
    747 #endif
    748 
    749 /* ========================================================================= */
    750 unsigned long ZEXPORT crc32_z(crc, buf, len)
    751     unsigned long crc;
    752     const unsigned char FAR *buf;
    753     z_size_t len;
    754 {
    755     /* Return initial CRC, if requested. */
    756     if (buf == Z_NULL) return 0;
    757 
    758 #ifdef DYNAMIC_CRC_TABLE
    759     once(&made, make_crc_table);
    760 #endif /* DYNAMIC_CRC_TABLE */
    761 
    762     /* Pre-condition the CRC */
    763     crc = (~crc) & 0xffffffff;
    764 
    765 #ifdef W
    766 
    767     /* If provided enough bytes, do a braided CRC calculation. */
    768     if (len >= N * W + W - 1) {
    769         z_size_t blks;
    770         z_word_t const *words;
    771         unsigned endian;
    772         int k;
    773 
    774         /* Compute the CRC up to a z_word_t boundary. */
    775         while (len && ((z_size_t)buf & (W - 1)) != 0) {
    776             len--;
    777             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    778         }
    779 
    780         /* Compute the CRC on as many N z_word_t blocks as are available. */
    781         blks = len / (N * W);
    782         len -= blks * N * W;
    783         words = (z_word_t const *)buf;
    784 
    785         /* Do endian check at execution time instead of compile time, since ARM
    786            processors can change the endianess at execution time. If the
    787            compiler knows what the endianess will be, it can optimize out the
    788            check and the unused branch. */
    789         endian = 1;
    790         if (*(unsigned char *)&endian) {
    791             /* Little endian. */
    792 
    793             z_crc_t crc0;
    794             z_word_t word0;
    795 #if N > 1
    796             z_crc_t crc1;
    797             z_word_t word1;
    798 #if N > 2
    799             z_crc_t crc2;
    800             z_word_t word2;
    801 #if N > 3
    802             z_crc_t crc3;
    803             z_word_t word3;
    804 #if N > 4
    805             z_crc_t crc4;
    806             z_word_t word4;
    807 #if N > 5
    808             z_crc_t crc5;
    809             z_word_t word5;
    810 #endif
    811 #endif
    812 #endif
    813 #endif
    814 #endif
    815 
    816             /* Initialize the CRC for each braid. */
    817             crc0 = crc;
    818 #if N > 1
    819             crc1 = 0;
    820 #if N > 2
    821             crc2 = 0;
    822 #if N > 3
    823             crc3 = 0;
    824 #if N > 4
    825             crc4 = 0;
    826 #if N > 5
    827             crc5 = 0;
    828 #endif
    829 #endif
    830 #endif
    831 #endif
    832 #endif
    833 
    834             /*
    835               Process the first blks-1 blocks, computing the CRCs on each braid
    836               independently.
    837              */
    838             while (--blks) {
    839                 /* Load the word for each braid into registers. */
    840                 word0 = crc0 ^ words[0];
    841 #if N > 1
    842                 word1 = crc1 ^ words[1];
    843 #if N > 2
    844                 word2 = crc2 ^ words[2];
    845 #if N > 3
    846                 word3 = crc3 ^ words[3];
    847 #if N > 4
    848                 word4 = crc4 ^ words[4];
    849 #if N > 5
    850                 word5 = crc5 ^ words[5];
    851 #endif
    852 #endif
    853 #endif
    854 #endif
    855 #endif
    856                 words += N;
    857 
    858                 /* Compute and update the CRC for each word. The loop should
    859                    get unrolled. */
    860                 crc0 = crc_braid_table[0][word0 & 0xff];
    861 #if N > 1
    862                 crc1 = crc_braid_table[0][word1 & 0xff];
    863 #if N > 2
    864                 crc2 = crc_braid_table[0][word2 & 0xff];
    865 #if N > 3
    866                 crc3 = crc_braid_table[0][word3 & 0xff];
    867 #if N > 4
    868                 crc4 = crc_braid_table[0][word4 & 0xff];
    869 #if N > 5
    870                 crc5 = crc_braid_table[0][word5 & 0xff];
    871 #endif
    872 #endif
    873 #endif
    874 #endif
    875 #endif
    876                 for (k = 1; k < W; k++) {
    877                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
    878 #if N > 1
    879                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
    880 #if N > 2
    881                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
    882 #if N > 3
    883                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
    884 #if N > 4
    885                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
    886 #if N > 5
    887                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
    888 #endif
    889 #endif
    890 #endif
    891 #endif
    892 #endif
    893                 }
    894             }
    895 
    896             /*
    897               Process the last block, combining the CRCs of the N braids at the
    898               same time.
    899              */
    900             crc = crc_word(crc0 ^ words[0]);
    901 #if N > 1
    902             crc = crc_word(crc1 ^ words[1] ^ crc);
    903 #if N > 2
    904             crc = crc_word(crc2 ^ words[2] ^ crc);
    905 #if N > 3
    906             crc = crc_word(crc3 ^ words[3] ^ crc);
    907 #if N > 4
    908             crc = crc_word(crc4 ^ words[4] ^ crc);
    909 #if N > 5
    910             crc = crc_word(crc5 ^ words[5] ^ crc);
    911 #endif
    912 #endif
    913 #endif
    914 #endif
    915 #endif
    916             words += N;
    917         }
    918         else {
    919             /* Big endian. */
    920 
    921             z_word_t crc0, word0, comb;
    922 #if N > 1
    923             z_word_t crc1, word1;
    924 #if N > 2
    925             z_word_t crc2, word2;
    926 #if N > 3
    927             z_word_t crc3, word3;
    928 #if N > 4
    929             z_word_t crc4, word4;
    930 #if N > 5
    931             z_word_t crc5, word5;
    932 #endif
    933 #endif
    934 #endif
    935 #endif
    936 #endif
    937 
    938             /* Initialize the CRC for each braid. */
    939             crc0 = byte_swap(crc);
    940 #if N > 1
    941             crc1 = 0;
    942 #if N > 2
    943             crc2 = 0;
    944 #if N > 3
    945             crc3 = 0;
    946 #if N > 4
    947             crc4 = 0;
    948 #if N > 5
    949             crc5 = 0;
    950 #endif
    951 #endif
    952 #endif
    953 #endif
    954 #endif
    955 
    956             /*
    957               Process the first blks-1 blocks, computing the CRCs on each braid
    958               independently.
    959              */
    960             while (--blks) {
    961                 /* Load the word for each braid into registers. */
    962                 word0 = crc0 ^ words[0];
    963 #if N > 1
    964                 word1 = crc1 ^ words[1];
    965 #if N > 2
    966                 word2 = crc2 ^ words[2];
    967 #if N > 3
    968                 word3 = crc3 ^ words[3];
    969 #if N > 4
    970                 word4 = crc4 ^ words[4];
    971 #if N > 5
    972                 word5 = crc5 ^ words[5];
    973 #endif
    974 #endif
    975 #endif
    976 #endif
    977 #endif
    978                 words += N;
    979 
    980                 /* Compute and update the CRC for each word. The loop should
    981                    get unrolled. */
    982                 crc0 = crc_braid_big_table[0][word0 & 0xff];
    983 #if N > 1
    984                 crc1 = crc_braid_big_table[0][word1 & 0xff];
    985 #if N > 2
    986                 crc2 = crc_braid_big_table[0][word2 & 0xff];
    987 #if N > 3
    988                 crc3 = crc_braid_big_table[0][word3 & 0xff];
    989 #if N > 4
    990                 crc4 = crc_braid_big_table[0][word4 & 0xff];
    991 #if N > 5
    992                 crc5 = crc_braid_big_table[0][word5 & 0xff];
    993 #endif
    994 #endif
    995 #endif
    996 #endif
    997 #endif
    998                 for (k = 1; k < W; k++) {
    999                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
   1000 #if N > 1
   1001                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
   1002 #if N > 2
   1003                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
   1004 #if N > 3
   1005                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
   1006 #if N > 4
   1007                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
   1008 #if N > 5
   1009                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
   1010 #endif
   1011 #endif
   1012 #endif
   1013 #endif
   1014 #endif
   1015                 }
   1016             }
   1017 
   1018             /*
   1019               Process the last block, combining the CRCs of the N braids at the
   1020               same time.
   1021              */
   1022             comb = crc_word_big(crc0 ^ words[0]);
   1023 #if N > 1
   1024             comb = crc_word_big(crc1 ^ words[1] ^ comb);
   1025 #if N > 2
   1026             comb = crc_word_big(crc2 ^ words[2] ^ comb);
   1027 #if N > 3
   1028             comb = crc_word_big(crc3 ^ words[3] ^ comb);
   1029 #if N > 4
   1030             comb = crc_word_big(crc4 ^ words[4] ^ comb);
   1031 #if N > 5
   1032             comb = crc_word_big(crc5 ^ words[5] ^ comb);
   1033 #endif
   1034 #endif
   1035 #endif
   1036 #endif
   1037 #endif
   1038             words += N;
   1039             crc = byte_swap(comb);
   1040         }
   1041 
   1042         /*
   1043           Update the pointer to the remaining bytes to process.
   1044          */
   1045         buf = (unsigned char const *)words;
   1046     }
   1047 
   1048 #endif /* W */
   1049 
   1050     /* Complete the computation of the CRC on any remaining bytes. */
   1051     while (len >= 8) {
   1052         len -= 8;
   1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1056         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1057         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1058         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1059         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1060         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1061     }
   1062     while (len) {
   1063         len--;
   1064         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1065     }
   1066 
   1067     /* Return the CRC, post-conditioned. */
   1068     return crc ^ 0xffffffff;
   1069 }
   1070 
   1071 #endif
   1072 
   1073 /* ========================================================================= */
   1074 unsigned long ZEXPORT crc32(crc, buf, len)
   1075     unsigned long crc;
   1076     const unsigned char FAR *buf;
   1077     uInt len;
   1078 {
   1079     return crc32_z(crc, buf, len);
   1080 }
   1081 
   1082 /* ========================================================================= */
   1083 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
   1084     uLong crc1;
   1085     uLong crc2;
   1086     z_off64_t len2;
   1087 {
   1088 #ifdef DYNAMIC_CRC_TABLE
   1089     once(&made, make_crc_table);
   1090 #endif /* DYNAMIC_CRC_TABLE */
   1091     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
   1092 }
   1093 
   1094 /* ========================================================================= */
   1095 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
   1096     uLong crc1;
   1097     uLong crc2;
   1098     z_off_t len2;
   1099 {
   1100     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
   1101 }
   1102 
   1103 /* ========================================================================= */
   1104 uLong ZEXPORT crc32_combine_gen64(len2)
   1105     z_off64_t len2;
   1106 {
   1107 #ifdef DYNAMIC_CRC_TABLE
   1108     once(&made, make_crc_table);
   1109 #endif /* DYNAMIC_CRC_TABLE */
   1110     return x2nmodp(len2, 3);
   1111 }
   1112 
   1113 /* ========================================================================= */
   1114 uLong ZEXPORT crc32_combine_gen(len2)
   1115     z_off_t len2;
   1116 {
   1117     return crc32_combine_gen64((z_off64_t)len2);
   1118 }
   1119 
   1120 /* ========================================================================= */
   1121 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
   1122     uLong crc1;
   1123     uLong crc2;
   1124     uLong op;
   1125 {
   1126     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
   1127 }
   1128