tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

crc32.c (31605B)


      1 /* crc32.c -- compute the CRC-32 of a data stream
      2 * Copyright (C) 1995-2022 Mark Adler
      3 * For conditions of distribution and use, see copyright notice in zlib.h
      4 *
      5 * This interleaved implementation of a CRC makes use of pipelined multiple
      6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
      7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
      8 */
      9 
     10 /* @(#) $Id$ */
     11 
     12 /*
     13  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
     14  protection on the static variables used to control the first-use generation
     15  of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
     16  first call get_crc_table() to initialize the tables before allowing more than
     17  one thread to use crc32().
     18 
     19  MAKECRCH can be #defined to write out crc32.h. A main() routine is also
     20  produced, so that this one source file can be compiled to an executable.
     21 */
     22 
     23 #ifdef MAKECRCH
     24 #  include <stdio.h>
     25 #  ifndef DYNAMIC_CRC_TABLE
     26 #    define DYNAMIC_CRC_TABLE
     27 #  endif /* !DYNAMIC_CRC_TABLE */
     28 #endif /* MAKECRCH */
     29 
     30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
     31 
     32 /*
     33  A CRC of a message is computed on N braids of words in the message, where
     34  each word consists of W bytes (4 or 8). If N is 3, for example, then three
     35  running sparse CRCs are calculated respectively on each braid, at these
     36  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
     37  This is done starting at a word boundary, and continues until as many blocks
     38  of N * W bytes as are available have been processed. The results are combined
     39  into a single CRC at the end. For this code, N must be in the range 1..6 and
     40  W must be 4 or 8. The upper limit on N can be increased if desired by adding
     41  more #if blocks, extending the patterns apparent in the code. In addition,
     42  crc32.h would need to be regenerated, if the maximum N value is increased.
     43 
     44  N and W are chosen empirically by benchmarking the execution time on a given
     45  processor. The choices for N and W below were based on testing on Intel Kaby
     46  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
     47  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
     48  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
     49  They were all tested with either gcc or clang, all using the -O3 optimization
     50  level. Your mileage may vary.
     51 */
     52 
     53 /* Define N */
     54 #ifdef Z_TESTN
     55 #  define N Z_TESTN
     56 #else
     57 #  define N 5
     58 #endif
     59 #if N < 1 || N > 6
     60 #  error N must be in 1..6
     61 #endif
     62 
     63 /*
     64  z_crc_t must be at least 32 bits. z_word_t must be at least as long as
     65  z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
     66  that bytes are eight bits.
     67 */
     68 
     69 /*
     70  Define W and the associated z_word_t type. If W is not defined, then a
     71  braided calculation is not used, and the associated tables and code are not
     72  compiled.
     73 */
     74 #ifdef Z_TESTW
     75 #  if Z_TESTW-1 != -1
     76 #    define W Z_TESTW
     77 #  endif
     78 #else
     79 #  ifdef MAKECRCH
     80 #    define W 8         /* required for MAKECRCH */
     81 #  else
     82 #    if defined(__x86_64__) || defined(__aarch64__)
     83 #      define W 8
     84 #    else
     85 #      define W 4
     86 #    endif
     87 #  endif
     88 #endif
     89 #ifdef W
     90 #  if W == 8 && defined(Z_U8)
     91     typedef Z_U8 z_word_t;
     92 #  elif defined(Z_U4)
     93 #    undef W
     94 #    define W 4
     95     typedef Z_U4 z_word_t;
     96 #  else
     97 #    undef W
     98 #  endif
     99 #endif
    100 
    101 /* If available, use the ARM processor CRC32 instruction. */
    102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
    103 #  define ARMCRC32
    104 #endif
    105 
    106 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
    107 /*
    108  Swap the bytes in a z_word_t to convert between little and big endian. Any
    109  self-respecting compiler will optimize this to a single machine byte-swap
    110  instruction, if one is available. This assumes that word_t is either 32 bits
    111  or 64 bits.
    112 */
    113 local z_word_t byte_swap(z_word_t word) {
    114 #  if W == 8
    115    return
    116        (word & 0xff00000000000000) >> 56 |
    117        (word & 0xff000000000000) >> 40 |
    118        (word & 0xff0000000000) >> 24 |
    119        (word & 0xff00000000) >> 8 |
    120        (word & 0xff000000) << 8 |
    121        (word & 0xff0000) << 24 |
    122        (word & 0xff00) << 40 |
    123        (word & 0xff) << 56;
    124 #  else   /* W == 4 */
    125    return
    126        (word & 0xff000000) >> 24 |
    127        (word & 0xff0000) >> 8 |
    128        (word & 0xff00) << 8 |
    129        (word & 0xff) << 24;
    130 #  endif
    131 }
    132 #endif
    133 
    134 #ifdef DYNAMIC_CRC_TABLE
    135 /* =========================================================================
    136 * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
    137 * below.
    138 */
    139   local z_crc_t FAR x2n_table[32];
    140 #else
    141 /* =========================================================================
    142 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
    143 * of x for combining CRC-32s, all made by make_crc_table().
    144 */
    145 #  include "crc32.h"
    146 #endif
    147 
    148 /* CRC polynomial. */
    149 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
    150 
    151 /*
    152  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
    153  reflected. For speed, this requires that a not be zero.
    154 */
    155 local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
    156    z_crc_t m, p;
    157 
    158    m = (z_crc_t)1 << 31;
    159    p = 0;
    160    for (;;) {
    161        if (a & m) {
    162            p ^= b;
    163            if ((a & (m - 1)) == 0)
    164                break;
    165        }
    166        m >>= 1;
    167        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
    168    }
    169    return p;
    170 }
    171 
    172 /*
    173  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
    174  initialized.
    175 */
    176 local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
    177    z_crc_t p;
    178 
    179    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
    180    while (n) {
    181        if (n & 1)
    182            p = multmodp(x2n_table[k & 31], p);
    183        n >>= 1;
    184        k++;
    185    }
    186    return p;
    187 }
    188 
    189 #ifdef DYNAMIC_CRC_TABLE
    190 /* =========================================================================
    191 * Build the tables for byte-wise and braided CRC-32 calculations, and a table
    192 * of powers of x for combining CRC-32s.
    193 */
    194 local z_crc_t FAR crc_table[256];
    195 #ifdef W
    196   local z_word_t FAR crc_big_table[256];
    197   local z_crc_t FAR crc_braid_table[W][256];
    198   local z_word_t FAR crc_braid_big_table[W][256];
    199   local void braid(z_crc_t [][256], z_word_t [][256], int, int);
    200 #endif
    201 #ifdef MAKECRCH
    202   local void write_table(FILE *, const z_crc_t FAR *, int);
    203   local void write_table32hi(FILE *, const z_word_t FAR *, int);
    204   local void write_table64(FILE *, const z_word_t FAR *, int);
    205 #endif /* MAKECRCH */
    206 
    207 /*
    208  Define a once() function depending on the availability of atomics. If this is
    209  compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
    210  multiple threads, and if atomics are not available, then get_crc_table() must
    211  be called to initialize the tables and must return before any threads are
    212  allowed to compute or combine CRCs.
    213 */
    214 
    215 /* Definition of once functionality. */
    216 typedef struct once_s once_t;
    217 
    218 /* Check for the availability of atomics. */
    219 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
    220    !defined(__STDC_NO_ATOMICS__)
    221 
    222 #include <stdatomic.h>
    223 
    224 /* Structure for once(), which must be initialized with ONCE_INIT. */
    225 struct once_s {
    226    atomic_flag begun;
    227    atomic_int done;
    228 };
    229 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
    230 
    231 /*
    232  Run the provided init() function exactly once, even if multiple threads
    233  invoke once() at the same time. The state must be a once_t initialized with
    234  ONCE_INIT.
    235 */
    236 local void once(once_t *state, void (*init)(void)) {
    237    if (!atomic_load(&state->done)) {
    238        if (atomic_flag_test_and_set(&state->begun))
    239            while (!atomic_load(&state->done))
    240                ;
    241        else {
    242            init();
    243            atomic_store(&state->done, 1);
    244        }
    245    }
    246 }
    247 
    248 #else   /* no atomics */
    249 
    250 /* Structure for once(), which must be initialized with ONCE_INIT. */
    251 struct once_s {
    252    volatile int begun;
    253    volatile int done;
    254 };
    255 #define ONCE_INIT {0, 0}
    256 
    257 /* Test and set. Alas, not atomic, but tries to minimize the period of
    258   vulnerability. */
    259 local int test_and_set(int volatile *flag) {
    260    int was;
    261 
    262    was = *flag;
    263    *flag = 1;
    264    return was;
    265 }
    266 
    267 /* Run the provided init() function once. This is not thread-safe. */
    268 local void once(once_t *state, void (*init)(void)) {
    269    if (!state->done) {
    270        if (test_and_set(&state->begun))
    271            while (!state->done)
    272                ;
    273        else {
    274            init();
    275            state->done = 1;
    276        }
    277    }
    278 }
    279 
    280 #endif
    281 
    282 /* State for once(). */
    283 local once_t made = ONCE_INIT;
    284 
    285 /*
    286  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
    287  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
    288 
    289  Polynomials over GF(2) are represented in binary, one bit per coefficient,
    290  with the lowest powers in the most significant bit. Then adding polynomials
    291  is just exclusive-or, and multiplying a polynomial by x is a right shift by
    292  one. If we call the above polynomial p, and represent a byte as the
    293  polynomial q, also with the lowest power in the most significant bit (so the
    294  byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
    295  where a mod b means the remainder after dividing a by b.
    296 
    297  This calculation is done using the shift-register method of multiplying and
    298  taking the remainder. The register is initialized to zero, and for each
    299  incoming bit, x^32 is added mod p to the register if the bit is a one (where
    300  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
    301  (which is shifting right by one and adding x^32 mod p if the bit shifted out
    302  is a one). We start with the highest power (least significant bit) of q and
    303  repeat for all eight bits of q.
    304 
    305  The table is simply the CRC of all possible eight bit values. This is all the
    306  information needed to generate CRCs on data a byte at a time for all
    307  combinations of CRC register values and incoming bytes.
    308 */
    309 
    310 local void make_crc_table(void) {
    311    unsigned i, j, n;
    312    z_crc_t p;
    313 
    314    /* initialize the CRC of bytes tables */
    315    for (i = 0; i < 256; i++) {
    316        p = i;
    317        for (j = 0; j < 8; j++)
    318            p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
    319        crc_table[i] = p;
    320 #ifdef W
    321        crc_big_table[i] = byte_swap(p);
    322 #endif
    323    }
    324 
    325    /* initialize the x^2^n mod p(x) table */
    326    p = (z_crc_t)1 << 30;         /* x^1 */
    327    x2n_table[0] = p;
    328    for (n = 1; n < 32; n++)
    329        x2n_table[n] = p = multmodp(p, p);
    330 
    331 #ifdef W
    332    /* initialize the braiding tables -- needs x2n_table[] */
    333    braid(crc_braid_table, crc_braid_big_table, N, W);
    334 #endif
    335 
    336 #ifdef MAKECRCH
    337    {
    338        /*
    339          The crc32.h header file contains tables for both 32-bit and 64-bit
    340          z_word_t's, and so requires a 64-bit type be available. In that case,
    341          z_word_t must be defined to be 64-bits. This code then also generates
    342          and writes out the tables for the case that z_word_t is 32 bits.
    343         */
    344 #if !defined(W) || W != 8
    345 #  error Need a 64-bit integer type in order to generate crc32.h.
    346 #endif
    347        FILE *out;
    348        int k, n;
    349        z_crc_t ltl[8][256];
    350        z_word_t big[8][256];
    351 
    352        out = fopen("crc32.h", "w");
    353        if (out == NULL) return;
    354 
    355        /* write out little-endian CRC table to crc32.h */
    356        fprintf(out,
    357            "/* crc32.h -- tables for rapid CRC calculation\n"
    358            " * Generated automatically by crc32.c\n */\n"
    359            "\n"
    360            "local const z_crc_t FAR crc_table[] = {\n"
    361            "    ");
    362        write_table(out, crc_table, 256);
    363        fprintf(out,
    364            "};\n");
    365 
    366        /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
    367        fprintf(out,
    368            "\n"
    369            "#ifdef W\n"
    370            "\n"
    371            "#if W == 8\n"
    372            "\n"
    373            "local const z_word_t FAR crc_big_table[] = {\n"
    374            "    ");
    375        write_table64(out, crc_big_table, 256);
    376        fprintf(out,
    377            "};\n");
    378 
    379        /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
    380        fprintf(out,
    381            "\n"
    382            "#else /* W == 4 */\n"
    383            "\n"
    384            "local const z_word_t FAR crc_big_table[] = {\n"
    385            "    ");
    386        write_table32hi(out, crc_big_table, 256);
    387        fprintf(out,
    388            "};\n"
    389            "\n"
    390            "#endif\n");
    391 
    392        /* write out braid tables for each value of N */
    393        for (n = 1; n <= 6; n++) {
    394            fprintf(out,
    395            "\n"
    396            "#if N == %d\n", n);
    397 
    398            /* compute braid tables for this N and 64-bit word_t */
    399            braid(ltl, big, n, 8);
    400 
    401            /* write out braid tables for 64-bit z_word_t to crc32.h */
    402            fprintf(out,
    403            "\n"
    404            "#if W == 8\n"
    405            "\n"
    406            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    407            for (k = 0; k < 8; k++) {
    408                fprintf(out, "   {");
    409                write_table(out, ltl[k], 256);
    410                fprintf(out, "}%s", k < 7 ? ",\n" : "");
    411            }
    412            fprintf(out,
    413            "};\n"
    414            "\n"
    415            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    416            for (k = 0; k < 8; k++) {
    417                fprintf(out, "   {");
    418                write_table64(out, big[k], 256);
    419                fprintf(out, "}%s", k < 7 ? ",\n" : "");
    420            }
    421            fprintf(out,
    422            "};\n");
    423 
    424            /* compute braid tables for this N and 32-bit word_t */
    425            braid(ltl, big, n, 4);
    426 
    427            /* write out braid tables for 32-bit z_word_t to crc32.h */
    428            fprintf(out,
    429            "\n"
    430            "#else /* W == 4 */\n"
    431            "\n"
    432            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
    433            for (k = 0; k < 4; k++) {
    434                fprintf(out, "   {");
    435                write_table(out, ltl[k], 256);
    436                fprintf(out, "}%s", k < 3 ? ",\n" : "");
    437            }
    438            fprintf(out,
    439            "};\n"
    440            "\n"
    441            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
    442            for (k = 0; k < 4; k++) {
    443                fprintf(out, "   {");
    444                write_table32hi(out, big[k], 256);
    445                fprintf(out, "}%s", k < 3 ? ",\n" : "");
    446            }
    447            fprintf(out,
    448            "};\n"
    449            "\n"
    450            "#endif\n"
    451            "\n"
    452            "#endif\n");
    453        }
    454        fprintf(out,
    455            "\n"
    456            "#endif\n");
    457 
    458        /* write out zeros operator table to crc32.h */
    459        fprintf(out,
    460            "\n"
    461            "local const z_crc_t FAR x2n_table[] = {\n"
    462            "    ");
    463        write_table(out, x2n_table, 32);
    464        fprintf(out,
    465            "};\n");
    466        fclose(out);
    467    }
    468 #endif /* MAKECRCH */
    469 }
    470 
    471 #ifdef MAKECRCH
    472 
    473 /*
    474   Write the 32-bit values in table[0..k-1] to out, five per line in
    475   hexadecimal separated by commas.
    476 */
    477 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
    478    int n;
    479 
    480    for (n = 0; n < k; n++)
    481        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    482                (unsigned long)(table[n]),
    483                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    484 }
    485 
    486 /*
    487   Write the high 32-bits of each value in table[0..k-1] to out, five per line
    488   in hexadecimal separated by commas.
    489 */
    490 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
    491    int n;
    492 
    493    for (n = 0; n < k; n++)
    494        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
    495                (unsigned long)(table[n] >> 32),
    496                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
    497 }
    498 
    499 /*
    500  Write the 64-bit values in table[0..k-1] to out, three per line in
    501  hexadecimal separated by commas. This assumes that if there is a 64-bit
    502  type, then there is also a long long integer type, and it is at least 64
    503  bits. If not, then the type cast and format string can be adjusted
    504  accordingly.
    505 */
    506 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
    507    int n;
    508 
    509    for (n = 0; n < k; n++)
    510        fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
    511                (unsigned long long)(table[n]),
    512                n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
    513 }
    514 
    515 /* Actually do the deed. */
    516 int main(void) {
    517    make_crc_table();
    518    return 0;
    519 }
    520 
    521 #endif /* MAKECRCH */
    522 
    523 #ifdef W
    524 /*
    525  Generate the little and big-endian braid tables for the given n and z_word_t
    526  size w. Each array must have room for w blocks of 256 elements.
    527 */
    528 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
    529    int k;
    530    z_crc_t i, p, q;
    531    for (k = 0; k < w; k++) {
    532        p = x2nmodp((n * w + 3 - k) << 3, 0);
    533        ltl[k][0] = 0;
    534        big[w - 1 - k][0] = 0;
    535        for (i = 1; i < 256; i++) {
    536            ltl[k][i] = q = multmodp(i << 24, p);
    537            big[w - 1 - k][i] = byte_swap(q);
    538        }
    539    }
    540 }
    541 #endif
    542 
    543 #endif /* DYNAMIC_CRC_TABLE */
    544 
    545 /* =========================================================================
    546 * This function can be used by asm versions of crc32(), and to force the
    547 * generation of the CRC tables in a threaded application.
    548 */
    549 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
    550 #ifdef DYNAMIC_CRC_TABLE
    551    once(&made, make_crc_table);
    552 #endif /* DYNAMIC_CRC_TABLE */
    553    return (const z_crc_t FAR *)crc_table;
    554 }
    555 
    556 /* =========================================================================
    557 * Use ARM machine instructions if available. This will compute the CRC about
    558 * ten times faster than the braided calculation. This code does not check for
    559 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
    560 * only be defined if the compilation specifies an ARM processor architecture
    561 * that has the instructions. For example, compiling with -march=armv8.1-a or
    562 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
    563 * instructions.
    564 */
    565 #ifdef ARMCRC32
    566 
    567 /*
    568   Constants empirically determined to maximize speed. These values are from
    569   measurements on a Cortex-A57. Your mileage may vary.
    570 */
    571 #define Z_BATCH 3990                /* number of words in a batch */
    572 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
    573 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
    574 
    575 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
    576                              z_size_t len) {
    577    z_crc_t val;
    578    z_word_t crc1, crc2;
    579    const z_word_t *word;
    580    z_word_t val0, val1, val2;
    581    z_size_t last, last2, i;
    582    z_size_t num;
    583 
    584    /* Return initial CRC, if requested. */
    585    if (buf == Z_NULL) return 0;
    586 
    587 #ifdef DYNAMIC_CRC_TABLE
    588    once(&made, make_crc_table);
    589 #endif /* DYNAMIC_CRC_TABLE */
    590 
    591    /* Pre-condition the CRC */
    592    crc = (~crc) & 0xffffffff;
    593 
    594    /* Compute the CRC up to a word boundary. */
    595    while (len && ((z_size_t)buf & 7) != 0) {
    596        len--;
    597        val = *buf++;
    598        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    599    }
    600 
    601    /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
    602    word = (z_word_t const *)buf;
    603    num = len >> 3;
    604    len &= 7;
    605 
    606    /* Do three interleaved CRCs to realize the throughput of one crc32x
    607       instruction per cycle. Each CRC is calculated on Z_BATCH words. The
    608       three CRCs are combined into a single CRC after each set of batches. */
    609    while (num >= 3 * Z_BATCH) {
    610        crc1 = 0;
    611        crc2 = 0;
    612        for (i = 0; i < Z_BATCH; i++) {
    613            val0 = word[i];
    614            val1 = word[i + Z_BATCH];
    615            val2 = word[i + 2 * Z_BATCH];
    616            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    617            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    618            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    619        }
    620        word += 3 * Z_BATCH;
    621        num -= 3 * Z_BATCH;
    622        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
    623        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
    624    }
    625 
    626    /* Do one last smaller batch with the remaining words, if there are enough
    627       to pay for the combination of CRCs. */
    628    last = num / 3;
    629    if (last >= Z_BATCH_MIN) {
    630        last2 = last << 1;
    631        crc1 = 0;
    632        crc2 = 0;
    633        for (i = 0; i < last; i++) {
    634            val0 = word[i];
    635            val1 = word[i + last];
    636            val2 = word[i + last2];
    637            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    638            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
    639            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
    640        }
    641        word += 3 * last;
    642        num -= 3 * last;
    643        val = x2nmodp(last, 6);
    644        crc = multmodp(val, crc) ^ crc1;
    645        crc = multmodp(val, crc) ^ crc2;
    646    }
    647 
    648    /* Compute the CRC on any remaining words. */
    649    for (i = 0; i < num; i++) {
    650        val0 = word[i];
    651        __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
    652    }
    653    word += num;
    654 
    655    /* Complete the CRC on any remaining bytes. */
    656    buf = (const unsigned char FAR *)word;
    657    while (len) {
    658        len--;
    659        val = *buf++;
    660        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
    661    }
    662 
    663    /* Return the CRC, post-conditioned. */
    664    return crc ^ 0xffffffff;
    665 }
    666 
    667 #else
    668 
    669 #ifdef W
    670 
    671 /*
    672  Return the CRC of the W bytes in the word_t data, taking the
    673  least-significant byte of the word as the first byte of data, without any pre
    674  or post conditioning. This is used to combine the CRCs of each braid.
    675 */
    676 local z_crc_t crc_word(z_word_t data) {
    677    int k;
    678    for (k = 0; k < W; k++)
    679        data = (data >> 8) ^ crc_table[data & 0xff];
    680    return (z_crc_t)data;
    681 }
    682 
    683 local z_word_t crc_word_big(z_word_t data) {
    684    int k;
    685    for (k = 0; k < W; k++)
    686        data = (data << 8) ^
    687            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
    688    return data;
    689 }
    690 
    691 #endif
    692 
    693 /* ========================================================================= */
    694 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
    695                              z_size_t len) {
    696    /* Return initial CRC, if requested. */
    697    if (buf == Z_NULL) return 0;
    698 
    699 #ifdef DYNAMIC_CRC_TABLE
    700    once(&made, make_crc_table);
    701 #endif /* DYNAMIC_CRC_TABLE */
    702 
    703    /* Pre-condition the CRC */
    704    crc = (~crc) & 0xffffffff;
    705 
    706 #ifdef W
    707 
    708    /* If provided enough bytes, do a braided CRC calculation. */
    709    if (len >= N * W + W - 1) {
    710        z_size_t blks;
    711        z_word_t const *words;
    712        unsigned endian;
    713        int k;
    714 
    715        /* Compute the CRC up to a z_word_t boundary. */
    716        while (len && ((z_size_t)buf & (W - 1)) != 0) {
    717            len--;
    718            crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    719        }
    720 
    721        /* Compute the CRC on as many N z_word_t blocks as are available. */
    722        blks = len / (N * W);
    723        len -= blks * N * W;
    724        words = (z_word_t const *)buf;
    725 
    726        /* Do endian check at execution time instead of compile time, since ARM
    727           processors can change the endianness at execution time. If the
    728           compiler knows what the endianness will be, it can optimize out the
    729           check and the unused branch. */
    730        endian = 1;
    731        if (*(unsigned char *)&endian) {
    732            /* Little endian. */
    733 
    734            z_crc_t crc0;
    735            z_word_t word0;
    736 #if N > 1
    737            z_crc_t crc1;
    738            z_word_t word1;
    739 #if N > 2
    740            z_crc_t crc2;
    741            z_word_t word2;
    742 #if N > 3
    743            z_crc_t crc3;
    744            z_word_t word3;
    745 #if N > 4
    746            z_crc_t crc4;
    747            z_word_t word4;
    748 #if N > 5
    749            z_crc_t crc5;
    750            z_word_t word5;
    751 #endif
    752 #endif
    753 #endif
    754 #endif
    755 #endif
    756 
    757            /* Initialize the CRC for each braid. */
    758            crc0 = crc;
    759 #if N > 1
    760            crc1 = 0;
    761 #if N > 2
    762            crc2 = 0;
    763 #if N > 3
    764            crc3 = 0;
    765 #if N > 4
    766            crc4 = 0;
    767 #if N > 5
    768            crc5 = 0;
    769 #endif
    770 #endif
    771 #endif
    772 #endif
    773 #endif
    774 
    775            /*
    776              Process the first blks-1 blocks, computing the CRCs on each braid
    777              independently.
    778             */
    779            while (--blks) {
    780                /* Load the word for each braid into registers. */
    781                word0 = crc0 ^ words[0];
    782 #if N > 1
    783                word1 = crc1 ^ words[1];
    784 #if N > 2
    785                word2 = crc2 ^ words[2];
    786 #if N > 3
    787                word3 = crc3 ^ words[3];
    788 #if N > 4
    789                word4 = crc4 ^ words[4];
    790 #if N > 5
    791                word5 = crc5 ^ words[5];
    792 #endif
    793 #endif
    794 #endif
    795 #endif
    796 #endif
    797                words += N;
    798 
    799                /* Compute and update the CRC for each word. The loop should
    800                   get unrolled. */
    801                crc0 = crc_braid_table[0][word0 & 0xff];
    802 #if N > 1
    803                crc1 = crc_braid_table[0][word1 & 0xff];
    804 #if N > 2
    805                crc2 = crc_braid_table[0][word2 & 0xff];
    806 #if N > 3
    807                crc3 = crc_braid_table[0][word3 & 0xff];
    808 #if N > 4
    809                crc4 = crc_braid_table[0][word4 & 0xff];
    810 #if N > 5
    811                crc5 = crc_braid_table[0][word5 & 0xff];
    812 #endif
    813 #endif
    814 #endif
    815 #endif
    816 #endif
    817                for (k = 1; k < W; k++) {
    818                    crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
    819 #if N > 1
    820                    crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
    821 #if N > 2
    822                    crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
    823 #if N > 3
    824                    crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
    825 #if N > 4
    826                    crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
    827 #if N > 5
    828                    crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
    829 #endif
    830 #endif
    831 #endif
    832 #endif
    833 #endif
    834                }
    835            }
    836 
    837            /*
    838              Process the last block, combining the CRCs of the N braids at the
    839              same time.
    840             */
    841            crc = crc_word(crc0 ^ words[0]);
    842 #if N > 1
    843            crc = crc_word(crc1 ^ words[1] ^ crc);
    844 #if N > 2
    845            crc = crc_word(crc2 ^ words[2] ^ crc);
    846 #if N > 3
    847            crc = crc_word(crc3 ^ words[3] ^ crc);
    848 #if N > 4
    849            crc = crc_word(crc4 ^ words[4] ^ crc);
    850 #if N > 5
    851            crc = crc_word(crc5 ^ words[5] ^ crc);
    852 #endif
    853 #endif
    854 #endif
    855 #endif
    856 #endif
    857            words += N;
    858        }
    859        else {
    860            /* Big endian. */
    861 
    862            z_word_t crc0, word0, comb;
    863 #if N > 1
    864            z_word_t crc1, word1;
    865 #if N > 2
    866            z_word_t crc2, word2;
    867 #if N > 3
    868            z_word_t crc3, word3;
    869 #if N > 4
    870            z_word_t crc4, word4;
    871 #if N > 5
    872            z_word_t crc5, word5;
    873 #endif
    874 #endif
    875 #endif
    876 #endif
    877 #endif
    878 
    879            /* Initialize the CRC for each braid. */
    880            crc0 = byte_swap(crc);
    881 #if N > 1
    882            crc1 = 0;
    883 #if N > 2
    884            crc2 = 0;
    885 #if N > 3
    886            crc3 = 0;
    887 #if N > 4
    888            crc4 = 0;
    889 #if N > 5
    890            crc5 = 0;
    891 #endif
    892 #endif
    893 #endif
    894 #endif
    895 #endif
    896 
    897            /*
    898              Process the first blks-1 blocks, computing the CRCs on each braid
    899              independently.
    900             */
    901            while (--blks) {
    902                /* Load the word for each braid into registers. */
    903                word0 = crc0 ^ words[0];
    904 #if N > 1
    905                word1 = crc1 ^ words[1];
    906 #if N > 2
    907                word2 = crc2 ^ words[2];
    908 #if N > 3
    909                word3 = crc3 ^ words[3];
    910 #if N > 4
    911                word4 = crc4 ^ words[4];
    912 #if N > 5
    913                word5 = crc5 ^ words[5];
    914 #endif
    915 #endif
    916 #endif
    917 #endif
    918 #endif
    919                words += N;
    920 
    921                /* Compute and update the CRC for each word. The loop should
    922                   get unrolled. */
    923                crc0 = crc_braid_big_table[0][word0 & 0xff];
    924 #if N > 1
    925                crc1 = crc_braid_big_table[0][word1 & 0xff];
    926 #if N > 2
    927                crc2 = crc_braid_big_table[0][word2 & 0xff];
    928 #if N > 3
    929                crc3 = crc_braid_big_table[0][word3 & 0xff];
    930 #if N > 4
    931                crc4 = crc_braid_big_table[0][word4 & 0xff];
    932 #if N > 5
    933                crc5 = crc_braid_big_table[0][word5 & 0xff];
    934 #endif
    935 #endif
    936 #endif
    937 #endif
    938 #endif
    939                for (k = 1; k < W; k++) {
    940                    crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
    941 #if N > 1
    942                    crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
    943 #if N > 2
    944                    crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
    945 #if N > 3
    946                    crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
    947 #if N > 4
    948                    crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
    949 #if N > 5
    950                    crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
    951 #endif
    952 #endif
    953 #endif
    954 #endif
    955 #endif
    956                }
    957            }
    958 
    959            /*
    960              Process the last block, combining the CRCs of the N braids at the
    961              same time.
    962             */
    963            comb = crc_word_big(crc0 ^ words[0]);
    964 #if N > 1
    965            comb = crc_word_big(crc1 ^ words[1] ^ comb);
    966 #if N > 2
    967            comb = crc_word_big(crc2 ^ words[2] ^ comb);
    968 #if N > 3
    969            comb = crc_word_big(crc3 ^ words[3] ^ comb);
    970 #if N > 4
    971            comb = crc_word_big(crc4 ^ words[4] ^ comb);
    972 #if N > 5
    973            comb = crc_word_big(crc5 ^ words[5] ^ comb);
    974 #endif
    975 #endif
    976 #endif
    977 #endif
    978 #endif
    979            words += N;
    980            crc = byte_swap(comb);
    981        }
    982 
    983        /*
    984          Update the pointer to the remaining bytes to process.
    985         */
    986        buf = (unsigned char const *)words;
    987    }
    988 
    989 #endif /* W */
    990 
    991    /* Complete the computation of the CRC on any remaining bytes. */
    992    while (len >= 8) {
    993        len -= 8;
    994        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    995        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    996        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    997        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    998        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    999        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1000        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1001        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1002    }
   1003    while (len) {
   1004        len--;
   1005        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
   1006    }
   1007 
   1008    /* Return the CRC, post-conditioned. */
   1009    return crc ^ 0xffffffff;
   1010 }
   1011 
   1012 #endif
   1013 
   1014 /* ========================================================================= */
   1015 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
   1016                            uInt len) {
   1017    return crc32_z(crc, buf, len);
   1018 }
   1019 
   1020 /* ========================================================================= */
   1021 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
   1022 #ifdef DYNAMIC_CRC_TABLE
   1023    once(&made, make_crc_table);
   1024 #endif /* DYNAMIC_CRC_TABLE */
   1025    return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
   1026 }
   1027 
   1028 /* ========================================================================= */
   1029 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
   1030    return crc32_combine64(crc1, crc2, (z_off64_t)len2);
   1031 }
   1032 
   1033 /* ========================================================================= */
   1034 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
   1035 #ifdef DYNAMIC_CRC_TABLE
   1036    once(&made, make_crc_table);
   1037 #endif /* DYNAMIC_CRC_TABLE */
   1038    return x2nmodp(len2, 3);
   1039 }
   1040 
   1041 /* ========================================================================= */
   1042 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
   1043    return crc32_combine_gen64((z_off64_t)len2);
   1044 }
   1045 
   1046 /* ========================================================================= */
   1047 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
   1048    return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
   1049 }