Home | History | Annotate | Line # | Download | only in examples
      1 /* gznorm.c -- normalize a gzip stream
      2  * Copyright (C) 2018 Mark Adler
      3  * For conditions of distribution and use, see copyright notice in zlib.h
      4  * Version 1.0  7 Oct 2018  Mark Adler */
      5 
      6 // gznorm takes a gzip stream, potentially containing multiple members, and
      7 // converts it to a gzip stream with a single member. In addition the gzip
      8 // header is normalized, removing the file name and time stamp, and setting the
      9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress
     10 // the data, so it is fast, but no advantage is gained from the history that
     11 // could be available across member boundaries.
     12 
     13 #include <stdio.h>      // fread, fwrite, putc, fflush, ferror, fprintf,
     14                         // vsnprintf, stdout, stderr, NULL, FILE
     15 #include <stdlib.h>     // malloc, free
     16 #include <string.h>     // strerror
     17 #include <errno.h>      // errno
     18 #include <stdarg.h>     // va_list, va_start, va_end
     19 #include "zlib.h"       // inflateInit2, inflate, inflateReset, inflateEnd,
     20                         // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
     21                         // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
     22                         // Z_MEM_ERROR
     23 
     24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
     25 #  include <fcntl.h>
     26 #  include <io.h>
     27 #  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
     28 #else
     29 #  define SET_BINARY_MODE(file)
     30 #endif
     31 
     32 #define local static
     33 
     34 // printf to an allocated string. Return the string, or NULL if the printf or
     35 // allocation fails.
     36 local char *aprintf(char *fmt, ...) {
     37     // Get the length of the result of the printf.
     38     va_list args;
     39     va_start(args, fmt);
     40     int len = vsnprintf(NULL, 0, fmt, args);
     41     va_end(args);
     42     if (len < 0)
     43         return NULL;
     44 
     45     // Allocate the required space and printf to it.
     46     char *str = malloc(len + 1);
     47     if (str == NULL)
     48         return NULL;
     49     va_start(args, fmt);
     50     vsnprintf(str, len + 1, fmt, args);
     51     va_end(args);
     52     return str;
     53 }
     54 
     55 // Return with an error, putting an allocated error message in *err. Doing an
     56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
     57 // permitted.
     58 #define BYE(...) \
     59     do { \
     60         inflateEnd(&strm); \
     61         *err = aprintf(__VA_ARGS__); \
     62         return 1; \
     63     } while (0)
     64 
     65 // Chunk size for buffered reads and for decompression. Twice this many bytes
     66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
     67 #define CHUNK 16384
     68 
     69 // Read a gzip stream from in and write an equivalent normalized gzip stream to
     70 // out. If given no input, an empty gzip stream will be written. If successful,
     71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
     72 // details of the error are returned in *err, a pointer to an allocated string.
     73 //
     74 // The input may be a stream with multiple gzip members, which is converted to
     75 // a single gzip member on the output. Each gzip member is decompressed at the
     76 // level of deflate blocks. This enables clearing the last-block bit, shifting
     77 // the compressed data to concatenate to the previous member's compressed data,
     78 // which can end at an arbitrary bit boundary, and identifying stored blocks in
     79 // order to resynchronize those to byte boundaries. The deflate compressed data
     80 // is terminated with a 10-bit empty fixed block. If any members on the input
     81 // end with a 10-bit empty fixed block, then that block is excised from the
     82 // stream. This avoids appending empty fixed blocks for every normalization,
     83 // and assures that gzip_normalize applied a second time will not change the
     84 // input. The pad bits after stored block headers and after the final deflate
     85 // block are all forced to zeros.
     86 local int gzip_normalize(FILE *in, FILE *out, char **err) {
     87     // initialize the inflate engine to process a gzip member
     88     z_stream strm;
     89     strm.zalloc = Z_NULL;
     90     strm.zfree = Z_NULL;
     91     strm.opaque = Z_NULL;
     92     strm.avail_in = 0;
     93     strm.next_in = Z_NULL;
     94     if (inflateInit2(&strm, 15 + 16) != Z_OK)
     95         BYE("out of memory");
     96 
     97     // State while processing the input gzip stream.
     98     enum {              // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
     99         BETWEEN,        // between gzip members (must end in this state)
    100         HEAD,           // reading a gzip header
    101         BLOCK,          // reading deflate blocks
    102         TAIL            // reading a gzip trailer
    103     } state = BETWEEN;              // current component being processed
    104     unsigned long crc = 0;          // accumulated CRC of uncompressed data
    105     unsigned long len = 0;          // accumulated length of uncompressed data
    106     unsigned long buf = 0;          // deflate stream bit buffer of num bits
    107     int num = 0;                    // number of bits in buf (at bottom)
    108 
    109     // Write a canonical gzip header (no mod time, file name, comment, extra
    110     // block, or extra flags, and OS is marked as unknown).
    111     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
    112 
    113     // Process the gzip stream from in until reaching the end of the input,
    114     // encountering invalid input, or experiencing an i/o error.
    115     int more;                       // true if not at the end of the input
    116     do {
    117         // State inside this loop.
    118         unsigned char *put;         // next input buffer location to process
    119         int prev;                   // number of bits from previous block in
    120                                     // the bit buffer, or -1 if not at the
    121                                     // start of a block
    122         unsigned long long memb;    // uncompressed length of member
    123         size_t tail;                // number of trailer bytes read (0..8)
    124         unsigned long part;         // accumulated trailer component
    125 
    126         // Get the next chunk of input from in.
    127         unsigned char dat[CHUNK];
    128         strm.avail_in = fread(dat, 1, CHUNK, in);
    129         if (strm.avail_in == 0)
    130             break;
    131         more = strm.avail_in == CHUNK;
    132         strm.next_in = put = dat;
    133 
    134         // Run that chunk of input through the inflate engine to exhaustion.
    135         do {
    136             // At this point it is assured that strm.avail_in > 0.
    137 
    138             // Inflate until the end of a gzip component (header, deflate
    139             // block, trailer) is reached, or until all of the chunk is
    140             // consumed. The resulting decompressed data is discarded, though
    141             // the total size of the decompressed data in each member is
    142             // tracked, for the calculation of the total CRC.
    143             do {
    144                 // inflate and handle any errors
    145                 unsigned char scrap[CHUNK];
    146                 strm.avail_out = CHUNK;
    147                 strm.next_out = scrap;
    148                 int ret = inflate(&strm, Z_BLOCK);
    149                 if (ret == Z_MEM_ERROR)
    150                     BYE("out of memory");
    151                 if (ret == Z_DATA_ERROR)
    152                     BYE("input invalid: %s", strm.msg);
    153                 if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
    154                     BYE("internal error");
    155 
    156                 // Update the number of uncompressed bytes generated in this
    157                 // member. The actual count (not modulo 2^32) is required to
    158                 // correctly compute the total CRC.
    159                 unsigned got = CHUNK - strm.avail_out;
    160                 memb += got;
    161                 if (memb < got)
    162                     BYE("overflow error");
    163 
    164                 // Continue to process this chunk until it is consumed, or
    165                 // until the end of a component (header, deflate block, or
    166                 // trailer) is reached.
    167             } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
    168 
    169             // Since strm.avail_in was > 0 for the inflate call, some input was
    170             // just consumed. It is therefore assured that put < strm.next_in.
    171 
    172             // Disposition the consumed component or part of a component.
    173             switch (state) {
    174                 case BETWEEN:
    175                     state = HEAD;
    176                     // Fall through to HEAD when some or all of the header is
    177                     // processed.
    178 
    179                 case HEAD:
    180                     // Discard the header.
    181                     if (strm.data_type & 0x80) {
    182                         // End of header reached -- deflate blocks follow.
    183                         put = strm.next_in;
    184                         prev = num;
    185                         memb = 0;
    186                         state = BLOCK;
    187                     }
    188                     break;
    189 
    190                 case BLOCK:
    191                     // Copy the deflate stream to the output, but with the
    192                     // last-block-bit cleared. Re-synchronize stored block
    193                     // headers to the output byte boundaries. The bytes at
    194                     // put..strm.next_in-1 is the compressed data that has been
    195                     // processed and is ready to be copied to the output.
    196 
    197                     // At this point, it is assured that new compressed data is
    198                     // available, i.e., put < strm.next_in. If prev is -1, then
    199                     // that compressed data starts in the middle of a deflate
    200                     // block. If prev is not -1, then the bits in the bit
    201                     // buffer, possibly combined with the bits in *put, contain
    202                     // the three-bit header of the new deflate block. In that
    203                     // case, prev is the number of bits from the previous block
    204                     // that remain in the bit buffer. Since num is the number
    205                     // of bits in the bit buffer, we have that num - prev is
    206                     // the number of bits from the new block currently in the
    207                     // bit buffer.
    208 
    209                     // If strm.data_type & 0xc0 is 0x80, then the last byte of
    210                     // the available compressed data includes the last bits of
    211                     // the end of a deflate block. In that case, that last byte
    212                     // also has strm.data_type & 0x1f bits of the next deflate
    213                     // block, in the range 0..7. If strm.data_type & 0xc0 is
    214                     // 0xc0, then the last byte of the compressed data is the
    215                     // end of the deflate stream, followed by strm.data_type &
    216                     // 0x1f pad bits, also in the range 0..7.
    217 
    218                     // Set bits to the number of bits not yet consumed from the
    219                     // last byte. If we are at the end of the block, bits is
    220                     // either the number of bits in the last byte belonging to
    221                     // the next block, or the number of pad bits after the
    222                     // final block. In either of those cases, bits is in the
    223                     // range 0..7.
    224                     ;                   // (required due to C syntax oddity)
    225                     int bits = strm.data_type & 0x1f;
    226 
    227                     if (prev != -1) {
    228                         // We are at the start of a new block. Clear the last
    229                         // block bit, and check for special cases. If it is a
    230                         // stored block, then emit the header and pad to the
    231                         // next byte boundary. If it is a final, empty fixed
    232                         // block, then excise it.
    233 
    234                         // Some or all of the three header bits for this block
    235                         // may already be in the bit buffer. Load any remaining
    236                         // header bits into the bit buffer.
    237                         if (num - prev < 3) {
    238                             buf += (unsigned long)*put++ << num;
    239                             num += 8;
    240                         }
    241 
    242                         // Set last to have a 1 in the position of the last
    243                         // block bit in the bit buffer.
    244                         unsigned long last = (unsigned long)1 << prev;
    245 
    246                         if (((buf >> prev) & 7) == 3) {
    247                             // This is a final fixed block. Load at least ten
    248                             // bits from this block, including the header, into
    249                             // the bit buffer. We already have at least three,
    250                             // so at most one more byte needs to be loaded.
    251                             if (num - prev < 10) {
    252                                 if (put == strm.next_in)
    253                                     // Need to go get and process more input.
    254                                     // We'll end up back here to finish this.
    255                                     break;
    256                                 buf += (unsigned long)*put++ << num;
    257                                 num += 8;
    258                             }
    259                             if (((buf >> prev) & 0x3ff) == 3) {
    260                                 // That final fixed block is empty. Delete it
    261                                 // to avoid adding an empty block every time a
    262                                 // gzip stream is normalized.
    263                                 num = prev;
    264                                 buf &= last - 1;    // zero the pad bits
    265                             }
    266                         }
    267                         else if (((buf >> prev) & 6) == 0) {
    268                             // This is a stored block. Flush to the next
    269                             // byte boundary after the three-bit header.
    270                             num = (prev + 10) & ~7;
    271                             buf &= last - 1;        // zero the pad bits
    272                         }
    273 
    274                         // Clear the last block bit.
    275                         buf &= ~last;
    276 
    277                         // Write out complete bytes in the bit buffer.
    278                         while (num >= 8) {
    279                             putc(buf, out);
    280                             buf >>= 8;
    281                             num -= 8;
    282                         }
    283 
    284                         // If no more bytes left to process, then we have
    285                         // consumed the byte that had bits from the next block.
    286                         if (put == strm.next_in)
    287                             bits = 0;
    288                     }
    289 
    290                     // We are done handling the deflate block header. Now copy
    291                     // all or almost all of the remaining compressed data that
    292                     // has been processed so far. Don't copy one byte at the
    293                     // end if it contains bits from the next deflate block or
    294                     // pad bits at the end of a deflate block.
    295 
    296                     // mix is 1 if we are at the end of a deflate block, and if
    297                     // some of the bits in the last byte follow this block. mix
    298                     // is 0 if we are in the middle of a deflate block, if the
    299                     // deflate block ended on a byte boundary, or if all of the
    300                     // compressed data processed so far has been consumed.
    301                     int mix = (strm.data_type & 0x80) && bits;
    302 
    303                     // Copy all of the processed compressed data to the output,
    304                     // except for the last byte if it contains bits from the
    305                     // next deflate block or pad bits at the end of the deflate
    306                     // stream. Copy the data after shifting in num bits from
    307                     // buf in front of it, leaving num bits from the end of the
    308                     // compressed data in buf when done.
    309                     unsigned char *end = strm.next_in - mix;
    310                     if (put < end) {
    311                         if (num)
    312                             // Insert num bits from buf before the data being
    313                             // copied.
    314                             do {
    315                                 buf += (unsigned)(*put++) << num;
    316                                 putc(buf, out);
    317                                 buf >>= 8;
    318                             } while (put < end);
    319                         else {
    320                             // No shifting needed -- write directly.
    321                             fwrite(put, 1, end - put, out);
    322                             put = end;
    323                         }
    324                     }
    325 
    326                     // Process the last processed byte if it wasn't written.
    327                     if (mix) {
    328                         // Load the last byte into the bit buffer.
    329                         buf += (unsigned)(*put++) << num;
    330                         num += 8;
    331 
    332                         if (strm.data_type & 0x40) {
    333                             // We are at the end of the deflate stream and
    334                             // there are bits pad bits. Discard the pad bits
    335                             // and write a byte to the output, if available.
    336                             // Leave the num bits left over in buf to prepend
    337                             // to the next deflate stream.
    338                             num -= bits;
    339                             if (num >= 8) {
    340                                 putc(buf, out);
    341                                 num -= 8;
    342                                 buf >>= 8;
    343                             }
    344 
    345                             // Force the pad bits in the bit buffer to zeros.
    346                             buf &= ((unsigned long)1 << num) - 1;
    347 
    348                             // Don't need to set prev here since going to TAIL.
    349                         }
    350                         else
    351                             // At the end of an internal deflate block. Leave
    352                             // the last byte in the bit buffer to examine on
    353                             // the next entry to BLOCK, when more bits from the
    354                             // next block will be available.
    355                             prev = num - bits;      // number of bits in buffer
    356                                                     // from current block
    357                     }
    358 
    359                     // Don't have a byte left over, so we are in the middle of
    360                     // a deflate block, or the deflate block ended on a byte
    361                     // boundary. Set prev appropriately for the next entry into
    362                     // BLOCK.
    363                     else if (strm.data_type & 0x80)
    364                         // The block ended on a byte boundary, so no header
    365                         // bits are in the bit buffer.
    366                         prev = num;
    367                     else
    368                         // In the middle of a deflate block, so no header here.
    369                         prev = -1;
    370 
    371                     // Check for the end of the deflate stream.
    372                     if ((strm.data_type & 0xc0) == 0xc0) {
    373                         // That ends the deflate stream on the input side, the
    374                         // pad bits were discarded, and any remaining bits from
    375                         // the last block in the stream are saved in the bit
    376                         // buffer to prepend to the next stream. Process the
    377                         // gzip trailer next.
    378                         tail = 0;
    379                         part = 0;
    380                         state = TAIL;
    381                     }
    382                     break;
    383 
    384                 case TAIL:
    385                     // Accumulate available trailer bytes to update the total
    386                     // CRC and the total uncompressed length.
    387                     do {
    388                         part = (part >> 8) + ((unsigned long)(*put++) << 24);
    389                         tail++;
    390                         if (tail == 4) {
    391                             // Update the total CRC.
    392                             z_off_t len2 = memb;
    393                             if (len2 < 0 || (unsigned long long)len2 != memb)
    394                                 BYE("overflow error");
    395                             crc = crc ? crc32_combine(crc, part, len2) : part;
    396                             part = 0;
    397                         }
    398                         else if (tail == 8) {
    399                             // Update the total uncompressed length. (It's ok
    400                             // if this sum is done modulo 2^32.)
    401                             len += part;
    402 
    403                             // At the end of a member. Set up to inflate an
    404                             // immediately following gzip member. (If we made
    405                             // it this far, then the trailer was valid.)
    406                             if (inflateReset(&strm) != Z_OK)
    407                                 BYE("internal error");
    408                             state = BETWEEN;
    409                             break;
    410                         }
    411                     } while (put < strm.next_in);
    412                     break;
    413             }
    414 
    415             // Process the input buffer until completely consumed.
    416         } while (strm.avail_in > 0);
    417 
    418         // Process input until end of file, invalid input, or i/o error.
    419     } while (more);
    420 
    421     // Done with the inflate engine.
    422     inflateEnd(&strm);
    423 
    424     // Verify the validity of the input.
    425     if (state != BETWEEN)
    426         BYE("input invalid: incomplete gzip stream");
    427 
    428     // Write the remaining deflate stream bits, followed by a terminating
    429     // deflate fixed block.
    430     buf += (unsigned long)3 << num;
    431     putc(buf, out);
    432     putc(buf >> 8, out);
    433     if (num > 6)
    434         putc(0, out);
    435 
    436     // Write the gzip trailer, which is the CRC and the uncompressed length
    437     // modulo 2^32, both in little-endian order.
    438     putc(crc, out);
    439     putc(crc >> 8, out);
    440     putc(crc >> 16, out);
    441     putc(crc >> 24, out);
    442     putc(len, out);
    443     putc(len >> 8, out);
    444     putc(len >> 16, out);
    445     putc(len >> 24, out);
    446     fflush(out);
    447 
    448     // Check for any i/o errors.
    449     if (ferror(in) || ferror(out))
    450         BYE("i/o error: %s", strerror(errno));
    451 
    452     // All good!
    453     *err = NULL;
    454     return 0;
    455 }
    456 
    457 // Normalize the gzip stream on stdin, writing the result to stdout.
    458 int main(void) {
    459     // Avoid end-of-line conversions on evil operating systems.
    460     SET_BINARY_MODE(stdin);
    461     SET_BINARY_MODE(stdout);
    462 
    463     // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
    464     char *err;
    465     int ret = gzip_normalize(stdin, stdout, &err);
    466     if (ret)
    467         fprintf(stderr, "gznorm error: %s\n", err);
    468     free(err);
    469     return ret;
    470 }
    471