gznorm.c revision 1.1 1 1.1 christos /* gznorm.c -- normalize a gzip stream
2 1.1 christos * Copyright (C) 2018 Mark Adler
3 1.1 christos * For conditions of distribution and use, see copyright notice in zlib.h
4 1.1 christos * Version 1.0 7 Oct 2018 Mark Adler */
5 1.1 christos
6 1.1 christos // gznorm takes a gzip stream, potentially containing multiple members, and
7 1.1 christos // converts it to a gzip stream with a single member. In addition the gzip
8 1.1 christos // header is normalized, removing the file name and time stamp, and setting the
9 1.1 christos // other header contents (XFL, OS) to fixed values. gznorm does not recompress
10 1.1 christos // the data, so it is fast, but no advantage is gained from the history that
11 1.1 christos // could be available across member boundaries.
12 1.1 christos
13 1.1 christos #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
14 1.1 christos // vsnprintf, stdout, stderr, NULL, FILE
15 1.1 christos #include <stdlib.h> // malloc, free
16 1.1 christos #include <string.h> // strerror
17 1.1 christos #include <errno.h> // errno
18 1.1 christos #include <stdarg.h> // va_list, va_start, va_end
19 1.1 christos #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
20 1.1 christos // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
21 1.1 christos // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
22 1.1 christos // Z_MEM_ERROR
23 1.1 christos
24 1.1 christos #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 1.1 christos # include <fcntl.h>
26 1.1 christos # include <io.h>
27 1.1 christos # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
28 1.1 christos #else
29 1.1 christos # define SET_BINARY_MODE(file)
30 1.1 christos #endif
31 1.1 christos
32 1.1 christos #define local static
33 1.1 christos
34 1.1 christos // printf to an allocated string. Return the string, or NULL if the printf or
35 1.1 christos // allocation fails.
36 1.1 christos local char *aprintf(char *fmt, ...) {
37 1.1 christos // Get the length of the result of the printf.
38 1.1 christos va_list args;
39 1.1 christos va_start(args, fmt);
40 1.1 christos int len = vsnprintf(NULL, 0, fmt, args);
41 1.1 christos va_end(args);
42 1.1 christos if (len < 0)
43 1.1 christos return NULL;
44 1.1 christos
45 1.1 christos // Allocate the required space and printf to it.
46 1.1 christos char *str = malloc(len + 1);
47 1.1 christos if (str == NULL)
48 1.1 christos return NULL;
49 1.1 christos va_start(args, fmt);
50 1.1 christos vsnprintf(str, len + 1, fmt, args);
51 1.1 christos va_end(args);
52 1.1 christos return str;
53 1.1 christos }
54 1.1 christos
55 1.1 christos // Return with an error, putting an allocated error message in *err. Doing an
56 1.1 christos // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
57 1.1 christos // permitted.
58 1.1 christos #define BYE(...) \
59 1.1 christos do { \
60 1.1 christos inflateEnd(&strm); \
61 1.1 christos *err = aprintf(__VA_ARGS__); \
62 1.1 christos return 1; \
63 1.1 christos } while (0)
64 1.1 christos
65 1.1 christos // Chunk size for buffered reads and for decompression. Twice this many bytes
66 1.1 christos // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
67 1.1 christos #define CHUNK 16384
68 1.1 christos
69 1.1 christos // Read a gzip stream from in and write an equivalent normalized gzip stream to
70 1.1 christos // out. If given no input, an empty gzip stream will be written. If successful,
71 1.1 christos // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
72 1.1 christos // details of the error are returned in *err, a pointer to an allocated string.
73 1.1 christos //
74 1.1 christos // The input may be a stream with multiple gzip members, which is converted to
75 1.1 christos // a single gzip member on the output. Each gzip member is decompressed at the
76 1.1 christos // level of deflate blocks. This enables clearing the last-block bit, shifting
77 1.1 christos // the compressed data to concatenate to the previous member's compressed data,
78 1.1 christos // which can end at an arbitrary bit boundary, and identifying stored blocks in
79 1.1 christos // order to resynchronize those to byte boundaries. The deflate compressed data
80 1.1 christos // is terminated with a 10-bit empty fixed block. If any members on the input
81 1.1 christos // end with a 10-bit empty fixed block, then that block is excised from the
82 1.1 christos // stream. This avoids appending empty fixed blocks for every normalization,
83 1.1 christos // and assures that gzip_normalize applied a second time will not change the
84 1.1 christos // input. The pad bits after stored block headers and after the final deflate
85 1.1 christos // block are all forced to zeros.
86 1.1 christos local int gzip_normalize(FILE *in, FILE *out, char **err) {
87 1.1 christos // initialize the inflate engine to process a gzip member
88 1.1 christos z_stream strm;
89 1.1 christos strm.zalloc = Z_NULL;
90 1.1 christos strm.zfree = Z_NULL;
91 1.1 christos strm.opaque = Z_NULL;
92 1.1 christos strm.avail_in = 0;
93 1.1 christos strm.next_in = Z_NULL;
94 1.1 christos if (inflateInit2(&strm, 15 + 16) != Z_OK)
95 1.1 christos BYE("out of memory");
96 1.1 christos
97 1.1 christos // State while processing the input gzip stream.
98 1.1 christos enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
99 1.1 christos BETWEEN, // between gzip members (must end in this state)
100 1.1 christos HEAD, // reading a gzip header
101 1.1 christos BLOCK, // reading deflate blocks
102 1.1 christos TAIL // reading a gzip trailer
103 1.1 christos } state = BETWEEN; // current component being processed
104 1.1 christos unsigned long crc = 0; // accumulated CRC of uncompressed data
105 1.1 christos unsigned long len = 0; // accumulated length of uncompressed data
106 1.1 christos unsigned long buf = 0; // deflate stream bit buffer of num bits
107 1.1 christos int num = 0; // number of bits in buf (at bottom)
108 1.1 christos
109 1.1 christos // Write a canonical gzip header (no mod time, file name, comment, extra
110 1.1 christos // block, or extra flags, and OS is marked as unknown).
111 1.1 christos fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
112 1.1 christos
113 1.1 christos // Process the gzip stream from in until reaching the end of the input,
114 1.1 christos // encountering invalid input, or experiencing an i/o error.
115 1.1 christos int more; // true if not at the end of the input
116 1.1 christos do {
117 1.1 christos // State inside this loop.
118 1.1 christos unsigned char *put; // next input buffer location to process
119 1.1 christos int prev; // number of bits from previous block in
120 1.1 christos // the bit buffer, or -1 if not at the
121 1.1 christos // start of a block
122 1.1 christos unsigned long long memb; // uncompressed length of member
123 1.1 christos size_t tail; // number of trailer bytes read (0..8)
124 1.1 christos unsigned long part; // accumulated trailer component
125 1.1 christos
126 1.1 christos // Get the next chunk of input from in.
127 1.1 christos unsigned char dat[CHUNK];
128 1.1 christos strm.avail_in = fread(dat, 1, CHUNK, in);
129 1.1 christos if (strm.avail_in == 0)
130 1.1 christos break;
131 1.1 christos more = strm.avail_in == CHUNK;
132 1.1 christos strm.next_in = put = dat;
133 1.1 christos
134 1.1 christos // Run that chunk of input through the inflate engine to exhaustion.
135 1.1 christos do {
136 1.1 christos // At this point it is assured that strm.avail_in > 0.
137 1.1 christos
138 1.1 christos // Inflate until the end of a gzip component (header, deflate
139 1.1 christos // block, trailer) is reached, or until all of the chunk is
140 1.1 christos // consumed. The resulting decompressed data is discarded, though
141 1.1 christos // the total size of the decompressed data in each member is
142 1.1 christos // tracked, for the calculation of the total CRC.
143 1.1 christos do {
144 1.1 christos // inflate and handle any errors
145 1.1 christos unsigned char scrap[CHUNK];
146 1.1 christos strm.avail_out = CHUNK;
147 1.1 christos strm.next_out = scrap;
148 1.1 christos int ret = inflate(&strm, Z_BLOCK);
149 1.1 christos if (ret == Z_MEM_ERROR)
150 1.1 christos BYE("out of memory");
151 1.1 christos if (ret == Z_DATA_ERROR)
152 1.1 christos BYE("input invalid: %s", strm.msg);
153 1.1 christos if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
154 1.1 christos BYE("internal error");
155 1.1 christos
156 1.1 christos // Update the number of uncompressed bytes generated in this
157 1.1 christos // member. The actual count (not modulo 2^32) is required to
158 1.1 christos // correctly compute the total CRC.
159 1.1 christos unsigned got = CHUNK - strm.avail_out;
160 1.1 christos memb += got;
161 1.1 christos if (memb < got)
162 1.1 christos BYE("overflow error");
163 1.1 christos
164 1.1 christos // Continue to process this chunk until it is consumed, or
165 1.1 christos // until the end of a component (header, deflate block, or
166 1.1 christos // trailer) is reached.
167 1.1 christos } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
168 1.1 christos
169 1.1 christos // Since strm.avail_in was > 0 for the inflate call, some input was
170 1.1 christos // just consumed. It is therefore assured that put < strm.next_in.
171 1.1 christos
172 1.1 christos // Disposition the consumed component or part of a component.
173 1.1 christos switch (state) {
174 1.1 christos case BETWEEN:
175 1.1 christos state = HEAD;
176 1.1 christos // Fall through to HEAD when some or all of the header is
177 1.1 christos // processed.
178 1.1 christos
179 1.1 christos case HEAD:
180 1.1 christos // Discard the header.
181 1.1 christos if (strm.data_type & 0x80) {
182 1.1 christos // End of header reached -- deflate blocks follow.
183 1.1 christos put = strm.next_in;
184 1.1 christos prev = num;
185 1.1 christos memb = 0;
186 1.1 christos state = BLOCK;
187 1.1 christos }
188 1.1 christos break;
189 1.1 christos
190 1.1 christos case BLOCK:
191 1.1 christos // Copy the deflate stream to the output, but with the
192 1.1 christos // last-block-bit cleared. Re-synchronize stored block
193 1.1 christos // headers to the output byte boundaries. The bytes at
194 1.1 christos // put..strm.next_in-1 is the compressed data that has been
195 1.1 christos // processed and is ready to be copied to the output.
196 1.1 christos
197 1.1 christos // At this point, it is assured that new compressed data is
198 1.1 christos // available, i.e., put < strm.next_in. If prev is -1, then
199 1.1 christos // that compressed data starts in the middle of a deflate
200 1.1 christos // block. If prev is not -1, then the bits in the bit
201 1.1 christos // buffer, possibly combined with the bits in *put, contain
202 1.1 christos // the three-bit header of the new deflate block. In that
203 1.1 christos // case, prev is the number of bits from the previous block
204 1.1 christos // that remain in the bit buffer. Since num is the number
205 1.1 christos // of bits in the bit buffer, we have that num - prev is
206 1.1 christos // the number of bits from the new block currently in the
207 1.1 christos // bit buffer.
208 1.1 christos
209 1.1 christos // If strm.data_type & 0xc0 is 0x80, then the last byte of
210 1.1 christos // the available compressed data includes the last bits of
211 1.1 christos // the end of a deflate block. In that case, that last byte
212 1.1 christos // also has strm.data_type & 0x1f bits of the next deflate
213 1.1 christos // block, in the range 0..7. If strm.data_type & 0xc0 is
214 1.1 christos // 0xc0, then the last byte of the compressed data is the
215 1.1 christos // end of the deflate stream, followed by strm.data_type &
216 1.1 christos // 0x1f pad bits, also in the range 0..7.
217 1.1 christos
218 1.1 christos // Set bits to the number of bits not yet consumed from the
219 1.1 christos // last byte. If we are at the end of the block, bits is
220 1.1 christos // either the number of bits in the last byte belonging to
221 1.1 christos // the next block, or the number of pad bits after the
222 1.1 christos // final block. In either of those cases, bits is in the
223 1.1 christos // range 0..7.
224 1.1 christos ; // (required due to C syntax oddity)
225 1.1 christos int bits = strm.data_type & 0x1f;
226 1.1 christos
227 1.1 christos if (prev != -1) {
228 1.1 christos // We are at the start of a new block. Clear the last
229 1.1 christos // block bit, and check for special cases. If it is a
230 1.1 christos // stored block, then emit the header and pad to the
231 1.1 christos // next byte boundary. If it is a final, empty fixed
232 1.1 christos // block, then excise it.
233 1.1 christos
234 1.1 christos // Some or all of the three header bits for this block
235 1.1 christos // may already be in the bit buffer. Load any remaining
236 1.1 christos // header bits into the bit buffer.
237 1.1 christos if (num - prev < 3) {
238 1.1 christos buf += (unsigned long)*put++ << num;
239 1.1 christos num += 8;
240 1.1 christos }
241 1.1 christos
242 1.1 christos // Set last to have a 1 in the position of the last
243 1.1 christos // block bit in the bit buffer.
244 1.1 christos unsigned long last = (unsigned long)1 << prev;
245 1.1 christos
246 1.1 christos if (((buf >> prev) & 7) == 3) {
247 1.1 christos // This is a final fixed block. Load at least ten
248 1.1 christos // bits from this block, including the header, into
249 1.1 christos // the bit buffer. We already have at least three,
250 1.1 christos // so at most one more byte needs to be loaded.
251 1.1 christos if (num - prev < 10) {
252 1.1 christos if (put == strm.next_in)
253 1.1 christos // Need to go get and process more input.
254 1.1 christos // We'll end up back here to finish this.
255 1.1 christos break;
256 1.1 christos buf += (unsigned long)*put++ << num;
257 1.1 christos num += 8;
258 1.1 christos }
259 1.1 christos if (((buf >> prev) & 0x3ff) == 3) {
260 1.1 christos // That final fixed block is empty. Delete it
261 1.1 christos // to avoid adding an empty block every time a
262 1.1 christos // gzip stream is normalized.
263 1.1 christos num = prev;
264 1.1 christos buf &= last - 1; // zero the pad bits
265 1.1 christos }
266 1.1 christos }
267 1.1 christos else if (((buf >> prev) & 6) == 0) {
268 1.1 christos // This is a stored block. Flush to the next
269 1.1 christos // byte boundary after the three-bit header.
270 1.1 christos num = (prev + 10) & ~7;
271 1.1 christos buf &= last - 1; // zero the pad bits
272 1.1 christos }
273 1.1 christos
274 1.1 christos // Clear the last block bit.
275 1.1 christos buf &= ~last;
276 1.1 christos
277 1.1 christos // Write out complete bytes in the bit buffer.
278 1.1 christos while (num >= 8) {
279 1.1 christos putc(buf, out);
280 1.1 christos buf >>= 8;
281 1.1 christos num -= 8;
282 1.1 christos }
283 1.1 christos
284 1.1 christos // If no more bytes left to process, then we have
285 1.1 christos // consumed the byte that had bits from the next block.
286 1.1 christos if (put == strm.next_in)
287 1.1 christos bits = 0;
288 1.1 christos }
289 1.1 christos
290 1.1 christos // We are done handling the deflate block header. Now copy
291 1.1 christos // all or almost all of the remaining compressed data that
292 1.1 christos // has been processed so far. Don't copy one byte at the
293 1.1 christos // end if it contains bits from the next deflate block or
294 1.1 christos // pad bits at the end of a deflate block.
295 1.1 christos
296 1.1 christos // mix is 1 if we are at the end of a deflate block, and if
297 1.1 christos // some of the bits in the last byte follow this block. mix
298 1.1 christos // is 0 if we are in the middle of a deflate block, if the
299 1.1 christos // deflate block ended on a byte boundary, or if all of the
300 1.1 christos // compressed data processed so far has been consumed.
301 1.1 christos int mix = (strm.data_type & 0x80) && bits;
302 1.1 christos
303 1.1 christos // Copy all of the processed compressed data to the output,
304 1.1 christos // except for the last byte if it contains bits from the
305 1.1 christos // next deflate block or pad bits at the end of the deflate
306 1.1 christos // stream. Copy the data after shifting in num bits from
307 1.1 christos // buf in front of it, leaving num bits from the end of the
308 1.1 christos // compressed data in buf when done.
309 1.1 christos unsigned char *end = strm.next_in - mix;
310 1.1 christos if (put < end) {
311 1.1 christos if (num)
312 1.1 christos // Insert num bits from buf before the data being
313 1.1 christos // copied.
314 1.1 christos do {
315 1.1 christos buf += (unsigned)(*put++) << num;
316 1.1 christos putc(buf, out);
317 1.1 christos buf >>= 8;
318 1.1 christos } while (put < end);
319 1.1 christos else {
320 1.1 christos // No shifting needed -- write directly.
321 1.1 christos fwrite(put, 1, end - put, out);
322 1.1 christos put = end;
323 1.1 christos }
324 1.1 christos }
325 1.1 christos
326 1.1 christos // Process the last processed byte if it wasn't written.
327 1.1 christos if (mix) {
328 1.1 christos // Load the last byte into the bit buffer.
329 1.1 christos buf += (unsigned)(*put++) << num;
330 1.1 christos num += 8;
331 1.1 christos
332 1.1 christos if (strm.data_type & 0x40) {
333 1.1 christos // We are at the end of the deflate stream and
334 1.1 christos // there are bits pad bits. Discard the pad bits
335 1.1 christos // and write a byte to the output, if available.
336 1.1 christos // Leave the num bits left over in buf to prepend
337 1.1 christos // to the next deflate stream.
338 1.1 christos num -= bits;
339 1.1 christos if (num >= 8) {
340 1.1 christos putc(buf, out);
341 1.1 christos num -= 8;
342 1.1 christos buf >>= 8;
343 1.1 christos }
344 1.1 christos
345 1.1 christos // Force the pad bits in the bit buffer to zeros.
346 1.1 christos buf &= ((unsigned long)1 << num) - 1;
347 1.1 christos
348 1.1 christos // Don't need to set prev here since going to TAIL.
349 1.1 christos }
350 1.1 christos else
351 1.1 christos // At the end of an internal deflate block. Leave
352 1.1 christos // the last byte in the bit buffer to examine on
353 1.1 christos // the next entry to BLOCK, when more bits from the
354 1.1 christos // next block will be available.
355 1.1 christos prev = num - bits; // number of bits in buffer
356 1.1 christos // from current block
357 1.1 christos }
358 1.1 christos
359 1.1 christos // Don't have a byte left over, so we are in the middle of
360 1.1 christos // a deflate block, or the deflate block ended on a byte
361 1.1 christos // boundary. Set prev appropriately for the next entry into
362 1.1 christos // BLOCK.
363 1.1 christos else if (strm.data_type & 0x80)
364 1.1 christos // The block ended on a byte boundary, so no header
365 1.1 christos // bits are in the bit buffer.
366 1.1 christos prev = num;
367 1.1 christos else
368 1.1 christos // In the middle of a deflate block, so no header here.
369 1.1 christos prev = -1;
370 1.1 christos
371 1.1 christos // Check for the end of the deflate stream.
372 1.1 christos if ((strm.data_type & 0xc0) == 0xc0) {
373 1.1 christos // That ends the deflate stream on the input side, the
374 1.1 christos // pad bits were discarded, and any remaining bits from
375 1.1 christos // the last block in the stream are saved in the bit
376 1.1 christos // buffer to prepend to the next stream. Process the
377 1.1 christos // gzip trailer next.
378 1.1 christos tail = 0;
379 1.1 christos part = 0;
380 1.1 christos state = TAIL;
381 1.1 christos }
382 1.1 christos break;
383 1.1 christos
384 1.1 christos case TAIL:
385 1.1 christos // Accumulate available trailer bytes to update the total
386 1.1 christos // CRC and the total uncompressed length.
387 1.1 christos do {
388 1.1 christos part = (part >> 8) + ((unsigned long)(*put++) << 24);
389 1.1 christos tail++;
390 1.1 christos if (tail == 4) {
391 1.1 christos // Update the total CRC.
392 1.1 christos z_off_t len2 = memb;
393 1.1 christos if (len2 < 0 || (unsigned long long)len2 != memb)
394 1.1 christos BYE("overflow error");
395 1.1 christos crc = crc ? crc32_combine(crc, part, len2) : part;
396 1.1 christos part = 0;
397 1.1 christos }
398 1.1 christos else if (tail == 8) {
399 1.1 christos // Update the total uncompressed length. (It's ok
400 1.1 christos // if this sum is done modulo 2^32.)
401 1.1 christos len += part;
402 1.1 christos
403 1.1 christos // At the end of a member. Set up to inflate an
404 1.1 christos // immediately following gzip member. (If we made
405 1.1 christos // it this far, then the trailer was valid.)
406 1.1 christos if (inflateReset(&strm) != Z_OK)
407 1.1 christos BYE("internal error");
408 1.1 christos state = BETWEEN;
409 1.1 christos break;
410 1.1 christos }
411 1.1 christos } while (put < strm.next_in);
412 1.1 christos break;
413 1.1 christos }
414 1.1 christos
415 1.1 christos // Process the input buffer until completely consumed.
416 1.1 christos } while (strm.avail_in > 0);
417 1.1 christos
418 1.1 christos // Process input until end of file, invalid input, or i/o error.
419 1.1 christos } while (more);
420 1.1 christos
421 1.1 christos // Done with the inflate engine.
422 1.1 christos inflateEnd(&strm);
423 1.1 christos
424 1.1 christos // Verify the validity of the input.
425 1.1 christos if (state != BETWEEN)
426 1.1 christos BYE("input invalid: incomplete gzip stream");
427 1.1 christos
428 1.1 christos // Write the remaining deflate stream bits, followed by a terminating
429 1.1 christos // deflate fixed block.
430 1.1 christos buf += (unsigned long)3 << num;
431 1.1 christos putc(buf, out);
432 1.1 christos putc(buf >> 8, out);
433 1.1 christos if (num > 6)
434 1.1 christos putc(0, out);
435 1.1 christos
436 1.1 christos // Write the gzip trailer, which is the CRC and the uncompressed length
437 1.1 christos // modulo 2^32, both in little-endian order.
438 1.1 christos putc(crc, out);
439 1.1 christos putc(crc >> 8, out);
440 1.1 christos putc(crc >> 16, out);
441 1.1 christos putc(crc >> 24, out);
442 1.1 christos putc(len, out);
443 1.1 christos putc(len >> 8, out);
444 1.1 christos putc(len >> 16, out);
445 1.1 christos putc(len >> 24, out);
446 1.1 christos fflush(out);
447 1.1 christos
448 1.1 christos // Check for any i/o errors.
449 1.1 christos if (ferror(in) || ferror(out))
450 1.1 christos BYE("i/o error: %s", strerror(errno));
451 1.1 christos
452 1.1 christos // All good!
453 1.1 christos *err = NULL;
454 1.1 christos return 0;
455 1.1 christos }
456 1.1 christos
457 1.1 christos // Normalize the gzip stream on stdin, writing the result to stdout.
458 1.1 christos int main(void) {
459 1.1 christos // Avoid end-of-line conversions on evil operating systems.
460 1.1 christos SET_BINARY_MODE(stdin);
461 1.1 christos SET_BINARY_MODE(stdout);
462 1.1 christos
463 1.1 christos // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
464 1.1 christos char *err;
465 1.1 christos int ret = gzip_normalize(stdin, stdout, &err);
466 1.1 christos if (ret)
467 1.1 christos fprintf(stderr, "gznorm error: %s\n", err);
468 1.1 christos free(err);
469 1.1 christos return ret;
470 1.1 christos }
471