gzjoin.c revision 1.1.1.2 1 1.1 christos /* gzjoin -- command to join gzip files into one gzip file
2 1.1 christos
3 1.1.1.2 christos Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4 1.1.1.2 christos version 1.2, 14 Aug 2012
5 1.1 christos
6 1.1 christos This software is provided 'as-is', without any express or implied
7 1.1 christos warranty. In no event will the author be held liable for any damages
8 1.1 christos arising from the use of this software.
9 1.1 christos
10 1.1 christos Permission is granted to anyone to use this software for any purpose,
11 1.1 christos including commercial applications, and to alter it and redistribute it
12 1.1 christos freely, subject to the following restrictions:
13 1.1 christos
14 1.1 christos 1. The origin of this software must not be misrepresented; you must not
15 1.1 christos claim that you wrote the original software. If you use this software
16 1.1 christos in a product, an acknowledgment in the product documentation would be
17 1.1 christos appreciated but is not required.
18 1.1 christos 2. Altered source versions must be plainly marked as such, and must not be
19 1.1 christos misrepresented as being the original software.
20 1.1 christos 3. This notice may not be removed or altered from any source distribution.
21 1.1 christos
22 1.1 christos Mark Adler madler (at) alumni.caltech.edu
23 1.1 christos */
24 1.1 christos
25 1.1 christos /*
26 1.1 christos * Change history:
27 1.1 christos *
28 1.1 christos * 1.0 11 Dec 2004 - First version
29 1.1 christos * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
30 1.1.1.2 christos * 1.2 14 Aug 2012 - Clean up for z_const usage
31 1.1 christos */
32 1.1 christos
33 1.1 christos /*
34 1.1 christos gzjoin takes one or more gzip files on the command line and writes out a
35 1.1 christos single gzip file that will uncompress to the concatenation of the
36 1.1 christos uncompressed data from the individual gzip files. gzjoin does this without
37 1.1 christos having to recompress any of the data and without having to calculate a new
38 1.1 christos crc32 for the concatenated uncompressed data. gzjoin does however have to
39 1.1 christos decompress all of the input data in order to find the bits in the compressed
40 1.1 christos data that need to be modified to concatenate the streams.
41 1.1 christos
42 1.1 christos gzjoin does not do an integrity check on the input gzip files other than
43 1.1 christos checking the gzip header and decompressing the compressed data. They are
44 1.1 christos otherwise assumed to be complete and correct.
45 1.1 christos
46 1.1 christos Each joint between gzip files removes at least 18 bytes of previous trailer
47 1.1 christos and subsequent header, and inserts an average of about three bytes to the
48 1.1 christos compressed data in order to connect the streams. The output gzip file
49 1.1 christos has a minimal ten-byte gzip header with no file name or modification time.
50 1.1 christos
51 1.1 christos This program was written to illustrate the use of the Z_BLOCK option of
52 1.1 christos inflate() and the crc32_combine() function. gzjoin will not compile with
53 1.1 christos versions of zlib earlier than 1.2.3.
54 1.1 christos */
55 1.1 christos
56 1.1 christos #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
57 1.1 christos #include <stdlib.h> /* exit(), malloc(), free() */
58 1.1 christos #include <fcntl.h> /* open() */
59 1.1 christos #include <unistd.h> /* close(), read(), lseek() */
60 1.1 christos #include "zlib.h"
61 1.1 christos /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
62 1.1 christos
63 1.1 christos #define local static
64 1.1 christos
65 1.1 christos /* exit with an error (return a value to allow use in an expression) */
66 1.1 christos local int bail(char *why1, char *why2)
67 1.1 christos {
68 1.1 christos fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
69 1.1 christos exit(1);
70 1.1 christos return 0;
71 1.1 christos }
72 1.1 christos
73 1.1 christos /* -- simple buffered file input with access to the buffer -- */
74 1.1 christos
75 1.1 christos #define CHUNK 32768 /* must be a power of two and fit in unsigned */
76 1.1 christos
77 1.1 christos /* bin buffered input file type */
78 1.1 christos typedef struct {
79 1.1 christos char *name; /* name of file for error messages */
80 1.1 christos int fd; /* file descriptor */
81 1.1 christos unsigned left; /* bytes remaining at next */
82 1.1 christos unsigned char *next; /* next byte to read */
83 1.1 christos unsigned char *buf; /* allocated buffer of length CHUNK */
84 1.1 christos } bin;
85 1.1 christos
86 1.1 christos /* close a buffered file and free allocated memory */
87 1.1 christos local void bclose(bin *in)
88 1.1 christos {
89 1.1 christos if (in != NULL) {
90 1.1 christos if (in->fd != -1)
91 1.1 christos close(in->fd);
92 1.1 christos if (in->buf != NULL)
93 1.1 christos free(in->buf);
94 1.1 christos free(in);
95 1.1 christos }
96 1.1 christos }
97 1.1 christos
98 1.1 christos /* open a buffered file for input, return a pointer to type bin, or NULL on
99 1.1 christos failure */
100 1.1 christos local bin *bopen(char *name)
101 1.1 christos {
102 1.1 christos bin *in;
103 1.1 christos
104 1.1 christos in = malloc(sizeof(bin));
105 1.1 christos if (in == NULL)
106 1.1 christos return NULL;
107 1.1 christos in->buf = malloc(CHUNK);
108 1.1 christos in->fd = open(name, O_RDONLY, 0);
109 1.1 christos if (in->buf == NULL || in->fd == -1) {
110 1.1 christos bclose(in);
111 1.1 christos return NULL;
112 1.1 christos }
113 1.1 christos in->left = 0;
114 1.1 christos in->next = in->buf;
115 1.1 christos in->name = name;
116 1.1 christos return in;
117 1.1 christos }
118 1.1 christos
119 1.1 christos /* load buffer from file, return -1 on read error, 0 or 1 on success, with
120 1.1 christos 1 indicating that end-of-file was reached */
121 1.1 christos local int bload(bin *in)
122 1.1 christos {
123 1.1 christos long len;
124 1.1 christos
125 1.1 christos if (in == NULL)
126 1.1 christos return -1;
127 1.1 christos if (in->left != 0)
128 1.1 christos return 0;
129 1.1 christos in->next = in->buf;
130 1.1 christos do {
131 1.1 christos len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
132 1.1 christos if (len < 0)
133 1.1 christos return -1;
134 1.1 christos in->left += (unsigned)len;
135 1.1 christos } while (len != 0 && in->left < CHUNK);
136 1.1 christos return len == 0 ? 1 : 0;
137 1.1 christos }
138 1.1 christos
139 1.1 christos /* get a byte from the file, bail if end of file */
140 1.1 christos #define bget(in) (in->left ? 0 : bload(in), \
141 1.1 christos in->left ? (in->left--, *(in->next)++) : \
142 1.1 christos bail("unexpected end of file on ", in->name))
143 1.1 christos
144 1.1 christos /* get a four-byte little-endian unsigned integer from file */
145 1.1 christos local unsigned long bget4(bin *in)
146 1.1 christos {
147 1.1 christos unsigned long val;
148 1.1 christos
149 1.1 christos val = bget(in);
150 1.1 christos val += (unsigned long)(bget(in)) << 8;
151 1.1 christos val += (unsigned long)(bget(in)) << 16;
152 1.1 christos val += (unsigned long)(bget(in)) << 24;
153 1.1 christos return val;
154 1.1 christos }
155 1.1 christos
156 1.1 christos /* skip bytes in file */
157 1.1 christos local void bskip(bin *in, unsigned skip)
158 1.1 christos {
159 1.1 christos /* check pointer */
160 1.1 christos if (in == NULL)
161 1.1 christos return;
162 1.1 christos
163 1.1 christos /* easy case -- skip bytes in buffer */
164 1.1 christos if (skip <= in->left) {
165 1.1 christos in->left -= skip;
166 1.1 christos in->next += skip;
167 1.1 christos return;
168 1.1 christos }
169 1.1 christos
170 1.1 christos /* skip what's in buffer, discard buffer contents */
171 1.1 christos skip -= in->left;
172 1.1 christos in->left = 0;
173 1.1 christos
174 1.1 christos /* seek past multiples of CHUNK bytes */
175 1.1 christos if (skip > CHUNK) {
176 1.1 christos unsigned left;
177 1.1 christos
178 1.1 christos left = skip & (CHUNK - 1);
179 1.1 christos if (left == 0) {
180 1.1 christos /* exact number of chunks: seek all the way minus one byte to check
181 1.1 christos for end-of-file with a read */
182 1.1 christos lseek(in->fd, skip - 1, SEEK_CUR);
183 1.1 christos if (read(in->fd, in->buf, 1) != 1)
184 1.1 christos bail("unexpected end of file on ", in->name);
185 1.1 christos return;
186 1.1 christos }
187 1.1 christos
188 1.1 christos /* skip the integral chunks, update skip with remainder */
189 1.1 christos lseek(in->fd, skip - left, SEEK_CUR);
190 1.1 christos skip = left;
191 1.1 christos }
192 1.1 christos
193 1.1 christos /* read more input and skip remainder */
194 1.1 christos bload(in);
195 1.1 christos if (skip > in->left)
196 1.1 christos bail("unexpected end of file on ", in->name);
197 1.1 christos in->left -= skip;
198 1.1 christos in->next += skip;
199 1.1 christos }
200 1.1 christos
201 1.1 christos /* -- end of buffered input functions -- */
202 1.1 christos
203 1.1 christos /* skip the gzip header from file in */
204 1.1 christos local void gzhead(bin *in)
205 1.1 christos {
206 1.1 christos int flags;
207 1.1 christos
208 1.1 christos /* verify gzip magic header and compression method */
209 1.1 christos if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
210 1.1 christos bail(in->name, " is not a valid gzip file");
211 1.1 christos
212 1.1 christos /* get and verify flags */
213 1.1 christos flags = bget(in);
214 1.1 christos if ((flags & 0xe0) != 0)
215 1.1 christos bail("unknown reserved bits set in ", in->name);
216 1.1 christos
217 1.1 christos /* skip modification time, extra flags, and os */
218 1.1 christos bskip(in, 6);
219 1.1 christos
220 1.1 christos /* skip extra field if present */
221 1.1 christos if (flags & 4) {
222 1.1 christos unsigned len;
223 1.1 christos
224 1.1 christos len = bget(in);
225 1.1 christos len += (unsigned)(bget(in)) << 8;
226 1.1 christos bskip(in, len);
227 1.1 christos }
228 1.1 christos
229 1.1 christos /* skip file name if present */
230 1.1 christos if (flags & 8)
231 1.1 christos while (bget(in) != 0)
232 1.1 christos ;
233 1.1 christos
234 1.1 christos /* skip comment if present */
235 1.1 christos if (flags & 16)
236 1.1 christos while (bget(in) != 0)
237 1.1 christos ;
238 1.1 christos
239 1.1 christos /* skip header crc if present */
240 1.1 christos if (flags & 2)
241 1.1 christos bskip(in, 2);
242 1.1 christos }
243 1.1 christos
244 1.1 christos /* write a four-byte little-endian unsigned integer to out */
245 1.1 christos local void put4(unsigned long val, FILE *out)
246 1.1 christos {
247 1.1 christos putc(val & 0xff, out);
248 1.1 christos putc((val >> 8) & 0xff, out);
249 1.1 christos putc((val >> 16) & 0xff, out);
250 1.1 christos putc((val >> 24) & 0xff, out);
251 1.1 christos }
252 1.1 christos
253 1.1 christos /* Load up zlib stream from buffered input, bail if end of file */
254 1.1 christos local void zpull(z_streamp strm, bin *in)
255 1.1 christos {
256 1.1 christos if (in->left == 0)
257 1.1 christos bload(in);
258 1.1 christos if (in->left == 0)
259 1.1 christos bail("unexpected end of file on ", in->name);
260 1.1 christos strm->avail_in = in->left;
261 1.1 christos strm->next_in = in->next;
262 1.1 christos }
263 1.1 christos
264 1.1 christos /* Write header for gzip file to out and initialize trailer. */
265 1.1 christos local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
266 1.1 christos {
267 1.1 christos fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
268 1.1 christos *crc = crc32(0L, Z_NULL, 0);
269 1.1 christos *tot = 0;
270 1.1 christos }
271 1.1 christos
272 1.1 christos /* Copy the compressed data from name, zeroing the last block bit of the last
273 1.1 christos block if clr is true, and adding empty blocks as needed to get to a byte
274 1.1 christos boundary. If clr is false, then the last block becomes the last block of
275 1.1 christos the output, and the gzip trailer is written. crc and tot maintains the
276 1.1 christos crc and length (modulo 2^32) of the output for the trailer. The resulting
277 1.1 christos gzip file is written to out. gzinit() must be called before the first call
278 1.1 christos of gzcopy() to write the gzip header and to initialize crc and tot. */
279 1.1 christos local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
280 1.1 christos FILE *out)
281 1.1 christos {
282 1.1 christos int ret; /* return value from zlib functions */
283 1.1 christos int pos; /* where the "last block" bit is in byte */
284 1.1 christos int last; /* true if processing the last block */
285 1.1 christos bin *in; /* buffered input file */
286 1.1 christos unsigned char *start; /* start of compressed data in buffer */
287 1.1 christos unsigned char *junk; /* buffer for uncompressed data -- discarded */
288 1.1 christos z_off_t len; /* length of uncompressed data (support > 4 GB) */
289 1.1 christos z_stream strm; /* zlib inflate stream */
290 1.1 christos
291 1.1 christos /* open gzip file and skip header */
292 1.1 christos in = bopen(name);
293 1.1 christos if (in == NULL)
294 1.1 christos bail("could not open ", name);
295 1.1 christos gzhead(in);
296 1.1 christos
297 1.1 christos /* allocate buffer for uncompressed data and initialize raw inflate
298 1.1 christos stream */
299 1.1 christos junk = malloc(CHUNK);
300 1.1 christos strm.zalloc = Z_NULL;
301 1.1 christos strm.zfree = Z_NULL;
302 1.1 christos strm.opaque = Z_NULL;
303 1.1 christos strm.avail_in = 0;
304 1.1 christos strm.next_in = Z_NULL;
305 1.1 christos ret = inflateInit2(&strm, -15);
306 1.1 christos if (junk == NULL || ret != Z_OK)
307 1.1 christos bail("out of memory", "");
308 1.1 christos
309 1.1 christos /* inflate and copy compressed data, clear last-block bit if requested */
310 1.1 christos len = 0;
311 1.1 christos zpull(&strm, in);
312 1.1.1.2 christos start = in->next;
313 1.1 christos last = start[0] & 1;
314 1.1 christos if (last && clr)
315 1.1 christos start[0] &= ~1;
316 1.1 christos strm.avail_out = 0;
317 1.1 christos for (;;) {
318 1.1 christos /* if input used and output done, write used input and get more */
319 1.1 christos if (strm.avail_in == 0 && strm.avail_out != 0) {
320 1.1 christos fwrite(start, 1, strm.next_in - start, out);
321 1.1 christos start = in->buf;
322 1.1 christos in->left = 0;
323 1.1 christos zpull(&strm, in);
324 1.1 christos }
325 1.1 christos
326 1.1 christos /* decompress -- return early when end-of-block reached */
327 1.1 christos strm.avail_out = CHUNK;
328 1.1 christos strm.next_out = junk;
329 1.1 christos ret = inflate(&strm, Z_BLOCK);
330 1.1 christos switch (ret) {
331 1.1 christos case Z_MEM_ERROR:
332 1.1 christos bail("out of memory", "");
333 1.1 christos case Z_DATA_ERROR:
334 1.1 christos bail("invalid compressed data in ", in->name);
335 1.1 christos }
336 1.1 christos
337 1.1 christos /* update length of uncompressed data */
338 1.1 christos len += CHUNK - strm.avail_out;
339 1.1 christos
340 1.1 christos /* check for block boundary (only get this when block copied out) */
341 1.1 christos if (strm.data_type & 128) {
342 1.1 christos /* if that was the last block, then done */
343 1.1 christos if (last)
344 1.1 christos break;
345 1.1 christos
346 1.1 christos /* number of unused bits in last byte */
347 1.1 christos pos = strm.data_type & 7;
348 1.1 christos
349 1.1 christos /* find the next last-block bit */
350 1.1 christos if (pos != 0) {
351 1.1 christos /* next last-block bit is in last used byte */
352 1.1 christos pos = 0x100 >> pos;
353 1.1 christos last = strm.next_in[-1] & pos;
354 1.1 christos if (last && clr)
355 1.1.1.2 christos in->buf[strm.next_in - in->buf - 1] &= ~pos;
356 1.1 christos }
357 1.1 christos else {
358 1.1 christos /* next last-block bit is in next unused byte */
359 1.1 christos if (strm.avail_in == 0) {
360 1.1 christos /* don't have that byte yet -- get it */
361 1.1 christos fwrite(start, 1, strm.next_in - start, out);
362 1.1 christos start = in->buf;
363 1.1 christos in->left = 0;
364 1.1 christos zpull(&strm, in);
365 1.1 christos }
366 1.1 christos last = strm.next_in[0] & 1;
367 1.1 christos if (last && clr)
368 1.1.1.2 christos in->buf[strm.next_in - in->buf] &= ~1;
369 1.1 christos }
370 1.1 christos }
371 1.1 christos }
372 1.1 christos
373 1.1 christos /* update buffer with unused input */
374 1.1 christos in->left = strm.avail_in;
375 1.1.1.2 christos in->next = in->buf + (strm.next_in - in->buf);
376 1.1 christos
377 1.1 christos /* copy used input, write empty blocks to get to byte boundary */
378 1.1 christos pos = strm.data_type & 7;
379 1.1 christos fwrite(start, 1, in->next - start - 1, out);
380 1.1 christos last = in->next[-1];
381 1.1 christos if (pos == 0 || !clr)
382 1.1 christos /* already at byte boundary, or last file: write last byte */
383 1.1 christos putc(last, out);
384 1.1 christos else {
385 1.1 christos /* append empty blocks to last byte */
386 1.1 christos last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
387 1.1 christos if (pos & 1) {
388 1.1 christos /* odd -- append an empty stored block */
389 1.1 christos putc(last, out);
390 1.1 christos if (pos == 1)
391 1.1 christos putc(0, out); /* two more bits in block header */
392 1.1 christos fwrite("\0\0\xff\xff", 1, 4, out);
393 1.1 christos }
394 1.1 christos else {
395 1.1 christos /* even -- append 1, 2, or 3 empty fixed blocks */
396 1.1 christos switch (pos) {
397 1.1 christos case 6:
398 1.1 christos putc(last | 8, out);
399 1.1 christos last = 0;
400 1.1 christos case 4:
401 1.1 christos putc(last | 0x20, out);
402 1.1 christos last = 0;
403 1.1 christos case 2:
404 1.1 christos putc(last | 0x80, out);
405 1.1 christos putc(0, out);
406 1.1 christos }
407 1.1 christos }
408 1.1 christos }
409 1.1 christos
410 1.1 christos /* update crc and tot */
411 1.1 christos *crc = crc32_combine(*crc, bget4(in), len);
412 1.1 christos *tot += (unsigned long)len;
413 1.1 christos
414 1.1 christos /* clean up */
415 1.1 christos inflateEnd(&strm);
416 1.1 christos free(junk);
417 1.1 christos bclose(in);
418 1.1 christos
419 1.1 christos /* write trailer if this is the last gzip file */
420 1.1 christos if (!clr) {
421 1.1 christos put4(*crc, out);
422 1.1 christos put4(*tot, out);
423 1.1 christos }
424 1.1 christos }
425 1.1 christos
426 1.1 christos /* join the gzip files on the command line, write result to stdout */
427 1.1 christos int main(int argc, char **argv)
428 1.1 christos {
429 1.1 christos unsigned long crc, tot; /* running crc and total uncompressed length */
430 1.1 christos
431 1.1 christos /* skip command name */
432 1.1 christos argc--;
433 1.1 christos argv++;
434 1.1 christos
435 1.1 christos /* show usage if no arguments */
436 1.1 christos if (argc == 0) {
437 1.1 christos fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
438 1.1 christos stderr);
439 1.1 christos return 0;
440 1.1 christos }
441 1.1 christos
442 1.1 christos /* join gzip files on command line and write to stdout */
443 1.1 christos gzinit(&crc, &tot, stdout);
444 1.1 christos while (argc--)
445 1.1 christos gzcopy(*argv++, argc, &crc, &tot, stdout);
446 1.1 christos
447 1.1 christos /* done */
448 1.1 christos return 0;
449 1.1 christos }
450