Home | History | Annotate | Line # | Download | only in gzip
unxz.c revision 1.7.2.1
      1 /*	$NetBSD: unxz.c,v 1.7.2.1 2018/10/20 06:58:47 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Christos Zoulas.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 #include <sys/cdefs.h>
     32 __RCSID("$NetBSD: unxz.c,v 1.7.2.1 2018/10/20 06:58:47 pgoyette Exp $");
     33 
     34 #include <stdarg.h>
     35 #include <errno.h>
     36 #include <stdio.h>
     37 #include <unistd.h>
     38 #include <lzma.h>
     39 
     40 static off_t
     41 unxz(int i, int o, char *pre, size_t prelen, off_t *bytes_in)
     42 {
     43 	lzma_stream strm = LZMA_STREAM_INIT;
     44 	static const int flags = LZMA_TELL_UNSUPPORTED_CHECK|LZMA_CONCATENATED;
     45 	lzma_ret ret;
     46 	lzma_action action = LZMA_RUN;
     47 	off_t bytes_out, bp;
     48 	uint8_t ibuf[BUFSIZ];
     49 	uint8_t obuf[BUFSIZ];
     50 
     51 	if (bytes_in == NULL)
     52 		bytes_in = &bp;
     53 
     54 	strm.next_in = ibuf;
     55 	memcpy(ibuf, pre, prelen);
     56 	strm.avail_in = read(i, ibuf + prelen, sizeof(ibuf) - prelen);
     57 	if (strm.avail_in == (size_t)-1)
     58 		maybe_err("read failed");
     59 	infile_newdata(strm.avail_in);
     60 	strm.avail_in += prelen;
     61 	*bytes_in = strm.avail_in;
     62 
     63 	if ((ret = lzma_stream_decoder(&strm, UINT64_MAX, flags)) != LZMA_OK)
     64 		maybe_errx("Can't initialize decoder (%d)", ret);
     65 
     66 	strm.next_out = NULL;
     67 	strm.avail_out = 0;
     68 	if ((ret = lzma_code(&strm, LZMA_RUN)) != LZMA_OK)
     69 		maybe_errx("Can't read headers (%d)", ret);
     70 
     71 	bytes_out = 0;
     72 	strm.next_out = obuf;
     73 	strm.avail_out = sizeof(obuf);
     74 
     75 	for (;;) {
     76 		check_siginfo();
     77 		if (strm.avail_in == 0) {
     78 			strm.next_in = ibuf;
     79 			strm.avail_in = read(i, ibuf, sizeof(ibuf));
     80 			switch (strm.avail_in) {
     81 			case (size_t)-1:
     82 				maybe_err("read failed");
     83 				/*NOTREACHED*/
     84 			case 0:
     85 				action = LZMA_FINISH;
     86 				break;
     87 			default:
     88 				infile_newdata(strm.avail_in);
     89 				*bytes_in += strm.avail_in;
     90 				break;
     91 			}
     92 		}
     93 
     94 		ret = lzma_code(&strm, action);
     95 
     96 		// Write and check write error before checking decoder error.
     97 		// This way as much data as possible gets written to output
     98 		// even if decoder detected an error.
     99 		if (strm.avail_out == 0 || ret != LZMA_OK) {
    100 			const size_t write_size = sizeof(obuf) - strm.avail_out;
    101 
    102 			if (write(o, obuf, write_size) != (ssize_t)write_size)
    103 				maybe_err("write failed");
    104 
    105 			strm.next_out = obuf;
    106 			strm.avail_out = sizeof(obuf);
    107 			bytes_out += write_size;
    108 		}
    109 
    110 		if (ret != LZMA_OK) {
    111 			if (ret == LZMA_STREAM_END) {
    112 				// Check that there's no trailing garbage.
    113 				if (strm.avail_in != 0 || read(i, ibuf, 1))
    114 					ret = LZMA_DATA_ERROR;
    115 				else {
    116 					lzma_end(&strm);
    117 					return bytes_out;
    118 				}
    119 			}
    120 
    121 			const char *msg;
    122 			switch (ret) {
    123 			case LZMA_MEM_ERROR:
    124 				msg = strerror(ENOMEM);
    125 				break;
    126 
    127 			case LZMA_FORMAT_ERROR:
    128 				msg = "File format not recognized";
    129 				break;
    130 
    131 			case LZMA_OPTIONS_ERROR:
    132 				// FIXME: Better message?
    133 				msg = "Unsupported compression options";
    134 				break;
    135 
    136 			case LZMA_DATA_ERROR:
    137 				msg = "File is corrupt";
    138 				break;
    139 
    140 			case LZMA_BUF_ERROR:
    141 				msg = "Unexpected end of input";
    142 				break;
    143 
    144 			case LZMA_MEMLIMIT_ERROR:
    145 				msg = "Reached memory limit";
    146 				break;
    147 
    148 			default:
    149 				maybe_errx("Unknown error (%d)", ret);
    150 				break;
    151 			}
    152 			maybe_errx("%s", msg);
    153 
    154 		}
    155 	}
    156 }
    157 
    158 #include <stdbool.h>
    159 
    160 /*
    161  * Copied various bits and pieces from xz support code or brute force
    162  * replacements.
    163  */
    164 
    165 #define	my_min(A,B)	((A)<(B)?(A):(B))
    166 
    167 // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them.
    168 // We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t))
    169 #if BUFSIZ <= 1024
    170 #       define IO_BUFFER_SIZE 8192
    171 #else
    172 #       define IO_BUFFER_SIZE (BUFSIZ & ~7U)
    173 #endif
    174 
    175 /// is_sparse() accesses the buffer as uint64_t for maximum speed.
    176 /// Use an union to make sure that the buffer is properly aligned.
    177 typedef union {
    178         uint8_t u8[IO_BUFFER_SIZE];
    179         uint32_t u32[IO_BUFFER_SIZE / sizeof(uint32_t)];
    180         uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)];
    181 } io_buf;
    182 
    183 
    184 static bool
    185 io_pread(int fd, io_buf *buf, size_t size, off_t pos)
    186 {
    187 	// Using lseek() and read() is more portable than pread() and
    188 	// for us it is as good as real pread().
    189 	if (lseek(fd, pos, SEEK_SET) != pos) {
    190 		return true;
    191 	}
    192 
    193 	const size_t amount = read(fd, buf, size);
    194 	if (amount == SIZE_MAX)
    195 		return true;
    196 
    197 	if (amount != size) {
    198 		return true;
    199 	}
    200 
    201 	return false;
    202 }
    203 
    204 /*
    205  * Most of the following is copied (mostly verbatim) from the xz
    206  * distribution, from file src/xz/list.c
    207  */
    208 
    209 ///////////////////////////////////////////////////////////////////////////////
    210 //
    211 /// \file       list.c
    212 /// \brief      Listing information about .xz files
    213 //
    214 //  Author:     Lasse Collin
    215 //
    216 //  This file has been put into the public domain.
    217 //  You can do whatever you want with this file.
    218 //
    219 ///////////////////////////////////////////////////////////////////////////////
    220 
    221 
    222 /// Information about a .xz file
    223 typedef struct {
    224 	/// Combined Index of all Streams in the file
    225 	lzma_index *idx;
    226 
    227 	/// Total amount of Stream Padding
    228 	uint64_t stream_padding;
    229 
    230 	/// Highest memory usage so far
    231 	uint64_t memusage_max;
    232 
    233 	/// True if all Blocks so far have Compressed Size and
    234 	/// Uncompressed Size fields
    235 	bool all_have_sizes;
    236 
    237 	/// Oldest XZ Utils version that will decompress the file
    238 	uint32_t min_version;
    239 
    240 } xz_file_info;
    241 
    242 #define XZ_FILE_INFO_INIT { NULL, 0, 0, true, 50000002 }
    243 
    244 
    245 /// \brief      Parse the Index(es) from the given .xz file
    246 ///
    247 /// \param      xfi     Pointer to structure where the decoded information
    248 ///                     is stored.
    249 /// \param      pair    Input file
    250 ///
    251 /// \return     On success, false is returned. On error, true is returned.
    252 ///
    253 // TODO: This function is pretty big. liblzma should have a function that
    254 // takes a callback function to parse the Index(es) from a .xz file to make
    255 // it easy for applications.
    256 static bool
    257 parse_indexes(xz_file_info *xfi, int src_fd)
    258 {
    259 	struct stat st;
    260 
    261 	fstat(src_fd, &st);
    262 	if (st.st_size <= 0) {
    263 		return true;
    264 	}
    265 
    266 	if (st.st_size < 2 * LZMA_STREAM_HEADER_SIZE) {
    267 		return true;
    268 	}
    269 
    270 	io_buf buf;
    271 	lzma_stream_flags header_flags;
    272 	lzma_stream_flags footer_flags;
    273 	lzma_ret ret;
    274 
    275 	// lzma_stream for the Index decoder
    276 	lzma_stream strm = LZMA_STREAM_INIT;
    277 
    278 	// All Indexes decoded so far
    279 	lzma_index *combined_index = NULL;
    280 
    281 	// The Index currently being decoded
    282 	lzma_index *this_index = NULL;
    283 
    284 	// Current position in the file. We parse the file backwards so
    285 	// initialize it to point to the end of the file.
    286 	off_t pos = st.st_size;
    287 
    288 	// Each loop iteration decodes one Index.
    289 	do {
    290 		// Check that there is enough data left to contain at least
    291 		// the Stream Header and Stream Footer. This check cannot
    292 		// fail in the first pass of this loop.
    293 		if (pos < 2 * LZMA_STREAM_HEADER_SIZE) {
    294 			goto error;
    295 		}
    296 
    297 		pos -= LZMA_STREAM_HEADER_SIZE;
    298 		lzma_vli stream_padding = 0;
    299 
    300 		// Locate the Stream Footer. There may be Stream Padding which
    301 		// we must skip when reading backwards.
    302 		while (true) {
    303 			if (pos < LZMA_STREAM_HEADER_SIZE) {
    304 				goto error;
    305 			}
    306 
    307 			if (io_pread(src_fd, &buf,
    308 					LZMA_STREAM_HEADER_SIZE, pos))
    309 				goto error;
    310 
    311 			// Stream Padding is always a multiple of four bytes.
    312 			int i = 2;
    313 			if (buf.u32[i] != 0)
    314 				break;
    315 
    316 			// To avoid calling io_pread() for every four bytes
    317 			// of Stream Padding, take advantage that we read
    318 			// 12 bytes (LZMA_STREAM_HEADER_SIZE) already and
    319 			// check them too before calling io_pread() again.
    320 			do {
    321 				stream_padding += 4;
    322 				pos -= 4;
    323 				--i;
    324 			} while (i >= 0 && buf.u32[i] == 0);
    325 		}
    326 
    327 		// Decode the Stream Footer.
    328 		ret = lzma_stream_footer_decode(&footer_flags, buf.u8);
    329 		if (ret != LZMA_OK) {
    330 			goto error;
    331 		}
    332 
    333 		// Check that the Stream Footer doesn't specify something
    334 		// that we don't support. This can only happen if the xz
    335 		// version is older than liblzma and liblzma supports
    336 		// something new.
    337 		//
    338 		// It is enough to check Stream Footer. Stream Header must
    339 		// match when it is compared against Stream Footer with
    340 		// lzma_stream_flags_compare().
    341 		if (footer_flags.version != 0) {
    342 			goto error;
    343 		}
    344 
    345 		// Check that the size of the Index field looks sane.
    346 		lzma_vli index_size = footer_flags.backward_size;
    347 		if ((lzma_vli)(pos) < index_size + LZMA_STREAM_HEADER_SIZE) {
    348 			goto error;
    349 		}
    350 
    351 		// Set pos to the beginning of the Index.
    352 		pos -= index_size;
    353 
    354 		// Decode the Index.
    355 		ret = lzma_index_decoder(&strm, &this_index, UINT64_MAX);
    356 		if (ret != LZMA_OK) {
    357 			goto error;
    358 		}
    359 
    360 		do {
    361 			// Don't give the decoder more input than the
    362 			// Index size.
    363 			strm.avail_in = my_min(IO_BUFFER_SIZE, index_size);
    364 			if (io_pread(src_fd, &buf, strm.avail_in, pos))
    365 				goto error;
    366 
    367 			pos += strm.avail_in;
    368 			index_size -= strm.avail_in;
    369 
    370 			strm.next_in = buf.u8;
    371 			ret = lzma_code(&strm, LZMA_RUN);
    372 
    373 		} while (ret == LZMA_OK);
    374 
    375 		// If the decoding seems to be successful, check also that
    376 		// the Index decoder consumed as much input as indicated
    377 		// by the Backward Size field.
    378 		if (ret == LZMA_STREAM_END)
    379 			if (index_size != 0 || strm.avail_in != 0)
    380 				ret = LZMA_DATA_ERROR;
    381 
    382 		if (ret != LZMA_STREAM_END) {
    383 			// LZMA_BUFFER_ERROR means that the Index decoder
    384 			// would have liked more input than what the Index
    385 			// size should be according to Stream Footer.
    386 			// The message for LZMA_DATA_ERROR makes more
    387 			// sense in that case.
    388 			if (ret == LZMA_BUF_ERROR)
    389 				ret = LZMA_DATA_ERROR;
    390 
    391 			goto error;
    392 		}
    393 
    394 		// Decode the Stream Header and check that its Stream Flags
    395 		// match the Stream Footer.
    396 		pos -= footer_flags.backward_size + LZMA_STREAM_HEADER_SIZE;
    397 		if ((lzma_vli)(pos) < lzma_index_total_size(this_index)) {
    398 			goto error;
    399 		}
    400 
    401 		pos -= lzma_index_total_size(this_index);
    402 		if (io_pread(src_fd, &buf, LZMA_STREAM_HEADER_SIZE, pos))
    403 			goto error;
    404 
    405 		ret = lzma_stream_header_decode(&header_flags, buf.u8);
    406 		if (ret != LZMA_OK) {
    407 			goto error;
    408 		}
    409 
    410 		ret = lzma_stream_flags_compare(&header_flags, &footer_flags);
    411 		if (ret != LZMA_OK) {
    412 			goto error;
    413 		}
    414 
    415 		// Store the decoded Stream Flags into this_index. This is
    416 		// needed so that we can print which Check is used in each
    417 		// Stream.
    418 		ret = lzma_index_stream_flags(this_index, &footer_flags);
    419 		if (ret != LZMA_OK)
    420 			goto error;
    421 
    422 		// Store also the size of the Stream Padding field. It is
    423 		// needed to show the offsets of the Streams correctly.
    424 		ret = lzma_index_stream_padding(this_index, stream_padding);
    425 		if (ret != LZMA_OK)
    426 			goto error;
    427 
    428 		if (combined_index != NULL) {
    429 			// Append the earlier decoded Indexes
    430 			// after this_index.
    431 			ret = lzma_index_cat(
    432 					this_index, combined_index, NULL);
    433 			if (ret != LZMA_OK) {
    434 				goto error;
    435 			}
    436 		}
    437 
    438 		combined_index = this_index;
    439 		this_index = NULL;
    440 
    441 		xfi->stream_padding += stream_padding;
    442 
    443 	} while (pos > 0);
    444 
    445 	lzma_end(&strm);
    446 
    447 	// All OK. Make combined_index available to the caller.
    448 	xfi->idx = combined_index;
    449 	return false;
    450 
    451 error:
    452 	// Something went wrong, free the allocated memory.
    453 	lzma_end(&strm);
    454 	lzma_index_end(combined_index, NULL);
    455 	lzma_index_end(this_index, NULL);
    456 	return true;
    457 }
    458 
    459 /***************** end of copy form list.c *************************/
    460 
    461 /*
    462  * Small wrapper to extract total length of a file
    463  */
    464 off_t
    465 unxz_len(int fd)
    466 {
    467 	xz_file_info xfi = XZ_FILE_INFO_INIT;
    468 	if (!parse_indexes(&xfi, fd)) {
    469 		off_t res = lzma_index_uncompressed_size(xfi.idx);
    470 		lzma_index_end(xfi.idx, NULL);
    471 		return res;
    472 	}
    473 	return 0;
    474 }
    475 
    476