usr.bin/gzip/unxz.c

1.9  christos /*	$NetBSD: unxz.c,v 1.9 2024/05/04 13:17:03 christos Exp $	*/
1.5  christos
1.5  christos /*-
1.5  christos  * Copyright (c) 2011 The NetBSD Foundation, Inc.
1.5  christos  * All rights reserved.
1.5  christos  *
1.5  christos  * This code is derived from software contributed to The NetBSD Foundation
1.5  christos  * by Christos Zoulas.
1.5  christos  *
1.5  christos  * Redistribution and use in source and binary forms, with or without
1.5  christos  * modification, are permitted provided that the following conditions
1.5  christos  * are met:
1.5  christos  * 1. Redistributions of source code must retain the above copyright
1.5  christos  *    notice, this list of conditions and the following disclaimer.
1.5  christos  * 2. Redistributions in binary form must reproduce the above copyright
1.5  christos  *    notice, this list of conditions and the following disclaimer in the
1.5  christos  *    documentation and/or other materials provided with the distribution.
1.5  christos  *
1.5  christos  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
1.5  christos  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
1.5  christos  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1.5  christos  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
1.5  christos  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1.5  christos  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1.5  christos  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1.5  christos  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1.5  christos  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1.5  christos  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1.5  christos  * POSSIBILITY OF SUCH DAMAGE.
1.5  christos  */
1.5  christos #include <sys/cdefs.h>
1.9  christos __RCSID("$NetBSD: unxz.c,v 1.9 2024/05/04 13:17:03 christos Exp $");
1.1  christos
1.1  christos #include <stdarg.h>
1.1  christos #include <errno.h>
1.1  christos #include <stdio.h>
1.1  christos #include <unistd.h>
1.1  christos #include <lzma.h>
1.1  christos
1.1  christos static off_t
1.1  christos unxz(int i, int o, char *pre, size_t prelen, off_t *bytes_in)
1.1  christos {
1.1  christos 	lzma_stream strm = LZMA_STREAM_INIT;
1.2  christos 	static const int flags = LZMA_TELL_UNSUPPORTED_CHECK|LZMA_CONCATENATED;
1.1  christos 	lzma_ret ret;
1.3  christos 	lzma_action action = LZMA_RUN;
1.3  christos 	off_t bytes_out, bp;
1.1  christos 	uint8_t ibuf[BUFSIZ];
1.1  christos 	uint8_t obuf[BUFSIZ];
1.1  christos
1.3  christos 	if (bytes_in == NULL)
1.3  christos 		bytes_in = &bp;
1.3  christos
1.1  christos 	strm.next_in = ibuf;
1.2  christos 	memcpy(ibuf, pre, prelen);
1.1  christos 	strm.avail_in = read(i, ibuf + prelen, sizeof(ibuf) - prelen);
1.1  christos 	if (strm.avail_in == (size_t)-1)
1.3  christos 		maybe_err("read failed");
1.7       mrg 	infile_newdata(strm.avail_in);
1.3  christos 	strm.avail_in += prelen;
1.3  christos 	*bytes_in = strm.avail_in;
1.1  christos
1.2  christos 	if ((ret = lzma_stream_decoder(&strm, UINT64_MAX, flags)) != LZMA_OK)
1.2  christos 		maybe_errx("Can't initialize decoder (%d)", ret);
1.2  christos
1.2  christos 	strm.next_out = NULL;
1.2  christos 	strm.avail_out = 0;
1.2  christos 	if ((ret = lzma_code(&strm, LZMA_RUN)) != LZMA_OK)
1.2  christos 		maybe_errx("Can't read headers (%d)", ret);
1.1  christos
1.3  christos 	bytes_out = 0;
1.1  christos 	strm.next_out = obuf;
1.1  christos 	strm.avail_out = sizeof(obuf);
1.1  christos
1.1  christos 	for (;;) {
1.7       mrg 		check_siginfo();
1.1  christos 		if (strm.avail_in == 0) {
1.1  christos 			strm.next_in = ibuf;
1.1  christos 			strm.avail_in = read(i, ibuf, sizeof(ibuf));
1.3  christos 			switch (strm.avail_in) {
1.3  christos 			case (size_t)-1:
1.3  christos 				maybe_err("read failed");
1.3  christos 				/*NOTREACHED*/
1.3  christos 			case 0:
1.3  christos 				action = LZMA_FINISH;
1.3  christos 				break;
1.3  christos 			default:
1.7       mrg 				infile_newdata(strm.avail_in);
1.3  christos 				*bytes_in += strm.avail_in;
1.3  christos 				break;
1.3  christos 			}
1.1  christos 		}
1.1  christos
1.3  christos 		ret = lzma_code(&strm, action);
1.1  christos
1.1  christos 		// Write and check write error before checking decoder error.
1.1  christos 		// This way as much data as possible gets written to output
1.1  christos 		// even if decoder detected an error.
1.1  christos 		if (strm.avail_out == 0 || ret != LZMA_OK) {
1.1  christos 			const size_t write_size = sizeof(obuf) - strm.avail_out;
1.1  christos
1.9  christos 			if (!tflag &&
1.9  christos 			    write(o, obuf, write_size) != (ssize_t)write_size)
1.1  christos 				maybe_err("write failed");
1.1  christos
1.1  christos 			strm.next_out = obuf;
1.1  christos 			strm.avail_out = sizeof(obuf);
1.3  christos 			bytes_out += write_size;
1.1  christos 		}
1.1  christos
1.1  christos 		if (ret != LZMA_OK) {
1.1  christos 			if (ret == LZMA_STREAM_END) {
1.1  christos 				// Check that there's no trailing garbage.
1.1  christos 				if (strm.avail_in != 0 || read(i, ibuf, 1))
1.1  christos 					ret = LZMA_DATA_ERROR;
1.1  christos 				else {
1.1  christos 					lzma_end(&strm);
1.3  christos 					return bytes_out;
1.1  christos 				}
1.1  christos 			}
1.1  christos
1.1  christos 			const char *msg;
1.1  christos 			switch (ret) {
1.1  christos 			case LZMA_MEM_ERROR:
1.1  christos 				msg = strerror(ENOMEM);
1.1  christos 				break;
1.1  christos
1.1  christos 			case LZMA_FORMAT_ERROR:
1.1  christos 				msg = "File format not recognized";
1.1  christos 				break;
1.1  christos
1.1  christos 			case LZMA_OPTIONS_ERROR:
1.1  christos 				// FIXME: Better message?
1.1  christos 				msg = "Unsupported compression options";
1.1  christos 				break;
1.1  christos
1.1  christos 			case LZMA_DATA_ERROR:
1.1  christos 				msg = "File is corrupt";
1.1  christos 				break;
1.1  christos
1.1  christos 			case LZMA_BUF_ERROR:
1.1  christos 				msg = "Unexpected end of input";
1.1  christos 				break;
1.1  christos
1.1  christos 			case LZMA_MEMLIMIT_ERROR:
1.1  christos 				msg = "Reached memory limit";
1.1  christos 				break;
1.1  christos
1.1  christos 			default:
1.4  christos 				maybe_errx("Unknown error (%d)", ret);
1.1  christos 				break;
1.1  christos 			}
1.4  christos 			maybe_errx("%s", msg);
1.1  christos
1.1  christos 		}
1.1  christos 	}
1.1  christos }
1.8    martin
1.8    martin #include <stdbool.h>
1.8    martin
1.8    martin /*
1.8    martin  * Copied various bits and pieces from xz support code or brute force
1.8    martin  * replacements.
1.8    martin  */
1.8    martin
1.8    martin #define	my_min(A,B)	((A)<(B)?(A):(B))
1.8    martin
1.8    martin // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them.
1.8    martin // We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t))
1.8    martin #if BUFSIZ <= 1024
1.8    martin #       define IO_BUFFER_SIZE 8192
1.8    martin #else
1.8    martin #       define IO_BUFFER_SIZE (BUFSIZ & ~7U)
1.8    martin #endif
1.8    martin
1.8    martin /// is_sparse() accesses the buffer as uint64_t for maximum speed.
1.8    martin /// Use an union to make sure that the buffer is properly aligned.
1.8    martin typedef union {
1.8    martin         uint8_t u8[IO_BUFFER_SIZE];
1.8    martin         uint32_t u32[IO_BUFFER_SIZE / sizeof(uint32_t)];
1.8    martin         uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)];
1.8    martin } io_buf;
1.8    martin
1.8    martin
1.8    martin static bool
1.8    martin io_pread(int fd, io_buf *buf, size_t size, off_t pos)
1.8    martin {
1.8    martin 	// Using lseek() and read() is more portable than pread() and
1.8    martin 	// for us it is as good as real pread().
1.8    martin 	if (lseek(fd, pos, SEEK_SET) != pos) {
1.8    martin 		return true;
1.8    martin 	}
1.8    martin
1.8    martin 	const size_t amount = read(fd, buf, size);
1.8    martin 	if (amount == SIZE_MAX)
1.8    martin 		return true;
1.8    martin
1.8    martin 	if (amount != size) {
1.8    martin 		return true;
1.8    martin 	}
1.8    martin
1.8    martin 	return false;
1.8    martin }
1.8    martin
1.8    martin /*
1.8    martin  * Most of the following is copied (mostly verbatim) from the xz
1.8    martin  * distribution, from file src/xz/list.c
1.8    martin  */
1.8    martin
1.8    martin ///////////////////////////////////////////////////////////////////////////////
1.8    martin //
1.8    martin /// \file       list.c
1.8    martin /// \brief      Listing information about .xz files
1.8    martin //
1.8    martin //  Author:     Lasse Collin
1.8    martin //
1.8    martin //  This file has been put into the public domain.
1.8    martin //  You can do whatever you want with this file.
1.8    martin //
1.8    martin ///////////////////////////////////////////////////////////////////////////////
1.8    martin
1.8    martin
1.8    martin /// Information about a .xz file
1.8    martin typedef struct {
1.8    martin 	/// Combined Index of all Streams in the file
1.8    martin 	lzma_index *idx;
1.8    martin
1.8    martin 	/// Total amount of Stream Padding
1.8    martin 	uint64_t stream_padding;
1.8    martin
1.8    martin 	/// Highest memory usage so far
1.8    martin 	uint64_t memusage_max;
1.8    martin
1.8    martin 	/// True if all Blocks so far have Compressed Size and
1.8    martin 	/// Uncompressed Size fields
1.8    martin 	bool all_have_sizes;
1.8    martin
1.8    martin 	/// Oldest XZ Utils version that will decompress the file
1.8    martin 	uint32_t min_version;
1.8    martin
1.8    martin } xz_file_info;
1.8    martin
1.8    martin #define XZ_FILE_INFO_INIT { NULL, 0, 0, true, 50000002 }
1.8    martin
1.8    martin
1.8    martin /// \brief      Parse the Index(es) from the given .xz file
1.8    martin ///
1.8    martin /// \param      xfi     Pointer to structure where the decoded information
1.8    martin ///                     is stored.
1.8    martin /// \param      pair    Input file
1.8    martin ///
1.8    martin /// \return     On success, false is returned. On error, true is returned.
1.8    martin ///
1.8    martin // TODO: This function is pretty big. liblzma should have a function that
1.8    martin // takes a callback function to parse the Index(es) from a .xz file to make
1.8    martin // it easy for applications.
1.8    martin static bool
1.8    martin parse_indexes(xz_file_info *xfi, int src_fd)
1.8    martin {
1.8    martin 	struct stat st;
1.8    martin
1.8    martin 	fstat(src_fd, &st);
1.8    martin 	if (st.st_size <= 0) {
1.8    martin 		return true;
1.8    martin 	}
1.8    martin
1.8    martin 	if (st.st_size < 2 * LZMA_STREAM_HEADER_SIZE) {
1.8    martin 		return true;
1.8    martin 	}
1.8    martin
1.8    martin 	io_buf buf;
1.8    martin 	lzma_stream_flags header_flags;
1.8    martin 	lzma_stream_flags footer_flags;
1.8    martin 	lzma_ret ret;
1.8    martin
1.8    martin 	// lzma_stream for the Index decoder
1.8    martin 	lzma_stream strm = LZMA_STREAM_INIT;
1.8    martin
1.8    martin 	// All Indexes decoded so far
1.8    martin 	lzma_index *combined_index = NULL;
1.8    martin
1.8    martin 	// The Index currently being decoded
1.8    martin 	lzma_index *this_index = NULL;
1.8    martin
1.8    martin 	// Current position in the file. We parse the file backwards so
1.8    martin 	// initialize it to point to the end of the file.
1.8    martin 	off_t pos = st.st_size;
1.8    martin
1.8    martin 	// Each loop iteration decodes one Index.
1.8    martin 	do {
1.8    martin 		// Check that there is enough data left to contain at least
1.8    martin 		// the Stream Header and Stream Footer. This check cannot
1.8    martin 		// fail in the first pass of this loop.
1.8    martin 		if (pos < 2 * LZMA_STREAM_HEADER_SIZE) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		pos -= LZMA_STREAM_HEADER_SIZE;
1.8    martin 		lzma_vli stream_padding = 0;
1.8    martin
1.8    martin 		// Locate the Stream Footer. There may be Stream Padding which
1.8    martin 		// we must skip when reading backwards.
1.8    martin 		while (true) {
1.8    martin 			if (pos < LZMA_STREAM_HEADER_SIZE) {
1.8    martin 				goto error;
1.8    martin 			}
1.8    martin
1.8    martin 			if (io_pread(src_fd, &buf,
1.8    martin 					LZMA_STREAM_HEADER_SIZE, pos))
1.8    martin 				goto error;
1.8    martin
1.8    martin 			// Stream Padding is always a multiple of four bytes.
1.8    martin 			int i = 2;
1.8    martin 			if (buf.u32[i] != 0)
1.8    martin 				break;
1.8    martin
1.8    martin 			// To avoid calling io_pread() for every four bytes
1.8    martin 			// of Stream Padding, take advantage that we read
1.8    martin 			// 12 bytes (LZMA_STREAM_HEADER_SIZE) already and
1.8    martin 			// check them too before calling io_pread() again.
1.8    martin 			do {
1.8    martin 				stream_padding += 4;
1.8    martin 				pos -= 4;
1.8    martin 				--i;
1.8    martin 			} while (i >= 0 && buf.u32[i] == 0);
1.8    martin 		}
1.8    martin
1.8    martin 		// Decode the Stream Footer.
1.8    martin 		ret = lzma_stream_footer_decode(&footer_flags, buf.u8);
1.8    martin 		if (ret != LZMA_OK) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		// Check that the Stream Footer doesn't specify something
1.8    martin 		// that we don't support. This can only happen if the xz
1.8    martin 		// version is older than liblzma and liblzma supports
1.8    martin 		// something new.
1.8    martin 		//
1.8    martin 		// It is enough to check Stream Footer. Stream Header must
1.8    martin 		// match when it is compared against Stream Footer with
1.8    martin 		// lzma_stream_flags_compare().
1.8    martin 		if (footer_flags.version != 0) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		// Check that the size of the Index field looks sane.
1.8    martin 		lzma_vli index_size = footer_flags.backward_size;
1.8    martin 		if ((lzma_vli)(pos) < index_size + LZMA_STREAM_HEADER_SIZE) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		// Set pos to the beginning of the Index.
1.8    martin 		pos -= index_size;
1.8    martin
1.8    martin 		// Decode the Index.
1.8    martin 		ret = lzma_index_decoder(&strm, &this_index, UINT64_MAX);
1.8    martin 		if (ret != LZMA_OK) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		do {
1.8    martin 			// Don't give the decoder more input than the
1.8    martin 			// Index size.
1.8    martin 			strm.avail_in = my_min(IO_BUFFER_SIZE, index_size);
1.8    martin 			if (io_pread(src_fd, &buf, strm.avail_in, pos))
1.8    martin 				goto error;
1.8    martin
1.8    martin 			pos += strm.avail_in;
1.8    martin 			index_size -= strm.avail_in;
1.8    martin
1.8    martin 			strm.next_in = buf.u8;
1.8    martin 			ret = lzma_code(&strm, LZMA_RUN);
1.8    martin
1.8    martin 		} while (ret == LZMA_OK);
1.8    martin
1.8    martin 		// If the decoding seems to be successful, check also that
1.8    martin 		// the Index decoder consumed as much input as indicated
1.8    martin 		// by the Backward Size field.
1.8    martin 		if (ret == LZMA_STREAM_END)
1.8    martin 			if (index_size != 0 || strm.avail_in != 0)
1.8    martin 				ret = LZMA_DATA_ERROR;
1.8    martin
1.8    martin 		if (ret != LZMA_STREAM_END) {
1.8    martin 			// LZMA_BUFFER_ERROR means that the Index decoder
1.8    martin 			// would have liked more input than what the Index
1.8    martin 			// size should be according to Stream Footer.
1.8    martin 			// The message for LZMA_DATA_ERROR makes more
1.8    martin 			// sense in that case.
1.8    martin 			if (ret == LZMA_BUF_ERROR)
1.8    martin 				ret = LZMA_DATA_ERROR;
1.8    martin
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		// Decode the Stream Header and check that its Stream Flags
1.8    martin 		// match the Stream Footer.
1.8    martin 		pos -= footer_flags.backward_size + LZMA_STREAM_HEADER_SIZE;
1.8    martin 		if ((lzma_vli)(pos) < lzma_index_total_size(this_index)) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		pos -= lzma_index_total_size(this_index);
1.8    martin 		if (io_pread(src_fd, &buf, LZMA_STREAM_HEADER_SIZE, pos))
1.8    martin 			goto error;
1.8    martin
1.8    martin 		ret = lzma_stream_header_decode(&header_flags, buf.u8);
1.8    martin 		if (ret != LZMA_OK) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		ret = lzma_stream_flags_compare(&header_flags, &footer_flags);
1.8    martin 		if (ret != LZMA_OK) {
1.8    martin 			goto error;
1.8    martin 		}
1.8    martin
1.8    martin 		// Store the decoded Stream Flags into this_index. This is
1.8    martin 		// needed so that we can print which Check is used in each
1.8    martin 		// Stream.
1.8    martin 		ret = lzma_index_stream_flags(this_index, &footer_flags);
1.8    martin 		if (ret != LZMA_OK)
1.8    martin 			goto error;
1.8    martin
1.8    martin 		// Store also the size of the Stream Padding field. It is
1.8    martin 		// needed to show the offsets of the Streams correctly.
1.8    martin 		ret = lzma_index_stream_padding(this_index, stream_padding);
1.8    martin 		if (ret != LZMA_OK)
1.8    martin 			goto error;
1.8    martin
1.8    martin 		if (combined_index != NULL) {
1.8    martin 			// Append the earlier decoded Indexes
1.8    martin 			// after this_index.
1.8    martin 			ret = lzma_index_cat(
1.8    martin 					this_index, combined_index, NULL);
1.8    martin 			if (ret != LZMA_OK) {
1.8    martin 				goto error;
1.8    martin 			}
1.8    martin 		}
1.8    martin
1.8    martin 		combined_index = this_index;
1.8    martin 		this_index = NULL;
1.8    martin
1.8    martin 		xfi->stream_padding += stream_padding;
1.8    martin
1.8    martin 	} while (pos > 0);
1.8    martin
1.8    martin 	lzma_end(&strm);
1.8    martin
1.8    martin 	// All OK. Make combined_index available to the caller.
1.8    martin 	xfi->idx = combined_index;
1.8    martin 	return false;
1.8    martin
1.8    martin error:
1.8    martin 	// Something went wrong, free the allocated memory.
1.8    martin 	lzma_end(&strm);
1.8    martin 	lzma_index_end(combined_index, NULL);
1.8    martin 	lzma_index_end(this_index, NULL);
1.8    martin 	return true;
1.8    martin }
1.8    martin
1.8    martin /***************** end of copy form list.c *************************/
1.8    martin
1.8    martin /*
1.8    martin  * Small wrapper to extract total length of a file
1.8    martin  */
1.8    martin off_t
1.8    martin unxz_len(int fd)
1.8    martin {
1.8    martin 	xz_file_info xfi = XZ_FILE_INFO_INIT;
1.8    martin 	if (!parse_indexes(&xfi, fd)) {
1.8    martin 		off_t res = lzma_index_uncompressed_size(xfi.idx);
1.8    martin 		lzma_index_end(xfi.idx, NULL);
1.8    martin 		return res;
1.8    martin 	}
1.8    martin 	return 0;
1.8    martin }
1.8    martin