Home | History | Annotate | Line # | Download | only in vndcompress
      1 /*	$NetBSD: vndcompress.c,v 1.29 2017/07/29 21:04:07 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2013 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __RCSID("$NetBSD: vndcompress.c,v 1.29 2017/07/29 21:04:07 riastradh Exp $");
     34 
     35 #include <sys/endian.h>
     36 #include <sys/stat.h>
     37 
     38 #include <assert.h>
     39 #include <err.h>
     40 #include <errno.h>
     41 #include <fcntl.h>
     42 #include <inttypes.h>
     43 #include <limits.h>
     44 #include <signal.h>
     45 #include <stdbool.h>
     46 #include <stdint.h>
     47 #include <stdio.h>
     48 #include <stdlib.h>
     49 #include <string.h>
     50 #include <unistd.h>
     51 #include <zlib.h>
     52 
     53 #include "common.h"
     54 #include "offtab.h"
     55 #include "utils.h"
     56 
     57 /*
     58  * XXX Switch to control bug-for-bug byte-for-byte compatibility with
     59  * NetBSD's vndcompress.
     60  */
     61 #define	VNDCOMPRESS_COMPAT	0
     62 
     63 __CTASSERT(sizeof(struct cloop2_header) == CLOOP2_OFFSET_TABLE_OFFSET);
     64 
     65 struct compress_state {
     66 	uint64_t	size;		/* uncompressed size */
     67 	uint64_t	offset;		/* output byte offset */
     68 	uint32_t	blocksize;	/* bytes per block */
     69 	uint32_t	blkno;		/* input block number */
     70 	uint32_t	n_full_blocks;	/* floor(size/blocksize) */
     71 	uint32_t	n_blocks;	/* ceiling(size/blocksize) */
     72 	uint32_t	n_offsets;	/* n_blocks + 1 */
     73 	uint32_t	end_block;	/* last block to transfer */
     74 	uint32_t	checkpoint_blocks;	/* blocks before checkpoint */
     75 	int		image_fd;
     76 	int		cloop2_fd;
     77 	struct offtab	offtab;
     78 	uint32_t	n_checkpointed_blocks;
     79 	volatile sig_atomic_t
     80 			initialized;	/* everything above initialized?  */
     81 };
     82 
     83 /* Global compression state for SIGINFO handler.  */
     84 static struct compress_state	global_state;
     85 
     86 struct sigdesc {
     87 	int sd_signo;
     88 	const char *sd_name;
     89 };
     90 
     91 static const struct sigdesc info_signals[] = {
     92 	{ SIGINFO, "SIGINFO" },
     93 	{ SIGUSR1, "SIGUSR1" },
     94 };
     95 
     96 static const struct sigdesc checkpoint_signals[] = {
     97 	{ SIGUSR2, "SIGUSR2" },
     98 };
     99 
    100 static void	init_signals(void);
    101 static void	init_signal_handler(int, const struct sigdesc *, size_t,
    102 		    void (*)(int));
    103 static void	info_signal_handler(int);
    104 static void	checkpoint_signal_handler(int);
    105 static void	compress_progress(struct compress_state *);
    106 static void	compress_init(int, char **, const struct options *,
    107 		    struct compress_state *);
    108 static bool	compress_restart(struct compress_state *);
    109 static uint32_t	compress_block(int, int, uint32_t, uint32_t, uint32_t, void *,
    110 		    void *);
    111 static void	compress_maybe_checkpoint(struct compress_state *);
    112 static void	compress_checkpoint(struct compress_state *);
    113 static void	compress_exit(struct compress_state *);
    114 
    115 /*
    116  * Compression entry point.
    117  */
    118 int
    119 vndcompress(int argc, char **argv, const struct options *O)
    120 {
    121 	struct compress_state *const S = &global_state;
    122 
    123 	/* Paranoia.  The other fields either have no sentinel or use zero.  */
    124 	S->image_fd = -1;
    125 	S->cloop2_fd = -1;
    126 
    127 	/* Set up signal handlers so we can handle SIGINFO ASAP.  */
    128 	init_signals();
    129 
    130 	/*
    131 	 * Parse the arguments to initialize our state.
    132 	 */
    133 	compress_init(argc, argv, O, S);
    134 	assert(MIN_BLOCKSIZE <= S->blocksize);
    135 	assert(S->blocksize <= MAX_BLOCKSIZE);
    136 
    137 	/*
    138 	 * Allocate compression buffers.
    139 	 *
    140 	 * Compression may actually expand.  From an overabundance of
    141 	 * caution, assume it can expand by at most double.
    142 	 *
    143 	 * XXX Check and consider tightening this assumption.
    144 	 */
    145 	__CTASSERT(MAX_BLOCKSIZE <= SIZE_MAX);
    146 	void *const uncompbuf = malloc(S->blocksize);
    147 	if (uncompbuf == NULL)
    148 		err(1, "malloc uncompressed buffer");
    149 
    150 	/* XXX compression ratio bound */
    151 	__CTASSERT(MUL_OK(size_t, 2, MAX_BLOCKSIZE));
    152 	void *const compbuf = malloc(2 * (size_t)S->blocksize);
    153 	if (compbuf == NULL)
    154 		err(1, "malloc compressed buffer");
    155 
    156 	/*
    157 	 * Compress the blocks.  S->blkno specifies the input block
    158 	 * we're about to transfer.  S->offset is the current output
    159 	 * offset.
    160 	 */
    161 	while (S->blkno < S->n_blocks) {
    162 		/* Report any progress.  */
    163 		compress_progress(S);
    164 
    165 		/* Stop if we've done the requested partial transfer.  */
    166 		if ((0 < S->end_block) && (S->end_block <= S->blkno))
    167 			goto out;
    168 
    169 		/* Checkpoint if appropriate.  */
    170 		compress_maybe_checkpoint(S);
    171 		offtab_prepare_put(&S->offtab, (S->blkno + 1));
    172 
    173 		/* Choose read size: partial if last block, full if not.  */
    174 		const uint32_t readsize = (S->blkno == S->n_full_blocks?
    175 		    (S->size % S->blocksize) : S->blocksize);
    176 		assert(readsize > 0);
    177 		assert(readsize <= S->blocksize);
    178 
    179 		/* Fail noisily if we might be about to overflow.  */
    180 		/* XXX compression ratio bound */
    181 		__CTASSERT(MUL_OK(uint64_t, 2, MAX_BLOCKSIZE));
    182 		__CTASSERT(MUL_OK(off_t, 2, MAX_BLOCKSIZE));
    183 		assert(S->offset <= MIN(UINT64_MAX, OFF_MAX));
    184 		if (!ADD_OK(uint64_t, S->offset, 2*(uintmax_t)readsize) ||
    185 		    !ADD_OK(off_t, S->offset, 2*(uintmax_t)readsize))
    186 			errx(1, "blkno %"PRIu32" may overflow: %ju + 2*%ju",
    187 			    S->blkno, (uintmax_t)S->offset,
    188 			    (uintmax_t)readsize);
    189 
    190 		/* Process the block.  */
    191 		const uint32_t complen =
    192 		    compress_block(S->image_fd, S->cloop2_fd, S->blkno,
    193 			S->blocksize, readsize, uncompbuf, compbuf);
    194 
    195 		/*
    196 		 * Signal-atomically update the state to reflect
    197 		 * (a) what block number we are now at,
    198 		 * (b) how far we are now in the output file, and
    199 		 * (c) where the last block ended.
    200 		 */
    201 		assert(ADD_OK(uint32_t, S->blkno, 1));
    202 		assert(ADD_OK(uint64_t, S->offset, complen));
    203 		assert(ADD_OK(off_t, (off_t)S->offset, (off_t)complen));
    204 		assert((S->blkno + 1) < S->n_offsets);
    205 	    {
    206 		sigset_t old_sigmask;
    207 		block_signals(&old_sigmask);
    208 		S->blkno += 1;					/* (a) */
    209 		S->offset += complen;				/* (b) */
    210 		offtab_put(&S->offtab, S->blkno, S->offset);	/* (c) */
    211 		restore_sigmask(&old_sigmask);
    212 	    }
    213 	}
    214 
    215 	/* Make sure we're all done. */
    216 	assert(S->blkno == S->n_blocks);
    217 	assert((S->blkno + 1) == S->n_offsets);
    218 
    219 	/* Pad to the disk block size.  */
    220 	const uint32_t n_extra = (S->offset % DEV_BSIZE);
    221 	if (n_extra != 0) {
    222 		const uint32_t n_padding = (DEV_BSIZE - n_extra);
    223 		/* Reuse compbuf -- guaranteed to be large enough.  */
    224 		(void)memset(compbuf, 0, n_padding);
    225 		const ssize_t n_written = write(S->cloop2_fd, compbuf,
    226 		    n_padding);
    227 		if (n_written == -1)
    228 			err(1, "write final padding failed");
    229 		assert(n_written >= 0);
    230 		if ((size_t)n_written != n_padding)
    231 			errx(1, "partial write of final padding bytes"
    232 			    ": %zu != %"PRIu32,
    233 			    (size_t)n_written, n_padding);
    234 
    235 		/* Account for the extra bytes in the output file.  */
    236 		assert(ADD_OK(uint64_t, S->offset, n_padding));
    237 		assert(ADD_OK(off_t, (off_t)S->offset, (off_t)n_padding));
    238 	    {
    239 		sigset_t old_sigmask;
    240 		block_signals(&old_sigmask);
    241 		S->offset += n_padding;
    242 		restore_sigmask(&old_sigmask);
    243 	    }
    244 	}
    245 
    246 out:
    247 	/* One last checkpoint to commit the offset table.  */
    248 	assert(S->offset <= OFF_MAX);
    249 	assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
    250 	compress_checkpoint(S);
    251 
    252 	/*
    253 	 * Free the compression buffers and finalize the compression.
    254 	 */
    255 	free(compbuf);
    256 	free(uncompbuf);
    257 	compress_exit(S);
    258 
    259 	return 0;
    260 }
    261 
    262 /*
    263  * Signal cruft.
    264  */
    265 
    266 static void
    267 init_signals(void)
    268 {
    269 
    270 	init_signal_handler(SA_RESTART, info_signals,
    271 	    __arraycount(info_signals), &info_signal_handler);
    272 	init_signal_handler(SA_RESTART, checkpoint_signals,
    273 	    __arraycount(checkpoint_signals), &checkpoint_signal_handler);
    274 }
    275 
    276 static void
    277 init_signal_handler(int flags, const struct sigdesc *signals, size_t n,
    278     void (*handler)(int))
    279 {
    280 	static const struct sigaction zero_sa;
    281 	struct sigaction sa = zero_sa;
    282 	size_t i;
    283 
    284 	(void)sigemptyset(&sa.sa_mask);
    285 	for (i = 0; i < n; i++)
    286 		(void)sigaddset(&sa.sa_mask, signals[i].sd_signo);
    287 	sa.sa_flags = flags;
    288 	sa.sa_handler = handler;
    289 	for (i = 0; i < n; i++)
    290 		if (sigaction(signals[i].sd_signo, &sa, NULL) == -1)
    291 			err(1, "sigaction(%s)", signals[i].sd_name);
    292 }
    293 
    294 static void
    295 info_signal_handler(int signo __unused)
    296 {
    297 	/* Save errno.  */
    298 	const int error = errno;
    299 	struct compress_state *const S = &global_state;
    300 	char buf[128];
    301 
    302 	/* Bail if the state is not yet initialized.  */
    303 	if (!S->initialized) {
    304 		warnx_ss("initializing");
    305 		goto out;
    306 	}
    307 
    308 	/* Carefully calculate our I/O position.  */
    309 	assert(S->blocksize > 0);
    310 	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, MAX_BLOCKSIZE));
    311 	const uint64_t nread = ((uint64_t)S->blkno * (uint64_t)S->blocksize);
    312 
    313 	assert(S->n_blocks > 0);
    314 	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, sizeof(uint64_t)));
    315 	__CTASSERT(ADD_OK(uint64_t, CLOOP2_OFFSET_TABLE_OFFSET,
    316 		MAX_N_BLOCKS*sizeof(uint64_t)));
    317 	const uint64_t nwritten = (S->offset <= (CLOOP2_OFFSET_TABLE_OFFSET +
    318 		((uint64_t)S->n_blocks * sizeof(uint64_t)))?
    319 	    0 : S->offset);
    320 
    321 	/* snprintf_ss can't do floating-point, so do fixed-point instead.  */
    322 	const uint64_t ratio_percent =
    323 	    (nread > 0?
    324 		((nwritten >= (UINT64_MAX / 100)) ?
    325 		    ((nwritten / nread) * 100) : ((nwritten * 100) / nread))
    326 		: 0);
    327 
    328 	/* Format the status.  */
    329 	assert(S->n_checkpointed_blocks <= MAX_N_BLOCKS);
    330 	assert(S->blocksize <= MAX_BLOCKSIZE);
    331 	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, MAX_BLOCKSIZE));
    332 	const int n = snprintf_ss(buf, sizeof(buf),
    333 	    "vndcompress: read %"PRIu64" bytes, wrote %"PRIu64" bytes, "
    334 	    "compression ratio %"PRIu64"%% (checkpointed %"PRIu64" bytes)\n",
    335 	    nread, nwritten, ratio_percent,
    336 	    ((uint64_t)S->n_checkpointed_blocks * (uint64_t)S->blocksize));
    337 	if (n < 0) {
    338 		const char msg[] = "vndcompress: can't format info\n";
    339 		(void)write(STDERR_FILENO, msg, __arraycount(msg));
    340 	} else {
    341 		__CTASSERT(INT_MAX <= SIZE_MAX);
    342 		(void)write(STDERR_FILENO, buf, (size_t)n);
    343 	}
    344 
    345 out:
    346 	/* Restore errno.  */
    347 	errno = error;
    348 }
    349 
    350 static void
    351 checkpoint_signal_handler(int signo __unused)
    352 {
    353 	/* Save errno.  */
    354 	const int error = errno;
    355 	struct compress_state *const S = &global_state;
    356 
    357 	/* Bail if the state is not yet initialized.  */
    358 	if (!S->initialized) {
    359 		warnx_ss("nothing to checkpoint yet");
    360 		goto out;
    361 	}
    362 
    363 	assert(S->image_fd >= 0);
    364 	assert(S->cloop2_fd >= 0);
    365 
    366 	/* Take a checkpoint.  */
    367 	assert(S->blkno <= MAX_N_BLOCKS);
    368 	assert(S->blocksize <= MAX_BLOCKSIZE);
    369 	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, MAX_BLOCKSIZE));
    370 	warnx_ss("checkpointing %"PRIu64" bytes",
    371 	    ((uint64_t)S->blkno * (uint64_t)S->blocksize));
    372 	compress_checkpoint(S);
    373 
    374 out:
    375 	/* Restore errno.  */
    376 	errno = error;
    377 }
    378 
    379 /*
    380  * Report progress.
    381  *
    382  * XXX Should do a progress bar here.
    383  */
    384 static void
    385 compress_progress(struct compress_state *S __unused)
    386 {
    387 }
    388 
    389 /*
    390  * Parse arguments, open the files, and initialize the state.
    391  */
    392 static void
    393 compress_init(int argc, char **argv, const struct options *O,
    394     struct compress_state *S)
    395 {
    396 
    397 	if (!((argc == 2) || (argc == 3)))
    398 		usage();
    399 
    400 	const char *const image_pathname = argv[0];
    401 	const char *const cloop2_pathname = argv[1];
    402 
    403 	/* Grab the block size either from `-b' or from the last argument.  */
    404 	__CTASSERT(0 < DEV_BSIZE);
    405 	__CTASSERT((MIN_BLOCKSIZE % DEV_BSIZE) == 0);
    406 	__CTASSERT(MIN_BLOCKSIZE <= DEF_BLOCKSIZE);
    407 	__CTASSERT((DEF_BLOCKSIZE % DEV_BSIZE) == 0);
    408 	__CTASSERT(DEF_BLOCKSIZE <= MAX_BLOCKSIZE);
    409 	__CTASSERT((MAX_BLOCKSIZE % DEV_BSIZE) == 0);
    410 	if (ISSET(O->flags, FLAG_b)) {
    411 		if (argc == 3) {
    412 			warnx("use -b or the extra argument, not both");
    413 			usage();
    414 		}
    415 		S->blocksize = O->blocksize;
    416 	} else {
    417 		S->blocksize = (argc == 2? DEF_BLOCKSIZE :
    418 		    strsuftoll("block size", argv[2], MIN_BLOCKSIZE,
    419 			MAX_BLOCKSIZE));
    420 	}
    421 
    422 	/* Sanity-check the blocksize.  (strsuftoll guarantees bounds.)  */
    423 	__CTASSERT(DEV_BSIZE <= UINT32_MAX);
    424 	if ((S->blocksize % DEV_BSIZE) != 0)
    425 		errx(1, "bad blocksize: %"PRIu32
    426 		    " (not a multiple of %"PRIu32")",
    427 		    S->blocksize, (uint32_t)DEV_BSIZE);
    428 	assert(MIN_BLOCKSIZE <= S->blocksize);
    429 	assert((S->blocksize % DEV_BSIZE) == 0);
    430 	assert(S->blocksize <= MAX_BLOCKSIZE);
    431 
    432 	/* Grab the end block number if we have one.  */
    433 	S->end_block = (ISSET(O->flags, FLAG_p)? O->end_block : 0);
    434 
    435 	/* Grab the checkpoint block count, if we have one.  */
    436 	S->checkpoint_blocks =
    437 	    (ISSET(O->flags, FLAG_k)? O->checkpoint_blocks : 0);
    438 
    439 	/* Open the input image file and the output cloop2 file.  */
    440 	S->image_fd = open(image_pathname, O_RDONLY);
    441 	if (S->image_fd == -1)
    442 		err(1, "open(%s)", image_pathname);
    443 
    444 	int oflags;
    445 	if (!ISSET(O->flags, FLAG_r))
    446 		oflags = (O_WRONLY | O_TRUNC | O_CREAT);
    447 	else if (!ISSET(O->flags, FLAG_R))
    448 		oflags = (O_RDWR | O_CREAT);
    449 	else
    450 		oflags = O_RDWR;
    451 	S->cloop2_fd = open(cloop2_pathname, oflags, 0777);
    452 	if (S->cloop2_fd == -1)
    453 		err(1, "open(%s)", cloop2_pathname);
    454 
    455 	/* Find the size of the input image.  */
    456 	if (ISSET(O->flags, FLAG_l)) {
    457 		S->size = O->length;
    458 	} else {
    459 		static const struct stat zero_st;
    460 		struct stat st = zero_st;
    461 		if (fstat(S->image_fd, &st) == -1)
    462 			err(1, "stat(%s)", image_pathname);
    463 		if (st.st_size <= 0)
    464 			errx(1, "unknown image size");
    465 		assert(st.st_size >= 0);
    466 		__CTASSERT(OFF_MAX <= UINT64_MAX);
    467 		assert(__type_fit(uint64_t, st.st_size));
    468 		S->size = st.st_size;
    469 	}
    470 	assert(S->size <= OFF_MAX);
    471 
    472 	/* Find number of full blocks and whether there's a partial block.  */
    473 	__CTASSERT(0 < MIN_BLOCKSIZE);
    474 	assert(0 < S->blocksize);
    475 	if (TOOMANY(off_t, (off_t)S->size, (off_t)S->blocksize,
    476 		(off_t)MAX_N_BLOCKS))
    477 		errx(1, "image too large for block size %"PRIu32": %"PRIu64,
    478 		    S->blocksize, S->size);
    479 	__CTASSERT(MAX_N_BLOCKS <= UINT32_MAX);
    480 	S->n_full_blocks = S->size/S->blocksize;
    481 	S->n_blocks = HOWMANY(S->size, S->blocksize);
    482 	assert(S->n_full_blocks <= S->n_blocks);
    483 	assert(S->n_blocks <= MAX_N_BLOCKS);
    484 
    485 	/* Choose a window size.  */
    486 	const uint32_t window_size = (ISSET(O->flags, FLAG_w)? O->window_size :
    487 	    DEF_WINDOW_SIZE);
    488 
    489 	/* Create an offset table for the blocks; one extra for the end.  */
    490 	__CTASSERT(ADD_OK(uint32_t, MAX_N_BLOCKS, 1));
    491 	S->n_offsets = (S->n_blocks + 1);
    492 	__CTASSERT(MAX_N_OFFSETS == (MAX_N_BLOCKS + 1));
    493 	__CTASSERT(MUL_OK(size_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    494 	__CTASSERT(CLOOP2_OFFSET_TABLE_OFFSET <= OFFTAB_MAX_FDPOS);
    495 	offtab_init(&S->offtab, S->n_offsets, window_size, S->cloop2_fd,
    496 	    CLOOP2_OFFSET_TABLE_OFFSET);
    497 
    498 	/* Attempt to restart a partial transfer if requested.  */
    499 	if (ISSET(O->flags, FLAG_r)) {
    500 		if (compress_restart(S)) {
    501 			/*
    502 			 * Restart succeeded.  Truncate the output
    503 			 * here, in case any garbage got appended.  We
    504 			 * are committed to making progress at this
    505 			 * point.  If the ftruncate fails, we don't
    506 			 * lose anything valuable -- this is the last
    507 			 * point at which we can restart anyway.
    508 			 */
    509 			if (ftruncate(S->cloop2_fd, S->offset) == -1)
    510 				err(1, "ftruncate failed");
    511 
    512 			/* All set!  No more initialization to do.  */
    513 			return;
    514 		} else {
    515 			/* Restart failed.  Barf now if requested.  */
    516 			if (ISSET(O->flags, FLAG_R))
    517 				errx(1, "restart failed, aborting");
    518 
    519 			/* Otherwise, truncate and start at the top.  */
    520 			if (ftruncate(S->cloop2_fd, 0) == -1)
    521 				err(1, "truncate failed");
    522 			if (lseek(S->cloop2_fd, 0, SEEK_SET) == -1)
    523 				err(1, "lseek to cloop2 beginning failed");
    524 
    525 			/* If we seeked in the input, rewind.  */
    526 			if (S->blkno != 0) {
    527 				if (lseek(S->image_fd, 0, SEEK_SET) == -1)
    528 					err(1,
    529 					    "lseek to image beginning failed");
    530 			}
    531 		}
    532 	}
    533 
    534 	/* Write a bogus (zero) header for now, until we checkpoint.  */
    535 	static const struct cloop2_header zero_header;
    536 	const ssize_t h_written = write(S->cloop2_fd, &zero_header,
    537 	    sizeof(zero_header));
    538 	if (h_written == -1)
    539 		err(1, "write header");
    540 	assert(h_written >= 0);
    541 	if ((size_t)h_written != sizeof(zero_header))
    542 		errx(1, "partial write of header: %zu != %zu",
    543 		    (size_t)h_written, sizeof(zero_header));
    544 
    545 	/* Reset the offset table to be empty and write it.  */
    546 	offtab_reset_write(&S->offtab);
    547 
    548 	/* Start at the beginning of the image.  */
    549 	S->blkno = 0;
    550 	S->offset = (sizeof(struct cloop2_header) +
    551 	    ((uint64_t)S->n_offsets * sizeof(uint64_t)));
    552 	S->n_checkpointed_blocks = 0;
    553 
    554 	/* Good to go and ready for interruption by a signal.  */
    555 	S->initialized = 1;
    556 }
    557 
    558 /*
    559  * Try to recover state from an existing output file.
    560  *
    561  * On success, fill the offset table with what's in the file, set
    562  * S->blkno and S->offset to reflect our position, and seek to the
    563  * respective positions in the input and output files.
    564  *
    565  * On failure, return false.  May clobber the offset table, S->blkno,
    566  * S->offset, and the file pointers.
    567  */
    568 static bool
    569 compress_restart(struct compress_state *S)
    570 {
    571 
    572 	/* Read in the header.  */
    573 	static const struct cloop2_header zero_header;
    574 	struct cloop2_header header = zero_header;
    575 
    576 	const ssize_t h_read = read_block(S->cloop2_fd, &header,
    577 	    sizeof(header));
    578 	if (h_read == -1) {
    579 		warn("failed to read header");
    580 		return false;
    581 	}
    582 	assert(h_read >= 0);
    583 	if ((size_t)h_read != sizeof(header)) {
    584 		warnx("partial read of header");
    585 		return false;
    586 	}
    587 
    588 	/* Check that the header looks like a header.  */
    589 	__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
    590 	if (memcmp(header.cl2h_magic, cloop2_magic, sizeof(cloop2_magic))
    591 	    != 0) {
    592 		warnx("bad cloop2 shell script magic");
    593 		return false;
    594 	}
    595 
    596 	/* Check the header parameters.  */
    597 	if (be32toh(header.cl2h_blocksize) != S->blocksize) {
    598 		warnx("mismatched block size: %"PRIu32
    599 		    " (expected %"PRIu32")",
    600 		    be32toh(header.cl2h_blocksize), S->blocksize);
    601 		return false;
    602 	}
    603 	if (be32toh(header.cl2h_n_blocks) != S->n_blocks) {
    604 		warnx("mismatched number of blocks: %"PRIu32
    605 		    " (expected %"PRIu32")",
    606 		    be32toh(header.cl2h_n_blocks), S->n_blocks);
    607 		return false;
    608 	}
    609 
    610 	/* Read in the partial offset table.  */
    611 	if (!offtab_reset_read(&S->offtab, &warn, &warnx))
    612 		return false;
    613 	if (!offtab_prepare_get(&S->offtab, 0))
    614 		return false;
    615 	const uint64_t first_offset = offtab_get(&S->offtab, 0);
    616 	__CTASSERT(MUL_OK(uint64_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    617 	__CTASSERT(ADD_OK(uint64_t, sizeof(struct cloop2_header),
    618 		MAX_N_OFFSETS*sizeof(uint64_t)));
    619 	const uint64_t expected = sizeof(struct cloop2_header) +
    620 	    ((uint64_t)S->n_offsets * sizeof(uint64_t));
    621 	if (first_offset != expected) {
    622 		warnx("first offset is not 0x%"PRIx64": 0x%"PRIx64,
    623 		    expected, first_offset);
    624 		return false;
    625 	}
    626 
    627 	/* Find where we left off.  */
    628 	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
    629 	uint32_t blkno = 0;
    630 	uint64_t last_offset = first_offset;
    631 	for (blkno = 0; blkno < S->n_blocks; blkno++) {
    632 		if (!offtab_prepare_get(&S->offtab, blkno))
    633 			return false;
    634 		const uint64_t offset = offtab_get(&S->offtab, blkno);
    635 		if (offset == ~(uint64_t)0)
    636 			break;
    637 
    638 		if (0 < blkno) {
    639 			const uint64_t start = last_offset;
    640 			const uint64_t end = offset;
    641 			if (end <= start) {
    642 				warnx("bad offset table: 0x%"PRIx64
    643 				    ", 0x%"PRIx64, start, end);
    644 				return false;
    645 			}
    646 			/* XXX compression ratio bound */
    647 			__CTASSERT(MUL_OK(size_t, 2, MAX_BLOCKSIZE));
    648 			if ((2 * (size_t)S->blocksize) <= (end - start)) {
    649 				warnx("block %"PRIu32" too large:"
    650 				    " %"PRIu64" bytes"
    651 				    " from 0x%"PRIx64" to 0x%"PRIx64,
    652 				    blkno, (end - start), start, end);
    653 				return false;
    654 			}
    655 		}
    656 
    657 		last_offset = offset;
    658 	}
    659 
    660 	if (blkno == 0) {
    661 		warnx("no blocks were written; nothing to restart");
    662 		return false;
    663 	}
    664 
    665 	/* Make sure the rest of the offset table is all ones.  */
    666 	if (blkno < S->n_blocks) {
    667 		uint32_t nblkno;
    668 
    669 		for (nblkno = blkno; nblkno < S->n_blocks; nblkno++) {
    670 			if (!offtab_prepare_get(&S->offtab, nblkno))
    671 				return false;
    672 			const uint64_t offset = offtab_get(&S->offtab, nblkno);
    673 			if (offset != ~(uint64_t)0) {
    674 				warnx("bad partial offset table entry"
    675 				    " at %"PRIu32": 0x%"PRIx64,
    676 				    nblkno, offset);
    677 				return false;
    678 			}
    679 		}
    680 	}
    681 
    682 	/*
    683 	 * XXX Consider decompressing some number of blocks to make
    684 	 * sure they match.
    685 	 */
    686 
    687 	/* Back up by one.  */
    688 	assert(1 <= blkno);
    689 	blkno -= 1;
    690 
    691 	/* Seek to the output position.  */
    692 	assert(last_offset <= OFF_MAX);
    693 	if (lseek(S->cloop2_fd, last_offset, SEEK_SET) == -1) {
    694 		warn("lseek output cloop2 to %"PRIx64" failed", last_offset);
    695 		return false;
    696 	}
    697 
    698 	/* Switch from reading to writing the offset table.  */
    699 	if (!offtab_transmogrify_read_to_write(&S->offtab, blkno))
    700 		return false;
    701 
    702 	/*
    703 	 * Seek to the input position last, after all other possible
    704 	 * failures, because if the input is a pipe, we can't change
    705 	 * our mind, rewind, and start at the beginning instead of
    706 	 * restarting.
    707 	 */
    708 	assert(S->size <= OFF_MAX);
    709 	assert(blkno <= (S->size / S->blocksize));
    710 	const off_t restart_position = ((off_t)blkno * (off_t)S->blocksize);
    711 	assert(0 <= restart_position);
    712 	assert(restart_position <= (off_t)S->size);
    713 	if (lseek(S->image_fd, restart_position, SEEK_SET) == -1) {
    714 		if (errno != ESPIPE) {
    715 			warn("lseek input image failed");
    716 			return false;
    717 		}
    718 
    719 		/* Try read instead of lseek for a pipe/socket/fifo.  */
    720 		void *const buffer = malloc(0x10000);
    721 		if (buffer == NULL)
    722 			err(1, "malloc temporary buffer");
    723 		off_t left = restart_position;
    724 		while (left > 0) {
    725 			const size_t size = MIN(0x10000, left);
    726 			const ssize_t n_read = read_block(S->image_fd, buffer,
    727 			    size);
    728 			if (n_read == -1) {
    729 				free(buffer);
    730 				warn("read of input image failed");
    731 				return false;
    732 			}
    733 			assert(n_read >= 0);
    734 			if ((size_t)n_read != size) {
    735 				free(buffer);
    736 				warnx("partial read of input image");
    737 				return false;
    738 			}
    739 			assert((off_t)size <= left);
    740 			left -= size;
    741 		}
    742 		free(buffer);
    743 	}
    744 
    745 	/* Start where we left off.  */
    746 	S->blkno = blkno;
    747 	S->offset = last_offset;
    748 	S->n_checkpointed_blocks = blkno;
    749 
    750 	/* Good to go and ready for interruption by a signal.  */
    751 	S->initialized = 1;
    752 
    753 	/* Success!  */
    754 	return true;
    755 }
    756 
    757 /*
    758  * Read a single block, compress it, and write the compressed block.
    759  * Return the size of the compressed block.
    760  */
    761 static uint32_t
    762 compress_block(int in_fd, int out_fd, uint32_t blkno, uint32_t blocksize,
    763     uint32_t readsize, void *uncompbuf, void *compbuf)
    764 {
    765 
    766 	assert(readsize <= blocksize);
    767 	assert(blocksize <= MAX_BLOCKSIZE);
    768 
    769 	/* Read the uncompressed block.  */
    770 	const ssize_t n_read = read_block(in_fd, uncompbuf, readsize);
    771 	if (n_read == -1)
    772 		err(1, "read block %"PRIu32, blkno);
    773 	assert(n_read >= 0);
    774 	if ((size_t)n_read != readsize)
    775 		errx(1, "partial read of block %"PRIu32": %zu != %"PRIu32,
    776 		    blkno, (size_t)n_read, readsize);
    777 
    778 	/* Compress the block.  */
    779 	/* XXX compression ratio bound */
    780 	__CTASSERT(MUL_OK(unsigned long, 2, MAX_BLOCKSIZE));
    781 	const unsigned long uncomplen =
    782 	    (VNDCOMPRESS_COMPAT? blocksize : readsize); /* XXX */
    783 	unsigned long complen = (uncomplen * 2);
    784 	const int zerror = compress2(compbuf, &complen, uncompbuf, uncomplen,
    785 	    Z_BEST_COMPRESSION);
    786 	if (zerror != Z_OK)
    787 		errx(1, "compressed failed at block %"PRIu32" (%d): %s", blkno,
    788 		    zerror, zError(zerror));
    789 	assert(complen <= (uncomplen * 2));
    790 
    791 	/* Write the compressed block.  */
    792 	const ssize_t n_written = write(out_fd, compbuf, complen);
    793 	if (n_written == -1)
    794 		err(1, "write block %"PRIu32, blkno);
    795 	assert(n_written >= 0);
    796 	if ((size_t)n_written != complen)
    797 		errx(1, "partial write of block %"PRIu32": %zu != %lu",
    798 		    blkno, (size_t)n_written, complen);
    799 
    800 	return (size_t)n_written;
    801 }
    802 
    803 /*
    804  * Checkpoint if appropriate.
    805  */
    806 static void
    807 compress_maybe_checkpoint(struct compress_state *S)
    808 {
    809 
    810 	if ((0 < S->checkpoint_blocks) && (0 < S->blkno) &&
    811 	    ((S->blkno % S->checkpoint_blocks) == 0)) {
    812 		assert(S->offset <= OFF_MAX);
    813 		assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
    814 		compress_checkpoint(S);
    815 	}
    816 }
    817 
    818 /*
    819  * Write the prefix of the offset table that we have filled so far.
    820  *
    821  * We fsync the data blocks we have written, and then write the offset
    822  * table, and then fsync the offset table and file metadata.  This
    823  * should help to avoid offset tables that point at garbage data.
    824  *
    825  * This may be called from a signal handler, so it must not use stdio,
    826  * malloc, &c. -- it may only (a) handle signal-safe state in S, and
    827  * (b) do file descriptor I/O / fsync.
    828  *
    829  * XXX This requires further thought and heavy testing to be sure.
    830  *
    831  * XXX Should have an option to suppress fsync.
    832  *
    833  * XXX Should have an option to fail on fsync failures.
    834  *
    835  * XXX Would be nice if we could just do a barrier rather than an
    836  * fsync.
    837  *
    838  * XXX How might we automatically test the fsyncs?
    839  */
    840 static void
    841 compress_checkpoint(struct compress_state *S)
    842 {
    843 
    844 	assert(S->blkno < S->n_offsets);
    845 	const uint32_t n_offsets = (S->blkno + 1);
    846 	assert(n_offsets <= S->n_offsets);
    847 
    848 	assert(S->offset <= OFF_MAX);
    849 	assert((off_t)S->offset <= lseek(S->cloop2_fd, 0, SEEK_CUR));
    850 
    851 	/* Make sure the data hits the disk before we say it's ready.  */
    852 	if (fsync_range(S->cloop2_fd, (FFILESYNC | FDISKSYNC), 0, S->offset)
    853 	    == -1)
    854 		warn_ss("fsync of output failed");
    855 
    856 	/* Say the data blocks are ready.  */
    857 	offtab_checkpoint(&S->offtab, n_offsets,
    858 	    (S->n_checkpointed_blocks == 0? OFFTAB_CHECKPOINT_SYNC : 0));
    859 
    860 	/*
    861 	 * If this is the first checkpoint, initialize the header.
    862 	 * Signal handler can race with main code here, but it is
    863 	 * harmless -- just an extra fsync and write of the header,
    864 	 * which are both idempotent.
    865 	 *
    866 	 * Once we have synchronously checkpointed the offset table,
    867 	 * subsequent writes will preserve a valid state.
    868 	 */
    869 	if (S->n_checkpointed_blocks == 0) {
    870 		static const struct cloop2_header zero_header;
    871 		struct cloop2_header header = zero_header;
    872 
    873 		/* Format the header.  */
    874 		__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
    875 		(void)memcpy(header.cl2h_magic, cloop2_magic,
    876 		    sizeof(cloop2_magic));
    877 		header.cl2h_blocksize = htobe32(S->blocksize);
    878 		header.cl2h_n_blocks = htobe32(S->n_blocks);
    879 
    880 		/* Write the header.  */
    881 		const ssize_t h_written = pwrite(S->cloop2_fd, &header,
    882 		    sizeof(header), 0);
    883 		if (h_written == -1)
    884 			err_ss(1, "write header");
    885 		assert(h_written >= 0);
    886 		if ((size_t)h_written != sizeof(header))
    887 			errx_ss(1, "partial write of header: %zu != %zu",
    888 			    (size_t)h_written, sizeof(header));
    889 	}
    890 
    891 	/* Record how many blocks we've checkpointed.  */
    892     {
    893 	sigset_t old_sigmask;
    894 	block_signals(&old_sigmask);
    895 	S->n_checkpointed_blocks = S->blkno;
    896 	restore_sigmask(&old_sigmask);
    897     }
    898 }
    899 
    900 /*
    901  * Release everything we allocated in compress_init.
    902  */
    903 static void
    904 compress_exit(struct compress_state *S)
    905 {
    906 
    907 	/* Done with the offset table.  Destroy it.  */
    908 	offtab_destroy(&S->offtab);
    909 
    910 	/* Done with the files.  Close them.  */
    911 	if (close(S->cloop2_fd) == -1)
    912 		warn("close(cloop2 fd)");
    913 	if (close(S->image_fd) == -1)
    914 		warn("close(image fd)");
    915 }
    916