Home | History | Annotate | Line # | Download | only in vndcompress
      1 /*	$NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2014 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __RCSID("$NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $");
     34 
     35 #include <sys/types.h>
     36 #include <sys/endian.h>
     37 
     38 #include <assert.h>
     39 #include <err.h>
     40 #include <errno.h>
     41 #include <inttypes.h>
     42 #include <limits.h>
     43 #include <stdbool.h>
     44 #include <stdlib.h>
     45 #include <unistd.h>
     46 
     47 #include "common.h"
     48 #include "utils.h"
     49 
     50 #include "offtab.h"
     51 
     52 static void __printflike(1,2) __dead
     53 offtab_bug(const char *fmt, ...)
     54 {
     55 
     56 	errx(1, "bug in offtab, please report");
     57 }
     58 
     59 static void __printflike(1,2) __dead
     60 offtab_bugx(const char *fmt, ...)
     61 {
     62 
     63 	errx(1, "bug in offtab, please report");
     64 }
     65 
     66 static uint32_t
     67 offtab_compute_window_size(struct offtab *offtab, uint32_t start)
     68 {
     69 
     70 	assert(start < offtab->ot_n_offsets);
     71 	return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start));
     72 }
     73 
     74 static uint32_t
     75 offtab_current_window_size(struct offtab *offtab)
     76 {
     77 
     78 	return offtab_compute_window_size(offtab, offtab->ot_window_start);
     79 }
     80 
     81 static uint32_t
     82 offtab_current_window_end(struct offtab *offtab)
     83 {
     84 
     85 	assert(offtab->ot_window_start < offtab->ot_n_offsets);
     86 	assert(offtab_current_window_size(offtab) <=
     87 	    (offtab->ot_n_offsets - offtab->ot_window_start));
     88 	return (offtab->ot_window_start + offtab_current_window_size(offtab));
     89 }
     90 
     91 static void
     92 offtab_compute_window_position(struct offtab *offtab, uint32_t window_start,
     93     size_t *bytes, off_t *pos)
     94 {
     95 	const uint32_t window_size = offtab_compute_window_size(offtab,
     96 	    window_start);
     97 
     98 	__CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t)));
     99 	*bytes = (window_size * sizeof(uint64_t));
    100 
    101 	assert(window_start <= offtab->ot_n_offsets);
    102 	__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    103 	const off_t window_offset = ((off_t)window_start *
    104 	    (off_t)sizeof(uint64_t));
    105 
    106 	assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS);
    107 	__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
    108 		(off_t)MAX_N_OFFSETS*sizeof(uint64_t)));
    109 	assert(ADD_OK(off_t, offtab->ot_fdpos, window_offset));
    110 	*pos = (offtab->ot_fdpos + window_offset);
    111 }
    112 
    113 #define	OFFTAB_READ_SEEK	0x01
    114 #define	OFFTAB_READ_NOSEEK	0x00
    115 
    116 static bool
    117 offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
    118 {
    119 	const uint32_t window_start = rounddown(blkno, offtab->ot_window_size);
    120 	size_t window_bytes;
    121 	off_t window_pos;
    122 
    123 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    124 	assert(ISSET(read_flags, OFFTAB_READ_SEEK) ||
    125 	    (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
    126 	    ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
    127 
    128 	offtab_compute_window_position(offtab, window_start,
    129 	    &window_bytes, &window_pos);
    130 	const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK)
    131 	    ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes,
    132 		window_pos)
    133 	    : read_block(offtab->ot_fd, offtab->ot_window, window_bytes));
    134 	if (n_read == -1) {
    135 		(*offtab->ot_report)("read offset table at %"PRIuMAX,
    136 		    (uintmax_t)window_pos);
    137 		return false;
    138 	}
    139 	assert(n_read >= 0);
    140 	if ((size_t)n_read != window_bytes) {
    141 		(*offtab->ot_reportx)("partial read of offset table"
    142 		    " at %"PRIuMAX": %zu != %zu",
    143 		    (uintmax_t)window_pos, (size_t)n_read, window_bytes);
    144 		return false;
    145 	}
    146 
    147 	offtab->ot_window_start = window_start;
    148 
    149 	return true;
    150 }
    151 
    152 static bool
    153 offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
    154 {
    155 
    156 	/* Don't bother if blkno is already in the window.  */
    157 	if ((offtab->ot_window_start <= blkno) &&
    158 	    (blkno < offtab_current_window_end(offtab)))
    159 		return true;
    160 
    161 	if (!offtab_read_window(offtab, blkno, read_flags))
    162 		return false;
    163 
    164 	return true;
    165 }
    166 
    167 static void
    168 offtab_write_window(struct offtab *offtab)
    169 {
    170 	size_t window_bytes;
    171 	off_t window_pos;
    172 
    173 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    174 
    175 	offtab_compute_window_position(offtab, offtab->ot_window_start,
    176 	    &window_bytes, &window_pos);
    177 	const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window,
    178 	    window_bytes, window_pos);
    179 	if (n_written == -1)
    180 		err_ss(1, "write initial offset table");
    181 	assert(n_written >= 0);
    182 	if ((size_t)n_written != window_bytes)
    183 		errx_ss(1, "partial write of initial offset bytes: %zu <= %zu",
    184 		    (size_t)n_written,
    185 		    window_bytes);
    186 }
    187 
    188 static void
    189 offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end)
    190 {
    191 
    192 	/* Don't bother if [start, end) does not cover our window.  */
    193 	if (end <= offtab->ot_window_start)
    194 		return;
    195 	if (offtab_current_window_end(offtab) < start)
    196 		return;
    197 
    198 	offtab_write_window(offtab);
    199 }
    200 
    201 /*
    203  * Initialize an offtab to support the specified number of offsets read
    204  * to or written from fd at byte position fdpos.
    205  */
    206 void
    207 offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size,
    208     int fd, off_t fdpos)
    209 {
    210 
    211 	assert(offtab != NULL);
    212 	assert(0 < n_offsets);
    213 	assert(0 <= fd);
    214 	assert(0 <= fdpos);
    215 	assert(fdpos <= OFFTAB_MAX_FDPOS);
    216 
    217 	offtab->ot_n_offsets = n_offsets;
    218 	if ((window_size == 0) || (n_offsets < window_size))
    219 		offtab->ot_window_size = n_offsets;
    220 	else
    221 		offtab->ot_window_size = window_size;
    222 	assert(offtab->ot_window_size <= offtab->ot_n_offsets);
    223 	offtab->ot_window_start = (uint32_t)-1;
    224 	__CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t)));
    225 	offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t));
    226 	if (offtab->ot_window == NULL)
    227 		err(1, "malloc offset table");
    228 	offtab->ot_blkno = (uint32_t)-1;
    229 	offtab->ot_fd = fd;
    230 	offtab->ot_fdpos = fdpos;
    231 	offtab->ot_report = &offtab_bug;
    232 	offtab->ot_reportx = &offtab_bugx;
    233 	offtab->ot_mode = OFFTAB_MODE_NONE;
    234 }
    235 
    236 /*
    237  * Destroy an offtab.
    238  */
    239 void
    240 offtab_destroy(struct offtab *offtab)
    241 {
    242 
    243 	free(offtab->ot_window);
    244 }
    245 
    246 /*
    247  * For an offtab that has been used to read data from disk, convert it
    248  * to an offtab that can be used to write subsequent data to disk.
    249  * blkno is the last valid blkno read from disk.
    250  */
    251 bool
    252 offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno)
    253 {
    254 
    255 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    256 	assert(0 < blkno);
    257 
    258 	if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
    259 		return false;
    260 
    261 	offtab->ot_mode = OFFTAB_MODE_WRITE;
    262 	offtab->ot_blkno = blkno;
    263 
    264 	return true;
    265 }
    266 
    267 /*
    269  * Reset an offtab for reading an offset table from the beginning.
    270  * Initializes in-memory state and may read data from offtab->ot_fd,
    271  * which must currently be at byte position offtab->ot_fdpos.  Failure
    272  * will be reported by the report/reportx routines, which are called
    273  * like warn/warnx.  May fail; returns true on success, false on
    274  * failure.
    275  *
    276  * This almost has copypasta of offtab_prepare_get, but this uses read,
    277  * rather than pread, so that it will work on nonseekable input if the
    278  * window is the whole offset table.
    279  */
    280 bool
    281 offtab_reset_read(struct offtab *offtab,
    282     void (*report)(const char *, ...) __printflike(1,2),
    283     void (*reportx)(const char *, ...) __printflike(1,2))
    284 {
    285 
    286 	assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
    287 	    ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
    288 
    289 	offtab->ot_report = report;
    290 	offtab->ot_reportx = reportx;
    291 	offtab->ot_mode = OFFTAB_MODE_READ;
    292 	offtab->ot_blkno = (uint32_t)-1;
    293 
    294 	if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK))
    295 		return false;
    296 
    297 	if (offtab->ot_window_size < offtab->ot_n_offsets) {
    298 		__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    299 		const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
    300 		    (off_t)sizeof(uint64_t));
    301 		assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS);
    302 		__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
    303 			(off_t)MAX_N_OFFSETS*sizeof(uint64_t)));
    304 		assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes));
    305 		const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
    306 		if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) {
    307 			(*offtab->ot_report)("lseek to first offset 0x%"PRIx64,
    308 			    first_offset);
    309 			return false;
    310 		}
    311 	}
    312 
    313 	return true;
    314 }
    315 
    316 /*
    317  * Do any I/O or bookkeeping necessary to fetch the offset for blkno in
    318  * preparation for a call to offtab_get.  May fail; returns true on
    319  * success, false on failure.
    320  */
    321 bool
    322 offtab_prepare_get(struct offtab *offtab, uint32_t blkno)
    323 {
    324 
    325 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    326 	assert(blkno < offtab->ot_n_offsets);
    327 
    328 	if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
    329 		return false;
    330 
    331 	assert(offtab->ot_window_start <= blkno);
    332 	assert(blkno < offtab_current_window_end(offtab));
    333 
    334 	offtab->ot_blkno = blkno;
    335 	return true;
    336 }
    337 
    338 /*
    339  * Return the offset for blkno.  Caller must have called
    340  * offtab_prepare_get beforehand.
    341  */
    342 uint64_t
    343 offtab_get(struct offtab *offtab, uint32_t blkno)
    344 {
    345 
    346 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    347 	assert(blkno == offtab->ot_blkno);
    348 	assert(offtab->ot_window_start <= blkno);
    349 	assert(blkno < offtab_current_window_end(offtab));
    350 
    351 	return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]);
    352 }
    353 
    354 /*
    356  * Reset offtab for writing a fresh offset table.  Initializes
    357  * in-memory state and writes an empty offset table to offtab->ot_fd,
    358  * which must currently be at byte position offtab->ot_fdpos.  May
    359  * fail; returns on success, aborts with err(3) on failure.
    360  */
    361 void
    362 offtab_reset_write(struct offtab *offtab)
    363 {
    364 	uint32_t i;
    365 
    366 	assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos);
    367 
    368 	offtab->ot_mode = OFFTAB_MODE_WRITE;
    369 	offtab->ot_blkno = (uint32_t)-1;
    370 
    371 	/*
    372 	 * Initialize the offset table to all ones (except for the
    373 	 * fixed first offset) so that we can easily detect where we
    374 	 * were interrupted if we want to restart.
    375 	 */
    376 	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
    377 	assert(offtab->ot_n_offsets > 0);
    378 
    379 	/* Initialize window of all ones.  */
    380 	for (i = 0; i < offtab->ot_window_size; i++)
    381 		offtab->ot_window[i] = ~(uint64_t)0;
    382 
    383 	/* Write the window to every position in the table.  */
    384 	const uint32_t n_windows =
    385 	    howmany(offtab->ot_n_offsets, offtab->ot_window_size);
    386 	for (i = 1; i < n_windows; i++) {
    387 		/* Change the start but reuse the all-ones buffer.  */
    388 		offtab->ot_window_start = (i * offtab->ot_window_size);
    389 		offtab_write_window(offtab);
    390 	}
    391 
    392 	/* Compute the number of bytes in the offset table.  */
    393 	__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    394 	const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
    395 	    sizeof(uint64_t));
    396 
    397 	/* Compute the offset of the first block.  */
    398 	assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS);
    399 	__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
    400 		MAX_N_OFFSETS*sizeof(uint64_t)));
    401 	assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes));
    402 	const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
    403 
    404 	/* Assert that it fits in 64 bits.  */
    405 	__CTASSERT(MUL_OK(uint64_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    406 	__CTASSERT(ADD_OK(uint64_t, OFFTAB_MAX_FDPOS,
    407 		(uint64_t)MAX_N_OFFSETS*sizeof(uint64_t)));
    408 
    409 	/* Write out the first window with the first offset.  */
    410 	offtab->ot_window_start = 0;
    411 	offtab->ot_window[0] = htobe64((uint64_t)first_offset);
    412 	offtab_write_window(offtab);
    413 
    414 	if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1)
    415 		err(1, "lseek to first offset failed");
    416 }
    417 
    418 /*
    419  * Guarantee that the disk reflects block offsets [0, n_offsets).  If
    420  * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire
    421  * offset table.  May fail; returns on success, aborts with err(3) on
    422  * failure.  Fsync failure is considered success but is reported with a
    423  * warning.
    424  *
    425  * This routine does not write state in memory, and does not read state
    426  * that is not signal-safe.  The only state read is offtab->ot_window,
    427  * offtab->ot_window_start, and quantities that are static for the
    428  * signal-interruptable existence of the offset table.
    429  */
    430 void
    431 offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags)
    432 {
    433 
    434 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    435 	assert(n_offsets <= offtab->ot_n_offsets);
    436 
    437 	/*
    438 	 * Write the window unless we just did that and were
    439 	 * interrupted before we could move the window.
    440 	 */
    441 	if (offtab->ot_window != NULL)
    442 		offtab_maybe_write_window(offtab, 0, n_offsets);
    443 
    444 	if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) {
    445 		__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
    446 		const off_t sync_bytes = ((off_t)n_offsets *
    447 		    (off_t)sizeof(uint64_t));
    448 		__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
    449 			MAX_N_OFFSETS*sizeof(uint64_t)));
    450 		assert(ADD_OK(off_t, offtab->ot_fdpos, sync_bytes));
    451 		if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC),
    452 			offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes))
    453 		    == -1)
    454 			warn_ss("fsync of offset table failed");
    455 	}
    456 }
    457 
    458 /*
    459  * Do any I/O or bookkeeping necessary to set an offset for blkno.  May
    460  * fail; returns on success, aborts with err(3) on failure.
    461  */
    462 void
    463 offtab_prepare_put(struct offtab *offtab, uint32_t blkno)
    464 {
    465 	uint32_t i;
    466 
    467 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    468 	assert(blkno < offtab->ot_n_offsets);
    469 
    470 	/*
    471 	 * Assume, for convenience, that we write blocks in order.
    472 	 * Thus we need not do another read -- we can just clear the
    473 	 * window.
    474 	 */
    475 	assert((offtab->ot_blkno == (uint32_t)-1) ||
    476 	    ((offtab->ot_blkno + 1) == blkno));
    477 
    478 	/* If it's already in our window, we're good to go.  */
    479 	if ((offtab->ot_window_start <= blkno) &&
    480 	    (blkno < offtab_current_window_end(offtab)))
    481 		goto win;
    482 
    483 	/* Otherwise, write out the current window and choose a new one.  */
    484 	offtab_write_window(offtab);
    485 
    486 	assert(offtab->ot_window_size <= blkno);
    487 	assert(offtab->ot_window_start == (blkno - offtab->ot_window_size));
    488 	assert((offtab->ot_window_start + offtab->ot_window_size) ==
    489 	    rounddown(blkno, offtab->ot_window_size));
    490 
    491     {
    492 	uint64_t *window;
    493 	sigset_t sigmask;
    494 
    495 	/*
    496 	 * Mark the window as being updated so nobody tries to write it
    497 	 * (since we just wrote it) while we fill it with ones.
    498 	 */
    499 	block_signals(&sigmask);
    500 	window = offtab->ot_window;
    501 	offtab->ot_window = NULL;
    502 	restore_sigmask(&sigmask);
    503 
    504 	/* Fill the window with ones.  */
    505 	for (i = 0; i < offtab_current_window_size(offtab); i++)
    506 		window[i] = ~(uint64_t)0;
    507 
    508 	/* Restore the window as ready again.  */
    509 	block_signals(&sigmask);
    510 	offtab->ot_window = window;
    511 	offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size);
    512 	restore_sigmask(&sigmask);
    513     }
    514 
    515 win:	assert(offtab->ot_window_start <= blkno);
    516 	assert(blkno < offtab_current_window_end(offtab));
    517 
    518 	offtab->ot_blkno = blkno;
    519 }
    520 
    521 /*
    522  * Actually set the offset for blkno.
    523  */
    524 void
    525 offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset)
    526 {
    527 
    528 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    529 	assert(blkno == offtab->ot_blkno);
    530 	assert(offtab->ot_window_start <= blkno);
    531 	assert(blkno < offtab_current_window_end(offtab));
    532 
    533 	offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset);
    534 }
    535