Home | History | Annotate | Line # | Download | only in vndcompress
offtab.c revision 1.13.4.2
      1 /*	$NetBSD: offtab.c,v 1.13.4.2 2014/05/22 11:42:51 yamt Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2014 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __RCSID("$NetBSD: offtab.c,v 1.13.4.2 2014/05/22 11:42:51 yamt Exp $");
     34 
     35 #include <sys/types.h>
     36 #include <sys/endian.h>
     37 
     38 #include <assert.h>
     39 #include <err.h>
     40 #include <errno.h>
     41 #include <inttypes.h>
     42 #include <limits.h>
     43 #include <stdbool.h>
     44 #include <stdlib.h>
     45 #include <unistd.h>
     46 
     47 #include "common.h"
     48 #include "utils.h"
     49 
     50 #include "offtab.h"
     51 
     52 static void __printflike(1,2) __dead
     53 offtab_bug(const char *fmt, ...)
     54 {
     55 
     56 	errx(1, "bug in offtab, please report");
     57 }
     58 
     59 static void __printflike(1,2) __dead
     60 offtab_bugx(const char *fmt, ...)
     61 {
     62 
     63 	errx(1, "bug in offtab, please report");
     64 }
     65 
     66 static uint32_t
     67 offtab_compute_window_size(struct offtab *offtab, uint32_t start)
     68 {
     69 
     70 	assert(start < offtab->ot_n_offsets);
     71 	return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start));
     72 }
     73 
     74 static uint32_t
     75 offtab_current_window_size(struct offtab *offtab)
     76 {
     77 
     78 	return offtab_compute_window_size(offtab, offtab->ot_window_start);
     79 }
     80 
     81 static uint32_t
     82 offtab_current_window_end(struct offtab *offtab)
     83 {
     84 
     85 	assert(offtab->ot_window_start < offtab->ot_n_offsets);
     86 	assert(offtab_current_window_size(offtab) <=
     87 	    (offtab->ot_n_offsets - offtab->ot_window_start));
     88 	return (offtab->ot_window_start + offtab_current_window_size(offtab));
     89 }
     90 
     91 static void
     92 offtab_compute_window_position(struct offtab *offtab, uint32_t window_start,
     93     size_t *bytes, off_t *pos)
     94 {
     95 	const uint32_t window_size = offtab_compute_window_size(offtab,
     96 	    window_start);
     97 
     98 	__CTASSERT(MAX_WINDOW_SIZE <= (SIZE_MAX / sizeof(uint64_t)));
     99 	*bytes = (window_size * sizeof(uint64_t));
    100 
    101 	assert(window_start <= offtab->ot_n_offsets);
    102 	__CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t)));
    103 	const off_t window_offset = ((off_t)window_start *
    104 	    (off_t)sizeof(uint64_t));
    105 
    106 	/* XXX This assertion is not justified.  */
    107 	assert(offtab->ot_fdpos <= (OFF_MAX - window_offset));
    108 	*pos = (offtab->ot_fdpos + window_offset);
    109 }
    110 
    111 #define	OFFTAB_READ_SEEK	0x01
    112 #define	OFFTAB_READ_NOSEEK	0x00
    113 
    114 static bool
    115 offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
    116 {
    117 	const uint32_t window_start = rounddown(blkno, offtab->ot_window_size);
    118 	size_t window_bytes;
    119 	off_t window_pos;
    120 
    121 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    122 	assert(ISSET(read_flags, OFFTAB_READ_SEEK) ||
    123 	    (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
    124 	    ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
    125 
    126 	offtab_compute_window_position(offtab, window_start,
    127 	    &window_bytes, &window_pos);
    128 	const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK)
    129 	    ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes,
    130 		window_pos)
    131 	    : read_block(offtab->ot_fd, offtab->ot_window, window_bytes));
    132 	if (n_read == -1) {
    133 		(*offtab->ot_report)("read offset table at %"PRIuMAX,
    134 		    (uintmax_t)window_pos);
    135 		return false;
    136 	}
    137 	assert(n_read >= 0);
    138 	if ((size_t)n_read != window_bytes) {
    139 		(*offtab->ot_reportx)("partial read of offset table"
    140 		    " at %"PRIuMAX": %zu != %zu",
    141 		    (uintmax_t)window_pos, (size_t)n_read, window_bytes);
    142 		return false;
    143 	}
    144 
    145 	offtab->ot_window_start = window_start;
    146 
    147 	return true;
    148 }
    149 
    150 static bool
    151 offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
    152 {
    153 
    154 	/* Don't bother if blkno is already in the window.  */
    155 	if ((offtab->ot_window_start <= blkno) &&
    156 	    (blkno < offtab_current_window_end(offtab)))
    157 		return true;
    158 
    159 	if (!offtab_read_window(offtab, blkno, read_flags))
    160 		return false;
    161 
    162 	return true;
    163 }
    164 
    165 static void
    166 offtab_write_window(struct offtab *offtab)
    167 {
    168 	size_t window_bytes;
    169 	off_t window_pos;
    170 
    171 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    172 
    173 	offtab_compute_window_position(offtab, offtab->ot_window_start,
    174 	    &window_bytes, &window_pos);
    175 	const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window,
    176 	    window_bytes, window_pos);
    177 	if (n_written == -1)
    178 		err_ss(1, "write initial offset table");
    179 	assert(n_written >= 0);
    180 	if ((size_t)n_written != window_bytes)
    181 		errx_ss(1, "partial write of initial offset bytes: %zu <= %zu",
    182 		    (size_t)n_written,
    183 		    window_bytes);
    184 }
    185 
    186 static void
    187 offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end)
    188 {
    189 
    190 	/* Don't bother if [start, end) does not cover our window.  */
    191 	if (end <= offtab->ot_window_start)
    192 		return;
    193 	if (offtab_current_window_end(offtab) < start)
    194 		return;
    195 
    196 	offtab_write_window(offtab);
    197 }
    198 
    199 /*
    201  * Initialize an offtab to support the specified number of offsets read
    202  * to or written from fd at byte position fdpos.
    203  */
    204 void
    205 offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size,
    206     int fd, off_t fdpos)
    207 {
    208 
    209 	assert(offtab != NULL);
    210 	assert(0 < n_offsets);
    211 	assert(0 <= fd);
    212 	assert(0 <= fdpos);
    213 
    214 	offtab->ot_n_offsets = n_offsets;
    215 	if ((window_size == 0) || (n_offsets < window_size))
    216 		offtab->ot_window_size = n_offsets;
    217 	else
    218 		offtab->ot_window_size = window_size;
    219 	assert(offtab->ot_window_size <= offtab->ot_n_offsets);
    220 	offtab->ot_window_start = (uint32_t)-1;
    221 	__CTASSERT(MAX_WINDOW_SIZE <= (SIZE_MAX / sizeof(uint64_t)));
    222 	offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t));
    223 	if (offtab->ot_window == NULL)
    224 		err(1, "malloc offset table");
    225 	offtab->ot_blkno = (uint32_t)-1;
    226 	offtab->ot_fd = fd;
    227 	offtab->ot_fdpos = fdpos;
    228 	offtab->ot_report = &offtab_bug;
    229 	offtab->ot_reportx = &offtab_bugx;
    230 	offtab->ot_mode = OFFTAB_MODE_NONE;
    231 }
    232 
    233 /*
    234  * Destroy an offtab.
    235  */
    236 void
    237 offtab_destroy(struct offtab *offtab)
    238 {
    239 
    240 	free(offtab->ot_window);
    241 }
    242 
    243 /*
    244  * For an offtab that has been used to read data from disk, convert it
    245  * to an offtab that can be used to write subsequent data to disk.
    246  * blkno is the last valid blkno read from disk.
    247  */
    248 bool
    249 offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno)
    250 {
    251 
    252 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    253 	assert(0 < blkno);
    254 
    255 	if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
    256 		return false;
    257 
    258 	offtab->ot_mode = OFFTAB_MODE_WRITE;
    259 	offtab->ot_blkno = blkno;
    260 
    261 	return true;
    262 }
    263 
    264 /*
    266  * Reset an offtab for reading an offset table from the beginning.
    267  * Initializes in-memory state and may read data from offtab->ot_fd,
    268  * which must currently be at byte position offtab->ot_fdpos.  Failure
    269  * will be reported by the report/reportx routines, which are called
    270  * like warn/warnx.  May fail; returns true on success, false on
    271  * failure.
    272  *
    273  * This almost has copypasta of offtab_prepare_get, but this uses read,
    274  * rather than pread, so that it will work on nonseekable input if the
    275  * window is the whole offset table.
    276  */
    277 bool
    278 offtab_reset_read(struct offtab *offtab,
    279     void (*report)(const char *, ...) __printflike(1,2),
    280     void (*reportx)(const char *, ...) __printflike(1,2))
    281 {
    282 
    283 	assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
    284 	    ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
    285 
    286 	offtab->ot_report = report;
    287 	offtab->ot_reportx = reportx;
    288 	offtab->ot_mode = OFFTAB_MODE_READ;
    289 	offtab->ot_blkno = (uint32_t)-1;
    290 
    291 	if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK))
    292 		return false;
    293 
    294 	if (offtab->ot_window_size < offtab->ot_n_offsets) {
    295 		__CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t)));
    296 		const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
    297 		    (off_t)sizeof(uint64_t));
    298 		assert(offtab->ot_fdpos <= (OFF_MAX - offtab_bytes));
    299 		const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
    300 		if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) {
    301 			(*offtab->ot_report)("lseek to first offset 0x%"PRIx64,
    302 			    first_offset);
    303 			return false;
    304 		}
    305 	}
    306 
    307 	return true;
    308 }
    309 
    310 /*
    311  * Do any I/O or bookkeeping necessary to fetch the offset for blkno in
    312  * preparation for a call to offtab_get.  May fail; returns true on
    313  * success, false on failure.
    314  */
    315 bool
    316 offtab_prepare_get(struct offtab *offtab, uint32_t blkno)
    317 {
    318 
    319 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    320 	assert(blkno < offtab->ot_n_offsets);
    321 
    322 	if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
    323 		return false;
    324 
    325 	assert(offtab->ot_window_start <= blkno);
    326 	assert(blkno < offtab_current_window_end(offtab));
    327 
    328 	offtab->ot_blkno = blkno;
    329 	return true;
    330 }
    331 
    332 /*
    333  * Return the offset for blkno.  Caller must have called
    334  * offtab_prepare_get beforehand.
    335  */
    336 uint64_t
    337 offtab_get(struct offtab *offtab, uint32_t blkno)
    338 {
    339 
    340 	assert(offtab->ot_mode == OFFTAB_MODE_READ);
    341 	assert(blkno == offtab->ot_blkno);
    342 	assert(offtab->ot_window_start <= blkno);
    343 	assert(blkno < offtab_current_window_end(offtab));
    344 
    345 	return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]);
    346 }
    347 
    348 /*
    350  * Reset offtab for writing a fresh offset table.  Initializes
    351  * in-memory state and writes an empty offset table to offtab->ot_fd,
    352  * which must currently be at byte position offtab->ot_fdpos.  May
    353  * fail; returns on success, aborts with err(3) on failure.
    354  */
    355 void
    356 offtab_reset_write(struct offtab *offtab)
    357 {
    358 	uint32_t i;
    359 
    360 	assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos);
    361 
    362 	offtab->ot_mode = OFFTAB_MODE_WRITE;
    363 	offtab->ot_blkno = (uint32_t)-1;
    364 
    365 	/*
    366 	 * Initialize the offset table to all ones (except for the
    367 	 * fixed first offset) so that we can easily detect where we
    368 	 * were interrupted if we want to restart.
    369 	 */
    370 	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
    371 	assert(offtab->ot_n_offsets > 0);
    372 
    373 	for (i = 0; i < offtab->ot_window_size; i++)
    374 		offtab->ot_window[i] = ~(uint64_t)0;
    375 
    376 	const uint32_t n_windows =
    377 	    howmany(offtab->ot_n_offsets, offtab->ot_window_size);
    378 	for (i = 1; i < n_windows; i++) {
    379 		/* Change the start but reuse the all-ones buffer.  */
    380 		offtab->ot_window_start = (i * offtab->ot_window_size);
    381 		offtab_write_window(offtab);
    382 	}
    383 
    384 	offtab->ot_window_start = 0;
    385 	__CTASSERT(MAX_N_OFFSETS <=
    386 	    (MIN(OFF_MAX, UINT64_MAX) / sizeof(uint64_t)));
    387 	const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
    388 	    sizeof(uint64_t));
    389 	assert(offtab->ot_fdpos <=
    390 	    ((off_t)MIN(OFF_MAX, UINT64_MAX) - offtab_bytes));
    391 	const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
    392 	assert(first_offset <= (off_t)MIN(OFF_MAX, UINT64_MAX));
    393 	offtab->ot_window[0] = htobe64((uint64_t)first_offset);
    394 	offtab_write_window(offtab);
    395 
    396 	if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1)
    397 		err(1, "lseek to first offset failed");
    398 }
    399 
    400 /*
    401  * Guarantee that the disk reflects block offsets [0, n_offsets).  If
    402  * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire
    403  * offset table.  May fail; returns on success, aborts with err(3) on
    404  * failure.  Fsync failure is considered success but is reported with a
    405  * warning.
    406  *
    407  * This routine does not write state in memory, and does not read state
    408  * that is not signal-safe.  The only state read is offtab->ot_window,
    409  * offtab->ot_window_start, and quantities that are static for the
    410  * signal-interruptable existence of the offset table.
    411  */
    412 void
    413 offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags)
    414 {
    415 
    416 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    417 	assert(n_offsets <= offtab->ot_n_offsets);
    418 
    419 	/*
    420 	 * Write the window unless we just did that and were
    421 	 * interrupted before we could move the window.
    422 	 */
    423 	if (offtab->ot_window != NULL)
    424 		offtab_maybe_write_window(offtab, 0, n_offsets);
    425 
    426 	if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) {
    427 		__CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t)));
    428 		const off_t sync_bytes = ((off_t)n_offsets *
    429 		    (off_t)sizeof(uint64_t));
    430 		assert(offtab->ot_fdpos <= (OFF_MAX - sync_bytes));
    431 		if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC),
    432 			offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes))
    433 		    == -1)
    434 			warn_ss("fsync of offset table failed");
    435 	}
    436 }
    437 
    438 /*
    439  * Do any I/O or bookkeeping necessary to set an offset for blkno.  May
    440  * fail; returns on success, aborts with err(3) on failure.
    441  */
    442 void
    443 offtab_prepare_put(struct offtab *offtab, uint32_t blkno)
    444 {
    445 	uint32_t i;
    446 
    447 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    448 	assert(blkno < offtab->ot_n_offsets);
    449 
    450 	/*
    451 	 * Assume, for convenience, that we write blocks in order.
    452 	 * Thus we need not do another read -- we can just clear the
    453 	 * window.
    454 	 */
    455 	assert((offtab->ot_blkno == (uint32_t)-1) ||
    456 	    ((offtab->ot_blkno + 1) == blkno));
    457 
    458 	/* If it's already in our window, we're good to go.  */
    459 	if ((offtab->ot_window_start <= blkno) &&
    460 	    (blkno < offtab_current_window_end(offtab)))
    461 		goto win;
    462 
    463 	/* Otherwise, write out the current window and choose a new one.  */
    464 	offtab_write_window(offtab);
    465 
    466 	assert(offtab->ot_window_size <= blkno);
    467 	assert(offtab->ot_window_start == (blkno - offtab->ot_window_size));
    468 	assert((offtab->ot_window_start + offtab->ot_window_size) ==
    469 	    rounddown(blkno, offtab->ot_window_size));
    470 
    471     {
    472 	uint64_t *window;
    473 	sigset_t sigmask;
    474 
    475 	/*
    476 	 * Mark the window as being updated so nobody tries to write it
    477 	 * (since we just wrote it) while we fill it with ones.
    478 	 */
    479 	block_signals(&sigmask);
    480 	window = offtab->ot_window;
    481 	offtab->ot_window = NULL;
    482 	restore_sigmask(&sigmask);
    483 
    484 	/* Fill the window with ones.  */
    485 	for (i = 0; i < offtab_current_window_size(offtab); i++)
    486 		window[i] = ~(uint64_t)0;
    487 
    488 	/* Restore the window as ready again.  */
    489 	block_signals(&sigmask);
    490 	offtab->ot_window = window;
    491 	offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size);
    492 	restore_sigmask(&sigmask);
    493     }
    494 
    495 win:	assert(offtab->ot_window_start <= blkno);
    496 	assert(blkno < offtab_current_window_end(offtab));
    497 
    498 	offtab->ot_blkno = blkno;
    499 }
    500 
    501 /*
    502  * Actually set the offset for blkno.
    503  */
    504 void
    505 offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset)
    506 {
    507 
    508 	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
    509 	assert(blkno == offtab->ot_blkno);
    510 	assert(offtab->ot_window_start <= blkno);
    511 	assert(blkno < offtab_current_window_end(offtab));
    512 
    513 	offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset);
    514 }
    515