1 /* $NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __RCSID("$NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $"); 34 35 #include <sys/types.h> 36 #include <sys/endian.h> 37 38 #include <assert.h> 39 #include <err.h> 40 #include <errno.h> 41 #include <inttypes.h> 42 #include <limits.h> 43 #include <stdbool.h> 44 #include <stdlib.h> 45 #include <unistd.h> 46 47 #include "common.h" 48 #include "utils.h" 49 50 #include "offtab.h" 51 52 static void __printflike(1,2) __dead 53 offtab_bug(const char *fmt, ...) 54 { 55 56 errx(1, "bug in offtab, please report"); 57 } 58 59 static void __printflike(1,2) __dead 60 offtab_bugx(const char *fmt, ...) 61 { 62 63 errx(1, "bug in offtab, please report"); 64 } 65 66 static uint32_t 67 offtab_compute_window_size(struct offtab *offtab, uint32_t start) 68 { 69 70 assert(start < offtab->ot_n_offsets); 71 return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start)); 72 } 73 74 static uint32_t 75 offtab_current_window_size(struct offtab *offtab) 76 { 77 78 return offtab_compute_window_size(offtab, offtab->ot_window_start); 79 } 80 81 static uint32_t 82 offtab_current_window_end(struct offtab *offtab) 83 { 84 85 assert(offtab->ot_window_start < offtab->ot_n_offsets); 86 assert(offtab_current_window_size(offtab) <= 87 (offtab->ot_n_offsets - offtab->ot_window_start)); 88 return (offtab->ot_window_start + offtab_current_window_size(offtab)); 89 } 90 91 static void 92 offtab_compute_window_position(struct offtab *offtab, uint32_t window_start, 93 size_t *bytes, off_t *pos) 94 { 95 const uint32_t window_size = offtab_compute_window_size(offtab, 96 window_start); 97 98 __CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t))); 99 *bytes = (window_size * sizeof(uint64_t)); 100 101 assert(window_start <= offtab->ot_n_offsets); 102 __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); 103 const off_t window_offset = ((off_t)window_start * 104 (off_t)sizeof(uint64_t)); 105 106 assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS); 107 __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, 108 (off_t)MAX_N_OFFSETS*sizeof(uint64_t))); 109 assert(ADD_OK(off_t, offtab->ot_fdpos, window_offset)); 110 *pos = (offtab->ot_fdpos + window_offset); 111 } 112 113 #define OFFTAB_READ_SEEK 0x01 114 #define OFFTAB_READ_NOSEEK 0x00 115 116 static bool 117 offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags) 118 { 119 const uint32_t window_start = rounddown(blkno, offtab->ot_window_size); 120 size_t window_bytes; 121 off_t window_pos; 122 123 assert(offtab->ot_mode == OFFTAB_MODE_READ); 124 assert(ISSET(read_flags, OFFTAB_READ_SEEK) || 125 (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) || 126 ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE))); 127 128 offtab_compute_window_position(offtab, window_start, 129 &window_bytes, &window_pos); 130 const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK) 131 ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes, 132 window_pos) 133 : read_block(offtab->ot_fd, offtab->ot_window, window_bytes)); 134 if (n_read == -1) { 135 (*offtab->ot_report)("read offset table at %"PRIuMAX, 136 (uintmax_t)window_pos); 137 return false; 138 } 139 assert(n_read >= 0); 140 if ((size_t)n_read != window_bytes) { 141 (*offtab->ot_reportx)("partial read of offset table" 142 " at %"PRIuMAX": %zu != %zu", 143 (uintmax_t)window_pos, (size_t)n_read, window_bytes); 144 return false; 145 } 146 147 offtab->ot_window_start = window_start; 148 149 return true; 150 } 151 152 static bool 153 offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags) 154 { 155 156 /* Don't bother if blkno is already in the window. */ 157 if ((offtab->ot_window_start <= blkno) && 158 (blkno < offtab_current_window_end(offtab))) 159 return true; 160 161 if (!offtab_read_window(offtab, blkno, read_flags)) 162 return false; 163 164 return true; 165 } 166 167 static void 168 offtab_write_window(struct offtab *offtab) 169 { 170 size_t window_bytes; 171 off_t window_pos; 172 173 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 174 175 offtab_compute_window_position(offtab, offtab->ot_window_start, 176 &window_bytes, &window_pos); 177 const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window, 178 window_bytes, window_pos); 179 if (n_written == -1) 180 err_ss(1, "write initial offset table"); 181 assert(n_written >= 0); 182 if ((size_t)n_written != window_bytes) 183 errx_ss(1, "partial write of initial offset bytes: %zu <= %zu", 184 (size_t)n_written, 185 window_bytes); 186 } 187 188 static void 189 offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end) 190 { 191 192 /* Don't bother if [start, end) does not cover our window. */ 193 if (end <= offtab->ot_window_start) 194 return; 195 if (offtab_current_window_end(offtab) < start) 196 return; 197 198 offtab_write_window(offtab); 199 } 200 201 /* 203 * Initialize an offtab to support the specified number of offsets read 204 * to or written from fd at byte position fdpos. 205 */ 206 void 207 offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size, 208 int fd, off_t fdpos) 209 { 210 211 assert(offtab != NULL); 212 assert(0 < n_offsets); 213 assert(0 <= fd); 214 assert(0 <= fdpos); 215 assert(fdpos <= OFFTAB_MAX_FDPOS); 216 217 offtab->ot_n_offsets = n_offsets; 218 if ((window_size == 0) || (n_offsets < window_size)) 219 offtab->ot_window_size = n_offsets; 220 else 221 offtab->ot_window_size = window_size; 222 assert(offtab->ot_window_size <= offtab->ot_n_offsets); 223 offtab->ot_window_start = (uint32_t)-1; 224 __CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t))); 225 offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t)); 226 if (offtab->ot_window == NULL) 227 err(1, "malloc offset table"); 228 offtab->ot_blkno = (uint32_t)-1; 229 offtab->ot_fd = fd; 230 offtab->ot_fdpos = fdpos; 231 offtab->ot_report = &offtab_bug; 232 offtab->ot_reportx = &offtab_bugx; 233 offtab->ot_mode = OFFTAB_MODE_NONE; 234 } 235 236 /* 237 * Destroy an offtab. 238 */ 239 void 240 offtab_destroy(struct offtab *offtab) 241 { 242 243 free(offtab->ot_window); 244 } 245 246 /* 247 * For an offtab that has been used to read data from disk, convert it 248 * to an offtab that can be used to write subsequent data to disk. 249 * blkno is the last valid blkno read from disk. 250 */ 251 bool 252 offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno) 253 { 254 255 assert(offtab->ot_mode == OFFTAB_MODE_READ); 256 assert(0 < blkno); 257 258 if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK)) 259 return false; 260 261 offtab->ot_mode = OFFTAB_MODE_WRITE; 262 offtab->ot_blkno = blkno; 263 264 return true; 265 } 266 267 /* 269 * Reset an offtab for reading an offset table from the beginning. 270 * Initializes in-memory state and may read data from offtab->ot_fd, 271 * which must currently be at byte position offtab->ot_fdpos. Failure 272 * will be reported by the report/reportx routines, which are called 273 * like warn/warnx. May fail; returns true on success, false on 274 * failure. 275 * 276 * This almost has copypasta of offtab_prepare_get, but this uses read, 277 * rather than pread, so that it will work on nonseekable input if the 278 * window is the whole offset table. 279 */ 280 bool 281 offtab_reset_read(struct offtab *offtab, 282 void (*report)(const char *, ...) __printflike(1,2), 283 void (*reportx)(const char *, ...) __printflike(1,2)) 284 { 285 286 assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) || 287 ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE))); 288 289 offtab->ot_report = report; 290 offtab->ot_reportx = reportx; 291 offtab->ot_mode = OFFTAB_MODE_READ; 292 offtab->ot_blkno = (uint32_t)-1; 293 294 if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK)) 295 return false; 296 297 if (offtab->ot_window_size < offtab->ot_n_offsets) { 298 __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); 299 const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets * 300 (off_t)sizeof(uint64_t)); 301 assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS); 302 __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, 303 (off_t)MAX_N_OFFSETS*sizeof(uint64_t))); 304 assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes)); 305 const off_t first_offset = (offtab->ot_fdpos + offtab_bytes); 306 if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) { 307 (*offtab->ot_report)("lseek to first offset 0x%"PRIx64, 308 first_offset); 309 return false; 310 } 311 } 312 313 return true; 314 } 315 316 /* 317 * Do any I/O or bookkeeping necessary to fetch the offset for blkno in 318 * preparation for a call to offtab_get. May fail; returns true on 319 * success, false on failure. 320 */ 321 bool 322 offtab_prepare_get(struct offtab *offtab, uint32_t blkno) 323 { 324 325 assert(offtab->ot_mode == OFFTAB_MODE_READ); 326 assert(blkno < offtab->ot_n_offsets); 327 328 if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK)) 329 return false; 330 331 assert(offtab->ot_window_start <= blkno); 332 assert(blkno < offtab_current_window_end(offtab)); 333 334 offtab->ot_blkno = blkno; 335 return true; 336 } 337 338 /* 339 * Return the offset for blkno. Caller must have called 340 * offtab_prepare_get beforehand. 341 */ 342 uint64_t 343 offtab_get(struct offtab *offtab, uint32_t blkno) 344 { 345 346 assert(offtab->ot_mode == OFFTAB_MODE_READ); 347 assert(blkno == offtab->ot_blkno); 348 assert(offtab->ot_window_start <= blkno); 349 assert(blkno < offtab_current_window_end(offtab)); 350 351 return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]); 352 } 353 354 /* 356 * Reset offtab for writing a fresh offset table. Initializes 357 * in-memory state and writes an empty offset table to offtab->ot_fd, 358 * which must currently be at byte position offtab->ot_fdpos. May 359 * fail; returns on success, aborts with err(3) on failure. 360 */ 361 void 362 offtab_reset_write(struct offtab *offtab) 363 { 364 uint32_t i; 365 366 assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos); 367 368 offtab->ot_mode = OFFTAB_MODE_WRITE; 369 offtab->ot_blkno = (uint32_t)-1; 370 371 /* 372 * Initialize the offset table to all ones (except for the 373 * fixed first offset) so that we can easily detect where we 374 * were interrupted if we want to restart. 375 */ 376 __CTASSERT(MAX_N_OFFSETS <= UINT32_MAX); 377 assert(offtab->ot_n_offsets > 0); 378 379 /* Initialize window of all ones. */ 380 for (i = 0; i < offtab->ot_window_size; i++) 381 offtab->ot_window[i] = ~(uint64_t)0; 382 383 /* Write the window to every position in the table. */ 384 const uint32_t n_windows = 385 howmany(offtab->ot_n_offsets, offtab->ot_window_size); 386 for (i = 1; i < n_windows; i++) { 387 /* Change the start but reuse the all-ones buffer. */ 388 offtab->ot_window_start = (i * offtab->ot_window_size); 389 offtab_write_window(offtab); 390 } 391 392 /* Compute the number of bytes in the offset table. */ 393 __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); 394 const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets * 395 sizeof(uint64_t)); 396 397 /* Compute the offset of the first block. */ 398 assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS); 399 __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, 400 MAX_N_OFFSETS*sizeof(uint64_t))); 401 assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes)); 402 const off_t first_offset = (offtab->ot_fdpos + offtab_bytes); 403 404 /* Assert that it fits in 64 bits. */ 405 __CTASSERT(MUL_OK(uint64_t, MAX_N_OFFSETS, sizeof(uint64_t))); 406 __CTASSERT(ADD_OK(uint64_t, OFFTAB_MAX_FDPOS, 407 (uint64_t)MAX_N_OFFSETS*sizeof(uint64_t))); 408 409 /* Write out the first window with the first offset. */ 410 offtab->ot_window_start = 0; 411 offtab->ot_window[0] = htobe64((uint64_t)first_offset); 412 offtab_write_window(offtab); 413 414 if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) 415 err(1, "lseek to first offset failed"); 416 } 417 418 /* 419 * Guarantee that the disk reflects block offsets [0, n_offsets). If 420 * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire 421 * offset table. May fail; returns on success, aborts with err(3) on 422 * failure. Fsync failure is considered success but is reported with a 423 * warning. 424 * 425 * This routine does not write state in memory, and does not read state 426 * that is not signal-safe. The only state read is offtab->ot_window, 427 * offtab->ot_window_start, and quantities that are static for the 428 * signal-interruptable existence of the offset table. 429 */ 430 void 431 offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags) 432 { 433 434 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 435 assert(n_offsets <= offtab->ot_n_offsets); 436 437 /* 438 * Write the window unless we just did that and were 439 * interrupted before we could move the window. 440 */ 441 if (offtab->ot_window != NULL) 442 offtab_maybe_write_window(offtab, 0, n_offsets); 443 444 if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) { 445 __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); 446 const off_t sync_bytes = ((off_t)n_offsets * 447 (off_t)sizeof(uint64_t)); 448 __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, 449 MAX_N_OFFSETS*sizeof(uint64_t))); 450 assert(ADD_OK(off_t, offtab->ot_fdpos, sync_bytes)); 451 if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC), 452 offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes)) 453 == -1) 454 warn_ss("fsync of offset table failed"); 455 } 456 } 457 458 /* 459 * Do any I/O or bookkeeping necessary to set an offset for blkno. May 460 * fail; returns on success, aborts with err(3) on failure. 461 */ 462 void 463 offtab_prepare_put(struct offtab *offtab, uint32_t blkno) 464 { 465 uint32_t i; 466 467 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 468 assert(blkno < offtab->ot_n_offsets); 469 470 /* 471 * Assume, for convenience, that we write blocks in order. 472 * Thus we need not do another read -- we can just clear the 473 * window. 474 */ 475 assert((offtab->ot_blkno == (uint32_t)-1) || 476 ((offtab->ot_blkno + 1) == blkno)); 477 478 /* If it's already in our window, we're good to go. */ 479 if ((offtab->ot_window_start <= blkno) && 480 (blkno < offtab_current_window_end(offtab))) 481 goto win; 482 483 /* Otherwise, write out the current window and choose a new one. */ 484 offtab_write_window(offtab); 485 486 assert(offtab->ot_window_size <= blkno); 487 assert(offtab->ot_window_start == (blkno - offtab->ot_window_size)); 488 assert((offtab->ot_window_start + offtab->ot_window_size) == 489 rounddown(blkno, offtab->ot_window_size)); 490 491 { 492 uint64_t *window; 493 sigset_t sigmask; 494 495 /* 496 * Mark the window as being updated so nobody tries to write it 497 * (since we just wrote it) while we fill it with ones. 498 */ 499 block_signals(&sigmask); 500 window = offtab->ot_window; 501 offtab->ot_window = NULL; 502 restore_sigmask(&sigmask); 503 504 /* Fill the window with ones. */ 505 for (i = 0; i < offtab_current_window_size(offtab); i++) 506 window[i] = ~(uint64_t)0; 507 508 /* Restore the window as ready again. */ 509 block_signals(&sigmask); 510 offtab->ot_window = window; 511 offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size); 512 restore_sigmask(&sigmask); 513 } 514 515 win: assert(offtab->ot_window_start <= blkno); 516 assert(blkno < offtab_current_window_end(offtab)); 517 518 offtab->ot_blkno = blkno; 519 } 520 521 /* 522 * Actually set the offset for blkno. 523 */ 524 void 525 offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset) 526 { 527 528 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 529 assert(blkno == offtab->ot_blkno); 530 assert(offtab->ot_window_start <= blkno); 531 assert(blkno < offtab_current_window_end(offtab)); 532 533 offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset); 534 } 535