offtab.c revision 1.13.4.2 1 /* $NetBSD: offtab.c,v 1.13.4.2 2014/05/22 11:42:51 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 2014 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __RCSID("$NetBSD: offtab.c,v 1.13.4.2 2014/05/22 11:42:51 yamt Exp $");
34
35 #include <sys/types.h>
36 #include <sys/endian.h>
37
38 #include <assert.h>
39 #include <err.h>
40 #include <errno.h>
41 #include <inttypes.h>
42 #include <limits.h>
43 #include <stdbool.h>
44 #include <stdlib.h>
45 #include <unistd.h>
46
47 #include "common.h"
48 #include "utils.h"
49
50 #include "offtab.h"
51
52 static void __printflike(1,2) __dead
53 offtab_bug(const char *fmt, ...)
54 {
55
56 errx(1, "bug in offtab, please report");
57 }
58
59 static void __printflike(1,2) __dead
60 offtab_bugx(const char *fmt, ...)
61 {
62
63 errx(1, "bug in offtab, please report");
64 }
65
66 static uint32_t
67 offtab_compute_window_size(struct offtab *offtab, uint32_t start)
68 {
69
70 assert(start < offtab->ot_n_offsets);
71 return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start));
72 }
73
74 static uint32_t
75 offtab_current_window_size(struct offtab *offtab)
76 {
77
78 return offtab_compute_window_size(offtab, offtab->ot_window_start);
79 }
80
81 static uint32_t
82 offtab_current_window_end(struct offtab *offtab)
83 {
84
85 assert(offtab->ot_window_start < offtab->ot_n_offsets);
86 assert(offtab_current_window_size(offtab) <=
87 (offtab->ot_n_offsets - offtab->ot_window_start));
88 return (offtab->ot_window_start + offtab_current_window_size(offtab));
89 }
90
91 static void
92 offtab_compute_window_position(struct offtab *offtab, uint32_t window_start,
93 size_t *bytes, off_t *pos)
94 {
95 const uint32_t window_size = offtab_compute_window_size(offtab,
96 window_start);
97
98 __CTASSERT(MAX_WINDOW_SIZE <= (SIZE_MAX / sizeof(uint64_t)));
99 *bytes = (window_size * sizeof(uint64_t));
100
101 assert(window_start <= offtab->ot_n_offsets);
102 __CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t)));
103 const off_t window_offset = ((off_t)window_start *
104 (off_t)sizeof(uint64_t));
105
106 /* XXX This assertion is not justified. */
107 assert(offtab->ot_fdpos <= (OFF_MAX - window_offset));
108 *pos = (offtab->ot_fdpos + window_offset);
109 }
110
111 #define OFFTAB_READ_SEEK 0x01
112 #define OFFTAB_READ_NOSEEK 0x00
113
114 static bool
115 offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
116 {
117 const uint32_t window_start = rounddown(blkno, offtab->ot_window_size);
118 size_t window_bytes;
119 off_t window_pos;
120
121 assert(offtab->ot_mode == OFFTAB_MODE_READ);
122 assert(ISSET(read_flags, OFFTAB_READ_SEEK) ||
123 (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
124 ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
125
126 offtab_compute_window_position(offtab, window_start,
127 &window_bytes, &window_pos);
128 const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK)
129 ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes,
130 window_pos)
131 : read_block(offtab->ot_fd, offtab->ot_window, window_bytes));
132 if (n_read == -1) {
133 (*offtab->ot_report)("read offset table at %"PRIuMAX,
134 (uintmax_t)window_pos);
135 return false;
136 }
137 assert(n_read >= 0);
138 if ((size_t)n_read != window_bytes) {
139 (*offtab->ot_reportx)("partial read of offset table"
140 " at %"PRIuMAX": %zu != %zu",
141 (uintmax_t)window_pos, (size_t)n_read, window_bytes);
142 return false;
143 }
144
145 offtab->ot_window_start = window_start;
146
147 return true;
148 }
149
150 static bool
151 offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
152 {
153
154 /* Don't bother if blkno is already in the window. */
155 if ((offtab->ot_window_start <= blkno) &&
156 (blkno < offtab_current_window_end(offtab)))
157 return true;
158
159 if (!offtab_read_window(offtab, blkno, read_flags))
160 return false;
161
162 return true;
163 }
164
165 static void
166 offtab_write_window(struct offtab *offtab)
167 {
168 size_t window_bytes;
169 off_t window_pos;
170
171 assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
172
173 offtab_compute_window_position(offtab, offtab->ot_window_start,
174 &window_bytes, &window_pos);
175 const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window,
176 window_bytes, window_pos);
177 if (n_written == -1)
178 err_ss(1, "write initial offset table");
179 assert(n_written >= 0);
180 if ((size_t)n_written != window_bytes)
181 errx_ss(1, "partial write of initial offset bytes: %zu <= %zu",
182 (size_t)n_written,
183 window_bytes);
184 }
185
186 static void
187 offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end)
188 {
189
190 /* Don't bother if [start, end) does not cover our window. */
191 if (end <= offtab->ot_window_start)
192 return;
193 if (offtab_current_window_end(offtab) < start)
194 return;
195
196 offtab_write_window(offtab);
197 }
198
199 /*
201 * Initialize an offtab to support the specified number of offsets read
202 * to or written from fd at byte position fdpos.
203 */
204 void
205 offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size,
206 int fd, off_t fdpos)
207 {
208
209 assert(offtab != NULL);
210 assert(0 < n_offsets);
211 assert(0 <= fd);
212 assert(0 <= fdpos);
213
214 offtab->ot_n_offsets = n_offsets;
215 if ((window_size == 0) || (n_offsets < window_size))
216 offtab->ot_window_size = n_offsets;
217 else
218 offtab->ot_window_size = window_size;
219 assert(offtab->ot_window_size <= offtab->ot_n_offsets);
220 offtab->ot_window_start = (uint32_t)-1;
221 __CTASSERT(MAX_WINDOW_SIZE <= (SIZE_MAX / sizeof(uint64_t)));
222 offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t));
223 if (offtab->ot_window == NULL)
224 err(1, "malloc offset table");
225 offtab->ot_blkno = (uint32_t)-1;
226 offtab->ot_fd = fd;
227 offtab->ot_fdpos = fdpos;
228 offtab->ot_report = &offtab_bug;
229 offtab->ot_reportx = &offtab_bugx;
230 offtab->ot_mode = OFFTAB_MODE_NONE;
231 }
232
233 /*
234 * Destroy an offtab.
235 */
236 void
237 offtab_destroy(struct offtab *offtab)
238 {
239
240 free(offtab->ot_window);
241 }
242
243 /*
244 * For an offtab that has been used to read data from disk, convert it
245 * to an offtab that can be used to write subsequent data to disk.
246 * blkno is the last valid blkno read from disk.
247 */
248 bool
249 offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno)
250 {
251
252 assert(offtab->ot_mode == OFFTAB_MODE_READ);
253 assert(0 < blkno);
254
255 if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
256 return false;
257
258 offtab->ot_mode = OFFTAB_MODE_WRITE;
259 offtab->ot_blkno = blkno;
260
261 return true;
262 }
263
264 /*
266 * Reset an offtab for reading an offset table from the beginning.
267 * Initializes in-memory state and may read data from offtab->ot_fd,
268 * which must currently be at byte position offtab->ot_fdpos. Failure
269 * will be reported by the report/reportx routines, which are called
270 * like warn/warnx. May fail; returns true on success, false on
271 * failure.
272 *
273 * This almost has copypasta of offtab_prepare_get, but this uses read,
274 * rather than pread, so that it will work on nonseekable input if the
275 * window is the whole offset table.
276 */
277 bool
278 offtab_reset_read(struct offtab *offtab,
279 void (*report)(const char *, ...) __printflike(1,2),
280 void (*reportx)(const char *, ...) __printflike(1,2))
281 {
282
283 assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
284 ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
285
286 offtab->ot_report = report;
287 offtab->ot_reportx = reportx;
288 offtab->ot_mode = OFFTAB_MODE_READ;
289 offtab->ot_blkno = (uint32_t)-1;
290
291 if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK))
292 return false;
293
294 if (offtab->ot_window_size < offtab->ot_n_offsets) {
295 __CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t)));
296 const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
297 (off_t)sizeof(uint64_t));
298 assert(offtab->ot_fdpos <= (OFF_MAX - offtab_bytes));
299 const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
300 if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) {
301 (*offtab->ot_report)("lseek to first offset 0x%"PRIx64,
302 first_offset);
303 return false;
304 }
305 }
306
307 return true;
308 }
309
310 /*
311 * Do any I/O or bookkeeping necessary to fetch the offset for blkno in
312 * preparation for a call to offtab_get. May fail; returns true on
313 * success, false on failure.
314 */
315 bool
316 offtab_prepare_get(struct offtab *offtab, uint32_t blkno)
317 {
318
319 assert(offtab->ot_mode == OFFTAB_MODE_READ);
320 assert(blkno < offtab->ot_n_offsets);
321
322 if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
323 return false;
324
325 assert(offtab->ot_window_start <= blkno);
326 assert(blkno < offtab_current_window_end(offtab));
327
328 offtab->ot_blkno = blkno;
329 return true;
330 }
331
332 /*
333 * Return the offset for blkno. Caller must have called
334 * offtab_prepare_get beforehand.
335 */
336 uint64_t
337 offtab_get(struct offtab *offtab, uint32_t blkno)
338 {
339
340 assert(offtab->ot_mode == OFFTAB_MODE_READ);
341 assert(blkno == offtab->ot_blkno);
342 assert(offtab->ot_window_start <= blkno);
343 assert(blkno < offtab_current_window_end(offtab));
344
345 return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]);
346 }
347
348 /*
350 * Reset offtab for writing a fresh offset table. Initializes
351 * in-memory state and writes an empty offset table to offtab->ot_fd,
352 * which must currently be at byte position offtab->ot_fdpos. May
353 * fail; returns on success, aborts with err(3) on failure.
354 */
355 void
356 offtab_reset_write(struct offtab *offtab)
357 {
358 uint32_t i;
359
360 assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos);
361
362 offtab->ot_mode = OFFTAB_MODE_WRITE;
363 offtab->ot_blkno = (uint32_t)-1;
364
365 /*
366 * Initialize the offset table to all ones (except for the
367 * fixed first offset) so that we can easily detect where we
368 * were interrupted if we want to restart.
369 */
370 __CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
371 assert(offtab->ot_n_offsets > 0);
372
373 for (i = 0; i < offtab->ot_window_size; i++)
374 offtab->ot_window[i] = ~(uint64_t)0;
375
376 const uint32_t n_windows =
377 howmany(offtab->ot_n_offsets, offtab->ot_window_size);
378 for (i = 1; i < n_windows; i++) {
379 /* Change the start but reuse the all-ones buffer. */
380 offtab->ot_window_start = (i * offtab->ot_window_size);
381 offtab_write_window(offtab);
382 }
383
384 offtab->ot_window_start = 0;
385 __CTASSERT(MAX_N_OFFSETS <=
386 (MIN(OFF_MAX, UINT64_MAX) / sizeof(uint64_t)));
387 const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
388 sizeof(uint64_t));
389 assert(offtab->ot_fdpos <=
390 ((off_t)MIN(OFF_MAX, UINT64_MAX) - offtab_bytes));
391 const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
392 assert(first_offset <= (off_t)MIN(OFF_MAX, UINT64_MAX));
393 offtab->ot_window[0] = htobe64((uint64_t)first_offset);
394 offtab_write_window(offtab);
395
396 if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1)
397 err(1, "lseek to first offset failed");
398 }
399
400 /*
401 * Guarantee that the disk reflects block offsets [0, n_offsets). If
402 * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire
403 * offset table. May fail; returns on success, aborts with err(3) on
404 * failure. Fsync failure is considered success but is reported with a
405 * warning.
406 *
407 * This routine does not write state in memory, and does not read state
408 * that is not signal-safe. The only state read is offtab->ot_window,
409 * offtab->ot_window_start, and quantities that are static for the
410 * signal-interruptable existence of the offset table.
411 */
412 void
413 offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags)
414 {
415
416 assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
417 assert(n_offsets <= offtab->ot_n_offsets);
418
419 /*
420 * Write the window unless we just did that and were
421 * interrupted before we could move the window.
422 */
423 if (offtab->ot_window != NULL)
424 offtab_maybe_write_window(offtab, 0, n_offsets);
425
426 if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) {
427 __CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t)));
428 const off_t sync_bytes = ((off_t)n_offsets *
429 (off_t)sizeof(uint64_t));
430 assert(offtab->ot_fdpos <= (OFF_MAX - sync_bytes));
431 if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC),
432 offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes))
433 == -1)
434 warn_ss("fsync of offset table failed");
435 }
436 }
437
438 /*
439 * Do any I/O or bookkeeping necessary to set an offset for blkno. May
440 * fail; returns on success, aborts with err(3) on failure.
441 */
442 void
443 offtab_prepare_put(struct offtab *offtab, uint32_t blkno)
444 {
445 uint32_t i;
446
447 assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
448 assert(blkno < offtab->ot_n_offsets);
449
450 /*
451 * Assume, for convenience, that we write blocks in order.
452 * Thus we need not do another read -- we can just clear the
453 * window.
454 */
455 assert((offtab->ot_blkno == (uint32_t)-1) ||
456 ((offtab->ot_blkno + 1) == blkno));
457
458 /* If it's already in our window, we're good to go. */
459 if ((offtab->ot_window_start <= blkno) &&
460 (blkno < offtab_current_window_end(offtab)))
461 goto win;
462
463 /* Otherwise, write out the current window and choose a new one. */
464 offtab_write_window(offtab);
465
466 assert(offtab->ot_window_size <= blkno);
467 assert(offtab->ot_window_start == (blkno - offtab->ot_window_size));
468 assert((offtab->ot_window_start + offtab->ot_window_size) ==
469 rounddown(blkno, offtab->ot_window_size));
470
471 {
472 uint64_t *window;
473 sigset_t sigmask;
474
475 /*
476 * Mark the window as being updated so nobody tries to write it
477 * (since we just wrote it) while we fill it with ones.
478 */
479 block_signals(&sigmask);
480 window = offtab->ot_window;
481 offtab->ot_window = NULL;
482 restore_sigmask(&sigmask);
483
484 /* Fill the window with ones. */
485 for (i = 0; i < offtab_current_window_size(offtab); i++)
486 window[i] = ~(uint64_t)0;
487
488 /* Restore the window as ready again. */
489 block_signals(&sigmask);
490 offtab->ot_window = window;
491 offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size);
492 restore_sigmask(&sigmask);
493 }
494
495 win: assert(offtab->ot_window_start <= blkno);
496 assert(blkno < offtab_current_window_end(offtab));
497
498 offtab->ot_blkno = blkno;
499 }
500
501 /*
502 * Actually set the offset for blkno.
503 */
504 void
505 offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset)
506 {
507
508 assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
509 assert(blkno == offtab->ot_blkno);
510 assert(offtab->ot_window_start <= blkno);
511 assert(blkno < offtab_current_window_end(offtab));
512
513 offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset);
514 }
515