vfs_wapbl.c revision 1.85 1 /* $NetBSD: vfs_wapbl.c,v 1.85 2016/10/28 20:38:12 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.85 2016/10/28 20:38:12 jdolecek Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43 #include <sys/time.h>
44 #include <sys/wapbl.h>
45 #include <sys/wapbl_replay.h>
46
47 #ifdef _KERNEL
48
49 #include <sys/atomic.h>
50 #include <sys/conf.h>
51 #include <sys/file.h>
52 #include <sys/kauth.h>
53 #include <sys/kernel.h>
54 #include <sys/module.h>
55 #include <sys/mount.h>
56 #include <sys/mutex.h>
57 #include <sys/namei.h>
58 #include <sys/proc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/sysctl.h>
61 #include <sys/uio.h>
62 #include <sys/vnode.h>
63
64 #include <miscfs/specfs/specdev.h>
65
66 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
67 #define wapbl_free(a, s) kmem_free((a), (s))
68 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
69
70 static struct sysctllog *wapbl_sysctl;
71 static int wapbl_flush_disk_cache = 1;
72 static int wapbl_verbose_commit = 0;
73
74 static inline size_t wapbl_space_free(size_t, off_t, off_t);
75
76 #else /* !_KERNEL */
77
78 #include <assert.h>
79 #include <errno.h>
80 #include <stdbool.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_alloc(s) malloc(s)
88 #define wapbl_free(a, s) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * lm = rwlock held writing or mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 LIST_HEAD(wapbl_ino_head, wapbl_ino);
109 struct wapbl {
110 struct vnode *wl_logvp; /* r: log here */
111 struct vnode *wl_devvp; /* r: log on this device */
112 struct mount *wl_mount; /* r: mountpoint wl is associated with */
113 daddr_t wl_logpbn; /* r: Physical block number of start of log */
114 int wl_log_dev_bshift; /* r: logarithm of device block size of log
115 device */
116 int wl_fs_dev_bshift; /* r: logarithm of device block size of
117 filesystem device */
118
119 unsigned wl_lock_count; /* m: Count of transactions in progress */
120
121 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
122 size_t wl_circ_off; /* r: Number of bytes reserved at start */
123
124 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
125 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
126
127 off_t wl_head; /* l: Byte offset of log head */
128 off_t wl_tail; /* l: Byte offset of log tail */
129 /*
130 * WAPBL log layout, stored on wl_devvp at wl_logpbn:
131 *
132 * ___________________ wl_circ_size __________________
133 * / \
134 * +---------+---------+-------+--------------+--------+
135 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
136 * +---------+---------+-------+--------------+--------+
137 * wl_circ_off --^ ^-- wl_head ^-- wl_tail
138 *
139 * commit0 and commit1 are commit headers. A commit header has
140 * a generation number, indicating which of the two headers is
141 * more recent, and an assignment of head and tail pointers.
142 * The rest is a circular queue of log records, starting at
143 * the byte offset wl_circ_off.
144 *
145 * E marks empty space for records.
146 * W marks records for block writes issued but waiting.
147 * C marks completed records.
148 *
149 * wapbl_flush writes new records to empty `E' spaces after
150 * wl_head from the current transaction in memory.
151 *
152 * wapbl_truncate advances wl_tail past any completed `C'
153 * records, freeing them up for use.
154 *
155 * head == tail == 0 means log is empty.
156 * head == tail != 0 means log is full.
157 *
158 * See assertions in wapbl_advance() for other boundary
159 * conditions.
160 *
161 * Only wapbl_flush moves the head, except when wapbl_truncate
162 * sets it to 0 to indicate that the log is empty.
163 *
164 * Only wapbl_truncate moves the tail, except when wapbl_flush
165 * sets it to wl_circ_off to indicate that the log is full.
166 */
167
168 struct wapbl_wc_header *wl_wc_header; /* l */
169 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
170
171 kmutex_t wl_mtx; /* u: short-term lock */
172 krwlock_t wl_rwlock; /* u: File system transaction lock */
173
174 /*
175 * Must be held while accessing
176 * wl_count or wl_bufs or head or tail
177 */
178
179 /*
180 * Callback called from within the flush routine to flush any extra
181 * bits. Note that flush may be skipped without calling this if
182 * there are no outstanding buffers in the transaction.
183 */
184 #if _KERNEL
185 wapbl_flush_fn_t wl_flush; /* r */
186 wapbl_flush_fn_t wl_flush_abort;/* r */
187 #endif
188
189 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
190 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
191 size_t wl_bcount; /* m: Total bcount of wl_bufs */
192
193 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
194
195 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
196 size_t wl_reclaimable_bytes; /* m: Amount of space available for
197 reclamation by truncate */
198 int wl_error_count; /* m: # of wl_entries with errors */
199 size_t wl_reserved_bytes; /* never truncate log smaller than this */
200
201 #ifdef WAPBL_DEBUG_BUFBYTES
202 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
203 #endif
204
205 #if _KERNEL
206 int wl_brperjblock; /* r Block records per journal block */
207 #endif
208
209 SIMPLEQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
210 int wl_dealloccnt; /* lm: total count */
211 int wl_dealloclim; /* r: max count */
212
213 /* hashtable of inode numbers for allocated but unlinked inodes */
214 /* synch ??? */
215 struct wapbl_ino_head *wl_inohash;
216 u_long wl_inohashmask;
217 int wl_inohashcnt;
218
219 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
220 accounting */
221
222 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
223 daddr_t wl_buffer_dblk; /* l: buffer disk block address */
224 size_t wl_buffer_used; /* l: buffer current use */
225 };
226
227 #ifdef WAPBL_DEBUG_PRINT
228 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
229 #endif
230
231 /****************************************************************/
232 #ifdef _KERNEL
233
234 #ifdef WAPBL_DEBUG
235 struct wapbl *wapbl_debug_wl;
236 #endif
237
238 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
239 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
240 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
241 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
242 #endif /* _KERNEL */
243
244 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
245
246 static inline size_t wapbl_space_used(size_t avail, off_t head,
247 off_t tail);
248
249 #ifdef _KERNEL
250
251 static struct pool wapbl_entry_pool;
252 static struct pool wapbl_dealloc_pool;
253
254 #define WAPBL_INODETRK_SIZE 83
255 static int wapbl_ino_pool_refcount;
256 static struct pool wapbl_ino_pool;
257 struct wapbl_ino {
258 LIST_ENTRY(wapbl_ino) wi_hash;
259 ino_t wi_ino;
260 mode_t wi_mode;
261 };
262
263 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
264 static void wapbl_inodetrk_free(struct wapbl *wl);
265 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
266
267 static size_t wapbl_transaction_len(struct wapbl *wl);
268 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
269
270 #if 0
271 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
272 #endif
273
274 static int wapbl_replay_isopen1(struct wapbl_replay *);
275
276 struct wapbl_ops wapbl_ops = {
277 .wo_wapbl_discard = wapbl_discard,
278 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
279 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
280 .wo_wapbl_replay_read = wapbl_replay_read,
281 .wo_wapbl_add_buf = wapbl_add_buf,
282 .wo_wapbl_remove_buf = wapbl_remove_buf,
283 .wo_wapbl_resize_buf = wapbl_resize_buf,
284 .wo_wapbl_begin = wapbl_begin,
285 .wo_wapbl_end = wapbl_end,
286 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
287
288 /* XXX: the following is only used to say "this is a wapbl buf" */
289 .wo_wapbl_biodone = wapbl_biodone,
290 };
291
292 static int
293 wapbl_sysctl_init(void)
294 {
295 int rv;
296 const struct sysctlnode *rnode, *cnode;
297
298 wapbl_sysctl = NULL;
299
300 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
301 CTLFLAG_PERMANENT,
302 CTLTYPE_NODE, "wapbl",
303 SYSCTL_DESCR("WAPBL journaling options"),
304 NULL, 0, NULL, 0,
305 CTL_VFS, CTL_CREATE, CTL_EOL);
306 if (rv)
307 return rv;
308
309 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
310 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
311 CTLTYPE_INT, "flush_disk_cache",
312 SYSCTL_DESCR("flush disk cache"),
313 NULL, 0, &wapbl_flush_disk_cache, 0,
314 CTL_CREATE, CTL_EOL);
315 if (rv)
316 return rv;
317
318 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
319 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
320 CTLTYPE_INT, "verbose_commit",
321 SYSCTL_DESCR("show time and size of wapbl log commits"),
322 NULL, 0, &wapbl_verbose_commit, 0,
323 CTL_CREATE, CTL_EOL);
324 return rv;
325 }
326
327 static void
328 wapbl_init(void)
329 {
330
331 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
332 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
333 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
334 "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
335
336 wapbl_sysctl_init();
337 }
338
339 static int
340 wapbl_fini(void)
341 {
342
343 if (wapbl_sysctl != NULL)
344 sysctl_teardown(&wapbl_sysctl);
345
346 pool_destroy(&wapbl_dealloc_pool);
347 pool_destroy(&wapbl_entry_pool);
348
349 return 0;
350 }
351
352 static int
353 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
354 {
355 int error, i;
356
357 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
358 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
359
360 /*
361 * Its only valid to reuse the replay log if its
362 * the same as the new log we just opened.
363 */
364 KDASSERT(!wapbl_replay_isopen(wr));
365 KASSERT(wl->wl_devvp->v_type == VBLK);
366 KASSERT(wr->wr_devvp->v_type == VBLK);
367 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
368 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
369 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
370 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
371 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
372 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
373
374 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
375
376 for (i = 0; i < wr->wr_inodescnt; i++)
377 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
378 wr->wr_inodes[i].wr_imode);
379
380 /* Make sure new transaction won't overwrite old inodes list */
381 KDASSERT(wapbl_transaction_len(wl) <=
382 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
383 wr->wr_inodestail));
384
385 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
386 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
387 wapbl_transaction_len(wl);
388
389 error = wapbl_write_inodes(wl, &wl->wl_head);
390 if (error)
391 return error;
392
393 KASSERT(wl->wl_head != wl->wl_tail);
394 KASSERT(wl->wl_head != 0);
395
396 return 0;
397 }
398
399 int
400 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
401 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
402 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
403 {
404 struct wapbl *wl;
405 struct vnode *devvp;
406 daddr_t logpbn;
407 int error;
408 int log_dev_bshift = ilog2(blksize);
409 int fs_dev_bshift = log_dev_bshift;
410 int run;
411
412 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
413 " count=%zu blksize=%zu\n", vp, off, count, blksize));
414
415 if (log_dev_bshift > fs_dev_bshift) {
416 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
417 ("wapbl: log device's block size cannot be larger "
418 "than filesystem's\n"));
419 /*
420 * Not currently implemented, although it could be if
421 * needed someday.
422 */
423 return ENOSYS;
424 }
425
426 if (off < 0)
427 return EINVAL;
428
429 if (blksize < DEV_BSIZE)
430 return EINVAL;
431 if (blksize % DEV_BSIZE)
432 return EINVAL;
433
434 /* XXXTODO: verify that the full load is writable */
435
436 /*
437 * XXX check for minimum log size
438 * minimum is governed by minimum amount of space
439 * to complete a transaction. (probably truncate)
440 */
441 /* XXX for now pick something minimal */
442 if ((count * blksize) < MAXPHYS) {
443 return ENOSPC;
444 }
445
446 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
447 return error;
448 }
449
450 wl = wapbl_calloc(1, sizeof(*wl));
451 rw_init(&wl->wl_rwlock);
452 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
453 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
454 LIST_INIT(&wl->wl_bufs);
455 SIMPLEQ_INIT(&wl->wl_entries);
456
457 wl->wl_logvp = vp;
458 wl->wl_devvp = devvp;
459 wl->wl_mount = mp;
460 wl->wl_logpbn = logpbn;
461 wl->wl_log_dev_bshift = log_dev_bshift;
462 wl->wl_fs_dev_bshift = fs_dev_bshift;
463
464 wl->wl_flush = flushfn;
465 wl->wl_flush_abort = flushabortfn;
466
467 /* Reserve two log device blocks for the commit headers */
468 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
469 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
470 /* truncate the log usage to a multiple of log_dev_bshift */
471 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
472 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
473
474 /*
475 * wl_bufbytes_max limits the size of the in memory transaction space.
476 * - Since buffers are allocated and accounted for in units of
477 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
478 * (i.e. 1<<PAGE_SHIFT)
479 * - Since the log device has to be written in units of
480 * 1<<wl_log_dev_bshift it is required to be a mulitple of
481 * 1<<wl_log_dev_bshift.
482 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
483 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
484 * Therefore it must be multiple of the least common multiple of those
485 * three quantities. Fortunately, all of those quantities are
486 * guaranteed to be a power of two, and the least common multiple of
487 * a set of numbers which are all powers of two is simply the maximum
488 * of those numbers. Finally, the maximum logarithm of a power of two
489 * is the same as the log of the maximum power of two. So we can do
490 * the following operations to size wl_bufbytes_max:
491 */
492
493 /* XXX fix actual number of pages reserved per filesystem. */
494 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
495
496 /* Round wl_bufbytes_max to the largest power of two constraint */
497 wl->wl_bufbytes_max >>= PAGE_SHIFT;
498 wl->wl_bufbytes_max <<= PAGE_SHIFT;
499 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
500 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
501 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
502 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
503
504 /* XXX maybe use filesystem fragment size instead of 1024 */
505 /* XXX fix actual number of buffers reserved per filesystem. */
506 wl->wl_bufcount_max = (nbuf / 2) * 1024;
507
508 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
509 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
510 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
511 KASSERT(wl->wl_brperjblock > 0);
512
513 /* XXX tie this into resource estimation */
514 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
515 SIMPLEQ_INIT(&wl->wl_dealloclist);
516
517 wl->wl_buffer = wapbl_alloc(MAXPHYS);
518 wl->wl_buffer_used = 0;
519
520 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
521
522 /* Initialize the commit header */
523 {
524 struct wapbl_wc_header *wc;
525 size_t len = 1 << wl->wl_log_dev_bshift;
526 wc = wapbl_calloc(1, len);
527 wc->wc_type = WAPBL_WC_HEADER;
528 wc->wc_len = len;
529 wc->wc_circ_off = wl->wl_circ_off;
530 wc->wc_circ_size = wl->wl_circ_size;
531 /* XXX wc->wc_fsid */
532 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
533 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
534 wl->wl_wc_header = wc;
535 wl->wl_wc_scratch = wapbl_alloc(len);
536 }
537
538 /*
539 * if there was an existing set of unlinked but
540 * allocated inodes, preserve it in the new
541 * log.
542 */
543 if (wr && wr->wr_inodescnt) {
544 error = wapbl_start_flush_inodes(wl, wr);
545 if (error)
546 goto errout;
547 }
548
549 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
550 if (error) {
551 goto errout;
552 }
553
554 *wlp = wl;
555 #if defined(WAPBL_DEBUG)
556 wapbl_debug_wl = wl;
557 #endif
558
559 return 0;
560 errout:
561 wapbl_discard(wl);
562 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
563 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
564 wapbl_free(wl->wl_buffer, MAXPHYS);
565 wapbl_inodetrk_free(wl);
566 wapbl_free(wl, sizeof(*wl));
567
568 return error;
569 }
570
571 /*
572 * Like wapbl_flush, only discards the transaction
573 * completely
574 */
575
576 void
577 wapbl_discard(struct wapbl *wl)
578 {
579 struct wapbl_entry *we;
580 struct wapbl_dealloc *wd;
581 struct buf *bp;
582 int i;
583
584 /*
585 * XXX we may consider using upgrade here
586 * if we want to call flush from inside a transaction
587 */
588 rw_enter(&wl->wl_rwlock, RW_WRITER);
589 wl->wl_flush(wl->wl_mount, SIMPLEQ_FIRST(&wl->wl_dealloclist));
590
591 #ifdef WAPBL_DEBUG_PRINT
592 {
593 pid_t pid = -1;
594 lwpid_t lid = -1;
595 if (curproc)
596 pid = curproc->p_pid;
597 if (curlwp)
598 lid = curlwp->l_lid;
599 #ifdef WAPBL_DEBUG_BUFBYTES
600 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
601 ("wapbl_discard: thread %d.%d discarding "
602 "transaction\n"
603 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
604 "deallocs=%d inodes=%d\n"
605 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
606 "unsynced=%zu\n",
607 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
608 wl->wl_bcount, wl->wl_dealloccnt,
609 wl->wl_inohashcnt, wl->wl_error_count,
610 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
611 wl->wl_unsynced_bufbytes));
612 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
613 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
614 ("\tentry: bufcount = %zu, reclaimable = %zu, "
615 "error = %d, unsynced = %zu\n",
616 we->we_bufcount, we->we_reclaimable_bytes,
617 we->we_error, we->we_unsynced_bufbytes));
618 }
619 #else /* !WAPBL_DEBUG_BUFBYTES */
620 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
621 ("wapbl_discard: thread %d.%d discarding transaction\n"
622 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
623 "deallocs=%d inodes=%d\n"
624 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
625 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
626 wl->wl_bcount, wl->wl_dealloccnt,
627 wl->wl_inohashcnt, wl->wl_error_count,
628 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
629 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
630 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
631 ("\tentry: bufcount = %zu, reclaimable = %zu, "
632 "error = %d\n",
633 we->we_bufcount, we->we_reclaimable_bytes,
634 we->we_error));
635 }
636 #endif /* !WAPBL_DEBUG_BUFBYTES */
637 }
638 #endif /* WAPBL_DEBUG_PRINT */
639
640 for (i = 0; i <= wl->wl_inohashmask; i++) {
641 struct wapbl_ino_head *wih;
642 struct wapbl_ino *wi;
643
644 wih = &wl->wl_inohash[i];
645 while ((wi = LIST_FIRST(wih)) != NULL) {
646 LIST_REMOVE(wi, wi_hash);
647 pool_put(&wapbl_ino_pool, wi);
648 KASSERT(wl->wl_inohashcnt > 0);
649 wl->wl_inohashcnt--;
650 }
651 }
652
653 /*
654 * clean buffer list
655 */
656 mutex_enter(&bufcache_lock);
657 mutex_enter(&wl->wl_mtx);
658 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
659 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
660 /*
661 * The buffer will be unlocked and
662 * removed from the transaction in brelse
663 */
664 mutex_exit(&wl->wl_mtx);
665 brelsel(bp, 0);
666 mutex_enter(&wl->wl_mtx);
667 }
668 }
669 mutex_exit(&wl->wl_mtx);
670 mutex_exit(&bufcache_lock);
671
672 /*
673 * Remove references to this wl from wl_entries, free any which
674 * no longer have buffers, others will be freed in wapbl_biodone
675 * when they no longer have any buffers.
676 */
677 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
678 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
679 /* XXX should we be accumulating wl_error_count
680 * and increasing reclaimable bytes ? */
681 we->we_wapbl = NULL;
682 if (we->we_bufcount == 0) {
683 #ifdef WAPBL_DEBUG_BUFBYTES
684 KASSERT(we->we_unsynced_bufbytes == 0);
685 #endif
686 pool_put(&wapbl_entry_pool, we);
687 }
688 }
689
690 /* Discard list of deallocs */
691 while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
692 SIMPLEQ_REMOVE_HEAD(&wl->wl_dealloclist, wd_entries);
693 pool_put(&wapbl_dealloc_pool, wd);
694 wl->wl_dealloccnt--;
695 }
696
697 /* XXX should we clear wl_reserved_bytes? */
698
699 KASSERT(wl->wl_bufbytes == 0);
700 KASSERT(wl->wl_bcount == 0);
701 KASSERT(wl->wl_bufcount == 0);
702 KASSERT(LIST_EMPTY(&wl->wl_bufs));
703 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
704 KASSERT(wl->wl_inohashcnt == 0);
705 KASSERT(SIMPLEQ_EMPTY(&wl->wl_dealloclist));
706 KASSERT(wl->wl_dealloccnt == 0);
707
708 rw_exit(&wl->wl_rwlock);
709 }
710
711 int
712 wapbl_stop(struct wapbl *wl, int force)
713 {
714 int error;
715
716 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
717 error = wapbl_flush(wl, 1);
718 if (error) {
719 if (force)
720 wapbl_discard(wl);
721 else
722 return error;
723 }
724
725 /* Unlinked inodes persist after a flush */
726 if (wl->wl_inohashcnt) {
727 if (force) {
728 wapbl_discard(wl);
729 } else {
730 return EBUSY;
731 }
732 }
733
734 KASSERT(wl->wl_bufbytes == 0);
735 KASSERT(wl->wl_bcount == 0);
736 KASSERT(wl->wl_bufcount == 0);
737 KASSERT(LIST_EMPTY(&wl->wl_bufs));
738 KASSERT(wl->wl_dealloccnt == 0);
739 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
740 KASSERT(wl->wl_inohashcnt == 0);
741 KASSERT(SIMPLEQ_EMPTY(&wl->wl_dealloclist));
742 KASSERT(wl->wl_dealloccnt == 0);
743
744 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
745 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
746 wapbl_free(wl->wl_buffer, MAXPHYS);
747 wapbl_inodetrk_free(wl);
748
749 cv_destroy(&wl->wl_reclaimable_cv);
750 mutex_destroy(&wl->wl_mtx);
751 rw_destroy(&wl->wl_rwlock);
752 wapbl_free(wl, sizeof(*wl));
753
754 return 0;
755 }
756
757 /****************************************************************/
758 /*
759 * Unbuffered disk I/O
760 */
761
762 static int
763 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
764 {
765 struct pstats *pstats = curlwp->l_proc->p_stats;
766 struct buf *bp;
767 int error;
768
769 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
770 KASSERT(devvp->v_type == VBLK);
771
772 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
773 mutex_enter(devvp->v_interlock);
774 devvp->v_numoutput++;
775 mutex_exit(devvp->v_interlock);
776 pstats->p_ru.ru_oublock++;
777 } else {
778 pstats->p_ru.ru_inblock++;
779 }
780
781 bp = getiobuf(devvp, true);
782 bp->b_flags = flags;
783 bp->b_cflags = BC_BUSY; /* silly & dubious */
784 bp->b_dev = devvp->v_rdev;
785 bp->b_data = data;
786 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
787 bp->b_blkno = pbn;
788 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
789
790 WAPBL_PRINTF(WAPBL_PRINT_IO,
791 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
792 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
793 bp->b_blkno, bp->b_dev));
794
795 VOP_STRATEGY(devvp, bp);
796
797 error = biowait(bp);
798 putiobuf(bp);
799
800 if (error) {
801 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
802 ("wapbl_doio: %s %zu bytes at block %" PRId64
803 " on dev 0x%"PRIx64" failed with error %d\n",
804 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
805 "write" : "read"),
806 len, pbn, devvp->v_rdev, error));
807 }
808
809 return error;
810 }
811
812 /*
813 * wapbl_write(data, len, devvp, pbn)
814 *
815 * Synchronously write len bytes from data to physical block pbn
816 * on devvp.
817 */
818 int
819 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
820 {
821
822 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
823 }
824
825 /*
826 * wapbl_read(data, len, devvp, pbn)
827 *
828 * Synchronously read len bytes into data from physical block pbn
829 * on devvp.
830 */
831 int
832 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
833 {
834
835 return wapbl_doio(data, len, devvp, pbn, B_READ);
836 }
837
838 /****************************************************************/
839 /*
840 * Buffered disk writes -- try to coalesce writes and emit
841 * MAXPHYS-aligned blocks.
842 */
843
844 /*
845 * wapbl_buffered_flush(wl)
846 *
847 * Flush any buffered writes from wapbl_buffered_write.
848 */
849 static int
850 wapbl_buffered_flush(struct wapbl *wl)
851 {
852 int error;
853
854 if (wl->wl_buffer_used == 0)
855 return 0;
856
857 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
858 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
859 wl->wl_buffer_used = 0;
860
861 return error;
862 }
863
864 /*
865 * wapbl_buffered_write(data, len, wl, pbn)
866 *
867 * Write len bytes from data to physical block pbn on
868 * wl->wl_devvp. The write may not complete until
869 * wapbl_buffered_flush.
870 */
871 static int
872 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
873 {
874 int error;
875 size_t resid;
876
877 /*
878 * If not adjacent to buffered data flush first. Disk block
879 * address is always valid for non-empty buffer.
880 */
881 if (wl->wl_buffer_used > 0 &&
882 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
883 error = wapbl_buffered_flush(wl);
884 if (error)
885 return error;
886 }
887 /*
888 * If this write goes to an empty buffer we have to
889 * save the disk block address first.
890 */
891 if (wl->wl_buffer_used == 0)
892 wl->wl_buffer_dblk = pbn;
893 /*
894 * Remaining space so this buffer ends on a MAXPHYS boundary.
895 *
896 * Cannot become less or equal zero as the buffer would have been
897 * flushed on the last call then.
898 */
899 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
900 wl->wl_buffer_used;
901 KASSERT(resid > 0);
902 KASSERT(dbtob(btodb(resid)) == resid);
903 if (len >= resid) {
904 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
905 wl->wl_buffer_used += resid;
906 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
907 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
908 data = (uint8_t *)data + resid;
909 len -= resid;
910 wl->wl_buffer_dblk = pbn + btodb(resid);
911 wl->wl_buffer_used = 0;
912 if (error)
913 return error;
914 }
915 KASSERT(len < MAXPHYS);
916 if (len > 0) {
917 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
918 wl->wl_buffer_used += len;
919 }
920
921 return 0;
922 }
923
924 /*
925 * wapbl_circ_write(wl, data, len, offp)
926 *
927 * Write len bytes from data to the circular queue of wl, starting
928 * at linear byte offset *offp, and returning the new linear byte
929 * offset in *offp.
930 *
931 * If the starting linear byte offset precedes wl->wl_circ_off,
932 * the write instead begins at wl->wl_circ_off. XXX WTF? This
933 * should be a KASSERT, not a conditional.
934 *
935 * The write is buffered in wl and must be flushed with
936 * wapbl_buffered_flush before it will be submitted to the disk.
937 */
938 static int
939 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
940 {
941 size_t slen;
942 off_t off = *offp;
943 int error;
944 daddr_t pbn;
945
946 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
947 wl->wl_log_dev_bshift) == len);
948
949 if (off < wl->wl_circ_off)
950 off = wl->wl_circ_off;
951 slen = wl->wl_circ_off + wl->wl_circ_size - off;
952 if (slen < len) {
953 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
954 #ifdef _KERNEL
955 pbn = btodb(pbn << wl->wl_log_dev_bshift);
956 #endif
957 error = wapbl_buffered_write(data, slen, wl, pbn);
958 if (error)
959 return error;
960 data = (uint8_t *)data + slen;
961 len -= slen;
962 off = wl->wl_circ_off;
963 }
964 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
965 #ifdef _KERNEL
966 pbn = btodb(pbn << wl->wl_log_dev_bshift);
967 #endif
968 error = wapbl_buffered_write(data, len, wl, pbn);
969 if (error)
970 return error;
971 off += len;
972 if (off >= wl->wl_circ_off + wl->wl_circ_size)
973 off = wl->wl_circ_off;
974 *offp = off;
975 return 0;
976 }
977
978 /****************************************************************/
979 /*
980 * WAPBL transactions: entering, adding/removing bufs, and exiting
981 */
982
983 int
984 wapbl_begin(struct wapbl *wl, const char *file, int line)
985 {
986 int doflush;
987 unsigned lockcount;
988
989 KDASSERT(wl);
990
991 /*
992 * XXX this needs to be made much more sophisticated.
993 * perhaps each wapbl_begin could reserve a specified
994 * number of buffers and bytes.
995 */
996 mutex_enter(&wl->wl_mtx);
997 lockcount = wl->wl_lock_count;
998 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
999 wl->wl_bufbytes_max / 2) ||
1000 ((wl->wl_bufcount + (lockcount * 10)) >
1001 wl->wl_bufcount_max / 2) ||
1002 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1003 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1004 mutex_exit(&wl->wl_mtx);
1005
1006 if (doflush) {
1007 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1008 ("force flush lockcnt=%d bufbytes=%zu "
1009 "(max=%zu) bufcount=%zu (max=%zu) "
1010 "dealloccnt %d (lim=%d)\n",
1011 lockcount, wl->wl_bufbytes,
1012 wl->wl_bufbytes_max, wl->wl_bufcount,
1013 wl->wl_bufcount_max,
1014 wl->wl_dealloccnt, wl->wl_dealloclim));
1015 }
1016
1017 if (doflush) {
1018 int error = wapbl_flush(wl, 0);
1019 if (error)
1020 return error;
1021 }
1022
1023 rw_enter(&wl->wl_rwlock, RW_READER);
1024 mutex_enter(&wl->wl_mtx);
1025 wl->wl_lock_count++;
1026 mutex_exit(&wl->wl_mtx);
1027
1028 #if defined(WAPBL_DEBUG_PRINT)
1029 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1030 ("wapbl_begin thread %d.%d with bufcount=%zu "
1031 "bufbytes=%zu bcount=%zu at %s:%d\n",
1032 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1033 wl->wl_bufbytes, wl->wl_bcount, file, line));
1034 #endif
1035
1036 return 0;
1037 }
1038
1039 void
1040 wapbl_end(struct wapbl *wl)
1041 {
1042
1043 #if defined(WAPBL_DEBUG_PRINT)
1044 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1045 ("wapbl_end thread %d.%d with bufcount=%zu "
1046 "bufbytes=%zu bcount=%zu\n",
1047 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1048 wl->wl_bufbytes, wl->wl_bcount));
1049 #endif
1050
1051 /*
1052 * XXX this could be handled more gracefully, perhaps place
1053 * only a partial transaction in the log and allow the
1054 * remaining to flush without the protection of the journal.
1055 */
1056 KASSERTMSG((wapbl_transaction_len(wl) <=
1057 (wl->wl_circ_size - wl->wl_reserved_bytes)),
1058 "wapbl_end: current transaction too big to flush");
1059
1060 mutex_enter(&wl->wl_mtx);
1061 KASSERT(wl->wl_lock_count > 0);
1062 wl->wl_lock_count--;
1063 mutex_exit(&wl->wl_mtx);
1064
1065 rw_exit(&wl->wl_rwlock);
1066 }
1067
1068 void
1069 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1070 {
1071
1072 KASSERT(bp->b_cflags & BC_BUSY);
1073 KASSERT(bp->b_vp);
1074
1075 wapbl_jlock_assert(wl);
1076
1077 #if 0
1078 /*
1079 * XXX this might be an issue for swapfiles.
1080 * see uvm_swap.c:1702
1081 *
1082 * XXX2 why require it then? leap of semantics?
1083 */
1084 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1085 #endif
1086
1087 mutex_enter(&wl->wl_mtx);
1088 if (bp->b_flags & B_LOCKED) {
1089 LIST_REMOVE(bp, b_wapbllist);
1090 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1091 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1092 "with %d bytes %d bcount\n",
1093 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1094 bp->b_bcount));
1095 } else {
1096 /* unlocked by dirty buffers shouldn't exist */
1097 KASSERT(!(bp->b_oflags & BO_DELWRI));
1098 wl->wl_bufbytes += bp->b_bufsize;
1099 wl->wl_bcount += bp->b_bcount;
1100 wl->wl_bufcount++;
1101 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1102 ("wapbl_add_buf thread %d.%d adding buf %p "
1103 "with %d bytes %d bcount\n",
1104 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1105 bp->b_bcount));
1106 }
1107 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1108 mutex_exit(&wl->wl_mtx);
1109
1110 bp->b_flags |= B_LOCKED;
1111 }
1112
1113 static void
1114 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1115 {
1116
1117 KASSERT(mutex_owned(&wl->wl_mtx));
1118 KASSERT(bp->b_cflags & BC_BUSY);
1119 wapbl_jlock_assert(wl);
1120
1121 #if 0
1122 /*
1123 * XXX this might be an issue for swapfiles.
1124 * see uvm_swap.c:1725
1125 *
1126 * XXXdeux: see above
1127 */
1128 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1129 #endif
1130 KASSERT(bp->b_flags & B_LOCKED);
1131
1132 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1133 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1134 "%d bytes %d bcount\n",
1135 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1136
1137 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1138 wl->wl_bufbytes -= bp->b_bufsize;
1139 KASSERT(wl->wl_bcount >= bp->b_bcount);
1140 wl->wl_bcount -= bp->b_bcount;
1141 KASSERT(wl->wl_bufcount > 0);
1142 wl->wl_bufcount--;
1143 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1144 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1145 LIST_REMOVE(bp, b_wapbllist);
1146
1147 bp->b_flags &= ~B_LOCKED;
1148 }
1149
1150 /* called from brelsel() in vfs_bio among other places */
1151 void
1152 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1153 {
1154
1155 mutex_enter(&wl->wl_mtx);
1156 wapbl_remove_buf_locked(wl, bp);
1157 mutex_exit(&wl->wl_mtx);
1158 }
1159
1160 void
1161 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1162 {
1163
1164 KASSERT(bp->b_cflags & BC_BUSY);
1165
1166 /*
1167 * XXX: why does this depend on B_LOCKED? otherwise the buf
1168 * is not for a transaction? if so, why is this called in the
1169 * first place?
1170 */
1171 if (bp->b_flags & B_LOCKED) {
1172 mutex_enter(&wl->wl_mtx);
1173 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1174 wl->wl_bcount += bp->b_bcount - oldcnt;
1175 mutex_exit(&wl->wl_mtx);
1176 }
1177 }
1178
1179 #endif /* _KERNEL */
1180
1181 /****************************************************************/
1182 /* Some utility inlines */
1183
1184 /*
1185 * wapbl_space_used(avail, head, tail)
1186 *
1187 * Number of bytes used in a circular queue of avail total bytes,
1188 * from tail to head.
1189 */
1190 static inline size_t
1191 wapbl_space_used(size_t avail, off_t head, off_t tail)
1192 {
1193
1194 if (tail == 0) {
1195 KASSERT(head == 0);
1196 return 0;
1197 }
1198 return ((head + (avail - 1) - tail) % avail) + 1;
1199 }
1200
1201 #ifdef _KERNEL
1202 /*
1203 * wapbl_advance(size, off, oldoff, delta)
1204 *
1205 * Given a byte offset oldoff into a circular queue of size bytes
1206 * starting at off, return a new byte offset oldoff + delta into
1207 * the circular queue.
1208 */
1209 static inline off_t
1210 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1211 {
1212 off_t newoff;
1213
1214 /* Define acceptable ranges for inputs. */
1215 KASSERT(delta <= (size_t)size);
1216 KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1217 KASSERT(oldoff < (off_t)(size + off));
1218
1219 if ((oldoff == 0) && (delta != 0))
1220 newoff = off + delta;
1221 else if ((oldoff + delta) < (size + off))
1222 newoff = oldoff + delta;
1223 else
1224 newoff = (oldoff + delta) - size;
1225
1226 /* Note some interesting axioms */
1227 KASSERT((delta != 0) || (newoff == oldoff));
1228 KASSERT((delta == 0) || (newoff != 0));
1229 KASSERT((delta != (size)) || (newoff == oldoff));
1230
1231 /* Define acceptable ranges for output. */
1232 KASSERT((newoff == 0) || ((size_t)newoff >= off));
1233 KASSERT((size_t)newoff < (size + off));
1234 return newoff;
1235 }
1236
1237 /*
1238 * wapbl_space_free(avail, head, tail)
1239 *
1240 * Number of bytes free in a circular queue of avail total bytes,
1241 * in which everything from tail to head is used.
1242 */
1243 static inline size_t
1244 wapbl_space_free(size_t avail, off_t head, off_t tail)
1245 {
1246
1247 return avail - wapbl_space_used(avail, head, tail);
1248 }
1249
1250 /*
1251 * wapbl_advance_head(size, off, delta, headp, tailp)
1252 *
1253 * In a circular queue of size bytes starting at off, given the
1254 * old head and tail offsets *headp and *tailp, store the new head
1255 * and tail offsets in *headp and *tailp resulting from adding
1256 * delta bytes of data to the head.
1257 */
1258 static inline void
1259 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1260 off_t *tailp)
1261 {
1262 off_t head = *headp;
1263 off_t tail = *tailp;
1264
1265 KASSERT(delta <= wapbl_space_free(size, head, tail));
1266 head = wapbl_advance(size, off, head, delta);
1267 if ((tail == 0) && (head != 0))
1268 tail = off;
1269 *headp = head;
1270 *tailp = tail;
1271 }
1272
1273 /*
1274 * wapbl_advance_tail(size, off, delta, headp, tailp)
1275 *
1276 * In a circular queue of size bytes starting at off, given the
1277 * old head and tail offsets *headp and *tailp, store the new head
1278 * and tail offsets in *headp and *tailp resulting from removing
1279 * delta bytes of data from the tail.
1280 */
1281 static inline void
1282 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1283 off_t *tailp)
1284 {
1285 off_t head = *headp;
1286 off_t tail = *tailp;
1287
1288 KASSERT(delta <= wapbl_space_used(size, head, tail));
1289 tail = wapbl_advance(size, off, tail, delta);
1290 if (head == tail) {
1291 head = tail = 0;
1292 }
1293 *headp = head;
1294 *tailp = tail;
1295 }
1296
1297
1298 /****************************************************************/
1299
1300 /*
1301 * wapbl_truncate(wl, minfree)
1302 *
1303 * Wait until at least minfree bytes are available in the log.
1304 *
1305 * If it was necessary to wait for writes to complete,
1306 * advance the circular queue tail to reflect the new write
1307 * completions and issue a write commit to the log.
1308 *
1309 * => Caller must hold wl->wl_rwlock writer lock.
1310 */
1311 static int
1312 wapbl_truncate(struct wapbl *wl, size_t minfree)
1313 {
1314 size_t delta;
1315 size_t avail;
1316 off_t head;
1317 off_t tail;
1318 int error = 0;
1319
1320 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1321 KASSERT(rw_write_held(&wl->wl_rwlock));
1322
1323 mutex_enter(&wl->wl_mtx);
1324
1325 /*
1326 * First check to see if we have to do a commit
1327 * at all.
1328 */
1329 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1330 if (minfree < avail) {
1331 mutex_exit(&wl->wl_mtx);
1332 return 0;
1333 }
1334 minfree -= avail;
1335 while ((wl->wl_error_count == 0) &&
1336 (wl->wl_reclaimable_bytes < minfree)) {
1337 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1338 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1339 "minfree=%zd\n",
1340 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1341 minfree));
1342
1343 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1344 }
1345 if (wl->wl_reclaimable_bytes < minfree) {
1346 KASSERT(wl->wl_error_count);
1347 /* XXX maybe get actual error from buffer instead someday? */
1348 error = EIO;
1349 }
1350 head = wl->wl_head;
1351 tail = wl->wl_tail;
1352 delta = wl->wl_reclaimable_bytes;
1353
1354 /* If all of of the entries are flushed, then be sure to keep
1355 * the reserved bytes reserved. Watch out for discarded transactions,
1356 * which could leave more bytes reserved than are reclaimable.
1357 */
1358 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1359 (delta >= wl->wl_reserved_bytes)) {
1360 delta -= wl->wl_reserved_bytes;
1361 }
1362 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1363 &tail);
1364 KDASSERT(wl->wl_reserved_bytes <=
1365 wapbl_space_used(wl->wl_circ_size, head, tail));
1366 mutex_exit(&wl->wl_mtx);
1367
1368 if (error)
1369 return error;
1370
1371 /*
1372 * This is where head, tail and delta are unprotected
1373 * from races against itself or flush. This is ok since
1374 * we only call this routine from inside flush itself.
1375 *
1376 * XXX: how can it race against itself when accessed only
1377 * from behind the write-locked rwlock?
1378 */
1379 error = wapbl_write_commit(wl, head, tail);
1380 if (error)
1381 return error;
1382
1383 wl->wl_head = head;
1384 wl->wl_tail = tail;
1385
1386 mutex_enter(&wl->wl_mtx);
1387 KASSERT(wl->wl_reclaimable_bytes >= delta);
1388 wl->wl_reclaimable_bytes -= delta;
1389 mutex_exit(&wl->wl_mtx);
1390 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1391 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1392 curproc->p_pid, curlwp->l_lid, delta));
1393
1394 return 0;
1395 }
1396
1397 /****************************************************************/
1398
1399 void
1400 wapbl_biodone(struct buf *bp)
1401 {
1402 struct wapbl_entry *we = bp->b_private;
1403 struct wapbl *wl = we->we_wapbl;
1404 #ifdef WAPBL_DEBUG_BUFBYTES
1405 const int bufsize = bp->b_bufsize;
1406 #endif
1407
1408 /*
1409 * Handle possible flushing of buffers after log has been
1410 * decomissioned.
1411 */
1412 if (!wl) {
1413 KASSERT(we->we_bufcount > 0);
1414 we->we_bufcount--;
1415 #ifdef WAPBL_DEBUG_BUFBYTES
1416 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1417 we->we_unsynced_bufbytes -= bufsize;
1418 #endif
1419
1420 if (we->we_bufcount == 0) {
1421 #ifdef WAPBL_DEBUG_BUFBYTES
1422 KASSERT(we->we_unsynced_bufbytes == 0);
1423 #endif
1424 pool_put(&wapbl_entry_pool, we);
1425 }
1426
1427 brelse(bp, 0);
1428 return;
1429 }
1430
1431 #ifdef ohbother
1432 KDASSERT(bp->b_oflags & BO_DONE);
1433 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1434 KDASSERT(bp->b_flags & B_ASYNC);
1435 KDASSERT(bp->b_cflags & BC_BUSY);
1436 KDASSERT(!(bp->b_flags & B_LOCKED));
1437 KDASSERT(!(bp->b_flags & B_READ));
1438 KDASSERT(!(bp->b_cflags & BC_INVAL));
1439 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1440 #endif
1441
1442 if (bp->b_error) {
1443 /*
1444 * If an error occurs, it would be nice to leave the buffer
1445 * as a delayed write on the LRU queue so that we can retry
1446 * it later. But buffercache(9) can't handle dirty buffer
1447 * reuse, so just mark the log permanently errored out.
1448 */
1449 mutex_enter(&wl->wl_mtx);
1450 if (wl->wl_error_count == 0) {
1451 wl->wl_error_count++;
1452 cv_broadcast(&wl->wl_reclaimable_cv);
1453 }
1454 mutex_exit(&wl->wl_mtx);
1455 }
1456
1457 /*
1458 * Release the buffer here. wapbl_flush() may wait for the
1459 * log to become empty and we better unbusy the buffer before
1460 * wapbl_flush() returns.
1461 */
1462 brelse(bp, 0);
1463
1464 mutex_enter(&wl->wl_mtx);
1465
1466 KASSERT(we->we_bufcount > 0);
1467 we->we_bufcount--;
1468 #ifdef WAPBL_DEBUG_BUFBYTES
1469 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1470 we->we_unsynced_bufbytes -= bufsize;
1471 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1472 wl->wl_unsynced_bufbytes -= bufsize;
1473 #endif
1474
1475 /*
1476 * If the current transaction can be reclaimed, start
1477 * at the beginning and reclaim any consecutive reclaimable
1478 * transactions. If we successfully reclaim anything,
1479 * then wakeup anyone waiting for the reclaim.
1480 */
1481 if (we->we_bufcount == 0) {
1482 size_t delta = 0;
1483 int errcnt = 0;
1484 #ifdef WAPBL_DEBUG_BUFBYTES
1485 KDASSERT(we->we_unsynced_bufbytes == 0);
1486 #endif
1487 /*
1488 * clear any posted error, since the buffer it came from
1489 * has successfully flushed by now
1490 */
1491 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1492 (we->we_bufcount == 0)) {
1493 delta += we->we_reclaimable_bytes;
1494 if (we->we_error)
1495 errcnt++;
1496 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1497 pool_put(&wapbl_entry_pool, we);
1498 }
1499
1500 if (delta) {
1501 wl->wl_reclaimable_bytes += delta;
1502 KASSERT(wl->wl_error_count >= errcnt);
1503 wl->wl_error_count -= errcnt;
1504 cv_broadcast(&wl->wl_reclaimable_cv);
1505 }
1506 }
1507
1508 mutex_exit(&wl->wl_mtx);
1509 }
1510
1511 /*
1512 * wapbl_flush(wl, wait)
1513 *
1514 * Flush pending block writes, deallocations, and inodes from
1515 * the current transaction in memory to the log on disk:
1516 *
1517 * 1. Call the file system's wl_flush callback to flush any
1518 * per-file-system pending updates.
1519 * 2. Wait for enough space in the log for the current transaction.
1520 * 3. Synchronously write the new log records, advancing the
1521 * circular queue head.
1522 * 4. Issue the pending block writes asynchronously, now that they
1523 * are recorded in the log and can be replayed after crash.
1524 * 5. If wait is true, wait for all writes to complete and for the
1525 * log to become empty.
1526 *
1527 * On failure, call the file system's wl_flush_abort callback.
1528 */
1529 int
1530 wapbl_flush(struct wapbl *wl, int waitfor)
1531 {
1532 struct buf *bp;
1533 struct wapbl_entry *we;
1534 off_t off;
1535 off_t head;
1536 off_t tail;
1537 size_t delta = 0;
1538 size_t flushsize;
1539 size_t reserved;
1540 int error = 0;
1541
1542 /*
1543 * Do a quick check to see if a full flush can be skipped
1544 * This assumes that the flush callback does not need to be called
1545 * unless there are other outstanding bufs.
1546 */
1547 if (!waitfor) {
1548 size_t nbufs;
1549 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1550 protect the KASSERTS */
1551 nbufs = wl->wl_bufcount;
1552 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1553 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1554 mutex_exit(&wl->wl_mtx);
1555 if (nbufs == 0)
1556 return 0;
1557 }
1558
1559 /*
1560 * XXX we may consider using LK_UPGRADE here
1561 * if we want to call flush from inside a transaction
1562 */
1563 rw_enter(&wl->wl_rwlock, RW_WRITER);
1564 wl->wl_flush(wl->wl_mount, SIMPLEQ_FIRST(&wl->wl_dealloclist));
1565
1566 /*
1567 * Now that we are exclusively locked and the file system has
1568 * issued any deferred block writes for this transaction, check
1569 * whether there are any blocks to write to the log. If not,
1570 * skip waiting for space or writing any log entries.
1571 *
1572 * XXX Shouldn't this also check wl_dealloccnt and
1573 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1574 * file system didn't produce any blocks as a consequence of
1575 * it, but the same does not seem to be so of wl_inohashcnt.
1576 */
1577 if (wl->wl_bufcount == 0) {
1578 goto wait_out;
1579 }
1580
1581 #if 0
1582 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1583 ("wapbl_flush thread %d.%d flushing entries with "
1584 "bufcount=%zu bufbytes=%zu\n",
1585 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1586 wl->wl_bufbytes));
1587 #endif
1588
1589 /* Calculate amount of space needed to flush */
1590 flushsize = wapbl_transaction_len(wl);
1591 if (wapbl_verbose_commit) {
1592 struct timespec ts;
1593 getnanotime(&ts);
1594 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1595 __func__, (long long)ts.tv_sec,
1596 (long)ts.tv_nsec, flushsize);
1597 }
1598
1599 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1600 /*
1601 * XXX this could be handled more gracefully, perhaps place
1602 * only a partial transaction in the log and allow the
1603 * remaining to flush without the protection of the journal.
1604 */
1605 panic("wapbl_flush: current transaction too big to flush");
1606 }
1607
1608 error = wapbl_truncate(wl, flushsize);
1609 if (error)
1610 goto out;
1611
1612 off = wl->wl_head;
1613 KASSERT((off == 0) || (off >= wl->wl_circ_off));
1614 KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1615 error = wapbl_write_blocks(wl, &off);
1616 if (error)
1617 goto out;
1618 error = wapbl_write_revocations(wl, &off);
1619 if (error)
1620 goto out;
1621 error = wapbl_write_inodes(wl, &off);
1622 if (error)
1623 goto out;
1624
1625 reserved = 0;
1626 if (wl->wl_inohashcnt)
1627 reserved = wapbl_transaction_inodes_len(wl);
1628
1629 head = wl->wl_head;
1630 tail = wl->wl_tail;
1631
1632 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1633 &head, &tail);
1634
1635 KASSERTMSG(head == off,
1636 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1637 " off=%"PRIdMAX" flush=%zu",
1638 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1639 flushsize);
1640
1641 /* Opportunistically move the tail forward if we can */
1642 mutex_enter(&wl->wl_mtx);
1643 delta = wl->wl_reclaimable_bytes;
1644 mutex_exit(&wl->wl_mtx);
1645 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1646 &head, &tail);
1647
1648 error = wapbl_write_commit(wl, head, tail);
1649 if (error)
1650 goto out;
1651
1652 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1653
1654 #ifdef WAPBL_DEBUG_BUFBYTES
1655 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1656 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1657 " unsynced=%zu"
1658 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1659 "inodes=%d\n",
1660 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1661 wapbl_space_used(wl->wl_circ_size, head, tail),
1662 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1663 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1664 wl->wl_inohashcnt));
1665 #else
1666 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1667 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1668 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1669 "inodes=%d\n",
1670 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1671 wapbl_space_used(wl->wl_circ_size, head, tail),
1672 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1673 wl->wl_dealloccnt, wl->wl_inohashcnt));
1674 #endif
1675
1676
1677 mutex_enter(&bufcache_lock);
1678 mutex_enter(&wl->wl_mtx);
1679
1680 wl->wl_reserved_bytes = reserved;
1681 wl->wl_head = head;
1682 wl->wl_tail = tail;
1683 KASSERT(wl->wl_reclaimable_bytes >= delta);
1684 wl->wl_reclaimable_bytes -= delta;
1685 KDASSERT(wl->wl_dealloccnt == 0);
1686 #ifdef WAPBL_DEBUG_BUFBYTES
1687 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1688 #endif
1689
1690 we->we_wapbl = wl;
1691 we->we_bufcount = wl->wl_bufcount;
1692 #ifdef WAPBL_DEBUG_BUFBYTES
1693 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1694 #endif
1695 we->we_reclaimable_bytes = flushsize;
1696 we->we_error = 0;
1697 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1698
1699 /*
1700 * this flushes bufs in reverse order than they were queued
1701 * it shouldn't matter, but if we care we could use TAILQ instead.
1702 * XXX Note they will get put on the lru queue when they flush
1703 * so we might actually want to change this to preserve order.
1704 */
1705 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1706 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1707 continue;
1708 }
1709 bp->b_iodone = wapbl_biodone;
1710 bp->b_private = we;
1711 bremfree(bp);
1712 wapbl_remove_buf_locked(wl, bp);
1713 mutex_exit(&wl->wl_mtx);
1714 mutex_exit(&bufcache_lock);
1715 bawrite(bp);
1716 mutex_enter(&bufcache_lock);
1717 mutex_enter(&wl->wl_mtx);
1718 }
1719 mutex_exit(&wl->wl_mtx);
1720 mutex_exit(&bufcache_lock);
1721
1722 #if 0
1723 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1724 ("wapbl_flush thread %d.%d done flushing entries...\n",
1725 curproc->p_pid, curlwp->l_lid));
1726 #endif
1727
1728 wait_out:
1729
1730 /*
1731 * If the waitfor flag is set, don't return until everything is
1732 * fully flushed and the on disk log is empty.
1733 */
1734 if (waitfor) {
1735 error = wapbl_truncate(wl, wl->wl_circ_size -
1736 wl->wl_reserved_bytes);
1737 }
1738
1739 out:
1740 if (error) {
1741 wl->wl_flush_abort(wl->wl_mount,
1742 SIMPLEQ_FIRST(&wl->wl_dealloclist));
1743 }
1744
1745 #ifdef WAPBL_DEBUG_PRINT
1746 if (error) {
1747 pid_t pid = -1;
1748 lwpid_t lid = -1;
1749 if (curproc)
1750 pid = curproc->p_pid;
1751 if (curlwp)
1752 lid = curlwp->l_lid;
1753 mutex_enter(&wl->wl_mtx);
1754 #ifdef WAPBL_DEBUG_BUFBYTES
1755 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1756 ("wapbl_flush: thread %d.%d aborted flush: "
1757 "error = %d\n"
1758 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1759 "deallocs=%d inodes=%d\n"
1760 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1761 "unsynced=%zu\n",
1762 pid, lid, error, wl->wl_bufcount,
1763 wl->wl_bufbytes, wl->wl_bcount,
1764 wl->wl_dealloccnt, wl->wl_inohashcnt,
1765 wl->wl_error_count, wl->wl_reclaimable_bytes,
1766 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1767 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1768 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1769 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1770 "error = %d, unsynced = %zu\n",
1771 we->we_bufcount, we->we_reclaimable_bytes,
1772 we->we_error, we->we_unsynced_bufbytes));
1773 }
1774 #else
1775 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1776 ("wapbl_flush: thread %d.%d aborted flush: "
1777 "error = %d\n"
1778 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1779 "deallocs=%d inodes=%d\n"
1780 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1781 pid, lid, error, wl->wl_bufcount,
1782 wl->wl_bufbytes, wl->wl_bcount,
1783 wl->wl_dealloccnt, wl->wl_inohashcnt,
1784 wl->wl_error_count, wl->wl_reclaimable_bytes,
1785 wl->wl_reserved_bytes));
1786 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1787 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1788 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1789 "error = %d\n", we->we_bufcount,
1790 we->we_reclaimable_bytes, we->we_error));
1791 }
1792 #endif
1793 mutex_exit(&wl->wl_mtx);
1794 }
1795 #endif
1796
1797 rw_exit(&wl->wl_rwlock);
1798 return error;
1799 }
1800
1801 /****************************************************************/
1802
1803 void
1804 wapbl_jlock_assert(struct wapbl *wl)
1805 {
1806
1807 KASSERT(rw_lock_held(&wl->wl_rwlock));
1808 }
1809
1810 void
1811 wapbl_junlock_assert(struct wapbl *wl)
1812 {
1813
1814 KASSERT(!rw_write_held(&wl->wl_rwlock));
1815 }
1816
1817 /****************************************************************/
1818
1819 /* locks missing */
1820 void
1821 wapbl_print(struct wapbl *wl,
1822 int full,
1823 void (*pr)(const char *, ...))
1824 {
1825 struct buf *bp;
1826 struct wapbl_entry *we;
1827 (*pr)("wapbl %p", wl);
1828 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1829 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1830 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1831 wl->wl_circ_size, wl->wl_circ_off,
1832 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1833 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1834 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1835 #ifdef WAPBL_DEBUG_BUFBYTES
1836 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1837 "reserved = %zu errcnt = %d unsynced = %zu\n",
1838 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1839 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1840 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1841 #else
1842 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1843 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1844 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1845 wl->wl_error_count);
1846 #endif
1847 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1848 wl->wl_dealloccnt, wl->wl_dealloclim);
1849 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1850 wl->wl_inohashcnt, wl->wl_inohashmask);
1851 (*pr)("entries:\n");
1852 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1853 #ifdef WAPBL_DEBUG_BUFBYTES
1854 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1855 "unsynced = %zu\n",
1856 we->we_bufcount, we->we_reclaimable_bytes,
1857 we->we_error, we->we_unsynced_bufbytes);
1858 #else
1859 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1860 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1861 #endif
1862 }
1863 if (full) {
1864 int cnt = 0;
1865 (*pr)("bufs =");
1866 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1867 if (!LIST_NEXT(bp, b_wapbllist)) {
1868 (*pr)(" %p", bp);
1869 } else if ((++cnt % 6) == 0) {
1870 (*pr)(" %p,\n\t", bp);
1871 } else {
1872 (*pr)(" %p,", bp);
1873 }
1874 }
1875 (*pr)("\n");
1876
1877 (*pr)("dealloced blks = ");
1878 {
1879 struct wapbl_dealloc *wd;
1880 cnt = 0;
1881 SIMPLEQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
1882 (*pr)(" %"PRId64":%d,",
1883 wd->wd_blkno,
1884 wd->wd_len);
1885 if ((++cnt % 4) == 0) {
1886 (*pr)("\n\t");
1887 }
1888 }
1889 }
1890 (*pr)("\n");
1891
1892 (*pr)("registered inodes = ");
1893 {
1894 int i;
1895 cnt = 0;
1896 for (i = 0; i <= wl->wl_inohashmask; i++) {
1897 struct wapbl_ino_head *wih;
1898 struct wapbl_ino *wi;
1899
1900 wih = &wl->wl_inohash[i];
1901 LIST_FOREACH(wi, wih, wi_hash) {
1902 if (wi->wi_ino == 0)
1903 continue;
1904 (*pr)(" %"PRIu64"/0%06"PRIo32",",
1905 wi->wi_ino, wi->wi_mode);
1906 if ((++cnt % 4) == 0) {
1907 (*pr)("\n\t");
1908 }
1909 }
1910 }
1911 (*pr)("\n");
1912 }
1913 }
1914 }
1915
1916 #if defined(WAPBL_DEBUG) || defined(DDB)
1917 void
1918 wapbl_dump(struct wapbl *wl)
1919 {
1920 #if defined(WAPBL_DEBUG)
1921 if (!wl)
1922 wl = wapbl_debug_wl;
1923 #endif
1924 if (!wl)
1925 return;
1926 wapbl_print(wl, 1, printf);
1927 }
1928 #endif
1929
1930 /****************************************************************/
1931
1932 int
1933 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force)
1934 {
1935 struct wapbl_dealloc *wd;
1936 int error = 0;
1937
1938 wapbl_jlock_assert(wl);
1939
1940 mutex_enter(&wl->wl_mtx);
1941
1942 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
1943 if (!force) {
1944 error = EAGAIN;
1945 goto out;
1946 }
1947
1948 /*
1949 * Forced registration can only be used when:
1950 * 1) the caller can't cope with failure
1951 * 2) the path can be triggered only bounded, small
1952 * times per transaction
1953 * If this is not fullfilled, and the path would be triggered
1954 * many times, this could overflow maximum transaction size
1955 * and panic later.
1956 */
1957 printf("%s: forced dealloc registration over limit: %d >= %d\n",
1958 wl->wl_mount->mnt_stat.f_mntonname,
1959 wl->wl_dealloccnt, wl->wl_dealloclim);
1960 }
1961
1962 wl->wl_dealloccnt++;
1963 mutex_exit(&wl->wl_mtx);
1964
1965 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
1966 wd->wd_blkno = blk;
1967 wd->wd_len = len;
1968
1969 mutex_enter(&wl->wl_mtx);
1970 SIMPLEQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
1971
1972 out:
1973 mutex_exit(&wl->wl_mtx);
1974
1975 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1976 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
1977 blk, len, error));
1978
1979 return error;
1980 }
1981
1982 /****************************************************************/
1983
1984 static void
1985 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1986 {
1987
1988 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1989 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1990 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1991 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1992 }
1993 }
1994
1995 static void
1996 wapbl_inodetrk_free(struct wapbl *wl)
1997 {
1998
1999 /* XXX this KASSERT needs locking/mutex analysis */
2000 KASSERT(wl->wl_inohashcnt == 0);
2001 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2002 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2003 pool_destroy(&wapbl_ino_pool);
2004 }
2005 }
2006
2007 static struct wapbl_ino *
2008 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2009 {
2010 struct wapbl_ino_head *wih;
2011 struct wapbl_ino *wi;
2012
2013 KASSERT(mutex_owned(&wl->wl_mtx));
2014
2015 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2016 LIST_FOREACH(wi, wih, wi_hash) {
2017 if (ino == wi->wi_ino)
2018 return wi;
2019 }
2020 return 0;
2021 }
2022
2023 void
2024 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2025 {
2026 struct wapbl_ino_head *wih;
2027 struct wapbl_ino *wi;
2028
2029 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2030
2031 mutex_enter(&wl->wl_mtx);
2032 if (wapbl_inodetrk_get(wl, ino) == NULL) {
2033 wi->wi_ino = ino;
2034 wi->wi_mode = mode;
2035 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2036 LIST_INSERT_HEAD(wih, wi, wi_hash);
2037 wl->wl_inohashcnt++;
2038 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2039 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2040 mutex_exit(&wl->wl_mtx);
2041 } else {
2042 mutex_exit(&wl->wl_mtx);
2043 pool_put(&wapbl_ino_pool, wi);
2044 }
2045 }
2046
2047 void
2048 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2049 {
2050 struct wapbl_ino *wi;
2051
2052 mutex_enter(&wl->wl_mtx);
2053 wi = wapbl_inodetrk_get(wl, ino);
2054 if (wi) {
2055 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2056 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2057 KASSERT(wl->wl_inohashcnt > 0);
2058 wl->wl_inohashcnt--;
2059 LIST_REMOVE(wi, wi_hash);
2060 mutex_exit(&wl->wl_mtx);
2061
2062 pool_put(&wapbl_ino_pool, wi);
2063 } else {
2064 mutex_exit(&wl->wl_mtx);
2065 }
2066 }
2067
2068 /****************************************************************/
2069
2070 /*
2071 * wapbl_transaction_inodes_len(wl)
2072 *
2073 * Calculate the number of bytes required for inode registration
2074 * log records in wl.
2075 */
2076 static inline size_t
2077 wapbl_transaction_inodes_len(struct wapbl *wl)
2078 {
2079 int blocklen = 1<<wl->wl_log_dev_bshift;
2080 int iph;
2081
2082 /* Calculate number of inodes described in a inodelist header */
2083 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2084 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2085
2086 KASSERT(iph > 0);
2087
2088 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2089 }
2090
2091
2092 /*
2093 * wapbl_transaction_len(wl)
2094 *
2095 * Calculate number of bytes required for all log records in wl.
2096 */
2097 static size_t
2098 wapbl_transaction_len(struct wapbl *wl)
2099 {
2100 int blocklen = 1<<wl->wl_log_dev_bshift;
2101 size_t len;
2102
2103 /* Calculate number of blocks described in a blocklist header */
2104 len = wl->wl_bcount;
2105 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2106 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2107 len += wapbl_transaction_inodes_len(wl);
2108
2109 return len;
2110 }
2111
2112 /*
2113 * wapbl_cache_sync(wl, msg)
2114 *
2115 * Issue DIOCCACHESYNC to wl->wl_devvp.
2116 *
2117 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2118 * including msg about the duration of the cache sync.
2119 */
2120 static int
2121 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2122 {
2123 const bool verbose = wapbl_verbose_commit >= 2;
2124 struct bintime start_time;
2125 int force = 1;
2126 int error;
2127
2128 if (!wapbl_flush_disk_cache) {
2129 return 0;
2130 }
2131 if (verbose) {
2132 bintime(&start_time);
2133 }
2134 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2135 FWRITE, FSCRED);
2136 if (error) {
2137 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2138 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2139 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
2140 }
2141 if (verbose) {
2142 struct bintime d;
2143 struct timespec ts;
2144
2145 bintime(&d);
2146 bintime_sub(&d, &start_time);
2147 bintime2timespec(&d, &ts);
2148 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2149 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2150 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2151 }
2152 return error;
2153 }
2154
2155 /*
2156 * wapbl_write_commit(wl, head, tail)
2157 *
2158 * Issue a disk cache sync to wait for all pending writes to the
2159 * log to complete, and then synchronously commit the current
2160 * circular queue head and tail to the log, in the next of two
2161 * locations for commit headers on disk.
2162 *
2163 * Increment the generation number. If the generation number
2164 * rolls over to zero, then a subsequent commit would appear to
2165 * have an older generation than this one -- in that case, issue a
2166 * duplicate commit to avoid this.
2167 *
2168 * => Caller must have exclusive access to wl, either by holding
2169 * wl->wl_rwlock for writer or by being wapbl_start before anyone
2170 * else has seen wl.
2171 */
2172 static int
2173 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2174 {
2175 struct wapbl_wc_header *wc = wl->wl_wc_header;
2176 struct timespec ts;
2177 int error;
2178 daddr_t pbn;
2179
2180 error = wapbl_buffered_flush(wl);
2181 if (error)
2182 return error;
2183 /*
2184 * flush disk cache to ensure that blocks we've written are actually
2185 * written to the stable storage before the commit header.
2186 *
2187 * XXX Calc checksum here, instead we do this for now
2188 */
2189 wapbl_cache_sync(wl, "1");
2190
2191 wc->wc_head = head;
2192 wc->wc_tail = tail;
2193 wc->wc_checksum = 0;
2194 wc->wc_version = 1;
2195 getnanotime(&ts);
2196 wc->wc_time = ts.tv_sec;
2197 wc->wc_timensec = ts.tv_nsec;
2198
2199 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2200 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2201 (intmax_t)head, (intmax_t)tail));
2202
2203 /*
2204 * write the commit header.
2205 *
2206 * XXX if generation will rollover, then first zero
2207 * over second commit header before trying to write both headers.
2208 */
2209
2210 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2211 #ifdef _KERNEL
2212 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2213 #endif
2214 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2215 if (error)
2216 return error;
2217 error = wapbl_buffered_flush(wl);
2218 if (error)
2219 return error;
2220
2221 /*
2222 * flush disk cache to ensure that the commit header is actually
2223 * written before meta data blocks.
2224 */
2225 wapbl_cache_sync(wl, "2");
2226
2227 /*
2228 * If the generation number was zero, write it out a second time.
2229 * This handles initialization and generation number rollover
2230 */
2231 if (wc->wc_generation++ == 0) {
2232 error = wapbl_write_commit(wl, head, tail);
2233 /*
2234 * This panic should be able to be removed if we do the
2235 * zero'ing mentioned above, and we are certain to roll
2236 * back generation number on failure.
2237 */
2238 if (error)
2239 panic("wapbl_write_commit: error writing duplicate "
2240 "log header: %d", error);
2241 }
2242 return 0;
2243 }
2244
2245 /*
2246 * wapbl_write_blocks(wl, offp)
2247 *
2248 * Write all pending physical blocks in the current transaction
2249 * from wapbl_add_buf to the log on disk, adding to the circular
2250 * queue head at byte offset *offp, and returning the new head's
2251 * byte offset in *offp.
2252 */
2253 static int
2254 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2255 {
2256 struct wapbl_wc_blocklist *wc =
2257 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2258 int blocklen = 1<<wl->wl_log_dev_bshift;
2259 struct buf *bp;
2260 off_t off = *offp;
2261 int error;
2262 size_t padding;
2263
2264 KASSERT(rw_write_held(&wl->wl_rwlock));
2265
2266 bp = LIST_FIRST(&wl->wl_bufs);
2267
2268 while (bp) {
2269 int cnt;
2270 struct buf *obp = bp;
2271
2272 KASSERT(bp->b_flags & B_LOCKED);
2273
2274 wc->wc_type = WAPBL_WC_BLOCKS;
2275 wc->wc_len = blocklen;
2276 wc->wc_blkcount = 0;
2277 while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
2278 /*
2279 * Make sure all the physical block numbers are up to
2280 * date. If this is not always true on a given
2281 * filesystem, then VOP_BMAP must be called. We
2282 * could call VOP_BMAP here, or else in the filesystem
2283 * specific flush callback, although neither of those
2284 * solutions allow us to take the vnode lock. If a
2285 * filesystem requires that we must take the vnode lock
2286 * to call VOP_BMAP, then we can probably do it in
2287 * bwrite when the vnode lock should already be held
2288 * by the invoking code.
2289 */
2290 KASSERT((bp->b_vp->v_type == VBLK) ||
2291 (bp->b_blkno != bp->b_lblkno));
2292 KASSERT(bp->b_blkno > 0);
2293
2294 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2295 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2296 wc->wc_len += bp->b_bcount;
2297 wc->wc_blkcount++;
2298 bp = LIST_NEXT(bp, b_wapbllist);
2299 }
2300 if (wc->wc_len % blocklen != 0) {
2301 padding = blocklen - wc->wc_len % blocklen;
2302 wc->wc_len += padding;
2303 } else {
2304 padding = 0;
2305 }
2306
2307 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2308 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2309 wc->wc_len, padding, (intmax_t)off));
2310
2311 error = wapbl_circ_write(wl, wc, blocklen, &off);
2312 if (error)
2313 return error;
2314 bp = obp;
2315 cnt = 0;
2316 while (bp && (cnt++ < wl->wl_brperjblock)) {
2317 error = wapbl_circ_write(wl, bp->b_data,
2318 bp->b_bcount, &off);
2319 if (error)
2320 return error;
2321 bp = LIST_NEXT(bp, b_wapbllist);
2322 }
2323 if (padding) {
2324 void *zero;
2325
2326 zero = wapbl_alloc(padding);
2327 memset(zero, 0, padding);
2328 error = wapbl_circ_write(wl, zero, padding, &off);
2329 wapbl_free(zero, padding);
2330 if (error)
2331 return error;
2332 }
2333 }
2334 *offp = off;
2335 return 0;
2336 }
2337
2338 /*
2339 * wapbl_write_revocations(wl, offp)
2340 *
2341 * Write all pending deallocations in the current transaction from
2342 * wapbl_register_deallocation to the log on disk, adding to the
2343 * circular queue's head at byte offset *offp, and returning the
2344 * new head's byte offset in *offp.
2345 */
2346 static int
2347 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2348 {
2349 struct wapbl_wc_blocklist *wc =
2350 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2351 struct wapbl_dealloc *wd, *lwd;
2352 int blocklen = 1<<wl->wl_log_dev_bshift;
2353 off_t off = *offp;
2354 int error;
2355
2356 if (wl->wl_dealloccnt == 0)
2357 return 0;
2358
2359 while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2360 wc->wc_type = WAPBL_WC_REVOCATIONS;
2361 wc->wc_len = blocklen;
2362 wc->wc_blkcount = 0;
2363 while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
2364 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2365 wd->wd_blkno;
2366 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2367 wd->wd_len;
2368 wc->wc_blkcount++;
2369
2370 wd = SIMPLEQ_NEXT(wd, wd_entries);
2371 }
2372 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2373 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2374 wc->wc_len, (intmax_t)off));
2375 error = wapbl_circ_write(wl, wc, blocklen, &off);
2376 if (error)
2377 return error;
2378
2379 /* free all successfully written deallocs */
2380 lwd = wd;
2381 while ((wd = SIMPLEQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2382 if (wd == lwd)
2383 break;
2384 SIMPLEQ_REMOVE_HEAD(&wl->wl_dealloclist, wd_entries);
2385 pool_put(&wapbl_dealloc_pool, wd);
2386 wl->wl_dealloccnt--;
2387 }
2388 }
2389 *offp = off;
2390 return 0;
2391 }
2392
2393 /*
2394 * wapbl_write_inodes(wl, offp)
2395 *
2396 * Write all pending inode allocations in the current transaction
2397 * from wapbl_register_inode to the log on disk, adding to the
2398 * circular queue's head at byte offset *offp and returning the
2399 * new head's byte offset in *offp.
2400 */
2401 static int
2402 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2403 {
2404 struct wapbl_wc_inodelist *wc =
2405 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2406 int i;
2407 int blocklen = 1 << wl->wl_log_dev_bshift;
2408 off_t off = *offp;
2409 int error;
2410
2411 struct wapbl_ino_head *wih;
2412 struct wapbl_ino *wi;
2413 int iph;
2414
2415 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2416 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2417
2418 i = 0;
2419 wih = &wl->wl_inohash[0];
2420 wi = 0;
2421 do {
2422 wc->wc_type = WAPBL_WC_INODES;
2423 wc->wc_len = blocklen;
2424 wc->wc_inocnt = 0;
2425 wc->wc_clear = (i == 0);
2426 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2427 while (!wi) {
2428 KASSERT((wih - &wl->wl_inohash[0])
2429 <= wl->wl_inohashmask);
2430 wi = LIST_FIRST(wih++);
2431 }
2432 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2433 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2434 wc->wc_inocnt++;
2435 i++;
2436 wi = LIST_NEXT(wi, wi_hash);
2437 }
2438 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2439 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2440 wc->wc_len, (intmax_t)off));
2441 error = wapbl_circ_write(wl, wc, blocklen, &off);
2442 if (error)
2443 return error;
2444 } while (i < wl->wl_inohashcnt);
2445
2446 *offp = off;
2447 return 0;
2448 }
2449
2450 #endif /* _KERNEL */
2451
2452 /****************************************************************/
2453
2454 struct wapbl_blk {
2455 LIST_ENTRY(wapbl_blk) wb_hash;
2456 daddr_t wb_blk;
2457 off_t wb_off; /* Offset of this block in the log */
2458 };
2459 #define WAPBL_BLKPOOL_MIN 83
2460
2461 static void
2462 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2463 {
2464 if (size < WAPBL_BLKPOOL_MIN)
2465 size = WAPBL_BLKPOOL_MIN;
2466 KASSERT(wr->wr_blkhash == 0);
2467 #ifdef _KERNEL
2468 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2469 #else /* ! _KERNEL */
2470 /* Manually implement hashinit */
2471 {
2472 unsigned long i, hashsize;
2473 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2474 continue;
2475 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2476 for (i = 0; i < hashsize; i++)
2477 LIST_INIT(&wr->wr_blkhash[i]);
2478 wr->wr_blkhashmask = hashsize - 1;
2479 }
2480 #endif /* ! _KERNEL */
2481 }
2482
2483 static void
2484 wapbl_blkhash_free(struct wapbl_replay *wr)
2485 {
2486 KASSERT(wr->wr_blkhashcnt == 0);
2487 #ifdef _KERNEL
2488 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2489 #else /* ! _KERNEL */
2490 wapbl_free(wr->wr_blkhash,
2491 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2492 #endif /* ! _KERNEL */
2493 }
2494
2495 static struct wapbl_blk *
2496 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2497 {
2498 struct wapbl_blk_head *wbh;
2499 struct wapbl_blk *wb;
2500 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2501 LIST_FOREACH(wb, wbh, wb_hash) {
2502 if (blk == wb->wb_blk)
2503 return wb;
2504 }
2505 return 0;
2506 }
2507
2508 static void
2509 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2510 {
2511 struct wapbl_blk_head *wbh;
2512 struct wapbl_blk *wb;
2513 wb = wapbl_blkhash_get(wr, blk);
2514 if (wb) {
2515 KASSERT(wb->wb_blk == blk);
2516 wb->wb_off = off;
2517 } else {
2518 wb = wapbl_alloc(sizeof(*wb));
2519 wb->wb_blk = blk;
2520 wb->wb_off = off;
2521 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2522 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2523 wr->wr_blkhashcnt++;
2524 }
2525 }
2526
2527 static void
2528 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2529 {
2530 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2531 if (wb) {
2532 KASSERT(wr->wr_blkhashcnt > 0);
2533 wr->wr_blkhashcnt--;
2534 LIST_REMOVE(wb, wb_hash);
2535 wapbl_free(wb, sizeof(*wb));
2536 }
2537 }
2538
2539 static void
2540 wapbl_blkhash_clear(struct wapbl_replay *wr)
2541 {
2542 unsigned long i;
2543 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2544 struct wapbl_blk *wb;
2545
2546 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2547 KASSERT(wr->wr_blkhashcnt > 0);
2548 wr->wr_blkhashcnt--;
2549 LIST_REMOVE(wb, wb_hash);
2550 wapbl_free(wb, sizeof(*wb));
2551 }
2552 }
2553 KASSERT(wr->wr_blkhashcnt == 0);
2554 }
2555
2556 /****************************************************************/
2557
2558 /*
2559 * wapbl_circ_read(wr, data, len, offp)
2560 *
2561 * Read len bytes into data from the circular queue of wr,
2562 * starting at the linear byte offset *offp, and returning the new
2563 * linear byte offset in *offp.
2564 *
2565 * If the starting linear byte offset precedes wr->wr_circ_off,
2566 * the read instead begins at wr->wr_circ_off. XXX WTF? This
2567 * should be a KASSERT, not a conditional.
2568 */
2569 static int
2570 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2571 {
2572 size_t slen;
2573 off_t off = *offp;
2574 int error;
2575 daddr_t pbn;
2576
2577 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2578 wr->wr_log_dev_bshift) == len);
2579
2580 if (off < wr->wr_circ_off)
2581 off = wr->wr_circ_off;
2582 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2583 if (slen < len) {
2584 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2585 #ifdef _KERNEL
2586 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2587 #endif
2588 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2589 if (error)
2590 return error;
2591 data = (uint8_t *)data + slen;
2592 len -= slen;
2593 off = wr->wr_circ_off;
2594 }
2595 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2596 #ifdef _KERNEL
2597 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2598 #endif
2599 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2600 if (error)
2601 return error;
2602 off += len;
2603 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2604 off = wr->wr_circ_off;
2605 *offp = off;
2606 return 0;
2607 }
2608
2609 /*
2610 * wapbl_circ_advance(wr, len, offp)
2611 *
2612 * Compute the linear byte offset of the circular queue of wr that
2613 * is len bytes past *offp, and store it in *offp.
2614 *
2615 * This is as if wapbl_circ_read, but without actually reading
2616 * anything.
2617 *
2618 * If the starting linear byte offset precedes wr->wr_circ_off, it
2619 * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2620 * be a KASSERT, not a conditional.
2621 */
2622 static void
2623 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2624 {
2625 size_t slen;
2626 off_t off = *offp;
2627
2628 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2629 wr->wr_log_dev_bshift) == len);
2630
2631 if (off < wr->wr_circ_off)
2632 off = wr->wr_circ_off;
2633 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2634 if (slen < len) {
2635 len -= slen;
2636 off = wr->wr_circ_off;
2637 }
2638 off += len;
2639 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2640 off = wr->wr_circ_off;
2641 *offp = off;
2642 }
2643
2644 /****************************************************************/
2645
2646 int
2647 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2648 daddr_t off, size_t count, size_t blksize)
2649 {
2650 struct wapbl_replay *wr;
2651 int error;
2652 struct vnode *devvp;
2653 daddr_t logpbn;
2654 uint8_t *scratch;
2655 struct wapbl_wc_header *wch;
2656 struct wapbl_wc_header *wch2;
2657 /* Use this until we read the actual log header */
2658 int log_dev_bshift = ilog2(blksize);
2659 size_t used;
2660 daddr_t pbn;
2661
2662 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2663 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2664 vp, off, count, blksize));
2665
2666 if (off < 0)
2667 return EINVAL;
2668
2669 if (blksize < DEV_BSIZE)
2670 return EINVAL;
2671 if (blksize % DEV_BSIZE)
2672 return EINVAL;
2673
2674 #ifdef _KERNEL
2675 #if 0
2676 /* XXX vp->v_size isn't reliably set for VBLK devices,
2677 * especially root. However, we might still want to verify
2678 * that the full load is readable */
2679 if ((off + count) * blksize > vp->v_size)
2680 return EINVAL;
2681 #endif
2682 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2683 return error;
2684 }
2685 #else /* ! _KERNEL */
2686 devvp = vp;
2687 logpbn = off;
2688 #endif /* ! _KERNEL */
2689
2690 scratch = wapbl_alloc(MAXBSIZE);
2691
2692 pbn = logpbn;
2693 #ifdef _KERNEL
2694 pbn = btodb(pbn << log_dev_bshift);
2695 #endif
2696 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2697 if (error)
2698 goto errout;
2699
2700 wch = (struct wapbl_wc_header *)scratch;
2701 wch2 =
2702 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2703 /* XXX verify checksums and magic numbers */
2704 if (wch->wc_type != WAPBL_WC_HEADER) {
2705 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2706 error = EFTYPE;
2707 goto errout;
2708 }
2709
2710 if (wch2->wc_generation > wch->wc_generation)
2711 wch = wch2;
2712
2713 wr = wapbl_calloc(1, sizeof(*wr));
2714
2715 wr->wr_logvp = vp;
2716 wr->wr_devvp = devvp;
2717 wr->wr_logpbn = logpbn;
2718
2719 wr->wr_scratch = scratch;
2720
2721 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2722 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2723 wr->wr_circ_off = wch->wc_circ_off;
2724 wr->wr_circ_size = wch->wc_circ_size;
2725 wr->wr_generation = wch->wc_generation;
2726
2727 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2728
2729 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2730 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2731 " len=%"PRId64" used=%zu\n",
2732 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2733 wch->wc_circ_size, used));
2734
2735 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2736
2737 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2738 if (error) {
2739 wapbl_replay_stop(wr);
2740 wapbl_replay_free(wr);
2741 return error;
2742 }
2743
2744 *wrp = wr;
2745 return 0;
2746
2747 errout:
2748 wapbl_free(scratch, MAXBSIZE);
2749 return error;
2750 }
2751
2752 void
2753 wapbl_replay_stop(struct wapbl_replay *wr)
2754 {
2755
2756 if (!wapbl_replay_isopen(wr))
2757 return;
2758
2759 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2760
2761 wapbl_free(wr->wr_scratch, MAXBSIZE);
2762 wr->wr_scratch = NULL;
2763
2764 wr->wr_logvp = NULL;
2765
2766 wapbl_blkhash_clear(wr);
2767 wapbl_blkhash_free(wr);
2768 }
2769
2770 void
2771 wapbl_replay_free(struct wapbl_replay *wr)
2772 {
2773
2774 KDASSERT(!wapbl_replay_isopen(wr));
2775
2776 if (wr->wr_inodes)
2777 wapbl_free(wr->wr_inodes,
2778 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2779 wapbl_free(wr, sizeof(*wr));
2780 }
2781
2782 #ifdef _KERNEL
2783 int
2784 wapbl_replay_isopen1(struct wapbl_replay *wr)
2785 {
2786
2787 return wapbl_replay_isopen(wr);
2788 }
2789 #endif
2790
2791 /*
2792 * calculate the disk address for the i'th block in the wc_blockblist
2793 * offset by j blocks of size blen.
2794 *
2795 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
2796 * was written to the journal.
2797 *
2798 * The kernel needs that address plus the offset in DEV_BSIZE units.
2799 *
2800 * Userland needs that address plus the offset in blen units.
2801 *
2802 */
2803 static daddr_t
2804 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
2805 {
2806 daddr_t pbn;
2807
2808 #ifdef _KERNEL
2809 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
2810 #else
2811 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
2812 #endif
2813
2814 return pbn;
2815 }
2816
2817 static void
2818 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2819 {
2820 struct wapbl_wc_blocklist *wc =
2821 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2822 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2823 int i, j, n;
2824
2825 for (i = 0; i < wc->wc_blkcount; i++) {
2826 /*
2827 * Enter each physical block into the hashtable independently.
2828 */
2829 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2830 for (j = 0; j < n; j++) {
2831 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
2832 *offp);
2833 wapbl_circ_advance(wr, fsblklen, offp);
2834 }
2835 }
2836 }
2837
2838 static void
2839 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2840 {
2841 struct wapbl_wc_blocklist *wc =
2842 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2843 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2844 int i, j, n;
2845
2846 for (i = 0; i < wc->wc_blkcount; i++) {
2847 /*
2848 * Remove any blocks found from the hashtable.
2849 */
2850 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2851 for (j = 0; j < n; j++)
2852 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
2853 }
2854 }
2855
2856 static void
2857 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2858 {
2859 struct wapbl_wc_inodelist *wc =
2860 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2861 void *new_inodes;
2862 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2863
2864 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2865
2866 /*
2867 * Keep track of where we found this so location won't be
2868 * overwritten.
2869 */
2870 if (wc->wc_clear) {
2871 wr->wr_inodestail = oldoff;
2872 wr->wr_inodescnt = 0;
2873 if (wr->wr_inodes != NULL) {
2874 wapbl_free(wr->wr_inodes, oldsize);
2875 wr->wr_inodes = NULL;
2876 }
2877 }
2878 wr->wr_inodeshead = newoff;
2879 if (wc->wc_inocnt == 0)
2880 return;
2881
2882 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2883 sizeof(wr->wr_inodes[0]));
2884 if (wr->wr_inodes != NULL) {
2885 memcpy(new_inodes, wr->wr_inodes, oldsize);
2886 wapbl_free(wr->wr_inodes, oldsize);
2887 }
2888 wr->wr_inodes = new_inodes;
2889 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2890 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2891 wr->wr_inodescnt += wc->wc_inocnt;
2892 }
2893
2894 static int
2895 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2896 {
2897 off_t off;
2898 int error;
2899
2900 int logblklen = 1 << wr->wr_log_dev_bshift;
2901
2902 wapbl_blkhash_clear(wr);
2903
2904 off = tail;
2905 while (off != head) {
2906 struct wapbl_wc_null *wcn;
2907 off_t saveoff = off;
2908 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2909 if (error)
2910 goto errout;
2911 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2912 switch (wcn->wc_type) {
2913 case WAPBL_WC_BLOCKS:
2914 wapbl_replay_process_blocks(wr, &off);
2915 break;
2916
2917 case WAPBL_WC_REVOCATIONS:
2918 wapbl_replay_process_revocations(wr);
2919 break;
2920
2921 case WAPBL_WC_INODES:
2922 wapbl_replay_process_inodes(wr, saveoff, off);
2923 break;
2924
2925 default:
2926 printf("Unrecognized wapbl type: 0x%08x\n",
2927 wcn->wc_type);
2928 error = EFTYPE;
2929 goto errout;
2930 }
2931 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2932 if (off != saveoff) {
2933 printf("wapbl_replay: corrupted records\n");
2934 error = EFTYPE;
2935 goto errout;
2936 }
2937 }
2938 return 0;
2939
2940 errout:
2941 wapbl_blkhash_clear(wr);
2942 return error;
2943 }
2944
2945 #if 0
2946 int
2947 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2948 {
2949 off_t off;
2950 int mismatchcnt = 0;
2951 int logblklen = 1 << wr->wr_log_dev_bshift;
2952 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2953 void *scratch1 = wapbl_alloc(MAXBSIZE);
2954 void *scratch2 = wapbl_alloc(MAXBSIZE);
2955 int error = 0;
2956
2957 KDASSERT(wapbl_replay_isopen(wr));
2958
2959 off = wch->wc_tail;
2960 while (off != wch->wc_head) {
2961 struct wapbl_wc_null *wcn;
2962 #ifdef DEBUG
2963 off_t saveoff = off;
2964 #endif
2965 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2966 if (error)
2967 goto out;
2968 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2969 switch (wcn->wc_type) {
2970 case WAPBL_WC_BLOCKS:
2971 {
2972 struct wapbl_wc_blocklist *wc =
2973 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2974 int i;
2975 for (i = 0; i < wc->wc_blkcount; i++) {
2976 int foundcnt = 0;
2977 int dirtycnt = 0;
2978 int j, n;
2979 /*
2980 * Check each physical block into the
2981 * hashtable independently
2982 */
2983 n = wc->wc_blocks[i].wc_dlen >>
2984 wch->wc_fs_dev_bshift;
2985 for (j = 0; j < n; j++) {
2986 struct wapbl_blk *wb =
2987 wapbl_blkhash_get(wr,
2988 wapbl_block_daddr(wc, i, j, fsblklen));
2989 if (wb && (wb->wb_off == off)) {
2990 foundcnt++;
2991 error =
2992 wapbl_circ_read(wr,
2993 scratch1, fsblklen,
2994 &off);
2995 if (error)
2996 goto out;
2997 error =
2998 wapbl_read(scratch2,
2999 fsblklen, fsdevvp,
3000 wb->wb_blk);
3001 if (error)
3002 goto out;
3003 if (memcmp(scratch1,
3004 scratch2,
3005 fsblklen)) {
3006 printf(
3007 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3008 wb->wb_blk, (intmax_t)off);
3009 dirtycnt++;
3010 mismatchcnt++;
3011 }
3012 } else {
3013 wapbl_circ_advance(wr,
3014 fsblklen, &off);
3015 }
3016 }
3017 #if 0
3018 /*
3019 * If all of the blocks in an entry
3020 * are clean, then remove all of its
3021 * blocks from the hashtable since they
3022 * never will need replay.
3023 */
3024 if ((foundcnt != 0) &&
3025 (dirtycnt == 0)) {
3026 off = saveoff;
3027 wapbl_circ_advance(wr,
3028 logblklen, &off);
3029 for (j = 0; j < n; j++) {
3030 struct wapbl_blk *wb =
3031 wapbl_blkhash_get(wr,
3032 wapbl_block_daddr(wc, i, j, fsblklen));
3033 if (wb &&
3034 (wb->wb_off == off)) {
3035 wapbl_blkhash_rem(wr, wb->wb_blk);
3036 }
3037 wapbl_circ_advance(wr,
3038 fsblklen, &off);
3039 }
3040 }
3041 #endif
3042 }
3043 }
3044 break;
3045 case WAPBL_WC_REVOCATIONS:
3046 case WAPBL_WC_INODES:
3047 break;
3048 default:
3049 KASSERT(0);
3050 }
3051 #ifdef DEBUG
3052 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3053 KASSERT(off == saveoff);
3054 #endif
3055 }
3056 out:
3057 wapbl_free(scratch1, MAXBSIZE);
3058 wapbl_free(scratch2, MAXBSIZE);
3059 if (!error && mismatchcnt)
3060 error = EFTYPE;
3061 return error;
3062 }
3063 #endif
3064
3065 int
3066 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3067 {
3068 struct wapbl_blk *wb;
3069 size_t i;
3070 off_t off;
3071 void *scratch;
3072 int error = 0;
3073 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3074
3075 KDASSERT(wapbl_replay_isopen(wr));
3076
3077 scratch = wapbl_alloc(MAXBSIZE);
3078
3079 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
3080 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3081 off = wb->wb_off;
3082 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3083 if (error)
3084 break;
3085 error = wapbl_write(scratch, fsblklen, fsdevvp,
3086 wb->wb_blk);
3087 if (error)
3088 break;
3089 }
3090 }
3091
3092 wapbl_free(scratch, MAXBSIZE);
3093 return error;
3094 }
3095
3096 int
3097 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3098 {
3099 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3100
3101 KDASSERT(wapbl_replay_isopen(wr));
3102 KASSERT((len % fsblklen) == 0);
3103
3104 while (len != 0) {
3105 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3106 if (wb)
3107 return 1;
3108 len -= fsblklen;
3109 }
3110 return 0;
3111 }
3112
3113 int
3114 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3115 {
3116 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3117
3118 KDASSERT(wapbl_replay_isopen(wr));
3119
3120 KASSERT((len % fsblklen) == 0);
3121
3122 while (len != 0) {
3123 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3124 if (wb) {
3125 off_t off = wb->wb_off;
3126 int error;
3127 error = wapbl_circ_read(wr, data, fsblklen, &off);
3128 if (error)
3129 return error;
3130 }
3131 data = (uint8_t *)data + fsblklen;
3132 len -= fsblklen;
3133 blk++;
3134 }
3135 return 0;
3136 }
3137
3138 #ifdef _KERNEL
3139
3140 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3141
3142 static int
3143 wapbl_modcmd(modcmd_t cmd, void *arg)
3144 {
3145
3146 switch (cmd) {
3147 case MODULE_CMD_INIT:
3148 wapbl_init();
3149 return 0;
3150 case MODULE_CMD_FINI:
3151 return wapbl_fini();
3152 default:
3153 return ENOTTY;
3154 }
3155 }
3156 #endif /* _KERNEL */
3157