vfs_wapbl.c revision 1.48.2.1 1 /* $NetBSD: vfs_wapbl.c,v 1.48.2.1 2012/02/18 07:35:35 mrg Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.48.2.1 2012/02/18 07:35:35 mrg Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 #else /* !_KERNEL */
74 #include <assert.h>
75 #include <errno.h>
76 #include <stdio.h>
77 #include <stdbool.h>
78 #include <stdlib.h>
79 #include <string.h>
80
81 #include <sys/time.h>
82 #include <sys/wapbl.h>
83 #include <sys/wapbl_replay.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_alloc(s) malloc(s)
88 #define wapbl_free(a, s) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * lm = rwlock held writing or mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* lm: address of block */
175 int *wl_dealloclens; /* lm: size of block */
176 int wl_dealloccnt; /* lm: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187 };
188
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191 #endif
192
193 /****************************************************************/
194 #ifdef _KERNEL
195
196 #ifdef WAPBL_DEBUG
197 struct wapbl *wapbl_debug_wl;
198 #endif
199
200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204 #endif /* _KERNEL */
205
206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207
208 static inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail);
210 static inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail);
212
213 #ifdef _KERNEL
214
215 static struct pool wapbl_entry_pool;
216
217 #define WAPBL_INODETRK_SIZE 83
218 static int wapbl_ino_pool_refcount;
219 static struct pool wapbl_ino_pool;
220 struct wapbl_ino {
221 LIST_ENTRY(wapbl_ino) wi_hash;
222 ino_t wi_ino;
223 mode_t wi_mode;
224 };
225
226 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
227 static void wapbl_inodetrk_free(struct wapbl *wl);
228 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
229
230 static size_t wapbl_transaction_len(struct wapbl *wl);
231 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
232
233 #if 0
234 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
235 #endif
236
237 static int wapbl_replay_isopen1(struct wapbl_replay *);
238
239 /*
240 * This is useful for debugging. If set, the log will
241 * only be truncated when necessary.
242 */
243 int wapbl_lazy_truncate = 0;
244
245 struct wapbl_ops wapbl_ops = {
246 .wo_wapbl_discard = wapbl_discard,
247 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
248 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
249 .wo_wapbl_replay_read = wapbl_replay_read,
250 .wo_wapbl_add_buf = wapbl_add_buf,
251 .wo_wapbl_remove_buf = wapbl_remove_buf,
252 .wo_wapbl_resize_buf = wapbl_resize_buf,
253 .wo_wapbl_begin = wapbl_begin,
254 .wo_wapbl_end = wapbl_end,
255 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
256
257 /* XXX: the following is only used to say "this is a wapbl buf" */
258 .wo_wapbl_biodone = wapbl_biodone,
259 };
260
261 static int
262 wapbl_sysctl_init(void)
263 {
264 int rv;
265 const struct sysctlnode *rnode, *cnode;
266
267 wapbl_sysctl = NULL;
268
269 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
270 CTLFLAG_PERMANENT,
271 CTLTYPE_NODE, "vfs", NULL,
272 NULL, 0, NULL, 0,
273 CTL_VFS, CTL_EOL);
274 if (rv)
275 return rv;
276
277 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
278 CTLFLAG_PERMANENT,
279 CTLTYPE_NODE, "wapbl",
280 SYSCTL_DESCR("WAPBL journaling options"),
281 NULL, 0, NULL, 0,
282 CTL_CREATE, CTL_EOL);
283 if (rv)
284 return rv;
285
286 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
287 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
288 CTLTYPE_INT, "flush_disk_cache",
289 SYSCTL_DESCR("flush disk cache"),
290 NULL, 0, &wapbl_flush_disk_cache, 0,
291 CTL_CREATE, CTL_EOL);
292 if (rv)
293 return rv;
294
295 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
296 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
297 CTLTYPE_INT, "verbose_commit",
298 SYSCTL_DESCR("show time and size of wapbl log commits"),
299 NULL, 0, &wapbl_verbose_commit, 0,
300 CTL_CREATE, CTL_EOL);
301 return rv;
302 }
303
304 static void
305 wapbl_init(void)
306 {
307
308 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
309 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
310
311 wapbl_sysctl_init();
312 }
313
314 #ifdef notyet
315 static int
316 wapbl_fini(bool interface)
317 {
318
319 if (aio_sysctl != NULL)
320 sysctl_teardown(&aio_sysctl);
321
322 pool_destroy(&wapbl_entry_pool);
323
324 return 0;
325 }
326 #endif
327
328 static int
329 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
330 {
331 int error, i;
332
333 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
334 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
335
336 /*
337 * Its only valid to reuse the replay log if its
338 * the same as the new log we just opened.
339 */
340 KDASSERT(!wapbl_replay_isopen(wr));
341 KASSERT(wl->wl_devvp->v_type == VBLK);
342 KASSERT(wr->wr_devvp->v_type == VBLK);
343 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
344 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
345 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
346 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
347 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
348 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
349
350 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
351
352 for (i = 0; i < wr->wr_inodescnt; i++)
353 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
354 wr->wr_inodes[i].wr_imode);
355
356 /* Make sure new transaction won't overwrite old inodes list */
357 KDASSERT(wapbl_transaction_len(wl) <=
358 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
359 wr->wr_inodestail));
360
361 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
362 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
363 wapbl_transaction_len(wl);
364
365 error = wapbl_write_inodes(wl, &wl->wl_head);
366 if (error)
367 return error;
368
369 KASSERT(wl->wl_head != wl->wl_tail);
370 KASSERT(wl->wl_head != 0);
371
372 return 0;
373 }
374
375 int
376 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
377 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
378 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
379 {
380 struct wapbl *wl;
381 struct vnode *devvp;
382 daddr_t logpbn;
383 int error;
384 int log_dev_bshift = ilog2(blksize);
385 int fs_dev_bshift = log_dev_bshift;
386 int run;
387
388 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
389 " count=%zu blksize=%zu\n", vp, off, count, blksize));
390
391 if (log_dev_bshift > fs_dev_bshift) {
392 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
393 ("wapbl: log device's block size cannot be larger "
394 "than filesystem's\n"));
395 /*
396 * Not currently implemented, although it could be if
397 * needed someday.
398 */
399 return ENOSYS;
400 }
401
402 if (off < 0)
403 return EINVAL;
404
405 if (blksize < DEV_BSIZE)
406 return EINVAL;
407 if (blksize % DEV_BSIZE)
408 return EINVAL;
409
410 /* XXXTODO: verify that the full load is writable */
411
412 /*
413 * XXX check for minimum log size
414 * minimum is governed by minimum amount of space
415 * to complete a transaction. (probably truncate)
416 */
417 /* XXX for now pick something minimal */
418 if ((count * blksize) < MAXPHYS) {
419 return ENOSPC;
420 }
421
422 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
423 return error;
424 }
425
426 wl = wapbl_calloc(1, sizeof(*wl));
427 rw_init(&wl->wl_rwlock);
428 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
429 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
430 LIST_INIT(&wl->wl_bufs);
431 SIMPLEQ_INIT(&wl->wl_entries);
432
433 wl->wl_logvp = vp;
434 wl->wl_devvp = devvp;
435 wl->wl_mount = mp;
436 wl->wl_logpbn = logpbn;
437 wl->wl_log_dev_bshift = log_dev_bshift;
438 wl->wl_fs_dev_bshift = fs_dev_bshift;
439
440 wl->wl_flush = flushfn;
441 wl->wl_flush_abort = flushabortfn;
442
443 /* Reserve two log device blocks for the commit headers */
444 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
445 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
446 /* truncate the log usage to a multiple of log_dev_bshift */
447 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
448 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
449
450 /*
451 * wl_bufbytes_max limits the size of the in memory transaction space.
452 * - Since buffers are allocated and accounted for in units of
453 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
454 * (i.e. 1<<PAGE_SHIFT)
455 * - Since the log device has to be written in units of
456 * 1<<wl_log_dev_bshift it is required to be a mulitple of
457 * 1<<wl_log_dev_bshift.
458 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
459 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
460 * Therefore it must be multiple of the least common multiple of those
461 * three quantities. Fortunately, all of those quantities are
462 * guaranteed to be a power of two, and the least common multiple of
463 * a set of numbers which are all powers of two is simply the maximum
464 * of those numbers. Finally, the maximum logarithm of a power of two
465 * is the same as the log of the maximum power of two. So we can do
466 * the following operations to size wl_bufbytes_max:
467 */
468
469 /* XXX fix actual number of pages reserved per filesystem. */
470 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
471
472 /* Round wl_bufbytes_max to the largest power of two constraint */
473 wl->wl_bufbytes_max >>= PAGE_SHIFT;
474 wl->wl_bufbytes_max <<= PAGE_SHIFT;
475 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
476 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
477 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
478 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
479
480 /* XXX maybe use filesystem fragment size instead of 1024 */
481 /* XXX fix actual number of buffers reserved per filesystem. */
482 wl->wl_bufcount_max = (nbuf / 2) * 1024;
483
484 /* XXX tie this into resource estimation */
485 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
486
487 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
488 wl->wl_dealloclim);
489 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
490 wl->wl_dealloclim);
491
492 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
493
494 /* Initialize the commit header */
495 {
496 struct wapbl_wc_header *wc;
497 size_t len = 1 << wl->wl_log_dev_bshift;
498 wc = wapbl_calloc(1, len);
499 wc->wc_type = WAPBL_WC_HEADER;
500 wc->wc_len = len;
501 wc->wc_circ_off = wl->wl_circ_off;
502 wc->wc_circ_size = wl->wl_circ_size;
503 /* XXX wc->wc_fsid */
504 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
505 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
506 wl->wl_wc_header = wc;
507 wl->wl_wc_scratch = wapbl_alloc(len);
508 }
509
510 /*
511 * if there was an existing set of unlinked but
512 * allocated inodes, preserve it in the new
513 * log.
514 */
515 if (wr && wr->wr_inodescnt) {
516 error = wapbl_start_flush_inodes(wl, wr);
517 if (error)
518 goto errout;
519 }
520
521 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
522 if (error) {
523 goto errout;
524 }
525
526 *wlp = wl;
527 #if defined(WAPBL_DEBUG)
528 wapbl_debug_wl = wl;
529 #endif
530
531 return 0;
532 errout:
533 wapbl_discard(wl);
534 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
535 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
536 wapbl_free(wl->wl_deallocblks,
537 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
538 wapbl_free(wl->wl_dealloclens,
539 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
540 wapbl_inodetrk_free(wl);
541 wapbl_free(wl, sizeof(*wl));
542
543 return error;
544 }
545
546 /*
547 * Like wapbl_flush, only discards the transaction
548 * completely
549 */
550
551 void
552 wapbl_discard(struct wapbl *wl)
553 {
554 struct wapbl_entry *we;
555 struct buf *bp;
556 int i;
557
558 /*
559 * XXX we may consider using upgrade here
560 * if we want to call flush from inside a transaction
561 */
562 rw_enter(&wl->wl_rwlock, RW_WRITER);
563 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
564 wl->wl_dealloccnt);
565
566 #ifdef WAPBL_DEBUG_PRINT
567 {
568 pid_t pid = -1;
569 lwpid_t lid = -1;
570 if (curproc)
571 pid = curproc->p_pid;
572 if (curlwp)
573 lid = curlwp->l_lid;
574 #ifdef WAPBL_DEBUG_BUFBYTES
575 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
576 ("wapbl_discard: thread %d.%d discarding "
577 "transaction\n"
578 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
579 "deallocs=%d inodes=%d\n"
580 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
581 "unsynced=%zu\n",
582 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
583 wl->wl_bcount, wl->wl_dealloccnt,
584 wl->wl_inohashcnt, wl->wl_error_count,
585 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
586 wl->wl_unsynced_bufbytes));
587 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
588 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
589 ("\tentry: bufcount = %zu, reclaimable = %zu, "
590 "error = %d, unsynced = %zu\n",
591 we->we_bufcount, we->we_reclaimable_bytes,
592 we->we_error, we->we_unsynced_bufbytes));
593 }
594 #else /* !WAPBL_DEBUG_BUFBYTES */
595 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
596 ("wapbl_discard: thread %d.%d discarding transaction\n"
597 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
598 "deallocs=%d inodes=%d\n"
599 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
600 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
601 wl->wl_bcount, wl->wl_dealloccnt,
602 wl->wl_inohashcnt, wl->wl_error_count,
603 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
604 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
605 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
606 ("\tentry: bufcount = %zu, reclaimable = %zu, "
607 "error = %d\n",
608 we->we_bufcount, we->we_reclaimable_bytes,
609 we->we_error));
610 }
611 #endif /* !WAPBL_DEBUG_BUFBYTES */
612 }
613 #endif /* WAPBL_DEBUG_PRINT */
614
615 for (i = 0; i <= wl->wl_inohashmask; i++) {
616 struct wapbl_ino_head *wih;
617 struct wapbl_ino *wi;
618
619 wih = &wl->wl_inohash[i];
620 while ((wi = LIST_FIRST(wih)) != NULL) {
621 LIST_REMOVE(wi, wi_hash);
622 pool_put(&wapbl_ino_pool, wi);
623 KASSERT(wl->wl_inohashcnt > 0);
624 wl->wl_inohashcnt--;
625 }
626 }
627
628 /*
629 * clean buffer list
630 */
631 mutex_enter(&bufcache_lock);
632 mutex_enter(&wl->wl_mtx);
633 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
634 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
635 /*
636 * The buffer will be unlocked and
637 * removed from the transaction in brelse
638 */
639 mutex_exit(&wl->wl_mtx);
640 brelsel(bp, 0);
641 mutex_enter(&wl->wl_mtx);
642 }
643 }
644 mutex_exit(&wl->wl_mtx);
645 mutex_exit(&bufcache_lock);
646
647 /*
648 * Remove references to this wl from wl_entries, free any which
649 * no longer have buffers, others will be freed in wapbl_biodone
650 * when they no longer have any buffers.
651 */
652 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
653 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
654 /* XXX should we be accumulating wl_error_count
655 * and increasing reclaimable bytes ? */
656 we->we_wapbl = NULL;
657 if (we->we_bufcount == 0) {
658 #ifdef WAPBL_DEBUG_BUFBYTES
659 KASSERT(we->we_unsynced_bufbytes == 0);
660 #endif
661 pool_put(&wapbl_entry_pool, we);
662 }
663 }
664
665 /* Discard list of deallocs */
666 wl->wl_dealloccnt = 0;
667 /* XXX should we clear wl_reserved_bytes? */
668
669 KASSERT(wl->wl_bufbytes == 0);
670 KASSERT(wl->wl_bcount == 0);
671 KASSERT(wl->wl_bufcount == 0);
672 KASSERT(LIST_EMPTY(&wl->wl_bufs));
673 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
674 KASSERT(wl->wl_inohashcnt == 0);
675
676 rw_exit(&wl->wl_rwlock);
677 }
678
679 int
680 wapbl_stop(struct wapbl *wl, int force)
681 {
682 struct vnode *vp;
683 int error;
684
685 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
686 error = wapbl_flush(wl, 1);
687 if (error) {
688 if (force)
689 wapbl_discard(wl);
690 else
691 return error;
692 }
693
694 /* Unlinked inodes persist after a flush */
695 if (wl->wl_inohashcnt) {
696 if (force) {
697 wapbl_discard(wl);
698 } else {
699 return EBUSY;
700 }
701 }
702
703 KASSERT(wl->wl_bufbytes == 0);
704 KASSERT(wl->wl_bcount == 0);
705 KASSERT(wl->wl_bufcount == 0);
706 KASSERT(LIST_EMPTY(&wl->wl_bufs));
707 KASSERT(wl->wl_dealloccnt == 0);
708 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
709 KASSERT(wl->wl_inohashcnt == 0);
710
711 vp = wl->wl_logvp;
712
713 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
714 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
715 wapbl_free(wl->wl_deallocblks,
716 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
717 wapbl_free(wl->wl_dealloclens,
718 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
719 wapbl_inodetrk_free(wl);
720
721 cv_destroy(&wl->wl_reclaimable_cv);
722 mutex_destroy(&wl->wl_mtx);
723 rw_destroy(&wl->wl_rwlock);
724 wapbl_free(wl, sizeof(*wl));
725
726 return 0;
727 }
728
729 static int
730 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
731 {
732 struct pstats *pstats = curlwp->l_proc->p_stats;
733 struct buf *bp;
734 int error;
735
736 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
737 KASSERT(devvp->v_type == VBLK);
738
739 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
740 mutex_enter(devvp->v_interlock);
741 devvp->v_numoutput++;
742 mutex_exit(devvp->v_interlock);
743 pstats->p_ru.ru_oublock++;
744 } else {
745 pstats->p_ru.ru_inblock++;
746 }
747
748 bp = getiobuf(devvp, true);
749 bp->b_flags = flags;
750 bp->b_cflags = BC_BUSY; /* silly & dubious */
751 bp->b_dev = devvp->v_rdev;
752 bp->b_data = data;
753 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
754 bp->b_blkno = pbn;
755
756 WAPBL_PRINTF(WAPBL_PRINT_IO,
757 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
758 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
759 bp->b_blkno, bp->b_dev));
760
761 VOP_STRATEGY(devvp, bp);
762
763 error = biowait(bp);
764 putiobuf(bp);
765
766 if (error) {
767 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
768 ("wapbl_doio: %s %zu bytes at block %" PRId64
769 " on dev 0x%"PRIx64" failed with error %d\n",
770 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
771 "write" : "read"),
772 len, pbn, devvp->v_rdev, error));
773 }
774
775 return error;
776 }
777
778 int
779 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
780 {
781
782 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
783 }
784
785 int
786 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
787 {
788
789 return wapbl_doio(data, len, devvp, pbn, B_READ);
790 }
791
792 /*
793 * Off is byte offset returns new offset for next write
794 * handles log wraparound
795 */
796 static int
797 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
798 {
799 size_t slen;
800 off_t off = *offp;
801 int error;
802 daddr_t pbn;
803
804 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
805 wl->wl_log_dev_bshift) == len);
806
807 if (off < wl->wl_circ_off)
808 off = wl->wl_circ_off;
809 slen = wl->wl_circ_off + wl->wl_circ_size - off;
810 if (slen < len) {
811 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
812 #ifdef _KERNEL
813 pbn = btodb(pbn << wl->wl_log_dev_bshift);
814 #endif
815 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
816 if (error)
817 return error;
818 data = (uint8_t *)data + slen;
819 len -= slen;
820 off = wl->wl_circ_off;
821 }
822 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
823 #ifdef _KERNEL
824 pbn = btodb(pbn << wl->wl_log_dev_bshift);
825 #endif
826 error = wapbl_write(data, len, wl->wl_devvp, pbn);
827 if (error)
828 return error;
829 off += len;
830 if (off >= wl->wl_circ_off + wl->wl_circ_size)
831 off = wl->wl_circ_off;
832 *offp = off;
833 return 0;
834 }
835
836 /****************************************************************/
837
838 int
839 wapbl_begin(struct wapbl *wl, const char *file, int line)
840 {
841 int doflush;
842 unsigned lockcount;
843
844 KDASSERT(wl);
845
846 /*
847 * XXX this needs to be made much more sophisticated.
848 * perhaps each wapbl_begin could reserve a specified
849 * number of buffers and bytes.
850 */
851 mutex_enter(&wl->wl_mtx);
852 lockcount = wl->wl_lock_count;
853 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
854 wl->wl_bufbytes_max / 2) ||
855 ((wl->wl_bufcount + (lockcount * 10)) >
856 wl->wl_bufcount_max / 2) ||
857 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
858 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
859 mutex_exit(&wl->wl_mtx);
860
861 if (doflush) {
862 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
863 ("force flush lockcnt=%d bufbytes=%zu "
864 "(max=%zu) bufcount=%zu (max=%zu) "
865 "dealloccnt %d (lim=%d)\n",
866 lockcount, wl->wl_bufbytes,
867 wl->wl_bufbytes_max, wl->wl_bufcount,
868 wl->wl_bufcount_max,
869 wl->wl_dealloccnt, wl->wl_dealloclim));
870 }
871
872 if (doflush) {
873 int error = wapbl_flush(wl, 0);
874 if (error)
875 return error;
876 }
877
878 rw_enter(&wl->wl_rwlock, RW_READER);
879 mutex_enter(&wl->wl_mtx);
880 wl->wl_lock_count++;
881 mutex_exit(&wl->wl_mtx);
882
883 #if defined(WAPBL_DEBUG_PRINT)
884 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
885 ("wapbl_begin thread %d.%d with bufcount=%zu "
886 "bufbytes=%zu bcount=%zu at %s:%d\n",
887 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
888 wl->wl_bufbytes, wl->wl_bcount, file, line));
889 #endif
890
891 return 0;
892 }
893
894 void
895 wapbl_end(struct wapbl *wl)
896 {
897
898 #if defined(WAPBL_DEBUG_PRINT)
899 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
900 ("wapbl_end thread %d.%d with bufcount=%zu "
901 "bufbytes=%zu bcount=%zu\n",
902 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
903 wl->wl_bufbytes, wl->wl_bcount));
904 #endif
905
906 #ifdef DIAGNOSTIC
907 size_t flushsize = wapbl_transaction_len(wl);
908 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
909 /*
910 * XXX this could be handled more gracefully, perhaps place
911 * only a partial transaction in the log and allow the
912 * remaining to flush without the protection of the journal.
913 */
914 panic("wapbl_end: current transaction too big to flush\n");
915 }
916 #endif
917
918 mutex_enter(&wl->wl_mtx);
919 KASSERT(wl->wl_lock_count > 0);
920 wl->wl_lock_count--;
921 mutex_exit(&wl->wl_mtx);
922
923 rw_exit(&wl->wl_rwlock);
924 }
925
926 void
927 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
928 {
929
930 KASSERT(bp->b_cflags & BC_BUSY);
931 KASSERT(bp->b_vp);
932
933 wapbl_jlock_assert(wl);
934
935 #if 0
936 /*
937 * XXX this might be an issue for swapfiles.
938 * see uvm_swap.c:1702
939 *
940 * XXX2 why require it then? leap of semantics?
941 */
942 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
943 #endif
944
945 mutex_enter(&wl->wl_mtx);
946 if (bp->b_flags & B_LOCKED) {
947 LIST_REMOVE(bp, b_wapbllist);
948 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
949 ("wapbl_add_buf thread %d.%d re-adding buf %p "
950 "with %d bytes %d bcount\n",
951 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
952 bp->b_bcount));
953 } else {
954 /* unlocked by dirty buffers shouldn't exist */
955 KASSERT(!(bp->b_oflags & BO_DELWRI));
956 wl->wl_bufbytes += bp->b_bufsize;
957 wl->wl_bcount += bp->b_bcount;
958 wl->wl_bufcount++;
959 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
960 ("wapbl_add_buf thread %d.%d adding buf %p "
961 "with %d bytes %d bcount\n",
962 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
963 bp->b_bcount));
964 }
965 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
966 mutex_exit(&wl->wl_mtx);
967
968 bp->b_flags |= B_LOCKED;
969 }
970
971 static void
972 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
973 {
974
975 KASSERT(mutex_owned(&wl->wl_mtx));
976 KASSERT(bp->b_cflags & BC_BUSY);
977 wapbl_jlock_assert(wl);
978
979 #if 0
980 /*
981 * XXX this might be an issue for swapfiles.
982 * see uvm_swap.c:1725
983 *
984 * XXXdeux: see above
985 */
986 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
987 #endif
988 KASSERT(bp->b_flags & B_LOCKED);
989
990 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
991 ("wapbl_remove_buf thread %d.%d removing buf %p with "
992 "%d bytes %d bcount\n",
993 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
994
995 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
996 wl->wl_bufbytes -= bp->b_bufsize;
997 KASSERT(wl->wl_bcount >= bp->b_bcount);
998 wl->wl_bcount -= bp->b_bcount;
999 KASSERT(wl->wl_bufcount > 0);
1000 wl->wl_bufcount--;
1001 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1002 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1003 LIST_REMOVE(bp, b_wapbllist);
1004
1005 bp->b_flags &= ~B_LOCKED;
1006 }
1007
1008 /* called from brelsel() in vfs_bio among other places */
1009 void
1010 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1011 {
1012
1013 mutex_enter(&wl->wl_mtx);
1014 wapbl_remove_buf_locked(wl, bp);
1015 mutex_exit(&wl->wl_mtx);
1016 }
1017
1018 void
1019 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1020 {
1021
1022 KASSERT(bp->b_cflags & BC_BUSY);
1023
1024 /*
1025 * XXX: why does this depend on B_LOCKED? otherwise the buf
1026 * is not for a transaction? if so, why is this called in the
1027 * first place?
1028 */
1029 if (bp->b_flags & B_LOCKED) {
1030 mutex_enter(&wl->wl_mtx);
1031 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1032 wl->wl_bcount += bp->b_bcount - oldcnt;
1033 mutex_exit(&wl->wl_mtx);
1034 }
1035 }
1036
1037 #endif /* _KERNEL */
1038
1039 /****************************************************************/
1040 /* Some utility inlines */
1041
1042 /* This is used to advance the pointer at old to new value at old+delta */
1043 static inline off_t
1044 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1045 {
1046 off_t new;
1047
1048 /* Define acceptable ranges for inputs. */
1049 KASSERT(delta <= (size_t)size);
1050 KASSERT((old == 0) || ((size_t)old >= off));
1051 KASSERT(old < (off_t)(size + off));
1052
1053 if ((old == 0) && (delta != 0))
1054 new = off + delta;
1055 else if ((old + delta) < (size + off))
1056 new = old + delta;
1057 else
1058 new = (old + delta) - size;
1059
1060 /* Note some interesting axioms */
1061 KASSERT((delta != 0) || (new == old));
1062 KASSERT((delta == 0) || (new != 0));
1063 KASSERT((delta != (size)) || (new == old));
1064
1065 /* Define acceptable ranges for output. */
1066 KASSERT((new == 0) || ((size_t)new >= off));
1067 KASSERT((size_t)new < (size + off));
1068 return new;
1069 }
1070
1071 static inline size_t
1072 wapbl_space_used(size_t avail, off_t head, off_t tail)
1073 {
1074
1075 if (tail == 0) {
1076 KASSERT(head == 0);
1077 return 0;
1078 }
1079 return ((head + (avail - 1) - tail) % avail) + 1;
1080 }
1081
1082 static inline size_t
1083 wapbl_space_free(size_t avail, off_t head, off_t tail)
1084 {
1085
1086 return avail - wapbl_space_used(avail, head, tail);
1087 }
1088
1089 static inline void
1090 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1091 off_t *tailp)
1092 {
1093 off_t head = *headp;
1094 off_t tail = *tailp;
1095
1096 KASSERT(delta <= wapbl_space_free(size, head, tail));
1097 head = wapbl_advance(size, off, head, delta);
1098 if ((tail == 0) && (head != 0))
1099 tail = off;
1100 *headp = head;
1101 *tailp = tail;
1102 }
1103
1104 static inline void
1105 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1106 off_t *tailp)
1107 {
1108 off_t head = *headp;
1109 off_t tail = *tailp;
1110
1111 KASSERT(delta <= wapbl_space_used(size, head, tail));
1112 tail = wapbl_advance(size, off, tail, delta);
1113 if (head == tail) {
1114 head = tail = 0;
1115 }
1116 *headp = head;
1117 *tailp = tail;
1118 }
1119
1120 #ifdef _KERNEL
1121
1122 /****************************************************************/
1123
1124 /*
1125 * Remove transactions whose buffers are completely flushed to disk.
1126 * Will block until at least minfree space is available.
1127 * only intended to be called from inside wapbl_flush and therefore
1128 * does not protect against commit races with itself or with flush.
1129 */
1130 static int
1131 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1132 {
1133 size_t delta;
1134 size_t avail;
1135 off_t head;
1136 off_t tail;
1137 int error = 0;
1138
1139 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1140 KASSERT(rw_write_held(&wl->wl_rwlock));
1141
1142 mutex_enter(&wl->wl_mtx);
1143
1144 /*
1145 * First check to see if we have to do a commit
1146 * at all.
1147 */
1148 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1149 if (minfree < avail) {
1150 mutex_exit(&wl->wl_mtx);
1151 return 0;
1152 }
1153 minfree -= avail;
1154 while ((wl->wl_error_count == 0) &&
1155 (wl->wl_reclaimable_bytes < minfree)) {
1156 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1157 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1158 "minfree=%zd\n",
1159 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1160 minfree));
1161
1162 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1163 }
1164 if (wl->wl_reclaimable_bytes < minfree) {
1165 KASSERT(wl->wl_error_count);
1166 /* XXX maybe get actual error from buffer instead someday? */
1167 error = EIO;
1168 }
1169 head = wl->wl_head;
1170 tail = wl->wl_tail;
1171 delta = wl->wl_reclaimable_bytes;
1172
1173 /* If all of of the entries are flushed, then be sure to keep
1174 * the reserved bytes reserved. Watch out for discarded transactions,
1175 * which could leave more bytes reserved than are reclaimable.
1176 */
1177 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1178 (delta >= wl->wl_reserved_bytes)) {
1179 delta -= wl->wl_reserved_bytes;
1180 }
1181 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1182 &tail);
1183 KDASSERT(wl->wl_reserved_bytes <=
1184 wapbl_space_used(wl->wl_circ_size, head, tail));
1185 mutex_exit(&wl->wl_mtx);
1186
1187 if (error)
1188 return error;
1189
1190 if (waitonly)
1191 return 0;
1192
1193 /*
1194 * This is where head, tail and delta are unprotected
1195 * from races against itself or flush. This is ok since
1196 * we only call this routine from inside flush itself.
1197 *
1198 * XXX: how can it race against itself when accessed only
1199 * from behind the write-locked rwlock?
1200 */
1201 error = wapbl_write_commit(wl, head, tail);
1202 if (error)
1203 return error;
1204
1205 wl->wl_head = head;
1206 wl->wl_tail = tail;
1207
1208 mutex_enter(&wl->wl_mtx);
1209 KASSERT(wl->wl_reclaimable_bytes >= delta);
1210 wl->wl_reclaimable_bytes -= delta;
1211 mutex_exit(&wl->wl_mtx);
1212 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1213 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1214 curproc->p_pid, curlwp->l_lid, delta));
1215
1216 return 0;
1217 }
1218
1219 /****************************************************************/
1220
1221 void
1222 wapbl_biodone(struct buf *bp)
1223 {
1224 struct wapbl_entry *we = bp->b_private;
1225 struct wapbl *wl = we->we_wapbl;
1226
1227 /*
1228 * Handle possible flushing of buffers after log has been
1229 * decomissioned.
1230 */
1231 if (!wl) {
1232 KASSERT(we->we_bufcount > 0);
1233 we->we_bufcount--;
1234 #ifdef WAPBL_DEBUG_BUFBYTES
1235 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1236 we->we_unsynced_bufbytes -= bp->b_bufsize;
1237 #endif
1238
1239 if (we->we_bufcount == 0) {
1240 #ifdef WAPBL_DEBUG_BUFBYTES
1241 KASSERT(we->we_unsynced_bufbytes == 0);
1242 #endif
1243 pool_put(&wapbl_entry_pool, we);
1244 }
1245
1246 brelse(bp, 0);
1247 return;
1248 }
1249
1250 #ifdef ohbother
1251 KDASSERT(bp->b_oflags & BO_DONE);
1252 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1253 KDASSERT(bp->b_flags & B_ASYNC);
1254 KDASSERT(bp->b_cflags & BC_BUSY);
1255 KDASSERT(!(bp->b_flags & B_LOCKED));
1256 KDASSERT(!(bp->b_flags & B_READ));
1257 KDASSERT(!(bp->b_cflags & BC_INVAL));
1258 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1259 #endif
1260
1261 if (bp->b_error) {
1262 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1263 /*
1264 * XXXpooka: interfaces not fully updated
1265 * Note: this was not enabled in the original patch
1266 * against netbsd4 either. I don't know if comment
1267 * above is true or not.
1268 */
1269
1270 /*
1271 * If an error occurs, report the error and leave the
1272 * buffer as a delayed write on the LRU queue.
1273 * restarting the write would likely result in
1274 * an error spinloop, so let it be done harmlessly
1275 * by the syncer.
1276 */
1277 bp->b_flags &= ~(B_DONE);
1278 simple_unlock(&bp->b_interlock);
1279
1280 if (we->we_error == 0) {
1281 mutex_enter(&wl->wl_mtx);
1282 wl->wl_error_count++;
1283 mutex_exit(&wl->wl_mtx);
1284 cv_broadcast(&wl->wl_reclaimable_cv);
1285 }
1286 we->we_error = bp->b_error;
1287 bp->b_error = 0;
1288 brelse(bp);
1289 return;
1290 #else
1291 /* For now, just mark the log permanently errored out */
1292
1293 mutex_enter(&wl->wl_mtx);
1294 if (wl->wl_error_count == 0) {
1295 wl->wl_error_count++;
1296 cv_broadcast(&wl->wl_reclaimable_cv);
1297 }
1298 mutex_exit(&wl->wl_mtx);
1299 #endif
1300 }
1301
1302 mutex_enter(&wl->wl_mtx);
1303
1304 KASSERT(we->we_bufcount > 0);
1305 we->we_bufcount--;
1306 #ifdef WAPBL_DEBUG_BUFBYTES
1307 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1308 we->we_unsynced_bufbytes -= bp->b_bufsize;
1309 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1310 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1311 #endif
1312
1313 /*
1314 * If the current transaction can be reclaimed, start
1315 * at the beginning and reclaim any consecutive reclaimable
1316 * transactions. If we successfully reclaim anything,
1317 * then wakeup anyone waiting for the reclaim.
1318 */
1319 if (we->we_bufcount == 0) {
1320 size_t delta = 0;
1321 int errcnt = 0;
1322 #ifdef WAPBL_DEBUG_BUFBYTES
1323 KDASSERT(we->we_unsynced_bufbytes == 0);
1324 #endif
1325 /*
1326 * clear any posted error, since the buffer it came from
1327 * has successfully flushed by now
1328 */
1329 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1330 (we->we_bufcount == 0)) {
1331 delta += we->we_reclaimable_bytes;
1332 if (we->we_error)
1333 errcnt++;
1334 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1335 pool_put(&wapbl_entry_pool, we);
1336 }
1337
1338 if (delta) {
1339 wl->wl_reclaimable_bytes += delta;
1340 KASSERT(wl->wl_error_count >= errcnt);
1341 wl->wl_error_count -= errcnt;
1342 cv_broadcast(&wl->wl_reclaimable_cv);
1343 }
1344 }
1345
1346 mutex_exit(&wl->wl_mtx);
1347 brelse(bp, 0);
1348 }
1349
1350 /*
1351 * Write transactions to disk + start I/O for contents
1352 */
1353 int
1354 wapbl_flush(struct wapbl *wl, int waitfor)
1355 {
1356 struct buf *bp;
1357 struct wapbl_entry *we;
1358 off_t off;
1359 off_t head;
1360 off_t tail;
1361 size_t delta = 0;
1362 size_t flushsize;
1363 size_t reserved;
1364 int error = 0;
1365
1366 /*
1367 * Do a quick check to see if a full flush can be skipped
1368 * This assumes that the flush callback does not need to be called
1369 * unless there are other outstanding bufs.
1370 */
1371 if (!waitfor) {
1372 size_t nbufs;
1373 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1374 protect the KASSERTS */
1375 nbufs = wl->wl_bufcount;
1376 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1377 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1378 mutex_exit(&wl->wl_mtx);
1379 if (nbufs == 0)
1380 return 0;
1381 }
1382
1383 /*
1384 * XXX we may consider using LK_UPGRADE here
1385 * if we want to call flush from inside a transaction
1386 */
1387 rw_enter(&wl->wl_rwlock, RW_WRITER);
1388 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1389 wl->wl_dealloccnt);
1390
1391 /*
1392 * Now that we are fully locked and flushed,
1393 * do another check for nothing to do.
1394 */
1395 if (wl->wl_bufcount == 0) {
1396 goto out;
1397 }
1398
1399 #if 0
1400 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1401 ("wapbl_flush thread %d.%d flushing entries with "
1402 "bufcount=%zu bufbytes=%zu\n",
1403 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1404 wl->wl_bufbytes));
1405 #endif
1406
1407 /* Calculate amount of space needed to flush */
1408 flushsize = wapbl_transaction_len(wl);
1409 if (wapbl_verbose_commit) {
1410 struct timespec ts;
1411 getnanotime(&ts);
1412 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1413 __func__, (long long)ts.tv_sec,
1414 (long)ts.tv_nsec, flushsize);
1415 }
1416
1417 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1418 /*
1419 * XXX this could be handled more gracefully, perhaps place
1420 * only a partial transaction in the log and allow the
1421 * remaining to flush without the protection of the journal.
1422 */
1423 panic("wapbl_flush: current transaction too big to flush\n");
1424 }
1425
1426 error = wapbl_truncate(wl, flushsize, 0);
1427 if (error)
1428 goto out2;
1429
1430 off = wl->wl_head;
1431 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1432 (off < wl->wl_circ_off + wl->wl_circ_size)));
1433 error = wapbl_write_blocks(wl, &off);
1434 if (error)
1435 goto out2;
1436 error = wapbl_write_revocations(wl, &off);
1437 if (error)
1438 goto out2;
1439 error = wapbl_write_inodes(wl, &off);
1440 if (error)
1441 goto out2;
1442
1443 reserved = 0;
1444 if (wl->wl_inohashcnt)
1445 reserved = wapbl_transaction_inodes_len(wl);
1446
1447 head = wl->wl_head;
1448 tail = wl->wl_tail;
1449
1450 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1451 &head, &tail);
1452 #ifdef WAPBL_DEBUG
1453 if (head != off) {
1454 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1455 " off=%"PRIdMAX" flush=%zu\n",
1456 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1457 flushsize);
1458 }
1459 #else
1460 KASSERT(head == off);
1461 #endif
1462
1463 /* Opportunistically move the tail forward if we can */
1464 if (!wapbl_lazy_truncate) {
1465 mutex_enter(&wl->wl_mtx);
1466 delta = wl->wl_reclaimable_bytes;
1467 mutex_exit(&wl->wl_mtx);
1468 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1469 &head, &tail);
1470 }
1471
1472 error = wapbl_write_commit(wl, head, tail);
1473 if (error)
1474 goto out2;
1475
1476 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1477
1478 #ifdef WAPBL_DEBUG_BUFBYTES
1479 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1480 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1481 " unsynced=%zu"
1482 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1483 "inodes=%d\n",
1484 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1485 wapbl_space_used(wl->wl_circ_size, head, tail),
1486 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1487 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1488 wl->wl_inohashcnt));
1489 #else
1490 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1491 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1492 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1493 "inodes=%d\n",
1494 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1495 wapbl_space_used(wl->wl_circ_size, head, tail),
1496 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1497 wl->wl_dealloccnt, wl->wl_inohashcnt));
1498 #endif
1499
1500
1501 mutex_enter(&bufcache_lock);
1502 mutex_enter(&wl->wl_mtx);
1503
1504 wl->wl_reserved_bytes = reserved;
1505 wl->wl_head = head;
1506 wl->wl_tail = tail;
1507 KASSERT(wl->wl_reclaimable_bytes >= delta);
1508 wl->wl_reclaimable_bytes -= delta;
1509 wl->wl_dealloccnt = 0;
1510 #ifdef WAPBL_DEBUG_BUFBYTES
1511 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1512 #endif
1513
1514 we->we_wapbl = wl;
1515 we->we_bufcount = wl->wl_bufcount;
1516 #ifdef WAPBL_DEBUG_BUFBYTES
1517 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1518 #endif
1519 we->we_reclaimable_bytes = flushsize;
1520 we->we_error = 0;
1521 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1522
1523 /*
1524 * this flushes bufs in reverse order than they were queued
1525 * it shouldn't matter, but if we care we could use TAILQ instead.
1526 * XXX Note they will get put on the lru queue when they flush
1527 * so we might actually want to change this to preserve order.
1528 */
1529 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1530 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1531 continue;
1532 }
1533 bp->b_iodone = wapbl_biodone;
1534 bp->b_private = we;
1535 bremfree(bp);
1536 wapbl_remove_buf_locked(wl, bp);
1537 mutex_exit(&wl->wl_mtx);
1538 mutex_exit(&bufcache_lock);
1539 bawrite(bp);
1540 mutex_enter(&bufcache_lock);
1541 mutex_enter(&wl->wl_mtx);
1542 }
1543 mutex_exit(&wl->wl_mtx);
1544 mutex_exit(&bufcache_lock);
1545
1546 #if 0
1547 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1548 ("wapbl_flush thread %d.%d done flushing entries...\n",
1549 curproc->p_pid, curlwp->l_lid));
1550 #endif
1551
1552 out:
1553
1554 /*
1555 * If the waitfor flag is set, don't return until everything is
1556 * fully flushed and the on disk log is empty.
1557 */
1558 if (waitfor) {
1559 error = wapbl_truncate(wl, wl->wl_circ_size -
1560 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1561 }
1562
1563 out2:
1564 if (error) {
1565 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1566 wl->wl_dealloclens, wl->wl_dealloccnt);
1567 }
1568
1569 #ifdef WAPBL_DEBUG_PRINT
1570 if (error) {
1571 pid_t pid = -1;
1572 lwpid_t lid = -1;
1573 if (curproc)
1574 pid = curproc->p_pid;
1575 if (curlwp)
1576 lid = curlwp->l_lid;
1577 mutex_enter(&wl->wl_mtx);
1578 #ifdef WAPBL_DEBUG_BUFBYTES
1579 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1580 ("wapbl_flush: thread %d.%d aborted flush: "
1581 "error = %d\n"
1582 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1583 "deallocs=%d inodes=%d\n"
1584 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1585 "unsynced=%zu\n",
1586 pid, lid, error, wl->wl_bufcount,
1587 wl->wl_bufbytes, wl->wl_bcount,
1588 wl->wl_dealloccnt, wl->wl_inohashcnt,
1589 wl->wl_error_count, wl->wl_reclaimable_bytes,
1590 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1591 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1592 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1593 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1594 "error = %d, unsynced = %zu\n",
1595 we->we_bufcount, we->we_reclaimable_bytes,
1596 we->we_error, we->we_unsynced_bufbytes));
1597 }
1598 #else
1599 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1600 ("wapbl_flush: thread %d.%d aborted flush: "
1601 "error = %d\n"
1602 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1603 "deallocs=%d inodes=%d\n"
1604 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1605 pid, lid, error, wl->wl_bufcount,
1606 wl->wl_bufbytes, wl->wl_bcount,
1607 wl->wl_dealloccnt, wl->wl_inohashcnt,
1608 wl->wl_error_count, wl->wl_reclaimable_bytes,
1609 wl->wl_reserved_bytes));
1610 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1611 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1612 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1613 "error = %d\n", we->we_bufcount,
1614 we->we_reclaimable_bytes, we->we_error));
1615 }
1616 #endif
1617 mutex_exit(&wl->wl_mtx);
1618 }
1619 #endif
1620
1621 rw_exit(&wl->wl_rwlock);
1622 return error;
1623 }
1624
1625 /****************************************************************/
1626
1627 void
1628 wapbl_jlock_assert(struct wapbl *wl)
1629 {
1630
1631 KASSERT(rw_lock_held(&wl->wl_rwlock));
1632 }
1633
1634 void
1635 wapbl_junlock_assert(struct wapbl *wl)
1636 {
1637
1638 KASSERT(!rw_write_held(&wl->wl_rwlock));
1639 }
1640
1641 /****************************************************************/
1642
1643 /* locks missing */
1644 void
1645 wapbl_print(struct wapbl *wl,
1646 int full,
1647 void (*pr)(const char *, ...))
1648 {
1649 struct buf *bp;
1650 struct wapbl_entry *we;
1651 (*pr)("wapbl %p", wl);
1652 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1653 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1654 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1655 wl->wl_circ_size, wl->wl_circ_off,
1656 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1657 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1658 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1659 #ifdef WAPBL_DEBUG_BUFBYTES
1660 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1661 "reserved = %zu errcnt = %d unsynced = %zu\n",
1662 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1663 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1664 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1665 #else
1666 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1667 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1668 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1669 wl->wl_error_count);
1670 #endif
1671 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1672 wl->wl_dealloccnt, wl->wl_dealloclim);
1673 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1674 wl->wl_inohashcnt, wl->wl_inohashmask);
1675 (*pr)("entries:\n");
1676 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1677 #ifdef WAPBL_DEBUG_BUFBYTES
1678 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1679 "unsynced = %zu\n",
1680 we->we_bufcount, we->we_reclaimable_bytes,
1681 we->we_error, we->we_unsynced_bufbytes);
1682 #else
1683 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1684 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1685 #endif
1686 }
1687 if (full) {
1688 int cnt = 0;
1689 (*pr)("bufs =");
1690 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1691 if (!LIST_NEXT(bp, b_wapbllist)) {
1692 (*pr)(" %p", bp);
1693 } else if ((++cnt % 6) == 0) {
1694 (*pr)(" %p,\n\t", bp);
1695 } else {
1696 (*pr)(" %p,", bp);
1697 }
1698 }
1699 (*pr)("\n");
1700
1701 (*pr)("dealloced blks = ");
1702 {
1703 int i;
1704 cnt = 0;
1705 for (i = 0; i < wl->wl_dealloccnt; i++) {
1706 (*pr)(" %"PRId64":%d,",
1707 wl->wl_deallocblks[i],
1708 wl->wl_dealloclens[i]);
1709 if ((++cnt % 4) == 0) {
1710 (*pr)("\n\t");
1711 }
1712 }
1713 }
1714 (*pr)("\n");
1715
1716 (*pr)("registered inodes = ");
1717 {
1718 int i;
1719 cnt = 0;
1720 for (i = 0; i <= wl->wl_inohashmask; i++) {
1721 struct wapbl_ino_head *wih;
1722 struct wapbl_ino *wi;
1723
1724 wih = &wl->wl_inohash[i];
1725 LIST_FOREACH(wi, wih, wi_hash) {
1726 if (wi->wi_ino == 0)
1727 continue;
1728 (*pr)(" %"PRId32"/0%06"PRIo32",",
1729 wi->wi_ino, wi->wi_mode);
1730 if ((++cnt % 4) == 0) {
1731 (*pr)("\n\t");
1732 }
1733 }
1734 }
1735 (*pr)("\n");
1736 }
1737 }
1738 }
1739
1740 #if defined(WAPBL_DEBUG) || defined(DDB)
1741 void
1742 wapbl_dump(struct wapbl *wl)
1743 {
1744 #if defined(WAPBL_DEBUG)
1745 if (!wl)
1746 wl = wapbl_debug_wl;
1747 #endif
1748 if (!wl)
1749 return;
1750 wapbl_print(wl, 1, printf);
1751 }
1752 #endif
1753
1754 /****************************************************************/
1755
1756 void
1757 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1758 {
1759
1760 wapbl_jlock_assert(wl);
1761
1762 mutex_enter(&wl->wl_mtx);
1763 /* XXX should eventually instead tie this into resource estimation */
1764 /*
1765 * XXX this panic needs locking/mutex analysis and the
1766 * ability to cope with the failure.
1767 */
1768 /* XXX this XXX doesn't have enough XXX */
1769 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1770 panic("wapbl_register_deallocation: out of resources");
1771
1772 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1773 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1774 wl->wl_dealloccnt++;
1775 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1776 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1777 mutex_exit(&wl->wl_mtx);
1778 }
1779
1780 /****************************************************************/
1781
1782 static void
1783 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1784 {
1785
1786 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1787 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1788 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1789 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1790 }
1791 }
1792
1793 static void
1794 wapbl_inodetrk_free(struct wapbl *wl)
1795 {
1796
1797 /* XXX this KASSERT needs locking/mutex analysis */
1798 KASSERT(wl->wl_inohashcnt == 0);
1799 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1800 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1801 pool_destroy(&wapbl_ino_pool);
1802 }
1803 }
1804
1805 static struct wapbl_ino *
1806 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1807 {
1808 struct wapbl_ino_head *wih;
1809 struct wapbl_ino *wi;
1810
1811 KASSERT(mutex_owned(&wl->wl_mtx));
1812
1813 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1814 LIST_FOREACH(wi, wih, wi_hash) {
1815 if (ino == wi->wi_ino)
1816 return wi;
1817 }
1818 return 0;
1819 }
1820
1821 void
1822 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1823 {
1824 struct wapbl_ino_head *wih;
1825 struct wapbl_ino *wi;
1826
1827 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1828
1829 mutex_enter(&wl->wl_mtx);
1830 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1831 wi->wi_ino = ino;
1832 wi->wi_mode = mode;
1833 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1834 LIST_INSERT_HEAD(wih, wi, wi_hash);
1835 wl->wl_inohashcnt++;
1836 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1837 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1838 mutex_exit(&wl->wl_mtx);
1839 } else {
1840 mutex_exit(&wl->wl_mtx);
1841 pool_put(&wapbl_ino_pool, wi);
1842 }
1843 }
1844
1845 void
1846 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1847 {
1848 struct wapbl_ino *wi;
1849
1850 mutex_enter(&wl->wl_mtx);
1851 wi = wapbl_inodetrk_get(wl, ino);
1852 if (wi) {
1853 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1854 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1855 KASSERT(wl->wl_inohashcnt > 0);
1856 wl->wl_inohashcnt--;
1857 LIST_REMOVE(wi, wi_hash);
1858 mutex_exit(&wl->wl_mtx);
1859
1860 pool_put(&wapbl_ino_pool, wi);
1861 } else {
1862 mutex_exit(&wl->wl_mtx);
1863 }
1864 }
1865
1866 /****************************************************************/
1867
1868 static inline size_t
1869 wapbl_transaction_inodes_len(struct wapbl *wl)
1870 {
1871 int blocklen = 1<<wl->wl_log_dev_bshift;
1872 int iph;
1873
1874 /* Calculate number of inodes described in a inodelist header */
1875 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1876 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1877
1878 KASSERT(iph > 0);
1879
1880 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1881 }
1882
1883
1884 /* Calculate amount of space a transaction will take on disk */
1885 static size_t
1886 wapbl_transaction_len(struct wapbl *wl)
1887 {
1888 int blocklen = 1<<wl->wl_log_dev_bshift;
1889 size_t len;
1890 int bph;
1891
1892 /* Calculate number of blocks described in a blocklist header */
1893 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1894 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1895
1896 KASSERT(bph > 0);
1897
1898 len = wl->wl_bcount;
1899 len += howmany(wl->wl_bufcount, bph) * blocklen;
1900 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1901 len += wapbl_transaction_inodes_len(wl);
1902
1903 return len;
1904 }
1905
1906 /*
1907 * wapbl_cache_sync: issue DIOCCACHESYNC
1908 */
1909 static int
1910 wapbl_cache_sync(struct wapbl *wl, const char *msg)
1911 {
1912 const bool verbose = wapbl_verbose_commit >= 2;
1913 struct bintime start_time;
1914 int force = 1;
1915 int error;
1916
1917 if (!wapbl_flush_disk_cache) {
1918 return 0;
1919 }
1920 if (verbose) {
1921 bintime(&start_time);
1922 }
1923 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1924 FWRITE, FSCRED);
1925 if (error) {
1926 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1927 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
1928 "returned %d\n", wl->wl_devvp->v_rdev, error));
1929 }
1930 if (verbose) {
1931 struct bintime d;
1932 struct timespec ts;
1933
1934 bintime(&d);
1935 bintime_sub(&d, &start_time);
1936 bintime2timespec(&d, &ts);
1937 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
1938 msg, (uintmax_t)wl->wl_devvp->v_rdev,
1939 (uintmax_t)ts.tv_sec, ts.tv_nsec);
1940 }
1941 return error;
1942 }
1943
1944 /*
1945 * Perform commit operation
1946 *
1947 * Note that generation number incrementation needs to
1948 * be protected against racing with other invocations
1949 * of wapbl_write_commit. This is ok since this routine
1950 * is only invoked from wapbl_flush
1951 */
1952 static int
1953 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1954 {
1955 struct wapbl_wc_header *wc = wl->wl_wc_header;
1956 struct timespec ts;
1957 int error;
1958 daddr_t pbn;
1959
1960 /*
1961 * flush disk cache to ensure that blocks we've written are actually
1962 * written to the stable storage before the commit header.
1963 *
1964 * XXX Calc checksum here, instead we do this for now
1965 */
1966 wapbl_cache_sync(wl, "1");
1967
1968 wc->wc_head = head;
1969 wc->wc_tail = tail;
1970 wc->wc_checksum = 0;
1971 wc->wc_version = 1;
1972 getnanotime(&ts);
1973 wc->wc_time = ts.tv_sec;
1974 wc->wc_timensec = ts.tv_nsec;
1975
1976 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1977 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1978 (intmax_t)head, (intmax_t)tail));
1979
1980 /*
1981 * write the commit header.
1982 *
1983 * XXX if generation will rollover, then first zero
1984 * over second commit header before trying to write both headers.
1985 */
1986
1987 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1988 #ifdef _KERNEL
1989 pbn = btodb(pbn << wc->wc_log_dev_bshift);
1990 #endif
1991 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1992 if (error)
1993 return error;
1994
1995 /*
1996 * flush disk cache to ensure that the commit header is actually
1997 * written before meta data blocks.
1998 */
1999 wapbl_cache_sync(wl, "2");
2000
2001 /*
2002 * If the generation number was zero, write it out a second time.
2003 * This handles initialization and generation number rollover
2004 */
2005 if (wc->wc_generation++ == 0) {
2006 error = wapbl_write_commit(wl, head, tail);
2007 /*
2008 * This panic should be able to be removed if we do the
2009 * zero'ing mentioned above, and we are certain to roll
2010 * back generation number on failure.
2011 */
2012 if (error)
2013 panic("wapbl_write_commit: error writing duplicate "
2014 "log header: %d\n", error);
2015 }
2016 return 0;
2017 }
2018
2019 /* Returns new offset value */
2020 static int
2021 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2022 {
2023 struct wapbl_wc_blocklist *wc =
2024 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2025 int blocklen = 1<<wl->wl_log_dev_bshift;
2026 int bph;
2027 struct buf *bp;
2028 off_t off = *offp;
2029 int error;
2030 size_t padding;
2031
2032 KASSERT(rw_write_held(&wl->wl_rwlock));
2033
2034 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2035 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2036
2037 bp = LIST_FIRST(&wl->wl_bufs);
2038
2039 while (bp) {
2040 int cnt;
2041 struct buf *obp = bp;
2042
2043 KASSERT(bp->b_flags & B_LOCKED);
2044
2045 wc->wc_type = WAPBL_WC_BLOCKS;
2046 wc->wc_len = blocklen;
2047 wc->wc_blkcount = 0;
2048 while (bp && (wc->wc_blkcount < bph)) {
2049 /*
2050 * Make sure all the physical block numbers are up to
2051 * date. If this is not always true on a given
2052 * filesystem, then VOP_BMAP must be called. We
2053 * could call VOP_BMAP here, or else in the filesystem
2054 * specific flush callback, although neither of those
2055 * solutions allow us to take the vnode lock. If a
2056 * filesystem requires that we must take the vnode lock
2057 * to call VOP_BMAP, then we can probably do it in
2058 * bwrite when the vnode lock should already be held
2059 * by the invoking code.
2060 */
2061 KASSERT((bp->b_vp->v_type == VBLK) ||
2062 (bp->b_blkno != bp->b_lblkno));
2063 KASSERT(bp->b_blkno > 0);
2064
2065 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2066 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2067 wc->wc_len += bp->b_bcount;
2068 wc->wc_blkcount++;
2069 bp = LIST_NEXT(bp, b_wapbllist);
2070 }
2071 if (wc->wc_len % blocklen != 0) {
2072 padding = blocklen - wc->wc_len % blocklen;
2073 wc->wc_len += padding;
2074 } else {
2075 padding = 0;
2076 }
2077
2078 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2079 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2080 wc->wc_len, padding, (intmax_t)off));
2081
2082 error = wapbl_circ_write(wl, wc, blocklen, &off);
2083 if (error)
2084 return error;
2085 bp = obp;
2086 cnt = 0;
2087 while (bp && (cnt++ < bph)) {
2088 error = wapbl_circ_write(wl, bp->b_data,
2089 bp->b_bcount, &off);
2090 if (error)
2091 return error;
2092 bp = LIST_NEXT(bp, b_wapbllist);
2093 }
2094 if (padding) {
2095 void *zero;
2096
2097 zero = wapbl_alloc(padding);
2098 memset(zero, 0, padding);
2099 error = wapbl_circ_write(wl, zero, padding, &off);
2100 wapbl_free(zero, padding);
2101 if (error)
2102 return error;
2103 }
2104 }
2105 *offp = off;
2106 return 0;
2107 }
2108
2109 static int
2110 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2111 {
2112 struct wapbl_wc_blocklist *wc =
2113 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2114 int i;
2115 int blocklen = 1<<wl->wl_log_dev_bshift;
2116 int bph;
2117 off_t off = *offp;
2118 int error;
2119
2120 if (wl->wl_dealloccnt == 0)
2121 return 0;
2122
2123 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2124 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2125
2126 i = 0;
2127 while (i < wl->wl_dealloccnt) {
2128 wc->wc_type = WAPBL_WC_REVOCATIONS;
2129 wc->wc_len = blocklen;
2130 wc->wc_blkcount = 0;
2131 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2132 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2133 wl->wl_deallocblks[i];
2134 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2135 wl->wl_dealloclens[i];
2136 wc->wc_blkcount++;
2137 i++;
2138 }
2139 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2140 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2141 wc->wc_len, (intmax_t)off));
2142 error = wapbl_circ_write(wl, wc, blocklen, &off);
2143 if (error)
2144 return error;
2145 }
2146 *offp = off;
2147 return 0;
2148 }
2149
2150 static int
2151 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2152 {
2153 struct wapbl_wc_inodelist *wc =
2154 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2155 int i;
2156 int blocklen = 1 << wl->wl_log_dev_bshift;
2157 off_t off = *offp;
2158 int error;
2159
2160 struct wapbl_ino_head *wih;
2161 struct wapbl_ino *wi;
2162 int iph;
2163
2164 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2165 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2166
2167 i = 0;
2168 wih = &wl->wl_inohash[0];
2169 wi = 0;
2170 do {
2171 wc->wc_type = WAPBL_WC_INODES;
2172 wc->wc_len = blocklen;
2173 wc->wc_inocnt = 0;
2174 wc->wc_clear = (i == 0);
2175 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2176 while (!wi) {
2177 KASSERT((wih - &wl->wl_inohash[0])
2178 <= wl->wl_inohashmask);
2179 wi = LIST_FIRST(wih++);
2180 }
2181 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2182 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2183 wc->wc_inocnt++;
2184 i++;
2185 wi = LIST_NEXT(wi, wi_hash);
2186 }
2187 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2188 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2189 wc->wc_len, (intmax_t)off));
2190 error = wapbl_circ_write(wl, wc, blocklen, &off);
2191 if (error)
2192 return error;
2193 } while (i < wl->wl_inohashcnt);
2194
2195 *offp = off;
2196 return 0;
2197 }
2198
2199 #endif /* _KERNEL */
2200
2201 /****************************************************************/
2202
2203 struct wapbl_blk {
2204 LIST_ENTRY(wapbl_blk) wb_hash;
2205 daddr_t wb_blk;
2206 off_t wb_off; /* Offset of this block in the log */
2207 };
2208 #define WAPBL_BLKPOOL_MIN 83
2209
2210 static void
2211 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2212 {
2213 if (size < WAPBL_BLKPOOL_MIN)
2214 size = WAPBL_BLKPOOL_MIN;
2215 KASSERT(wr->wr_blkhash == 0);
2216 #ifdef _KERNEL
2217 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2218 #else /* ! _KERNEL */
2219 /* Manually implement hashinit */
2220 {
2221 unsigned long i, hashsize;
2222 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2223 continue;
2224 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2225 for (i = 0; i < hashsize; i++)
2226 LIST_INIT(&wr->wr_blkhash[i]);
2227 wr->wr_blkhashmask = hashsize - 1;
2228 }
2229 #endif /* ! _KERNEL */
2230 }
2231
2232 static void
2233 wapbl_blkhash_free(struct wapbl_replay *wr)
2234 {
2235 KASSERT(wr->wr_blkhashcnt == 0);
2236 #ifdef _KERNEL
2237 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2238 #else /* ! _KERNEL */
2239 wapbl_free(wr->wr_blkhash,
2240 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2241 #endif /* ! _KERNEL */
2242 }
2243
2244 static struct wapbl_blk *
2245 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2246 {
2247 struct wapbl_blk_head *wbh;
2248 struct wapbl_blk *wb;
2249 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2250 LIST_FOREACH(wb, wbh, wb_hash) {
2251 if (blk == wb->wb_blk)
2252 return wb;
2253 }
2254 return 0;
2255 }
2256
2257 static void
2258 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2259 {
2260 struct wapbl_blk_head *wbh;
2261 struct wapbl_blk *wb;
2262 wb = wapbl_blkhash_get(wr, blk);
2263 if (wb) {
2264 KASSERT(wb->wb_blk == blk);
2265 wb->wb_off = off;
2266 } else {
2267 wb = wapbl_alloc(sizeof(*wb));
2268 wb->wb_blk = blk;
2269 wb->wb_off = off;
2270 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2271 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2272 wr->wr_blkhashcnt++;
2273 }
2274 }
2275
2276 static void
2277 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2278 {
2279 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2280 if (wb) {
2281 KASSERT(wr->wr_blkhashcnt > 0);
2282 wr->wr_blkhashcnt--;
2283 LIST_REMOVE(wb, wb_hash);
2284 wapbl_free(wb, sizeof(*wb));
2285 }
2286 }
2287
2288 static void
2289 wapbl_blkhash_clear(struct wapbl_replay *wr)
2290 {
2291 unsigned long i;
2292 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2293 struct wapbl_blk *wb;
2294
2295 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2296 KASSERT(wr->wr_blkhashcnt > 0);
2297 wr->wr_blkhashcnt--;
2298 LIST_REMOVE(wb, wb_hash);
2299 wapbl_free(wb, sizeof(*wb));
2300 }
2301 }
2302 KASSERT(wr->wr_blkhashcnt == 0);
2303 }
2304
2305 /****************************************************************/
2306
2307 static int
2308 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2309 {
2310 size_t slen;
2311 off_t off = *offp;
2312 int error;
2313 daddr_t pbn;
2314
2315 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2316 wr->wr_log_dev_bshift) == len);
2317
2318 if (off < wr->wr_circ_off)
2319 off = wr->wr_circ_off;
2320 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2321 if (slen < len) {
2322 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2323 #ifdef _KERNEL
2324 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2325 #endif
2326 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2327 if (error)
2328 return error;
2329 data = (uint8_t *)data + slen;
2330 len -= slen;
2331 off = wr->wr_circ_off;
2332 }
2333 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2334 #ifdef _KERNEL
2335 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2336 #endif
2337 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2338 if (error)
2339 return error;
2340 off += len;
2341 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2342 off = wr->wr_circ_off;
2343 *offp = off;
2344 return 0;
2345 }
2346
2347 static void
2348 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2349 {
2350 size_t slen;
2351 off_t off = *offp;
2352
2353 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2354 wr->wr_log_dev_bshift) == len);
2355
2356 if (off < wr->wr_circ_off)
2357 off = wr->wr_circ_off;
2358 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2359 if (slen < len) {
2360 len -= slen;
2361 off = wr->wr_circ_off;
2362 }
2363 off += len;
2364 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2365 off = wr->wr_circ_off;
2366 *offp = off;
2367 }
2368
2369 /****************************************************************/
2370
2371 int
2372 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2373 daddr_t off, size_t count, size_t blksize)
2374 {
2375 struct wapbl_replay *wr;
2376 int error;
2377 struct vnode *devvp;
2378 daddr_t logpbn;
2379 uint8_t *scratch;
2380 struct wapbl_wc_header *wch;
2381 struct wapbl_wc_header *wch2;
2382 /* Use this until we read the actual log header */
2383 int log_dev_bshift = ilog2(blksize);
2384 size_t used;
2385 daddr_t pbn;
2386
2387 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2388 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2389 vp, off, count, blksize));
2390
2391 if (off < 0)
2392 return EINVAL;
2393
2394 if (blksize < DEV_BSIZE)
2395 return EINVAL;
2396 if (blksize % DEV_BSIZE)
2397 return EINVAL;
2398
2399 #ifdef _KERNEL
2400 #if 0
2401 /* XXX vp->v_size isn't reliably set for VBLK devices,
2402 * especially root. However, we might still want to verify
2403 * that the full load is readable */
2404 if ((off + count) * blksize > vp->v_size)
2405 return EINVAL;
2406 #endif
2407 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2408 return error;
2409 }
2410 #else /* ! _KERNEL */
2411 devvp = vp;
2412 logpbn = off;
2413 #endif /* ! _KERNEL */
2414
2415 scratch = wapbl_alloc(MAXBSIZE);
2416
2417 pbn = logpbn;
2418 #ifdef _KERNEL
2419 pbn = btodb(pbn << log_dev_bshift);
2420 #endif
2421 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2422 if (error)
2423 goto errout;
2424
2425 wch = (struct wapbl_wc_header *)scratch;
2426 wch2 =
2427 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2428 /* XXX verify checksums and magic numbers */
2429 if (wch->wc_type != WAPBL_WC_HEADER) {
2430 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2431 error = EFTYPE;
2432 goto errout;
2433 }
2434
2435 if (wch2->wc_generation > wch->wc_generation)
2436 wch = wch2;
2437
2438 wr = wapbl_calloc(1, sizeof(*wr));
2439
2440 wr->wr_logvp = vp;
2441 wr->wr_devvp = devvp;
2442 wr->wr_logpbn = logpbn;
2443
2444 wr->wr_scratch = scratch;
2445
2446 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2447 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2448 wr->wr_circ_off = wch->wc_circ_off;
2449 wr->wr_circ_size = wch->wc_circ_size;
2450 wr->wr_generation = wch->wc_generation;
2451
2452 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2453
2454 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2455 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2456 " len=%"PRId64" used=%zu\n",
2457 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2458 wch->wc_circ_size, used));
2459
2460 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2461
2462 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2463 if (error) {
2464 wapbl_replay_stop(wr);
2465 wapbl_replay_free(wr);
2466 return error;
2467 }
2468
2469 *wrp = wr;
2470 return 0;
2471
2472 errout:
2473 wapbl_free(scratch, MAXBSIZE);
2474 return error;
2475 }
2476
2477 void
2478 wapbl_replay_stop(struct wapbl_replay *wr)
2479 {
2480
2481 if (!wapbl_replay_isopen(wr))
2482 return;
2483
2484 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2485
2486 wapbl_free(wr->wr_scratch, MAXBSIZE);
2487 wr->wr_scratch = NULL;
2488
2489 wr->wr_logvp = NULL;
2490
2491 wapbl_blkhash_clear(wr);
2492 wapbl_blkhash_free(wr);
2493 }
2494
2495 void
2496 wapbl_replay_free(struct wapbl_replay *wr)
2497 {
2498
2499 KDASSERT(!wapbl_replay_isopen(wr));
2500
2501 if (wr->wr_inodes)
2502 wapbl_free(wr->wr_inodes,
2503 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2504 wapbl_free(wr, sizeof(*wr));
2505 }
2506
2507 #ifdef _KERNEL
2508 int
2509 wapbl_replay_isopen1(struct wapbl_replay *wr)
2510 {
2511
2512 return wapbl_replay_isopen(wr);
2513 }
2514 #endif
2515
2516 static void
2517 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2518 {
2519 struct wapbl_wc_blocklist *wc =
2520 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2521 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2522 int i, j, n;
2523
2524 for (i = 0; i < wc->wc_blkcount; i++) {
2525 /*
2526 * Enter each physical block into the hashtable independently.
2527 */
2528 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2529 for (j = 0; j < n; j++) {
2530 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2531 *offp);
2532 wapbl_circ_advance(wr, fsblklen, offp);
2533 }
2534 }
2535 }
2536
2537 static void
2538 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2539 {
2540 struct wapbl_wc_blocklist *wc =
2541 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2542 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2543 int i, j, n;
2544
2545 for (i = 0; i < wc->wc_blkcount; i++) {
2546 /*
2547 * Remove any blocks found from the hashtable.
2548 */
2549 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2550 for (j = 0; j < n; j++)
2551 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2552 }
2553 }
2554
2555 static void
2556 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2557 {
2558 struct wapbl_wc_inodelist *wc =
2559 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2560 void *new_inodes;
2561 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2562
2563 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2564
2565 /*
2566 * Keep track of where we found this so location won't be
2567 * overwritten.
2568 */
2569 if (wc->wc_clear) {
2570 wr->wr_inodestail = oldoff;
2571 wr->wr_inodescnt = 0;
2572 if (wr->wr_inodes != NULL) {
2573 wapbl_free(wr->wr_inodes, oldsize);
2574 wr->wr_inodes = NULL;
2575 }
2576 }
2577 wr->wr_inodeshead = newoff;
2578 if (wc->wc_inocnt == 0)
2579 return;
2580
2581 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2582 sizeof(wr->wr_inodes[0]));
2583 if (wr->wr_inodes != NULL) {
2584 memcpy(new_inodes, wr->wr_inodes, oldsize);
2585 wapbl_free(wr->wr_inodes, oldsize);
2586 }
2587 wr->wr_inodes = new_inodes;
2588 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2589 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2590 wr->wr_inodescnt += wc->wc_inocnt;
2591 }
2592
2593 static int
2594 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2595 {
2596 off_t off;
2597 int error;
2598
2599 int logblklen = 1 << wr->wr_log_dev_bshift;
2600
2601 wapbl_blkhash_clear(wr);
2602
2603 off = tail;
2604 while (off != head) {
2605 struct wapbl_wc_null *wcn;
2606 off_t saveoff = off;
2607 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2608 if (error)
2609 goto errout;
2610 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2611 switch (wcn->wc_type) {
2612 case WAPBL_WC_BLOCKS:
2613 wapbl_replay_process_blocks(wr, &off);
2614 break;
2615
2616 case WAPBL_WC_REVOCATIONS:
2617 wapbl_replay_process_revocations(wr);
2618 break;
2619
2620 case WAPBL_WC_INODES:
2621 wapbl_replay_process_inodes(wr, saveoff, off);
2622 break;
2623
2624 default:
2625 printf("Unrecognized wapbl type: 0x%08x\n",
2626 wcn->wc_type);
2627 error = EFTYPE;
2628 goto errout;
2629 }
2630 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2631 if (off != saveoff) {
2632 printf("wapbl_replay: corrupted records\n");
2633 error = EFTYPE;
2634 goto errout;
2635 }
2636 }
2637 return 0;
2638
2639 errout:
2640 wapbl_blkhash_clear(wr);
2641 return error;
2642 }
2643
2644 #if 0
2645 int
2646 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2647 {
2648 off_t off;
2649 int mismatchcnt = 0;
2650 int logblklen = 1 << wr->wr_log_dev_bshift;
2651 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2652 void *scratch1 = wapbl_alloc(MAXBSIZE);
2653 void *scratch2 = wapbl_alloc(MAXBSIZE);
2654 int error = 0;
2655
2656 KDASSERT(wapbl_replay_isopen(wr));
2657
2658 off = wch->wc_tail;
2659 while (off != wch->wc_head) {
2660 struct wapbl_wc_null *wcn;
2661 #ifdef DEBUG
2662 off_t saveoff = off;
2663 #endif
2664 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2665 if (error)
2666 goto out;
2667 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2668 switch (wcn->wc_type) {
2669 case WAPBL_WC_BLOCKS:
2670 {
2671 struct wapbl_wc_blocklist *wc =
2672 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2673 int i;
2674 for (i = 0; i < wc->wc_blkcount; i++) {
2675 int foundcnt = 0;
2676 int dirtycnt = 0;
2677 int j, n;
2678 /*
2679 * Check each physical block into the
2680 * hashtable independently
2681 */
2682 n = wc->wc_blocks[i].wc_dlen >>
2683 wch->wc_fs_dev_bshift;
2684 for (j = 0; j < n; j++) {
2685 struct wapbl_blk *wb =
2686 wapbl_blkhash_get(wr,
2687 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2688 if (wb && (wb->wb_off == off)) {
2689 foundcnt++;
2690 error =
2691 wapbl_circ_read(wr,
2692 scratch1, fsblklen,
2693 &off);
2694 if (error)
2695 goto out;
2696 error =
2697 wapbl_read(scratch2,
2698 fsblklen, fsdevvp,
2699 wb->wb_blk);
2700 if (error)
2701 goto out;
2702 if (memcmp(scratch1,
2703 scratch2,
2704 fsblklen)) {
2705 printf(
2706 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2707 wb->wb_blk, (intmax_t)off);
2708 dirtycnt++;
2709 mismatchcnt++;
2710 }
2711 } else {
2712 wapbl_circ_advance(wr,
2713 fsblklen, &off);
2714 }
2715 }
2716 #if 0
2717 /*
2718 * If all of the blocks in an entry
2719 * are clean, then remove all of its
2720 * blocks from the hashtable since they
2721 * never will need replay.
2722 */
2723 if ((foundcnt != 0) &&
2724 (dirtycnt == 0)) {
2725 off = saveoff;
2726 wapbl_circ_advance(wr,
2727 logblklen, &off);
2728 for (j = 0; j < n; j++) {
2729 struct wapbl_blk *wb =
2730 wapbl_blkhash_get(wr,
2731 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2732 if (wb &&
2733 (wb->wb_off == off)) {
2734 wapbl_blkhash_rem(wr, wb->wb_blk);
2735 }
2736 wapbl_circ_advance(wr,
2737 fsblklen, &off);
2738 }
2739 }
2740 #endif
2741 }
2742 }
2743 break;
2744 case WAPBL_WC_REVOCATIONS:
2745 case WAPBL_WC_INODES:
2746 break;
2747 default:
2748 KASSERT(0);
2749 }
2750 #ifdef DEBUG
2751 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2752 KASSERT(off == saveoff);
2753 #endif
2754 }
2755 out:
2756 wapbl_free(scratch1, MAXBSIZE);
2757 wapbl_free(scratch2, MAXBSIZE);
2758 if (!error && mismatchcnt)
2759 error = EFTYPE;
2760 return error;
2761 }
2762 #endif
2763
2764 int
2765 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2766 {
2767 struct wapbl_blk *wb;
2768 size_t i;
2769 off_t off;
2770 void *scratch;
2771 int error = 0;
2772 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2773
2774 KDASSERT(wapbl_replay_isopen(wr));
2775
2776 scratch = wapbl_alloc(MAXBSIZE);
2777
2778 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2779 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2780 off = wb->wb_off;
2781 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2782 if (error)
2783 break;
2784 error = wapbl_write(scratch, fsblklen, fsdevvp,
2785 wb->wb_blk);
2786 if (error)
2787 break;
2788 }
2789 }
2790
2791 wapbl_free(scratch, MAXBSIZE);
2792 return error;
2793 }
2794
2795 int
2796 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2797 {
2798 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2799
2800 KDASSERT(wapbl_replay_isopen(wr));
2801 KASSERT((len % fsblklen) == 0);
2802
2803 while (len != 0) {
2804 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2805 if (wb)
2806 return 1;
2807 len -= fsblklen;
2808 }
2809 return 0;
2810 }
2811
2812 int
2813 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2814 {
2815 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2816
2817 KDASSERT(wapbl_replay_isopen(wr));
2818
2819 KASSERT((len % fsblklen) == 0);
2820
2821 while (len != 0) {
2822 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2823 if (wb) {
2824 off_t off = wb->wb_off;
2825 int error;
2826 error = wapbl_circ_read(wr, data, fsblklen, &off);
2827 if (error)
2828 return error;
2829 }
2830 data = (uint8_t *)data + fsblklen;
2831 len -= fsblklen;
2832 blk++;
2833 }
2834 return 0;
2835 }
2836
2837 #ifdef _KERNEL
2838 /*
2839 * This is not really a module now, but maybe on it's way to
2840 * being one some day.
2841 */
2842 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2843
2844 static int
2845 wapbl_modcmd(modcmd_t cmd, void *arg)
2846 {
2847
2848 switch (cmd) {
2849 case MODULE_CMD_INIT:
2850 wapbl_init();
2851 return 0;
2852 case MODULE_CMD_FINI:
2853 #ifdef notyet
2854 return wapbl_fini(true);
2855 #endif
2856 return EOPNOTSUPP;
2857 default:
2858 return ENOTTY;
2859 }
2860 }
2861 #endif /* _KERNEL */
2862