vfs_wapbl.c revision 1.48.2.2 1 /* $NetBSD: vfs_wapbl.c,v 1.48.2.2 2012/06/02 11:09:35 mrg Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.48.2.2 2012/06/02 11:09:35 mrg Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 #else /* !_KERNEL */
74 #include <assert.h>
75 #include <errno.h>
76 #include <stdio.h>
77 #include <stdbool.h>
78 #include <stdlib.h>
79 #include <string.h>
80
81 #include <sys/time.h>
82 #include <sys/wapbl.h>
83 #include <sys/wapbl_replay.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_alloc(s) malloc(s)
88 #define wapbl_free(a, s) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * lm = rwlock held writing or mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* lm: address of block */
175 int *wl_dealloclens; /* lm: size of block */
176 int wl_dealloccnt; /* lm: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187 };
188
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191 #endif
192
193 /****************************************************************/
194 #ifdef _KERNEL
195
196 #ifdef WAPBL_DEBUG
197 struct wapbl *wapbl_debug_wl;
198 #endif
199
200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204 #endif /* _KERNEL */
205
206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207
208 static inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail);
210 static inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail);
212
213 #ifdef _KERNEL
214
215 static struct pool wapbl_entry_pool;
216
217 #define WAPBL_INODETRK_SIZE 83
218 static int wapbl_ino_pool_refcount;
219 static struct pool wapbl_ino_pool;
220 struct wapbl_ino {
221 LIST_ENTRY(wapbl_ino) wi_hash;
222 ino_t wi_ino;
223 mode_t wi_mode;
224 };
225
226 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
227 static void wapbl_inodetrk_free(struct wapbl *wl);
228 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
229
230 static size_t wapbl_transaction_len(struct wapbl *wl);
231 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
232
233 #if 0
234 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
235 #endif
236
237 static int wapbl_replay_isopen1(struct wapbl_replay *);
238
239 /*
240 * This is useful for debugging. If set, the log will
241 * only be truncated when necessary.
242 */
243 int wapbl_lazy_truncate = 0;
244
245 struct wapbl_ops wapbl_ops = {
246 .wo_wapbl_discard = wapbl_discard,
247 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
248 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
249 .wo_wapbl_replay_read = wapbl_replay_read,
250 .wo_wapbl_add_buf = wapbl_add_buf,
251 .wo_wapbl_remove_buf = wapbl_remove_buf,
252 .wo_wapbl_resize_buf = wapbl_resize_buf,
253 .wo_wapbl_begin = wapbl_begin,
254 .wo_wapbl_end = wapbl_end,
255 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
256
257 /* XXX: the following is only used to say "this is a wapbl buf" */
258 .wo_wapbl_biodone = wapbl_biodone,
259 };
260
261 static int
262 wapbl_sysctl_init(void)
263 {
264 int rv;
265 const struct sysctlnode *rnode, *cnode;
266
267 wapbl_sysctl = NULL;
268
269 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
270 CTLFLAG_PERMANENT,
271 CTLTYPE_NODE, "vfs", NULL,
272 NULL, 0, NULL, 0,
273 CTL_VFS, CTL_EOL);
274 if (rv)
275 return rv;
276
277 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
278 CTLFLAG_PERMANENT,
279 CTLTYPE_NODE, "wapbl",
280 SYSCTL_DESCR("WAPBL journaling options"),
281 NULL, 0, NULL, 0,
282 CTL_CREATE, CTL_EOL);
283 if (rv)
284 return rv;
285
286 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
287 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
288 CTLTYPE_INT, "flush_disk_cache",
289 SYSCTL_DESCR("flush disk cache"),
290 NULL, 0, &wapbl_flush_disk_cache, 0,
291 CTL_CREATE, CTL_EOL);
292 if (rv)
293 return rv;
294
295 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
296 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
297 CTLTYPE_INT, "verbose_commit",
298 SYSCTL_DESCR("show time and size of wapbl log commits"),
299 NULL, 0, &wapbl_verbose_commit, 0,
300 CTL_CREATE, CTL_EOL);
301 return rv;
302 }
303
304 static void
305 wapbl_init(void)
306 {
307
308 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
309 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
310
311 wapbl_sysctl_init();
312 }
313
314 #ifdef notyet
315 static int
316 wapbl_fini(bool interface)
317 {
318
319 if (aio_sysctl != NULL)
320 sysctl_teardown(&aio_sysctl);
321
322 pool_destroy(&wapbl_entry_pool);
323
324 return 0;
325 }
326 #endif
327
328 static int
329 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
330 {
331 int error, i;
332
333 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
334 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
335
336 /*
337 * Its only valid to reuse the replay log if its
338 * the same as the new log we just opened.
339 */
340 KDASSERT(!wapbl_replay_isopen(wr));
341 KASSERT(wl->wl_devvp->v_type == VBLK);
342 KASSERT(wr->wr_devvp->v_type == VBLK);
343 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
344 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
345 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
346 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
347 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
348 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
349
350 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
351
352 for (i = 0; i < wr->wr_inodescnt; i++)
353 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
354 wr->wr_inodes[i].wr_imode);
355
356 /* Make sure new transaction won't overwrite old inodes list */
357 KDASSERT(wapbl_transaction_len(wl) <=
358 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
359 wr->wr_inodestail));
360
361 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
362 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
363 wapbl_transaction_len(wl);
364
365 error = wapbl_write_inodes(wl, &wl->wl_head);
366 if (error)
367 return error;
368
369 KASSERT(wl->wl_head != wl->wl_tail);
370 KASSERT(wl->wl_head != 0);
371
372 return 0;
373 }
374
375 int
376 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
377 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
378 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
379 {
380 struct wapbl *wl;
381 struct vnode *devvp;
382 daddr_t logpbn;
383 int error;
384 int log_dev_bshift = ilog2(blksize);
385 int fs_dev_bshift = log_dev_bshift;
386 int run;
387
388 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
389 " count=%zu blksize=%zu\n", vp, off, count, blksize));
390
391 if (log_dev_bshift > fs_dev_bshift) {
392 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
393 ("wapbl: log device's block size cannot be larger "
394 "than filesystem's\n"));
395 /*
396 * Not currently implemented, although it could be if
397 * needed someday.
398 */
399 return ENOSYS;
400 }
401
402 if (off < 0)
403 return EINVAL;
404
405 if (blksize < DEV_BSIZE)
406 return EINVAL;
407 if (blksize % DEV_BSIZE)
408 return EINVAL;
409
410 /* XXXTODO: verify that the full load is writable */
411
412 /*
413 * XXX check for minimum log size
414 * minimum is governed by minimum amount of space
415 * to complete a transaction. (probably truncate)
416 */
417 /* XXX for now pick something minimal */
418 if ((count * blksize) < MAXPHYS) {
419 return ENOSPC;
420 }
421
422 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
423 return error;
424 }
425
426 wl = wapbl_calloc(1, sizeof(*wl));
427 rw_init(&wl->wl_rwlock);
428 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
429 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
430 LIST_INIT(&wl->wl_bufs);
431 SIMPLEQ_INIT(&wl->wl_entries);
432
433 wl->wl_logvp = vp;
434 wl->wl_devvp = devvp;
435 wl->wl_mount = mp;
436 wl->wl_logpbn = logpbn;
437 wl->wl_log_dev_bshift = log_dev_bshift;
438 wl->wl_fs_dev_bshift = fs_dev_bshift;
439
440 wl->wl_flush = flushfn;
441 wl->wl_flush_abort = flushabortfn;
442
443 /* Reserve two log device blocks for the commit headers */
444 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
445 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
446 /* truncate the log usage to a multiple of log_dev_bshift */
447 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
448 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
449
450 /*
451 * wl_bufbytes_max limits the size of the in memory transaction space.
452 * - Since buffers are allocated and accounted for in units of
453 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
454 * (i.e. 1<<PAGE_SHIFT)
455 * - Since the log device has to be written in units of
456 * 1<<wl_log_dev_bshift it is required to be a mulitple of
457 * 1<<wl_log_dev_bshift.
458 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
459 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
460 * Therefore it must be multiple of the least common multiple of those
461 * three quantities. Fortunately, all of those quantities are
462 * guaranteed to be a power of two, and the least common multiple of
463 * a set of numbers which are all powers of two is simply the maximum
464 * of those numbers. Finally, the maximum logarithm of a power of two
465 * is the same as the log of the maximum power of two. So we can do
466 * the following operations to size wl_bufbytes_max:
467 */
468
469 /* XXX fix actual number of pages reserved per filesystem. */
470 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
471
472 /* Round wl_bufbytes_max to the largest power of two constraint */
473 wl->wl_bufbytes_max >>= PAGE_SHIFT;
474 wl->wl_bufbytes_max <<= PAGE_SHIFT;
475 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
476 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
477 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
478 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
479
480 /* XXX maybe use filesystem fragment size instead of 1024 */
481 /* XXX fix actual number of buffers reserved per filesystem. */
482 wl->wl_bufcount_max = (nbuf / 2) * 1024;
483
484 /* XXX tie this into resource estimation */
485 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
486
487 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
488 wl->wl_dealloclim);
489 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
490 wl->wl_dealloclim);
491
492 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
493
494 /* Initialize the commit header */
495 {
496 struct wapbl_wc_header *wc;
497 size_t len = 1 << wl->wl_log_dev_bshift;
498 wc = wapbl_calloc(1, len);
499 wc->wc_type = WAPBL_WC_HEADER;
500 wc->wc_len = len;
501 wc->wc_circ_off = wl->wl_circ_off;
502 wc->wc_circ_size = wl->wl_circ_size;
503 /* XXX wc->wc_fsid */
504 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
505 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
506 wl->wl_wc_header = wc;
507 wl->wl_wc_scratch = wapbl_alloc(len);
508 }
509
510 /*
511 * if there was an existing set of unlinked but
512 * allocated inodes, preserve it in the new
513 * log.
514 */
515 if (wr && wr->wr_inodescnt) {
516 error = wapbl_start_flush_inodes(wl, wr);
517 if (error)
518 goto errout;
519 }
520
521 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
522 if (error) {
523 goto errout;
524 }
525
526 *wlp = wl;
527 #if defined(WAPBL_DEBUG)
528 wapbl_debug_wl = wl;
529 #endif
530
531 return 0;
532 errout:
533 wapbl_discard(wl);
534 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
535 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
536 wapbl_free(wl->wl_deallocblks,
537 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
538 wapbl_free(wl->wl_dealloclens,
539 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
540 wapbl_inodetrk_free(wl);
541 wapbl_free(wl, sizeof(*wl));
542
543 return error;
544 }
545
546 /*
547 * Like wapbl_flush, only discards the transaction
548 * completely
549 */
550
551 void
552 wapbl_discard(struct wapbl *wl)
553 {
554 struct wapbl_entry *we;
555 struct buf *bp;
556 int i;
557
558 /*
559 * XXX we may consider using upgrade here
560 * if we want to call flush from inside a transaction
561 */
562 rw_enter(&wl->wl_rwlock, RW_WRITER);
563 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
564 wl->wl_dealloccnt);
565
566 #ifdef WAPBL_DEBUG_PRINT
567 {
568 pid_t pid = -1;
569 lwpid_t lid = -1;
570 if (curproc)
571 pid = curproc->p_pid;
572 if (curlwp)
573 lid = curlwp->l_lid;
574 #ifdef WAPBL_DEBUG_BUFBYTES
575 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
576 ("wapbl_discard: thread %d.%d discarding "
577 "transaction\n"
578 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
579 "deallocs=%d inodes=%d\n"
580 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
581 "unsynced=%zu\n",
582 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
583 wl->wl_bcount, wl->wl_dealloccnt,
584 wl->wl_inohashcnt, wl->wl_error_count,
585 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
586 wl->wl_unsynced_bufbytes));
587 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
588 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
589 ("\tentry: bufcount = %zu, reclaimable = %zu, "
590 "error = %d, unsynced = %zu\n",
591 we->we_bufcount, we->we_reclaimable_bytes,
592 we->we_error, we->we_unsynced_bufbytes));
593 }
594 #else /* !WAPBL_DEBUG_BUFBYTES */
595 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
596 ("wapbl_discard: thread %d.%d discarding transaction\n"
597 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
598 "deallocs=%d inodes=%d\n"
599 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
600 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
601 wl->wl_bcount, wl->wl_dealloccnt,
602 wl->wl_inohashcnt, wl->wl_error_count,
603 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
604 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
605 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
606 ("\tentry: bufcount = %zu, reclaimable = %zu, "
607 "error = %d\n",
608 we->we_bufcount, we->we_reclaimable_bytes,
609 we->we_error));
610 }
611 #endif /* !WAPBL_DEBUG_BUFBYTES */
612 }
613 #endif /* WAPBL_DEBUG_PRINT */
614
615 for (i = 0; i <= wl->wl_inohashmask; i++) {
616 struct wapbl_ino_head *wih;
617 struct wapbl_ino *wi;
618
619 wih = &wl->wl_inohash[i];
620 while ((wi = LIST_FIRST(wih)) != NULL) {
621 LIST_REMOVE(wi, wi_hash);
622 pool_put(&wapbl_ino_pool, wi);
623 KASSERT(wl->wl_inohashcnt > 0);
624 wl->wl_inohashcnt--;
625 }
626 }
627
628 /*
629 * clean buffer list
630 */
631 mutex_enter(&bufcache_lock);
632 mutex_enter(&wl->wl_mtx);
633 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
634 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
635 /*
636 * The buffer will be unlocked and
637 * removed from the transaction in brelse
638 */
639 mutex_exit(&wl->wl_mtx);
640 brelsel(bp, 0);
641 mutex_enter(&wl->wl_mtx);
642 }
643 }
644 mutex_exit(&wl->wl_mtx);
645 mutex_exit(&bufcache_lock);
646
647 /*
648 * Remove references to this wl from wl_entries, free any which
649 * no longer have buffers, others will be freed in wapbl_biodone
650 * when they no longer have any buffers.
651 */
652 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
653 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
654 /* XXX should we be accumulating wl_error_count
655 * and increasing reclaimable bytes ? */
656 we->we_wapbl = NULL;
657 if (we->we_bufcount == 0) {
658 #ifdef WAPBL_DEBUG_BUFBYTES
659 KASSERT(we->we_unsynced_bufbytes == 0);
660 #endif
661 pool_put(&wapbl_entry_pool, we);
662 }
663 }
664
665 /* Discard list of deallocs */
666 wl->wl_dealloccnt = 0;
667 /* XXX should we clear wl_reserved_bytes? */
668
669 KASSERT(wl->wl_bufbytes == 0);
670 KASSERT(wl->wl_bcount == 0);
671 KASSERT(wl->wl_bufcount == 0);
672 KASSERT(LIST_EMPTY(&wl->wl_bufs));
673 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
674 KASSERT(wl->wl_inohashcnt == 0);
675
676 rw_exit(&wl->wl_rwlock);
677 }
678
679 int
680 wapbl_stop(struct wapbl *wl, int force)
681 {
682 struct vnode *vp;
683 int error;
684
685 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
686 error = wapbl_flush(wl, 1);
687 if (error) {
688 if (force)
689 wapbl_discard(wl);
690 else
691 return error;
692 }
693
694 /* Unlinked inodes persist after a flush */
695 if (wl->wl_inohashcnt) {
696 if (force) {
697 wapbl_discard(wl);
698 } else {
699 return EBUSY;
700 }
701 }
702
703 KASSERT(wl->wl_bufbytes == 0);
704 KASSERT(wl->wl_bcount == 0);
705 KASSERT(wl->wl_bufcount == 0);
706 KASSERT(LIST_EMPTY(&wl->wl_bufs));
707 KASSERT(wl->wl_dealloccnt == 0);
708 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
709 KASSERT(wl->wl_inohashcnt == 0);
710
711 vp = wl->wl_logvp;
712
713 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
714 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
715 wapbl_free(wl->wl_deallocblks,
716 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
717 wapbl_free(wl->wl_dealloclens,
718 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
719 wapbl_inodetrk_free(wl);
720
721 cv_destroy(&wl->wl_reclaimable_cv);
722 mutex_destroy(&wl->wl_mtx);
723 rw_destroy(&wl->wl_rwlock);
724 wapbl_free(wl, sizeof(*wl));
725
726 return 0;
727 }
728
729 static int
730 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
731 {
732 struct pstats *pstats = curlwp->l_proc->p_stats;
733 struct buf *bp;
734 int error;
735
736 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
737 KASSERT(devvp->v_type == VBLK);
738
739 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
740 mutex_enter(devvp->v_interlock);
741 devvp->v_numoutput++;
742 mutex_exit(devvp->v_interlock);
743 pstats->p_ru.ru_oublock++;
744 } else {
745 pstats->p_ru.ru_inblock++;
746 }
747
748 bp = getiobuf(devvp, true);
749 bp->b_flags = flags;
750 bp->b_cflags = BC_BUSY; /* silly & dubious */
751 bp->b_dev = devvp->v_rdev;
752 bp->b_data = data;
753 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
754 bp->b_blkno = pbn;
755 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
756
757 WAPBL_PRINTF(WAPBL_PRINT_IO,
758 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
759 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
760 bp->b_blkno, bp->b_dev));
761
762 VOP_STRATEGY(devvp, bp);
763
764 error = biowait(bp);
765 putiobuf(bp);
766
767 if (error) {
768 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
769 ("wapbl_doio: %s %zu bytes at block %" PRId64
770 " on dev 0x%"PRIx64" failed with error %d\n",
771 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
772 "write" : "read"),
773 len, pbn, devvp->v_rdev, error));
774 }
775
776 return error;
777 }
778
779 int
780 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
781 {
782
783 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
784 }
785
786 int
787 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
788 {
789
790 return wapbl_doio(data, len, devvp, pbn, B_READ);
791 }
792
793 /*
794 * Off is byte offset returns new offset for next write
795 * handles log wraparound
796 */
797 static int
798 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
799 {
800 size_t slen;
801 off_t off = *offp;
802 int error;
803 daddr_t pbn;
804
805 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
806 wl->wl_log_dev_bshift) == len);
807
808 if (off < wl->wl_circ_off)
809 off = wl->wl_circ_off;
810 slen = wl->wl_circ_off + wl->wl_circ_size - off;
811 if (slen < len) {
812 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
813 #ifdef _KERNEL
814 pbn = btodb(pbn << wl->wl_log_dev_bshift);
815 #endif
816 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
817 if (error)
818 return error;
819 data = (uint8_t *)data + slen;
820 len -= slen;
821 off = wl->wl_circ_off;
822 }
823 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
824 #ifdef _KERNEL
825 pbn = btodb(pbn << wl->wl_log_dev_bshift);
826 #endif
827 error = wapbl_write(data, len, wl->wl_devvp, pbn);
828 if (error)
829 return error;
830 off += len;
831 if (off >= wl->wl_circ_off + wl->wl_circ_size)
832 off = wl->wl_circ_off;
833 *offp = off;
834 return 0;
835 }
836
837 /****************************************************************/
838
839 int
840 wapbl_begin(struct wapbl *wl, const char *file, int line)
841 {
842 int doflush;
843 unsigned lockcount;
844
845 KDASSERT(wl);
846
847 /*
848 * XXX this needs to be made much more sophisticated.
849 * perhaps each wapbl_begin could reserve a specified
850 * number of buffers and bytes.
851 */
852 mutex_enter(&wl->wl_mtx);
853 lockcount = wl->wl_lock_count;
854 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
855 wl->wl_bufbytes_max / 2) ||
856 ((wl->wl_bufcount + (lockcount * 10)) >
857 wl->wl_bufcount_max / 2) ||
858 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
859 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
860 mutex_exit(&wl->wl_mtx);
861
862 if (doflush) {
863 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
864 ("force flush lockcnt=%d bufbytes=%zu "
865 "(max=%zu) bufcount=%zu (max=%zu) "
866 "dealloccnt %d (lim=%d)\n",
867 lockcount, wl->wl_bufbytes,
868 wl->wl_bufbytes_max, wl->wl_bufcount,
869 wl->wl_bufcount_max,
870 wl->wl_dealloccnt, wl->wl_dealloclim));
871 }
872
873 if (doflush) {
874 int error = wapbl_flush(wl, 0);
875 if (error)
876 return error;
877 }
878
879 rw_enter(&wl->wl_rwlock, RW_READER);
880 mutex_enter(&wl->wl_mtx);
881 wl->wl_lock_count++;
882 mutex_exit(&wl->wl_mtx);
883
884 #if defined(WAPBL_DEBUG_PRINT)
885 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
886 ("wapbl_begin thread %d.%d with bufcount=%zu "
887 "bufbytes=%zu bcount=%zu at %s:%d\n",
888 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
889 wl->wl_bufbytes, wl->wl_bcount, file, line));
890 #endif
891
892 return 0;
893 }
894
895 void
896 wapbl_end(struct wapbl *wl)
897 {
898
899 #if defined(WAPBL_DEBUG_PRINT)
900 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
901 ("wapbl_end thread %d.%d with bufcount=%zu "
902 "bufbytes=%zu bcount=%zu\n",
903 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
904 wl->wl_bufbytes, wl->wl_bcount));
905 #endif
906
907 #ifdef DIAGNOSTIC
908 size_t flushsize = wapbl_transaction_len(wl);
909 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
910 /*
911 * XXX this could be handled more gracefully, perhaps place
912 * only a partial transaction in the log and allow the
913 * remaining to flush without the protection of the journal.
914 */
915 panic("wapbl_end: current transaction too big to flush\n");
916 }
917 #endif
918
919 mutex_enter(&wl->wl_mtx);
920 KASSERT(wl->wl_lock_count > 0);
921 wl->wl_lock_count--;
922 mutex_exit(&wl->wl_mtx);
923
924 rw_exit(&wl->wl_rwlock);
925 }
926
927 void
928 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
929 {
930
931 KASSERT(bp->b_cflags & BC_BUSY);
932 KASSERT(bp->b_vp);
933
934 wapbl_jlock_assert(wl);
935
936 #if 0
937 /*
938 * XXX this might be an issue for swapfiles.
939 * see uvm_swap.c:1702
940 *
941 * XXX2 why require it then? leap of semantics?
942 */
943 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
944 #endif
945
946 mutex_enter(&wl->wl_mtx);
947 if (bp->b_flags & B_LOCKED) {
948 LIST_REMOVE(bp, b_wapbllist);
949 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
950 ("wapbl_add_buf thread %d.%d re-adding buf %p "
951 "with %d bytes %d bcount\n",
952 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
953 bp->b_bcount));
954 } else {
955 /* unlocked by dirty buffers shouldn't exist */
956 KASSERT(!(bp->b_oflags & BO_DELWRI));
957 wl->wl_bufbytes += bp->b_bufsize;
958 wl->wl_bcount += bp->b_bcount;
959 wl->wl_bufcount++;
960 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
961 ("wapbl_add_buf thread %d.%d adding buf %p "
962 "with %d bytes %d bcount\n",
963 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
964 bp->b_bcount));
965 }
966 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
967 mutex_exit(&wl->wl_mtx);
968
969 bp->b_flags |= B_LOCKED;
970 }
971
972 static void
973 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
974 {
975
976 KASSERT(mutex_owned(&wl->wl_mtx));
977 KASSERT(bp->b_cflags & BC_BUSY);
978 wapbl_jlock_assert(wl);
979
980 #if 0
981 /*
982 * XXX this might be an issue for swapfiles.
983 * see uvm_swap.c:1725
984 *
985 * XXXdeux: see above
986 */
987 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
988 #endif
989 KASSERT(bp->b_flags & B_LOCKED);
990
991 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
992 ("wapbl_remove_buf thread %d.%d removing buf %p with "
993 "%d bytes %d bcount\n",
994 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
995
996 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
997 wl->wl_bufbytes -= bp->b_bufsize;
998 KASSERT(wl->wl_bcount >= bp->b_bcount);
999 wl->wl_bcount -= bp->b_bcount;
1000 KASSERT(wl->wl_bufcount > 0);
1001 wl->wl_bufcount--;
1002 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1003 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1004 LIST_REMOVE(bp, b_wapbllist);
1005
1006 bp->b_flags &= ~B_LOCKED;
1007 }
1008
1009 /* called from brelsel() in vfs_bio among other places */
1010 void
1011 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1012 {
1013
1014 mutex_enter(&wl->wl_mtx);
1015 wapbl_remove_buf_locked(wl, bp);
1016 mutex_exit(&wl->wl_mtx);
1017 }
1018
1019 void
1020 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1021 {
1022
1023 KASSERT(bp->b_cflags & BC_BUSY);
1024
1025 /*
1026 * XXX: why does this depend on B_LOCKED? otherwise the buf
1027 * is not for a transaction? if so, why is this called in the
1028 * first place?
1029 */
1030 if (bp->b_flags & B_LOCKED) {
1031 mutex_enter(&wl->wl_mtx);
1032 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1033 wl->wl_bcount += bp->b_bcount - oldcnt;
1034 mutex_exit(&wl->wl_mtx);
1035 }
1036 }
1037
1038 #endif /* _KERNEL */
1039
1040 /****************************************************************/
1041 /* Some utility inlines */
1042
1043 /* This is used to advance the pointer at old to new value at old+delta */
1044 static inline off_t
1045 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1046 {
1047 off_t new;
1048
1049 /* Define acceptable ranges for inputs. */
1050 KASSERT(delta <= (size_t)size);
1051 KASSERT((old == 0) || ((size_t)old >= off));
1052 KASSERT(old < (off_t)(size + off));
1053
1054 if ((old == 0) && (delta != 0))
1055 new = off + delta;
1056 else if ((old + delta) < (size + off))
1057 new = old + delta;
1058 else
1059 new = (old + delta) - size;
1060
1061 /* Note some interesting axioms */
1062 KASSERT((delta != 0) || (new == old));
1063 KASSERT((delta == 0) || (new != 0));
1064 KASSERT((delta != (size)) || (new == old));
1065
1066 /* Define acceptable ranges for output. */
1067 KASSERT((new == 0) || ((size_t)new >= off));
1068 KASSERT((size_t)new < (size + off));
1069 return new;
1070 }
1071
1072 static inline size_t
1073 wapbl_space_used(size_t avail, off_t head, off_t tail)
1074 {
1075
1076 if (tail == 0) {
1077 KASSERT(head == 0);
1078 return 0;
1079 }
1080 return ((head + (avail - 1) - tail) % avail) + 1;
1081 }
1082
1083 static inline size_t
1084 wapbl_space_free(size_t avail, off_t head, off_t tail)
1085 {
1086
1087 return avail - wapbl_space_used(avail, head, tail);
1088 }
1089
1090 static inline void
1091 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1092 off_t *tailp)
1093 {
1094 off_t head = *headp;
1095 off_t tail = *tailp;
1096
1097 KASSERT(delta <= wapbl_space_free(size, head, tail));
1098 head = wapbl_advance(size, off, head, delta);
1099 if ((tail == 0) && (head != 0))
1100 tail = off;
1101 *headp = head;
1102 *tailp = tail;
1103 }
1104
1105 static inline void
1106 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1107 off_t *tailp)
1108 {
1109 off_t head = *headp;
1110 off_t tail = *tailp;
1111
1112 KASSERT(delta <= wapbl_space_used(size, head, tail));
1113 tail = wapbl_advance(size, off, tail, delta);
1114 if (head == tail) {
1115 head = tail = 0;
1116 }
1117 *headp = head;
1118 *tailp = tail;
1119 }
1120
1121 #ifdef _KERNEL
1122
1123 /****************************************************************/
1124
1125 /*
1126 * Remove transactions whose buffers are completely flushed to disk.
1127 * Will block until at least minfree space is available.
1128 * only intended to be called from inside wapbl_flush and therefore
1129 * does not protect against commit races with itself or with flush.
1130 */
1131 static int
1132 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1133 {
1134 size_t delta;
1135 size_t avail;
1136 off_t head;
1137 off_t tail;
1138 int error = 0;
1139
1140 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1141 KASSERT(rw_write_held(&wl->wl_rwlock));
1142
1143 mutex_enter(&wl->wl_mtx);
1144
1145 /*
1146 * First check to see if we have to do a commit
1147 * at all.
1148 */
1149 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1150 if (minfree < avail) {
1151 mutex_exit(&wl->wl_mtx);
1152 return 0;
1153 }
1154 minfree -= avail;
1155 while ((wl->wl_error_count == 0) &&
1156 (wl->wl_reclaimable_bytes < minfree)) {
1157 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1158 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1159 "minfree=%zd\n",
1160 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1161 minfree));
1162
1163 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1164 }
1165 if (wl->wl_reclaimable_bytes < minfree) {
1166 KASSERT(wl->wl_error_count);
1167 /* XXX maybe get actual error from buffer instead someday? */
1168 error = EIO;
1169 }
1170 head = wl->wl_head;
1171 tail = wl->wl_tail;
1172 delta = wl->wl_reclaimable_bytes;
1173
1174 /* If all of of the entries are flushed, then be sure to keep
1175 * the reserved bytes reserved. Watch out for discarded transactions,
1176 * which could leave more bytes reserved than are reclaimable.
1177 */
1178 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1179 (delta >= wl->wl_reserved_bytes)) {
1180 delta -= wl->wl_reserved_bytes;
1181 }
1182 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1183 &tail);
1184 KDASSERT(wl->wl_reserved_bytes <=
1185 wapbl_space_used(wl->wl_circ_size, head, tail));
1186 mutex_exit(&wl->wl_mtx);
1187
1188 if (error)
1189 return error;
1190
1191 if (waitonly)
1192 return 0;
1193
1194 /*
1195 * This is where head, tail and delta are unprotected
1196 * from races against itself or flush. This is ok since
1197 * we only call this routine from inside flush itself.
1198 *
1199 * XXX: how can it race against itself when accessed only
1200 * from behind the write-locked rwlock?
1201 */
1202 error = wapbl_write_commit(wl, head, tail);
1203 if (error)
1204 return error;
1205
1206 wl->wl_head = head;
1207 wl->wl_tail = tail;
1208
1209 mutex_enter(&wl->wl_mtx);
1210 KASSERT(wl->wl_reclaimable_bytes >= delta);
1211 wl->wl_reclaimable_bytes -= delta;
1212 mutex_exit(&wl->wl_mtx);
1213 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1214 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1215 curproc->p_pid, curlwp->l_lid, delta));
1216
1217 return 0;
1218 }
1219
1220 /****************************************************************/
1221
1222 void
1223 wapbl_biodone(struct buf *bp)
1224 {
1225 struct wapbl_entry *we = bp->b_private;
1226 struct wapbl *wl = we->we_wapbl;
1227
1228 /*
1229 * Handle possible flushing of buffers after log has been
1230 * decomissioned.
1231 */
1232 if (!wl) {
1233 KASSERT(we->we_bufcount > 0);
1234 we->we_bufcount--;
1235 #ifdef WAPBL_DEBUG_BUFBYTES
1236 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1237 we->we_unsynced_bufbytes -= bp->b_bufsize;
1238 #endif
1239
1240 if (we->we_bufcount == 0) {
1241 #ifdef WAPBL_DEBUG_BUFBYTES
1242 KASSERT(we->we_unsynced_bufbytes == 0);
1243 #endif
1244 pool_put(&wapbl_entry_pool, we);
1245 }
1246
1247 brelse(bp, 0);
1248 return;
1249 }
1250
1251 #ifdef ohbother
1252 KDASSERT(bp->b_oflags & BO_DONE);
1253 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1254 KDASSERT(bp->b_flags & B_ASYNC);
1255 KDASSERT(bp->b_cflags & BC_BUSY);
1256 KDASSERT(!(bp->b_flags & B_LOCKED));
1257 KDASSERT(!(bp->b_flags & B_READ));
1258 KDASSERT(!(bp->b_cflags & BC_INVAL));
1259 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1260 #endif
1261
1262 if (bp->b_error) {
1263 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1264 /*
1265 * XXXpooka: interfaces not fully updated
1266 * Note: this was not enabled in the original patch
1267 * against netbsd4 either. I don't know if comment
1268 * above is true or not.
1269 */
1270
1271 /*
1272 * If an error occurs, report the error and leave the
1273 * buffer as a delayed write on the LRU queue.
1274 * restarting the write would likely result in
1275 * an error spinloop, so let it be done harmlessly
1276 * by the syncer.
1277 */
1278 bp->b_flags &= ~(B_DONE);
1279 simple_unlock(&bp->b_interlock);
1280
1281 if (we->we_error == 0) {
1282 mutex_enter(&wl->wl_mtx);
1283 wl->wl_error_count++;
1284 mutex_exit(&wl->wl_mtx);
1285 cv_broadcast(&wl->wl_reclaimable_cv);
1286 }
1287 we->we_error = bp->b_error;
1288 bp->b_error = 0;
1289 brelse(bp);
1290 return;
1291 #else
1292 /* For now, just mark the log permanently errored out */
1293
1294 mutex_enter(&wl->wl_mtx);
1295 if (wl->wl_error_count == 0) {
1296 wl->wl_error_count++;
1297 cv_broadcast(&wl->wl_reclaimable_cv);
1298 }
1299 mutex_exit(&wl->wl_mtx);
1300 #endif
1301 }
1302
1303 mutex_enter(&wl->wl_mtx);
1304
1305 KASSERT(we->we_bufcount > 0);
1306 we->we_bufcount--;
1307 #ifdef WAPBL_DEBUG_BUFBYTES
1308 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1309 we->we_unsynced_bufbytes -= bp->b_bufsize;
1310 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1311 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1312 #endif
1313
1314 /*
1315 * If the current transaction can be reclaimed, start
1316 * at the beginning and reclaim any consecutive reclaimable
1317 * transactions. If we successfully reclaim anything,
1318 * then wakeup anyone waiting for the reclaim.
1319 */
1320 if (we->we_bufcount == 0) {
1321 size_t delta = 0;
1322 int errcnt = 0;
1323 #ifdef WAPBL_DEBUG_BUFBYTES
1324 KDASSERT(we->we_unsynced_bufbytes == 0);
1325 #endif
1326 /*
1327 * clear any posted error, since the buffer it came from
1328 * has successfully flushed by now
1329 */
1330 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1331 (we->we_bufcount == 0)) {
1332 delta += we->we_reclaimable_bytes;
1333 if (we->we_error)
1334 errcnt++;
1335 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1336 pool_put(&wapbl_entry_pool, we);
1337 }
1338
1339 if (delta) {
1340 wl->wl_reclaimable_bytes += delta;
1341 KASSERT(wl->wl_error_count >= errcnt);
1342 wl->wl_error_count -= errcnt;
1343 cv_broadcast(&wl->wl_reclaimable_cv);
1344 }
1345 }
1346
1347 mutex_exit(&wl->wl_mtx);
1348 brelse(bp, 0);
1349 }
1350
1351 /*
1352 * Write transactions to disk + start I/O for contents
1353 */
1354 int
1355 wapbl_flush(struct wapbl *wl, int waitfor)
1356 {
1357 struct buf *bp;
1358 struct wapbl_entry *we;
1359 off_t off;
1360 off_t head;
1361 off_t tail;
1362 size_t delta = 0;
1363 size_t flushsize;
1364 size_t reserved;
1365 int error = 0;
1366
1367 /*
1368 * Do a quick check to see if a full flush can be skipped
1369 * This assumes that the flush callback does not need to be called
1370 * unless there are other outstanding bufs.
1371 */
1372 if (!waitfor) {
1373 size_t nbufs;
1374 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1375 protect the KASSERTS */
1376 nbufs = wl->wl_bufcount;
1377 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1378 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1379 mutex_exit(&wl->wl_mtx);
1380 if (nbufs == 0)
1381 return 0;
1382 }
1383
1384 /*
1385 * XXX we may consider using LK_UPGRADE here
1386 * if we want to call flush from inside a transaction
1387 */
1388 rw_enter(&wl->wl_rwlock, RW_WRITER);
1389 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1390 wl->wl_dealloccnt);
1391
1392 /*
1393 * Now that we are fully locked and flushed,
1394 * do another check for nothing to do.
1395 */
1396 if (wl->wl_bufcount == 0) {
1397 goto out;
1398 }
1399
1400 #if 0
1401 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1402 ("wapbl_flush thread %d.%d flushing entries with "
1403 "bufcount=%zu bufbytes=%zu\n",
1404 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1405 wl->wl_bufbytes));
1406 #endif
1407
1408 /* Calculate amount of space needed to flush */
1409 flushsize = wapbl_transaction_len(wl);
1410 if (wapbl_verbose_commit) {
1411 struct timespec ts;
1412 getnanotime(&ts);
1413 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1414 __func__, (long long)ts.tv_sec,
1415 (long)ts.tv_nsec, flushsize);
1416 }
1417
1418 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1419 /*
1420 * XXX this could be handled more gracefully, perhaps place
1421 * only a partial transaction in the log and allow the
1422 * remaining to flush without the protection of the journal.
1423 */
1424 panic("wapbl_flush: current transaction too big to flush\n");
1425 }
1426
1427 error = wapbl_truncate(wl, flushsize, 0);
1428 if (error)
1429 goto out2;
1430
1431 off = wl->wl_head;
1432 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1433 (off < wl->wl_circ_off + wl->wl_circ_size)));
1434 error = wapbl_write_blocks(wl, &off);
1435 if (error)
1436 goto out2;
1437 error = wapbl_write_revocations(wl, &off);
1438 if (error)
1439 goto out2;
1440 error = wapbl_write_inodes(wl, &off);
1441 if (error)
1442 goto out2;
1443
1444 reserved = 0;
1445 if (wl->wl_inohashcnt)
1446 reserved = wapbl_transaction_inodes_len(wl);
1447
1448 head = wl->wl_head;
1449 tail = wl->wl_tail;
1450
1451 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1452 &head, &tail);
1453 #ifdef WAPBL_DEBUG
1454 if (head != off) {
1455 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1456 " off=%"PRIdMAX" flush=%zu\n",
1457 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1458 flushsize);
1459 }
1460 #else
1461 KASSERT(head == off);
1462 #endif
1463
1464 /* Opportunistically move the tail forward if we can */
1465 if (!wapbl_lazy_truncate) {
1466 mutex_enter(&wl->wl_mtx);
1467 delta = wl->wl_reclaimable_bytes;
1468 mutex_exit(&wl->wl_mtx);
1469 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1470 &head, &tail);
1471 }
1472
1473 error = wapbl_write_commit(wl, head, tail);
1474 if (error)
1475 goto out2;
1476
1477 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1478
1479 #ifdef WAPBL_DEBUG_BUFBYTES
1480 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1481 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1482 " unsynced=%zu"
1483 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1484 "inodes=%d\n",
1485 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1486 wapbl_space_used(wl->wl_circ_size, head, tail),
1487 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1488 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1489 wl->wl_inohashcnt));
1490 #else
1491 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1492 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1493 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1494 "inodes=%d\n",
1495 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1496 wapbl_space_used(wl->wl_circ_size, head, tail),
1497 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1498 wl->wl_dealloccnt, wl->wl_inohashcnt));
1499 #endif
1500
1501
1502 mutex_enter(&bufcache_lock);
1503 mutex_enter(&wl->wl_mtx);
1504
1505 wl->wl_reserved_bytes = reserved;
1506 wl->wl_head = head;
1507 wl->wl_tail = tail;
1508 KASSERT(wl->wl_reclaimable_bytes >= delta);
1509 wl->wl_reclaimable_bytes -= delta;
1510 wl->wl_dealloccnt = 0;
1511 #ifdef WAPBL_DEBUG_BUFBYTES
1512 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1513 #endif
1514
1515 we->we_wapbl = wl;
1516 we->we_bufcount = wl->wl_bufcount;
1517 #ifdef WAPBL_DEBUG_BUFBYTES
1518 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1519 #endif
1520 we->we_reclaimable_bytes = flushsize;
1521 we->we_error = 0;
1522 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1523
1524 /*
1525 * this flushes bufs in reverse order than they were queued
1526 * it shouldn't matter, but if we care we could use TAILQ instead.
1527 * XXX Note they will get put on the lru queue when they flush
1528 * so we might actually want to change this to preserve order.
1529 */
1530 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1531 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1532 continue;
1533 }
1534 bp->b_iodone = wapbl_biodone;
1535 bp->b_private = we;
1536 bremfree(bp);
1537 wapbl_remove_buf_locked(wl, bp);
1538 mutex_exit(&wl->wl_mtx);
1539 mutex_exit(&bufcache_lock);
1540 bawrite(bp);
1541 mutex_enter(&bufcache_lock);
1542 mutex_enter(&wl->wl_mtx);
1543 }
1544 mutex_exit(&wl->wl_mtx);
1545 mutex_exit(&bufcache_lock);
1546
1547 #if 0
1548 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1549 ("wapbl_flush thread %d.%d done flushing entries...\n",
1550 curproc->p_pid, curlwp->l_lid));
1551 #endif
1552
1553 out:
1554
1555 /*
1556 * If the waitfor flag is set, don't return until everything is
1557 * fully flushed and the on disk log is empty.
1558 */
1559 if (waitfor) {
1560 error = wapbl_truncate(wl, wl->wl_circ_size -
1561 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1562 }
1563
1564 out2:
1565 if (error) {
1566 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1567 wl->wl_dealloclens, wl->wl_dealloccnt);
1568 }
1569
1570 #ifdef WAPBL_DEBUG_PRINT
1571 if (error) {
1572 pid_t pid = -1;
1573 lwpid_t lid = -1;
1574 if (curproc)
1575 pid = curproc->p_pid;
1576 if (curlwp)
1577 lid = curlwp->l_lid;
1578 mutex_enter(&wl->wl_mtx);
1579 #ifdef WAPBL_DEBUG_BUFBYTES
1580 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1581 ("wapbl_flush: thread %d.%d aborted flush: "
1582 "error = %d\n"
1583 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1584 "deallocs=%d inodes=%d\n"
1585 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1586 "unsynced=%zu\n",
1587 pid, lid, error, wl->wl_bufcount,
1588 wl->wl_bufbytes, wl->wl_bcount,
1589 wl->wl_dealloccnt, wl->wl_inohashcnt,
1590 wl->wl_error_count, wl->wl_reclaimable_bytes,
1591 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1592 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1593 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1594 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1595 "error = %d, unsynced = %zu\n",
1596 we->we_bufcount, we->we_reclaimable_bytes,
1597 we->we_error, we->we_unsynced_bufbytes));
1598 }
1599 #else
1600 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1601 ("wapbl_flush: thread %d.%d aborted flush: "
1602 "error = %d\n"
1603 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1604 "deallocs=%d inodes=%d\n"
1605 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1606 pid, lid, error, wl->wl_bufcount,
1607 wl->wl_bufbytes, wl->wl_bcount,
1608 wl->wl_dealloccnt, wl->wl_inohashcnt,
1609 wl->wl_error_count, wl->wl_reclaimable_bytes,
1610 wl->wl_reserved_bytes));
1611 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1612 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1613 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1614 "error = %d\n", we->we_bufcount,
1615 we->we_reclaimable_bytes, we->we_error));
1616 }
1617 #endif
1618 mutex_exit(&wl->wl_mtx);
1619 }
1620 #endif
1621
1622 rw_exit(&wl->wl_rwlock);
1623 return error;
1624 }
1625
1626 /****************************************************************/
1627
1628 void
1629 wapbl_jlock_assert(struct wapbl *wl)
1630 {
1631
1632 KASSERT(rw_lock_held(&wl->wl_rwlock));
1633 }
1634
1635 void
1636 wapbl_junlock_assert(struct wapbl *wl)
1637 {
1638
1639 KASSERT(!rw_write_held(&wl->wl_rwlock));
1640 }
1641
1642 /****************************************************************/
1643
1644 /* locks missing */
1645 void
1646 wapbl_print(struct wapbl *wl,
1647 int full,
1648 void (*pr)(const char *, ...))
1649 {
1650 struct buf *bp;
1651 struct wapbl_entry *we;
1652 (*pr)("wapbl %p", wl);
1653 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1654 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1655 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1656 wl->wl_circ_size, wl->wl_circ_off,
1657 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1658 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1659 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1660 #ifdef WAPBL_DEBUG_BUFBYTES
1661 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1662 "reserved = %zu errcnt = %d unsynced = %zu\n",
1663 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1664 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1665 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1666 #else
1667 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1668 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1669 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1670 wl->wl_error_count);
1671 #endif
1672 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1673 wl->wl_dealloccnt, wl->wl_dealloclim);
1674 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1675 wl->wl_inohashcnt, wl->wl_inohashmask);
1676 (*pr)("entries:\n");
1677 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1678 #ifdef WAPBL_DEBUG_BUFBYTES
1679 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1680 "unsynced = %zu\n",
1681 we->we_bufcount, we->we_reclaimable_bytes,
1682 we->we_error, we->we_unsynced_bufbytes);
1683 #else
1684 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1685 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1686 #endif
1687 }
1688 if (full) {
1689 int cnt = 0;
1690 (*pr)("bufs =");
1691 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1692 if (!LIST_NEXT(bp, b_wapbllist)) {
1693 (*pr)(" %p", bp);
1694 } else if ((++cnt % 6) == 0) {
1695 (*pr)(" %p,\n\t", bp);
1696 } else {
1697 (*pr)(" %p,", bp);
1698 }
1699 }
1700 (*pr)("\n");
1701
1702 (*pr)("dealloced blks = ");
1703 {
1704 int i;
1705 cnt = 0;
1706 for (i = 0; i < wl->wl_dealloccnt; i++) {
1707 (*pr)(" %"PRId64":%d,",
1708 wl->wl_deallocblks[i],
1709 wl->wl_dealloclens[i]);
1710 if ((++cnt % 4) == 0) {
1711 (*pr)("\n\t");
1712 }
1713 }
1714 }
1715 (*pr)("\n");
1716
1717 (*pr)("registered inodes = ");
1718 {
1719 int i;
1720 cnt = 0;
1721 for (i = 0; i <= wl->wl_inohashmask; i++) {
1722 struct wapbl_ino_head *wih;
1723 struct wapbl_ino *wi;
1724
1725 wih = &wl->wl_inohash[i];
1726 LIST_FOREACH(wi, wih, wi_hash) {
1727 if (wi->wi_ino == 0)
1728 continue;
1729 (*pr)(" %"PRId32"/0%06"PRIo32",",
1730 wi->wi_ino, wi->wi_mode);
1731 if ((++cnt % 4) == 0) {
1732 (*pr)("\n\t");
1733 }
1734 }
1735 }
1736 (*pr)("\n");
1737 }
1738 }
1739 }
1740
1741 #if defined(WAPBL_DEBUG) || defined(DDB)
1742 void
1743 wapbl_dump(struct wapbl *wl)
1744 {
1745 #if defined(WAPBL_DEBUG)
1746 if (!wl)
1747 wl = wapbl_debug_wl;
1748 #endif
1749 if (!wl)
1750 return;
1751 wapbl_print(wl, 1, printf);
1752 }
1753 #endif
1754
1755 /****************************************************************/
1756
1757 void
1758 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1759 {
1760
1761 wapbl_jlock_assert(wl);
1762
1763 mutex_enter(&wl->wl_mtx);
1764 /* XXX should eventually instead tie this into resource estimation */
1765 /*
1766 * XXX this panic needs locking/mutex analysis and the
1767 * ability to cope with the failure.
1768 */
1769 /* XXX this XXX doesn't have enough XXX */
1770 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1771 panic("wapbl_register_deallocation: out of resources");
1772
1773 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1774 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1775 wl->wl_dealloccnt++;
1776 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1777 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1778 mutex_exit(&wl->wl_mtx);
1779 }
1780
1781 /****************************************************************/
1782
1783 static void
1784 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1785 {
1786
1787 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1788 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1789 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1790 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1791 }
1792 }
1793
1794 static void
1795 wapbl_inodetrk_free(struct wapbl *wl)
1796 {
1797
1798 /* XXX this KASSERT needs locking/mutex analysis */
1799 KASSERT(wl->wl_inohashcnt == 0);
1800 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1801 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1802 pool_destroy(&wapbl_ino_pool);
1803 }
1804 }
1805
1806 static struct wapbl_ino *
1807 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1808 {
1809 struct wapbl_ino_head *wih;
1810 struct wapbl_ino *wi;
1811
1812 KASSERT(mutex_owned(&wl->wl_mtx));
1813
1814 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1815 LIST_FOREACH(wi, wih, wi_hash) {
1816 if (ino == wi->wi_ino)
1817 return wi;
1818 }
1819 return 0;
1820 }
1821
1822 void
1823 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1824 {
1825 struct wapbl_ino_head *wih;
1826 struct wapbl_ino *wi;
1827
1828 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1829
1830 mutex_enter(&wl->wl_mtx);
1831 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1832 wi->wi_ino = ino;
1833 wi->wi_mode = mode;
1834 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1835 LIST_INSERT_HEAD(wih, wi, wi_hash);
1836 wl->wl_inohashcnt++;
1837 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1838 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1839 mutex_exit(&wl->wl_mtx);
1840 } else {
1841 mutex_exit(&wl->wl_mtx);
1842 pool_put(&wapbl_ino_pool, wi);
1843 }
1844 }
1845
1846 void
1847 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1848 {
1849 struct wapbl_ino *wi;
1850
1851 mutex_enter(&wl->wl_mtx);
1852 wi = wapbl_inodetrk_get(wl, ino);
1853 if (wi) {
1854 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1855 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1856 KASSERT(wl->wl_inohashcnt > 0);
1857 wl->wl_inohashcnt--;
1858 LIST_REMOVE(wi, wi_hash);
1859 mutex_exit(&wl->wl_mtx);
1860
1861 pool_put(&wapbl_ino_pool, wi);
1862 } else {
1863 mutex_exit(&wl->wl_mtx);
1864 }
1865 }
1866
1867 /****************************************************************/
1868
1869 static inline size_t
1870 wapbl_transaction_inodes_len(struct wapbl *wl)
1871 {
1872 int blocklen = 1<<wl->wl_log_dev_bshift;
1873 int iph;
1874
1875 /* Calculate number of inodes described in a inodelist header */
1876 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1877 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1878
1879 KASSERT(iph > 0);
1880
1881 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1882 }
1883
1884
1885 /* Calculate amount of space a transaction will take on disk */
1886 static size_t
1887 wapbl_transaction_len(struct wapbl *wl)
1888 {
1889 int blocklen = 1<<wl->wl_log_dev_bshift;
1890 size_t len;
1891 int bph;
1892
1893 /* Calculate number of blocks described in a blocklist header */
1894 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1895 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1896
1897 KASSERT(bph > 0);
1898
1899 len = wl->wl_bcount;
1900 len += howmany(wl->wl_bufcount, bph) * blocklen;
1901 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1902 len += wapbl_transaction_inodes_len(wl);
1903
1904 return len;
1905 }
1906
1907 /*
1908 * wapbl_cache_sync: issue DIOCCACHESYNC
1909 */
1910 static int
1911 wapbl_cache_sync(struct wapbl *wl, const char *msg)
1912 {
1913 const bool verbose = wapbl_verbose_commit >= 2;
1914 struct bintime start_time;
1915 int force = 1;
1916 int error;
1917
1918 if (!wapbl_flush_disk_cache) {
1919 return 0;
1920 }
1921 if (verbose) {
1922 bintime(&start_time);
1923 }
1924 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1925 FWRITE, FSCRED);
1926 if (error) {
1927 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1928 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
1929 "returned %d\n", wl->wl_devvp->v_rdev, error));
1930 }
1931 if (verbose) {
1932 struct bintime d;
1933 struct timespec ts;
1934
1935 bintime(&d);
1936 bintime_sub(&d, &start_time);
1937 bintime2timespec(&d, &ts);
1938 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
1939 msg, (uintmax_t)wl->wl_devvp->v_rdev,
1940 (uintmax_t)ts.tv_sec, ts.tv_nsec);
1941 }
1942 return error;
1943 }
1944
1945 /*
1946 * Perform commit operation
1947 *
1948 * Note that generation number incrementation needs to
1949 * be protected against racing with other invocations
1950 * of wapbl_write_commit. This is ok since this routine
1951 * is only invoked from wapbl_flush
1952 */
1953 static int
1954 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1955 {
1956 struct wapbl_wc_header *wc = wl->wl_wc_header;
1957 struct timespec ts;
1958 int error;
1959 daddr_t pbn;
1960
1961 /*
1962 * flush disk cache to ensure that blocks we've written are actually
1963 * written to the stable storage before the commit header.
1964 *
1965 * XXX Calc checksum here, instead we do this for now
1966 */
1967 wapbl_cache_sync(wl, "1");
1968
1969 wc->wc_head = head;
1970 wc->wc_tail = tail;
1971 wc->wc_checksum = 0;
1972 wc->wc_version = 1;
1973 getnanotime(&ts);
1974 wc->wc_time = ts.tv_sec;
1975 wc->wc_timensec = ts.tv_nsec;
1976
1977 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1978 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1979 (intmax_t)head, (intmax_t)tail));
1980
1981 /*
1982 * write the commit header.
1983 *
1984 * XXX if generation will rollover, then first zero
1985 * over second commit header before trying to write both headers.
1986 */
1987
1988 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1989 #ifdef _KERNEL
1990 pbn = btodb(pbn << wc->wc_log_dev_bshift);
1991 #endif
1992 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1993 if (error)
1994 return error;
1995
1996 /*
1997 * flush disk cache to ensure that the commit header is actually
1998 * written before meta data blocks.
1999 */
2000 wapbl_cache_sync(wl, "2");
2001
2002 /*
2003 * If the generation number was zero, write it out a second time.
2004 * This handles initialization and generation number rollover
2005 */
2006 if (wc->wc_generation++ == 0) {
2007 error = wapbl_write_commit(wl, head, tail);
2008 /*
2009 * This panic should be able to be removed if we do the
2010 * zero'ing mentioned above, and we are certain to roll
2011 * back generation number on failure.
2012 */
2013 if (error)
2014 panic("wapbl_write_commit: error writing duplicate "
2015 "log header: %d\n", error);
2016 }
2017 return 0;
2018 }
2019
2020 /* Returns new offset value */
2021 static int
2022 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2023 {
2024 struct wapbl_wc_blocklist *wc =
2025 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2026 int blocklen = 1<<wl->wl_log_dev_bshift;
2027 int bph;
2028 struct buf *bp;
2029 off_t off = *offp;
2030 int error;
2031 size_t padding;
2032
2033 KASSERT(rw_write_held(&wl->wl_rwlock));
2034
2035 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2036 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2037
2038 bp = LIST_FIRST(&wl->wl_bufs);
2039
2040 while (bp) {
2041 int cnt;
2042 struct buf *obp = bp;
2043
2044 KASSERT(bp->b_flags & B_LOCKED);
2045
2046 wc->wc_type = WAPBL_WC_BLOCKS;
2047 wc->wc_len = blocklen;
2048 wc->wc_blkcount = 0;
2049 while (bp && (wc->wc_blkcount < bph)) {
2050 /*
2051 * Make sure all the physical block numbers are up to
2052 * date. If this is not always true on a given
2053 * filesystem, then VOP_BMAP must be called. We
2054 * could call VOP_BMAP here, or else in the filesystem
2055 * specific flush callback, although neither of those
2056 * solutions allow us to take the vnode lock. If a
2057 * filesystem requires that we must take the vnode lock
2058 * to call VOP_BMAP, then we can probably do it in
2059 * bwrite when the vnode lock should already be held
2060 * by the invoking code.
2061 */
2062 KASSERT((bp->b_vp->v_type == VBLK) ||
2063 (bp->b_blkno != bp->b_lblkno));
2064 KASSERT(bp->b_blkno > 0);
2065
2066 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2067 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2068 wc->wc_len += bp->b_bcount;
2069 wc->wc_blkcount++;
2070 bp = LIST_NEXT(bp, b_wapbllist);
2071 }
2072 if (wc->wc_len % blocklen != 0) {
2073 padding = blocklen - wc->wc_len % blocklen;
2074 wc->wc_len += padding;
2075 } else {
2076 padding = 0;
2077 }
2078
2079 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2080 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2081 wc->wc_len, padding, (intmax_t)off));
2082
2083 error = wapbl_circ_write(wl, wc, blocklen, &off);
2084 if (error)
2085 return error;
2086 bp = obp;
2087 cnt = 0;
2088 while (bp && (cnt++ < bph)) {
2089 error = wapbl_circ_write(wl, bp->b_data,
2090 bp->b_bcount, &off);
2091 if (error)
2092 return error;
2093 bp = LIST_NEXT(bp, b_wapbllist);
2094 }
2095 if (padding) {
2096 void *zero;
2097
2098 zero = wapbl_alloc(padding);
2099 memset(zero, 0, padding);
2100 error = wapbl_circ_write(wl, zero, padding, &off);
2101 wapbl_free(zero, padding);
2102 if (error)
2103 return error;
2104 }
2105 }
2106 *offp = off;
2107 return 0;
2108 }
2109
2110 static int
2111 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2112 {
2113 struct wapbl_wc_blocklist *wc =
2114 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2115 int i;
2116 int blocklen = 1<<wl->wl_log_dev_bshift;
2117 int bph;
2118 off_t off = *offp;
2119 int error;
2120
2121 if (wl->wl_dealloccnt == 0)
2122 return 0;
2123
2124 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2125 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2126
2127 i = 0;
2128 while (i < wl->wl_dealloccnt) {
2129 wc->wc_type = WAPBL_WC_REVOCATIONS;
2130 wc->wc_len = blocklen;
2131 wc->wc_blkcount = 0;
2132 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2133 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2134 wl->wl_deallocblks[i];
2135 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2136 wl->wl_dealloclens[i];
2137 wc->wc_blkcount++;
2138 i++;
2139 }
2140 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2141 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2142 wc->wc_len, (intmax_t)off));
2143 error = wapbl_circ_write(wl, wc, blocklen, &off);
2144 if (error)
2145 return error;
2146 }
2147 *offp = off;
2148 return 0;
2149 }
2150
2151 static int
2152 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2153 {
2154 struct wapbl_wc_inodelist *wc =
2155 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2156 int i;
2157 int blocklen = 1 << wl->wl_log_dev_bshift;
2158 off_t off = *offp;
2159 int error;
2160
2161 struct wapbl_ino_head *wih;
2162 struct wapbl_ino *wi;
2163 int iph;
2164
2165 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2166 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2167
2168 i = 0;
2169 wih = &wl->wl_inohash[0];
2170 wi = 0;
2171 do {
2172 wc->wc_type = WAPBL_WC_INODES;
2173 wc->wc_len = blocklen;
2174 wc->wc_inocnt = 0;
2175 wc->wc_clear = (i == 0);
2176 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2177 while (!wi) {
2178 KASSERT((wih - &wl->wl_inohash[0])
2179 <= wl->wl_inohashmask);
2180 wi = LIST_FIRST(wih++);
2181 }
2182 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2183 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2184 wc->wc_inocnt++;
2185 i++;
2186 wi = LIST_NEXT(wi, wi_hash);
2187 }
2188 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2189 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2190 wc->wc_len, (intmax_t)off));
2191 error = wapbl_circ_write(wl, wc, blocklen, &off);
2192 if (error)
2193 return error;
2194 } while (i < wl->wl_inohashcnt);
2195
2196 *offp = off;
2197 return 0;
2198 }
2199
2200 #endif /* _KERNEL */
2201
2202 /****************************************************************/
2203
2204 struct wapbl_blk {
2205 LIST_ENTRY(wapbl_blk) wb_hash;
2206 daddr_t wb_blk;
2207 off_t wb_off; /* Offset of this block in the log */
2208 };
2209 #define WAPBL_BLKPOOL_MIN 83
2210
2211 static void
2212 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2213 {
2214 if (size < WAPBL_BLKPOOL_MIN)
2215 size = WAPBL_BLKPOOL_MIN;
2216 KASSERT(wr->wr_blkhash == 0);
2217 #ifdef _KERNEL
2218 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2219 #else /* ! _KERNEL */
2220 /* Manually implement hashinit */
2221 {
2222 unsigned long i, hashsize;
2223 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2224 continue;
2225 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2226 for (i = 0; i < hashsize; i++)
2227 LIST_INIT(&wr->wr_blkhash[i]);
2228 wr->wr_blkhashmask = hashsize - 1;
2229 }
2230 #endif /* ! _KERNEL */
2231 }
2232
2233 static void
2234 wapbl_blkhash_free(struct wapbl_replay *wr)
2235 {
2236 KASSERT(wr->wr_blkhashcnt == 0);
2237 #ifdef _KERNEL
2238 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2239 #else /* ! _KERNEL */
2240 wapbl_free(wr->wr_blkhash,
2241 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2242 #endif /* ! _KERNEL */
2243 }
2244
2245 static struct wapbl_blk *
2246 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2247 {
2248 struct wapbl_blk_head *wbh;
2249 struct wapbl_blk *wb;
2250 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2251 LIST_FOREACH(wb, wbh, wb_hash) {
2252 if (blk == wb->wb_blk)
2253 return wb;
2254 }
2255 return 0;
2256 }
2257
2258 static void
2259 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2260 {
2261 struct wapbl_blk_head *wbh;
2262 struct wapbl_blk *wb;
2263 wb = wapbl_blkhash_get(wr, blk);
2264 if (wb) {
2265 KASSERT(wb->wb_blk == blk);
2266 wb->wb_off = off;
2267 } else {
2268 wb = wapbl_alloc(sizeof(*wb));
2269 wb->wb_blk = blk;
2270 wb->wb_off = off;
2271 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2272 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2273 wr->wr_blkhashcnt++;
2274 }
2275 }
2276
2277 static void
2278 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2279 {
2280 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2281 if (wb) {
2282 KASSERT(wr->wr_blkhashcnt > 0);
2283 wr->wr_blkhashcnt--;
2284 LIST_REMOVE(wb, wb_hash);
2285 wapbl_free(wb, sizeof(*wb));
2286 }
2287 }
2288
2289 static void
2290 wapbl_blkhash_clear(struct wapbl_replay *wr)
2291 {
2292 unsigned long i;
2293 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2294 struct wapbl_blk *wb;
2295
2296 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2297 KASSERT(wr->wr_blkhashcnt > 0);
2298 wr->wr_blkhashcnt--;
2299 LIST_REMOVE(wb, wb_hash);
2300 wapbl_free(wb, sizeof(*wb));
2301 }
2302 }
2303 KASSERT(wr->wr_blkhashcnt == 0);
2304 }
2305
2306 /****************************************************************/
2307
2308 static int
2309 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2310 {
2311 size_t slen;
2312 off_t off = *offp;
2313 int error;
2314 daddr_t pbn;
2315
2316 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2317 wr->wr_log_dev_bshift) == len);
2318
2319 if (off < wr->wr_circ_off)
2320 off = wr->wr_circ_off;
2321 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2322 if (slen < len) {
2323 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2324 #ifdef _KERNEL
2325 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2326 #endif
2327 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2328 if (error)
2329 return error;
2330 data = (uint8_t *)data + slen;
2331 len -= slen;
2332 off = wr->wr_circ_off;
2333 }
2334 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2335 #ifdef _KERNEL
2336 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2337 #endif
2338 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2339 if (error)
2340 return error;
2341 off += len;
2342 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2343 off = wr->wr_circ_off;
2344 *offp = off;
2345 return 0;
2346 }
2347
2348 static void
2349 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2350 {
2351 size_t slen;
2352 off_t off = *offp;
2353
2354 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2355 wr->wr_log_dev_bshift) == len);
2356
2357 if (off < wr->wr_circ_off)
2358 off = wr->wr_circ_off;
2359 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2360 if (slen < len) {
2361 len -= slen;
2362 off = wr->wr_circ_off;
2363 }
2364 off += len;
2365 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2366 off = wr->wr_circ_off;
2367 *offp = off;
2368 }
2369
2370 /****************************************************************/
2371
2372 int
2373 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2374 daddr_t off, size_t count, size_t blksize)
2375 {
2376 struct wapbl_replay *wr;
2377 int error;
2378 struct vnode *devvp;
2379 daddr_t logpbn;
2380 uint8_t *scratch;
2381 struct wapbl_wc_header *wch;
2382 struct wapbl_wc_header *wch2;
2383 /* Use this until we read the actual log header */
2384 int log_dev_bshift = ilog2(blksize);
2385 size_t used;
2386 daddr_t pbn;
2387
2388 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2389 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2390 vp, off, count, blksize));
2391
2392 if (off < 0)
2393 return EINVAL;
2394
2395 if (blksize < DEV_BSIZE)
2396 return EINVAL;
2397 if (blksize % DEV_BSIZE)
2398 return EINVAL;
2399
2400 #ifdef _KERNEL
2401 #if 0
2402 /* XXX vp->v_size isn't reliably set for VBLK devices,
2403 * especially root. However, we might still want to verify
2404 * that the full load is readable */
2405 if ((off + count) * blksize > vp->v_size)
2406 return EINVAL;
2407 #endif
2408 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2409 return error;
2410 }
2411 #else /* ! _KERNEL */
2412 devvp = vp;
2413 logpbn = off;
2414 #endif /* ! _KERNEL */
2415
2416 scratch = wapbl_alloc(MAXBSIZE);
2417
2418 pbn = logpbn;
2419 #ifdef _KERNEL
2420 pbn = btodb(pbn << log_dev_bshift);
2421 #endif
2422 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2423 if (error)
2424 goto errout;
2425
2426 wch = (struct wapbl_wc_header *)scratch;
2427 wch2 =
2428 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2429 /* XXX verify checksums and magic numbers */
2430 if (wch->wc_type != WAPBL_WC_HEADER) {
2431 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2432 error = EFTYPE;
2433 goto errout;
2434 }
2435
2436 if (wch2->wc_generation > wch->wc_generation)
2437 wch = wch2;
2438
2439 wr = wapbl_calloc(1, sizeof(*wr));
2440
2441 wr->wr_logvp = vp;
2442 wr->wr_devvp = devvp;
2443 wr->wr_logpbn = logpbn;
2444
2445 wr->wr_scratch = scratch;
2446
2447 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2448 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2449 wr->wr_circ_off = wch->wc_circ_off;
2450 wr->wr_circ_size = wch->wc_circ_size;
2451 wr->wr_generation = wch->wc_generation;
2452
2453 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2454
2455 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2456 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2457 " len=%"PRId64" used=%zu\n",
2458 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2459 wch->wc_circ_size, used));
2460
2461 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2462
2463 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2464 if (error) {
2465 wapbl_replay_stop(wr);
2466 wapbl_replay_free(wr);
2467 return error;
2468 }
2469
2470 *wrp = wr;
2471 return 0;
2472
2473 errout:
2474 wapbl_free(scratch, MAXBSIZE);
2475 return error;
2476 }
2477
2478 void
2479 wapbl_replay_stop(struct wapbl_replay *wr)
2480 {
2481
2482 if (!wapbl_replay_isopen(wr))
2483 return;
2484
2485 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2486
2487 wapbl_free(wr->wr_scratch, MAXBSIZE);
2488 wr->wr_scratch = NULL;
2489
2490 wr->wr_logvp = NULL;
2491
2492 wapbl_blkhash_clear(wr);
2493 wapbl_blkhash_free(wr);
2494 }
2495
2496 void
2497 wapbl_replay_free(struct wapbl_replay *wr)
2498 {
2499
2500 KDASSERT(!wapbl_replay_isopen(wr));
2501
2502 if (wr->wr_inodes)
2503 wapbl_free(wr->wr_inodes,
2504 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2505 wapbl_free(wr, sizeof(*wr));
2506 }
2507
2508 #ifdef _KERNEL
2509 int
2510 wapbl_replay_isopen1(struct wapbl_replay *wr)
2511 {
2512
2513 return wapbl_replay_isopen(wr);
2514 }
2515 #endif
2516
2517 static void
2518 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2519 {
2520 struct wapbl_wc_blocklist *wc =
2521 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2522 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2523 int i, j, n;
2524
2525 for (i = 0; i < wc->wc_blkcount; i++) {
2526 /*
2527 * Enter each physical block into the hashtable independently.
2528 */
2529 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2530 for (j = 0; j < n; j++) {
2531 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2532 *offp);
2533 wapbl_circ_advance(wr, fsblklen, offp);
2534 }
2535 }
2536 }
2537
2538 static void
2539 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2540 {
2541 struct wapbl_wc_blocklist *wc =
2542 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2543 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2544 int i, j, n;
2545
2546 for (i = 0; i < wc->wc_blkcount; i++) {
2547 /*
2548 * Remove any blocks found from the hashtable.
2549 */
2550 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2551 for (j = 0; j < n; j++)
2552 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2553 }
2554 }
2555
2556 static void
2557 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2558 {
2559 struct wapbl_wc_inodelist *wc =
2560 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2561 void *new_inodes;
2562 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2563
2564 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2565
2566 /*
2567 * Keep track of where we found this so location won't be
2568 * overwritten.
2569 */
2570 if (wc->wc_clear) {
2571 wr->wr_inodestail = oldoff;
2572 wr->wr_inodescnt = 0;
2573 if (wr->wr_inodes != NULL) {
2574 wapbl_free(wr->wr_inodes, oldsize);
2575 wr->wr_inodes = NULL;
2576 }
2577 }
2578 wr->wr_inodeshead = newoff;
2579 if (wc->wc_inocnt == 0)
2580 return;
2581
2582 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2583 sizeof(wr->wr_inodes[0]));
2584 if (wr->wr_inodes != NULL) {
2585 memcpy(new_inodes, wr->wr_inodes, oldsize);
2586 wapbl_free(wr->wr_inodes, oldsize);
2587 }
2588 wr->wr_inodes = new_inodes;
2589 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2590 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2591 wr->wr_inodescnt += wc->wc_inocnt;
2592 }
2593
2594 static int
2595 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2596 {
2597 off_t off;
2598 int error;
2599
2600 int logblklen = 1 << wr->wr_log_dev_bshift;
2601
2602 wapbl_blkhash_clear(wr);
2603
2604 off = tail;
2605 while (off != head) {
2606 struct wapbl_wc_null *wcn;
2607 off_t saveoff = off;
2608 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2609 if (error)
2610 goto errout;
2611 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2612 switch (wcn->wc_type) {
2613 case WAPBL_WC_BLOCKS:
2614 wapbl_replay_process_blocks(wr, &off);
2615 break;
2616
2617 case WAPBL_WC_REVOCATIONS:
2618 wapbl_replay_process_revocations(wr);
2619 break;
2620
2621 case WAPBL_WC_INODES:
2622 wapbl_replay_process_inodes(wr, saveoff, off);
2623 break;
2624
2625 default:
2626 printf("Unrecognized wapbl type: 0x%08x\n",
2627 wcn->wc_type);
2628 error = EFTYPE;
2629 goto errout;
2630 }
2631 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2632 if (off != saveoff) {
2633 printf("wapbl_replay: corrupted records\n");
2634 error = EFTYPE;
2635 goto errout;
2636 }
2637 }
2638 return 0;
2639
2640 errout:
2641 wapbl_blkhash_clear(wr);
2642 return error;
2643 }
2644
2645 #if 0
2646 int
2647 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2648 {
2649 off_t off;
2650 int mismatchcnt = 0;
2651 int logblklen = 1 << wr->wr_log_dev_bshift;
2652 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2653 void *scratch1 = wapbl_alloc(MAXBSIZE);
2654 void *scratch2 = wapbl_alloc(MAXBSIZE);
2655 int error = 0;
2656
2657 KDASSERT(wapbl_replay_isopen(wr));
2658
2659 off = wch->wc_tail;
2660 while (off != wch->wc_head) {
2661 struct wapbl_wc_null *wcn;
2662 #ifdef DEBUG
2663 off_t saveoff = off;
2664 #endif
2665 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2666 if (error)
2667 goto out;
2668 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2669 switch (wcn->wc_type) {
2670 case WAPBL_WC_BLOCKS:
2671 {
2672 struct wapbl_wc_blocklist *wc =
2673 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2674 int i;
2675 for (i = 0; i < wc->wc_blkcount; i++) {
2676 int foundcnt = 0;
2677 int dirtycnt = 0;
2678 int j, n;
2679 /*
2680 * Check each physical block into the
2681 * hashtable independently
2682 */
2683 n = wc->wc_blocks[i].wc_dlen >>
2684 wch->wc_fs_dev_bshift;
2685 for (j = 0; j < n; j++) {
2686 struct wapbl_blk *wb =
2687 wapbl_blkhash_get(wr,
2688 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2689 if (wb && (wb->wb_off == off)) {
2690 foundcnt++;
2691 error =
2692 wapbl_circ_read(wr,
2693 scratch1, fsblklen,
2694 &off);
2695 if (error)
2696 goto out;
2697 error =
2698 wapbl_read(scratch2,
2699 fsblklen, fsdevvp,
2700 wb->wb_blk);
2701 if (error)
2702 goto out;
2703 if (memcmp(scratch1,
2704 scratch2,
2705 fsblklen)) {
2706 printf(
2707 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2708 wb->wb_blk, (intmax_t)off);
2709 dirtycnt++;
2710 mismatchcnt++;
2711 }
2712 } else {
2713 wapbl_circ_advance(wr,
2714 fsblklen, &off);
2715 }
2716 }
2717 #if 0
2718 /*
2719 * If all of the blocks in an entry
2720 * are clean, then remove all of its
2721 * blocks from the hashtable since they
2722 * never will need replay.
2723 */
2724 if ((foundcnt != 0) &&
2725 (dirtycnt == 0)) {
2726 off = saveoff;
2727 wapbl_circ_advance(wr,
2728 logblklen, &off);
2729 for (j = 0; j < n; j++) {
2730 struct wapbl_blk *wb =
2731 wapbl_blkhash_get(wr,
2732 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2733 if (wb &&
2734 (wb->wb_off == off)) {
2735 wapbl_blkhash_rem(wr, wb->wb_blk);
2736 }
2737 wapbl_circ_advance(wr,
2738 fsblklen, &off);
2739 }
2740 }
2741 #endif
2742 }
2743 }
2744 break;
2745 case WAPBL_WC_REVOCATIONS:
2746 case WAPBL_WC_INODES:
2747 break;
2748 default:
2749 KASSERT(0);
2750 }
2751 #ifdef DEBUG
2752 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2753 KASSERT(off == saveoff);
2754 #endif
2755 }
2756 out:
2757 wapbl_free(scratch1, MAXBSIZE);
2758 wapbl_free(scratch2, MAXBSIZE);
2759 if (!error && mismatchcnt)
2760 error = EFTYPE;
2761 return error;
2762 }
2763 #endif
2764
2765 int
2766 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2767 {
2768 struct wapbl_blk *wb;
2769 size_t i;
2770 off_t off;
2771 void *scratch;
2772 int error = 0;
2773 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2774
2775 KDASSERT(wapbl_replay_isopen(wr));
2776
2777 scratch = wapbl_alloc(MAXBSIZE);
2778
2779 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2780 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2781 off = wb->wb_off;
2782 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2783 if (error)
2784 break;
2785 error = wapbl_write(scratch, fsblklen, fsdevvp,
2786 wb->wb_blk);
2787 if (error)
2788 break;
2789 }
2790 }
2791
2792 wapbl_free(scratch, MAXBSIZE);
2793 return error;
2794 }
2795
2796 int
2797 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2798 {
2799 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2800
2801 KDASSERT(wapbl_replay_isopen(wr));
2802 KASSERT((len % fsblklen) == 0);
2803
2804 while (len != 0) {
2805 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2806 if (wb)
2807 return 1;
2808 len -= fsblklen;
2809 }
2810 return 0;
2811 }
2812
2813 int
2814 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2815 {
2816 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2817
2818 KDASSERT(wapbl_replay_isopen(wr));
2819
2820 KASSERT((len % fsblklen) == 0);
2821
2822 while (len != 0) {
2823 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2824 if (wb) {
2825 off_t off = wb->wb_off;
2826 int error;
2827 error = wapbl_circ_read(wr, data, fsblklen, &off);
2828 if (error)
2829 return error;
2830 }
2831 data = (uint8_t *)data + fsblklen;
2832 len -= fsblklen;
2833 blk++;
2834 }
2835 return 0;
2836 }
2837
2838 #ifdef _KERNEL
2839 /*
2840 * This is not really a module now, but maybe on it's way to
2841 * being one some day.
2842 */
2843 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2844
2845 static int
2846 wapbl_modcmd(modcmd_t cmd, void *arg)
2847 {
2848
2849 switch (cmd) {
2850 case MODULE_CMD_INIT:
2851 wapbl_init();
2852 return 0;
2853 case MODULE_CMD_FINI:
2854 #ifdef notyet
2855 return wapbl_fini(true);
2856 #endif
2857 return EOPNOTSUPP;
2858 default:
2859 return ENOTTY;
2860 }
2861 }
2862 #endif /* _KERNEL */
2863