vfs_wapbl.c revision 1.52.2.1 1 /* $NetBSD: vfs_wapbl.c,v 1.52.2.1 2012/09/12 06:15:34 tls Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.52.2.1 2012/09/12 06:15:34 tls Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 #else /* !_KERNEL */
74 #include <assert.h>
75 #include <errno.h>
76 #include <stdio.h>
77 #include <stdbool.h>
78 #include <stdlib.h>
79 #include <string.h>
80
81 #include <sys/time.h>
82 #include <sys/wapbl.h>
83 #include <sys/wapbl_replay.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_alloc(s) malloc(s)
88 #define wapbl_free(a, s) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * lm = rwlock held writing or mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* lm: address of block */
175 int *wl_dealloclens; /* lm: size of block */
176 int wl_dealloccnt; /* lm: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187 };
188
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191 #endif
192
193 /****************************************************************/
194 #ifdef _KERNEL
195
196 #ifdef WAPBL_DEBUG
197 struct wapbl *wapbl_debug_wl;
198 #endif
199
200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204 #endif /* _KERNEL */
205
206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207
208 static inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail);
210 static inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail);
212
213 #ifdef _KERNEL
214
215 static struct pool wapbl_entry_pool;
216
217 #define WAPBL_INODETRK_SIZE 83
218 static int wapbl_ino_pool_refcount;
219 static struct pool wapbl_ino_pool;
220 struct wapbl_ino {
221 LIST_ENTRY(wapbl_ino) wi_hash;
222 ino_t wi_ino;
223 mode_t wi_mode;
224 };
225
226 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
227 static void wapbl_inodetrk_free(struct wapbl *wl);
228 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
229
230 static size_t wapbl_transaction_len(struct wapbl *wl);
231 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
232
233 #if 0
234 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
235 #endif
236
237 static int wapbl_replay_isopen1(struct wapbl_replay *);
238
239 /*
240 * This is useful for debugging. If set, the log will
241 * only be truncated when necessary.
242 */
243 int wapbl_lazy_truncate = 0;
244
245 struct wapbl_ops wapbl_ops = {
246 .wo_wapbl_discard = wapbl_discard,
247 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
248 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
249 .wo_wapbl_replay_read = wapbl_replay_read,
250 .wo_wapbl_add_buf = wapbl_add_buf,
251 .wo_wapbl_remove_buf = wapbl_remove_buf,
252 .wo_wapbl_resize_buf = wapbl_resize_buf,
253 .wo_wapbl_begin = wapbl_begin,
254 .wo_wapbl_end = wapbl_end,
255 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
256
257 /* XXX: the following is only used to say "this is a wapbl buf" */
258 .wo_wapbl_biodone = wapbl_biodone,
259 };
260
261 static int
262 wapbl_sysctl_init(void)
263 {
264 int rv;
265 const struct sysctlnode *rnode, *cnode;
266
267 wapbl_sysctl = NULL;
268
269 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
270 CTLFLAG_PERMANENT,
271 CTLTYPE_NODE, "vfs", NULL,
272 NULL, 0, NULL, 0,
273 CTL_VFS, CTL_EOL);
274 if (rv)
275 return rv;
276
277 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
278 CTLFLAG_PERMANENT,
279 CTLTYPE_NODE, "wapbl",
280 SYSCTL_DESCR("WAPBL journaling options"),
281 NULL, 0, NULL, 0,
282 CTL_CREATE, CTL_EOL);
283 if (rv)
284 return rv;
285
286 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
287 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
288 CTLTYPE_INT, "flush_disk_cache",
289 SYSCTL_DESCR("flush disk cache"),
290 NULL, 0, &wapbl_flush_disk_cache, 0,
291 CTL_CREATE, CTL_EOL);
292 if (rv)
293 return rv;
294
295 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
296 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
297 CTLTYPE_INT, "verbose_commit",
298 SYSCTL_DESCR("show time and size of wapbl log commits"),
299 NULL, 0, &wapbl_verbose_commit, 0,
300 CTL_CREATE, CTL_EOL);
301 return rv;
302 }
303
304 static void
305 wapbl_init(void)
306 {
307
308 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
309 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
310
311 wapbl_sysctl_init();
312 }
313
314 #ifdef notyet
315 static int
316 wapbl_fini(bool interface)
317 {
318
319 if (aio_sysctl != NULL)
320 sysctl_teardown(&aio_sysctl);
321
322 pool_destroy(&wapbl_entry_pool);
323
324 return 0;
325 }
326 #endif
327
328 static int
329 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
330 {
331 int error, i;
332
333 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
334 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
335
336 /*
337 * Its only valid to reuse the replay log if its
338 * the same as the new log we just opened.
339 */
340 KDASSERT(!wapbl_replay_isopen(wr));
341 KASSERT(wl->wl_devvp->v_type == VBLK);
342 KASSERT(wr->wr_devvp->v_type == VBLK);
343 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
344 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
345 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
346 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
347 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
348 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
349
350 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
351
352 for (i = 0; i < wr->wr_inodescnt; i++)
353 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
354 wr->wr_inodes[i].wr_imode);
355
356 /* Make sure new transaction won't overwrite old inodes list */
357 KDASSERT(wapbl_transaction_len(wl) <=
358 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
359 wr->wr_inodestail));
360
361 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
362 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
363 wapbl_transaction_len(wl);
364
365 error = wapbl_write_inodes(wl, &wl->wl_head);
366 if (error)
367 return error;
368
369 KASSERT(wl->wl_head != wl->wl_tail);
370 KASSERT(wl->wl_head != 0);
371
372 return 0;
373 }
374
375 int
376 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
377 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
378 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
379 {
380 struct wapbl *wl;
381 struct vnode *devvp;
382 daddr_t logpbn;
383 int error;
384 int log_dev_bshift = ilog2(blksize);
385 int fs_dev_bshift = log_dev_bshift;
386 int run;
387
388 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
389 " count=%zu blksize=%zu\n", vp, off, count, blksize));
390
391 if (log_dev_bshift > fs_dev_bshift) {
392 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
393 ("wapbl: log device's block size cannot be larger "
394 "than filesystem's\n"));
395 /*
396 * Not currently implemented, although it could be if
397 * needed someday.
398 */
399 return ENOSYS;
400 }
401
402 if (off < 0)
403 return EINVAL;
404
405 if (blksize < DEV_BSIZE)
406 return EINVAL;
407 if (blksize % DEV_BSIZE)
408 return EINVAL;
409
410 /* XXXTODO: verify that the full load is writable */
411
412 /*
413 * XXX check for minimum log size
414 * minimum is governed by minimum amount of space
415 * to complete a transaction. (probably truncate)
416 */
417 /* XXX for now pick something minimal */
418 if ((count * blksize) < mp->mnt_maxphys) {
419 return ENOSPC;
420 }
421
422 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
423 return error;
424 }
425
426 wl = wapbl_calloc(1, sizeof(*wl));
427 rw_init(&wl->wl_rwlock);
428 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
429 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
430 LIST_INIT(&wl->wl_bufs);
431 SIMPLEQ_INIT(&wl->wl_entries);
432
433 wl->wl_logvp = vp;
434 wl->wl_devvp = devvp;
435 wl->wl_mount = mp;
436 wl->wl_logpbn = logpbn;
437 wl->wl_log_dev_bshift = log_dev_bshift;
438 wl->wl_fs_dev_bshift = fs_dev_bshift;
439
440 wl->wl_flush = flushfn;
441 wl->wl_flush_abort = flushabortfn;
442
443 /* Reserve two log device blocks for the commit headers */
444 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
445 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
446 /* truncate the log usage to a multiple of log_dev_bshift */
447 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
448 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
449
450 /*
451 * wl_bufbytes_max limits the size of the in memory transaction space.
452 * - Since buffers are allocated and accounted for in units of
453 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
454 * (i.e. 1<<PAGE_SHIFT)
455 * - Since the log device has to be written in units of
456 * 1<<wl_log_dev_bshift it is required to be a mulitple of
457 * 1<<wl_log_dev_bshift.
458 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
459 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
460 * Therefore it must be multiple of the least common multiple of those
461 * three quantities. Fortunately, all of those quantities are
462 * guaranteed to be a power of two, and the least common multiple of
463 * a set of numbers which are all powers of two is simply the maximum
464 * of those numbers. Finally, the maximum logarithm of a power of two
465 * is the same as the log of the maximum power of two. So we can do
466 * the following operations to size wl_bufbytes_max:
467 */
468
469 /* XXX fix actual number of pages reserved per filesystem. */
470 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
471
472 /* Round wl_bufbytes_max to the largest power of two constraint */
473 wl->wl_bufbytes_max >>= PAGE_SHIFT;
474 wl->wl_bufbytes_max <<= PAGE_SHIFT;
475 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
476 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
477 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
478 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
479
480 /* XXX maybe use filesystem fragment size instead of 1024 */
481 /* XXX fix actual number of buffers reserved per filesystem. */
482 wl->wl_bufcount_max = (nbuf / 2) * 1024;
483
484 /* XXX tie this into resource estimation */
485 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
486
487 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
488 wl->wl_dealloclim);
489 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
490 wl->wl_dealloclim);
491
492 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
493
494 /* Initialize the commit header */
495 {
496 struct wapbl_wc_header *wc;
497 size_t len = 1 << wl->wl_log_dev_bshift;
498 wc = wapbl_calloc(1, len);
499 wc->wc_type = WAPBL_WC_HEADER;
500 wc->wc_len = len;
501 wc->wc_circ_off = wl->wl_circ_off;
502 wc->wc_circ_size = wl->wl_circ_size;
503 /* XXX wc->wc_fsid */
504 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
505 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
506 wl->wl_wc_header = wc;
507 wl->wl_wc_scratch = wapbl_alloc(len);
508 }
509
510 /*
511 * if there was an existing set of unlinked but
512 * allocated inodes, preserve it in the new
513 * log.
514 */
515 if (wr && wr->wr_inodescnt) {
516 error = wapbl_start_flush_inodes(wl, wr);
517 if (error)
518 goto errout;
519 }
520
521 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
522 if (error) {
523 goto errout;
524 }
525
526 *wlp = wl;
527 #if defined(WAPBL_DEBUG)
528 wapbl_debug_wl = wl;
529 #endif
530
531 return 0;
532 errout:
533 wapbl_discard(wl);
534 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
535 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
536 wapbl_free(wl->wl_deallocblks,
537 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
538 wapbl_free(wl->wl_dealloclens,
539 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
540 wapbl_inodetrk_free(wl);
541 wapbl_free(wl, sizeof(*wl));
542
543 return error;
544 }
545
546 /*
547 * Like wapbl_flush, only discards the transaction
548 * completely
549 */
550
551 void
552 wapbl_discard(struct wapbl *wl)
553 {
554 struct wapbl_entry *we;
555 struct buf *bp;
556 int i;
557
558 /*
559 * XXX we may consider using upgrade here
560 * if we want to call flush from inside a transaction
561 */
562 rw_enter(&wl->wl_rwlock, RW_WRITER);
563 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
564 wl->wl_dealloccnt);
565
566 #ifdef WAPBL_DEBUG_PRINT
567 {
568 pid_t pid = -1;
569 lwpid_t lid = -1;
570 if (curproc)
571 pid = curproc->p_pid;
572 if (curlwp)
573 lid = curlwp->l_lid;
574 #ifdef WAPBL_DEBUG_BUFBYTES
575 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
576 ("wapbl_discard: thread %d.%d discarding "
577 "transaction\n"
578 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
579 "deallocs=%d inodes=%d\n"
580 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
581 "unsynced=%zu\n",
582 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
583 wl->wl_bcount, wl->wl_dealloccnt,
584 wl->wl_inohashcnt, wl->wl_error_count,
585 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
586 wl->wl_unsynced_bufbytes));
587 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
588 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
589 ("\tentry: bufcount = %zu, reclaimable = %zu, "
590 "error = %d, unsynced = %zu\n",
591 we->we_bufcount, we->we_reclaimable_bytes,
592 we->we_error, we->we_unsynced_bufbytes));
593 }
594 #else /* !WAPBL_DEBUG_BUFBYTES */
595 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
596 ("wapbl_discard: thread %d.%d discarding transaction\n"
597 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
598 "deallocs=%d inodes=%d\n"
599 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
600 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
601 wl->wl_bcount, wl->wl_dealloccnt,
602 wl->wl_inohashcnt, wl->wl_error_count,
603 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
604 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
605 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
606 ("\tentry: bufcount = %zu, reclaimable = %zu, "
607 "error = %d\n",
608 we->we_bufcount, we->we_reclaimable_bytes,
609 we->we_error));
610 }
611 #endif /* !WAPBL_DEBUG_BUFBYTES */
612 }
613 #endif /* WAPBL_DEBUG_PRINT */
614
615 for (i = 0; i <= wl->wl_inohashmask; i++) {
616 struct wapbl_ino_head *wih;
617 struct wapbl_ino *wi;
618
619 wih = &wl->wl_inohash[i];
620 while ((wi = LIST_FIRST(wih)) != NULL) {
621 LIST_REMOVE(wi, wi_hash);
622 pool_put(&wapbl_ino_pool, wi);
623 KASSERT(wl->wl_inohashcnt > 0);
624 wl->wl_inohashcnt--;
625 }
626 }
627
628 /*
629 * clean buffer list
630 */
631 mutex_enter(&bufcache_lock);
632 mutex_enter(&wl->wl_mtx);
633 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
634 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
635 /*
636 * The buffer will be unlocked and
637 * removed from the transaction in brelse
638 */
639 mutex_exit(&wl->wl_mtx);
640 brelsel(bp, 0);
641 mutex_enter(&wl->wl_mtx);
642 }
643 }
644 mutex_exit(&wl->wl_mtx);
645 mutex_exit(&bufcache_lock);
646
647 /*
648 * Remove references to this wl from wl_entries, free any which
649 * no longer have buffers, others will be freed in wapbl_biodone
650 * when they no longer have any buffers.
651 */
652 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
653 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
654 /* XXX should we be accumulating wl_error_count
655 * and increasing reclaimable bytes ? */
656 we->we_wapbl = NULL;
657 if (we->we_bufcount == 0) {
658 #ifdef WAPBL_DEBUG_BUFBYTES
659 KASSERT(we->we_unsynced_bufbytes == 0);
660 #endif
661 pool_put(&wapbl_entry_pool, we);
662 }
663 }
664
665 /* Discard list of deallocs */
666 wl->wl_dealloccnt = 0;
667 /* XXX should we clear wl_reserved_bytes? */
668
669 KASSERT(wl->wl_bufbytes == 0);
670 KASSERT(wl->wl_bcount == 0);
671 KASSERT(wl->wl_bufcount == 0);
672 KASSERT(LIST_EMPTY(&wl->wl_bufs));
673 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
674 KASSERT(wl->wl_inohashcnt == 0);
675
676 rw_exit(&wl->wl_rwlock);
677 }
678
679 int
680 wapbl_stop(struct wapbl *wl, int force)
681 {
682 struct vnode *vp;
683 int error;
684
685 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
686 error = wapbl_flush(wl, 1);
687 if (error) {
688 if (force)
689 wapbl_discard(wl);
690 else
691 return error;
692 }
693
694 /* Unlinked inodes persist after a flush */
695 if (wl->wl_inohashcnt) {
696 if (force) {
697 wapbl_discard(wl);
698 } else {
699 return EBUSY;
700 }
701 }
702
703 KASSERT(wl->wl_bufbytes == 0);
704 KASSERT(wl->wl_bcount == 0);
705 KASSERT(wl->wl_bufcount == 0);
706 KASSERT(LIST_EMPTY(&wl->wl_bufs));
707 KASSERT(wl->wl_dealloccnt == 0);
708 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
709 KASSERT(wl->wl_inohashcnt == 0);
710
711 vp = wl->wl_logvp;
712
713 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
714 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
715 wapbl_free(wl->wl_deallocblks,
716 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
717 wapbl_free(wl->wl_dealloclens,
718 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
719 wapbl_inodetrk_free(wl);
720
721 cv_destroy(&wl->wl_reclaimable_cv);
722 mutex_destroy(&wl->wl_mtx);
723 rw_destroy(&wl->wl_rwlock);
724 wapbl_free(wl, sizeof(*wl));
725
726 return 0;
727 }
728
729 static int
730 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
731 {
732 struct pstats *pstats = curlwp->l_proc->p_stats;
733 struct buf *bp;
734 int error;
735
736 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
737 KASSERT(devvp->v_type == VBLK);
738
739 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
740 mutex_enter(devvp->v_interlock);
741 devvp->v_numoutput++;
742 mutex_exit(devvp->v_interlock);
743 pstats->p_ru.ru_oublock++;
744 } else {
745 pstats->p_ru.ru_inblock++;
746 }
747
748 bp = getiobuf(devvp, true);
749 bp->b_flags = flags;
750 bp->b_cflags = BC_BUSY; /* silly & dubious */
751 bp->b_dev = devvp->v_rdev;
752 bp->b_data = data;
753 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
754 bp->b_blkno = pbn;
755 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
756
757 WAPBL_PRINTF(WAPBL_PRINT_IO,
758 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
759 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
760 bp->b_blkno, bp->b_dev));
761
762 VOP_STRATEGY(devvp, bp);
763
764 error = biowait(bp);
765 putiobuf(bp);
766
767 if (error) {
768 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
769 ("wapbl_doio: %s %zu bytes at block %" PRId64
770 " on dev 0x%"PRIx64" failed with error %d\n",
771 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
772 "write" : "read"),
773 len, pbn, devvp->v_rdev, error));
774 }
775
776 return error;
777 }
778
779 int
780 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
781 {
782
783 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
784 }
785
786 int
787 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
788 {
789
790 return wapbl_doio(data, len, devvp, pbn, B_READ);
791 }
792
793 /*
794 * Off is byte offset returns new offset for next write
795 * handles log wraparound
796 */
797 static int
798 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
799 {
800 size_t slen;
801 off_t off = *offp;
802 int error;
803 daddr_t pbn;
804
805 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
806 wl->wl_log_dev_bshift) == len);
807
808 if (off < wl->wl_circ_off)
809 off = wl->wl_circ_off;
810 slen = wl->wl_circ_off + wl->wl_circ_size - off;
811 if (slen < len) {
812 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
813 #ifdef _KERNEL
814 pbn = btodb(pbn << wl->wl_log_dev_bshift);
815 #endif
816 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
817 if (error)
818 return error;
819 data = (uint8_t *)data + slen;
820 len -= slen;
821 off = wl->wl_circ_off;
822 }
823 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
824 #ifdef _KERNEL
825 pbn = btodb(pbn << wl->wl_log_dev_bshift);
826 #endif
827 error = wapbl_write(data, len, wl->wl_devvp, pbn);
828 if (error)
829 return error;
830 off += len;
831 if (off >= wl->wl_circ_off + wl->wl_circ_size)
832 off = wl->wl_circ_off;
833 *offp = off;
834 return 0;
835 }
836
837 /****************************************************************/
838
839 int
840 wapbl_begin(struct wapbl *wl, const char *file, int line)
841 {
842 int doflush;
843 unsigned lockcount;
844 uint32_t maxphys;
845
846 KDASSERT(wl);
847
848 /*
849 * XXX this needs to be made much more sophisticated.
850 * perhaps each wapbl_begin could reserve a specified
851 * number of buffers and bytes.
852 */
853 mutex_enter(&wl->wl_mtx);
854 lockcount = wl->wl_lock_count;
855 maxphys = wl->wl_mount->mnt_maxphys;
856 doflush = ((wl->wl_bufbytes + (lockcount * maxphys)) >
857 wl->wl_bufbytes_max / 2) ||
858 ((wl->wl_bufcount + (lockcount * 10)) >
859 wl->wl_bufcount_max / 2) ||
860 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
861 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
862 mutex_exit(&wl->wl_mtx);
863
864 if (doflush) {
865 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
866 ("force flush lockcnt=%d bufbytes=%zu "
867 "(max=%zu) bufcount=%zu (max=%zu) "
868 "dealloccnt %d (lim=%d)\n",
869 lockcount, wl->wl_bufbytes,
870 wl->wl_bufbytes_max, wl->wl_bufcount,
871 wl->wl_bufcount_max,
872 wl->wl_dealloccnt, wl->wl_dealloclim));
873 }
874
875 if (doflush) {
876 int error = wapbl_flush(wl, 0);
877 if (error)
878 return error;
879 }
880
881 rw_enter(&wl->wl_rwlock, RW_READER);
882 mutex_enter(&wl->wl_mtx);
883 wl->wl_lock_count++;
884 mutex_exit(&wl->wl_mtx);
885
886 #if defined(WAPBL_DEBUG_PRINT)
887 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
888 ("wapbl_begin thread %d.%d with bufcount=%zu "
889 "bufbytes=%zu bcount=%zu at %s:%d\n",
890 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
891 wl->wl_bufbytes, wl->wl_bcount, file, line));
892 #endif
893
894 return 0;
895 }
896
897 void
898 wapbl_end(struct wapbl *wl)
899 {
900
901 #if defined(WAPBL_DEBUG_PRINT)
902 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
903 ("wapbl_end thread %d.%d with bufcount=%zu "
904 "bufbytes=%zu bcount=%zu\n",
905 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
906 wl->wl_bufbytes, wl->wl_bcount));
907 #endif
908
909 #ifdef DIAGNOSTIC
910 size_t flushsize = wapbl_transaction_len(wl);
911 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
912 /*
913 * XXX this could be handled more gracefully, perhaps place
914 * only a partial transaction in the log and allow the
915 * remaining to flush without the protection of the journal.
916 */
917 panic("wapbl_end: current transaction too big to flush\n");
918 }
919 #endif
920
921 mutex_enter(&wl->wl_mtx);
922 KASSERT(wl->wl_lock_count > 0);
923 wl->wl_lock_count--;
924 mutex_exit(&wl->wl_mtx);
925
926 rw_exit(&wl->wl_rwlock);
927 }
928
929 void
930 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
931 {
932
933 KASSERT(bp->b_cflags & BC_BUSY);
934 KASSERT(bp->b_vp);
935
936 wapbl_jlock_assert(wl);
937
938 #if 0
939 /*
940 * XXX this might be an issue for swapfiles.
941 * see uvm_swap.c:1702
942 *
943 * XXX2 why require it then? leap of semantics?
944 */
945 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
946 #endif
947
948 mutex_enter(&wl->wl_mtx);
949 if (bp->b_flags & B_LOCKED) {
950 LIST_REMOVE(bp, b_wapbllist);
951 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
952 ("wapbl_add_buf thread %d.%d re-adding buf %p "
953 "with %d bytes %d bcount\n",
954 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
955 bp->b_bcount));
956 } else {
957 /* unlocked by dirty buffers shouldn't exist */
958 KASSERT(!(bp->b_oflags & BO_DELWRI));
959 wl->wl_bufbytes += bp->b_bufsize;
960 wl->wl_bcount += bp->b_bcount;
961 wl->wl_bufcount++;
962 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
963 ("wapbl_add_buf thread %d.%d adding buf %p "
964 "with %d bytes %d bcount\n",
965 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
966 bp->b_bcount));
967 }
968 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
969 mutex_exit(&wl->wl_mtx);
970
971 bp->b_flags |= B_LOCKED;
972 }
973
974 static void
975 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
976 {
977
978 KASSERT(mutex_owned(&wl->wl_mtx));
979 KASSERT(bp->b_cflags & BC_BUSY);
980 wapbl_jlock_assert(wl);
981
982 #if 0
983 /*
984 * XXX this might be an issue for swapfiles.
985 * see uvm_swap.c:1725
986 *
987 * XXXdeux: see above
988 */
989 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
990 #endif
991 KASSERT(bp->b_flags & B_LOCKED);
992
993 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
994 ("wapbl_remove_buf thread %d.%d removing buf %p with "
995 "%d bytes %d bcount\n",
996 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
997
998 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
999 wl->wl_bufbytes -= bp->b_bufsize;
1000 KASSERT(wl->wl_bcount >= bp->b_bcount);
1001 wl->wl_bcount -= bp->b_bcount;
1002 KASSERT(wl->wl_bufcount > 0);
1003 wl->wl_bufcount--;
1004 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1005 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1006 LIST_REMOVE(bp, b_wapbllist);
1007
1008 bp->b_flags &= ~B_LOCKED;
1009 }
1010
1011 /* called from brelsel() in vfs_bio among other places */
1012 void
1013 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1014 {
1015
1016 mutex_enter(&wl->wl_mtx);
1017 wapbl_remove_buf_locked(wl, bp);
1018 mutex_exit(&wl->wl_mtx);
1019 }
1020
1021 void
1022 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1023 {
1024
1025 KASSERT(bp->b_cflags & BC_BUSY);
1026
1027 /*
1028 * XXX: why does this depend on B_LOCKED? otherwise the buf
1029 * is not for a transaction? if so, why is this called in the
1030 * first place?
1031 */
1032 if (bp->b_flags & B_LOCKED) {
1033 mutex_enter(&wl->wl_mtx);
1034 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1035 wl->wl_bcount += bp->b_bcount - oldcnt;
1036 mutex_exit(&wl->wl_mtx);
1037 }
1038 }
1039
1040 #endif /* _KERNEL */
1041
1042 /****************************************************************/
1043 /* Some utility inlines */
1044
1045 /* This is used to advance the pointer at old to new value at old+delta */
1046 static inline off_t
1047 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1048 {
1049 off_t new;
1050
1051 /* Define acceptable ranges for inputs. */
1052 KASSERT(delta <= (size_t)size);
1053 KASSERT((old == 0) || ((size_t)old >= off));
1054 KASSERT(old < (off_t)(size + off));
1055
1056 if ((old == 0) && (delta != 0))
1057 new = off + delta;
1058 else if ((old + delta) < (size + off))
1059 new = old + delta;
1060 else
1061 new = (old + delta) - size;
1062
1063 /* Note some interesting axioms */
1064 KASSERT((delta != 0) || (new == old));
1065 KASSERT((delta == 0) || (new != 0));
1066 KASSERT((delta != (size)) || (new == old));
1067
1068 /* Define acceptable ranges for output. */
1069 KASSERT((new == 0) || ((size_t)new >= off));
1070 KASSERT((size_t)new < (size + off));
1071 return new;
1072 }
1073
1074 static inline size_t
1075 wapbl_space_used(size_t avail, off_t head, off_t tail)
1076 {
1077
1078 if (tail == 0) {
1079 KASSERT(head == 0);
1080 return 0;
1081 }
1082 return ((head + (avail - 1) - tail) % avail) + 1;
1083 }
1084
1085 static inline size_t
1086 wapbl_space_free(size_t avail, off_t head, off_t tail)
1087 {
1088
1089 return avail - wapbl_space_used(avail, head, tail);
1090 }
1091
1092 static inline void
1093 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1094 off_t *tailp)
1095 {
1096 off_t head = *headp;
1097 off_t tail = *tailp;
1098
1099 KASSERT(delta <= wapbl_space_free(size, head, tail));
1100 head = wapbl_advance(size, off, head, delta);
1101 if ((tail == 0) && (head != 0))
1102 tail = off;
1103 *headp = head;
1104 *tailp = tail;
1105 }
1106
1107 static inline void
1108 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1109 off_t *tailp)
1110 {
1111 off_t head = *headp;
1112 off_t tail = *tailp;
1113
1114 KASSERT(delta <= wapbl_space_used(size, head, tail));
1115 tail = wapbl_advance(size, off, tail, delta);
1116 if (head == tail) {
1117 head = tail = 0;
1118 }
1119 *headp = head;
1120 *tailp = tail;
1121 }
1122
1123 #ifdef _KERNEL
1124
1125 /****************************************************************/
1126
1127 /*
1128 * Remove transactions whose buffers are completely flushed to disk.
1129 * Will block until at least minfree space is available.
1130 * only intended to be called from inside wapbl_flush and therefore
1131 * does not protect against commit races with itself or with flush.
1132 */
1133 static int
1134 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1135 {
1136 size_t delta;
1137 size_t avail;
1138 off_t head;
1139 off_t tail;
1140 int error = 0;
1141
1142 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1143 KASSERT(rw_write_held(&wl->wl_rwlock));
1144
1145 mutex_enter(&wl->wl_mtx);
1146
1147 /*
1148 * First check to see if we have to do a commit
1149 * at all.
1150 */
1151 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1152 if (minfree < avail) {
1153 mutex_exit(&wl->wl_mtx);
1154 return 0;
1155 }
1156 minfree -= avail;
1157 while ((wl->wl_error_count == 0) &&
1158 (wl->wl_reclaimable_bytes < minfree)) {
1159 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1160 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1161 "minfree=%zd\n",
1162 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1163 minfree));
1164
1165 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1166 }
1167 if (wl->wl_reclaimable_bytes < minfree) {
1168 KASSERT(wl->wl_error_count);
1169 /* XXX maybe get actual error from buffer instead someday? */
1170 error = EIO;
1171 }
1172 head = wl->wl_head;
1173 tail = wl->wl_tail;
1174 delta = wl->wl_reclaimable_bytes;
1175
1176 /* If all of of the entries are flushed, then be sure to keep
1177 * the reserved bytes reserved. Watch out for discarded transactions,
1178 * which could leave more bytes reserved than are reclaimable.
1179 */
1180 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1181 (delta >= wl->wl_reserved_bytes)) {
1182 delta -= wl->wl_reserved_bytes;
1183 }
1184 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1185 &tail);
1186 KDASSERT(wl->wl_reserved_bytes <=
1187 wapbl_space_used(wl->wl_circ_size, head, tail));
1188 mutex_exit(&wl->wl_mtx);
1189
1190 if (error)
1191 return error;
1192
1193 if (waitonly)
1194 return 0;
1195
1196 /*
1197 * This is where head, tail and delta are unprotected
1198 * from races against itself or flush. This is ok since
1199 * we only call this routine from inside flush itself.
1200 *
1201 * XXX: how can it race against itself when accessed only
1202 * from behind the write-locked rwlock?
1203 */
1204 error = wapbl_write_commit(wl, head, tail);
1205 if (error)
1206 return error;
1207
1208 wl->wl_head = head;
1209 wl->wl_tail = tail;
1210
1211 mutex_enter(&wl->wl_mtx);
1212 KASSERT(wl->wl_reclaimable_bytes >= delta);
1213 wl->wl_reclaimable_bytes -= delta;
1214 mutex_exit(&wl->wl_mtx);
1215 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1216 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1217 curproc->p_pid, curlwp->l_lid, delta));
1218
1219 return 0;
1220 }
1221
1222 /****************************************************************/
1223
1224 void
1225 wapbl_biodone(struct buf *bp)
1226 {
1227 struct wapbl_entry *we = bp->b_private;
1228 struct wapbl *wl = we->we_wapbl;
1229
1230 /*
1231 * Handle possible flushing of buffers after log has been
1232 * decomissioned.
1233 */
1234 if (!wl) {
1235 KASSERT(we->we_bufcount > 0);
1236 we->we_bufcount--;
1237 #ifdef WAPBL_DEBUG_BUFBYTES
1238 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1239 we->we_unsynced_bufbytes -= bp->b_bufsize;
1240 #endif
1241
1242 if (we->we_bufcount == 0) {
1243 #ifdef WAPBL_DEBUG_BUFBYTES
1244 KASSERT(we->we_unsynced_bufbytes == 0);
1245 #endif
1246 pool_put(&wapbl_entry_pool, we);
1247 }
1248
1249 brelse(bp, 0);
1250 return;
1251 }
1252
1253 #ifdef ohbother
1254 KDASSERT(bp->b_oflags & BO_DONE);
1255 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1256 KDASSERT(bp->b_flags & B_ASYNC);
1257 KDASSERT(bp->b_cflags & BC_BUSY);
1258 KDASSERT(!(bp->b_flags & B_LOCKED));
1259 KDASSERT(!(bp->b_flags & B_READ));
1260 KDASSERT(!(bp->b_cflags & BC_INVAL));
1261 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1262 #endif
1263
1264 if (bp->b_error) {
1265 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1266 /*
1267 * XXXpooka: interfaces not fully updated
1268 * Note: this was not enabled in the original patch
1269 * against netbsd4 either. I don't know if comment
1270 * above is true or not.
1271 */
1272
1273 /*
1274 * If an error occurs, report the error and leave the
1275 * buffer as a delayed write on the LRU queue.
1276 * restarting the write would likely result in
1277 * an error spinloop, so let it be done harmlessly
1278 * by the syncer.
1279 */
1280 bp->b_flags &= ~(B_DONE);
1281 simple_unlock(&bp->b_interlock);
1282
1283 if (we->we_error == 0) {
1284 mutex_enter(&wl->wl_mtx);
1285 wl->wl_error_count++;
1286 mutex_exit(&wl->wl_mtx);
1287 cv_broadcast(&wl->wl_reclaimable_cv);
1288 }
1289 we->we_error = bp->b_error;
1290 bp->b_error = 0;
1291 brelse(bp);
1292 return;
1293 #else
1294 /* For now, just mark the log permanently errored out */
1295
1296 mutex_enter(&wl->wl_mtx);
1297 if (wl->wl_error_count == 0) {
1298 wl->wl_error_count++;
1299 cv_broadcast(&wl->wl_reclaimable_cv);
1300 }
1301 mutex_exit(&wl->wl_mtx);
1302 #endif
1303 }
1304
1305 mutex_enter(&wl->wl_mtx);
1306
1307 KASSERT(we->we_bufcount > 0);
1308 we->we_bufcount--;
1309 #ifdef WAPBL_DEBUG_BUFBYTES
1310 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1311 we->we_unsynced_bufbytes -= bp->b_bufsize;
1312 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1313 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1314 #endif
1315
1316 /*
1317 * If the current transaction can be reclaimed, start
1318 * at the beginning and reclaim any consecutive reclaimable
1319 * transactions. If we successfully reclaim anything,
1320 * then wakeup anyone waiting for the reclaim.
1321 */
1322 if (we->we_bufcount == 0) {
1323 size_t delta = 0;
1324 int errcnt = 0;
1325 #ifdef WAPBL_DEBUG_BUFBYTES
1326 KDASSERT(we->we_unsynced_bufbytes == 0);
1327 #endif
1328 /*
1329 * clear any posted error, since the buffer it came from
1330 * has successfully flushed by now
1331 */
1332 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1333 (we->we_bufcount == 0)) {
1334 delta += we->we_reclaimable_bytes;
1335 if (we->we_error)
1336 errcnt++;
1337 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1338 pool_put(&wapbl_entry_pool, we);
1339 }
1340
1341 if (delta) {
1342 wl->wl_reclaimable_bytes += delta;
1343 KASSERT(wl->wl_error_count >= errcnt);
1344 wl->wl_error_count -= errcnt;
1345 cv_broadcast(&wl->wl_reclaimable_cv);
1346 }
1347 }
1348
1349 mutex_exit(&wl->wl_mtx);
1350 brelse(bp, 0);
1351 }
1352
1353 /*
1354 * Write transactions to disk + start I/O for contents
1355 */
1356 int
1357 wapbl_flush(struct wapbl *wl, int waitfor)
1358 {
1359 struct buf *bp;
1360 struct wapbl_entry *we;
1361 off_t off;
1362 off_t head;
1363 off_t tail;
1364 size_t delta = 0;
1365 size_t flushsize;
1366 size_t reserved;
1367 int error = 0;
1368
1369 /*
1370 * Do a quick check to see if a full flush can be skipped
1371 * This assumes that the flush callback does not need to be called
1372 * unless there are other outstanding bufs.
1373 */
1374 if (!waitfor) {
1375 size_t nbufs;
1376 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1377 protect the KASSERTS */
1378 nbufs = wl->wl_bufcount;
1379 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1380 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1381 mutex_exit(&wl->wl_mtx);
1382 if (nbufs == 0)
1383 return 0;
1384 }
1385
1386 /*
1387 * XXX we may consider using LK_UPGRADE here
1388 * if we want to call flush from inside a transaction
1389 */
1390 rw_enter(&wl->wl_rwlock, RW_WRITER);
1391 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1392 wl->wl_dealloccnt);
1393
1394 /*
1395 * Now that we are fully locked and flushed,
1396 * do another check for nothing to do.
1397 */
1398 if (wl->wl_bufcount == 0) {
1399 goto out;
1400 }
1401
1402 #if 0
1403 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1404 ("wapbl_flush thread %d.%d flushing entries with "
1405 "bufcount=%zu bufbytes=%zu\n",
1406 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1407 wl->wl_bufbytes));
1408 #endif
1409
1410 /* Calculate amount of space needed to flush */
1411 flushsize = wapbl_transaction_len(wl);
1412 if (wapbl_verbose_commit) {
1413 struct timespec ts;
1414 getnanotime(&ts);
1415 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1416 __func__, (long long)ts.tv_sec,
1417 (long)ts.tv_nsec, flushsize);
1418 }
1419
1420 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1421 /*
1422 * XXX this could be handled more gracefully, perhaps place
1423 * only a partial transaction in the log and allow the
1424 * remaining to flush without the protection of the journal.
1425 */
1426 panic("wapbl_flush: current transaction too big to flush\n");
1427 }
1428
1429 error = wapbl_truncate(wl, flushsize, 0);
1430 if (error)
1431 goto out2;
1432
1433 off = wl->wl_head;
1434 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1435 (off < wl->wl_circ_off + wl->wl_circ_size)));
1436 error = wapbl_write_blocks(wl, &off);
1437 if (error)
1438 goto out2;
1439 error = wapbl_write_revocations(wl, &off);
1440 if (error)
1441 goto out2;
1442 error = wapbl_write_inodes(wl, &off);
1443 if (error)
1444 goto out2;
1445
1446 reserved = 0;
1447 if (wl->wl_inohashcnt)
1448 reserved = wapbl_transaction_inodes_len(wl);
1449
1450 head = wl->wl_head;
1451 tail = wl->wl_tail;
1452
1453 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1454 &head, &tail);
1455 #ifdef WAPBL_DEBUG
1456 if (head != off) {
1457 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1458 " off=%"PRIdMAX" flush=%zu\n",
1459 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1460 flushsize);
1461 }
1462 #else
1463 KASSERT(head == off);
1464 #endif
1465
1466 /* Opportunistically move the tail forward if we can */
1467 if (!wapbl_lazy_truncate) {
1468 mutex_enter(&wl->wl_mtx);
1469 delta = wl->wl_reclaimable_bytes;
1470 mutex_exit(&wl->wl_mtx);
1471 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1472 &head, &tail);
1473 }
1474
1475 error = wapbl_write_commit(wl, head, tail);
1476 if (error)
1477 goto out2;
1478
1479 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1480
1481 #ifdef WAPBL_DEBUG_BUFBYTES
1482 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1483 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1484 " unsynced=%zu"
1485 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1486 "inodes=%d\n",
1487 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1488 wapbl_space_used(wl->wl_circ_size, head, tail),
1489 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1490 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1491 wl->wl_inohashcnt));
1492 #else
1493 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1494 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1495 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1496 "inodes=%d\n",
1497 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1498 wapbl_space_used(wl->wl_circ_size, head, tail),
1499 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1500 wl->wl_dealloccnt, wl->wl_inohashcnt));
1501 #endif
1502
1503
1504 mutex_enter(&bufcache_lock);
1505 mutex_enter(&wl->wl_mtx);
1506
1507 wl->wl_reserved_bytes = reserved;
1508 wl->wl_head = head;
1509 wl->wl_tail = tail;
1510 KASSERT(wl->wl_reclaimable_bytes >= delta);
1511 wl->wl_reclaimable_bytes -= delta;
1512 wl->wl_dealloccnt = 0;
1513 #ifdef WAPBL_DEBUG_BUFBYTES
1514 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1515 #endif
1516
1517 we->we_wapbl = wl;
1518 we->we_bufcount = wl->wl_bufcount;
1519 #ifdef WAPBL_DEBUG_BUFBYTES
1520 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1521 #endif
1522 we->we_reclaimable_bytes = flushsize;
1523 we->we_error = 0;
1524 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1525
1526 /*
1527 * this flushes bufs in reverse order than they were queued
1528 * it shouldn't matter, but if we care we could use TAILQ instead.
1529 * XXX Note they will get put on the lru queue when they flush
1530 * so we might actually want to change this to preserve order.
1531 */
1532 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1533 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1534 continue;
1535 }
1536 bp->b_iodone = wapbl_biodone;
1537 bp->b_private = we;
1538 bremfree(bp);
1539 wapbl_remove_buf_locked(wl, bp);
1540 mutex_exit(&wl->wl_mtx);
1541 mutex_exit(&bufcache_lock);
1542 bawrite(bp);
1543 mutex_enter(&bufcache_lock);
1544 mutex_enter(&wl->wl_mtx);
1545 }
1546 mutex_exit(&wl->wl_mtx);
1547 mutex_exit(&bufcache_lock);
1548
1549 #if 0
1550 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1551 ("wapbl_flush thread %d.%d done flushing entries...\n",
1552 curproc->p_pid, curlwp->l_lid));
1553 #endif
1554
1555 out:
1556
1557 /*
1558 * If the waitfor flag is set, don't return until everything is
1559 * fully flushed and the on disk log is empty.
1560 */
1561 if (waitfor) {
1562 error = wapbl_truncate(wl, wl->wl_circ_size -
1563 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1564 }
1565
1566 out2:
1567 if (error) {
1568 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1569 wl->wl_dealloclens, wl->wl_dealloccnt);
1570 }
1571
1572 #ifdef WAPBL_DEBUG_PRINT
1573 if (error) {
1574 pid_t pid = -1;
1575 lwpid_t lid = -1;
1576 if (curproc)
1577 pid = curproc->p_pid;
1578 if (curlwp)
1579 lid = curlwp->l_lid;
1580 mutex_enter(&wl->wl_mtx);
1581 #ifdef WAPBL_DEBUG_BUFBYTES
1582 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1583 ("wapbl_flush: thread %d.%d aborted flush: "
1584 "error = %d\n"
1585 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1586 "deallocs=%d inodes=%d\n"
1587 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1588 "unsynced=%zu\n",
1589 pid, lid, error, wl->wl_bufcount,
1590 wl->wl_bufbytes, wl->wl_bcount,
1591 wl->wl_dealloccnt, wl->wl_inohashcnt,
1592 wl->wl_error_count, wl->wl_reclaimable_bytes,
1593 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1594 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1595 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1596 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1597 "error = %d, unsynced = %zu\n",
1598 we->we_bufcount, we->we_reclaimable_bytes,
1599 we->we_error, we->we_unsynced_bufbytes));
1600 }
1601 #else
1602 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1603 ("wapbl_flush: thread %d.%d aborted flush: "
1604 "error = %d\n"
1605 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1606 "deallocs=%d inodes=%d\n"
1607 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1608 pid, lid, error, wl->wl_bufcount,
1609 wl->wl_bufbytes, wl->wl_bcount,
1610 wl->wl_dealloccnt, wl->wl_inohashcnt,
1611 wl->wl_error_count, wl->wl_reclaimable_bytes,
1612 wl->wl_reserved_bytes));
1613 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1614 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1615 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1616 "error = %d\n", we->we_bufcount,
1617 we->we_reclaimable_bytes, we->we_error));
1618 }
1619 #endif
1620 mutex_exit(&wl->wl_mtx);
1621 }
1622 #endif
1623
1624 rw_exit(&wl->wl_rwlock);
1625 return error;
1626 }
1627
1628 /****************************************************************/
1629
1630 void
1631 wapbl_jlock_assert(struct wapbl *wl)
1632 {
1633
1634 KASSERT(rw_lock_held(&wl->wl_rwlock));
1635 }
1636
1637 void
1638 wapbl_junlock_assert(struct wapbl *wl)
1639 {
1640
1641 KASSERT(!rw_write_held(&wl->wl_rwlock));
1642 }
1643
1644 /****************************************************************/
1645
1646 /* locks missing */
1647 void
1648 wapbl_print(struct wapbl *wl,
1649 int full,
1650 void (*pr)(const char *, ...))
1651 {
1652 struct buf *bp;
1653 struct wapbl_entry *we;
1654 (*pr)("wapbl %p", wl);
1655 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1656 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1657 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1658 wl->wl_circ_size, wl->wl_circ_off,
1659 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1660 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1661 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1662 #ifdef WAPBL_DEBUG_BUFBYTES
1663 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1664 "reserved = %zu errcnt = %d unsynced = %zu\n",
1665 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1666 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1667 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1668 #else
1669 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1670 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1671 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1672 wl->wl_error_count);
1673 #endif
1674 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1675 wl->wl_dealloccnt, wl->wl_dealloclim);
1676 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1677 wl->wl_inohashcnt, wl->wl_inohashmask);
1678 (*pr)("entries:\n");
1679 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1680 #ifdef WAPBL_DEBUG_BUFBYTES
1681 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1682 "unsynced = %zu\n",
1683 we->we_bufcount, we->we_reclaimable_bytes,
1684 we->we_error, we->we_unsynced_bufbytes);
1685 #else
1686 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1687 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1688 #endif
1689 }
1690 if (full) {
1691 int cnt = 0;
1692 (*pr)("bufs =");
1693 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1694 if (!LIST_NEXT(bp, b_wapbllist)) {
1695 (*pr)(" %p", bp);
1696 } else if ((++cnt % 6) == 0) {
1697 (*pr)(" %p,\n\t", bp);
1698 } else {
1699 (*pr)(" %p,", bp);
1700 }
1701 }
1702 (*pr)("\n");
1703
1704 (*pr)("dealloced blks = ");
1705 {
1706 int i;
1707 cnt = 0;
1708 for (i = 0; i < wl->wl_dealloccnt; i++) {
1709 (*pr)(" %"PRId64":%d,",
1710 wl->wl_deallocblks[i],
1711 wl->wl_dealloclens[i]);
1712 if ((++cnt % 4) == 0) {
1713 (*pr)("\n\t");
1714 }
1715 }
1716 }
1717 (*pr)("\n");
1718
1719 (*pr)("registered inodes = ");
1720 {
1721 int i;
1722 cnt = 0;
1723 for (i = 0; i <= wl->wl_inohashmask; i++) {
1724 struct wapbl_ino_head *wih;
1725 struct wapbl_ino *wi;
1726
1727 wih = &wl->wl_inohash[i];
1728 LIST_FOREACH(wi, wih, wi_hash) {
1729 if (wi->wi_ino == 0)
1730 continue;
1731 (*pr)(" %"PRId32"/0%06"PRIo32",",
1732 wi->wi_ino, wi->wi_mode);
1733 if ((++cnt % 4) == 0) {
1734 (*pr)("\n\t");
1735 }
1736 }
1737 }
1738 (*pr)("\n");
1739 }
1740 }
1741 }
1742
1743 #if defined(WAPBL_DEBUG) || defined(DDB)
1744 void
1745 wapbl_dump(struct wapbl *wl)
1746 {
1747 #if defined(WAPBL_DEBUG)
1748 if (!wl)
1749 wl = wapbl_debug_wl;
1750 #endif
1751 if (!wl)
1752 return;
1753 wapbl_print(wl, 1, printf);
1754 }
1755 #endif
1756
1757 /****************************************************************/
1758
1759 void
1760 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1761 {
1762
1763 wapbl_jlock_assert(wl);
1764
1765 mutex_enter(&wl->wl_mtx);
1766 /* XXX should eventually instead tie this into resource estimation */
1767 /*
1768 * XXX this panic needs locking/mutex analysis and the
1769 * ability to cope with the failure.
1770 */
1771 /* XXX this XXX doesn't have enough XXX */
1772 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1773 panic("wapbl_register_deallocation: out of resources");
1774
1775 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1776 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1777 wl->wl_dealloccnt++;
1778 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1779 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1780 mutex_exit(&wl->wl_mtx);
1781 }
1782
1783 /****************************************************************/
1784
1785 static void
1786 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1787 {
1788
1789 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1790 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1791 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1792 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1793 }
1794 }
1795
1796 static void
1797 wapbl_inodetrk_free(struct wapbl *wl)
1798 {
1799
1800 /* XXX this KASSERT needs locking/mutex analysis */
1801 KASSERT(wl->wl_inohashcnt == 0);
1802 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1803 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1804 pool_destroy(&wapbl_ino_pool);
1805 }
1806 }
1807
1808 static struct wapbl_ino *
1809 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1810 {
1811 struct wapbl_ino_head *wih;
1812 struct wapbl_ino *wi;
1813
1814 KASSERT(mutex_owned(&wl->wl_mtx));
1815
1816 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1817 LIST_FOREACH(wi, wih, wi_hash) {
1818 if (ino == wi->wi_ino)
1819 return wi;
1820 }
1821 return 0;
1822 }
1823
1824 void
1825 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1826 {
1827 struct wapbl_ino_head *wih;
1828 struct wapbl_ino *wi;
1829
1830 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1831
1832 mutex_enter(&wl->wl_mtx);
1833 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1834 wi->wi_ino = ino;
1835 wi->wi_mode = mode;
1836 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1837 LIST_INSERT_HEAD(wih, wi, wi_hash);
1838 wl->wl_inohashcnt++;
1839 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1840 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1841 mutex_exit(&wl->wl_mtx);
1842 } else {
1843 mutex_exit(&wl->wl_mtx);
1844 pool_put(&wapbl_ino_pool, wi);
1845 }
1846 }
1847
1848 void
1849 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1850 {
1851 struct wapbl_ino *wi;
1852
1853 mutex_enter(&wl->wl_mtx);
1854 wi = wapbl_inodetrk_get(wl, ino);
1855 if (wi) {
1856 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1857 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1858 KASSERT(wl->wl_inohashcnt > 0);
1859 wl->wl_inohashcnt--;
1860 LIST_REMOVE(wi, wi_hash);
1861 mutex_exit(&wl->wl_mtx);
1862
1863 pool_put(&wapbl_ino_pool, wi);
1864 } else {
1865 mutex_exit(&wl->wl_mtx);
1866 }
1867 }
1868
1869 /****************************************************************/
1870
1871 static inline size_t
1872 wapbl_transaction_inodes_len(struct wapbl *wl)
1873 {
1874 int blocklen = 1<<wl->wl_log_dev_bshift;
1875 int iph;
1876
1877 /* Calculate number of inodes described in a inodelist header */
1878 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1879 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1880
1881 KASSERT(iph > 0);
1882
1883 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1884 }
1885
1886
1887 /* Calculate amount of space a transaction will take on disk */
1888 static size_t
1889 wapbl_transaction_len(struct wapbl *wl)
1890 {
1891 int blocklen = 1<<wl->wl_log_dev_bshift;
1892 size_t len;
1893 int bph;
1894
1895 /* Calculate number of blocks described in a blocklist header */
1896 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1897 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1898
1899 KASSERT(bph > 0);
1900
1901 len = wl->wl_bcount;
1902 len += howmany(wl->wl_bufcount, bph) * blocklen;
1903 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1904 len += wapbl_transaction_inodes_len(wl);
1905
1906 return len;
1907 }
1908
1909 /*
1910 * wapbl_cache_sync: issue DIOCCACHESYNC
1911 */
1912 static int
1913 wapbl_cache_sync(struct wapbl *wl, const char *msg)
1914 {
1915 const bool verbose = wapbl_verbose_commit >= 2;
1916 struct bintime start_time;
1917 int force = 1;
1918 int error;
1919
1920 if (!wapbl_flush_disk_cache) {
1921 return 0;
1922 }
1923 if (verbose) {
1924 bintime(&start_time);
1925 }
1926 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1927 FWRITE, FSCRED);
1928 if (error) {
1929 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1930 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
1931 "returned %d\n", wl->wl_devvp->v_rdev, error));
1932 }
1933 if (verbose) {
1934 struct bintime d;
1935 struct timespec ts;
1936
1937 bintime(&d);
1938 bintime_sub(&d, &start_time);
1939 bintime2timespec(&d, &ts);
1940 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
1941 msg, (uintmax_t)wl->wl_devvp->v_rdev,
1942 (uintmax_t)ts.tv_sec, ts.tv_nsec);
1943 }
1944 return error;
1945 }
1946
1947 /*
1948 * Perform commit operation
1949 *
1950 * Note that generation number incrementation needs to
1951 * be protected against racing with other invocations
1952 * of wapbl_write_commit. This is ok since this routine
1953 * is only invoked from wapbl_flush
1954 */
1955 static int
1956 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1957 {
1958 struct wapbl_wc_header *wc = wl->wl_wc_header;
1959 struct timespec ts;
1960 int error;
1961 daddr_t pbn;
1962
1963 /*
1964 * flush disk cache to ensure that blocks we've written are actually
1965 * written to the stable storage before the commit header.
1966 *
1967 * XXX Calc checksum here, instead we do this for now
1968 */
1969 wapbl_cache_sync(wl, "1");
1970
1971 wc->wc_head = head;
1972 wc->wc_tail = tail;
1973 wc->wc_checksum = 0;
1974 wc->wc_version = 1;
1975 getnanotime(&ts);
1976 wc->wc_time = ts.tv_sec;
1977 wc->wc_timensec = ts.tv_nsec;
1978
1979 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1980 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1981 (intmax_t)head, (intmax_t)tail));
1982
1983 /*
1984 * write the commit header.
1985 *
1986 * XXX if generation will rollover, then first zero
1987 * over second commit header before trying to write both headers.
1988 */
1989
1990 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1991 #ifdef _KERNEL
1992 pbn = btodb(pbn << wc->wc_log_dev_bshift);
1993 #endif
1994 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1995 if (error)
1996 return error;
1997
1998 /*
1999 * flush disk cache to ensure that the commit header is actually
2000 * written before meta data blocks.
2001 */
2002 wapbl_cache_sync(wl, "2");
2003
2004 /*
2005 * If the generation number was zero, write it out a second time.
2006 * This handles initialization and generation number rollover
2007 */
2008 if (wc->wc_generation++ == 0) {
2009 error = wapbl_write_commit(wl, head, tail);
2010 /*
2011 * This panic should be able to be removed if we do the
2012 * zero'ing mentioned above, and we are certain to roll
2013 * back generation number on failure.
2014 */
2015 if (error)
2016 panic("wapbl_write_commit: error writing duplicate "
2017 "log header: %d\n", error);
2018 }
2019 return 0;
2020 }
2021
2022 /* Returns new offset value */
2023 static int
2024 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2025 {
2026 struct wapbl_wc_blocklist *wc =
2027 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2028 int blocklen = 1<<wl->wl_log_dev_bshift;
2029 int bph;
2030 struct buf *bp;
2031 off_t off = *offp;
2032 int error;
2033 size_t padding;
2034
2035 KASSERT(rw_write_held(&wl->wl_rwlock));
2036
2037 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2038 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2039
2040 bp = LIST_FIRST(&wl->wl_bufs);
2041
2042 while (bp) {
2043 int cnt;
2044 struct buf *obp = bp;
2045
2046 KASSERT(bp->b_flags & B_LOCKED);
2047
2048 wc->wc_type = WAPBL_WC_BLOCKS;
2049 wc->wc_len = blocklen;
2050 wc->wc_blkcount = 0;
2051 while (bp && (wc->wc_blkcount < bph)) {
2052 /*
2053 * Make sure all the physical block numbers are up to
2054 * date. If this is not always true on a given
2055 * filesystem, then VOP_BMAP must be called. We
2056 * could call VOP_BMAP here, or else in the filesystem
2057 * specific flush callback, although neither of those
2058 * solutions allow us to take the vnode lock. If a
2059 * filesystem requires that we must take the vnode lock
2060 * to call VOP_BMAP, then we can probably do it in
2061 * bwrite when the vnode lock should already be held
2062 * by the invoking code.
2063 */
2064 KASSERT((bp->b_vp->v_type == VBLK) ||
2065 (bp->b_blkno != bp->b_lblkno));
2066 KASSERT(bp->b_blkno > 0);
2067
2068 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2069 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2070 wc->wc_len += bp->b_bcount;
2071 wc->wc_blkcount++;
2072 bp = LIST_NEXT(bp, b_wapbllist);
2073 }
2074 if (wc->wc_len % blocklen != 0) {
2075 padding = blocklen - wc->wc_len % blocklen;
2076 wc->wc_len += padding;
2077 } else {
2078 padding = 0;
2079 }
2080
2081 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2082 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2083 wc->wc_len, padding, (intmax_t)off));
2084
2085 error = wapbl_circ_write(wl, wc, blocklen, &off);
2086 if (error)
2087 return error;
2088 bp = obp;
2089 cnt = 0;
2090 while (bp && (cnt++ < bph)) {
2091 error = wapbl_circ_write(wl, bp->b_data,
2092 bp->b_bcount, &off);
2093 if (error)
2094 return error;
2095 bp = LIST_NEXT(bp, b_wapbllist);
2096 }
2097 if (padding) {
2098 void *zero;
2099
2100 zero = wapbl_alloc(padding);
2101 memset(zero, 0, padding);
2102 error = wapbl_circ_write(wl, zero, padding, &off);
2103 wapbl_free(zero, padding);
2104 if (error)
2105 return error;
2106 }
2107 }
2108 *offp = off;
2109 return 0;
2110 }
2111
2112 static int
2113 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2114 {
2115 struct wapbl_wc_blocklist *wc =
2116 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2117 int i;
2118 int blocklen = 1<<wl->wl_log_dev_bshift;
2119 int bph;
2120 off_t off = *offp;
2121 int error;
2122
2123 if (wl->wl_dealloccnt == 0)
2124 return 0;
2125
2126 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2127 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2128
2129 i = 0;
2130 while (i < wl->wl_dealloccnt) {
2131 wc->wc_type = WAPBL_WC_REVOCATIONS;
2132 wc->wc_len = blocklen;
2133 wc->wc_blkcount = 0;
2134 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2135 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2136 wl->wl_deallocblks[i];
2137 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2138 wl->wl_dealloclens[i];
2139 wc->wc_blkcount++;
2140 i++;
2141 }
2142 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2143 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2144 wc->wc_len, (intmax_t)off));
2145 error = wapbl_circ_write(wl, wc, blocklen, &off);
2146 if (error)
2147 return error;
2148 }
2149 *offp = off;
2150 return 0;
2151 }
2152
2153 static int
2154 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2155 {
2156 struct wapbl_wc_inodelist *wc =
2157 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2158 int i;
2159 int blocklen = 1 << wl->wl_log_dev_bshift;
2160 off_t off = *offp;
2161 int error;
2162
2163 struct wapbl_ino_head *wih;
2164 struct wapbl_ino *wi;
2165 int iph;
2166
2167 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2168 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2169
2170 i = 0;
2171 wih = &wl->wl_inohash[0];
2172 wi = 0;
2173 do {
2174 wc->wc_type = WAPBL_WC_INODES;
2175 wc->wc_len = blocklen;
2176 wc->wc_inocnt = 0;
2177 wc->wc_clear = (i == 0);
2178 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2179 while (!wi) {
2180 KASSERT((wih - &wl->wl_inohash[0])
2181 <= wl->wl_inohashmask);
2182 wi = LIST_FIRST(wih++);
2183 }
2184 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2185 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2186 wc->wc_inocnt++;
2187 i++;
2188 wi = LIST_NEXT(wi, wi_hash);
2189 }
2190 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2191 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2192 wc->wc_len, (intmax_t)off));
2193 error = wapbl_circ_write(wl, wc, blocklen, &off);
2194 if (error)
2195 return error;
2196 } while (i < wl->wl_inohashcnt);
2197
2198 *offp = off;
2199 return 0;
2200 }
2201
2202 #endif /* _KERNEL */
2203
2204 /****************************************************************/
2205
2206 struct wapbl_blk {
2207 LIST_ENTRY(wapbl_blk) wb_hash;
2208 daddr_t wb_blk;
2209 off_t wb_off; /* Offset of this block in the log */
2210 };
2211 #define WAPBL_BLKPOOL_MIN 83
2212
2213 static void
2214 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2215 {
2216 if (size < WAPBL_BLKPOOL_MIN)
2217 size = WAPBL_BLKPOOL_MIN;
2218 KASSERT(wr->wr_blkhash == 0);
2219 #ifdef _KERNEL
2220 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2221 #else /* ! _KERNEL */
2222 /* Manually implement hashinit */
2223 {
2224 unsigned long i, hashsize;
2225 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2226 continue;
2227 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2228 for (i = 0; i < hashsize; i++)
2229 LIST_INIT(&wr->wr_blkhash[i]);
2230 wr->wr_blkhashmask = hashsize - 1;
2231 }
2232 #endif /* ! _KERNEL */
2233 }
2234
2235 static void
2236 wapbl_blkhash_free(struct wapbl_replay *wr)
2237 {
2238 KASSERT(wr->wr_blkhashcnt == 0);
2239 #ifdef _KERNEL
2240 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2241 #else /* ! _KERNEL */
2242 wapbl_free(wr->wr_blkhash,
2243 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2244 #endif /* ! _KERNEL */
2245 }
2246
2247 static struct wapbl_blk *
2248 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2249 {
2250 struct wapbl_blk_head *wbh;
2251 struct wapbl_blk *wb;
2252 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2253 LIST_FOREACH(wb, wbh, wb_hash) {
2254 if (blk == wb->wb_blk)
2255 return wb;
2256 }
2257 return 0;
2258 }
2259
2260 static void
2261 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2262 {
2263 struct wapbl_blk_head *wbh;
2264 struct wapbl_blk *wb;
2265 wb = wapbl_blkhash_get(wr, blk);
2266 if (wb) {
2267 KASSERT(wb->wb_blk == blk);
2268 wb->wb_off = off;
2269 } else {
2270 wb = wapbl_alloc(sizeof(*wb));
2271 wb->wb_blk = blk;
2272 wb->wb_off = off;
2273 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2274 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2275 wr->wr_blkhashcnt++;
2276 }
2277 }
2278
2279 static void
2280 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2281 {
2282 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2283 if (wb) {
2284 KASSERT(wr->wr_blkhashcnt > 0);
2285 wr->wr_blkhashcnt--;
2286 LIST_REMOVE(wb, wb_hash);
2287 wapbl_free(wb, sizeof(*wb));
2288 }
2289 }
2290
2291 static void
2292 wapbl_blkhash_clear(struct wapbl_replay *wr)
2293 {
2294 unsigned long i;
2295 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2296 struct wapbl_blk *wb;
2297
2298 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2299 KASSERT(wr->wr_blkhashcnt > 0);
2300 wr->wr_blkhashcnt--;
2301 LIST_REMOVE(wb, wb_hash);
2302 wapbl_free(wb, sizeof(*wb));
2303 }
2304 }
2305 KASSERT(wr->wr_blkhashcnt == 0);
2306 }
2307
2308 /****************************************************************/
2309
2310 static int
2311 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2312 {
2313 size_t slen;
2314 off_t off = *offp;
2315 int error;
2316 daddr_t pbn;
2317
2318 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2319 wr->wr_log_dev_bshift) == len);
2320
2321 if (off < wr->wr_circ_off)
2322 off = wr->wr_circ_off;
2323 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2324 if (slen < len) {
2325 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2326 #ifdef _KERNEL
2327 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2328 #endif
2329 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2330 if (error)
2331 return error;
2332 data = (uint8_t *)data + slen;
2333 len -= slen;
2334 off = wr->wr_circ_off;
2335 }
2336 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2337 #ifdef _KERNEL
2338 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2339 #endif
2340 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2341 if (error)
2342 return error;
2343 off += len;
2344 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2345 off = wr->wr_circ_off;
2346 *offp = off;
2347 return 0;
2348 }
2349
2350 static void
2351 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2352 {
2353 size_t slen;
2354 off_t off = *offp;
2355
2356 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2357 wr->wr_log_dev_bshift) == len);
2358
2359 if (off < wr->wr_circ_off)
2360 off = wr->wr_circ_off;
2361 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2362 if (slen < len) {
2363 len -= slen;
2364 off = wr->wr_circ_off;
2365 }
2366 off += len;
2367 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2368 off = wr->wr_circ_off;
2369 *offp = off;
2370 }
2371
2372 /****************************************************************/
2373
2374 int
2375 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2376 daddr_t off, size_t count, size_t blksize)
2377 {
2378 struct wapbl_replay *wr;
2379 int error;
2380 struct vnode *devvp;
2381 daddr_t logpbn;
2382 uint8_t *scratch;
2383 struct wapbl_wc_header *wch;
2384 struct wapbl_wc_header *wch2;
2385 /* Use this until we read the actual log header */
2386 int log_dev_bshift = ilog2(blksize);
2387 size_t used;
2388 daddr_t pbn;
2389
2390 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2391 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2392 vp, off, count, blksize));
2393
2394 if (off < 0)
2395 return EINVAL;
2396
2397 if (blksize < DEV_BSIZE)
2398 return EINVAL;
2399 if (blksize % DEV_BSIZE)
2400 return EINVAL;
2401
2402 #ifdef _KERNEL
2403 #if 0
2404 /* XXX vp->v_size isn't reliably set for VBLK devices,
2405 * especially root. However, we might still want to verify
2406 * that the full load is readable */
2407 if ((off + count) * blksize > vp->v_size)
2408 return EINVAL;
2409 #endif
2410 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2411 return error;
2412 }
2413 #else /* ! _KERNEL */
2414 devvp = vp;
2415 logpbn = off;
2416 #endif /* ! _KERNEL */
2417
2418 scratch = wapbl_alloc(MAXBSIZE);
2419
2420 pbn = logpbn;
2421 #ifdef _KERNEL
2422 pbn = btodb(pbn << log_dev_bshift);
2423 #endif
2424 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2425 if (error)
2426 goto errout;
2427
2428 wch = (struct wapbl_wc_header *)scratch;
2429 wch2 =
2430 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2431 /* XXX verify checksums and magic numbers */
2432 if (wch->wc_type != WAPBL_WC_HEADER) {
2433 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2434 error = EFTYPE;
2435 goto errout;
2436 }
2437
2438 if (wch2->wc_generation > wch->wc_generation)
2439 wch = wch2;
2440
2441 wr = wapbl_calloc(1, sizeof(*wr));
2442
2443 wr->wr_logvp = vp;
2444 wr->wr_devvp = devvp;
2445 wr->wr_logpbn = logpbn;
2446
2447 wr->wr_scratch = scratch;
2448
2449 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2450 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2451 wr->wr_circ_off = wch->wc_circ_off;
2452 wr->wr_circ_size = wch->wc_circ_size;
2453 wr->wr_generation = wch->wc_generation;
2454
2455 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2456
2457 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2458 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2459 " len=%"PRId64" used=%zu\n",
2460 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2461 wch->wc_circ_size, used));
2462
2463 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2464
2465 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2466 if (error) {
2467 wapbl_replay_stop(wr);
2468 wapbl_replay_free(wr);
2469 return error;
2470 }
2471
2472 *wrp = wr;
2473 return 0;
2474
2475 errout:
2476 wapbl_free(scratch, MAXBSIZE);
2477 return error;
2478 }
2479
2480 void
2481 wapbl_replay_stop(struct wapbl_replay *wr)
2482 {
2483
2484 if (!wapbl_replay_isopen(wr))
2485 return;
2486
2487 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2488
2489 wapbl_free(wr->wr_scratch, MAXBSIZE);
2490 wr->wr_scratch = NULL;
2491
2492 wr->wr_logvp = NULL;
2493
2494 wapbl_blkhash_clear(wr);
2495 wapbl_blkhash_free(wr);
2496 }
2497
2498 void
2499 wapbl_replay_free(struct wapbl_replay *wr)
2500 {
2501
2502 KDASSERT(!wapbl_replay_isopen(wr));
2503
2504 if (wr->wr_inodes)
2505 wapbl_free(wr->wr_inodes,
2506 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2507 wapbl_free(wr, sizeof(*wr));
2508 }
2509
2510 #ifdef _KERNEL
2511 int
2512 wapbl_replay_isopen1(struct wapbl_replay *wr)
2513 {
2514
2515 return wapbl_replay_isopen(wr);
2516 }
2517 #endif
2518
2519 static void
2520 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2521 {
2522 struct wapbl_wc_blocklist *wc =
2523 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2524 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2525 int i, j, n;
2526
2527 for (i = 0; i < wc->wc_blkcount; i++) {
2528 /*
2529 * Enter each physical block into the hashtable independently.
2530 */
2531 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2532 for (j = 0; j < n; j++) {
2533 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2534 *offp);
2535 wapbl_circ_advance(wr, fsblklen, offp);
2536 }
2537 }
2538 }
2539
2540 static void
2541 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2542 {
2543 struct wapbl_wc_blocklist *wc =
2544 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2545 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2546 int i, j, n;
2547
2548 for (i = 0; i < wc->wc_blkcount; i++) {
2549 /*
2550 * Remove any blocks found from the hashtable.
2551 */
2552 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2553 for (j = 0; j < n; j++)
2554 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2555 }
2556 }
2557
2558 static void
2559 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2560 {
2561 struct wapbl_wc_inodelist *wc =
2562 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2563 void *new_inodes;
2564 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2565
2566 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2567
2568 /*
2569 * Keep track of where we found this so location won't be
2570 * overwritten.
2571 */
2572 if (wc->wc_clear) {
2573 wr->wr_inodestail = oldoff;
2574 wr->wr_inodescnt = 0;
2575 if (wr->wr_inodes != NULL) {
2576 wapbl_free(wr->wr_inodes, oldsize);
2577 wr->wr_inodes = NULL;
2578 }
2579 }
2580 wr->wr_inodeshead = newoff;
2581 if (wc->wc_inocnt == 0)
2582 return;
2583
2584 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2585 sizeof(wr->wr_inodes[0]));
2586 if (wr->wr_inodes != NULL) {
2587 memcpy(new_inodes, wr->wr_inodes, oldsize);
2588 wapbl_free(wr->wr_inodes, oldsize);
2589 }
2590 wr->wr_inodes = new_inodes;
2591 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2592 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2593 wr->wr_inodescnt += wc->wc_inocnt;
2594 }
2595
2596 static int
2597 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2598 {
2599 off_t off;
2600 int error;
2601
2602 int logblklen = 1 << wr->wr_log_dev_bshift;
2603
2604 wapbl_blkhash_clear(wr);
2605
2606 off = tail;
2607 while (off != head) {
2608 struct wapbl_wc_null *wcn;
2609 off_t saveoff = off;
2610 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2611 if (error)
2612 goto errout;
2613 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2614 switch (wcn->wc_type) {
2615 case WAPBL_WC_BLOCKS:
2616 wapbl_replay_process_blocks(wr, &off);
2617 break;
2618
2619 case WAPBL_WC_REVOCATIONS:
2620 wapbl_replay_process_revocations(wr);
2621 break;
2622
2623 case WAPBL_WC_INODES:
2624 wapbl_replay_process_inodes(wr, saveoff, off);
2625 break;
2626
2627 default:
2628 printf("Unrecognized wapbl type: 0x%08x\n",
2629 wcn->wc_type);
2630 error = EFTYPE;
2631 goto errout;
2632 }
2633 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2634 if (off != saveoff) {
2635 printf("wapbl_replay: corrupted records\n");
2636 error = EFTYPE;
2637 goto errout;
2638 }
2639 }
2640 return 0;
2641
2642 errout:
2643 wapbl_blkhash_clear(wr);
2644 return error;
2645 }
2646
2647 #if 0
2648 int
2649 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2650 {
2651 off_t off;
2652 int mismatchcnt = 0;
2653 int logblklen = 1 << wr->wr_log_dev_bshift;
2654 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2655 void *scratch1 = wapbl_alloc(MAXBSIZE);
2656 void *scratch2 = wapbl_alloc(MAXBSIZE);
2657 int error = 0;
2658
2659 KDASSERT(wapbl_replay_isopen(wr));
2660
2661 off = wch->wc_tail;
2662 while (off != wch->wc_head) {
2663 struct wapbl_wc_null *wcn;
2664 #ifdef DEBUG
2665 off_t saveoff = off;
2666 #endif
2667 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2668 if (error)
2669 goto out;
2670 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2671 switch (wcn->wc_type) {
2672 case WAPBL_WC_BLOCKS:
2673 {
2674 struct wapbl_wc_blocklist *wc =
2675 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2676 int i;
2677 for (i = 0; i < wc->wc_blkcount; i++) {
2678 int foundcnt = 0;
2679 int dirtycnt = 0;
2680 int j, n;
2681 /*
2682 * Check each physical block into the
2683 * hashtable independently
2684 */
2685 n = wc->wc_blocks[i].wc_dlen >>
2686 wch->wc_fs_dev_bshift;
2687 for (j = 0; j < n; j++) {
2688 struct wapbl_blk *wb =
2689 wapbl_blkhash_get(wr,
2690 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2691 if (wb && (wb->wb_off == off)) {
2692 foundcnt++;
2693 error =
2694 wapbl_circ_read(wr,
2695 scratch1, fsblklen,
2696 &off);
2697 if (error)
2698 goto out;
2699 error =
2700 wapbl_read(scratch2,
2701 fsblklen, fsdevvp,
2702 wb->wb_blk);
2703 if (error)
2704 goto out;
2705 if (memcmp(scratch1,
2706 scratch2,
2707 fsblklen)) {
2708 printf(
2709 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2710 wb->wb_blk, (intmax_t)off);
2711 dirtycnt++;
2712 mismatchcnt++;
2713 }
2714 } else {
2715 wapbl_circ_advance(wr,
2716 fsblklen, &off);
2717 }
2718 }
2719 #if 0
2720 /*
2721 * If all of the blocks in an entry
2722 * are clean, then remove all of its
2723 * blocks from the hashtable since they
2724 * never will need replay.
2725 */
2726 if ((foundcnt != 0) &&
2727 (dirtycnt == 0)) {
2728 off = saveoff;
2729 wapbl_circ_advance(wr,
2730 logblklen, &off);
2731 for (j = 0; j < n; j++) {
2732 struct wapbl_blk *wb =
2733 wapbl_blkhash_get(wr,
2734 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2735 if (wb &&
2736 (wb->wb_off == off)) {
2737 wapbl_blkhash_rem(wr, wb->wb_blk);
2738 }
2739 wapbl_circ_advance(wr,
2740 fsblklen, &off);
2741 }
2742 }
2743 #endif
2744 }
2745 }
2746 break;
2747 case WAPBL_WC_REVOCATIONS:
2748 case WAPBL_WC_INODES:
2749 break;
2750 default:
2751 KASSERT(0);
2752 }
2753 #ifdef DEBUG
2754 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2755 KASSERT(off == saveoff);
2756 #endif
2757 }
2758 out:
2759 wapbl_free(scratch1, MAXBSIZE);
2760 wapbl_free(scratch2, MAXBSIZE);
2761 if (!error && mismatchcnt)
2762 error = EFTYPE;
2763 return error;
2764 }
2765 #endif
2766
2767 int
2768 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2769 {
2770 struct wapbl_blk *wb;
2771 size_t i;
2772 off_t off;
2773 void *scratch;
2774 int error = 0;
2775 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2776
2777 KDASSERT(wapbl_replay_isopen(wr));
2778
2779 scratch = wapbl_alloc(MAXBSIZE);
2780
2781 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2782 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2783 off = wb->wb_off;
2784 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2785 if (error)
2786 break;
2787 error = wapbl_write(scratch, fsblklen, fsdevvp,
2788 wb->wb_blk);
2789 if (error)
2790 break;
2791 }
2792 }
2793
2794 wapbl_free(scratch, MAXBSIZE);
2795 return error;
2796 }
2797
2798 int
2799 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2800 {
2801 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2802
2803 KDASSERT(wapbl_replay_isopen(wr));
2804 KASSERT((len % fsblklen) == 0);
2805
2806 while (len != 0) {
2807 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2808 if (wb)
2809 return 1;
2810 len -= fsblklen;
2811 }
2812 return 0;
2813 }
2814
2815 int
2816 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2817 {
2818 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2819
2820 KDASSERT(wapbl_replay_isopen(wr));
2821
2822 KASSERT((len % fsblklen) == 0);
2823
2824 while (len != 0) {
2825 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2826 if (wb) {
2827 off_t off = wb->wb_off;
2828 int error;
2829 error = wapbl_circ_read(wr, data, fsblklen, &off);
2830 if (error)
2831 return error;
2832 }
2833 data = (uint8_t *)data + fsblklen;
2834 len -= fsblklen;
2835 blk++;
2836 }
2837 return 0;
2838 }
2839
2840 #ifdef _KERNEL
2841 /*
2842 * This is not really a module now, but maybe on it's way to
2843 * being one some day.
2844 */
2845 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2846
2847 static int
2848 wapbl_modcmd(modcmd_t cmd, void *arg)
2849 {
2850
2851 switch (cmd) {
2852 case MODULE_CMD_INIT:
2853 wapbl_init();
2854 return 0;
2855 case MODULE_CMD_FINI:
2856 #ifdef notyet
2857 return wapbl_fini(true);
2858 #endif
2859 return EOPNOTSUPP;
2860 default:
2861 return ENOTTY;
2862 }
2863 }
2864 #endif /* _KERNEL */
2865