vfs_wapbl.c revision 1.46 1 /* $NetBSD: vfs_wapbl.c,v 1.46 2011/08/14 12:37:09 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.46 2011/08/14 12:37:09 christos Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/malloc.h>
53 #include <sys/module.h>
54 #include <sys/resourcevar.h>
55 #include <sys/conf.h>
56 #include <sys/mount.h>
57 #include <sys/kernel.h>
58 #include <sys/kauth.h>
59 #include <sys/mutex.h>
60 #include <sys/atomic.h>
61 #include <sys/wapbl.h>
62 #include <sys/wapbl_replay.h>
63
64 #include <miscfs/specfs/specdev.h>
65
66 #if 0 /* notyet */
67 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
68 #define wapbl_free(a, s) kmem_free((a), (s))
69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
70 #else
71 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
72 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
73 #define wapbl_free(a, s) free((a), M_WAPBL)
74 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
75 #endif
76
77 static struct sysctllog *wapbl_sysctl;
78 static int wapbl_flush_disk_cache = 1;
79 static int wapbl_verbose_commit = 0;
80
81 #else /* !_KERNEL */
82 #include <assert.h>
83 #include <errno.h>
84 #include <stdio.h>
85 #include <stdbool.h>
86 #include <stdlib.h>
87 #include <string.h>
88
89 #include <sys/time.h>
90 #include <sys/wapbl.h>
91 #include <sys/wapbl_replay.h>
92
93 #define KDASSERT(x) assert(x)
94 #define KASSERT(x) assert(x)
95 #define wapbl_malloc(s) malloc(s)
96 #define wapbl_free(a, s) free(a)
97 #define wapbl_calloc(n, s) calloc((n), (s))
98
99 #endif /* !_KERNEL */
100
101 /*
102 * INTERNAL DATA STRUCTURES
103 */
104
105 /*
106 * This structure holds per-mount log information.
107 *
108 * Legend: a = atomic access only
109 * r = read-only after init
110 * l = rwlock held
111 * m = mutex held
112 * lm = rwlock held writing or mutex held
113 * u = unlocked access ok
114 * b = bufcache_lock held
115 */
116 struct wapbl {
117 struct vnode *wl_logvp; /* r: log here */
118 struct vnode *wl_devvp; /* r: log on this device */
119 struct mount *wl_mount; /* r: mountpoint wl is associated with */
120 daddr_t wl_logpbn; /* r: Physical block number of start of log */
121 int wl_log_dev_bshift; /* r: logarithm of device block size of log
122 device */
123 int wl_fs_dev_bshift; /* r: logarithm of device block size of
124 filesystem device */
125
126 unsigned wl_lock_count; /* m: Count of transactions in progress */
127
128 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
129 size_t wl_circ_off; /* r: Number of bytes reserved at start */
130
131 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
132 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
133
134 off_t wl_head; /* l: Byte offset of log head */
135 off_t wl_tail; /* l: Byte offset of log tail */
136 /*
137 * head == tail == 0 means log is empty
138 * head == tail != 0 means log is full
139 * see assertions in wapbl_advance() for other boundary conditions.
140 * only truncate moves the tail, except when flush sets it to
141 * wl_header_size only flush moves the head, except when truncate
142 * sets it to 0.
143 */
144
145 struct wapbl_wc_header *wl_wc_header; /* l */
146 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
147
148 kmutex_t wl_mtx; /* u: short-term lock */
149 krwlock_t wl_rwlock; /* u: File system transaction lock */
150
151 /*
152 * Must be held while accessing
153 * wl_count or wl_bufs or head or tail
154 */
155
156 /*
157 * Callback called from within the flush routine to flush any extra
158 * bits. Note that flush may be skipped without calling this if
159 * there are no outstanding buffers in the transaction.
160 */
161 #if _KERNEL
162 wapbl_flush_fn_t wl_flush; /* r */
163 wapbl_flush_fn_t wl_flush_abort;/* r */
164 #endif
165
166 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
167 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
168 size_t wl_bcount; /* m: Total bcount of wl_bufs */
169
170 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
171
172 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
173 size_t wl_reclaimable_bytes; /* m: Amount of space available for
174 reclamation by truncate */
175 int wl_error_count; /* m: # of wl_entries with errors */
176 size_t wl_reserved_bytes; /* never truncate log smaller than this */
177
178 #ifdef WAPBL_DEBUG_BUFBYTES
179 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
180 #endif
181
182 daddr_t *wl_deallocblks;/* lm: address of block */
183 int *wl_dealloclens; /* lm: size of block */
184 int wl_dealloccnt; /* lm: total count */
185 int wl_dealloclim; /* l: max count */
186
187 /* hashtable of inode numbers for allocated but unlinked inodes */
188 /* synch ??? */
189 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
190 u_long wl_inohashmask;
191 int wl_inohashcnt;
192
193 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
194 accounting */
195 };
196
197 #ifdef WAPBL_DEBUG_PRINT
198 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
199 #endif
200
201 /****************************************************************/
202 #ifdef _KERNEL
203
204 #ifdef WAPBL_DEBUG
205 struct wapbl *wapbl_debug_wl;
206 #endif
207
208 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
209 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
210 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
211 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
212 #endif /* _KERNEL */
213
214 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
215
216 static inline size_t wapbl_space_free(size_t avail, off_t head,
217 off_t tail);
218 static inline size_t wapbl_space_used(size_t avail, off_t head,
219 off_t tail);
220
221 #ifdef _KERNEL
222
223 #define WAPBL_INODETRK_SIZE 83
224 static int wapbl_ino_pool_refcount;
225 static struct pool wapbl_ino_pool;
226 struct wapbl_ino {
227 LIST_ENTRY(wapbl_ino) wi_hash;
228 ino_t wi_ino;
229 mode_t wi_mode;
230 };
231
232 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
233 static void wapbl_inodetrk_free(struct wapbl *wl);
234 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
235
236 static size_t wapbl_transaction_len(struct wapbl *wl);
237 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
238
239 #if 0
240 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
241 #endif
242
243 static int wapbl_replay_isopen1(struct wapbl_replay *);
244
245 /*
246 * This is useful for debugging. If set, the log will
247 * only be truncated when necessary.
248 */
249 int wapbl_lazy_truncate = 0;
250
251 struct wapbl_ops wapbl_ops = {
252 .wo_wapbl_discard = wapbl_discard,
253 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
254 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
255 .wo_wapbl_replay_read = wapbl_replay_read,
256 .wo_wapbl_add_buf = wapbl_add_buf,
257 .wo_wapbl_remove_buf = wapbl_remove_buf,
258 .wo_wapbl_resize_buf = wapbl_resize_buf,
259 .wo_wapbl_begin = wapbl_begin,
260 .wo_wapbl_end = wapbl_end,
261 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
262
263 /* XXX: the following is only used to say "this is a wapbl buf" */
264 .wo_wapbl_biodone = wapbl_biodone,
265 };
266
267 static int
268 wapbl_sysctl_init(void)
269 {
270 int rv;
271 const struct sysctlnode *rnode, *cnode;
272
273 wapbl_sysctl = NULL;
274
275 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
276 CTLFLAG_PERMANENT,
277 CTLTYPE_NODE, "vfs", NULL,
278 NULL, 0, NULL, 0,
279 CTL_VFS, CTL_EOL);
280 if (rv)
281 return rv;
282
283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
284 CTLFLAG_PERMANENT,
285 CTLTYPE_NODE, "wapbl",
286 SYSCTL_DESCR("WAPBL journaling options"),
287 NULL, 0, NULL, 0,
288 CTL_CREATE, CTL_EOL);
289 if (rv)
290 return rv;
291
292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
294 CTLTYPE_INT, "flush_disk_cache",
295 SYSCTL_DESCR("flush disk cache"),
296 NULL, 0, &wapbl_flush_disk_cache, 0,
297 CTL_CREATE, CTL_EOL);
298 if (rv)
299 return rv;
300
301 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
302 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
303 CTLTYPE_INT, "verbose_commit",
304 SYSCTL_DESCR("show time and size of wapbl log commits"),
305 NULL, 0, &wapbl_verbose_commit, 0,
306 CTL_CREATE, CTL_EOL);
307 return rv;
308 }
309
310 static void
311 wapbl_init(void)
312 {
313 malloc_type_attach(M_WAPBL);
314 wapbl_sysctl_init();
315 }
316
317 #ifdef notyet
318 static int
319 wapbl_fini(bool interface)
320 {
321 if (aio_sysctl != NULL)
322 sysctl_teardown(&aio_sysctl);
323 return 0;
324 }
325 #endif
326
327 static int
328 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
329 {
330 int error, i;
331
332 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
333 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
334
335 /*
336 * Its only valid to reuse the replay log if its
337 * the same as the new log we just opened.
338 */
339 KDASSERT(!wapbl_replay_isopen(wr));
340 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
341 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
342 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
343 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
344 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
345 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
346
347 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
348
349 for (i = 0; i < wr->wr_inodescnt; i++)
350 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
351 wr->wr_inodes[i].wr_imode);
352
353 /* Make sure new transaction won't overwrite old inodes list */
354 KDASSERT(wapbl_transaction_len(wl) <=
355 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
356 wr->wr_inodestail));
357
358 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
359 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
360 wapbl_transaction_len(wl);
361
362 error = wapbl_write_inodes(wl, &wl->wl_head);
363 if (error)
364 return error;
365
366 KASSERT(wl->wl_head != wl->wl_tail);
367 KASSERT(wl->wl_head != 0);
368
369 return 0;
370 }
371
372 int
373 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
374 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
375 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
376 {
377 struct wapbl *wl;
378 struct vnode *devvp;
379 daddr_t logpbn;
380 int error;
381 int log_dev_bshift = ilog2(blksize);
382 int fs_dev_bshift = log_dev_bshift;
383 int run;
384
385 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
386 " count=%zu blksize=%zu\n", vp, off, count, blksize));
387
388 if (log_dev_bshift > fs_dev_bshift) {
389 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
390 ("wapbl: log device's block size cannot be larger "
391 "than filesystem's\n"));
392 /*
393 * Not currently implemented, although it could be if
394 * needed someday.
395 */
396 return ENOSYS;
397 }
398
399 if (off < 0)
400 return EINVAL;
401
402 if (blksize < DEV_BSIZE)
403 return EINVAL;
404 if (blksize % DEV_BSIZE)
405 return EINVAL;
406
407 /* XXXTODO: verify that the full load is writable */
408
409 /*
410 * XXX check for minimum log size
411 * minimum is governed by minimum amount of space
412 * to complete a transaction. (probably truncate)
413 */
414 /* XXX for now pick something minimal */
415 if ((count * blksize) < MAXPHYS) {
416 return ENOSPC;
417 }
418
419 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
420 return error;
421 }
422
423 wl = wapbl_calloc(1, sizeof(*wl));
424 rw_init(&wl->wl_rwlock);
425 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
426 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
427 LIST_INIT(&wl->wl_bufs);
428 SIMPLEQ_INIT(&wl->wl_entries);
429
430 wl->wl_logvp = vp;
431 wl->wl_devvp = devvp;
432 wl->wl_mount = mp;
433 wl->wl_logpbn = logpbn;
434 wl->wl_log_dev_bshift = log_dev_bshift;
435 wl->wl_fs_dev_bshift = fs_dev_bshift;
436
437 wl->wl_flush = flushfn;
438 wl->wl_flush_abort = flushabortfn;
439
440 /* Reserve two log device blocks for the commit headers */
441 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
442 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
443 /* truncate the log usage to a multiple of log_dev_bshift */
444 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
445 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
446
447 /*
448 * wl_bufbytes_max limits the size of the in memory transaction space.
449 * - Since buffers are allocated and accounted for in units of
450 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
451 * (i.e. 1<<PAGE_SHIFT)
452 * - Since the log device has to be written in units of
453 * 1<<wl_log_dev_bshift it is required to be a mulitple of
454 * 1<<wl_log_dev_bshift.
455 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
456 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
457 * Therefore it must be multiple of the least common multiple of those
458 * three quantities. Fortunately, all of those quantities are
459 * guaranteed to be a power of two, and the least common multiple of
460 * a set of numbers which are all powers of two is simply the maximum
461 * of those numbers. Finally, the maximum logarithm of a power of two
462 * is the same as the log of the maximum power of two. So we can do
463 * the following operations to size wl_bufbytes_max:
464 */
465
466 /* XXX fix actual number of pages reserved per filesystem. */
467 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
468
469 /* Round wl_bufbytes_max to the largest power of two constraint */
470 wl->wl_bufbytes_max >>= PAGE_SHIFT;
471 wl->wl_bufbytes_max <<= PAGE_SHIFT;
472 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
473 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
474 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
475 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
476
477 /* XXX maybe use filesystem fragment size instead of 1024 */
478 /* XXX fix actual number of buffers reserved per filesystem. */
479 wl->wl_bufcount_max = (nbuf / 2) * 1024;
480
481 /* XXX tie this into resource estimation */
482 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
483
484 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
485 wl->wl_dealloclim);
486 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
487 wl->wl_dealloclim);
488
489 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
490
491 /* Initialize the commit header */
492 {
493 struct wapbl_wc_header *wc;
494 size_t len = 1 << wl->wl_log_dev_bshift;
495 wc = wapbl_calloc(1, len);
496 wc->wc_type = WAPBL_WC_HEADER;
497 wc->wc_len = len;
498 wc->wc_circ_off = wl->wl_circ_off;
499 wc->wc_circ_size = wl->wl_circ_size;
500 /* XXX wc->wc_fsid */
501 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
502 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
503 wl->wl_wc_header = wc;
504 wl->wl_wc_scratch = wapbl_malloc(len);
505 }
506
507 /*
508 * if there was an existing set of unlinked but
509 * allocated inodes, preserve it in the new
510 * log.
511 */
512 if (wr && wr->wr_inodescnt) {
513 error = wapbl_start_flush_inodes(wl, wr);
514 if (error)
515 goto errout;
516 }
517
518 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
519 if (error) {
520 goto errout;
521 }
522
523 *wlp = wl;
524 #if defined(WAPBL_DEBUG)
525 wapbl_debug_wl = wl;
526 #endif
527
528 return 0;
529 errout:
530 wapbl_discard(wl);
531 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
532 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
533 wapbl_free(wl->wl_deallocblks,
534 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
535 wapbl_free(wl->wl_dealloclens,
536 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
537 wapbl_inodetrk_free(wl);
538 wapbl_free(wl, sizeof(*wl));
539
540 return error;
541 }
542
543 /*
544 * Like wapbl_flush, only discards the transaction
545 * completely
546 */
547
548 void
549 wapbl_discard(struct wapbl *wl)
550 {
551 struct wapbl_entry *we;
552 struct buf *bp;
553 int i;
554
555 /*
556 * XXX we may consider using upgrade here
557 * if we want to call flush from inside a transaction
558 */
559 rw_enter(&wl->wl_rwlock, RW_WRITER);
560 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
561 wl->wl_dealloccnt);
562
563 #ifdef WAPBL_DEBUG_PRINT
564 {
565 pid_t pid = -1;
566 lwpid_t lid = -1;
567 if (curproc)
568 pid = curproc->p_pid;
569 if (curlwp)
570 lid = curlwp->l_lid;
571 #ifdef WAPBL_DEBUG_BUFBYTES
572 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
573 ("wapbl_discard: thread %d.%d discarding "
574 "transaction\n"
575 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
576 "deallocs=%d inodes=%d\n"
577 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
578 "unsynced=%zu\n",
579 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
580 wl->wl_bcount, wl->wl_dealloccnt,
581 wl->wl_inohashcnt, wl->wl_error_count,
582 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
583 wl->wl_unsynced_bufbytes));
584 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
585 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
586 ("\tentry: bufcount = %zu, reclaimable = %zu, "
587 "error = %d, unsynced = %zu\n",
588 we->we_bufcount, we->we_reclaimable_bytes,
589 we->we_error, we->we_unsynced_bufbytes));
590 }
591 #else /* !WAPBL_DEBUG_BUFBYTES */
592 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
593 ("wapbl_discard: thread %d.%d discarding transaction\n"
594 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
595 "deallocs=%d inodes=%d\n"
596 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
597 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
598 wl->wl_bcount, wl->wl_dealloccnt,
599 wl->wl_inohashcnt, wl->wl_error_count,
600 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
601 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
602 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
603 ("\tentry: bufcount = %zu, reclaimable = %zu, "
604 "error = %d\n",
605 we->we_bufcount, we->we_reclaimable_bytes,
606 we->we_error));
607 }
608 #endif /* !WAPBL_DEBUG_BUFBYTES */
609 }
610 #endif /* WAPBL_DEBUG_PRINT */
611
612 for (i = 0; i <= wl->wl_inohashmask; i++) {
613 struct wapbl_ino_head *wih;
614 struct wapbl_ino *wi;
615
616 wih = &wl->wl_inohash[i];
617 while ((wi = LIST_FIRST(wih)) != NULL) {
618 LIST_REMOVE(wi, wi_hash);
619 pool_put(&wapbl_ino_pool, wi);
620 KASSERT(wl->wl_inohashcnt > 0);
621 wl->wl_inohashcnt--;
622 }
623 }
624
625 /*
626 * clean buffer list
627 */
628 mutex_enter(&bufcache_lock);
629 mutex_enter(&wl->wl_mtx);
630 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
631 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
632 /*
633 * The buffer will be unlocked and
634 * removed from the transaction in brelse
635 */
636 mutex_exit(&wl->wl_mtx);
637 brelsel(bp, 0);
638 mutex_enter(&wl->wl_mtx);
639 }
640 }
641 mutex_exit(&wl->wl_mtx);
642 mutex_exit(&bufcache_lock);
643
644 /*
645 * Remove references to this wl from wl_entries, free any which
646 * no longer have buffers, others will be freed in wapbl_biodone
647 * when they no longer have any buffers.
648 */
649 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
650 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
651 /* XXX should we be accumulating wl_error_count
652 * and increasing reclaimable bytes ? */
653 we->we_wapbl = NULL;
654 if (we->we_bufcount == 0) {
655 #ifdef WAPBL_DEBUG_BUFBYTES
656 KASSERT(we->we_unsynced_bufbytes == 0);
657 #endif
658 wapbl_free(we, sizeof(*we));
659 }
660 }
661
662 /* Discard list of deallocs */
663 wl->wl_dealloccnt = 0;
664 /* XXX should we clear wl_reserved_bytes? */
665
666 KASSERT(wl->wl_bufbytes == 0);
667 KASSERT(wl->wl_bcount == 0);
668 KASSERT(wl->wl_bufcount == 0);
669 KASSERT(LIST_EMPTY(&wl->wl_bufs));
670 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
671 KASSERT(wl->wl_inohashcnt == 0);
672
673 rw_exit(&wl->wl_rwlock);
674 }
675
676 int
677 wapbl_stop(struct wapbl *wl, int force)
678 {
679 struct vnode *vp;
680 int error;
681
682 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
683 error = wapbl_flush(wl, 1);
684 if (error) {
685 if (force)
686 wapbl_discard(wl);
687 else
688 return error;
689 }
690
691 /* Unlinked inodes persist after a flush */
692 if (wl->wl_inohashcnt) {
693 if (force) {
694 wapbl_discard(wl);
695 } else {
696 return EBUSY;
697 }
698 }
699
700 KASSERT(wl->wl_bufbytes == 0);
701 KASSERT(wl->wl_bcount == 0);
702 KASSERT(wl->wl_bufcount == 0);
703 KASSERT(LIST_EMPTY(&wl->wl_bufs));
704 KASSERT(wl->wl_dealloccnt == 0);
705 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
706 KASSERT(wl->wl_inohashcnt == 0);
707
708 vp = wl->wl_logvp;
709
710 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
711 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
712 wapbl_free(wl->wl_deallocblks,
713 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
714 wapbl_free(wl->wl_dealloclens,
715 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
716 wapbl_inodetrk_free(wl);
717
718 cv_destroy(&wl->wl_reclaimable_cv);
719 mutex_destroy(&wl->wl_mtx);
720 rw_destroy(&wl->wl_rwlock);
721 wapbl_free(wl, sizeof(*wl));
722
723 return 0;
724 }
725
726 static int
727 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
728 {
729 struct pstats *pstats = curlwp->l_proc->p_stats;
730 struct buf *bp;
731 int error;
732
733 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
734 KASSERT(devvp->v_type == VBLK);
735
736 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
737 mutex_enter(devvp->v_interlock);
738 devvp->v_numoutput++;
739 mutex_exit(devvp->v_interlock);
740 pstats->p_ru.ru_oublock++;
741 } else {
742 pstats->p_ru.ru_inblock++;
743 }
744
745 bp = getiobuf(devvp, true);
746 bp->b_flags = flags;
747 bp->b_cflags = BC_BUSY; /* silly & dubious */
748 bp->b_dev = devvp->v_rdev;
749 bp->b_data = data;
750 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
751 bp->b_blkno = pbn;
752
753 WAPBL_PRINTF(WAPBL_PRINT_IO,
754 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
755 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
756 bp->b_blkno, bp->b_dev));
757
758 VOP_STRATEGY(devvp, bp);
759
760 error = biowait(bp);
761 putiobuf(bp);
762
763 if (error) {
764 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
765 ("wapbl_doio: %s %zu bytes at block %" PRId64
766 " on dev 0x%"PRIx64" failed with error %d\n",
767 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
768 "write" : "read"),
769 len, pbn, devvp->v_rdev, error));
770 }
771
772 return error;
773 }
774
775 int
776 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
777 {
778
779 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
780 }
781
782 int
783 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
784 {
785
786 return wapbl_doio(data, len, devvp, pbn, B_READ);
787 }
788
789 /*
790 * Off is byte offset returns new offset for next write
791 * handles log wraparound
792 */
793 static int
794 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
795 {
796 size_t slen;
797 off_t off = *offp;
798 int error;
799 daddr_t pbn;
800
801 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
802 wl->wl_log_dev_bshift) == len);
803
804 if (off < wl->wl_circ_off)
805 off = wl->wl_circ_off;
806 slen = wl->wl_circ_off + wl->wl_circ_size - off;
807 if (slen < len) {
808 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
809 #ifdef _KERNEL
810 pbn = btodb(pbn << wl->wl_log_dev_bshift);
811 #endif
812 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
813 if (error)
814 return error;
815 data = (uint8_t *)data + slen;
816 len -= slen;
817 off = wl->wl_circ_off;
818 }
819 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
820 #ifdef _KERNEL
821 pbn = btodb(pbn << wl->wl_log_dev_bshift);
822 #endif
823 error = wapbl_write(data, len, wl->wl_devvp, pbn);
824 if (error)
825 return error;
826 off += len;
827 if (off >= wl->wl_circ_off + wl->wl_circ_size)
828 off = wl->wl_circ_off;
829 *offp = off;
830 return 0;
831 }
832
833 /****************************************************************/
834
835 int
836 wapbl_begin(struct wapbl *wl, const char *file, int line)
837 {
838 int doflush;
839 unsigned lockcount;
840
841 KDASSERT(wl);
842
843 /*
844 * XXX this needs to be made much more sophisticated.
845 * perhaps each wapbl_begin could reserve a specified
846 * number of buffers and bytes.
847 */
848 mutex_enter(&wl->wl_mtx);
849 lockcount = wl->wl_lock_count;
850 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
851 wl->wl_bufbytes_max / 2) ||
852 ((wl->wl_bufcount + (lockcount * 10)) >
853 wl->wl_bufcount_max / 2) ||
854 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
855 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
856 mutex_exit(&wl->wl_mtx);
857
858 if (doflush) {
859 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
860 ("force flush lockcnt=%d bufbytes=%zu "
861 "(max=%zu) bufcount=%zu (max=%zu) "
862 "dealloccnt %d (lim=%d)\n",
863 lockcount, wl->wl_bufbytes,
864 wl->wl_bufbytes_max, wl->wl_bufcount,
865 wl->wl_bufcount_max,
866 wl->wl_dealloccnt, wl->wl_dealloclim));
867 }
868
869 if (doflush) {
870 int error = wapbl_flush(wl, 0);
871 if (error)
872 return error;
873 }
874
875 rw_enter(&wl->wl_rwlock, RW_READER);
876 mutex_enter(&wl->wl_mtx);
877 wl->wl_lock_count++;
878 mutex_exit(&wl->wl_mtx);
879
880 #if defined(WAPBL_DEBUG_PRINT)
881 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
882 ("wapbl_begin thread %d.%d with bufcount=%zu "
883 "bufbytes=%zu bcount=%zu at %s:%d\n",
884 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
885 wl->wl_bufbytes, wl->wl_bcount, file, line));
886 #endif
887
888 return 0;
889 }
890
891 void
892 wapbl_end(struct wapbl *wl)
893 {
894
895 #if defined(WAPBL_DEBUG_PRINT)
896 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
897 ("wapbl_end thread %d.%d with bufcount=%zu "
898 "bufbytes=%zu bcount=%zu\n",
899 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
900 wl->wl_bufbytes, wl->wl_bcount));
901 #endif
902
903 #ifdef DIAGNOSTIC
904 size_t flushsize = wapbl_transaction_len(wl);
905 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
906 /*
907 * XXX this could be handled more gracefully, perhaps place
908 * only a partial transaction in the log and allow the
909 * remaining to flush without the protection of the journal.
910 */
911 panic("wapbl_end: current transaction too big to flush\n");
912 }
913 #endif
914
915 mutex_enter(&wl->wl_mtx);
916 KASSERT(wl->wl_lock_count > 0);
917 wl->wl_lock_count--;
918 mutex_exit(&wl->wl_mtx);
919
920 rw_exit(&wl->wl_rwlock);
921 }
922
923 void
924 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
925 {
926
927 KASSERT(bp->b_cflags & BC_BUSY);
928 KASSERT(bp->b_vp);
929
930 wapbl_jlock_assert(wl);
931
932 #if 0
933 /*
934 * XXX this might be an issue for swapfiles.
935 * see uvm_swap.c:1702
936 *
937 * XXX2 why require it then? leap of semantics?
938 */
939 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
940 #endif
941
942 mutex_enter(&wl->wl_mtx);
943 if (bp->b_flags & B_LOCKED) {
944 LIST_REMOVE(bp, b_wapbllist);
945 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
946 ("wapbl_add_buf thread %d.%d re-adding buf %p "
947 "with %d bytes %d bcount\n",
948 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
949 bp->b_bcount));
950 } else {
951 /* unlocked by dirty buffers shouldn't exist */
952 KASSERT(!(bp->b_oflags & BO_DELWRI));
953 wl->wl_bufbytes += bp->b_bufsize;
954 wl->wl_bcount += bp->b_bcount;
955 wl->wl_bufcount++;
956 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
957 ("wapbl_add_buf thread %d.%d adding buf %p "
958 "with %d bytes %d bcount\n",
959 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
960 bp->b_bcount));
961 }
962 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
963 mutex_exit(&wl->wl_mtx);
964
965 bp->b_flags |= B_LOCKED;
966 }
967
968 static void
969 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
970 {
971
972 KASSERT(mutex_owned(&wl->wl_mtx));
973 KASSERT(bp->b_cflags & BC_BUSY);
974 wapbl_jlock_assert(wl);
975
976 #if 0
977 /*
978 * XXX this might be an issue for swapfiles.
979 * see uvm_swap.c:1725
980 *
981 * XXXdeux: see above
982 */
983 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
984 #endif
985 KASSERT(bp->b_flags & B_LOCKED);
986
987 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
988 ("wapbl_remove_buf thread %d.%d removing buf %p with "
989 "%d bytes %d bcount\n",
990 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
991
992 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
993 wl->wl_bufbytes -= bp->b_bufsize;
994 KASSERT(wl->wl_bcount >= bp->b_bcount);
995 wl->wl_bcount -= bp->b_bcount;
996 KASSERT(wl->wl_bufcount > 0);
997 wl->wl_bufcount--;
998 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
999 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1000 LIST_REMOVE(bp, b_wapbllist);
1001
1002 bp->b_flags &= ~B_LOCKED;
1003 }
1004
1005 /* called from brelsel() in vfs_bio among other places */
1006 void
1007 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1008 {
1009
1010 mutex_enter(&wl->wl_mtx);
1011 wapbl_remove_buf_locked(wl, bp);
1012 mutex_exit(&wl->wl_mtx);
1013 }
1014
1015 void
1016 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1017 {
1018
1019 KASSERT(bp->b_cflags & BC_BUSY);
1020
1021 /*
1022 * XXX: why does this depend on B_LOCKED? otherwise the buf
1023 * is not for a transaction? if so, why is this called in the
1024 * first place?
1025 */
1026 if (bp->b_flags & B_LOCKED) {
1027 mutex_enter(&wl->wl_mtx);
1028 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1029 wl->wl_bcount += bp->b_bcount - oldcnt;
1030 mutex_exit(&wl->wl_mtx);
1031 }
1032 }
1033
1034 #endif /* _KERNEL */
1035
1036 /****************************************************************/
1037 /* Some utility inlines */
1038
1039 /* This is used to advance the pointer at old to new value at old+delta */
1040 static inline off_t
1041 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1042 {
1043 off_t new;
1044
1045 /* Define acceptable ranges for inputs. */
1046 KASSERT(delta <= (size_t)size);
1047 KASSERT((old == 0) || ((size_t)old >= off));
1048 KASSERT(old < (off_t)(size + off));
1049
1050 if ((old == 0) && (delta != 0))
1051 new = off + delta;
1052 else if ((old + delta) < (size + off))
1053 new = old + delta;
1054 else
1055 new = (old + delta) - size;
1056
1057 /* Note some interesting axioms */
1058 KASSERT((delta != 0) || (new == old));
1059 KASSERT((delta == 0) || (new != 0));
1060 KASSERT((delta != (size)) || (new == old));
1061
1062 /* Define acceptable ranges for output. */
1063 KASSERT((new == 0) || ((size_t)new >= off));
1064 KASSERT((size_t)new < (size + off));
1065 return new;
1066 }
1067
1068 static inline size_t
1069 wapbl_space_used(size_t avail, off_t head, off_t tail)
1070 {
1071
1072 if (tail == 0) {
1073 KASSERT(head == 0);
1074 return 0;
1075 }
1076 return ((head + (avail - 1) - tail) % avail) + 1;
1077 }
1078
1079 static inline size_t
1080 wapbl_space_free(size_t avail, off_t head, off_t tail)
1081 {
1082
1083 return avail - wapbl_space_used(avail, head, tail);
1084 }
1085
1086 static inline void
1087 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1088 off_t *tailp)
1089 {
1090 off_t head = *headp;
1091 off_t tail = *tailp;
1092
1093 KASSERT(delta <= wapbl_space_free(size, head, tail));
1094 head = wapbl_advance(size, off, head, delta);
1095 if ((tail == 0) && (head != 0))
1096 tail = off;
1097 *headp = head;
1098 *tailp = tail;
1099 }
1100
1101 static inline void
1102 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1103 off_t *tailp)
1104 {
1105 off_t head = *headp;
1106 off_t tail = *tailp;
1107
1108 KASSERT(delta <= wapbl_space_used(size, head, tail));
1109 tail = wapbl_advance(size, off, tail, delta);
1110 if (head == tail) {
1111 head = tail = 0;
1112 }
1113 *headp = head;
1114 *tailp = tail;
1115 }
1116
1117 #ifdef _KERNEL
1118
1119 /****************************************************************/
1120
1121 /*
1122 * Remove transactions whose buffers are completely flushed to disk.
1123 * Will block until at least minfree space is available.
1124 * only intended to be called from inside wapbl_flush and therefore
1125 * does not protect against commit races with itself or with flush.
1126 */
1127 static int
1128 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1129 {
1130 size_t delta;
1131 size_t avail;
1132 off_t head;
1133 off_t tail;
1134 int error = 0;
1135
1136 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1137 KASSERT(rw_write_held(&wl->wl_rwlock));
1138
1139 mutex_enter(&wl->wl_mtx);
1140
1141 /*
1142 * First check to see if we have to do a commit
1143 * at all.
1144 */
1145 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1146 if (minfree < avail) {
1147 mutex_exit(&wl->wl_mtx);
1148 return 0;
1149 }
1150 minfree -= avail;
1151 while ((wl->wl_error_count == 0) &&
1152 (wl->wl_reclaimable_bytes < minfree)) {
1153 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1154 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1155 "minfree=%zd\n",
1156 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1157 minfree));
1158
1159 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1160 }
1161 if (wl->wl_reclaimable_bytes < minfree) {
1162 KASSERT(wl->wl_error_count);
1163 /* XXX maybe get actual error from buffer instead someday? */
1164 error = EIO;
1165 }
1166 head = wl->wl_head;
1167 tail = wl->wl_tail;
1168 delta = wl->wl_reclaimable_bytes;
1169
1170 /* If all of of the entries are flushed, then be sure to keep
1171 * the reserved bytes reserved. Watch out for discarded transactions,
1172 * which could leave more bytes reserved than are reclaimable.
1173 */
1174 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1175 (delta >= wl->wl_reserved_bytes)) {
1176 delta -= wl->wl_reserved_bytes;
1177 }
1178 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1179 &tail);
1180 KDASSERT(wl->wl_reserved_bytes <=
1181 wapbl_space_used(wl->wl_circ_size, head, tail));
1182 mutex_exit(&wl->wl_mtx);
1183
1184 if (error)
1185 return error;
1186
1187 if (waitonly)
1188 return 0;
1189
1190 /*
1191 * This is where head, tail and delta are unprotected
1192 * from races against itself or flush. This is ok since
1193 * we only call this routine from inside flush itself.
1194 *
1195 * XXX: how can it race against itself when accessed only
1196 * from behind the write-locked rwlock?
1197 */
1198 error = wapbl_write_commit(wl, head, tail);
1199 if (error)
1200 return error;
1201
1202 wl->wl_head = head;
1203 wl->wl_tail = tail;
1204
1205 mutex_enter(&wl->wl_mtx);
1206 KASSERT(wl->wl_reclaimable_bytes >= delta);
1207 wl->wl_reclaimable_bytes -= delta;
1208 mutex_exit(&wl->wl_mtx);
1209 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1210 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1211 curproc->p_pid, curlwp->l_lid, delta));
1212
1213 return 0;
1214 }
1215
1216 /****************************************************************/
1217
1218 void
1219 wapbl_biodone(struct buf *bp)
1220 {
1221 struct wapbl_entry *we = bp->b_private;
1222 struct wapbl *wl = we->we_wapbl;
1223
1224 /*
1225 * Handle possible flushing of buffers after log has been
1226 * decomissioned.
1227 */
1228 if (!wl) {
1229 KASSERT(we->we_bufcount > 0);
1230 we->we_bufcount--;
1231 #ifdef WAPBL_DEBUG_BUFBYTES
1232 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1233 we->we_unsynced_bufbytes -= bp->b_bufsize;
1234 #endif
1235
1236 if (we->we_bufcount == 0) {
1237 #ifdef WAPBL_DEBUG_BUFBYTES
1238 KASSERT(we->we_unsynced_bufbytes == 0);
1239 #endif
1240 wapbl_free(we, sizeof(*we));
1241 }
1242
1243 brelse(bp, 0);
1244 return;
1245 }
1246
1247 #ifdef ohbother
1248 KDASSERT(bp->b_oflags & BO_DONE);
1249 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1250 KDASSERT(bp->b_flags & B_ASYNC);
1251 KDASSERT(bp->b_cflags & BC_BUSY);
1252 KDASSERT(!(bp->b_flags & B_LOCKED));
1253 KDASSERT(!(bp->b_flags & B_READ));
1254 KDASSERT(!(bp->b_cflags & BC_INVAL));
1255 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1256 #endif
1257
1258 if (bp->b_error) {
1259 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1260 /*
1261 * XXXpooka: interfaces not fully updated
1262 * Note: this was not enabled in the original patch
1263 * against netbsd4 either. I don't know if comment
1264 * above is true or not.
1265 */
1266
1267 /*
1268 * If an error occurs, report the error and leave the
1269 * buffer as a delayed write on the LRU queue.
1270 * restarting the write would likely result in
1271 * an error spinloop, so let it be done harmlessly
1272 * by the syncer.
1273 */
1274 bp->b_flags &= ~(B_DONE);
1275 simple_unlock(&bp->b_interlock);
1276
1277 if (we->we_error == 0) {
1278 mutex_enter(&wl->wl_mtx);
1279 wl->wl_error_count++;
1280 mutex_exit(&wl->wl_mtx);
1281 cv_broadcast(&wl->wl_reclaimable_cv);
1282 }
1283 we->we_error = bp->b_error;
1284 bp->b_error = 0;
1285 brelse(bp);
1286 return;
1287 #else
1288 /* For now, just mark the log permanently errored out */
1289
1290 mutex_enter(&wl->wl_mtx);
1291 if (wl->wl_error_count == 0) {
1292 wl->wl_error_count++;
1293 cv_broadcast(&wl->wl_reclaimable_cv);
1294 }
1295 mutex_exit(&wl->wl_mtx);
1296 #endif
1297 }
1298
1299 mutex_enter(&wl->wl_mtx);
1300
1301 KASSERT(we->we_bufcount > 0);
1302 we->we_bufcount--;
1303 #ifdef WAPBL_DEBUG_BUFBYTES
1304 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1305 we->we_unsynced_bufbytes -= bp->b_bufsize;
1306 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1307 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1308 #endif
1309
1310 /*
1311 * If the current transaction can be reclaimed, start
1312 * at the beginning and reclaim any consecutive reclaimable
1313 * transactions. If we successfully reclaim anything,
1314 * then wakeup anyone waiting for the reclaim.
1315 */
1316 if (we->we_bufcount == 0) {
1317 size_t delta = 0;
1318 int errcnt = 0;
1319 #ifdef WAPBL_DEBUG_BUFBYTES
1320 KDASSERT(we->we_unsynced_bufbytes == 0);
1321 #endif
1322 /*
1323 * clear any posted error, since the buffer it came from
1324 * has successfully flushed by now
1325 */
1326 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1327 (we->we_bufcount == 0)) {
1328 delta += we->we_reclaimable_bytes;
1329 if (we->we_error)
1330 errcnt++;
1331 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1332 wapbl_free(we, sizeof(*we));
1333 }
1334
1335 if (delta) {
1336 wl->wl_reclaimable_bytes += delta;
1337 KASSERT(wl->wl_error_count >= errcnt);
1338 wl->wl_error_count -= errcnt;
1339 cv_broadcast(&wl->wl_reclaimable_cv);
1340 }
1341 }
1342
1343 mutex_exit(&wl->wl_mtx);
1344 brelse(bp, 0);
1345 }
1346
1347 /*
1348 * Write transactions to disk + start I/O for contents
1349 */
1350 int
1351 wapbl_flush(struct wapbl *wl, int waitfor)
1352 {
1353 struct buf *bp;
1354 struct wapbl_entry *we;
1355 off_t off;
1356 off_t head;
1357 off_t tail;
1358 size_t delta = 0;
1359 size_t flushsize;
1360 size_t reserved;
1361 int error = 0;
1362
1363 /*
1364 * Do a quick check to see if a full flush can be skipped
1365 * This assumes that the flush callback does not need to be called
1366 * unless there are other outstanding bufs.
1367 */
1368 if (!waitfor) {
1369 size_t nbufs;
1370 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1371 protect the KASSERTS */
1372 nbufs = wl->wl_bufcount;
1373 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1374 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1375 mutex_exit(&wl->wl_mtx);
1376 if (nbufs == 0)
1377 return 0;
1378 }
1379
1380 /*
1381 * XXX we may consider using LK_UPGRADE here
1382 * if we want to call flush from inside a transaction
1383 */
1384 rw_enter(&wl->wl_rwlock, RW_WRITER);
1385 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1386 wl->wl_dealloccnt);
1387
1388 /*
1389 * Now that we are fully locked and flushed,
1390 * do another check for nothing to do.
1391 */
1392 if (wl->wl_bufcount == 0) {
1393 goto out;
1394 }
1395
1396 #if 0
1397 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1398 ("wapbl_flush thread %d.%d flushing entries with "
1399 "bufcount=%zu bufbytes=%zu\n",
1400 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1401 wl->wl_bufbytes));
1402 #endif
1403
1404 /* Calculate amount of space needed to flush */
1405 flushsize = wapbl_transaction_len(wl);
1406 if (wapbl_verbose_commit) {
1407 struct timespec ts;
1408 getnanotime(&ts);
1409 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1410 __func__, (long long)ts.tv_sec,
1411 (long)ts.tv_nsec, flushsize);
1412 }
1413
1414 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1415 /*
1416 * XXX this could be handled more gracefully, perhaps place
1417 * only a partial transaction in the log and allow the
1418 * remaining to flush without the protection of the journal.
1419 */
1420 panic("wapbl_flush: current transaction too big to flush\n");
1421 }
1422
1423 error = wapbl_truncate(wl, flushsize, 0);
1424 if (error)
1425 goto out2;
1426
1427 off = wl->wl_head;
1428 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1429 (off < wl->wl_circ_off + wl->wl_circ_size)));
1430 error = wapbl_write_blocks(wl, &off);
1431 if (error)
1432 goto out2;
1433 error = wapbl_write_revocations(wl, &off);
1434 if (error)
1435 goto out2;
1436 error = wapbl_write_inodes(wl, &off);
1437 if (error)
1438 goto out2;
1439
1440 reserved = 0;
1441 if (wl->wl_inohashcnt)
1442 reserved = wapbl_transaction_inodes_len(wl);
1443
1444 head = wl->wl_head;
1445 tail = wl->wl_tail;
1446
1447 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1448 &head, &tail);
1449 #ifdef WAPBL_DEBUG
1450 if (head != off) {
1451 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1452 " off=%"PRIdMAX" flush=%zu\n",
1453 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1454 flushsize);
1455 }
1456 #else
1457 KASSERT(head == off);
1458 #endif
1459
1460 /* Opportunistically move the tail forward if we can */
1461 if (!wapbl_lazy_truncate) {
1462 mutex_enter(&wl->wl_mtx);
1463 delta = wl->wl_reclaimable_bytes;
1464 mutex_exit(&wl->wl_mtx);
1465 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1466 &head, &tail);
1467 }
1468
1469 error = wapbl_write_commit(wl, head, tail);
1470 if (error)
1471 goto out2;
1472
1473 we = wapbl_calloc(1, sizeof(*we));
1474
1475 #ifdef WAPBL_DEBUG_BUFBYTES
1476 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1477 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1478 " unsynced=%zu"
1479 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1480 "inodes=%d\n",
1481 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1482 wapbl_space_used(wl->wl_circ_size, head, tail),
1483 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1484 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1485 wl->wl_inohashcnt));
1486 #else
1487 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1488 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1489 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1490 "inodes=%d\n",
1491 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1492 wapbl_space_used(wl->wl_circ_size, head, tail),
1493 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1494 wl->wl_dealloccnt, wl->wl_inohashcnt));
1495 #endif
1496
1497
1498 mutex_enter(&bufcache_lock);
1499 mutex_enter(&wl->wl_mtx);
1500
1501 wl->wl_reserved_bytes = reserved;
1502 wl->wl_head = head;
1503 wl->wl_tail = tail;
1504 KASSERT(wl->wl_reclaimable_bytes >= delta);
1505 wl->wl_reclaimable_bytes -= delta;
1506 wl->wl_dealloccnt = 0;
1507 #ifdef WAPBL_DEBUG_BUFBYTES
1508 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1509 #endif
1510
1511 we->we_wapbl = wl;
1512 we->we_bufcount = wl->wl_bufcount;
1513 #ifdef WAPBL_DEBUG_BUFBYTES
1514 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1515 #endif
1516 we->we_reclaimable_bytes = flushsize;
1517 we->we_error = 0;
1518 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1519
1520 /*
1521 * this flushes bufs in reverse order than they were queued
1522 * it shouldn't matter, but if we care we could use TAILQ instead.
1523 * XXX Note they will get put on the lru queue when they flush
1524 * so we might actually want to change this to preserve order.
1525 */
1526 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1527 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1528 continue;
1529 }
1530 bp->b_iodone = wapbl_biodone;
1531 bp->b_private = we;
1532 bremfree(bp);
1533 wapbl_remove_buf_locked(wl, bp);
1534 mutex_exit(&wl->wl_mtx);
1535 mutex_exit(&bufcache_lock);
1536 bawrite(bp);
1537 mutex_enter(&bufcache_lock);
1538 mutex_enter(&wl->wl_mtx);
1539 }
1540 mutex_exit(&wl->wl_mtx);
1541 mutex_exit(&bufcache_lock);
1542
1543 #if 0
1544 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1545 ("wapbl_flush thread %d.%d done flushing entries...\n",
1546 curproc->p_pid, curlwp->l_lid));
1547 #endif
1548
1549 out:
1550
1551 /*
1552 * If the waitfor flag is set, don't return until everything is
1553 * fully flushed and the on disk log is empty.
1554 */
1555 if (waitfor) {
1556 error = wapbl_truncate(wl, wl->wl_circ_size -
1557 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1558 }
1559
1560 out2:
1561 if (error) {
1562 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1563 wl->wl_dealloclens, wl->wl_dealloccnt);
1564 }
1565
1566 #ifdef WAPBL_DEBUG_PRINT
1567 if (error) {
1568 pid_t pid = -1;
1569 lwpid_t lid = -1;
1570 if (curproc)
1571 pid = curproc->p_pid;
1572 if (curlwp)
1573 lid = curlwp->l_lid;
1574 mutex_enter(&wl->wl_mtx);
1575 #ifdef WAPBL_DEBUG_BUFBYTES
1576 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1577 ("wapbl_flush: thread %d.%d aborted flush: "
1578 "error = %d\n"
1579 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1580 "deallocs=%d inodes=%d\n"
1581 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1582 "unsynced=%zu\n",
1583 pid, lid, error, wl->wl_bufcount,
1584 wl->wl_bufbytes, wl->wl_bcount,
1585 wl->wl_dealloccnt, wl->wl_inohashcnt,
1586 wl->wl_error_count, wl->wl_reclaimable_bytes,
1587 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1588 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1589 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1590 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1591 "error = %d, unsynced = %zu\n",
1592 we->we_bufcount, we->we_reclaimable_bytes,
1593 we->we_error, we->we_unsynced_bufbytes));
1594 }
1595 #else
1596 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1597 ("wapbl_flush: thread %d.%d aborted flush: "
1598 "error = %d\n"
1599 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1600 "deallocs=%d inodes=%d\n"
1601 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1602 pid, lid, error, wl->wl_bufcount,
1603 wl->wl_bufbytes, wl->wl_bcount,
1604 wl->wl_dealloccnt, wl->wl_inohashcnt,
1605 wl->wl_error_count, wl->wl_reclaimable_bytes,
1606 wl->wl_reserved_bytes));
1607 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1608 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1609 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1610 "error = %d\n", we->we_bufcount,
1611 we->we_reclaimable_bytes, we->we_error));
1612 }
1613 #endif
1614 mutex_exit(&wl->wl_mtx);
1615 }
1616 #endif
1617
1618 rw_exit(&wl->wl_rwlock);
1619 return error;
1620 }
1621
1622 /****************************************************************/
1623
1624 void
1625 wapbl_jlock_assert(struct wapbl *wl)
1626 {
1627
1628 KASSERT(rw_lock_held(&wl->wl_rwlock));
1629 }
1630
1631 void
1632 wapbl_junlock_assert(struct wapbl *wl)
1633 {
1634
1635 KASSERT(!rw_write_held(&wl->wl_rwlock));
1636 }
1637
1638 /****************************************************************/
1639
1640 /* locks missing */
1641 void
1642 wapbl_print(struct wapbl *wl,
1643 int full,
1644 void (*pr)(const char *, ...))
1645 {
1646 struct buf *bp;
1647 struct wapbl_entry *we;
1648 (*pr)("wapbl %p", wl);
1649 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1650 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1651 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1652 wl->wl_circ_size, wl->wl_circ_off,
1653 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1654 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1655 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1656 #ifdef WAPBL_DEBUG_BUFBYTES
1657 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1658 "reserved = %zu errcnt = %d unsynced = %zu\n",
1659 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1660 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1661 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1662 #else
1663 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1664 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1665 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1666 wl->wl_error_count);
1667 #endif
1668 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1669 wl->wl_dealloccnt, wl->wl_dealloclim);
1670 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1671 wl->wl_inohashcnt, wl->wl_inohashmask);
1672 (*pr)("entries:\n");
1673 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1674 #ifdef WAPBL_DEBUG_BUFBYTES
1675 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1676 "unsynced = %zu\n",
1677 we->we_bufcount, we->we_reclaimable_bytes,
1678 we->we_error, we->we_unsynced_bufbytes);
1679 #else
1680 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1681 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1682 #endif
1683 }
1684 if (full) {
1685 int cnt = 0;
1686 (*pr)("bufs =");
1687 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1688 if (!LIST_NEXT(bp, b_wapbllist)) {
1689 (*pr)(" %p", bp);
1690 } else if ((++cnt % 6) == 0) {
1691 (*pr)(" %p,\n\t", bp);
1692 } else {
1693 (*pr)(" %p,", bp);
1694 }
1695 }
1696 (*pr)("\n");
1697
1698 (*pr)("dealloced blks = ");
1699 {
1700 int i;
1701 cnt = 0;
1702 for (i = 0; i < wl->wl_dealloccnt; i++) {
1703 (*pr)(" %"PRId64":%d,",
1704 wl->wl_deallocblks[i],
1705 wl->wl_dealloclens[i]);
1706 if ((++cnt % 4) == 0) {
1707 (*pr)("\n\t");
1708 }
1709 }
1710 }
1711 (*pr)("\n");
1712
1713 (*pr)("registered inodes = ");
1714 {
1715 int i;
1716 cnt = 0;
1717 for (i = 0; i <= wl->wl_inohashmask; i++) {
1718 struct wapbl_ino_head *wih;
1719 struct wapbl_ino *wi;
1720
1721 wih = &wl->wl_inohash[i];
1722 LIST_FOREACH(wi, wih, wi_hash) {
1723 if (wi->wi_ino == 0)
1724 continue;
1725 (*pr)(" %"PRId32"/0%06"PRIo32",",
1726 wi->wi_ino, wi->wi_mode);
1727 if ((++cnt % 4) == 0) {
1728 (*pr)("\n\t");
1729 }
1730 }
1731 }
1732 (*pr)("\n");
1733 }
1734 }
1735 }
1736
1737 #if defined(WAPBL_DEBUG) || defined(DDB)
1738 void
1739 wapbl_dump(struct wapbl *wl)
1740 {
1741 #if defined(WAPBL_DEBUG)
1742 if (!wl)
1743 wl = wapbl_debug_wl;
1744 #endif
1745 if (!wl)
1746 return;
1747 wapbl_print(wl, 1, printf);
1748 }
1749 #endif
1750
1751 /****************************************************************/
1752
1753 void
1754 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1755 {
1756
1757 wapbl_jlock_assert(wl);
1758
1759 mutex_enter(&wl->wl_mtx);
1760 /* XXX should eventually instead tie this into resource estimation */
1761 /*
1762 * XXX this panic needs locking/mutex analysis and the
1763 * ability to cope with the failure.
1764 */
1765 /* XXX this XXX doesn't have enough XXX */
1766 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1767 panic("wapbl_register_deallocation: out of resources");
1768
1769 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1770 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1771 wl->wl_dealloccnt++;
1772 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1773 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1774 mutex_exit(&wl->wl_mtx);
1775 }
1776
1777 /****************************************************************/
1778
1779 static void
1780 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1781 {
1782
1783 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1784 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1785 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1786 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1787 }
1788 }
1789
1790 static void
1791 wapbl_inodetrk_free(struct wapbl *wl)
1792 {
1793
1794 /* XXX this KASSERT needs locking/mutex analysis */
1795 KASSERT(wl->wl_inohashcnt == 0);
1796 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1797 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1798 pool_destroy(&wapbl_ino_pool);
1799 }
1800 }
1801
1802 static struct wapbl_ino *
1803 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1804 {
1805 struct wapbl_ino_head *wih;
1806 struct wapbl_ino *wi;
1807
1808 KASSERT(mutex_owned(&wl->wl_mtx));
1809
1810 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1811 LIST_FOREACH(wi, wih, wi_hash) {
1812 if (ino == wi->wi_ino)
1813 return wi;
1814 }
1815 return 0;
1816 }
1817
1818 void
1819 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1820 {
1821 struct wapbl_ino_head *wih;
1822 struct wapbl_ino *wi;
1823
1824 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1825
1826 mutex_enter(&wl->wl_mtx);
1827 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1828 wi->wi_ino = ino;
1829 wi->wi_mode = mode;
1830 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1831 LIST_INSERT_HEAD(wih, wi, wi_hash);
1832 wl->wl_inohashcnt++;
1833 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1834 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1835 mutex_exit(&wl->wl_mtx);
1836 } else {
1837 mutex_exit(&wl->wl_mtx);
1838 pool_put(&wapbl_ino_pool, wi);
1839 }
1840 }
1841
1842 void
1843 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1844 {
1845 struct wapbl_ino *wi;
1846
1847 mutex_enter(&wl->wl_mtx);
1848 wi = wapbl_inodetrk_get(wl, ino);
1849 if (wi) {
1850 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1851 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1852 KASSERT(wl->wl_inohashcnt > 0);
1853 wl->wl_inohashcnt--;
1854 LIST_REMOVE(wi, wi_hash);
1855 mutex_exit(&wl->wl_mtx);
1856
1857 pool_put(&wapbl_ino_pool, wi);
1858 } else {
1859 mutex_exit(&wl->wl_mtx);
1860 }
1861 }
1862
1863 /****************************************************************/
1864
1865 static inline size_t
1866 wapbl_transaction_inodes_len(struct wapbl *wl)
1867 {
1868 int blocklen = 1<<wl->wl_log_dev_bshift;
1869 int iph;
1870
1871 /* Calculate number of inodes described in a inodelist header */
1872 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1873 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1874
1875 KASSERT(iph > 0);
1876
1877 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1878 }
1879
1880
1881 /* Calculate amount of space a transaction will take on disk */
1882 static size_t
1883 wapbl_transaction_len(struct wapbl *wl)
1884 {
1885 int blocklen = 1<<wl->wl_log_dev_bshift;
1886 size_t len;
1887 int bph;
1888
1889 /* Calculate number of blocks described in a blocklist header */
1890 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1891 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1892
1893 KASSERT(bph > 0);
1894
1895 len = wl->wl_bcount;
1896 len += howmany(wl->wl_bufcount, bph) * blocklen;
1897 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1898 len += wapbl_transaction_inodes_len(wl);
1899
1900 return len;
1901 }
1902
1903 /*
1904 * Perform commit operation
1905 *
1906 * Note that generation number incrementation needs to
1907 * be protected against racing with other invocations
1908 * of wapbl_commit. This is ok since this routine
1909 * is only invoked from wapbl_flush
1910 */
1911 static int
1912 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1913 {
1914 struct wapbl_wc_header *wc = wl->wl_wc_header;
1915 struct timespec ts;
1916 int error;
1917 int force = 1;
1918 daddr_t pbn;
1919
1920 if (wapbl_flush_disk_cache) {
1921 /* XXX Calc checksum here, instead we do this for now */
1922 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1923 FWRITE, FSCRED);
1924 if (error) {
1925 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1926 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1927 "returned %d\n", wl->wl_devvp->v_rdev, error));
1928 }
1929 }
1930
1931 wc->wc_head = head;
1932 wc->wc_tail = tail;
1933 wc->wc_checksum = 0;
1934 wc->wc_version = 1;
1935 getnanotime(&ts);
1936 wc->wc_time = ts.tv_sec;
1937 wc->wc_timensec = ts.tv_nsec;
1938
1939 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1940 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1941 (intmax_t)head, (intmax_t)tail));
1942
1943 /*
1944 * XXX if generation will rollover, then first zero
1945 * over second commit header before trying to write both headers.
1946 */
1947
1948 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1949 #ifdef _KERNEL
1950 pbn = btodb(pbn << wc->wc_log_dev_bshift);
1951 #endif
1952 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1953 if (error)
1954 return error;
1955
1956 if (wapbl_flush_disk_cache) {
1957 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1958 FWRITE, FSCRED);
1959 if (error) {
1960 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1961 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1962 "returned %d\n", wl->wl_devvp->v_rdev, error));
1963 }
1964 }
1965
1966 /*
1967 * If the generation number was zero, write it out a second time.
1968 * This handles initialization and generation number rollover
1969 */
1970 if (wc->wc_generation++ == 0) {
1971 error = wapbl_write_commit(wl, head, tail);
1972 /*
1973 * This panic should be able to be removed if we do the
1974 * zero'ing mentioned above, and we are certain to roll
1975 * back generation number on failure.
1976 */
1977 if (error)
1978 panic("wapbl_write_commit: error writing duplicate "
1979 "log header: %d\n", error);
1980 }
1981 return 0;
1982 }
1983
1984 /* Returns new offset value */
1985 static int
1986 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1987 {
1988 struct wapbl_wc_blocklist *wc =
1989 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1990 int blocklen = 1<<wl->wl_log_dev_bshift;
1991 int bph;
1992 struct buf *bp;
1993 off_t off = *offp;
1994 int error;
1995 size_t padding;
1996
1997 KASSERT(rw_write_held(&wl->wl_rwlock));
1998
1999 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2000 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2001
2002 bp = LIST_FIRST(&wl->wl_bufs);
2003
2004 while (bp) {
2005 int cnt;
2006 struct buf *obp = bp;
2007
2008 KASSERT(bp->b_flags & B_LOCKED);
2009
2010 wc->wc_type = WAPBL_WC_BLOCKS;
2011 wc->wc_len = blocklen;
2012 wc->wc_blkcount = 0;
2013 while (bp && (wc->wc_blkcount < bph)) {
2014 /*
2015 * Make sure all the physical block numbers are up to
2016 * date. If this is not always true on a given
2017 * filesystem, then VOP_BMAP must be called. We
2018 * could call VOP_BMAP here, or else in the filesystem
2019 * specific flush callback, although neither of those
2020 * solutions allow us to take the vnode lock. If a
2021 * filesystem requires that we must take the vnode lock
2022 * to call VOP_BMAP, then we can probably do it in
2023 * bwrite when the vnode lock should already be held
2024 * by the invoking code.
2025 */
2026 KASSERT((bp->b_vp->v_type == VBLK) ||
2027 (bp->b_blkno != bp->b_lblkno));
2028 KASSERT(bp->b_blkno > 0);
2029
2030 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2031 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2032 wc->wc_len += bp->b_bcount;
2033 wc->wc_blkcount++;
2034 bp = LIST_NEXT(bp, b_wapbllist);
2035 }
2036 if (wc->wc_len % blocklen != 0) {
2037 padding = blocklen - wc->wc_len % blocklen;
2038 wc->wc_len += padding;
2039 } else {
2040 padding = 0;
2041 }
2042
2043 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2044 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2045 wc->wc_len, padding, (intmax_t)off));
2046
2047 error = wapbl_circ_write(wl, wc, blocklen, &off);
2048 if (error)
2049 return error;
2050 bp = obp;
2051 cnt = 0;
2052 while (bp && (cnt++ < bph)) {
2053 error = wapbl_circ_write(wl, bp->b_data,
2054 bp->b_bcount, &off);
2055 if (error)
2056 return error;
2057 bp = LIST_NEXT(bp, b_wapbllist);
2058 }
2059 if (padding) {
2060 void *zero;
2061
2062 zero = wapbl_malloc(padding);
2063 memset(zero, 0, padding);
2064 error = wapbl_circ_write(wl, zero, padding, &off);
2065 wapbl_free(zero, padding);
2066 if (error)
2067 return error;
2068 }
2069 }
2070 *offp = off;
2071 return 0;
2072 }
2073
2074 static int
2075 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2076 {
2077 struct wapbl_wc_blocklist *wc =
2078 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2079 int i;
2080 int blocklen = 1<<wl->wl_log_dev_bshift;
2081 int bph;
2082 off_t off = *offp;
2083 int error;
2084
2085 if (wl->wl_dealloccnt == 0)
2086 return 0;
2087
2088 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2089 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2090
2091 i = 0;
2092 while (i < wl->wl_dealloccnt) {
2093 wc->wc_type = WAPBL_WC_REVOCATIONS;
2094 wc->wc_len = blocklen;
2095 wc->wc_blkcount = 0;
2096 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2097 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2098 wl->wl_deallocblks[i];
2099 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2100 wl->wl_dealloclens[i];
2101 wc->wc_blkcount++;
2102 i++;
2103 }
2104 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2105 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2106 wc->wc_len, (intmax_t)off));
2107 error = wapbl_circ_write(wl, wc, blocklen, &off);
2108 if (error)
2109 return error;
2110 }
2111 *offp = off;
2112 return 0;
2113 }
2114
2115 static int
2116 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2117 {
2118 struct wapbl_wc_inodelist *wc =
2119 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2120 int i;
2121 int blocklen = 1 << wl->wl_log_dev_bshift;
2122 off_t off = *offp;
2123 int error;
2124
2125 struct wapbl_ino_head *wih;
2126 struct wapbl_ino *wi;
2127 int iph;
2128
2129 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2130 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2131
2132 i = 0;
2133 wih = &wl->wl_inohash[0];
2134 wi = 0;
2135 do {
2136 wc->wc_type = WAPBL_WC_INODES;
2137 wc->wc_len = blocklen;
2138 wc->wc_inocnt = 0;
2139 wc->wc_clear = (i == 0);
2140 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2141 while (!wi) {
2142 KASSERT((wih - &wl->wl_inohash[0])
2143 <= wl->wl_inohashmask);
2144 wi = LIST_FIRST(wih++);
2145 }
2146 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2147 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2148 wc->wc_inocnt++;
2149 i++;
2150 wi = LIST_NEXT(wi, wi_hash);
2151 }
2152 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2153 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2154 wc->wc_len, (intmax_t)off));
2155 error = wapbl_circ_write(wl, wc, blocklen, &off);
2156 if (error)
2157 return error;
2158 } while (i < wl->wl_inohashcnt);
2159
2160 *offp = off;
2161 return 0;
2162 }
2163
2164 #endif /* _KERNEL */
2165
2166 /****************************************************************/
2167
2168 struct wapbl_blk {
2169 LIST_ENTRY(wapbl_blk) wb_hash;
2170 daddr_t wb_blk;
2171 off_t wb_off; /* Offset of this block in the log */
2172 };
2173 #define WAPBL_BLKPOOL_MIN 83
2174
2175 static void
2176 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2177 {
2178 if (size < WAPBL_BLKPOOL_MIN)
2179 size = WAPBL_BLKPOOL_MIN;
2180 KASSERT(wr->wr_blkhash == 0);
2181 #ifdef _KERNEL
2182 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2183 #else /* ! _KERNEL */
2184 /* Manually implement hashinit */
2185 {
2186 unsigned long i, hashsize;
2187 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2188 continue;
2189 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2190 for (i = 0; i < hashsize; i++)
2191 LIST_INIT(&wr->wr_blkhash[i]);
2192 wr->wr_blkhashmask = hashsize - 1;
2193 }
2194 #endif /* ! _KERNEL */
2195 }
2196
2197 static void
2198 wapbl_blkhash_free(struct wapbl_replay *wr)
2199 {
2200 KASSERT(wr->wr_blkhashcnt == 0);
2201 #ifdef _KERNEL
2202 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2203 #else /* ! _KERNEL */
2204 wapbl_free(wr->wr_blkhash,
2205 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2206 #endif /* ! _KERNEL */
2207 }
2208
2209 static struct wapbl_blk *
2210 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2211 {
2212 struct wapbl_blk_head *wbh;
2213 struct wapbl_blk *wb;
2214 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2215 LIST_FOREACH(wb, wbh, wb_hash) {
2216 if (blk == wb->wb_blk)
2217 return wb;
2218 }
2219 return 0;
2220 }
2221
2222 static void
2223 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2224 {
2225 struct wapbl_blk_head *wbh;
2226 struct wapbl_blk *wb;
2227 wb = wapbl_blkhash_get(wr, blk);
2228 if (wb) {
2229 KASSERT(wb->wb_blk == blk);
2230 wb->wb_off = off;
2231 } else {
2232 wb = wapbl_malloc(sizeof(*wb));
2233 wb->wb_blk = blk;
2234 wb->wb_off = off;
2235 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2236 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2237 wr->wr_blkhashcnt++;
2238 }
2239 }
2240
2241 static void
2242 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2243 {
2244 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2245 if (wb) {
2246 KASSERT(wr->wr_blkhashcnt > 0);
2247 wr->wr_blkhashcnt--;
2248 LIST_REMOVE(wb, wb_hash);
2249 wapbl_free(wb, sizeof(*wb));
2250 }
2251 }
2252
2253 static void
2254 wapbl_blkhash_clear(struct wapbl_replay *wr)
2255 {
2256 unsigned long i;
2257 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2258 struct wapbl_blk *wb;
2259
2260 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2261 KASSERT(wr->wr_blkhashcnt > 0);
2262 wr->wr_blkhashcnt--;
2263 LIST_REMOVE(wb, wb_hash);
2264 wapbl_free(wb, sizeof(*wb));
2265 }
2266 }
2267 KASSERT(wr->wr_blkhashcnt == 0);
2268 }
2269
2270 /****************************************************************/
2271
2272 static int
2273 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2274 {
2275 size_t slen;
2276 off_t off = *offp;
2277 int error;
2278 daddr_t pbn;
2279
2280 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2281 wr->wr_log_dev_bshift) == len);
2282
2283 if (off < wr->wr_circ_off)
2284 off = wr->wr_circ_off;
2285 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2286 if (slen < len) {
2287 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2288 #ifdef _KERNEL
2289 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2290 #endif
2291 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2292 if (error)
2293 return error;
2294 data = (uint8_t *)data + slen;
2295 len -= slen;
2296 off = wr->wr_circ_off;
2297 }
2298 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2299 #ifdef _KERNEL
2300 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2301 #endif
2302 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2303 if (error)
2304 return error;
2305 off += len;
2306 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2307 off = wr->wr_circ_off;
2308 *offp = off;
2309 return 0;
2310 }
2311
2312 static void
2313 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2314 {
2315 size_t slen;
2316 off_t off = *offp;
2317
2318 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2319 wr->wr_log_dev_bshift) == len);
2320
2321 if (off < wr->wr_circ_off)
2322 off = wr->wr_circ_off;
2323 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2324 if (slen < len) {
2325 len -= slen;
2326 off = wr->wr_circ_off;
2327 }
2328 off += len;
2329 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2330 off = wr->wr_circ_off;
2331 *offp = off;
2332 }
2333
2334 /****************************************************************/
2335
2336 int
2337 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2338 daddr_t off, size_t count, size_t blksize)
2339 {
2340 struct wapbl_replay *wr;
2341 int error;
2342 struct vnode *devvp;
2343 daddr_t logpbn;
2344 uint8_t *scratch;
2345 struct wapbl_wc_header *wch;
2346 struct wapbl_wc_header *wch2;
2347 /* Use this until we read the actual log header */
2348 int log_dev_bshift = ilog2(blksize);
2349 size_t used;
2350 daddr_t pbn;
2351
2352 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2353 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2354 vp, off, count, blksize));
2355
2356 if (off < 0)
2357 return EINVAL;
2358
2359 if (blksize < DEV_BSIZE)
2360 return EINVAL;
2361 if (blksize % DEV_BSIZE)
2362 return EINVAL;
2363
2364 #ifdef _KERNEL
2365 #if 0
2366 /* XXX vp->v_size isn't reliably set for VBLK devices,
2367 * especially root. However, we might still want to verify
2368 * that the full load is readable */
2369 if ((off + count) * blksize > vp->v_size)
2370 return EINVAL;
2371 #endif
2372 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2373 return error;
2374 }
2375 #else /* ! _KERNEL */
2376 devvp = vp;
2377 logpbn = off;
2378 #endif /* ! _KERNEL */
2379
2380 scratch = wapbl_malloc(MAXBSIZE);
2381
2382 pbn = logpbn;
2383 #ifdef _KERNEL
2384 pbn = btodb(pbn << log_dev_bshift);
2385 #endif
2386 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2387 if (error)
2388 goto errout;
2389
2390 wch = (struct wapbl_wc_header *)scratch;
2391 wch2 =
2392 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2393 /* XXX verify checksums and magic numbers */
2394 if (wch->wc_type != WAPBL_WC_HEADER) {
2395 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2396 error = EFTYPE;
2397 goto errout;
2398 }
2399
2400 if (wch2->wc_generation > wch->wc_generation)
2401 wch = wch2;
2402
2403 wr = wapbl_calloc(1, sizeof(*wr));
2404
2405 wr->wr_logvp = vp;
2406 wr->wr_devvp = devvp;
2407 wr->wr_logpbn = logpbn;
2408
2409 wr->wr_scratch = scratch;
2410
2411 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2412 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2413 wr->wr_circ_off = wch->wc_circ_off;
2414 wr->wr_circ_size = wch->wc_circ_size;
2415 wr->wr_generation = wch->wc_generation;
2416
2417 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2418
2419 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2420 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2421 " len=%"PRId64" used=%zu\n",
2422 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2423 wch->wc_circ_size, used));
2424
2425 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2426
2427 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2428 if (error) {
2429 wapbl_replay_stop(wr);
2430 wapbl_replay_free(wr);
2431 return error;
2432 }
2433
2434 *wrp = wr;
2435 return 0;
2436
2437 errout:
2438 wapbl_free(scratch, MAXBSIZE);
2439 return error;
2440 }
2441
2442 void
2443 wapbl_replay_stop(struct wapbl_replay *wr)
2444 {
2445
2446 if (!wapbl_replay_isopen(wr))
2447 return;
2448
2449 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2450
2451 wapbl_free(wr->wr_scratch, MAXBSIZE);
2452 wr->wr_scratch = NULL;
2453
2454 wr->wr_logvp = NULL;
2455
2456 wapbl_blkhash_clear(wr);
2457 wapbl_blkhash_free(wr);
2458 }
2459
2460 void
2461 wapbl_replay_free(struct wapbl_replay *wr)
2462 {
2463
2464 KDASSERT(!wapbl_replay_isopen(wr));
2465
2466 if (wr->wr_inodes)
2467 wapbl_free(wr->wr_inodes,
2468 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2469 wapbl_free(wr, sizeof(*wr));
2470 }
2471
2472 #ifdef _KERNEL
2473 int
2474 wapbl_replay_isopen1(struct wapbl_replay *wr)
2475 {
2476
2477 return wapbl_replay_isopen(wr);
2478 }
2479 #endif
2480
2481 static void
2482 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2483 {
2484 struct wapbl_wc_blocklist *wc =
2485 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2486 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2487 int i, j, n;
2488
2489 for (i = 0; i < wc->wc_blkcount; i++) {
2490 /*
2491 * Enter each physical block into the hashtable independently.
2492 */
2493 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2494 for (j = 0; j < n; j++) {
2495 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2496 *offp);
2497 wapbl_circ_advance(wr, fsblklen, offp);
2498 }
2499 }
2500 }
2501
2502 static void
2503 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2504 {
2505 struct wapbl_wc_blocklist *wc =
2506 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2507 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2508 int i, j, n;
2509
2510 for (i = 0; i < wc->wc_blkcount; i++) {
2511 /*
2512 * Remove any blocks found from the hashtable.
2513 */
2514 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2515 for (j = 0; j < n; j++)
2516 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2517 }
2518 }
2519
2520 static void
2521 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2522 {
2523 struct wapbl_wc_inodelist *wc =
2524 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2525 void *new_inodes;
2526 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2527
2528 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2529
2530 /*
2531 * Keep track of where we found this so location won't be
2532 * overwritten.
2533 */
2534 if (wc->wc_clear) {
2535 wr->wr_inodestail = oldoff;
2536 wr->wr_inodescnt = 0;
2537 if (wr->wr_inodes != NULL) {
2538 wapbl_free(wr->wr_inodes, oldsize);
2539 wr->wr_inodes = NULL;
2540 }
2541 }
2542 wr->wr_inodeshead = newoff;
2543 if (wc->wc_inocnt == 0)
2544 return;
2545
2546 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
2547 sizeof(wr->wr_inodes[0]));
2548 if (wr->wr_inodes != NULL) {
2549 memcpy(new_inodes, wr->wr_inodes, oldsize);
2550 wapbl_free(wr->wr_inodes, oldsize);
2551 }
2552 wr->wr_inodes = new_inodes;
2553 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2554 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2555 wr->wr_inodescnt += wc->wc_inocnt;
2556 }
2557
2558 static int
2559 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2560 {
2561 off_t off;
2562 int error;
2563
2564 int logblklen = 1 << wr->wr_log_dev_bshift;
2565
2566 wapbl_blkhash_clear(wr);
2567
2568 off = tail;
2569 while (off != head) {
2570 struct wapbl_wc_null *wcn;
2571 off_t saveoff = off;
2572 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2573 if (error)
2574 goto errout;
2575 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2576 switch (wcn->wc_type) {
2577 case WAPBL_WC_BLOCKS:
2578 wapbl_replay_process_blocks(wr, &off);
2579 break;
2580
2581 case WAPBL_WC_REVOCATIONS:
2582 wapbl_replay_process_revocations(wr);
2583 break;
2584
2585 case WAPBL_WC_INODES:
2586 wapbl_replay_process_inodes(wr, saveoff, off);
2587 break;
2588
2589 default:
2590 printf("Unrecognized wapbl type: 0x%08x\n",
2591 wcn->wc_type);
2592 error = EFTYPE;
2593 goto errout;
2594 }
2595 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2596 if (off != saveoff) {
2597 printf("wapbl_replay: corrupted records\n");
2598 error = EFTYPE;
2599 goto errout;
2600 }
2601 }
2602 return 0;
2603
2604 errout:
2605 wapbl_blkhash_clear(wr);
2606 return error;
2607 }
2608
2609 #if 0
2610 int
2611 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2612 {
2613 off_t off;
2614 int mismatchcnt = 0;
2615 int logblklen = 1 << wr->wr_log_dev_bshift;
2616 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2617 void *scratch1 = wapbl_malloc(MAXBSIZE);
2618 void *scratch2 = wapbl_malloc(MAXBSIZE);
2619 int error = 0;
2620
2621 KDASSERT(wapbl_replay_isopen(wr));
2622
2623 off = wch->wc_tail;
2624 while (off != wch->wc_head) {
2625 struct wapbl_wc_null *wcn;
2626 #ifdef DEBUG
2627 off_t saveoff = off;
2628 #endif
2629 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2630 if (error)
2631 goto out;
2632 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2633 switch (wcn->wc_type) {
2634 case WAPBL_WC_BLOCKS:
2635 {
2636 struct wapbl_wc_blocklist *wc =
2637 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2638 int i;
2639 for (i = 0; i < wc->wc_blkcount; i++) {
2640 int foundcnt = 0;
2641 int dirtycnt = 0;
2642 int j, n;
2643 /*
2644 * Check each physical block into the
2645 * hashtable independently
2646 */
2647 n = wc->wc_blocks[i].wc_dlen >>
2648 wch->wc_fs_dev_bshift;
2649 for (j = 0; j < n; j++) {
2650 struct wapbl_blk *wb =
2651 wapbl_blkhash_get(wr,
2652 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2653 if (wb && (wb->wb_off == off)) {
2654 foundcnt++;
2655 error =
2656 wapbl_circ_read(wr,
2657 scratch1, fsblklen,
2658 &off);
2659 if (error)
2660 goto out;
2661 error =
2662 wapbl_read(scratch2,
2663 fsblklen, fsdevvp,
2664 wb->wb_blk);
2665 if (error)
2666 goto out;
2667 if (memcmp(scratch1,
2668 scratch2,
2669 fsblklen)) {
2670 printf(
2671 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2672 wb->wb_blk, (intmax_t)off);
2673 dirtycnt++;
2674 mismatchcnt++;
2675 }
2676 } else {
2677 wapbl_circ_advance(wr,
2678 fsblklen, &off);
2679 }
2680 }
2681 #if 0
2682 /*
2683 * If all of the blocks in an entry
2684 * are clean, then remove all of its
2685 * blocks from the hashtable since they
2686 * never will need replay.
2687 */
2688 if ((foundcnt != 0) &&
2689 (dirtycnt == 0)) {
2690 off = saveoff;
2691 wapbl_circ_advance(wr,
2692 logblklen, &off);
2693 for (j = 0; j < n; j++) {
2694 struct wapbl_blk *wb =
2695 wapbl_blkhash_get(wr,
2696 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2697 if (wb &&
2698 (wb->wb_off == off)) {
2699 wapbl_blkhash_rem(wr, wb->wb_blk);
2700 }
2701 wapbl_circ_advance(wr,
2702 fsblklen, &off);
2703 }
2704 }
2705 #endif
2706 }
2707 }
2708 break;
2709 case WAPBL_WC_REVOCATIONS:
2710 case WAPBL_WC_INODES:
2711 break;
2712 default:
2713 KASSERT(0);
2714 }
2715 #ifdef DEBUG
2716 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2717 KASSERT(off == saveoff);
2718 #endif
2719 }
2720 out:
2721 wapbl_free(scratch1, MAXBSIZE);
2722 wapbl_free(scratch2, MAXBSIZE);
2723 if (!error && mismatchcnt)
2724 error = EFTYPE;
2725 return error;
2726 }
2727 #endif
2728
2729 int
2730 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2731 {
2732 struct wapbl_blk *wb;
2733 size_t i;
2734 off_t off;
2735 void *scratch;
2736 int error = 0;
2737 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2738
2739 KDASSERT(wapbl_replay_isopen(wr));
2740
2741 scratch = wapbl_malloc(MAXBSIZE);
2742
2743 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2744 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2745 off = wb->wb_off;
2746 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2747 if (error)
2748 break;
2749 error = wapbl_write(scratch, fsblklen, fsdevvp,
2750 wb->wb_blk);
2751 if (error)
2752 break;
2753 }
2754 }
2755
2756 wapbl_free(scratch, MAXBSIZE);
2757 return error;
2758 }
2759
2760 int
2761 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2762 {
2763 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2764
2765 KDASSERT(wapbl_replay_isopen(wr));
2766 KASSERT((len % fsblklen) == 0);
2767
2768 while (len != 0) {
2769 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2770 if (wb)
2771 return 1;
2772 len -= fsblklen;
2773 }
2774 return 0;
2775 }
2776
2777 int
2778 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2779 {
2780 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2781
2782 KDASSERT(wapbl_replay_isopen(wr));
2783
2784 KASSERT((len % fsblklen) == 0);
2785
2786 while (len != 0) {
2787 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2788 if (wb) {
2789 off_t off = wb->wb_off;
2790 int error;
2791 error = wapbl_circ_read(wr, data, fsblklen, &off);
2792 if (error)
2793 return error;
2794 }
2795 data = (uint8_t *)data + fsblklen;
2796 len -= fsblklen;
2797 blk++;
2798 }
2799 return 0;
2800 }
2801
2802 #ifdef _KERNEL
2803 /*
2804 * This is not really a module now, but maybe on it's way to
2805 * being one some day.
2806 */
2807 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2808
2809 static int
2810 wapbl_modcmd(modcmd_t cmd, void *arg)
2811 {
2812
2813 switch (cmd) {
2814 case MODULE_CMD_INIT:
2815 wapbl_init();
2816 return 0;
2817 case MODULE_CMD_FINI:
2818 #ifdef notyet
2819 return wapbl_fini(true);
2820 #endif
2821 return EOPNOTSUPP;
2822 default:
2823 return ENOTTY;
2824 }
2825 }
2826 #endif /* _KERNEL */
2827