vfs_wapbl.c revision 1.41 1 /* $NetBSD: vfs_wapbl.c,v 1.41 2011/02/16 19:43:05 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.41 2011/02/16 19:43:05 hannken Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/malloc.h>
53 #include <sys/module.h>
54 #include <sys/resourcevar.h>
55 #include <sys/conf.h>
56 #include <sys/mount.h>
57 #include <sys/kernel.h>
58 #include <sys/kauth.h>
59 #include <sys/mutex.h>
60 #include <sys/atomic.h>
61 #include <sys/wapbl.h>
62 #include <sys/wapbl_replay.h>
63
64 #include <miscfs/specfs/specdev.h>
65
66 #if 0 /* notyet */
67 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
68 #define wapbl_free(a, s) kmem_free((a), (s))
69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
70 #else
71 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
72 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
73 #define wapbl_free(a, s) free((a), M_WAPBL)
74 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
75 #endif
76
77 static struct sysctllog *wapbl_sysctl;
78 static int wapbl_flush_disk_cache = 1;
79 static int wapbl_verbose_commit = 0;
80
81 #else /* !_KERNEL */
82 #include <assert.h>
83 #include <errno.h>
84 #include <stdio.h>
85 #include <stdbool.h>
86 #include <stdlib.h>
87 #include <string.h>
88
89 #include <sys/time.h>
90 #include <sys/wapbl.h>
91 #include <sys/wapbl_replay.h>
92
93 #define KDASSERT(x) assert(x)
94 #define KASSERT(x) assert(x)
95 #define wapbl_malloc(s) malloc(s)
96 #define wapbl_free(a, s) free(a)
97 #define wapbl_calloc(n, s) calloc((n), (s))
98
99 #endif /* !_KERNEL */
100
101 /*
102 * INTERNAL DATA STRUCTURES
103 */
104
105 /*
106 * This structure holds per-mount log information.
107 *
108 * Legend: a = atomic access only
109 * r = read-only after init
110 * l = rwlock held
111 * m = mutex held
112 * lm = rwlock held writing or mutex held
113 * u = unlocked access ok
114 * b = bufcache_lock held
115 */
116 struct wapbl {
117 struct vnode *wl_logvp; /* r: log here */
118 struct vnode *wl_devvp; /* r: log on this device */
119 struct mount *wl_mount; /* r: mountpoint wl is associated with */
120 daddr_t wl_logpbn; /* r: Physical block number of start of log */
121 int wl_log_dev_bshift; /* r: logarithm of device block size of log
122 device */
123 int wl_fs_dev_bshift; /* r: logarithm of device block size of
124 filesystem device */
125
126 unsigned wl_lock_count; /* m: Count of transactions in progress */
127
128 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
129 size_t wl_circ_off; /* r: Number of bytes reserved at start */
130
131 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
132 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
133
134 off_t wl_head; /* l: Byte offset of log head */
135 off_t wl_tail; /* l: Byte offset of log tail */
136 /*
137 * head == tail == 0 means log is empty
138 * head == tail != 0 means log is full
139 * see assertions in wapbl_advance() for other boundary conditions.
140 * only truncate moves the tail, except when flush sets it to
141 * wl_header_size only flush moves the head, except when truncate
142 * sets it to 0.
143 */
144
145 struct wapbl_wc_header *wl_wc_header; /* l */
146 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
147
148 kmutex_t wl_mtx; /* u: short-term lock */
149 krwlock_t wl_rwlock; /* u: File system transaction lock */
150
151 /*
152 * Must be held while accessing
153 * wl_count or wl_bufs or head or tail
154 */
155
156 /*
157 * Callback called from within the flush routine to flush any extra
158 * bits. Note that flush may be skipped without calling this if
159 * there are no outstanding buffers in the transaction.
160 */
161 #if _KERNEL
162 wapbl_flush_fn_t wl_flush; /* r */
163 wapbl_flush_fn_t wl_flush_abort;/* r */
164 #endif
165
166 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
167 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
168 size_t wl_bcount; /* m: Total bcount of wl_bufs */
169
170 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
171
172 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
173 size_t wl_reclaimable_bytes; /* m: Amount of space available for
174 reclamation by truncate */
175 int wl_error_count; /* m: # of wl_entries with errors */
176 size_t wl_reserved_bytes; /* never truncate log smaller than this */
177
178 #ifdef WAPBL_DEBUG_BUFBYTES
179 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
180 #endif
181
182 daddr_t *wl_deallocblks;/* lm: address of block */
183 int *wl_dealloclens; /* lm: size of block */
184 int wl_dealloccnt; /* lm: total count */
185 int wl_dealloclim; /* l: max count */
186
187 /* hashtable of inode numbers for allocated but unlinked inodes */
188 /* synch ??? */
189 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
190 u_long wl_inohashmask;
191 int wl_inohashcnt;
192
193 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
194 accounting */
195 };
196
197 #ifdef WAPBL_DEBUG_PRINT
198 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
199 #endif
200
201 /****************************************************************/
202 #ifdef _KERNEL
203
204 #ifdef WAPBL_DEBUG
205 struct wapbl *wapbl_debug_wl;
206 #endif
207
208 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
209 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
210 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
211 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
212 #endif /* _KERNEL */
213
214 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
215
216 static inline size_t wapbl_space_free(size_t avail, off_t head,
217 off_t tail);
218 static inline size_t wapbl_space_used(size_t avail, off_t head,
219 off_t tail);
220
221 #ifdef _KERNEL
222
223 #define WAPBL_INODETRK_SIZE 83
224 static int wapbl_ino_pool_refcount;
225 static struct pool wapbl_ino_pool;
226 struct wapbl_ino {
227 LIST_ENTRY(wapbl_ino) wi_hash;
228 ino_t wi_ino;
229 mode_t wi_mode;
230 };
231
232 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
233 static void wapbl_inodetrk_free(struct wapbl *wl);
234 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
235
236 static size_t wapbl_transaction_len(struct wapbl *wl);
237 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
238
239 #if 0
240 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
241 #endif
242
243 static int wapbl_replay_isopen1(struct wapbl_replay *);
244
245 /*
246 * This is useful for debugging. If set, the log will
247 * only be truncated when necessary.
248 */
249 int wapbl_lazy_truncate = 0;
250
251 struct wapbl_ops wapbl_ops = {
252 .wo_wapbl_discard = wapbl_discard,
253 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
254 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
255 .wo_wapbl_replay_read = wapbl_replay_read,
256 .wo_wapbl_add_buf = wapbl_add_buf,
257 .wo_wapbl_remove_buf = wapbl_remove_buf,
258 .wo_wapbl_resize_buf = wapbl_resize_buf,
259 .wo_wapbl_begin = wapbl_begin,
260 .wo_wapbl_end = wapbl_end,
261 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
262
263 /* XXX: the following is only used to say "this is a wapbl buf" */
264 .wo_wapbl_biodone = wapbl_biodone,
265 };
266
267 static int
268 wapbl_sysctl_init(void)
269 {
270 int rv;
271 const struct sysctlnode *rnode, *cnode;
272
273 wapbl_sysctl = NULL;
274
275 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
276 CTLFLAG_PERMANENT,
277 CTLTYPE_NODE, "vfs", NULL,
278 NULL, 0, NULL, 0,
279 CTL_VFS, CTL_EOL);
280 if (rv)
281 return rv;
282
283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
284 CTLFLAG_PERMANENT,
285 CTLTYPE_NODE, "wapbl",
286 SYSCTL_DESCR("WAPBL journaling options"),
287 NULL, 0, NULL, 0,
288 CTL_CREATE, CTL_EOL);
289 if (rv)
290 return rv;
291
292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
294 CTLTYPE_INT, "flush_disk_cache",
295 SYSCTL_DESCR("flush disk cache"),
296 NULL, 0, &wapbl_flush_disk_cache, 0,
297 CTL_CREATE, CTL_EOL);
298 if (rv)
299 return rv;
300
301 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
302 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
303 CTLTYPE_INT, "verbose_commit",
304 SYSCTL_DESCR("show time and size of wapbl log commits"),
305 NULL, 0, &wapbl_verbose_commit, 0,
306 CTL_CREATE, CTL_EOL);
307 return rv;
308 }
309
310 static void
311 wapbl_init(void)
312 {
313 malloc_type_attach(M_WAPBL);
314 wapbl_sysctl_init();
315 }
316
317 #ifdef notyet
318 static int
319 wapbl_fini(bool interface)
320 {
321 if (aio_sysctl != NULL)
322 sysctl_teardown(&aio_sysctl);
323 return 0;
324 }
325 #endif
326
327 static int
328 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
329 {
330 int error, i;
331
332 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
333 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
334
335 /*
336 * Its only valid to reuse the replay log if its
337 * the same as the new log we just opened.
338 */
339 KDASSERT(!wapbl_replay_isopen(wr));
340 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
341 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
342 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
343 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
344 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
345 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
346
347 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
348
349 for (i = 0; i < wr->wr_inodescnt; i++)
350 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
351 wr->wr_inodes[i].wr_imode);
352
353 /* Make sure new transaction won't overwrite old inodes list */
354 KDASSERT(wapbl_transaction_len(wl) <=
355 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
356 wr->wr_inodestail));
357
358 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
359 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
360 wapbl_transaction_len(wl);
361
362 error = wapbl_write_inodes(wl, &wl->wl_head);
363 if (error)
364 return error;
365
366 KASSERT(wl->wl_head != wl->wl_tail);
367 KASSERT(wl->wl_head != 0);
368
369 return 0;
370 }
371
372 int
373 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
374 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
375 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
376 {
377 struct wapbl *wl;
378 struct vnode *devvp;
379 daddr_t logpbn;
380 int error;
381 int log_dev_bshift = ilog2(blksize);
382 int fs_dev_bshift = log_dev_bshift;
383 int run;
384
385 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
386 " count=%zu blksize=%zu\n", vp, off, count, blksize));
387
388 if (log_dev_bshift > fs_dev_bshift) {
389 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
390 ("wapbl: log device's block size cannot be larger "
391 "than filesystem's\n"));
392 /*
393 * Not currently implemented, although it could be if
394 * needed someday.
395 */
396 return ENOSYS;
397 }
398
399 if (off < 0)
400 return EINVAL;
401
402 if (blksize < DEV_BSIZE)
403 return EINVAL;
404 if (blksize % DEV_BSIZE)
405 return EINVAL;
406
407 /* XXXTODO: verify that the full load is writable */
408
409 /*
410 * XXX check for minimum log size
411 * minimum is governed by minimum amount of space
412 * to complete a transaction. (probably truncate)
413 */
414 /* XXX for now pick something minimal */
415 if ((count * blksize) < MAXPHYS) {
416 return ENOSPC;
417 }
418
419 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
420 return error;
421 }
422
423 wl = wapbl_calloc(1, sizeof(*wl));
424 rw_init(&wl->wl_rwlock);
425 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
426 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
427 LIST_INIT(&wl->wl_bufs);
428 SIMPLEQ_INIT(&wl->wl_entries);
429
430 wl->wl_logvp = vp;
431 wl->wl_devvp = devvp;
432 wl->wl_mount = mp;
433 wl->wl_logpbn = logpbn;
434 wl->wl_log_dev_bshift = log_dev_bshift;
435 wl->wl_fs_dev_bshift = fs_dev_bshift;
436
437 wl->wl_flush = flushfn;
438 wl->wl_flush_abort = flushabortfn;
439
440 /* Reserve two log device blocks for the commit headers */
441 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
442 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
443 /* truncate the log usage to a multiple of log_dev_bshift */
444 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
445 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
446
447 /*
448 * wl_bufbytes_max limits the size of the in memory transaction space.
449 * - Since buffers are allocated and accounted for in units of
450 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
451 * (i.e. 1<<PAGE_SHIFT)
452 * - Since the log device has to be written in units of
453 * 1<<wl_log_dev_bshift it is required to be a mulitple of
454 * 1<<wl_log_dev_bshift.
455 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
456 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
457 * Therefore it must be multiple of the least common multiple of those
458 * three quantities. Fortunately, all of those quantities are
459 * guaranteed to be a power of two, and the least common multiple of
460 * a set of numbers which are all powers of two is simply the maximum
461 * of those numbers. Finally, the maximum logarithm of a power of two
462 * is the same as the log of the maximum power of two. So we can do
463 * the following operations to size wl_bufbytes_max:
464 */
465
466 /* XXX fix actual number of pages reserved per filesystem. */
467 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
468
469 /* Round wl_bufbytes_max to the largest power of two constraint */
470 wl->wl_bufbytes_max >>= PAGE_SHIFT;
471 wl->wl_bufbytes_max <<= PAGE_SHIFT;
472 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
473 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
474 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
475 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
476
477 /* XXX maybe use filesystem fragment size instead of 1024 */
478 /* XXX fix actual number of buffers reserved per filesystem. */
479 wl->wl_bufcount_max = (nbuf / 2) * 1024;
480
481 /* XXX tie this into resource estimation */
482 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
483
484 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
485 wl->wl_dealloclim);
486 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
487 wl->wl_dealloclim);
488
489 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
490
491 /* Initialize the commit header */
492 {
493 struct wapbl_wc_header *wc;
494 size_t len = 1 << wl->wl_log_dev_bshift;
495 wc = wapbl_calloc(1, len);
496 wc->wc_type = WAPBL_WC_HEADER;
497 wc->wc_len = len;
498 wc->wc_circ_off = wl->wl_circ_off;
499 wc->wc_circ_size = wl->wl_circ_size;
500 /* XXX wc->wc_fsid */
501 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
502 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
503 wl->wl_wc_header = wc;
504 wl->wl_wc_scratch = wapbl_malloc(len);
505 }
506
507 /*
508 * if there was an existing set of unlinked but
509 * allocated inodes, preserve it in the new
510 * log.
511 */
512 if (wr && wr->wr_inodescnt) {
513 error = wapbl_start_flush_inodes(wl, wr);
514 if (error)
515 goto errout;
516 }
517
518 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
519 if (error) {
520 goto errout;
521 }
522
523 *wlp = wl;
524 #if defined(WAPBL_DEBUG)
525 wapbl_debug_wl = wl;
526 #endif
527
528 return 0;
529 errout:
530 wapbl_discard(wl);
531 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
532 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
533 wapbl_free(wl->wl_deallocblks,
534 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
535 wapbl_free(wl->wl_dealloclens,
536 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
537 wapbl_inodetrk_free(wl);
538 wapbl_free(wl, sizeof(*wl));
539
540 return error;
541 }
542
543 /*
544 * Like wapbl_flush, only discards the transaction
545 * completely
546 */
547
548 void
549 wapbl_discard(struct wapbl *wl)
550 {
551 struct wapbl_entry *we;
552 struct buf *bp;
553 int i;
554
555 /*
556 * XXX we may consider using upgrade here
557 * if we want to call flush from inside a transaction
558 */
559 rw_enter(&wl->wl_rwlock, RW_WRITER);
560 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
561 wl->wl_dealloccnt);
562
563 #ifdef WAPBL_DEBUG_PRINT
564 {
565 pid_t pid = -1;
566 lwpid_t lid = -1;
567 if (curproc)
568 pid = curproc->p_pid;
569 if (curlwp)
570 lid = curlwp->l_lid;
571 #ifdef WAPBL_DEBUG_BUFBYTES
572 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
573 ("wapbl_discard: thread %d.%d discarding "
574 "transaction\n"
575 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
576 "deallocs=%d inodes=%d\n"
577 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
578 "unsynced=%zu\n",
579 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
580 wl->wl_bcount, wl->wl_dealloccnt,
581 wl->wl_inohashcnt, wl->wl_error_count,
582 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
583 wl->wl_unsynced_bufbytes));
584 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
585 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
586 ("\tentry: bufcount = %zu, reclaimable = %zu, "
587 "error = %d, unsynced = %zu\n",
588 we->we_bufcount, we->we_reclaimable_bytes,
589 we->we_error, we->we_unsynced_bufbytes));
590 }
591 #else /* !WAPBL_DEBUG_BUFBYTES */
592 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
593 ("wapbl_discard: thread %d.%d discarding transaction\n"
594 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
595 "deallocs=%d inodes=%d\n"
596 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
597 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
598 wl->wl_bcount, wl->wl_dealloccnt,
599 wl->wl_inohashcnt, wl->wl_error_count,
600 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
601 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
602 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
603 ("\tentry: bufcount = %zu, reclaimable = %zu, "
604 "error = %d\n",
605 we->we_bufcount, we->we_reclaimable_bytes,
606 we->we_error));
607 }
608 #endif /* !WAPBL_DEBUG_BUFBYTES */
609 }
610 #endif /* WAPBL_DEBUG_PRINT */
611
612 for (i = 0; i <= wl->wl_inohashmask; i++) {
613 struct wapbl_ino_head *wih;
614 struct wapbl_ino *wi;
615
616 wih = &wl->wl_inohash[i];
617 while ((wi = LIST_FIRST(wih)) != NULL) {
618 LIST_REMOVE(wi, wi_hash);
619 pool_put(&wapbl_ino_pool, wi);
620 KASSERT(wl->wl_inohashcnt > 0);
621 wl->wl_inohashcnt--;
622 }
623 }
624
625 /*
626 * clean buffer list
627 */
628 mutex_enter(&bufcache_lock);
629 mutex_enter(&wl->wl_mtx);
630 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
631 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
632 /*
633 * The buffer will be unlocked and
634 * removed from the transaction in brelse
635 */
636 mutex_exit(&wl->wl_mtx);
637 brelsel(bp, 0);
638 mutex_enter(&wl->wl_mtx);
639 }
640 }
641 mutex_exit(&wl->wl_mtx);
642 mutex_exit(&bufcache_lock);
643
644 /*
645 * Remove references to this wl from wl_entries, free any which
646 * no longer have buffers, others will be freed in wapbl_biodone
647 * when they no longer have any buffers.
648 */
649 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
650 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
651 /* XXX should we be accumulating wl_error_count
652 * and increasing reclaimable bytes ? */
653 we->we_wapbl = NULL;
654 if (we->we_bufcount == 0) {
655 #ifdef WAPBL_DEBUG_BUFBYTES
656 KASSERT(we->we_unsynced_bufbytes == 0);
657 #endif
658 wapbl_free(we, sizeof(*we));
659 }
660 }
661
662 /* Discard list of deallocs */
663 wl->wl_dealloccnt = 0;
664 /* XXX should we clear wl_reserved_bytes? */
665
666 KASSERT(wl->wl_bufbytes == 0);
667 KASSERT(wl->wl_bcount == 0);
668 KASSERT(wl->wl_bufcount == 0);
669 KASSERT(LIST_EMPTY(&wl->wl_bufs));
670 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
671 KASSERT(wl->wl_inohashcnt == 0);
672
673 rw_exit(&wl->wl_rwlock);
674 }
675
676 int
677 wapbl_stop(struct wapbl *wl, int force)
678 {
679 struct vnode *vp;
680 int error;
681
682 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
683 error = wapbl_flush(wl, 1);
684 if (error) {
685 if (force)
686 wapbl_discard(wl);
687 else
688 return error;
689 }
690
691 /* Unlinked inodes persist after a flush */
692 if (wl->wl_inohashcnt) {
693 if (force) {
694 wapbl_discard(wl);
695 } else {
696 return EBUSY;
697 }
698 }
699
700 KASSERT(wl->wl_bufbytes == 0);
701 KASSERT(wl->wl_bcount == 0);
702 KASSERT(wl->wl_bufcount == 0);
703 KASSERT(LIST_EMPTY(&wl->wl_bufs));
704 KASSERT(wl->wl_dealloccnt == 0);
705 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
706 KASSERT(wl->wl_inohashcnt == 0);
707
708 vp = wl->wl_logvp;
709
710 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
711 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
712 wapbl_free(wl->wl_deallocblks,
713 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
714 wapbl_free(wl->wl_dealloclens,
715 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
716 wapbl_inodetrk_free(wl);
717
718 cv_destroy(&wl->wl_reclaimable_cv);
719 mutex_destroy(&wl->wl_mtx);
720 rw_destroy(&wl->wl_rwlock);
721 wapbl_free(wl, sizeof(*wl));
722
723 return 0;
724 }
725
726 static int
727 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
728 {
729 struct pstats *pstats = curlwp->l_proc->p_stats;
730 struct buf *bp;
731 int error;
732
733 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
734 KASSERT(devvp->v_type == VBLK);
735
736 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
737 mutex_enter(&devvp->v_interlock);
738 devvp->v_numoutput++;
739 mutex_exit(&devvp->v_interlock);
740 pstats->p_ru.ru_oublock++;
741 } else {
742 pstats->p_ru.ru_inblock++;
743 }
744
745 bp = getiobuf(devvp, true);
746 bp->b_flags = flags;
747 bp->b_cflags = BC_BUSY; /* silly & dubious */
748 bp->b_dev = devvp->v_rdev;
749 bp->b_data = data;
750 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
751 bp->b_blkno = pbn;
752
753 WAPBL_PRINTF(WAPBL_PRINT_IO,
754 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
755 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
756 bp->b_blkno, bp->b_dev));
757
758 VOP_STRATEGY(devvp, bp);
759
760 error = biowait(bp);
761 putiobuf(bp);
762
763 if (error) {
764 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
765 ("wapbl_doio: %s %zu bytes at block %" PRId64
766 " on dev 0x%"PRIx64" failed with error %d\n",
767 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
768 "write" : "read"),
769 len, pbn, devvp->v_rdev, error));
770 }
771
772 return error;
773 }
774
775 int
776 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
777 {
778
779 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
780 }
781
782 int
783 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
784 {
785
786 return wapbl_doio(data, len, devvp, pbn, B_READ);
787 }
788
789 /*
790 * Off is byte offset returns new offset for next write
791 * handles log wraparound
792 */
793 static int
794 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
795 {
796 size_t slen;
797 off_t off = *offp;
798 int error;
799 daddr_t pbn;
800
801 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
802 wl->wl_log_dev_bshift) == len);
803
804 if (off < wl->wl_circ_off)
805 off = wl->wl_circ_off;
806 slen = wl->wl_circ_off + wl->wl_circ_size - off;
807 if (slen < len) {
808 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
809 #ifdef _KERNEL
810 pbn = btodb(pbn << wl->wl_log_dev_bshift);
811 #endif
812 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
813 if (error)
814 return error;
815 data = (uint8_t *)data + slen;
816 len -= slen;
817 off = wl->wl_circ_off;
818 }
819 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
820 #ifdef _KERNEL
821 pbn = btodb(pbn << wl->wl_log_dev_bshift);
822 #endif
823 error = wapbl_write(data, len, wl->wl_devvp, pbn);
824 if (error)
825 return error;
826 off += len;
827 if (off >= wl->wl_circ_off + wl->wl_circ_size)
828 off = wl->wl_circ_off;
829 *offp = off;
830 return 0;
831 }
832
833 /****************************************************************/
834
835 int
836 wapbl_begin(struct wapbl *wl, const char *file, int line)
837 {
838 int doflush;
839 unsigned lockcount;
840
841 KDASSERT(wl);
842
843 /*
844 * XXX this needs to be made much more sophisticated.
845 * perhaps each wapbl_begin could reserve a specified
846 * number of buffers and bytes.
847 */
848 mutex_enter(&wl->wl_mtx);
849 lockcount = wl->wl_lock_count;
850 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
851 wl->wl_bufbytes_max / 2) ||
852 ((wl->wl_bufcount + (lockcount * 10)) >
853 wl->wl_bufcount_max / 2) ||
854 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
855 (wl->wl_dealloccnt >=
856 (wl->wl_dealloclim - (wl->wl_dealloclim >> 8)));
857 mutex_exit(&wl->wl_mtx);
858
859 if (doflush) {
860 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
861 ("force flush lockcnt=%d bufbytes=%zu "
862 "(max=%zu) bufcount=%zu (max=%zu) "
863 "dealloccnt %d (lim=%d)\n",
864 lockcount, wl->wl_bufbytes,
865 wl->wl_bufbytes_max, wl->wl_bufcount,
866 wl->wl_bufcount_max,
867 wl->wl_dealloccnt, wl->wl_dealloclim));
868 }
869
870 if (doflush) {
871 int error = wapbl_flush(wl, 0);
872 if (error)
873 return error;
874 }
875
876 rw_enter(&wl->wl_rwlock, RW_READER);
877 mutex_enter(&wl->wl_mtx);
878 wl->wl_lock_count++;
879 mutex_exit(&wl->wl_mtx);
880
881 #if defined(WAPBL_DEBUG_PRINT)
882 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
883 ("wapbl_begin thread %d.%d with bufcount=%zu "
884 "bufbytes=%zu bcount=%zu at %s:%d\n",
885 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
886 wl->wl_bufbytes, wl->wl_bcount, file, line));
887 #endif
888
889 return 0;
890 }
891
892 void
893 wapbl_end(struct wapbl *wl)
894 {
895
896 #if defined(WAPBL_DEBUG_PRINT)
897 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
898 ("wapbl_end thread %d.%d with bufcount=%zu "
899 "bufbytes=%zu bcount=%zu\n",
900 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
901 wl->wl_bufbytes, wl->wl_bcount));
902 #endif
903
904 #ifdef DIAGNOSTIC
905 size_t flushsize = wapbl_transaction_len(wl);
906 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
907 /*
908 * XXX this could be handled more gracefully, perhaps place
909 * only a partial transaction in the log and allow the
910 * remaining to flush without the protection of the journal.
911 */
912 panic("wapbl_end: current transaction too big to flush\n");
913 }
914 #endif
915
916 mutex_enter(&wl->wl_mtx);
917 KASSERT(wl->wl_lock_count > 0);
918 wl->wl_lock_count--;
919 mutex_exit(&wl->wl_mtx);
920
921 rw_exit(&wl->wl_rwlock);
922 }
923
924 void
925 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
926 {
927
928 KASSERT(bp->b_cflags & BC_BUSY);
929 KASSERT(bp->b_vp);
930
931 wapbl_jlock_assert(wl);
932
933 #if 0
934 /*
935 * XXX this might be an issue for swapfiles.
936 * see uvm_swap.c:1702
937 *
938 * XXX2 why require it then? leap of semantics?
939 */
940 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
941 #endif
942
943 mutex_enter(&wl->wl_mtx);
944 if (bp->b_flags & B_LOCKED) {
945 LIST_REMOVE(bp, b_wapbllist);
946 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
947 ("wapbl_add_buf thread %d.%d re-adding buf %p "
948 "with %d bytes %d bcount\n",
949 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
950 bp->b_bcount));
951 } else {
952 /* unlocked by dirty buffers shouldn't exist */
953 KASSERT(!(bp->b_oflags & BO_DELWRI));
954 wl->wl_bufbytes += bp->b_bufsize;
955 wl->wl_bcount += bp->b_bcount;
956 wl->wl_bufcount++;
957 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
958 ("wapbl_add_buf thread %d.%d adding buf %p "
959 "with %d bytes %d bcount\n",
960 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
961 bp->b_bcount));
962 }
963 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
964 mutex_exit(&wl->wl_mtx);
965
966 bp->b_flags |= B_LOCKED;
967 }
968
969 static void
970 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
971 {
972
973 KASSERT(mutex_owned(&wl->wl_mtx));
974 KASSERT(bp->b_cflags & BC_BUSY);
975 wapbl_jlock_assert(wl);
976
977 #if 0
978 /*
979 * XXX this might be an issue for swapfiles.
980 * see uvm_swap.c:1725
981 *
982 * XXXdeux: see above
983 */
984 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
985 #endif
986 KASSERT(bp->b_flags & B_LOCKED);
987
988 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
989 ("wapbl_remove_buf thread %d.%d removing buf %p with "
990 "%d bytes %d bcount\n",
991 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
992
993 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
994 wl->wl_bufbytes -= bp->b_bufsize;
995 KASSERT(wl->wl_bcount >= bp->b_bcount);
996 wl->wl_bcount -= bp->b_bcount;
997 KASSERT(wl->wl_bufcount > 0);
998 wl->wl_bufcount--;
999 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1000 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1001 LIST_REMOVE(bp, b_wapbllist);
1002
1003 bp->b_flags &= ~B_LOCKED;
1004 }
1005
1006 /* called from brelsel() in vfs_bio among other places */
1007 void
1008 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1009 {
1010
1011 mutex_enter(&wl->wl_mtx);
1012 wapbl_remove_buf_locked(wl, bp);
1013 mutex_exit(&wl->wl_mtx);
1014 }
1015
1016 void
1017 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1018 {
1019
1020 KASSERT(bp->b_cflags & BC_BUSY);
1021
1022 /*
1023 * XXX: why does this depend on B_LOCKED? otherwise the buf
1024 * is not for a transaction? if so, why is this called in the
1025 * first place?
1026 */
1027 if (bp->b_flags & B_LOCKED) {
1028 mutex_enter(&wl->wl_mtx);
1029 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1030 wl->wl_bcount += bp->b_bcount - oldcnt;
1031 mutex_exit(&wl->wl_mtx);
1032 }
1033 }
1034
1035 #endif /* _KERNEL */
1036
1037 /****************************************************************/
1038 /* Some utility inlines */
1039
1040 /* This is used to advance the pointer at old to new value at old+delta */
1041 static inline off_t
1042 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1043 {
1044 off_t new;
1045
1046 /* Define acceptable ranges for inputs. */
1047 KASSERT(delta <= size);
1048 KASSERT((old == 0) || (old >= off));
1049 KASSERT(old < (size + off));
1050
1051 if ((old == 0) && (delta != 0))
1052 new = off + delta;
1053 else if ((old + delta) < (size + off))
1054 new = old + delta;
1055 else
1056 new = (old + delta) - size;
1057
1058 /* Note some interesting axioms */
1059 KASSERT((delta != 0) || (new == old));
1060 KASSERT((delta == 0) || (new != 0));
1061 KASSERT((delta != (size)) || (new == old));
1062
1063 /* Define acceptable ranges for output. */
1064 KASSERT((new == 0) || (new >= off));
1065 KASSERT(new < (size + off));
1066 return new;
1067 }
1068
1069 static inline size_t
1070 wapbl_space_used(size_t avail, off_t head, off_t tail)
1071 {
1072
1073 if (tail == 0) {
1074 KASSERT(head == 0);
1075 return 0;
1076 }
1077 return ((head + (avail - 1) - tail) % avail) + 1;
1078 }
1079
1080 static inline size_t
1081 wapbl_space_free(size_t avail, off_t head, off_t tail)
1082 {
1083
1084 return avail - wapbl_space_used(avail, head, tail);
1085 }
1086
1087 static inline void
1088 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1089 off_t *tailp)
1090 {
1091 off_t head = *headp;
1092 off_t tail = *tailp;
1093
1094 KASSERT(delta <= wapbl_space_free(size, head, tail));
1095 head = wapbl_advance(size, off, head, delta);
1096 if ((tail == 0) && (head != 0))
1097 tail = off;
1098 *headp = head;
1099 *tailp = tail;
1100 }
1101
1102 static inline void
1103 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1104 off_t *tailp)
1105 {
1106 off_t head = *headp;
1107 off_t tail = *tailp;
1108
1109 KASSERT(delta <= wapbl_space_used(size, head, tail));
1110 tail = wapbl_advance(size, off, tail, delta);
1111 if (head == tail) {
1112 head = tail = 0;
1113 }
1114 *headp = head;
1115 *tailp = tail;
1116 }
1117
1118 #ifdef _KERNEL
1119
1120 /****************************************************************/
1121
1122 /*
1123 * Remove transactions whose buffers are completely flushed to disk.
1124 * Will block until at least minfree space is available.
1125 * only intended to be called from inside wapbl_flush and therefore
1126 * does not protect against commit races with itself or with flush.
1127 */
1128 static int
1129 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1130 {
1131 size_t delta;
1132 size_t avail;
1133 off_t head;
1134 off_t tail;
1135 int error = 0;
1136
1137 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1138 KASSERT(rw_write_held(&wl->wl_rwlock));
1139
1140 mutex_enter(&wl->wl_mtx);
1141
1142 /*
1143 * First check to see if we have to do a commit
1144 * at all.
1145 */
1146 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1147 if (minfree < avail) {
1148 mutex_exit(&wl->wl_mtx);
1149 return 0;
1150 }
1151 minfree -= avail;
1152 while ((wl->wl_error_count == 0) &&
1153 (wl->wl_reclaimable_bytes < minfree)) {
1154 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1155 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1156 "minfree=%zd\n",
1157 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1158 minfree));
1159
1160 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1161 }
1162 if (wl->wl_reclaimable_bytes < minfree) {
1163 KASSERT(wl->wl_error_count);
1164 /* XXX maybe get actual error from buffer instead someday? */
1165 error = EIO;
1166 }
1167 head = wl->wl_head;
1168 tail = wl->wl_tail;
1169 delta = wl->wl_reclaimable_bytes;
1170
1171 /* If all of of the entries are flushed, then be sure to keep
1172 * the reserved bytes reserved. Watch out for discarded transactions,
1173 * which could leave more bytes reserved than are reclaimable.
1174 */
1175 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1176 (delta >= wl->wl_reserved_bytes)) {
1177 delta -= wl->wl_reserved_bytes;
1178 }
1179 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1180 &tail);
1181 KDASSERT(wl->wl_reserved_bytes <=
1182 wapbl_space_used(wl->wl_circ_size, head, tail));
1183 mutex_exit(&wl->wl_mtx);
1184
1185 if (error)
1186 return error;
1187
1188 if (waitonly)
1189 return 0;
1190
1191 /*
1192 * This is where head, tail and delta are unprotected
1193 * from races against itself or flush. This is ok since
1194 * we only call this routine from inside flush itself.
1195 *
1196 * XXX: how can it race against itself when accessed only
1197 * from behind the write-locked rwlock?
1198 */
1199 error = wapbl_write_commit(wl, head, tail);
1200 if (error)
1201 return error;
1202
1203 wl->wl_head = head;
1204 wl->wl_tail = tail;
1205
1206 mutex_enter(&wl->wl_mtx);
1207 KASSERT(wl->wl_reclaimable_bytes >= delta);
1208 wl->wl_reclaimable_bytes -= delta;
1209 mutex_exit(&wl->wl_mtx);
1210 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1211 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1212 curproc->p_pid, curlwp->l_lid, delta));
1213
1214 return 0;
1215 }
1216
1217 /****************************************************************/
1218
1219 void
1220 wapbl_biodone(struct buf *bp)
1221 {
1222 struct wapbl_entry *we = bp->b_private;
1223 struct wapbl *wl = we->we_wapbl;
1224
1225 /*
1226 * Handle possible flushing of buffers after log has been
1227 * decomissioned.
1228 */
1229 if (!wl) {
1230 KASSERT(we->we_bufcount > 0);
1231 we->we_bufcount--;
1232 #ifdef WAPBL_DEBUG_BUFBYTES
1233 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1234 we->we_unsynced_bufbytes -= bp->b_bufsize;
1235 #endif
1236
1237 if (we->we_bufcount == 0) {
1238 #ifdef WAPBL_DEBUG_BUFBYTES
1239 KASSERT(we->we_unsynced_bufbytes == 0);
1240 #endif
1241 wapbl_free(we, sizeof(*we));
1242 }
1243
1244 brelse(bp, 0);
1245 return;
1246 }
1247
1248 #ifdef ohbother
1249 KDASSERT(bp->b_flags & B_DONE);
1250 KDASSERT(!(bp->b_flags & B_DELWRI));
1251 KDASSERT(bp->b_flags & B_ASYNC);
1252 KDASSERT(bp->b_flags & B_BUSY);
1253 KDASSERT(!(bp->b_flags & B_LOCKED));
1254 KDASSERT(!(bp->b_flags & B_READ));
1255 KDASSERT(!(bp->b_flags & B_INVAL));
1256 KDASSERT(!(bp->b_flags & B_NOCACHE));
1257 #endif
1258
1259 if (bp->b_error) {
1260 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1261 /*
1262 * XXXpooka: interfaces not fully updated
1263 * Note: this was not enabled in the original patch
1264 * against netbsd4 either. I don't know if comment
1265 * above is true or not.
1266 */
1267
1268 /*
1269 * If an error occurs, report the error and leave the
1270 * buffer as a delayed write on the LRU queue.
1271 * restarting the write would likely result in
1272 * an error spinloop, so let it be done harmlessly
1273 * by the syncer.
1274 */
1275 bp->b_flags &= ~(B_DONE);
1276 simple_unlock(&bp->b_interlock);
1277
1278 if (we->we_error == 0) {
1279 mutex_enter(&wl->wl_mtx);
1280 wl->wl_error_count++;
1281 mutex_exit(&wl->wl_mtx);
1282 cv_broadcast(&wl->wl_reclaimable_cv);
1283 }
1284 we->we_error = bp->b_error;
1285 bp->b_error = 0;
1286 brelse(bp);
1287 return;
1288 #else
1289 /* For now, just mark the log permanently errored out */
1290
1291 mutex_enter(&wl->wl_mtx);
1292 if (wl->wl_error_count == 0) {
1293 wl->wl_error_count++;
1294 cv_broadcast(&wl->wl_reclaimable_cv);
1295 }
1296 mutex_exit(&wl->wl_mtx);
1297 #endif
1298 }
1299
1300 mutex_enter(&wl->wl_mtx);
1301
1302 KASSERT(we->we_bufcount > 0);
1303 we->we_bufcount--;
1304 #ifdef WAPBL_DEBUG_BUFBYTES
1305 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1306 we->we_unsynced_bufbytes -= bp->b_bufsize;
1307 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1308 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1309 #endif
1310
1311 /*
1312 * If the current transaction can be reclaimed, start
1313 * at the beginning and reclaim any consecutive reclaimable
1314 * transactions. If we successfully reclaim anything,
1315 * then wakeup anyone waiting for the reclaim.
1316 */
1317 if (we->we_bufcount == 0) {
1318 size_t delta = 0;
1319 int errcnt = 0;
1320 #ifdef WAPBL_DEBUG_BUFBYTES
1321 KDASSERT(we->we_unsynced_bufbytes == 0);
1322 #endif
1323 /*
1324 * clear any posted error, since the buffer it came from
1325 * has successfully flushed by now
1326 */
1327 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1328 (we->we_bufcount == 0)) {
1329 delta += we->we_reclaimable_bytes;
1330 if (we->we_error)
1331 errcnt++;
1332 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1333 wapbl_free(we, sizeof(*we));
1334 }
1335
1336 if (delta) {
1337 wl->wl_reclaimable_bytes += delta;
1338 KASSERT(wl->wl_error_count >= errcnt);
1339 wl->wl_error_count -= errcnt;
1340 cv_broadcast(&wl->wl_reclaimable_cv);
1341 }
1342 }
1343
1344 mutex_exit(&wl->wl_mtx);
1345 brelse(bp, 0);
1346 }
1347
1348 /*
1349 * Write transactions to disk + start I/O for contents
1350 */
1351 int
1352 wapbl_flush(struct wapbl *wl, int waitfor)
1353 {
1354 struct buf *bp;
1355 struct wapbl_entry *we;
1356 off_t off;
1357 off_t head;
1358 off_t tail;
1359 size_t delta = 0;
1360 size_t flushsize;
1361 size_t reserved;
1362 int error = 0;
1363
1364 /*
1365 * Do a quick check to see if a full flush can be skipped
1366 * This assumes that the flush callback does not need to be called
1367 * unless there are other outstanding bufs.
1368 */
1369 if (!waitfor) {
1370 size_t nbufs;
1371 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1372 protect the KASSERTS */
1373 nbufs = wl->wl_bufcount;
1374 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1375 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1376 mutex_exit(&wl->wl_mtx);
1377 if (nbufs == 0)
1378 return 0;
1379 }
1380
1381 /*
1382 * XXX we may consider using LK_UPGRADE here
1383 * if we want to call flush from inside a transaction
1384 */
1385 rw_enter(&wl->wl_rwlock, RW_WRITER);
1386 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1387 wl->wl_dealloccnt);
1388
1389 /*
1390 * Now that we are fully locked and flushed,
1391 * do another check for nothing to do.
1392 */
1393 if (wl->wl_bufcount == 0) {
1394 goto out;
1395 }
1396
1397 #if 0
1398 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1399 ("wapbl_flush thread %d.%d flushing entries with "
1400 "bufcount=%zu bufbytes=%zu\n",
1401 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1402 wl->wl_bufbytes));
1403 #endif
1404
1405 /* Calculate amount of space needed to flush */
1406 flushsize = wapbl_transaction_len(wl);
1407 if (wapbl_verbose_commit) {
1408 struct timespec ts;
1409 getnanotime(&ts);
1410 printf("%s: %lld.%06ld this transaction = %zu bytes\n",
1411 __func__, (long long)ts.tv_sec,
1412 (long)ts.tv_nsec, flushsize);
1413 }
1414
1415 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1416 /*
1417 * XXX this could be handled more gracefully, perhaps place
1418 * only a partial transaction in the log and allow the
1419 * remaining to flush without the protection of the journal.
1420 */
1421 panic("wapbl_flush: current transaction too big to flush\n");
1422 }
1423
1424 error = wapbl_truncate(wl, flushsize, 0);
1425 if (error)
1426 goto out2;
1427
1428 off = wl->wl_head;
1429 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1430 (off < wl->wl_circ_off + wl->wl_circ_size)));
1431 error = wapbl_write_blocks(wl, &off);
1432 if (error)
1433 goto out2;
1434 error = wapbl_write_revocations(wl, &off);
1435 if (error)
1436 goto out2;
1437 error = wapbl_write_inodes(wl, &off);
1438 if (error)
1439 goto out2;
1440
1441 reserved = 0;
1442 if (wl->wl_inohashcnt)
1443 reserved = wapbl_transaction_inodes_len(wl);
1444
1445 head = wl->wl_head;
1446 tail = wl->wl_tail;
1447
1448 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1449 &head, &tail);
1450 #ifdef WAPBL_DEBUG
1451 if (head != off) {
1452 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1453 " off=%"PRIdMAX" flush=%zu\n",
1454 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1455 flushsize);
1456 }
1457 #else
1458 KASSERT(head == off);
1459 #endif
1460
1461 /* Opportunistically move the tail forward if we can */
1462 if (!wapbl_lazy_truncate) {
1463 mutex_enter(&wl->wl_mtx);
1464 delta = wl->wl_reclaimable_bytes;
1465 mutex_exit(&wl->wl_mtx);
1466 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1467 &head, &tail);
1468 }
1469
1470 error = wapbl_write_commit(wl, head, tail);
1471 if (error)
1472 goto out2;
1473
1474 we = wapbl_calloc(1, sizeof(*we));
1475
1476 #ifdef WAPBL_DEBUG_BUFBYTES
1477 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1478 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1479 " unsynced=%zu"
1480 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1481 "inodes=%d\n",
1482 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1483 wapbl_space_used(wl->wl_circ_size, head, tail),
1484 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1485 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1486 wl->wl_inohashcnt));
1487 #else
1488 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1489 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1490 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1491 "inodes=%d\n",
1492 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1493 wapbl_space_used(wl->wl_circ_size, head, tail),
1494 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1495 wl->wl_dealloccnt, wl->wl_inohashcnt));
1496 #endif
1497
1498
1499 mutex_enter(&bufcache_lock);
1500 mutex_enter(&wl->wl_mtx);
1501
1502 wl->wl_reserved_bytes = reserved;
1503 wl->wl_head = head;
1504 wl->wl_tail = tail;
1505 KASSERT(wl->wl_reclaimable_bytes >= delta);
1506 wl->wl_reclaimable_bytes -= delta;
1507 wl->wl_dealloccnt = 0;
1508 #ifdef WAPBL_DEBUG_BUFBYTES
1509 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1510 #endif
1511
1512 we->we_wapbl = wl;
1513 we->we_bufcount = wl->wl_bufcount;
1514 #ifdef WAPBL_DEBUG_BUFBYTES
1515 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1516 #endif
1517 we->we_reclaimable_bytes = flushsize;
1518 we->we_error = 0;
1519 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1520
1521 /*
1522 * this flushes bufs in reverse order than they were queued
1523 * it shouldn't matter, but if we care we could use TAILQ instead.
1524 * XXX Note they will get put on the lru queue when they flush
1525 * so we might actually want to change this to preserve order.
1526 */
1527 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1528 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1529 continue;
1530 }
1531 bp->b_iodone = wapbl_biodone;
1532 bp->b_private = we;
1533 bremfree(bp);
1534 wapbl_remove_buf_locked(wl, bp);
1535 mutex_exit(&wl->wl_mtx);
1536 mutex_exit(&bufcache_lock);
1537 bawrite(bp);
1538 mutex_enter(&bufcache_lock);
1539 mutex_enter(&wl->wl_mtx);
1540 }
1541 mutex_exit(&wl->wl_mtx);
1542 mutex_exit(&bufcache_lock);
1543
1544 #if 0
1545 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1546 ("wapbl_flush thread %d.%d done flushing entries...\n",
1547 curproc->p_pid, curlwp->l_lid));
1548 #endif
1549
1550 out:
1551
1552 /*
1553 * If the waitfor flag is set, don't return until everything is
1554 * fully flushed and the on disk log is empty.
1555 */
1556 if (waitfor) {
1557 error = wapbl_truncate(wl, wl->wl_circ_size -
1558 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1559 }
1560
1561 out2:
1562 if (error) {
1563 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1564 wl->wl_dealloclens, wl->wl_dealloccnt);
1565 }
1566
1567 #ifdef WAPBL_DEBUG_PRINT
1568 if (error) {
1569 pid_t pid = -1;
1570 lwpid_t lid = -1;
1571 if (curproc)
1572 pid = curproc->p_pid;
1573 if (curlwp)
1574 lid = curlwp->l_lid;
1575 mutex_enter(&wl->wl_mtx);
1576 #ifdef WAPBL_DEBUG_BUFBYTES
1577 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1578 ("wapbl_flush: thread %d.%d aborted flush: "
1579 "error = %d\n"
1580 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1581 "deallocs=%d inodes=%d\n"
1582 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1583 "unsynced=%zu\n",
1584 pid, lid, error, wl->wl_bufcount,
1585 wl->wl_bufbytes, wl->wl_bcount,
1586 wl->wl_dealloccnt, wl->wl_inohashcnt,
1587 wl->wl_error_count, wl->wl_reclaimable_bytes,
1588 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1589 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1590 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1591 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1592 "error = %d, unsynced = %zu\n",
1593 we->we_bufcount, we->we_reclaimable_bytes,
1594 we->we_error, we->we_unsynced_bufbytes));
1595 }
1596 #else
1597 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1598 ("wapbl_flush: thread %d.%d aborted flush: "
1599 "error = %d\n"
1600 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1601 "deallocs=%d inodes=%d\n"
1602 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1603 pid, lid, error, wl->wl_bufcount,
1604 wl->wl_bufbytes, wl->wl_bcount,
1605 wl->wl_dealloccnt, wl->wl_inohashcnt,
1606 wl->wl_error_count, wl->wl_reclaimable_bytes,
1607 wl->wl_reserved_bytes));
1608 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1609 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1610 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1611 "error = %d\n", we->we_bufcount,
1612 we->we_reclaimable_bytes, we->we_error));
1613 }
1614 #endif
1615 mutex_exit(&wl->wl_mtx);
1616 }
1617 #endif
1618
1619 rw_exit(&wl->wl_rwlock);
1620 return error;
1621 }
1622
1623 /****************************************************************/
1624
1625 void
1626 wapbl_jlock_assert(struct wapbl *wl)
1627 {
1628
1629 KASSERT(rw_lock_held(&wl->wl_rwlock));
1630 }
1631
1632 void
1633 wapbl_junlock_assert(struct wapbl *wl)
1634 {
1635
1636 KASSERT(!rw_write_held(&wl->wl_rwlock));
1637 }
1638
1639 /****************************************************************/
1640
1641 /* locks missing */
1642 void
1643 wapbl_print(struct wapbl *wl,
1644 int full,
1645 void (*pr)(const char *, ...))
1646 {
1647 struct buf *bp;
1648 struct wapbl_entry *we;
1649 (*pr)("wapbl %p", wl);
1650 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1651 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1652 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1653 wl->wl_circ_size, wl->wl_circ_off,
1654 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1655 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1656 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1657 #ifdef WAPBL_DEBUG_BUFBYTES
1658 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1659 "reserved = %zu errcnt = %d unsynced = %zu\n",
1660 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1661 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1662 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1663 #else
1664 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1665 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1666 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1667 wl->wl_error_count);
1668 #endif
1669 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1670 wl->wl_dealloccnt, wl->wl_dealloclim);
1671 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1672 wl->wl_inohashcnt, wl->wl_inohashmask);
1673 (*pr)("entries:\n");
1674 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1675 #ifdef WAPBL_DEBUG_BUFBYTES
1676 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1677 "unsynced = %zu\n",
1678 we->we_bufcount, we->we_reclaimable_bytes,
1679 we->we_error, we->we_unsynced_bufbytes);
1680 #else
1681 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1682 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1683 #endif
1684 }
1685 if (full) {
1686 int cnt = 0;
1687 (*pr)("bufs =");
1688 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1689 if (!LIST_NEXT(bp, b_wapbllist)) {
1690 (*pr)(" %p", bp);
1691 } else if ((++cnt % 6) == 0) {
1692 (*pr)(" %p,\n\t", bp);
1693 } else {
1694 (*pr)(" %p,", bp);
1695 }
1696 }
1697 (*pr)("\n");
1698
1699 (*pr)("dealloced blks = ");
1700 {
1701 int i;
1702 cnt = 0;
1703 for (i = 0; i < wl->wl_dealloccnt; i++) {
1704 (*pr)(" %"PRId64":%d,",
1705 wl->wl_deallocblks[i],
1706 wl->wl_dealloclens[i]);
1707 if ((++cnt % 4) == 0) {
1708 (*pr)("\n\t");
1709 }
1710 }
1711 }
1712 (*pr)("\n");
1713
1714 (*pr)("registered inodes = ");
1715 {
1716 int i;
1717 cnt = 0;
1718 for (i = 0; i <= wl->wl_inohashmask; i++) {
1719 struct wapbl_ino_head *wih;
1720 struct wapbl_ino *wi;
1721
1722 wih = &wl->wl_inohash[i];
1723 LIST_FOREACH(wi, wih, wi_hash) {
1724 if (wi->wi_ino == 0)
1725 continue;
1726 (*pr)(" %"PRId32"/0%06"PRIo32",",
1727 wi->wi_ino, wi->wi_mode);
1728 if ((++cnt % 4) == 0) {
1729 (*pr)("\n\t");
1730 }
1731 }
1732 }
1733 (*pr)("\n");
1734 }
1735 }
1736 }
1737
1738 #if defined(WAPBL_DEBUG) || defined(DDB)
1739 void
1740 wapbl_dump(struct wapbl *wl)
1741 {
1742 #if defined(WAPBL_DEBUG)
1743 if (!wl)
1744 wl = wapbl_debug_wl;
1745 #endif
1746 if (!wl)
1747 return;
1748 wapbl_print(wl, 1, printf);
1749 }
1750 #endif
1751
1752 /****************************************************************/
1753
1754 void
1755 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1756 {
1757
1758 wapbl_jlock_assert(wl);
1759
1760 mutex_enter(&wl->wl_mtx);
1761 /* XXX should eventually instead tie this into resource estimation */
1762 /*
1763 * XXX this panic needs locking/mutex analysis and the
1764 * ability to cope with the failure.
1765 */
1766 /* XXX this XXX doesn't have enough XXX */
1767 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1768 panic("wapbl_register_deallocation: out of resources");
1769
1770 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1771 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1772 wl->wl_dealloccnt++;
1773 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1774 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1775 mutex_exit(&wl->wl_mtx);
1776 }
1777
1778 /****************************************************************/
1779
1780 static void
1781 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1782 {
1783
1784 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1785 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1786 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1787 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1788 }
1789 }
1790
1791 static void
1792 wapbl_inodetrk_free(struct wapbl *wl)
1793 {
1794
1795 /* XXX this KASSERT needs locking/mutex analysis */
1796 KASSERT(wl->wl_inohashcnt == 0);
1797 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1798 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1799 pool_destroy(&wapbl_ino_pool);
1800 }
1801 }
1802
1803 static struct wapbl_ino *
1804 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1805 {
1806 struct wapbl_ino_head *wih;
1807 struct wapbl_ino *wi;
1808
1809 KASSERT(mutex_owned(&wl->wl_mtx));
1810
1811 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1812 LIST_FOREACH(wi, wih, wi_hash) {
1813 if (ino == wi->wi_ino)
1814 return wi;
1815 }
1816 return 0;
1817 }
1818
1819 void
1820 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1821 {
1822 struct wapbl_ino_head *wih;
1823 struct wapbl_ino *wi;
1824
1825 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1826
1827 mutex_enter(&wl->wl_mtx);
1828 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1829 wi->wi_ino = ino;
1830 wi->wi_mode = mode;
1831 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1832 LIST_INSERT_HEAD(wih, wi, wi_hash);
1833 wl->wl_inohashcnt++;
1834 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1835 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1836 mutex_exit(&wl->wl_mtx);
1837 } else {
1838 mutex_exit(&wl->wl_mtx);
1839 pool_put(&wapbl_ino_pool, wi);
1840 }
1841 }
1842
1843 void
1844 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1845 {
1846 struct wapbl_ino *wi;
1847
1848 mutex_enter(&wl->wl_mtx);
1849 wi = wapbl_inodetrk_get(wl, ino);
1850 if (wi) {
1851 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1852 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1853 KASSERT(wl->wl_inohashcnt > 0);
1854 wl->wl_inohashcnt--;
1855 LIST_REMOVE(wi, wi_hash);
1856 mutex_exit(&wl->wl_mtx);
1857
1858 pool_put(&wapbl_ino_pool, wi);
1859 } else {
1860 mutex_exit(&wl->wl_mtx);
1861 }
1862 }
1863
1864 /****************************************************************/
1865
1866 static inline size_t
1867 wapbl_transaction_inodes_len(struct wapbl *wl)
1868 {
1869 int blocklen = 1<<wl->wl_log_dev_bshift;
1870 int iph;
1871
1872 /* Calculate number of inodes described in a inodelist header */
1873 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1874 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1875
1876 KASSERT(iph > 0);
1877
1878 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1879 }
1880
1881
1882 /* Calculate amount of space a transaction will take on disk */
1883 static size_t
1884 wapbl_transaction_len(struct wapbl *wl)
1885 {
1886 int blocklen = 1<<wl->wl_log_dev_bshift;
1887 size_t len;
1888 int bph;
1889
1890 /* Calculate number of blocks described in a blocklist header */
1891 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1892 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1893
1894 KASSERT(bph > 0);
1895
1896 len = wl->wl_bcount;
1897 len += howmany(wl->wl_bufcount, bph) * blocklen;
1898 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1899 len += wapbl_transaction_inodes_len(wl);
1900
1901 return len;
1902 }
1903
1904 /*
1905 * Perform commit operation
1906 *
1907 * Note that generation number incrementation needs to
1908 * be protected against racing with other invocations
1909 * of wapbl_commit. This is ok since this routine
1910 * is only invoked from wapbl_flush
1911 */
1912 static int
1913 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1914 {
1915 struct wapbl_wc_header *wc = wl->wl_wc_header;
1916 struct timespec ts;
1917 int error;
1918 int force = 1;
1919 daddr_t pbn;
1920
1921 if (wapbl_flush_disk_cache) {
1922 /* XXX Calc checksum here, instead we do this for now */
1923 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1924 FWRITE, FSCRED);
1925 if (error) {
1926 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1927 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1928 "returned %d\n", wl->wl_devvp->v_rdev, error));
1929 }
1930 }
1931
1932 wc->wc_head = head;
1933 wc->wc_tail = tail;
1934 wc->wc_checksum = 0;
1935 wc->wc_version = 1;
1936 getnanotime(&ts);
1937 wc->wc_time = ts.tv_sec;
1938 wc->wc_timensec = ts.tv_nsec;
1939
1940 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1941 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1942 (intmax_t)head, (intmax_t)tail));
1943
1944 /*
1945 * XXX if generation will rollover, then first zero
1946 * over second commit header before trying to write both headers.
1947 */
1948
1949 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1950 #ifdef _KERNEL
1951 pbn = btodb(pbn << wc->wc_log_dev_bshift);
1952 #endif
1953 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1954 if (error)
1955 return error;
1956
1957 if (wapbl_flush_disk_cache) {
1958 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1959 FWRITE, FSCRED);
1960 if (error) {
1961 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1962 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1963 "returned %d\n", wl->wl_devvp->v_rdev, error));
1964 }
1965 }
1966
1967 /*
1968 * If the generation number was zero, write it out a second time.
1969 * This handles initialization and generation number rollover
1970 */
1971 if (wc->wc_generation++ == 0) {
1972 error = wapbl_write_commit(wl, head, tail);
1973 /*
1974 * This panic should be able to be removed if we do the
1975 * zero'ing mentioned above, and we are certain to roll
1976 * back generation number on failure.
1977 */
1978 if (error)
1979 panic("wapbl_write_commit: error writing duplicate "
1980 "log header: %d\n", error);
1981 }
1982 return 0;
1983 }
1984
1985 /* Returns new offset value */
1986 static int
1987 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1988 {
1989 struct wapbl_wc_blocklist *wc =
1990 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1991 int blocklen = 1<<wl->wl_log_dev_bshift;
1992 int bph;
1993 struct buf *bp;
1994 off_t off = *offp;
1995 int error;
1996 size_t padding;
1997
1998 KASSERT(rw_write_held(&wl->wl_rwlock));
1999
2000 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2001 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2002
2003 bp = LIST_FIRST(&wl->wl_bufs);
2004
2005 while (bp) {
2006 int cnt;
2007 struct buf *obp = bp;
2008
2009 KASSERT(bp->b_flags & B_LOCKED);
2010
2011 wc->wc_type = WAPBL_WC_BLOCKS;
2012 wc->wc_len = blocklen;
2013 wc->wc_blkcount = 0;
2014 while (bp && (wc->wc_blkcount < bph)) {
2015 /*
2016 * Make sure all the physical block numbers are up to
2017 * date. If this is not always true on a given
2018 * filesystem, then VOP_BMAP must be called. We
2019 * could call VOP_BMAP here, or else in the filesystem
2020 * specific flush callback, although neither of those
2021 * solutions allow us to take the vnode lock. If a
2022 * filesystem requires that we must take the vnode lock
2023 * to call VOP_BMAP, then we can probably do it in
2024 * bwrite when the vnode lock should already be held
2025 * by the invoking code.
2026 */
2027 KASSERT((bp->b_vp->v_type == VBLK) ||
2028 (bp->b_blkno != bp->b_lblkno));
2029 KASSERT(bp->b_blkno > 0);
2030
2031 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2032 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2033 wc->wc_len += bp->b_bcount;
2034 wc->wc_blkcount++;
2035 bp = LIST_NEXT(bp, b_wapbllist);
2036 }
2037 if (wc->wc_len % blocklen != 0) {
2038 padding = blocklen - wc->wc_len % blocklen;
2039 wc->wc_len += padding;
2040 } else {
2041 padding = 0;
2042 }
2043
2044 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2045 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2046 wc->wc_len, padding, (intmax_t)off));
2047
2048 error = wapbl_circ_write(wl, wc, blocklen, &off);
2049 if (error)
2050 return error;
2051 bp = obp;
2052 cnt = 0;
2053 while (bp && (cnt++ < bph)) {
2054 error = wapbl_circ_write(wl, bp->b_data,
2055 bp->b_bcount, &off);
2056 if (error)
2057 return error;
2058 bp = LIST_NEXT(bp, b_wapbllist);
2059 }
2060 if (padding) {
2061 void *zero;
2062
2063 zero = wapbl_malloc(padding);
2064 memset(zero, 0, padding);
2065 error = wapbl_circ_write(wl, zero, padding, &off);
2066 wapbl_free(zero, padding);
2067 if (error)
2068 return error;
2069 }
2070 }
2071 *offp = off;
2072 return 0;
2073 }
2074
2075 static int
2076 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2077 {
2078 struct wapbl_wc_blocklist *wc =
2079 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2080 int i;
2081 int blocklen = 1<<wl->wl_log_dev_bshift;
2082 int bph;
2083 off_t off = *offp;
2084 int error;
2085
2086 if (wl->wl_dealloccnt == 0)
2087 return 0;
2088
2089 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2090 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2091
2092 i = 0;
2093 while (i < wl->wl_dealloccnt) {
2094 wc->wc_type = WAPBL_WC_REVOCATIONS;
2095 wc->wc_len = blocklen;
2096 wc->wc_blkcount = 0;
2097 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2098 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2099 wl->wl_deallocblks[i];
2100 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2101 wl->wl_dealloclens[i];
2102 wc->wc_blkcount++;
2103 i++;
2104 }
2105 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2106 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2107 wc->wc_len, (intmax_t)off));
2108 error = wapbl_circ_write(wl, wc, blocklen, &off);
2109 if (error)
2110 return error;
2111 }
2112 *offp = off;
2113 return 0;
2114 }
2115
2116 static int
2117 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2118 {
2119 struct wapbl_wc_inodelist *wc =
2120 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2121 int i;
2122 int blocklen = 1 << wl->wl_log_dev_bshift;
2123 off_t off = *offp;
2124 int error;
2125
2126 struct wapbl_ino_head *wih;
2127 struct wapbl_ino *wi;
2128 int iph;
2129
2130 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2131 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2132
2133 i = 0;
2134 wih = &wl->wl_inohash[0];
2135 wi = 0;
2136 do {
2137 wc->wc_type = WAPBL_WC_INODES;
2138 wc->wc_len = blocklen;
2139 wc->wc_inocnt = 0;
2140 wc->wc_clear = (i == 0);
2141 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2142 while (!wi) {
2143 KASSERT((wih - &wl->wl_inohash[0])
2144 <= wl->wl_inohashmask);
2145 wi = LIST_FIRST(wih++);
2146 }
2147 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2148 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2149 wc->wc_inocnt++;
2150 i++;
2151 wi = LIST_NEXT(wi, wi_hash);
2152 }
2153 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2154 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2155 wc->wc_len, (intmax_t)off));
2156 error = wapbl_circ_write(wl, wc, blocklen, &off);
2157 if (error)
2158 return error;
2159 } while (i < wl->wl_inohashcnt);
2160
2161 *offp = off;
2162 return 0;
2163 }
2164
2165 #endif /* _KERNEL */
2166
2167 /****************************************************************/
2168
2169 struct wapbl_blk {
2170 LIST_ENTRY(wapbl_blk) wb_hash;
2171 daddr_t wb_blk;
2172 off_t wb_off; /* Offset of this block in the log */
2173 };
2174 #define WAPBL_BLKPOOL_MIN 83
2175
2176 static void
2177 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2178 {
2179 if (size < WAPBL_BLKPOOL_MIN)
2180 size = WAPBL_BLKPOOL_MIN;
2181 KASSERT(wr->wr_blkhash == 0);
2182 #ifdef _KERNEL
2183 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2184 #else /* ! _KERNEL */
2185 /* Manually implement hashinit */
2186 {
2187 unsigned long i, hashsize;
2188 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2189 continue;
2190 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2191 for (i = 0; i < hashsize; i++)
2192 LIST_INIT(&wr->wr_blkhash[i]);
2193 wr->wr_blkhashmask = hashsize - 1;
2194 }
2195 #endif /* ! _KERNEL */
2196 }
2197
2198 static void
2199 wapbl_blkhash_free(struct wapbl_replay *wr)
2200 {
2201 KASSERT(wr->wr_blkhashcnt == 0);
2202 #ifdef _KERNEL
2203 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2204 #else /* ! _KERNEL */
2205 wapbl_free(wr->wr_blkhash,
2206 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2207 #endif /* ! _KERNEL */
2208 }
2209
2210 static struct wapbl_blk *
2211 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2212 {
2213 struct wapbl_blk_head *wbh;
2214 struct wapbl_blk *wb;
2215 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2216 LIST_FOREACH(wb, wbh, wb_hash) {
2217 if (blk == wb->wb_blk)
2218 return wb;
2219 }
2220 return 0;
2221 }
2222
2223 static void
2224 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2225 {
2226 struct wapbl_blk_head *wbh;
2227 struct wapbl_blk *wb;
2228 wb = wapbl_blkhash_get(wr, blk);
2229 if (wb) {
2230 KASSERT(wb->wb_blk == blk);
2231 wb->wb_off = off;
2232 } else {
2233 wb = wapbl_malloc(sizeof(*wb));
2234 wb->wb_blk = blk;
2235 wb->wb_off = off;
2236 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2237 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2238 wr->wr_blkhashcnt++;
2239 }
2240 }
2241
2242 static void
2243 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2244 {
2245 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2246 if (wb) {
2247 KASSERT(wr->wr_blkhashcnt > 0);
2248 wr->wr_blkhashcnt--;
2249 LIST_REMOVE(wb, wb_hash);
2250 wapbl_free(wb, sizeof(*wb));
2251 }
2252 }
2253
2254 static void
2255 wapbl_blkhash_clear(struct wapbl_replay *wr)
2256 {
2257 unsigned long i;
2258 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2259 struct wapbl_blk *wb;
2260
2261 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2262 KASSERT(wr->wr_blkhashcnt > 0);
2263 wr->wr_blkhashcnt--;
2264 LIST_REMOVE(wb, wb_hash);
2265 wapbl_free(wb, sizeof(*wb));
2266 }
2267 }
2268 KASSERT(wr->wr_blkhashcnt == 0);
2269 }
2270
2271 /****************************************************************/
2272
2273 static int
2274 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2275 {
2276 size_t slen;
2277 off_t off = *offp;
2278 int error;
2279 daddr_t pbn;
2280
2281 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2282 wr->wr_log_dev_bshift) == len);
2283
2284 if (off < wr->wr_circ_off)
2285 off = wr->wr_circ_off;
2286 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2287 if (slen < len) {
2288 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2289 #ifdef _KERNEL
2290 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2291 #endif
2292 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2293 if (error)
2294 return error;
2295 data = (uint8_t *)data + slen;
2296 len -= slen;
2297 off = wr->wr_circ_off;
2298 }
2299 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2300 #ifdef _KERNEL
2301 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2302 #endif
2303 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2304 if (error)
2305 return error;
2306 off += len;
2307 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2308 off = wr->wr_circ_off;
2309 *offp = off;
2310 return 0;
2311 }
2312
2313 static void
2314 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2315 {
2316 size_t slen;
2317 off_t off = *offp;
2318
2319 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2320 wr->wr_log_dev_bshift) == len);
2321
2322 if (off < wr->wr_circ_off)
2323 off = wr->wr_circ_off;
2324 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2325 if (slen < len) {
2326 len -= slen;
2327 off = wr->wr_circ_off;
2328 }
2329 off += len;
2330 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2331 off = wr->wr_circ_off;
2332 *offp = off;
2333 }
2334
2335 /****************************************************************/
2336
2337 int
2338 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2339 daddr_t off, size_t count, size_t blksize)
2340 {
2341 struct wapbl_replay *wr;
2342 int error;
2343 struct vnode *devvp;
2344 daddr_t logpbn;
2345 uint8_t *scratch;
2346 struct wapbl_wc_header *wch;
2347 struct wapbl_wc_header *wch2;
2348 /* Use this until we read the actual log header */
2349 int log_dev_bshift = ilog2(blksize);
2350 size_t used;
2351 daddr_t pbn;
2352
2353 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2354 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2355 vp, off, count, blksize));
2356
2357 if (off < 0)
2358 return EINVAL;
2359
2360 if (blksize < DEV_BSIZE)
2361 return EINVAL;
2362 if (blksize % DEV_BSIZE)
2363 return EINVAL;
2364
2365 #ifdef _KERNEL
2366 #if 0
2367 /* XXX vp->v_size isn't reliably set for VBLK devices,
2368 * especially root. However, we might still want to verify
2369 * that the full load is readable */
2370 if ((off + count) * blksize > vp->v_size)
2371 return EINVAL;
2372 #endif
2373 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2374 return error;
2375 }
2376 #else /* ! _KERNEL */
2377 devvp = vp;
2378 logpbn = off;
2379 #endif /* ! _KERNEL */
2380
2381 scratch = wapbl_malloc(MAXBSIZE);
2382
2383 pbn = logpbn;
2384 #ifdef _KERNEL
2385 pbn = btodb(pbn << log_dev_bshift);
2386 #endif
2387 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2388 if (error)
2389 goto errout;
2390
2391 wch = (struct wapbl_wc_header *)scratch;
2392 wch2 =
2393 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2394 /* XXX verify checksums and magic numbers */
2395 if (wch->wc_type != WAPBL_WC_HEADER) {
2396 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2397 error = EFTYPE;
2398 goto errout;
2399 }
2400
2401 if (wch2->wc_generation > wch->wc_generation)
2402 wch = wch2;
2403
2404 wr = wapbl_calloc(1, sizeof(*wr));
2405
2406 wr->wr_logvp = vp;
2407 wr->wr_devvp = devvp;
2408 wr->wr_logpbn = logpbn;
2409
2410 wr->wr_scratch = scratch;
2411
2412 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2413 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2414 wr->wr_circ_off = wch->wc_circ_off;
2415 wr->wr_circ_size = wch->wc_circ_size;
2416 wr->wr_generation = wch->wc_generation;
2417
2418 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2419
2420 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2421 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2422 " len=%"PRId64" used=%zu\n",
2423 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2424 wch->wc_circ_size, used));
2425
2426 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2427
2428 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2429 if (error) {
2430 wapbl_replay_stop(wr);
2431 wapbl_replay_free(wr);
2432 return error;
2433 }
2434
2435 *wrp = wr;
2436 return 0;
2437
2438 errout:
2439 wapbl_free(scratch, MAXBSIZE);
2440 return error;
2441 }
2442
2443 void
2444 wapbl_replay_stop(struct wapbl_replay *wr)
2445 {
2446
2447 if (!wapbl_replay_isopen(wr))
2448 return;
2449
2450 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2451
2452 wapbl_free(wr->wr_scratch, MAXBSIZE);
2453 wr->wr_scratch = NULL;
2454
2455 wr->wr_logvp = NULL;
2456
2457 wapbl_blkhash_clear(wr);
2458 wapbl_blkhash_free(wr);
2459 }
2460
2461 void
2462 wapbl_replay_free(struct wapbl_replay *wr)
2463 {
2464
2465 KDASSERT(!wapbl_replay_isopen(wr));
2466
2467 if (wr->wr_inodes)
2468 wapbl_free(wr->wr_inodes,
2469 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2470 wapbl_free(wr, sizeof(*wr));
2471 }
2472
2473 #ifdef _KERNEL
2474 int
2475 wapbl_replay_isopen1(struct wapbl_replay *wr)
2476 {
2477
2478 return wapbl_replay_isopen(wr);
2479 }
2480 #endif
2481
2482 static void
2483 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2484 {
2485 struct wapbl_wc_blocklist *wc =
2486 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2487 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2488 int i, j, n;
2489
2490 for (i = 0; i < wc->wc_blkcount; i++) {
2491 /*
2492 * Enter each physical block into the hashtable independently.
2493 */
2494 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2495 for (j = 0; j < n; j++) {
2496 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2497 *offp);
2498 wapbl_circ_advance(wr, fsblklen, offp);
2499 }
2500 }
2501 }
2502
2503 static void
2504 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2505 {
2506 struct wapbl_wc_blocklist *wc =
2507 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2508 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2509 int i, j, n;
2510
2511 for (i = 0; i < wc->wc_blkcount; i++) {
2512 /*
2513 * Remove any blocks found from the hashtable.
2514 */
2515 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2516 for (j = 0; j < n; j++)
2517 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2518 }
2519 }
2520
2521 static void
2522 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2523 {
2524 struct wapbl_wc_inodelist *wc =
2525 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2526 void *new_inodes;
2527 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2528
2529 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2530
2531 /*
2532 * Keep track of where we found this so location won't be
2533 * overwritten.
2534 */
2535 if (wc->wc_clear) {
2536 wr->wr_inodestail = oldoff;
2537 wr->wr_inodescnt = 0;
2538 if (wr->wr_inodes != NULL) {
2539 wapbl_free(wr->wr_inodes, oldsize);
2540 wr->wr_inodes = NULL;
2541 }
2542 }
2543 wr->wr_inodeshead = newoff;
2544 if (wc->wc_inocnt == 0)
2545 return;
2546
2547 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
2548 sizeof(wr->wr_inodes[0]));
2549 if (wr->wr_inodes != NULL) {
2550 memcpy(new_inodes, wr->wr_inodes, oldsize);
2551 wapbl_free(wr->wr_inodes, oldsize);
2552 }
2553 wr->wr_inodes = new_inodes;
2554 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2555 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2556 wr->wr_inodescnt += wc->wc_inocnt;
2557 }
2558
2559 static int
2560 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2561 {
2562 off_t off;
2563 int error;
2564
2565 int logblklen = 1 << wr->wr_log_dev_bshift;
2566
2567 wapbl_blkhash_clear(wr);
2568
2569 off = tail;
2570 while (off != head) {
2571 struct wapbl_wc_null *wcn;
2572 off_t saveoff = off;
2573 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2574 if (error)
2575 goto errout;
2576 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2577 switch (wcn->wc_type) {
2578 case WAPBL_WC_BLOCKS:
2579 wapbl_replay_process_blocks(wr, &off);
2580 break;
2581
2582 case WAPBL_WC_REVOCATIONS:
2583 wapbl_replay_process_revocations(wr);
2584 break;
2585
2586 case WAPBL_WC_INODES:
2587 wapbl_replay_process_inodes(wr, saveoff, off);
2588 break;
2589
2590 default:
2591 printf("Unrecognized wapbl type: 0x%08x\n",
2592 wcn->wc_type);
2593 error = EFTYPE;
2594 goto errout;
2595 }
2596 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2597 if (off != saveoff) {
2598 printf("wapbl_replay: corrupted records\n");
2599 error = EFTYPE;
2600 goto errout;
2601 }
2602 }
2603 return 0;
2604
2605 errout:
2606 wapbl_blkhash_clear(wr);
2607 return error;
2608 }
2609
2610 #if 0
2611 int
2612 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2613 {
2614 off_t off;
2615 int mismatchcnt = 0;
2616 int logblklen = 1 << wr->wr_log_dev_bshift;
2617 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2618 void *scratch1 = wapbl_malloc(MAXBSIZE);
2619 void *scratch2 = wapbl_malloc(MAXBSIZE);
2620 int error = 0;
2621
2622 KDASSERT(wapbl_replay_isopen(wr));
2623
2624 off = wch->wc_tail;
2625 while (off != wch->wc_head) {
2626 struct wapbl_wc_null *wcn;
2627 #ifdef DEBUG
2628 off_t saveoff = off;
2629 #endif
2630 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2631 if (error)
2632 goto out;
2633 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2634 switch (wcn->wc_type) {
2635 case WAPBL_WC_BLOCKS:
2636 {
2637 struct wapbl_wc_blocklist *wc =
2638 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2639 int i;
2640 for (i = 0; i < wc->wc_blkcount; i++) {
2641 int foundcnt = 0;
2642 int dirtycnt = 0;
2643 int j, n;
2644 /*
2645 * Check each physical block into the
2646 * hashtable independently
2647 */
2648 n = wc->wc_blocks[i].wc_dlen >>
2649 wch->wc_fs_dev_bshift;
2650 for (j = 0; j < n; j++) {
2651 struct wapbl_blk *wb =
2652 wapbl_blkhash_get(wr,
2653 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2654 if (wb && (wb->wb_off == off)) {
2655 foundcnt++;
2656 error =
2657 wapbl_circ_read(wr,
2658 scratch1, fsblklen,
2659 &off);
2660 if (error)
2661 goto out;
2662 error =
2663 wapbl_read(scratch2,
2664 fsblklen, fsdevvp,
2665 wb->wb_blk);
2666 if (error)
2667 goto out;
2668 if (memcmp(scratch1,
2669 scratch2,
2670 fsblklen)) {
2671 printf(
2672 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2673 wb->wb_blk, (intmax_t)off);
2674 dirtycnt++;
2675 mismatchcnt++;
2676 }
2677 } else {
2678 wapbl_circ_advance(wr,
2679 fsblklen, &off);
2680 }
2681 }
2682 #if 0
2683 /*
2684 * If all of the blocks in an entry
2685 * are clean, then remove all of its
2686 * blocks from the hashtable since they
2687 * never will need replay.
2688 */
2689 if ((foundcnt != 0) &&
2690 (dirtycnt == 0)) {
2691 off = saveoff;
2692 wapbl_circ_advance(wr,
2693 logblklen, &off);
2694 for (j = 0; j < n; j++) {
2695 struct wapbl_blk *wb =
2696 wapbl_blkhash_get(wr,
2697 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2698 if (wb &&
2699 (wb->wb_off == off)) {
2700 wapbl_blkhash_rem(wr, wb->wb_blk);
2701 }
2702 wapbl_circ_advance(wr,
2703 fsblklen, &off);
2704 }
2705 }
2706 #endif
2707 }
2708 }
2709 break;
2710 case WAPBL_WC_REVOCATIONS:
2711 case WAPBL_WC_INODES:
2712 break;
2713 default:
2714 KASSERT(0);
2715 }
2716 #ifdef DEBUG
2717 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2718 KASSERT(off == saveoff);
2719 #endif
2720 }
2721 out:
2722 wapbl_free(scratch1, MAXBSIZE);
2723 wapbl_free(scratch2, MAXBSIZE);
2724 if (!error && mismatchcnt)
2725 error = EFTYPE;
2726 return error;
2727 }
2728 #endif
2729
2730 int
2731 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2732 {
2733 struct wapbl_blk *wb;
2734 size_t i;
2735 off_t off;
2736 void *scratch;
2737 int error = 0;
2738 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2739
2740 KDASSERT(wapbl_replay_isopen(wr));
2741
2742 scratch = wapbl_malloc(MAXBSIZE);
2743
2744 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2745 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2746 off = wb->wb_off;
2747 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2748 if (error)
2749 break;
2750 error = wapbl_write(scratch, fsblklen, fsdevvp,
2751 wb->wb_blk);
2752 if (error)
2753 break;
2754 }
2755 }
2756
2757 wapbl_free(scratch, MAXBSIZE);
2758 return error;
2759 }
2760
2761 int
2762 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2763 {
2764 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2765
2766 KDASSERT(wapbl_replay_isopen(wr));
2767 KASSERT((len % fsblklen) == 0);
2768
2769 while (len != 0) {
2770 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2771 if (wb)
2772 return 1;
2773 len -= fsblklen;
2774 }
2775 return 0;
2776 }
2777
2778 int
2779 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2780 {
2781 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2782
2783 KDASSERT(wapbl_replay_isopen(wr));
2784
2785 KASSERT((len % fsblklen) == 0);
2786
2787 while (len != 0) {
2788 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2789 if (wb) {
2790 off_t off = wb->wb_off;
2791 int error;
2792 error = wapbl_circ_read(wr, data, fsblklen, &off);
2793 if (error)
2794 return error;
2795 }
2796 data = (uint8_t *)data + fsblklen;
2797 len -= fsblklen;
2798 blk++;
2799 }
2800 return 0;
2801 }
2802
2803 #ifdef _KERNEL
2804 /*
2805 * This is not really a module now, but maybe on it's way to
2806 * being one some day.
2807 */
2808 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2809
2810 static int
2811 wapbl_modcmd(modcmd_t cmd, void *arg)
2812 {
2813
2814 switch (cmd) {
2815 case MODULE_CMD_INIT:
2816 wapbl_init();
2817 return 0;
2818 case MODULE_CMD_FINI:
2819 #ifdef notyet
2820 return wapbl_fini(true);
2821 #endif
2822 return EOPNOTSUPP;
2823 default:
2824 return ENOTTY;
2825 }
2826 }
2827 #endif /* _KERNEL */
2828