vfs_wapbl.c revision 1.52.2.2 1 /* $NetBSD: vfs_wapbl.c,v 1.52.2.2 2012/11/20 03:02:45 tls Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.52.2.2 2012/11/20 03:02:45 tls Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 #else /* !_KERNEL */
74 #include <assert.h>
75 #include <errno.h>
76 #include <stdio.h>
77 #include <stdbool.h>
78 #include <stdlib.h>
79 #include <string.h>
80
81 #include <sys/time.h>
82 #include <sys/wapbl.h>
83 #include <sys/wapbl_replay.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_alloc(s) malloc(s)
88 #define wapbl_free(a, s) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * lm = rwlock held writing or mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* lm: address of block */
175 int *wl_dealloclens; /* lm: size of block */
176 int wl_dealloccnt; /* lm: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187 };
188
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191 #endif
192
193 /****************************************************************/
194 #ifdef _KERNEL
195
196 #ifdef WAPBL_DEBUG
197 struct wapbl *wapbl_debug_wl;
198 #endif
199
200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204 #endif /* _KERNEL */
205
206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207
208 static inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail);
210 static inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail);
212
213 #ifdef _KERNEL
214
215 static struct pool wapbl_entry_pool;
216
217 #define WAPBL_INODETRK_SIZE 83
218 static int wapbl_ino_pool_refcount;
219 static struct pool wapbl_ino_pool;
220 struct wapbl_ino {
221 LIST_ENTRY(wapbl_ino) wi_hash;
222 ino_t wi_ino;
223 mode_t wi_mode;
224 };
225
226 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
227 static void wapbl_inodetrk_free(struct wapbl *wl);
228 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
229
230 static size_t wapbl_transaction_len(struct wapbl *wl);
231 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
232
233 #if 0
234 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
235 #endif
236
237 static int wapbl_replay_isopen1(struct wapbl_replay *);
238
239 /*
240 * This is useful for debugging. If set, the log will
241 * only be truncated when necessary.
242 */
243 int wapbl_lazy_truncate = 0;
244
245 struct wapbl_ops wapbl_ops = {
246 .wo_wapbl_discard = wapbl_discard,
247 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
248 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
249 .wo_wapbl_replay_read = wapbl_replay_read,
250 .wo_wapbl_add_buf = wapbl_add_buf,
251 .wo_wapbl_remove_buf = wapbl_remove_buf,
252 .wo_wapbl_resize_buf = wapbl_resize_buf,
253 .wo_wapbl_begin = wapbl_begin,
254 .wo_wapbl_end = wapbl_end,
255 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
256
257 /* XXX: the following is only used to say "this is a wapbl buf" */
258 .wo_wapbl_biodone = wapbl_biodone,
259 };
260
261 static int
262 wapbl_sysctl_init(void)
263 {
264 int rv;
265 const struct sysctlnode *rnode, *cnode;
266
267 wapbl_sysctl = NULL;
268
269 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
270 CTLFLAG_PERMANENT,
271 CTLTYPE_NODE, "vfs", NULL,
272 NULL, 0, NULL, 0,
273 CTL_VFS, CTL_EOL);
274 if (rv)
275 return rv;
276
277 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
278 CTLFLAG_PERMANENT,
279 CTLTYPE_NODE, "wapbl",
280 SYSCTL_DESCR("WAPBL journaling options"),
281 NULL, 0, NULL, 0,
282 CTL_CREATE, CTL_EOL);
283 if (rv)
284 return rv;
285
286 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
287 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
288 CTLTYPE_INT, "flush_disk_cache",
289 SYSCTL_DESCR("flush disk cache"),
290 NULL, 0, &wapbl_flush_disk_cache, 0,
291 CTL_CREATE, CTL_EOL);
292 if (rv)
293 return rv;
294
295 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
296 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
297 CTLTYPE_INT, "verbose_commit",
298 SYSCTL_DESCR("show time and size of wapbl log commits"),
299 NULL, 0, &wapbl_verbose_commit, 0,
300 CTL_CREATE, CTL_EOL);
301 return rv;
302 }
303
304 static void
305 wapbl_init(void)
306 {
307
308 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
309 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
310
311 wapbl_sysctl_init();
312 }
313
314 #ifdef notyet
315 static int
316 wapbl_fini(bool interface)
317 {
318
319 if (aio_sysctl != NULL)
320 sysctl_teardown(&aio_sysctl);
321
322 pool_destroy(&wapbl_entry_pool);
323
324 return 0;
325 }
326 #endif
327
328 static int
329 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
330 {
331 int error, i;
332
333 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
334 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
335
336 /*
337 * Its only valid to reuse the replay log if its
338 * the same as the new log we just opened.
339 */
340 KDASSERT(!wapbl_replay_isopen(wr));
341 KASSERT(wl->wl_devvp->v_type == VBLK);
342 KASSERT(wr->wr_devvp->v_type == VBLK);
343 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
344 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
345 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
346 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
347 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
348 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
349
350 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
351
352 for (i = 0; i < wr->wr_inodescnt; i++)
353 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
354 wr->wr_inodes[i].wr_imode);
355
356 /* Make sure new transaction won't overwrite old inodes list */
357 KDASSERT(wapbl_transaction_len(wl) <=
358 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
359 wr->wr_inodestail));
360
361 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
362 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
363 wapbl_transaction_len(wl);
364
365 error = wapbl_write_inodes(wl, &wl->wl_head);
366 if (error)
367 return error;
368
369 KASSERT(wl->wl_head != wl->wl_tail);
370 KASSERT(wl->wl_head != 0);
371
372 return 0;
373 }
374
375 int
376 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
377 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
378 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
379 {
380 struct wapbl *wl;
381 struct vnode *devvp;
382 daddr_t logpbn;
383 int error;
384 int log_dev_bshift = ilog2(blksize);
385 int fs_dev_bshift = log_dev_bshift;
386 int run;
387
388 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
389 " count=%zu blksize=%zu\n", vp, off, count, blksize));
390
391 if (log_dev_bshift > fs_dev_bshift) {
392 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
393 ("wapbl: log device's block size cannot be larger "
394 "than filesystem's\n"));
395 /*
396 * Not currently implemented, although it could be if
397 * needed someday.
398 */
399 return ENOSYS;
400 }
401
402 if (off < 0)
403 return EINVAL;
404
405 if (blksize < DEV_BSIZE)
406 return EINVAL;
407 if (blksize % DEV_BSIZE)
408 return EINVAL;
409
410 /* XXXTODO: verify that the full load is writable */
411
412 /*
413 * XXX check for minimum log size
414 * minimum is governed by minimum amount of space
415 * to complete a transaction. (probably truncate)
416 */
417 /* XXX for now pick something minimal */
418 if ((count * blksize) < mp->mnt_maxphys) {
419 return ENOSPC;
420 }
421
422 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
423 return error;
424 }
425
426 wl = wapbl_calloc(1, sizeof(*wl));
427 rw_init(&wl->wl_rwlock);
428 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
429 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
430 LIST_INIT(&wl->wl_bufs);
431 SIMPLEQ_INIT(&wl->wl_entries);
432
433 wl->wl_logvp = vp;
434 wl->wl_devvp = devvp;
435 wl->wl_mount = mp;
436 wl->wl_logpbn = logpbn;
437 wl->wl_log_dev_bshift = log_dev_bshift;
438 wl->wl_fs_dev_bshift = fs_dev_bshift;
439
440 wl->wl_flush = flushfn;
441 wl->wl_flush_abort = flushabortfn;
442
443 /* Reserve two log device blocks for the commit headers */
444 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
445 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
446 /* truncate the log usage to a multiple of log_dev_bshift */
447 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
448 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
449
450 /*
451 * wl_bufbytes_max limits the size of the in memory transaction space.
452 * - Since buffers are allocated and accounted for in units of
453 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
454 * (i.e. 1<<PAGE_SHIFT)
455 * - Since the log device has to be written in units of
456 * 1<<wl_log_dev_bshift it is required to be a mulitple of
457 * 1<<wl_log_dev_bshift.
458 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
459 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
460 * Therefore it must be multiple of the least common multiple of those
461 * three quantities. Fortunately, all of those quantities are
462 * guaranteed to be a power of two, and the least common multiple of
463 * a set of numbers which are all powers of two is simply the maximum
464 * of those numbers. Finally, the maximum logarithm of a power of two
465 * is the same as the log of the maximum power of two. So we can do
466 * the following operations to size wl_bufbytes_max:
467 */
468
469 /* XXX fix actual number of pages reserved per filesystem. */
470 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
471
472 /* Round wl_bufbytes_max to the largest power of two constraint */
473 wl->wl_bufbytes_max >>= PAGE_SHIFT;
474 wl->wl_bufbytes_max <<= PAGE_SHIFT;
475 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
476 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
477 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
478 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
479
480 /* XXX maybe use filesystem fragment size instead of 1024 */
481 /* XXX fix actual number of buffers reserved per filesystem. */
482 wl->wl_bufcount_max = (nbuf / 2) * 1024;
483
484 /* XXX tie this into resource estimation */
485 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
486
487 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
488 wl->wl_dealloclim);
489 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
490 wl->wl_dealloclim);
491
492 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
493
494 /* Initialize the commit header */
495 {
496 struct wapbl_wc_header *wc;
497 size_t len = 1 << wl->wl_log_dev_bshift;
498 wc = wapbl_calloc(1, len);
499 wc->wc_type = WAPBL_WC_HEADER;
500 wc->wc_len = len;
501 wc->wc_circ_off = wl->wl_circ_off;
502 wc->wc_circ_size = wl->wl_circ_size;
503 /* XXX wc->wc_fsid */
504 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
505 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
506 wl->wl_wc_header = wc;
507 wl->wl_wc_scratch = wapbl_alloc(len);
508 }
509
510 /*
511 * if there was an existing set of unlinked but
512 * allocated inodes, preserve it in the new
513 * log.
514 */
515 if (wr && wr->wr_inodescnt) {
516 error = wapbl_start_flush_inodes(wl, wr);
517 if (error)
518 goto errout;
519 }
520
521 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
522 if (error) {
523 goto errout;
524 }
525
526 *wlp = wl;
527 #if defined(WAPBL_DEBUG)
528 wapbl_debug_wl = wl;
529 #endif
530
531 return 0;
532 errout:
533 wapbl_discard(wl);
534 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
535 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
536 wapbl_free(wl->wl_deallocblks,
537 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
538 wapbl_free(wl->wl_dealloclens,
539 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
540 wapbl_inodetrk_free(wl);
541 wapbl_free(wl, sizeof(*wl));
542
543 return error;
544 }
545
546 /*
547 * Like wapbl_flush, only discards the transaction
548 * completely
549 */
550
551 void
552 wapbl_discard(struct wapbl *wl)
553 {
554 struct wapbl_entry *we;
555 struct buf *bp;
556 int i;
557
558 /*
559 * XXX we may consider using upgrade here
560 * if we want to call flush from inside a transaction
561 */
562 rw_enter(&wl->wl_rwlock, RW_WRITER);
563 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
564 wl->wl_dealloccnt);
565
566 #ifdef WAPBL_DEBUG_PRINT
567 {
568 pid_t pid = -1;
569 lwpid_t lid = -1;
570 if (curproc)
571 pid = curproc->p_pid;
572 if (curlwp)
573 lid = curlwp->l_lid;
574 #ifdef WAPBL_DEBUG_BUFBYTES
575 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
576 ("wapbl_discard: thread %d.%d discarding "
577 "transaction\n"
578 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
579 "deallocs=%d inodes=%d\n"
580 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
581 "unsynced=%zu\n",
582 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
583 wl->wl_bcount, wl->wl_dealloccnt,
584 wl->wl_inohashcnt, wl->wl_error_count,
585 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
586 wl->wl_unsynced_bufbytes));
587 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
588 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
589 ("\tentry: bufcount = %zu, reclaimable = %zu, "
590 "error = %d, unsynced = %zu\n",
591 we->we_bufcount, we->we_reclaimable_bytes,
592 we->we_error, we->we_unsynced_bufbytes));
593 }
594 #else /* !WAPBL_DEBUG_BUFBYTES */
595 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
596 ("wapbl_discard: thread %d.%d discarding transaction\n"
597 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
598 "deallocs=%d inodes=%d\n"
599 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
600 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
601 wl->wl_bcount, wl->wl_dealloccnt,
602 wl->wl_inohashcnt, wl->wl_error_count,
603 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
604 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
605 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
606 ("\tentry: bufcount = %zu, reclaimable = %zu, "
607 "error = %d\n",
608 we->we_bufcount, we->we_reclaimable_bytes,
609 we->we_error));
610 }
611 #endif /* !WAPBL_DEBUG_BUFBYTES */
612 }
613 #endif /* WAPBL_DEBUG_PRINT */
614
615 for (i = 0; i <= wl->wl_inohashmask; i++) {
616 struct wapbl_ino_head *wih;
617 struct wapbl_ino *wi;
618
619 wih = &wl->wl_inohash[i];
620 while ((wi = LIST_FIRST(wih)) != NULL) {
621 LIST_REMOVE(wi, wi_hash);
622 pool_put(&wapbl_ino_pool, wi);
623 KASSERT(wl->wl_inohashcnt > 0);
624 wl->wl_inohashcnt--;
625 }
626 }
627
628 /*
629 * clean buffer list
630 */
631 mutex_enter(&bufcache_lock);
632 mutex_enter(&wl->wl_mtx);
633 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
634 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
635 /*
636 * The buffer will be unlocked and
637 * removed from the transaction in brelse
638 */
639 mutex_exit(&wl->wl_mtx);
640 brelsel(bp, 0);
641 mutex_enter(&wl->wl_mtx);
642 }
643 }
644 mutex_exit(&wl->wl_mtx);
645 mutex_exit(&bufcache_lock);
646
647 /*
648 * Remove references to this wl from wl_entries, free any which
649 * no longer have buffers, others will be freed in wapbl_biodone
650 * when they no longer have any buffers.
651 */
652 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
653 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
654 /* XXX should we be accumulating wl_error_count
655 * and increasing reclaimable bytes ? */
656 we->we_wapbl = NULL;
657 if (we->we_bufcount == 0) {
658 #ifdef WAPBL_DEBUG_BUFBYTES
659 KASSERT(we->we_unsynced_bufbytes == 0);
660 #endif
661 pool_put(&wapbl_entry_pool, we);
662 }
663 }
664
665 /* Discard list of deallocs */
666 wl->wl_dealloccnt = 0;
667 /* XXX should we clear wl_reserved_bytes? */
668
669 KASSERT(wl->wl_bufbytes == 0);
670 KASSERT(wl->wl_bcount == 0);
671 KASSERT(wl->wl_bufcount == 0);
672 KASSERT(LIST_EMPTY(&wl->wl_bufs));
673 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
674 KASSERT(wl->wl_inohashcnt == 0);
675
676 rw_exit(&wl->wl_rwlock);
677 }
678
679 int
680 wapbl_stop(struct wapbl *wl, int force)
681 {
682 struct vnode *vp;
683 int error;
684
685 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
686 error = wapbl_flush(wl, 1);
687 if (error) {
688 if (force)
689 wapbl_discard(wl);
690 else
691 return error;
692 }
693
694 /* Unlinked inodes persist after a flush */
695 if (wl->wl_inohashcnt) {
696 if (force) {
697 wapbl_discard(wl);
698 } else {
699 return EBUSY;
700 }
701 }
702
703 KASSERT(wl->wl_bufbytes == 0);
704 KASSERT(wl->wl_bcount == 0);
705 KASSERT(wl->wl_bufcount == 0);
706 KASSERT(LIST_EMPTY(&wl->wl_bufs));
707 KASSERT(wl->wl_dealloccnt == 0);
708 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
709 KASSERT(wl->wl_inohashcnt == 0);
710
711 vp = wl->wl_logvp;
712
713 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
714 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
715 wapbl_free(wl->wl_deallocblks,
716 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
717 wapbl_free(wl->wl_dealloclens,
718 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
719 wapbl_inodetrk_free(wl);
720
721 cv_destroy(&wl->wl_reclaimable_cv);
722 mutex_destroy(&wl->wl_mtx);
723 rw_destroy(&wl->wl_rwlock);
724 wapbl_free(wl, sizeof(*wl));
725
726 return 0;
727 }
728
729 static int
730 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
731 {
732 struct pstats *pstats = curlwp->l_proc->p_stats;
733 struct buf *bp;
734 int error;
735
736 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
737 KASSERT(devvp->v_type == VBLK);
738
739 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
740 mutex_enter(devvp->v_interlock);
741 devvp->v_numoutput++;
742 mutex_exit(devvp->v_interlock);
743 pstats->p_ru.ru_oublock++;
744 } else {
745 pstats->p_ru.ru_inblock++;
746 }
747
748 bp = getiobuf(devvp, true);
749 bp->b_flags = flags;
750 bp->b_cflags = BC_BUSY; /* silly & dubious */
751 bp->b_dev = devvp->v_rdev;
752 bp->b_data = data;
753 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
754 bp->b_blkno = pbn;
755 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
756
757 WAPBL_PRINTF(WAPBL_PRINT_IO,
758 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
759 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
760 bp->b_blkno, bp->b_dev));
761
762 VOP_STRATEGY(devvp, bp);
763
764 error = biowait(bp);
765 putiobuf(bp);
766
767 if (error) {
768 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
769 ("wapbl_doio: %s %zu bytes at block %" PRId64
770 " on dev 0x%"PRIx64" failed with error %d\n",
771 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
772 "write" : "read"),
773 len, pbn, devvp->v_rdev, error));
774 }
775
776 return error;
777 }
778
779 int
780 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
781 {
782
783 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
784 }
785
786 int
787 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
788 {
789
790 return wapbl_doio(data, len, devvp, pbn, B_READ);
791 }
792
793 /*
794 * Off is byte offset returns new offset for next write
795 * handles log wraparound
796 */
797 static int
798 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
799 {
800 size_t slen;
801 off_t off = *offp;
802 int error;
803 daddr_t pbn;
804
805 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
806 wl->wl_log_dev_bshift) == len);
807
808 if (off < wl->wl_circ_off)
809 off = wl->wl_circ_off;
810 slen = wl->wl_circ_off + wl->wl_circ_size - off;
811 if (slen < len) {
812 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
813 #ifdef _KERNEL
814 pbn = btodb(pbn << wl->wl_log_dev_bshift);
815 #endif
816 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
817 if (error)
818 return error;
819 data = (uint8_t *)data + slen;
820 len -= slen;
821 off = wl->wl_circ_off;
822 }
823 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
824 #ifdef _KERNEL
825 pbn = btodb(pbn << wl->wl_log_dev_bshift);
826 #endif
827 error = wapbl_write(data, len, wl->wl_devvp, pbn);
828 if (error)
829 return error;
830 off += len;
831 if (off >= wl->wl_circ_off + wl->wl_circ_size)
832 off = wl->wl_circ_off;
833 *offp = off;
834 return 0;
835 }
836
837 /****************************************************************/
838
839 int
840 wapbl_begin(struct wapbl *wl, const char *file, int line)
841 {
842 int doflush;
843 unsigned lockcount;
844 uint32_t maxphys;
845
846 KDASSERT(wl);
847
848 /*
849 * XXX this needs to be made much more sophisticated.
850 * perhaps each wapbl_begin could reserve a specified
851 * number of buffers and bytes.
852 */
853 mutex_enter(&wl->wl_mtx);
854 lockcount = wl->wl_lock_count;
855 maxphys = wl->wl_mount->mnt_maxphys;
856 doflush = ((wl->wl_bufbytes + (lockcount * maxphys)) >
857 wl->wl_bufbytes_max / 2) ||
858 ((wl->wl_bufcount + (lockcount * 10)) >
859 wl->wl_bufcount_max / 2) ||
860 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
861 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
862 mutex_exit(&wl->wl_mtx);
863
864 if (doflush) {
865 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
866 ("force flush lockcnt=%d bufbytes=%zu "
867 "(max=%zu) bufcount=%zu (max=%zu) "
868 "dealloccnt %d (lim=%d)\n",
869 lockcount, wl->wl_bufbytes,
870 wl->wl_bufbytes_max, wl->wl_bufcount,
871 wl->wl_bufcount_max,
872 wl->wl_dealloccnt, wl->wl_dealloclim));
873 }
874
875 if (doflush) {
876 int error = wapbl_flush(wl, 0);
877 if (error)
878 return error;
879 }
880
881 rw_enter(&wl->wl_rwlock, RW_READER);
882 mutex_enter(&wl->wl_mtx);
883 wl->wl_lock_count++;
884 mutex_exit(&wl->wl_mtx);
885
886 #if defined(WAPBL_DEBUG_PRINT)
887 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
888 ("wapbl_begin thread %d.%d with bufcount=%zu "
889 "bufbytes=%zu bcount=%zu at %s:%d\n",
890 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
891 wl->wl_bufbytes, wl->wl_bcount, file, line));
892 #endif
893
894 return 0;
895 }
896
897 void
898 wapbl_end(struct wapbl *wl)
899 {
900
901 #if defined(WAPBL_DEBUG_PRINT)
902 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
903 ("wapbl_end thread %d.%d with bufcount=%zu "
904 "bufbytes=%zu bcount=%zu\n",
905 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
906 wl->wl_bufbytes, wl->wl_bcount));
907 #endif
908
909 #ifdef DIAGNOSTIC
910 size_t flushsize = wapbl_transaction_len(wl);
911 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
912 /*
913 * XXX this could be handled more gracefully, perhaps place
914 * only a partial transaction in the log and allow the
915 * remaining to flush without the protection of the journal.
916 */
917 panic("wapbl_end: current transaction too big to flush\n");
918 }
919 #endif
920
921 mutex_enter(&wl->wl_mtx);
922 KASSERT(wl->wl_lock_count > 0);
923 wl->wl_lock_count--;
924 mutex_exit(&wl->wl_mtx);
925
926 rw_exit(&wl->wl_rwlock);
927 }
928
929 void
930 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
931 {
932
933 KASSERT(bp->b_cflags & BC_BUSY);
934 KASSERT(bp->b_vp);
935
936 wapbl_jlock_assert(wl);
937
938 #if 0
939 /*
940 * XXX this might be an issue for swapfiles.
941 * see uvm_swap.c:1702
942 *
943 * XXX2 why require it then? leap of semantics?
944 */
945 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
946 #endif
947
948 mutex_enter(&wl->wl_mtx);
949 if (bp->b_flags & B_LOCKED) {
950 LIST_REMOVE(bp, b_wapbllist);
951 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
952 ("wapbl_add_buf thread %d.%d re-adding buf %p "
953 "with %d bytes %d bcount\n",
954 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
955 bp->b_bcount));
956 } else {
957 /* unlocked by dirty buffers shouldn't exist */
958 KASSERT(!(bp->b_oflags & BO_DELWRI));
959 wl->wl_bufbytes += bp->b_bufsize;
960 wl->wl_bcount += bp->b_bcount;
961 wl->wl_bufcount++;
962 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
963 ("wapbl_add_buf thread %d.%d adding buf %p "
964 "with %d bytes %d bcount\n",
965 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
966 bp->b_bcount));
967 }
968 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
969 mutex_exit(&wl->wl_mtx);
970
971 bp->b_flags |= B_LOCKED;
972 }
973
974 static void
975 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
976 {
977
978 KASSERT(mutex_owned(&wl->wl_mtx));
979 KASSERT(bp->b_cflags & BC_BUSY);
980 wapbl_jlock_assert(wl);
981
982 #if 0
983 /*
984 * XXX this might be an issue for swapfiles.
985 * see uvm_swap.c:1725
986 *
987 * XXXdeux: see above
988 */
989 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
990 #endif
991 KASSERT(bp->b_flags & B_LOCKED);
992
993 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
994 ("wapbl_remove_buf thread %d.%d removing buf %p with "
995 "%d bytes %d bcount\n",
996 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
997
998 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
999 wl->wl_bufbytes -= bp->b_bufsize;
1000 KASSERT(wl->wl_bcount >= bp->b_bcount);
1001 wl->wl_bcount -= bp->b_bcount;
1002 KASSERT(wl->wl_bufcount > 0);
1003 wl->wl_bufcount--;
1004 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1005 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1006 LIST_REMOVE(bp, b_wapbllist);
1007
1008 bp->b_flags &= ~B_LOCKED;
1009 }
1010
1011 /* called from brelsel() in vfs_bio among other places */
1012 void
1013 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1014 {
1015
1016 mutex_enter(&wl->wl_mtx);
1017 wapbl_remove_buf_locked(wl, bp);
1018 mutex_exit(&wl->wl_mtx);
1019 }
1020
1021 void
1022 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1023 {
1024
1025 KASSERT(bp->b_cflags & BC_BUSY);
1026
1027 /*
1028 * XXX: why does this depend on B_LOCKED? otherwise the buf
1029 * is not for a transaction? if so, why is this called in the
1030 * first place?
1031 */
1032 if (bp->b_flags & B_LOCKED) {
1033 mutex_enter(&wl->wl_mtx);
1034 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1035 wl->wl_bcount += bp->b_bcount - oldcnt;
1036 mutex_exit(&wl->wl_mtx);
1037 }
1038 }
1039
1040 #endif /* _KERNEL */
1041
1042 /****************************************************************/
1043 /* Some utility inlines */
1044
1045 /* This is used to advance the pointer at old to new value at old+delta */
1046 static inline off_t
1047 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1048 {
1049 off_t new;
1050
1051 /* Define acceptable ranges for inputs. */
1052 KASSERT(delta <= (size_t)size);
1053 KASSERT((old == 0) || ((size_t)old >= off));
1054 KASSERT(old < (off_t)(size + off));
1055
1056 if ((old == 0) && (delta != 0))
1057 new = off + delta;
1058 else if ((old + delta) < (size + off))
1059 new = old + delta;
1060 else
1061 new = (old + delta) - size;
1062
1063 /* Note some interesting axioms */
1064 KASSERT((delta != 0) || (new == old));
1065 KASSERT((delta == 0) || (new != 0));
1066 KASSERT((delta != (size)) || (new == old));
1067
1068 /* Define acceptable ranges for output. */
1069 KASSERT((new == 0) || ((size_t)new >= off));
1070 KASSERT((size_t)new < (size + off));
1071 return new;
1072 }
1073
1074 static inline size_t
1075 wapbl_space_used(size_t avail, off_t head, off_t tail)
1076 {
1077
1078 if (tail == 0) {
1079 KASSERT(head == 0);
1080 return 0;
1081 }
1082 return ((head + (avail - 1) - tail) % avail) + 1;
1083 }
1084
1085 static inline size_t
1086 wapbl_space_free(size_t avail, off_t head, off_t tail)
1087 {
1088
1089 return avail - wapbl_space_used(avail, head, tail);
1090 }
1091
1092 static inline void
1093 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1094 off_t *tailp)
1095 {
1096 off_t head = *headp;
1097 off_t tail = *tailp;
1098
1099 KASSERT(delta <= wapbl_space_free(size, head, tail));
1100 head = wapbl_advance(size, off, head, delta);
1101 if ((tail == 0) && (head != 0))
1102 tail = off;
1103 *headp = head;
1104 *tailp = tail;
1105 }
1106
1107 static inline void
1108 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1109 off_t *tailp)
1110 {
1111 off_t head = *headp;
1112 off_t tail = *tailp;
1113
1114 KASSERT(delta <= wapbl_space_used(size, head, tail));
1115 tail = wapbl_advance(size, off, tail, delta);
1116 if (head == tail) {
1117 head = tail = 0;
1118 }
1119 *headp = head;
1120 *tailp = tail;
1121 }
1122
1123 #ifdef _KERNEL
1124
1125 /****************************************************************/
1126
1127 /*
1128 * Remove transactions whose buffers are completely flushed to disk.
1129 * Will block until at least minfree space is available.
1130 * only intended to be called from inside wapbl_flush and therefore
1131 * does not protect against commit races with itself or with flush.
1132 */
1133 static int
1134 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1135 {
1136 size_t delta;
1137 size_t avail;
1138 off_t head;
1139 off_t tail;
1140 int error = 0;
1141
1142 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1143 KASSERT(rw_write_held(&wl->wl_rwlock));
1144
1145 mutex_enter(&wl->wl_mtx);
1146
1147 /*
1148 * First check to see if we have to do a commit
1149 * at all.
1150 */
1151 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1152 if (minfree < avail) {
1153 mutex_exit(&wl->wl_mtx);
1154 return 0;
1155 }
1156 minfree -= avail;
1157 while ((wl->wl_error_count == 0) &&
1158 (wl->wl_reclaimable_bytes < minfree)) {
1159 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1160 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1161 "minfree=%zd\n",
1162 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1163 minfree));
1164
1165 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1166 }
1167 if (wl->wl_reclaimable_bytes < minfree) {
1168 KASSERT(wl->wl_error_count);
1169 /* XXX maybe get actual error from buffer instead someday? */
1170 error = EIO;
1171 }
1172 head = wl->wl_head;
1173 tail = wl->wl_tail;
1174 delta = wl->wl_reclaimable_bytes;
1175
1176 /* If all of of the entries are flushed, then be sure to keep
1177 * the reserved bytes reserved. Watch out for discarded transactions,
1178 * which could leave more bytes reserved than are reclaimable.
1179 */
1180 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1181 (delta >= wl->wl_reserved_bytes)) {
1182 delta -= wl->wl_reserved_bytes;
1183 }
1184 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1185 &tail);
1186 KDASSERT(wl->wl_reserved_bytes <=
1187 wapbl_space_used(wl->wl_circ_size, head, tail));
1188 mutex_exit(&wl->wl_mtx);
1189
1190 if (error)
1191 return error;
1192
1193 if (waitonly)
1194 return 0;
1195
1196 /*
1197 * This is where head, tail and delta are unprotected
1198 * from races against itself or flush. This is ok since
1199 * we only call this routine from inside flush itself.
1200 *
1201 * XXX: how can it race against itself when accessed only
1202 * from behind the write-locked rwlock?
1203 */
1204 error = wapbl_write_commit(wl, head, tail);
1205 if (error)
1206 return error;
1207
1208 wl->wl_head = head;
1209 wl->wl_tail = tail;
1210
1211 mutex_enter(&wl->wl_mtx);
1212 KASSERT(wl->wl_reclaimable_bytes >= delta);
1213 wl->wl_reclaimable_bytes -= delta;
1214 mutex_exit(&wl->wl_mtx);
1215 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1216 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1217 curproc->p_pid, curlwp->l_lid, delta));
1218
1219 return 0;
1220 }
1221
1222 /****************************************************************/
1223
1224 void
1225 wapbl_biodone(struct buf *bp)
1226 {
1227 struct wapbl_entry *we = bp->b_private;
1228 struct wapbl *wl = we->we_wapbl;
1229 #ifdef WAPBL_DEBUG_BUFBYTES
1230 const int bufsize = bp->b_bufsize;
1231 #endif
1232
1233 /*
1234 * Handle possible flushing of buffers after log has been
1235 * decomissioned.
1236 */
1237 if (!wl) {
1238 KASSERT(we->we_bufcount > 0);
1239 we->we_bufcount--;
1240 #ifdef WAPBL_DEBUG_BUFBYTES
1241 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1242 we->we_unsynced_bufbytes -= bufsize;
1243 #endif
1244
1245 if (we->we_bufcount == 0) {
1246 #ifdef WAPBL_DEBUG_BUFBYTES
1247 KASSERT(we->we_unsynced_bufbytes == 0);
1248 #endif
1249 pool_put(&wapbl_entry_pool, we);
1250 }
1251
1252 brelse(bp, 0);
1253 return;
1254 }
1255
1256 #ifdef ohbother
1257 KDASSERT(bp->b_oflags & BO_DONE);
1258 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1259 KDASSERT(bp->b_flags & B_ASYNC);
1260 KDASSERT(bp->b_cflags & BC_BUSY);
1261 KDASSERT(!(bp->b_flags & B_LOCKED));
1262 KDASSERT(!(bp->b_flags & B_READ));
1263 KDASSERT(!(bp->b_cflags & BC_INVAL));
1264 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1265 #endif
1266
1267 if (bp->b_error) {
1268 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1269 /*
1270 * XXXpooka: interfaces not fully updated
1271 * Note: this was not enabled in the original patch
1272 * against netbsd4 either. I don't know if comment
1273 * above is true or not.
1274 */
1275
1276 /*
1277 * If an error occurs, report the error and leave the
1278 * buffer as a delayed write on the LRU queue.
1279 * restarting the write would likely result in
1280 * an error spinloop, so let it be done harmlessly
1281 * by the syncer.
1282 */
1283 bp->b_flags &= ~(B_DONE);
1284 simple_unlock(&bp->b_interlock);
1285
1286 if (we->we_error == 0) {
1287 mutex_enter(&wl->wl_mtx);
1288 wl->wl_error_count++;
1289 mutex_exit(&wl->wl_mtx);
1290 cv_broadcast(&wl->wl_reclaimable_cv);
1291 }
1292 we->we_error = bp->b_error;
1293 bp->b_error = 0;
1294 brelse(bp);
1295 return;
1296 #else
1297 /* For now, just mark the log permanently errored out */
1298
1299 mutex_enter(&wl->wl_mtx);
1300 if (wl->wl_error_count == 0) {
1301 wl->wl_error_count++;
1302 cv_broadcast(&wl->wl_reclaimable_cv);
1303 }
1304 mutex_exit(&wl->wl_mtx);
1305 #endif
1306 }
1307
1308 /*
1309 * Release the buffer here. wapbl_flush() may wait for the
1310 * log to become empty and we better unbusy the buffer before
1311 * wapbl_flush() returns.
1312 */
1313 brelse(bp, 0);
1314
1315 mutex_enter(&wl->wl_mtx);
1316
1317 KASSERT(we->we_bufcount > 0);
1318 we->we_bufcount--;
1319 #ifdef WAPBL_DEBUG_BUFBYTES
1320 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1321 we->we_unsynced_bufbytes -= bufsize;
1322 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1323 wl->wl_unsynced_bufbytes -= bufsize;
1324 #endif
1325
1326 /*
1327 * If the current transaction can be reclaimed, start
1328 * at the beginning and reclaim any consecutive reclaimable
1329 * transactions. If we successfully reclaim anything,
1330 * then wakeup anyone waiting for the reclaim.
1331 */
1332 if (we->we_bufcount == 0) {
1333 size_t delta = 0;
1334 int errcnt = 0;
1335 #ifdef WAPBL_DEBUG_BUFBYTES
1336 KDASSERT(we->we_unsynced_bufbytes == 0);
1337 #endif
1338 /*
1339 * clear any posted error, since the buffer it came from
1340 * has successfully flushed by now
1341 */
1342 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1343 (we->we_bufcount == 0)) {
1344 delta += we->we_reclaimable_bytes;
1345 if (we->we_error)
1346 errcnt++;
1347 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1348 pool_put(&wapbl_entry_pool, we);
1349 }
1350
1351 if (delta) {
1352 wl->wl_reclaimable_bytes += delta;
1353 KASSERT(wl->wl_error_count >= errcnt);
1354 wl->wl_error_count -= errcnt;
1355 cv_broadcast(&wl->wl_reclaimable_cv);
1356 }
1357 }
1358
1359 mutex_exit(&wl->wl_mtx);
1360 }
1361
1362 /*
1363 * Write transactions to disk + start I/O for contents
1364 */
1365 int
1366 wapbl_flush(struct wapbl *wl, int waitfor)
1367 {
1368 struct buf *bp;
1369 struct wapbl_entry *we;
1370 off_t off;
1371 off_t head;
1372 off_t tail;
1373 size_t delta = 0;
1374 size_t flushsize;
1375 size_t reserved;
1376 int error = 0;
1377
1378 /*
1379 * Do a quick check to see if a full flush can be skipped
1380 * This assumes that the flush callback does not need to be called
1381 * unless there are other outstanding bufs.
1382 */
1383 if (!waitfor) {
1384 size_t nbufs;
1385 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1386 protect the KASSERTS */
1387 nbufs = wl->wl_bufcount;
1388 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1389 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1390 mutex_exit(&wl->wl_mtx);
1391 if (nbufs == 0)
1392 return 0;
1393 }
1394
1395 /*
1396 * XXX we may consider using LK_UPGRADE here
1397 * if we want to call flush from inside a transaction
1398 */
1399 rw_enter(&wl->wl_rwlock, RW_WRITER);
1400 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1401 wl->wl_dealloccnt);
1402
1403 /*
1404 * Now that we are fully locked and flushed,
1405 * do another check for nothing to do.
1406 */
1407 if (wl->wl_bufcount == 0) {
1408 goto out;
1409 }
1410
1411 #if 0
1412 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1413 ("wapbl_flush thread %d.%d flushing entries with "
1414 "bufcount=%zu bufbytes=%zu\n",
1415 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1416 wl->wl_bufbytes));
1417 #endif
1418
1419 /* Calculate amount of space needed to flush */
1420 flushsize = wapbl_transaction_len(wl);
1421 if (wapbl_verbose_commit) {
1422 struct timespec ts;
1423 getnanotime(&ts);
1424 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1425 __func__, (long long)ts.tv_sec,
1426 (long)ts.tv_nsec, flushsize);
1427 }
1428
1429 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1430 /*
1431 * XXX this could be handled more gracefully, perhaps place
1432 * only a partial transaction in the log and allow the
1433 * remaining to flush without the protection of the journal.
1434 */
1435 panic("wapbl_flush: current transaction too big to flush\n");
1436 }
1437
1438 error = wapbl_truncate(wl, flushsize, 0);
1439 if (error)
1440 goto out2;
1441
1442 off = wl->wl_head;
1443 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1444 (off < wl->wl_circ_off + wl->wl_circ_size)));
1445 error = wapbl_write_blocks(wl, &off);
1446 if (error)
1447 goto out2;
1448 error = wapbl_write_revocations(wl, &off);
1449 if (error)
1450 goto out2;
1451 error = wapbl_write_inodes(wl, &off);
1452 if (error)
1453 goto out2;
1454
1455 reserved = 0;
1456 if (wl->wl_inohashcnt)
1457 reserved = wapbl_transaction_inodes_len(wl);
1458
1459 head = wl->wl_head;
1460 tail = wl->wl_tail;
1461
1462 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1463 &head, &tail);
1464 #ifdef WAPBL_DEBUG
1465 if (head != off) {
1466 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1467 " off=%"PRIdMAX" flush=%zu\n",
1468 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1469 flushsize);
1470 }
1471 #else
1472 KASSERT(head == off);
1473 #endif
1474
1475 /* Opportunistically move the tail forward if we can */
1476 if (!wapbl_lazy_truncate) {
1477 mutex_enter(&wl->wl_mtx);
1478 delta = wl->wl_reclaimable_bytes;
1479 mutex_exit(&wl->wl_mtx);
1480 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1481 &head, &tail);
1482 }
1483
1484 error = wapbl_write_commit(wl, head, tail);
1485 if (error)
1486 goto out2;
1487
1488 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1489
1490 #ifdef WAPBL_DEBUG_BUFBYTES
1491 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1492 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1493 " unsynced=%zu"
1494 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1495 "inodes=%d\n",
1496 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1497 wapbl_space_used(wl->wl_circ_size, head, tail),
1498 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1499 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1500 wl->wl_inohashcnt));
1501 #else
1502 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1503 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1504 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1505 "inodes=%d\n",
1506 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1507 wapbl_space_used(wl->wl_circ_size, head, tail),
1508 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1509 wl->wl_dealloccnt, wl->wl_inohashcnt));
1510 #endif
1511
1512
1513 mutex_enter(&bufcache_lock);
1514 mutex_enter(&wl->wl_mtx);
1515
1516 wl->wl_reserved_bytes = reserved;
1517 wl->wl_head = head;
1518 wl->wl_tail = tail;
1519 KASSERT(wl->wl_reclaimable_bytes >= delta);
1520 wl->wl_reclaimable_bytes -= delta;
1521 wl->wl_dealloccnt = 0;
1522 #ifdef WAPBL_DEBUG_BUFBYTES
1523 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1524 #endif
1525
1526 we->we_wapbl = wl;
1527 we->we_bufcount = wl->wl_bufcount;
1528 #ifdef WAPBL_DEBUG_BUFBYTES
1529 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1530 #endif
1531 we->we_reclaimable_bytes = flushsize;
1532 we->we_error = 0;
1533 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1534
1535 /*
1536 * this flushes bufs in reverse order than they were queued
1537 * it shouldn't matter, but if we care we could use TAILQ instead.
1538 * XXX Note they will get put on the lru queue when they flush
1539 * so we might actually want to change this to preserve order.
1540 */
1541 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1542 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1543 continue;
1544 }
1545 bp->b_iodone = wapbl_biodone;
1546 bp->b_private = we;
1547 bremfree(bp);
1548 wapbl_remove_buf_locked(wl, bp);
1549 mutex_exit(&wl->wl_mtx);
1550 mutex_exit(&bufcache_lock);
1551 bawrite(bp);
1552 mutex_enter(&bufcache_lock);
1553 mutex_enter(&wl->wl_mtx);
1554 }
1555 mutex_exit(&wl->wl_mtx);
1556 mutex_exit(&bufcache_lock);
1557
1558 #if 0
1559 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1560 ("wapbl_flush thread %d.%d done flushing entries...\n",
1561 curproc->p_pid, curlwp->l_lid));
1562 #endif
1563
1564 out:
1565
1566 /*
1567 * If the waitfor flag is set, don't return until everything is
1568 * fully flushed and the on disk log is empty.
1569 */
1570 if (waitfor) {
1571 error = wapbl_truncate(wl, wl->wl_circ_size -
1572 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1573 }
1574
1575 out2:
1576 if (error) {
1577 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1578 wl->wl_dealloclens, wl->wl_dealloccnt);
1579 }
1580
1581 #ifdef WAPBL_DEBUG_PRINT
1582 if (error) {
1583 pid_t pid = -1;
1584 lwpid_t lid = -1;
1585 if (curproc)
1586 pid = curproc->p_pid;
1587 if (curlwp)
1588 lid = curlwp->l_lid;
1589 mutex_enter(&wl->wl_mtx);
1590 #ifdef WAPBL_DEBUG_BUFBYTES
1591 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1592 ("wapbl_flush: thread %d.%d aborted flush: "
1593 "error = %d\n"
1594 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1595 "deallocs=%d inodes=%d\n"
1596 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1597 "unsynced=%zu\n",
1598 pid, lid, error, wl->wl_bufcount,
1599 wl->wl_bufbytes, wl->wl_bcount,
1600 wl->wl_dealloccnt, wl->wl_inohashcnt,
1601 wl->wl_error_count, wl->wl_reclaimable_bytes,
1602 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1603 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1604 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1605 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1606 "error = %d, unsynced = %zu\n",
1607 we->we_bufcount, we->we_reclaimable_bytes,
1608 we->we_error, we->we_unsynced_bufbytes));
1609 }
1610 #else
1611 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1612 ("wapbl_flush: thread %d.%d aborted flush: "
1613 "error = %d\n"
1614 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1615 "deallocs=%d inodes=%d\n"
1616 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1617 pid, lid, error, wl->wl_bufcount,
1618 wl->wl_bufbytes, wl->wl_bcount,
1619 wl->wl_dealloccnt, wl->wl_inohashcnt,
1620 wl->wl_error_count, wl->wl_reclaimable_bytes,
1621 wl->wl_reserved_bytes));
1622 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1623 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1624 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1625 "error = %d\n", we->we_bufcount,
1626 we->we_reclaimable_bytes, we->we_error));
1627 }
1628 #endif
1629 mutex_exit(&wl->wl_mtx);
1630 }
1631 #endif
1632
1633 rw_exit(&wl->wl_rwlock);
1634 return error;
1635 }
1636
1637 /****************************************************************/
1638
1639 void
1640 wapbl_jlock_assert(struct wapbl *wl)
1641 {
1642
1643 KASSERT(rw_lock_held(&wl->wl_rwlock));
1644 }
1645
1646 void
1647 wapbl_junlock_assert(struct wapbl *wl)
1648 {
1649
1650 KASSERT(!rw_write_held(&wl->wl_rwlock));
1651 }
1652
1653 /****************************************************************/
1654
1655 /* locks missing */
1656 void
1657 wapbl_print(struct wapbl *wl,
1658 int full,
1659 void (*pr)(const char *, ...))
1660 {
1661 struct buf *bp;
1662 struct wapbl_entry *we;
1663 (*pr)("wapbl %p", wl);
1664 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1665 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1666 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1667 wl->wl_circ_size, wl->wl_circ_off,
1668 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1669 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1670 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1671 #ifdef WAPBL_DEBUG_BUFBYTES
1672 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1673 "reserved = %zu errcnt = %d unsynced = %zu\n",
1674 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1675 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1676 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1677 #else
1678 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1679 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1680 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1681 wl->wl_error_count);
1682 #endif
1683 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1684 wl->wl_dealloccnt, wl->wl_dealloclim);
1685 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1686 wl->wl_inohashcnt, wl->wl_inohashmask);
1687 (*pr)("entries:\n");
1688 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1689 #ifdef WAPBL_DEBUG_BUFBYTES
1690 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1691 "unsynced = %zu\n",
1692 we->we_bufcount, we->we_reclaimable_bytes,
1693 we->we_error, we->we_unsynced_bufbytes);
1694 #else
1695 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1696 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1697 #endif
1698 }
1699 if (full) {
1700 int cnt = 0;
1701 (*pr)("bufs =");
1702 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1703 if (!LIST_NEXT(bp, b_wapbllist)) {
1704 (*pr)(" %p", bp);
1705 } else if ((++cnt % 6) == 0) {
1706 (*pr)(" %p,\n\t", bp);
1707 } else {
1708 (*pr)(" %p,", bp);
1709 }
1710 }
1711 (*pr)("\n");
1712
1713 (*pr)("dealloced blks = ");
1714 {
1715 int i;
1716 cnt = 0;
1717 for (i = 0; i < wl->wl_dealloccnt; i++) {
1718 (*pr)(" %"PRId64":%d,",
1719 wl->wl_deallocblks[i],
1720 wl->wl_dealloclens[i]);
1721 if ((++cnt % 4) == 0) {
1722 (*pr)("\n\t");
1723 }
1724 }
1725 }
1726 (*pr)("\n");
1727
1728 (*pr)("registered inodes = ");
1729 {
1730 int i;
1731 cnt = 0;
1732 for (i = 0; i <= wl->wl_inohashmask; i++) {
1733 struct wapbl_ino_head *wih;
1734 struct wapbl_ino *wi;
1735
1736 wih = &wl->wl_inohash[i];
1737 LIST_FOREACH(wi, wih, wi_hash) {
1738 if (wi->wi_ino == 0)
1739 continue;
1740 (*pr)(" %"PRId32"/0%06"PRIo32",",
1741 wi->wi_ino, wi->wi_mode);
1742 if ((++cnt % 4) == 0) {
1743 (*pr)("\n\t");
1744 }
1745 }
1746 }
1747 (*pr)("\n");
1748 }
1749 }
1750 }
1751
1752 #if defined(WAPBL_DEBUG) || defined(DDB)
1753 void
1754 wapbl_dump(struct wapbl *wl)
1755 {
1756 #if defined(WAPBL_DEBUG)
1757 if (!wl)
1758 wl = wapbl_debug_wl;
1759 #endif
1760 if (!wl)
1761 return;
1762 wapbl_print(wl, 1, printf);
1763 }
1764 #endif
1765
1766 /****************************************************************/
1767
1768 void
1769 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1770 {
1771
1772 wapbl_jlock_assert(wl);
1773
1774 mutex_enter(&wl->wl_mtx);
1775 /* XXX should eventually instead tie this into resource estimation */
1776 /*
1777 * XXX this panic needs locking/mutex analysis and the
1778 * ability to cope with the failure.
1779 */
1780 /* XXX this XXX doesn't have enough XXX */
1781 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1782 panic("wapbl_register_deallocation: out of resources");
1783
1784 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1785 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1786 wl->wl_dealloccnt++;
1787 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1788 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1789 mutex_exit(&wl->wl_mtx);
1790 }
1791
1792 /****************************************************************/
1793
1794 static void
1795 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1796 {
1797
1798 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1799 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1800 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1801 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1802 }
1803 }
1804
1805 static void
1806 wapbl_inodetrk_free(struct wapbl *wl)
1807 {
1808
1809 /* XXX this KASSERT needs locking/mutex analysis */
1810 KASSERT(wl->wl_inohashcnt == 0);
1811 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1812 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1813 pool_destroy(&wapbl_ino_pool);
1814 }
1815 }
1816
1817 static struct wapbl_ino *
1818 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1819 {
1820 struct wapbl_ino_head *wih;
1821 struct wapbl_ino *wi;
1822
1823 KASSERT(mutex_owned(&wl->wl_mtx));
1824
1825 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1826 LIST_FOREACH(wi, wih, wi_hash) {
1827 if (ino == wi->wi_ino)
1828 return wi;
1829 }
1830 return 0;
1831 }
1832
1833 void
1834 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1835 {
1836 struct wapbl_ino_head *wih;
1837 struct wapbl_ino *wi;
1838
1839 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1840
1841 mutex_enter(&wl->wl_mtx);
1842 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1843 wi->wi_ino = ino;
1844 wi->wi_mode = mode;
1845 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1846 LIST_INSERT_HEAD(wih, wi, wi_hash);
1847 wl->wl_inohashcnt++;
1848 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1849 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1850 mutex_exit(&wl->wl_mtx);
1851 } else {
1852 mutex_exit(&wl->wl_mtx);
1853 pool_put(&wapbl_ino_pool, wi);
1854 }
1855 }
1856
1857 void
1858 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1859 {
1860 struct wapbl_ino *wi;
1861
1862 mutex_enter(&wl->wl_mtx);
1863 wi = wapbl_inodetrk_get(wl, ino);
1864 if (wi) {
1865 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1866 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1867 KASSERT(wl->wl_inohashcnt > 0);
1868 wl->wl_inohashcnt--;
1869 LIST_REMOVE(wi, wi_hash);
1870 mutex_exit(&wl->wl_mtx);
1871
1872 pool_put(&wapbl_ino_pool, wi);
1873 } else {
1874 mutex_exit(&wl->wl_mtx);
1875 }
1876 }
1877
1878 /****************************************************************/
1879
1880 static inline size_t
1881 wapbl_transaction_inodes_len(struct wapbl *wl)
1882 {
1883 int blocklen = 1<<wl->wl_log_dev_bshift;
1884 int iph;
1885
1886 /* Calculate number of inodes described in a inodelist header */
1887 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1888 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1889
1890 KASSERT(iph > 0);
1891
1892 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1893 }
1894
1895
1896 /* Calculate amount of space a transaction will take on disk */
1897 static size_t
1898 wapbl_transaction_len(struct wapbl *wl)
1899 {
1900 int blocklen = 1<<wl->wl_log_dev_bshift;
1901 size_t len;
1902 int bph;
1903
1904 /* Calculate number of blocks described in a blocklist header */
1905 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1906 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1907
1908 KASSERT(bph > 0);
1909
1910 len = wl->wl_bcount;
1911 len += howmany(wl->wl_bufcount, bph) * blocklen;
1912 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1913 len += wapbl_transaction_inodes_len(wl);
1914
1915 return len;
1916 }
1917
1918 /*
1919 * wapbl_cache_sync: issue DIOCCACHESYNC
1920 */
1921 static int
1922 wapbl_cache_sync(struct wapbl *wl, const char *msg)
1923 {
1924 const bool verbose = wapbl_verbose_commit >= 2;
1925 struct bintime start_time;
1926 int force = 1;
1927 int error;
1928
1929 if (!wapbl_flush_disk_cache) {
1930 return 0;
1931 }
1932 if (verbose) {
1933 bintime(&start_time);
1934 }
1935 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
1936 FWRITE, FSCRED);
1937 if (error) {
1938 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1939 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
1940 "returned %d\n", wl->wl_devvp->v_rdev, error));
1941 }
1942 if (verbose) {
1943 struct bintime d;
1944 struct timespec ts;
1945
1946 bintime(&d);
1947 bintime_sub(&d, &start_time);
1948 bintime2timespec(&d, &ts);
1949 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
1950 msg, (uintmax_t)wl->wl_devvp->v_rdev,
1951 (uintmax_t)ts.tv_sec, ts.tv_nsec);
1952 }
1953 return error;
1954 }
1955
1956 /*
1957 * Perform commit operation
1958 *
1959 * Note that generation number incrementation needs to
1960 * be protected against racing with other invocations
1961 * of wapbl_write_commit. This is ok since this routine
1962 * is only invoked from wapbl_flush
1963 */
1964 static int
1965 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1966 {
1967 struct wapbl_wc_header *wc = wl->wl_wc_header;
1968 struct timespec ts;
1969 int error;
1970 daddr_t pbn;
1971
1972 /*
1973 * flush disk cache to ensure that blocks we've written are actually
1974 * written to the stable storage before the commit header.
1975 *
1976 * XXX Calc checksum here, instead we do this for now
1977 */
1978 wapbl_cache_sync(wl, "1");
1979
1980 wc->wc_head = head;
1981 wc->wc_tail = tail;
1982 wc->wc_checksum = 0;
1983 wc->wc_version = 1;
1984 getnanotime(&ts);
1985 wc->wc_time = ts.tv_sec;
1986 wc->wc_timensec = ts.tv_nsec;
1987
1988 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1989 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1990 (intmax_t)head, (intmax_t)tail));
1991
1992 /*
1993 * write the commit header.
1994 *
1995 * XXX if generation will rollover, then first zero
1996 * over second commit header before trying to write both headers.
1997 */
1998
1999 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2000 #ifdef _KERNEL
2001 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2002 #endif
2003 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
2004 if (error)
2005 return error;
2006
2007 /*
2008 * flush disk cache to ensure that the commit header is actually
2009 * written before meta data blocks.
2010 */
2011 wapbl_cache_sync(wl, "2");
2012
2013 /*
2014 * If the generation number was zero, write it out a second time.
2015 * This handles initialization and generation number rollover
2016 */
2017 if (wc->wc_generation++ == 0) {
2018 error = wapbl_write_commit(wl, head, tail);
2019 /*
2020 * This panic should be able to be removed if we do the
2021 * zero'ing mentioned above, and we are certain to roll
2022 * back generation number on failure.
2023 */
2024 if (error)
2025 panic("wapbl_write_commit: error writing duplicate "
2026 "log header: %d\n", error);
2027 }
2028 return 0;
2029 }
2030
2031 /* Returns new offset value */
2032 static int
2033 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2034 {
2035 struct wapbl_wc_blocklist *wc =
2036 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2037 int blocklen = 1<<wl->wl_log_dev_bshift;
2038 int bph;
2039 struct buf *bp;
2040 off_t off = *offp;
2041 int error;
2042 size_t padding;
2043
2044 KASSERT(rw_write_held(&wl->wl_rwlock));
2045
2046 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2047 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2048
2049 bp = LIST_FIRST(&wl->wl_bufs);
2050
2051 while (bp) {
2052 int cnt;
2053 struct buf *obp = bp;
2054
2055 KASSERT(bp->b_flags & B_LOCKED);
2056
2057 wc->wc_type = WAPBL_WC_BLOCKS;
2058 wc->wc_len = blocklen;
2059 wc->wc_blkcount = 0;
2060 while (bp && (wc->wc_blkcount < bph)) {
2061 /*
2062 * Make sure all the physical block numbers are up to
2063 * date. If this is not always true on a given
2064 * filesystem, then VOP_BMAP must be called. We
2065 * could call VOP_BMAP here, or else in the filesystem
2066 * specific flush callback, although neither of those
2067 * solutions allow us to take the vnode lock. If a
2068 * filesystem requires that we must take the vnode lock
2069 * to call VOP_BMAP, then we can probably do it in
2070 * bwrite when the vnode lock should already be held
2071 * by the invoking code.
2072 */
2073 KASSERT((bp->b_vp->v_type == VBLK) ||
2074 (bp->b_blkno != bp->b_lblkno));
2075 KASSERT(bp->b_blkno > 0);
2076
2077 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2078 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2079 wc->wc_len += bp->b_bcount;
2080 wc->wc_blkcount++;
2081 bp = LIST_NEXT(bp, b_wapbllist);
2082 }
2083 if (wc->wc_len % blocklen != 0) {
2084 padding = blocklen - wc->wc_len % blocklen;
2085 wc->wc_len += padding;
2086 } else {
2087 padding = 0;
2088 }
2089
2090 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2091 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2092 wc->wc_len, padding, (intmax_t)off));
2093
2094 error = wapbl_circ_write(wl, wc, blocklen, &off);
2095 if (error)
2096 return error;
2097 bp = obp;
2098 cnt = 0;
2099 while (bp && (cnt++ < bph)) {
2100 error = wapbl_circ_write(wl, bp->b_data,
2101 bp->b_bcount, &off);
2102 if (error)
2103 return error;
2104 bp = LIST_NEXT(bp, b_wapbllist);
2105 }
2106 if (padding) {
2107 void *zero;
2108
2109 zero = wapbl_alloc(padding);
2110 memset(zero, 0, padding);
2111 error = wapbl_circ_write(wl, zero, padding, &off);
2112 wapbl_free(zero, padding);
2113 if (error)
2114 return error;
2115 }
2116 }
2117 *offp = off;
2118 return 0;
2119 }
2120
2121 static int
2122 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2123 {
2124 struct wapbl_wc_blocklist *wc =
2125 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2126 int i;
2127 int blocklen = 1<<wl->wl_log_dev_bshift;
2128 int bph;
2129 off_t off = *offp;
2130 int error;
2131
2132 if (wl->wl_dealloccnt == 0)
2133 return 0;
2134
2135 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2136 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2137
2138 i = 0;
2139 while (i < wl->wl_dealloccnt) {
2140 wc->wc_type = WAPBL_WC_REVOCATIONS;
2141 wc->wc_len = blocklen;
2142 wc->wc_blkcount = 0;
2143 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2144 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2145 wl->wl_deallocblks[i];
2146 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2147 wl->wl_dealloclens[i];
2148 wc->wc_blkcount++;
2149 i++;
2150 }
2151 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2152 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2153 wc->wc_len, (intmax_t)off));
2154 error = wapbl_circ_write(wl, wc, blocklen, &off);
2155 if (error)
2156 return error;
2157 }
2158 *offp = off;
2159 return 0;
2160 }
2161
2162 static int
2163 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2164 {
2165 struct wapbl_wc_inodelist *wc =
2166 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2167 int i;
2168 int blocklen = 1 << wl->wl_log_dev_bshift;
2169 off_t off = *offp;
2170 int error;
2171
2172 struct wapbl_ino_head *wih;
2173 struct wapbl_ino *wi;
2174 int iph;
2175
2176 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2177 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2178
2179 i = 0;
2180 wih = &wl->wl_inohash[0];
2181 wi = 0;
2182 do {
2183 wc->wc_type = WAPBL_WC_INODES;
2184 wc->wc_len = blocklen;
2185 wc->wc_inocnt = 0;
2186 wc->wc_clear = (i == 0);
2187 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2188 while (!wi) {
2189 KASSERT((wih - &wl->wl_inohash[0])
2190 <= wl->wl_inohashmask);
2191 wi = LIST_FIRST(wih++);
2192 }
2193 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2194 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2195 wc->wc_inocnt++;
2196 i++;
2197 wi = LIST_NEXT(wi, wi_hash);
2198 }
2199 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2200 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2201 wc->wc_len, (intmax_t)off));
2202 error = wapbl_circ_write(wl, wc, blocklen, &off);
2203 if (error)
2204 return error;
2205 } while (i < wl->wl_inohashcnt);
2206
2207 *offp = off;
2208 return 0;
2209 }
2210
2211 #endif /* _KERNEL */
2212
2213 /****************************************************************/
2214
2215 struct wapbl_blk {
2216 LIST_ENTRY(wapbl_blk) wb_hash;
2217 daddr_t wb_blk;
2218 off_t wb_off; /* Offset of this block in the log */
2219 };
2220 #define WAPBL_BLKPOOL_MIN 83
2221
2222 static void
2223 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2224 {
2225 if (size < WAPBL_BLKPOOL_MIN)
2226 size = WAPBL_BLKPOOL_MIN;
2227 KASSERT(wr->wr_blkhash == 0);
2228 #ifdef _KERNEL
2229 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2230 #else /* ! _KERNEL */
2231 /* Manually implement hashinit */
2232 {
2233 unsigned long i, hashsize;
2234 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2235 continue;
2236 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2237 for (i = 0; i < hashsize; i++)
2238 LIST_INIT(&wr->wr_blkhash[i]);
2239 wr->wr_blkhashmask = hashsize - 1;
2240 }
2241 #endif /* ! _KERNEL */
2242 }
2243
2244 static void
2245 wapbl_blkhash_free(struct wapbl_replay *wr)
2246 {
2247 KASSERT(wr->wr_blkhashcnt == 0);
2248 #ifdef _KERNEL
2249 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2250 #else /* ! _KERNEL */
2251 wapbl_free(wr->wr_blkhash,
2252 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2253 #endif /* ! _KERNEL */
2254 }
2255
2256 static struct wapbl_blk *
2257 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2258 {
2259 struct wapbl_blk_head *wbh;
2260 struct wapbl_blk *wb;
2261 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2262 LIST_FOREACH(wb, wbh, wb_hash) {
2263 if (blk == wb->wb_blk)
2264 return wb;
2265 }
2266 return 0;
2267 }
2268
2269 static void
2270 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2271 {
2272 struct wapbl_blk_head *wbh;
2273 struct wapbl_blk *wb;
2274 wb = wapbl_blkhash_get(wr, blk);
2275 if (wb) {
2276 KASSERT(wb->wb_blk == blk);
2277 wb->wb_off = off;
2278 } else {
2279 wb = wapbl_alloc(sizeof(*wb));
2280 wb->wb_blk = blk;
2281 wb->wb_off = off;
2282 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2283 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2284 wr->wr_blkhashcnt++;
2285 }
2286 }
2287
2288 static void
2289 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2290 {
2291 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2292 if (wb) {
2293 KASSERT(wr->wr_blkhashcnt > 0);
2294 wr->wr_blkhashcnt--;
2295 LIST_REMOVE(wb, wb_hash);
2296 wapbl_free(wb, sizeof(*wb));
2297 }
2298 }
2299
2300 static void
2301 wapbl_blkhash_clear(struct wapbl_replay *wr)
2302 {
2303 unsigned long i;
2304 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2305 struct wapbl_blk *wb;
2306
2307 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2308 KASSERT(wr->wr_blkhashcnt > 0);
2309 wr->wr_blkhashcnt--;
2310 LIST_REMOVE(wb, wb_hash);
2311 wapbl_free(wb, sizeof(*wb));
2312 }
2313 }
2314 KASSERT(wr->wr_blkhashcnt == 0);
2315 }
2316
2317 /****************************************************************/
2318
2319 static int
2320 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2321 {
2322 size_t slen;
2323 off_t off = *offp;
2324 int error;
2325 daddr_t pbn;
2326
2327 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2328 wr->wr_log_dev_bshift) == len);
2329
2330 if (off < wr->wr_circ_off)
2331 off = wr->wr_circ_off;
2332 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2333 if (slen < len) {
2334 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2335 #ifdef _KERNEL
2336 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2337 #endif
2338 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2339 if (error)
2340 return error;
2341 data = (uint8_t *)data + slen;
2342 len -= slen;
2343 off = wr->wr_circ_off;
2344 }
2345 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2346 #ifdef _KERNEL
2347 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2348 #endif
2349 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2350 if (error)
2351 return error;
2352 off += len;
2353 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2354 off = wr->wr_circ_off;
2355 *offp = off;
2356 return 0;
2357 }
2358
2359 static void
2360 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2361 {
2362 size_t slen;
2363 off_t off = *offp;
2364
2365 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2366 wr->wr_log_dev_bshift) == len);
2367
2368 if (off < wr->wr_circ_off)
2369 off = wr->wr_circ_off;
2370 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2371 if (slen < len) {
2372 len -= slen;
2373 off = wr->wr_circ_off;
2374 }
2375 off += len;
2376 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2377 off = wr->wr_circ_off;
2378 *offp = off;
2379 }
2380
2381 /****************************************************************/
2382
2383 int
2384 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2385 daddr_t off, size_t count, size_t blksize)
2386 {
2387 struct wapbl_replay *wr;
2388 int error;
2389 struct vnode *devvp;
2390 daddr_t logpbn;
2391 uint8_t *scratch;
2392 struct wapbl_wc_header *wch;
2393 struct wapbl_wc_header *wch2;
2394 /* Use this until we read the actual log header */
2395 int log_dev_bshift = ilog2(blksize);
2396 size_t used;
2397 daddr_t pbn;
2398
2399 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2400 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2401 vp, off, count, blksize));
2402
2403 if (off < 0)
2404 return EINVAL;
2405
2406 if (blksize < DEV_BSIZE)
2407 return EINVAL;
2408 if (blksize % DEV_BSIZE)
2409 return EINVAL;
2410
2411 #ifdef _KERNEL
2412 #if 0
2413 /* XXX vp->v_size isn't reliably set for VBLK devices,
2414 * especially root. However, we might still want to verify
2415 * that the full load is readable */
2416 if ((off + count) * blksize > vp->v_size)
2417 return EINVAL;
2418 #endif
2419 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2420 return error;
2421 }
2422 #else /* ! _KERNEL */
2423 devvp = vp;
2424 logpbn = off;
2425 #endif /* ! _KERNEL */
2426
2427 scratch = wapbl_alloc(MAXBSIZE);
2428
2429 pbn = logpbn;
2430 #ifdef _KERNEL
2431 pbn = btodb(pbn << log_dev_bshift);
2432 #endif
2433 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2434 if (error)
2435 goto errout;
2436
2437 wch = (struct wapbl_wc_header *)scratch;
2438 wch2 =
2439 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2440 /* XXX verify checksums and magic numbers */
2441 if (wch->wc_type != WAPBL_WC_HEADER) {
2442 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2443 error = EFTYPE;
2444 goto errout;
2445 }
2446
2447 if (wch2->wc_generation > wch->wc_generation)
2448 wch = wch2;
2449
2450 wr = wapbl_calloc(1, sizeof(*wr));
2451
2452 wr->wr_logvp = vp;
2453 wr->wr_devvp = devvp;
2454 wr->wr_logpbn = logpbn;
2455
2456 wr->wr_scratch = scratch;
2457
2458 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2459 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2460 wr->wr_circ_off = wch->wc_circ_off;
2461 wr->wr_circ_size = wch->wc_circ_size;
2462 wr->wr_generation = wch->wc_generation;
2463
2464 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2465
2466 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2467 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2468 " len=%"PRId64" used=%zu\n",
2469 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2470 wch->wc_circ_size, used));
2471
2472 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2473
2474 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2475 if (error) {
2476 wapbl_replay_stop(wr);
2477 wapbl_replay_free(wr);
2478 return error;
2479 }
2480
2481 *wrp = wr;
2482 return 0;
2483
2484 errout:
2485 wapbl_free(scratch, MAXBSIZE);
2486 return error;
2487 }
2488
2489 void
2490 wapbl_replay_stop(struct wapbl_replay *wr)
2491 {
2492
2493 if (!wapbl_replay_isopen(wr))
2494 return;
2495
2496 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2497
2498 wapbl_free(wr->wr_scratch, MAXBSIZE);
2499 wr->wr_scratch = NULL;
2500
2501 wr->wr_logvp = NULL;
2502
2503 wapbl_blkhash_clear(wr);
2504 wapbl_blkhash_free(wr);
2505 }
2506
2507 void
2508 wapbl_replay_free(struct wapbl_replay *wr)
2509 {
2510
2511 KDASSERT(!wapbl_replay_isopen(wr));
2512
2513 if (wr->wr_inodes)
2514 wapbl_free(wr->wr_inodes,
2515 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2516 wapbl_free(wr, sizeof(*wr));
2517 }
2518
2519 #ifdef _KERNEL
2520 int
2521 wapbl_replay_isopen1(struct wapbl_replay *wr)
2522 {
2523
2524 return wapbl_replay_isopen(wr);
2525 }
2526 #endif
2527
2528 static void
2529 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2530 {
2531 struct wapbl_wc_blocklist *wc =
2532 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2533 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2534 int i, j, n;
2535
2536 for (i = 0; i < wc->wc_blkcount; i++) {
2537 /*
2538 * Enter each physical block into the hashtable independently.
2539 */
2540 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2541 for (j = 0; j < n; j++) {
2542 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2543 *offp);
2544 wapbl_circ_advance(wr, fsblklen, offp);
2545 }
2546 }
2547 }
2548
2549 static void
2550 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2551 {
2552 struct wapbl_wc_blocklist *wc =
2553 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2554 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2555 int i, j, n;
2556
2557 for (i = 0; i < wc->wc_blkcount; i++) {
2558 /*
2559 * Remove any blocks found from the hashtable.
2560 */
2561 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2562 for (j = 0; j < n; j++)
2563 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2564 }
2565 }
2566
2567 static void
2568 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2569 {
2570 struct wapbl_wc_inodelist *wc =
2571 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2572 void *new_inodes;
2573 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2574
2575 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2576
2577 /*
2578 * Keep track of where we found this so location won't be
2579 * overwritten.
2580 */
2581 if (wc->wc_clear) {
2582 wr->wr_inodestail = oldoff;
2583 wr->wr_inodescnt = 0;
2584 if (wr->wr_inodes != NULL) {
2585 wapbl_free(wr->wr_inodes, oldsize);
2586 wr->wr_inodes = NULL;
2587 }
2588 }
2589 wr->wr_inodeshead = newoff;
2590 if (wc->wc_inocnt == 0)
2591 return;
2592
2593 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2594 sizeof(wr->wr_inodes[0]));
2595 if (wr->wr_inodes != NULL) {
2596 memcpy(new_inodes, wr->wr_inodes, oldsize);
2597 wapbl_free(wr->wr_inodes, oldsize);
2598 }
2599 wr->wr_inodes = new_inodes;
2600 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2601 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2602 wr->wr_inodescnt += wc->wc_inocnt;
2603 }
2604
2605 static int
2606 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2607 {
2608 off_t off;
2609 int error;
2610
2611 int logblklen = 1 << wr->wr_log_dev_bshift;
2612
2613 wapbl_blkhash_clear(wr);
2614
2615 off = tail;
2616 while (off != head) {
2617 struct wapbl_wc_null *wcn;
2618 off_t saveoff = off;
2619 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2620 if (error)
2621 goto errout;
2622 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2623 switch (wcn->wc_type) {
2624 case WAPBL_WC_BLOCKS:
2625 wapbl_replay_process_blocks(wr, &off);
2626 break;
2627
2628 case WAPBL_WC_REVOCATIONS:
2629 wapbl_replay_process_revocations(wr);
2630 break;
2631
2632 case WAPBL_WC_INODES:
2633 wapbl_replay_process_inodes(wr, saveoff, off);
2634 break;
2635
2636 default:
2637 printf("Unrecognized wapbl type: 0x%08x\n",
2638 wcn->wc_type);
2639 error = EFTYPE;
2640 goto errout;
2641 }
2642 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2643 if (off != saveoff) {
2644 printf("wapbl_replay: corrupted records\n");
2645 error = EFTYPE;
2646 goto errout;
2647 }
2648 }
2649 return 0;
2650
2651 errout:
2652 wapbl_blkhash_clear(wr);
2653 return error;
2654 }
2655
2656 #if 0
2657 int
2658 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2659 {
2660 off_t off;
2661 int mismatchcnt = 0;
2662 int logblklen = 1 << wr->wr_log_dev_bshift;
2663 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2664 void *scratch1 = wapbl_alloc(MAXBSIZE);
2665 void *scratch2 = wapbl_alloc(MAXBSIZE);
2666 int error = 0;
2667
2668 KDASSERT(wapbl_replay_isopen(wr));
2669
2670 off = wch->wc_tail;
2671 while (off != wch->wc_head) {
2672 struct wapbl_wc_null *wcn;
2673 #ifdef DEBUG
2674 off_t saveoff = off;
2675 #endif
2676 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2677 if (error)
2678 goto out;
2679 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2680 switch (wcn->wc_type) {
2681 case WAPBL_WC_BLOCKS:
2682 {
2683 struct wapbl_wc_blocklist *wc =
2684 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2685 int i;
2686 for (i = 0; i < wc->wc_blkcount; i++) {
2687 int foundcnt = 0;
2688 int dirtycnt = 0;
2689 int j, n;
2690 /*
2691 * Check each physical block into the
2692 * hashtable independently
2693 */
2694 n = wc->wc_blocks[i].wc_dlen >>
2695 wch->wc_fs_dev_bshift;
2696 for (j = 0; j < n; j++) {
2697 struct wapbl_blk *wb =
2698 wapbl_blkhash_get(wr,
2699 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2700 if (wb && (wb->wb_off == off)) {
2701 foundcnt++;
2702 error =
2703 wapbl_circ_read(wr,
2704 scratch1, fsblklen,
2705 &off);
2706 if (error)
2707 goto out;
2708 error =
2709 wapbl_read(scratch2,
2710 fsblklen, fsdevvp,
2711 wb->wb_blk);
2712 if (error)
2713 goto out;
2714 if (memcmp(scratch1,
2715 scratch2,
2716 fsblklen)) {
2717 printf(
2718 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2719 wb->wb_blk, (intmax_t)off);
2720 dirtycnt++;
2721 mismatchcnt++;
2722 }
2723 } else {
2724 wapbl_circ_advance(wr,
2725 fsblklen, &off);
2726 }
2727 }
2728 #if 0
2729 /*
2730 * If all of the blocks in an entry
2731 * are clean, then remove all of its
2732 * blocks from the hashtable since they
2733 * never will need replay.
2734 */
2735 if ((foundcnt != 0) &&
2736 (dirtycnt == 0)) {
2737 off = saveoff;
2738 wapbl_circ_advance(wr,
2739 logblklen, &off);
2740 for (j = 0; j < n; j++) {
2741 struct wapbl_blk *wb =
2742 wapbl_blkhash_get(wr,
2743 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2744 if (wb &&
2745 (wb->wb_off == off)) {
2746 wapbl_blkhash_rem(wr, wb->wb_blk);
2747 }
2748 wapbl_circ_advance(wr,
2749 fsblklen, &off);
2750 }
2751 }
2752 #endif
2753 }
2754 }
2755 break;
2756 case WAPBL_WC_REVOCATIONS:
2757 case WAPBL_WC_INODES:
2758 break;
2759 default:
2760 KASSERT(0);
2761 }
2762 #ifdef DEBUG
2763 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2764 KASSERT(off == saveoff);
2765 #endif
2766 }
2767 out:
2768 wapbl_free(scratch1, MAXBSIZE);
2769 wapbl_free(scratch2, MAXBSIZE);
2770 if (!error && mismatchcnt)
2771 error = EFTYPE;
2772 return error;
2773 }
2774 #endif
2775
2776 int
2777 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2778 {
2779 struct wapbl_blk *wb;
2780 size_t i;
2781 off_t off;
2782 void *scratch;
2783 int error = 0;
2784 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2785
2786 KDASSERT(wapbl_replay_isopen(wr));
2787
2788 scratch = wapbl_alloc(MAXBSIZE);
2789
2790 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2791 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2792 off = wb->wb_off;
2793 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2794 if (error)
2795 break;
2796 error = wapbl_write(scratch, fsblklen, fsdevvp,
2797 wb->wb_blk);
2798 if (error)
2799 break;
2800 }
2801 }
2802
2803 wapbl_free(scratch, MAXBSIZE);
2804 return error;
2805 }
2806
2807 int
2808 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2809 {
2810 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2811
2812 KDASSERT(wapbl_replay_isopen(wr));
2813 KASSERT((len % fsblklen) == 0);
2814
2815 while (len != 0) {
2816 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2817 if (wb)
2818 return 1;
2819 len -= fsblklen;
2820 }
2821 return 0;
2822 }
2823
2824 int
2825 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2826 {
2827 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2828
2829 KDASSERT(wapbl_replay_isopen(wr));
2830
2831 KASSERT((len % fsblklen) == 0);
2832
2833 while (len != 0) {
2834 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2835 if (wb) {
2836 off_t off = wb->wb_off;
2837 int error;
2838 error = wapbl_circ_read(wr, data, fsblklen, &off);
2839 if (error)
2840 return error;
2841 }
2842 data = (uint8_t *)data + fsblklen;
2843 len -= fsblklen;
2844 blk++;
2845 }
2846 return 0;
2847 }
2848
2849 #ifdef _KERNEL
2850 /*
2851 * This is not really a module now, but maybe on it's way to
2852 * being one some day.
2853 */
2854 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2855
2856 static int
2857 wapbl_modcmd(modcmd_t cmd, void *arg)
2858 {
2859
2860 switch (cmd) {
2861 case MODULE_CMD_INIT:
2862 wapbl_init();
2863 return 0;
2864 case MODULE_CMD_FINI:
2865 #ifdef notyet
2866 return wapbl_fini(true);
2867 #endif
2868 return EOPNOTSUPP;
2869 default:
2870 return ENOTTY;
2871 }
2872 }
2873 #endif /* _KERNEL */
2874