vfs_wapbl.c revision 1.62 1 /* $NetBSD: vfs_wapbl.c,v 1.62 2015/08/09 07:40:59 mlelstv Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.62 2015/08/09 07:40:59 mlelstv Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 static inline size_t wapbl_space_free(size_t, off_t, off_t);
74
75 #else /* !_KERNEL */
76 #include <assert.h>
77 #include <errno.h>
78 #include <stdio.h>
79 #include <stdbool.h>
80 #include <stdlib.h>
81 #include <string.h>
82
83 #include <sys/time.h>
84 #include <sys/wapbl.h>
85 #include <sys/wapbl_replay.h>
86
87 #define KDASSERT(x) assert(x)
88 #define KASSERT(x) assert(x)
89 #define wapbl_alloc(s) malloc(s)
90 #define wapbl_free(a, s) free(a)
91 #define wapbl_calloc(n, s) calloc((n), (s))
92
93 #endif /* !_KERNEL */
94
95 /*
96 * INTERNAL DATA STRUCTURES
97 */
98
99 /*
100 * This structure holds per-mount log information.
101 *
102 * Legend: a = atomic access only
103 * r = read-only after init
104 * l = rwlock held
105 * m = mutex held
106 * lm = rwlock held writing or mutex held
107 * u = unlocked access ok
108 * b = bufcache_lock held
109 */
110 LIST_HEAD(wapbl_ino_head, wapbl_ino);
111 struct wapbl {
112 struct vnode *wl_logvp; /* r: log here */
113 struct vnode *wl_devvp; /* r: log on this device */
114 struct mount *wl_mount; /* r: mountpoint wl is associated with */
115 daddr_t wl_logpbn; /* r: Physical block number of start of log */
116 int wl_log_dev_bshift; /* r: logarithm of device block size of log
117 device */
118 int wl_fs_dev_bshift; /* r: logarithm of device block size of
119 filesystem device */
120
121 unsigned wl_lock_count; /* m: Count of transactions in progress */
122
123 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
124 size_t wl_circ_off; /* r: Number of bytes reserved at start */
125
126 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
127 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
128
129 off_t wl_head; /* l: Byte offset of log head */
130 off_t wl_tail; /* l: Byte offset of log tail */
131 /*
132 * head == tail == 0 means log is empty
133 * head == tail != 0 means log is full
134 * see assertions in wapbl_advance() for other boundary conditions.
135 * only truncate moves the tail, except when flush sets it to
136 * wl_header_size only flush moves the head, except when truncate
137 * sets it to 0.
138 */
139
140 struct wapbl_wc_header *wl_wc_header; /* l */
141 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
142
143 kmutex_t wl_mtx; /* u: short-term lock */
144 krwlock_t wl_rwlock; /* u: File system transaction lock */
145
146 /*
147 * Must be held while accessing
148 * wl_count or wl_bufs or head or tail
149 */
150
151 /*
152 * Callback called from within the flush routine to flush any extra
153 * bits. Note that flush may be skipped without calling this if
154 * there are no outstanding buffers in the transaction.
155 */
156 #if _KERNEL
157 wapbl_flush_fn_t wl_flush; /* r */
158 wapbl_flush_fn_t wl_flush_abort;/* r */
159 #endif
160
161 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
162 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
163 size_t wl_bcount; /* m: Total bcount of wl_bufs */
164
165 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
166
167 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
168 size_t wl_reclaimable_bytes; /* m: Amount of space available for
169 reclamation by truncate */
170 int wl_error_count; /* m: # of wl_entries with errors */
171 size_t wl_reserved_bytes; /* never truncate log smaller than this */
172
173 #ifdef WAPBL_DEBUG_BUFBYTES
174 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
175 #endif
176
177 daddr_t *wl_deallocblks;/* lm: address of block */
178 int *wl_dealloclens; /* lm: size of block */
179 int wl_dealloccnt; /* lm: total count */
180 int wl_dealloclim; /* l: max count */
181
182 /* hashtable of inode numbers for allocated but unlinked inodes */
183 /* synch ??? */
184 struct wapbl_ino_head *wl_inohash;
185 u_long wl_inohashmask;
186 int wl_inohashcnt;
187
188 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
189 accounting */
190
191 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
192 daddr_t wl_buffer_dblk; /* l: buffer disk block address */
193 size_t wl_buffer_used; /* l: buffer current use */
194 };
195
196 #ifdef WAPBL_DEBUG_PRINT
197 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
198 #endif
199
200 /****************************************************************/
201 #ifdef _KERNEL
202
203 #ifdef WAPBL_DEBUG
204 struct wapbl *wapbl_debug_wl;
205 #endif
206
207 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
208 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
209 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
210 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
211 #endif /* _KERNEL */
212
213 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
214
215 static inline size_t wapbl_space_used(size_t avail, off_t head,
216 off_t tail);
217
218 #ifdef _KERNEL
219
220 static struct pool wapbl_entry_pool;
221
222 #define WAPBL_INODETRK_SIZE 83
223 static int wapbl_ino_pool_refcount;
224 static struct pool wapbl_ino_pool;
225 struct wapbl_ino {
226 LIST_ENTRY(wapbl_ino) wi_hash;
227 ino_t wi_ino;
228 mode_t wi_mode;
229 };
230
231 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
232 static void wapbl_inodetrk_free(struct wapbl *wl);
233 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
234
235 static size_t wapbl_transaction_len(struct wapbl *wl);
236 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
237
238 #if 0
239 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
240 #endif
241
242 static int wapbl_replay_isopen1(struct wapbl_replay *);
243
244 /*
245 * This is useful for debugging. If set, the log will
246 * only be truncated when necessary.
247 */
248 int wapbl_lazy_truncate = 0;
249
250 struct wapbl_ops wapbl_ops = {
251 .wo_wapbl_discard = wapbl_discard,
252 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
253 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
254 .wo_wapbl_replay_read = wapbl_replay_read,
255 .wo_wapbl_add_buf = wapbl_add_buf,
256 .wo_wapbl_remove_buf = wapbl_remove_buf,
257 .wo_wapbl_resize_buf = wapbl_resize_buf,
258 .wo_wapbl_begin = wapbl_begin,
259 .wo_wapbl_end = wapbl_end,
260 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
261
262 /* XXX: the following is only used to say "this is a wapbl buf" */
263 .wo_wapbl_biodone = wapbl_biodone,
264 };
265
266 static int
267 wapbl_sysctl_init(void)
268 {
269 int rv;
270 const struct sysctlnode *rnode, *cnode;
271
272 wapbl_sysctl = NULL;
273
274 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
275 CTLFLAG_PERMANENT,
276 CTLTYPE_NODE, "wapbl",
277 SYSCTL_DESCR("WAPBL journaling options"),
278 NULL, 0, NULL, 0,
279 CTL_VFS, CTL_CREATE, CTL_EOL);
280 if (rv)
281 return rv;
282
283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
284 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
285 CTLTYPE_INT, "flush_disk_cache",
286 SYSCTL_DESCR("flush disk cache"),
287 NULL, 0, &wapbl_flush_disk_cache, 0,
288 CTL_CREATE, CTL_EOL);
289 if (rv)
290 return rv;
291
292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
294 CTLTYPE_INT, "verbose_commit",
295 SYSCTL_DESCR("show time and size of wapbl log commits"),
296 NULL, 0, &wapbl_verbose_commit, 0,
297 CTL_CREATE, CTL_EOL);
298 return rv;
299 }
300
301 static void
302 wapbl_init(void)
303 {
304
305 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
306 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
307
308 wapbl_sysctl_init();
309 }
310
311 #ifdef notyet
312 static int
313 wapbl_fini(bool interface)
314 {
315
316 if (aio_sysctl != NULL)
317 sysctl_teardown(&aio_sysctl);
318
319 pool_destroy(&wapbl_entry_pool);
320
321 return 0;
322 }
323 #endif
324
325 static int
326 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
327 {
328 int error, i;
329
330 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
331 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
332
333 /*
334 * Its only valid to reuse the replay log if its
335 * the same as the new log we just opened.
336 */
337 KDASSERT(!wapbl_replay_isopen(wr));
338 KASSERT(wl->wl_devvp->v_type == VBLK);
339 KASSERT(wr->wr_devvp->v_type == VBLK);
340 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
341 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
342 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
343 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
344 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
345 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
346
347 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
348
349 for (i = 0; i < wr->wr_inodescnt; i++)
350 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
351 wr->wr_inodes[i].wr_imode);
352
353 /* Make sure new transaction won't overwrite old inodes list */
354 KDASSERT(wapbl_transaction_len(wl) <=
355 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
356 wr->wr_inodestail));
357
358 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
359 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
360 wapbl_transaction_len(wl);
361
362 error = wapbl_write_inodes(wl, &wl->wl_head);
363 if (error)
364 return error;
365
366 KASSERT(wl->wl_head != wl->wl_tail);
367 KASSERT(wl->wl_head != 0);
368
369 return 0;
370 }
371
372 int
373 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
374 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
375 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
376 {
377 struct wapbl *wl;
378 struct vnode *devvp;
379 daddr_t logpbn;
380 int error;
381 int log_dev_bshift = ilog2(blksize);
382 int fs_dev_bshift = log_dev_bshift;
383 int run;
384
385 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
386 " count=%zu blksize=%zu\n", vp, off, count, blksize));
387
388 if (log_dev_bshift > fs_dev_bshift) {
389 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
390 ("wapbl: log device's block size cannot be larger "
391 "than filesystem's\n"));
392 /*
393 * Not currently implemented, although it could be if
394 * needed someday.
395 */
396 return ENOSYS;
397 }
398
399 if (off < 0)
400 return EINVAL;
401
402 if (blksize < DEV_BSIZE)
403 return EINVAL;
404 if (blksize % DEV_BSIZE)
405 return EINVAL;
406
407 /* XXXTODO: verify that the full load is writable */
408
409 /*
410 * XXX check for minimum log size
411 * minimum is governed by minimum amount of space
412 * to complete a transaction. (probably truncate)
413 */
414 /* XXX for now pick something minimal */
415 if ((count * blksize) < MAXPHYS) {
416 return ENOSPC;
417 }
418
419 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
420 return error;
421 }
422
423 wl = wapbl_calloc(1, sizeof(*wl));
424 rw_init(&wl->wl_rwlock);
425 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
426 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
427 LIST_INIT(&wl->wl_bufs);
428 SIMPLEQ_INIT(&wl->wl_entries);
429
430 wl->wl_logvp = vp;
431 wl->wl_devvp = devvp;
432 wl->wl_mount = mp;
433 wl->wl_logpbn = logpbn;
434 wl->wl_log_dev_bshift = log_dev_bshift;
435 wl->wl_fs_dev_bshift = fs_dev_bshift;
436
437 wl->wl_flush = flushfn;
438 wl->wl_flush_abort = flushabortfn;
439
440 /* Reserve two log device blocks for the commit headers */
441 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
442 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
443 /* truncate the log usage to a multiple of log_dev_bshift */
444 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
445 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
446
447 /*
448 * wl_bufbytes_max limits the size of the in memory transaction space.
449 * - Since buffers are allocated and accounted for in units of
450 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
451 * (i.e. 1<<PAGE_SHIFT)
452 * - Since the log device has to be written in units of
453 * 1<<wl_log_dev_bshift it is required to be a mulitple of
454 * 1<<wl_log_dev_bshift.
455 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
456 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
457 * Therefore it must be multiple of the least common multiple of those
458 * three quantities. Fortunately, all of those quantities are
459 * guaranteed to be a power of two, and the least common multiple of
460 * a set of numbers which are all powers of two is simply the maximum
461 * of those numbers. Finally, the maximum logarithm of a power of two
462 * is the same as the log of the maximum power of two. So we can do
463 * the following operations to size wl_bufbytes_max:
464 */
465
466 /* XXX fix actual number of pages reserved per filesystem. */
467 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
468
469 /* Round wl_bufbytes_max to the largest power of two constraint */
470 wl->wl_bufbytes_max >>= PAGE_SHIFT;
471 wl->wl_bufbytes_max <<= PAGE_SHIFT;
472 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
473 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
474 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
475 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
476
477 /* XXX maybe use filesystem fragment size instead of 1024 */
478 /* XXX fix actual number of buffers reserved per filesystem. */
479 wl->wl_bufcount_max = (nbuf / 2) * 1024;
480
481 /* XXX tie this into resource estimation */
482 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
483
484 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
485 wl->wl_dealloclim);
486 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
487 wl->wl_dealloclim);
488
489 wl->wl_buffer = wapbl_alloc(MAXPHYS);
490 wl->wl_buffer_used = 0;
491
492 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
493
494 /* Initialize the commit header */
495 {
496 struct wapbl_wc_header *wc;
497 size_t len = 1 << wl->wl_log_dev_bshift;
498 wc = wapbl_calloc(1, len);
499 wc->wc_type = WAPBL_WC_HEADER;
500 wc->wc_len = len;
501 wc->wc_circ_off = wl->wl_circ_off;
502 wc->wc_circ_size = wl->wl_circ_size;
503 /* XXX wc->wc_fsid */
504 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
505 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
506 wl->wl_wc_header = wc;
507 wl->wl_wc_scratch = wapbl_alloc(len);
508 }
509
510 /*
511 * if there was an existing set of unlinked but
512 * allocated inodes, preserve it in the new
513 * log.
514 */
515 if (wr && wr->wr_inodescnt) {
516 error = wapbl_start_flush_inodes(wl, wr);
517 if (error)
518 goto errout;
519 }
520
521 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
522 if (error) {
523 goto errout;
524 }
525
526 *wlp = wl;
527 #if defined(WAPBL_DEBUG)
528 wapbl_debug_wl = wl;
529 #endif
530
531 return 0;
532 errout:
533 wapbl_discard(wl);
534 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
535 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
536 wapbl_free(wl->wl_deallocblks,
537 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
538 wapbl_free(wl->wl_dealloclens,
539 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
540 wapbl_free(wl->wl_buffer, MAXPHYS);
541 wapbl_inodetrk_free(wl);
542 wapbl_free(wl, sizeof(*wl));
543
544 return error;
545 }
546
547 /*
548 * Like wapbl_flush, only discards the transaction
549 * completely
550 */
551
552 void
553 wapbl_discard(struct wapbl *wl)
554 {
555 struct wapbl_entry *we;
556 struct buf *bp;
557 int i;
558
559 /*
560 * XXX we may consider using upgrade here
561 * if we want to call flush from inside a transaction
562 */
563 rw_enter(&wl->wl_rwlock, RW_WRITER);
564 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
565 wl->wl_dealloccnt);
566
567 #ifdef WAPBL_DEBUG_PRINT
568 {
569 pid_t pid = -1;
570 lwpid_t lid = -1;
571 if (curproc)
572 pid = curproc->p_pid;
573 if (curlwp)
574 lid = curlwp->l_lid;
575 #ifdef WAPBL_DEBUG_BUFBYTES
576 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
577 ("wapbl_discard: thread %d.%d discarding "
578 "transaction\n"
579 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
580 "deallocs=%d inodes=%d\n"
581 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
582 "unsynced=%zu\n",
583 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
584 wl->wl_bcount, wl->wl_dealloccnt,
585 wl->wl_inohashcnt, wl->wl_error_count,
586 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
587 wl->wl_unsynced_bufbytes));
588 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
589 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
590 ("\tentry: bufcount = %zu, reclaimable = %zu, "
591 "error = %d, unsynced = %zu\n",
592 we->we_bufcount, we->we_reclaimable_bytes,
593 we->we_error, we->we_unsynced_bufbytes));
594 }
595 #else /* !WAPBL_DEBUG_BUFBYTES */
596 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
597 ("wapbl_discard: thread %d.%d discarding transaction\n"
598 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
599 "deallocs=%d inodes=%d\n"
600 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
601 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
602 wl->wl_bcount, wl->wl_dealloccnt,
603 wl->wl_inohashcnt, wl->wl_error_count,
604 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
605 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
606 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
607 ("\tentry: bufcount = %zu, reclaimable = %zu, "
608 "error = %d\n",
609 we->we_bufcount, we->we_reclaimable_bytes,
610 we->we_error));
611 }
612 #endif /* !WAPBL_DEBUG_BUFBYTES */
613 }
614 #endif /* WAPBL_DEBUG_PRINT */
615
616 for (i = 0; i <= wl->wl_inohashmask; i++) {
617 struct wapbl_ino_head *wih;
618 struct wapbl_ino *wi;
619
620 wih = &wl->wl_inohash[i];
621 while ((wi = LIST_FIRST(wih)) != NULL) {
622 LIST_REMOVE(wi, wi_hash);
623 pool_put(&wapbl_ino_pool, wi);
624 KASSERT(wl->wl_inohashcnt > 0);
625 wl->wl_inohashcnt--;
626 }
627 }
628
629 /*
630 * clean buffer list
631 */
632 mutex_enter(&bufcache_lock);
633 mutex_enter(&wl->wl_mtx);
634 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
635 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
636 /*
637 * The buffer will be unlocked and
638 * removed from the transaction in brelse
639 */
640 mutex_exit(&wl->wl_mtx);
641 brelsel(bp, 0);
642 mutex_enter(&wl->wl_mtx);
643 }
644 }
645 mutex_exit(&wl->wl_mtx);
646 mutex_exit(&bufcache_lock);
647
648 /*
649 * Remove references to this wl from wl_entries, free any which
650 * no longer have buffers, others will be freed in wapbl_biodone
651 * when they no longer have any buffers.
652 */
653 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
654 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
655 /* XXX should we be accumulating wl_error_count
656 * and increasing reclaimable bytes ? */
657 we->we_wapbl = NULL;
658 if (we->we_bufcount == 0) {
659 #ifdef WAPBL_DEBUG_BUFBYTES
660 KASSERT(we->we_unsynced_bufbytes == 0);
661 #endif
662 pool_put(&wapbl_entry_pool, we);
663 }
664 }
665
666 /* Discard list of deallocs */
667 wl->wl_dealloccnt = 0;
668 /* XXX should we clear wl_reserved_bytes? */
669
670 KASSERT(wl->wl_bufbytes == 0);
671 KASSERT(wl->wl_bcount == 0);
672 KASSERT(wl->wl_bufcount == 0);
673 KASSERT(LIST_EMPTY(&wl->wl_bufs));
674 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
675 KASSERT(wl->wl_inohashcnt == 0);
676
677 rw_exit(&wl->wl_rwlock);
678 }
679
680 int
681 wapbl_stop(struct wapbl *wl, int force)
682 {
683 int error;
684
685 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
686 error = wapbl_flush(wl, 1);
687 if (error) {
688 if (force)
689 wapbl_discard(wl);
690 else
691 return error;
692 }
693
694 /* Unlinked inodes persist after a flush */
695 if (wl->wl_inohashcnt) {
696 if (force) {
697 wapbl_discard(wl);
698 } else {
699 return EBUSY;
700 }
701 }
702
703 KASSERT(wl->wl_bufbytes == 0);
704 KASSERT(wl->wl_bcount == 0);
705 KASSERT(wl->wl_bufcount == 0);
706 KASSERT(LIST_EMPTY(&wl->wl_bufs));
707 KASSERT(wl->wl_dealloccnt == 0);
708 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
709 KASSERT(wl->wl_inohashcnt == 0);
710
711 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
712 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
713 wapbl_free(wl->wl_deallocblks,
714 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
715 wapbl_free(wl->wl_dealloclens,
716 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
717 wapbl_free(wl->wl_buffer, MAXPHYS);
718 wapbl_inodetrk_free(wl);
719
720 cv_destroy(&wl->wl_reclaimable_cv);
721 mutex_destroy(&wl->wl_mtx);
722 rw_destroy(&wl->wl_rwlock);
723 wapbl_free(wl, sizeof(*wl));
724
725 return 0;
726 }
727
728 static int
729 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
730 {
731 struct pstats *pstats = curlwp->l_proc->p_stats;
732 struct buf *bp;
733 int error;
734
735 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
736 KASSERT(devvp->v_type == VBLK);
737
738 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
739 mutex_enter(devvp->v_interlock);
740 devvp->v_numoutput++;
741 mutex_exit(devvp->v_interlock);
742 pstats->p_ru.ru_oublock++;
743 } else {
744 pstats->p_ru.ru_inblock++;
745 }
746
747 bp = getiobuf(devvp, true);
748 bp->b_flags = flags;
749 bp->b_cflags = BC_BUSY; /* silly & dubious */
750 bp->b_dev = devvp->v_rdev;
751 bp->b_data = data;
752 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
753 bp->b_blkno = pbn;
754 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
755
756 WAPBL_PRINTF(WAPBL_PRINT_IO,
757 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
758 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
759 bp->b_blkno, bp->b_dev));
760
761 VOP_STRATEGY(devvp, bp);
762
763 error = biowait(bp);
764 putiobuf(bp);
765
766 if (error) {
767 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
768 ("wapbl_doio: %s %zu bytes at block %" PRId64
769 " on dev 0x%"PRIx64" failed with error %d\n",
770 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
771 "write" : "read"),
772 len, pbn, devvp->v_rdev, error));
773 }
774
775 return error;
776 }
777
778 int
779 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
780 {
781
782 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
783 }
784
785 int
786 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
787 {
788
789 return wapbl_doio(data, len, devvp, pbn, B_READ);
790 }
791
792 /*
793 * Flush buffered data if any.
794 */
795 static int
796 wapbl_buffered_flush(struct wapbl *wl)
797 {
798 int error;
799
800 if (wl->wl_buffer_used == 0)
801 return 0;
802
803 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
804 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
805 wl->wl_buffer_used = 0;
806
807 return error;
808 }
809
810 /*
811 * Write data to the log.
812 * Try to coalesce writes and emit MAXPHYS aligned blocks.
813 */
814 static int
815 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
816 {
817 int error;
818 size_t resid;
819
820 /*
821 * If not adjacent to buffered data flush first. Disk block
822 * address is always valid for non-empty buffer.
823 */
824 if (wl->wl_buffer_used > 0 &&
825 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
826 error = wapbl_buffered_flush(wl);
827 if (error)
828 return error;
829 }
830 /*
831 * If this write goes to an empty buffer we have to
832 * save the disk block address first.
833 */
834 if (wl->wl_buffer_used == 0)
835 wl->wl_buffer_dblk = pbn;
836 /*
837 * Remaining space so this buffer ends on a MAXPHYS boundary.
838 *
839 * Cannot become less or equal zero as the buffer would have been
840 * flushed on the last call then.
841 */
842 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
843 wl->wl_buffer_used;
844 KASSERT(resid > 0);
845 KASSERT(dbtob(btodb(resid)) == resid);
846 if (len >= resid) {
847 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
848 wl->wl_buffer_used += resid;
849 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
850 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
851 data = (uint8_t *)data + resid;
852 len -= resid;
853 wl->wl_buffer_dblk = pbn + btodb(resid);
854 wl->wl_buffer_used = 0;
855 if (error)
856 return error;
857 }
858 KASSERT(len < MAXPHYS);
859 if (len > 0) {
860 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
861 wl->wl_buffer_used += len;
862 }
863
864 return 0;
865 }
866
867 /*
868 * Off is byte offset returns new offset for next write
869 * handles log wraparound
870 */
871 static int
872 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
873 {
874 size_t slen;
875 off_t off = *offp;
876 int error;
877 daddr_t pbn;
878
879 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
880 wl->wl_log_dev_bshift) == len);
881
882 if (off < wl->wl_circ_off)
883 off = wl->wl_circ_off;
884 slen = wl->wl_circ_off + wl->wl_circ_size - off;
885 if (slen < len) {
886 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
887 #ifdef _KERNEL
888 pbn = btodb(pbn << wl->wl_log_dev_bshift);
889 #endif
890 error = wapbl_buffered_write(data, slen, wl, pbn);
891 if (error)
892 return error;
893 data = (uint8_t *)data + slen;
894 len -= slen;
895 off = wl->wl_circ_off;
896 }
897 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
898 #ifdef _KERNEL
899 pbn = btodb(pbn << wl->wl_log_dev_bshift);
900 #endif
901 error = wapbl_buffered_write(data, len, wl, pbn);
902 if (error)
903 return error;
904 off += len;
905 if (off >= wl->wl_circ_off + wl->wl_circ_size)
906 off = wl->wl_circ_off;
907 *offp = off;
908 return 0;
909 }
910
911 /****************************************************************/
912
913 int
914 wapbl_begin(struct wapbl *wl, const char *file, int line)
915 {
916 int doflush;
917 unsigned lockcount;
918
919 KDASSERT(wl);
920
921 /*
922 * XXX this needs to be made much more sophisticated.
923 * perhaps each wapbl_begin could reserve a specified
924 * number of buffers and bytes.
925 */
926 mutex_enter(&wl->wl_mtx);
927 lockcount = wl->wl_lock_count;
928 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
929 wl->wl_bufbytes_max / 2) ||
930 ((wl->wl_bufcount + (lockcount * 10)) >
931 wl->wl_bufcount_max / 2) ||
932 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
933 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
934 mutex_exit(&wl->wl_mtx);
935
936 if (doflush) {
937 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
938 ("force flush lockcnt=%d bufbytes=%zu "
939 "(max=%zu) bufcount=%zu (max=%zu) "
940 "dealloccnt %d (lim=%d)\n",
941 lockcount, wl->wl_bufbytes,
942 wl->wl_bufbytes_max, wl->wl_bufcount,
943 wl->wl_bufcount_max,
944 wl->wl_dealloccnt, wl->wl_dealloclim));
945 }
946
947 if (doflush) {
948 int error = wapbl_flush(wl, 0);
949 if (error)
950 return error;
951 }
952
953 rw_enter(&wl->wl_rwlock, RW_READER);
954 mutex_enter(&wl->wl_mtx);
955 wl->wl_lock_count++;
956 mutex_exit(&wl->wl_mtx);
957
958 #if defined(WAPBL_DEBUG_PRINT)
959 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
960 ("wapbl_begin thread %d.%d with bufcount=%zu "
961 "bufbytes=%zu bcount=%zu at %s:%d\n",
962 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
963 wl->wl_bufbytes, wl->wl_bcount, file, line));
964 #endif
965
966 return 0;
967 }
968
969 void
970 wapbl_end(struct wapbl *wl)
971 {
972
973 #if defined(WAPBL_DEBUG_PRINT)
974 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
975 ("wapbl_end thread %d.%d with bufcount=%zu "
976 "bufbytes=%zu bcount=%zu\n",
977 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
978 wl->wl_bufbytes, wl->wl_bcount));
979 #endif
980
981 #ifdef DIAGNOSTIC
982 size_t flushsize = wapbl_transaction_len(wl);
983 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
984 /*
985 * XXX this could be handled more gracefully, perhaps place
986 * only a partial transaction in the log and allow the
987 * remaining to flush without the protection of the journal.
988 */
989 panic("wapbl_end: current transaction too big to flush\n");
990 }
991 #endif
992
993 mutex_enter(&wl->wl_mtx);
994 KASSERT(wl->wl_lock_count > 0);
995 wl->wl_lock_count--;
996 mutex_exit(&wl->wl_mtx);
997
998 rw_exit(&wl->wl_rwlock);
999 }
1000
1001 void
1002 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1003 {
1004
1005 KASSERT(bp->b_cflags & BC_BUSY);
1006 KASSERT(bp->b_vp);
1007
1008 wapbl_jlock_assert(wl);
1009
1010 #if 0
1011 /*
1012 * XXX this might be an issue for swapfiles.
1013 * see uvm_swap.c:1702
1014 *
1015 * XXX2 why require it then? leap of semantics?
1016 */
1017 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1018 #endif
1019
1020 mutex_enter(&wl->wl_mtx);
1021 if (bp->b_flags & B_LOCKED) {
1022 LIST_REMOVE(bp, b_wapbllist);
1023 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1024 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1025 "with %d bytes %d bcount\n",
1026 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1027 bp->b_bcount));
1028 } else {
1029 /* unlocked by dirty buffers shouldn't exist */
1030 KASSERT(!(bp->b_oflags & BO_DELWRI));
1031 wl->wl_bufbytes += bp->b_bufsize;
1032 wl->wl_bcount += bp->b_bcount;
1033 wl->wl_bufcount++;
1034 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1035 ("wapbl_add_buf thread %d.%d adding buf %p "
1036 "with %d bytes %d bcount\n",
1037 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1038 bp->b_bcount));
1039 }
1040 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1041 mutex_exit(&wl->wl_mtx);
1042
1043 bp->b_flags |= B_LOCKED;
1044 }
1045
1046 static void
1047 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1048 {
1049
1050 KASSERT(mutex_owned(&wl->wl_mtx));
1051 KASSERT(bp->b_cflags & BC_BUSY);
1052 wapbl_jlock_assert(wl);
1053
1054 #if 0
1055 /*
1056 * XXX this might be an issue for swapfiles.
1057 * see uvm_swap.c:1725
1058 *
1059 * XXXdeux: see above
1060 */
1061 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1062 #endif
1063 KASSERT(bp->b_flags & B_LOCKED);
1064
1065 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1066 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1067 "%d bytes %d bcount\n",
1068 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1069
1070 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1071 wl->wl_bufbytes -= bp->b_bufsize;
1072 KASSERT(wl->wl_bcount >= bp->b_bcount);
1073 wl->wl_bcount -= bp->b_bcount;
1074 KASSERT(wl->wl_bufcount > 0);
1075 wl->wl_bufcount--;
1076 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1077 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1078 LIST_REMOVE(bp, b_wapbllist);
1079
1080 bp->b_flags &= ~B_LOCKED;
1081 }
1082
1083 /* called from brelsel() in vfs_bio among other places */
1084 void
1085 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1086 {
1087
1088 mutex_enter(&wl->wl_mtx);
1089 wapbl_remove_buf_locked(wl, bp);
1090 mutex_exit(&wl->wl_mtx);
1091 }
1092
1093 void
1094 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1095 {
1096
1097 KASSERT(bp->b_cflags & BC_BUSY);
1098
1099 /*
1100 * XXX: why does this depend on B_LOCKED? otherwise the buf
1101 * is not for a transaction? if so, why is this called in the
1102 * first place?
1103 */
1104 if (bp->b_flags & B_LOCKED) {
1105 mutex_enter(&wl->wl_mtx);
1106 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1107 wl->wl_bcount += bp->b_bcount - oldcnt;
1108 mutex_exit(&wl->wl_mtx);
1109 }
1110 }
1111
1112 #endif /* _KERNEL */
1113
1114 /****************************************************************/
1115 /* Some utility inlines */
1116
1117 static inline size_t
1118 wapbl_space_used(size_t avail, off_t head, off_t tail)
1119 {
1120
1121 if (tail == 0) {
1122 KASSERT(head == 0);
1123 return 0;
1124 }
1125 return ((head + (avail - 1) - tail) % avail) + 1;
1126 }
1127
1128 #ifdef _KERNEL
1129 /* This is used to advance the pointer at old to new value at old+delta */
1130 static inline off_t
1131 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1132 {
1133 off_t newoff;
1134
1135 /* Define acceptable ranges for inputs. */
1136 KASSERT(delta <= (size_t)size);
1137 KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1138 KASSERT(oldoff < (off_t)(size + off));
1139
1140 if ((oldoff == 0) && (delta != 0))
1141 newoff = off + delta;
1142 else if ((oldoff + delta) < (size + off))
1143 newoff = oldoff + delta;
1144 else
1145 newoff = (oldoff + delta) - size;
1146
1147 /* Note some interesting axioms */
1148 KASSERT((delta != 0) || (newoff == oldoff));
1149 KASSERT((delta == 0) || (newoff != 0));
1150 KASSERT((delta != (size)) || (newoff == oldoff));
1151
1152 /* Define acceptable ranges for output. */
1153 KASSERT((newoff == 0) || ((size_t)newoff >= off));
1154 KASSERT((size_t)newoff < (size + off));
1155 return newoff;
1156 }
1157
1158 static inline size_t
1159 wapbl_space_free(size_t avail, off_t head, off_t tail)
1160 {
1161
1162 return avail - wapbl_space_used(avail, head, tail);
1163 }
1164
1165 static inline void
1166 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1167 off_t *tailp)
1168 {
1169 off_t head = *headp;
1170 off_t tail = *tailp;
1171
1172 KASSERT(delta <= wapbl_space_free(size, head, tail));
1173 head = wapbl_advance(size, off, head, delta);
1174 if ((tail == 0) && (head != 0))
1175 tail = off;
1176 *headp = head;
1177 *tailp = tail;
1178 }
1179
1180 static inline void
1181 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1182 off_t *tailp)
1183 {
1184 off_t head = *headp;
1185 off_t tail = *tailp;
1186
1187 KASSERT(delta <= wapbl_space_used(size, head, tail));
1188 tail = wapbl_advance(size, off, tail, delta);
1189 if (head == tail) {
1190 head = tail = 0;
1191 }
1192 *headp = head;
1193 *tailp = tail;
1194 }
1195
1196
1197 /****************************************************************/
1198
1199 /*
1200 * Remove transactions whose buffers are completely flushed to disk.
1201 * Will block until at least minfree space is available.
1202 * only intended to be called from inside wapbl_flush and therefore
1203 * does not protect against commit races with itself or with flush.
1204 */
1205 static int
1206 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1207 {
1208 size_t delta;
1209 size_t avail;
1210 off_t head;
1211 off_t tail;
1212 int error = 0;
1213
1214 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1215 KASSERT(rw_write_held(&wl->wl_rwlock));
1216
1217 mutex_enter(&wl->wl_mtx);
1218
1219 /*
1220 * First check to see if we have to do a commit
1221 * at all.
1222 */
1223 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1224 if (minfree < avail) {
1225 mutex_exit(&wl->wl_mtx);
1226 return 0;
1227 }
1228 minfree -= avail;
1229 while ((wl->wl_error_count == 0) &&
1230 (wl->wl_reclaimable_bytes < minfree)) {
1231 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1232 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1233 "minfree=%zd\n",
1234 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1235 minfree));
1236
1237 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1238 }
1239 if (wl->wl_reclaimable_bytes < minfree) {
1240 KASSERT(wl->wl_error_count);
1241 /* XXX maybe get actual error from buffer instead someday? */
1242 error = EIO;
1243 }
1244 head = wl->wl_head;
1245 tail = wl->wl_tail;
1246 delta = wl->wl_reclaimable_bytes;
1247
1248 /* If all of of the entries are flushed, then be sure to keep
1249 * the reserved bytes reserved. Watch out for discarded transactions,
1250 * which could leave more bytes reserved than are reclaimable.
1251 */
1252 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1253 (delta >= wl->wl_reserved_bytes)) {
1254 delta -= wl->wl_reserved_bytes;
1255 }
1256 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1257 &tail);
1258 KDASSERT(wl->wl_reserved_bytes <=
1259 wapbl_space_used(wl->wl_circ_size, head, tail));
1260 mutex_exit(&wl->wl_mtx);
1261
1262 if (error)
1263 return error;
1264
1265 if (waitonly)
1266 return 0;
1267
1268 /*
1269 * This is where head, tail and delta are unprotected
1270 * from races against itself or flush. This is ok since
1271 * we only call this routine from inside flush itself.
1272 *
1273 * XXX: how can it race against itself when accessed only
1274 * from behind the write-locked rwlock?
1275 */
1276 error = wapbl_write_commit(wl, head, tail);
1277 if (error)
1278 return error;
1279
1280 wl->wl_head = head;
1281 wl->wl_tail = tail;
1282
1283 mutex_enter(&wl->wl_mtx);
1284 KASSERT(wl->wl_reclaimable_bytes >= delta);
1285 wl->wl_reclaimable_bytes -= delta;
1286 mutex_exit(&wl->wl_mtx);
1287 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1288 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1289 curproc->p_pid, curlwp->l_lid, delta));
1290
1291 return 0;
1292 }
1293
1294 /****************************************************************/
1295
1296 void
1297 wapbl_biodone(struct buf *bp)
1298 {
1299 struct wapbl_entry *we = bp->b_private;
1300 struct wapbl *wl = we->we_wapbl;
1301 #ifdef WAPBL_DEBUG_BUFBYTES
1302 const int bufsize = bp->b_bufsize;
1303 #endif
1304
1305 /*
1306 * Handle possible flushing of buffers after log has been
1307 * decomissioned.
1308 */
1309 if (!wl) {
1310 KASSERT(we->we_bufcount > 0);
1311 we->we_bufcount--;
1312 #ifdef WAPBL_DEBUG_BUFBYTES
1313 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1314 we->we_unsynced_bufbytes -= bufsize;
1315 #endif
1316
1317 if (we->we_bufcount == 0) {
1318 #ifdef WAPBL_DEBUG_BUFBYTES
1319 KASSERT(we->we_unsynced_bufbytes == 0);
1320 #endif
1321 pool_put(&wapbl_entry_pool, we);
1322 }
1323
1324 brelse(bp, 0);
1325 return;
1326 }
1327
1328 #ifdef ohbother
1329 KDASSERT(bp->b_oflags & BO_DONE);
1330 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1331 KDASSERT(bp->b_flags & B_ASYNC);
1332 KDASSERT(bp->b_cflags & BC_BUSY);
1333 KDASSERT(!(bp->b_flags & B_LOCKED));
1334 KDASSERT(!(bp->b_flags & B_READ));
1335 KDASSERT(!(bp->b_cflags & BC_INVAL));
1336 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1337 #endif
1338
1339 if (bp->b_error) {
1340 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1341 /*
1342 * XXXpooka: interfaces not fully updated
1343 * Note: this was not enabled in the original patch
1344 * against netbsd4 either. I don't know if comment
1345 * above is true or not.
1346 */
1347
1348 /*
1349 * If an error occurs, report the error and leave the
1350 * buffer as a delayed write on the LRU queue.
1351 * restarting the write would likely result in
1352 * an error spinloop, so let it be done harmlessly
1353 * by the syncer.
1354 */
1355 bp->b_flags &= ~(B_DONE);
1356 simple_unlock(&bp->b_interlock);
1357
1358 if (we->we_error == 0) {
1359 mutex_enter(&wl->wl_mtx);
1360 wl->wl_error_count++;
1361 mutex_exit(&wl->wl_mtx);
1362 cv_broadcast(&wl->wl_reclaimable_cv);
1363 }
1364 we->we_error = bp->b_error;
1365 bp->b_error = 0;
1366 brelse(bp);
1367 return;
1368 #else
1369 /* For now, just mark the log permanently errored out */
1370
1371 mutex_enter(&wl->wl_mtx);
1372 if (wl->wl_error_count == 0) {
1373 wl->wl_error_count++;
1374 cv_broadcast(&wl->wl_reclaimable_cv);
1375 }
1376 mutex_exit(&wl->wl_mtx);
1377 #endif
1378 }
1379
1380 /*
1381 * Release the buffer here. wapbl_flush() may wait for the
1382 * log to become empty and we better unbusy the buffer before
1383 * wapbl_flush() returns.
1384 */
1385 brelse(bp, 0);
1386
1387 mutex_enter(&wl->wl_mtx);
1388
1389 KASSERT(we->we_bufcount > 0);
1390 we->we_bufcount--;
1391 #ifdef WAPBL_DEBUG_BUFBYTES
1392 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1393 we->we_unsynced_bufbytes -= bufsize;
1394 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1395 wl->wl_unsynced_bufbytes -= bufsize;
1396 #endif
1397
1398 /*
1399 * If the current transaction can be reclaimed, start
1400 * at the beginning and reclaim any consecutive reclaimable
1401 * transactions. If we successfully reclaim anything,
1402 * then wakeup anyone waiting for the reclaim.
1403 */
1404 if (we->we_bufcount == 0) {
1405 size_t delta = 0;
1406 int errcnt = 0;
1407 #ifdef WAPBL_DEBUG_BUFBYTES
1408 KDASSERT(we->we_unsynced_bufbytes == 0);
1409 #endif
1410 /*
1411 * clear any posted error, since the buffer it came from
1412 * has successfully flushed by now
1413 */
1414 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1415 (we->we_bufcount == 0)) {
1416 delta += we->we_reclaimable_bytes;
1417 if (we->we_error)
1418 errcnt++;
1419 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1420 pool_put(&wapbl_entry_pool, we);
1421 }
1422
1423 if (delta) {
1424 wl->wl_reclaimable_bytes += delta;
1425 KASSERT(wl->wl_error_count >= errcnt);
1426 wl->wl_error_count -= errcnt;
1427 cv_broadcast(&wl->wl_reclaimable_cv);
1428 }
1429 }
1430
1431 mutex_exit(&wl->wl_mtx);
1432 }
1433
1434 /*
1435 * Write transactions to disk + start I/O for contents
1436 */
1437 int
1438 wapbl_flush(struct wapbl *wl, int waitfor)
1439 {
1440 struct buf *bp;
1441 struct wapbl_entry *we;
1442 off_t off;
1443 off_t head;
1444 off_t tail;
1445 size_t delta = 0;
1446 size_t flushsize;
1447 size_t reserved;
1448 int error = 0;
1449
1450 /*
1451 * Do a quick check to see if a full flush can be skipped
1452 * This assumes that the flush callback does not need to be called
1453 * unless there are other outstanding bufs.
1454 */
1455 if (!waitfor) {
1456 size_t nbufs;
1457 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1458 protect the KASSERTS */
1459 nbufs = wl->wl_bufcount;
1460 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1461 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1462 mutex_exit(&wl->wl_mtx);
1463 if (nbufs == 0)
1464 return 0;
1465 }
1466
1467 /*
1468 * XXX we may consider using LK_UPGRADE here
1469 * if we want to call flush from inside a transaction
1470 */
1471 rw_enter(&wl->wl_rwlock, RW_WRITER);
1472 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1473 wl->wl_dealloccnt);
1474
1475 /*
1476 * Now that we are fully locked and flushed,
1477 * do another check for nothing to do.
1478 */
1479 if (wl->wl_bufcount == 0) {
1480 goto out;
1481 }
1482
1483 #if 0
1484 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1485 ("wapbl_flush thread %d.%d flushing entries with "
1486 "bufcount=%zu bufbytes=%zu\n",
1487 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1488 wl->wl_bufbytes));
1489 #endif
1490
1491 /* Calculate amount of space needed to flush */
1492 flushsize = wapbl_transaction_len(wl);
1493 if (wapbl_verbose_commit) {
1494 struct timespec ts;
1495 getnanotime(&ts);
1496 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1497 __func__, (long long)ts.tv_sec,
1498 (long)ts.tv_nsec, flushsize);
1499 }
1500
1501 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1502 /*
1503 * XXX this could be handled more gracefully, perhaps place
1504 * only a partial transaction in the log and allow the
1505 * remaining to flush without the protection of the journal.
1506 */
1507 panic("wapbl_flush: current transaction too big to flush\n");
1508 }
1509
1510 error = wapbl_truncate(wl, flushsize, 0);
1511 if (error)
1512 goto out2;
1513
1514 off = wl->wl_head;
1515 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1516 (off < wl->wl_circ_off + wl->wl_circ_size)));
1517 error = wapbl_write_blocks(wl, &off);
1518 if (error)
1519 goto out2;
1520 error = wapbl_write_revocations(wl, &off);
1521 if (error)
1522 goto out2;
1523 error = wapbl_write_inodes(wl, &off);
1524 if (error)
1525 goto out2;
1526
1527 reserved = 0;
1528 if (wl->wl_inohashcnt)
1529 reserved = wapbl_transaction_inodes_len(wl);
1530
1531 head = wl->wl_head;
1532 tail = wl->wl_tail;
1533
1534 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1535 &head, &tail);
1536 #ifdef WAPBL_DEBUG
1537 if (head != off) {
1538 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1539 " off=%"PRIdMAX" flush=%zu\n",
1540 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1541 flushsize);
1542 }
1543 #else
1544 KASSERT(head == off);
1545 #endif
1546
1547 /* Opportunistically move the tail forward if we can */
1548 if (!wapbl_lazy_truncate) {
1549 mutex_enter(&wl->wl_mtx);
1550 delta = wl->wl_reclaimable_bytes;
1551 mutex_exit(&wl->wl_mtx);
1552 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1553 &head, &tail);
1554 }
1555
1556 error = wapbl_write_commit(wl, head, tail);
1557 if (error)
1558 goto out2;
1559
1560 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1561
1562 #ifdef WAPBL_DEBUG_BUFBYTES
1563 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1564 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1565 " unsynced=%zu"
1566 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1567 "inodes=%d\n",
1568 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1569 wapbl_space_used(wl->wl_circ_size, head, tail),
1570 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1571 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1572 wl->wl_inohashcnt));
1573 #else
1574 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1575 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1576 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1577 "inodes=%d\n",
1578 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1579 wapbl_space_used(wl->wl_circ_size, head, tail),
1580 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1581 wl->wl_dealloccnt, wl->wl_inohashcnt));
1582 #endif
1583
1584
1585 mutex_enter(&bufcache_lock);
1586 mutex_enter(&wl->wl_mtx);
1587
1588 wl->wl_reserved_bytes = reserved;
1589 wl->wl_head = head;
1590 wl->wl_tail = tail;
1591 KASSERT(wl->wl_reclaimable_bytes >= delta);
1592 wl->wl_reclaimable_bytes -= delta;
1593 wl->wl_dealloccnt = 0;
1594 #ifdef WAPBL_DEBUG_BUFBYTES
1595 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1596 #endif
1597
1598 we->we_wapbl = wl;
1599 we->we_bufcount = wl->wl_bufcount;
1600 #ifdef WAPBL_DEBUG_BUFBYTES
1601 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1602 #endif
1603 we->we_reclaimable_bytes = flushsize;
1604 we->we_error = 0;
1605 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1606
1607 /*
1608 * this flushes bufs in reverse order than they were queued
1609 * it shouldn't matter, but if we care we could use TAILQ instead.
1610 * XXX Note they will get put on the lru queue when they flush
1611 * so we might actually want to change this to preserve order.
1612 */
1613 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1614 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1615 continue;
1616 }
1617 bp->b_iodone = wapbl_biodone;
1618 bp->b_private = we;
1619 bremfree(bp);
1620 wapbl_remove_buf_locked(wl, bp);
1621 mutex_exit(&wl->wl_mtx);
1622 mutex_exit(&bufcache_lock);
1623 bawrite(bp);
1624 mutex_enter(&bufcache_lock);
1625 mutex_enter(&wl->wl_mtx);
1626 }
1627 mutex_exit(&wl->wl_mtx);
1628 mutex_exit(&bufcache_lock);
1629
1630 #if 0
1631 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1632 ("wapbl_flush thread %d.%d done flushing entries...\n",
1633 curproc->p_pid, curlwp->l_lid));
1634 #endif
1635
1636 out:
1637
1638 /*
1639 * If the waitfor flag is set, don't return until everything is
1640 * fully flushed and the on disk log is empty.
1641 */
1642 if (waitfor) {
1643 error = wapbl_truncate(wl, wl->wl_circ_size -
1644 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1645 }
1646
1647 out2:
1648 if (error) {
1649 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1650 wl->wl_dealloclens, wl->wl_dealloccnt);
1651 }
1652
1653 #ifdef WAPBL_DEBUG_PRINT
1654 if (error) {
1655 pid_t pid = -1;
1656 lwpid_t lid = -1;
1657 if (curproc)
1658 pid = curproc->p_pid;
1659 if (curlwp)
1660 lid = curlwp->l_lid;
1661 mutex_enter(&wl->wl_mtx);
1662 #ifdef WAPBL_DEBUG_BUFBYTES
1663 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1664 ("wapbl_flush: thread %d.%d aborted flush: "
1665 "error = %d\n"
1666 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1667 "deallocs=%d inodes=%d\n"
1668 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1669 "unsynced=%zu\n",
1670 pid, lid, error, wl->wl_bufcount,
1671 wl->wl_bufbytes, wl->wl_bcount,
1672 wl->wl_dealloccnt, wl->wl_inohashcnt,
1673 wl->wl_error_count, wl->wl_reclaimable_bytes,
1674 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1675 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1676 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1677 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1678 "error = %d, unsynced = %zu\n",
1679 we->we_bufcount, we->we_reclaimable_bytes,
1680 we->we_error, we->we_unsynced_bufbytes));
1681 }
1682 #else
1683 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1684 ("wapbl_flush: thread %d.%d aborted flush: "
1685 "error = %d\n"
1686 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1687 "deallocs=%d inodes=%d\n"
1688 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1689 pid, lid, error, wl->wl_bufcount,
1690 wl->wl_bufbytes, wl->wl_bcount,
1691 wl->wl_dealloccnt, wl->wl_inohashcnt,
1692 wl->wl_error_count, wl->wl_reclaimable_bytes,
1693 wl->wl_reserved_bytes));
1694 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1695 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1696 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1697 "error = %d\n", we->we_bufcount,
1698 we->we_reclaimable_bytes, we->we_error));
1699 }
1700 #endif
1701 mutex_exit(&wl->wl_mtx);
1702 }
1703 #endif
1704
1705 rw_exit(&wl->wl_rwlock);
1706 return error;
1707 }
1708
1709 /****************************************************************/
1710
1711 void
1712 wapbl_jlock_assert(struct wapbl *wl)
1713 {
1714
1715 KASSERT(rw_lock_held(&wl->wl_rwlock));
1716 }
1717
1718 void
1719 wapbl_junlock_assert(struct wapbl *wl)
1720 {
1721
1722 KASSERT(!rw_write_held(&wl->wl_rwlock));
1723 }
1724
1725 /****************************************************************/
1726
1727 /* locks missing */
1728 void
1729 wapbl_print(struct wapbl *wl,
1730 int full,
1731 void (*pr)(const char *, ...))
1732 {
1733 struct buf *bp;
1734 struct wapbl_entry *we;
1735 (*pr)("wapbl %p", wl);
1736 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1737 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1738 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1739 wl->wl_circ_size, wl->wl_circ_off,
1740 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1741 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1742 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1743 #ifdef WAPBL_DEBUG_BUFBYTES
1744 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1745 "reserved = %zu errcnt = %d unsynced = %zu\n",
1746 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1747 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1748 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1749 #else
1750 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1751 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1752 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1753 wl->wl_error_count);
1754 #endif
1755 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1756 wl->wl_dealloccnt, wl->wl_dealloclim);
1757 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1758 wl->wl_inohashcnt, wl->wl_inohashmask);
1759 (*pr)("entries:\n");
1760 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1761 #ifdef WAPBL_DEBUG_BUFBYTES
1762 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1763 "unsynced = %zu\n",
1764 we->we_bufcount, we->we_reclaimable_bytes,
1765 we->we_error, we->we_unsynced_bufbytes);
1766 #else
1767 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1768 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1769 #endif
1770 }
1771 if (full) {
1772 int cnt = 0;
1773 (*pr)("bufs =");
1774 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1775 if (!LIST_NEXT(bp, b_wapbllist)) {
1776 (*pr)(" %p", bp);
1777 } else if ((++cnt % 6) == 0) {
1778 (*pr)(" %p,\n\t", bp);
1779 } else {
1780 (*pr)(" %p,", bp);
1781 }
1782 }
1783 (*pr)("\n");
1784
1785 (*pr)("dealloced blks = ");
1786 {
1787 int i;
1788 cnt = 0;
1789 for (i = 0; i < wl->wl_dealloccnt; i++) {
1790 (*pr)(" %"PRId64":%d,",
1791 wl->wl_deallocblks[i],
1792 wl->wl_dealloclens[i]);
1793 if ((++cnt % 4) == 0) {
1794 (*pr)("\n\t");
1795 }
1796 }
1797 }
1798 (*pr)("\n");
1799
1800 (*pr)("registered inodes = ");
1801 {
1802 int i;
1803 cnt = 0;
1804 for (i = 0; i <= wl->wl_inohashmask; i++) {
1805 struct wapbl_ino_head *wih;
1806 struct wapbl_ino *wi;
1807
1808 wih = &wl->wl_inohash[i];
1809 LIST_FOREACH(wi, wih, wi_hash) {
1810 if (wi->wi_ino == 0)
1811 continue;
1812 (*pr)(" %"PRIu64"/0%06"PRIo32",",
1813 wi->wi_ino, wi->wi_mode);
1814 if ((++cnt % 4) == 0) {
1815 (*pr)("\n\t");
1816 }
1817 }
1818 }
1819 (*pr)("\n");
1820 }
1821 }
1822 }
1823
1824 #if defined(WAPBL_DEBUG) || defined(DDB)
1825 void
1826 wapbl_dump(struct wapbl *wl)
1827 {
1828 #if defined(WAPBL_DEBUG)
1829 if (!wl)
1830 wl = wapbl_debug_wl;
1831 #endif
1832 if (!wl)
1833 return;
1834 wapbl_print(wl, 1, printf);
1835 }
1836 #endif
1837
1838 /****************************************************************/
1839
1840 void
1841 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1842 {
1843
1844 wapbl_jlock_assert(wl);
1845
1846 mutex_enter(&wl->wl_mtx);
1847 /* XXX should eventually instead tie this into resource estimation */
1848 /*
1849 * XXX this panic needs locking/mutex analysis and the
1850 * ability to cope with the failure.
1851 */
1852 /* XXX this XXX doesn't have enough XXX */
1853 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1854 panic("wapbl_register_deallocation: out of resources");
1855
1856 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1857 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1858 wl->wl_dealloccnt++;
1859 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1860 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1861 mutex_exit(&wl->wl_mtx);
1862 }
1863
1864 /****************************************************************/
1865
1866 static void
1867 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1868 {
1869
1870 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1871 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1872 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1873 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1874 }
1875 }
1876
1877 static void
1878 wapbl_inodetrk_free(struct wapbl *wl)
1879 {
1880
1881 /* XXX this KASSERT needs locking/mutex analysis */
1882 KASSERT(wl->wl_inohashcnt == 0);
1883 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1884 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1885 pool_destroy(&wapbl_ino_pool);
1886 }
1887 }
1888
1889 static struct wapbl_ino *
1890 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1891 {
1892 struct wapbl_ino_head *wih;
1893 struct wapbl_ino *wi;
1894
1895 KASSERT(mutex_owned(&wl->wl_mtx));
1896
1897 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1898 LIST_FOREACH(wi, wih, wi_hash) {
1899 if (ino == wi->wi_ino)
1900 return wi;
1901 }
1902 return 0;
1903 }
1904
1905 void
1906 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1907 {
1908 struct wapbl_ino_head *wih;
1909 struct wapbl_ino *wi;
1910
1911 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1912
1913 mutex_enter(&wl->wl_mtx);
1914 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1915 wi->wi_ino = ino;
1916 wi->wi_mode = mode;
1917 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1918 LIST_INSERT_HEAD(wih, wi, wi_hash);
1919 wl->wl_inohashcnt++;
1920 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1921 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1922 mutex_exit(&wl->wl_mtx);
1923 } else {
1924 mutex_exit(&wl->wl_mtx);
1925 pool_put(&wapbl_ino_pool, wi);
1926 }
1927 }
1928
1929 void
1930 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1931 {
1932 struct wapbl_ino *wi;
1933
1934 mutex_enter(&wl->wl_mtx);
1935 wi = wapbl_inodetrk_get(wl, ino);
1936 if (wi) {
1937 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1938 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1939 KASSERT(wl->wl_inohashcnt > 0);
1940 wl->wl_inohashcnt--;
1941 LIST_REMOVE(wi, wi_hash);
1942 mutex_exit(&wl->wl_mtx);
1943
1944 pool_put(&wapbl_ino_pool, wi);
1945 } else {
1946 mutex_exit(&wl->wl_mtx);
1947 }
1948 }
1949
1950 /****************************************************************/
1951
1952 static inline size_t
1953 wapbl_transaction_inodes_len(struct wapbl *wl)
1954 {
1955 int blocklen = 1<<wl->wl_log_dev_bshift;
1956 int iph;
1957
1958 /* Calculate number of inodes described in a inodelist header */
1959 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1960 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1961
1962 KASSERT(iph > 0);
1963
1964 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1965 }
1966
1967
1968 /* Calculate amount of space a transaction will take on disk */
1969 static size_t
1970 wapbl_transaction_len(struct wapbl *wl)
1971 {
1972 int blocklen = 1<<wl->wl_log_dev_bshift;
1973 size_t len;
1974 int bph;
1975
1976 /* Calculate number of blocks described in a blocklist header */
1977 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1978 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1979
1980 KASSERT(bph > 0);
1981
1982 len = wl->wl_bcount;
1983 len += howmany(wl->wl_bufcount, bph) * blocklen;
1984 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1985 len += wapbl_transaction_inodes_len(wl);
1986
1987 return len;
1988 }
1989
1990 /*
1991 * wapbl_cache_sync: issue DIOCCACHESYNC
1992 */
1993 static int
1994 wapbl_cache_sync(struct wapbl *wl, const char *msg)
1995 {
1996 const bool verbose = wapbl_verbose_commit >= 2;
1997 struct bintime start_time;
1998 int force = 1;
1999 int error;
2000
2001 if (!wapbl_flush_disk_cache) {
2002 return 0;
2003 }
2004 if (verbose) {
2005 bintime(&start_time);
2006 }
2007 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2008 FWRITE, FSCRED);
2009 if (error) {
2010 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2011 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
2012 "returned %d\n", wl->wl_devvp->v_rdev, error));
2013 }
2014 if (verbose) {
2015 struct bintime d;
2016 struct timespec ts;
2017
2018 bintime(&d);
2019 bintime_sub(&d, &start_time);
2020 bintime2timespec(&d, &ts);
2021 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2022 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2023 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2024 }
2025 return error;
2026 }
2027
2028 /*
2029 * Perform commit operation
2030 *
2031 * Note that generation number incrementation needs to
2032 * be protected against racing with other invocations
2033 * of wapbl_write_commit. This is ok since this routine
2034 * is only invoked from wapbl_flush
2035 */
2036 static int
2037 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2038 {
2039 struct wapbl_wc_header *wc = wl->wl_wc_header;
2040 struct timespec ts;
2041 int error;
2042 daddr_t pbn;
2043
2044 error = wapbl_buffered_flush(wl);
2045 if (error)
2046 return error;
2047 /*
2048 * flush disk cache to ensure that blocks we've written are actually
2049 * written to the stable storage before the commit header.
2050 *
2051 * XXX Calc checksum here, instead we do this for now
2052 */
2053 wapbl_cache_sync(wl, "1");
2054
2055 wc->wc_head = head;
2056 wc->wc_tail = tail;
2057 wc->wc_checksum = 0;
2058 wc->wc_version = 1;
2059 getnanotime(&ts);
2060 wc->wc_time = ts.tv_sec;
2061 wc->wc_timensec = ts.tv_nsec;
2062
2063 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2064 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2065 (intmax_t)head, (intmax_t)tail));
2066
2067 /*
2068 * write the commit header.
2069 *
2070 * XXX if generation will rollover, then first zero
2071 * over second commit header before trying to write both headers.
2072 */
2073
2074 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2075 #ifdef _KERNEL
2076 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2077 #endif
2078 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2079 if (error)
2080 return error;
2081 error = wapbl_buffered_flush(wl);
2082 if (error)
2083 return error;
2084
2085 /*
2086 * flush disk cache to ensure that the commit header is actually
2087 * written before meta data blocks.
2088 */
2089 wapbl_cache_sync(wl, "2");
2090
2091 /*
2092 * If the generation number was zero, write it out a second time.
2093 * This handles initialization and generation number rollover
2094 */
2095 if (wc->wc_generation++ == 0) {
2096 error = wapbl_write_commit(wl, head, tail);
2097 /*
2098 * This panic should be able to be removed if we do the
2099 * zero'ing mentioned above, and we are certain to roll
2100 * back generation number on failure.
2101 */
2102 if (error)
2103 panic("wapbl_write_commit: error writing duplicate "
2104 "log header: %d\n", error);
2105 }
2106 return 0;
2107 }
2108
2109 /* Returns new offset value */
2110 static int
2111 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2112 {
2113 struct wapbl_wc_blocklist *wc =
2114 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2115 int blocklen = 1<<wl->wl_log_dev_bshift;
2116 int bph;
2117 struct buf *bp;
2118 off_t off = *offp;
2119 int error;
2120 size_t padding;
2121
2122 KASSERT(rw_write_held(&wl->wl_rwlock));
2123
2124 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2125 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2126
2127 bp = LIST_FIRST(&wl->wl_bufs);
2128
2129 while (bp) {
2130 int cnt;
2131 struct buf *obp = bp;
2132
2133 KASSERT(bp->b_flags & B_LOCKED);
2134
2135 wc->wc_type = WAPBL_WC_BLOCKS;
2136 wc->wc_len = blocklen;
2137 wc->wc_blkcount = 0;
2138 while (bp && (wc->wc_blkcount < bph)) {
2139 /*
2140 * Make sure all the physical block numbers are up to
2141 * date. If this is not always true on a given
2142 * filesystem, then VOP_BMAP must be called. We
2143 * could call VOP_BMAP here, or else in the filesystem
2144 * specific flush callback, although neither of those
2145 * solutions allow us to take the vnode lock. If a
2146 * filesystem requires that we must take the vnode lock
2147 * to call VOP_BMAP, then we can probably do it in
2148 * bwrite when the vnode lock should already be held
2149 * by the invoking code.
2150 */
2151 KASSERT((bp->b_vp->v_type == VBLK) ||
2152 (bp->b_blkno != bp->b_lblkno));
2153 KASSERT(bp->b_blkno > 0);
2154
2155 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2156 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2157 wc->wc_len += bp->b_bcount;
2158 wc->wc_blkcount++;
2159 bp = LIST_NEXT(bp, b_wapbllist);
2160 }
2161 if (wc->wc_len % blocklen != 0) {
2162 padding = blocklen - wc->wc_len % blocklen;
2163 wc->wc_len += padding;
2164 } else {
2165 padding = 0;
2166 }
2167
2168 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2169 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2170 wc->wc_len, padding, (intmax_t)off));
2171
2172 error = wapbl_circ_write(wl, wc, blocklen, &off);
2173 if (error)
2174 return error;
2175 bp = obp;
2176 cnt = 0;
2177 while (bp && (cnt++ < bph)) {
2178 error = wapbl_circ_write(wl, bp->b_data,
2179 bp->b_bcount, &off);
2180 if (error)
2181 return error;
2182 bp = LIST_NEXT(bp, b_wapbllist);
2183 }
2184 if (padding) {
2185 void *zero;
2186
2187 zero = wapbl_alloc(padding);
2188 memset(zero, 0, padding);
2189 error = wapbl_circ_write(wl, zero, padding, &off);
2190 wapbl_free(zero, padding);
2191 if (error)
2192 return error;
2193 }
2194 }
2195 *offp = off;
2196 return 0;
2197 }
2198
2199 static int
2200 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2201 {
2202 struct wapbl_wc_blocklist *wc =
2203 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2204 int i;
2205 int blocklen = 1<<wl->wl_log_dev_bshift;
2206 int bph;
2207 off_t off = *offp;
2208 int error;
2209
2210 if (wl->wl_dealloccnt == 0)
2211 return 0;
2212
2213 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2214 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2215
2216 i = 0;
2217 while (i < wl->wl_dealloccnt) {
2218 wc->wc_type = WAPBL_WC_REVOCATIONS;
2219 wc->wc_len = blocklen;
2220 wc->wc_blkcount = 0;
2221 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2222 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2223 wl->wl_deallocblks[i];
2224 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2225 wl->wl_dealloclens[i];
2226 wc->wc_blkcount++;
2227 i++;
2228 }
2229 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2230 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2231 wc->wc_len, (intmax_t)off));
2232 error = wapbl_circ_write(wl, wc, blocklen, &off);
2233 if (error)
2234 return error;
2235 }
2236 *offp = off;
2237 return 0;
2238 }
2239
2240 static int
2241 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2242 {
2243 struct wapbl_wc_inodelist *wc =
2244 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2245 int i;
2246 int blocklen = 1 << wl->wl_log_dev_bshift;
2247 off_t off = *offp;
2248 int error;
2249
2250 struct wapbl_ino_head *wih;
2251 struct wapbl_ino *wi;
2252 int iph;
2253
2254 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2255 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2256
2257 i = 0;
2258 wih = &wl->wl_inohash[0];
2259 wi = 0;
2260 do {
2261 wc->wc_type = WAPBL_WC_INODES;
2262 wc->wc_len = blocklen;
2263 wc->wc_inocnt = 0;
2264 wc->wc_clear = (i == 0);
2265 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2266 while (!wi) {
2267 KASSERT((wih - &wl->wl_inohash[0])
2268 <= wl->wl_inohashmask);
2269 wi = LIST_FIRST(wih++);
2270 }
2271 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2272 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2273 wc->wc_inocnt++;
2274 i++;
2275 wi = LIST_NEXT(wi, wi_hash);
2276 }
2277 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2278 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2279 wc->wc_len, (intmax_t)off));
2280 error = wapbl_circ_write(wl, wc, blocklen, &off);
2281 if (error)
2282 return error;
2283 } while (i < wl->wl_inohashcnt);
2284
2285 *offp = off;
2286 return 0;
2287 }
2288
2289 #endif /* _KERNEL */
2290
2291 /****************************************************************/
2292
2293 struct wapbl_blk {
2294 LIST_ENTRY(wapbl_blk) wb_hash;
2295 daddr_t wb_blk;
2296 off_t wb_off; /* Offset of this block in the log */
2297 };
2298 #define WAPBL_BLKPOOL_MIN 83
2299
2300 static void
2301 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2302 {
2303 if (size < WAPBL_BLKPOOL_MIN)
2304 size = WAPBL_BLKPOOL_MIN;
2305 KASSERT(wr->wr_blkhash == 0);
2306 #ifdef _KERNEL
2307 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2308 #else /* ! _KERNEL */
2309 /* Manually implement hashinit */
2310 {
2311 unsigned long i, hashsize;
2312 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2313 continue;
2314 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2315 for (i = 0; i < hashsize; i++)
2316 LIST_INIT(&wr->wr_blkhash[i]);
2317 wr->wr_blkhashmask = hashsize - 1;
2318 }
2319 #endif /* ! _KERNEL */
2320 }
2321
2322 static void
2323 wapbl_blkhash_free(struct wapbl_replay *wr)
2324 {
2325 KASSERT(wr->wr_blkhashcnt == 0);
2326 #ifdef _KERNEL
2327 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2328 #else /* ! _KERNEL */
2329 wapbl_free(wr->wr_blkhash,
2330 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2331 #endif /* ! _KERNEL */
2332 }
2333
2334 static struct wapbl_blk *
2335 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2336 {
2337 struct wapbl_blk_head *wbh;
2338 struct wapbl_blk *wb;
2339 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2340 LIST_FOREACH(wb, wbh, wb_hash) {
2341 if (blk == wb->wb_blk)
2342 return wb;
2343 }
2344 return 0;
2345 }
2346
2347 static void
2348 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2349 {
2350 struct wapbl_blk_head *wbh;
2351 struct wapbl_blk *wb;
2352 wb = wapbl_blkhash_get(wr, blk);
2353 if (wb) {
2354 KASSERT(wb->wb_blk == blk);
2355 wb->wb_off = off;
2356 } else {
2357 wb = wapbl_alloc(sizeof(*wb));
2358 wb->wb_blk = blk;
2359 wb->wb_off = off;
2360 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2361 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2362 wr->wr_blkhashcnt++;
2363 }
2364 }
2365
2366 static void
2367 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2368 {
2369 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2370 if (wb) {
2371 KASSERT(wr->wr_blkhashcnt > 0);
2372 wr->wr_blkhashcnt--;
2373 LIST_REMOVE(wb, wb_hash);
2374 wapbl_free(wb, sizeof(*wb));
2375 }
2376 }
2377
2378 static void
2379 wapbl_blkhash_clear(struct wapbl_replay *wr)
2380 {
2381 unsigned long i;
2382 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2383 struct wapbl_blk *wb;
2384
2385 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2386 KASSERT(wr->wr_blkhashcnt > 0);
2387 wr->wr_blkhashcnt--;
2388 LIST_REMOVE(wb, wb_hash);
2389 wapbl_free(wb, sizeof(*wb));
2390 }
2391 }
2392 KASSERT(wr->wr_blkhashcnt == 0);
2393 }
2394
2395 /****************************************************************/
2396
2397 static int
2398 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2399 {
2400 size_t slen;
2401 off_t off = *offp;
2402 int error;
2403 daddr_t pbn;
2404
2405 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2406 wr->wr_log_dev_bshift) == len);
2407
2408 if (off < wr->wr_circ_off)
2409 off = wr->wr_circ_off;
2410 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2411 if (slen < len) {
2412 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2413 #ifdef _KERNEL
2414 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2415 #endif
2416 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2417 if (error)
2418 return error;
2419 data = (uint8_t *)data + slen;
2420 len -= slen;
2421 off = wr->wr_circ_off;
2422 }
2423 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2424 #ifdef _KERNEL
2425 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2426 #endif
2427 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2428 if (error)
2429 return error;
2430 off += len;
2431 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2432 off = wr->wr_circ_off;
2433 *offp = off;
2434 return 0;
2435 }
2436
2437 static void
2438 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2439 {
2440 size_t slen;
2441 off_t off = *offp;
2442
2443 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2444 wr->wr_log_dev_bshift) == len);
2445
2446 if (off < wr->wr_circ_off)
2447 off = wr->wr_circ_off;
2448 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2449 if (slen < len) {
2450 len -= slen;
2451 off = wr->wr_circ_off;
2452 }
2453 off += len;
2454 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2455 off = wr->wr_circ_off;
2456 *offp = off;
2457 }
2458
2459 /****************************************************************/
2460
2461 int
2462 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2463 daddr_t off, size_t count, size_t blksize)
2464 {
2465 struct wapbl_replay *wr;
2466 int error;
2467 struct vnode *devvp;
2468 daddr_t logpbn;
2469 uint8_t *scratch;
2470 struct wapbl_wc_header *wch;
2471 struct wapbl_wc_header *wch2;
2472 /* Use this until we read the actual log header */
2473 int log_dev_bshift = ilog2(blksize);
2474 size_t used;
2475 daddr_t pbn;
2476
2477 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2478 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2479 vp, off, count, blksize));
2480
2481 if (off < 0)
2482 return EINVAL;
2483
2484 if (blksize < DEV_BSIZE)
2485 return EINVAL;
2486 if (blksize % DEV_BSIZE)
2487 return EINVAL;
2488
2489 #ifdef _KERNEL
2490 #if 0
2491 /* XXX vp->v_size isn't reliably set for VBLK devices,
2492 * especially root. However, we might still want to verify
2493 * that the full load is readable */
2494 if ((off + count) * blksize > vp->v_size)
2495 return EINVAL;
2496 #endif
2497 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2498 return error;
2499 }
2500 #else /* ! _KERNEL */
2501 devvp = vp;
2502 logpbn = off;
2503 #endif /* ! _KERNEL */
2504
2505 scratch = wapbl_alloc(MAXBSIZE);
2506
2507 pbn = logpbn;
2508 #ifdef _KERNEL
2509 pbn = btodb(pbn << log_dev_bshift);
2510 #endif
2511 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2512 if (error)
2513 goto errout;
2514
2515 wch = (struct wapbl_wc_header *)scratch;
2516 wch2 =
2517 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2518 /* XXX verify checksums and magic numbers */
2519 if (wch->wc_type != WAPBL_WC_HEADER) {
2520 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2521 error = EFTYPE;
2522 goto errout;
2523 }
2524
2525 if (wch2->wc_generation > wch->wc_generation)
2526 wch = wch2;
2527
2528 wr = wapbl_calloc(1, sizeof(*wr));
2529
2530 wr->wr_logvp = vp;
2531 wr->wr_devvp = devvp;
2532 wr->wr_logpbn = logpbn;
2533
2534 wr->wr_scratch = scratch;
2535
2536 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2537 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2538 wr->wr_circ_off = wch->wc_circ_off;
2539 wr->wr_circ_size = wch->wc_circ_size;
2540 wr->wr_generation = wch->wc_generation;
2541
2542 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2543
2544 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2545 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2546 " len=%"PRId64" used=%zu\n",
2547 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2548 wch->wc_circ_size, used));
2549
2550 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2551
2552 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2553 if (error) {
2554 wapbl_replay_stop(wr);
2555 wapbl_replay_free(wr);
2556 return error;
2557 }
2558
2559 *wrp = wr;
2560 return 0;
2561
2562 errout:
2563 wapbl_free(scratch, MAXBSIZE);
2564 return error;
2565 }
2566
2567 void
2568 wapbl_replay_stop(struct wapbl_replay *wr)
2569 {
2570
2571 if (!wapbl_replay_isopen(wr))
2572 return;
2573
2574 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2575
2576 wapbl_free(wr->wr_scratch, MAXBSIZE);
2577 wr->wr_scratch = NULL;
2578
2579 wr->wr_logvp = NULL;
2580
2581 wapbl_blkhash_clear(wr);
2582 wapbl_blkhash_free(wr);
2583 }
2584
2585 void
2586 wapbl_replay_free(struct wapbl_replay *wr)
2587 {
2588
2589 KDASSERT(!wapbl_replay_isopen(wr));
2590
2591 if (wr->wr_inodes)
2592 wapbl_free(wr->wr_inodes,
2593 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2594 wapbl_free(wr, sizeof(*wr));
2595 }
2596
2597 #ifdef _KERNEL
2598 int
2599 wapbl_replay_isopen1(struct wapbl_replay *wr)
2600 {
2601
2602 return wapbl_replay_isopen(wr);
2603 }
2604 #endif
2605
2606 /*
2607 * calculate the disk address for the i'th block in the wc_blockblist
2608 * offset by j blocks of size blen.
2609 *
2610 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
2611 * was written to the journal.
2612 *
2613 * The kernel needs that address plus the offset in DEV_BSIZE units.
2614 *
2615 * Userland needs that address plus the offset in blen units.
2616 *
2617 */
2618 static daddr_t
2619 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
2620 {
2621 daddr_t pbn;
2622
2623 #ifdef _KERNEL
2624 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
2625 #else
2626 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
2627 #endif
2628
2629 return pbn;
2630 }
2631
2632 static void
2633 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2634 {
2635 struct wapbl_wc_blocklist *wc =
2636 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2637 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2638 int i, j, n;
2639
2640 for (i = 0; i < wc->wc_blkcount; i++) {
2641 /*
2642 * Enter each physical block into the hashtable independently.
2643 */
2644 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2645 for (j = 0; j < n; j++) {
2646 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
2647 *offp);
2648 wapbl_circ_advance(wr, fsblklen, offp);
2649 }
2650 }
2651 }
2652
2653 static void
2654 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2655 {
2656 struct wapbl_wc_blocklist *wc =
2657 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2658 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2659 int i, j, n;
2660
2661 for (i = 0; i < wc->wc_blkcount; i++) {
2662 /*
2663 * Remove any blocks found from the hashtable.
2664 */
2665 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2666 for (j = 0; j < n; j++)
2667 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
2668 }
2669 }
2670
2671 static void
2672 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2673 {
2674 struct wapbl_wc_inodelist *wc =
2675 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2676 void *new_inodes;
2677 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2678
2679 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2680
2681 /*
2682 * Keep track of where we found this so location won't be
2683 * overwritten.
2684 */
2685 if (wc->wc_clear) {
2686 wr->wr_inodestail = oldoff;
2687 wr->wr_inodescnt = 0;
2688 if (wr->wr_inodes != NULL) {
2689 wapbl_free(wr->wr_inodes, oldsize);
2690 wr->wr_inodes = NULL;
2691 }
2692 }
2693 wr->wr_inodeshead = newoff;
2694 if (wc->wc_inocnt == 0)
2695 return;
2696
2697 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2698 sizeof(wr->wr_inodes[0]));
2699 if (wr->wr_inodes != NULL) {
2700 memcpy(new_inodes, wr->wr_inodes, oldsize);
2701 wapbl_free(wr->wr_inodes, oldsize);
2702 }
2703 wr->wr_inodes = new_inodes;
2704 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2705 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2706 wr->wr_inodescnt += wc->wc_inocnt;
2707 }
2708
2709 static int
2710 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2711 {
2712 off_t off;
2713 int error;
2714
2715 int logblklen = 1 << wr->wr_log_dev_bshift;
2716
2717 wapbl_blkhash_clear(wr);
2718
2719 off = tail;
2720 while (off != head) {
2721 struct wapbl_wc_null *wcn;
2722 off_t saveoff = off;
2723 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2724 if (error)
2725 goto errout;
2726 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2727 switch (wcn->wc_type) {
2728 case WAPBL_WC_BLOCKS:
2729 wapbl_replay_process_blocks(wr, &off);
2730 break;
2731
2732 case WAPBL_WC_REVOCATIONS:
2733 wapbl_replay_process_revocations(wr);
2734 break;
2735
2736 case WAPBL_WC_INODES:
2737 wapbl_replay_process_inodes(wr, saveoff, off);
2738 break;
2739
2740 default:
2741 printf("Unrecognized wapbl type: 0x%08x\n",
2742 wcn->wc_type);
2743 error = EFTYPE;
2744 goto errout;
2745 }
2746 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2747 if (off != saveoff) {
2748 printf("wapbl_replay: corrupted records\n");
2749 error = EFTYPE;
2750 goto errout;
2751 }
2752 }
2753 return 0;
2754
2755 errout:
2756 wapbl_blkhash_clear(wr);
2757 return error;
2758 }
2759
2760 #if 0
2761 int
2762 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2763 {
2764 off_t off;
2765 int mismatchcnt = 0;
2766 int logblklen = 1 << wr->wr_log_dev_bshift;
2767 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2768 void *scratch1 = wapbl_alloc(MAXBSIZE);
2769 void *scratch2 = wapbl_alloc(MAXBSIZE);
2770 int error = 0;
2771
2772 KDASSERT(wapbl_replay_isopen(wr));
2773
2774 off = wch->wc_tail;
2775 while (off != wch->wc_head) {
2776 struct wapbl_wc_null *wcn;
2777 #ifdef DEBUG
2778 off_t saveoff = off;
2779 #endif
2780 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2781 if (error)
2782 goto out;
2783 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2784 switch (wcn->wc_type) {
2785 case WAPBL_WC_BLOCKS:
2786 {
2787 struct wapbl_wc_blocklist *wc =
2788 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2789 int i;
2790 for (i = 0; i < wc->wc_blkcount; i++) {
2791 int foundcnt = 0;
2792 int dirtycnt = 0;
2793 int j, n;
2794 /*
2795 * Check each physical block into the
2796 * hashtable independently
2797 */
2798 n = wc->wc_blocks[i].wc_dlen >>
2799 wch->wc_fs_dev_bshift;
2800 for (j = 0; j < n; j++) {
2801 struct wapbl_blk *wb =
2802 wapbl_blkhash_get(wr,
2803 wapbl_block_daddr(wc, i, j, fsblklen));
2804 if (wb && (wb->wb_off == off)) {
2805 foundcnt++;
2806 error =
2807 wapbl_circ_read(wr,
2808 scratch1, fsblklen,
2809 &off);
2810 if (error)
2811 goto out;
2812 error =
2813 wapbl_read(scratch2,
2814 fsblklen, fsdevvp,
2815 wb->wb_blk);
2816 if (error)
2817 goto out;
2818 if (memcmp(scratch1,
2819 scratch2,
2820 fsblklen)) {
2821 printf(
2822 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2823 wb->wb_blk, (intmax_t)off);
2824 dirtycnt++;
2825 mismatchcnt++;
2826 }
2827 } else {
2828 wapbl_circ_advance(wr,
2829 fsblklen, &off);
2830 }
2831 }
2832 #if 0
2833 /*
2834 * If all of the blocks in an entry
2835 * are clean, then remove all of its
2836 * blocks from the hashtable since they
2837 * never will need replay.
2838 */
2839 if ((foundcnt != 0) &&
2840 (dirtycnt == 0)) {
2841 off = saveoff;
2842 wapbl_circ_advance(wr,
2843 logblklen, &off);
2844 for (j = 0; j < n; j++) {
2845 struct wapbl_blk *wb =
2846 wapbl_blkhash_get(wr,
2847 wapbl_block_daddr(wc, i, j, fsblklen));
2848 if (wb &&
2849 (wb->wb_off == off)) {
2850 wapbl_blkhash_rem(wr, wb->wb_blk);
2851 }
2852 wapbl_circ_advance(wr,
2853 fsblklen, &off);
2854 }
2855 }
2856 #endif
2857 }
2858 }
2859 break;
2860 case WAPBL_WC_REVOCATIONS:
2861 case WAPBL_WC_INODES:
2862 break;
2863 default:
2864 KASSERT(0);
2865 }
2866 #ifdef DEBUG
2867 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2868 KASSERT(off == saveoff);
2869 #endif
2870 }
2871 out:
2872 wapbl_free(scratch1, MAXBSIZE);
2873 wapbl_free(scratch2, MAXBSIZE);
2874 if (!error && mismatchcnt)
2875 error = EFTYPE;
2876 return error;
2877 }
2878 #endif
2879
2880 int
2881 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2882 {
2883 struct wapbl_blk *wb;
2884 size_t i;
2885 off_t off;
2886 void *scratch;
2887 int error = 0;
2888 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2889
2890 KDASSERT(wapbl_replay_isopen(wr));
2891
2892 scratch = wapbl_alloc(MAXBSIZE);
2893
2894 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2895 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2896 off = wb->wb_off;
2897 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2898 if (error)
2899 break;
2900 error = wapbl_write(scratch, fsblklen, fsdevvp,
2901 wb->wb_blk);
2902 if (error)
2903 break;
2904 }
2905 }
2906
2907 wapbl_free(scratch, MAXBSIZE);
2908 return error;
2909 }
2910
2911 int
2912 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2913 {
2914 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2915
2916 KDASSERT(wapbl_replay_isopen(wr));
2917 KASSERT((len % fsblklen) == 0);
2918
2919 while (len != 0) {
2920 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2921 if (wb)
2922 return 1;
2923 len -= fsblklen;
2924 }
2925 return 0;
2926 }
2927
2928 int
2929 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2930 {
2931 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2932
2933 KDASSERT(wapbl_replay_isopen(wr));
2934
2935 KASSERT((len % fsblklen) == 0);
2936
2937 while (len != 0) {
2938 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2939 if (wb) {
2940 off_t off = wb->wb_off;
2941 int error;
2942 error = wapbl_circ_read(wr, data, fsblklen, &off);
2943 if (error)
2944 return error;
2945 }
2946 data = (uint8_t *)data + fsblklen;
2947 len -= fsblklen;
2948 blk++;
2949 }
2950 return 0;
2951 }
2952
2953 #ifdef _KERNEL
2954 /*
2955 * This is not really a module now, but maybe on its way to
2956 * being one some day.
2957 */
2958 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2959
2960 static int
2961 wapbl_modcmd(modcmd_t cmd, void *arg)
2962 {
2963
2964 switch (cmd) {
2965 case MODULE_CMD_INIT:
2966 wapbl_init();
2967 return 0;
2968 case MODULE_CMD_FINI:
2969 #ifdef notyet
2970 return wapbl_fini(true);
2971 #endif
2972 return EOPNOTSUPP;
2973 default:
2974 return ENOTTY;
2975 }
2976 }
2977 #endif /* _KERNEL */
2978