vfs_wapbl.c revision 1.56 1 /* $NetBSD: vfs_wapbl.c,v 1.56 2013/09/14 13:19:50 joerg Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.56 2013/09/14 13:19:50 joerg Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 #else /* !_KERNEL */
74 #include <assert.h>
75 #include <errno.h>
76 #include <stdio.h>
77 #include <stdbool.h>
78 #include <stdlib.h>
79 #include <string.h>
80
81 #include <sys/time.h>
82 #include <sys/wapbl.h>
83 #include <sys/wapbl_replay.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_alloc(s) malloc(s)
88 #define wapbl_free(a, s) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * lm = rwlock held writing or mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* lm: address of block */
175 int *wl_dealloclens; /* lm: size of block */
176 int wl_dealloccnt; /* lm: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187
188 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
189 daddr_t wl_buffer_dblk; /* l: buffer disk block address */
190 size_t wl_buffer_used; /* l: buffer current use */
191 };
192
193 #ifdef WAPBL_DEBUG_PRINT
194 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
195 #endif
196
197 /****************************************************************/
198 #ifdef _KERNEL
199
200 #ifdef WAPBL_DEBUG
201 struct wapbl *wapbl_debug_wl;
202 #endif
203
204 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
205 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
206 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
207 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
208 #endif /* _KERNEL */
209
210 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
211
212 static inline size_t wapbl_space_used(size_t avail, off_t head,
213 off_t tail);
214
215 #ifdef _KERNEL
216
217 static struct pool wapbl_entry_pool;
218
219 #define WAPBL_INODETRK_SIZE 83
220 static int wapbl_ino_pool_refcount;
221 static struct pool wapbl_ino_pool;
222 struct wapbl_ino {
223 LIST_ENTRY(wapbl_ino) wi_hash;
224 ino_t wi_ino;
225 mode_t wi_mode;
226 };
227
228 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
229 static void wapbl_inodetrk_free(struct wapbl *wl);
230 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
231
232 static size_t wapbl_transaction_len(struct wapbl *wl);
233 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
234
235 #if 0
236 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
237 #endif
238
239 static int wapbl_replay_isopen1(struct wapbl_replay *);
240
241 /*
242 * This is useful for debugging. If set, the log will
243 * only be truncated when necessary.
244 */
245 int wapbl_lazy_truncate = 0;
246
247 struct wapbl_ops wapbl_ops = {
248 .wo_wapbl_discard = wapbl_discard,
249 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
250 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
251 .wo_wapbl_replay_read = wapbl_replay_read,
252 .wo_wapbl_add_buf = wapbl_add_buf,
253 .wo_wapbl_remove_buf = wapbl_remove_buf,
254 .wo_wapbl_resize_buf = wapbl_resize_buf,
255 .wo_wapbl_begin = wapbl_begin,
256 .wo_wapbl_end = wapbl_end,
257 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
258
259 /* XXX: the following is only used to say "this is a wapbl buf" */
260 .wo_wapbl_biodone = wapbl_biodone,
261 };
262
263 static int
264 wapbl_sysctl_init(void)
265 {
266 int rv;
267 const struct sysctlnode *rnode, *cnode;
268
269 wapbl_sysctl = NULL;
270
271 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
272 CTLFLAG_PERMANENT,
273 CTLTYPE_NODE, "vfs", NULL,
274 NULL, 0, NULL, 0,
275 CTL_VFS, CTL_EOL);
276 if (rv)
277 return rv;
278
279 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode,
280 CTLFLAG_PERMANENT,
281 CTLTYPE_NODE, "wapbl",
282 SYSCTL_DESCR("WAPBL journaling options"),
283 NULL, 0, NULL, 0,
284 CTL_CREATE, CTL_EOL);
285 if (rv)
286 return rv;
287
288 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
289 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
290 CTLTYPE_INT, "flush_disk_cache",
291 SYSCTL_DESCR("flush disk cache"),
292 NULL, 0, &wapbl_flush_disk_cache, 0,
293 CTL_CREATE, CTL_EOL);
294 if (rv)
295 return rv;
296
297 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
298 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
299 CTLTYPE_INT, "verbose_commit",
300 SYSCTL_DESCR("show time and size of wapbl log commits"),
301 NULL, 0, &wapbl_verbose_commit, 0,
302 CTL_CREATE, CTL_EOL);
303 return rv;
304 }
305
306 static void
307 wapbl_init(void)
308 {
309
310 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
311 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
312
313 wapbl_sysctl_init();
314 }
315
316 #ifdef notyet
317 static int
318 wapbl_fini(bool interface)
319 {
320
321 if (aio_sysctl != NULL)
322 sysctl_teardown(&aio_sysctl);
323
324 pool_destroy(&wapbl_entry_pool);
325
326 return 0;
327 }
328 #endif
329
330 static int
331 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
332 {
333 int error, i;
334
335 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
336 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
337
338 /*
339 * Its only valid to reuse the replay log if its
340 * the same as the new log we just opened.
341 */
342 KDASSERT(!wapbl_replay_isopen(wr));
343 KASSERT(wl->wl_devvp->v_type == VBLK);
344 KASSERT(wr->wr_devvp->v_type == VBLK);
345 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
346 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
347 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
348 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
349 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
350 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
351
352 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
353
354 for (i = 0; i < wr->wr_inodescnt; i++)
355 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
356 wr->wr_inodes[i].wr_imode);
357
358 /* Make sure new transaction won't overwrite old inodes list */
359 KDASSERT(wapbl_transaction_len(wl) <=
360 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
361 wr->wr_inodestail));
362
363 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
364 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
365 wapbl_transaction_len(wl);
366
367 error = wapbl_write_inodes(wl, &wl->wl_head);
368 if (error)
369 return error;
370
371 KASSERT(wl->wl_head != wl->wl_tail);
372 KASSERT(wl->wl_head != 0);
373
374 return 0;
375 }
376
377 int
378 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
379 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
380 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
381 {
382 struct wapbl *wl;
383 struct vnode *devvp;
384 daddr_t logpbn;
385 int error;
386 int log_dev_bshift = ilog2(blksize);
387 int fs_dev_bshift = log_dev_bshift;
388 int run;
389
390 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
391 " count=%zu blksize=%zu\n", vp, off, count, blksize));
392
393 if (log_dev_bshift > fs_dev_bshift) {
394 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
395 ("wapbl: log device's block size cannot be larger "
396 "than filesystem's\n"));
397 /*
398 * Not currently implemented, although it could be if
399 * needed someday.
400 */
401 return ENOSYS;
402 }
403
404 if (off < 0)
405 return EINVAL;
406
407 if (blksize < DEV_BSIZE)
408 return EINVAL;
409 if (blksize % DEV_BSIZE)
410 return EINVAL;
411
412 /* XXXTODO: verify that the full load is writable */
413
414 /*
415 * XXX check for minimum log size
416 * minimum is governed by minimum amount of space
417 * to complete a transaction. (probably truncate)
418 */
419 /* XXX for now pick something minimal */
420 if ((count * blksize) < MAXPHYS) {
421 return ENOSPC;
422 }
423
424 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
425 return error;
426 }
427
428 wl = wapbl_calloc(1, sizeof(*wl));
429 rw_init(&wl->wl_rwlock);
430 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
431 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
432 LIST_INIT(&wl->wl_bufs);
433 SIMPLEQ_INIT(&wl->wl_entries);
434
435 wl->wl_logvp = vp;
436 wl->wl_devvp = devvp;
437 wl->wl_mount = mp;
438 wl->wl_logpbn = logpbn;
439 wl->wl_log_dev_bshift = log_dev_bshift;
440 wl->wl_fs_dev_bshift = fs_dev_bshift;
441
442 wl->wl_flush = flushfn;
443 wl->wl_flush_abort = flushabortfn;
444
445 /* Reserve two log device blocks for the commit headers */
446 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
447 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
448 /* truncate the log usage to a multiple of log_dev_bshift */
449 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
450 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
451
452 /*
453 * wl_bufbytes_max limits the size of the in memory transaction space.
454 * - Since buffers are allocated and accounted for in units of
455 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
456 * (i.e. 1<<PAGE_SHIFT)
457 * - Since the log device has to be written in units of
458 * 1<<wl_log_dev_bshift it is required to be a mulitple of
459 * 1<<wl_log_dev_bshift.
460 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
461 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
462 * Therefore it must be multiple of the least common multiple of those
463 * three quantities. Fortunately, all of those quantities are
464 * guaranteed to be a power of two, and the least common multiple of
465 * a set of numbers which are all powers of two is simply the maximum
466 * of those numbers. Finally, the maximum logarithm of a power of two
467 * is the same as the log of the maximum power of two. So we can do
468 * the following operations to size wl_bufbytes_max:
469 */
470
471 /* XXX fix actual number of pages reserved per filesystem. */
472 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
473
474 /* Round wl_bufbytes_max to the largest power of two constraint */
475 wl->wl_bufbytes_max >>= PAGE_SHIFT;
476 wl->wl_bufbytes_max <<= PAGE_SHIFT;
477 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
478 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
479 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
480 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
481
482 /* XXX maybe use filesystem fragment size instead of 1024 */
483 /* XXX fix actual number of buffers reserved per filesystem. */
484 wl->wl_bufcount_max = (nbuf / 2) * 1024;
485
486 /* XXX tie this into resource estimation */
487 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
488
489 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
490 wl->wl_dealloclim);
491 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
492 wl->wl_dealloclim);
493
494 wl->wl_buffer = wapbl_alloc(MAXPHYS);
495 wl->wl_buffer_used = 0;
496
497 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
498
499 /* Initialize the commit header */
500 {
501 struct wapbl_wc_header *wc;
502 size_t len = 1 << wl->wl_log_dev_bshift;
503 wc = wapbl_calloc(1, len);
504 wc->wc_type = WAPBL_WC_HEADER;
505 wc->wc_len = len;
506 wc->wc_circ_off = wl->wl_circ_off;
507 wc->wc_circ_size = wl->wl_circ_size;
508 /* XXX wc->wc_fsid */
509 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
510 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
511 wl->wl_wc_header = wc;
512 wl->wl_wc_scratch = wapbl_alloc(len);
513 }
514
515 /*
516 * if there was an existing set of unlinked but
517 * allocated inodes, preserve it in the new
518 * log.
519 */
520 if (wr && wr->wr_inodescnt) {
521 error = wapbl_start_flush_inodes(wl, wr);
522 if (error)
523 goto errout;
524 }
525
526 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
527 if (error) {
528 goto errout;
529 }
530
531 *wlp = wl;
532 #if defined(WAPBL_DEBUG)
533 wapbl_debug_wl = wl;
534 #endif
535
536 return 0;
537 errout:
538 wapbl_discard(wl);
539 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
540 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
541 wapbl_free(wl->wl_deallocblks,
542 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
543 wapbl_free(wl->wl_dealloclens,
544 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
545 wapbl_free(wl->wl_buffer, MAXPHYS);
546 wapbl_inodetrk_free(wl);
547 wapbl_free(wl, sizeof(*wl));
548
549 return error;
550 }
551
552 /*
553 * Like wapbl_flush, only discards the transaction
554 * completely
555 */
556
557 void
558 wapbl_discard(struct wapbl *wl)
559 {
560 struct wapbl_entry *we;
561 struct buf *bp;
562 int i;
563
564 /*
565 * XXX we may consider using upgrade here
566 * if we want to call flush from inside a transaction
567 */
568 rw_enter(&wl->wl_rwlock, RW_WRITER);
569 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
570 wl->wl_dealloccnt);
571
572 #ifdef WAPBL_DEBUG_PRINT
573 {
574 pid_t pid = -1;
575 lwpid_t lid = -1;
576 if (curproc)
577 pid = curproc->p_pid;
578 if (curlwp)
579 lid = curlwp->l_lid;
580 #ifdef WAPBL_DEBUG_BUFBYTES
581 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
582 ("wapbl_discard: thread %d.%d discarding "
583 "transaction\n"
584 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
585 "deallocs=%d inodes=%d\n"
586 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
587 "unsynced=%zu\n",
588 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
589 wl->wl_bcount, wl->wl_dealloccnt,
590 wl->wl_inohashcnt, wl->wl_error_count,
591 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
592 wl->wl_unsynced_bufbytes));
593 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
594 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
595 ("\tentry: bufcount = %zu, reclaimable = %zu, "
596 "error = %d, unsynced = %zu\n",
597 we->we_bufcount, we->we_reclaimable_bytes,
598 we->we_error, we->we_unsynced_bufbytes));
599 }
600 #else /* !WAPBL_DEBUG_BUFBYTES */
601 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
602 ("wapbl_discard: thread %d.%d discarding transaction\n"
603 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
604 "deallocs=%d inodes=%d\n"
605 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
606 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
607 wl->wl_bcount, wl->wl_dealloccnt,
608 wl->wl_inohashcnt, wl->wl_error_count,
609 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
610 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
611 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
612 ("\tentry: bufcount = %zu, reclaimable = %zu, "
613 "error = %d\n",
614 we->we_bufcount, we->we_reclaimable_bytes,
615 we->we_error));
616 }
617 #endif /* !WAPBL_DEBUG_BUFBYTES */
618 }
619 #endif /* WAPBL_DEBUG_PRINT */
620
621 for (i = 0; i <= wl->wl_inohashmask; i++) {
622 struct wapbl_ino_head *wih;
623 struct wapbl_ino *wi;
624
625 wih = &wl->wl_inohash[i];
626 while ((wi = LIST_FIRST(wih)) != NULL) {
627 LIST_REMOVE(wi, wi_hash);
628 pool_put(&wapbl_ino_pool, wi);
629 KASSERT(wl->wl_inohashcnt > 0);
630 wl->wl_inohashcnt--;
631 }
632 }
633
634 /*
635 * clean buffer list
636 */
637 mutex_enter(&bufcache_lock);
638 mutex_enter(&wl->wl_mtx);
639 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
640 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
641 /*
642 * The buffer will be unlocked and
643 * removed from the transaction in brelse
644 */
645 mutex_exit(&wl->wl_mtx);
646 brelsel(bp, 0);
647 mutex_enter(&wl->wl_mtx);
648 }
649 }
650 mutex_exit(&wl->wl_mtx);
651 mutex_exit(&bufcache_lock);
652
653 /*
654 * Remove references to this wl from wl_entries, free any which
655 * no longer have buffers, others will be freed in wapbl_biodone
656 * when they no longer have any buffers.
657 */
658 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
659 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
660 /* XXX should we be accumulating wl_error_count
661 * and increasing reclaimable bytes ? */
662 we->we_wapbl = NULL;
663 if (we->we_bufcount == 0) {
664 #ifdef WAPBL_DEBUG_BUFBYTES
665 KASSERT(we->we_unsynced_bufbytes == 0);
666 #endif
667 pool_put(&wapbl_entry_pool, we);
668 }
669 }
670
671 /* Discard list of deallocs */
672 wl->wl_dealloccnt = 0;
673 /* XXX should we clear wl_reserved_bytes? */
674
675 KASSERT(wl->wl_bufbytes == 0);
676 KASSERT(wl->wl_bcount == 0);
677 KASSERT(wl->wl_bufcount == 0);
678 KASSERT(LIST_EMPTY(&wl->wl_bufs));
679 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
680 KASSERT(wl->wl_inohashcnt == 0);
681
682 rw_exit(&wl->wl_rwlock);
683 }
684
685 int
686 wapbl_stop(struct wapbl *wl, int force)
687 {
688 struct vnode *vp;
689 int error;
690
691 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
692 error = wapbl_flush(wl, 1);
693 if (error) {
694 if (force)
695 wapbl_discard(wl);
696 else
697 return error;
698 }
699
700 /* Unlinked inodes persist after a flush */
701 if (wl->wl_inohashcnt) {
702 if (force) {
703 wapbl_discard(wl);
704 } else {
705 return EBUSY;
706 }
707 }
708
709 KASSERT(wl->wl_bufbytes == 0);
710 KASSERT(wl->wl_bcount == 0);
711 KASSERT(wl->wl_bufcount == 0);
712 KASSERT(LIST_EMPTY(&wl->wl_bufs));
713 KASSERT(wl->wl_dealloccnt == 0);
714 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
715 KASSERT(wl->wl_inohashcnt == 0);
716
717 vp = wl->wl_logvp;
718
719 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
720 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
721 wapbl_free(wl->wl_deallocblks,
722 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
723 wapbl_free(wl->wl_dealloclens,
724 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
725 wapbl_free(wl->wl_buffer, MAXPHYS);
726 wapbl_inodetrk_free(wl);
727
728 cv_destroy(&wl->wl_reclaimable_cv);
729 mutex_destroy(&wl->wl_mtx);
730 rw_destroy(&wl->wl_rwlock);
731 wapbl_free(wl, sizeof(*wl));
732
733 return 0;
734 }
735
736 static int
737 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
738 {
739 struct pstats *pstats = curlwp->l_proc->p_stats;
740 struct buf *bp;
741 int error;
742
743 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
744 KASSERT(devvp->v_type == VBLK);
745
746 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
747 mutex_enter(devvp->v_interlock);
748 devvp->v_numoutput++;
749 mutex_exit(devvp->v_interlock);
750 pstats->p_ru.ru_oublock++;
751 } else {
752 pstats->p_ru.ru_inblock++;
753 }
754
755 bp = getiobuf(devvp, true);
756 bp->b_flags = flags;
757 bp->b_cflags = BC_BUSY; /* silly & dubious */
758 bp->b_dev = devvp->v_rdev;
759 bp->b_data = data;
760 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
761 bp->b_blkno = pbn;
762 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
763
764 WAPBL_PRINTF(WAPBL_PRINT_IO,
765 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
766 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
767 bp->b_blkno, bp->b_dev));
768
769 VOP_STRATEGY(devvp, bp);
770
771 error = biowait(bp);
772 putiobuf(bp);
773
774 if (error) {
775 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
776 ("wapbl_doio: %s %zu bytes at block %" PRId64
777 " on dev 0x%"PRIx64" failed with error %d\n",
778 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
779 "write" : "read"),
780 len, pbn, devvp->v_rdev, error));
781 }
782
783 return error;
784 }
785
786 int
787 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
788 {
789
790 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
791 }
792
793 int
794 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
795 {
796
797 return wapbl_doio(data, len, devvp, pbn, B_READ);
798 }
799
800 /*
801 * Flush buffered data if any.
802 */
803 static int
804 wapbl_buffered_flush(struct wapbl *wl)
805 {
806 int error;
807
808 if (wl->wl_buffer_used == 0)
809 return 0;
810
811 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
812 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
813 wl->wl_buffer_used = 0;
814
815 return error;
816 }
817
818 /*
819 * Write data to the log.
820 * Try to coalesce writes and emit MAXPHYS aligned blocks.
821 */
822 static int
823 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
824 {
825 int error;
826 size_t resid;
827
828 /*
829 * If not adjacent to buffered data flush first. Disk block
830 * address is always valid for non-empty buffer.
831 */
832 if (wl->wl_buffer_used > 0 &&
833 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
834 error = wapbl_buffered_flush(wl);
835 if (error)
836 return error;
837 }
838 /*
839 * If this write goes to an empty buffer we have to
840 * save the disk block address first.
841 */
842 if (wl->wl_buffer_used == 0)
843 wl->wl_buffer_dblk = pbn;
844 /*
845 * Remaining space so this buffer ends on a MAXPHYS boundary.
846 *
847 * Cannot become less or equal zero as the buffer would have been
848 * flushed on the last call then.
849 */
850 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
851 wl->wl_buffer_used;
852 KASSERT(resid > 0);
853 KASSERT(dbtob(btodb(resid)) == resid);
854 if (len >= resid) {
855 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
856 wl->wl_buffer_used += resid;
857 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
858 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
859 data = (uint8_t *)data + resid;
860 len -= resid;
861 wl->wl_buffer_dblk = pbn + btodb(resid);
862 wl->wl_buffer_used = 0;
863 if (error)
864 return error;
865 }
866 KASSERT(len < MAXPHYS);
867 if (len > 0) {
868 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
869 wl->wl_buffer_used += len;
870 }
871
872 return 0;
873 }
874
875 /*
876 * Off is byte offset returns new offset for next write
877 * handles log wraparound
878 */
879 static int
880 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
881 {
882 size_t slen;
883 off_t off = *offp;
884 int error;
885 daddr_t pbn;
886
887 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
888 wl->wl_log_dev_bshift) == len);
889
890 if (off < wl->wl_circ_off)
891 off = wl->wl_circ_off;
892 slen = wl->wl_circ_off + wl->wl_circ_size - off;
893 if (slen < len) {
894 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
895 #ifdef _KERNEL
896 pbn = btodb(pbn << wl->wl_log_dev_bshift);
897 #endif
898 error = wapbl_buffered_write(data, slen, wl, pbn);
899 if (error)
900 return error;
901 data = (uint8_t *)data + slen;
902 len -= slen;
903 off = wl->wl_circ_off;
904 }
905 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
906 #ifdef _KERNEL
907 pbn = btodb(pbn << wl->wl_log_dev_bshift);
908 #endif
909 error = wapbl_buffered_write(data, len, wl, pbn);
910 if (error)
911 return error;
912 off += len;
913 if (off >= wl->wl_circ_off + wl->wl_circ_size)
914 off = wl->wl_circ_off;
915 *offp = off;
916 return 0;
917 }
918
919 /****************************************************************/
920
921 int
922 wapbl_begin(struct wapbl *wl, const char *file, int line)
923 {
924 int doflush;
925 unsigned lockcount;
926
927 KDASSERT(wl);
928
929 /*
930 * XXX this needs to be made much more sophisticated.
931 * perhaps each wapbl_begin could reserve a specified
932 * number of buffers and bytes.
933 */
934 mutex_enter(&wl->wl_mtx);
935 lockcount = wl->wl_lock_count;
936 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
937 wl->wl_bufbytes_max / 2) ||
938 ((wl->wl_bufcount + (lockcount * 10)) >
939 wl->wl_bufcount_max / 2) ||
940 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
941 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
942 mutex_exit(&wl->wl_mtx);
943
944 if (doflush) {
945 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
946 ("force flush lockcnt=%d bufbytes=%zu "
947 "(max=%zu) bufcount=%zu (max=%zu) "
948 "dealloccnt %d (lim=%d)\n",
949 lockcount, wl->wl_bufbytes,
950 wl->wl_bufbytes_max, wl->wl_bufcount,
951 wl->wl_bufcount_max,
952 wl->wl_dealloccnt, wl->wl_dealloclim));
953 }
954
955 if (doflush) {
956 int error = wapbl_flush(wl, 0);
957 if (error)
958 return error;
959 }
960
961 rw_enter(&wl->wl_rwlock, RW_READER);
962 mutex_enter(&wl->wl_mtx);
963 wl->wl_lock_count++;
964 mutex_exit(&wl->wl_mtx);
965
966 #if defined(WAPBL_DEBUG_PRINT)
967 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
968 ("wapbl_begin thread %d.%d with bufcount=%zu "
969 "bufbytes=%zu bcount=%zu at %s:%d\n",
970 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
971 wl->wl_bufbytes, wl->wl_bcount, file, line));
972 #endif
973
974 return 0;
975 }
976
977 void
978 wapbl_end(struct wapbl *wl)
979 {
980
981 #if defined(WAPBL_DEBUG_PRINT)
982 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
983 ("wapbl_end thread %d.%d with bufcount=%zu "
984 "bufbytes=%zu bcount=%zu\n",
985 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
986 wl->wl_bufbytes, wl->wl_bcount));
987 #endif
988
989 #ifdef DIAGNOSTIC
990 size_t flushsize = wapbl_transaction_len(wl);
991 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
992 /*
993 * XXX this could be handled more gracefully, perhaps place
994 * only a partial transaction in the log and allow the
995 * remaining to flush without the protection of the journal.
996 */
997 panic("wapbl_end: current transaction too big to flush\n");
998 }
999 #endif
1000
1001 mutex_enter(&wl->wl_mtx);
1002 KASSERT(wl->wl_lock_count > 0);
1003 wl->wl_lock_count--;
1004 mutex_exit(&wl->wl_mtx);
1005
1006 rw_exit(&wl->wl_rwlock);
1007 }
1008
1009 void
1010 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1011 {
1012
1013 KASSERT(bp->b_cflags & BC_BUSY);
1014 KASSERT(bp->b_vp);
1015
1016 wapbl_jlock_assert(wl);
1017
1018 #if 0
1019 /*
1020 * XXX this might be an issue for swapfiles.
1021 * see uvm_swap.c:1702
1022 *
1023 * XXX2 why require it then? leap of semantics?
1024 */
1025 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1026 #endif
1027
1028 mutex_enter(&wl->wl_mtx);
1029 if (bp->b_flags & B_LOCKED) {
1030 LIST_REMOVE(bp, b_wapbllist);
1031 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1032 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1033 "with %d bytes %d bcount\n",
1034 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1035 bp->b_bcount));
1036 } else {
1037 /* unlocked by dirty buffers shouldn't exist */
1038 KASSERT(!(bp->b_oflags & BO_DELWRI));
1039 wl->wl_bufbytes += bp->b_bufsize;
1040 wl->wl_bcount += bp->b_bcount;
1041 wl->wl_bufcount++;
1042 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1043 ("wapbl_add_buf thread %d.%d adding buf %p "
1044 "with %d bytes %d bcount\n",
1045 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1046 bp->b_bcount));
1047 }
1048 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1049 mutex_exit(&wl->wl_mtx);
1050
1051 bp->b_flags |= B_LOCKED;
1052 }
1053
1054 static void
1055 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1056 {
1057
1058 KASSERT(mutex_owned(&wl->wl_mtx));
1059 KASSERT(bp->b_cflags & BC_BUSY);
1060 wapbl_jlock_assert(wl);
1061
1062 #if 0
1063 /*
1064 * XXX this might be an issue for swapfiles.
1065 * see uvm_swap.c:1725
1066 *
1067 * XXXdeux: see above
1068 */
1069 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1070 #endif
1071 KASSERT(bp->b_flags & B_LOCKED);
1072
1073 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1074 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1075 "%d bytes %d bcount\n",
1076 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1077
1078 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1079 wl->wl_bufbytes -= bp->b_bufsize;
1080 KASSERT(wl->wl_bcount >= bp->b_bcount);
1081 wl->wl_bcount -= bp->b_bcount;
1082 KASSERT(wl->wl_bufcount > 0);
1083 wl->wl_bufcount--;
1084 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1085 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1086 LIST_REMOVE(bp, b_wapbllist);
1087
1088 bp->b_flags &= ~B_LOCKED;
1089 }
1090
1091 /* called from brelsel() in vfs_bio among other places */
1092 void
1093 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1094 {
1095
1096 mutex_enter(&wl->wl_mtx);
1097 wapbl_remove_buf_locked(wl, bp);
1098 mutex_exit(&wl->wl_mtx);
1099 }
1100
1101 void
1102 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1103 {
1104
1105 KASSERT(bp->b_cflags & BC_BUSY);
1106
1107 /*
1108 * XXX: why does this depend on B_LOCKED? otherwise the buf
1109 * is not for a transaction? if so, why is this called in the
1110 * first place?
1111 */
1112 if (bp->b_flags & B_LOCKED) {
1113 mutex_enter(&wl->wl_mtx);
1114 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1115 wl->wl_bcount += bp->b_bcount - oldcnt;
1116 mutex_exit(&wl->wl_mtx);
1117 }
1118 }
1119
1120 #endif /* _KERNEL */
1121
1122 /****************************************************************/
1123 /* Some utility inlines */
1124
1125 static inline size_t
1126 wapbl_space_used(size_t avail, off_t head, off_t tail)
1127 {
1128
1129 if (tail == 0) {
1130 KASSERT(head == 0);
1131 return 0;
1132 }
1133 return ((head + (avail - 1) - tail) % avail) + 1;
1134 }
1135
1136 #ifdef _KERNEL
1137 /* This is used to advance the pointer at old to new value at old+delta */
1138 static inline off_t
1139 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1140 {
1141 off_t new;
1142
1143 /* Define acceptable ranges for inputs. */
1144 KASSERT(delta <= (size_t)size);
1145 KASSERT((old == 0) || ((size_t)old >= off));
1146 KASSERT(old < (off_t)(size + off));
1147
1148 if ((old == 0) && (delta != 0))
1149 new = off + delta;
1150 else if ((old + delta) < (size + off))
1151 new = old + delta;
1152 else
1153 new = (old + delta) - size;
1154
1155 /* Note some interesting axioms */
1156 KASSERT((delta != 0) || (new == old));
1157 KASSERT((delta == 0) || (new != 0));
1158 KASSERT((delta != (size)) || (new == old));
1159
1160 /* Define acceptable ranges for output. */
1161 KASSERT((new == 0) || ((size_t)new >= off));
1162 KASSERT((size_t)new < (size + off));
1163 return new;
1164 }
1165
1166 static inline size_t
1167 wapbl_space_free(size_t avail, off_t head, off_t tail)
1168 {
1169
1170 return avail - wapbl_space_used(avail, head, tail);
1171 }
1172
1173 static inline void
1174 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1175 off_t *tailp)
1176 {
1177 off_t head = *headp;
1178 off_t tail = *tailp;
1179
1180 KASSERT(delta <= wapbl_space_free(size, head, tail));
1181 head = wapbl_advance(size, off, head, delta);
1182 if ((tail == 0) && (head != 0))
1183 tail = off;
1184 *headp = head;
1185 *tailp = tail;
1186 }
1187
1188 static inline void
1189 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1190 off_t *tailp)
1191 {
1192 off_t head = *headp;
1193 off_t tail = *tailp;
1194
1195 KASSERT(delta <= wapbl_space_used(size, head, tail));
1196 tail = wapbl_advance(size, off, tail, delta);
1197 if (head == tail) {
1198 head = tail = 0;
1199 }
1200 *headp = head;
1201 *tailp = tail;
1202 }
1203
1204
1205 /****************************************************************/
1206
1207 /*
1208 * Remove transactions whose buffers are completely flushed to disk.
1209 * Will block until at least minfree space is available.
1210 * only intended to be called from inside wapbl_flush and therefore
1211 * does not protect against commit races with itself or with flush.
1212 */
1213 static int
1214 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1215 {
1216 size_t delta;
1217 size_t avail;
1218 off_t head;
1219 off_t tail;
1220 int error = 0;
1221
1222 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1223 KASSERT(rw_write_held(&wl->wl_rwlock));
1224
1225 mutex_enter(&wl->wl_mtx);
1226
1227 /*
1228 * First check to see if we have to do a commit
1229 * at all.
1230 */
1231 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1232 if (minfree < avail) {
1233 mutex_exit(&wl->wl_mtx);
1234 return 0;
1235 }
1236 minfree -= avail;
1237 while ((wl->wl_error_count == 0) &&
1238 (wl->wl_reclaimable_bytes < minfree)) {
1239 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1240 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1241 "minfree=%zd\n",
1242 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1243 minfree));
1244
1245 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1246 }
1247 if (wl->wl_reclaimable_bytes < minfree) {
1248 KASSERT(wl->wl_error_count);
1249 /* XXX maybe get actual error from buffer instead someday? */
1250 error = EIO;
1251 }
1252 head = wl->wl_head;
1253 tail = wl->wl_tail;
1254 delta = wl->wl_reclaimable_bytes;
1255
1256 /* If all of of the entries are flushed, then be sure to keep
1257 * the reserved bytes reserved. Watch out for discarded transactions,
1258 * which could leave more bytes reserved than are reclaimable.
1259 */
1260 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1261 (delta >= wl->wl_reserved_bytes)) {
1262 delta -= wl->wl_reserved_bytes;
1263 }
1264 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1265 &tail);
1266 KDASSERT(wl->wl_reserved_bytes <=
1267 wapbl_space_used(wl->wl_circ_size, head, tail));
1268 mutex_exit(&wl->wl_mtx);
1269
1270 if (error)
1271 return error;
1272
1273 if (waitonly)
1274 return 0;
1275
1276 /*
1277 * This is where head, tail and delta are unprotected
1278 * from races against itself or flush. This is ok since
1279 * we only call this routine from inside flush itself.
1280 *
1281 * XXX: how can it race against itself when accessed only
1282 * from behind the write-locked rwlock?
1283 */
1284 error = wapbl_write_commit(wl, head, tail);
1285 if (error)
1286 return error;
1287
1288 wl->wl_head = head;
1289 wl->wl_tail = tail;
1290
1291 mutex_enter(&wl->wl_mtx);
1292 KASSERT(wl->wl_reclaimable_bytes >= delta);
1293 wl->wl_reclaimable_bytes -= delta;
1294 mutex_exit(&wl->wl_mtx);
1295 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1296 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1297 curproc->p_pid, curlwp->l_lid, delta));
1298
1299 return 0;
1300 }
1301
1302 /****************************************************************/
1303
1304 void
1305 wapbl_biodone(struct buf *bp)
1306 {
1307 struct wapbl_entry *we = bp->b_private;
1308 struct wapbl *wl = we->we_wapbl;
1309 #ifdef WAPBL_DEBUG_BUFBYTES
1310 const int bufsize = bp->b_bufsize;
1311 #endif
1312
1313 /*
1314 * Handle possible flushing of buffers after log has been
1315 * decomissioned.
1316 */
1317 if (!wl) {
1318 KASSERT(we->we_bufcount > 0);
1319 we->we_bufcount--;
1320 #ifdef WAPBL_DEBUG_BUFBYTES
1321 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1322 we->we_unsynced_bufbytes -= bufsize;
1323 #endif
1324
1325 if (we->we_bufcount == 0) {
1326 #ifdef WAPBL_DEBUG_BUFBYTES
1327 KASSERT(we->we_unsynced_bufbytes == 0);
1328 #endif
1329 pool_put(&wapbl_entry_pool, we);
1330 }
1331
1332 brelse(bp, 0);
1333 return;
1334 }
1335
1336 #ifdef ohbother
1337 KDASSERT(bp->b_oflags & BO_DONE);
1338 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1339 KDASSERT(bp->b_flags & B_ASYNC);
1340 KDASSERT(bp->b_cflags & BC_BUSY);
1341 KDASSERT(!(bp->b_flags & B_LOCKED));
1342 KDASSERT(!(bp->b_flags & B_READ));
1343 KDASSERT(!(bp->b_cflags & BC_INVAL));
1344 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1345 #endif
1346
1347 if (bp->b_error) {
1348 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1349 /*
1350 * XXXpooka: interfaces not fully updated
1351 * Note: this was not enabled in the original patch
1352 * against netbsd4 either. I don't know if comment
1353 * above is true or not.
1354 */
1355
1356 /*
1357 * If an error occurs, report the error and leave the
1358 * buffer as a delayed write on the LRU queue.
1359 * restarting the write would likely result in
1360 * an error spinloop, so let it be done harmlessly
1361 * by the syncer.
1362 */
1363 bp->b_flags &= ~(B_DONE);
1364 simple_unlock(&bp->b_interlock);
1365
1366 if (we->we_error == 0) {
1367 mutex_enter(&wl->wl_mtx);
1368 wl->wl_error_count++;
1369 mutex_exit(&wl->wl_mtx);
1370 cv_broadcast(&wl->wl_reclaimable_cv);
1371 }
1372 we->we_error = bp->b_error;
1373 bp->b_error = 0;
1374 brelse(bp);
1375 return;
1376 #else
1377 /* For now, just mark the log permanently errored out */
1378
1379 mutex_enter(&wl->wl_mtx);
1380 if (wl->wl_error_count == 0) {
1381 wl->wl_error_count++;
1382 cv_broadcast(&wl->wl_reclaimable_cv);
1383 }
1384 mutex_exit(&wl->wl_mtx);
1385 #endif
1386 }
1387
1388 /*
1389 * Release the buffer here. wapbl_flush() may wait for the
1390 * log to become empty and we better unbusy the buffer before
1391 * wapbl_flush() returns.
1392 */
1393 brelse(bp, 0);
1394
1395 mutex_enter(&wl->wl_mtx);
1396
1397 KASSERT(we->we_bufcount > 0);
1398 we->we_bufcount--;
1399 #ifdef WAPBL_DEBUG_BUFBYTES
1400 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1401 we->we_unsynced_bufbytes -= bufsize;
1402 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1403 wl->wl_unsynced_bufbytes -= bufsize;
1404 #endif
1405
1406 /*
1407 * If the current transaction can be reclaimed, start
1408 * at the beginning and reclaim any consecutive reclaimable
1409 * transactions. If we successfully reclaim anything,
1410 * then wakeup anyone waiting for the reclaim.
1411 */
1412 if (we->we_bufcount == 0) {
1413 size_t delta = 0;
1414 int errcnt = 0;
1415 #ifdef WAPBL_DEBUG_BUFBYTES
1416 KDASSERT(we->we_unsynced_bufbytes == 0);
1417 #endif
1418 /*
1419 * clear any posted error, since the buffer it came from
1420 * has successfully flushed by now
1421 */
1422 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1423 (we->we_bufcount == 0)) {
1424 delta += we->we_reclaimable_bytes;
1425 if (we->we_error)
1426 errcnt++;
1427 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1428 pool_put(&wapbl_entry_pool, we);
1429 }
1430
1431 if (delta) {
1432 wl->wl_reclaimable_bytes += delta;
1433 KASSERT(wl->wl_error_count >= errcnt);
1434 wl->wl_error_count -= errcnt;
1435 cv_broadcast(&wl->wl_reclaimable_cv);
1436 }
1437 }
1438
1439 mutex_exit(&wl->wl_mtx);
1440 }
1441
1442 /*
1443 * Write transactions to disk + start I/O for contents
1444 */
1445 int
1446 wapbl_flush(struct wapbl *wl, int waitfor)
1447 {
1448 struct buf *bp;
1449 struct wapbl_entry *we;
1450 off_t off;
1451 off_t head;
1452 off_t tail;
1453 size_t delta = 0;
1454 size_t flushsize;
1455 size_t reserved;
1456 int error = 0;
1457
1458 /*
1459 * Do a quick check to see if a full flush can be skipped
1460 * This assumes that the flush callback does not need to be called
1461 * unless there are other outstanding bufs.
1462 */
1463 if (!waitfor) {
1464 size_t nbufs;
1465 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1466 protect the KASSERTS */
1467 nbufs = wl->wl_bufcount;
1468 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1469 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1470 mutex_exit(&wl->wl_mtx);
1471 if (nbufs == 0)
1472 return 0;
1473 }
1474
1475 /*
1476 * XXX we may consider using LK_UPGRADE here
1477 * if we want to call flush from inside a transaction
1478 */
1479 rw_enter(&wl->wl_rwlock, RW_WRITER);
1480 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1481 wl->wl_dealloccnt);
1482
1483 /*
1484 * Now that we are fully locked and flushed,
1485 * do another check for nothing to do.
1486 */
1487 if (wl->wl_bufcount == 0) {
1488 goto out;
1489 }
1490
1491 #if 0
1492 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1493 ("wapbl_flush thread %d.%d flushing entries with "
1494 "bufcount=%zu bufbytes=%zu\n",
1495 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1496 wl->wl_bufbytes));
1497 #endif
1498
1499 /* Calculate amount of space needed to flush */
1500 flushsize = wapbl_transaction_len(wl);
1501 if (wapbl_verbose_commit) {
1502 struct timespec ts;
1503 getnanotime(&ts);
1504 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1505 __func__, (long long)ts.tv_sec,
1506 (long)ts.tv_nsec, flushsize);
1507 }
1508
1509 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1510 /*
1511 * XXX this could be handled more gracefully, perhaps place
1512 * only a partial transaction in the log and allow the
1513 * remaining to flush without the protection of the journal.
1514 */
1515 panic("wapbl_flush: current transaction too big to flush\n");
1516 }
1517
1518 error = wapbl_truncate(wl, flushsize, 0);
1519 if (error)
1520 goto out2;
1521
1522 off = wl->wl_head;
1523 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1524 (off < wl->wl_circ_off + wl->wl_circ_size)));
1525 error = wapbl_write_blocks(wl, &off);
1526 if (error)
1527 goto out2;
1528 error = wapbl_write_revocations(wl, &off);
1529 if (error)
1530 goto out2;
1531 error = wapbl_write_inodes(wl, &off);
1532 if (error)
1533 goto out2;
1534
1535 reserved = 0;
1536 if (wl->wl_inohashcnt)
1537 reserved = wapbl_transaction_inodes_len(wl);
1538
1539 head = wl->wl_head;
1540 tail = wl->wl_tail;
1541
1542 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1543 &head, &tail);
1544 #ifdef WAPBL_DEBUG
1545 if (head != off) {
1546 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1547 " off=%"PRIdMAX" flush=%zu\n",
1548 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1549 flushsize);
1550 }
1551 #else
1552 KASSERT(head == off);
1553 #endif
1554
1555 /* Opportunistically move the tail forward if we can */
1556 if (!wapbl_lazy_truncate) {
1557 mutex_enter(&wl->wl_mtx);
1558 delta = wl->wl_reclaimable_bytes;
1559 mutex_exit(&wl->wl_mtx);
1560 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1561 &head, &tail);
1562 }
1563
1564 error = wapbl_write_commit(wl, head, tail);
1565 if (error)
1566 goto out2;
1567
1568 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1569
1570 #ifdef WAPBL_DEBUG_BUFBYTES
1571 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1572 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1573 " unsynced=%zu"
1574 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1575 "inodes=%d\n",
1576 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1577 wapbl_space_used(wl->wl_circ_size, head, tail),
1578 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1579 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1580 wl->wl_inohashcnt));
1581 #else
1582 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1583 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1584 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1585 "inodes=%d\n",
1586 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1587 wapbl_space_used(wl->wl_circ_size, head, tail),
1588 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1589 wl->wl_dealloccnt, wl->wl_inohashcnt));
1590 #endif
1591
1592
1593 mutex_enter(&bufcache_lock);
1594 mutex_enter(&wl->wl_mtx);
1595
1596 wl->wl_reserved_bytes = reserved;
1597 wl->wl_head = head;
1598 wl->wl_tail = tail;
1599 KASSERT(wl->wl_reclaimable_bytes >= delta);
1600 wl->wl_reclaimable_bytes -= delta;
1601 wl->wl_dealloccnt = 0;
1602 #ifdef WAPBL_DEBUG_BUFBYTES
1603 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1604 #endif
1605
1606 we->we_wapbl = wl;
1607 we->we_bufcount = wl->wl_bufcount;
1608 #ifdef WAPBL_DEBUG_BUFBYTES
1609 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1610 #endif
1611 we->we_reclaimable_bytes = flushsize;
1612 we->we_error = 0;
1613 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1614
1615 /*
1616 * this flushes bufs in reverse order than they were queued
1617 * it shouldn't matter, but if we care we could use TAILQ instead.
1618 * XXX Note they will get put on the lru queue when they flush
1619 * so we might actually want to change this to preserve order.
1620 */
1621 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1622 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1623 continue;
1624 }
1625 bp->b_iodone = wapbl_biodone;
1626 bp->b_private = we;
1627 bremfree(bp);
1628 wapbl_remove_buf_locked(wl, bp);
1629 mutex_exit(&wl->wl_mtx);
1630 mutex_exit(&bufcache_lock);
1631 bawrite(bp);
1632 mutex_enter(&bufcache_lock);
1633 mutex_enter(&wl->wl_mtx);
1634 }
1635 mutex_exit(&wl->wl_mtx);
1636 mutex_exit(&bufcache_lock);
1637
1638 #if 0
1639 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1640 ("wapbl_flush thread %d.%d done flushing entries...\n",
1641 curproc->p_pid, curlwp->l_lid));
1642 #endif
1643
1644 out:
1645
1646 /*
1647 * If the waitfor flag is set, don't return until everything is
1648 * fully flushed and the on disk log is empty.
1649 */
1650 if (waitfor) {
1651 error = wapbl_truncate(wl, wl->wl_circ_size -
1652 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1653 }
1654
1655 out2:
1656 if (error) {
1657 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1658 wl->wl_dealloclens, wl->wl_dealloccnt);
1659 }
1660
1661 #ifdef WAPBL_DEBUG_PRINT
1662 if (error) {
1663 pid_t pid = -1;
1664 lwpid_t lid = -1;
1665 if (curproc)
1666 pid = curproc->p_pid;
1667 if (curlwp)
1668 lid = curlwp->l_lid;
1669 mutex_enter(&wl->wl_mtx);
1670 #ifdef WAPBL_DEBUG_BUFBYTES
1671 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1672 ("wapbl_flush: thread %d.%d aborted flush: "
1673 "error = %d\n"
1674 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1675 "deallocs=%d inodes=%d\n"
1676 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1677 "unsynced=%zu\n",
1678 pid, lid, error, wl->wl_bufcount,
1679 wl->wl_bufbytes, wl->wl_bcount,
1680 wl->wl_dealloccnt, wl->wl_inohashcnt,
1681 wl->wl_error_count, wl->wl_reclaimable_bytes,
1682 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1683 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1684 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1685 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1686 "error = %d, unsynced = %zu\n",
1687 we->we_bufcount, we->we_reclaimable_bytes,
1688 we->we_error, we->we_unsynced_bufbytes));
1689 }
1690 #else
1691 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1692 ("wapbl_flush: thread %d.%d aborted flush: "
1693 "error = %d\n"
1694 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1695 "deallocs=%d inodes=%d\n"
1696 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1697 pid, lid, error, wl->wl_bufcount,
1698 wl->wl_bufbytes, wl->wl_bcount,
1699 wl->wl_dealloccnt, wl->wl_inohashcnt,
1700 wl->wl_error_count, wl->wl_reclaimable_bytes,
1701 wl->wl_reserved_bytes));
1702 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1703 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1704 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1705 "error = %d\n", we->we_bufcount,
1706 we->we_reclaimable_bytes, we->we_error));
1707 }
1708 #endif
1709 mutex_exit(&wl->wl_mtx);
1710 }
1711 #endif
1712
1713 rw_exit(&wl->wl_rwlock);
1714 return error;
1715 }
1716
1717 /****************************************************************/
1718
1719 void
1720 wapbl_jlock_assert(struct wapbl *wl)
1721 {
1722
1723 KASSERT(rw_lock_held(&wl->wl_rwlock));
1724 }
1725
1726 void
1727 wapbl_junlock_assert(struct wapbl *wl)
1728 {
1729
1730 KASSERT(!rw_write_held(&wl->wl_rwlock));
1731 }
1732
1733 /****************************************************************/
1734
1735 /* locks missing */
1736 void
1737 wapbl_print(struct wapbl *wl,
1738 int full,
1739 void (*pr)(const char *, ...))
1740 {
1741 struct buf *bp;
1742 struct wapbl_entry *we;
1743 (*pr)("wapbl %p", wl);
1744 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1745 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1746 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1747 wl->wl_circ_size, wl->wl_circ_off,
1748 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1749 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1750 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1751 #ifdef WAPBL_DEBUG_BUFBYTES
1752 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1753 "reserved = %zu errcnt = %d unsynced = %zu\n",
1754 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1755 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1756 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1757 #else
1758 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1759 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1760 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1761 wl->wl_error_count);
1762 #endif
1763 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1764 wl->wl_dealloccnt, wl->wl_dealloclim);
1765 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1766 wl->wl_inohashcnt, wl->wl_inohashmask);
1767 (*pr)("entries:\n");
1768 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1769 #ifdef WAPBL_DEBUG_BUFBYTES
1770 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1771 "unsynced = %zu\n",
1772 we->we_bufcount, we->we_reclaimable_bytes,
1773 we->we_error, we->we_unsynced_bufbytes);
1774 #else
1775 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1776 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1777 #endif
1778 }
1779 if (full) {
1780 int cnt = 0;
1781 (*pr)("bufs =");
1782 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1783 if (!LIST_NEXT(bp, b_wapbllist)) {
1784 (*pr)(" %p", bp);
1785 } else if ((++cnt % 6) == 0) {
1786 (*pr)(" %p,\n\t", bp);
1787 } else {
1788 (*pr)(" %p,", bp);
1789 }
1790 }
1791 (*pr)("\n");
1792
1793 (*pr)("dealloced blks = ");
1794 {
1795 int i;
1796 cnt = 0;
1797 for (i = 0; i < wl->wl_dealloccnt; i++) {
1798 (*pr)(" %"PRId64":%d,",
1799 wl->wl_deallocblks[i],
1800 wl->wl_dealloclens[i]);
1801 if ((++cnt % 4) == 0) {
1802 (*pr)("\n\t");
1803 }
1804 }
1805 }
1806 (*pr)("\n");
1807
1808 (*pr)("registered inodes = ");
1809 {
1810 int i;
1811 cnt = 0;
1812 for (i = 0; i <= wl->wl_inohashmask; i++) {
1813 struct wapbl_ino_head *wih;
1814 struct wapbl_ino *wi;
1815
1816 wih = &wl->wl_inohash[i];
1817 LIST_FOREACH(wi, wih, wi_hash) {
1818 if (wi->wi_ino == 0)
1819 continue;
1820 (*pr)(" %"PRIu64"/0%06"PRIo32",",
1821 wi->wi_ino, wi->wi_mode);
1822 if ((++cnt % 4) == 0) {
1823 (*pr)("\n\t");
1824 }
1825 }
1826 }
1827 (*pr)("\n");
1828 }
1829 }
1830 }
1831
1832 #if defined(WAPBL_DEBUG) || defined(DDB)
1833 void
1834 wapbl_dump(struct wapbl *wl)
1835 {
1836 #if defined(WAPBL_DEBUG)
1837 if (!wl)
1838 wl = wapbl_debug_wl;
1839 #endif
1840 if (!wl)
1841 return;
1842 wapbl_print(wl, 1, printf);
1843 }
1844 #endif
1845
1846 /****************************************************************/
1847
1848 void
1849 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1850 {
1851
1852 wapbl_jlock_assert(wl);
1853
1854 mutex_enter(&wl->wl_mtx);
1855 /* XXX should eventually instead tie this into resource estimation */
1856 /*
1857 * XXX this panic needs locking/mutex analysis and the
1858 * ability to cope with the failure.
1859 */
1860 /* XXX this XXX doesn't have enough XXX */
1861 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1862 panic("wapbl_register_deallocation: out of resources");
1863
1864 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1865 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1866 wl->wl_dealloccnt++;
1867 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1868 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1869 mutex_exit(&wl->wl_mtx);
1870 }
1871
1872 /****************************************************************/
1873
1874 static void
1875 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1876 {
1877
1878 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1879 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1880 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1881 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1882 }
1883 }
1884
1885 static void
1886 wapbl_inodetrk_free(struct wapbl *wl)
1887 {
1888
1889 /* XXX this KASSERT needs locking/mutex analysis */
1890 KASSERT(wl->wl_inohashcnt == 0);
1891 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1892 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1893 pool_destroy(&wapbl_ino_pool);
1894 }
1895 }
1896
1897 static struct wapbl_ino *
1898 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1899 {
1900 struct wapbl_ino_head *wih;
1901 struct wapbl_ino *wi;
1902
1903 KASSERT(mutex_owned(&wl->wl_mtx));
1904
1905 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1906 LIST_FOREACH(wi, wih, wi_hash) {
1907 if (ino == wi->wi_ino)
1908 return wi;
1909 }
1910 return 0;
1911 }
1912
1913 void
1914 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1915 {
1916 struct wapbl_ino_head *wih;
1917 struct wapbl_ino *wi;
1918
1919 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1920
1921 mutex_enter(&wl->wl_mtx);
1922 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1923 wi->wi_ino = ino;
1924 wi->wi_mode = mode;
1925 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1926 LIST_INSERT_HEAD(wih, wi, wi_hash);
1927 wl->wl_inohashcnt++;
1928 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1929 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1930 mutex_exit(&wl->wl_mtx);
1931 } else {
1932 mutex_exit(&wl->wl_mtx);
1933 pool_put(&wapbl_ino_pool, wi);
1934 }
1935 }
1936
1937 void
1938 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1939 {
1940 struct wapbl_ino *wi;
1941
1942 mutex_enter(&wl->wl_mtx);
1943 wi = wapbl_inodetrk_get(wl, ino);
1944 if (wi) {
1945 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1946 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1947 KASSERT(wl->wl_inohashcnt > 0);
1948 wl->wl_inohashcnt--;
1949 LIST_REMOVE(wi, wi_hash);
1950 mutex_exit(&wl->wl_mtx);
1951
1952 pool_put(&wapbl_ino_pool, wi);
1953 } else {
1954 mutex_exit(&wl->wl_mtx);
1955 }
1956 }
1957
1958 /****************************************************************/
1959
1960 static inline size_t
1961 wapbl_transaction_inodes_len(struct wapbl *wl)
1962 {
1963 int blocklen = 1<<wl->wl_log_dev_bshift;
1964 int iph;
1965
1966 /* Calculate number of inodes described in a inodelist header */
1967 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1968 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1969
1970 KASSERT(iph > 0);
1971
1972 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1973 }
1974
1975
1976 /* Calculate amount of space a transaction will take on disk */
1977 static size_t
1978 wapbl_transaction_len(struct wapbl *wl)
1979 {
1980 int blocklen = 1<<wl->wl_log_dev_bshift;
1981 size_t len;
1982 int bph;
1983
1984 /* Calculate number of blocks described in a blocklist header */
1985 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1986 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1987
1988 KASSERT(bph > 0);
1989
1990 len = wl->wl_bcount;
1991 len += howmany(wl->wl_bufcount, bph) * blocklen;
1992 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1993 len += wapbl_transaction_inodes_len(wl);
1994
1995 return len;
1996 }
1997
1998 /*
1999 * wapbl_cache_sync: issue DIOCCACHESYNC
2000 */
2001 static int
2002 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2003 {
2004 const bool verbose = wapbl_verbose_commit >= 2;
2005 struct bintime start_time;
2006 int force = 1;
2007 int error;
2008
2009 if (!wapbl_flush_disk_cache) {
2010 return 0;
2011 }
2012 if (verbose) {
2013 bintime(&start_time);
2014 }
2015 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2016 FWRITE, FSCRED);
2017 if (error) {
2018 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2019 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
2020 "returned %d\n", wl->wl_devvp->v_rdev, error));
2021 }
2022 if (verbose) {
2023 struct bintime d;
2024 struct timespec ts;
2025
2026 bintime(&d);
2027 bintime_sub(&d, &start_time);
2028 bintime2timespec(&d, &ts);
2029 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2030 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2031 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2032 }
2033 return error;
2034 }
2035
2036 /*
2037 * Perform commit operation
2038 *
2039 * Note that generation number incrementation needs to
2040 * be protected against racing with other invocations
2041 * of wapbl_write_commit. This is ok since this routine
2042 * is only invoked from wapbl_flush
2043 */
2044 static int
2045 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2046 {
2047 struct wapbl_wc_header *wc = wl->wl_wc_header;
2048 struct timespec ts;
2049 int error;
2050 daddr_t pbn;
2051
2052 error = wapbl_buffered_flush(wl);
2053 if (error)
2054 return error;
2055 /*
2056 * flush disk cache to ensure that blocks we've written are actually
2057 * written to the stable storage before the commit header.
2058 *
2059 * XXX Calc checksum here, instead we do this for now
2060 */
2061 wapbl_cache_sync(wl, "1");
2062
2063 wc->wc_head = head;
2064 wc->wc_tail = tail;
2065 wc->wc_checksum = 0;
2066 wc->wc_version = 1;
2067 getnanotime(&ts);
2068 wc->wc_time = ts.tv_sec;
2069 wc->wc_timensec = ts.tv_nsec;
2070
2071 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2072 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2073 (intmax_t)head, (intmax_t)tail));
2074
2075 /*
2076 * write the commit header.
2077 *
2078 * XXX if generation will rollover, then first zero
2079 * over second commit header before trying to write both headers.
2080 */
2081
2082 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2083 #ifdef _KERNEL
2084 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2085 #endif
2086 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2087 if (error)
2088 return error;
2089 error = wapbl_buffered_flush(wl);
2090 if (error)
2091 return error;
2092
2093 /*
2094 * flush disk cache to ensure that the commit header is actually
2095 * written before meta data blocks.
2096 */
2097 wapbl_cache_sync(wl, "2");
2098
2099 /*
2100 * If the generation number was zero, write it out a second time.
2101 * This handles initialization and generation number rollover
2102 */
2103 if (wc->wc_generation++ == 0) {
2104 error = wapbl_write_commit(wl, head, tail);
2105 /*
2106 * This panic should be able to be removed if we do the
2107 * zero'ing mentioned above, and we are certain to roll
2108 * back generation number on failure.
2109 */
2110 if (error)
2111 panic("wapbl_write_commit: error writing duplicate "
2112 "log header: %d\n", error);
2113 }
2114 return 0;
2115 }
2116
2117 /* Returns new offset value */
2118 static int
2119 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2120 {
2121 struct wapbl_wc_blocklist *wc =
2122 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2123 int blocklen = 1<<wl->wl_log_dev_bshift;
2124 int bph;
2125 struct buf *bp;
2126 off_t off = *offp;
2127 int error;
2128 size_t padding;
2129
2130 KASSERT(rw_write_held(&wl->wl_rwlock));
2131
2132 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2133 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2134
2135 bp = LIST_FIRST(&wl->wl_bufs);
2136
2137 while (bp) {
2138 int cnt;
2139 struct buf *obp = bp;
2140
2141 KASSERT(bp->b_flags & B_LOCKED);
2142
2143 wc->wc_type = WAPBL_WC_BLOCKS;
2144 wc->wc_len = blocklen;
2145 wc->wc_blkcount = 0;
2146 while (bp && (wc->wc_blkcount < bph)) {
2147 /*
2148 * Make sure all the physical block numbers are up to
2149 * date. If this is not always true on a given
2150 * filesystem, then VOP_BMAP must be called. We
2151 * could call VOP_BMAP here, or else in the filesystem
2152 * specific flush callback, although neither of those
2153 * solutions allow us to take the vnode lock. If a
2154 * filesystem requires that we must take the vnode lock
2155 * to call VOP_BMAP, then we can probably do it in
2156 * bwrite when the vnode lock should already be held
2157 * by the invoking code.
2158 */
2159 KASSERT((bp->b_vp->v_type == VBLK) ||
2160 (bp->b_blkno != bp->b_lblkno));
2161 KASSERT(bp->b_blkno > 0);
2162
2163 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2164 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2165 wc->wc_len += bp->b_bcount;
2166 wc->wc_blkcount++;
2167 bp = LIST_NEXT(bp, b_wapbllist);
2168 }
2169 if (wc->wc_len % blocklen != 0) {
2170 padding = blocklen - wc->wc_len % blocklen;
2171 wc->wc_len += padding;
2172 } else {
2173 padding = 0;
2174 }
2175
2176 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2177 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2178 wc->wc_len, padding, (intmax_t)off));
2179
2180 error = wapbl_circ_write(wl, wc, blocklen, &off);
2181 if (error)
2182 return error;
2183 bp = obp;
2184 cnt = 0;
2185 while (bp && (cnt++ < bph)) {
2186 error = wapbl_circ_write(wl, bp->b_data,
2187 bp->b_bcount, &off);
2188 if (error)
2189 return error;
2190 bp = LIST_NEXT(bp, b_wapbllist);
2191 }
2192 if (padding) {
2193 void *zero;
2194
2195 zero = wapbl_alloc(padding);
2196 memset(zero, 0, padding);
2197 error = wapbl_circ_write(wl, zero, padding, &off);
2198 wapbl_free(zero, padding);
2199 if (error)
2200 return error;
2201 }
2202 }
2203 *offp = off;
2204 return 0;
2205 }
2206
2207 static int
2208 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2209 {
2210 struct wapbl_wc_blocklist *wc =
2211 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2212 int i;
2213 int blocklen = 1<<wl->wl_log_dev_bshift;
2214 int bph;
2215 off_t off = *offp;
2216 int error;
2217
2218 if (wl->wl_dealloccnt == 0)
2219 return 0;
2220
2221 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2222 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2223
2224 i = 0;
2225 while (i < wl->wl_dealloccnt) {
2226 wc->wc_type = WAPBL_WC_REVOCATIONS;
2227 wc->wc_len = blocklen;
2228 wc->wc_blkcount = 0;
2229 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2230 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2231 wl->wl_deallocblks[i];
2232 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2233 wl->wl_dealloclens[i];
2234 wc->wc_blkcount++;
2235 i++;
2236 }
2237 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2238 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2239 wc->wc_len, (intmax_t)off));
2240 error = wapbl_circ_write(wl, wc, blocklen, &off);
2241 if (error)
2242 return error;
2243 }
2244 *offp = off;
2245 return 0;
2246 }
2247
2248 static int
2249 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2250 {
2251 struct wapbl_wc_inodelist *wc =
2252 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2253 int i;
2254 int blocklen = 1 << wl->wl_log_dev_bshift;
2255 off_t off = *offp;
2256 int error;
2257
2258 struct wapbl_ino_head *wih;
2259 struct wapbl_ino *wi;
2260 int iph;
2261
2262 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2263 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2264
2265 i = 0;
2266 wih = &wl->wl_inohash[0];
2267 wi = 0;
2268 do {
2269 wc->wc_type = WAPBL_WC_INODES;
2270 wc->wc_len = blocklen;
2271 wc->wc_inocnt = 0;
2272 wc->wc_clear = (i == 0);
2273 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2274 while (!wi) {
2275 KASSERT((wih - &wl->wl_inohash[0])
2276 <= wl->wl_inohashmask);
2277 wi = LIST_FIRST(wih++);
2278 }
2279 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2280 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2281 wc->wc_inocnt++;
2282 i++;
2283 wi = LIST_NEXT(wi, wi_hash);
2284 }
2285 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2286 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2287 wc->wc_len, (intmax_t)off));
2288 error = wapbl_circ_write(wl, wc, blocklen, &off);
2289 if (error)
2290 return error;
2291 } while (i < wl->wl_inohashcnt);
2292
2293 *offp = off;
2294 return 0;
2295 }
2296
2297 #endif /* _KERNEL */
2298
2299 /****************************************************************/
2300
2301 struct wapbl_blk {
2302 LIST_ENTRY(wapbl_blk) wb_hash;
2303 daddr_t wb_blk;
2304 off_t wb_off; /* Offset of this block in the log */
2305 };
2306 #define WAPBL_BLKPOOL_MIN 83
2307
2308 static void
2309 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2310 {
2311 if (size < WAPBL_BLKPOOL_MIN)
2312 size = WAPBL_BLKPOOL_MIN;
2313 KASSERT(wr->wr_blkhash == 0);
2314 #ifdef _KERNEL
2315 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2316 #else /* ! _KERNEL */
2317 /* Manually implement hashinit */
2318 {
2319 unsigned long i, hashsize;
2320 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2321 continue;
2322 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2323 for (i = 0; i < hashsize; i++)
2324 LIST_INIT(&wr->wr_blkhash[i]);
2325 wr->wr_blkhashmask = hashsize - 1;
2326 }
2327 #endif /* ! _KERNEL */
2328 }
2329
2330 static void
2331 wapbl_blkhash_free(struct wapbl_replay *wr)
2332 {
2333 KASSERT(wr->wr_blkhashcnt == 0);
2334 #ifdef _KERNEL
2335 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2336 #else /* ! _KERNEL */
2337 wapbl_free(wr->wr_blkhash,
2338 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2339 #endif /* ! _KERNEL */
2340 }
2341
2342 static struct wapbl_blk *
2343 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2344 {
2345 struct wapbl_blk_head *wbh;
2346 struct wapbl_blk *wb;
2347 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2348 LIST_FOREACH(wb, wbh, wb_hash) {
2349 if (blk == wb->wb_blk)
2350 return wb;
2351 }
2352 return 0;
2353 }
2354
2355 static void
2356 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2357 {
2358 struct wapbl_blk_head *wbh;
2359 struct wapbl_blk *wb;
2360 wb = wapbl_blkhash_get(wr, blk);
2361 if (wb) {
2362 KASSERT(wb->wb_blk == blk);
2363 wb->wb_off = off;
2364 } else {
2365 wb = wapbl_alloc(sizeof(*wb));
2366 wb->wb_blk = blk;
2367 wb->wb_off = off;
2368 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2369 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2370 wr->wr_blkhashcnt++;
2371 }
2372 }
2373
2374 static void
2375 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2376 {
2377 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2378 if (wb) {
2379 KASSERT(wr->wr_blkhashcnt > 0);
2380 wr->wr_blkhashcnt--;
2381 LIST_REMOVE(wb, wb_hash);
2382 wapbl_free(wb, sizeof(*wb));
2383 }
2384 }
2385
2386 static void
2387 wapbl_blkhash_clear(struct wapbl_replay *wr)
2388 {
2389 unsigned long i;
2390 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2391 struct wapbl_blk *wb;
2392
2393 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2394 KASSERT(wr->wr_blkhashcnt > 0);
2395 wr->wr_blkhashcnt--;
2396 LIST_REMOVE(wb, wb_hash);
2397 wapbl_free(wb, sizeof(*wb));
2398 }
2399 }
2400 KASSERT(wr->wr_blkhashcnt == 0);
2401 }
2402
2403 /****************************************************************/
2404
2405 static int
2406 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2407 {
2408 size_t slen;
2409 off_t off = *offp;
2410 int error;
2411 daddr_t pbn;
2412
2413 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2414 wr->wr_log_dev_bshift) == len);
2415
2416 if (off < wr->wr_circ_off)
2417 off = wr->wr_circ_off;
2418 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2419 if (slen < len) {
2420 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2421 #ifdef _KERNEL
2422 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2423 #endif
2424 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2425 if (error)
2426 return error;
2427 data = (uint8_t *)data + slen;
2428 len -= slen;
2429 off = wr->wr_circ_off;
2430 }
2431 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2432 #ifdef _KERNEL
2433 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2434 #endif
2435 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2436 if (error)
2437 return error;
2438 off += len;
2439 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2440 off = wr->wr_circ_off;
2441 *offp = off;
2442 return 0;
2443 }
2444
2445 static void
2446 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2447 {
2448 size_t slen;
2449 off_t off = *offp;
2450
2451 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2452 wr->wr_log_dev_bshift) == len);
2453
2454 if (off < wr->wr_circ_off)
2455 off = wr->wr_circ_off;
2456 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2457 if (slen < len) {
2458 len -= slen;
2459 off = wr->wr_circ_off;
2460 }
2461 off += len;
2462 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2463 off = wr->wr_circ_off;
2464 *offp = off;
2465 }
2466
2467 /****************************************************************/
2468
2469 int
2470 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2471 daddr_t off, size_t count, size_t blksize)
2472 {
2473 struct wapbl_replay *wr;
2474 int error;
2475 struct vnode *devvp;
2476 daddr_t logpbn;
2477 uint8_t *scratch;
2478 struct wapbl_wc_header *wch;
2479 struct wapbl_wc_header *wch2;
2480 /* Use this until we read the actual log header */
2481 int log_dev_bshift = ilog2(blksize);
2482 size_t used;
2483 daddr_t pbn;
2484
2485 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2486 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2487 vp, off, count, blksize));
2488
2489 if (off < 0)
2490 return EINVAL;
2491
2492 if (blksize < DEV_BSIZE)
2493 return EINVAL;
2494 if (blksize % DEV_BSIZE)
2495 return EINVAL;
2496
2497 #ifdef _KERNEL
2498 #if 0
2499 /* XXX vp->v_size isn't reliably set for VBLK devices,
2500 * especially root. However, we might still want to verify
2501 * that the full load is readable */
2502 if ((off + count) * blksize > vp->v_size)
2503 return EINVAL;
2504 #endif
2505 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2506 return error;
2507 }
2508 #else /* ! _KERNEL */
2509 devvp = vp;
2510 logpbn = off;
2511 #endif /* ! _KERNEL */
2512
2513 scratch = wapbl_alloc(MAXBSIZE);
2514
2515 pbn = logpbn;
2516 #ifdef _KERNEL
2517 pbn = btodb(pbn << log_dev_bshift);
2518 #endif
2519 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2520 if (error)
2521 goto errout;
2522
2523 wch = (struct wapbl_wc_header *)scratch;
2524 wch2 =
2525 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2526 /* XXX verify checksums and magic numbers */
2527 if (wch->wc_type != WAPBL_WC_HEADER) {
2528 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2529 error = EFTYPE;
2530 goto errout;
2531 }
2532
2533 if (wch2->wc_generation > wch->wc_generation)
2534 wch = wch2;
2535
2536 wr = wapbl_calloc(1, sizeof(*wr));
2537
2538 wr->wr_logvp = vp;
2539 wr->wr_devvp = devvp;
2540 wr->wr_logpbn = logpbn;
2541
2542 wr->wr_scratch = scratch;
2543
2544 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2545 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2546 wr->wr_circ_off = wch->wc_circ_off;
2547 wr->wr_circ_size = wch->wc_circ_size;
2548 wr->wr_generation = wch->wc_generation;
2549
2550 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2551
2552 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2553 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2554 " len=%"PRId64" used=%zu\n",
2555 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2556 wch->wc_circ_size, used));
2557
2558 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2559
2560 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2561 if (error) {
2562 wapbl_replay_stop(wr);
2563 wapbl_replay_free(wr);
2564 return error;
2565 }
2566
2567 *wrp = wr;
2568 return 0;
2569
2570 errout:
2571 wapbl_free(scratch, MAXBSIZE);
2572 return error;
2573 }
2574
2575 void
2576 wapbl_replay_stop(struct wapbl_replay *wr)
2577 {
2578
2579 if (!wapbl_replay_isopen(wr))
2580 return;
2581
2582 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2583
2584 wapbl_free(wr->wr_scratch, MAXBSIZE);
2585 wr->wr_scratch = NULL;
2586
2587 wr->wr_logvp = NULL;
2588
2589 wapbl_blkhash_clear(wr);
2590 wapbl_blkhash_free(wr);
2591 }
2592
2593 void
2594 wapbl_replay_free(struct wapbl_replay *wr)
2595 {
2596
2597 KDASSERT(!wapbl_replay_isopen(wr));
2598
2599 if (wr->wr_inodes)
2600 wapbl_free(wr->wr_inodes,
2601 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2602 wapbl_free(wr, sizeof(*wr));
2603 }
2604
2605 #ifdef _KERNEL
2606 int
2607 wapbl_replay_isopen1(struct wapbl_replay *wr)
2608 {
2609
2610 return wapbl_replay_isopen(wr);
2611 }
2612 #endif
2613
2614 static void
2615 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2616 {
2617 struct wapbl_wc_blocklist *wc =
2618 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2619 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2620 int i, j, n;
2621
2622 for (i = 0; i < wc->wc_blkcount; i++) {
2623 /*
2624 * Enter each physical block into the hashtable independently.
2625 */
2626 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2627 for (j = 0; j < n; j++) {
2628 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2629 *offp);
2630 wapbl_circ_advance(wr, fsblklen, offp);
2631 }
2632 }
2633 }
2634
2635 static void
2636 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2637 {
2638 struct wapbl_wc_blocklist *wc =
2639 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2640 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2641 int i, j, n;
2642
2643 for (i = 0; i < wc->wc_blkcount; i++) {
2644 /*
2645 * Remove any blocks found from the hashtable.
2646 */
2647 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2648 for (j = 0; j < n; j++)
2649 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2650 }
2651 }
2652
2653 static void
2654 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2655 {
2656 struct wapbl_wc_inodelist *wc =
2657 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2658 void *new_inodes;
2659 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2660
2661 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2662
2663 /*
2664 * Keep track of where we found this so location won't be
2665 * overwritten.
2666 */
2667 if (wc->wc_clear) {
2668 wr->wr_inodestail = oldoff;
2669 wr->wr_inodescnt = 0;
2670 if (wr->wr_inodes != NULL) {
2671 wapbl_free(wr->wr_inodes, oldsize);
2672 wr->wr_inodes = NULL;
2673 }
2674 }
2675 wr->wr_inodeshead = newoff;
2676 if (wc->wc_inocnt == 0)
2677 return;
2678
2679 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2680 sizeof(wr->wr_inodes[0]));
2681 if (wr->wr_inodes != NULL) {
2682 memcpy(new_inodes, wr->wr_inodes, oldsize);
2683 wapbl_free(wr->wr_inodes, oldsize);
2684 }
2685 wr->wr_inodes = new_inodes;
2686 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2687 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2688 wr->wr_inodescnt += wc->wc_inocnt;
2689 }
2690
2691 static int
2692 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2693 {
2694 off_t off;
2695 int error;
2696
2697 int logblklen = 1 << wr->wr_log_dev_bshift;
2698
2699 wapbl_blkhash_clear(wr);
2700
2701 off = tail;
2702 while (off != head) {
2703 struct wapbl_wc_null *wcn;
2704 off_t saveoff = off;
2705 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2706 if (error)
2707 goto errout;
2708 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2709 switch (wcn->wc_type) {
2710 case WAPBL_WC_BLOCKS:
2711 wapbl_replay_process_blocks(wr, &off);
2712 break;
2713
2714 case WAPBL_WC_REVOCATIONS:
2715 wapbl_replay_process_revocations(wr);
2716 break;
2717
2718 case WAPBL_WC_INODES:
2719 wapbl_replay_process_inodes(wr, saveoff, off);
2720 break;
2721
2722 default:
2723 printf("Unrecognized wapbl type: 0x%08x\n",
2724 wcn->wc_type);
2725 error = EFTYPE;
2726 goto errout;
2727 }
2728 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2729 if (off != saveoff) {
2730 printf("wapbl_replay: corrupted records\n");
2731 error = EFTYPE;
2732 goto errout;
2733 }
2734 }
2735 return 0;
2736
2737 errout:
2738 wapbl_blkhash_clear(wr);
2739 return error;
2740 }
2741
2742 #if 0
2743 int
2744 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2745 {
2746 off_t off;
2747 int mismatchcnt = 0;
2748 int logblklen = 1 << wr->wr_log_dev_bshift;
2749 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2750 void *scratch1 = wapbl_alloc(MAXBSIZE);
2751 void *scratch2 = wapbl_alloc(MAXBSIZE);
2752 int error = 0;
2753
2754 KDASSERT(wapbl_replay_isopen(wr));
2755
2756 off = wch->wc_tail;
2757 while (off != wch->wc_head) {
2758 struct wapbl_wc_null *wcn;
2759 #ifdef DEBUG
2760 off_t saveoff = off;
2761 #endif
2762 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2763 if (error)
2764 goto out;
2765 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2766 switch (wcn->wc_type) {
2767 case WAPBL_WC_BLOCKS:
2768 {
2769 struct wapbl_wc_blocklist *wc =
2770 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2771 int i;
2772 for (i = 0; i < wc->wc_blkcount; i++) {
2773 int foundcnt = 0;
2774 int dirtycnt = 0;
2775 int j, n;
2776 /*
2777 * Check each physical block into the
2778 * hashtable independently
2779 */
2780 n = wc->wc_blocks[i].wc_dlen >>
2781 wch->wc_fs_dev_bshift;
2782 for (j = 0; j < n; j++) {
2783 struct wapbl_blk *wb =
2784 wapbl_blkhash_get(wr,
2785 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2786 if (wb && (wb->wb_off == off)) {
2787 foundcnt++;
2788 error =
2789 wapbl_circ_read(wr,
2790 scratch1, fsblklen,
2791 &off);
2792 if (error)
2793 goto out;
2794 error =
2795 wapbl_read(scratch2,
2796 fsblklen, fsdevvp,
2797 wb->wb_blk);
2798 if (error)
2799 goto out;
2800 if (memcmp(scratch1,
2801 scratch2,
2802 fsblklen)) {
2803 printf(
2804 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2805 wb->wb_blk, (intmax_t)off);
2806 dirtycnt++;
2807 mismatchcnt++;
2808 }
2809 } else {
2810 wapbl_circ_advance(wr,
2811 fsblklen, &off);
2812 }
2813 }
2814 #if 0
2815 /*
2816 * If all of the blocks in an entry
2817 * are clean, then remove all of its
2818 * blocks from the hashtable since they
2819 * never will need replay.
2820 */
2821 if ((foundcnt != 0) &&
2822 (dirtycnt == 0)) {
2823 off = saveoff;
2824 wapbl_circ_advance(wr,
2825 logblklen, &off);
2826 for (j = 0; j < n; j++) {
2827 struct wapbl_blk *wb =
2828 wapbl_blkhash_get(wr,
2829 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2830 if (wb &&
2831 (wb->wb_off == off)) {
2832 wapbl_blkhash_rem(wr, wb->wb_blk);
2833 }
2834 wapbl_circ_advance(wr,
2835 fsblklen, &off);
2836 }
2837 }
2838 #endif
2839 }
2840 }
2841 break;
2842 case WAPBL_WC_REVOCATIONS:
2843 case WAPBL_WC_INODES:
2844 break;
2845 default:
2846 KASSERT(0);
2847 }
2848 #ifdef DEBUG
2849 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2850 KASSERT(off == saveoff);
2851 #endif
2852 }
2853 out:
2854 wapbl_free(scratch1, MAXBSIZE);
2855 wapbl_free(scratch2, MAXBSIZE);
2856 if (!error && mismatchcnt)
2857 error = EFTYPE;
2858 return error;
2859 }
2860 #endif
2861
2862 int
2863 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2864 {
2865 struct wapbl_blk *wb;
2866 size_t i;
2867 off_t off;
2868 void *scratch;
2869 int error = 0;
2870 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2871
2872 KDASSERT(wapbl_replay_isopen(wr));
2873
2874 scratch = wapbl_alloc(MAXBSIZE);
2875
2876 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2877 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2878 off = wb->wb_off;
2879 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2880 if (error)
2881 break;
2882 error = wapbl_write(scratch, fsblklen, fsdevvp,
2883 wb->wb_blk);
2884 if (error)
2885 break;
2886 }
2887 }
2888
2889 wapbl_free(scratch, MAXBSIZE);
2890 return error;
2891 }
2892
2893 int
2894 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2895 {
2896 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2897
2898 KDASSERT(wapbl_replay_isopen(wr));
2899 KASSERT((len % fsblklen) == 0);
2900
2901 while (len != 0) {
2902 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2903 if (wb)
2904 return 1;
2905 len -= fsblklen;
2906 }
2907 return 0;
2908 }
2909
2910 int
2911 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2912 {
2913 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2914
2915 KDASSERT(wapbl_replay_isopen(wr));
2916
2917 KASSERT((len % fsblklen) == 0);
2918
2919 while (len != 0) {
2920 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2921 if (wb) {
2922 off_t off = wb->wb_off;
2923 int error;
2924 error = wapbl_circ_read(wr, data, fsblklen, &off);
2925 if (error)
2926 return error;
2927 }
2928 data = (uint8_t *)data + fsblklen;
2929 len -= fsblklen;
2930 blk++;
2931 }
2932 return 0;
2933 }
2934
2935 #ifdef _KERNEL
2936 /*
2937 * This is not really a module now, but maybe on it's way to
2938 * being one some day.
2939 */
2940 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2941
2942 static int
2943 wapbl_modcmd(modcmd_t cmd, void *arg)
2944 {
2945
2946 switch (cmd) {
2947 case MODULE_CMD_INIT:
2948 wapbl_init();
2949 return 0;
2950 case MODULE_CMD_FINI:
2951 #ifdef notyet
2952 return wapbl_fini(true);
2953 #endif
2954 return EOPNOTSUPP;
2955 default:
2956 return ENOTTY;
2957 }
2958 }
2959 #endif /* _KERNEL */
2960