vfs_wapbl.c revision 1.106 1 /* $NetBSD: vfs_wapbl.c,v 1.106 2020/03/16 21:20:10 pgoyette Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.106 2020/03/16 21:20:10 pgoyette Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43 #include <sys/time.h>
44 #include <sys/wapbl.h>
45 #include <sys/wapbl_replay.h>
46
47 #ifdef _KERNEL
48
49 #include <sys/atomic.h>
50 #include <sys/conf.h>
51 #include <sys/evcnt.h>
52 #include <sys/file.h>
53 #include <sys/kauth.h>
54 #include <sys/kernel.h>
55 #include <sys/module.h>
56 #include <sys/mount.h>
57 #include <sys/mutex.h>
58 #include <sys/namei.h>
59 #include <sys/proc.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sysctl.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
68 #define wapbl_free(a, s) kmem_free((a), (s))
69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
70
71 static int wapbl_flush_disk_cache = 1;
72 static int wapbl_verbose_commit = 0;
73 static int wapbl_allow_dpofua = 0; /* switched off by default for now */
74 static int wapbl_journal_iobufs = 4;
75
76 static inline size_t wapbl_space_free(size_t, off_t, off_t);
77
78 #else /* !_KERNEL */
79
80 #include <assert.h>
81 #include <errno.h>
82 #include <stdbool.h>
83 #include <stdio.h>
84 #include <stdlib.h>
85 #include <string.h>
86
87 #define KDASSERT(x) assert(x)
88 #define KASSERT(x) assert(x)
89 #define wapbl_alloc(s) malloc(s)
90 #define wapbl_free(a, s) free(a)
91 #define wapbl_calloc(n, s) calloc((n), (s))
92
93 #endif /* !_KERNEL */
94
95 /*
96 * INTERNAL DATA STRUCTURES
97 */
98
99 /*
100 * This structure holds per-mount log information.
101 *
102 * Legend: a = atomic access only
103 * r = read-only after init
104 * l = rwlock held
105 * m = mutex held
106 * lm = rwlock held writing or mutex held
107 * u = unlocked access ok
108 * b = bufcache_lock held
109 */
110 LIST_HEAD(wapbl_ino_head, wapbl_ino);
111 struct wapbl {
112 struct vnode *wl_logvp; /* r: log here */
113 struct vnode *wl_devvp; /* r: log on this device */
114 struct mount *wl_mount; /* r: mountpoint wl is associated with */
115 daddr_t wl_logpbn; /* r: Physical block number of start of log */
116 int wl_log_dev_bshift; /* r: logarithm of device block size of log
117 device */
118 int wl_fs_dev_bshift; /* r: logarithm of device block size of
119 filesystem device */
120
121 unsigned wl_lock_count; /* m: Count of transactions in progress */
122
123 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
124 size_t wl_circ_off; /* r: Number of bytes reserved at start */
125
126 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
127 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
128
129 off_t wl_head; /* l: Byte offset of log head */
130 off_t wl_tail; /* l: Byte offset of log tail */
131 /*
132 * WAPBL log layout, stored on wl_devvp at wl_logpbn:
133 *
134 * ___________________ wl_circ_size __________________
135 * / \
136 * +---------+---------+-------+--------------+--------+
137 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
138 * +---------+---------+-------+--------------+--------+
139 * wl_circ_off --^ ^-- wl_head ^-- wl_tail
140 *
141 * commit0 and commit1 are commit headers. A commit header has
142 * a generation number, indicating which of the two headers is
143 * more recent, and an assignment of head and tail pointers.
144 * The rest is a circular queue of log records, starting at
145 * the byte offset wl_circ_off.
146 *
147 * E marks empty space for records.
148 * W marks records for block writes issued but waiting.
149 * C marks completed records.
150 *
151 * wapbl_flush writes new records to empty `E' spaces after
152 * wl_head from the current transaction in memory.
153 *
154 * wapbl_truncate advances wl_tail past any completed `C'
155 * records, freeing them up for use.
156 *
157 * head == tail == 0 means log is empty.
158 * head == tail != 0 means log is full.
159 *
160 * See assertions in wapbl_advance() for other boundary
161 * conditions.
162 *
163 * Only wapbl_flush moves the head, except when wapbl_truncate
164 * sets it to 0 to indicate that the log is empty.
165 *
166 * Only wapbl_truncate moves the tail, except when wapbl_flush
167 * sets it to wl_circ_off to indicate that the log is full.
168 */
169
170 struct wapbl_wc_header *wl_wc_header; /* l */
171 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
172
173 kmutex_t wl_mtx; /* u: short-term lock */
174 krwlock_t wl_rwlock; /* u: File system transaction lock */
175
176 /*
177 * Must be held while accessing
178 * wl_count or wl_bufs or head or tail
179 */
180
181 #if _KERNEL
182 /*
183 * Callback called from within the flush routine to flush any extra
184 * bits. Note that flush may be skipped without calling this if
185 * there are no outstanding buffers in the transaction.
186 */
187 wapbl_flush_fn_t wl_flush; /* r */
188 wapbl_flush_fn_t wl_flush_abort;/* r */
189
190 /* Event counters */
191 char wl_ev_group[EVCNT_STRING_MAX]; /* r */
192 struct evcnt wl_ev_commit; /* l */
193 struct evcnt wl_ev_journalwrite; /* l */
194 struct evcnt wl_ev_jbufs_bio_nowait; /* l */
195 struct evcnt wl_ev_metawrite; /* lm */
196 struct evcnt wl_ev_cacheflush; /* l */
197 #endif
198
199 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
200 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
201 size_t wl_bcount; /* m: Total bcount of wl_bufs */
202
203 TAILQ_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
204
205 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
206 size_t wl_reclaimable_bytes; /* m: Amount of space available for
207 reclamation by truncate */
208 int wl_error_count; /* m: # of wl_entries with errors */
209 size_t wl_reserved_bytes; /* never truncate log smaller than this */
210
211 #ifdef WAPBL_DEBUG_BUFBYTES
212 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
213 #endif
214
215 #if _KERNEL
216 int wl_brperjblock; /* r Block records per journal block */
217 #endif
218
219 TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
220 int wl_dealloccnt; /* lm: total count */
221 int wl_dealloclim; /* r: max count */
222
223 /* hashtable of inode numbers for allocated but unlinked inodes */
224 /* synch ??? */
225 struct wapbl_ino_head *wl_inohash;
226 u_long wl_inohashmask;
227 int wl_inohashcnt;
228
229 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
230 accounting */
231
232 /* buffers for wapbl_buffered_write() */
233 TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
234 TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
235
236 int wl_dkcache; /* r: disk cache flags */
237 #define WAPBL_USE_FUA(wl) \
238 (wapbl_allow_dpofua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
239 #define WAPBL_JFLAGS(wl) \
240 (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
241 #define WAPBL_JDATA_FLAGS(wl) \
242 (WAPBL_JFLAGS(wl) & B_MEDIA_DPO) /* only DPO */
243 int wl_jwrite_flags; /* r: journal write flags */
244 };
245
246 #ifdef WAPBL_DEBUG_PRINT
247 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
248 #endif
249
250 /****************************************************************/
251 #ifdef _KERNEL
252
253 #ifdef WAPBL_DEBUG
254 struct wapbl *wapbl_debug_wl;
255 #endif
256
257 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
258 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
259 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
260 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
261 #endif /* _KERNEL */
262
263 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
264
265 static inline size_t wapbl_space_used(size_t avail, off_t head,
266 off_t tail);
267
268 #ifdef _KERNEL
269
270 static struct pool wapbl_entry_pool;
271 static struct pool wapbl_dealloc_pool;
272
273 #define WAPBL_INODETRK_SIZE 83
274 static int wapbl_ino_pool_refcount;
275 static struct pool wapbl_ino_pool;
276 struct wapbl_ino {
277 LIST_ENTRY(wapbl_ino) wi_hash;
278 ino_t wi_ino;
279 mode_t wi_mode;
280 };
281
282 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
283 static void wapbl_inodetrk_free(struct wapbl *wl);
284 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
285
286 static size_t wapbl_transaction_len(struct wapbl *wl);
287 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
288
289 static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *,
290 bool);
291
292 static void wapbl_evcnt_init(struct wapbl *);
293 static void wapbl_evcnt_free(struct wapbl *);
294
295 static void wapbl_dkcache_init(struct wapbl *);
296
297 #if 0
298 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
299 #endif
300
301 static int wapbl_replay_isopen1(struct wapbl_replay *);
302
303 const struct wapbl_ops wapbl_ops = {
304 .wo_wapbl_discard = wapbl_discard,
305 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
306 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
307 .wo_wapbl_replay_read = wapbl_replay_read,
308 .wo_wapbl_add_buf = wapbl_add_buf,
309 .wo_wapbl_remove_buf = wapbl_remove_buf,
310 .wo_wapbl_resize_buf = wapbl_resize_buf,
311 .wo_wapbl_begin = wapbl_begin,
312 .wo_wapbl_end = wapbl_end,
313 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
314 .wo_wapbl_jlock_assert = wapbl_jlock_assert,
315
316 /* XXX: the following is only used to say "this is a wapbl buf" */
317 .wo_wapbl_biodone = wapbl_biodone,
318 };
319
320 SYSCTL_SETUP(wapbl_sysctl_init, "wapbl sysctl")
321 {
322 int rv;
323 const struct sysctlnode *rnode, *cnode;
324
325 rv = sysctl_createv(clog, 0, NULL, &rnode,
326 CTLFLAG_PERMANENT,
327 CTLTYPE_NODE, "wapbl",
328 SYSCTL_DESCR("WAPBL journaling options"),
329 NULL, 0, NULL, 0,
330 CTL_VFS, CTL_CREATE, CTL_EOL);
331 if (rv)
332 return;
333
334 rv = sysctl_createv(clog, 0, &rnode, &cnode,
335 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
336 CTLTYPE_INT, "flush_disk_cache",
337 SYSCTL_DESCR("flush disk cache"),
338 NULL, 0, &wapbl_flush_disk_cache, 0,
339 CTL_CREATE, CTL_EOL);
340 if (rv)
341 return;
342
343 rv = sysctl_createv(clog, 0, &rnode, &cnode,
344 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
345 CTLTYPE_INT, "verbose_commit",
346 SYSCTL_DESCR("show time and size of wapbl log commits"),
347 NULL, 0, &wapbl_verbose_commit, 0,
348 CTL_CREATE, CTL_EOL);
349 if (rv)
350 return;
351
352 rv = sysctl_createv(clog, 0, &rnode, &cnode,
353 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
354 CTLTYPE_INT, "allow_dpofua",
355 SYSCTL_DESCR("allow use of FUA/DPO instead of cache flush if available"),
356 NULL, 0, &wapbl_allow_dpofua, 0,
357 CTL_CREATE, CTL_EOL);
358 if (rv)
359 return;
360
361 rv = sysctl_createv(clog, 0, &rnode, &cnode,
362 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
363 CTLTYPE_INT, "journal_iobufs",
364 SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
365 NULL, 0, &wapbl_journal_iobufs, 0,
366 CTL_CREATE, CTL_EOL);
367 if (rv)
368 return;
369
370 return;
371 }
372
373 static void
374 wapbl_init(void)
375 {
376
377 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
378 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
379 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
380 "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
381 }
382
383 static int
384 wapbl_fini(void)
385 {
386
387 pool_destroy(&wapbl_dealloc_pool);
388 pool_destroy(&wapbl_entry_pool);
389
390 return 0;
391 }
392
393 static void
394 wapbl_evcnt_init(struct wapbl *wl)
395 {
396 snprintf(wl->wl_ev_group, sizeof(wl->wl_ev_group),
397 "wapbl fsid 0x%x/0x%x",
398 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[0],
399 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[1]
400 );
401
402 evcnt_attach_dynamic(&wl->wl_ev_commit, EVCNT_TYPE_MISC,
403 NULL, wl->wl_ev_group, "commit");
404 evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
405 NULL, wl->wl_ev_group, "journal write total");
406 evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
407 NULL, wl->wl_ev_group, "journal write finished async");
408 evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
409 NULL, wl->wl_ev_group, "metadata async write");
410 evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
411 NULL, wl->wl_ev_group, "cache flush");
412 }
413
414 static void
415 wapbl_evcnt_free(struct wapbl *wl)
416 {
417 evcnt_detach(&wl->wl_ev_commit);
418 evcnt_detach(&wl->wl_ev_journalwrite);
419 evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
420 evcnt_detach(&wl->wl_ev_metawrite);
421 evcnt_detach(&wl->wl_ev_cacheflush);
422 }
423
424 static void
425 wapbl_dkcache_init(struct wapbl *wl)
426 {
427 int error;
428
429 /* Get disk cache flags */
430 error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
431 FWRITE, FSCRED);
432 if (error) {
433 /* behave as if there was a write cache */
434 wl->wl_dkcache = DKCACHE_WRITE;
435 }
436
437 /* Use FUA instead of cache flush if available */
438 if (ISSET(wl->wl_dkcache, DKCACHE_FUA))
439 wl->wl_jwrite_flags |= B_MEDIA_FUA;
440
441 /* Use DPO for journal writes if available */
442 if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
443 wl->wl_jwrite_flags |= B_MEDIA_DPO;
444 }
445
446 static int
447 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
448 {
449 int error, i;
450
451 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
452 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
453
454 /*
455 * Its only valid to reuse the replay log if its
456 * the same as the new log we just opened.
457 */
458 KDASSERT(!wapbl_replay_isopen(wr));
459 KASSERT(wl->wl_devvp->v_type == VBLK);
460 KASSERT(wr->wr_devvp->v_type == VBLK);
461 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
462 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
463 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
464 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
465 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
466 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
467
468 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
469
470 for (i = 0; i < wr->wr_inodescnt; i++)
471 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
472 wr->wr_inodes[i].wr_imode);
473
474 /* Make sure new transaction won't overwrite old inodes list */
475 KDASSERT(wapbl_transaction_len(wl) <=
476 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
477 wr->wr_inodestail));
478
479 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
480 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
481 wapbl_transaction_len(wl);
482
483 error = wapbl_write_inodes(wl, &wl->wl_head);
484 if (error)
485 return error;
486
487 KASSERT(wl->wl_head != wl->wl_tail);
488 KASSERT(wl->wl_head != 0);
489
490 return 0;
491 }
492
493 int
494 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
495 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
496 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
497 {
498 struct wapbl *wl;
499 struct vnode *devvp;
500 daddr_t logpbn;
501 int error;
502 int log_dev_bshift = ilog2(blksize);
503 int fs_dev_bshift = log_dev_bshift;
504 int run;
505
506 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
507 " count=%zu blksize=%zu\n", vp, off, count, blksize));
508
509 if (log_dev_bshift > fs_dev_bshift) {
510 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
511 ("wapbl: log device's block size cannot be larger "
512 "than filesystem's\n"));
513 /*
514 * Not currently implemented, although it could be if
515 * needed someday.
516 */
517 return ENOSYS;
518 }
519
520 if (off < 0)
521 return EINVAL;
522
523 if (blksize < DEV_BSIZE)
524 return EINVAL;
525 if (blksize % DEV_BSIZE)
526 return EINVAL;
527
528 /* XXXTODO: verify that the full load is writable */
529
530 /*
531 * XXX check for minimum log size
532 * minimum is governed by minimum amount of space
533 * to complete a transaction. (probably truncate)
534 */
535 /* XXX for now pick something minimal */
536 if ((count * blksize) < MAXPHYS) {
537 return ENOSPC;
538 }
539
540 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
541 return error;
542 }
543
544 wl = wapbl_calloc(1, sizeof(*wl));
545 rw_init(&wl->wl_rwlock);
546 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
547 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
548 TAILQ_INIT(&wl->wl_bufs);
549 SIMPLEQ_INIT(&wl->wl_entries);
550
551 wl->wl_logvp = vp;
552 wl->wl_devvp = devvp;
553 wl->wl_mount = mp;
554 wl->wl_logpbn = logpbn;
555 wl->wl_log_dev_bshift = log_dev_bshift;
556 wl->wl_fs_dev_bshift = fs_dev_bshift;
557
558 wl->wl_flush = flushfn;
559 wl->wl_flush_abort = flushabortfn;
560
561 /* Reserve two log device blocks for the commit headers */
562 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
563 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
564 /* truncate the log usage to a multiple of log_dev_bshift */
565 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
566 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
567
568 /*
569 * wl_bufbytes_max limits the size of the in memory transaction space.
570 * - Since buffers are allocated and accounted for in units of
571 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
572 * (i.e. 1<<PAGE_SHIFT)
573 * - Since the log device has to be written in units of
574 * 1<<wl_log_dev_bshift it is required to be a mulitple of
575 * 1<<wl_log_dev_bshift.
576 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
577 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
578 * Therefore it must be multiple of the least common multiple of those
579 * three quantities. Fortunately, all of those quantities are
580 * guaranteed to be a power of two, and the least common multiple of
581 * a set of numbers which are all powers of two is simply the maximum
582 * of those numbers. Finally, the maximum logarithm of a power of two
583 * is the same as the log of the maximum power of two. So we can do
584 * the following operations to size wl_bufbytes_max:
585 */
586
587 /* XXX fix actual number of pages reserved per filesystem. */
588 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
589
590 /* Round wl_bufbytes_max to the largest power of two constraint */
591 wl->wl_bufbytes_max >>= PAGE_SHIFT;
592 wl->wl_bufbytes_max <<= PAGE_SHIFT;
593 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
594 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
595 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
596 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
597
598 /* XXX maybe use filesystem fragment size instead of 1024 */
599 /* XXX fix actual number of buffers reserved per filesystem. */
600 wl->wl_bufcount_max = (buf_nbuf() / 2) * 1024;
601
602 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
603 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
604 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
605 KASSERT(wl->wl_brperjblock > 0);
606
607 /* XXX tie this into resource estimation */
608 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
609 TAILQ_INIT(&wl->wl_dealloclist);
610
611 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
612
613 wapbl_evcnt_init(wl);
614
615 wapbl_dkcache_init(wl);
616
617 /* Initialize the commit header */
618 {
619 struct wapbl_wc_header *wc;
620 size_t len = 1 << wl->wl_log_dev_bshift;
621 wc = wapbl_calloc(1, len);
622 wc->wc_type = WAPBL_WC_HEADER;
623 wc->wc_len = len;
624 wc->wc_circ_off = wl->wl_circ_off;
625 wc->wc_circ_size = wl->wl_circ_size;
626 /* XXX wc->wc_fsid */
627 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
628 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
629 wl->wl_wc_header = wc;
630 wl->wl_wc_scratch = wapbl_alloc(len);
631 }
632
633 TAILQ_INIT(&wl->wl_iobufs);
634 TAILQ_INIT(&wl->wl_iobufs_busy);
635 for (int i = 0; i < wapbl_journal_iobufs; i++) {
636 struct buf *bp;
637
638 if ((bp = geteblk(MAXPHYS)) == NULL)
639 goto errout;
640
641 mutex_enter(&bufcache_lock);
642 mutex_enter(devvp->v_interlock);
643 bgetvp(devvp, bp);
644 mutex_exit(devvp->v_interlock);
645 mutex_exit(&bufcache_lock);
646
647 bp->b_dev = devvp->v_rdev;
648
649 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
650 }
651
652 /*
653 * if there was an existing set of unlinked but
654 * allocated inodes, preserve it in the new
655 * log.
656 */
657 if (wr && wr->wr_inodescnt) {
658 error = wapbl_start_flush_inodes(wl, wr);
659 if (error)
660 goto errout;
661 }
662
663 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
664 if (error) {
665 goto errout;
666 }
667
668 *wlp = wl;
669 #if defined(WAPBL_DEBUG)
670 wapbl_debug_wl = wl;
671 #endif
672
673 return 0;
674 errout:
675 wapbl_discard(wl);
676 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
677 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
678 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
679 struct buf *bp;
680
681 bp = TAILQ_FIRST(&wl->wl_iobufs);
682 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
683 brelse(bp, BC_INVAL);
684 }
685 wapbl_inodetrk_free(wl);
686 wapbl_free(wl, sizeof(*wl));
687
688 return error;
689 }
690
691 /*
692 * Like wapbl_flush, only discards the transaction
693 * completely
694 */
695
696 void
697 wapbl_discard(struct wapbl *wl)
698 {
699 struct wapbl_entry *we;
700 struct wapbl_dealloc *wd;
701 struct buf *bp;
702 int i;
703
704 /*
705 * XXX we may consider using upgrade here
706 * if we want to call flush from inside a transaction
707 */
708 rw_enter(&wl->wl_rwlock, RW_WRITER);
709 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
710
711 #ifdef WAPBL_DEBUG_PRINT
712 {
713 pid_t pid = -1;
714 lwpid_t lid = -1;
715 if (curproc)
716 pid = curproc->p_pid;
717 if (curlwp)
718 lid = curlwp->l_lid;
719 #ifdef WAPBL_DEBUG_BUFBYTES
720 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
721 ("wapbl_discard: thread %d.%d discarding "
722 "transaction\n"
723 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
724 "deallocs=%d inodes=%d\n"
725 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
726 "unsynced=%zu\n",
727 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
728 wl->wl_bcount, wl->wl_dealloccnt,
729 wl->wl_inohashcnt, wl->wl_error_count,
730 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
731 wl->wl_unsynced_bufbytes));
732 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
733 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
734 ("\tentry: bufcount = %zu, reclaimable = %zu, "
735 "error = %d, unsynced = %zu\n",
736 we->we_bufcount, we->we_reclaimable_bytes,
737 we->we_error, we->we_unsynced_bufbytes));
738 }
739 #else /* !WAPBL_DEBUG_BUFBYTES */
740 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
741 ("wapbl_discard: thread %d.%d discarding transaction\n"
742 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
743 "deallocs=%d inodes=%d\n"
744 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
745 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
746 wl->wl_bcount, wl->wl_dealloccnt,
747 wl->wl_inohashcnt, wl->wl_error_count,
748 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
749 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
750 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
751 ("\tentry: bufcount = %zu, reclaimable = %zu, "
752 "error = %d\n",
753 we->we_bufcount, we->we_reclaimable_bytes,
754 we->we_error));
755 }
756 #endif /* !WAPBL_DEBUG_BUFBYTES */
757 }
758 #endif /* WAPBL_DEBUG_PRINT */
759
760 for (i = 0; i <= wl->wl_inohashmask; i++) {
761 struct wapbl_ino_head *wih;
762 struct wapbl_ino *wi;
763
764 wih = &wl->wl_inohash[i];
765 while ((wi = LIST_FIRST(wih)) != NULL) {
766 LIST_REMOVE(wi, wi_hash);
767 pool_put(&wapbl_ino_pool, wi);
768 KASSERT(wl->wl_inohashcnt > 0);
769 wl->wl_inohashcnt--;
770 }
771 }
772
773 /*
774 * clean buffer list
775 */
776 mutex_enter(&bufcache_lock);
777 mutex_enter(&wl->wl_mtx);
778 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
779 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
780 /*
781 * The buffer will be unlocked and
782 * removed from the transaction in brelse
783 */
784 mutex_exit(&wl->wl_mtx);
785 brelsel(bp, 0);
786 mutex_enter(&wl->wl_mtx);
787 }
788 }
789 mutex_exit(&wl->wl_mtx);
790 mutex_exit(&bufcache_lock);
791
792 /*
793 * Remove references to this wl from wl_entries, free any which
794 * no longer have buffers, others will be freed in wapbl_biodone
795 * when they no longer have any buffers.
796 */
797 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
798 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
799 /* XXX should we be accumulating wl_error_count
800 * and increasing reclaimable bytes ? */
801 we->we_wapbl = NULL;
802 if (we->we_bufcount == 0) {
803 #ifdef WAPBL_DEBUG_BUFBYTES
804 KASSERT(we->we_unsynced_bufbytes == 0);
805 #endif
806 pool_put(&wapbl_entry_pool, we);
807 }
808 }
809
810 /* Discard list of deallocs */
811 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
812 wapbl_deallocation_free(wl, wd, true);
813
814 /* XXX should we clear wl_reserved_bytes? */
815
816 KASSERT(wl->wl_bufbytes == 0);
817 KASSERT(wl->wl_bcount == 0);
818 KASSERT(wl->wl_bufcount == 0);
819 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
820 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
821 KASSERT(wl->wl_inohashcnt == 0);
822 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
823 KASSERT(wl->wl_dealloccnt == 0);
824
825 rw_exit(&wl->wl_rwlock);
826 }
827
828 int
829 wapbl_stop(struct wapbl *wl, int force)
830 {
831 int error;
832
833 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
834 error = wapbl_flush(wl, 1);
835 if (error) {
836 if (force)
837 wapbl_discard(wl);
838 else
839 return error;
840 }
841
842 /* Unlinked inodes persist after a flush */
843 if (wl->wl_inohashcnt) {
844 if (force) {
845 wapbl_discard(wl);
846 } else {
847 return EBUSY;
848 }
849 }
850
851 KASSERT(wl->wl_bufbytes == 0);
852 KASSERT(wl->wl_bcount == 0);
853 KASSERT(wl->wl_bufcount == 0);
854 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
855 KASSERT(wl->wl_dealloccnt == 0);
856 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
857 KASSERT(wl->wl_inohashcnt == 0);
858 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
859 KASSERT(wl->wl_dealloccnt == 0);
860 KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
861
862 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
863 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
864 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
865 struct buf *bp;
866
867 bp = TAILQ_FIRST(&wl->wl_iobufs);
868 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
869 brelse(bp, BC_INVAL);
870 }
871 wapbl_inodetrk_free(wl);
872
873 wapbl_evcnt_free(wl);
874
875 cv_destroy(&wl->wl_reclaimable_cv);
876 mutex_destroy(&wl->wl_mtx);
877 rw_destroy(&wl->wl_rwlock);
878 wapbl_free(wl, sizeof(*wl));
879
880 return 0;
881 }
882
883 /****************************************************************/
884 /*
885 * Unbuffered disk I/O
886 */
887
888 static void
889 wapbl_doio_accounting(struct vnode *devvp, int flags)
890 {
891 struct pstats *pstats = curlwp->l_proc->p_stats;
892
893 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
894 mutex_enter(devvp->v_interlock);
895 devvp->v_numoutput++;
896 mutex_exit(devvp->v_interlock);
897 pstats->p_ru.ru_oublock++;
898 } else {
899 pstats->p_ru.ru_inblock++;
900 }
901
902 }
903
904 static int
905 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
906 {
907 struct buf *bp;
908 int error;
909
910 KASSERT(devvp->v_type == VBLK);
911
912 wapbl_doio_accounting(devvp, flags);
913
914 bp = getiobuf(devvp, true);
915 bp->b_flags = flags;
916 bp->b_cflags |= BC_BUSY; /* mandatory, asserted by biowait() */
917 bp->b_dev = devvp->v_rdev;
918 bp->b_data = data;
919 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
920 bp->b_blkno = pbn;
921 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
922
923 WAPBL_PRINTF(WAPBL_PRINT_IO,
924 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
925 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
926 bp->b_blkno, bp->b_dev));
927
928 VOP_STRATEGY(devvp, bp);
929
930 error = biowait(bp);
931 putiobuf(bp);
932
933 if (error) {
934 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
935 ("wapbl_doio: %s %zu bytes at block %" PRId64
936 " on dev 0x%"PRIx64" failed with error %d\n",
937 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
938 "write" : "read"),
939 len, pbn, devvp->v_rdev, error));
940 }
941
942 return error;
943 }
944
945 /*
946 * wapbl_write(data, len, devvp, pbn)
947 *
948 * Synchronously write len bytes from data to physical block pbn
949 * on devvp.
950 */
951 int
952 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
953 {
954
955 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
956 }
957
958 /*
959 * wapbl_read(data, len, devvp, pbn)
960 *
961 * Synchronously read len bytes into data from physical block pbn
962 * on devvp.
963 */
964 int
965 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
966 {
967
968 return wapbl_doio(data, len, devvp, pbn, B_READ);
969 }
970
971 /****************************************************************/
972 /*
973 * Buffered disk writes -- try to coalesce writes and emit
974 * MAXPHYS-aligned blocks.
975 */
976
977 /*
978 * wapbl_buffered_write_async(wl, bp)
979 *
980 * Send buffer for asynchronous write.
981 */
982 static void
983 wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
984 {
985 wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
986
987 KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
988 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
989
990 bp->b_flags |= B_WRITE;
991 bp->b_cflags |= BC_BUSY; /* mandatory, asserted by biowait() */
992 bp->b_oflags = 0;
993 bp->b_bcount = bp->b_resid;
994 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
995
996 VOP_STRATEGY(wl->wl_devvp, bp);
997
998 wl->wl_ev_journalwrite.ev_count++;
999
1000 TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
1001 }
1002
1003 /*
1004 * wapbl_buffered_flush(wl)
1005 *
1006 * Flush any buffered writes from wapbl_buffered_write.
1007 */
1008 static int
1009 wapbl_buffered_flush(struct wapbl *wl, bool full)
1010 {
1011 int error = 0;
1012 struct buf *bp, *bnext;
1013 bool only_done = true, found = false;
1014
1015 /* if there is outstanding buffered write, send it now */
1016 if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
1017 wapbl_buffered_write_async(wl, bp);
1018
1019 /* wait for I/O to complete */
1020 again:
1021 TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
1022 if (!full && only_done) {
1023 /* skip unfinished */
1024 if (!ISSET(bp->b_oflags, BO_DONE))
1025 continue;
1026 }
1027
1028 if (ISSET(bp->b_oflags, BO_DONE))
1029 wl->wl_ev_jbufs_bio_nowait.ev_count++;
1030
1031 TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
1032 error = biowait(bp);
1033
1034 /* reset for reuse */
1035 bp->b_blkno = bp->b_resid = bp->b_flags = 0;
1036 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
1037 found = true;
1038
1039 if (!full)
1040 break;
1041 }
1042
1043 if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
1044 only_done = false;
1045 goto again;
1046 }
1047
1048 return error;
1049 }
1050
1051 /*
1052 * wapbl_buffered_write(data, len, wl, pbn)
1053 *
1054 * Write len bytes from data to physical block pbn on
1055 * wl->wl_devvp. The write may not complete until
1056 * wapbl_buffered_flush.
1057 */
1058 static int
1059 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn,
1060 int bflags)
1061 {
1062 size_t resid;
1063 struct buf *bp;
1064
1065 again:
1066 bp = TAILQ_FIRST(&wl->wl_iobufs);
1067
1068 if (bp == NULL) {
1069 /* No more buffers, wait for any previous I/O to finish. */
1070 wapbl_buffered_flush(wl, false);
1071
1072 bp = TAILQ_FIRST(&wl->wl_iobufs);
1073 KASSERT(bp != NULL);
1074 }
1075
1076 /*
1077 * If not adjacent to buffered data flush first. Disk block
1078 * address is always valid for non-empty buffer.
1079 */
1080 if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
1081 wapbl_buffered_write_async(wl, bp);
1082 goto again;
1083 }
1084
1085 /*
1086 * If this write goes to an empty buffer we have to
1087 * save the disk block address first.
1088 */
1089 if (bp->b_blkno == 0) {
1090 bp->b_blkno = pbn;
1091 bp->b_flags |= bflags;
1092 }
1093
1094 /*
1095 * Remaining space so this buffer ends on a buffer size boundary.
1096 *
1097 * Cannot become less or equal zero as the buffer would have been
1098 * flushed on the last call then.
1099 */
1100 resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
1101 bp->b_resid;
1102 KASSERT(resid > 0);
1103 KASSERT(dbtob(btodb(resid)) == resid);
1104
1105 if (len < resid)
1106 resid = len;
1107
1108 memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
1109 bp->b_resid += resid;
1110
1111 if (len >= resid) {
1112 /* Just filled the buf, or data did not fit */
1113 wapbl_buffered_write_async(wl, bp);
1114
1115 data = (uint8_t *)data + resid;
1116 len -= resid;
1117 pbn += btodb(resid);
1118
1119 if (len > 0)
1120 goto again;
1121 }
1122
1123 return 0;
1124 }
1125
1126 /*
1127 * wapbl_circ_write(wl, data, len, offp)
1128 *
1129 * Write len bytes from data to the circular queue of wl, starting
1130 * at linear byte offset *offp, and returning the new linear byte
1131 * offset in *offp.
1132 *
1133 * If the starting linear byte offset precedes wl->wl_circ_off,
1134 * the write instead begins at wl->wl_circ_off. XXX WTF? This
1135 * should be a KASSERT, not a conditional.
1136 *
1137 * The write is buffered in wl and must be flushed with
1138 * wapbl_buffered_flush before it will be submitted to the disk.
1139 */
1140 static int
1141 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
1142 {
1143 size_t slen;
1144 off_t off = *offp;
1145 int error;
1146 daddr_t pbn;
1147
1148 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
1149 wl->wl_log_dev_bshift) == len);
1150
1151 if (off < wl->wl_circ_off)
1152 off = wl->wl_circ_off;
1153 slen = wl->wl_circ_off + wl->wl_circ_size - off;
1154 if (slen < len) {
1155 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1156 #ifdef _KERNEL
1157 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1158 #endif
1159 error = wapbl_buffered_write(data, slen, wl, pbn,
1160 WAPBL_JDATA_FLAGS(wl));
1161 if (error)
1162 return error;
1163 data = (uint8_t *)data + slen;
1164 len -= slen;
1165 off = wl->wl_circ_off;
1166 }
1167 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1168 #ifdef _KERNEL
1169 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1170 #endif
1171 error = wapbl_buffered_write(data, len, wl, pbn,
1172 WAPBL_JDATA_FLAGS(wl));
1173 if (error)
1174 return error;
1175 off += len;
1176 if (off >= wl->wl_circ_off + wl->wl_circ_size)
1177 off = wl->wl_circ_off;
1178 *offp = off;
1179 return 0;
1180 }
1181
1182 /****************************************************************/
1183 /*
1184 * WAPBL transactions: entering, adding/removing bufs, and exiting
1185 */
1186
1187 int
1188 wapbl_begin(struct wapbl *wl, const char *file, int line)
1189 {
1190 int doflush;
1191 unsigned lockcount;
1192
1193 KDASSERT(wl);
1194
1195 /*
1196 * XXX this needs to be made much more sophisticated.
1197 * perhaps each wapbl_begin could reserve a specified
1198 * number of buffers and bytes.
1199 */
1200 mutex_enter(&wl->wl_mtx);
1201 lockcount = wl->wl_lock_count;
1202 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
1203 wl->wl_bufbytes_max / 2) ||
1204 ((wl->wl_bufcount + (lockcount * 10)) >
1205 wl->wl_bufcount_max / 2) ||
1206 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1207 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1208 mutex_exit(&wl->wl_mtx);
1209
1210 if (doflush) {
1211 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1212 ("force flush lockcnt=%d bufbytes=%zu "
1213 "(max=%zu) bufcount=%zu (max=%zu) "
1214 "dealloccnt %d (lim=%d)\n",
1215 lockcount, wl->wl_bufbytes,
1216 wl->wl_bufbytes_max, wl->wl_bufcount,
1217 wl->wl_bufcount_max,
1218 wl->wl_dealloccnt, wl->wl_dealloclim));
1219 }
1220
1221 if (doflush) {
1222 int error = wapbl_flush(wl, 0);
1223 if (error)
1224 return error;
1225 }
1226
1227 rw_enter(&wl->wl_rwlock, RW_READER);
1228 mutex_enter(&wl->wl_mtx);
1229 wl->wl_lock_count++;
1230 mutex_exit(&wl->wl_mtx);
1231
1232 #if defined(WAPBL_DEBUG_PRINT)
1233 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1234 ("wapbl_begin thread %d.%d with bufcount=%zu "
1235 "bufbytes=%zu bcount=%zu at %s:%d\n",
1236 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1237 wl->wl_bufbytes, wl->wl_bcount, file, line));
1238 #endif
1239
1240 return 0;
1241 }
1242
1243 void
1244 wapbl_end(struct wapbl *wl)
1245 {
1246
1247 #if defined(WAPBL_DEBUG_PRINT)
1248 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1249 ("wapbl_end thread %d.%d with bufcount=%zu "
1250 "bufbytes=%zu bcount=%zu\n",
1251 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1252 wl->wl_bufbytes, wl->wl_bcount));
1253 #endif
1254
1255 /*
1256 * XXX this could be handled more gracefully, perhaps place
1257 * only a partial transaction in the log and allow the
1258 * remaining to flush without the protection of the journal.
1259 */
1260 KASSERTMSG((wapbl_transaction_len(wl) <=
1261 (wl->wl_circ_size - wl->wl_reserved_bytes)),
1262 "wapbl_end: current transaction too big to flush");
1263
1264 mutex_enter(&wl->wl_mtx);
1265 KASSERT(wl->wl_lock_count > 0);
1266 wl->wl_lock_count--;
1267 mutex_exit(&wl->wl_mtx);
1268
1269 rw_exit(&wl->wl_rwlock);
1270 }
1271
1272 void
1273 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1274 {
1275
1276 KASSERT(bp->b_cflags & BC_BUSY);
1277 KASSERT(bp->b_vp);
1278
1279 wapbl_jlock_assert(wl);
1280
1281 #if 0
1282 /*
1283 * XXX this might be an issue for swapfiles.
1284 * see uvm_swap.c:1702
1285 *
1286 * XXX2 why require it then? leap of semantics?
1287 */
1288 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1289 #endif
1290
1291 mutex_enter(&wl->wl_mtx);
1292 if (bp->b_flags & B_LOCKED) {
1293 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1294 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1295 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1296 "with %d bytes %d bcount\n",
1297 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1298 bp->b_bcount));
1299 } else {
1300 /* unlocked by dirty buffers shouldn't exist */
1301 KASSERT(!(bp->b_oflags & BO_DELWRI));
1302 wl->wl_bufbytes += bp->b_bufsize;
1303 wl->wl_bcount += bp->b_bcount;
1304 wl->wl_bufcount++;
1305 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1306 ("wapbl_add_buf thread %d.%d adding buf %p "
1307 "with %d bytes %d bcount\n",
1308 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1309 bp->b_bcount));
1310 }
1311 TAILQ_INSERT_TAIL(&wl->wl_bufs, bp, b_wapbllist);
1312 mutex_exit(&wl->wl_mtx);
1313
1314 bp->b_flags |= B_LOCKED;
1315 }
1316
1317 static void
1318 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1319 {
1320
1321 KASSERT(mutex_owned(&wl->wl_mtx));
1322 KASSERT(bp->b_cflags & BC_BUSY);
1323 wapbl_jlock_assert(wl);
1324
1325 #if 0
1326 /*
1327 * XXX this might be an issue for swapfiles.
1328 * see uvm_swap.c:1725
1329 *
1330 * XXXdeux: see above
1331 */
1332 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1333 #endif
1334 KASSERT(bp->b_flags & B_LOCKED);
1335
1336 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1337 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1338 "%d bytes %d bcount\n",
1339 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1340
1341 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1342 wl->wl_bufbytes -= bp->b_bufsize;
1343 KASSERT(wl->wl_bcount >= bp->b_bcount);
1344 wl->wl_bcount -= bp->b_bcount;
1345 KASSERT(wl->wl_bufcount > 0);
1346 wl->wl_bufcount--;
1347 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1348 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1349 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1350
1351 bp->b_flags &= ~B_LOCKED;
1352 }
1353
1354 /* called from brelsel() in vfs_bio among other places */
1355 void
1356 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1357 {
1358
1359 mutex_enter(&wl->wl_mtx);
1360 wapbl_remove_buf_locked(wl, bp);
1361 mutex_exit(&wl->wl_mtx);
1362 }
1363
1364 void
1365 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1366 {
1367
1368 KASSERT(bp->b_cflags & BC_BUSY);
1369
1370 /*
1371 * XXX: why does this depend on B_LOCKED? otherwise the buf
1372 * is not for a transaction? if so, why is this called in the
1373 * first place?
1374 */
1375 if (bp->b_flags & B_LOCKED) {
1376 mutex_enter(&wl->wl_mtx);
1377 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1378 wl->wl_bcount += bp->b_bcount - oldcnt;
1379 mutex_exit(&wl->wl_mtx);
1380 }
1381 }
1382
1383 #endif /* _KERNEL */
1384
1385 /****************************************************************/
1386 /* Some utility inlines */
1387
1388 /*
1389 * wapbl_space_used(avail, head, tail)
1390 *
1391 * Number of bytes used in a circular queue of avail total bytes,
1392 * from tail to head.
1393 */
1394 static inline size_t
1395 wapbl_space_used(size_t avail, off_t head, off_t tail)
1396 {
1397
1398 if (tail == 0) {
1399 KASSERT(head == 0);
1400 return 0;
1401 }
1402 return ((head + (avail - 1) - tail) % avail) + 1;
1403 }
1404
1405 #ifdef _KERNEL
1406 /*
1407 * wapbl_advance(size, off, oldoff, delta)
1408 *
1409 * Given a byte offset oldoff into a circular queue of size bytes
1410 * starting at off, return a new byte offset oldoff + delta into
1411 * the circular queue.
1412 */
1413 static inline off_t
1414 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1415 {
1416 off_t newoff;
1417
1418 /* Define acceptable ranges for inputs. */
1419 KASSERT(delta <= (size_t)size);
1420 KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1421 KASSERT(oldoff < (off_t)(size + off));
1422
1423 if ((oldoff == 0) && (delta != 0))
1424 newoff = off + delta;
1425 else if ((oldoff + delta) < (size + off))
1426 newoff = oldoff + delta;
1427 else
1428 newoff = (oldoff + delta) - size;
1429
1430 /* Note some interesting axioms */
1431 KASSERT((delta != 0) || (newoff == oldoff));
1432 KASSERT((delta == 0) || (newoff != 0));
1433 KASSERT((delta != (size)) || (newoff == oldoff));
1434
1435 /* Define acceptable ranges for output. */
1436 KASSERT((newoff == 0) || ((size_t)newoff >= off));
1437 KASSERT((size_t)newoff < (size + off));
1438 return newoff;
1439 }
1440
1441 /*
1442 * wapbl_space_free(avail, head, tail)
1443 *
1444 * Number of bytes free in a circular queue of avail total bytes,
1445 * in which everything from tail to head is used.
1446 */
1447 static inline size_t
1448 wapbl_space_free(size_t avail, off_t head, off_t tail)
1449 {
1450
1451 return avail - wapbl_space_used(avail, head, tail);
1452 }
1453
1454 /*
1455 * wapbl_advance_head(size, off, delta, headp, tailp)
1456 *
1457 * In a circular queue of size bytes starting at off, given the
1458 * old head and tail offsets *headp and *tailp, store the new head
1459 * and tail offsets in *headp and *tailp resulting from adding
1460 * delta bytes of data to the head.
1461 */
1462 static inline void
1463 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1464 off_t *tailp)
1465 {
1466 off_t head = *headp;
1467 off_t tail = *tailp;
1468
1469 KASSERT(delta <= wapbl_space_free(size, head, tail));
1470 head = wapbl_advance(size, off, head, delta);
1471 if ((tail == 0) && (head != 0))
1472 tail = off;
1473 *headp = head;
1474 *tailp = tail;
1475 }
1476
1477 /*
1478 * wapbl_advance_tail(size, off, delta, headp, tailp)
1479 *
1480 * In a circular queue of size bytes starting at off, given the
1481 * old head and tail offsets *headp and *tailp, store the new head
1482 * and tail offsets in *headp and *tailp resulting from removing
1483 * delta bytes of data from the tail.
1484 */
1485 static inline void
1486 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1487 off_t *tailp)
1488 {
1489 off_t head = *headp;
1490 off_t tail = *tailp;
1491
1492 KASSERT(delta <= wapbl_space_used(size, head, tail));
1493 tail = wapbl_advance(size, off, tail, delta);
1494 if (head == tail) {
1495 head = tail = 0;
1496 }
1497 *headp = head;
1498 *tailp = tail;
1499 }
1500
1501
1502 /****************************************************************/
1503
1504 /*
1505 * wapbl_truncate(wl, minfree)
1506 *
1507 * Wait until at least minfree bytes are available in the log.
1508 *
1509 * If it was necessary to wait for writes to complete,
1510 * advance the circular queue tail to reflect the new write
1511 * completions and issue a write commit to the log.
1512 *
1513 * => Caller must hold wl->wl_rwlock writer lock.
1514 */
1515 static int
1516 wapbl_truncate(struct wapbl *wl, size_t minfree)
1517 {
1518 size_t delta;
1519 size_t avail;
1520 off_t head;
1521 off_t tail;
1522 int error = 0;
1523
1524 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1525 KASSERT(rw_write_held(&wl->wl_rwlock));
1526
1527 mutex_enter(&wl->wl_mtx);
1528
1529 /*
1530 * First check to see if we have to do a commit
1531 * at all.
1532 */
1533 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1534 if (minfree < avail) {
1535 mutex_exit(&wl->wl_mtx);
1536 return 0;
1537 }
1538 minfree -= avail;
1539 while ((wl->wl_error_count == 0) &&
1540 (wl->wl_reclaimable_bytes < minfree)) {
1541 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1542 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1543 "minfree=%zd\n",
1544 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1545 minfree));
1546
1547 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1548 }
1549 if (wl->wl_reclaimable_bytes < minfree) {
1550 KASSERT(wl->wl_error_count);
1551 /* XXX maybe get actual error from buffer instead someday? */
1552 error = EIO;
1553 }
1554 head = wl->wl_head;
1555 tail = wl->wl_tail;
1556 delta = wl->wl_reclaimable_bytes;
1557
1558 /* If all of of the entries are flushed, then be sure to keep
1559 * the reserved bytes reserved. Watch out for discarded transactions,
1560 * which could leave more bytes reserved than are reclaimable.
1561 */
1562 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1563 (delta >= wl->wl_reserved_bytes)) {
1564 delta -= wl->wl_reserved_bytes;
1565 }
1566 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1567 &tail);
1568 KDASSERT(wl->wl_reserved_bytes <=
1569 wapbl_space_used(wl->wl_circ_size, head, tail));
1570 mutex_exit(&wl->wl_mtx);
1571
1572 if (error)
1573 return error;
1574
1575 /*
1576 * This is where head, tail and delta are unprotected
1577 * from races against itself or flush. This is ok since
1578 * we only call this routine from inside flush itself.
1579 *
1580 * XXX: how can it race against itself when accessed only
1581 * from behind the write-locked rwlock?
1582 */
1583 error = wapbl_write_commit(wl, head, tail);
1584 if (error)
1585 return error;
1586
1587 wl->wl_head = head;
1588 wl->wl_tail = tail;
1589
1590 mutex_enter(&wl->wl_mtx);
1591 KASSERT(wl->wl_reclaimable_bytes >= delta);
1592 wl->wl_reclaimable_bytes -= delta;
1593 mutex_exit(&wl->wl_mtx);
1594 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1595 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1596 curproc->p_pid, curlwp->l_lid, delta));
1597
1598 return 0;
1599 }
1600
1601 /****************************************************************/
1602
1603 void
1604 wapbl_biodone(struct buf *bp)
1605 {
1606 struct wapbl_entry *we = bp->b_private;
1607 struct wapbl *wl = we->we_wapbl;
1608 #ifdef WAPBL_DEBUG_BUFBYTES
1609 const int bufsize = bp->b_bufsize;
1610 #endif
1611
1612 /*
1613 * Handle possible flushing of buffers after log has been
1614 * decomissioned.
1615 */
1616 if (!wl) {
1617 KASSERT(we->we_bufcount > 0);
1618 we->we_bufcount--;
1619 #ifdef WAPBL_DEBUG_BUFBYTES
1620 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1621 we->we_unsynced_bufbytes -= bufsize;
1622 #endif
1623
1624 if (we->we_bufcount == 0) {
1625 #ifdef WAPBL_DEBUG_BUFBYTES
1626 KASSERT(we->we_unsynced_bufbytes == 0);
1627 #endif
1628 pool_put(&wapbl_entry_pool, we);
1629 }
1630
1631 brelse(bp, 0);
1632 return;
1633 }
1634
1635 #ifdef ohbother
1636 KDASSERT(bp->b_oflags & BO_DONE);
1637 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1638 KDASSERT(bp->b_flags & B_ASYNC);
1639 KDASSERT(bp->b_cflags & BC_BUSY);
1640 KDASSERT(!(bp->b_flags & B_LOCKED));
1641 KDASSERT(!(bp->b_flags & B_READ));
1642 KDASSERT(!(bp->b_cflags & BC_INVAL));
1643 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1644 #endif
1645
1646 if (bp->b_error) {
1647 /*
1648 * If an error occurs, it would be nice to leave the buffer
1649 * as a delayed write on the LRU queue so that we can retry
1650 * it later. But buffercache(9) can't handle dirty buffer
1651 * reuse, so just mark the log permanently errored out.
1652 */
1653 mutex_enter(&wl->wl_mtx);
1654 if (wl->wl_error_count == 0) {
1655 wl->wl_error_count++;
1656 cv_broadcast(&wl->wl_reclaimable_cv);
1657 }
1658 mutex_exit(&wl->wl_mtx);
1659 }
1660
1661 /*
1662 * Make sure that the buf doesn't retain the media flags, so that
1663 * e.g. wapbl_allow_fuadpo has immediate effect on any following I/O.
1664 * The flags will be set again if needed by another I/O.
1665 */
1666 bp->b_flags &= ~B_MEDIA_FLAGS;
1667
1668 /*
1669 * Release the buffer here. wapbl_flush() may wait for the
1670 * log to become empty and we better unbusy the buffer before
1671 * wapbl_flush() returns.
1672 */
1673 brelse(bp, 0);
1674
1675 mutex_enter(&wl->wl_mtx);
1676
1677 KASSERT(we->we_bufcount > 0);
1678 we->we_bufcount--;
1679 #ifdef WAPBL_DEBUG_BUFBYTES
1680 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1681 we->we_unsynced_bufbytes -= bufsize;
1682 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1683 wl->wl_unsynced_bufbytes -= bufsize;
1684 #endif
1685 wl->wl_ev_metawrite.ev_count++;
1686
1687 /*
1688 * If the current transaction can be reclaimed, start
1689 * at the beginning and reclaim any consecutive reclaimable
1690 * transactions. If we successfully reclaim anything,
1691 * then wakeup anyone waiting for the reclaim.
1692 */
1693 if (we->we_bufcount == 0) {
1694 size_t delta = 0;
1695 int errcnt = 0;
1696 #ifdef WAPBL_DEBUG_BUFBYTES
1697 KDASSERT(we->we_unsynced_bufbytes == 0);
1698 #endif
1699 /*
1700 * clear any posted error, since the buffer it came from
1701 * has successfully flushed by now
1702 */
1703 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1704 (we->we_bufcount == 0)) {
1705 delta += we->we_reclaimable_bytes;
1706 if (we->we_error)
1707 errcnt++;
1708 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1709 pool_put(&wapbl_entry_pool, we);
1710 }
1711
1712 if (delta) {
1713 wl->wl_reclaimable_bytes += delta;
1714 KASSERT(wl->wl_error_count >= errcnt);
1715 wl->wl_error_count -= errcnt;
1716 cv_broadcast(&wl->wl_reclaimable_cv);
1717 }
1718 }
1719
1720 mutex_exit(&wl->wl_mtx);
1721 }
1722
1723 /*
1724 * wapbl_flush(wl, wait)
1725 *
1726 * Flush pending block writes, deallocations, and inodes from
1727 * the current transaction in memory to the log on disk:
1728 *
1729 * 1. Call the file system's wl_flush callback to flush any
1730 * per-file-system pending updates.
1731 * 2. Wait for enough space in the log for the current transaction.
1732 * 3. Synchronously write the new log records, advancing the
1733 * circular queue head.
1734 * 4. Issue the pending block writes asynchronously, now that they
1735 * are recorded in the log and can be replayed after crash.
1736 * 5. If wait is true, wait for all writes to complete and for the
1737 * log to become empty.
1738 *
1739 * On failure, call the file system's wl_flush_abort callback.
1740 */
1741 int
1742 wapbl_flush(struct wapbl *wl, int waitfor)
1743 {
1744 struct buf *bp;
1745 struct wapbl_entry *we;
1746 off_t off;
1747 off_t head;
1748 off_t tail;
1749 size_t delta = 0;
1750 size_t flushsize;
1751 size_t reserved;
1752 int error = 0;
1753
1754 /*
1755 * Do a quick check to see if a full flush can be skipped
1756 * This assumes that the flush callback does not need to be called
1757 * unless there are other outstanding bufs.
1758 */
1759 if (!waitfor) {
1760 size_t nbufs;
1761 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1762 protect the KASSERTS */
1763 nbufs = wl->wl_bufcount;
1764 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1765 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1766 mutex_exit(&wl->wl_mtx);
1767 if (nbufs == 0)
1768 return 0;
1769 }
1770
1771 /*
1772 * XXX we may consider using LK_UPGRADE here
1773 * if we want to call flush from inside a transaction
1774 */
1775 rw_enter(&wl->wl_rwlock, RW_WRITER);
1776 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
1777
1778 /*
1779 * Now that we are exclusively locked and the file system has
1780 * issued any deferred block writes for this transaction, check
1781 * whether there are any blocks to write to the log. If not,
1782 * skip waiting for space or writing any log entries.
1783 *
1784 * XXX Shouldn't this also check wl_dealloccnt and
1785 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1786 * file system didn't produce any blocks as a consequence of
1787 * it, but the same does not seem to be so of wl_inohashcnt.
1788 */
1789 if (wl->wl_bufcount == 0) {
1790 goto wait_out;
1791 }
1792
1793 #if 0
1794 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1795 ("wapbl_flush thread %d.%d flushing entries with "
1796 "bufcount=%zu bufbytes=%zu\n",
1797 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1798 wl->wl_bufbytes));
1799 #endif
1800
1801 /* Calculate amount of space needed to flush */
1802 flushsize = wapbl_transaction_len(wl);
1803 if (wapbl_verbose_commit) {
1804 struct timespec ts;
1805 getnanotime(&ts);
1806 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1807 __func__, (long long)ts.tv_sec,
1808 (long)ts.tv_nsec, flushsize);
1809 }
1810
1811 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1812 /*
1813 * XXX this could be handled more gracefully, perhaps place
1814 * only a partial transaction in the log and allow the
1815 * remaining to flush without the protection of the journal.
1816 */
1817 panic("wapbl_flush: current transaction too big to flush");
1818 }
1819
1820 error = wapbl_truncate(wl, flushsize);
1821 if (error)
1822 goto out;
1823
1824 off = wl->wl_head;
1825 KASSERT((off == 0) || (off >= wl->wl_circ_off));
1826 KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1827 error = wapbl_write_blocks(wl, &off);
1828 if (error)
1829 goto out;
1830 error = wapbl_write_revocations(wl, &off);
1831 if (error)
1832 goto out;
1833 error = wapbl_write_inodes(wl, &off);
1834 if (error)
1835 goto out;
1836
1837 reserved = 0;
1838 if (wl->wl_inohashcnt)
1839 reserved = wapbl_transaction_inodes_len(wl);
1840
1841 head = wl->wl_head;
1842 tail = wl->wl_tail;
1843
1844 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1845 &head, &tail);
1846
1847 KASSERTMSG(head == off,
1848 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1849 " off=%"PRIdMAX" flush=%zu",
1850 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1851 flushsize);
1852
1853 /* Opportunistically move the tail forward if we can */
1854 mutex_enter(&wl->wl_mtx);
1855 delta = wl->wl_reclaimable_bytes;
1856 mutex_exit(&wl->wl_mtx);
1857 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1858 &head, &tail);
1859
1860 error = wapbl_write_commit(wl, head, tail);
1861 if (error)
1862 goto out;
1863
1864 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1865
1866 #ifdef WAPBL_DEBUG_BUFBYTES
1867 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1868 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1869 " unsynced=%zu"
1870 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1871 "inodes=%d\n",
1872 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1873 wapbl_space_used(wl->wl_circ_size, head, tail),
1874 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1875 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1876 wl->wl_inohashcnt));
1877 #else
1878 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1879 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1880 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1881 "inodes=%d\n",
1882 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1883 wapbl_space_used(wl->wl_circ_size, head, tail),
1884 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1885 wl->wl_dealloccnt, wl->wl_inohashcnt));
1886 #endif
1887
1888
1889 mutex_enter(&bufcache_lock);
1890 mutex_enter(&wl->wl_mtx);
1891
1892 wl->wl_reserved_bytes = reserved;
1893 wl->wl_head = head;
1894 wl->wl_tail = tail;
1895 KASSERT(wl->wl_reclaimable_bytes >= delta);
1896 wl->wl_reclaimable_bytes -= delta;
1897 KDASSERT(wl->wl_dealloccnt == 0);
1898 #ifdef WAPBL_DEBUG_BUFBYTES
1899 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1900 #endif
1901
1902 we->we_wapbl = wl;
1903 we->we_bufcount = wl->wl_bufcount;
1904 #ifdef WAPBL_DEBUG_BUFBYTES
1905 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1906 #endif
1907 we->we_reclaimable_bytes = flushsize;
1908 we->we_error = 0;
1909 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1910
1911 /*
1912 * This flushes bufs in order than they were queued, so the LRU
1913 * order is preserved.
1914 */
1915 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
1916 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1917 continue;
1918 }
1919 bp->b_iodone = wapbl_biodone;
1920 bp->b_private = we;
1921
1922 bremfree(bp);
1923 wapbl_remove_buf_locked(wl, bp);
1924 mutex_exit(&wl->wl_mtx);
1925 mutex_exit(&bufcache_lock);
1926 bawrite(bp);
1927 mutex_enter(&bufcache_lock);
1928 mutex_enter(&wl->wl_mtx);
1929 }
1930 mutex_exit(&wl->wl_mtx);
1931 mutex_exit(&bufcache_lock);
1932
1933 #if 0
1934 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1935 ("wapbl_flush thread %d.%d done flushing entries...\n",
1936 curproc->p_pid, curlwp->l_lid));
1937 #endif
1938
1939 wait_out:
1940
1941 /*
1942 * If the waitfor flag is set, don't return until everything is
1943 * fully flushed and the on disk log is empty.
1944 */
1945 if (waitfor) {
1946 error = wapbl_truncate(wl, wl->wl_circ_size -
1947 wl->wl_reserved_bytes);
1948 }
1949
1950 out:
1951 if (error) {
1952 wl->wl_flush_abort(wl->wl_mount,
1953 TAILQ_FIRST(&wl->wl_dealloclist));
1954 }
1955
1956 #ifdef WAPBL_DEBUG_PRINT
1957 if (error) {
1958 pid_t pid = -1;
1959 lwpid_t lid = -1;
1960 if (curproc)
1961 pid = curproc->p_pid;
1962 if (curlwp)
1963 lid = curlwp->l_lid;
1964 mutex_enter(&wl->wl_mtx);
1965 #ifdef WAPBL_DEBUG_BUFBYTES
1966 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1967 ("wapbl_flush: thread %d.%d aborted flush: "
1968 "error = %d\n"
1969 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1970 "deallocs=%d inodes=%d\n"
1971 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1972 "unsynced=%zu\n",
1973 pid, lid, error, wl->wl_bufcount,
1974 wl->wl_bufbytes, wl->wl_bcount,
1975 wl->wl_dealloccnt, wl->wl_inohashcnt,
1976 wl->wl_error_count, wl->wl_reclaimable_bytes,
1977 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1978 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1979 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1980 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1981 "error = %d, unsynced = %zu\n",
1982 we->we_bufcount, we->we_reclaimable_bytes,
1983 we->we_error, we->we_unsynced_bufbytes));
1984 }
1985 #else
1986 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1987 ("wapbl_flush: thread %d.%d aborted flush: "
1988 "error = %d\n"
1989 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1990 "deallocs=%d inodes=%d\n"
1991 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1992 pid, lid, error, wl->wl_bufcount,
1993 wl->wl_bufbytes, wl->wl_bcount,
1994 wl->wl_dealloccnt, wl->wl_inohashcnt,
1995 wl->wl_error_count, wl->wl_reclaimable_bytes,
1996 wl->wl_reserved_bytes));
1997 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1998 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1999 ("\tentry: bufcount = %zu, reclaimable = %zu, "
2000 "error = %d\n", we->we_bufcount,
2001 we->we_reclaimable_bytes, we->we_error));
2002 }
2003 #endif
2004 mutex_exit(&wl->wl_mtx);
2005 }
2006 #endif
2007
2008 rw_exit(&wl->wl_rwlock);
2009 return error;
2010 }
2011
2012 /****************************************************************/
2013
2014 void
2015 wapbl_jlock_assert(struct wapbl *wl)
2016 {
2017
2018 KASSERT(rw_lock_held(&wl->wl_rwlock));
2019 }
2020
2021 void
2022 wapbl_junlock_assert(struct wapbl *wl)
2023 {
2024
2025 KASSERT(!rw_write_held(&wl->wl_rwlock));
2026 }
2027
2028 /****************************************************************/
2029
2030 /* locks missing */
2031 void
2032 wapbl_print(struct wapbl *wl,
2033 int full,
2034 void (*pr)(const char *, ...))
2035 {
2036 struct buf *bp;
2037 struct wapbl_entry *we;
2038 (*pr)("wapbl %p", wl);
2039 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
2040 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
2041 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
2042 wl->wl_circ_size, wl->wl_circ_off,
2043 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
2044 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
2045 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
2046 #ifdef WAPBL_DEBUG_BUFBYTES
2047 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2048 "reserved = %zu errcnt = %d unsynced = %zu\n",
2049 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
2050 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2051 wl->wl_error_count, wl->wl_unsynced_bufbytes);
2052 #else
2053 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2054 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
2055 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2056 wl->wl_error_count);
2057 #endif
2058 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
2059 wl->wl_dealloccnt, wl->wl_dealloclim);
2060 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
2061 wl->wl_inohashcnt, wl->wl_inohashmask);
2062 (*pr)("entries:\n");
2063 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2064 #ifdef WAPBL_DEBUG_BUFBYTES
2065 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
2066 "unsynced = %zu\n",
2067 we->we_bufcount, we->we_reclaimable_bytes,
2068 we->we_error, we->we_unsynced_bufbytes);
2069 #else
2070 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
2071 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
2072 #endif
2073 }
2074 if (full) {
2075 int cnt = 0;
2076 (*pr)("bufs =");
2077 TAILQ_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
2078 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2079 (*pr)(" %p", bp);
2080 } else if ((++cnt % 6) == 0) {
2081 (*pr)(" %p,\n\t", bp);
2082 } else {
2083 (*pr)(" %p,", bp);
2084 }
2085 }
2086 (*pr)("\n");
2087
2088 (*pr)("dealloced blks = ");
2089 {
2090 struct wapbl_dealloc *wd;
2091 cnt = 0;
2092 TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
2093 (*pr)(" %"PRId64":%d,",
2094 wd->wd_blkno,
2095 wd->wd_len);
2096 if ((++cnt % 4) == 0) {
2097 (*pr)("\n\t");
2098 }
2099 }
2100 }
2101 (*pr)("\n");
2102
2103 (*pr)("registered inodes = ");
2104 {
2105 int i;
2106 cnt = 0;
2107 for (i = 0; i <= wl->wl_inohashmask; i++) {
2108 struct wapbl_ino_head *wih;
2109 struct wapbl_ino *wi;
2110
2111 wih = &wl->wl_inohash[i];
2112 LIST_FOREACH(wi, wih, wi_hash) {
2113 if (wi->wi_ino == 0)
2114 continue;
2115 (*pr)(" %"PRIu64"/0%06"PRIo32",",
2116 wi->wi_ino, wi->wi_mode);
2117 if ((++cnt % 4) == 0) {
2118 (*pr)("\n\t");
2119 }
2120 }
2121 }
2122 (*pr)("\n");
2123 }
2124
2125 (*pr)("iobufs free =");
2126 TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
2127 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2128 (*pr)(" %p", bp);
2129 } else if ((++cnt % 6) == 0) {
2130 (*pr)(" %p,\n\t", bp);
2131 } else {
2132 (*pr)(" %p,", bp);
2133 }
2134 }
2135 (*pr)("\n");
2136
2137 (*pr)("iobufs busy =");
2138 TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
2139 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2140 (*pr)(" %p", bp);
2141 } else if ((++cnt % 6) == 0) {
2142 (*pr)(" %p,\n\t", bp);
2143 } else {
2144 (*pr)(" %p,", bp);
2145 }
2146 }
2147 (*pr)("\n");
2148 }
2149 }
2150
2151 #if defined(WAPBL_DEBUG) || defined(DDB)
2152 void
2153 wapbl_dump(struct wapbl *wl)
2154 {
2155 #if defined(WAPBL_DEBUG)
2156 if (!wl)
2157 wl = wapbl_debug_wl;
2158 #endif
2159 if (!wl)
2160 return;
2161 wapbl_print(wl, 1, printf);
2162 }
2163 #endif
2164
2165 /****************************************************************/
2166
2167 int
2168 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force,
2169 void **cookiep)
2170 {
2171 struct wapbl_dealloc *wd;
2172 int error = 0;
2173
2174 wapbl_jlock_assert(wl);
2175
2176 mutex_enter(&wl->wl_mtx);
2177
2178 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
2179 if (!force) {
2180 error = EAGAIN;
2181 goto out;
2182 }
2183
2184 /*
2185 * Forced registration can only be used when:
2186 * 1) the caller can't cope with failure
2187 * 2) the path can be triggered only bounded, small
2188 * times per transaction
2189 * If this is not fullfilled, and the path would be triggered
2190 * many times, this could overflow maximum transaction size
2191 * and panic later.
2192 */
2193 printf("%s: forced dealloc registration over limit: %d >= %d\n",
2194 wl->wl_mount->mnt_stat.f_mntonname,
2195 wl->wl_dealloccnt, wl->wl_dealloclim);
2196 }
2197
2198 wl->wl_dealloccnt++;
2199 mutex_exit(&wl->wl_mtx);
2200
2201 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
2202 wd->wd_blkno = blk;
2203 wd->wd_len = len;
2204
2205 mutex_enter(&wl->wl_mtx);
2206 TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
2207
2208 if (cookiep)
2209 *cookiep = wd;
2210
2211 out:
2212 mutex_exit(&wl->wl_mtx);
2213
2214 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
2215 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
2216 blk, len, error));
2217
2218 return error;
2219 }
2220
2221 static void
2222 wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd,
2223 bool locked)
2224 {
2225 KASSERT(!locked
2226 || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx));
2227
2228 if (!locked)
2229 mutex_enter(&wl->wl_mtx);
2230
2231 TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
2232 wl->wl_dealloccnt--;
2233
2234 if (!locked)
2235 mutex_exit(&wl->wl_mtx);
2236
2237 pool_put(&wapbl_dealloc_pool, wd);
2238 }
2239
2240 void
2241 wapbl_unregister_deallocation(struct wapbl *wl, void *cookie)
2242 {
2243 KASSERT(cookie != NULL);
2244 wapbl_deallocation_free(wl, cookie, false);
2245 }
2246
2247 /****************************************************************/
2248
2249 static void
2250 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
2251 {
2252
2253 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
2254 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
2255 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
2256 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
2257 }
2258 }
2259
2260 static void
2261 wapbl_inodetrk_free(struct wapbl *wl)
2262 {
2263
2264 /* XXX this KASSERT needs locking/mutex analysis */
2265 KASSERT(wl->wl_inohashcnt == 0);
2266 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2267 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2268 pool_destroy(&wapbl_ino_pool);
2269 }
2270 }
2271
2272 static struct wapbl_ino *
2273 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2274 {
2275 struct wapbl_ino_head *wih;
2276 struct wapbl_ino *wi;
2277
2278 KASSERT(mutex_owned(&wl->wl_mtx));
2279
2280 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2281 LIST_FOREACH(wi, wih, wi_hash) {
2282 if (ino == wi->wi_ino)
2283 return wi;
2284 }
2285 return 0;
2286 }
2287
2288 void
2289 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2290 {
2291 struct wapbl_ino_head *wih;
2292 struct wapbl_ino *wi;
2293
2294 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2295
2296 mutex_enter(&wl->wl_mtx);
2297 if (wapbl_inodetrk_get(wl, ino) == NULL) {
2298 wi->wi_ino = ino;
2299 wi->wi_mode = mode;
2300 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2301 LIST_INSERT_HEAD(wih, wi, wi_hash);
2302 wl->wl_inohashcnt++;
2303 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2304 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2305 mutex_exit(&wl->wl_mtx);
2306 } else {
2307 mutex_exit(&wl->wl_mtx);
2308 pool_put(&wapbl_ino_pool, wi);
2309 }
2310 }
2311
2312 void
2313 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2314 {
2315 struct wapbl_ino *wi;
2316
2317 mutex_enter(&wl->wl_mtx);
2318 wi = wapbl_inodetrk_get(wl, ino);
2319 if (wi) {
2320 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2321 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2322 KASSERT(wl->wl_inohashcnt > 0);
2323 wl->wl_inohashcnt--;
2324 LIST_REMOVE(wi, wi_hash);
2325 mutex_exit(&wl->wl_mtx);
2326
2327 pool_put(&wapbl_ino_pool, wi);
2328 } else {
2329 mutex_exit(&wl->wl_mtx);
2330 }
2331 }
2332
2333 /****************************************************************/
2334
2335 /*
2336 * wapbl_transaction_inodes_len(wl)
2337 *
2338 * Calculate the number of bytes required for inode registration
2339 * log records in wl.
2340 */
2341 static inline size_t
2342 wapbl_transaction_inodes_len(struct wapbl *wl)
2343 {
2344 int blocklen = 1<<wl->wl_log_dev_bshift;
2345 int iph;
2346
2347 /* Calculate number of inodes described in a inodelist header */
2348 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2349 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2350
2351 KASSERT(iph > 0);
2352
2353 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2354 }
2355
2356
2357 /*
2358 * wapbl_transaction_len(wl)
2359 *
2360 * Calculate number of bytes required for all log records in wl.
2361 */
2362 static size_t
2363 wapbl_transaction_len(struct wapbl *wl)
2364 {
2365 int blocklen = 1<<wl->wl_log_dev_bshift;
2366 size_t len;
2367
2368 /* Calculate number of blocks described in a blocklist header */
2369 len = wl->wl_bcount;
2370 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2371 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2372 len += wapbl_transaction_inodes_len(wl);
2373
2374 return len;
2375 }
2376
2377 /*
2378 * wapbl_cache_sync(wl, msg)
2379 *
2380 * Issue DIOCCACHESYNC to wl->wl_devvp.
2381 *
2382 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2383 * including msg about the duration of the cache sync.
2384 */
2385 static int
2386 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2387 {
2388 const bool verbose = wapbl_verbose_commit >= 2;
2389 struct bintime start_time;
2390 int force = 1;
2391 int error;
2392
2393 /* Skip full cache sync if disabled */
2394 if (!wapbl_flush_disk_cache) {
2395 return 0;
2396 }
2397 if (verbose) {
2398 bintime(&start_time);
2399 }
2400 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2401 FWRITE, FSCRED);
2402 if (error) {
2403 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2404 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2405 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
2406 }
2407 if (verbose) {
2408 struct bintime d;
2409 struct timespec ts;
2410
2411 bintime(&d);
2412 bintime_sub(&d, &start_time);
2413 bintime2timespec(&d, &ts);
2414 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2415 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2416 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2417 }
2418
2419 wl->wl_ev_cacheflush.ev_count++;
2420
2421 return error;
2422 }
2423
2424 /*
2425 * wapbl_write_commit(wl, head, tail)
2426 *
2427 * Issue a disk cache sync to wait for all pending writes to the
2428 * log to complete, and then synchronously commit the current
2429 * circular queue head and tail to the log, in the next of two
2430 * locations for commit headers on disk.
2431 *
2432 * Increment the generation number. If the generation number
2433 * rolls over to zero, then a subsequent commit would appear to
2434 * have an older generation than this one -- in that case, issue a
2435 * duplicate commit to avoid this.
2436 *
2437 * => Caller must have exclusive access to wl, either by holding
2438 * wl->wl_rwlock for writer or by being wapbl_start before anyone
2439 * else has seen wl.
2440 */
2441 static int
2442 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2443 {
2444 struct wapbl_wc_header *wc = wl->wl_wc_header;
2445 struct timespec ts;
2446 int error;
2447 daddr_t pbn;
2448
2449 error = wapbl_buffered_flush(wl, true);
2450 if (error)
2451 return error;
2452 /*
2453 * Flush disk cache to ensure that blocks we've written are actually
2454 * written to the stable storage before the commit header.
2455 * This flushes to disk not only journal blocks, but also all
2456 * metadata blocks, written asynchronously since previous commit.
2457 *
2458 * XXX Calc checksum here, instead we do this for now
2459 */
2460 wapbl_cache_sync(wl, "1");
2461
2462 wc->wc_head = head;
2463 wc->wc_tail = tail;
2464 wc->wc_checksum = 0;
2465 wc->wc_version = 1;
2466 getnanotime(&ts);
2467 wc->wc_time = ts.tv_sec;
2468 wc->wc_timensec = ts.tv_nsec;
2469
2470 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2471 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2472 (intmax_t)head, (intmax_t)tail));
2473
2474 /*
2475 * write the commit header.
2476 *
2477 * XXX if generation will rollover, then first zero
2478 * over second commit header before trying to write both headers.
2479 */
2480
2481 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2482 #ifdef _KERNEL
2483 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2484 #endif
2485 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn, WAPBL_JFLAGS(wl));
2486 if (error)
2487 return error;
2488 error = wapbl_buffered_flush(wl, true);
2489 if (error)
2490 return error;
2491
2492 /*
2493 * Flush disk cache to ensure that the commit header is actually
2494 * written before meta data blocks. Commit block is written using
2495 * FUA when enabled, in that case this flush is not needed.
2496 */
2497 if (!WAPBL_USE_FUA(wl))
2498 wapbl_cache_sync(wl, "2");
2499
2500 /*
2501 * If the generation number was zero, write it out a second time.
2502 * This handles initialization and generation number rollover
2503 */
2504 if (wc->wc_generation++ == 0) {
2505 error = wapbl_write_commit(wl, head, tail);
2506 /*
2507 * This panic should be able to be removed if we do the
2508 * zero'ing mentioned above, and we are certain to roll
2509 * back generation number on failure.
2510 */
2511 if (error)
2512 panic("wapbl_write_commit: error writing duplicate "
2513 "log header: %d", error);
2514 }
2515
2516 wl->wl_ev_commit.ev_count++;
2517
2518 return 0;
2519 }
2520
2521 /*
2522 * wapbl_write_blocks(wl, offp)
2523 *
2524 * Write all pending physical blocks in the current transaction
2525 * from wapbl_add_buf to the log on disk, adding to the circular
2526 * queue head at byte offset *offp, and returning the new head's
2527 * byte offset in *offp.
2528 */
2529 static int
2530 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2531 {
2532 struct wapbl_wc_blocklist *wc =
2533 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2534 int blocklen = 1<<wl->wl_log_dev_bshift;
2535 struct buf *bp;
2536 off_t off = *offp;
2537 int error;
2538 size_t padding;
2539
2540 KASSERT(rw_write_held(&wl->wl_rwlock));
2541
2542 bp = TAILQ_FIRST(&wl->wl_bufs);
2543
2544 while (bp) {
2545 int cnt;
2546 struct buf *obp = bp;
2547
2548 KASSERT(bp->b_flags & B_LOCKED);
2549
2550 wc->wc_type = WAPBL_WC_BLOCKS;
2551 wc->wc_len = blocklen;
2552 wc->wc_blkcount = 0;
2553 while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
2554 /*
2555 * Make sure all the physical block numbers are up to
2556 * date. If this is not always true on a given
2557 * filesystem, then VOP_BMAP must be called. We
2558 * could call VOP_BMAP here, or else in the filesystem
2559 * specific flush callback, although neither of those
2560 * solutions allow us to take the vnode lock. If a
2561 * filesystem requires that we must take the vnode lock
2562 * to call VOP_BMAP, then we can probably do it in
2563 * bwrite when the vnode lock should already be held
2564 * by the invoking code.
2565 */
2566 KASSERT((bp->b_vp->v_type == VBLK) ||
2567 (bp->b_blkno != bp->b_lblkno));
2568 KASSERT(bp->b_blkno > 0);
2569
2570 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2571 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2572 wc->wc_len += bp->b_bcount;
2573 wc->wc_blkcount++;
2574 bp = TAILQ_NEXT(bp, b_wapbllist);
2575 }
2576 if (wc->wc_len % blocklen != 0) {
2577 padding = blocklen - wc->wc_len % blocklen;
2578 wc->wc_len += padding;
2579 } else {
2580 padding = 0;
2581 }
2582
2583 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2584 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2585 wc->wc_len, padding, (intmax_t)off));
2586
2587 error = wapbl_circ_write(wl, wc, blocklen, &off);
2588 if (error)
2589 return error;
2590 bp = obp;
2591 cnt = 0;
2592 while (bp && (cnt++ < wl->wl_brperjblock)) {
2593 error = wapbl_circ_write(wl, bp->b_data,
2594 bp->b_bcount, &off);
2595 if (error)
2596 return error;
2597 bp = TAILQ_NEXT(bp, b_wapbllist);
2598 }
2599 if (padding) {
2600 void *zero;
2601
2602 zero = wapbl_alloc(padding);
2603 memset(zero, 0, padding);
2604 error = wapbl_circ_write(wl, zero, padding, &off);
2605 wapbl_free(zero, padding);
2606 if (error)
2607 return error;
2608 }
2609 }
2610 *offp = off;
2611 return 0;
2612 }
2613
2614 /*
2615 * wapbl_write_revocations(wl, offp)
2616 *
2617 * Write all pending deallocations in the current transaction from
2618 * wapbl_register_deallocation to the log on disk, adding to the
2619 * circular queue's head at byte offset *offp, and returning the
2620 * new head's byte offset in *offp.
2621 */
2622 static int
2623 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2624 {
2625 struct wapbl_wc_blocklist *wc =
2626 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2627 struct wapbl_dealloc *wd, *lwd;
2628 int blocklen = 1<<wl->wl_log_dev_bshift;
2629 off_t off = *offp;
2630 int error;
2631
2632 KASSERT(rw_write_held(&wl->wl_rwlock));
2633
2634 if (wl->wl_dealloccnt == 0)
2635 return 0;
2636
2637 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2638 wc->wc_type = WAPBL_WC_REVOCATIONS;
2639 wc->wc_len = blocklen;
2640 wc->wc_blkcount = 0;
2641 while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
2642 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2643 wd->wd_blkno;
2644 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2645 wd->wd_len;
2646 wc->wc_blkcount++;
2647
2648 wd = TAILQ_NEXT(wd, wd_entries);
2649 }
2650 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2651 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2652 wc->wc_len, (intmax_t)off));
2653 error = wapbl_circ_write(wl, wc, blocklen, &off);
2654 if (error)
2655 return error;
2656
2657 /* free all successfully written deallocs */
2658 lwd = wd;
2659 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2660 if (wd == lwd)
2661 break;
2662 wapbl_deallocation_free(wl, wd, true);
2663 }
2664 }
2665 *offp = off;
2666 return 0;
2667 }
2668
2669 /*
2670 * wapbl_write_inodes(wl, offp)
2671 *
2672 * Write all pending inode allocations in the current transaction
2673 * from wapbl_register_inode to the log on disk, adding to the
2674 * circular queue's head at byte offset *offp and returning the
2675 * new head's byte offset in *offp.
2676 */
2677 static int
2678 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2679 {
2680 struct wapbl_wc_inodelist *wc =
2681 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2682 int i;
2683 int blocklen = 1 << wl->wl_log_dev_bshift;
2684 off_t off = *offp;
2685 int error;
2686
2687 struct wapbl_ino_head *wih;
2688 struct wapbl_ino *wi;
2689 int iph;
2690
2691 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2692 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2693
2694 i = 0;
2695 wih = &wl->wl_inohash[0];
2696 wi = 0;
2697 do {
2698 wc->wc_type = WAPBL_WC_INODES;
2699 wc->wc_len = blocklen;
2700 wc->wc_inocnt = 0;
2701 wc->wc_clear = (i == 0);
2702 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2703 while (!wi) {
2704 KASSERT((wih - &wl->wl_inohash[0])
2705 <= wl->wl_inohashmask);
2706 wi = LIST_FIRST(wih++);
2707 }
2708 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2709 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2710 wc->wc_inocnt++;
2711 i++;
2712 wi = LIST_NEXT(wi, wi_hash);
2713 }
2714 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2715 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2716 wc->wc_len, (intmax_t)off));
2717 error = wapbl_circ_write(wl, wc, blocklen, &off);
2718 if (error)
2719 return error;
2720 } while (i < wl->wl_inohashcnt);
2721
2722 *offp = off;
2723 return 0;
2724 }
2725
2726 #endif /* _KERNEL */
2727
2728 /****************************************************************/
2729
2730 struct wapbl_blk {
2731 LIST_ENTRY(wapbl_blk) wb_hash;
2732 daddr_t wb_blk;
2733 off_t wb_off; /* Offset of this block in the log */
2734 };
2735 #define WAPBL_BLKPOOL_MIN 83
2736
2737 static void
2738 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2739 {
2740 if (size < WAPBL_BLKPOOL_MIN)
2741 size = WAPBL_BLKPOOL_MIN;
2742 KASSERT(wr->wr_blkhash == 0);
2743 #ifdef _KERNEL
2744 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2745 #else /* ! _KERNEL */
2746 /* Manually implement hashinit */
2747 {
2748 unsigned long i, hashsize;
2749 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2750 continue;
2751 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2752 for (i = 0; i < hashsize; i++)
2753 LIST_INIT(&wr->wr_blkhash[i]);
2754 wr->wr_blkhashmask = hashsize - 1;
2755 }
2756 #endif /* ! _KERNEL */
2757 }
2758
2759 static void
2760 wapbl_blkhash_free(struct wapbl_replay *wr)
2761 {
2762 KASSERT(wr->wr_blkhashcnt == 0);
2763 #ifdef _KERNEL
2764 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2765 #else /* ! _KERNEL */
2766 wapbl_free(wr->wr_blkhash,
2767 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2768 #endif /* ! _KERNEL */
2769 }
2770
2771 static struct wapbl_blk *
2772 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2773 {
2774 struct wapbl_blk_head *wbh;
2775 struct wapbl_blk *wb;
2776 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2777 LIST_FOREACH(wb, wbh, wb_hash) {
2778 if (blk == wb->wb_blk)
2779 return wb;
2780 }
2781 return 0;
2782 }
2783
2784 static void
2785 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2786 {
2787 struct wapbl_blk_head *wbh;
2788 struct wapbl_blk *wb;
2789 wb = wapbl_blkhash_get(wr, blk);
2790 if (wb) {
2791 KASSERT(wb->wb_blk == blk);
2792 wb->wb_off = off;
2793 } else {
2794 wb = wapbl_alloc(sizeof(*wb));
2795 wb->wb_blk = blk;
2796 wb->wb_off = off;
2797 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2798 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2799 wr->wr_blkhashcnt++;
2800 }
2801 }
2802
2803 static void
2804 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2805 {
2806 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2807 if (wb) {
2808 KASSERT(wr->wr_blkhashcnt > 0);
2809 wr->wr_blkhashcnt--;
2810 LIST_REMOVE(wb, wb_hash);
2811 wapbl_free(wb, sizeof(*wb));
2812 }
2813 }
2814
2815 static void
2816 wapbl_blkhash_clear(struct wapbl_replay *wr)
2817 {
2818 unsigned long i;
2819 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2820 struct wapbl_blk *wb;
2821
2822 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2823 KASSERT(wr->wr_blkhashcnt > 0);
2824 wr->wr_blkhashcnt--;
2825 LIST_REMOVE(wb, wb_hash);
2826 wapbl_free(wb, sizeof(*wb));
2827 }
2828 }
2829 KASSERT(wr->wr_blkhashcnt == 0);
2830 }
2831
2832 /****************************************************************/
2833
2834 /*
2835 * wapbl_circ_read(wr, data, len, offp)
2836 *
2837 * Read len bytes into data from the circular queue of wr,
2838 * starting at the linear byte offset *offp, and returning the new
2839 * linear byte offset in *offp.
2840 *
2841 * If the starting linear byte offset precedes wr->wr_circ_off,
2842 * the read instead begins at wr->wr_circ_off. XXX WTF? This
2843 * should be a KASSERT, not a conditional.
2844 */
2845 static int
2846 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2847 {
2848 size_t slen;
2849 off_t off = *offp;
2850 int error;
2851 daddr_t pbn;
2852
2853 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2854 wr->wr_log_dev_bshift) == len);
2855
2856 if (off < wr->wr_circ_off)
2857 off = wr->wr_circ_off;
2858 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2859 if (slen < len) {
2860 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2861 #ifdef _KERNEL
2862 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2863 #endif
2864 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2865 if (error)
2866 return error;
2867 data = (uint8_t *)data + slen;
2868 len -= slen;
2869 off = wr->wr_circ_off;
2870 }
2871 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2872 #ifdef _KERNEL
2873 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2874 #endif
2875 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2876 if (error)
2877 return error;
2878 off += len;
2879 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2880 off = wr->wr_circ_off;
2881 *offp = off;
2882 return 0;
2883 }
2884
2885 /*
2886 * wapbl_circ_advance(wr, len, offp)
2887 *
2888 * Compute the linear byte offset of the circular queue of wr that
2889 * is len bytes past *offp, and store it in *offp.
2890 *
2891 * This is as if wapbl_circ_read, but without actually reading
2892 * anything.
2893 *
2894 * If the starting linear byte offset precedes wr->wr_circ_off, it
2895 * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2896 * be a KASSERT, not a conditional.
2897 */
2898 static void
2899 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2900 {
2901 size_t slen;
2902 off_t off = *offp;
2903
2904 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2905 wr->wr_log_dev_bshift) == len);
2906
2907 if (off < wr->wr_circ_off)
2908 off = wr->wr_circ_off;
2909 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2910 if (slen < len) {
2911 len -= slen;
2912 off = wr->wr_circ_off;
2913 }
2914 off += len;
2915 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2916 off = wr->wr_circ_off;
2917 *offp = off;
2918 }
2919
2920 /****************************************************************/
2921
2922 int
2923 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2924 daddr_t off, size_t count, size_t blksize)
2925 {
2926 struct wapbl_replay *wr;
2927 int error;
2928 struct vnode *devvp;
2929 daddr_t logpbn;
2930 uint8_t *scratch;
2931 struct wapbl_wc_header *wch;
2932 struct wapbl_wc_header *wch2;
2933 /* Use this until we read the actual log header */
2934 int log_dev_bshift = ilog2(blksize);
2935 size_t used;
2936 daddr_t pbn;
2937
2938 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2939 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2940 vp, off, count, blksize));
2941
2942 if (off < 0)
2943 return EINVAL;
2944
2945 if (blksize < DEV_BSIZE)
2946 return EINVAL;
2947 if (blksize % DEV_BSIZE)
2948 return EINVAL;
2949
2950 #ifdef _KERNEL
2951 #if 0
2952 /* XXX vp->v_size isn't reliably set for VBLK devices,
2953 * especially root. However, we might still want to verify
2954 * that the full load is readable */
2955 if ((off + count) * blksize > vp->v_size)
2956 return EINVAL;
2957 #endif
2958 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2959 return error;
2960 }
2961 #else /* ! _KERNEL */
2962 devvp = vp;
2963 logpbn = off;
2964 #endif /* ! _KERNEL */
2965
2966 scratch = wapbl_alloc(MAXBSIZE);
2967
2968 pbn = logpbn;
2969 #ifdef _KERNEL
2970 pbn = btodb(pbn << log_dev_bshift);
2971 #endif
2972 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2973 if (error)
2974 goto errout;
2975
2976 wch = (struct wapbl_wc_header *)scratch;
2977 wch2 =
2978 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2979 /* XXX verify checksums and magic numbers */
2980 if (wch->wc_type != WAPBL_WC_HEADER) {
2981 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2982 error = EFTYPE;
2983 goto errout;
2984 }
2985
2986 if (wch2->wc_generation > wch->wc_generation)
2987 wch = wch2;
2988
2989 wr = wapbl_calloc(1, sizeof(*wr));
2990
2991 wr->wr_logvp = vp;
2992 wr->wr_devvp = devvp;
2993 wr->wr_logpbn = logpbn;
2994
2995 wr->wr_scratch = scratch;
2996
2997 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2998 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2999 wr->wr_circ_off = wch->wc_circ_off;
3000 wr->wr_circ_size = wch->wc_circ_size;
3001 wr->wr_generation = wch->wc_generation;
3002
3003 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
3004
3005 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
3006 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
3007 " len=%"PRId64" used=%zu\n",
3008 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
3009 wch->wc_circ_size, used));
3010
3011 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
3012
3013 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
3014 if (error) {
3015 wapbl_replay_stop(wr);
3016 wapbl_replay_free(wr);
3017 return error;
3018 }
3019
3020 *wrp = wr;
3021 return 0;
3022
3023 errout:
3024 wapbl_free(scratch, MAXBSIZE);
3025 return error;
3026 }
3027
3028 void
3029 wapbl_replay_stop(struct wapbl_replay *wr)
3030 {
3031
3032 if (!wapbl_replay_isopen(wr))
3033 return;
3034
3035 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
3036
3037 wapbl_free(wr->wr_scratch, MAXBSIZE);
3038 wr->wr_scratch = NULL;
3039
3040 wr->wr_logvp = NULL;
3041
3042 wapbl_blkhash_clear(wr);
3043 wapbl_blkhash_free(wr);
3044 }
3045
3046 void
3047 wapbl_replay_free(struct wapbl_replay *wr)
3048 {
3049
3050 KDASSERT(!wapbl_replay_isopen(wr));
3051
3052 if (wr->wr_inodes)
3053 wapbl_free(wr->wr_inodes,
3054 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
3055 wapbl_free(wr, sizeof(*wr));
3056 }
3057
3058 #ifdef _KERNEL
3059 int
3060 wapbl_replay_isopen1(struct wapbl_replay *wr)
3061 {
3062
3063 return wapbl_replay_isopen(wr);
3064 }
3065 #endif
3066
3067 /*
3068 * calculate the disk address for the i'th block in the wc_blockblist
3069 * offset by j blocks of size blen.
3070 *
3071 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
3072 * was written to the journal.
3073 *
3074 * The kernel needs that address plus the offset in DEV_BSIZE units.
3075 *
3076 * Userland needs that address plus the offset in blen units.
3077 *
3078 */
3079 static daddr_t
3080 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
3081 {
3082 daddr_t pbn;
3083
3084 #ifdef _KERNEL
3085 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
3086 #else
3087 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
3088 #endif
3089
3090 return pbn;
3091 }
3092
3093 static void
3094 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
3095 {
3096 struct wapbl_wc_blocklist *wc =
3097 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3098 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3099 int i, j, n;
3100
3101 for (i = 0; i < wc->wc_blkcount; i++) {
3102 /*
3103 * Enter each physical block into the hashtable independently.
3104 */
3105 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3106 for (j = 0; j < n; j++) {
3107 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
3108 *offp);
3109 wapbl_circ_advance(wr, fsblklen, offp);
3110 }
3111 }
3112 }
3113
3114 static void
3115 wapbl_replay_process_revocations(struct wapbl_replay *wr)
3116 {
3117 struct wapbl_wc_blocklist *wc =
3118 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3119 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3120 int i, j, n;
3121
3122 for (i = 0; i < wc->wc_blkcount; i++) {
3123 /*
3124 * Remove any blocks found from the hashtable.
3125 */
3126 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3127 for (j = 0; j < n; j++)
3128 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
3129 }
3130 }
3131
3132 static void
3133 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
3134 {
3135 struct wapbl_wc_inodelist *wc =
3136 (struct wapbl_wc_inodelist *)wr->wr_scratch;
3137 void *new_inodes;
3138 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
3139
3140 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
3141
3142 /*
3143 * Keep track of where we found this so location won't be
3144 * overwritten.
3145 */
3146 if (wc->wc_clear) {
3147 wr->wr_inodestail = oldoff;
3148 wr->wr_inodescnt = 0;
3149 if (wr->wr_inodes != NULL) {
3150 wapbl_free(wr->wr_inodes, oldsize);
3151 wr->wr_inodes = NULL;
3152 }
3153 }
3154 wr->wr_inodeshead = newoff;
3155 if (wc->wc_inocnt == 0)
3156 return;
3157
3158 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
3159 sizeof(wr->wr_inodes[0]));
3160 if (wr->wr_inodes != NULL) {
3161 memcpy(new_inodes, wr->wr_inodes, oldsize);
3162 wapbl_free(wr->wr_inodes, oldsize);
3163 }
3164 wr->wr_inodes = new_inodes;
3165 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
3166 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
3167 wr->wr_inodescnt += wc->wc_inocnt;
3168 }
3169
3170 static int
3171 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
3172 {
3173 off_t off;
3174 int error;
3175
3176 int logblklen = 1 << wr->wr_log_dev_bshift;
3177
3178 wapbl_blkhash_clear(wr);
3179
3180 off = tail;
3181 while (off != head) {
3182 struct wapbl_wc_null *wcn;
3183 off_t saveoff = off;
3184 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3185 if (error)
3186 goto errout;
3187 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3188 switch (wcn->wc_type) {
3189 case WAPBL_WC_BLOCKS:
3190 wapbl_replay_process_blocks(wr, &off);
3191 break;
3192
3193 case WAPBL_WC_REVOCATIONS:
3194 wapbl_replay_process_revocations(wr);
3195 break;
3196
3197 case WAPBL_WC_INODES:
3198 wapbl_replay_process_inodes(wr, saveoff, off);
3199 break;
3200
3201 default:
3202 printf("Unrecognized wapbl type: 0x%08x\n",
3203 wcn->wc_type);
3204 error = EFTYPE;
3205 goto errout;
3206 }
3207 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3208 if (off != saveoff) {
3209 printf("wapbl_replay: corrupted records\n");
3210 error = EFTYPE;
3211 goto errout;
3212 }
3213 }
3214 return 0;
3215
3216 errout:
3217 wapbl_blkhash_clear(wr);
3218 return error;
3219 }
3220
3221 #if 0
3222 int
3223 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
3224 {
3225 off_t off;
3226 int mismatchcnt = 0;
3227 int logblklen = 1 << wr->wr_log_dev_bshift;
3228 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3229 void *scratch1 = wapbl_alloc(MAXBSIZE);
3230 void *scratch2 = wapbl_alloc(MAXBSIZE);
3231 int error = 0;
3232
3233 KDASSERT(wapbl_replay_isopen(wr));
3234
3235 off = wch->wc_tail;
3236 while (off != wch->wc_head) {
3237 struct wapbl_wc_null *wcn;
3238 #ifdef DEBUG
3239 off_t saveoff = off;
3240 #endif
3241 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3242 if (error)
3243 goto out;
3244 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3245 switch (wcn->wc_type) {
3246 case WAPBL_WC_BLOCKS:
3247 {
3248 struct wapbl_wc_blocklist *wc =
3249 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3250 int i;
3251 for (i = 0; i < wc->wc_blkcount; i++) {
3252 int foundcnt = 0;
3253 int dirtycnt = 0;
3254 int j, n;
3255 /*
3256 * Check each physical block into the
3257 * hashtable independently
3258 */
3259 n = wc->wc_blocks[i].wc_dlen >>
3260 wch->wc_fs_dev_bshift;
3261 for (j = 0; j < n; j++) {
3262 struct wapbl_blk *wb =
3263 wapbl_blkhash_get(wr,
3264 wapbl_block_daddr(wc, i, j, fsblklen));
3265 if (wb && (wb->wb_off == off)) {
3266 foundcnt++;
3267 error =
3268 wapbl_circ_read(wr,
3269 scratch1, fsblklen,
3270 &off);
3271 if (error)
3272 goto out;
3273 error =
3274 wapbl_read(scratch2,
3275 fsblklen, fsdevvp,
3276 wb->wb_blk);
3277 if (error)
3278 goto out;
3279 if (memcmp(scratch1,
3280 scratch2,
3281 fsblklen)) {
3282 printf(
3283 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3284 wb->wb_blk, (intmax_t)off);
3285 dirtycnt++;
3286 mismatchcnt++;
3287 }
3288 } else {
3289 wapbl_circ_advance(wr,
3290 fsblklen, &off);
3291 }
3292 }
3293 #if 0
3294 /*
3295 * If all of the blocks in an entry
3296 * are clean, then remove all of its
3297 * blocks from the hashtable since they
3298 * never will need replay.
3299 */
3300 if ((foundcnt != 0) &&
3301 (dirtycnt == 0)) {
3302 off = saveoff;
3303 wapbl_circ_advance(wr,
3304 logblklen, &off);
3305 for (j = 0; j < n; j++) {
3306 struct wapbl_blk *wb =
3307 wapbl_blkhash_get(wr,
3308 wapbl_block_daddr(wc, i, j, fsblklen));
3309 if (wb &&
3310 (wb->wb_off == off)) {
3311 wapbl_blkhash_rem(wr, wb->wb_blk);
3312 }
3313 wapbl_circ_advance(wr,
3314 fsblklen, &off);
3315 }
3316 }
3317 #endif
3318 }
3319 }
3320 break;
3321 case WAPBL_WC_REVOCATIONS:
3322 case WAPBL_WC_INODES:
3323 break;
3324 default:
3325 KASSERT(0);
3326 }
3327 #ifdef DEBUG
3328 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3329 KASSERT(off == saveoff);
3330 #endif
3331 }
3332 out:
3333 wapbl_free(scratch1, MAXBSIZE);
3334 wapbl_free(scratch2, MAXBSIZE);
3335 if (!error && mismatchcnt)
3336 error = EFTYPE;
3337 return error;
3338 }
3339 #endif
3340
3341 int
3342 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3343 {
3344 struct wapbl_blk *wb;
3345 size_t i;
3346 off_t off;
3347 void *scratch;
3348 int error = 0;
3349 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3350
3351 KDASSERT(wapbl_replay_isopen(wr));
3352
3353 scratch = wapbl_alloc(MAXBSIZE);
3354
3355 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
3356 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3357 off = wb->wb_off;
3358 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3359 if (error)
3360 break;
3361 error = wapbl_write(scratch, fsblklen, fsdevvp,
3362 wb->wb_blk);
3363 if (error)
3364 break;
3365 }
3366 }
3367
3368 wapbl_free(scratch, MAXBSIZE);
3369 return error;
3370 }
3371
3372 int
3373 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3374 {
3375 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3376
3377 KDASSERT(wapbl_replay_isopen(wr));
3378 KASSERT((len % fsblklen) == 0);
3379
3380 while (len != 0) {
3381 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3382 if (wb)
3383 return 1;
3384 len -= fsblklen;
3385 }
3386 return 0;
3387 }
3388
3389 int
3390 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3391 {
3392 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3393
3394 KDASSERT(wapbl_replay_isopen(wr));
3395
3396 KASSERT((len % fsblklen) == 0);
3397
3398 while (len != 0) {
3399 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3400 if (wb) {
3401 off_t off = wb->wb_off;
3402 int error;
3403 error = wapbl_circ_read(wr, data, fsblklen, &off);
3404 if (error)
3405 return error;
3406 }
3407 data = (uint8_t *)data + fsblklen;
3408 len -= fsblklen;
3409 blk++;
3410 }
3411 return 0;
3412 }
3413
3414 #ifdef _KERNEL
3415
3416 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3417
3418 static int
3419 wapbl_modcmd(modcmd_t cmd, void *arg)
3420 {
3421
3422 switch (cmd) {
3423 case MODULE_CMD_INIT:
3424 wapbl_init();
3425 return 0;
3426 case MODULE_CMD_FINI:
3427 return wapbl_fini();
3428 default:
3429 return ENOTTY;
3430 }
3431 }
3432 #endif /* _KERNEL */
3433