vfs_wapbl.c revision 1.98 1 /* $NetBSD: vfs_wapbl.c,v 1.98 2017/10/23 19:03:40 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.98 2017/10/23 19:03:40 jdolecek Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43 #include <sys/time.h>
44 #include <sys/wapbl.h>
45 #include <sys/wapbl_replay.h>
46
47 #ifdef _KERNEL
48
49 #include <sys/atomic.h>
50 #include <sys/conf.h>
51 #include <sys/evcnt.h>
52 #include <sys/file.h>
53 #include <sys/kauth.h>
54 #include <sys/kernel.h>
55 #include <sys/module.h>
56 #include <sys/mount.h>
57 #include <sys/mutex.h>
58 #include <sys/namei.h>
59 #include <sys/proc.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sysctl.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
68 #define wapbl_free(a, s) kmem_free((a), (s))
69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
70
71 static struct sysctllog *wapbl_sysctl;
72 static int wapbl_flush_disk_cache = 1;
73 static int wapbl_verbose_commit = 0;
74 static int wapbl_allow_dpofua = 0; /* switched off by default for now */
75 static int wapbl_journal_iobufs = 4;
76
77 static inline size_t wapbl_space_free(size_t, off_t, off_t);
78
79 #else /* !_KERNEL */
80
81 #include <assert.h>
82 #include <errno.h>
83 #include <stdbool.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <string.h>
87
88 #define KDASSERT(x) assert(x)
89 #define KASSERT(x) assert(x)
90 #define wapbl_alloc(s) malloc(s)
91 #define wapbl_free(a, s) free(a)
92 #define wapbl_calloc(n, s) calloc((n), (s))
93
94 #endif /* !_KERNEL */
95
96 /*
97 * INTERNAL DATA STRUCTURES
98 */
99
100 /*
101 * This structure holds per-mount log information.
102 *
103 * Legend: a = atomic access only
104 * r = read-only after init
105 * l = rwlock held
106 * m = mutex held
107 * lm = rwlock held writing or mutex held
108 * u = unlocked access ok
109 * b = bufcache_lock held
110 */
111 LIST_HEAD(wapbl_ino_head, wapbl_ino);
112 struct wapbl {
113 struct vnode *wl_logvp; /* r: log here */
114 struct vnode *wl_devvp; /* r: log on this device */
115 struct mount *wl_mount; /* r: mountpoint wl is associated with */
116 daddr_t wl_logpbn; /* r: Physical block number of start of log */
117 int wl_log_dev_bshift; /* r: logarithm of device block size of log
118 device */
119 int wl_fs_dev_bshift; /* r: logarithm of device block size of
120 filesystem device */
121
122 unsigned wl_lock_count; /* m: Count of transactions in progress */
123
124 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
125 size_t wl_circ_off; /* r: Number of bytes reserved at start */
126
127 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
128 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
129
130 off_t wl_head; /* l: Byte offset of log head */
131 off_t wl_tail; /* l: Byte offset of log tail */
132 /*
133 * WAPBL log layout, stored on wl_devvp at wl_logpbn:
134 *
135 * ___________________ wl_circ_size __________________
136 * / \
137 * +---------+---------+-------+--------------+--------+
138 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
139 * +---------+---------+-------+--------------+--------+
140 * wl_circ_off --^ ^-- wl_head ^-- wl_tail
141 *
142 * commit0 and commit1 are commit headers. A commit header has
143 * a generation number, indicating which of the two headers is
144 * more recent, and an assignment of head and tail pointers.
145 * The rest is a circular queue of log records, starting at
146 * the byte offset wl_circ_off.
147 *
148 * E marks empty space for records.
149 * W marks records for block writes issued but waiting.
150 * C marks completed records.
151 *
152 * wapbl_flush writes new records to empty `E' spaces after
153 * wl_head from the current transaction in memory.
154 *
155 * wapbl_truncate advances wl_tail past any completed `C'
156 * records, freeing them up for use.
157 *
158 * head == tail == 0 means log is empty.
159 * head == tail != 0 means log is full.
160 *
161 * See assertions in wapbl_advance() for other boundary
162 * conditions.
163 *
164 * Only wapbl_flush moves the head, except when wapbl_truncate
165 * sets it to 0 to indicate that the log is empty.
166 *
167 * Only wapbl_truncate moves the tail, except when wapbl_flush
168 * sets it to wl_circ_off to indicate that the log is full.
169 */
170
171 struct wapbl_wc_header *wl_wc_header; /* l */
172 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
173
174 kmutex_t wl_mtx; /* u: short-term lock */
175 krwlock_t wl_rwlock; /* u: File system transaction lock */
176
177 /*
178 * Must be held while accessing
179 * wl_count or wl_bufs or head or tail
180 */
181
182 #if _KERNEL
183 /*
184 * Callback called from within the flush routine to flush any extra
185 * bits. Note that flush may be skipped without calling this if
186 * there are no outstanding buffers in the transaction.
187 */
188 wapbl_flush_fn_t wl_flush; /* r */
189 wapbl_flush_fn_t wl_flush_abort;/* r */
190
191 /* Event counters */
192 char wl_ev_group[EVCNT_STRING_MAX]; /* r */
193 struct evcnt wl_ev_commit; /* l */
194 struct evcnt wl_ev_journalwrite; /* l */
195 struct evcnt wl_ev_jbufs_bio_nowait; /* l */
196 struct evcnt wl_ev_metawrite; /* lm */
197 struct evcnt wl_ev_cacheflush; /* l */
198 #endif
199
200 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
201 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
202 size_t wl_bcount; /* m: Total bcount of wl_bufs */
203
204 TAILQ_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
205
206 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
207 size_t wl_reclaimable_bytes; /* m: Amount of space available for
208 reclamation by truncate */
209 int wl_error_count; /* m: # of wl_entries with errors */
210 size_t wl_reserved_bytes; /* never truncate log smaller than this */
211
212 #ifdef WAPBL_DEBUG_BUFBYTES
213 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
214 #endif
215
216 #if _KERNEL
217 int wl_brperjblock; /* r Block records per journal block */
218 #endif
219
220 TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
221 int wl_dealloccnt; /* lm: total count */
222 int wl_dealloclim; /* r: max count */
223
224 /* hashtable of inode numbers for allocated but unlinked inodes */
225 /* synch ??? */
226 struct wapbl_ino_head *wl_inohash;
227 u_long wl_inohashmask;
228 int wl_inohashcnt;
229
230 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
231 accounting */
232
233 /* buffers for wapbl_buffered_write() */
234 TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
235 TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
236
237 int wl_dkcache; /* r: disk cache flags */
238 #define WAPBL_USE_FUA(wl) \
239 (wapbl_allow_dpofua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
240 #define WAPBL_JFLAGS(wl) \
241 (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
242 #define WAPBL_MFLAGS(wl) \
243 (WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0)
244 int wl_jwrite_flags; /* r: journal write flags */
245 int wl_mwrite_flags; /* r: metadata write flags */
246 };
247
248 #ifdef WAPBL_DEBUG_PRINT
249 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
250 #endif
251
252 /****************************************************************/
253 #ifdef _KERNEL
254
255 #ifdef WAPBL_DEBUG
256 struct wapbl *wapbl_debug_wl;
257 #endif
258
259 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
260 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
261 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
262 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
263 #endif /* _KERNEL */
264
265 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
266
267 static inline size_t wapbl_space_used(size_t avail, off_t head,
268 off_t tail);
269
270 #ifdef _KERNEL
271
272 static struct pool wapbl_entry_pool;
273 static struct pool wapbl_dealloc_pool;
274
275 #define WAPBL_INODETRK_SIZE 83
276 static int wapbl_ino_pool_refcount;
277 static struct pool wapbl_ino_pool;
278 struct wapbl_ino {
279 LIST_ENTRY(wapbl_ino) wi_hash;
280 ino_t wi_ino;
281 mode_t wi_mode;
282 };
283
284 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
285 static void wapbl_inodetrk_free(struct wapbl *wl);
286 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
287
288 static size_t wapbl_transaction_len(struct wapbl *wl);
289 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
290
291 static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *,
292 bool);
293
294 static void wapbl_evcnt_init(struct wapbl *);
295 static void wapbl_evcnt_free(struct wapbl *);
296
297 static void wapbl_dkcache_init(struct wapbl *);
298
299 #if 0
300 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
301 #endif
302
303 static int wapbl_replay_isopen1(struct wapbl_replay *);
304
305 struct wapbl_ops wapbl_ops = {
306 .wo_wapbl_discard = wapbl_discard,
307 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
308 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
309 .wo_wapbl_replay_read = wapbl_replay_read,
310 .wo_wapbl_add_buf = wapbl_add_buf,
311 .wo_wapbl_remove_buf = wapbl_remove_buf,
312 .wo_wapbl_resize_buf = wapbl_resize_buf,
313 .wo_wapbl_begin = wapbl_begin,
314 .wo_wapbl_end = wapbl_end,
315 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
316
317 /* XXX: the following is only used to say "this is a wapbl buf" */
318 .wo_wapbl_biodone = wapbl_biodone,
319 };
320
321 static int
322 wapbl_sysctl_init(void)
323 {
324 int rv;
325 const struct sysctlnode *rnode, *cnode;
326
327 wapbl_sysctl = NULL;
328
329 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
330 CTLFLAG_PERMANENT,
331 CTLTYPE_NODE, "wapbl",
332 SYSCTL_DESCR("WAPBL journaling options"),
333 NULL, 0, NULL, 0,
334 CTL_VFS, CTL_CREATE, CTL_EOL);
335 if (rv)
336 return rv;
337
338 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
339 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
340 CTLTYPE_INT, "flush_disk_cache",
341 SYSCTL_DESCR("flush disk cache"),
342 NULL, 0, &wapbl_flush_disk_cache, 0,
343 CTL_CREATE, CTL_EOL);
344 if (rv)
345 return rv;
346
347 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
348 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
349 CTLTYPE_INT, "verbose_commit",
350 SYSCTL_DESCR("show time and size of wapbl log commits"),
351 NULL, 0, &wapbl_verbose_commit, 0,
352 CTL_CREATE, CTL_EOL);
353 if (rv)
354 return rv;
355
356 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
357 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
358 CTLTYPE_INT, "allow_dpofua",
359 SYSCTL_DESCR("allow use of FUA/DPO instead of cash flush if available"),
360 NULL, 0, &wapbl_allow_dpofua, 0,
361 CTL_CREATE, CTL_EOL);
362 if (rv)
363 return rv;
364
365 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
366 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
367 CTLTYPE_INT, "journal_iobufs",
368 SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
369 NULL, 0, &wapbl_journal_iobufs, 0,
370 CTL_CREATE, CTL_EOL);
371 if (rv)
372 return rv;
373
374 return rv;
375 }
376
377 static void
378 wapbl_init(void)
379 {
380
381 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
382 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
383 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
384 "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
385
386 wapbl_sysctl_init();
387 }
388
389 static int
390 wapbl_fini(void)
391 {
392
393 if (wapbl_sysctl != NULL)
394 sysctl_teardown(&wapbl_sysctl);
395
396 pool_destroy(&wapbl_dealloc_pool);
397 pool_destroy(&wapbl_entry_pool);
398
399 return 0;
400 }
401
402 static void
403 wapbl_evcnt_init(struct wapbl *wl)
404 {
405 snprintf(wl->wl_ev_group, sizeof(wl->wl_ev_group),
406 "wapbl fsid 0x%x/0x%x",
407 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[0],
408 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[1]
409 );
410
411 evcnt_attach_dynamic(&wl->wl_ev_commit, EVCNT_TYPE_MISC,
412 NULL, wl->wl_ev_group, "commit");
413 evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
414 NULL, wl->wl_ev_group, "journal write total");
415 evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
416 NULL, wl->wl_ev_group, "journal write finished async");
417 evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
418 NULL, wl->wl_ev_group, "metadata async write");
419 evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
420 NULL, wl->wl_ev_group, "cache flush");
421 }
422
423 static void
424 wapbl_evcnt_free(struct wapbl *wl)
425 {
426 evcnt_detach(&wl->wl_ev_commit);
427 evcnt_detach(&wl->wl_ev_journalwrite);
428 evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
429 evcnt_detach(&wl->wl_ev_metawrite);
430 evcnt_detach(&wl->wl_ev_cacheflush);
431 }
432
433 static void
434 wapbl_dkcache_init(struct wapbl *wl)
435 {
436 int error;
437
438 /* Get disk cache flags */
439 error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
440 FWRITE, FSCRED);
441 if (error) {
442 /* behave as if there was a write cache */
443 wl->wl_dkcache = DKCACHE_WRITE;
444 }
445
446 /* Use FUA instead of cache flush if available */
447 if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) {
448 wl->wl_jwrite_flags |= B_MEDIA_FUA;
449 wl->wl_mwrite_flags |= B_MEDIA_FUA;
450 }
451
452 /* Use DPO for journal writes if available */
453 if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
454 wl->wl_jwrite_flags |= B_MEDIA_DPO;
455 }
456
457 static int
458 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
459 {
460 int error, i;
461
462 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
463 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
464
465 /*
466 * Its only valid to reuse the replay log if its
467 * the same as the new log we just opened.
468 */
469 KDASSERT(!wapbl_replay_isopen(wr));
470 KASSERT(wl->wl_devvp->v_type == VBLK);
471 KASSERT(wr->wr_devvp->v_type == VBLK);
472 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
473 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
474 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
475 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
476 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
477 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
478
479 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
480
481 for (i = 0; i < wr->wr_inodescnt; i++)
482 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
483 wr->wr_inodes[i].wr_imode);
484
485 /* Make sure new transaction won't overwrite old inodes list */
486 KDASSERT(wapbl_transaction_len(wl) <=
487 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
488 wr->wr_inodestail));
489
490 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
491 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
492 wapbl_transaction_len(wl);
493
494 error = wapbl_write_inodes(wl, &wl->wl_head);
495 if (error)
496 return error;
497
498 KASSERT(wl->wl_head != wl->wl_tail);
499 KASSERT(wl->wl_head != 0);
500
501 return 0;
502 }
503
504 int
505 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
506 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
507 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
508 {
509 struct wapbl *wl;
510 struct vnode *devvp;
511 daddr_t logpbn;
512 int error;
513 int log_dev_bshift = ilog2(blksize);
514 int fs_dev_bshift = log_dev_bshift;
515 int run;
516
517 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
518 " count=%zu blksize=%zu\n", vp, off, count, blksize));
519
520 if (log_dev_bshift > fs_dev_bshift) {
521 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
522 ("wapbl: log device's block size cannot be larger "
523 "than filesystem's\n"));
524 /*
525 * Not currently implemented, although it could be if
526 * needed someday.
527 */
528 return ENOSYS;
529 }
530
531 if (off < 0)
532 return EINVAL;
533
534 if (blksize < DEV_BSIZE)
535 return EINVAL;
536 if (blksize % DEV_BSIZE)
537 return EINVAL;
538
539 /* XXXTODO: verify that the full load is writable */
540
541 /*
542 * XXX check for minimum log size
543 * minimum is governed by minimum amount of space
544 * to complete a transaction. (probably truncate)
545 */
546 /* XXX for now pick something minimal */
547 if ((count * blksize) < MAXPHYS) {
548 return ENOSPC;
549 }
550
551 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
552 return error;
553 }
554
555 wl = wapbl_calloc(1, sizeof(*wl));
556 rw_init(&wl->wl_rwlock);
557 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
558 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
559 TAILQ_INIT(&wl->wl_bufs);
560 SIMPLEQ_INIT(&wl->wl_entries);
561
562 wl->wl_logvp = vp;
563 wl->wl_devvp = devvp;
564 wl->wl_mount = mp;
565 wl->wl_logpbn = logpbn;
566 wl->wl_log_dev_bshift = log_dev_bshift;
567 wl->wl_fs_dev_bshift = fs_dev_bshift;
568
569 wl->wl_flush = flushfn;
570 wl->wl_flush_abort = flushabortfn;
571
572 /* Reserve two log device blocks for the commit headers */
573 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
574 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
575 /* truncate the log usage to a multiple of log_dev_bshift */
576 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
577 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
578
579 /*
580 * wl_bufbytes_max limits the size of the in memory transaction space.
581 * - Since buffers are allocated and accounted for in units of
582 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
583 * (i.e. 1<<PAGE_SHIFT)
584 * - Since the log device has to be written in units of
585 * 1<<wl_log_dev_bshift it is required to be a mulitple of
586 * 1<<wl_log_dev_bshift.
587 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
588 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
589 * Therefore it must be multiple of the least common multiple of those
590 * three quantities. Fortunately, all of those quantities are
591 * guaranteed to be a power of two, and the least common multiple of
592 * a set of numbers which are all powers of two is simply the maximum
593 * of those numbers. Finally, the maximum logarithm of a power of two
594 * is the same as the log of the maximum power of two. So we can do
595 * the following operations to size wl_bufbytes_max:
596 */
597
598 /* XXX fix actual number of pages reserved per filesystem. */
599 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
600
601 /* Round wl_bufbytes_max to the largest power of two constraint */
602 wl->wl_bufbytes_max >>= PAGE_SHIFT;
603 wl->wl_bufbytes_max <<= PAGE_SHIFT;
604 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
605 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
606 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
607 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
608
609 /* XXX maybe use filesystem fragment size instead of 1024 */
610 /* XXX fix actual number of buffers reserved per filesystem. */
611 wl->wl_bufcount_max = (buf_nbuf() / 2) * 1024;
612
613 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
614 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
615 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
616 KASSERT(wl->wl_brperjblock > 0);
617
618 /* XXX tie this into resource estimation */
619 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
620 TAILQ_INIT(&wl->wl_dealloclist);
621
622 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
623
624 wapbl_evcnt_init(wl);
625
626 wapbl_dkcache_init(wl);
627
628 /* Initialize the commit header */
629 {
630 struct wapbl_wc_header *wc;
631 size_t len = 1 << wl->wl_log_dev_bshift;
632 wc = wapbl_calloc(1, len);
633 wc->wc_type = WAPBL_WC_HEADER;
634 wc->wc_len = len;
635 wc->wc_circ_off = wl->wl_circ_off;
636 wc->wc_circ_size = wl->wl_circ_size;
637 /* XXX wc->wc_fsid */
638 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
639 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
640 wl->wl_wc_header = wc;
641 wl->wl_wc_scratch = wapbl_alloc(len);
642 }
643
644 TAILQ_INIT(&wl->wl_iobufs);
645 TAILQ_INIT(&wl->wl_iobufs_busy);
646 for (int i = 0; i < wapbl_journal_iobufs; i++) {
647 struct buf *bp;
648
649 if ((bp = geteblk(MAXPHYS)) == NULL)
650 goto errout;
651
652 mutex_enter(&bufcache_lock);
653 mutex_enter(devvp->v_interlock);
654 bgetvp(devvp, bp);
655 mutex_exit(devvp->v_interlock);
656 mutex_exit(&bufcache_lock);
657
658 bp->b_dev = devvp->v_rdev;
659
660 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
661 }
662
663 /*
664 * if there was an existing set of unlinked but
665 * allocated inodes, preserve it in the new
666 * log.
667 */
668 if (wr && wr->wr_inodescnt) {
669 error = wapbl_start_flush_inodes(wl, wr);
670 if (error)
671 goto errout;
672 }
673
674 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
675 if (error) {
676 goto errout;
677 }
678
679 *wlp = wl;
680 #if defined(WAPBL_DEBUG)
681 wapbl_debug_wl = wl;
682 #endif
683
684 return 0;
685 errout:
686 wapbl_discard(wl);
687 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
688 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
689 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
690 struct buf *bp;
691
692 bp = TAILQ_FIRST(&wl->wl_iobufs);
693 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
694 brelse(bp, BC_INVAL);
695 }
696 wapbl_inodetrk_free(wl);
697 wapbl_free(wl, sizeof(*wl));
698
699 return error;
700 }
701
702 /*
703 * Like wapbl_flush, only discards the transaction
704 * completely
705 */
706
707 void
708 wapbl_discard(struct wapbl *wl)
709 {
710 struct wapbl_entry *we;
711 struct wapbl_dealloc *wd;
712 struct buf *bp;
713 int i;
714
715 /*
716 * XXX we may consider using upgrade here
717 * if we want to call flush from inside a transaction
718 */
719 rw_enter(&wl->wl_rwlock, RW_WRITER);
720 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
721
722 #ifdef WAPBL_DEBUG_PRINT
723 {
724 pid_t pid = -1;
725 lwpid_t lid = -1;
726 if (curproc)
727 pid = curproc->p_pid;
728 if (curlwp)
729 lid = curlwp->l_lid;
730 #ifdef WAPBL_DEBUG_BUFBYTES
731 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
732 ("wapbl_discard: thread %d.%d discarding "
733 "transaction\n"
734 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
735 "deallocs=%d inodes=%d\n"
736 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
737 "unsynced=%zu\n",
738 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
739 wl->wl_bcount, wl->wl_dealloccnt,
740 wl->wl_inohashcnt, wl->wl_error_count,
741 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
742 wl->wl_unsynced_bufbytes));
743 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
744 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
745 ("\tentry: bufcount = %zu, reclaimable = %zu, "
746 "error = %d, unsynced = %zu\n",
747 we->we_bufcount, we->we_reclaimable_bytes,
748 we->we_error, we->we_unsynced_bufbytes));
749 }
750 #else /* !WAPBL_DEBUG_BUFBYTES */
751 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
752 ("wapbl_discard: thread %d.%d discarding transaction\n"
753 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
754 "deallocs=%d inodes=%d\n"
755 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
756 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
757 wl->wl_bcount, wl->wl_dealloccnt,
758 wl->wl_inohashcnt, wl->wl_error_count,
759 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
760 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
761 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
762 ("\tentry: bufcount = %zu, reclaimable = %zu, "
763 "error = %d\n",
764 we->we_bufcount, we->we_reclaimable_bytes,
765 we->we_error));
766 }
767 #endif /* !WAPBL_DEBUG_BUFBYTES */
768 }
769 #endif /* WAPBL_DEBUG_PRINT */
770
771 for (i = 0; i <= wl->wl_inohashmask; i++) {
772 struct wapbl_ino_head *wih;
773 struct wapbl_ino *wi;
774
775 wih = &wl->wl_inohash[i];
776 while ((wi = LIST_FIRST(wih)) != NULL) {
777 LIST_REMOVE(wi, wi_hash);
778 pool_put(&wapbl_ino_pool, wi);
779 KASSERT(wl->wl_inohashcnt > 0);
780 wl->wl_inohashcnt--;
781 }
782 }
783
784 /*
785 * clean buffer list
786 */
787 mutex_enter(&bufcache_lock);
788 mutex_enter(&wl->wl_mtx);
789 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
790 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
791 /*
792 * The buffer will be unlocked and
793 * removed from the transaction in brelse
794 */
795 mutex_exit(&wl->wl_mtx);
796 brelsel(bp, 0);
797 mutex_enter(&wl->wl_mtx);
798 }
799 }
800 mutex_exit(&wl->wl_mtx);
801 mutex_exit(&bufcache_lock);
802
803 /*
804 * Remove references to this wl from wl_entries, free any which
805 * no longer have buffers, others will be freed in wapbl_biodone
806 * when they no longer have any buffers.
807 */
808 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
809 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
810 /* XXX should we be accumulating wl_error_count
811 * and increasing reclaimable bytes ? */
812 we->we_wapbl = NULL;
813 if (we->we_bufcount == 0) {
814 #ifdef WAPBL_DEBUG_BUFBYTES
815 KASSERT(we->we_unsynced_bufbytes == 0);
816 #endif
817 pool_put(&wapbl_entry_pool, we);
818 }
819 }
820
821 /* Discard list of deallocs */
822 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
823 wapbl_deallocation_free(wl, wd, true);
824
825 /* XXX should we clear wl_reserved_bytes? */
826
827 KASSERT(wl->wl_bufbytes == 0);
828 KASSERT(wl->wl_bcount == 0);
829 KASSERT(wl->wl_bufcount == 0);
830 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
831 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
832 KASSERT(wl->wl_inohashcnt == 0);
833 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
834 KASSERT(wl->wl_dealloccnt == 0);
835
836 rw_exit(&wl->wl_rwlock);
837 }
838
839 int
840 wapbl_stop(struct wapbl *wl, int force)
841 {
842 int error;
843
844 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
845 error = wapbl_flush(wl, 1);
846 if (error) {
847 if (force)
848 wapbl_discard(wl);
849 else
850 return error;
851 }
852
853 /* Unlinked inodes persist after a flush */
854 if (wl->wl_inohashcnt) {
855 if (force) {
856 wapbl_discard(wl);
857 } else {
858 return EBUSY;
859 }
860 }
861
862 KASSERT(wl->wl_bufbytes == 0);
863 KASSERT(wl->wl_bcount == 0);
864 KASSERT(wl->wl_bufcount == 0);
865 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
866 KASSERT(wl->wl_dealloccnt == 0);
867 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
868 KASSERT(wl->wl_inohashcnt == 0);
869 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
870 KASSERT(wl->wl_dealloccnt == 0);
871 KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
872
873 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
874 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
875 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
876 struct buf *bp;
877
878 bp = TAILQ_FIRST(&wl->wl_iobufs);
879 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
880 brelse(bp, BC_INVAL);
881 }
882 wapbl_inodetrk_free(wl);
883
884 wapbl_evcnt_free(wl);
885
886 cv_destroy(&wl->wl_reclaimable_cv);
887 mutex_destroy(&wl->wl_mtx);
888 rw_destroy(&wl->wl_rwlock);
889 wapbl_free(wl, sizeof(*wl));
890
891 return 0;
892 }
893
894 /****************************************************************/
895 /*
896 * Unbuffered disk I/O
897 */
898
899 static void
900 wapbl_doio_accounting(struct vnode *devvp, int flags)
901 {
902 struct pstats *pstats = curlwp->l_proc->p_stats;
903
904 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
905 mutex_enter(devvp->v_interlock);
906 devvp->v_numoutput++;
907 mutex_exit(devvp->v_interlock);
908 pstats->p_ru.ru_oublock++;
909 } else {
910 pstats->p_ru.ru_inblock++;
911 }
912
913 }
914
915 static int
916 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
917 {
918 struct buf *bp;
919 int error;
920
921 KASSERT(devvp->v_type == VBLK);
922
923 wapbl_doio_accounting(devvp, flags);
924
925 bp = getiobuf(devvp, true);
926 bp->b_flags = flags;
927 bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
928 bp->b_dev = devvp->v_rdev;
929 bp->b_data = data;
930 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
931 bp->b_blkno = pbn;
932 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
933
934 WAPBL_PRINTF(WAPBL_PRINT_IO,
935 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
936 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
937 bp->b_blkno, bp->b_dev));
938
939 VOP_STRATEGY(devvp, bp);
940
941 error = biowait(bp);
942 putiobuf(bp);
943
944 if (error) {
945 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
946 ("wapbl_doio: %s %zu bytes at block %" PRId64
947 " on dev 0x%"PRIx64" failed with error %d\n",
948 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
949 "write" : "read"),
950 len, pbn, devvp->v_rdev, error));
951 }
952
953 return error;
954 }
955
956 /*
957 * wapbl_write(data, len, devvp, pbn)
958 *
959 * Synchronously write len bytes from data to physical block pbn
960 * on devvp.
961 */
962 int
963 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
964 {
965
966 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
967 }
968
969 /*
970 * wapbl_read(data, len, devvp, pbn)
971 *
972 * Synchronously read len bytes into data from physical block pbn
973 * on devvp.
974 */
975 int
976 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
977 {
978
979 return wapbl_doio(data, len, devvp, pbn, B_READ);
980 }
981
982 /****************************************************************/
983 /*
984 * Buffered disk writes -- try to coalesce writes and emit
985 * MAXPHYS-aligned blocks.
986 */
987
988 /*
989 * wapbl_buffered_write_async(wl, bp)
990 *
991 * Send buffer for asynchronous write.
992 */
993 static void
994 wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
995 {
996 wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
997
998 KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
999 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
1000
1001 bp->b_flags = B_WRITE | WAPBL_JFLAGS(wl);
1002 bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
1003 bp->b_oflags = 0;
1004 bp->b_bcount = bp->b_resid;
1005 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1006
1007 VOP_STRATEGY(wl->wl_devvp, bp);
1008
1009 wl->wl_ev_journalwrite.ev_count++;
1010
1011 TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
1012 }
1013
1014 /*
1015 * wapbl_buffered_flush(wl)
1016 *
1017 * Flush any buffered writes from wapbl_buffered_write.
1018 */
1019 static int
1020 wapbl_buffered_flush(struct wapbl *wl, bool full)
1021 {
1022 int error = 0;
1023 struct buf *bp, *bnext;
1024 bool only_done = true, found = false;
1025
1026 /* if there is outstanding buffered write, send it now */
1027 if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
1028 wapbl_buffered_write_async(wl, bp);
1029
1030 /* wait for I/O to complete */
1031 again:
1032 TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
1033 if (!full && only_done) {
1034 /* skip unfinished */
1035 if (!ISSET(bp->b_oflags, BO_DONE))
1036 continue;
1037 }
1038
1039 if (ISSET(bp->b_oflags, BO_DONE))
1040 wl->wl_ev_jbufs_bio_nowait.ev_count++;
1041
1042 TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
1043 error = biowait(bp);
1044
1045 /* reset for reuse */
1046 bp->b_blkno = bp->b_resid = 0;
1047 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
1048 found = true;
1049
1050 if (!full)
1051 break;
1052 }
1053
1054 if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
1055 only_done = false;
1056 goto again;
1057 }
1058
1059 return error;
1060 }
1061
1062 /*
1063 * wapbl_buffered_write(data, len, wl, pbn)
1064 *
1065 * Write len bytes from data to physical block pbn on
1066 * wl->wl_devvp. The write may not complete until
1067 * wapbl_buffered_flush.
1068 */
1069 static int
1070 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
1071 {
1072 size_t resid;
1073 struct buf *bp;
1074
1075 again:
1076 bp = TAILQ_FIRST(&wl->wl_iobufs);
1077
1078 if (bp == NULL) {
1079 /* No more buffers, wait for any previous I/O to finish. */
1080 wapbl_buffered_flush(wl, false);
1081
1082 bp = TAILQ_FIRST(&wl->wl_iobufs);
1083 KASSERT(bp != NULL);
1084 }
1085
1086 /*
1087 * If not adjacent to buffered data flush first. Disk block
1088 * address is always valid for non-empty buffer.
1089 */
1090 if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
1091 wapbl_buffered_write_async(wl, bp);
1092 goto again;
1093 }
1094
1095 /*
1096 * If this write goes to an empty buffer we have to
1097 * save the disk block address first.
1098 */
1099 if (bp->b_blkno == 0)
1100 bp->b_blkno = pbn;
1101
1102 /*
1103 * Remaining space so this buffer ends on a buffer size boundary.
1104 *
1105 * Cannot become less or equal zero as the buffer would have been
1106 * flushed on the last call then.
1107 */
1108 resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
1109 bp->b_resid;
1110 KASSERT(resid > 0);
1111 KASSERT(dbtob(btodb(resid)) == resid);
1112
1113 if (len < resid)
1114 resid = len;
1115
1116 memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
1117 bp->b_resid += resid;
1118
1119 if (len >= resid) {
1120 /* Just filled the buf, or data did not fit */
1121 wapbl_buffered_write_async(wl, bp);
1122
1123 data = (uint8_t *)data + resid;
1124 len -= resid;
1125 pbn += btodb(resid);
1126
1127 if (len > 0)
1128 goto again;
1129 }
1130
1131 return 0;
1132 }
1133
1134 /*
1135 * wapbl_circ_write(wl, data, len, offp)
1136 *
1137 * Write len bytes from data to the circular queue of wl, starting
1138 * at linear byte offset *offp, and returning the new linear byte
1139 * offset in *offp.
1140 *
1141 * If the starting linear byte offset precedes wl->wl_circ_off,
1142 * the write instead begins at wl->wl_circ_off. XXX WTF? This
1143 * should be a KASSERT, not a conditional.
1144 *
1145 * The write is buffered in wl and must be flushed with
1146 * wapbl_buffered_flush before it will be submitted to the disk.
1147 */
1148 static int
1149 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
1150 {
1151 size_t slen;
1152 off_t off = *offp;
1153 int error;
1154 daddr_t pbn;
1155
1156 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
1157 wl->wl_log_dev_bshift) == len);
1158
1159 if (off < wl->wl_circ_off)
1160 off = wl->wl_circ_off;
1161 slen = wl->wl_circ_off + wl->wl_circ_size - off;
1162 if (slen < len) {
1163 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1164 #ifdef _KERNEL
1165 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1166 #endif
1167 error = wapbl_buffered_write(data, slen, wl, pbn);
1168 if (error)
1169 return error;
1170 data = (uint8_t *)data + slen;
1171 len -= slen;
1172 off = wl->wl_circ_off;
1173 }
1174 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1175 #ifdef _KERNEL
1176 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1177 #endif
1178 error = wapbl_buffered_write(data, len, wl, pbn);
1179 if (error)
1180 return error;
1181 off += len;
1182 if (off >= wl->wl_circ_off + wl->wl_circ_size)
1183 off = wl->wl_circ_off;
1184 *offp = off;
1185 return 0;
1186 }
1187
1188 /****************************************************************/
1189 /*
1190 * WAPBL transactions: entering, adding/removing bufs, and exiting
1191 */
1192
1193 int
1194 wapbl_begin(struct wapbl *wl, const char *file, int line)
1195 {
1196 int doflush;
1197 unsigned lockcount;
1198
1199 KDASSERT(wl);
1200
1201 /*
1202 * XXX this needs to be made much more sophisticated.
1203 * perhaps each wapbl_begin could reserve a specified
1204 * number of buffers and bytes.
1205 */
1206 mutex_enter(&wl->wl_mtx);
1207 lockcount = wl->wl_lock_count;
1208 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
1209 wl->wl_bufbytes_max / 2) ||
1210 ((wl->wl_bufcount + (lockcount * 10)) >
1211 wl->wl_bufcount_max / 2) ||
1212 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1213 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1214 mutex_exit(&wl->wl_mtx);
1215
1216 if (doflush) {
1217 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1218 ("force flush lockcnt=%d bufbytes=%zu "
1219 "(max=%zu) bufcount=%zu (max=%zu) "
1220 "dealloccnt %d (lim=%d)\n",
1221 lockcount, wl->wl_bufbytes,
1222 wl->wl_bufbytes_max, wl->wl_bufcount,
1223 wl->wl_bufcount_max,
1224 wl->wl_dealloccnt, wl->wl_dealloclim));
1225 }
1226
1227 if (doflush) {
1228 int error = wapbl_flush(wl, 0);
1229 if (error)
1230 return error;
1231 }
1232
1233 rw_enter(&wl->wl_rwlock, RW_READER);
1234 mutex_enter(&wl->wl_mtx);
1235 wl->wl_lock_count++;
1236 mutex_exit(&wl->wl_mtx);
1237
1238 #if defined(WAPBL_DEBUG_PRINT)
1239 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1240 ("wapbl_begin thread %d.%d with bufcount=%zu "
1241 "bufbytes=%zu bcount=%zu at %s:%d\n",
1242 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1243 wl->wl_bufbytes, wl->wl_bcount, file, line));
1244 #endif
1245
1246 return 0;
1247 }
1248
1249 void
1250 wapbl_end(struct wapbl *wl)
1251 {
1252
1253 #if defined(WAPBL_DEBUG_PRINT)
1254 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1255 ("wapbl_end thread %d.%d with bufcount=%zu "
1256 "bufbytes=%zu bcount=%zu\n",
1257 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1258 wl->wl_bufbytes, wl->wl_bcount));
1259 #endif
1260
1261 /*
1262 * XXX this could be handled more gracefully, perhaps place
1263 * only a partial transaction in the log and allow the
1264 * remaining to flush without the protection of the journal.
1265 */
1266 KASSERTMSG((wapbl_transaction_len(wl) <=
1267 (wl->wl_circ_size - wl->wl_reserved_bytes)),
1268 "wapbl_end: current transaction too big to flush");
1269
1270 mutex_enter(&wl->wl_mtx);
1271 KASSERT(wl->wl_lock_count > 0);
1272 wl->wl_lock_count--;
1273 mutex_exit(&wl->wl_mtx);
1274
1275 rw_exit(&wl->wl_rwlock);
1276 }
1277
1278 void
1279 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1280 {
1281
1282 KASSERT(bp->b_cflags & BC_BUSY);
1283 KASSERT(bp->b_vp);
1284
1285 wapbl_jlock_assert(wl);
1286
1287 #if 0
1288 /*
1289 * XXX this might be an issue for swapfiles.
1290 * see uvm_swap.c:1702
1291 *
1292 * XXX2 why require it then? leap of semantics?
1293 */
1294 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1295 #endif
1296
1297 mutex_enter(&wl->wl_mtx);
1298 if (bp->b_flags & B_LOCKED) {
1299 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1300 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1301 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1302 "with %d bytes %d bcount\n",
1303 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1304 bp->b_bcount));
1305 } else {
1306 /* unlocked by dirty buffers shouldn't exist */
1307 KASSERT(!(bp->b_oflags & BO_DELWRI));
1308 wl->wl_bufbytes += bp->b_bufsize;
1309 wl->wl_bcount += bp->b_bcount;
1310 wl->wl_bufcount++;
1311 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1312 ("wapbl_add_buf thread %d.%d adding buf %p "
1313 "with %d bytes %d bcount\n",
1314 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1315 bp->b_bcount));
1316 }
1317 TAILQ_INSERT_TAIL(&wl->wl_bufs, bp, b_wapbllist);
1318 mutex_exit(&wl->wl_mtx);
1319
1320 bp->b_flags |= B_LOCKED;
1321 }
1322
1323 static void
1324 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1325 {
1326
1327 KASSERT(mutex_owned(&wl->wl_mtx));
1328 KASSERT(bp->b_cflags & BC_BUSY);
1329 wapbl_jlock_assert(wl);
1330
1331 #if 0
1332 /*
1333 * XXX this might be an issue for swapfiles.
1334 * see uvm_swap.c:1725
1335 *
1336 * XXXdeux: see above
1337 */
1338 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1339 #endif
1340 KASSERT(bp->b_flags & B_LOCKED);
1341
1342 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1343 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1344 "%d bytes %d bcount\n",
1345 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1346
1347 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1348 wl->wl_bufbytes -= bp->b_bufsize;
1349 KASSERT(wl->wl_bcount >= bp->b_bcount);
1350 wl->wl_bcount -= bp->b_bcount;
1351 KASSERT(wl->wl_bufcount > 0);
1352 wl->wl_bufcount--;
1353 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1354 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1355 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1356
1357 bp->b_flags &= ~B_LOCKED;
1358 }
1359
1360 /* called from brelsel() in vfs_bio among other places */
1361 void
1362 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1363 {
1364
1365 mutex_enter(&wl->wl_mtx);
1366 wapbl_remove_buf_locked(wl, bp);
1367 mutex_exit(&wl->wl_mtx);
1368 }
1369
1370 void
1371 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1372 {
1373
1374 KASSERT(bp->b_cflags & BC_BUSY);
1375
1376 /*
1377 * XXX: why does this depend on B_LOCKED? otherwise the buf
1378 * is not for a transaction? if so, why is this called in the
1379 * first place?
1380 */
1381 if (bp->b_flags & B_LOCKED) {
1382 mutex_enter(&wl->wl_mtx);
1383 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1384 wl->wl_bcount += bp->b_bcount - oldcnt;
1385 mutex_exit(&wl->wl_mtx);
1386 }
1387 }
1388
1389 #endif /* _KERNEL */
1390
1391 /****************************************************************/
1392 /* Some utility inlines */
1393
1394 /*
1395 * wapbl_space_used(avail, head, tail)
1396 *
1397 * Number of bytes used in a circular queue of avail total bytes,
1398 * from tail to head.
1399 */
1400 static inline size_t
1401 wapbl_space_used(size_t avail, off_t head, off_t tail)
1402 {
1403
1404 if (tail == 0) {
1405 KASSERT(head == 0);
1406 return 0;
1407 }
1408 return ((head + (avail - 1) - tail) % avail) + 1;
1409 }
1410
1411 #ifdef _KERNEL
1412 /*
1413 * wapbl_advance(size, off, oldoff, delta)
1414 *
1415 * Given a byte offset oldoff into a circular queue of size bytes
1416 * starting at off, return a new byte offset oldoff + delta into
1417 * the circular queue.
1418 */
1419 static inline off_t
1420 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1421 {
1422 off_t newoff;
1423
1424 /* Define acceptable ranges for inputs. */
1425 KASSERT(delta <= (size_t)size);
1426 KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1427 KASSERT(oldoff < (off_t)(size + off));
1428
1429 if ((oldoff == 0) && (delta != 0))
1430 newoff = off + delta;
1431 else if ((oldoff + delta) < (size + off))
1432 newoff = oldoff + delta;
1433 else
1434 newoff = (oldoff + delta) - size;
1435
1436 /* Note some interesting axioms */
1437 KASSERT((delta != 0) || (newoff == oldoff));
1438 KASSERT((delta == 0) || (newoff != 0));
1439 KASSERT((delta != (size)) || (newoff == oldoff));
1440
1441 /* Define acceptable ranges for output. */
1442 KASSERT((newoff == 0) || ((size_t)newoff >= off));
1443 KASSERT((size_t)newoff < (size + off));
1444 return newoff;
1445 }
1446
1447 /*
1448 * wapbl_space_free(avail, head, tail)
1449 *
1450 * Number of bytes free in a circular queue of avail total bytes,
1451 * in which everything from tail to head is used.
1452 */
1453 static inline size_t
1454 wapbl_space_free(size_t avail, off_t head, off_t tail)
1455 {
1456
1457 return avail - wapbl_space_used(avail, head, tail);
1458 }
1459
1460 /*
1461 * wapbl_advance_head(size, off, delta, headp, tailp)
1462 *
1463 * In a circular queue of size bytes starting at off, given the
1464 * old head and tail offsets *headp and *tailp, store the new head
1465 * and tail offsets in *headp and *tailp resulting from adding
1466 * delta bytes of data to the head.
1467 */
1468 static inline void
1469 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1470 off_t *tailp)
1471 {
1472 off_t head = *headp;
1473 off_t tail = *tailp;
1474
1475 KASSERT(delta <= wapbl_space_free(size, head, tail));
1476 head = wapbl_advance(size, off, head, delta);
1477 if ((tail == 0) && (head != 0))
1478 tail = off;
1479 *headp = head;
1480 *tailp = tail;
1481 }
1482
1483 /*
1484 * wapbl_advance_tail(size, off, delta, headp, tailp)
1485 *
1486 * In a circular queue of size bytes starting at off, given the
1487 * old head and tail offsets *headp and *tailp, store the new head
1488 * and tail offsets in *headp and *tailp resulting from removing
1489 * delta bytes of data from the tail.
1490 */
1491 static inline void
1492 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1493 off_t *tailp)
1494 {
1495 off_t head = *headp;
1496 off_t tail = *tailp;
1497
1498 KASSERT(delta <= wapbl_space_used(size, head, tail));
1499 tail = wapbl_advance(size, off, tail, delta);
1500 if (head == tail) {
1501 head = tail = 0;
1502 }
1503 *headp = head;
1504 *tailp = tail;
1505 }
1506
1507
1508 /****************************************************************/
1509
1510 /*
1511 * wapbl_truncate(wl, minfree)
1512 *
1513 * Wait until at least minfree bytes are available in the log.
1514 *
1515 * If it was necessary to wait for writes to complete,
1516 * advance the circular queue tail to reflect the new write
1517 * completions and issue a write commit to the log.
1518 *
1519 * => Caller must hold wl->wl_rwlock writer lock.
1520 */
1521 static int
1522 wapbl_truncate(struct wapbl *wl, size_t minfree)
1523 {
1524 size_t delta;
1525 size_t avail;
1526 off_t head;
1527 off_t tail;
1528 int error = 0;
1529
1530 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1531 KASSERT(rw_write_held(&wl->wl_rwlock));
1532
1533 mutex_enter(&wl->wl_mtx);
1534
1535 /*
1536 * First check to see if we have to do a commit
1537 * at all.
1538 */
1539 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1540 if (minfree < avail) {
1541 mutex_exit(&wl->wl_mtx);
1542 return 0;
1543 }
1544 minfree -= avail;
1545 while ((wl->wl_error_count == 0) &&
1546 (wl->wl_reclaimable_bytes < minfree)) {
1547 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1548 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1549 "minfree=%zd\n",
1550 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1551 minfree));
1552
1553 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1554 }
1555 if (wl->wl_reclaimable_bytes < minfree) {
1556 KASSERT(wl->wl_error_count);
1557 /* XXX maybe get actual error from buffer instead someday? */
1558 error = EIO;
1559 }
1560 head = wl->wl_head;
1561 tail = wl->wl_tail;
1562 delta = wl->wl_reclaimable_bytes;
1563
1564 /* If all of of the entries are flushed, then be sure to keep
1565 * the reserved bytes reserved. Watch out for discarded transactions,
1566 * which could leave more bytes reserved than are reclaimable.
1567 */
1568 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1569 (delta >= wl->wl_reserved_bytes)) {
1570 delta -= wl->wl_reserved_bytes;
1571 }
1572 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1573 &tail);
1574 KDASSERT(wl->wl_reserved_bytes <=
1575 wapbl_space_used(wl->wl_circ_size, head, tail));
1576 mutex_exit(&wl->wl_mtx);
1577
1578 if (error)
1579 return error;
1580
1581 /*
1582 * This is where head, tail and delta are unprotected
1583 * from races against itself or flush. This is ok since
1584 * we only call this routine from inside flush itself.
1585 *
1586 * XXX: how can it race against itself when accessed only
1587 * from behind the write-locked rwlock?
1588 */
1589 error = wapbl_write_commit(wl, head, tail);
1590 if (error)
1591 return error;
1592
1593 wl->wl_head = head;
1594 wl->wl_tail = tail;
1595
1596 mutex_enter(&wl->wl_mtx);
1597 KASSERT(wl->wl_reclaimable_bytes >= delta);
1598 wl->wl_reclaimable_bytes -= delta;
1599 mutex_exit(&wl->wl_mtx);
1600 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1601 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1602 curproc->p_pid, curlwp->l_lid, delta));
1603
1604 return 0;
1605 }
1606
1607 /****************************************************************/
1608
1609 void
1610 wapbl_biodone(struct buf *bp)
1611 {
1612 struct wapbl_entry *we = bp->b_private;
1613 struct wapbl *wl = we->we_wapbl;
1614 #ifdef WAPBL_DEBUG_BUFBYTES
1615 const int bufsize = bp->b_bufsize;
1616 #endif
1617
1618 /*
1619 * Handle possible flushing of buffers after log has been
1620 * decomissioned.
1621 */
1622 if (!wl) {
1623 KASSERT(we->we_bufcount > 0);
1624 we->we_bufcount--;
1625 #ifdef WAPBL_DEBUG_BUFBYTES
1626 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1627 we->we_unsynced_bufbytes -= bufsize;
1628 #endif
1629
1630 if (we->we_bufcount == 0) {
1631 #ifdef WAPBL_DEBUG_BUFBYTES
1632 KASSERT(we->we_unsynced_bufbytes == 0);
1633 #endif
1634 pool_put(&wapbl_entry_pool, we);
1635 }
1636
1637 brelse(bp, 0);
1638 return;
1639 }
1640
1641 #ifdef ohbother
1642 KDASSERT(bp->b_oflags & BO_DONE);
1643 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1644 KDASSERT(bp->b_flags & B_ASYNC);
1645 KDASSERT(bp->b_cflags & BC_BUSY);
1646 KDASSERT(!(bp->b_flags & B_LOCKED));
1647 KDASSERT(!(bp->b_flags & B_READ));
1648 KDASSERT(!(bp->b_cflags & BC_INVAL));
1649 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1650 #endif
1651
1652 if (bp->b_error) {
1653 /*
1654 * If an error occurs, it would be nice to leave the buffer
1655 * as a delayed write on the LRU queue so that we can retry
1656 * it later. But buffercache(9) can't handle dirty buffer
1657 * reuse, so just mark the log permanently errored out.
1658 */
1659 mutex_enter(&wl->wl_mtx);
1660 if (wl->wl_error_count == 0) {
1661 wl->wl_error_count++;
1662 cv_broadcast(&wl->wl_reclaimable_cv);
1663 }
1664 mutex_exit(&wl->wl_mtx);
1665 }
1666
1667 /*
1668 * Make sure that the buf doesn't retain the media flags, so that
1669 * e.g. wapbl_allow_fuadpo has immediate effect on any following I/O.
1670 * The flags will be set again if needed by another I/O.
1671 */
1672 bp->b_flags &= ~B_MEDIA_FLAGS;
1673
1674 /*
1675 * Release the buffer here. wapbl_flush() may wait for the
1676 * log to become empty and we better unbusy the buffer before
1677 * wapbl_flush() returns.
1678 */
1679 brelse(bp, 0);
1680
1681 mutex_enter(&wl->wl_mtx);
1682
1683 KASSERT(we->we_bufcount > 0);
1684 we->we_bufcount--;
1685 #ifdef WAPBL_DEBUG_BUFBYTES
1686 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1687 we->we_unsynced_bufbytes -= bufsize;
1688 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1689 wl->wl_unsynced_bufbytes -= bufsize;
1690 #endif
1691 wl->wl_ev_metawrite.ev_count++;
1692
1693 /*
1694 * If the current transaction can be reclaimed, start
1695 * at the beginning and reclaim any consecutive reclaimable
1696 * transactions. If we successfully reclaim anything,
1697 * then wakeup anyone waiting for the reclaim.
1698 */
1699 if (we->we_bufcount == 0) {
1700 size_t delta = 0;
1701 int errcnt = 0;
1702 #ifdef WAPBL_DEBUG_BUFBYTES
1703 KDASSERT(we->we_unsynced_bufbytes == 0);
1704 #endif
1705 /*
1706 * clear any posted error, since the buffer it came from
1707 * has successfully flushed by now
1708 */
1709 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1710 (we->we_bufcount == 0)) {
1711 delta += we->we_reclaimable_bytes;
1712 if (we->we_error)
1713 errcnt++;
1714 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1715 pool_put(&wapbl_entry_pool, we);
1716 }
1717
1718 if (delta) {
1719 wl->wl_reclaimable_bytes += delta;
1720 KASSERT(wl->wl_error_count >= errcnt);
1721 wl->wl_error_count -= errcnt;
1722 cv_broadcast(&wl->wl_reclaimable_cv);
1723 }
1724 }
1725
1726 mutex_exit(&wl->wl_mtx);
1727 }
1728
1729 /*
1730 * wapbl_flush(wl, wait)
1731 *
1732 * Flush pending block writes, deallocations, and inodes from
1733 * the current transaction in memory to the log on disk:
1734 *
1735 * 1. Call the file system's wl_flush callback to flush any
1736 * per-file-system pending updates.
1737 * 2. Wait for enough space in the log for the current transaction.
1738 * 3. Synchronously write the new log records, advancing the
1739 * circular queue head.
1740 * 4. Issue the pending block writes asynchronously, now that they
1741 * are recorded in the log and can be replayed after crash.
1742 * 5. If wait is true, wait for all writes to complete and for the
1743 * log to become empty.
1744 *
1745 * On failure, call the file system's wl_flush_abort callback.
1746 */
1747 int
1748 wapbl_flush(struct wapbl *wl, int waitfor)
1749 {
1750 struct buf *bp;
1751 struct wapbl_entry *we;
1752 off_t off;
1753 off_t head;
1754 off_t tail;
1755 size_t delta = 0;
1756 size_t flushsize;
1757 size_t reserved;
1758 int error = 0;
1759
1760 /*
1761 * Do a quick check to see if a full flush can be skipped
1762 * This assumes that the flush callback does not need to be called
1763 * unless there are other outstanding bufs.
1764 */
1765 if (!waitfor) {
1766 size_t nbufs;
1767 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1768 protect the KASSERTS */
1769 nbufs = wl->wl_bufcount;
1770 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1771 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1772 mutex_exit(&wl->wl_mtx);
1773 if (nbufs == 0)
1774 return 0;
1775 }
1776
1777 /*
1778 * XXX we may consider using LK_UPGRADE here
1779 * if we want to call flush from inside a transaction
1780 */
1781 rw_enter(&wl->wl_rwlock, RW_WRITER);
1782 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
1783
1784 /*
1785 * Now that we are exclusively locked and the file system has
1786 * issued any deferred block writes for this transaction, check
1787 * whether there are any blocks to write to the log. If not,
1788 * skip waiting for space or writing any log entries.
1789 *
1790 * XXX Shouldn't this also check wl_dealloccnt and
1791 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1792 * file system didn't produce any blocks as a consequence of
1793 * it, but the same does not seem to be so of wl_inohashcnt.
1794 */
1795 if (wl->wl_bufcount == 0) {
1796 goto wait_out;
1797 }
1798
1799 #if 0
1800 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1801 ("wapbl_flush thread %d.%d flushing entries with "
1802 "bufcount=%zu bufbytes=%zu\n",
1803 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1804 wl->wl_bufbytes));
1805 #endif
1806
1807 /* Calculate amount of space needed to flush */
1808 flushsize = wapbl_transaction_len(wl);
1809 if (wapbl_verbose_commit) {
1810 struct timespec ts;
1811 getnanotime(&ts);
1812 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1813 __func__, (long long)ts.tv_sec,
1814 (long)ts.tv_nsec, flushsize);
1815 }
1816
1817 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1818 /*
1819 * XXX this could be handled more gracefully, perhaps place
1820 * only a partial transaction in the log and allow the
1821 * remaining to flush without the protection of the journal.
1822 */
1823 panic("wapbl_flush: current transaction too big to flush");
1824 }
1825
1826 error = wapbl_truncate(wl, flushsize);
1827 if (error)
1828 goto out;
1829
1830 off = wl->wl_head;
1831 KASSERT((off == 0) || (off >= wl->wl_circ_off));
1832 KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1833 error = wapbl_write_blocks(wl, &off);
1834 if (error)
1835 goto out;
1836 error = wapbl_write_revocations(wl, &off);
1837 if (error)
1838 goto out;
1839 error = wapbl_write_inodes(wl, &off);
1840 if (error)
1841 goto out;
1842
1843 reserved = 0;
1844 if (wl->wl_inohashcnt)
1845 reserved = wapbl_transaction_inodes_len(wl);
1846
1847 head = wl->wl_head;
1848 tail = wl->wl_tail;
1849
1850 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1851 &head, &tail);
1852
1853 KASSERTMSG(head == off,
1854 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1855 " off=%"PRIdMAX" flush=%zu",
1856 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1857 flushsize);
1858
1859 /* Opportunistically move the tail forward if we can */
1860 mutex_enter(&wl->wl_mtx);
1861 delta = wl->wl_reclaimable_bytes;
1862 mutex_exit(&wl->wl_mtx);
1863 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1864 &head, &tail);
1865
1866 error = wapbl_write_commit(wl, head, tail);
1867 if (error)
1868 goto out;
1869
1870 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1871
1872 #ifdef WAPBL_DEBUG_BUFBYTES
1873 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1874 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1875 " unsynced=%zu"
1876 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1877 "inodes=%d\n",
1878 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1879 wapbl_space_used(wl->wl_circ_size, head, tail),
1880 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1881 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1882 wl->wl_inohashcnt));
1883 #else
1884 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1885 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1886 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1887 "inodes=%d\n",
1888 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1889 wapbl_space_used(wl->wl_circ_size, head, tail),
1890 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1891 wl->wl_dealloccnt, wl->wl_inohashcnt));
1892 #endif
1893
1894
1895 mutex_enter(&bufcache_lock);
1896 mutex_enter(&wl->wl_mtx);
1897
1898 wl->wl_reserved_bytes = reserved;
1899 wl->wl_head = head;
1900 wl->wl_tail = tail;
1901 KASSERT(wl->wl_reclaimable_bytes >= delta);
1902 wl->wl_reclaimable_bytes -= delta;
1903 KDASSERT(wl->wl_dealloccnt == 0);
1904 #ifdef WAPBL_DEBUG_BUFBYTES
1905 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1906 #endif
1907
1908 we->we_wapbl = wl;
1909 we->we_bufcount = wl->wl_bufcount;
1910 #ifdef WAPBL_DEBUG_BUFBYTES
1911 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1912 #endif
1913 we->we_reclaimable_bytes = flushsize;
1914 we->we_error = 0;
1915 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1916
1917 /*
1918 * This flushes bufs in order than they were queued, so the LRU
1919 * order is preserved.
1920 */
1921 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
1922 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1923 continue;
1924 }
1925 bp->b_iodone = wapbl_biodone;
1926 bp->b_private = we;
1927
1928 /* make sure the block is saved sync when FUA in use */
1929 bp->b_flags |= WAPBL_MFLAGS(wl);
1930
1931 bremfree(bp);
1932 wapbl_remove_buf_locked(wl, bp);
1933 mutex_exit(&wl->wl_mtx);
1934 mutex_exit(&bufcache_lock);
1935 bawrite(bp);
1936 mutex_enter(&bufcache_lock);
1937 mutex_enter(&wl->wl_mtx);
1938 }
1939 mutex_exit(&wl->wl_mtx);
1940 mutex_exit(&bufcache_lock);
1941
1942 #if 0
1943 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1944 ("wapbl_flush thread %d.%d done flushing entries...\n",
1945 curproc->p_pid, curlwp->l_lid));
1946 #endif
1947
1948 wait_out:
1949
1950 /*
1951 * If the waitfor flag is set, don't return until everything is
1952 * fully flushed and the on disk log is empty.
1953 */
1954 if (waitfor) {
1955 error = wapbl_truncate(wl, wl->wl_circ_size -
1956 wl->wl_reserved_bytes);
1957 }
1958
1959 out:
1960 if (error) {
1961 wl->wl_flush_abort(wl->wl_mount,
1962 TAILQ_FIRST(&wl->wl_dealloclist));
1963 }
1964
1965 #ifdef WAPBL_DEBUG_PRINT
1966 if (error) {
1967 pid_t pid = -1;
1968 lwpid_t lid = -1;
1969 if (curproc)
1970 pid = curproc->p_pid;
1971 if (curlwp)
1972 lid = curlwp->l_lid;
1973 mutex_enter(&wl->wl_mtx);
1974 #ifdef WAPBL_DEBUG_BUFBYTES
1975 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1976 ("wapbl_flush: thread %d.%d aborted flush: "
1977 "error = %d\n"
1978 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1979 "deallocs=%d inodes=%d\n"
1980 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1981 "unsynced=%zu\n",
1982 pid, lid, error, wl->wl_bufcount,
1983 wl->wl_bufbytes, wl->wl_bcount,
1984 wl->wl_dealloccnt, wl->wl_inohashcnt,
1985 wl->wl_error_count, wl->wl_reclaimable_bytes,
1986 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1987 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1988 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1989 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1990 "error = %d, unsynced = %zu\n",
1991 we->we_bufcount, we->we_reclaimable_bytes,
1992 we->we_error, we->we_unsynced_bufbytes));
1993 }
1994 #else
1995 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1996 ("wapbl_flush: thread %d.%d aborted flush: "
1997 "error = %d\n"
1998 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1999 "deallocs=%d inodes=%d\n"
2000 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
2001 pid, lid, error, wl->wl_bufcount,
2002 wl->wl_bufbytes, wl->wl_bcount,
2003 wl->wl_dealloccnt, wl->wl_inohashcnt,
2004 wl->wl_error_count, wl->wl_reclaimable_bytes,
2005 wl->wl_reserved_bytes));
2006 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2007 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2008 ("\tentry: bufcount = %zu, reclaimable = %zu, "
2009 "error = %d\n", we->we_bufcount,
2010 we->we_reclaimable_bytes, we->we_error));
2011 }
2012 #endif
2013 mutex_exit(&wl->wl_mtx);
2014 }
2015 #endif
2016
2017 rw_exit(&wl->wl_rwlock);
2018 return error;
2019 }
2020
2021 /****************************************************************/
2022
2023 void
2024 wapbl_jlock_assert(struct wapbl *wl)
2025 {
2026
2027 KASSERT(rw_lock_held(&wl->wl_rwlock));
2028 }
2029
2030 void
2031 wapbl_junlock_assert(struct wapbl *wl)
2032 {
2033
2034 KASSERT(!rw_write_held(&wl->wl_rwlock));
2035 }
2036
2037 /****************************************************************/
2038
2039 /* locks missing */
2040 void
2041 wapbl_print(struct wapbl *wl,
2042 int full,
2043 void (*pr)(const char *, ...))
2044 {
2045 struct buf *bp;
2046 struct wapbl_entry *we;
2047 (*pr)("wapbl %p", wl);
2048 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
2049 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
2050 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
2051 wl->wl_circ_size, wl->wl_circ_off,
2052 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
2053 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
2054 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
2055 #ifdef WAPBL_DEBUG_BUFBYTES
2056 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2057 "reserved = %zu errcnt = %d unsynced = %zu\n",
2058 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
2059 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2060 wl->wl_error_count, wl->wl_unsynced_bufbytes);
2061 #else
2062 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2063 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
2064 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2065 wl->wl_error_count);
2066 #endif
2067 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
2068 wl->wl_dealloccnt, wl->wl_dealloclim);
2069 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
2070 wl->wl_inohashcnt, wl->wl_inohashmask);
2071 (*pr)("entries:\n");
2072 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2073 #ifdef WAPBL_DEBUG_BUFBYTES
2074 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
2075 "unsynced = %zu\n",
2076 we->we_bufcount, we->we_reclaimable_bytes,
2077 we->we_error, we->we_unsynced_bufbytes);
2078 #else
2079 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
2080 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
2081 #endif
2082 }
2083 if (full) {
2084 int cnt = 0;
2085 (*pr)("bufs =");
2086 TAILQ_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
2087 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2088 (*pr)(" %p", bp);
2089 } else if ((++cnt % 6) == 0) {
2090 (*pr)(" %p,\n\t", bp);
2091 } else {
2092 (*pr)(" %p,", bp);
2093 }
2094 }
2095 (*pr)("\n");
2096
2097 (*pr)("dealloced blks = ");
2098 {
2099 struct wapbl_dealloc *wd;
2100 cnt = 0;
2101 TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
2102 (*pr)(" %"PRId64":%d,",
2103 wd->wd_blkno,
2104 wd->wd_len);
2105 if ((++cnt % 4) == 0) {
2106 (*pr)("\n\t");
2107 }
2108 }
2109 }
2110 (*pr)("\n");
2111
2112 (*pr)("registered inodes = ");
2113 {
2114 int i;
2115 cnt = 0;
2116 for (i = 0; i <= wl->wl_inohashmask; i++) {
2117 struct wapbl_ino_head *wih;
2118 struct wapbl_ino *wi;
2119
2120 wih = &wl->wl_inohash[i];
2121 LIST_FOREACH(wi, wih, wi_hash) {
2122 if (wi->wi_ino == 0)
2123 continue;
2124 (*pr)(" %"PRIu64"/0%06"PRIo32",",
2125 wi->wi_ino, wi->wi_mode);
2126 if ((++cnt % 4) == 0) {
2127 (*pr)("\n\t");
2128 }
2129 }
2130 }
2131 (*pr)("\n");
2132 }
2133
2134 (*pr)("iobufs free =");
2135 TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
2136 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2137 (*pr)(" %p", bp);
2138 } else if ((++cnt % 6) == 0) {
2139 (*pr)(" %p,\n\t", bp);
2140 } else {
2141 (*pr)(" %p,", bp);
2142 }
2143 }
2144 (*pr)("\n");
2145
2146 (*pr)("iobufs busy =");
2147 TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
2148 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2149 (*pr)(" %p", bp);
2150 } else if ((++cnt % 6) == 0) {
2151 (*pr)(" %p,\n\t", bp);
2152 } else {
2153 (*pr)(" %p,", bp);
2154 }
2155 }
2156 (*pr)("\n");
2157 }
2158 }
2159
2160 #if defined(WAPBL_DEBUG) || defined(DDB)
2161 void
2162 wapbl_dump(struct wapbl *wl)
2163 {
2164 #if defined(WAPBL_DEBUG)
2165 if (!wl)
2166 wl = wapbl_debug_wl;
2167 #endif
2168 if (!wl)
2169 return;
2170 wapbl_print(wl, 1, printf);
2171 }
2172 #endif
2173
2174 /****************************************************************/
2175
2176 int
2177 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force,
2178 void **cookiep)
2179 {
2180 struct wapbl_dealloc *wd;
2181 int error = 0;
2182
2183 wapbl_jlock_assert(wl);
2184
2185 mutex_enter(&wl->wl_mtx);
2186
2187 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
2188 if (!force) {
2189 error = EAGAIN;
2190 goto out;
2191 }
2192
2193 /*
2194 * Forced registration can only be used when:
2195 * 1) the caller can't cope with failure
2196 * 2) the path can be triggered only bounded, small
2197 * times per transaction
2198 * If this is not fullfilled, and the path would be triggered
2199 * many times, this could overflow maximum transaction size
2200 * and panic later.
2201 */
2202 printf("%s: forced dealloc registration over limit: %d >= %d\n",
2203 wl->wl_mount->mnt_stat.f_mntonname,
2204 wl->wl_dealloccnt, wl->wl_dealloclim);
2205 }
2206
2207 wl->wl_dealloccnt++;
2208 mutex_exit(&wl->wl_mtx);
2209
2210 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
2211 wd->wd_blkno = blk;
2212 wd->wd_len = len;
2213
2214 mutex_enter(&wl->wl_mtx);
2215 TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
2216
2217 if (cookiep)
2218 *cookiep = wd;
2219
2220 out:
2221 mutex_exit(&wl->wl_mtx);
2222
2223 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
2224 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
2225 blk, len, error));
2226
2227 return error;
2228 }
2229
2230 static void
2231 wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd,
2232 bool locked)
2233 {
2234 KASSERT(!locked
2235 || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx));
2236
2237 if (!locked)
2238 mutex_enter(&wl->wl_mtx);
2239
2240 TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
2241 wl->wl_dealloccnt--;
2242
2243 if (!locked)
2244 mutex_exit(&wl->wl_mtx);
2245
2246 pool_put(&wapbl_dealloc_pool, wd);
2247 }
2248
2249 void
2250 wapbl_unregister_deallocation(struct wapbl *wl, void *cookie)
2251 {
2252 KASSERT(cookie != NULL);
2253 wapbl_deallocation_free(wl, cookie, false);
2254 }
2255
2256 /****************************************************************/
2257
2258 static void
2259 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
2260 {
2261
2262 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
2263 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
2264 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
2265 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
2266 }
2267 }
2268
2269 static void
2270 wapbl_inodetrk_free(struct wapbl *wl)
2271 {
2272
2273 /* XXX this KASSERT needs locking/mutex analysis */
2274 KASSERT(wl->wl_inohashcnt == 0);
2275 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2276 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2277 pool_destroy(&wapbl_ino_pool);
2278 }
2279 }
2280
2281 static struct wapbl_ino *
2282 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2283 {
2284 struct wapbl_ino_head *wih;
2285 struct wapbl_ino *wi;
2286
2287 KASSERT(mutex_owned(&wl->wl_mtx));
2288
2289 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2290 LIST_FOREACH(wi, wih, wi_hash) {
2291 if (ino == wi->wi_ino)
2292 return wi;
2293 }
2294 return 0;
2295 }
2296
2297 void
2298 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2299 {
2300 struct wapbl_ino_head *wih;
2301 struct wapbl_ino *wi;
2302
2303 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2304
2305 mutex_enter(&wl->wl_mtx);
2306 if (wapbl_inodetrk_get(wl, ino) == NULL) {
2307 wi->wi_ino = ino;
2308 wi->wi_mode = mode;
2309 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2310 LIST_INSERT_HEAD(wih, wi, wi_hash);
2311 wl->wl_inohashcnt++;
2312 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2313 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2314 mutex_exit(&wl->wl_mtx);
2315 } else {
2316 mutex_exit(&wl->wl_mtx);
2317 pool_put(&wapbl_ino_pool, wi);
2318 }
2319 }
2320
2321 void
2322 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2323 {
2324 struct wapbl_ino *wi;
2325
2326 mutex_enter(&wl->wl_mtx);
2327 wi = wapbl_inodetrk_get(wl, ino);
2328 if (wi) {
2329 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2330 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2331 KASSERT(wl->wl_inohashcnt > 0);
2332 wl->wl_inohashcnt--;
2333 LIST_REMOVE(wi, wi_hash);
2334 mutex_exit(&wl->wl_mtx);
2335
2336 pool_put(&wapbl_ino_pool, wi);
2337 } else {
2338 mutex_exit(&wl->wl_mtx);
2339 }
2340 }
2341
2342 /****************************************************************/
2343
2344 /*
2345 * wapbl_transaction_inodes_len(wl)
2346 *
2347 * Calculate the number of bytes required for inode registration
2348 * log records in wl.
2349 */
2350 static inline size_t
2351 wapbl_transaction_inodes_len(struct wapbl *wl)
2352 {
2353 int blocklen = 1<<wl->wl_log_dev_bshift;
2354 int iph;
2355
2356 /* Calculate number of inodes described in a inodelist header */
2357 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2358 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2359
2360 KASSERT(iph > 0);
2361
2362 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2363 }
2364
2365
2366 /*
2367 * wapbl_transaction_len(wl)
2368 *
2369 * Calculate number of bytes required for all log records in wl.
2370 */
2371 static size_t
2372 wapbl_transaction_len(struct wapbl *wl)
2373 {
2374 int blocklen = 1<<wl->wl_log_dev_bshift;
2375 size_t len;
2376
2377 /* Calculate number of blocks described in a blocklist header */
2378 len = wl->wl_bcount;
2379 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2380 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2381 len += wapbl_transaction_inodes_len(wl);
2382
2383 return len;
2384 }
2385
2386 /*
2387 * wapbl_cache_sync(wl, msg)
2388 *
2389 * Issue DIOCCACHESYNC to wl->wl_devvp.
2390 *
2391 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2392 * including msg about the duration of the cache sync.
2393 */
2394 static int
2395 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2396 {
2397 const bool verbose = wapbl_verbose_commit >= 2;
2398 struct bintime start_time;
2399 int force = 1;
2400 int error;
2401
2402 /* Skip full cache sync if disabled, or when using FUA */
2403 if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
2404 return 0;
2405 }
2406 if (verbose) {
2407 bintime(&start_time);
2408 }
2409 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2410 FWRITE, FSCRED);
2411 if (error) {
2412 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2413 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2414 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
2415 }
2416 if (verbose) {
2417 struct bintime d;
2418 struct timespec ts;
2419
2420 bintime(&d);
2421 bintime_sub(&d, &start_time);
2422 bintime2timespec(&d, &ts);
2423 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2424 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2425 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2426 }
2427
2428 wl->wl_ev_cacheflush.ev_count++;
2429
2430 return error;
2431 }
2432
2433 /*
2434 * wapbl_write_commit(wl, head, tail)
2435 *
2436 * Issue a disk cache sync to wait for all pending writes to the
2437 * log to complete, and then synchronously commit the current
2438 * circular queue head and tail to the log, in the next of two
2439 * locations for commit headers on disk.
2440 *
2441 * Increment the generation number. If the generation number
2442 * rolls over to zero, then a subsequent commit would appear to
2443 * have an older generation than this one -- in that case, issue a
2444 * duplicate commit to avoid this.
2445 *
2446 * => Caller must have exclusive access to wl, either by holding
2447 * wl->wl_rwlock for writer or by being wapbl_start before anyone
2448 * else has seen wl.
2449 */
2450 static int
2451 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2452 {
2453 struct wapbl_wc_header *wc = wl->wl_wc_header;
2454 struct timespec ts;
2455 int error;
2456 daddr_t pbn;
2457
2458 error = wapbl_buffered_flush(wl, true);
2459 if (error)
2460 return error;
2461 /*
2462 * flush disk cache to ensure that blocks we've written are actually
2463 * written to the stable storage before the commit header.
2464 *
2465 * XXX Calc checksum here, instead we do this for now
2466 */
2467 wapbl_cache_sync(wl, "1");
2468
2469 wc->wc_head = head;
2470 wc->wc_tail = tail;
2471 wc->wc_checksum = 0;
2472 wc->wc_version = 1;
2473 getnanotime(&ts);
2474 wc->wc_time = ts.tv_sec;
2475 wc->wc_timensec = ts.tv_nsec;
2476
2477 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2478 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2479 (intmax_t)head, (intmax_t)tail));
2480
2481 /*
2482 * write the commit header.
2483 *
2484 * XXX if generation will rollover, then first zero
2485 * over second commit header before trying to write both headers.
2486 */
2487
2488 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2489 #ifdef _KERNEL
2490 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2491 #endif
2492 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2493 if (error)
2494 return error;
2495 error = wapbl_buffered_flush(wl, true);
2496 if (error)
2497 return error;
2498
2499 /*
2500 * flush disk cache to ensure that the commit header is actually
2501 * written before meta data blocks.
2502 */
2503 wapbl_cache_sync(wl, "2");
2504
2505 /*
2506 * If the generation number was zero, write it out a second time.
2507 * This handles initialization and generation number rollover
2508 */
2509 if (wc->wc_generation++ == 0) {
2510 error = wapbl_write_commit(wl, head, tail);
2511 /*
2512 * This panic should be able to be removed if we do the
2513 * zero'ing mentioned above, and we are certain to roll
2514 * back generation number on failure.
2515 */
2516 if (error)
2517 panic("wapbl_write_commit: error writing duplicate "
2518 "log header: %d", error);
2519 }
2520
2521 wl->wl_ev_commit.ev_count++;
2522
2523 return 0;
2524 }
2525
2526 /*
2527 * wapbl_write_blocks(wl, offp)
2528 *
2529 * Write all pending physical blocks in the current transaction
2530 * from wapbl_add_buf to the log on disk, adding to the circular
2531 * queue head at byte offset *offp, and returning the new head's
2532 * byte offset in *offp.
2533 */
2534 static int
2535 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2536 {
2537 struct wapbl_wc_blocklist *wc =
2538 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2539 int blocklen = 1<<wl->wl_log_dev_bshift;
2540 struct buf *bp;
2541 off_t off = *offp;
2542 int error;
2543 size_t padding;
2544
2545 KASSERT(rw_write_held(&wl->wl_rwlock));
2546
2547 bp = TAILQ_FIRST(&wl->wl_bufs);
2548
2549 while (bp) {
2550 int cnt;
2551 struct buf *obp = bp;
2552
2553 KASSERT(bp->b_flags & B_LOCKED);
2554
2555 wc->wc_type = WAPBL_WC_BLOCKS;
2556 wc->wc_len = blocklen;
2557 wc->wc_blkcount = 0;
2558 while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
2559 /*
2560 * Make sure all the physical block numbers are up to
2561 * date. If this is not always true on a given
2562 * filesystem, then VOP_BMAP must be called. We
2563 * could call VOP_BMAP here, or else in the filesystem
2564 * specific flush callback, although neither of those
2565 * solutions allow us to take the vnode lock. If a
2566 * filesystem requires that we must take the vnode lock
2567 * to call VOP_BMAP, then we can probably do it in
2568 * bwrite when the vnode lock should already be held
2569 * by the invoking code.
2570 */
2571 KASSERT((bp->b_vp->v_type == VBLK) ||
2572 (bp->b_blkno != bp->b_lblkno));
2573 KASSERT(bp->b_blkno > 0);
2574
2575 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2576 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2577 wc->wc_len += bp->b_bcount;
2578 wc->wc_blkcount++;
2579 bp = TAILQ_NEXT(bp, b_wapbllist);
2580 }
2581 if (wc->wc_len % blocklen != 0) {
2582 padding = blocklen - wc->wc_len % blocklen;
2583 wc->wc_len += padding;
2584 } else {
2585 padding = 0;
2586 }
2587
2588 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2589 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2590 wc->wc_len, padding, (intmax_t)off));
2591
2592 error = wapbl_circ_write(wl, wc, blocklen, &off);
2593 if (error)
2594 return error;
2595 bp = obp;
2596 cnt = 0;
2597 while (bp && (cnt++ < wl->wl_brperjblock)) {
2598 error = wapbl_circ_write(wl, bp->b_data,
2599 bp->b_bcount, &off);
2600 if (error)
2601 return error;
2602 bp = TAILQ_NEXT(bp, b_wapbllist);
2603 }
2604 if (padding) {
2605 void *zero;
2606
2607 zero = wapbl_alloc(padding);
2608 memset(zero, 0, padding);
2609 error = wapbl_circ_write(wl, zero, padding, &off);
2610 wapbl_free(zero, padding);
2611 if (error)
2612 return error;
2613 }
2614 }
2615 *offp = off;
2616 return 0;
2617 }
2618
2619 /*
2620 * wapbl_write_revocations(wl, offp)
2621 *
2622 * Write all pending deallocations in the current transaction from
2623 * wapbl_register_deallocation to the log on disk, adding to the
2624 * circular queue's head at byte offset *offp, and returning the
2625 * new head's byte offset in *offp.
2626 */
2627 static int
2628 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2629 {
2630 struct wapbl_wc_blocklist *wc =
2631 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2632 struct wapbl_dealloc *wd, *lwd;
2633 int blocklen = 1<<wl->wl_log_dev_bshift;
2634 off_t off = *offp;
2635 int error;
2636
2637 KASSERT(rw_write_held(&wl->wl_rwlock));
2638
2639 if (wl->wl_dealloccnt == 0)
2640 return 0;
2641
2642 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2643 wc->wc_type = WAPBL_WC_REVOCATIONS;
2644 wc->wc_len = blocklen;
2645 wc->wc_blkcount = 0;
2646 while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
2647 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2648 wd->wd_blkno;
2649 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2650 wd->wd_len;
2651 wc->wc_blkcount++;
2652
2653 wd = TAILQ_NEXT(wd, wd_entries);
2654 }
2655 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2656 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2657 wc->wc_len, (intmax_t)off));
2658 error = wapbl_circ_write(wl, wc, blocklen, &off);
2659 if (error)
2660 return error;
2661
2662 /* free all successfully written deallocs */
2663 lwd = wd;
2664 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2665 if (wd == lwd)
2666 break;
2667 wapbl_deallocation_free(wl, wd, true);
2668 }
2669 }
2670 *offp = off;
2671 return 0;
2672 }
2673
2674 /*
2675 * wapbl_write_inodes(wl, offp)
2676 *
2677 * Write all pending inode allocations in the current transaction
2678 * from wapbl_register_inode to the log on disk, adding to the
2679 * circular queue's head at byte offset *offp and returning the
2680 * new head's byte offset in *offp.
2681 */
2682 static int
2683 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2684 {
2685 struct wapbl_wc_inodelist *wc =
2686 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2687 int i;
2688 int blocklen = 1 << wl->wl_log_dev_bshift;
2689 off_t off = *offp;
2690 int error;
2691
2692 struct wapbl_ino_head *wih;
2693 struct wapbl_ino *wi;
2694 int iph;
2695
2696 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2697 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2698
2699 i = 0;
2700 wih = &wl->wl_inohash[0];
2701 wi = 0;
2702 do {
2703 wc->wc_type = WAPBL_WC_INODES;
2704 wc->wc_len = blocklen;
2705 wc->wc_inocnt = 0;
2706 wc->wc_clear = (i == 0);
2707 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2708 while (!wi) {
2709 KASSERT((wih - &wl->wl_inohash[0])
2710 <= wl->wl_inohashmask);
2711 wi = LIST_FIRST(wih++);
2712 }
2713 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2714 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2715 wc->wc_inocnt++;
2716 i++;
2717 wi = LIST_NEXT(wi, wi_hash);
2718 }
2719 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2720 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2721 wc->wc_len, (intmax_t)off));
2722 error = wapbl_circ_write(wl, wc, blocklen, &off);
2723 if (error)
2724 return error;
2725 } while (i < wl->wl_inohashcnt);
2726
2727 *offp = off;
2728 return 0;
2729 }
2730
2731 #endif /* _KERNEL */
2732
2733 /****************************************************************/
2734
2735 struct wapbl_blk {
2736 LIST_ENTRY(wapbl_blk) wb_hash;
2737 daddr_t wb_blk;
2738 off_t wb_off; /* Offset of this block in the log */
2739 };
2740 #define WAPBL_BLKPOOL_MIN 83
2741
2742 static void
2743 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2744 {
2745 if (size < WAPBL_BLKPOOL_MIN)
2746 size = WAPBL_BLKPOOL_MIN;
2747 KASSERT(wr->wr_blkhash == 0);
2748 #ifdef _KERNEL
2749 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2750 #else /* ! _KERNEL */
2751 /* Manually implement hashinit */
2752 {
2753 unsigned long i, hashsize;
2754 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2755 continue;
2756 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2757 for (i = 0; i < hashsize; i++)
2758 LIST_INIT(&wr->wr_blkhash[i]);
2759 wr->wr_blkhashmask = hashsize - 1;
2760 }
2761 #endif /* ! _KERNEL */
2762 }
2763
2764 static void
2765 wapbl_blkhash_free(struct wapbl_replay *wr)
2766 {
2767 KASSERT(wr->wr_blkhashcnt == 0);
2768 #ifdef _KERNEL
2769 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2770 #else /* ! _KERNEL */
2771 wapbl_free(wr->wr_blkhash,
2772 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2773 #endif /* ! _KERNEL */
2774 }
2775
2776 static struct wapbl_blk *
2777 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2778 {
2779 struct wapbl_blk_head *wbh;
2780 struct wapbl_blk *wb;
2781 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2782 LIST_FOREACH(wb, wbh, wb_hash) {
2783 if (blk == wb->wb_blk)
2784 return wb;
2785 }
2786 return 0;
2787 }
2788
2789 static void
2790 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2791 {
2792 struct wapbl_blk_head *wbh;
2793 struct wapbl_blk *wb;
2794 wb = wapbl_blkhash_get(wr, blk);
2795 if (wb) {
2796 KASSERT(wb->wb_blk == blk);
2797 wb->wb_off = off;
2798 } else {
2799 wb = wapbl_alloc(sizeof(*wb));
2800 wb->wb_blk = blk;
2801 wb->wb_off = off;
2802 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2803 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2804 wr->wr_blkhashcnt++;
2805 }
2806 }
2807
2808 static void
2809 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2810 {
2811 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2812 if (wb) {
2813 KASSERT(wr->wr_blkhashcnt > 0);
2814 wr->wr_blkhashcnt--;
2815 LIST_REMOVE(wb, wb_hash);
2816 wapbl_free(wb, sizeof(*wb));
2817 }
2818 }
2819
2820 static void
2821 wapbl_blkhash_clear(struct wapbl_replay *wr)
2822 {
2823 unsigned long i;
2824 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2825 struct wapbl_blk *wb;
2826
2827 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2828 KASSERT(wr->wr_blkhashcnt > 0);
2829 wr->wr_blkhashcnt--;
2830 LIST_REMOVE(wb, wb_hash);
2831 wapbl_free(wb, sizeof(*wb));
2832 }
2833 }
2834 KASSERT(wr->wr_blkhashcnt == 0);
2835 }
2836
2837 /****************************************************************/
2838
2839 /*
2840 * wapbl_circ_read(wr, data, len, offp)
2841 *
2842 * Read len bytes into data from the circular queue of wr,
2843 * starting at the linear byte offset *offp, and returning the new
2844 * linear byte offset in *offp.
2845 *
2846 * If the starting linear byte offset precedes wr->wr_circ_off,
2847 * the read instead begins at wr->wr_circ_off. XXX WTF? This
2848 * should be a KASSERT, not a conditional.
2849 */
2850 static int
2851 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2852 {
2853 size_t slen;
2854 off_t off = *offp;
2855 int error;
2856 daddr_t pbn;
2857
2858 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2859 wr->wr_log_dev_bshift) == len);
2860
2861 if (off < wr->wr_circ_off)
2862 off = wr->wr_circ_off;
2863 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2864 if (slen < len) {
2865 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2866 #ifdef _KERNEL
2867 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2868 #endif
2869 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2870 if (error)
2871 return error;
2872 data = (uint8_t *)data + slen;
2873 len -= slen;
2874 off = wr->wr_circ_off;
2875 }
2876 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2877 #ifdef _KERNEL
2878 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2879 #endif
2880 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2881 if (error)
2882 return error;
2883 off += len;
2884 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2885 off = wr->wr_circ_off;
2886 *offp = off;
2887 return 0;
2888 }
2889
2890 /*
2891 * wapbl_circ_advance(wr, len, offp)
2892 *
2893 * Compute the linear byte offset of the circular queue of wr that
2894 * is len bytes past *offp, and store it in *offp.
2895 *
2896 * This is as if wapbl_circ_read, but without actually reading
2897 * anything.
2898 *
2899 * If the starting linear byte offset precedes wr->wr_circ_off, it
2900 * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2901 * be a KASSERT, not a conditional.
2902 */
2903 static void
2904 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2905 {
2906 size_t slen;
2907 off_t off = *offp;
2908
2909 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2910 wr->wr_log_dev_bshift) == len);
2911
2912 if (off < wr->wr_circ_off)
2913 off = wr->wr_circ_off;
2914 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2915 if (slen < len) {
2916 len -= slen;
2917 off = wr->wr_circ_off;
2918 }
2919 off += len;
2920 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2921 off = wr->wr_circ_off;
2922 *offp = off;
2923 }
2924
2925 /****************************************************************/
2926
2927 int
2928 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2929 daddr_t off, size_t count, size_t blksize)
2930 {
2931 struct wapbl_replay *wr;
2932 int error;
2933 struct vnode *devvp;
2934 daddr_t logpbn;
2935 uint8_t *scratch;
2936 struct wapbl_wc_header *wch;
2937 struct wapbl_wc_header *wch2;
2938 /* Use this until we read the actual log header */
2939 int log_dev_bshift = ilog2(blksize);
2940 size_t used;
2941 daddr_t pbn;
2942
2943 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2944 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2945 vp, off, count, blksize));
2946
2947 if (off < 0)
2948 return EINVAL;
2949
2950 if (blksize < DEV_BSIZE)
2951 return EINVAL;
2952 if (blksize % DEV_BSIZE)
2953 return EINVAL;
2954
2955 #ifdef _KERNEL
2956 #if 0
2957 /* XXX vp->v_size isn't reliably set for VBLK devices,
2958 * especially root. However, we might still want to verify
2959 * that the full load is readable */
2960 if ((off + count) * blksize > vp->v_size)
2961 return EINVAL;
2962 #endif
2963 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2964 return error;
2965 }
2966 #else /* ! _KERNEL */
2967 devvp = vp;
2968 logpbn = off;
2969 #endif /* ! _KERNEL */
2970
2971 scratch = wapbl_alloc(MAXBSIZE);
2972
2973 pbn = logpbn;
2974 #ifdef _KERNEL
2975 pbn = btodb(pbn << log_dev_bshift);
2976 #endif
2977 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2978 if (error)
2979 goto errout;
2980
2981 wch = (struct wapbl_wc_header *)scratch;
2982 wch2 =
2983 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2984 /* XXX verify checksums and magic numbers */
2985 if (wch->wc_type != WAPBL_WC_HEADER) {
2986 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2987 error = EFTYPE;
2988 goto errout;
2989 }
2990
2991 if (wch2->wc_generation > wch->wc_generation)
2992 wch = wch2;
2993
2994 wr = wapbl_calloc(1, sizeof(*wr));
2995
2996 wr->wr_logvp = vp;
2997 wr->wr_devvp = devvp;
2998 wr->wr_logpbn = logpbn;
2999
3000 wr->wr_scratch = scratch;
3001
3002 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
3003 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
3004 wr->wr_circ_off = wch->wc_circ_off;
3005 wr->wr_circ_size = wch->wc_circ_size;
3006 wr->wr_generation = wch->wc_generation;
3007
3008 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
3009
3010 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
3011 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
3012 " len=%"PRId64" used=%zu\n",
3013 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
3014 wch->wc_circ_size, used));
3015
3016 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
3017
3018 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
3019 if (error) {
3020 wapbl_replay_stop(wr);
3021 wapbl_replay_free(wr);
3022 return error;
3023 }
3024
3025 *wrp = wr;
3026 return 0;
3027
3028 errout:
3029 wapbl_free(scratch, MAXBSIZE);
3030 return error;
3031 }
3032
3033 void
3034 wapbl_replay_stop(struct wapbl_replay *wr)
3035 {
3036
3037 if (!wapbl_replay_isopen(wr))
3038 return;
3039
3040 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
3041
3042 wapbl_free(wr->wr_scratch, MAXBSIZE);
3043 wr->wr_scratch = NULL;
3044
3045 wr->wr_logvp = NULL;
3046
3047 wapbl_blkhash_clear(wr);
3048 wapbl_blkhash_free(wr);
3049 }
3050
3051 void
3052 wapbl_replay_free(struct wapbl_replay *wr)
3053 {
3054
3055 KDASSERT(!wapbl_replay_isopen(wr));
3056
3057 if (wr->wr_inodes)
3058 wapbl_free(wr->wr_inodes,
3059 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
3060 wapbl_free(wr, sizeof(*wr));
3061 }
3062
3063 #ifdef _KERNEL
3064 int
3065 wapbl_replay_isopen1(struct wapbl_replay *wr)
3066 {
3067
3068 return wapbl_replay_isopen(wr);
3069 }
3070 #endif
3071
3072 /*
3073 * calculate the disk address for the i'th block in the wc_blockblist
3074 * offset by j blocks of size blen.
3075 *
3076 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
3077 * was written to the journal.
3078 *
3079 * The kernel needs that address plus the offset in DEV_BSIZE units.
3080 *
3081 * Userland needs that address plus the offset in blen units.
3082 *
3083 */
3084 static daddr_t
3085 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
3086 {
3087 daddr_t pbn;
3088
3089 #ifdef _KERNEL
3090 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
3091 #else
3092 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
3093 #endif
3094
3095 return pbn;
3096 }
3097
3098 static void
3099 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
3100 {
3101 struct wapbl_wc_blocklist *wc =
3102 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3103 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3104 int i, j, n;
3105
3106 for (i = 0; i < wc->wc_blkcount; i++) {
3107 /*
3108 * Enter each physical block into the hashtable independently.
3109 */
3110 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3111 for (j = 0; j < n; j++) {
3112 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
3113 *offp);
3114 wapbl_circ_advance(wr, fsblklen, offp);
3115 }
3116 }
3117 }
3118
3119 static void
3120 wapbl_replay_process_revocations(struct wapbl_replay *wr)
3121 {
3122 struct wapbl_wc_blocklist *wc =
3123 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3124 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3125 int i, j, n;
3126
3127 for (i = 0; i < wc->wc_blkcount; i++) {
3128 /*
3129 * Remove any blocks found from the hashtable.
3130 */
3131 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3132 for (j = 0; j < n; j++)
3133 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
3134 }
3135 }
3136
3137 static void
3138 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
3139 {
3140 struct wapbl_wc_inodelist *wc =
3141 (struct wapbl_wc_inodelist *)wr->wr_scratch;
3142 void *new_inodes;
3143 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
3144
3145 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
3146
3147 /*
3148 * Keep track of where we found this so location won't be
3149 * overwritten.
3150 */
3151 if (wc->wc_clear) {
3152 wr->wr_inodestail = oldoff;
3153 wr->wr_inodescnt = 0;
3154 if (wr->wr_inodes != NULL) {
3155 wapbl_free(wr->wr_inodes, oldsize);
3156 wr->wr_inodes = NULL;
3157 }
3158 }
3159 wr->wr_inodeshead = newoff;
3160 if (wc->wc_inocnt == 0)
3161 return;
3162
3163 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
3164 sizeof(wr->wr_inodes[0]));
3165 if (wr->wr_inodes != NULL) {
3166 memcpy(new_inodes, wr->wr_inodes, oldsize);
3167 wapbl_free(wr->wr_inodes, oldsize);
3168 }
3169 wr->wr_inodes = new_inodes;
3170 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
3171 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
3172 wr->wr_inodescnt += wc->wc_inocnt;
3173 }
3174
3175 static int
3176 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
3177 {
3178 off_t off;
3179 int error;
3180
3181 int logblklen = 1 << wr->wr_log_dev_bshift;
3182
3183 wapbl_blkhash_clear(wr);
3184
3185 off = tail;
3186 while (off != head) {
3187 struct wapbl_wc_null *wcn;
3188 off_t saveoff = off;
3189 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3190 if (error)
3191 goto errout;
3192 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3193 switch (wcn->wc_type) {
3194 case WAPBL_WC_BLOCKS:
3195 wapbl_replay_process_blocks(wr, &off);
3196 break;
3197
3198 case WAPBL_WC_REVOCATIONS:
3199 wapbl_replay_process_revocations(wr);
3200 break;
3201
3202 case WAPBL_WC_INODES:
3203 wapbl_replay_process_inodes(wr, saveoff, off);
3204 break;
3205
3206 default:
3207 printf("Unrecognized wapbl type: 0x%08x\n",
3208 wcn->wc_type);
3209 error = EFTYPE;
3210 goto errout;
3211 }
3212 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3213 if (off != saveoff) {
3214 printf("wapbl_replay: corrupted records\n");
3215 error = EFTYPE;
3216 goto errout;
3217 }
3218 }
3219 return 0;
3220
3221 errout:
3222 wapbl_blkhash_clear(wr);
3223 return error;
3224 }
3225
3226 #if 0
3227 int
3228 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
3229 {
3230 off_t off;
3231 int mismatchcnt = 0;
3232 int logblklen = 1 << wr->wr_log_dev_bshift;
3233 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3234 void *scratch1 = wapbl_alloc(MAXBSIZE);
3235 void *scratch2 = wapbl_alloc(MAXBSIZE);
3236 int error = 0;
3237
3238 KDASSERT(wapbl_replay_isopen(wr));
3239
3240 off = wch->wc_tail;
3241 while (off != wch->wc_head) {
3242 struct wapbl_wc_null *wcn;
3243 #ifdef DEBUG
3244 off_t saveoff = off;
3245 #endif
3246 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3247 if (error)
3248 goto out;
3249 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3250 switch (wcn->wc_type) {
3251 case WAPBL_WC_BLOCKS:
3252 {
3253 struct wapbl_wc_blocklist *wc =
3254 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3255 int i;
3256 for (i = 0; i < wc->wc_blkcount; i++) {
3257 int foundcnt = 0;
3258 int dirtycnt = 0;
3259 int j, n;
3260 /*
3261 * Check each physical block into the
3262 * hashtable independently
3263 */
3264 n = wc->wc_blocks[i].wc_dlen >>
3265 wch->wc_fs_dev_bshift;
3266 for (j = 0; j < n; j++) {
3267 struct wapbl_blk *wb =
3268 wapbl_blkhash_get(wr,
3269 wapbl_block_daddr(wc, i, j, fsblklen));
3270 if (wb && (wb->wb_off == off)) {
3271 foundcnt++;
3272 error =
3273 wapbl_circ_read(wr,
3274 scratch1, fsblklen,
3275 &off);
3276 if (error)
3277 goto out;
3278 error =
3279 wapbl_read(scratch2,
3280 fsblklen, fsdevvp,
3281 wb->wb_blk);
3282 if (error)
3283 goto out;
3284 if (memcmp(scratch1,
3285 scratch2,
3286 fsblklen)) {
3287 printf(
3288 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3289 wb->wb_blk, (intmax_t)off);
3290 dirtycnt++;
3291 mismatchcnt++;
3292 }
3293 } else {
3294 wapbl_circ_advance(wr,
3295 fsblklen, &off);
3296 }
3297 }
3298 #if 0
3299 /*
3300 * If all of the blocks in an entry
3301 * are clean, then remove all of its
3302 * blocks from the hashtable since they
3303 * never will need replay.
3304 */
3305 if ((foundcnt != 0) &&
3306 (dirtycnt == 0)) {
3307 off = saveoff;
3308 wapbl_circ_advance(wr,
3309 logblklen, &off);
3310 for (j = 0; j < n; j++) {
3311 struct wapbl_blk *wb =
3312 wapbl_blkhash_get(wr,
3313 wapbl_block_daddr(wc, i, j, fsblklen));
3314 if (wb &&
3315 (wb->wb_off == off)) {
3316 wapbl_blkhash_rem(wr, wb->wb_blk);
3317 }
3318 wapbl_circ_advance(wr,
3319 fsblklen, &off);
3320 }
3321 }
3322 #endif
3323 }
3324 }
3325 break;
3326 case WAPBL_WC_REVOCATIONS:
3327 case WAPBL_WC_INODES:
3328 break;
3329 default:
3330 KASSERT(0);
3331 }
3332 #ifdef DEBUG
3333 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3334 KASSERT(off == saveoff);
3335 #endif
3336 }
3337 out:
3338 wapbl_free(scratch1, MAXBSIZE);
3339 wapbl_free(scratch2, MAXBSIZE);
3340 if (!error && mismatchcnt)
3341 error = EFTYPE;
3342 return error;
3343 }
3344 #endif
3345
3346 int
3347 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3348 {
3349 struct wapbl_blk *wb;
3350 size_t i;
3351 off_t off;
3352 void *scratch;
3353 int error = 0;
3354 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3355
3356 KDASSERT(wapbl_replay_isopen(wr));
3357
3358 scratch = wapbl_alloc(MAXBSIZE);
3359
3360 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
3361 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3362 off = wb->wb_off;
3363 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3364 if (error)
3365 break;
3366 error = wapbl_write(scratch, fsblklen, fsdevvp,
3367 wb->wb_blk);
3368 if (error)
3369 break;
3370 }
3371 }
3372
3373 wapbl_free(scratch, MAXBSIZE);
3374 return error;
3375 }
3376
3377 int
3378 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3379 {
3380 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3381
3382 KDASSERT(wapbl_replay_isopen(wr));
3383 KASSERT((len % fsblklen) == 0);
3384
3385 while (len != 0) {
3386 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3387 if (wb)
3388 return 1;
3389 len -= fsblklen;
3390 }
3391 return 0;
3392 }
3393
3394 int
3395 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3396 {
3397 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3398
3399 KDASSERT(wapbl_replay_isopen(wr));
3400
3401 KASSERT((len % fsblklen) == 0);
3402
3403 while (len != 0) {
3404 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3405 if (wb) {
3406 off_t off = wb->wb_off;
3407 int error;
3408 error = wapbl_circ_read(wr, data, fsblklen, &off);
3409 if (error)
3410 return error;
3411 }
3412 data = (uint8_t *)data + fsblklen;
3413 len -= fsblklen;
3414 blk++;
3415 }
3416 return 0;
3417 }
3418
3419 #ifdef _KERNEL
3420
3421 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3422
3423 static int
3424 wapbl_modcmd(modcmd_t cmd, void *arg)
3425 {
3426
3427 switch (cmd) {
3428 case MODULE_CMD_INIT:
3429 wapbl_init();
3430 return 0;
3431 case MODULE_CMD_FINI:
3432 return wapbl_fini();
3433 default:
3434 return ENOTTY;
3435 }
3436 }
3437 #endif /* _KERNEL */
3438