vfs_wapbl.c revision 1.116 1 /* $NetBSD: vfs_wapbl.c,v 1.116 2024/12/07 02:27:38 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.116 2024/12/07 02:27:38 riastradh Exp $");
40
41 #include <sys/param.h>
42 #include <sys/types.h>
43
44 #include <sys/bitops.h>
45 #include <sys/time.h>
46 #include <sys/wapbl.h>
47 #include <sys/wapbl_replay.h>
48
49 #ifdef _KERNEL
50
51 #include <sys/atomic.h>
52 #include <sys/conf.h>
53 #include <sys/evcnt.h>
54 #include <sys/file.h>
55 #include <sys/kauth.h>
56 #include <sys/kernel.h>
57 #include <sys/module.h>
58 #include <sys/mount.h>
59 #include <sys/mutex.h>
60 #include <sys/namei.h>
61 #include <sys/proc.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sdt.h>
64 #include <sys/sysctl.h>
65 #include <sys/uio.h>
66 #include <sys/vnode.h>
67
68 #include <miscfs/specfs/specdev.h>
69
70 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
71 #define wapbl_free(a, s) kmem_free((a), (s))
72 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
73
74 static int wapbl_flush_disk_cache = 1;
75 static int wapbl_verbose_commit = 0;
76 static int wapbl_allow_dpofua = 0; /* switched off by default for now */
77 static int wapbl_journal_iobufs = 4;
78
79 static inline size_t wapbl_space_free(size_t, off_t, off_t);
80
81 #else /* !_KERNEL */
82
83 #include <assert.h>
84 #include <errno.h>
85 #include <stdbool.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <string.h>
89
90 #define KDASSERT(x) assert(x)
91 #define KASSERT(x) assert(x)
92 #define wapbl_alloc(s) malloc(s)
93 #define wapbl_free(a, s) free(a)
94 #define wapbl_calloc(n, s) calloc((n), (s))
95
96 #endif /* !_KERNEL */
97
98 /*
99 * INTERNAL DATA STRUCTURES
100 */
101
102 /*
103 * This structure holds per-mount log information.
104 *
105 * Legend: a = atomic access only
106 * r = read-only after init
107 * l = rwlock held
108 * m = mutex held
109 * lm = rwlock held writing or mutex held
110 * u = unlocked access ok
111 * b = bufcache_lock held
112 */
113 LIST_HEAD(wapbl_ino_head, wapbl_ino);
114 struct wapbl {
115 struct vnode *wl_logvp; /* r: log here */
116 struct vnode *wl_devvp; /* r: log on this device */
117 struct mount *wl_mount; /* r: mountpoint wl is associated with */
118 daddr_t wl_logpbn; /* r: Physical block number of start of log */
119 int wl_log_dev_bshift; /* r: logarithm of device block size of log
120 device */
121 int wl_fs_dev_bshift; /* r: logarithm of device block size of
122 filesystem device */
123
124 unsigned wl_lock_count; /* m: Count of transactions in progress */
125
126 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
127 size_t wl_circ_off; /* r: Number of bytes reserved at start */
128
129 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
130 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
131
132 off_t wl_head; /* l: Byte offset of log head */
133 off_t wl_tail; /* l: Byte offset of log tail */
134 /*
135 * WAPBL log layout, stored on wl_devvp at wl_logpbn:
136 *
137 * ___________________ wl_circ_size __________________
138 * / \
139 * +---------+---------+-------+--------------+--------+
140 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
141 * +---------+---------+-------+--------------+--------+
142 * wl_circ_off --^ ^-- wl_head ^-- wl_tail
143 *
144 * commit0 and commit1 are commit headers. A commit header has
145 * a generation number, indicating which of the two headers is
146 * more recent, and an assignment of head and tail pointers.
147 * The rest is a circular queue of log records, starting at
148 * the byte offset wl_circ_off.
149 *
150 * E marks empty space for records.
151 * W marks records for block writes issued but waiting.
152 * C marks completed records.
153 *
154 * wapbl_flush writes new records to empty `E' spaces after
155 * wl_head from the current transaction in memory.
156 *
157 * wapbl_truncate advances wl_tail past any completed `C'
158 * records, freeing them up for use.
159 *
160 * head == tail == 0 means log is empty.
161 * head == tail != 0 means log is full.
162 *
163 * See assertions in wapbl_advance() for other boundary
164 * conditions.
165 *
166 * Only wapbl_flush moves the head, except when wapbl_truncate
167 * sets it to 0 to indicate that the log is empty.
168 *
169 * Only wapbl_truncate moves the tail, except when wapbl_flush
170 * sets it to wl_circ_off to indicate that the log is full.
171 */
172
173 struct wapbl_wc_header *wl_wc_header; /* l */
174 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
175
176 kmutex_t wl_mtx; /* u: short-term lock */
177 krwlock_t wl_rwlock; /* u: File system transaction lock */
178
179 /*
180 * Must be held while accessing
181 * wl_count or wl_bufs or head or tail
182 */
183
184 #if _KERNEL
185 /*
186 * Callback called from within the flush routine to flush any extra
187 * bits. Note that flush may be skipped without calling this if
188 * there are no outstanding buffers in the transaction.
189 */
190 wapbl_flush_fn_t wl_flush; /* r */
191 wapbl_flush_fn_t wl_flush_abort;/* r */
192
193 /* Event counters */
194 char wl_ev_group[EVCNT_STRING_MAX]; /* r */
195 struct evcnt wl_ev_commit; /* l */
196 struct evcnt wl_ev_journalwrite; /* l */
197 struct evcnt wl_ev_jbufs_bio_nowait; /* l */
198 struct evcnt wl_ev_metawrite; /* lm */
199 struct evcnt wl_ev_cacheflush; /* l */
200 #endif
201
202 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
203 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
204 size_t wl_bcount; /* m: Total bcount of wl_bufs */
205
206 TAILQ_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
207
208 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
209 size_t wl_reclaimable_bytes; /* m: Amount of space available for
210 reclamation by truncate */
211 int wl_error_count; /* m: # of wl_entries with errors */
212 size_t wl_reserved_bytes; /* never truncate log smaller than this */
213
214 #ifdef WAPBL_DEBUG_BUFBYTES
215 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
216 #endif
217
218 #if _KERNEL
219 int wl_brperjblock; /* r Block records per journal block */
220 #endif
221
222 TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
223 int wl_dealloccnt; /* lm: total count */
224 int wl_dealloclim; /* r: max count */
225
226 /* hashtable of inode numbers for allocated but unlinked inodes */
227 /* synch ??? */
228 struct wapbl_ino_head *wl_inohash;
229 u_long wl_inohashmask;
230 int wl_inohashcnt;
231
232 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* m: On disk transaction
233 accounting */
234
235 /* buffers for wapbl_buffered_write() */
236 TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
237 TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
238
239 int wl_dkcache; /* r: disk cache flags */
240 #define WAPBL_USE_FUA(wl) \
241 (wapbl_allow_dpofua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
242 #define WAPBL_JFLAGS(wl) \
243 (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
244 #define WAPBL_JDATA_FLAGS(wl) \
245 (WAPBL_JFLAGS(wl) & B_MEDIA_DPO) /* only DPO */
246 int wl_jwrite_flags; /* r: journal write flags */
247 };
248
249 #ifdef WAPBL_DEBUG_PRINT
250 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
251 #endif
252
253 /****************************************************************/
254 #ifdef _KERNEL
255
256 #ifdef WAPBL_DEBUG
257 struct wapbl *wapbl_debug_wl;
258 #endif
259
260 static int wapbl_write_commit(struct wapbl *, off_t, off_t);
261 static int wapbl_write_blocks(struct wapbl *, off_t *);
262 static int wapbl_write_revocations(struct wapbl *, off_t *);
263 static int wapbl_write_inodes(struct wapbl *, off_t *);
264 #endif /* _KERNEL */
265
266 static int wapbl_replay_process(struct wapbl_replay *, off_t, off_t);
267
268 static inline size_t wapbl_space_used(size_t, off_t, off_t);
269
270 #ifdef _KERNEL
271
272 static struct pool wapbl_entry_pool;
273 static struct pool wapbl_dealloc_pool;
274
275 #define WAPBL_INODETRK_SIZE 83
276 static int wapbl_ino_pool_refcount;
277 static struct pool wapbl_ino_pool;
278 struct wapbl_ino {
279 LIST_ENTRY(wapbl_ino) wi_hash;
280 ino_t wi_ino;
281 mode_t wi_mode;
282 };
283
284 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
285 static void wapbl_inodetrk_free(struct wapbl *wl);
286 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
287
288 static size_t wapbl_transaction_len(struct wapbl *wl);
289 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
290
291 static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *,
292 bool);
293
294 static void wapbl_evcnt_init(struct wapbl *);
295 static void wapbl_evcnt_free(struct wapbl *);
296
297 static void wapbl_dkcache_init(struct wapbl *);
298
299 #if 0
300 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
301 #endif
302
303 static int wapbl_replay_isopen1(struct wapbl_replay *);
304
305 const struct wapbl_ops wapbl_ops = {
306 .wo_wapbl_discard = wapbl_discard,
307 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
308 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
309 .wo_wapbl_replay_read = wapbl_replay_read,
310 .wo_wapbl_add_buf = wapbl_add_buf,
311 .wo_wapbl_remove_buf = wapbl_remove_buf,
312 .wo_wapbl_resize_buf = wapbl_resize_buf,
313 .wo_wapbl_begin = wapbl_begin,
314 .wo_wapbl_end = wapbl_end,
315 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
316 .wo_wapbl_jlock_assert = wapbl_jlock_assert,
317
318 /* XXX: the following is only used to say "this is a wapbl buf" */
319 .wo_wapbl_biodone = wapbl_biodone,
320 };
321
322 SYSCTL_SETUP(wapbl_sysctl_init, "wapbl sysctl")
323 {
324 int rv;
325 const struct sysctlnode *rnode, *cnode;
326
327 rv = sysctl_createv(clog, 0, NULL, &rnode,
328 CTLFLAG_PERMANENT,
329 CTLTYPE_NODE, "wapbl",
330 SYSCTL_DESCR("WAPBL journaling options"),
331 NULL, 0, NULL, 0,
332 CTL_VFS, CTL_CREATE, CTL_EOL);
333 if (rv)
334 return;
335
336 rv = sysctl_createv(clog, 0, &rnode, &cnode,
337 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
338 CTLTYPE_INT, "flush_disk_cache",
339 SYSCTL_DESCR("flush disk cache"),
340 NULL, 0, &wapbl_flush_disk_cache, 0,
341 CTL_CREATE, CTL_EOL);
342 if (rv)
343 return;
344
345 rv = sysctl_createv(clog, 0, &rnode, &cnode,
346 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
347 CTLTYPE_INT, "verbose_commit",
348 SYSCTL_DESCR("show time and size of wapbl log commits"),
349 NULL, 0, &wapbl_verbose_commit, 0,
350 CTL_CREATE, CTL_EOL);
351 if (rv)
352 return;
353
354 rv = sysctl_createv(clog, 0, &rnode, &cnode,
355 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
356 CTLTYPE_INT, "allow_dpofua",
357 SYSCTL_DESCR("allow use of FUA/DPO instead of cache flush"
358 " if available"),
359 NULL, 0, &wapbl_allow_dpofua, 0,
360 CTL_CREATE, CTL_EOL);
361 if (rv)
362 return;
363
364 rv = sysctl_createv(clog, 0, &rnode, &cnode,
365 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
366 CTLTYPE_INT, "journal_iobufs",
367 SYSCTL_DESCR("count of bufs used for journal I/O"
368 " (max async count)"),
369 NULL, 0, &wapbl_journal_iobufs, 0,
370 CTL_CREATE, CTL_EOL);
371 if (rv)
372 return;
373
374 return;
375 }
376
377 static void
378 wapbl_init(void)
379 {
380
381 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
382 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
383 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
384 "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
385 }
386
387 static int
388 wapbl_fini(void)
389 {
390
391 pool_destroy(&wapbl_dealloc_pool);
392 pool_destroy(&wapbl_entry_pool);
393
394 return 0;
395 }
396
397 static void
398 wapbl_evcnt_init(struct wapbl *wl)
399 {
400
401 snprintf(wl->wl_ev_group, sizeof(wl->wl_ev_group),
402 "wapbl fsid 0x%x/0x%x",
403 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[0],
404 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[1]);
405
406 evcnt_attach_dynamic(&wl->wl_ev_commit, EVCNT_TYPE_MISC,
407 NULL, wl->wl_ev_group, "commit");
408 evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
409 NULL, wl->wl_ev_group, "journal write total");
410 evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
411 NULL, wl->wl_ev_group, "journal write finished async");
412 evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
413 NULL, wl->wl_ev_group, "metadata async write");
414 evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
415 NULL, wl->wl_ev_group, "cache flush");
416 }
417
418 static void
419 wapbl_evcnt_free(struct wapbl *wl)
420 {
421
422 evcnt_detach(&wl->wl_ev_commit);
423 evcnt_detach(&wl->wl_ev_journalwrite);
424 evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
425 evcnt_detach(&wl->wl_ev_metawrite);
426 evcnt_detach(&wl->wl_ev_cacheflush);
427 }
428
429 static void
430 wapbl_dkcache_init(struct wapbl *wl)
431 {
432 int error;
433
434 /* Get disk cache flags */
435 error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
436 FWRITE, FSCRED);
437 if (error) {
438 /* behave as if there was a write cache */
439 wl->wl_dkcache = DKCACHE_WRITE;
440 }
441
442 /* Use FUA instead of cache flush if available */
443 if (ISSET(wl->wl_dkcache, DKCACHE_FUA))
444 wl->wl_jwrite_flags |= B_MEDIA_FUA;
445
446 /* Use DPO for journal writes if available */
447 if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
448 wl->wl_jwrite_flags |= B_MEDIA_DPO;
449 }
450
451 static int
452 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
453 {
454 int error, i;
455
456 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
457 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
458
459 /*
460 * Its only valid to reuse the replay log if its
461 * the same as the new log we just opened.
462 */
463 KDASSERT(!wapbl_replay_isopen(wr));
464 KASSERT(wl->wl_devvp->v_type == VBLK);
465 KASSERT(wr->wr_devvp->v_type == VBLK);
466 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
467 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
468 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
469 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
470 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
471 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
472
473 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
474
475 for (i = 0; i < wr->wr_inodescnt; i++)
476 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
477 wr->wr_inodes[i].wr_imode);
478
479 /* Make sure new transaction won't overwrite old inodes list */
480 KDASSERT(wapbl_transaction_len(wl) <=
481 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
482 wr->wr_inodestail));
483
484 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
485 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
486 wapbl_transaction_len(wl);
487
488 error = wapbl_write_inodes(wl, &wl->wl_head);
489 if (error)
490 return error;
491
492 KASSERT(wl->wl_head != wl->wl_tail);
493 KASSERT(wl->wl_head != 0);
494
495 return 0;
496 }
497
498 int
499 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
500 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
501 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
502 {
503 struct wapbl *wl;
504 struct vnode *devvp;
505 daddr_t logpbn;
506 int error;
507 int log_dev_bshift = ilog2(blksize);
508 int fs_dev_bshift = log_dev_bshift;
509 int run;
510
511 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
512 ("wapbl_start: vp=%p off=%"PRId64" count=%zu blksize=%zu\n",
513 vp, off, count, blksize));
514
515 if (log_dev_bshift > fs_dev_bshift) {
516 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
517 ("wapbl: log device's block size cannot be larger "
518 "than filesystem's\n"));
519 /*
520 * Not currently implemented, although it could be if
521 * needed someday.
522 */
523 return SET_ERROR(ENOSYS);
524 }
525
526 if (off < 0)
527 return SET_ERROR(EINVAL);
528
529 if (blksize < DEV_BSIZE)
530 return SET_ERROR(EINVAL);
531 if (blksize % DEV_BSIZE)
532 return SET_ERROR(EINVAL);
533
534 /* XXXTODO: verify that the full load is writable */
535
536 /*
537 * XXX check for minimum log size
538 * minimum is governed by minimum amount of space
539 * to complete a transaction. (probably truncate)
540 */
541 /* XXX for now pick something minimal */
542 if ((count * blksize) < MAXPHYS) {
543 return SET_ERROR(ENOSPC);
544 }
545
546 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
547 return error;
548 }
549
550 wl = wapbl_calloc(1, sizeof(*wl));
551 rw_init(&wl->wl_rwlock);
552 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
553 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
554 TAILQ_INIT(&wl->wl_bufs);
555 SIMPLEQ_INIT(&wl->wl_entries);
556
557 wl->wl_logvp = vp;
558 wl->wl_devvp = devvp;
559 wl->wl_mount = mp;
560 wl->wl_logpbn = logpbn;
561 wl->wl_log_dev_bshift = log_dev_bshift;
562 wl->wl_fs_dev_bshift = fs_dev_bshift;
563
564 wl->wl_flush = flushfn;
565 wl->wl_flush_abort = flushabortfn;
566
567 /* Reserve two log device blocks for the commit headers */
568 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
569 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
570 /* truncate the log usage to a multiple of log_dev_bshift */
571 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
572 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
573
574 /*
575 * wl_bufbytes_max limits the size of the in memory transaction space.
576 * - Since buffers are allocated and accounted for in units of
577 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
578 * (i.e. 1<<PAGE_SHIFT)
579 * - Since the log device has to be written in units of
580 * 1<<wl_log_dev_bshift it is required to be a multiple of
581 * 1<<wl_log_dev_bshift.
582 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
583 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
584 * Therefore it must be multiple of the least common multiple of those
585 * three quantities. Fortunately, all of those quantities are
586 * guaranteed to be a power of two, and the least common multiple of
587 * a set of numbers which are all powers of two is simply the maximum
588 * of those numbers. Finally, the maximum logarithm of a power of two
589 * is the same as the log of the maximum power of two. So we can do
590 * the following operations to size wl_bufbytes_max:
591 */
592
593 /* XXX fix actual number of pages reserved per filesystem. */
594 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
595
596 /* Round wl_bufbytes_max to the largest power of two constraint */
597 wl->wl_bufbytes_max >>= PAGE_SHIFT;
598 wl->wl_bufbytes_max <<= PAGE_SHIFT;
599 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
600 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
601 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
602 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
603
604 /* XXX maybe use filesystem fragment size instead of 1024 */
605 /* XXX fix actual number of buffers reserved per filesystem. */
606 wl->wl_bufcount_max = (buf_nbuf() / 2) * 1024;
607
608 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
609 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
610 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
611 KASSERT(wl->wl_brperjblock > 0);
612
613 /* XXX tie this into resource estimation */
614 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
615 TAILQ_INIT(&wl->wl_dealloclist);
616
617 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
618
619 wapbl_evcnt_init(wl);
620
621 wapbl_dkcache_init(wl);
622
623 /* Initialize the commit header */
624 {
625 struct wapbl_wc_header *wc;
626 size_t len = 1 << wl->wl_log_dev_bshift;
627 wc = wapbl_calloc(1, len);
628 wc->wc_type = WAPBL_WC_HEADER;
629 wc->wc_len = len;
630 wc->wc_circ_off = wl->wl_circ_off;
631 wc->wc_circ_size = wl->wl_circ_size;
632 /* XXX wc->wc_fsid */
633 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
634 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
635 wl->wl_wc_header = wc;
636 wl->wl_wc_scratch = wapbl_alloc(len);
637 }
638
639 TAILQ_INIT(&wl->wl_iobufs);
640 TAILQ_INIT(&wl->wl_iobufs_busy);
641 for (int i = 0; i < wapbl_journal_iobufs; i++) {
642 struct buf *bp;
643
644 if ((bp = geteblk(MAXPHYS)) == NULL)
645 goto errout;
646
647 mutex_enter(&bufcache_lock);
648 mutex_enter(devvp->v_interlock);
649 bgetvp(devvp, bp);
650 mutex_exit(devvp->v_interlock);
651 mutex_exit(&bufcache_lock);
652
653 bp->b_dev = devvp->v_rdev;
654
655 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
656 }
657
658 /*
659 * if there was an existing set of unlinked but
660 * allocated inodes, preserve it in the new
661 * log.
662 */
663 if (wr && wr->wr_inodescnt) {
664 error = wapbl_start_flush_inodes(wl, wr);
665 if (error)
666 goto errout;
667 }
668
669 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
670 if (error) {
671 goto errout;
672 }
673
674 *wlp = wl;
675 #if defined(WAPBL_DEBUG)
676 wapbl_debug_wl = wl;
677 #endif
678
679 return 0;
680 errout:
681 wapbl_discard(wl);
682 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
683 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
684 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
685 struct buf *bp;
686
687 bp = TAILQ_FIRST(&wl->wl_iobufs);
688 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
689 brelse(bp, BC_INVAL);
690 }
691 wapbl_inodetrk_free(wl);
692 wapbl_free(wl, sizeof(*wl));
693
694 return error;
695 }
696
697 /*
698 * Like wapbl_flush, only discards the transaction
699 * completely
700 */
701
702 void
703 wapbl_discard(struct wapbl *wl)
704 {
705 struct wapbl_entry *we;
706 struct wapbl_dealloc *wd;
707 struct buf *bp;
708 int i;
709
710 /*
711 * XXX we may consider using upgrade here
712 * if we want to call flush from inside a transaction
713 */
714 rw_enter(&wl->wl_rwlock, RW_WRITER);
715 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
716
717 #ifdef WAPBL_DEBUG_PRINT
718 {
719 pid_t pid = -1;
720 lwpid_t lid = -1;
721 if (curproc)
722 pid = curproc->p_pid;
723 if (curlwp)
724 lid = curlwp->l_lid;
725 #ifdef WAPBL_DEBUG_BUFBYTES
726 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
727 ("wapbl_discard: thread %d.%d discarding "
728 "transaction\n"
729 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
730 "deallocs=%d inodes=%d\n"
731 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
732 "unsynced=%zu\n",
733 pid, lid,
734 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
735 wl->wl_dealloccnt, wl->wl_inohashcnt,
736 wl->wl_error_count, wl->wl_reclaimable_bytes,
737 wl->wl_reserved_bytes,
738 wl->wl_unsynced_bufbytes));
739 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
740 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
741 ("\tentry: bufcount = %zu, reclaimable = %zu, "
742 "error = %d, unsynced = %zu\n",
743 we->we_bufcount, we->we_reclaimable_bytes,
744 we->we_error, we->we_unsynced_bufbytes));
745 }
746 #else /* !WAPBL_DEBUG_BUFBYTES */
747 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
748 ("wapbl_discard: thread %d.%d discarding transaction\n"
749 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
750 "deallocs=%d inodes=%d\n"
751 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
752 pid, lid,
753 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
754 wl->wl_dealloccnt, wl->wl_inohashcnt,
755 wl->wl_error_count, wl->wl_reclaimable_bytes,
756 wl->wl_reserved_bytes));
757 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
758 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
759 ("\tentry: bufcount = %zu, reclaimable = %zu, "
760 "error = %d\n",
761 we->we_bufcount, we->we_reclaimable_bytes,
762 we->we_error));
763 }
764 #endif /* !WAPBL_DEBUG_BUFBYTES */
765 }
766 #endif /* WAPBL_DEBUG_PRINT */
767
768 for (i = 0; i <= wl->wl_inohashmask; i++) {
769 struct wapbl_ino_head *wih;
770 struct wapbl_ino *wi;
771
772 wih = &wl->wl_inohash[i];
773 while ((wi = LIST_FIRST(wih)) != NULL) {
774 LIST_REMOVE(wi, wi_hash);
775 pool_put(&wapbl_ino_pool, wi);
776 KASSERT(wl->wl_inohashcnt > 0);
777 wl->wl_inohashcnt--;
778 }
779 }
780
781 /*
782 * clean buffer list
783 */
784 mutex_enter(&bufcache_lock);
785 mutex_enter(&wl->wl_mtx);
786 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
787 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
788 KASSERT(bp->b_flags & B_LOCKED);
789 KASSERT(bp->b_oflags & BO_DELWRI);
790 /*
791 * Buffer is already on BQ_LOCKED queue.
792 * The buffer will be unlocked and
793 * removed from the transaction in brelsel()
794 */
795 mutex_exit(&wl->wl_mtx);
796 bremfree(bp);
797 brelsel(bp, BC_INVAL);
798 mutex_enter(&wl->wl_mtx);
799 }
800 }
801
802 /*
803 * Remove references to this wl from wl_entries, free any which
804 * no longer have buffers, others will be freed in wapbl_biodone()
805 * when they no longer have any buffers.
806 */
807 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
808 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
809 /* XXX should we be accumulating wl_error_count
810 * and increasing reclaimable bytes ? */
811 we->we_wapbl = NULL;
812 if (we->we_bufcount == 0) {
813 #ifdef WAPBL_DEBUG_BUFBYTES
814 KASSERT(we->we_unsynced_bufbytes == 0);
815 #endif
816 pool_put(&wapbl_entry_pool, we);
817 }
818 }
819
820 mutex_exit(&wl->wl_mtx);
821 mutex_exit(&bufcache_lock);
822
823 /* Discard list of deallocs */
824 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
825 wapbl_deallocation_free(wl, wd, true);
826
827 /* XXX should we clear wl_reserved_bytes? */
828
829 KASSERT(wl->wl_bufbytes == 0);
830 KASSERT(wl->wl_bcount == 0);
831 KASSERT(wl->wl_bufcount == 0);
832 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
833 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
834 KASSERT(wl->wl_inohashcnt == 0);
835 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
836 KASSERT(wl->wl_dealloccnt == 0);
837
838 rw_exit(&wl->wl_rwlock);
839 }
840
841 int
842 wapbl_stop(struct wapbl *wl, int force)
843 {
844 int error;
845
846 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
847 error = wapbl_flush(wl, 1);
848 if (error) {
849 if (force)
850 wapbl_discard(wl);
851 else
852 return error;
853 }
854
855 /* Unlinked inodes persist after a flush */
856 if (wl->wl_inohashcnt) {
857 if (force) {
858 wapbl_discard(wl);
859 } else {
860 return SET_ERROR(EBUSY);
861 }
862 }
863
864 KASSERT(wl->wl_bufbytes == 0);
865 KASSERT(wl->wl_bcount == 0);
866 KASSERT(wl->wl_bufcount == 0);
867 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
868 KASSERT(wl->wl_dealloccnt == 0);
869 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
870 KASSERT(wl->wl_inohashcnt == 0);
871 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
872 KASSERT(wl->wl_dealloccnt == 0);
873 KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
874
875 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
876 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
877 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
878 struct buf *bp;
879
880 bp = TAILQ_FIRST(&wl->wl_iobufs);
881 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
882 brelse(bp, BC_INVAL);
883 }
884 wapbl_inodetrk_free(wl);
885
886 wapbl_evcnt_free(wl);
887
888 cv_destroy(&wl->wl_reclaimable_cv);
889 mutex_destroy(&wl->wl_mtx);
890 rw_destroy(&wl->wl_rwlock);
891 wapbl_free(wl, sizeof(*wl));
892
893 return 0;
894 }
895
896 /****************************************************************/
897 /*
898 * Unbuffered disk I/O
899 */
900
901 static void
902 wapbl_doio_accounting(struct vnode *devvp, int flags)
903 {
904 struct pstats *pstats = curlwp->l_proc->p_stats;
905
906 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
907 mutex_enter(devvp->v_interlock);
908 devvp->v_numoutput++;
909 mutex_exit(devvp->v_interlock);
910 pstats->p_ru.ru_oublock++;
911 } else {
912 pstats->p_ru.ru_inblock++;
913 }
914
915 }
916
917 static int
918 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
919 {
920 struct buf *bp;
921 int error;
922
923 KASSERT(devvp->v_type == VBLK);
924
925 wapbl_doio_accounting(devvp, flags);
926
927 bp = getiobuf(devvp, true);
928 bp->b_flags = flags;
929 bp->b_cflags |= BC_BUSY; /* mandatory, asserted by biowait() */
930 bp->b_dev = devvp->v_rdev;
931 bp->b_data = data;
932 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
933 bp->b_blkno = pbn;
934 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
935
936 WAPBL_PRINTF(WAPBL_PRINT_IO,
937 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
938 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
939 bp->b_blkno, bp->b_dev));
940
941 VOP_STRATEGY(devvp, bp);
942
943 error = biowait(bp);
944 putiobuf(bp);
945
946 if (error) {
947 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
948 ("wapbl_doio: %s %zu bytes at block %" PRId64
949 " on dev 0x%"PRIx64" failed with error %d\n",
950 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
951 "write" : "read"),
952 len, pbn, devvp->v_rdev, error));
953 }
954
955 return error;
956 }
957
958 /*
959 * wapbl_write(data, len, devvp, pbn)
960 *
961 * Synchronously write len bytes from data to physical block pbn
962 * on devvp.
963 */
964 int
965 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
966 {
967
968 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
969 }
970
971 /*
972 * wapbl_read(data, len, devvp, pbn)
973 *
974 * Synchronously read len bytes into data from physical block pbn
975 * on devvp.
976 */
977 int
978 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
979 {
980
981 return wapbl_doio(data, len, devvp, pbn, B_READ);
982 }
983
984 /****************************************************************/
985 /*
986 * Buffered disk writes -- try to coalesce writes and emit
987 * MAXPHYS-aligned blocks.
988 */
989
990 /*
991 * wapbl_buffered_write_async(wl, bp)
992 *
993 * Send buffer for asynchronous write.
994 */
995 static void
996 wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
997 {
998
999 wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
1000
1001 KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
1002 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
1003
1004 bp->b_flags |= B_WRITE;
1005 bp->b_cflags |= BC_BUSY; /* mandatory, asserted by biowait() */
1006 bp->b_oflags = 0;
1007 bp->b_bcount = bp->b_resid;
1008 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1009
1010 VOP_STRATEGY(wl->wl_devvp, bp);
1011
1012 wl->wl_ev_journalwrite.ev_count++;
1013
1014 TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
1015 }
1016
1017 /*
1018 * wapbl_buffered_flush(wl)
1019 *
1020 * Flush any buffered writes from wapbl_buffered_write.
1021 */
1022 static int
1023 wapbl_buffered_flush(struct wapbl *wl, bool full)
1024 {
1025 int error = 0;
1026 struct buf *bp, *bnext;
1027 bool only_done = true, found = false;
1028
1029 /* if there is outstanding buffered write, send it now */
1030 if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
1031 wapbl_buffered_write_async(wl, bp);
1032
1033 /* wait for I/O to complete */
1034 again:
1035 TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
1036 if (!full && only_done) {
1037 /* skip unfinished */
1038 if (!ISSET(bp->b_oflags, BO_DONE))
1039 continue;
1040 }
1041
1042 if (ISSET(bp->b_oflags, BO_DONE))
1043 wl->wl_ev_jbufs_bio_nowait.ev_count++;
1044
1045 TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
1046 error = biowait(bp);
1047
1048 /* reset for reuse */
1049 bp->b_blkno = bp->b_resid = bp->b_flags = 0;
1050 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
1051 found = true;
1052
1053 if (!full)
1054 break;
1055 }
1056
1057 if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
1058 only_done = false;
1059 goto again;
1060 }
1061
1062 return error;
1063 }
1064
1065 /*
1066 * wapbl_buffered_write(data, len, wl, pbn)
1067 *
1068 * Write len bytes from data to physical block pbn on
1069 * wl->wl_devvp. The write may not complete until
1070 * wapbl_buffered_flush.
1071 */
1072 static int
1073 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn,
1074 int bflags)
1075 {
1076 size_t resid;
1077 struct buf *bp;
1078
1079 again:
1080 bp = TAILQ_FIRST(&wl->wl_iobufs);
1081
1082 if (bp == NULL) {
1083 /* No more buffers, wait for any previous I/O to finish. */
1084 wapbl_buffered_flush(wl, false);
1085
1086 bp = TAILQ_FIRST(&wl->wl_iobufs);
1087 KASSERT(bp != NULL);
1088 }
1089
1090 /*
1091 * If not adjacent to buffered data flush first. Disk block
1092 * address is always valid for non-empty buffer.
1093 */
1094 if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
1095 wapbl_buffered_write_async(wl, bp);
1096 goto again;
1097 }
1098
1099 /*
1100 * If this write goes to an empty buffer we have to
1101 * save the disk block address first.
1102 */
1103 if (bp->b_blkno == 0) {
1104 bp->b_blkno = pbn;
1105 bp->b_flags |= bflags;
1106 }
1107
1108 /*
1109 * Remaining space so this buffer ends on a buffer size boundary.
1110 *
1111 * Cannot become less or equal zero as the buffer would have been
1112 * flushed on the last call then.
1113 */
1114 resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
1115 bp->b_resid;
1116 KASSERT(resid > 0);
1117 KASSERT(dbtob(btodb(resid)) == resid);
1118
1119 if (len < resid)
1120 resid = len;
1121
1122 memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
1123 bp->b_resid += resid;
1124
1125 if (len >= resid) {
1126 /* Just filled the buf, or data did not fit */
1127 wapbl_buffered_write_async(wl, bp);
1128
1129 data = (uint8_t *)data + resid;
1130 len -= resid;
1131 pbn += btodb(resid);
1132
1133 if (len > 0)
1134 goto again;
1135 }
1136
1137 return 0;
1138 }
1139
1140 /*
1141 * wapbl_circ_write(wl, data, len, offp)
1142 *
1143 * Write len bytes from data to the circular queue of wl, starting
1144 * at linear byte offset *offp, and returning the new linear byte
1145 * offset in *offp.
1146 *
1147 * If the starting linear byte offset precedes wl->wl_circ_off,
1148 * the write instead begins at wl->wl_circ_off. XXX WTF? This
1149 * should be a KASSERT, not a conditional.
1150 *
1151 * The write is buffered in wl and must be flushed with
1152 * wapbl_buffered_flush before it will be submitted to the disk.
1153 */
1154 static int
1155 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
1156 {
1157 size_t slen;
1158 off_t off = *offp;
1159 int error;
1160 daddr_t pbn;
1161
1162 KDASSERT(((len >> wl->wl_log_dev_bshift) << wl->wl_log_dev_bshift) ==
1163 len);
1164
1165 if (off < wl->wl_circ_off)
1166 off = wl->wl_circ_off;
1167 slen = wl->wl_circ_off + wl->wl_circ_size - off;
1168 if (slen < len) {
1169 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1170 #ifdef _KERNEL
1171 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1172 #endif
1173 error = wapbl_buffered_write(data, slen, wl, pbn,
1174 WAPBL_JDATA_FLAGS(wl));
1175 if (error)
1176 return error;
1177 data = (uint8_t *)data + slen;
1178 len -= slen;
1179 off = wl->wl_circ_off;
1180 }
1181 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1182 #ifdef _KERNEL
1183 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1184 #endif
1185 error = wapbl_buffered_write(data, len, wl, pbn,
1186 WAPBL_JDATA_FLAGS(wl));
1187 if (error)
1188 return error;
1189 off += len;
1190 if (off >= wl->wl_circ_off + wl->wl_circ_size)
1191 off = wl->wl_circ_off;
1192 *offp = off;
1193 return 0;
1194 }
1195
1196 /****************************************************************/
1197 /*
1198 * WAPBL transactions: entering, adding/removing bufs, and exiting
1199 */
1200
1201 int
1202 wapbl_begin(struct wapbl *wl, const char *file, int line)
1203 {
1204 int doflush;
1205 unsigned lockcount;
1206
1207 KDASSERT(wl);
1208
1209 /*
1210 * XXX this needs to be made much more sophisticated.
1211 * perhaps each wapbl_begin could reserve a specified
1212 * number of buffers and bytes.
1213 */
1214 mutex_enter(&wl->wl_mtx);
1215 lockcount = wl->wl_lock_count;
1216 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
1217 wl->wl_bufbytes_max / 2) ||
1218 ((wl->wl_bufcount + (lockcount * 10)) >
1219 wl->wl_bufcount_max / 2) ||
1220 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1221 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1222 mutex_exit(&wl->wl_mtx);
1223
1224 if (doflush) {
1225 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1226 ("force flush lockcnt=%d bufbytes=%zu "
1227 "(max=%zu) bufcount=%zu (max=%zu) "
1228 "dealloccnt %d (lim=%d)\n",
1229 lockcount, wl->wl_bufbytes,
1230 wl->wl_bufbytes_max, wl->wl_bufcount,
1231 wl->wl_bufcount_max,
1232 wl->wl_dealloccnt, wl->wl_dealloclim));
1233 }
1234
1235 if (doflush) {
1236 int error = wapbl_flush(wl, 0);
1237 if (error)
1238 return error;
1239 }
1240
1241 rw_enter(&wl->wl_rwlock, RW_READER);
1242 mutex_enter(&wl->wl_mtx);
1243 wl->wl_lock_count++;
1244 mutex_exit(&wl->wl_mtx);
1245
1246 #if defined(WAPBL_DEBUG_PRINT)
1247 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1248 ("wapbl_begin thread %d.%d with bufcount=%zu "
1249 "bufbytes=%zu bcount=%zu at %s:%d\n",
1250 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1251 wl->wl_bufbytes, wl->wl_bcount, file, line));
1252 #endif
1253
1254 return 0;
1255 }
1256
1257 void
1258 wapbl_end(struct wapbl *wl)
1259 {
1260
1261 #if defined(WAPBL_DEBUG_PRINT)
1262 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1263 ("wapbl_end thread %d.%d with bufcount=%zu "
1264 "bufbytes=%zu bcount=%zu\n",
1265 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1266 wl->wl_bufbytes, wl->wl_bcount));
1267 #endif
1268
1269 /*
1270 * XXX this could be handled more gracefully, perhaps place
1271 * only a partial transaction in the log and allow the
1272 * remaining to flush without the protection of the journal.
1273 */
1274 KASSERTMSG((wapbl_transaction_len(wl) <=
1275 (wl->wl_circ_size - wl->wl_reserved_bytes)),
1276 "wapbl_end: current transaction too big to flush");
1277
1278 mutex_enter(&wl->wl_mtx);
1279 KASSERT(wl->wl_lock_count > 0);
1280 wl->wl_lock_count--;
1281 mutex_exit(&wl->wl_mtx);
1282
1283 rw_exit(&wl->wl_rwlock);
1284 }
1285
1286 void
1287 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1288 {
1289
1290 KASSERT(bp->b_cflags & BC_BUSY);
1291 KASSERT(bp->b_vp);
1292
1293 wapbl_jlock_assert(wl);
1294
1295 #if 0
1296 /*
1297 * XXX this might be an issue for swapfiles.
1298 * see uvm_swap.c:1702
1299 *
1300 * XXX2 why require it then? leap of semantics?
1301 */
1302 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1303 #endif
1304
1305 mutex_enter(&wl->wl_mtx);
1306 if (bp->b_flags & B_LOCKED) {
1307 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1308 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1309 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1310 "with %d bytes %d bcount\n",
1311 curproc->p_pid, curlwp->l_lid, bp,
1312 bp->b_bufsize, bp->b_bcount));
1313 } else {
1314 /* unlocked by dirty buffers shouldn't exist */
1315 KASSERT(!(bp->b_oflags & BO_DELWRI));
1316 wl->wl_bufbytes += bp->b_bufsize;
1317 wl->wl_bcount += bp->b_bcount;
1318 wl->wl_bufcount++;
1319 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1320 ("wapbl_add_buf thread %d.%d adding buf %p "
1321 "with %d bytes %d bcount\n",
1322 curproc->p_pid, curlwp->l_lid, bp,
1323 bp->b_bufsize, bp->b_bcount));
1324 }
1325 TAILQ_INSERT_TAIL(&wl->wl_bufs, bp, b_wapbllist);
1326 mutex_exit(&wl->wl_mtx);
1327
1328 bp->b_flags |= B_LOCKED;
1329 }
1330
1331 static void
1332 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1333 {
1334
1335 KASSERT(mutex_owned(&wl->wl_mtx));
1336 KASSERT(bp->b_cflags & BC_BUSY);
1337 wapbl_jlock_assert(wl);
1338
1339 #if 0
1340 /*
1341 * XXX this might be an issue for swapfiles.
1342 * see uvm_swap.c:1725
1343 *
1344 * XXXdeux: see above
1345 */
1346 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1347 #endif
1348 KASSERT(bp->b_flags & B_LOCKED);
1349
1350 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1351 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1352 "%d bytes %d bcount\n",
1353 curproc->p_pid, curlwp->l_lid, bp,
1354 bp->b_bufsize, bp->b_bcount));
1355
1356 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1357 wl->wl_bufbytes -= bp->b_bufsize;
1358 KASSERT(wl->wl_bcount >= bp->b_bcount);
1359 wl->wl_bcount -= bp->b_bcount;
1360 KASSERT(wl->wl_bufcount > 0);
1361 wl->wl_bufcount--;
1362 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1363 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1364 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1365
1366 bp->b_flags &= ~B_LOCKED;
1367 }
1368
1369 /* called from brelsel() in vfs_bio among other places */
1370 void
1371 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1372 {
1373
1374 mutex_enter(&wl->wl_mtx);
1375 wapbl_remove_buf_locked(wl, bp);
1376 mutex_exit(&wl->wl_mtx);
1377 }
1378
1379 void
1380 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1381 {
1382
1383 KASSERT(bp->b_cflags & BC_BUSY);
1384
1385 /*
1386 * XXX: why does this depend on B_LOCKED? otherwise the buf
1387 * is not for a transaction? if so, why is this called in the
1388 * first place?
1389 */
1390 if (bp->b_flags & B_LOCKED) {
1391 mutex_enter(&wl->wl_mtx);
1392 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1393 wl->wl_bcount += bp->b_bcount - oldcnt;
1394 mutex_exit(&wl->wl_mtx);
1395 }
1396 }
1397
1398 #endif /* _KERNEL */
1399
1400 /****************************************************************/
1401 /* Some utility inlines */
1402
1403 /*
1404 * wapbl_space_used(avail, head, tail)
1405 *
1406 * Number of bytes used in a circular queue of avail total bytes,
1407 * from tail to head.
1408 */
1409 static inline size_t
1410 wapbl_space_used(size_t avail, off_t head, off_t tail)
1411 {
1412
1413 if (tail == 0) {
1414 KASSERT(head == 0);
1415 return 0;
1416 }
1417 return ((head + (avail - 1) - tail) % avail) + 1;
1418 }
1419
1420 #ifdef _KERNEL
1421 /*
1422 * wapbl_advance(size, off, oldoff, delta)
1423 *
1424 * Given a byte offset oldoff into a circular queue of size bytes
1425 * starting at off, return a new byte offset oldoff + delta into
1426 * the circular queue.
1427 */
1428 static inline off_t
1429 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1430 {
1431 off_t newoff;
1432
1433 /* Define acceptable ranges for inputs. */
1434 KASSERT(delta <= (size_t)size);
1435 KASSERT(oldoff == 0 || (size_t)oldoff >= off);
1436 KASSERT(oldoff < (off_t)(size + off));
1437
1438 if (oldoff == 0 && delta != 0)
1439 newoff = off + delta;
1440 else if (oldoff + delta < size + off)
1441 newoff = oldoff + delta;
1442 else
1443 newoff = (oldoff + delta) - size;
1444
1445 /* Note some interesting axioms */
1446 KASSERT(delta != 0 || newoff == oldoff);
1447 KASSERT(delta == 0 || newoff != 0);
1448 KASSERT(delta != size || newoff == oldoff);
1449
1450 /* Define acceptable ranges for output. */
1451 KASSERT(newoff == 0 || (size_t)newoff >= off);
1452 KASSERT((size_t)newoff < size + off);
1453 return newoff;
1454 }
1455
1456 /*
1457 * wapbl_space_free(avail, head, tail)
1458 *
1459 * Number of bytes free in a circular queue of avail total bytes,
1460 * in which everything from tail to head is used.
1461 */
1462 static inline size_t
1463 wapbl_space_free(size_t avail, off_t head, off_t tail)
1464 {
1465
1466 return avail - wapbl_space_used(avail, head, tail);
1467 }
1468
1469 /*
1470 * wapbl_advance_head(size, off, delta, headp, tailp)
1471 *
1472 * In a circular queue of size bytes starting at off, given the
1473 * old head and tail offsets *headp and *tailp, store the new head
1474 * and tail offsets in *headp and *tailp resulting from adding
1475 * delta bytes of data to the head.
1476 */
1477 static inline void
1478 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1479 off_t *tailp)
1480 {
1481 off_t head = *headp;
1482 off_t tail = *tailp;
1483
1484 KASSERT(delta <= wapbl_space_free(size, head, tail));
1485 head = wapbl_advance(size, off, head, delta);
1486 if (tail == 0 && head != 0)
1487 tail = off;
1488 *headp = head;
1489 *tailp = tail;
1490 }
1491
1492 /*
1493 * wapbl_advance_tail(size, off, delta, headp, tailp)
1494 *
1495 * In a circular queue of size bytes starting at off, given the
1496 * old head and tail offsets *headp and *tailp, store the new head
1497 * and tail offsets in *headp and *tailp resulting from removing
1498 * delta bytes of data from the tail.
1499 */
1500 static inline void
1501 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1502 off_t *tailp)
1503 {
1504 off_t head = *headp;
1505 off_t tail = *tailp;
1506
1507 KASSERT(delta <= wapbl_space_used(size, head, tail));
1508 tail = wapbl_advance(size, off, tail, delta);
1509 if (head == tail) {
1510 head = tail = 0;
1511 }
1512 *headp = head;
1513 *tailp = tail;
1514 }
1515
1516
1517 /****************************************************************/
1518
1519 /*
1520 * wapbl_truncate(wl, minfree)
1521 *
1522 * Wait until at least minfree bytes are available in the log.
1523 *
1524 * If it was necessary to wait for writes to complete,
1525 * advance the circular queue tail to reflect the new write
1526 * completions and issue a write commit to the log.
1527 *
1528 * => Caller must hold wl->wl_rwlock writer lock.
1529 */
1530 static int
1531 wapbl_truncate(struct wapbl *wl, size_t minfree)
1532 {
1533 size_t delta;
1534 size_t avail;
1535 off_t head;
1536 off_t tail;
1537 int error = 0;
1538
1539 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1540 KASSERT(rw_write_held(&wl->wl_rwlock));
1541
1542 mutex_enter(&wl->wl_mtx);
1543
1544 /*
1545 * First check to see if we have to do a commit
1546 * at all.
1547 */
1548 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1549 if (minfree < avail) {
1550 mutex_exit(&wl->wl_mtx);
1551 return 0;
1552 }
1553 minfree -= avail;
1554 while (wl->wl_error_count == 0 &&
1555 wl->wl_reclaimable_bytes < minfree) {
1556 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1557 ("wapbl_truncate: sleeping on %p"
1558 " wl=%p bytes=%zd minfree=%zd\n",
1559 &wl->wl_reclaimable_bytes,
1560 wl, wl->wl_reclaimable_bytes, minfree));
1561 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1562 }
1563 if (wl->wl_reclaimable_bytes < minfree) {
1564 KASSERT(wl->wl_error_count);
1565 /* XXX maybe get actual error from buffer instead someday? */
1566 error = SET_ERROR(EIO);
1567 }
1568 head = wl->wl_head;
1569 tail = wl->wl_tail;
1570 delta = wl->wl_reclaimable_bytes;
1571
1572 /* If all of the entries are flushed, then be sure to keep
1573 * the reserved bytes reserved. Watch out for discarded transactions,
1574 * which could leave more bytes reserved than are reclaimable.
1575 */
1576 if (SIMPLEQ_EMPTY(&wl->wl_entries) && delta >= wl->wl_reserved_bytes) {
1577 delta -= wl->wl_reserved_bytes;
1578 }
1579 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1580 &tail);
1581 KDASSERT(wl->wl_reserved_bytes <=
1582 wapbl_space_used(wl->wl_circ_size, head, tail));
1583 mutex_exit(&wl->wl_mtx);
1584
1585 if (error)
1586 return error;
1587
1588 /*
1589 * This is where head, tail and delta are unprotected
1590 * from races against itself or flush. This is ok since
1591 * we only call this routine from inside flush itself.
1592 *
1593 * XXX: how can it race against itself when accessed only
1594 * from behind the write-locked rwlock?
1595 */
1596 error = wapbl_write_commit(wl, head, tail);
1597 if (error)
1598 return error;
1599
1600 wl->wl_head = head;
1601 wl->wl_tail = tail;
1602
1603 mutex_enter(&wl->wl_mtx);
1604 KASSERT(wl->wl_reclaimable_bytes >= delta);
1605 wl->wl_reclaimable_bytes -= delta;
1606 mutex_exit(&wl->wl_mtx);
1607 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1608 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1609 curproc->p_pid, curlwp->l_lid, delta));
1610
1611 return 0;
1612 }
1613
1614 /****************************************************************/
1615
1616 void
1617 wapbl_biodone(struct buf *bp)
1618 {
1619 struct wapbl_entry *we = bp->b_private;
1620 struct wapbl *wl;
1621 #ifdef WAPBL_DEBUG_BUFBYTES
1622 const int bufsize = bp->b_bufsize;
1623 #endif
1624
1625 mutex_enter(&bufcache_lock);
1626 wl = we->we_wapbl;
1627 mutex_exit(&bufcache_lock);
1628
1629 /*
1630 * Handle possible flushing of buffers after log has been
1631 * decomissioned.
1632 */
1633 if (!wl) {
1634 KASSERT(we->we_bufcount > 0);
1635 we->we_bufcount--;
1636 #ifdef WAPBL_DEBUG_BUFBYTES
1637 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1638 we->we_unsynced_bufbytes -= bufsize;
1639 #endif
1640
1641 if (we->we_bufcount == 0) {
1642 #ifdef WAPBL_DEBUG_BUFBYTES
1643 KASSERT(we->we_unsynced_bufbytes == 0);
1644 #endif
1645 pool_put(&wapbl_entry_pool, we);
1646 }
1647
1648 brelse(bp, 0);
1649 return;
1650 }
1651
1652 #ifdef ohbother
1653 KDASSERT(bp->b_oflags & BO_DONE);
1654 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1655 KDASSERT(bp->b_flags & B_ASYNC);
1656 KDASSERT(bp->b_cflags & BC_BUSY);
1657 KDASSERT(!(bp->b_flags & B_LOCKED));
1658 KDASSERT(!(bp->b_flags & B_READ));
1659 KDASSERT(!(bp->b_cflags & BC_INVAL));
1660 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1661 #endif
1662
1663 if (bp->b_error) {
1664 /*
1665 * If an error occurs, it would be nice to leave the buffer
1666 * as a delayed write on the LRU queue so that we can retry
1667 * it later. But buffercache(9) can't handle dirty buffer
1668 * reuse, so just mark the log permanently errored out.
1669 */
1670 mutex_enter(&wl->wl_mtx);
1671 if (wl->wl_error_count == 0) {
1672 wl->wl_error_count++;
1673 cv_broadcast(&wl->wl_reclaimable_cv);
1674 }
1675 mutex_exit(&wl->wl_mtx);
1676 }
1677
1678 /*
1679 * Make sure that the buf doesn't retain the media flags, so that
1680 * e.g. wapbl_allow_fuadpo has immediate effect on any following I/O.
1681 * The flags will be set again if needed by another I/O.
1682 */
1683 bp->b_flags &= ~B_MEDIA_FLAGS;
1684
1685 /*
1686 * Release the buffer here. wapbl_flush() may wait for the
1687 * log to become empty and we better unbusy the buffer before
1688 * wapbl_flush() returns.
1689 */
1690 brelse(bp, 0);
1691
1692 mutex_enter(&wl->wl_mtx);
1693
1694 KASSERT(we->we_bufcount > 0);
1695 we->we_bufcount--;
1696 #ifdef WAPBL_DEBUG_BUFBYTES
1697 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1698 we->we_unsynced_bufbytes -= bufsize;
1699 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1700 wl->wl_unsynced_bufbytes -= bufsize;
1701 #endif
1702 wl->wl_ev_metawrite.ev_count++;
1703
1704 /*
1705 * If the current transaction can be reclaimed, start
1706 * at the beginning and reclaim any consecutive reclaimable
1707 * transactions. If we successfully reclaim anything,
1708 * then wakeup anyone waiting for the reclaim.
1709 */
1710 if (we->we_bufcount == 0) {
1711 size_t delta = 0;
1712 int errcnt = 0;
1713 #ifdef WAPBL_DEBUG_BUFBYTES
1714 KDASSERT(we->we_unsynced_bufbytes == 0);
1715 #endif
1716 /*
1717 * clear any posted error, since the buffer it came from
1718 * has successfully flushed by now
1719 */
1720 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1721 we->we_bufcount == 0) {
1722 delta += we->we_reclaimable_bytes;
1723 if (we->we_error)
1724 errcnt++;
1725 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1726 pool_put(&wapbl_entry_pool, we);
1727 }
1728
1729 if (delta) {
1730 wl->wl_reclaimable_bytes += delta;
1731 KASSERT(wl->wl_error_count >= errcnt);
1732 wl->wl_error_count -= errcnt;
1733 cv_broadcast(&wl->wl_reclaimable_cv);
1734 }
1735 }
1736
1737 mutex_exit(&wl->wl_mtx);
1738 }
1739
1740 /*
1741 * wapbl_flush(wl, wait)
1742 *
1743 * Flush pending block writes, deallocations, and inodes from
1744 * the current transaction in memory to the log on disk:
1745 *
1746 * 1. Call the file system's wl_flush callback to flush any
1747 * per-file-system pending updates.
1748 * 2. Wait for enough space in the log for the current transaction.
1749 * 3. Synchronously write the new log records, advancing the
1750 * circular queue head.
1751 * 4. Issue the pending block writes asynchronously, now that they
1752 * are recorded in the log and can be replayed after crash.
1753 * 5. If wait is true, wait for all writes to complete and for the
1754 * log to become empty.
1755 *
1756 * On failure, call the file system's wl_flush_abort callback.
1757 */
1758 int
1759 wapbl_flush(struct wapbl *wl, int waitfor)
1760 {
1761 struct buf *bp;
1762 struct wapbl_entry *we;
1763 off_t off;
1764 off_t head;
1765 off_t tail;
1766 size_t delta = 0;
1767 size_t flushsize;
1768 size_t reserved;
1769 int error = 0;
1770
1771 /*
1772 * Do a quick check to see if a full flush can be skipped
1773 * This assumes that the flush callback does not need to be called
1774 * unless there are other outstanding bufs.
1775 */
1776 if (!waitfor) {
1777 size_t nbufs;
1778 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1779 protect the KASSERTS */
1780 nbufs = wl->wl_bufcount;
1781 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1782 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1783 mutex_exit(&wl->wl_mtx);
1784 if (nbufs == 0)
1785 return 0;
1786 }
1787
1788 /*
1789 * XXX we may consider using LK_UPGRADE here
1790 * if we want to call flush from inside a transaction
1791 */
1792 rw_enter(&wl->wl_rwlock, RW_WRITER);
1793 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
1794
1795 /*
1796 * Now that we are exclusively locked and the file system has
1797 * issued any deferred block writes for this transaction, check
1798 * whether there are any blocks to write to the log. If not,
1799 * skip waiting for space or writing any log entries.
1800 *
1801 * XXX Shouldn't this also check wl_dealloccnt and
1802 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1803 * file system didn't produce any blocks as a consequence of
1804 * it, but the same does not seem to be so of wl_inohashcnt.
1805 */
1806 if (wl->wl_bufcount == 0) {
1807 goto wait_out;
1808 }
1809
1810 #if 0
1811 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1812 ("wapbl_flush thread %d.%d flushing entries with "
1813 "bufcount=%zu bufbytes=%zu\n",
1814 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1815 wl->wl_bufbytes));
1816 #endif
1817
1818 /* Calculate amount of space needed to flush */
1819 flushsize = wapbl_transaction_len(wl);
1820 if (wapbl_verbose_commit) {
1821 struct timespec ts;
1822 getnanotime(&ts);
1823 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1824 __func__, (long long)ts.tv_sec,
1825 (long)ts.tv_nsec, flushsize);
1826 }
1827
1828 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1829 /*
1830 * XXX this could be handled more gracefully, perhaps place
1831 * only a partial transaction in the log and allow the
1832 * remaining to flush without the protection of the journal.
1833 */
1834 panic("wapbl_flush: current transaction too big to flush");
1835 }
1836
1837 error = wapbl_truncate(wl, flushsize);
1838 if (error)
1839 goto out;
1840
1841 off = wl->wl_head;
1842 KASSERT(off == 0 || off >= wl->wl_circ_off);
1843 KASSERT(off == 0 || off < wl->wl_circ_off + wl->wl_circ_size);
1844 error = wapbl_write_blocks(wl, &off);
1845 if (error)
1846 goto out;
1847 error = wapbl_write_revocations(wl, &off);
1848 if (error)
1849 goto out;
1850 error = wapbl_write_inodes(wl, &off);
1851 if (error)
1852 goto out;
1853
1854 reserved = 0;
1855 if (wl->wl_inohashcnt)
1856 reserved = wapbl_transaction_inodes_len(wl);
1857
1858 head = wl->wl_head;
1859 tail = wl->wl_tail;
1860
1861 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1862 &head, &tail);
1863
1864 KASSERTMSG(head == off,
1865 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1866 " off=%"PRIdMAX" flush=%zu",
1867 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1868 flushsize);
1869
1870 /* Opportunistically move the tail forward if we can */
1871 mutex_enter(&wl->wl_mtx);
1872 delta = wl->wl_reclaimable_bytes;
1873 mutex_exit(&wl->wl_mtx);
1874 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1875 &head, &tail);
1876
1877 error = wapbl_write_commit(wl, head, tail);
1878 if (error)
1879 goto out;
1880
1881 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1882
1883 #ifdef WAPBL_DEBUG_BUFBYTES
1884 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1885 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1886 " unsynced=%zu"
1887 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1888 "inodes=%d\n",
1889 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1890 wapbl_space_used(wl->wl_circ_size, head, tail),
1891 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1892 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1893 wl->wl_inohashcnt));
1894 #else
1895 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1896 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1897 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1898 "inodes=%d\n",
1899 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1900 wapbl_space_used(wl->wl_circ_size, head, tail),
1901 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1902 wl->wl_dealloccnt, wl->wl_inohashcnt));
1903 #endif
1904
1905
1906 mutex_enter(&bufcache_lock);
1907 mutex_enter(&wl->wl_mtx);
1908
1909 wl->wl_reserved_bytes = reserved;
1910 wl->wl_head = head;
1911 wl->wl_tail = tail;
1912 KASSERT(wl->wl_reclaimable_bytes >= delta);
1913 wl->wl_reclaimable_bytes -= delta;
1914 KDASSERT(wl->wl_dealloccnt == 0);
1915 #ifdef WAPBL_DEBUG_BUFBYTES
1916 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1917 #endif
1918
1919 we->we_wapbl = wl;
1920 we->we_bufcount = wl->wl_bufcount;
1921 #ifdef WAPBL_DEBUG_BUFBYTES
1922 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1923 #endif
1924 we->we_reclaimable_bytes = flushsize;
1925 we->we_error = 0;
1926 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1927
1928 /*
1929 * This flushes bufs in order than they were queued, so the LRU
1930 * order is preserved.
1931 */
1932 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
1933 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1934 continue;
1935 }
1936 bp->b_iodone = wapbl_biodone;
1937 bp->b_private = we;
1938
1939 bremfree(bp);
1940 wapbl_remove_buf_locked(wl, bp);
1941 mutex_exit(&wl->wl_mtx);
1942 mutex_exit(&bufcache_lock);
1943 bawrite(bp);
1944 mutex_enter(&bufcache_lock);
1945 mutex_enter(&wl->wl_mtx);
1946 }
1947 mutex_exit(&wl->wl_mtx);
1948 mutex_exit(&bufcache_lock);
1949
1950 #if 0
1951 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1952 ("wapbl_flush thread %d.%d done flushing entries...\n",
1953 curproc->p_pid, curlwp->l_lid));
1954 #endif
1955
1956 wait_out:
1957
1958 /*
1959 * If the waitfor flag is set, don't return until everything is
1960 * fully flushed and the on disk log is empty.
1961 */
1962 if (waitfor) {
1963 error = wapbl_truncate(wl, wl->wl_circ_size -
1964 wl->wl_reserved_bytes);
1965 }
1966
1967 out:
1968 if (error) {
1969 wl->wl_flush_abort(wl->wl_mount,
1970 TAILQ_FIRST(&wl->wl_dealloclist));
1971 }
1972
1973 #ifdef WAPBL_DEBUG_PRINT
1974 if (error) {
1975 pid_t pid = -1;
1976 lwpid_t lid = -1;
1977 if (curproc)
1978 pid = curproc->p_pid;
1979 if (curlwp)
1980 lid = curlwp->l_lid;
1981 mutex_enter(&wl->wl_mtx);
1982 #ifdef WAPBL_DEBUG_BUFBYTES
1983 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1984 ("wapbl_flush: thread %d.%d aborted flush: "
1985 "error = %d\n"
1986 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1987 "deallocs=%d inodes=%d\n"
1988 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1989 "unsynced=%zu\n",
1990 pid, lid, error, wl->wl_bufcount,
1991 wl->wl_bufbytes, wl->wl_bcount,
1992 wl->wl_dealloccnt, wl->wl_inohashcnt,
1993 wl->wl_error_count, wl->wl_reclaimable_bytes,
1994 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1995 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1996 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1997 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1998 "error = %d, unsynced = %zu\n",
1999 we->we_bufcount, we->we_reclaimable_bytes,
2000 we->we_error, we->we_unsynced_bufbytes));
2001 }
2002 #else
2003 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2004 ("wapbl_flush: thread %d.%d aborted flush: "
2005 "error = %d\n"
2006 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
2007 "deallocs=%d inodes=%d\n"
2008 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
2009 pid, lid, error, wl->wl_bufcount,
2010 wl->wl_bufbytes, wl->wl_bcount,
2011 wl->wl_dealloccnt, wl->wl_inohashcnt,
2012 wl->wl_error_count, wl->wl_reclaimable_bytes,
2013 wl->wl_reserved_bytes));
2014 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2015 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2016 ("\tentry: bufcount = %zu, reclaimable = %zu, "
2017 "error = %d\n", we->we_bufcount,
2018 we->we_reclaimable_bytes, we->we_error));
2019 }
2020 #endif
2021 mutex_exit(&wl->wl_mtx);
2022 }
2023 #endif
2024
2025 rw_exit(&wl->wl_rwlock);
2026 return error;
2027 }
2028
2029 /****************************************************************/
2030
2031 void
2032 wapbl_jlock_assert(struct wapbl *wl)
2033 {
2034
2035 KASSERT(rw_lock_held(&wl->wl_rwlock));
2036 }
2037
2038 void
2039 wapbl_junlock_assert(struct wapbl *wl)
2040 {
2041
2042 KASSERT(!rw_write_held(&wl->wl_rwlock));
2043 }
2044
2045 /****************************************************************/
2046
2047 /* locks missing */
2048 void
2049 wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...))
2050 {
2051 struct buf *bp;
2052 struct wapbl_entry *we;
2053 (*pr)("wapbl %p", wl);
2054 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
2055 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
2056 (*pr)("circ = %zu, header = %zu,"
2057 " head = %"PRIdMAX" tail = %"PRIdMAX"\n",
2058 wl->wl_circ_size, wl->wl_circ_off,
2059 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
2060 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
2061 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
2062 #ifdef WAPBL_DEBUG_BUFBYTES
2063 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2064 "reserved = %zu errcnt = %d unsynced = %zu\n",
2065 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
2066 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2067 wl->wl_error_count, wl->wl_unsynced_bufbytes);
2068 #else
2069 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2070 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
2071 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2072 wl->wl_error_count);
2073 #endif
2074 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
2075 wl->wl_dealloccnt, wl->wl_dealloclim);
2076 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
2077 wl->wl_inohashcnt, wl->wl_inohashmask);
2078 (*pr)("entries:\n");
2079 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2080 #ifdef WAPBL_DEBUG_BUFBYTES
2081 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
2082 "unsynced = %zu\n",
2083 we->we_bufcount, we->we_reclaimable_bytes,
2084 we->we_error, we->we_unsynced_bufbytes);
2085 #else
2086 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
2087 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
2088 #endif
2089 }
2090 if (full) {
2091 int cnt = 0;
2092 (*pr)("bufs =");
2093 TAILQ_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
2094 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2095 (*pr)(" %p", bp);
2096 } else if ((++cnt % 6) == 0) {
2097 (*pr)(" %p,\n\t", bp);
2098 } else {
2099 (*pr)(" %p,", bp);
2100 }
2101 }
2102 (*pr)("\n");
2103
2104 (*pr)("dealloced blks = ");
2105 {
2106 struct wapbl_dealloc *wd;
2107 cnt = 0;
2108 TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
2109 (*pr)(" %"PRId64":%d,",
2110 wd->wd_blkno,
2111 wd->wd_len);
2112 if ((++cnt % 4) == 0) {
2113 (*pr)("\n\t");
2114 }
2115 }
2116 }
2117 (*pr)("\n");
2118
2119 (*pr)("registered inodes = ");
2120 {
2121 int i;
2122 cnt = 0;
2123 for (i = 0; i <= wl->wl_inohashmask; i++) {
2124 struct wapbl_ino_head *wih;
2125 struct wapbl_ino *wi;
2126
2127 wih = &wl->wl_inohash[i];
2128 LIST_FOREACH(wi, wih, wi_hash) {
2129 if (wi->wi_ino == 0)
2130 continue;
2131 (*pr)(" %"PRIu64"/0%06"PRIo32",",
2132 wi->wi_ino, wi->wi_mode);
2133 if ((++cnt % 4) == 0) {
2134 (*pr)("\n\t");
2135 }
2136 }
2137 }
2138 (*pr)("\n");
2139 }
2140
2141 (*pr)("iobufs free =");
2142 TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
2143 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2144 (*pr)(" %p", bp);
2145 } else if ((++cnt % 6) == 0) {
2146 (*pr)(" %p,\n\t", bp);
2147 } else {
2148 (*pr)(" %p,", bp);
2149 }
2150 }
2151 (*pr)("\n");
2152
2153 (*pr)("iobufs busy =");
2154 TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
2155 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2156 (*pr)(" %p", bp);
2157 } else if ((++cnt % 6) == 0) {
2158 (*pr)(" %p,\n\t", bp);
2159 } else {
2160 (*pr)(" %p,", bp);
2161 }
2162 }
2163 (*pr)("\n");
2164 }
2165 }
2166
2167 #if defined(WAPBL_DEBUG) || defined(DDB)
2168 void
2169 wapbl_dump(struct wapbl *wl)
2170 {
2171 #if defined(WAPBL_DEBUG)
2172 if (!wl)
2173 wl = wapbl_debug_wl;
2174 #endif
2175 if (!wl)
2176 return;
2177 wapbl_print(wl, 1, printf);
2178 }
2179 #endif
2180
2181 /****************************************************************/
2182
2183 int
2184 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force,
2185 void **cookiep)
2186 {
2187 struct wapbl_dealloc *wd;
2188 int error = 0;
2189
2190 wapbl_jlock_assert(wl);
2191
2192 mutex_enter(&wl->wl_mtx);
2193
2194 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
2195 if (!force) {
2196 error = SET_ERROR(EAGAIN);
2197 goto out;
2198 }
2199
2200 /*
2201 * Forced registration can only be used when:
2202 * 1) the caller can't cope with failure
2203 * 2) the path can be triggered only bounded, small
2204 * times per transaction
2205 * If this is not fullfilled, and the path would be triggered
2206 * many times, this could overflow maximum transaction size
2207 * and panic later.
2208 */
2209 printf("%s: forced dealloc registration over limit:"
2210 " %d >= %d\n",
2211 wl->wl_mount->mnt_stat.f_mntonname,
2212 wl->wl_dealloccnt, wl->wl_dealloclim);
2213 }
2214
2215 wl->wl_dealloccnt++;
2216 mutex_exit(&wl->wl_mtx);
2217
2218 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
2219 wd->wd_blkno = blk;
2220 wd->wd_len = len;
2221
2222 mutex_enter(&wl->wl_mtx);
2223 TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
2224
2225 if (cookiep)
2226 *cookiep = wd;
2227
2228 out:
2229 mutex_exit(&wl->wl_mtx);
2230
2231 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
2232 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
2233 blk, len, error));
2234
2235 return error;
2236 }
2237
2238 static void
2239 wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd,
2240 bool locked)
2241 {
2242
2243 KASSERT(!locked
2244 || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx));
2245
2246 if (!locked)
2247 mutex_enter(&wl->wl_mtx);
2248
2249 TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
2250 wl->wl_dealloccnt--;
2251
2252 if (!locked)
2253 mutex_exit(&wl->wl_mtx);
2254
2255 pool_put(&wapbl_dealloc_pool, wd);
2256 }
2257
2258 void
2259 wapbl_unregister_deallocation(struct wapbl *wl, void *cookie)
2260 {
2261
2262 KASSERT(cookie != NULL);
2263 wapbl_deallocation_free(wl, cookie, false);
2264 }
2265
2266 /****************************************************************/
2267
2268 static void
2269 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
2270 {
2271
2272 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
2273 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
2274 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
2275 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
2276 }
2277 }
2278
2279 static void
2280 wapbl_inodetrk_free(struct wapbl *wl)
2281 {
2282
2283 /* XXX this KASSERT needs locking/mutex analysis */
2284 KASSERT(wl->wl_inohashcnt == 0);
2285 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2286 membar_release();
2287 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2288 membar_acquire();
2289 pool_destroy(&wapbl_ino_pool);
2290 }
2291 }
2292
2293 static struct wapbl_ino *
2294 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2295 {
2296 struct wapbl_ino_head *wih;
2297 struct wapbl_ino *wi;
2298
2299 KASSERT(mutex_owned(&wl->wl_mtx));
2300
2301 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2302 LIST_FOREACH(wi, wih, wi_hash) {
2303 if (ino == wi->wi_ino)
2304 return wi;
2305 }
2306 return 0;
2307 }
2308
2309 void
2310 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2311 {
2312 struct wapbl_ino_head *wih;
2313 struct wapbl_ino *wi;
2314
2315 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2316
2317 mutex_enter(&wl->wl_mtx);
2318 if (wapbl_inodetrk_get(wl, ino) == NULL) {
2319 wi->wi_ino = ino;
2320 wi->wi_mode = mode;
2321 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2322 LIST_INSERT_HEAD(wih, wi, wi_hash);
2323 wl->wl_inohashcnt++;
2324 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2325 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2326 mutex_exit(&wl->wl_mtx);
2327 } else {
2328 mutex_exit(&wl->wl_mtx);
2329 pool_put(&wapbl_ino_pool, wi);
2330 }
2331 }
2332
2333 void
2334 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2335 {
2336 struct wapbl_ino *wi;
2337
2338 mutex_enter(&wl->wl_mtx);
2339 wi = wapbl_inodetrk_get(wl, ino);
2340 if (wi) {
2341 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2342 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2343 KASSERT(wl->wl_inohashcnt > 0);
2344 wl->wl_inohashcnt--;
2345 LIST_REMOVE(wi, wi_hash);
2346 mutex_exit(&wl->wl_mtx);
2347
2348 pool_put(&wapbl_ino_pool, wi);
2349 } else {
2350 mutex_exit(&wl->wl_mtx);
2351 }
2352 }
2353
2354 /****************************************************************/
2355
2356 /*
2357 * wapbl_transaction_inodes_len(wl)
2358 *
2359 * Calculate the number of bytes required for inode registration
2360 * log records in wl.
2361 */
2362 static inline size_t
2363 wapbl_transaction_inodes_len(struct wapbl *wl)
2364 {
2365 int blocklen = 1<<wl->wl_log_dev_bshift;
2366 int iph;
2367
2368 /* Calculate number of inodes described in a inodelist header */
2369 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2370 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2371
2372 KASSERT(iph > 0);
2373
2374 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2375 }
2376
2377
2378 /*
2379 * wapbl_transaction_len(wl)
2380 *
2381 * Calculate number of bytes required for all log records in wl.
2382 */
2383 static size_t
2384 wapbl_transaction_len(struct wapbl *wl)
2385 {
2386 int blocklen = 1<<wl->wl_log_dev_bshift;
2387 size_t len;
2388
2389 /* Calculate number of blocks described in a blocklist header */
2390 len = wl->wl_bcount;
2391 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2392 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2393 len += wapbl_transaction_inodes_len(wl);
2394
2395 return len;
2396 }
2397
2398 /*
2399 * wapbl_cache_sync(wl, msg)
2400 *
2401 * Issue DIOCCACHESYNC to wl->wl_devvp.
2402 *
2403 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2404 * including msg about the duration of the cache sync.
2405 */
2406 static int
2407 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2408 {
2409 const bool verbose = wapbl_verbose_commit >= 2;
2410 struct bintime start_time;
2411 int force = 1;
2412 int error;
2413
2414 /* Skip full cache sync if disabled */
2415 if (!wapbl_flush_disk_cache) {
2416 return 0;
2417 }
2418 if (verbose) {
2419 bintime(&start_time);
2420 }
2421 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2422 FWRITE, FSCRED);
2423 if (error) {
2424 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2425 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2426 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev,
2427 error));
2428 }
2429 if (verbose) {
2430 struct bintime d;
2431 struct timespec ts;
2432
2433 bintime(&d);
2434 bintime_sub(&d, &start_time);
2435 bintime2timespec(&d, &ts);
2436 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2437 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2438 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2439 }
2440
2441 wl->wl_ev_cacheflush.ev_count++;
2442
2443 return error;
2444 }
2445
2446 /*
2447 * wapbl_write_commit(wl, head, tail)
2448 *
2449 * Issue a disk cache sync to wait for all pending writes to the
2450 * log to complete, and then synchronously commit the current
2451 * circular queue head and tail to the log, in the next of two
2452 * locations for commit headers on disk.
2453 *
2454 * Increment the generation number. If the generation number
2455 * rolls over to zero, then a subsequent commit would appear to
2456 * have an older generation than this one -- in that case, issue a
2457 * duplicate commit to avoid this.
2458 *
2459 * => Caller must have exclusive access to wl, either by holding
2460 * wl->wl_rwlock for writer or by being wapbl_start before anyone
2461 * else has seen wl.
2462 */
2463 static int
2464 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2465 {
2466 struct wapbl_wc_header *wc = wl->wl_wc_header;
2467 struct timespec ts;
2468 int error;
2469 daddr_t pbn;
2470
2471 error = wapbl_buffered_flush(wl, true);
2472 if (error)
2473 return error;
2474 /*
2475 * Flush disk cache to ensure that blocks we've written are actually
2476 * written to the stable storage before the commit header.
2477 * This flushes to disk not only journal blocks, but also all
2478 * metadata blocks, written asynchronously since previous commit.
2479 *
2480 * XXX Calc checksum here, instead we do this for now
2481 */
2482 wapbl_cache_sync(wl, "1");
2483
2484 wc->wc_head = head;
2485 wc->wc_tail = tail;
2486 wc->wc_checksum = 0;
2487 wc->wc_version = 1;
2488 getnanotime(&ts);
2489 wc->wc_time = ts.tv_sec;
2490 wc->wc_timensec = ts.tv_nsec;
2491
2492 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2493 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2494 (intmax_t)head, (intmax_t)tail));
2495
2496 /*
2497 * write the commit header.
2498 *
2499 * XXX if generation will rollover, then first zero
2500 * over second commit header before trying to write both headers.
2501 */
2502
2503 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2504 #ifdef _KERNEL
2505 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2506 #endif
2507 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn,
2508 WAPBL_JFLAGS(wl));
2509 if (error)
2510 return error;
2511 error = wapbl_buffered_flush(wl, true);
2512 if (error)
2513 return error;
2514
2515 /*
2516 * Flush disk cache to ensure that the commit header is actually
2517 * written before meta data blocks. Commit block is written using
2518 * FUA when enabled, in that case this flush is not needed.
2519 */
2520 if (!WAPBL_USE_FUA(wl))
2521 wapbl_cache_sync(wl, "2");
2522
2523 /*
2524 * If the generation number was zero, write it out a second time.
2525 * This handles initialization and generation number rollover
2526 */
2527 if (wc->wc_generation++ == 0) {
2528 error = wapbl_write_commit(wl, head, tail);
2529 /*
2530 * This panic should be able to be removed if we do the
2531 * zero'ing mentioned above, and we are certain to roll
2532 * back generation number on failure.
2533 */
2534 if (error) {
2535 panic("wapbl_write_commit: error writing duplicate "
2536 "log header: %d", error);
2537 }
2538 }
2539
2540 wl->wl_ev_commit.ev_count++;
2541
2542 return 0;
2543 }
2544
2545 /*
2546 * wapbl_write_blocks(wl, offp)
2547 *
2548 * Write all pending physical blocks in the current transaction
2549 * from wapbl_add_buf to the log on disk, adding to the circular
2550 * queue head at byte offset *offp, and returning the new head's
2551 * byte offset in *offp.
2552 */
2553 static int
2554 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2555 {
2556 struct wapbl_wc_blocklist *wc =
2557 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2558 int blocklen = 1<<wl->wl_log_dev_bshift;
2559 struct buf *bp;
2560 off_t off = *offp;
2561 int error;
2562 size_t padding;
2563
2564 KASSERT(rw_write_held(&wl->wl_rwlock));
2565
2566 bp = TAILQ_FIRST(&wl->wl_bufs);
2567
2568 while (bp) {
2569 int cnt;
2570 struct buf *obp = bp;
2571
2572 KASSERT(bp->b_flags & B_LOCKED);
2573
2574 wc->wc_type = WAPBL_WC_BLOCKS;
2575 wc->wc_len = blocklen;
2576 wc->wc_blkcount = 0;
2577 wc->wc_unused = 0;
2578 while (bp && wc->wc_blkcount < wl->wl_brperjblock) {
2579 /*
2580 * Make sure all the physical block numbers are up to
2581 * date. If this is not always true on a given
2582 * filesystem, then VOP_BMAP must be called. We
2583 * could call VOP_BMAP here, or else in the filesystem
2584 * specific flush callback, although neither of those
2585 * solutions allow us to take the vnode lock. If a
2586 * filesystem requires that we must take the vnode lock
2587 * to call VOP_BMAP, then we can probably do it in
2588 * bwrite when the vnode lock should already be held
2589 * by the invoking code.
2590 */
2591 KASSERT(bp->b_vp->v_type == VBLK ||
2592 bp->b_blkno != bp->b_lblkno);
2593 KASSERT(bp->b_blkno > 0);
2594
2595 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2596 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2597 wc->wc_len += bp->b_bcount;
2598 wc->wc_blkcount++;
2599 bp = TAILQ_NEXT(bp, b_wapbllist);
2600 }
2601 if (wc->wc_len % blocklen != 0) {
2602 padding = blocklen - wc->wc_len % blocklen;
2603 wc->wc_len += padding;
2604 } else {
2605 padding = 0;
2606 }
2607
2608 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2609 ("wapbl_write_blocks:"
2610 " len = %u (padding %zu) off = %"PRIdMAX"\n",
2611 wc->wc_len, padding, (intmax_t)off));
2612
2613 error = wapbl_circ_write(wl, wc, blocklen, &off);
2614 if (error)
2615 return error;
2616 bp = obp;
2617 cnt = 0;
2618 while (bp && cnt++ < wl->wl_brperjblock) {
2619 error = wapbl_circ_write(wl, bp->b_data,
2620 bp->b_bcount, &off);
2621 if (error)
2622 return error;
2623 bp = TAILQ_NEXT(bp, b_wapbllist);
2624 }
2625 if (padding) {
2626 void *zero;
2627
2628 zero = wapbl_alloc(padding);
2629 memset(zero, 0, padding);
2630 error = wapbl_circ_write(wl, zero, padding, &off);
2631 wapbl_free(zero, padding);
2632 if (error)
2633 return error;
2634 }
2635 }
2636 *offp = off;
2637 return 0;
2638 }
2639
2640 /*
2641 * wapbl_write_revocations(wl, offp)
2642 *
2643 * Write all pending deallocations in the current transaction from
2644 * wapbl_register_deallocation to the log on disk, adding to the
2645 * circular queue's head at byte offset *offp, and returning the
2646 * new head's byte offset in *offp.
2647 */
2648 static int
2649 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2650 {
2651 struct wapbl_wc_blocklist *wc =
2652 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2653 struct wapbl_dealloc *wd, *lwd;
2654 int blocklen = 1<<wl->wl_log_dev_bshift;
2655 off_t off = *offp;
2656 int error;
2657
2658 KASSERT(rw_write_held(&wl->wl_rwlock));
2659
2660 if (wl->wl_dealloccnt == 0)
2661 return 0;
2662
2663 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2664 wc->wc_type = WAPBL_WC_REVOCATIONS;
2665 wc->wc_len = blocklen;
2666 wc->wc_blkcount = 0;
2667 wc->wc_unused = 0;
2668 while (wd && wc->wc_blkcount < wl->wl_brperjblock) {
2669 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2670 wd->wd_blkno;
2671 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2672 wd->wd_len;
2673 wc->wc_blkcount++;
2674
2675 wd = TAILQ_NEXT(wd, wd_entries);
2676 }
2677 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2678 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2679 wc->wc_len, (intmax_t)off));
2680 error = wapbl_circ_write(wl, wc, blocklen, &off);
2681 if (error)
2682 return error;
2683
2684 /* free all successfully written deallocs */
2685 lwd = wd;
2686 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2687 if (wd == lwd)
2688 break;
2689 wapbl_deallocation_free(wl, wd, true);
2690 }
2691 }
2692 *offp = off;
2693 return 0;
2694 }
2695
2696 /*
2697 * wapbl_write_inodes(wl, offp)
2698 *
2699 * Write all pending inode allocations in the current transaction
2700 * from wapbl_register_inode to the log on disk, adding to the
2701 * circular queue's head at byte offset *offp and returning the
2702 * new head's byte offset in *offp.
2703 */
2704 static int
2705 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2706 {
2707 struct wapbl_wc_inodelist *wc =
2708 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2709 int i;
2710 int blocklen = 1 << wl->wl_log_dev_bshift;
2711 off_t off = *offp;
2712 int error;
2713
2714 struct wapbl_ino_head *wih;
2715 struct wapbl_ino *wi;
2716 int iph;
2717
2718 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2719 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2720
2721 i = 0;
2722 wih = &wl->wl_inohash[0];
2723 wi = 0;
2724 do {
2725 wc->wc_type = WAPBL_WC_INODES;
2726 wc->wc_len = blocklen;
2727 wc->wc_inocnt = 0;
2728 wc->wc_clear = (i == 0);
2729 while (i < wl->wl_inohashcnt && wc->wc_inocnt < iph) {
2730 while (!wi) {
2731 KASSERT((wih - &wl->wl_inohash[0])
2732 <= wl->wl_inohashmask);
2733 wi = LIST_FIRST(wih++);
2734 }
2735 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2736 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2737 wc->wc_inocnt++;
2738 i++;
2739 wi = LIST_NEXT(wi, wi_hash);
2740 }
2741 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2742 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2743 wc->wc_len, (intmax_t)off));
2744 error = wapbl_circ_write(wl, wc, blocklen, &off);
2745 if (error)
2746 return error;
2747 } while (i < wl->wl_inohashcnt);
2748
2749 *offp = off;
2750 return 0;
2751 }
2752
2753 #endif /* _KERNEL */
2754
2755 /****************************************************************/
2756
2757 struct wapbl_blk {
2758 LIST_ENTRY(wapbl_blk) wb_hash;
2759 daddr_t wb_blk;
2760 off_t wb_off; /* Offset of this block in the log */
2761 };
2762 #define WAPBL_BLKPOOL_MIN 83
2763
2764 static void
2765 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2766 {
2767
2768 if (size < WAPBL_BLKPOOL_MIN)
2769 size = WAPBL_BLKPOOL_MIN;
2770 KASSERT(wr->wr_blkhash == 0);
2771 #ifdef _KERNEL
2772 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2773 #else /* ! _KERNEL */
2774 /* Manually implement hashinit */
2775 {
2776 unsigned long i, hashsize;
2777
2778 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2779 continue;
2780 wr->wr_blkhash = wapbl_alloc(hashsize *
2781 sizeof(*wr->wr_blkhash));
2782 for (i = 0; i < hashsize; i++)
2783 LIST_INIT(&wr->wr_blkhash[i]);
2784 wr->wr_blkhashmask = hashsize - 1;
2785 }
2786 #endif /* ! _KERNEL */
2787 }
2788
2789 static void
2790 wapbl_blkhash_free(struct wapbl_replay *wr)
2791 {
2792
2793 KASSERT(wr->wr_blkhashcnt == 0);
2794 #ifdef _KERNEL
2795 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2796 #else /* ! _KERNEL */
2797 wapbl_free(wr->wr_blkhash,
2798 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2799 #endif /* ! _KERNEL */
2800 }
2801
2802 static struct wapbl_blk *
2803 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2804 {
2805 struct wapbl_blk_head *wbh;
2806 struct wapbl_blk *wb;
2807
2808 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2809 LIST_FOREACH(wb, wbh, wb_hash) {
2810 if (blk == wb->wb_blk)
2811 return wb;
2812 }
2813 return 0;
2814 }
2815
2816 static void
2817 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2818 {
2819 struct wapbl_blk_head *wbh;
2820 struct wapbl_blk *wb;
2821
2822 wb = wapbl_blkhash_get(wr, blk);
2823 if (wb) {
2824 KASSERT(wb->wb_blk == blk);
2825 wb->wb_off = off;
2826 } else {
2827 wb = wapbl_alloc(sizeof(*wb));
2828 wb->wb_blk = blk;
2829 wb->wb_off = off;
2830 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2831 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2832 wr->wr_blkhashcnt++;
2833 }
2834 }
2835
2836 static void
2837 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2838 {
2839 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2840
2841 if (wb) {
2842 KASSERT(wr->wr_blkhashcnt > 0);
2843 wr->wr_blkhashcnt--;
2844 LIST_REMOVE(wb, wb_hash);
2845 wapbl_free(wb, sizeof(*wb));
2846 }
2847 }
2848
2849 static void
2850 wapbl_blkhash_clear(struct wapbl_replay *wr)
2851 {
2852 unsigned long i;
2853
2854 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2855 struct wapbl_blk *wb;
2856
2857 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2858 KASSERT(wr->wr_blkhashcnt > 0);
2859 wr->wr_blkhashcnt--;
2860 LIST_REMOVE(wb, wb_hash);
2861 wapbl_free(wb, sizeof(*wb));
2862 }
2863 }
2864 KASSERT(wr->wr_blkhashcnt == 0);
2865 }
2866
2867 /****************************************************************/
2868
2869 /*
2870 * wapbl_circ_read(wr, data, len, offp)
2871 *
2872 * Read len bytes into data from the circular queue of wr,
2873 * starting at the linear byte offset *offp, and returning the new
2874 * linear byte offset in *offp.
2875 *
2876 * If the starting linear byte offset precedes wr->wr_circ_off,
2877 * the read instead begins at wr->wr_circ_off. XXX WTF? This
2878 * should be a KASSERT, not a conditional.
2879 */
2880 static int
2881 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2882 {
2883 size_t slen;
2884 off_t off = *offp;
2885 int error;
2886 daddr_t pbn;
2887
2888 KASSERT(((len >> wr->wr_log_dev_bshift) << wr->wr_log_dev_bshift) ==
2889 len);
2890
2891 if (off < wr->wr_circ_off)
2892 off = wr->wr_circ_off;
2893 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2894 if (slen < len) {
2895 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2896 #ifdef _KERNEL
2897 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2898 #endif
2899 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2900 if (error)
2901 return error;
2902 data = (uint8_t *)data + slen;
2903 len -= slen;
2904 off = wr->wr_circ_off;
2905 }
2906 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2907 #ifdef _KERNEL
2908 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2909 #endif
2910 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2911 if (error)
2912 return error;
2913 off += len;
2914 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2915 off = wr->wr_circ_off;
2916 *offp = off;
2917 return 0;
2918 }
2919
2920 /*
2921 * wapbl_circ_advance(wr, len, offp)
2922 *
2923 * Compute the linear byte offset of the circular queue of wr that
2924 * is len bytes past *offp, and store it in *offp.
2925 *
2926 * This is as if wapbl_circ_read, but without actually reading
2927 * anything.
2928 *
2929 * If the starting linear byte offset precedes wr->wr_circ_off, it
2930 * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2931 * be a KASSERT, not a conditional.
2932 */
2933 static void
2934 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2935 {
2936 size_t slen;
2937 off_t off = *offp;
2938
2939 KASSERT(((len >> wr->wr_log_dev_bshift) << wr->wr_log_dev_bshift) ==
2940 len);
2941
2942 if (off < wr->wr_circ_off)
2943 off = wr->wr_circ_off;
2944 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2945 if (slen < len) {
2946 len -= slen;
2947 off = wr->wr_circ_off;
2948 }
2949 off += len;
2950 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2951 off = wr->wr_circ_off;
2952 *offp = off;
2953 }
2954
2955 /****************************************************************/
2956
2957 int
2958 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2959 daddr_t off, size_t count, size_t blksize)
2960 {
2961 struct wapbl_replay *wr;
2962 int error;
2963 struct vnode *devvp;
2964 daddr_t logpbn;
2965 uint8_t *scratch;
2966 struct wapbl_wc_header *wch;
2967 struct wapbl_wc_header *wch2;
2968 /* Use this until we read the actual log header */
2969 int log_dev_bshift = ilog2(blksize);
2970 size_t used;
2971 daddr_t pbn;
2972
2973 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2974 ("wapbl_replay_start: vp=%p off=%"PRId64" count=%zu blksize=%zu\n",
2975 vp, off, count, blksize));
2976
2977 if (off < 0)
2978 return SET_ERROR(EINVAL);
2979
2980 if (blksize < DEV_BSIZE)
2981 return SET_ERROR(EINVAL);
2982 if (blksize % DEV_BSIZE)
2983 return SET_ERROR(EINVAL);
2984
2985 #ifdef _KERNEL
2986 #if 0
2987 /* XXX vp->v_size isn't reliably set for VBLK devices,
2988 * especially root. However, we might still want to verify
2989 * that the full load is readable */
2990 if ((off + count) * blksize > vp->v_size)
2991 return SET_ERROR(EINVAL);
2992 #endif
2993 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2994 return error;
2995 }
2996 #else /* ! _KERNEL */
2997 devvp = vp;
2998 logpbn = off;
2999 #endif /* ! _KERNEL */
3000
3001 scratch = wapbl_alloc(MAXBSIZE);
3002
3003 pbn = logpbn;
3004 #ifdef _KERNEL
3005 pbn = btodb(pbn << log_dev_bshift);
3006 #endif
3007 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
3008 if (error)
3009 goto errout;
3010
3011 wch = (struct wapbl_wc_header *)scratch;
3012 wch2 =
3013 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
3014 /* XXX verify checksums and magic numbers */
3015 if (wch->wc_type != WAPBL_WC_HEADER) {
3016 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
3017 error = SET_ERROR(EFTYPE);
3018 goto errout;
3019 }
3020
3021 if (wch2->wc_generation > wch->wc_generation)
3022 wch = wch2;
3023
3024 wr = wapbl_calloc(1, sizeof(*wr));
3025
3026 wr->wr_logvp = vp;
3027 wr->wr_devvp = devvp;
3028 wr->wr_logpbn = logpbn;
3029
3030 wr->wr_scratch = scratch;
3031
3032 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
3033 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
3034 wr->wr_circ_off = wch->wc_circ_off;
3035 wr->wr_circ_size = wch->wc_circ_size;
3036 wr->wr_generation = wch->wc_generation;
3037
3038 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
3039
3040 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
3041 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
3042 " len=%"PRId64" used=%zu\n",
3043 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
3044 wch->wc_circ_size, used));
3045
3046 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
3047
3048 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
3049 if (error) {
3050 wapbl_replay_stop(wr);
3051 wapbl_replay_free(wr);
3052 return error;
3053 }
3054
3055 *wrp = wr;
3056 return 0;
3057
3058 errout:
3059 wapbl_free(scratch, MAXBSIZE);
3060 return error;
3061 }
3062
3063 void
3064 wapbl_replay_stop(struct wapbl_replay *wr)
3065 {
3066
3067 if (!wapbl_replay_isopen(wr))
3068 return;
3069
3070 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
3071
3072 wapbl_free(wr->wr_scratch, MAXBSIZE);
3073 wr->wr_scratch = NULL;
3074
3075 wr->wr_logvp = NULL;
3076
3077 wapbl_blkhash_clear(wr);
3078 wapbl_blkhash_free(wr);
3079 }
3080
3081 void
3082 wapbl_replay_free(struct wapbl_replay *wr)
3083 {
3084
3085 KDASSERT(!wapbl_replay_isopen(wr));
3086
3087 if (wr->wr_inodes) {
3088 wapbl_free(wr->wr_inodes,
3089 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
3090 }
3091 wapbl_free(wr, sizeof(*wr));
3092 }
3093
3094 #ifdef _KERNEL
3095 int
3096 wapbl_replay_isopen1(struct wapbl_replay *wr)
3097 {
3098
3099 return wapbl_replay_isopen(wr);
3100 }
3101 #endif
3102
3103 /*
3104 * calculate the disk address for the i'th block in the wc_blockblist
3105 * offset by j blocks of size blen.
3106 *
3107 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
3108 * was written to the journal.
3109 *
3110 * The kernel needs that address plus the offset in DEV_BSIZE units.
3111 *
3112 * Userland needs that address plus the offset in blen units.
3113 *
3114 */
3115 static daddr_t
3116 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
3117 {
3118 daddr_t pbn;
3119
3120 #ifdef _KERNEL
3121 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
3122 #else
3123 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
3124 #endif
3125
3126 return pbn;
3127 }
3128
3129 static void
3130 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
3131 {
3132 struct wapbl_wc_blocklist *wc =
3133 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3134 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3135 int i, j, n;
3136
3137 for (i = 0; i < wc->wc_blkcount; i++) {
3138 /*
3139 * Enter each physical block into the hashtable independently.
3140 */
3141 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3142 for (j = 0; j < n; j++) {
3143 wapbl_blkhash_ins(wr,
3144 wapbl_block_daddr(wc, i, j, fsblklen),
3145 *offp);
3146 wapbl_circ_advance(wr, fsblklen, offp);
3147 }
3148 }
3149 }
3150
3151 static void
3152 wapbl_replay_process_revocations(struct wapbl_replay *wr)
3153 {
3154 struct wapbl_wc_blocklist *wc =
3155 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3156 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3157 int i, j, n;
3158
3159 for (i = 0; i < wc->wc_blkcount; i++) {
3160 /*
3161 * Remove any blocks found from the hashtable.
3162 */
3163 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3164 for (j = 0; j < n; j++) {
3165 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j,
3166 fsblklen));
3167 }
3168 }
3169 }
3170
3171 static void
3172 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff,
3173 off_t newoff)
3174 {
3175 struct wapbl_wc_inodelist *wc =
3176 (struct wapbl_wc_inodelist *)wr->wr_scratch;
3177 void *new_inodes;
3178 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
3179
3180 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
3181
3182 /*
3183 * Keep track of where we found this so location won't be
3184 * overwritten.
3185 */
3186 if (wc->wc_clear) {
3187 wr->wr_inodestail = oldoff;
3188 wr->wr_inodescnt = 0;
3189 if (wr->wr_inodes != NULL) {
3190 wapbl_free(wr->wr_inodes, oldsize);
3191 wr->wr_inodes = NULL;
3192 }
3193 }
3194 wr->wr_inodeshead = newoff;
3195 if (wc->wc_inocnt == 0)
3196 return;
3197
3198 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
3199 sizeof(wr->wr_inodes[0]));
3200 if (wr->wr_inodes != NULL) {
3201 memcpy(new_inodes, wr->wr_inodes, oldsize);
3202 wapbl_free(wr->wr_inodes, oldsize);
3203 }
3204 wr->wr_inodes = new_inodes;
3205 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
3206 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
3207 wr->wr_inodescnt += wc->wc_inocnt;
3208 }
3209
3210 static int
3211 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
3212 {
3213 off_t off;
3214 int error;
3215
3216 int logblklen = 1 << wr->wr_log_dev_bshift;
3217
3218 wapbl_blkhash_clear(wr);
3219
3220 off = tail;
3221 while (off != head) {
3222 struct wapbl_wc_null *wcn;
3223 off_t saveoff = off;
3224 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3225 if (error)
3226 goto errout;
3227 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3228 switch (wcn->wc_type) {
3229 case WAPBL_WC_BLOCKS:
3230 wapbl_replay_process_blocks(wr, &off);
3231 break;
3232
3233 case WAPBL_WC_REVOCATIONS:
3234 wapbl_replay_process_revocations(wr);
3235 break;
3236
3237 case WAPBL_WC_INODES:
3238 wapbl_replay_process_inodes(wr, saveoff, off);
3239 break;
3240
3241 default:
3242 printf("Unrecognized wapbl type: 0x%08x\n",
3243 wcn->wc_type);
3244 error = SET_ERROR(EFTYPE);
3245 goto errout;
3246 }
3247 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3248 if (off != saveoff) {
3249 printf("wapbl_replay: corrupted records\n");
3250 error = SET_ERROR(EFTYPE);
3251 goto errout;
3252 }
3253 }
3254 return 0;
3255
3256 errout:
3257 wapbl_blkhash_clear(wr);
3258 return error;
3259 }
3260
3261 #if 0
3262 int
3263 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
3264 {
3265 off_t off;
3266 int mismatchcnt = 0;
3267 int logblklen = 1 << wr->wr_log_dev_bshift;
3268 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3269 void *scratch1 = wapbl_alloc(MAXBSIZE);
3270 void *scratch2 = wapbl_alloc(MAXBSIZE);
3271 int error = 0;
3272
3273 KDASSERT(wapbl_replay_isopen(wr));
3274
3275 off = wch->wc_tail;
3276 while (off != wch->wc_head) {
3277 struct wapbl_wc_null *wcn;
3278 #ifdef DEBUG
3279 off_t saveoff = off;
3280 #endif
3281 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3282 if (error)
3283 goto out;
3284 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3285 switch (wcn->wc_type) {
3286 case WAPBL_WC_BLOCKS: {
3287 struct wapbl_wc_blocklist *wc =
3288 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3289 int i;
3290 for (i = 0; i < wc->wc_blkcount; i++) {
3291 int foundcnt = 0;
3292 int dirtycnt = 0;
3293 int j, n;
3294 /*
3295 * Check each physical block into the
3296 * hashtable independently
3297 */
3298 n = wc->wc_blocks[i].wc_dlen >>
3299 wch->wc_fs_dev_bshift;
3300 for (j = 0; j < n; j++) {
3301 struct wapbl_blk *wb =
3302 wapbl_blkhash_get(wr,
3303 wapbl_block_daddr(wc, i, j,
3304 fsblklen));
3305 if (wb && wb->wb_off == off) {
3306 foundcnt++;
3307 error =
3308 wapbl_circ_read(wr,
3309 scratch1, fsblklen,
3310 &off);
3311 if (error)
3312 goto out;
3313 error =
3314 wapbl_read(scratch2,
3315 fsblklen, fsdevvp,
3316 wb->wb_blk);
3317 if (error)
3318 goto out;
3319 if (memcmp(scratch1,
3320 scratch2,
3321 fsblklen)) {
3322 printf("wapbl_verify:"
3323 " mismatch block"
3324 " %"PRId64
3325 " at off"
3326 " %"PRIdMAX"\n",
3327 wb->wb_blk,
3328 (intmax_t)off);
3329 dirtycnt++;
3330 mismatchcnt++;
3331 }
3332 } else {
3333 wapbl_circ_advance(wr,
3334 fsblklen, &off);
3335 }
3336 }
3337 #if 0
3338 /*
3339 * If all of the blocks in an entry
3340 * are clean, then remove all of its
3341 * blocks from the hashtable since they
3342 * never will need replay.
3343 */
3344 if (foundcnt != 0 && dirtycnt == 0) {
3345 off = saveoff;
3346 wapbl_circ_advance(wr, logblklen,
3347 &off);
3348 for (j = 0; j < n; j++) {
3349 struct wapbl_blk *wb =
3350 wapbl_blkhash_get(wr,
3351 wapbl_block_daddr(wc,
3352 i, j, fsblklen));
3353 if (wb &&
3354 (wb->wb_off == off)) {
3355 wapbl_blkhash_rem(wr,
3356 wb->wb_blk);
3357 }
3358 wapbl_circ_advance(wr,
3359 fsblklen, &off);
3360 }
3361 }
3362 #endif
3363 }
3364 }
3365 break;
3366 case WAPBL_WC_REVOCATIONS:
3367 case WAPBL_WC_INODES:
3368 break;
3369 default:
3370 KASSERT(0);
3371 }
3372 #ifdef DEBUG
3373 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3374 KASSERT(off == saveoff);
3375 #endif
3376 }
3377 out:
3378 wapbl_free(scratch1, MAXBSIZE);
3379 wapbl_free(scratch2, MAXBSIZE);
3380 if (!error && mismatchcnt)
3381 error = SET_ERROR(EFTYPE);
3382 return error;
3383 }
3384 #endif
3385
3386 int
3387 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3388 {
3389 struct wapbl_blk *wb;
3390 size_t i;
3391 off_t off;
3392 void *scratch;
3393 int error = 0;
3394 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3395
3396 KDASSERT(wapbl_replay_isopen(wr));
3397
3398 scratch = wapbl_alloc(MAXBSIZE);
3399
3400 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
3401 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3402 off = wb->wb_off;
3403 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3404 if (error)
3405 break;
3406 error = wapbl_write(scratch, fsblklen, fsdevvp,
3407 wb->wb_blk);
3408 if (error)
3409 break;
3410 }
3411 }
3412
3413 wapbl_free(scratch, MAXBSIZE);
3414 return error;
3415 }
3416
3417 int
3418 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3419 {
3420 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3421
3422 KDASSERT(wapbl_replay_isopen(wr));
3423 KASSERT((len % fsblklen) == 0);
3424
3425 while (len != 0) {
3426 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3427 if (wb)
3428 return 1;
3429 len -= fsblklen;
3430 }
3431 return 0;
3432 }
3433
3434 int
3435 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3436 {
3437 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3438
3439 KDASSERT(wapbl_replay_isopen(wr));
3440
3441 KASSERT((len % fsblklen) == 0);
3442
3443 while (len != 0) {
3444 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3445 if (wb) {
3446 off_t off = wb->wb_off;
3447 int error;
3448 error = wapbl_circ_read(wr, data, fsblklen, &off);
3449 if (error)
3450 return error;
3451 }
3452 data = (uint8_t *)data + fsblklen;
3453 len -= fsblklen;
3454 blk++;
3455 }
3456 return 0;
3457 }
3458
3459 #ifdef _KERNEL
3460
3461 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3462
3463 static int
3464 wapbl_modcmd(modcmd_t cmd, void *arg)
3465 {
3466
3467 switch (cmd) {
3468 case MODULE_CMD_INIT:
3469 wapbl_init();
3470 return 0;
3471 case MODULE_CMD_FINI:
3472 return wapbl_fini();
3473 default:
3474 return SET_ERROR(ENOTTY);
3475 }
3476 }
3477
3478 #endif /* _KERNEL */
3479