vfs_wapbl.c revision 1.97 1 /* $NetBSD: vfs_wapbl.c,v 1.97 2017/06/08 01:23:01 chs Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.97 2017/06/08 01:23:01 chs Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43 #include <sys/time.h>
44 #include <sys/wapbl.h>
45 #include <sys/wapbl_replay.h>
46
47 #ifdef _KERNEL
48
49 #include <sys/atomic.h>
50 #include <sys/conf.h>
51 #include <sys/evcnt.h>
52 #include <sys/file.h>
53 #include <sys/kauth.h>
54 #include <sys/kernel.h>
55 #include <sys/module.h>
56 #include <sys/mount.h>
57 #include <sys/mutex.h>
58 #include <sys/namei.h>
59 #include <sys/proc.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sysctl.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
68 #define wapbl_free(a, s) kmem_free((a), (s))
69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
70
71 static struct sysctllog *wapbl_sysctl;
72 static int wapbl_flush_disk_cache = 1;
73 static int wapbl_verbose_commit = 0;
74 static int wapbl_allow_dpofua = 0; /* switched off by default for now */
75 static int wapbl_journal_iobufs = 4;
76
77 static inline size_t wapbl_space_free(size_t, off_t, off_t);
78
79 #else /* !_KERNEL */
80
81 #include <assert.h>
82 #include <errno.h>
83 #include <stdbool.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <string.h>
87
88 #define KDASSERT(x) assert(x)
89 #define KASSERT(x) assert(x)
90 #define wapbl_alloc(s) malloc(s)
91 #define wapbl_free(a, s) free(a)
92 #define wapbl_calloc(n, s) calloc((n), (s))
93
94 #endif /* !_KERNEL */
95
96 /*
97 * INTERNAL DATA STRUCTURES
98 */
99
100 /*
101 * This structure holds per-mount log information.
102 *
103 * Legend: a = atomic access only
104 * r = read-only after init
105 * l = rwlock held
106 * m = mutex held
107 * lm = rwlock held writing or mutex held
108 * u = unlocked access ok
109 * b = bufcache_lock held
110 */
111 LIST_HEAD(wapbl_ino_head, wapbl_ino);
112 struct wapbl {
113 struct vnode *wl_logvp; /* r: log here */
114 struct vnode *wl_devvp; /* r: log on this device */
115 struct mount *wl_mount; /* r: mountpoint wl is associated with */
116 daddr_t wl_logpbn; /* r: Physical block number of start of log */
117 int wl_log_dev_bshift; /* r: logarithm of device block size of log
118 device */
119 int wl_fs_dev_bshift; /* r: logarithm of device block size of
120 filesystem device */
121
122 unsigned wl_lock_count; /* m: Count of transactions in progress */
123
124 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
125 size_t wl_circ_off; /* r: Number of bytes reserved at start */
126
127 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
128 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
129
130 off_t wl_head; /* l: Byte offset of log head */
131 off_t wl_tail; /* l: Byte offset of log tail */
132 /*
133 * WAPBL log layout, stored on wl_devvp at wl_logpbn:
134 *
135 * ___________________ wl_circ_size __________________
136 * / \
137 * +---------+---------+-------+--------------+--------+
138 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
139 * +---------+---------+-------+--------------+--------+
140 * wl_circ_off --^ ^-- wl_head ^-- wl_tail
141 *
142 * commit0 and commit1 are commit headers. A commit header has
143 * a generation number, indicating which of the two headers is
144 * more recent, and an assignment of head and tail pointers.
145 * The rest is a circular queue of log records, starting at
146 * the byte offset wl_circ_off.
147 *
148 * E marks empty space for records.
149 * W marks records for block writes issued but waiting.
150 * C marks completed records.
151 *
152 * wapbl_flush writes new records to empty `E' spaces after
153 * wl_head from the current transaction in memory.
154 *
155 * wapbl_truncate advances wl_tail past any completed `C'
156 * records, freeing them up for use.
157 *
158 * head == tail == 0 means log is empty.
159 * head == tail != 0 means log is full.
160 *
161 * See assertions in wapbl_advance() for other boundary
162 * conditions.
163 *
164 * Only wapbl_flush moves the head, except when wapbl_truncate
165 * sets it to 0 to indicate that the log is empty.
166 *
167 * Only wapbl_truncate moves the tail, except when wapbl_flush
168 * sets it to wl_circ_off to indicate that the log is full.
169 */
170
171 struct wapbl_wc_header *wl_wc_header; /* l */
172 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
173
174 kmutex_t wl_mtx; /* u: short-term lock */
175 krwlock_t wl_rwlock; /* u: File system transaction lock */
176
177 /*
178 * Must be held while accessing
179 * wl_count or wl_bufs or head or tail
180 */
181
182 #if _KERNEL
183 /*
184 * Callback called from within the flush routine to flush any extra
185 * bits. Note that flush may be skipped without calling this if
186 * there are no outstanding buffers in the transaction.
187 */
188 wapbl_flush_fn_t wl_flush; /* r */
189 wapbl_flush_fn_t wl_flush_abort;/* r */
190
191 /* Event counters */
192 char wl_ev_group[EVCNT_STRING_MAX]; /* r */
193 struct evcnt wl_ev_commit; /* l */
194 struct evcnt wl_ev_journalwrite; /* l */
195 struct evcnt wl_ev_jbufs_bio_nowait; /* l */
196 struct evcnt wl_ev_jbufs_bio_wait; /* l */
197 struct evcnt wl_ev_metawrite; /* lm */
198 struct evcnt wl_ev_cacheflush; /* l */
199 #endif
200
201 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
202 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
203 size_t wl_bcount; /* m: Total bcount of wl_bufs */
204
205 TAILQ_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
206
207 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
208 size_t wl_reclaimable_bytes; /* m: Amount of space available for
209 reclamation by truncate */
210 int wl_error_count; /* m: # of wl_entries with errors */
211 size_t wl_reserved_bytes; /* never truncate log smaller than this */
212
213 #ifdef WAPBL_DEBUG_BUFBYTES
214 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
215 #endif
216
217 #if _KERNEL
218 int wl_brperjblock; /* r Block records per journal block */
219 #endif
220
221 TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
222 int wl_dealloccnt; /* lm: total count */
223 int wl_dealloclim; /* r: max count */
224
225 /* hashtable of inode numbers for allocated but unlinked inodes */
226 /* synch ??? */
227 struct wapbl_ino_head *wl_inohash;
228 u_long wl_inohashmask;
229 int wl_inohashcnt;
230
231 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
232 accounting */
233
234 /* buffers for wapbl_buffered_write() */
235 TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
236 TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
237
238 int wl_dkcache; /* r: disk cache flags */
239 #define WAPBL_USE_FUA(wl) \
240 (wapbl_allow_dpofua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
241 #define WAPBL_JFLAGS(wl) \
242 (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
243 #define WAPBL_MFLAGS(wl) \
244 (WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0)
245 int wl_jwrite_flags; /* r: journal write flags */
246 int wl_mwrite_flags; /* r: metadata write flags */
247 };
248
249 #ifdef WAPBL_DEBUG_PRINT
250 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
251 #endif
252
253 /****************************************************************/
254 #ifdef _KERNEL
255
256 #ifdef WAPBL_DEBUG
257 struct wapbl *wapbl_debug_wl;
258 #endif
259
260 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
261 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
262 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
263 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
264 #endif /* _KERNEL */
265
266 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
267
268 static inline size_t wapbl_space_used(size_t avail, off_t head,
269 off_t tail);
270
271 #ifdef _KERNEL
272
273 static struct pool wapbl_entry_pool;
274 static struct pool wapbl_dealloc_pool;
275
276 #define WAPBL_INODETRK_SIZE 83
277 static int wapbl_ino_pool_refcount;
278 static struct pool wapbl_ino_pool;
279 struct wapbl_ino {
280 LIST_ENTRY(wapbl_ino) wi_hash;
281 ino_t wi_ino;
282 mode_t wi_mode;
283 };
284
285 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
286 static void wapbl_inodetrk_free(struct wapbl *wl);
287 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
288
289 static size_t wapbl_transaction_len(struct wapbl *wl);
290 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
291
292 static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *,
293 bool);
294
295 static void wapbl_evcnt_init(struct wapbl *);
296 static void wapbl_evcnt_free(struct wapbl *);
297
298 static void wapbl_dkcache_init(struct wapbl *);
299
300 #if 0
301 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
302 #endif
303
304 static int wapbl_replay_isopen1(struct wapbl_replay *);
305
306 struct wapbl_ops wapbl_ops = {
307 .wo_wapbl_discard = wapbl_discard,
308 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
309 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
310 .wo_wapbl_replay_read = wapbl_replay_read,
311 .wo_wapbl_add_buf = wapbl_add_buf,
312 .wo_wapbl_remove_buf = wapbl_remove_buf,
313 .wo_wapbl_resize_buf = wapbl_resize_buf,
314 .wo_wapbl_begin = wapbl_begin,
315 .wo_wapbl_end = wapbl_end,
316 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
317
318 /* XXX: the following is only used to say "this is a wapbl buf" */
319 .wo_wapbl_biodone = wapbl_biodone,
320 };
321
322 static int
323 wapbl_sysctl_init(void)
324 {
325 int rv;
326 const struct sysctlnode *rnode, *cnode;
327
328 wapbl_sysctl = NULL;
329
330 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
331 CTLFLAG_PERMANENT,
332 CTLTYPE_NODE, "wapbl",
333 SYSCTL_DESCR("WAPBL journaling options"),
334 NULL, 0, NULL, 0,
335 CTL_VFS, CTL_CREATE, CTL_EOL);
336 if (rv)
337 return rv;
338
339 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
340 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
341 CTLTYPE_INT, "flush_disk_cache",
342 SYSCTL_DESCR("flush disk cache"),
343 NULL, 0, &wapbl_flush_disk_cache, 0,
344 CTL_CREATE, CTL_EOL);
345 if (rv)
346 return rv;
347
348 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
349 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
350 CTLTYPE_INT, "verbose_commit",
351 SYSCTL_DESCR("show time and size of wapbl log commits"),
352 NULL, 0, &wapbl_verbose_commit, 0,
353 CTL_CREATE, CTL_EOL);
354 if (rv)
355 return rv;
356
357 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
358 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
359 CTLTYPE_INT, "allow_dpofua",
360 SYSCTL_DESCR("allow use of FUA/DPO instead of cash flush if available"),
361 NULL, 0, &wapbl_allow_dpofua, 0,
362 CTL_CREATE, CTL_EOL);
363 if (rv)
364 return rv;
365
366 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
367 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
368 CTLTYPE_INT, "journal_iobufs",
369 SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
370 NULL, 0, &wapbl_journal_iobufs, 0,
371 CTL_CREATE, CTL_EOL);
372 if (rv)
373 return rv;
374
375 return rv;
376 }
377
378 static void
379 wapbl_init(void)
380 {
381
382 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
383 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
384 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
385 "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
386
387 wapbl_sysctl_init();
388 }
389
390 static int
391 wapbl_fini(void)
392 {
393
394 if (wapbl_sysctl != NULL)
395 sysctl_teardown(&wapbl_sysctl);
396
397 pool_destroy(&wapbl_dealloc_pool);
398 pool_destroy(&wapbl_entry_pool);
399
400 return 0;
401 }
402
403 static void
404 wapbl_evcnt_init(struct wapbl *wl)
405 {
406 snprintf(wl->wl_ev_group, sizeof(wl->wl_ev_group),
407 "wapbl fsid 0x%x/0x%x",
408 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[0],
409 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[1]
410 );
411
412 evcnt_attach_dynamic(&wl->wl_ev_commit, EVCNT_TYPE_MISC,
413 NULL, wl->wl_ev_group, "commit");
414 evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
415 NULL, wl->wl_ev_group, "journal sync block write");
416 evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
417 NULL, wl->wl_ev_group, "journal I/O bufs no wait");
418 evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_wait, EVCNT_TYPE_MISC,
419 NULL, wl->wl_ev_group, "journal I/O bufs biowait");
420 evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
421 NULL, wl->wl_ev_group, "metadata finished block write");
422 evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
423 NULL, wl->wl_ev_group, "cache flush");
424 }
425
426 static void
427 wapbl_evcnt_free(struct wapbl *wl)
428 {
429 evcnt_detach(&wl->wl_ev_commit);
430 evcnt_detach(&wl->wl_ev_journalwrite);
431 evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
432 evcnt_detach(&wl->wl_ev_jbufs_bio_wait);
433 evcnt_detach(&wl->wl_ev_metawrite);
434 evcnt_detach(&wl->wl_ev_cacheflush);
435 }
436
437 static void
438 wapbl_dkcache_init(struct wapbl *wl)
439 {
440 int error;
441
442 /* Get disk cache flags */
443 error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
444 FWRITE, FSCRED);
445 if (error) {
446 /* behave as if there was a write cache */
447 wl->wl_dkcache = DKCACHE_WRITE;
448 }
449
450 /* Use FUA instead of cache flush if available */
451 if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) {
452 wl->wl_jwrite_flags |= B_MEDIA_FUA;
453 wl->wl_mwrite_flags |= B_MEDIA_FUA;
454 }
455
456 /* Use DPO for journal writes if available */
457 if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
458 wl->wl_jwrite_flags |= B_MEDIA_DPO;
459 }
460
461 static int
462 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
463 {
464 int error, i;
465
466 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
467 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
468
469 /*
470 * Its only valid to reuse the replay log if its
471 * the same as the new log we just opened.
472 */
473 KDASSERT(!wapbl_replay_isopen(wr));
474 KASSERT(wl->wl_devvp->v_type == VBLK);
475 KASSERT(wr->wr_devvp->v_type == VBLK);
476 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
477 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
478 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
479 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
480 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
481 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
482
483 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
484
485 for (i = 0; i < wr->wr_inodescnt; i++)
486 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
487 wr->wr_inodes[i].wr_imode);
488
489 /* Make sure new transaction won't overwrite old inodes list */
490 KDASSERT(wapbl_transaction_len(wl) <=
491 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
492 wr->wr_inodestail));
493
494 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
495 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
496 wapbl_transaction_len(wl);
497
498 error = wapbl_write_inodes(wl, &wl->wl_head);
499 if (error)
500 return error;
501
502 KASSERT(wl->wl_head != wl->wl_tail);
503 KASSERT(wl->wl_head != 0);
504
505 return 0;
506 }
507
508 int
509 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
510 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
511 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
512 {
513 struct wapbl *wl;
514 struct vnode *devvp;
515 daddr_t logpbn;
516 int error;
517 int log_dev_bshift = ilog2(blksize);
518 int fs_dev_bshift = log_dev_bshift;
519 int run;
520
521 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
522 " count=%zu blksize=%zu\n", vp, off, count, blksize));
523
524 if (log_dev_bshift > fs_dev_bshift) {
525 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
526 ("wapbl: log device's block size cannot be larger "
527 "than filesystem's\n"));
528 /*
529 * Not currently implemented, although it could be if
530 * needed someday.
531 */
532 return ENOSYS;
533 }
534
535 if (off < 0)
536 return EINVAL;
537
538 if (blksize < DEV_BSIZE)
539 return EINVAL;
540 if (blksize % DEV_BSIZE)
541 return EINVAL;
542
543 /* XXXTODO: verify that the full load is writable */
544
545 /*
546 * XXX check for minimum log size
547 * minimum is governed by minimum amount of space
548 * to complete a transaction. (probably truncate)
549 */
550 /* XXX for now pick something minimal */
551 if ((count * blksize) < MAXPHYS) {
552 return ENOSPC;
553 }
554
555 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
556 return error;
557 }
558
559 wl = wapbl_calloc(1, sizeof(*wl));
560 rw_init(&wl->wl_rwlock);
561 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
562 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
563 TAILQ_INIT(&wl->wl_bufs);
564 SIMPLEQ_INIT(&wl->wl_entries);
565
566 wl->wl_logvp = vp;
567 wl->wl_devvp = devvp;
568 wl->wl_mount = mp;
569 wl->wl_logpbn = logpbn;
570 wl->wl_log_dev_bshift = log_dev_bshift;
571 wl->wl_fs_dev_bshift = fs_dev_bshift;
572
573 wl->wl_flush = flushfn;
574 wl->wl_flush_abort = flushabortfn;
575
576 /* Reserve two log device blocks for the commit headers */
577 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
578 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
579 /* truncate the log usage to a multiple of log_dev_bshift */
580 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
581 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
582
583 /*
584 * wl_bufbytes_max limits the size of the in memory transaction space.
585 * - Since buffers are allocated and accounted for in units of
586 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
587 * (i.e. 1<<PAGE_SHIFT)
588 * - Since the log device has to be written in units of
589 * 1<<wl_log_dev_bshift it is required to be a mulitple of
590 * 1<<wl_log_dev_bshift.
591 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
592 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
593 * Therefore it must be multiple of the least common multiple of those
594 * three quantities. Fortunately, all of those quantities are
595 * guaranteed to be a power of two, and the least common multiple of
596 * a set of numbers which are all powers of two is simply the maximum
597 * of those numbers. Finally, the maximum logarithm of a power of two
598 * is the same as the log of the maximum power of two. So we can do
599 * the following operations to size wl_bufbytes_max:
600 */
601
602 /* XXX fix actual number of pages reserved per filesystem. */
603 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
604
605 /* Round wl_bufbytes_max to the largest power of two constraint */
606 wl->wl_bufbytes_max >>= PAGE_SHIFT;
607 wl->wl_bufbytes_max <<= PAGE_SHIFT;
608 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
609 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
610 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
611 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
612
613 /* XXX maybe use filesystem fragment size instead of 1024 */
614 /* XXX fix actual number of buffers reserved per filesystem. */
615 wl->wl_bufcount_max = (buf_nbuf() / 2) * 1024;
616
617 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
618 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
619 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
620 KASSERT(wl->wl_brperjblock > 0);
621
622 /* XXX tie this into resource estimation */
623 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
624 TAILQ_INIT(&wl->wl_dealloclist);
625
626 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
627
628 wapbl_evcnt_init(wl);
629
630 wapbl_dkcache_init(wl);
631
632 /* Initialize the commit header */
633 {
634 struct wapbl_wc_header *wc;
635 size_t len = 1 << wl->wl_log_dev_bshift;
636 wc = wapbl_calloc(1, len);
637 wc->wc_type = WAPBL_WC_HEADER;
638 wc->wc_len = len;
639 wc->wc_circ_off = wl->wl_circ_off;
640 wc->wc_circ_size = wl->wl_circ_size;
641 /* XXX wc->wc_fsid */
642 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
643 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
644 wl->wl_wc_header = wc;
645 wl->wl_wc_scratch = wapbl_alloc(len);
646 }
647
648 TAILQ_INIT(&wl->wl_iobufs);
649 TAILQ_INIT(&wl->wl_iobufs_busy);
650 for (int i = 0; i < wapbl_journal_iobufs; i++) {
651 struct buf *bp;
652
653 if ((bp = geteblk(MAXPHYS)) == NULL)
654 goto errout;
655
656 mutex_enter(&bufcache_lock);
657 mutex_enter(devvp->v_interlock);
658 bgetvp(devvp, bp);
659 mutex_exit(devvp->v_interlock);
660 mutex_exit(&bufcache_lock);
661
662 bp->b_dev = devvp->v_rdev;
663
664 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
665 }
666
667 /*
668 * if there was an existing set of unlinked but
669 * allocated inodes, preserve it in the new
670 * log.
671 */
672 if (wr && wr->wr_inodescnt) {
673 error = wapbl_start_flush_inodes(wl, wr);
674 if (error)
675 goto errout;
676 }
677
678 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
679 if (error) {
680 goto errout;
681 }
682
683 *wlp = wl;
684 #if defined(WAPBL_DEBUG)
685 wapbl_debug_wl = wl;
686 #endif
687
688 return 0;
689 errout:
690 wapbl_discard(wl);
691 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
692 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
693 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
694 struct buf *bp;
695
696 bp = TAILQ_FIRST(&wl->wl_iobufs);
697 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
698 brelse(bp, BC_INVAL);
699 }
700 wapbl_inodetrk_free(wl);
701 wapbl_free(wl, sizeof(*wl));
702
703 return error;
704 }
705
706 /*
707 * Like wapbl_flush, only discards the transaction
708 * completely
709 */
710
711 void
712 wapbl_discard(struct wapbl *wl)
713 {
714 struct wapbl_entry *we;
715 struct wapbl_dealloc *wd;
716 struct buf *bp;
717 int i;
718
719 /*
720 * XXX we may consider using upgrade here
721 * if we want to call flush from inside a transaction
722 */
723 rw_enter(&wl->wl_rwlock, RW_WRITER);
724 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
725
726 #ifdef WAPBL_DEBUG_PRINT
727 {
728 pid_t pid = -1;
729 lwpid_t lid = -1;
730 if (curproc)
731 pid = curproc->p_pid;
732 if (curlwp)
733 lid = curlwp->l_lid;
734 #ifdef WAPBL_DEBUG_BUFBYTES
735 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
736 ("wapbl_discard: thread %d.%d discarding "
737 "transaction\n"
738 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
739 "deallocs=%d inodes=%d\n"
740 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
741 "unsynced=%zu\n",
742 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
743 wl->wl_bcount, wl->wl_dealloccnt,
744 wl->wl_inohashcnt, wl->wl_error_count,
745 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
746 wl->wl_unsynced_bufbytes));
747 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
748 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
749 ("\tentry: bufcount = %zu, reclaimable = %zu, "
750 "error = %d, unsynced = %zu\n",
751 we->we_bufcount, we->we_reclaimable_bytes,
752 we->we_error, we->we_unsynced_bufbytes));
753 }
754 #else /* !WAPBL_DEBUG_BUFBYTES */
755 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
756 ("wapbl_discard: thread %d.%d discarding transaction\n"
757 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
758 "deallocs=%d inodes=%d\n"
759 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
760 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
761 wl->wl_bcount, wl->wl_dealloccnt,
762 wl->wl_inohashcnt, wl->wl_error_count,
763 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
764 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
765 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
766 ("\tentry: bufcount = %zu, reclaimable = %zu, "
767 "error = %d\n",
768 we->we_bufcount, we->we_reclaimable_bytes,
769 we->we_error));
770 }
771 #endif /* !WAPBL_DEBUG_BUFBYTES */
772 }
773 #endif /* WAPBL_DEBUG_PRINT */
774
775 for (i = 0; i <= wl->wl_inohashmask; i++) {
776 struct wapbl_ino_head *wih;
777 struct wapbl_ino *wi;
778
779 wih = &wl->wl_inohash[i];
780 while ((wi = LIST_FIRST(wih)) != NULL) {
781 LIST_REMOVE(wi, wi_hash);
782 pool_put(&wapbl_ino_pool, wi);
783 KASSERT(wl->wl_inohashcnt > 0);
784 wl->wl_inohashcnt--;
785 }
786 }
787
788 /*
789 * clean buffer list
790 */
791 mutex_enter(&bufcache_lock);
792 mutex_enter(&wl->wl_mtx);
793 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
794 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
795 /*
796 * The buffer will be unlocked and
797 * removed from the transaction in brelse
798 */
799 mutex_exit(&wl->wl_mtx);
800 brelsel(bp, 0);
801 mutex_enter(&wl->wl_mtx);
802 }
803 }
804 mutex_exit(&wl->wl_mtx);
805 mutex_exit(&bufcache_lock);
806
807 /*
808 * Remove references to this wl from wl_entries, free any which
809 * no longer have buffers, others will be freed in wapbl_biodone
810 * when they no longer have any buffers.
811 */
812 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
813 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
814 /* XXX should we be accumulating wl_error_count
815 * and increasing reclaimable bytes ? */
816 we->we_wapbl = NULL;
817 if (we->we_bufcount == 0) {
818 #ifdef WAPBL_DEBUG_BUFBYTES
819 KASSERT(we->we_unsynced_bufbytes == 0);
820 #endif
821 pool_put(&wapbl_entry_pool, we);
822 }
823 }
824
825 /* Discard list of deallocs */
826 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
827 wapbl_deallocation_free(wl, wd, true);
828
829 /* XXX should we clear wl_reserved_bytes? */
830
831 KASSERT(wl->wl_bufbytes == 0);
832 KASSERT(wl->wl_bcount == 0);
833 KASSERT(wl->wl_bufcount == 0);
834 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
835 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
836 KASSERT(wl->wl_inohashcnt == 0);
837 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
838 KASSERT(wl->wl_dealloccnt == 0);
839
840 rw_exit(&wl->wl_rwlock);
841 }
842
843 int
844 wapbl_stop(struct wapbl *wl, int force)
845 {
846 int error;
847
848 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
849 error = wapbl_flush(wl, 1);
850 if (error) {
851 if (force)
852 wapbl_discard(wl);
853 else
854 return error;
855 }
856
857 /* Unlinked inodes persist after a flush */
858 if (wl->wl_inohashcnt) {
859 if (force) {
860 wapbl_discard(wl);
861 } else {
862 return EBUSY;
863 }
864 }
865
866 KASSERT(wl->wl_bufbytes == 0);
867 KASSERT(wl->wl_bcount == 0);
868 KASSERT(wl->wl_bufcount == 0);
869 KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
870 KASSERT(wl->wl_dealloccnt == 0);
871 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
872 KASSERT(wl->wl_inohashcnt == 0);
873 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
874 KASSERT(wl->wl_dealloccnt == 0);
875 KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
876
877 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
878 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
879 while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
880 struct buf *bp;
881
882 bp = TAILQ_FIRST(&wl->wl_iobufs);
883 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
884 brelse(bp, BC_INVAL);
885 }
886 wapbl_inodetrk_free(wl);
887
888 wapbl_evcnt_free(wl);
889
890 cv_destroy(&wl->wl_reclaimable_cv);
891 mutex_destroy(&wl->wl_mtx);
892 rw_destroy(&wl->wl_rwlock);
893 wapbl_free(wl, sizeof(*wl));
894
895 return 0;
896 }
897
898 /****************************************************************/
899 /*
900 * Unbuffered disk I/O
901 */
902
903 static void
904 wapbl_doio_accounting(struct vnode *devvp, int flags)
905 {
906 struct pstats *pstats = curlwp->l_proc->p_stats;
907
908 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
909 mutex_enter(devvp->v_interlock);
910 devvp->v_numoutput++;
911 mutex_exit(devvp->v_interlock);
912 pstats->p_ru.ru_oublock++;
913 } else {
914 pstats->p_ru.ru_inblock++;
915 }
916
917 }
918
919 static int
920 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
921 {
922 struct buf *bp;
923 int error;
924
925 KASSERT(devvp->v_type == VBLK);
926
927 wapbl_doio_accounting(devvp, flags);
928
929 bp = getiobuf(devvp, true);
930 bp->b_flags = flags;
931 bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
932 bp->b_dev = devvp->v_rdev;
933 bp->b_data = data;
934 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
935 bp->b_blkno = pbn;
936 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
937
938 WAPBL_PRINTF(WAPBL_PRINT_IO,
939 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
940 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
941 bp->b_blkno, bp->b_dev));
942
943 VOP_STRATEGY(devvp, bp);
944
945 error = biowait(bp);
946 putiobuf(bp);
947
948 if (error) {
949 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
950 ("wapbl_doio: %s %zu bytes at block %" PRId64
951 " on dev 0x%"PRIx64" failed with error %d\n",
952 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
953 "write" : "read"),
954 len, pbn, devvp->v_rdev, error));
955 }
956
957 return error;
958 }
959
960 /*
961 * wapbl_write(data, len, devvp, pbn)
962 *
963 * Synchronously write len bytes from data to physical block pbn
964 * on devvp.
965 */
966 int
967 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
968 {
969
970 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
971 }
972
973 /*
974 * wapbl_read(data, len, devvp, pbn)
975 *
976 * Synchronously read len bytes into data from physical block pbn
977 * on devvp.
978 */
979 int
980 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
981 {
982
983 return wapbl_doio(data, len, devvp, pbn, B_READ);
984 }
985
986 /****************************************************************/
987 /*
988 * Buffered disk writes -- try to coalesce writes and emit
989 * MAXPHYS-aligned blocks.
990 */
991
992 /*
993 * wapbl_buffered_write_async(wl, bp)
994 *
995 * Send buffer for asynchronous write.
996 */
997 static void
998 wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
999 {
1000 wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
1001
1002 KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
1003 TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
1004
1005 bp->b_flags = B_WRITE | WAPBL_JFLAGS(wl);
1006 bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
1007 bp->b_oflags = 0;
1008 bp->b_bcount = bp->b_resid;
1009 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1010
1011 VOP_STRATEGY(wl->wl_devvp, bp);
1012
1013 wl->wl_ev_journalwrite.ev_count++;
1014
1015 TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
1016 }
1017
1018 /*
1019 * wapbl_buffered_flush(wl)
1020 *
1021 * Flush any buffered writes from wapbl_buffered_write.
1022 */
1023 static int
1024 wapbl_buffered_flush(struct wapbl *wl, bool full)
1025 {
1026 int error = 0;
1027 struct buf *bp, *bnext;
1028 bool only_done = true, found = false;
1029
1030 /* if there is outstanding buffered write, send it now */
1031 if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
1032 wapbl_buffered_write_async(wl, bp);
1033
1034 /* wait for I/O to complete */
1035 again:
1036 TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
1037 if (!full && only_done) {
1038 /* skip unfinished */
1039 if (!ISSET(bp->b_oflags, BO_DONE))
1040 continue;
1041 }
1042
1043 if (ISSET(bp->b_oflags, BO_DONE))
1044 wl->wl_ev_jbufs_bio_nowait.ev_count++;
1045 else
1046 wl->wl_ev_jbufs_bio_wait.ev_count++;
1047
1048 TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
1049 error = biowait(bp);
1050
1051 /* reset for reuse */
1052 bp->b_blkno = bp->b_resid = 0;
1053 TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
1054 found = true;
1055
1056 if (!full)
1057 break;
1058 }
1059
1060 if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
1061 only_done = false;
1062 goto again;
1063 }
1064
1065 return error;
1066 }
1067
1068 /*
1069 * wapbl_buffered_write(data, len, wl, pbn)
1070 *
1071 * Write len bytes from data to physical block pbn on
1072 * wl->wl_devvp. The write may not complete until
1073 * wapbl_buffered_flush.
1074 */
1075 static int
1076 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
1077 {
1078 size_t resid;
1079 struct buf *bp;
1080
1081 again:
1082 bp = TAILQ_FIRST(&wl->wl_iobufs);
1083
1084 if (bp == NULL) {
1085 /* No more buffers, wait for any previous I/O to finish. */
1086 wapbl_buffered_flush(wl, false);
1087
1088 bp = TAILQ_FIRST(&wl->wl_iobufs);
1089 KASSERT(bp != NULL);
1090 }
1091
1092 /*
1093 * If not adjacent to buffered data flush first. Disk block
1094 * address is always valid for non-empty buffer.
1095 */
1096 if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
1097 wapbl_buffered_write_async(wl, bp);
1098 goto again;
1099 }
1100
1101 /*
1102 * If this write goes to an empty buffer we have to
1103 * save the disk block address first.
1104 */
1105 if (bp->b_blkno == 0)
1106 bp->b_blkno = pbn;
1107
1108 /*
1109 * Remaining space so this buffer ends on a buffer size boundary.
1110 *
1111 * Cannot become less or equal zero as the buffer would have been
1112 * flushed on the last call then.
1113 */
1114 resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
1115 bp->b_resid;
1116 KASSERT(resid > 0);
1117 KASSERT(dbtob(btodb(resid)) == resid);
1118
1119 if (len < resid)
1120 resid = len;
1121
1122 memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
1123 bp->b_resid += resid;
1124
1125 if (len >= resid) {
1126 /* Just filled the buf, or data did not fit */
1127 wapbl_buffered_write_async(wl, bp);
1128
1129 data = (uint8_t *)data + resid;
1130 len -= resid;
1131 pbn += btodb(resid);
1132
1133 if (len > 0)
1134 goto again;
1135 }
1136
1137 return 0;
1138 }
1139
1140 /*
1141 * wapbl_circ_write(wl, data, len, offp)
1142 *
1143 * Write len bytes from data to the circular queue of wl, starting
1144 * at linear byte offset *offp, and returning the new linear byte
1145 * offset in *offp.
1146 *
1147 * If the starting linear byte offset precedes wl->wl_circ_off,
1148 * the write instead begins at wl->wl_circ_off. XXX WTF? This
1149 * should be a KASSERT, not a conditional.
1150 *
1151 * The write is buffered in wl and must be flushed with
1152 * wapbl_buffered_flush before it will be submitted to the disk.
1153 */
1154 static int
1155 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
1156 {
1157 size_t slen;
1158 off_t off = *offp;
1159 int error;
1160 daddr_t pbn;
1161
1162 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
1163 wl->wl_log_dev_bshift) == len);
1164
1165 if (off < wl->wl_circ_off)
1166 off = wl->wl_circ_off;
1167 slen = wl->wl_circ_off + wl->wl_circ_size - off;
1168 if (slen < len) {
1169 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1170 #ifdef _KERNEL
1171 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1172 #endif
1173 error = wapbl_buffered_write(data, slen, wl, pbn);
1174 if (error)
1175 return error;
1176 data = (uint8_t *)data + slen;
1177 len -= slen;
1178 off = wl->wl_circ_off;
1179 }
1180 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1181 #ifdef _KERNEL
1182 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1183 #endif
1184 error = wapbl_buffered_write(data, len, wl, pbn);
1185 if (error)
1186 return error;
1187 off += len;
1188 if (off >= wl->wl_circ_off + wl->wl_circ_size)
1189 off = wl->wl_circ_off;
1190 *offp = off;
1191 return 0;
1192 }
1193
1194 /****************************************************************/
1195 /*
1196 * WAPBL transactions: entering, adding/removing bufs, and exiting
1197 */
1198
1199 int
1200 wapbl_begin(struct wapbl *wl, const char *file, int line)
1201 {
1202 int doflush;
1203 unsigned lockcount;
1204
1205 KDASSERT(wl);
1206
1207 /*
1208 * XXX this needs to be made much more sophisticated.
1209 * perhaps each wapbl_begin could reserve a specified
1210 * number of buffers and bytes.
1211 */
1212 mutex_enter(&wl->wl_mtx);
1213 lockcount = wl->wl_lock_count;
1214 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
1215 wl->wl_bufbytes_max / 2) ||
1216 ((wl->wl_bufcount + (lockcount * 10)) >
1217 wl->wl_bufcount_max / 2) ||
1218 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1219 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1220 mutex_exit(&wl->wl_mtx);
1221
1222 if (doflush) {
1223 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1224 ("force flush lockcnt=%d bufbytes=%zu "
1225 "(max=%zu) bufcount=%zu (max=%zu) "
1226 "dealloccnt %d (lim=%d)\n",
1227 lockcount, wl->wl_bufbytes,
1228 wl->wl_bufbytes_max, wl->wl_bufcount,
1229 wl->wl_bufcount_max,
1230 wl->wl_dealloccnt, wl->wl_dealloclim));
1231 }
1232
1233 if (doflush) {
1234 int error = wapbl_flush(wl, 0);
1235 if (error)
1236 return error;
1237 }
1238
1239 rw_enter(&wl->wl_rwlock, RW_READER);
1240 mutex_enter(&wl->wl_mtx);
1241 wl->wl_lock_count++;
1242 mutex_exit(&wl->wl_mtx);
1243
1244 #if defined(WAPBL_DEBUG_PRINT)
1245 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1246 ("wapbl_begin thread %d.%d with bufcount=%zu "
1247 "bufbytes=%zu bcount=%zu at %s:%d\n",
1248 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1249 wl->wl_bufbytes, wl->wl_bcount, file, line));
1250 #endif
1251
1252 return 0;
1253 }
1254
1255 void
1256 wapbl_end(struct wapbl *wl)
1257 {
1258
1259 #if defined(WAPBL_DEBUG_PRINT)
1260 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1261 ("wapbl_end thread %d.%d with bufcount=%zu "
1262 "bufbytes=%zu bcount=%zu\n",
1263 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1264 wl->wl_bufbytes, wl->wl_bcount));
1265 #endif
1266
1267 /*
1268 * XXX this could be handled more gracefully, perhaps place
1269 * only a partial transaction in the log and allow the
1270 * remaining to flush without the protection of the journal.
1271 */
1272 KASSERTMSG((wapbl_transaction_len(wl) <=
1273 (wl->wl_circ_size - wl->wl_reserved_bytes)),
1274 "wapbl_end: current transaction too big to flush");
1275
1276 mutex_enter(&wl->wl_mtx);
1277 KASSERT(wl->wl_lock_count > 0);
1278 wl->wl_lock_count--;
1279 mutex_exit(&wl->wl_mtx);
1280
1281 rw_exit(&wl->wl_rwlock);
1282 }
1283
1284 void
1285 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1286 {
1287
1288 KASSERT(bp->b_cflags & BC_BUSY);
1289 KASSERT(bp->b_vp);
1290
1291 wapbl_jlock_assert(wl);
1292
1293 #if 0
1294 /*
1295 * XXX this might be an issue for swapfiles.
1296 * see uvm_swap.c:1702
1297 *
1298 * XXX2 why require it then? leap of semantics?
1299 */
1300 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1301 #endif
1302
1303 mutex_enter(&wl->wl_mtx);
1304 if (bp->b_flags & B_LOCKED) {
1305 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1306 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1307 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1308 "with %d bytes %d bcount\n",
1309 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1310 bp->b_bcount));
1311 } else {
1312 /* unlocked by dirty buffers shouldn't exist */
1313 KASSERT(!(bp->b_oflags & BO_DELWRI));
1314 wl->wl_bufbytes += bp->b_bufsize;
1315 wl->wl_bcount += bp->b_bcount;
1316 wl->wl_bufcount++;
1317 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1318 ("wapbl_add_buf thread %d.%d adding buf %p "
1319 "with %d bytes %d bcount\n",
1320 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1321 bp->b_bcount));
1322 }
1323 TAILQ_INSERT_TAIL(&wl->wl_bufs, bp, b_wapbllist);
1324 mutex_exit(&wl->wl_mtx);
1325
1326 bp->b_flags |= B_LOCKED;
1327 }
1328
1329 static void
1330 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1331 {
1332
1333 KASSERT(mutex_owned(&wl->wl_mtx));
1334 KASSERT(bp->b_cflags & BC_BUSY);
1335 wapbl_jlock_assert(wl);
1336
1337 #if 0
1338 /*
1339 * XXX this might be an issue for swapfiles.
1340 * see uvm_swap.c:1725
1341 *
1342 * XXXdeux: see above
1343 */
1344 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1345 #endif
1346 KASSERT(bp->b_flags & B_LOCKED);
1347
1348 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1349 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1350 "%d bytes %d bcount\n",
1351 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1352
1353 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1354 wl->wl_bufbytes -= bp->b_bufsize;
1355 KASSERT(wl->wl_bcount >= bp->b_bcount);
1356 wl->wl_bcount -= bp->b_bcount;
1357 KASSERT(wl->wl_bufcount > 0);
1358 wl->wl_bufcount--;
1359 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1360 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1361 TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
1362
1363 bp->b_flags &= ~B_LOCKED;
1364 }
1365
1366 /* called from brelsel() in vfs_bio among other places */
1367 void
1368 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1369 {
1370
1371 mutex_enter(&wl->wl_mtx);
1372 wapbl_remove_buf_locked(wl, bp);
1373 mutex_exit(&wl->wl_mtx);
1374 }
1375
1376 void
1377 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1378 {
1379
1380 KASSERT(bp->b_cflags & BC_BUSY);
1381
1382 /*
1383 * XXX: why does this depend on B_LOCKED? otherwise the buf
1384 * is not for a transaction? if so, why is this called in the
1385 * first place?
1386 */
1387 if (bp->b_flags & B_LOCKED) {
1388 mutex_enter(&wl->wl_mtx);
1389 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1390 wl->wl_bcount += bp->b_bcount - oldcnt;
1391 mutex_exit(&wl->wl_mtx);
1392 }
1393 }
1394
1395 #endif /* _KERNEL */
1396
1397 /****************************************************************/
1398 /* Some utility inlines */
1399
1400 /*
1401 * wapbl_space_used(avail, head, tail)
1402 *
1403 * Number of bytes used in a circular queue of avail total bytes,
1404 * from tail to head.
1405 */
1406 static inline size_t
1407 wapbl_space_used(size_t avail, off_t head, off_t tail)
1408 {
1409
1410 if (tail == 0) {
1411 KASSERT(head == 0);
1412 return 0;
1413 }
1414 return ((head + (avail - 1) - tail) % avail) + 1;
1415 }
1416
1417 #ifdef _KERNEL
1418 /*
1419 * wapbl_advance(size, off, oldoff, delta)
1420 *
1421 * Given a byte offset oldoff into a circular queue of size bytes
1422 * starting at off, return a new byte offset oldoff + delta into
1423 * the circular queue.
1424 */
1425 static inline off_t
1426 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1427 {
1428 off_t newoff;
1429
1430 /* Define acceptable ranges for inputs. */
1431 KASSERT(delta <= (size_t)size);
1432 KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1433 KASSERT(oldoff < (off_t)(size + off));
1434
1435 if ((oldoff == 0) && (delta != 0))
1436 newoff = off + delta;
1437 else if ((oldoff + delta) < (size + off))
1438 newoff = oldoff + delta;
1439 else
1440 newoff = (oldoff + delta) - size;
1441
1442 /* Note some interesting axioms */
1443 KASSERT((delta != 0) || (newoff == oldoff));
1444 KASSERT((delta == 0) || (newoff != 0));
1445 KASSERT((delta != (size)) || (newoff == oldoff));
1446
1447 /* Define acceptable ranges for output. */
1448 KASSERT((newoff == 0) || ((size_t)newoff >= off));
1449 KASSERT((size_t)newoff < (size + off));
1450 return newoff;
1451 }
1452
1453 /*
1454 * wapbl_space_free(avail, head, tail)
1455 *
1456 * Number of bytes free in a circular queue of avail total bytes,
1457 * in which everything from tail to head is used.
1458 */
1459 static inline size_t
1460 wapbl_space_free(size_t avail, off_t head, off_t tail)
1461 {
1462
1463 return avail - wapbl_space_used(avail, head, tail);
1464 }
1465
1466 /*
1467 * wapbl_advance_head(size, off, delta, headp, tailp)
1468 *
1469 * In a circular queue of size bytes starting at off, given the
1470 * old head and tail offsets *headp and *tailp, store the new head
1471 * and tail offsets in *headp and *tailp resulting from adding
1472 * delta bytes of data to the head.
1473 */
1474 static inline void
1475 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1476 off_t *tailp)
1477 {
1478 off_t head = *headp;
1479 off_t tail = *tailp;
1480
1481 KASSERT(delta <= wapbl_space_free(size, head, tail));
1482 head = wapbl_advance(size, off, head, delta);
1483 if ((tail == 0) && (head != 0))
1484 tail = off;
1485 *headp = head;
1486 *tailp = tail;
1487 }
1488
1489 /*
1490 * wapbl_advance_tail(size, off, delta, headp, tailp)
1491 *
1492 * In a circular queue of size bytes starting at off, given the
1493 * old head and tail offsets *headp and *tailp, store the new head
1494 * and tail offsets in *headp and *tailp resulting from removing
1495 * delta bytes of data from the tail.
1496 */
1497 static inline void
1498 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1499 off_t *tailp)
1500 {
1501 off_t head = *headp;
1502 off_t tail = *tailp;
1503
1504 KASSERT(delta <= wapbl_space_used(size, head, tail));
1505 tail = wapbl_advance(size, off, tail, delta);
1506 if (head == tail) {
1507 head = tail = 0;
1508 }
1509 *headp = head;
1510 *tailp = tail;
1511 }
1512
1513
1514 /****************************************************************/
1515
1516 /*
1517 * wapbl_truncate(wl, minfree)
1518 *
1519 * Wait until at least minfree bytes are available in the log.
1520 *
1521 * If it was necessary to wait for writes to complete,
1522 * advance the circular queue tail to reflect the new write
1523 * completions and issue a write commit to the log.
1524 *
1525 * => Caller must hold wl->wl_rwlock writer lock.
1526 */
1527 static int
1528 wapbl_truncate(struct wapbl *wl, size_t minfree)
1529 {
1530 size_t delta;
1531 size_t avail;
1532 off_t head;
1533 off_t tail;
1534 int error = 0;
1535
1536 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1537 KASSERT(rw_write_held(&wl->wl_rwlock));
1538
1539 mutex_enter(&wl->wl_mtx);
1540
1541 /*
1542 * First check to see if we have to do a commit
1543 * at all.
1544 */
1545 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1546 if (minfree < avail) {
1547 mutex_exit(&wl->wl_mtx);
1548 return 0;
1549 }
1550 minfree -= avail;
1551 while ((wl->wl_error_count == 0) &&
1552 (wl->wl_reclaimable_bytes < minfree)) {
1553 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1554 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1555 "minfree=%zd\n",
1556 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1557 minfree));
1558
1559 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1560 }
1561 if (wl->wl_reclaimable_bytes < minfree) {
1562 KASSERT(wl->wl_error_count);
1563 /* XXX maybe get actual error from buffer instead someday? */
1564 error = EIO;
1565 }
1566 head = wl->wl_head;
1567 tail = wl->wl_tail;
1568 delta = wl->wl_reclaimable_bytes;
1569
1570 /* If all of of the entries are flushed, then be sure to keep
1571 * the reserved bytes reserved. Watch out for discarded transactions,
1572 * which could leave more bytes reserved than are reclaimable.
1573 */
1574 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1575 (delta >= wl->wl_reserved_bytes)) {
1576 delta -= wl->wl_reserved_bytes;
1577 }
1578 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1579 &tail);
1580 KDASSERT(wl->wl_reserved_bytes <=
1581 wapbl_space_used(wl->wl_circ_size, head, tail));
1582 mutex_exit(&wl->wl_mtx);
1583
1584 if (error)
1585 return error;
1586
1587 /*
1588 * This is where head, tail and delta are unprotected
1589 * from races against itself or flush. This is ok since
1590 * we only call this routine from inside flush itself.
1591 *
1592 * XXX: how can it race against itself when accessed only
1593 * from behind the write-locked rwlock?
1594 */
1595 error = wapbl_write_commit(wl, head, tail);
1596 if (error)
1597 return error;
1598
1599 wl->wl_head = head;
1600 wl->wl_tail = tail;
1601
1602 mutex_enter(&wl->wl_mtx);
1603 KASSERT(wl->wl_reclaimable_bytes >= delta);
1604 wl->wl_reclaimable_bytes -= delta;
1605 mutex_exit(&wl->wl_mtx);
1606 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1607 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1608 curproc->p_pid, curlwp->l_lid, delta));
1609
1610 return 0;
1611 }
1612
1613 /****************************************************************/
1614
1615 void
1616 wapbl_biodone(struct buf *bp)
1617 {
1618 struct wapbl_entry *we = bp->b_private;
1619 struct wapbl *wl = we->we_wapbl;
1620 #ifdef WAPBL_DEBUG_BUFBYTES
1621 const int bufsize = bp->b_bufsize;
1622 #endif
1623
1624 /*
1625 * Handle possible flushing of buffers after log has been
1626 * decomissioned.
1627 */
1628 if (!wl) {
1629 KASSERT(we->we_bufcount > 0);
1630 we->we_bufcount--;
1631 #ifdef WAPBL_DEBUG_BUFBYTES
1632 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1633 we->we_unsynced_bufbytes -= bufsize;
1634 #endif
1635
1636 if (we->we_bufcount == 0) {
1637 #ifdef WAPBL_DEBUG_BUFBYTES
1638 KASSERT(we->we_unsynced_bufbytes == 0);
1639 #endif
1640 pool_put(&wapbl_entry_pool, we);
1641 }
1642
1643 brelse(bp, 0);
1644 return;
1645 }
1646
1647 #ifdef ohbother
1648 KDASSERT(bp->b_oflags & BO_DONE);
1649 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1650 KDASSERT(bp->b_flags & B_ASYNC);
1651 KDASSERT(bp->b_cflags & BC_BUSY);
1652 KDASSERT(!(bp->b_flags & B_LOCKED));
1653 KDASSERT(!(bp->b_flags & B_READ));
1654 KDASSERT(!(bp->b_cflags & BC_INVAL));
1655 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1656 #endif
1657
1658 if (bp->b_error) {
1659 /*
1660 * If an error occurs, it would be nice to leave the buffer
1661 * as a delayed write on the LRU queue so that we can retry
1662 * it later. But buffercache(9) can't handle dirty buffer
1663 * reuse, so just mark the log permanently errored out.
1664 */
1665 mutex_enter(&wl->wl_mtx);
1666 if (wl->wl_error_count == 0) {
1667 wl->wl_error_count++;
1668 cv_broadcast(&wl->wl_reclaimable_cv);
1669 }
1670 mutex_exit(&wl->wl_mtx);
1671 }
1672
1673 /*
1674 * Make sure that the buf doesn't retain the media flags, so that
1675 * e.g. wapbl_allow_fuadpo has immediate effect on any following I/O.
1676 * The flags will be set again if needed by another I/O.
1677 */
1678 bp->b_flags &= ~B_MEDIA_FLAGS;
1679
1680 /*
1681 * Release the buffer here. wapbl_flush() may wait for the
1682 * log to become empty and we better unbusy the buffer before
1683 * wapbl_flush() returns.
1684 */
1685 brelse(bp, 0);
1686
1687 mutex_enter(&wl->wl_mtx);
1688
1689 KASSERT(we->we_bufcount > 0);
1690 we->we_bufcount--;
1691 #ifdef WAPBL_DEBUG_BUFBYTES
1692 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1693 we->we_unsynced_bufbytes -= bufsize;
1694 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1695 wl->wl_unsynced_bufbytes -= bufsize;
1696 #endif
1697 wl->wl_ev_metawrite.ev_count++;
1698
1699 /*
1700 * If the current transaction can be reclaimed, start
1701 * at the beginning and reclaim any consecutive reclaimable
1702 * transactions. If we successfully reclaim anything,
1703 * then wakeup anyone waiting for the reclaim.
1704 */
1705 if (we->we_bufcount == 0) {
1706 size_t delta = 0;
1707 int errcnt = 0;
1708 #ifdef WAPBL_DEBUG_BUFBYTES
1709 KDASSERT(we->we_unsynced_bufbytes == 0);
1710 #endif
1711 /*
1712 * clear any posted error, since the buffer it came from
1713 * has successfully flushed by now
1714 */
1715 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1716 (we->we_bufcount == 0)) {
1717 delta += we->we_reclaimable_bytes;
1718 if (we->we_error)
1719 errcnt++;
1720 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1721 pool_put(&wapbl_entry_pool, we);
1722 }
1723
1724 if (delta) {
1725 wl->wl_reclaimable_bytes += delta;
1726 KASSERT(wl->wl_error_count >= errcnt);
1727 wl->wl_error_count -= errcnt;
1728 cv_broadcast(&wl->wl_reclaimable_cv);
1729 }
1730 }
1731
1732 mutex_exit(&wl->wl_mtx);
1733 }
1734
1735 /*
1736 * wapbl_flush(wl, wait)
1737 *
1738 * Flush pending block writes, deallocations, and inodes from
1739 * the current transaction in memory to the log on disk:
1740 *
1741 * 1. Call the file system's wl_flush callback to flush any
1742 * per-file-system pending updates.
1743 * 2. Wait for enough space in the log for the current transaction.
1744 * 3. Synchronously write the new log records, advancing the
1745 * circular queue head.
1746 * 4. Issue the pending block writes asynchronously, now that they
1747 * are recorded in the log and can be replayed after crash.
1748 * 5. If wait is true, wait for all writes to complete and for the
1749 * log to become empty.
1750 *
1751 * On failure, call the file system's wl_flush_abort callback.
1752 */
1753 int
1754 wapbl_flush(struct wapbl *wl, int waitfor)
1755 {
1756 struct buf *bp;
1757 struct wapbl_entry *we;
1758 off_t off;
1759 off_t head;
1760 off_t tail;
1761 size_t delta = 0;
1762 size_t flushsize;
1763 size_t reserved;
1764 int error = 0;
1765
1766 /*
1767 * Do a quick check to see if a full flush can be skipped
1768 * This assumes that the flush callback does not need to be called
1769 * unless there are other outstanding bufs.
1770 */
1771 if (!waitfor) {
1772 size_t nbufs;
1773 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1774 protect the KASSERTS */
1775 nbufs = wl->wl_bufcount;
1776 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1777 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1778 mutex_exit(&wl->wl_mtx);
1779 if (nbufs == 0)
1780 return 0;
1781 }
1782
1783 /*
1784 * XXX we may consider using LK_UPGRADE here
1785 * if we want to call flush from inside a transaction
1786 */
1787 rw_enter(&wl->wl_rwlock, RW_WRITER);
1788 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
1789
1790 /*
1791 * Now that we are exclusively locked and the file system has
1792 * issued any deferred block writes for this transaction, check
1793 * whether there are any blocks to write to the log. If not,
1794 * skip waiting for space or writing any log entries.
1795 *
1796 * XXX Shouldn't this also check wl_dealloccnt and
1797 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1798 * file system didn't produce any blocks as a consequence of
1799 * it, but the same does not seem to be so of wl_inohashcnt.
1800 */
1801 if (wl->wl_bufcount == 0) {
1802 goto wait_out;
1803 }
1804
1805 #if 0
1806 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1807 ("wapbl_flush thread %d.%d flushing entries with "
1808 "bufcount=%zu bufbytes=%zu\n",
1809 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1810 wl->wl_bufbytes));
1811 #endif
1812
1813 /* Calculate amount of space needed to flush */
1814 flushsize = wapbl_transaction_len(wl);
1815 if (wapbl_verbose_commit) {
1816 struct timespec ts;
1817 getnanotime(&ts);
1818 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1819 __func__, (long long)ts.tv_sec,
1820 (long)ts.tv_nsec, flushsize);
1821 }
1822
1823 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1824 /*
1825 * XXX this could be handled more gracefully, perhaps place
1826 * only a partial transaction in the log and allow the
1827 * remaining to flush without the protection of the journal.
1828 */
1829 panic("wapbl_flush: current transaction too big to flush");
1830 }
1831
1832 error = wapbl_truncate(wl, flushsize);
1833 if (error)
1834 goto out;
1835
1836 off = wl->wl_head;
1837 KASSERT((off == 0) || (off >= wl->wl_circ_off));
1838 KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1839 error = wapbl_write_blocks(wl, &off);
1840 if (error)
1841 goto out;
1842 error = wapbl_write_revocations(wl, &off);
1843 if (error)
1844 goto out;
1845 error = wapbl_write_inodes(wl, &off);
1846 if (error)
1847 goto out;
1848
1849 reserved = 0;
1850 if (wl->wl_inohashcnt)
1851 reserved = wapbl_transaction_inodes_len(wl);
1852
1853 head = wl->wl_head;
1854 tail = wl->wl_tail;
1855
1856 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1857 &head, &tail);
1858
1859 KASSERTMSG(head == off,
1860 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1861 " off=%"PRIdMAX" flush=%zu",
1862 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1863 flushsize);
1864
1865 /* Opportunistically move the tail forward if we can */
1866 mutex_enter(&wl->wl_mtx);
1867 delta = wl->wl_reclaimable_bytes;
1868 mutex_exit(&wl->wl_mtx);
1869 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1870 &head, &tail);
1871
1872 error = wapbl_write_commit(wl, head, tail);
1873 if (error)
1874 goto out;
1875
1876 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1877
1878 #ifdef WAPBL_DEBUG_BUFBYTES
1879 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1880 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1881 " unsynced=%zu"
1882 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1883 "inodes=%d\n",
1884 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1885 wapbl_space_used(wl->wl_circ_size, head, tail),
1886 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1887 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1888 wl->wl_inohashcnt));
1889 #else
1890 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1891 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1892 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1893 "inodes=%d\n",
1894 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1895 wapbl_space_used(wl->wl_circ_size, head, tail),
1896 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1897 wl->wl_dealloccnt, wl->wl_inohashcnt));
1898 #endif
1899
1900
1901 mutex_enter(&bufcache_lock);
1902 mutex_enter(&wl->wl_mtx);
1903
1904 wl->wl_reserved_bytes = reserved;
1905 wl->wl_head = head;
1906 wl->wl_tail = tail;
1907 KASSERT(wl->wl_reclaimable_bytes >= delta);
1908 wl->wl_reclaimable_bytes -= delta;
1909 KDASSERT(wl->wl_dealloccnt == 0);
1910 #ifdef WAPBL_DEBUG_BUFBYTES
1911 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1912 #endif
1913
1914 we->we_wapbl = wl;
1915 we->we_bufcount = wl->wl_bufcount;
1916 #ifdef WAPBL_DEBUG_BUFBYTES
1917 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1918 #endif
1919 we->we_reclaimable_bytes = flushsize;
1920 we->we_error = 0;
1921 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1922
1923 /*
1924 * This flushes bufs in order than they were queued, so the LRU
1925 * order is preserved.
1926 */
1927 while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
1928 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1929 continue;
1930 }
1931 bp->b_iodone = wapbl_biodone;
1932 bp->b_private = we;
1933
1934 /* make sure the block is saved sync when FUA in use */
1935 bp->b_flags |= WAPBL_MFLAGS(wl);
1936
1937 bremfree(bp);
1938 wapbl_remove_buf_locked(wl, bp);
1939 mutex_exit(&wl->wl_mtx);
1940 mutex_exit(&bufcache_lock);
1941 bawrite(bp);
1942 mutex_enter(&bufcache_lock);
1943 mutex_enter(&wl->wl_mtx);
1944 }
1945 mutex_exit(&wl->wl_mtx);
1946 mutex_exit(&bufcache_lock);
1947
1948 #if 0
1949 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1950 ("wapbl_flush thread %d.%d done flushing entries...\n",
1951 curproc->p_pid, curlwp->l_lid));
1952 #endif
1953
1954 wait_out:
1955
1956 /*
1957 * If the waitfor flag is set, don't return until everything is
1958 * fully flushed and the on disk log is empty.
1959 */
1960 if (waitfor) {
1961 error = wapbl_truncate(wl, wl->wl_circ_size -
1962 wl->wl_reserved_bytes);
1963 }
1964
1965 out:
1966 if (error) {
1967 wl->wl_flush_abort(wl->wl_mount,
1968 TAILQ_FIRST(&wl->wl_dealloclist));
1969 }
1970
1971 #ifdef WAPBL_DEBUG_PRINT
1972 if (error) {
1973 pid_t pid = -1;
1974 lwpid_t lid = -1;
1975 if (curproc)
1976 pid = curproc->p_pid;
1977 if (curlwp)
1978 lid = curlwp->l_lid;
1979 mutex_enter(&wl->wl_mtx);
1980 #ifdef WAPBL_DEBUG_BUFBYTES
1981 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1982 ("wapbl_flush: thread %d.%d aborted flush: "
1983 "error = %d\n"
1984 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1985 "deallocs=%d inodes=%d\n"
1986 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1987 "unsynced=%zu\n",
1988 pid, lid, error, wl->wl_bufcount,
1989 wl->wl_bufbytes, wl->wl_bcount,
1990 wl->wl_dealloccnt, wl->wl_inohashcnt,
1991 wl->wl_error_count, wl->wl_reclaimable_bytes,
1992 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1993 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1994 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1995 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1996 "error = %d, unsynced = %zu\n",
1997 we->we_bufcount, we->we_reclaimable_bytes,
1998 we->we_error, we->we_unsynced_bufbytes));
1999 }
2000 #else
2001 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2002 ("wapbl_flush: thread %d.%d aborted flush: "
2003 "error = %d\n"
2004 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
2005 "deallocs=%d inodes=%d\n"
2006 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
2007 pid, lid, error, wl->wl_bufcount,
2008 wl->wl_bufbytes, wl->wl_bcount,
2009 wl->wl_dealloccnt, wl->wl_inohashcnt,
2010 wl->wl_error_count, wl->wl_reclaimable_bytes,
2011 wl->wl_reserved_bytes));
2012 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2013 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2014 ("\tentry: bufcount = %zu, reclaimable = %zu, "
2015 "error = %d\n", we->we_bufcount,
2016 we->we_reclaimable_bytes, we->we_error));
2017 }
2018 #endif
2019 mutex_exit(&wl->wl_mtx);
2020 }
2021 #endif
2022
2023 rw_exit(&wl->wl_rwlock);
2024 return error;
2025 }
2026
2027 /****************************************************************/
2028
2029 void
2030 wapbl_jlock_assert(struct wapbl *wl)
2031 {
2032
2033 KASSERT(rw_lock_held(&wl->wl_rwlock));
2034 }
2035
2036 void
2037 wapbl_junlock_assert(struct wapbl *wl)
2038 {
2039
2040 KASSERT(!rw_write_held(&wl->wl_rwlock));
2041 }
2042
2043 /****************************************************************/
2044
2045 /* locks missing */
2046 void
2047 wapbl_print(struct wapbl *wl,
2048 int full,
2049 void (*pr)(const char *, ...))
2050 {
2051 struct buf *bp;
2052 struct wapbl_entry *we;
2053 (*pr)("wapbl %p", wl);
2054 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
2055 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
2056 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
2057 wl->wl_circ_size, wl->wl_circ_off,
2058 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
2059 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
2060 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
2061 #ifdef WAPBL_DEBUG_BUFBYTES
2062 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2063 "reserved = %zu errcnt = %d unsynced = %zu\n",
2064 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
2065 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2066 wl->wl_error_count, wl->wl_unsynced_bufbytes);
2067 #else
2068 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
2069 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
2070 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
2071 wl->wl_error_count);
2072 #endif
2073 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
2074 wl->wl_dealloccnt, wl->wl_dealloclim);
2075 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
2076 wl->wl_inohashcnt, wl->wl_inohashmask);
2077 (*pr)("entries:\n");
2078 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
2079 #ifdef WAPBL_DEBUG_BUFBYTES
2080 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
2081 "unsynced = %zu\n",
2082 we->we_bufcount, we->we_reclaimable_bytes,
2083 we->we_error, we->we_unsynced_bufbytes);
2084 #else
2085 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
2086 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
2087 #endif
2088 }
2089 if (full) {
2090 int cnt = 0;
2091 (*pr)("bufs =");
2092 TAILQ_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
2093 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2094 (*pr)(" %p", bp);
2095 } else if ((++cnt % 6) == 0) {
2096 (*pr)(" %p,\n\t", bp);
2097 } else {
2098 (*pr)(" %p,", bp);
2099 }
2100 }
2101 (*pr)("\n");
2102
2103 (*pr)("dealloced blks = ");
2104 {
2105 struct wapbl_dealloc *wd;
2106 cnt = 0;
2107 TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
2108 (*pr)(" %"PRId64":%d,",
2109 wd->wd_blkno,
2110 wd->wd_len);
2111 if ((++cnt % 4) == 0) {
2112 (*pr)("\n\t");
2113 }
2114 }
2115 }
2116 (*pr)("\n");
2117
2118 (*pr)("registered inodes = ");
2119 {
2120 int i;
2121 cnt = 0;
2122 for (i = 0; i <= wl->wl_inohashmask; i++) {
2123 struct wapbl_ino_head *wih;
2124 struct wapbl_ino *wi;
2125
2126 wih = &wl->wl_inohash[i];
2127 LIST_FOREACH(wi, wih, wi_hash) {
2128 if (wi->wi_ino == 0)
2129 continue;
2130 (*pr)(" %"PRIu64"/0%06"PRIo32",",
2131 wi->wi_ino, wi->wi_mode);
2132 if ((++cnt % 4) == 0) {
2133 (*pr)("\n\t");
2134 }
2135 }
2136 }
2137 (*pr)("\n");
2138 }
2139
2140 (*pr)("iobufs free =");
2141 TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
2142 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2143 (*pr)(" %p", bp);
2144 } else if ((++cnt % 6) == 0) {
2145 (*pr)(" %p,\n\t", bp);
2146 } else {
2147 (*pr)(" %p,", bp);
2148 }
2149 }
2150 (*pr)("\n");
2151
2152 (*pr)("iobufs busy =");
2153 TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
2154 if (!TAILQ_NEXT(bp, b_wapbllist)) {
2155 (*pr)(" %p", bp);
2156 } else if ((++cnt % 6) == 0) {
2157 (*pr)(" %p,\n\t", bp);
2158 } else {
2159 (*pr)(" %p,", bp);
2160 }
2161 }
2162 (*pr)("\n");
2163 }
2164 }
2165
2166 #if defined(WAPBL_DEBUG) || defined(DDB)
2167 void
2168 wapbl_dump(struct wapbl *wl)
2169 {
2170 #if defined(WAPBL_DEBUG)
2171 if (!wl)
2172 wl = wapbl_debug_wl;
2173 #endif
2174 if (!wl)
2175 return;
2176 wapbl_print(wl, 1, printf);
2177 }
2178 #endif
2179
2180 /****************************************************************/
2181
2182 int
2183 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force,
2184 void **cookiep)
2185 {
2186 struct wapbl_dealloc *wd;
2187 int error = 0;
2188
2189 wapbl_jlock_assert(wl);
2190
2191 mutex_enter(&wl->wl_mtx);
2192
2193 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
2194 if (!force) {
2195 error = EAGAIN;
2196 goto out;
2197 }
2198
2199 /*
2200 * Forced registration can only be used when:
2201 * 1) the caller can't cope with failure
2202 * 2) the path can be triggered only bounded, small
2203 * times per transaction
2204 * If this is not fullfilled, and the path would be triggered
2205 * many times, this could overflow maximum transaction size
2206 * and panic later.
2207 */
2208 printf("%s: forced dealloc registration over limit: %d >= %d\n",
2209 wl->wl_mount->mnt_stat.f_mntonname,
2210 wl->wl_dealloccnt, wl->wl_dealloclim);
2211 }
2212
2213 wl->wl_dealloccnt++;
2214 mutex_exit(&wl->wl_mtx);
2215
2216 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
2217 wd->wd_blkno = blk;
2218 wd->wd_len = len;
2219
2220 mutex_enter(&wl->wl_mtx);
2221 TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
2222
2223 if (cookiep)
2224 *cookiep = wd;
2225
2226 out:
2227 mutex_exit(&wl->wl_mtx);
2228
2229 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
2230 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
2231 blk, len, error));
2232
2233 return error;
2234 }
2235
2236 static void
2237 wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd,
2238 bool locked)
2239 {
2240 KASSERT(!locked
2241 || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx));
2242
2243 if (!locked)
2244 mutex_enter(&wl->wl_mtx);
2245
2246 TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
2247 wl->wl_dealloccnt--;
2248
2249 if (!locked)
2250 mutex_exit(&wl->wl_mtx);
2251
2252 pool_put(&wapbl_dealloc_pool, wd);
2253 }
2254
2255 void
2256 wapbl_unregister_deallocation(struct wapbl *wl, void *cookie)
2257 {
2258 KASSERT(cookie != NULL);
2259 wapbl_deallocation_free(wl, cookie, false);
2260 }
2261
2262 /****************************************************************/
2263
2264 static void
2265 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
2266 {
2267
2268 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
2269 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
2270 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
2271 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
2272 }
2273 }
2274
2275 static void
2276 wapbl_inodetrk_free(struct wapbl *wl)
2277 {
2278
2279 /* XXX this KASSERT needs locking/mutex analysis */
2280 KASSERT(wl->wl_inohashcnt == 0);
2281 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2282 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2283 pool_destroy(&wapbl_ino_pool);
2284 }
2285 }
2286
2287 static struct wapbl_ino *
2288 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2289 {
2290 struct wapbl_ino_head *wih;
2291 struct wapbl_ino *wi;
2292
2293 KASSERT(mutex_owned(&wl->wl_mtx));
2294
2295 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2296 LIST_FOREACH(wi, wih, wi_hash) {
2297 if (ino == wi->wi_ino)
2298 return wi;
2299 }
2300 return 0;
2301 }
2302
2303 void
2304 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2305 {
2306 struct wapbl_ino_head *wih;
2307 struct wapbl_ino *wi;
2308
2309 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2310
2311 mutex_enter(&wl->wl_mtx);
2312 if (wapbl_inodetrk_get(wl, ino) == NULL) {
2313 wi->wi_ino = ino;
2314 wi->wi_mode = mode;
2315 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2316 LIST_INSERT_HEAD(wih, wi, wi_hash);
2317 wl->wl_inohashcnt++;
2318 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2319 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2320 mutex_exit(&wl->wl_mtx);
2321 } else {
2322 mutex_exit(&wl->wl_mtx);
2323 pool_put(&wapbl_ino_pool, wi);
2324 }
2325 }
2326
2327 void
2328 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2329 {
2330 struct wapbl_ino *wi;
2331
2332 mutex_enter(&wl->wl_mtx);
2333 wi = wapbl_inodetrk_get(wl, ino);
2334 if (wi) {
2335 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2336 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2337 KASSERT(wl->wl_inohashcnt > 0);
2338 wl->wl_inohashcnt--;
2339 LIST_REMOVE(wi, wi_hash);
2340 mutex_exit(&wl->wl_mtx);
2341
2342 pool_put(&wapbl_ino_pool, wi);
2343 } else {
2344 mutex_exit(&wl->wl_mtx);
2345 }
2346 }
2347
2348 /****************************************************************/
2349
2350 /*
2351 * wapbl_transaction_inodes_len(wl)
2352 *
2353 * Calculate the number of bytes required for inode registration
2354 * log records in wl.
2355 */
2356 static inline size_t
2357 wapbl_transaction_inodes_len(struct wapbl *wl)
2358 {
2359 int blocklen = 1<<wl->wl_log_dev_bshift;
2360 int iph;
2361
2362 /* Calculate number of inodes described in a inodelist header */
2363 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2364 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2365
2366 KASSERT(iph > 0);
2367
2368 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2369 }
2370
2371
2372 /*
2373 * wapbl_transaction_len(wl)
2374 *
2375 * Calculate number of bytes required for all log records in wl.
2376 */
2377 static size_t
2378 wapbl_transaction_len(struct wapbl *wl)
2379 {
2380 int blocklen = 1<<wl->wl_log_dev_bshift;
2381 size_t len;
2382
2383 /* Calculate number of blocks described in a blocklist header */
2384 len = wl->wl_bcount;
2385 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2386 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2387 len += wapbl_transaction_inodes_len(wl);
2388
2389 return len;
2390 }
2391
2392 /*
2393 * wapbl_cache_sync(wl, msg)
2394 *
2395 * Issue DIOCCACHESYNC to wl->wl_devvp.
2396 *
2397 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2398 * including msg about the duration of the cache sync.
2399 */
2400 static int
2401 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2402 {
2403 const bool verbose = wapbl_verbose_commit >= 2;
2404 struct bintime start_time;
2405 int force = 1;
2406 int error;
2407
2408 /* Skip full cache sync if disabled, or when using FUA */
2409 if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
2410 return 0;
2411 }
2412 if (verbose) {
2413 bintime(&start_time);
2414 }
2415 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2416 FWRITE, FSCRED);
2417 if (error) {
2418 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2419 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2420 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
2421 }
2422 if (verbose) {
2423 struct bintime d;
2424 struct timespec ts;
2425
2426 bintime(&d);
2427 bintime_sub(&d, &start_time);
2428 bintime2timespec(&d, &ts);
2429 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2430 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2431 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2432 }
2433
2434 wl->wl_ev_cacheflush.ev_count++;
2435
2436 return error;
2437 }
2438
2439 /*
2440 * wapbl_write_commit(wl, head, tail)
2441 *
2442 * Issue a disk cache sync to wait for all pending writes to the
2443 * log to complete, and then synchronously commit the current
2444 * circular queue head and tail to the log, in the next of two
2445 * locations for commit headers on disk.
2446 *
2447 * Increment the generation number. If the generation number
2448 * rolls over to zero, then a subsequent commit would appear to
2449 * have an older generation than this one -- in that case, issue a
2450 * duplicate commit to avoid this.
2451 *
2452 * => Caller must have exclusive access to wl, either by holding
2453 * wl->wl_rwlock for writer or by being wapbl_start before anyone
2454 * else has seen wl.
2455 */
2456 static int
2457 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2458 {
2459 struct wapbl_wc_header *wc = wl->wl_wc_header;
2460 struct timespec ts;
2461 int error;
2462 daddr_t pbn;
2463
2464 error = wapbl_buffered_flush(wl, true);
2465 if (error)
2466 return error;
2467 /*
2468 * flush disk cache to ensure that blocks we've written are actually
2469 * written to the stable storage before the commit header.
2470 *
2471 * XXX Calc checksum here, instead we do this for now
2472 */
2473 wapbl_cache_sync(wl, "1");
2474
2475 wc->wc_head = head;
2476 wc->wc_tail = tail;
2477 wc->wc_checksum = 0;
2478 wc->wc_version = 1;
2479 getnanotime(&ts);
2480 wc->wc_time = ts.tv_sec;
2481 wc->wc_timensec = ts.tv_nsec;
2482
2483 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2484 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2485 (intmax_t)head, (intmax_t)tail));
2486
2487 /*
2488 * write the commit header.
2489 *
2490 * XXX if generation will rollover, then first zero
2491 * over second commit header before trying to write both headers.
2492 */
2493
2494 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2495 #ifdef _KERNEL
2496 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2497 #endif
2498 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2499 if (error)
2500 return error;
2501 error = wapbl_buffered_flush(wl, true);
2502 if (error)
2503 return error;
2504
2505 /*
2506 * flush disk cache to ensure that the commit header is actually
2507 * written before meta data blocks.
2508 */
2509 wapbl_cache_sync(wl, "2");
2510
2511 /*
2512 * If the generation number was zero, write it out a second time.
2513 * This handles initialization and generation number rollover
2514 */
2515 if (wc->wc_generation++ == 0) {
2516 error = wapbl_write_commit(wl, head, tail);
2517 /*
2518 * This panic should be able to be removed if we do the
2519 * zero'ing mentioned above, and we are certain to roll
2520 * back generation number on failure.
2521 */
2522 if (error)
2523 panic("wapbl_write_commit: error writing duplicate "
2524 "log header: %d", error);
2525 }
2526
2527 wl->wl_ev_commit.ev_count++;
2528
2529 return 0;
2530 }
2531
2532 /*
2533 * wapbl_write_blocks(wl, offp)
2534 *
2535 * Write all pending physical blocks in the current transaction
2536 * from wapbl_add_buf to the log on disk, adding to the circular
2537 * queue head at byte offset *offp, and returning the new head's
2538 * byte offset in *offp.
2539 */
2540 static int
2541 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2542 {
2543 struct wapbl_wc_blocklist *wc =
2544 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2545 int blocklen = 1<<wl->wl_log_dev_bshift;
2546 struct buf *bp;
2547 off_t off = *offp;
2548 int error;
2549 size_t padding;
2550
2551 KASSERT(rw_write_held(&wl->wl_rwlock));
2552
2553 bp = TAILQ_FIRST(&wl->wl_bufs);
2554
2555 while (bp) {
2556 int cnt;
2557 struct buf *obp = bp;
2558
2559 KASSERT(bp->b_flags & B_LOCKED);
2560
2561 wc->wc_type = WAPBL_WC_BLOCKS;
2562 wc->wc_len = blocklen;
2563 wc->wc_blkcount = 0;
2564 while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
2565 /*
2566 * Make sure all the physical block numbers are up to
2567 * date. If this is not always true on a given
2568 * filesystem, then VOP_BMAP must be called. We
2569 * could call VOP_BMAP here, or else in the filesystem
2570 * specific flush callback, although neither of those
2571 * solutions allow us to take the vnode lock. If a
2572 * filesystem requires that we must take the vnode lock
2573 * to call VOP_BMAP, then we can probably do it in
2574 * bwrite when the vnode lock should already be held
2575 * by the invoking code.
2576 */
2577 KASSERT((bp->b_vp->v_type == VBLK) ||
2578 (bp->b_blkno != bp->b_lblkno));
2579 KASSERT(bp->b_blkno > 0);
2580
2581 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2582 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2583 wc->wc_len += bp->b_bcount;
2584 wc->wc_blkcount++;
2585 bp = TAILQ_NEXT(bp, b_wapbllist);
2586 }
2587 if (wc->wc_len % blocklen != 0) {
2588 padding = blocklen - wc->wc_len % blocklen;
2589 wc->wc_len += padding;
2590 } else {
2591 padding = 0;
2592 }
2593
2594 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2595 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2596 wc->wc_len, padding, (intmax_t)off));
2597
2598 error = wapbl_circ_write(wl, wc, blocklen, &off);
2599 if (error)
2600 return error;
2601 bp = obp;
2602 cnt = 0;
2603 while (bp && (cnt++ < wl->wl_brperjblock)) {
2604 error = wapbl_circ_write(wl, bp->b_data,
2605 bp->b_bcount, &off);
2606 if (error)
2607 return error;
2608 bp = TAILQ_NEXT(bp, b_wapbllist);
2609 }
2610 if (padding) {
2611 void *zero;
2612
2613 zero = wapbl_alloc(padding);
2614 memset(zero, 0, padding);
2615 error = wapbl_circ_write(wl, zero, padding, &off);
2616 wapbl_free(zero, padding);
2617 if (error)
2618 return error;
2619 }
2620 }
2621 *offp = off;
2622 return 0;
2623 }
2624
2625 /*
2626 * wapbl_write_revocations(wl, offp)
2627 *
2628 * Write all pending deallocations in the current transaction from
2629 * wapbl_register_deallocation to the log on disk, adding to the
2630 * circular queue's head at byte offset *offp, and returning the
2631 * new head's byte offset in *offp.
2632 */
2633 static int
2634 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2635 {
2636 struct wapbl_wc_blocklist *wc =
2637 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2638 struct wapbl_dealloc *wd, *lwd;
2639 int blocklen = 1<<wl->wl_log_dev_bshift;
2640 off_t off = *offp;
2641 int error;
2642
2643 KASSERT(rw_write_held(&wl->wl_rwlock));
2644
2645 if (wl->wl_dealloccnt == 0)
2646 return 0;
2647
2648 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2649 wc->wc_type = WAPBL_WC_REVOCATIONS;
2650 wc->wc_len = blocklen;
2651 wc->wc_blkcount = 0;
2652 while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
2653 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2654 wd->wd_blkno;
2655 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2656 wd->wd_len;
2657 wc->wc_blkcount++;
2658
2659 wd = TAILQ_NEXT(wd, wd_entries);
2660 }
2661 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2662 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2663 wc->wc_len, (intmax_t)off));
2664 error = wapbl_circ_write(wl, wc, blocklen, &off);
2665 if (error)
2666 return error;
2667
2668 /* free all successfully written deallocs */
2669 lwd = wd;
2670 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2671 if (wd == lwd)
2672 break;
2673 wapbl_deallocation_free(wl, wd, true);
2674 }
2675 }
2676 *offp = off;
2677 return 0;
2678 }
2679
2680 /*
2681 * wapbl_write_inodes(wl, offp)
2682 *
2683 * Write all pending inode allocations in the current transaction
2684 * from wapbl_register_inode to the log on disk, adding to the
2685 * circular queue's head at byte offset *offp and returning the
2686 * new head's byte offset in *offp.
2687 */
2688 static int
2689 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2690 {
2691 struct wapbl_wc_inodelist *wc =
2692 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2693 int i;
2694 int blocklen = 1 << wl->wl_log_dev_bshift;
2695 off_t off = *offp;
2696 int error;
2697
2698 struct wapbl_ino_head *wih;
2699 struct wapbl_ino *wi;
2700 int iph;
2701
2702 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2703 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2704
2705 i = 0;
2706 wih = &wl->wl_inohash[0];
2707 wi = 0;
2708 do {
2709 wc->wc_type = WAPBL_WC_INODES;
2710 wc->wc_len = blocklen;
2711 wc->wc_inocnt = 0;
2712 wc->wc_clear = (i == 0);
2713 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2714 while (!wi) {
2715 KASSERT((wih - &wl->wl_inohash[0])
2716 <= wl->wl_inohashmask);
2717 wi = LIST_FIRST(wih++);
2718 }
2719 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2720 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2721 wc->wc_inocnt++;
2722 i++;
2723 wi = LIST_NEXT(wi, wi_hash);
2724 }
2725 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2726 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2727 wc->wc_len, (intmax_t)off));
2728 error = wapbl_circ_write(wl, wc, blocklen, &off);
2729 if (error)
2730 return error;
2731 } while (i < wl->wl_inohashcnt);
2732
2733 *offp = off;
2734 return 0;
2735 }
2736
2737 #endif /* _KERNEL */
2738
2739 /****************************************************************/
2740
2741 struct wapbl_blk {
2742 LIST_ENTRY(wapbl_blk) wb_hash;
2743 daddr_t wb_blk;
2744 off_t wb_off; /* Offset of this block in the log */
2745 };
2746 #define WAPBL_BLKPOOL_MIN 83
2747
2748 static void
2749 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2750 {
2751 if (size < WAPBL_BLKPOOL_MIN)
2752 size = WAPBL_BLKPOOL_MIN;
2753 KASSERT(wr->wr_blkhash == 0);
2754 #ifdef _KERNEL
2755 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2756 #else /* ! _KERNEL */
2757 /* Manually implement hashinit */
2758 {
2759 unsigned long i, hashsize;
2760 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2761 continue;
2762 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2763 for (i = 0; i < hashsize; i++)
2764 LIST_INIT(&wr->wr_blkhash[i]);
2765 wr->wr_blkhashmask = hashsize - 1;
2766 }
2767 #endif /* ! _KERNEL */
2768 }
2769
2770 static void
2771 wapbl_blkhash_free(struct wapbl_replay *wr)
2772 {
2773 KASSERT(wr->wr_blkhashcnt == 0);
2774 #ifdef _KERNEL
2775 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2776 #else /* ! _KERNEL */
2777 wapbl_free(wr->wr_blkhash,
2778 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2779 #endif /* ! _KERNEL */
2780 }
2781
2782 static struct wapbl_blk *
2783 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2784 {
2785 struct wapbl_blk_head *wbh;
2786 struct wapbl_blk *wb;
2787 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2788 LIST_FOREACH(wb, wbh, wb_hash) {
2789 if (blk == wb->wb_blk)
2790 return wb;
2791 }
2792 return 0;
2793 }
2794
2795 static void
2796 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2797 {
2798 struct wapbl_blk_head *wbh;
2799 struct wapbl_blk *wb;
2800 wb = wapbl_blkhash_get(wr, blk);
2801 if (wb) {
2802 KASSERT(wb->wb_blk == blk);
2803 wb->wb_off = off;
2804 } else {
2805 wb = wapbl_alloc(sizeof(*wb));
2806 wb->wb_blk = blk;
2807 wb->wb_off = off;
2808 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2809 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2810 wr->wr_blkhashcnt++;
2811 }
2812 }
2813
2814 static void
2815 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2816 {
2817 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2818 if (wb) {
2819 KASSERT(wr->wr_blkhashcnt > 0);
2820 wr->wr_blkhashcnt--;
2821 LIST_REMOVE(wb, wb_hash);
2822 wapbl_free(wb, sizeof(*wb));
2823 }
2824 }
2825
2826 static void
2827 wapbl_blkhash_clear(struct wapbl_replay *wr)
2828 {
2829 unsigned long i;
2830 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2831 struct wapbl_blk *wb;
2832
2833 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2834 KASSERT(wr->wr_blkhashcnt > 0);
2835 wr->wr_blkhashcnt--;
2836 LIST_REMOVE(wb, wb_hash);
2837 wapbl_free(wb, sizeof(*wb));
2838 }
2839 }
2840 KASSERT(wr->wr_blkhashcnt == 0);
2841 }
2842
2843 /****************************************************************/
2844
2845 /*
2846 * wapbl_circ_read(wr, data, len, offp)
2847 *
2848 * Read len bytes into data from the circular queue of wr,
2849 * starting at the linear byte offset *offp, and returning the new
2850 * linear byte offset in *offp.
2851 *
2852 * If the starting linear byte offset precedes wr->wr_circ_off,
2853 * the read instead begins at wr->wr_circ_off. XXX WTF? This
2854 * should be a KASSERT, not a conditional.
2855 */
2856 static int
2857 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2858 {
2859 size_t slen;
2860 off_t off = *offp;
2861 int error;
2862 daddr_t pbn;
2863
2864 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2865 wr->wr_log_dev_bshift) == len);
2866
2867 if (off < wr->wr_circ_off)
2868 off = wr->wr_circ_off;
2869 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2870 if (slen < len) {
2871 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2872 #ifdef _KERNEL
2873 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2874 #endif
2875 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2876 if (error)
2877 return error;
2878 data = (uint8_t *)data + slen;
2879 len -= slen;
2880 off = wr->wr_circ_off;
2881 }
2882 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2883 #ifdef _KERNEL
2884 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2885 #endif
2886 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2887 if (error)
2888 return error;
2889 off += len;
2890 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2891 off = wr->wr_circ_off;
2892 *offp = off;
2893 return 0;
2894 }
2895
2896 /*
2897 * wapbl_circ_advance(wr, len, offp)
2898 *
2899 * Compute the linear byte offset of the circular queue of wr that
2900 * is len bytes past *offp, and store it in *offp.
2901 *
2902 * This is as if wapbl_circ_read, but without actually reading
2903 * anything.
2904 *
2905 * If the starting linear byte offset precedes wr->wr_circ_off, it
2906 * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2907 * be a KASSERT, not a conditional.
2908 */
2909 static void
2910 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2911 {
2912 size_t slen;
2913 off_t off = *offp;
2914
2915 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2916 wr->wr_log_dev_bshift) == len);
2917
2918 if (off < wr->wr_circ_off)
2919 off = wr->wr_circ_off;
2920 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2921 if (slen < len) {
2922 len -= slen;
2923 off = wr->wr_circ_off;
2924 }
2925 off += len;
2926 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2927 off = wr->wr_circ_off;
2928 *offp = off;
2929 }
2930
2931 /****************************************************************/
2932
2933 int
2934 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2935 daddr_t off, size_t count, size_t blksize)
2936 {
2937 struct wapbl_replay *wr;
2938 int error;
2939 struct vnode *devvp;
2940 daddr_t logpbn;
2941 uint8_t *scratch;
2942 struct wapbl_wc_header *wch;
2943 struct wapbl_wc_header *wch2;
2944 /* Use this until we read the actual log header */
2945 int log_dev_bshift = ilog2(blksize);
2946 size_t used;
2947 daddr_t pbn;
2948
2949 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2950 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2951 vp, off, count, blksize));
2952
2953 if (off < 0)
2954 return EINVAL;
2955
2956 if (blksize < DEV_BSIZE)
2957 return EINVAL;
2958 if (blksize % DEV_BSIZE)
2959 return EINVAL;
2960
2961 #ifdef _KERNEL
2962 #if 0
2963 /* XXX vp->v_size isn't reliably set for VBLK devices,
2964 * especially root. However, we might still want to verify
2965 * that the full load is readable */
2966 if ((off + count) * blksize > vp->v_size)
2967 return EINVAL;
2968 #endif
2969 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2970 return error;
2971 }
2972 #else /* ! _KERNEL */
2973 devvp = vp;
2974 logpbn = off;
2975 #endif /* ! _KERNEL */
2976
2977 scratch = wapbl_alloc(MAXBSIZE);
2978
2979 pbn = logpbn;
2980 #ifdef _KERNEL
2981 pbn = btodb(pbn << log_dev_bshift);
2982 #endif
2983 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2984 if (error)
2985 goto errout;
2986
2987 wch = (struct wapbl_wc_header *)scratch;
2988 wch2 =
2989 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2990 /* XXX verify checksums and magic numbers */
2991 if (wch->wc_type != WAPBL_WC_HEADER) {
2992 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2993 error = EFTYPE;
2994 goto errout;
2995 }
2996
2997 if (wch2->wc_generation > wch->wc_generation)
2998 wch = wch2;
2999
3000 wr = wapbl_calloc(1, sizeof(*wr));
3001
3002 wr->wr_logvp = vp;
3003 wr->wr_devvp = devvp;
3004 wr->wr_logpbn = logpbn;
3005
3006 wr->wr_scratch = scratch;
3007
3008 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
3009 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
3010 wr->wr_circ_off = wch->wc_circ_off;
3011 wr->wr_circ_size = wch->wc_circ_size;
3012 wr->wr_generation = wch->wc_generation;
3013
3014 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
3015
3016 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
3017 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
3018 " len=%"PRId64" used=%zu\n",
3019 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
3020 wch->wc_circ_size, used));
3021
3022 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
3023
3024 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
3025 if (error) {
3026 wapbl_replay_stop(wr);
3027 wapbl_replay_free(wr);
3028 return error;
3029 }
3030
3031 *wrp = wr;
3032 return 0;
3033
3034 errout:
3035 wapbl_free(scratch, MAXBSIZE);
3036 return error;
3037 }
3038
3039 void
3040 wapbl_replay_stop(struct wapbl_replay *wr)
3041 {
3042
3043 if (!wapbl_replay_isopen(wr))
3044 return;
3045
3046 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
3047
3048 wapbl_free(wr->wr_scratch, MAXBSIZE);
3049 wr->wr_scratch = NULL;
3050
3051 wr->wr_logvp = NULL;
3052
3053 wapbl_blkhash_clear(wr);
3054 wapbl_blkhash_free(wr);
3055 }
3056
3057 void
3058 wapbl_replay_free(struct wapbl_replay *wr)
3059 {
3060
3061 KDASSERT(!wapbl_replay_isopen(wr));
3062
3063 if (wr->wr_inodes)
3064 wapbl_free(wr->wr_inodes,
3065 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
3066 wapbl_free(wr, sizeof(*wr));
3067 }
3068
3069 #ifdef _KERNEL
3070 int
3071 wapbl_replay_isopen1(struct wapbl_replay *wr)
3072 {
3073
3074 return wapbl_replay_isopen(wr);
3075 }
3076 #endif
3077
3078 /*
3079 * calculate the disk address for the i'th block in the wc_blockblist
3080 * offset by j blocks of size blen.
3081 *
3082 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
3083 * was written to the journal.
3084 *
3085 * The kernel needs that address plus the offset in DEV_BSIZE units.
3086 *
3087 * Userland needs that address plus the offset in blen units.
3088 *
3089 */
3090 static daddr_t
3091 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
3092 {
3093 daddr_t pbn;
3094
3095 #ifdef _KERNEL
3096 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
3097 #else
3098 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
3099 #endif
3100
3101 return pbn;
3102 }
3103
3104 static void
3105 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
3106 {
3107 struct wapbl_wc_blocklist *wc =
3108 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3109 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3110 int i, j, n;
3111
3112 for (i = 0; i < wc->wc_blkcount; i++) {
3113 /*
3114 * Enter each physical block into the hashtable independently.
3115 */
3116 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3117 for (j = 0; j < n; j++) {
3118 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
3119 *offp);
3120 wapbl_circ_advance(wr, fsblklen, offp);
3121 }
3122 }
3123 }
3124
3125 static void
3126 wapbl_replay_process_revocations(struct wapbl_replay *wr)
3127 {
3128 struct wapbl_wc_blocklist *wc =
3129 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3130 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3131 int i, j, n;
3132
3133 for (i = 0; i < wc->wc_blkcount; i++) {
3134 /*
3135 * Remove any blocks found from the hashtable.
3136 */
3137 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
3138 for (j = 0; j < n; j++)
3139 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
3140 }
3141 }
3142
3143 static void
3144 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
3145 {
3146 struct wapbl_wc_inodelist *wc =
3147 (struct wapbl_wc_inodelist *)wr->wr_scratch;
3148 void *new_inodes;
3149 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
3150
3151 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
3152
3153 /*
3154 * Keep track of where we found this so location won't be
3155 * overwritten.
3156 */
3157 if (wc->wc_clear) {
3158 wr->wr_inodestail = oldoff;
3159 wr->wr_inodescnt = 0;
3160 if (wr->wr_inodes != NULL) {
3161 wapbl_free(wr->wr_inodes, oldsize);
3162 wr->wr_inodes = NULL;
3163 }
3164 }
3165 wr->wr_inodeshead = newoff;
3166 if (wc->wc_inocnt == 0)
3167 return;
3168
3169 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
3170 sizeof(wr->wr_inodes[0]));
3171 if (wr->wr_inodes != NULL) {
3172 memcpy(new_inodes, wr->wr_inodes, oldsize);
3173 wapbl_free(wr->wr_inodes, oldsize);
3174 }
3175 wr->wr_inodes = new_inodes;
3176 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
3177 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
3178 wr->wr_inodescnt += wc->wc_inocnt;
3179 }
3180
3181 static int
3182 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
3183 {
3184 off_t off;
3185 int error;
3186
3187 int logblklen = 1 << wr->wr_log_dev_bshift;
3188
3189 wapbl_blkhash_clear(wr);
3190
3191 off = tail;
3192 while (off != head) {
3193 struct wapbl_wc_null *wcn;
3194 off_t saveoff = off;
3195 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3196 if (error)
3197 goto errout;
3198 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3199 switch (wcn->wc_type) {
3200 case WAPBL_WC_BLOCKS:
3201 wapbl_replay_process_blocks(wr, &off);
3202 break;
3203
3204 case WAPBL_WC_REVOCATIONS:
3205 wapbl_replay_process_revocations(wr);
3206 break;
3207
3208 case WAPBL_WC_INODES:
3209 wapbl_replay_process_inodes(wr, saveoff, off);
3210 break;
3211
3212 default:
3213 printf("Unrecognized wapbl type: 0x%08x\n",
3214 wcn->wc_type);
3215 error = EFTYPE;
3216 goto errout;
3217 }
3218 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3219 if (off != saveoff) {
3220 printf("wapbl_replay: corrupted records\n");
3221 error = EFTYPE;
3222 goto errout;
3223 }
3224 }
3225 return 0;
3226
3227 errout:
3228 wapbl_blkhash_clear(wr);
3229 return error;
3230 }
3231
3232 #if 0
3233 int
3234 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
3235 {
3236 off_t off;
3237 int mismatchcnt = 0;
3238 int logblklen = 1 << wr->wr_log_dev_bshift;
3239 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3240 void *scratch1 = wapbl_alloc(MAXBSIZE);
3241 void *scratch2 = wapbl_alloc(MAXBSIZE);
3242 int error = 0;
3243
3244 KDASSERT(wapbl_replay_isopen(wr));
3245
3246 off = wch->wc_tail;
3247 while (off != wch->wc_head) {
3248 struct wapbl_wc_null *wcn;
3249 #ifdef DEBUG
3250 off_t saveoff = off;
3251 #endif
3252 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3253 if (error)
3254 goto out;
3255 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3256 switch (wcn->wc_type) {
3257 case WAPBL_WC_BLOCKS:
3258 {
3259 struct wapbl_wc_blocklist *wc =
3260 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3261 int i;
3262 for (i = 0; i < wc->wc_blkcount; i++) {
3263 int foundcnt = 0;
3264 int dirtycnt = 0;
3265 int j, n;
3266 /*
3267 * Check each physical block into the
3268 * hashtable independently
3269 */
3270 n = wc->wc_blocks[i].wc_dlen >>
3271 wch->wc_fs_dev_bshift;
3272 for (j = 0; j < n; j++) {
3273 struct wapbl_blk *wb =
3274 wapbl_blkhash_get(wr,
3275 wapbl_block_daddr(wc, i, j, fsblklen));
3276 if (wb && (wb->wb_off == off)) {
3277 foundcnt++;
3278 error =
3279 wapbl_circ_read(wr,
3280 scratch1, fsblklen,
3281 &off);
3282 if (error)
3283 goto out;
3284 error =
3285 wapbl_read(scratch2,
3286 fsblklen, fsdevvp,
3287 wb->wb_blk);
3288 if (error)
3289 goto out;
3290 if (memcmp(scratch1,
3291 scratch2,
3292 fsblklen)) {
3293 printf(
3294 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3295 wb->wb_blk, (intmax_t)off);
3296 dirtycnt++;
3297 mismatchcnt++;
3298 }
3299 } else {
3300 wapbl_circ_advance(wr,
3301 fsblklen, &off);
3302 }
3303 }
3304 #if 0
3305 /*
3306 * If all of the blocks in an entry
3307 * are clean, then remove all of its
3308 * blocks from the hashtable since they
3309 * never will need replay.
3310 */
3311 if ((foundcnt != 0) &&
3312 (dirtycnt == 0)) {
3313 off = saveoff;
3314 wapbl_circ_advance(wr,
3315 logblklen, &off);
3316 for (j = 0; j < n; j++) {
3317 struct wapbl_blk *wb =
3318 wapbl_blkhash_get(wr,
3319 wapbl_block_daddr(wc, i, j, fsblklen));
3320 if (wb &&
3321 (wb->wb_off == off)) {
3322 wapbl_blkhash_rem(wr, wb->wb_blk);
3323 }
3324 wapbl_circ_advance(wr,
3325 fsblklen, &off);
3326 }
3327 }
3328 #endif
3329 }
3330 }
3331 break;
3332 case WAPBL_WC_REVOCATIONS:
3333 case WAPBL_WC_INODES:
3334 break;
3335 default:
3336 KASSERT(0);
3337 }
3338 #ifdef DEBUG
3339 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3340 KASSERT(off == saveoff);
3341 #endif
3342 }
3343 out:
3344 wapbl_free(scratch1, MAXBSIZE);
3345 wapbl_free(scratch2, MAXBSIZE);
3346 if (!error && mismatchcnt)
3347 error = EFTYPE;
3348 return error;
3349 }
3350 #endif
3351
3352 int
3353 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3354 {
3355 struct wapbl_blk *wb;
3356 size_t i;
3357 off_t off;
3358 void *scratch;
3359 int error = 0;
3360 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3361
3362 KDASSERT(wapbl_replay_isopen(wr));
3363
3364 scratch = wapbl_alloc(MAXBSIZE);
3365
3366 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
3367 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3368 off = wb->wb_off;
3369 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3370 if (error)
3371 break;
3372 error = wapbl_write(scratch, fsblklen, fsdevvp,
3373 wb->wb_blk);
3374 if (error)
3375 break;
3376 }
3377 }
3378
3379 wapbl_free(scratch, MAXBSIZE);
3380 return error;
3381 }
3382
3383 int
3384 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3385 {
3386 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3387
3388 KDASSERT(wapbl_replay_isopen(wr));
3389 KASSERT((len % fsblklen) == 0);
3390
3391 while (len != 0) {
3392 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3393 if (wb)
3394 return 1;
3395 len -= fsblklen;
3396 }
3397 return 0;
3398 }
3399
3400 int
3401 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3402 {
3403 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3404
3405 KDASSERT(wapbl_replay_isopen(wr));
3406
3407 KASSERT((len % fsblklen) == 0);
3408
3409 while (len != 0) {
3410 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3411 if (wb) {
3412 off_t off = wb->wb_off;
3413 int error;
3414 error = wapbl_circ_read(wr, data, fsblklen, &off);
3415 if (error)
3416 return error;
3417 }
3418 data = (uint8_t *)data + fsblklen;
3419 len -= fsblklen;
3420 blk++;
3421 }
3422 return 0;
3423 }
3424
3425 #ifdef _KERNEL
3426
3427 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3428
3429 static int
3430 wapbl_modcmd(modcmd_t cmd, void *arg)
3431 {
3432
3433 switch (cmd) {
3434 case MODULE_CMD_INIT:
3435 wapbl_init();
3436 return 0;
3437 case MODULE_CMD_FINI:
3438 return wapbl_fini();
3439 default:
3440 return ENOTTY;
3441 }
3442 }
3443 #endif /* _KERNEL */
3444