vfs_wapbl.c revision 1.93 1 /* $NetBSD: vfs_wapbl.c,v 1.93 2017/04/05 20:38:53 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.93 2017/04/05 20:38:53 jdolecek Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43 #include <sys/time.h>
44 #include <sys/wapbl.h>
45 #include <sys/wapbl_replay.h>
46
47 #ifdef _KERNEL
48
49 #include <sys/atomic.h>
50 #include <sys/conf.h>
51 #include <sys/evcnt.h>
52 #include <sys/file.h>
53 #include <sys/kauth.h>
54 #include <sys/kernel.h>
55 #include <sys/module.h>
56 #include <sys/mount.h>
57 #include <sys/mutex.h>
58 #include <sys/namei.h>
59 #include <sys/proc.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sysctl.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
68 #define wapbl_free(a, s) kmem_free((a), (s))
69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
70
71 static struct sysctllog *wapbl_sysctl;
72 static int wapbl_flush_disk_cache = 1;
73 static int wapbl_verbose_commit = 0;
74 static int wapbl_allow_fuadpo = 0; /* switched off by default for now */
75
76 static inline size_t wapbl_space_free(size_t, off_t, off_t);
77
78 #else /* !_KERNEL */
79
80 #include <assert.h>
81 #include <errno.h>
82 #include <stdbool.h>
83 #include <stdio.h>
84 #include <stdlib.h>
85 #include <string.h>
86
87 #define KDASSERT(x) assert(x)
88 #define KASSERT(x) assert(x)
89 #define wapbl_alloc(s) malloc(s)
90 #define wapbl_free(a, s) free(a)
91 #define wapbl_calloc(n, s) calloc((n), (s))
92
93 #endif /* !_KERNEL */
94
95 /*
96 * INTERNAL DATA STRUCTURES
97 */
98
99 /*
100 * This structure holds per-mount log information.
101 *
102 * Legend: a = atomic access only
103 * r = read-only after init
104 * l = rwlock held
105 * m = mutex held
106 * lm = rwlock held writing or mutex held
107 * u = unlocked access ok
108 * b = bufcache_lock held
109 */
110 LIST_HEAD(wapbl_ino_head, wapbl_ino);
111 struct wapbl {
112 struct vnode *wl_logvp; /* r: log here */
113 struct vnode *wl_devvp; /* r: log on this device */
114 struct mount *wl_mount; /* r: mountpoint wl is associated with */
115 daddr_t wl_logpbn; /* r: Physical block number of start of log */
116 int wl_log_dev_bshift; /* r: logarithm of device block size of log
117 device */
118 int wl_fs_dev_bshift; /* r: logarithm of device block size of
119 filesystem device */
120
121 unsigned wl_lock_count; /* m: Count of transactions in progress */
122
123 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
124 size_t wl_circ_off; /* r: Number of bytes reserved at start */
125
126 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
127 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
128
129 off_t wl_head; /* l: Byte offset of log head */
130 off_t wl_tail; /* l: Byte offset of log tail */
131 /*
132 * WAPBL log layout, stored on wl_devvp at wl_logpbn:
133 *
134 * ___________________ wl_circ_size __________________
135 * / \
136 * +---------+---------+-------+--------------+--------+
137 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
138 * +---------+---------+-------+--------------+--------+
139 * wl_circ_off --^ ^-- wl_head ^-- wl_tail
140 *
141 * commit0 and commit1 are commit headers. A commit header has
142 * a generation number, indicating which of the two headers is
143 * more recent, and an assignment of head and tail pointers.
144 * The rest is a circular queue of log records, starting at
145 * the byte offset wl_circ_off.
146 *
147 * E marks empty space for records.
148 * W marks records for block writes issued but waiting.
149 * C marks completed records.
150 *
151 * wapbl_flush writes new records to empty `E' spaces after
152 * wl_head from the current transaction in memory.
153 *
154 * wapbl_truncate advances wl_tail past any completed `C'
155 * records, freeing them up for use.
156 *
157 * head == tail == 0 means log is empty.
158 * head == tail != 0 means log is full.
159 *
160 * See assertions in wapbl_advance() for other boundary
161 * conditions.
162 *
163 * Only wapbl_flush moves the head, except when wapbl_truncate
164 * sets it to 0 to indicate that the log is empty.
165 *
166 * Only wapbl_truncate moves the tail, except when wapbl_flush
167 * sets it to wl_circ_off to indicate that the log is full.
168 */
169
170 struct wapbl_wc_header *wl_wc_header; /* l */
171 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
172
173 kmutex_t wl_mtx; /* u: short-term lock */
174 krwlock_t wl_rwlock; /* u: File system transaction lock */
175
176 /*
177 * Must be held while accessing
178 * wl_count or wl_bufs or head or tail
179 */
180
181 #if _KERNEL
182 /*
183 * Callback called from within the flush routine to flush any extra
184 * bits. Note that flush may be skipped without calling this if
185 * there are no outstanding buffers in the transaction.
186 */
187 wapbl_flush_fn_t wl_flush; /* r */
188 wapbl_flush_fn_t wl_flush_abort;/* r */
189
190 /* Event counters */
191 char wl_ev_group[EVCNT_STRING_MAX]; /* r */
192 struct evcnt wl_ev_commit; /* l */
193 struct evcnt wl_ev_journalwrite; /* l */
194 struct evcnt wl_ev_metawrite; /* lm */
195 struct evcnt wl_ev_cacheflush; /* l */
196 #endif
197
198 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
199 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
200 size_t wl_bcount; /* m: Total bcount of wl_bufs */
201
202 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
203
204 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
205 size_t wl_reclaimable_bytes; /* m: Amount of space available for
206 reclamation by truncate */
207 int wl_error_count; /* m: # of wl_entries with errors */
208 size_t wl_reserved_bytes; /* never truncate log smaller than this */
209
210 #ifdef WAPBL_DEBUG_BUFBYTES
211 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
212 #endif
213
214 #if _KERNEL
215 int wl_brperjblock; /* r Block records per journal block */
216 #endif
217
218 TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
219 int wl_dealloccnt; /* lm: total count */
220 int wl_dealloclim; /* r: max count */
221
222 /* hashtable of inode numbers for allocated but unlinked inodes */
223 /* synch ??? */
224 struct wapbl_ino_head *wl_inohash;
225 u_long wl_inohashmask;
226 int wl_inohashcnt;
227
228 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
229 accounting */
230
231 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
232 daddr_t wl_buffer_dblk; /* l: buffer disk block address */
233 size_t wl_buffer_used; /* l: buffer current use */
234
235 int wl_dkcache; /* r: disk cache flags */
236 #define WAPBL_USE_FUA(wl) \
237 (wapbl_allow_fuadpo && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
238 #define WAPBL_JFLAGS(wl) \
239 (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
240 #define WAPBL_MFLAGS(wl) \
241 (WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0)
242 int wl_jwrite_flags; /* r: journal write flags */
243 int wl_mwrite_flags; /* r: metadata write flags */
244 };
245
246 #ifdef WAPBL_DEBUG_PRINT
247 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
248 #endif
249
250 /****************************************************************/
251 #ifdef _KERNEL
252
253 #ifdef WAPBL_DEBUG
254 struct wapbl *wapbl_debug_wl;
255 #endif
256
257 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
258 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
259 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
260 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
261 #endif /* _KERNEL */
262
263 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
264
265 static inline size_t wapbl_space_used(size_t avail, off_t head,
266 off_t tail);
267
268 #ifdef _KERNEL
269
270 static struct pool wapbl_entry_pool;
271 static struct pool wapbl_dealloc_pool;
272
273 #define WAPBL_INODETRK_SIZE 83
274 static int wapbl_ino_pool_refcount;
275 static struct pool wapbl_ino_pool;
276 struct wapbl_ino {
277 LIST_ENTRY(wapbl_ino) wi_hash;
278 ino_t wi_ino;
279 mode_t wi_mode;
280 };
281
282 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
283 static void wapbl_inodetrk_free(struct wapbl *wl);
284 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
285
286 static size_t wapbl_transaction_len(struct wapbl *wl);
287 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
288
289 static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *,
290 bool);
291
292 static void wapbl_evcnt_init(struct wapbl *);
293 static void wapbl_evcnt_free(struct wapbl *);
294
295 static void wapbl_dkcache_init(struct wapbl *);
296
297 #if 0
298 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
299 #endif
300
301 static int wapbl_replay_isopen1(struct wapbl_replay *);
302
303 struct wapbl_ops wapbl_ops = {
304 .wo_wapbl_discard = wapbl_discard,
305 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
306 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
307 .wo_wapbl_replay_read = wapbl_replay_read,
308 .wo_wapbl_add_buf = wapbl_add_buf,
309 .wo_wapbl_remove_buf = wapbl_remove_buf,
310 .wo_wapbl_resize_buf = wapbl_resize_buf,
311 .wo_wapbl_begin = wapbl_begin,
312 .wo_wapbl_end = wapbl_end,
313 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
314
315 /* XXX: the following is only used to say "this is a wapbl buf" */
316 .wo_wapbl_biodone = wapbl_biodone,
317 };
318
319 static int
320 wapbl_sysctl_init(void)
321 {
322 int rv;
323 const struct sysctlnode *rnode, *cnode;
324
325 wapbl_sysctl = NULL;
326
327 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
328 CTLFLAG_PERMANENT,
329 CTLTYPE_NODE, "wapbl",
330 SYSCTL_DESCR("WAPBL journaling options"),
331 NULL, 0, NULL, 0,
332 CTL_VFS, CTL_CREATE, CTL_EOL);
333 if (rv)
334 return rv;
335
336 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
337 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
338 CTLTYPE_INT, "flush_disk_cache",
339 SYSCTL_DESCR("flush disk cache"),
340 NULL, 0, &wapbl_flush_disk_cache, 0,
341 CTL_CREATE, CTL_EOL);
342 if (rv)
343 return rv;
344
345 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
346 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
347 CTLTYPE_INT, "verbose_commit",
348 SYSCTL_DESCR("show time and size of wapbl log commits"),
349 NULL, 0, &wapbl_verbose_commit, 0,
350 CTL_CREATE, CTL_EOL);
351 if (rv)
352 return rv;
353
354 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
355 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
356 CTLTYPE_INT, "allow_fuadpo",
357 SYSCTL_DESCR("allow use of FUA/DPO instead of cash flush if available"),
358 NULL, 0, &wapbl_allow_fuadpo, 0,
359 CTL_CREATE, CTL_EOL);
360 if (rv)
361 return rv;
362
363 return rv;
364 }
365
366 static void
367 wapbl_init(void)
368 {
369
370 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
371 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
372 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
373 "wapbldealloc", &pool_allocator_nointr, IPL_NONE);
374
375 wapbl_sysctl_init();
376 }
377
378 static int
379 wapbl_fini(void)
380 {
381
382 if (wapbl_sysctl != NULL)
383 sysctl_teardown(&wapbl_sysctl);
384
385 pool_destroy(&wapbl_dealloc_pool);
386 pool_destroy(&wapbl_entry_pool);
387
388 return 0;
389 }
390
391 static void
392 wapbl_evcnt_init(struct wapbl *wl)
393 {
394 snprintf(wl->wl_ev_group, sizeof(wl->wl_ev_group),
395 "wapbl fsid 0x%x/0x%x",
396 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[0],
397 wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[1]
398 );
399
400 evcnt_attach_dynamic(&wl->wl_ev_commit, EVCNT_TYPE_MISC,
401 NULL, wl->wl_ev_group, "commit");
402 evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
403 NULL, wl->wl_ev_group, "journal sync block write");
404 evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
405 NULL, wl->wl_ev_group, "metadata finished block write");
406 evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
407 NULL, wl->wl_ev_group, "cache flush");
408 }
409
410 static void
411 wapbl_evcnt_free(struct wapbl *wl)
412 {
413 evcnt_detach(&wl->wl_ev_commit);
414 evcnt_detach(&wl->wl_ev_journalwrite);
415 evcnt_detach(&wl->wl_ev_metawrite);
416 evcnt_detach(&wl->wl_ev_cacheflush);
417 }
418
419 static void
420 wapbl_dkcache_init(struct wapbl *wl)
421 {
422 int error;
423
424 /* Get disk cache flags */
425 error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
426 FWRITE, FSCRED);
427 if (error) {
428 /* behave as if there was a write cache */
429 wl->wl_dkcache = DKCACHE_WRITE;
430 }
431
432 /* Use FUA instead of cache flush if available */
433 if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) {
434 wl->wl_jwrite_flags |= B_MEDIA_FUA;
435 wl->wl_mwrite_flags |= B_MEDIA_FUA;
436 }
437
438 /* Use DPO for journal writes if available */
439 if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
440 wl->wl_jwrite_flags |= B_MEDIA_DPO;
441 }
442
443 static int
444 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
445 {
446 int error, i;
447
448 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
449 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
450
451 /*
452 * Its only valid to reuse the replay log if its
453 * the same as the new log we just opened.
454 */
455 KDASSERT(!wapbl_replay_isopen(wr));
456 KASSERT(wl->wl_devvp->v_type == VBLK);
457 KASSERT(wr->wr_devvp->v_type == VBLK);
458 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
459 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
460 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
461 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
462 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
463 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
464
465 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
466
467 for (i = 0; i < wr->wr_inodescnt; i++)
468 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
469 wr->wr_inodes[i].wr_imode);
470
471 /* Make sure new transaction won't overwrite old inodes list */
472 KDASSERT(wapbl_transaction_len(wl) <=
473 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
474 wr->wr_inodestail));
475
476 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
477 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
478 wapbl_transaction_len(wl);
479
480 error = wapbl_write_inodes(wl, &wl->wl_head);
481 if (error)
482 return error;
483
484 KASSERT(wl->wl_head != wl->wl_tail);
485 KASSERT(wl->wl_head != 0);
486
487 return 0;
488 }
489
490 int
491 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
492 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
493 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
494 {
495 struct wapbl *wl;
496 struct vnode *devvp;
497 daddr_t logpbn;
498 int error;
499 int log_dev_bshift = ilog2(blksize);
500 int fs_dev_bshift = log_dev_bshift;
501 int run;
502
503 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
504 " count=%zu blksize=%zu\n", vp, off, count, blksize));
505
506 if (log_dev_bshift > fs_dev_bshift) {
507 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
508 ("wapbl: log device's block size cannot be larger "
509 "than filesystem's\n"));
510 /*
511 * Not currently implemented, although it could be if
512 * needed someday.
513 */
514 return ENOSYS;
515 }
516
517 if (off < 0)
518 return EINVAL;
519
520 if (blksize < DEV_BSIZE)
521 return EINVAL;
522 if (blksize % DEV_BSIZE)
523 return EINVAL;
524
525 /* XXXTODO: verify that the full load is writable */
526
527 /*
528 * XXX check for minimum log size
529 * minimum is governed by minimum amount of space
530 * to complete a transaction. (probably truncate)
531 */
532 /* XXX for now pick something minimal */
533 if ((count * blksize) < MAXPHYS) {
534 return ENOSPC;
535 }
536
537 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
538 return error;
539 }
540
541 wl = wapbl_calloc(1, sizeof(*wl));
542 rw_init(&wl->wl_rwlock);
543 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
544 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
545 LIST_INIT(&wl->wl_bufs);
546 SIMPLEQ_INIT(&wl->wl_entries);
547
548 wl->wl_logvp = vp;
549 wl->wl_devvp = devvp;
550 wl->wl_mount = mp;
551 wl->wl_logpbn = logpbn;
552 wl->wl_log_dev_bshift = log_dev_bshift;
553 wl->wl_fs_dev_bshift = fs_dev_bshift;
554
555 wl->wl_flush = flushfn;
556 wl->wl_flush_abort = flushabortfn;
557
558 /* Reserve two log device blocks for the commit headers */
559 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
560 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
561 /* truncate the log usage to a multiple of log_dev_bshift */
562 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
563 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
564
565 /*
566 * wl_bufbytes_max limits the size of the in memory transaction space.
567 * - Since buffers are allocated and accounted for in units of
568 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
569 * (i.e. 1<<PAGE_SHIFT)
570 * - Since the log device has to be written in units of
571 * 1<<wl_log_dev_bshift it is required to be a mulitple of
572 * 1<<wl_log_dev_bshift.
573 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
574 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
575 * Therefore it must be multiple of the least common multiple of those
576 * three quantities. Fortunately, all of those quantities are
577 * guaranteed to be a power of two, and the least common multiple of
578 * a set of numbers which are all powers of two is simply the maximum
579 * of those numbers. Finally, the maximum logarithm of a power of two
580 * is the same as the log of the maximum power of two. So we can do
581 * the following operations to size wl_bufbytes_max:
582 */
583
584 /* XXX fix actual number of pages reserved per filesystem. */
585 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
586
587 /* Round wl_bufbytes_max to the largest power of two constraint */
588 wl->wl_bufbytes_max >>= PAGE_SHIFT;
589 wl->wl_bufbytes_max <<= PAGE_SHIFT;
590 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
591 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
592 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
593 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
594
595 /* XXX maybe use filesystem fragment size instead of 1024 */
596 /* XXX fix actual number of buffers reserved per filesystem. */
597 wl->wl_bufcount_max = (nbuf / 2) * 1024;
598
599 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
600 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
601 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
602 KASSERT(wl->wl_brperjblock > 0);
603
604 /* XXX tie this into resource estimation */
605 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
606 TAILQ_INIT(&wl->wl_dealloclist);
607
608 wl->wl_buffer = wapbl_alloc(MAXPHYS);
609 wl->wl_buffer_used = 0;
610
611 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
612
613 wapbl_evcnt_init(wl);
614
615 wapbl_dkcache_init(wl);
616
617 /* Initialize the commit header */
618 {
619 struct wapbl_wc_header *wc;
620 size_t len = 1 << wl->wl_log_dev_bshift;
621 wc = wapbl_calloc(1, len);
622 wc->wc_type = WAPBL_WC_HEADER;
623 wc->wc_len = len;
624 wc->wc_circ_off = wl->wl_circ_off;
625 wc->wc_circ_size = wl->wl_circ_size;
626 /* XXX wc->wc_fsid */
627 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
628 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
629 wl->wl_wc_header = wc;
630 wl->wl_wc_scratch = wapbl_alloc(len);
631 }
632
633 /*
634 * if there was an existing set of unlinked but
635 * allocated inodes, preserve it in the new
636 * log.
637 */
638 if (wr && wr->wr_inodescnt) {
639 error = wapbl_start_flush_inodes(wl, wr);
640 if (error)
641 goto errout;
642 }
643
644 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
645 if (error) {
646 goto errout;
647 }
648
649 *wlp = wl;
650 #if defined(WAPBL_DEBUG)
651 wapbl_debug_wl = wl;
652 #endif
653
654 return 0;
655 errout:
656 wapbl_discard(wl);
657 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
658 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
659 wapbl_free(wl->wl_buffer, MAXPHYS);
660 wapbl_inodetrk_free(wl);
661 wapbl_free(wl, sizeof(*wl));
662
663 return error;
664 }
665
666 /*
667 * Like wapbl_flush, only discards the transaction
668 * completely
669 */
670
671 void
672 wapbl_discard(struct wapbl *wl)
673 {
674 struct wapbl_entry *we;
675 struct wapbl_dealloc *wd;
676 struct buf *bp;
677 int i;
678
679 /*
680 * XXX we may consider using upgrade here
681 * if we want to call flush from inside a transaction
682 */
683 rw_enter(&wl->wl_rwlock, RW_WRITER);
684 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
685
686 #ifdef WAPBL_DEBUG_PRINT
687 {
688 pid_t pid = -1;
689 lwpid_t lid = -1;
690 if (curproc)
691 pid = curproc->p_pid;
692 if (curlwp)
693 lid = curlwp->l_lid;
694 #ifdef WAPBL_DEBUG_BUFBYTES
695 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
696 ("wapbl_discard: thread %d.%d discarding "
697 "transaction\n"
698 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
699 "deallocs=%d inodes=%d\n"
700 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
701 "unsynced=%zu\n",
702 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
703 wl->wl_bcount, wl->wl_dealloccnt,
704 wl->wl_inohashcnt, wl->wl_error_count,
705 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
706 wl->wl_unsynced_bufbytes));
707 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
708 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
709 ("\tentry: bufcount = %zu, reclaimable = %zu, "
710 "error = %d, unsynced = %zu\n",
711 we->we_bufcount, we->we_reclaimable_bytes,
712 we->we_error, we->we_unsynced_bufbytes));
713 }
714 #else /* !WAPBL_DEBUG_BUFBYTES */
715 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
716 ("wapbl_discard: thread %d.%d discarding transaction\n"
717 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
718 "deallocs=%d inodes=%d\n"
719 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
720 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
721 wl->wl_bcount, wl->wl_dealloccnt,
722 wl->wl_inohashcnt, wl->wl_error_count,
723 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
724 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
725 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
726 ("\tentry: bufcount = %zu, reclaimable = %zu, "
727 "error = %d\n",
728 we->we_bufcount, we->we_reclaimable_bytes,
729 we->we_error));
730 }
731 #endif /* !WAPBL_DEBUG_BUFBYTES */
732 }
733 #endif /* WAPBL_DEBUG_PRINT */
734
735 for (i = 0; i <= wl->wl_inohashmask; i++) {
736 struct wapbl_ino_head *wih;
737 struct wapbl_ino *wi;
738
739 wih = &wl->wl_inohash[i];
740 while ((wi = LIST_FIRST(wih)) != NULL) {
741 LIST_REMOVE(wi, wi_hash);
742 pool_put(&wapbl_ino_pool, wi);
743 KASSERT(wl->wl_inohashcnt > 0);
744 wl->wl_inohashcnt--;
745 }
746 }
747
748 /*
749 * clean buffer list
750 */
751 mutex_enter(&bufcache_lock);
752 mutex_enter(&wl->wl_mtx);
753 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
754 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
755 /*
756 * The buffer will be unlocked and
757 * removed from the transaction in brelse
758 */
759 mutex_exit(&wl->wl_mtx);
760 brelsel(bp, 0);
761 mutex_enter(&wl->wl_mtx);
762 }
763 }
764 mutex_exit(&wl->wl_mtx);
765 mutex_exit(&bufcache_lock);
766
767 /*
768 * Remove references to this wl from wl_entries, free any which
769 * no longer have buffers, others will be freed in wapbl_biodone
770 * when they no longer have any buffers.
771 */
772 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
773 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
774 /* XXX should we be accumulating wl_error_count
775 * and increasing reclaimable bytes ? */
776 we->we_wapbl = NULL;
777 if (we->we_bufcount == 0) {
778 #ifdef WAPBL_DEBUG_BUFBYTES
779 KASSERT(we->we_unsynced_bufbytes == 0);
780 #endif
781 pool_put(&wapbl_entry_pool, we);
782 }
783 }
784
785 /* Discard list of deallocs */
786 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
787 wapbl_deallocation_free(wl, wd, true);
788
789 /* XXX should we clear wl_reserved_bytes? */
790
791 KASSERT(wl->wl_bufbytes == 0);
792 KASSERT(wl->wl_bcount == 0);
793 KASSERT(wl->wl_bufcount == 0);
794 KASSERT(LIST_EMPTY(&wl->wl_bufs));
795 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
796 KASSERT(wl->wl_inohashcnt == 0);
797 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
798 KASSERT(wl->wl_dealloccnt == 0);
799
800 rw_exit(&wl->wl_rwlock);
801 }
802
803 int
804 wapbl_stop(struct wapbl *wl, int force)
805 {
806 int error;
807
808 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
809 error = wapbl_flush(wl, 1);
810 if (error) {
811 if (force)
812 wapbl_discard(wl);
813 else
814 return error;
815 }
816
817 /* Unlinked inodes persist after a flush */
818 if (wl->wl_inohashcnt) {
819 if (force) {
820 wapbl_discard(wl);
821 } else {
822 return EBUSY;
823 }
824 }
825
826 KASSERT(wl->wl_bufbytes == 0);
827 KASSERT(wl->wl_bcount == 0);
828 KASSERT(wl->wl_bufcount == 0);
829 KASSERT(LIST_EMPTY(&wl->wl_bufs));
830 KASSERT(wl->wl_dealloccnt == 0);
831 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
832 KASSERT(wl->wl_inohashcnt == 0);
833 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
834 KASSERT(wl->wl_dealloccnt == 0);
835
836 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
837 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
838 wapbl_free(wl->wl_buffer, MAXPHYS);
839 wapbl_inodetrk_free(wl);
840
841 wapbl_evcnt_free(wl);
842
843 cv_destroy(&wl->wl_reclaimable_cv);
844 mutex_destroy(&wl->wl_mtx);
845 rw_destroy(&wl->wl_rwlock);
846 wapbl_free(wl, sizeof(*wl));
847
848 return 0;
849 }
850
851 /****************************************************************/
852 /*
853 * Unbuffered disk I/O
854 */
855
856 static int
857 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
858 {
859 struct pstats *pstats = curlwp->l_proc->p_stats;
860 struct buf *bp;
861 int error;
862
863 KASSERT(devvp->v_type == VBLK);
864
865 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
866 mutex_enter(devvp->v_interlock);
867 devvp->v_numoutput++;
868 mutex_exit(devvp->v_interlock);
869 pstats->p_ru.ru_oublock++;
870 } else {
871 pstats->p_ru.ru_inblock++;
872 }
873
874 bp = getiobuf(devvp, true);
875 bp->b_flags = flags;
876 bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
877 bp->b_dev = devvp->v_rdev;
878 bp->b_data = data;
879 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
880 bp->b_blkno = pbn;
881 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
882
883 WAPBL_PRINTF(WAPBL_PRINT_IO,
884 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
885 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
886 bp->b_blkno, bp->b_dev));
887
888 VOP_STRATEGY(devvp, bp);
889
890 error = biowait(bp);
891 putiobuf(bp);
892
893 if (error) {
894 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
895 ("wapbl_doio: %s %zu bytes at block %" PRId64
896 " on dev 0x%"PRIx64" failed with error %d\n",
897 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
898 "write" : "read"),
899 len, pbn, devvp->v_rdev, error));
900 }
901
902 return error;
903 }
904
905 /*
906 * wapbl_write(data, len, devvp, pbn)
907 *
908 * Synchronously write len bytes from data to physical block pbn
909 * on devvp.
910 */
911 int
912 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
913 {
914
915 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
916 }
917
918 /*
919 * wapbl_read(data, len, devvp, pbn)
920 *
921 * Synchronously read len bytes into data from physical block pbn
922 * on devvp.
923 */
924 int
925 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
926 {
927
928 return wapbl_doio(data, len, devvp, pbn, B_READ);
929 }
930
931 /****************************************************************/
932 /*
933 * Buffered disk writes -- try to coalesce writes and emit
934 * MAXPHYS-aligned blocks.
935 */
936
937 /*
938 * wapbl_buffered_flush(wl)
939 *
940 * Flush any buffered writes from wapbl_buffered_write.
941 */
942 static int
943 wapbl_buffered_flush(struct wapbl *wl)
944 {
945 int error;
946
947 if (wl->wl_buffer_used == 0)
948 return 0;
949
950 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
951 wl->wl_devvp, wl->wl_buffer_dblk,
952 B_WRITE | WAPBL_JFLAGS(wl));
953 wl->wl_buffer_used = 0;
954
955 wl->wl_ev_journalwrite.ev_count++;
956
957 return error;
958 }
959
960 /*
961 * wapbl_buffered_write(data, len, wl, pbn)
962 *
963 * Write len bytes from data to physical block pbn on
964 * wl->wl_devvp. The write may not complete until
965 * wapbl_buffered_flush.
966 */
967 static int
968 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
969 {
970 int error;
971 size_t resid;
972
973 /*
974 * If not adjacent to buffered data flush first. Disk block
975 * address is always valid for non-empty buffer.
976 */
977 if (wl->wl_buffer_used > 0 &&
978 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
979 error = wapbl_buffered_flush(wl);
980 if (error)
981 return error;
982 }
983 /*
984 * If this write goes to an empty buffer we have to
985 * save the disk block address first.
986 */
987 if (wl->wl_buffer_used == 0)
988 wl->wl_buffer_dblk = pbn;
989 /*
990 * Remaining space so this buffer ends on a MAXPHYS boundary.
991 *
992 * Cannot become less or equal zero as the buffer would have been
993 * flushed on the last call then.
994 */
995 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
996 wl->wl_buffer_used;
997 KASSERT(resid > 0);
998 KASSERT(dbtob(btodb(resid)) == resid);
999 if (len >= resid) {
1000 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
1001 wl->wl_buffer_used += resid;
1002 error = wapbl_buffered_flush(wl);
1003 data = (uint8_t *)data + resid;
1004 len -= resid;
1005 wl->wl_buffer_dblk = pbn + btodb(resid);
1006 if (error)
1007 return error;
1008 }
1009 KASSERT(len < MAXPHYS);
1010 if (len > 0) {
1011 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
1012 wl->wl_buffer_used += len;
1013 }
1014
1015 return 0;
1016 }
1017
1018 /*
1019 * wapbl_circ_write(wl, data, len, offp)
1020 *
1021 * Write len bytes from data to the circular queue of wl, starting
1022 * at linear byte offset *offp, and returning the new linear byte
1023 * offset in *offp.
1024 *
1025 * If the starting linear byte offset precedes wl->wl_circ_off,
1026 * the write instead begins at wl->wl_circ_off. XXX WTF? This
1027 * should be a KASSERT, not a conditional.
1028 *
1029 * The write is buffered in wl and must be flushed with
1030 * wapbl_buffered_flush before it will be submitted to the disk.
1031 */
1032 static int
1033 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
1034 {
1035 size_t slen;
1036 off_t off = *offp;
1037 int error;
1038 daddr_t pbn;
1039
1040 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
1041 wl->wl_log_dev_bshift) == len);
1042
1043 if (off < wl->wl_circ_off)
1044 off = wl->wl_circ_off;
1045 slen = wl->wl_circ_off + wl->wl_circ_size - off;
1046 if (slen < len) {
1047 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1048 #ifdef _KERNEL
1049 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1050 #endif
1051 error = wapbl_buffered_write(data, slen, wl, pbn);
1052 if (error)
1053 return error;
1054 data = (uint8_t *)data + slen;
1055 len -= slen;
1056 off = wl->wl_circ_off;
1057 }
1058 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
1059 #ifdef _KERNEL
1060 pbn = btodb(pbn << wl->wl_log_dev_bshift);
1061 #endif
1062 error = wapbl_buffered_write(data, len, wl, pbn);
1063 if (error)
1064 return error;
1065 off += len;
1066 if (off >= wl->wl_circ_off + wl->wl_circ_size)
1067 off = wl->wl_circ_off;
1068 *offp = off;
1069 return 0;
1070 }
1071
1072 /****************************************************************/
1073 /*
1074 * WAPBL transactions: entering, adding/removing bufs, and exiting
1075 */
1076
1077 int
1078 wapbl_begin(struct wapbl *wl, const char *file, int line)
1079 {
1080 int doflush;
1081 unsigned lockcount;
1082
1083 KDASSERT(wl);
1084
1085 /*
1086 * XXX this needs to be made much more sophisticated.
1087 * perhaps each wapbl_begin could reserve a specified
1088 * number of buffers and bytes.
1089 */
1090 mutex_enter(&wl->wl_mtx);
1091 lockcount = wl->wl_lock_count;
1092 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
1093 wl->wl_bufbytes_max / 2) ||
1094 ((wl->wl_bufcount + (lockcount * 10)) >
1095 wl->wl_bufcount_max / 2) ||
1096 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
1097 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
1098 mutex_exit(&wl->wl_mtx);
1099
1100 if (doflush) {
1101 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1102 ("force flush lockcnt=%d bufbytes=%zu "
1103 "(max=%zu) bufcount=%zu (max=%zu) "
1104 "dealloccnt %d (lim=%d)\n",
1105 lockcount, wl->wl_bufbytes,
1106 wl->wl_bufbytes_max, wl->wl_bufcount,
1107 wl->wl_bufcount_max,
1108 wl->wl_dealloccnt, wl->wl_dealloclim));
1109 }
1110
1111 if (doflush) {
1112 int error = wapbl_flush(wl, 0);
1113 if (error)
1114 return error;
1115 }
1116
1117 rw_enter(&wl->wl_rwlock, RW_READER);
1118 mutex_enter(&wl->wl_mtx);
1119 wl->wl_lock_count++;
1120 mutex_exit(&wl->wl_mtx);
1121
1122 #if defined(WAPBL_DEBUG_PRINT)
1123 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1124 ("wapbl_begin thread %d.%d with bufcount=%zu "
1125 "bufbytes=%zu bcount=%zu at %s:%d\n",
1126 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1127 wl->wl_bufbytes, wl->wl_bcount, file, line));
1128 #endif
1129
1130 return 0;
1131 }
1132
1133 void
1134 wapbl_end(struct wapbl *wl)
1135 {
1136
1137 #if defined(WAPBL_DEBUG_PRINT)
1138 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1139 ("wapbl_end thread %d.%d with bufcount=%zu "
1140 "bufbytes=%zu bcount=%zu\n",
1141 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1142 wl->wl_bufbytes, wl->wl_bcount));
1143 #endif
1144
1145 /*
1146 * XXX this could be handled more gracefully, perhaps place
1147 * only a partial transaction in the log and allow the
1148 * remaining to flush without the protection of the journal.
1149 */
1150 KASSERTMSG((wapbl_transaction_len(wl) <=
1151 (wl->wl_circ_size - wl->wl_reserved_bytes)),
1152 "wapbl_end: current transaction too big to flush");
1153
1154 mutex_enter(&wl->wl_mtx);
1155 KASSERT(wl->wl_lock_count > 0);
1156 wl->wl_lock_count--;
1157 mutex_exit(&wl->wl_mtx);
1158
1159 rw_exit(&wl->wl_rwlock);
1160 }
1161
1162 void
1163 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1164 {
1165
1166 KASSERT(bp->b_cflags & BC_BUSY);
1167 KASSERT(bp->b_vp);
1168
1169 wapbl_jlock_assert(wl);
1170
1171 #if 0
1172 /*
1173 * XXX this might be an issue for swapfiles.
1174 * see uvm_swap.c:1702
1175 *
1176 * XXX2 why require it then? leap of semantics?
1177 */
1178 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1179 #endif
1180
1181 mutex_enter(&wl->wl_mtx);
1182 if (bp->b_flags & B_LOCKED) {
1183 LIST_REMOVE(bp, b_wapbllist);
1184 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1185 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1186 "with %d bytes %d bcount\n",
1187 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1188 bp->b_bcount));
1189 } else {
1190 /* unlocked by dirty buffers shouldn't exist */
1191 KASSERT(!(bp->b_oflags & BO_DELWRI));
1192 wl->wl_bufbytes += bp->b_bufsize;
1193 wl->wl_bcount += bp->b_bcount;
1194 wl->wl_bufcount++;
1195 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1196 ("wapbl_add_buf thread %d.%d adding buf %p "
1197 "with %d bytes %d bcount\n",
1198 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1199 bp->b_bcount));
1200 }
1201 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1202 mutex_exit(&wl->wl_mtx);
1203
1204 bp->b_flags |= B_LOCKED;
1205 }
1206
1207 static void
1208 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1209 {
1210
1211 KASSERT(mutex_owned(&wl->wl_mtx));
1212 KASSERT(bp->b_cflags & BC_BUSY);
1213 wapbl_jlock_assert(wl);
1214
1215 #if 0
1216 /*
1217 * XXX this might be an issue for swapfiles.
1218 * see uvm_swap.c:1725
1219 *
1220 * XXXdeux: see above
1221 */
1222 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1223 #endif
1224 KASSERT(bp->b_flags & B_LOCKED);
1225
1226 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1227 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1228 "%d bytes %d bcount\n",
1229 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1230
1231 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1232 wl->wl_bufbytes -= bp->b_bufsize;
1233 KASSERT(wl->wl_bcount >= bp->b_bcount);
1234 wl->wl_bcount -= bp->b_bcount;
1235 KASSERT(wl->wl_bufcount > 0);
1236 wl->wl_bufcount--;
1237 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1238 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1239 LIST_REMOVE(bp, b_wapbllist);
1240
1241 bp->b_flags &= ~B_LOCKED;
1242 }
1243
1244 /* called from brelsel() in vfs_bio among other places */
1245 void
1246 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1247 {
1248
1249 mutex_enter(&wl->wl_mtx);
1250 wapbl_remove_buf_locked(wl, bp);
1251 mutex_exit(&wl->wl_mtx);
1252 }
1253
1254 void
1255 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1256 {
1257
1258 KASSERT(bp->b_cflags & BC_BUSY);
1259
1260 /*
1261 * XXX: why does this depend on B_LOCKED? otherwise the buf
1262 * is not for a transaction? if so, why is this called in the
1263 * first place?
1264 */
1265 if (bp->b_flags & B_LOCKED) {
1266 mutex_enter(&wl->wl_mtx);
1267 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1268 wl->wl_bcount += bp->b_bcount - oldcnt;
1269 mutex_exit(&wl->wl_mtx);
1270 }
1271 }
1272
1273 #endif /* _KERNEL */
1274
1275 /****************************************************************/
1276 /* Some utility inlines */
1277
1278 /*
1279 * wapbl_space_used(avail, head, tail)
1280 *
1281 * Number of bytes used in a circular queue of avail total bytes,
1282 * from tail to head.
1283 */
1284 static inline size_t
1285 wapbl_space_used(size_t avail, off_t head, off_t tail)
1286 {
1287
1288 if (tail == 0) {
1289 KASSERT(head == 0);
1290 return 0;
1291 }
1292 return ((head + (avail - 1) - tail) % avail) + 1;
1293 }
1294
1295 #ifdef _KERNEL
1296 /*
1297 * wapbl_advance(size, off, oldoff, delta)
1298 *
1299 * Given a byte offset oldoff into a circular queue of size bytes
1300 * starting at off, return a new byte offset oldoff + delta into
1301 * the circular queue.
1302 */
1303 static inline off_t
1304 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1305 {
1306 off_t newoff;
1307
1308 /* Define acceptable ranges for inputs. */
1309 KASSERT(delta <= (size_t)size);
1310 KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
1311 KASSERT(oldoff < (off_t)(size + off));
1312
1313 if ((oldoff == 0) && (delta != 0))
1314 newoff = off + delta;
1315 else if ((oldoff + delta) < (size + off))
1316 newoff = oldoff + delta;
1317 else
1318 newoff = (oldoff + delta) - size;
1319
1320 /* Note some interesting axioms */
1321 KASSERT((delta != 0) || (newoff == oldoff));
1322 KASSERT((delta == 0) || (newoff != 0));
1323 KASSERT((delta != (size)) || (newoff == oldoff));
1324
1325 /* Define acceptable ranges for output. */
1326 KASSERT((newoff == 0) || ((size_t)newoff >= off));
1327 KASSERT((size_t)newoff < (size + off));
1328 return newoff;
1329 }
1330
1331 /*
1332 * wapbl_space_free(avail, head, tail)
1333 *
1334 * Number of bytes free in a circular queue of avail total bytes,
1335 * in which everything from tail to head is used.
1336 */
1337 static inline size_t
1338 wapbl_space_free(size_t avail, off_t head, off_t tail)
1339 {
1340
1341 return avail - wapbl_space_used(avail, head, tail);
1342 }
1343
1344 /*
1345 * wapbl_advance_head(size, off, delta, headp, tailp)
1346 *
1347 * In a circular queue of size bytes starting at off, given the
1348 * old head and tail offsets *headp and *tailp, store the new head
1349 * and tail offsets in *headp and *tailp resulting from adding
1350 * delta bytes of data to the head.
1351 */
1352 static inline void
1353 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1354 off_t *tailp)
1355 {
1356 off_t head = *headp;
1357 off_t tail = *tailp;
1358
1359 KASSERT(delta <= wapbl_space_free(size, head, tail));
1360 head = wapbl_advance(size, off, head, delta);
1361 if ((tail == 0) && (head != 0))
1362 tail = off;
1363 *headp = head;
1364 *tailp = tail;
1365 }
1366
1367 /*
1368 * wapbl_advance_tail(size, off, delta, headp, tailp)
1369 *
1370 * In a circular queue of size bytes starting at off, given the
1371 * old head and tail offsets *headp and *tailp, store the new head
1372 * and tail offsets in *headp and *tailp resulting from removing
1373 * delta bytes of data from the tail.
1374 */
1375 static inline void
1376 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1377 off_t *tailp)
1378 {
1379 off_t head = *headp;
1380 off_t tail = *tailp;
1381
1382 KASSERT(delta <= wapbl_space_used(size, head, tail));
1383 tail = wapbl_advance(size, off, tail, delta);
1384 if (head == tail) {
1385 head = tail = 0;
1386 }
1387 *headp = head;
1388 *tailp = tail;
1389 }
1390
1391
1392 /****************************************************************/
1393
1394 /*
1395 * wapbl_truncate(wl, minfree)
1396 *
1397 * Wait until at least minfree bytes are available in the log.
1398 *
1399 * If it was necessary to wait for writes to complete,
1400 * advance the circular queue tail to reflect the new write
1401 * completions and issue a write commit to the log.
1402 *
1403 * => Caller must hold wl->wl_rwlock writer lock.
1404 */
1405 static int
1406 wapbl_truncate(struct wapbl *wl, size_t minfree)
1407 {
1408 size_t delta;
1409 size_t avail;
1410 off_t head;
1411 off_t tail;
1412 int error = 0;
1413
1414 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1415 KASSERT(rw_write_held(&wl->wl_rwlock));
1416
1417 mutex_enter(&wl->wl_mtx);
1418
1419 /*
1420 * First check to see if we have to do a commit
1421 * at all.
1422 */
1423 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1424 if (minfree < avail) {
1425 mutex_exit(&wl->wl_mtx);
1426 return 0;
1427 }
1428 minfree -= avail;
1429 while ((wl->wl_error_count == 0) &&
1430 (wl->wl_reclaimable_bytes < minfree)) {
1431 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1432 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1433 "minfree=%zd\n",
1434 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1435 minfree));
1436
1437 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1438 }
1439 if (wl->wl_reclaimable_bytes < minfree) {
1440 KASSERT(wl->wl_error_count);
1441 /* XXX maybe get actual error from buffer instead someday? */
1442 error = EIO;
1443 }
1444 head = wl->wl_head;
1445 tail = wl->wl_tail;
1446 delta = wl->wl_reclaimable_bytes;
1447
1448 /* If all of of the entries are flushed, then be sure to keep
1449 * the reserved bytes reserved. Watch out for discarded transactions,
1450 * which could leave more bytes reserved than are reclaimable.
1451 */
1452 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1453 (delta >= wl->wl_reserved_bytes)) {
1454 delta -= wl->wl_reserved_bytes;
1455 }
1456 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1457 &tail);
1458 KDASSERT(wl->wl_reserved_bytes <=
1459 wapbl_space_used(wl->wl_circ_size, head, tail));
1460 mutex_exit(&wl->wl_mtx);
1461
1462 if (error)
1463 return error;
1464
1465 /*
1466 * This is where head, tail and delta are unprotected
1467 * from races against itself or flush. This is ok since
1468 * we only call this routine from inside flush itself.
1469 *
1470 * XXX: how can it race against itself when accessed only
1471 * from behind the write-locked rwlock?
1472 */
1473 error = wapbl_write_commit(wl, head, tail);
1474 if (error)
1475 return error;
1476
1477 wl->wl_head = head;
1478 wl->wl_tail = tail;
1479
1480 mutex_enter(&wl->wl_mtx);
1481 KASSERT(wl->wl_reclaimable_bytes >= delta);
1482 wl->wl_reclaimable_bytes -= delta;
1483 mutex_exit(&wl->wl_mtx);
1484 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1485 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1486 curproc->p_pid, curlwp->l_lid, delta));
1487
1488 return 0;
1489 }
1490
1491 /****************************************************************/
1492
1493 void
1494 wapbl_biodone(struct buf *bp)
1495 {
1496 struct wapbl_entry *we = bp->b_private;
1497 struct wapbl *wl = we->we_wapbl;
1498 #ifdef WAPBL_DEBUG_BUFBYTES
1499 const int bufsize = bp->b_bufsize;
1500 #endif
1501
1502 /*
1503 * Handle possible flushing of buffers after log has been
1504 * decomissioned.
1505 */
1506 if (!wl) {
1507 KASSERT(we->we_bufcount > 0);
1508 we->we_bufcount--;
1509 #ifdef WAPBL_DEBUG_BUFBYTES
1510 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1511 we->we_unsynced_bufbytes -= bufsize;
1512 #endif
1513
1514 if (we->we_bufcount == 0) {
1515 #ifdef WAPBL_DEBUG_BUFBYTES
1516 KASSERT(we->we_unsynced_bufbytes == 0);
1517 #endif
1518 pool_put(&wapbl_entry_pool, we);
1519 }
1520
1521 brelse(bp, 0);
1522 return;
1523 }
1524
1525 #ifdef ohbother
1526 KDASSERT(bp->b_oflags & BO_DONE);
1527 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1528 KDASSERT(bp->b_flags & B_ASYNC);
1529 KDASSERT(bp->b_cflags & BC_BUSY);
1530 KDASSERT(!(bp->b_flags & B_LOCKED));
1531 KDASSERT(!(bp->b_flags & B_READ));
1532 KDASSERT(!(bp->b_cflags & BC_INVAL));
1533 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1534 #endif
1535
1536 if (bp->b_error) {
1537 /*
1538 * If an error occurs, it would be nice to leave the buffer
1539 * as a delayed write on the LRU queue so that we can retry
1540 * it later. But buffercache(9) can't handle dirty buffer
1541 * reuse, so just mark the log permanently errored out.
1542 */
1543 mutex_enter(&wl->wl_mtx);
1544 if (wl->wl_error_count == 0) {
1545 wl->wl_error_count++;
1546 cv_broadcast(&wl->wl_reclaimable_cv);
1547 }
1548 mutex_exit(&wl->wl_mtx);
1549 }
1550
1551 /*
1552 * Make sure that the buf doesn't retain the media flags, so that
1553 * e.g. wapbl_allow_fuadpo has immediate effect on any following I/O.
1554 * The flags will be set again if needed by another I/O.
1555 */
1556 bp->b_flags &= ~B_MEDIA_FLAGS;
1557
1558 /*
1559 * Release the buffer here. wapbl_flush() may wait for the
1560 * log to become empty and we better unbusy the buffer before
1561 * wapbl_flush() returns.
1562 */
1563 brelse(bp, 0);
1564
1565 mutex_enter(&wl->wl_mtx);
1566
1567 KASSERT(we->we_bufcount > 0);
1568 we->we_bufcount--;
1569 #ifdef WAPBL_DEBUG_BUFBYTES
1570 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1571 we->we_unsynced_bufbytes -= bufsize;
1572 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1573 wl->wl_unsynced_bufbytes -= bufsize;
1574 #endif
1575 wl->wl_ev_metawrite.ev_count++;
1576
1577 /*
1578 * If the current transaction can be reclaimed, start
1579 * at the beginning and reclaim any consecutive reclaimable
1580 * transactions. If we successfully reclaim anything,
1581 * then wakeup anyone waiting for the reclaim.
1582 */
1583 if (we->we_bufcount == 0) {
1584 size_t delta = 0;
1585 int errcnt = 0;
1586 #ifdef WAPBL_DEBUG_BUFBYTES
1587 KDASSERT(we->we_unsynced_bufbytes == 0);
1588 #endif
1589 /*
1590 * clear any posted error, since the buffer it came from
1591 * has successfully flushed by now
1592 */
1593 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1594 (we->we_bufcount == 0)) {
1595 delta += we->we_reclaimable_bytes;
1596 if (we->we_error)
1597 errcnt++;
1598 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1599 pool_put(&wapbl_entry_pool, we);
1600 }
1601
1602 if (delta) {
1603 wl->wl_reclaimable_bytes += delta;
1604 KASSERT(wl->wl_error_count >= errcnt);
1605 wl->wl_error_count -= errcnt;
1606 cv_broadcast(&wl->wl_reclaimable_cv);
1607 }
1608 }
1609
1610 mutex_exit(&wl->wl_mtx);
1611 }
1612
1613 /*
1614 * wapbl_flush(wl, wait)
1615 *
1616 * Flush pending block writes, deallocations, and inodes from
1617 * the current transaction in memory to the log on disk:
1618 *
1619 * 1. Call the file system's wl_flush callback to flush any
1620 * per-file-system pending updates.
1621 * 2. Wait for enough space in the log for the current transaction.
1622 * 3. Synchronously write the new log records, advancing the
1623 * circular queue head.
1624 * 4. Issue the pending block writes asynchronously, now that they
1625 * are recorded in the log and can be replayed after crash.
1626 * 5. If wait is true, wait for all writes to complete and for the
1627 * log to become empty.
1628 *
1629 * On failure, call the file system's wl_flush_abort callback.
1630 */
1631 int
1632 wapbl_flush(struct wapbl *wl, int waitfor)
1633 {
1634 struct buf *bp;
1635 struct wapbl_entry *we;
1636 off_t off;
1637 off_t head;
1638 off_t tail;
1639 size_t delta = 0;
1640 size_t flushsize;
1641 size_t reserved;
1642 int error = 0;
1643
1644 /*
1645 * Do a quick check to see if a full flush can be skipped
1646 * This assumes that the flush callback does not need to be called
1647 * unless there are other outstanding bufs.
1648 */
1649 if (!waitfor) {
1650 size_t nbufs;
1651 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1652 protect the KASSERTS */
1653 nbufs = wl->wl_bufcount;
1654 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1655 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1656 mutex_exit(&wl->wl_mtx);
1657 if (nbufs == 0)
1658 return 0;
1659 }
1660
1661 /*
1662 * XXX we may consider using LK_UPGRADE here
1663 * if we want to call flush from inside a transaction
1664 */
1665 rw_enter(&wl->wl_rwlock, RW_WRITER);
1666 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
1667
1668 /*
1669 * Now that we are exclusively locked and the file system has
1670 * issued any deferred block writes for this transaction, check
1671 * whether there are any blocks to write to the log. If not,
1672 * skip waiting for space or writing any log entries.
1673 *
1674 * XXX Shouldn't this also check wl_dealloccnt and
1675 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1676 * file system didn't produce any blocks as a consequence of
1677 * it, but the same does not seem to be so of wl_inohashcnt.
1678 */
1679 if (wl->wl_bufcount == 0) {
1680 goto wait_out;
1681 }
1682
1683 #if 0
1684 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1685 ("wapbl_flush thread %d.%d flushing entries with "
1686 "bufcount=%zu bufbytes=%zu\n",
1687 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1688 wl->wl_bufbytes));
1689 #endif
1690
1691 /* Calculate amount of space needed to flush */
1692 flushsize = wapbl_transaction_len(wl);
1693 if (wapbl_verbose_commit) {
1694 struct timespec ts;
1695 getnanotime(&ts);
1696 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1697 __func__, (long long)ts.tv_sec,
1698 (long)ts.tv_nsec, flushsize);
1699 }
1700
1701 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1702 /*
1703 * XXX this could be handled more gracefully, perhaps place
1704 * only a partial transaction in the log and allow the
1705 * remaining to flush without the protection of the journal.
1706 */
1707 panic("wapbl_flush: current transaction too big to flush");
1708 }
1709
1710 error = wapbl_truncate(wl, flushsize);
1711 if (error)
1712 goto out;
1713
1714 off = wl->wl_head;
1715 KASSERT((off == 0) || (off >= wl->wl_circ_off));
1716 KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
1717 error = wapbl_write_blocks(wl, &off);
1718 if (error)
1719 goto out;
1720 error = wapbl_write_revocations(wl, &off);
1721 if (error)
1722 goto out;
1723 error = wapbl_write_inodes(wl, &off);
1724 if (error)
1725 goto out;
1726
1727 reserved = 0;
1728 if (wl->wl_inohashcnt)
1729 reserved = wapbl_transaction_inodes_len(wl);
1730
1731 head = wl->wl_head;
1732 tail = wl->wl_tail;
1733
1734 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1735 &head, &tail);
1736
1737 KASSERTMSG(head == off,
1738 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1739 " off=%"PRIdMAX" flush=%zu",
1740 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1741 flushsize);
1742
1743 /* Opportunistically move the tail forward if we can */
1744 mutex_enter(&wl->wl_mtx);
1745 delta = wl->wl_reclaimable_bytes;
1746 mutex_exit(&wl->wl_mtx);
1747 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1748 &head, &tail);
1749
1750 error = wapbl_write_commit(wl, head, tail);
1751 if (error)
1752 goto out;
1753
1754 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1755
1756 #ifdef WAPBL_DEBUG_BUFBYTES
1757 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1758 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1759 " unsynced=%zu"
1760 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1761 "inodes=%d\n",
1762 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1763 wapbl_space_used(wl->wl_circ_size, head, tail),
1764 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1765 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1766 wl->wl_inohashcnt));
1767 #else
1768 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1769 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1770 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1771 "inodes=%d\n",
1772 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1773 wapbl_space_used(wl->wl_circ_size, head, tail),
1774 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1775 wl->wl_dealloccnt, wl->wl_inohashcnt));
1776 #endif
1777
1778
1779 mutex_enter(&bufcache_lock);
1780 mutex_enter(&wl->wl_mtx);
1781
1782 wl->wl_reserved_bytes = reserved;
1783 wl->wl_head = head;
1784 wl->wl_tail = tail;
1785 KASSERT(wl->wl_reclaimable_bytes >= delta);
1786 wl->wl_reclaimable_bytes -= delta;
1787 KDASSERT(wl->wl_dealloccnt == 0);
1788 #ifdef WAPBL_DEBUG_BUFBYTES
1789 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1790 #endif
1791
1792 we->we_wapbl = wl;
1793 we->we_bufcount = wl->wl_bufcount;
1794 #ifdef WAPBL_DEBUG_BUFBYTES
1795 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1796 #endif
1797 we->we_reclaimable_bytes = flushsize;
1798 we->we_error = 0;
1799 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1800
1801 /*
1802 * this flushes bufs in reverse order than they were queued
1803 * it shouldn't matter, but if we care we could use TAILQ instead.
1804 * XXX Note they will get put on the lru queue when they flush
1805 * so we might actually want to change this to preserve order.
1806 */
1807 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1808 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1809 continue;
1810 }
1811 bp->b_iodone = wapbl_biodone;
1812 bp->b_private = we;
1813
1814 /* make sure the block is saved sync when FUA in use */
1815 bp->b_flags |= WAPBL_MFLAGS(wl);
1816
1817 bremfree(bp);
1818 wapbl_remove_buf_locked(wl, bp);
1819 mutex_exit(&wl->wl_mtx);
1820 mutex_exit(&bufcache_lock);
1821 bawrite(bp);
1822 mutex_enter(&bufcache_lock);
1823 mutex_enter(&wl->wl_mtx);
1824 }
1825 mutex_exit(&wl->wl_mtx);
1826 mutex_exit(&bufcache_lock);
1827
1828 #if 0
1829 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1830 ("wapbl_flush thread %d.%d done flushing entries...\n",
1831 curproc->p_pid, curlwp->l_lid));
1832 #endif
1833
1834 wait_out:
1835
1836 /*
1837 * If the waitfor flag is set, don't return until everything is
1838 * fully flushed and the on disk log is empty.
1839 */
1840 if (waitfor) {
1841 error = wapbl_truncate(wl, wl->wl_circ_size -
1842 wl->wl_reserved_bytes);
1843 }
1844
1845 out:
1846 if (error) {
1847 wl->wl_flush_abort(wl->wl_mount,
1848 TAILQ_FIRST(&wl->wl_dealloclist));
1849 }
1850
1851 #ifdef WAPBL_DEBUG_PRINT
1852 if (error) {
1853 pid_t pid = -1;
1854 lwpid_t lid = -1;
1855 if (curproc)
1856 pid = curproc->p_pid;
1857 if (curlwp)
1858 lid = curlwp->l_lid;
1859 mutex_enter(&wl->wl_mtx);
1860 #ifdef WAPBL_DEBUG_BUFBYTES
1861 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1862 ("wapbl_flush: thread %d.%d aborted flush: "
1863 "error = %d\n"
1864 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1865 "deallocs=%d inodes=%d\n"
1866 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1867 "unsynced=%zu\n",
1868 pid, lid, error, wl->wl_bufcount,
1869 wl->wl_bufbytes, wl->wl_bcount,
1870 wl->wl_dealloccnt, wl->wl_inohashcnt,
1871 wl->wl_error_count, wl->wl_reclaimable_bytes,
1872 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1873 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1874 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1875 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1876 "error = %d, unsynced = %zu\n",
1877 we->we_bufcount, we->we_reclaimable_bytes,
1878 we->we_error, we->we_unsynced_bufbytes));
1879 }
1880 #else
1881 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1882 ("wapbl_flush: thread %d.%d aborted flush: "
1883 "error = %d\n"
1884 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1885 "deallocs=%d inodes=%d\n"
1886 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1887 pid, lid, error, wl->wl_bufcount,
1888 wl->wl_bufbytes, wl->wl_bcount,
1889 wl->wl_dealloccnt, wl->wl_inohashcnt,
1890 wl->wl_error_count, wl->wl_reclaimable_bytes,
1891 wl->wl_reserved_bytes));
1892 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1893 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1894 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1895 "error = %d\n", we->we_bufcount,
1896 we->we_reclaimable_bytes, we->we_error));
1897 }
1898 #endif
1899 mutex_exit(&wl->wl_mtx);
1900 }
1901 #endif
1902
1903 rw_exit(&wl->wl_rwlock);
1904 return error;
1905 }
1906
1907 /****************************************************************/
1908
1909 void
1910 wapbl_jlock_assert(struct wapbl *wl)
1911 {
1912
1913 KASSERT(rw_lock_held(&wl->wl_rwlock));
1914 }
1915
1916 void
1917 wapbl_junlock_assert(struct wapbl *wl)
1918 {
1919
1920 KASSERT(!rw_write_held(&wl->wl_rwlock));
1921 }
1922
1923 /****************************************************************/
1924
1925 /* locks missing */
1926 void
1927 wapbl_print(struct wapbl *wl,
1928 int full,
1929 void (*pr)(const char *, ...))
1930 {
1931 struct buf *bp;
1932 struct wapbl_entry *we;
1933 (*pr)("wapbl %p", wl);
1934 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1935 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1936 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1937 wl->wl_circ_size, wl->wl_circ_off,
1938 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1939 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1940 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1941 #ifdef WAPBL_DEBUG_BUFBYTES
1942 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1943 "reserved = %zu errcnt = %d unsynced = %zu\n",
1944 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1945 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1946 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1947 #else
1948 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1949 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1950 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1951 wl->wl_error_count);
1952 #endif
1953 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1954 wl->wl_dealloccnt, wl->wl_dealloclim);
1955 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1956 wl->wl_inohashcnt, wl->wl_inohashmask);
1957 (*pr)("entries:\n");
1958 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1959 #ifdef WAPBL_DEBUG_BUFBYTES
1960 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1961 "unsynced = %zu\n",
1962 we->we_bufcount, we->we_reclaimable_bytes,
1963 we->we_error, we->we_unsynced_bufbytes);
1964 #else
1965 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1966 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1967 #endif
1968 }
1969 if (full) {
1970 int cnt = 0;
1971 (*pr)("bufs =");
1972 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1973 if (!LIST_NEXT(bp, b_wapbllist)) {
1974 (*pr)(" %p", bp);
1975 } else if ((++cnt % 6) == 0) {
1976 (*pr)(" %p,\n\t", bp);
1977 } else {
1978 (*pr)(" %p,", bp);
1979 }
1980 }
1981 (*pr)("\n");
1982
1983 (*pr)("dealloced blks = ");
1984 {
1985 struct wapbl_dealloc *wd;
1986 cnt = 0;
1987 TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
1988 (*pr)(" %"PRId64":%d,",
1989 wd->wd_blkno,
1990 wd->wd_len);
1991 if ((++cnt % 4) == 0) {
1992 (*pr)("\n\t");
1993 }
1994 }
1995 }
1996 (*pr)("\n");
1997
1998 (*pr)("registered inodes = ");
1999 {
2000 int i;
2001 cnt = 0;
2002 for (i = 0; i <= wl->wl_inohashmask; i++) {
2003 struct wapbl_ino_head *wih;
2004 struct wapbl_ino *wi;
2005
2006 wih = &wl->wl_inohash[i];
2007 LIST_FOREACH(wi, wih, wi_hash) {
2008 if (wi->wi_ino == 0)
2009 continue;
2010 (*pr)(" %"PRIu64"/0%06"PRIo32",",
2011 wi->wi_ino, wi->wi_mode);
2012 if ((++cnt % 4) == 0) {
2013 (*pr)("\n\t");
2014 }
2015 }
2016 }
2017 (*pr)("\n");
2018 }
2019 }
2020 }
2021
2022 #if defined(WAPBL_DEBUG) || defined(DDB)
2023 void
2024 wapbl_dump(struct wapbl *wl)
2025 {
2026 #if defined(WAPBL_DEBUG)
2027 if (!wl)
2028 wl = wapbl_debug_wl;
2029 #endif
2030 if (!wl)
2031 return;
2032 wapbl_print(wl, 1, printf);
2033 }
2034 #endif
2035
2036 /****************************************************************/
2037
2038 int
2039 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force,
2040 void **cookiep)
2041 {
2042 struct wapbl_dealloc *wd;
2043 int error = 0;
2044
2045 wapbl_jlock_assert(wl);
2046
2047 mutex_enter(&wl->wl_mtx);
2048
2049 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
2050 if (!force) {
2051 error = EAGAIN;
2052 goto out;
2053 }
2054
2055 /*
2056 * Forced registration can only be used when:
2057 * 1) the caller can't cope with failure
2058 * 2) the path can be triggered only bounded, small
2059 * times per transaction
2060 * If this is not fullfilled, and the path would be triggered
2061 * many times, this could overflow maximum transaction size
2062 * and panic later.
2063 */
2064 printf("%s: forced dealloc registration over limit: %d >= %d\n",
2065 wl->wl_mount->mnt_stat.f_mntonname,
2066 wl->wl_dealloccnt, wl->wl_dealloclim);
2067 }
2068
2069 wl->wl_dealloccnt++;
2070 mutex_exit(&wl->wl_mtx);
2071
2072 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
2073 wd->wd_blkno = blk;
2074 wd->wd_len = len;
2075
2076 mutex_enter(&wl->wl_mtx);
2077 TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
2078
2079 if (cookiep)
2080 *cookiep = wd;
2081
2082 out:
2083 mutex_exit(&wl->wl_mtx);
2084
2085 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
2086 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
2087 blk, len, error));
2088
2089 return error;
2090 }
2091
2092 static void
2093 wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd,
2094 bool locked)
2095 {
2096 KASSERT(!locked
2097 || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx));
2098
2099 if (!locked)
2100 mutex_enter(&wl->wl_mtx);
2101
2102 TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
2103 wl->wl_dealloccnt--;
2104
2105 if (!locked)
2106 mutex_exit(&wl->wl_mtx);
2107
2108 pool_put(&wapbl_dealloc_pool, wd);
2109 }
2110
2111 void
2112 wapbl_unregister_deallocation(struct wapbl *wl, void *cookie)
2113 {
2114 KASSERT(cookie != NULL);
2115 wapbl_deallocation_free(wl, cookie, false);
2116 }
2117
2118 /****************************************************************/
2119
2120 static void
2121 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
2122 {
2123
2124 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
2125 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
2126 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
2127 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
2128 }
2129 }
2130
2131 static void
2132 wapbl_inodetrk_free(struct wapbl *wl)
2133 {
2134
2135 /* XXX this KASSERT needs locking/mutex analysis */
2136 KASSERT(wl->wl_inohashcnt == 0);
2137 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2138 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
2139 pool_destroy(&wapbl_ino_pool);
2140 }
2141 }
2142
2143 static struct wapbl_ino *
2144 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2145 {
2146 struct wapbl_ino_head *wih;
2147 struct wapbl_ino *wi;
2148
2149 KASSERT(mutex_owned(&wl->wl_mtx));
2150
2151 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2152 LIST_FOREACH(wi, wih, wi_hash) {
2153 if (ino == wi->wi_ino)
2154 return wi;
2155 }
2156 return 0;
2157 }
2158
2159 void
2160 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2161 {
2162 struct wapbl_ino_head *wih;
2163 struct wapbl_ino *wi;
2164
2165 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2166
2167 mutex_enter(&wl->wl_mtx);
2168 if (wapbl_inodetrk_get(wl, ino) == NULL) {
2169 wi->wi_ino = ino;
2170 wi->wi_mode = mode;
2171 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2172 LIST_INSERT_HEAD(wih, wi, wi_hash);
2173 wl->wl_inohashcnt++;
2174 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2175 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
2176 mutex_exit(&wl->wl_mtx);
2177 } else {
2178 mutex_exit(&wl->wl_mtx);
2179 pool_put(&wapbl_ino_pool, wi);
2180 }
2181 }
2182
2183 void
2184 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2185 {
2186 struct wapbl_ino *wi;
2187
2188 mutex_enter(&wl->wl_mtx);
2189 wi = wapbl_inodetrk_get(wl, ino);
2190 if (wi) {
2191 WAPBL_PRINTF(WAPBL_PRINT_INODE,
2192 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2193 KASSERT(wl->wl_inohashcnt > 0);
2194 wl->wl_inohashcnt--;
2195 LIST_REMOVE(wi, wi_hash);
2196 mutex_exit(&wl->wl_mtx);
2197
2198 pool_put(&wapbl_ino_pool, wi);
2199 } else {
2200 mutex_exit(&wl->wl_mtx);
2201 }
2202 }
2203
2204 /****************************************************************/
2205
2206 /*
2207 * wapbl_transaction_inodes_len(wl)
2208 *
2209 * Calculate the number of bytes required for inode registration
2210 * log records in wl.
2211 */
2212 static inline size_t
2213 wapbl_transaction_inodes_len(struct wapbl *wl)
2214 {
2215 int blocklen = 1<<wl->wl_log_dev_bshift;
2216 int iph;
2217
2218 /* Calculate number of inodes described in a inodelist header */
2219 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2220 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2221
2222 KASSERT(iph > 0);
2223
2224 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2225 }
2226
2227
2228 /*
2229 * wapbl_transaction_len(wl)
2230 *
2231 * Calculate number of bytes required for all log records in wl.
2232 */
2233 static size_t
2234 wapbl_transaction_len(struct wapbl *wl)
2235 {
2236 int blocklen = 1<<wl->wl_log_dev_bshift;
2237 size_t len;
2238
2239 /* Calculate number of blocks described in a blocklist header */
2240 len = wl->wl_bcount;
2241 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2242 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2243 len += wapbl_transaction_inodes_len(wl);
2244
2245 return len;
2246 }
2247
2248 /*
2249 * wapbl_cache_sync(wl, msg)
2250 *
2251 * Issue DIOCCACHESYNC to wl->wl_devvp.
2252 *
2253 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2254 * including msg about the duration of the cache sync.
2255 */
2256 static int
2257 wapbl_cache_sync(struct wapbl *wl, const char *msg)
2258 {
2259 const bool verbose = wapbl_verbose_commit >= 2;
2260 struct bintime start_time;
2261 int force = 1;
2262 int error;
2263
2264 /* Skip full cache sync if disabled, or when using FUA */
2265 if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
2266 return 0;
2267 }
2268 if (verbose) {
2269 bintime(&start_time);
2270 }
2271 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2272 FWRITE, FSCRED);
2273 if (error) {
2274 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2275 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2276 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
2277 }
2278 if (verbose) {
2279 struct bintime d;
2280 struct timespec ts;
2281
2282 bintime(&d);
2283 bintime_sub(&d, &start_time);
2284 bintime2timespec(&d, &ts);
2285 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2286 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2287 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2288 }
2289
2290 wl->wl_ev_cacheflush.ev_count++;
2291
2292 return error;
2293 }
2294
2295 /*
2296 * wapbl_write_commit(wl, head, tail)
2297 *
2298 * Issue a disk cache sync to wait for all pending writes to the
2299 * log to complete, and then synchronously commit the current
2300 * circular queue head and tail to the log, in the next of two
2301 * locations for commit headers on disk.
2302 *
2303 * Increment the generation number. If the generation number
2304 * rolls over to zero, then a subsequent commit would appear to
2305 * have an older generation than this one -- in that case, issue a
2306 * duplicate commit to avoid this.
2307 *
2308 * => Caller must have exclusive access to wl, either by holding
2309 * wl->wl_rwlock for writer or by being wapbl_start before anyone
2310 * else has seen wl.
2311 */
2312 static int
2313 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2314 {
2315 struct wapbl_wc_header *wc = wl->wl_wc_header;
2316 struct timespec ts;
2317 int error;
2318 daddr_t pbn;
2319
2320 error = wapbl_buffered_flush(wl);
2321 if (error)
2322 return error;
2323 /*
2324 * flush disk cache to ensure that blocks we've written are actually
2325 * written to the stable storage before the commit header.
2326 *
2327 * XXX Calc checksum here, instead we do this for now
2328 */
2329 wapbl_cache_sync(wl, "1");
2330
2331 wc->wc_head = head;
2332 wc->wc_tail = tail;
2333 wc->wc_checksum = 0;
2334 wc->wc_version = 1;
2335 getnanotime(&ts);
2336 wc->wc_time = ts.tv_sec;
2337 wc->wc_timensec = ts.tv_nsec;
2338
2339 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2340 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2341 (intmax_t)head, (intmax_t)tail));
2342
2343 /*
2344 * write the commit header.
2345 *
2346 * XXX if generation will rollover, then first zero
2347 * over second commit header before trying to write both headers.
2348 */
2349
2350 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2351 #ifdef _KERNEL
2352 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2353 #endif
2354 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2355 if (error)
2356 return error;
2357 error = wapbl_buffered_flush(wl);
2358 if (error)
2359 return error;
2360
2361 /*
2362 * flush disk cache to ensure that the commit header is actually
2363 * written before meta data blocks.
2364 */
2365 wapbl_cache_sync(wl, "2");
2366
2367 /*
2368 * If the generation number was zero, write it out a second time.
2369 * This handles initialization and generation number rollover
2370 */
2371 if (wc->wc_generation++ == 0) {
2372 error = wapbl_write_commit(wl, head, tail);
2373 /*
2374 * This panic should be able to be removed if we do the
2375 * zero'ing mentioned above, and we are certain to roll
2376 * back generation number on failure.
2377 */
2378 if (error)
2379 panic("wapbl_write_commit: error writing duplicate "
2380 "log header: %d", error);
2381 }
2382
2383 wl->wl_ev_commit.ev_count++;
2384
2385 return 0;
2386 }
2387
2388 /*
2389 * wapbl_write_blocks(wl, offp)
2390 *
2391 * Write all pending physical blocks in the current transaction
2392 * from wapbl_add_buf to the log on disk, adding to the circular
2393 * queue head at byte offset *offp, and returning the new head's
2394 * byte offset in *offp.
2395 */
2396 static int
2397 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2398 {
2399 struct wapbl_wc_blocklist *wc =
2400 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2401 int blocklen = 1<<wl->wl_log_dev_bshift;
2402 struct buf *bp;
2403 off_t off = *offp;
2404 int error;
2405 size_t padding;
2406
2407 KASSERT(rw_write_held(&wl->wl_rwlock));
2408
2409 bp = LIST_FIRST(&wl->wl_bufs);
2410
2411 while (bp) {
2412 int cnt;
2413 struct buf *obp = bp;
2414
2415 KASSERT(bp->b_flags & B_LOCKED);
2416
2417 wc->wc_type = WAPBL_WC_BLOCKS;
2418 wc->wc_len = blocklen;
2419 wc->wc_blkcount = 0;
2420 while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
2421 /*
2422 * Make sure all the physical block numbers are up to
2423 * date. If this is not always true on a given
2424 * filesystem, then VOP_BMAP must be called. We
2425 * could call VOP_BMAP here, or else in the filesystem
2426 * specific flush callback, although neither of those
2427 * solutions allow us to take the vnode lock. If a
2428 * filesystem requires that we must take the vnode lock
2429 * to call VOP_BMAP, then we can probably do it in
2430 * bwrite when the vnode lock should already be held
2431 * by the invoking code.
2432 */
2433 KASSERT((bp->b_vp->v_type == VBLK) ||
2434 (bp->b_blkno != bp->b_lblkno));
2435 KASSERT(bp->b_blkno > 0);
2436
2437 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2438 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2439 wc->wc_len += bp->b_bcount;
2440 wc->wc_blkcount++;
2441 bp = LIST_NEXT(bp, b_wapbllist);
2442 }
2443 if (wc->wc_len % blocklen != 0) {
2444 padding = blocklen - wc->wc_len % blocklen;
2445 wc->wc_len += padding;
2446 } else {
2447 padding = 0;
2448 }
2449
2450 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2451 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2452 wc->wc_len, padding, (intmax_t)off));
2453
2454 error = wapbl_circ_write(wl, wc, blocklen, &off);
2455 if (error)
2456 return error;
2457 bp = obp;
2458 cnt = 0;
2459 while (bp && (cnt++ < wl->wl_brperjblock)) {
2460 error = wapbl_circ_write(wl, bp->b_data,
2461 bp->b_bcount, &off);
2462 if (error)
2463 return error;
2464 bp = LIST_NEXT(bp, b_wapbllist);
2465 }
2466 if (padding) {
2467 void *zero;
2468
2469 zero = wapbl_alloc(padding);
2470 memset(zero, 0, padding);
2471 error = wapbl_circ_write(wl, zero, padding, &off);
2472 wapbl_free(zero, padding);
2473 if (error)
2474 return error;
2475 }
2476 }
2477 *offp = off;
2478 return 0;
2479 }
2480
2481 /*
2482 * wapbl_write_revocations(wl, offp)
2483 *
2484 * Write all pending deallocations in the current transaction from
2485 * wapbl_register_deallocation to the log on disk, adding to the
2486 * circular queue's head at byte offset *offp, and returning the
2487 * new head's byte offset in *offp.
2488 */
2489 static int
2490 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2491 {
2492 struct wapbl_wc_blocklist *wc =
2493 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2494 struct wapbl_dealloc *wd, *lwd;
2495 int blocklen = 1<<wl->wl_log_dev_bshift;
2496 off_t off = *offp;
2497 int error;
2498
2499 KASSERT(rw_write_held(&wl->wl_rwlock));
2500
2501 if (wl->wl_dealloccnt == 0)
2502 return 0;
2503
2504 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2505 wc->wc_type = WAPBL_WC_REVOCATIONS;
2506 wc->wc_len = blocklen;
2507 wc->wc_blkcount = 0;
2508 while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
2509 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2510 wd->wd_blkno;
2511 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2512 wd->wd_len;
2513 wc->wc_blkcount++;
2514
2515 wd = TAILQ_NEXT(wd, wd_entries);
2516 }
2517 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2518 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2519 wc->wc_len, (intmax_t)off));
2520 error = wapbl_circ_write(wl, wc, blocklen, &off);
2521 if (error)
2522 return error;
2523
2524 /* free all successfully written deallocs */
2525 lwd = wd;
2526 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2527 if (wd == lwd)
2528 break;
2529 wapbl_deallocation_free(wl, wd, true);
2530 }
2531 }
2532 *offp = off;
2533 return 0;
2534 }
2535
2536 /*
2537 * wapbl_write_inodes(wl, offp)
2538 *
2539 * Write all pending inode allocations in the current transaction
2540 * from wapbl_register_inode to the log on disk, adding to the
2541 * circular queue's head at byte offset *offp and returning the
2542 * new head's byte offset in *offp.
2543 */
2544 static int
2545 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2546 {
2547 struct wapbl_wc_inodelist *wc =
2548 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2549 int i;
2550 int blocklen = 1 << wl->wl_log_dev_bshift;
2551 off_t off = *offp;
2552 int error;
2553
2554 struct wapbl_ino_head *wih;
2555 struct wapbl_ino *wi;
2556 int iph;
2557
2558 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2559 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2560
2561 i = 0;
2562 wih = &wl->wl_inohash[0];
2563 wi = 0;
2564 do {
2565 wc->wc_type = WAPBL_WC_INODES;
2566 wc->wc_len = blocklen;
2567 wc->wc_inocnt = 0;
2568 wc->wc_clear = (i == 0);
2569 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2570 while (!wi) {
2571 KASSERT((wih - &wl->wl_inohash[0])
2572 <= wl->wl_inohashmask);
2573 wi = LIST_FIRST(wih++);
2574 }
2575 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2576 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2577 wc->wc_inocnt++;
2578 i++;
2579 wi = LIST_NEXT(wi, wi_hash);
2580 }
2581 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2582 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2583 wc->wc_len, (intmax_t)off));
2584 error = wapbl_circ_write(wl, wc, blocklen, &off);
2585 if (error)
2586 return error;
2587 } while (i < wl->wl_inohashcnt);
2588
2589 *offp = off;
2590 return 0;
2591 }
2592
2593 #endif /* _KERNEL */
2594
2595 /****************************************************************/
2596
2597 struct wapbl_blk {
2598 LIST_ENTRY(wapbl_blk) wb_hash;
2599 daddr_t wb_blk;
2600 off_t wb_off; /* Offset of this block in the log */
2601 };
2602 #define WAPBL_BLKPOOL_MIN 83
2603
2604 static void
2605 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2606 {
2607 if (size < WAPBL_BLKPOOL_MIN)
2608 size = WAPBL_BLKPOOL_MIN;
2609 KASSERT(wr->wr_blkhash == 0);
2610 #ifdef _KERNEL
2611 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2612 #else /* ! _KERNEL */
2613 /* Manually implement hashinit */
2614 {
2615 unsigned long i, hashsize;
2616 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2617 continue;
2618 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2619 for (i = 0; i < hashsize; i++)
2620 LIST_INIT(&wr->wr_blkhash[i]);
2621 wr->wr_blkhashmask = hashsize - 1;
2622 }
2623 #endif /* ! _KERNEL */
2624 }
2625
2626 static void
2627 wapbl_blkhash_free(struct wapbl_replay *wr)
2628 {
2629 KASSERT(wr->wr_blkhashcnt == 0);
2630 #ifdef _KERNEL
2631 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2632 #else /* ! _KERNEL */
2633 wapbl_free(wr->wr_blkhash,
2634 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2635 #endif /* ! _KERNEL */
2636 }
2637
2638 static struct wapbl_blk *
2639 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2640 {
2641 struct wapbl_blk_head *wbh;
2642 struct wapbl_blk *wb;
2643 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2644 LIST_FOREACH(wb, wbh, wb_hash) {
2645 if (blk == wb->wb_blk)
2646 return wb;
2647 }
2648 return 0;
2649 }
2650
2651 static void
2652 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2653 {
2654 struct wapbl_blk_head *wbh;
2655 struct wapbl_blk *wb;
2656 wb = wapbl_blkhash_get(wr, blk);
2657 if (wb) {
2658 KASSERT(wb->wb_blk == blk);
2659 wb->wb_off = off;
2660 } else {
2661 wb = wapbl_alloc(sizeof(*wb));
2662 wb->wb_blk = blk;
2663 wb->wb_off = off;
2664 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2665 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2666 wr->wr_blkhashcnt++;
2667 }
2668 }
2669
2670 static void
2671 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2672 {
2673 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2674 if (wb) {
2675 KASSERT(wr->wr_blkhashcnt > 0);
2676 wr->wr_blkhashcnt--;
2677 LIST_REMOVE(wb, wb_hash);
2678 wapbl_free(wb, sizeof(*wb));
2679 }
2680 }
2681
2682 static void
2683 wapbl_blkhash_clear(struct wapbl_replay *wr)
2684 {
2685 unsigned long i;
2686 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2687 struct wapbl_blk *wb;
2688
2689 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2690 KASSERT(wr->wr_blkhashcnt > 0);
2691 wr->wr_blkhashcnt--;
2692 LIST_REMOVE(wb, wb_hash);
2693 wapbl_free(wb, sizeof(*wb));
2694 }
2695 }
2696 KASSERT(wr->wr_blkhashcnt == 0);
2697 }
2698
2699 /****************************************************************/
2700
2701 /*
2702 * wapbl_circ_read(wr, data, len, offp)
2703 *
2704 * Read len bytes into data from the circular queue of wr,
2705 * starting at the linear byte offset *offp, and returning the new
2706 * linear byte offset in *offp.
2707 *
2708 * If the starting linear byte offset precedes wr->wr_circ_off,
2709 * the read instead begins at wr->wr_circ_off. XXX WTF? This
2710 * should be a KASSERT, not a conditional.
2711 */
2712 static int
2713 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2714 {
2715 size_t slen;
2716 off_t off = *offp;
2717 int error;
2718 daddr_t pbn;
2719
2720 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2721 wr->wr_log_dev_bshift) == len);
2722
2723 if (off < wr->wr_circ_off)
2724 off = wr->wr_circ_off;
2725 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2726 if (slen < len) {
2727 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2728 #ifdef _KERNEL
2729 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2730 #endif
2731 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2732 if (error)
2733 return error;
2734 data = (uint8_t *)data + slen;
2735 len -= slen;
2736 off = wr->wr_circ_off;
2737 }
2738 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2739 #ifdef _KERNEL
2740 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2741 #endif
2742 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2743 if (error)
2744 return error;
2745 off += len;
2746 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2747 off = wr->wr_circ_off;
2748 *offp = off;
2749 return 0;
2750 }
2751
2752 /*
2753 * wapbl_circ_advance(wr, len, offp)
2754 *
2755 * Compute the linear byte offset of the circular queue of wr that
2756 * is len bytes past *offp, and store it in *offp.
2757 *
2758 * This is as if wapbl_circ_read, but without actually reading
2759 * anything.
2760 *
2761 * If the starting linear byte offset precedes wr->wr_circ_off, it
2762 * is taken to be wr->wr_circ_off instead. XXX WTF? This should
2763 * be a KASSERT, not a conditional.
2764 */
2765 static void
2766 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2767 {
2768 size_t slen;
2769 off_t off = *offp;
2770
2771 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2772 wr->wr_log_dev_bshift) == len);
2773
2774 if (off < wr->wr_circ_off)
2775 off = wr->wr_circ_off;
2776 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2777 if (slen < len) {
2778 len -= slen;
2779 off = wr->wr_circ_off;
2780 }
2781 off += len;
2782 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2783 off = wr->wr_circ_off;
2784 *offp = off;
2785 }
2786
2787 /****************************************************************/
2788
2789 int
2790 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2791 daddr_t off, size_t count, size_t blksize)
2792 {
2793 struct wapbl_replay *wr;
2794 int error;
2795 struct vnode *devvp;
2796 daddr_t logpbn;
2797 uint8_t *scratch;
2798 struct wapbl_wc_header *wch;
2799 struct wapbl_wc_header *wch2;
2800 /* Use this until we read the actual log header */
2801 int log_dev_bshift = ilog2(blksize);
2802 size_t used;
2803 daddr_t pbn;
2804
2805 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2806 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2807 vp, off, count, blksize));
2808
2809 if (off < 0)
2810 return EINVAL;
2811
2812 if (blksize < DEV_BSIZE)
2813 return EINVAL;
2814 if (blksize % DEV_BSIZE)
2815 return EINVAL;
2816
2817 #ifdef _KERNEL
2818 #if 0
2819 /* XXX vp->v_size isn't reliably set for VBLK devices,
2820 * especially root. However, we might still want to verify
2821 * that the full load is readable */
2822 if ((off + count) * blksize > vp->v_size)
2823 return EINVAL;
2824 #endif
2825 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2826 return error;
2827 }
2828 #else /* ! _KERNEL */
2829 devvp = vp;
2830 logpbn = off;
2831 #endif /* ! _KERNEL */
2832
2833 scratch = wapbl_alloc(MAXBSIZE);
2834
2835 pbn = logpbn;
2836 #ifdef _KERNEL
2837 pbn = btodb(pbn << log_dev_bshift);
2838 #endif
2839 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2840 if (error)
2841 goto errout;
2842
2843 wch = (struct wapbl_wc_header *)scratch;
2844 wch2 =
2845 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2846 /* XXX verify checksums and magic numbers */
2847 if (wch->wc_type != WAPBL_WC_HEADER) {
2848 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2849 error = EFTYPE;
2850 goto errout;
2851 }
2852
2853 if (wch2->wc_generation > wch->wc_generation)
2854 wch = wch2;
2855
2856 wr = wapbl_calloc(1, sizeof(*wr));
2857
2858 wr->wr_logvp = vp;
2859 wr->wr_devvp = devvp;
2860 wr->wr_logpbn = logpbn;
2861
2862 wr->wr_scratch = scratch;
2863
2864 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2865 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2866 wr->wr_circ_off = wch->wc_circ_off;
2867 wr->wr_circ_size = wch->wc_circ_size;
2868 wr->wr_generation = wch->wc_generation;
2869
2870 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2871
2872 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2873 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2874 " len=%"PRId64" used=%zu\n",
2875 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2876 wch->wc_circ_size, used));
2877
2878 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2879
2880 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2881 if (error) {
2882 wapbl_replay_stop(wr);
2883 wapbl_replay_free(wr);
2884 return error;
2885 }
2886
2887 *wrp = wr;
2888 return 0;
2889
2890 errout:
2891 wapbl_free(scratch, MAXBSIZE);
2892 return error;
2893 }
2894
2895 void
2896 wapbl_replay_stop(struct wapbl_replay *wr)
2897 {
2898
2899 if (!wapbl_replay_isopen(wr))
2900 return;
2901
2902 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2903
2904 wapbl_free(wr->wr_scratch, MAXBSIZE);
2905 wr->wr_scratch = NULL;
2906
2907 wr->wr_logvp = NULL;
2908
2909 wapbl_blkhash_clear(wr);
2910 wapbl_blkhash_free(wr);
2911 }
2912
2913 void
2914 wapbl_replay_free(struct wapbl_replay *wr)
2915 {
2916
2917 KDASSERT(!wapbl_replay_isopen(wr));
2918
2919 if (wr->wr_inodes)
2920 wapbl_free(wr->wr_inodes,
2921 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2922 wapbl_free(wr, sizeof(*wr));
2923 }
2924
2925 #ifdef _KERNEL
2926 int
2927 wapbl_replay_isopen1(struct wapbl_replay *wr)
2928 {
2929
2930 return wapbl_replay_isopen(wr);
2931 }
2932 #endif
2933
2934 /*
2935 * calculate the disk address for the i'th block in the wc_blockblist
2936 * offset by j blocks of size blen.
2937 *
2938 * wc_daddr is always a kernel disk address in DEV_BSIZE units that
2939 * was written to the journal.
2940 *
2941 * The kernel needs that address plus the offset in DEV_BSIZE units.
2942 *
2943 * Userland needs that address plus the offset in blen units.
2944 *
2945 */
2946 static daddr_t
2947 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
2948 {
2949 daddr_t pbn;
2950
2951 #ifdef _KERNEL
2952 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
2953 #else
2954 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
2955 #endif
2956
2957 return pbn;
2958 }
2959
2960 static void
2961 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2962 {
2963 struct wapbl_wc_blocklist *wc =
2964 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2965 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2966 int i, j, n;
2967
2968 for (i = 0; i < wc->wc_blkcount; i++) {
2969 /*
2970 * Enter each physical block into the hashtable independently.
2971 */
2972 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2973 for (j = 0; j < n; j++) {
2974 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
2975 *offp);
2976 wapbl_circ_advance(wr, fsblklen, offp);
2977 }
2978 }
2979 }
2980
2981 static void
2982 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2983 {
2984 struct wapbl_wc_blocklist *wc =
2985 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2986 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2987 int i, j, n;
2988
2989 for (i = 0; i < wc->wc_blkcount; i++) {
2990 /*
2991 * Remove any blocks found from the hashtable.
2992 */
2993 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2994 for (j = 0; j < n; j++)
2995 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
2996 }
2997 }
2998
2999 static void
3000 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
3001 {
3002 struct wapbl_wc_inodelist *wc =
3003 (struct wapbl_wc_inodelist *)wr->wr_scratch;
3004 void *new_inodes;
3005 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
3006
3007 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
3008
3009 /*
3010 * Keep track of where we found this so location won't be
3011 * overwritten.
3012 */
3013 if (wc->wc_clear) {
3014 wr->wr_inodestail = oldoff;
3015 wr->wr_inodescnt = 0;
3016 if (wr->wr_inodes != NULL) {
3017 wapbl_free(wr->wr_inodes, oldsize);
3018 wr->wr_inodes = NULL;
3019 }
3020 }
3021 wr->wr_inodeshead = newoff;
3022 if (wc->wc_inocnt == 0)
3023 return;
3024
3025 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
3026 sizeof(wr->wr_inodes[0]));
3027 if (wr->wr_inodes != NULL) {
3028 memcpy(new_inodes, wr->wr_inodes, oldsize);
3029 wapbl_free(wr->wr_inodes, oldsize);
3030 }
3031 wr->wr_inodes = new_inodes;
3032 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
3033 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
3034 wr->wr_inodescnt += wc->wc_inocnt;
3035 }
3036
3037 static int
3038 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
3039 {
3040 off_t off;
3041 int error;
3042
3043 int logblklen = 1 << wr->wr_log_dev_bshift;
3044
3045 wapbl_blkhash_clear(wr);
3046
3047 off = tail;
3048 while (off != head) {
3049 struct wapbl_wc_null *wcn;
3050 off_t saveoff = off;
3051 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3052 if (error)
3053 goto errout;
3054 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3055 switch (wcn->wc_type) {
3056 case WAPBL_WC_BLOCKS:
3057 wapbl_replay_process_blocks(wr, &off);
3058 break;
3059
3060 case WAPBL_WC_REVOCATIONS:
3061 wapbl_replay_process_revocations(wr);
3062 break;
3063
3064 case WAPBL_WC_INODES:
3065 wapbl_replay_process_inodes(wr, saveoff, off);
3066 break;
3067
3068 default:
3069 printf("Unrecognized wapbl type: 0x%08x\n",
3070 wcn->wc_type);
3071 error = EFTYPE;
3072 goto errout;
3073 }
3074 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3075 if (off != saveoff) {
3076 printf("wapbl_replay: corrupted records\n");
3077 error = EFTYPE;
3078 goto errout;
3079 }
3080 }
3081 return 0;
3082
3083 errout:
3084 wapbl_blkhash_clear(wr);
3085 return error;
3086 }
3087
3088 #if 0
3089 int
3090 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
3091 {
3092 off_t off;
3093 int mismatchcnt = 0;
3094 int logblklen = 1 << wr->wr_log_dev_bshift;
3095 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3096 void *scratch1 = wapbl_alloc(MAXBSIZE);
3097 void *scratch2 = wapbl_alloc(MAXBSIZE);
3098 int error = 0;
3099
3100 KDASSERT(wapbl_replay_isopen(wr));
3101
3102 off = wch->wc_tail;
3103 while (off != wch->wc_head) {
3104 struct wapbl_wc_null *wcn;
3105 #ifdef DEBUG
3106 off_t saveoff = off;
3107 #endif
3108 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
3109 if (error)
3110 goto out;
3111 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
3112 switch (wcn->wc_type) {
3113 case WAPBL_WC_BLOCKS:
3114 {
3115 struct wapbl_wc_blocklist *wc =
3116 (struct wapbl_wc_blocklist *)wr->wr_scratch;
3117 int i;
3118 for (i = 0; i < wc->wc_blkcount; i++) {
3119 int foundcnt = 0;
3120 int dirtycnt = 0;
3121 int j, n;
3122 /*
3123 * Check each physical block into the
3124 * hashtable independently
3125 */
3126 n = wc->wc_blocks[i].wc_dlen >>
3127 wch->wc_fs_dev_bshift;
3128 for (j = 0; j < n; j++) {
3129 struct wapbl_blk *wb =
3130 wapbl_blkhash_get(wr,
3131 wapbl_block_daddr(wc, i, j, fsblklen));
3132 if (wb && (wb->wb_off == off)) {
3133 foundcnt++;
3134 error =
3135 wapbl_circ_read(wr,
3136 scratch1, fsblklen,
3137 &off);
3138 if (error)
3139 goto out;
3140 error =
3141 wapbl_read(scratch2,
3142 fsblklen, fsdevvp,
3143 wb->wb_blk);
3144 if (error)
3145 goto out;
3146 if (memcmp(scratch1,
3147 scratch2,
3148 fsblklen)) {
3149 printf(
3150 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3151 wb->wb_blk, (intmax_t)off);
3152 dirtycnt++;
3153 mismatchcnt++;
3154 }
3155 } else {
3156 wapbl_circ_advance(wr,
3157 fsblklen, &off);
3158 }
3159 }
3160 #if 0
3161 /*
3162 * If all of the blocks in an entry
3163 * are clean, then remove all of its
3164 * blocks from the hashtable since they
3165 * never will need replay.
3166 */
3167 if ((foundcnt != 0) &&
3168 (dirtycnt == 0)) {
3169 off = saveoff;
3170 wapbl_circ_advance(wr,
3171 logblklen, &off);
3172 for (j = 0; j < n; j++) {
3173 struct wapbl_blk *wb =
3174 wapbl_blkhash_get(wr,
3175 wapbl_block_daddr(wc, i, j, fsblklen));
3176 if (wb &&
3177 (wb->wb_off == off)) {
3178 wapbl_blkhash_rem(wr, wb->wb_blk);
3179 }
3180 wapbl_circ_advance(wr,
3181 fsblklen, &off);
3182 }
3183 }
3184 #endif
3185 }
3186 }
3187 break;
3188 case WAPBL_WC_REVOCATIONS:
3189 case WAPBL_WC_INODES:
3190 break;
3191 default:
3192 KASSERT(0);
3193 }
3194 #ifdef DEBUG
3195 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3196 KASSERT(off == saveoff);
3197 #endif
3198 }
3199 out:
3200 wapbl_free(scratch1, MAXBSIZE);
3201 wapbl_free(scratch2, MAXBSIZE);
3202 if (!error && mismatchcnt)
3203 error = EFTYPE;
3204 return error;
3205 }
3206 #endif
3207
3208 int
3209 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
3210 {
3211 struct wapbl_blk *wb;
3212 size_t i;
3213 off_t off;
3214 void *scratch;
3215 int error = 0;
3216 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3217
3218 KDASSERT(wapbl_replay_isopen(wr));
3219
3220 scratch = wapbl_alloc(MAXBSIZE);
3221
3222 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
3223 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3224 off = wb->wb_off;
3225 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3226 if (error)
3227 break;
3228 error = wapbl_write(scratch, fsblklen, fsdevvp,
3229 wb->wb_blk);
3230 if (error)
3231 break;
3232 }
3233 }
3234
3235 wapbl_free(scratch, MAXBSIZE);
3236 return error;
3237 }
3238
3239 int
3240 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
3241 {
3242 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3243
3244 KDASSERT(wapbl_replay_isopen(wr));
3245 KASSERT((len % fsblklen) == 0);
3246
3247 while (len != 0) {
3248 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3249 if (wb)
3250 return 1;
3251 len -= fsblklen;
3252 }
3253 return 0;
3254 }
3255
3256 int
3257 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
3258 {
3259 int fsblklen = 1 << wr->wr_fs_dev_bshift;
3260
3261 KDASSERT(wapbl_replay_isopen(wr));
3262
3263 KASSERT((len % fsblklen) == 0);
3264
3265 while (len != 0) {
3266 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3267 if (wb) {
3268 off_t off = wb->wb_off;
3269 int error;
3270 error = wapbl_circ_read(wr, data, fsblklen, &off);
3271 if (error)
3272 return error;
3273 }
3274 data = (uint8_t *)data + fsblklen;
3275 len -= fsblklen;
3276 blk++;
3277 }
3278 return 0;
3279 }
3280
3281 #ifdef _KERNEL
3282
3283 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3284
3285 static int
3286 wapbl_modcmd(modcmd_t cmd, void *arg)
3287 {
3288
3289 switch (cmd) {
3290 case MODULE_CMD_INIT:
3291 wapbl_init();
3292 return 0;
3293 case MODULE_CMD_FINI:
3294 return wapbl_fini();
3295 default:
3296 return ENOTTY;
3297 }
3298 }
3299 #endif /* _KERNEL */
3300