vfs_wapbl.c revision 1.52.2.4 1 /* $NetBSD: vfs_wapbl.c,v 1.52.2.4 2014/08/20 00:04:29 tls Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.52.2.4 2014/08/20 00:04:29 tls Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/vnode.h>
51 #include <sys/file.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
66 #define wapbl_free(a, s) kmem_free((a), (s))
67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
68
69 static struct sysctllog *wapbl_sysctl;
70 static int wapbl_flush_disk_cache = 1;
71 static int wapbl_verbose_commit = 0;
72
73 static inline size_t wapbl_space_free(size_t, off_t, off_t);
74
75 #else /* !_KERNEL */
76 #include <assert.h>
77 #include <errno.h>
78 #include <stdio.h>
79 #include <stdbool.h>
80 #include <stdlib.h>
81 #include <string.h>
82
83 #include <sys/time.h>
84 #include <sys/wapbl.h>
85 #include <sys/wapbl_replay.h>
86
87 #define KDASSERT(x) assert(x)
88 #define KASSERT(x) assert(x)
89 #define wapbl_alloc(s) malloc(s)
90 #define wapbl_free(a, s) free(a)
91 #define wapbl_calloc(n, s) calloc((n), (s))
92
93 #endif /* !_KERNEL */
94
95 /*
96 * INTERNAL DATA STRUCTURES
97 */
98
99 /*
100 * This structure holds per-mount log information.
101 *
102 * Legend: a = atomic access only
103 * r = read-only after init
104 * l = rwlock held
105 * m = mutex held
106 * lm = rwlock held writing or mutex held
107 * u = unlocked access ok
108 * b = bufcache_lock held
109 */
110 struct wapbl {
111 struct vnode *wl_logvp; /* r: log here */
112 struct vnode *wl_devvp; /* r: log on this device */
113 struct mount *wl_mount; /* r: mountpoint wl is associated with */
114 daddr_t wl_logpbn; /* r: Physical block number of start of log */
115 int wl_log_dev_bshift; /* r: logarithm of device block size of log
116 device */
117 int wl_fs_dev_bshift; /* r: logarithm of device block size of
118 filesystem device */
119
120 unsigned wl_lock_count; /* m: Count of transactions in progress */
121
122 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
123 size_t wl_circ_off; /* r: Number of bytes reserved at start */
124
125 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
126 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
127
128 off_t wl_head; /* l: Byte offset of log head */
129 off_t wl_tail; /* l: Byte offset of log tail */
130 /*
131 * head == tail == 0 means log is empty
132 * head == tail != 0 means log is full
133 * see assertions in wapbl_advance() for other boundary conditions.
134 * only truncate moves the tail, except when flush sets it to
135 * wl_header_size only flush moves the head, except when truncate
136 * sets it to 0.
137 */
138
139 struct wapbl_wc_header *wl_wc_header; /* l */
140 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
141
142 kmutex_t wl_mtx; /* u: short-term lock */
143 krwlock_t wl_rwlock; /* u: File system transaction lock */
144
145 /*
146 * Must be held while accessing
147 * wl_count or wl_bufs or head or tail
148 */
149
150 /*
151 * Callback called from within the flush routine to flush any extra
152 * bits. Note that flush may be skipped without calling this if
153 * there are no outstanding buffers in the transaction.
154 */
155 #if _KERNEL
156 wapbl_flush_fn_t wl_flush; /* r */
157 wapbl_flush_fn_t wl_flush_abort;/* r */
158 #endif
159
160 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
161 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
162 size_t wl_bcount; /* m: Total bcount of wl_bufs */
163
164 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
165
166 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
167 size_t wl_reclaimable_bytes; /* m: Amount of space available for
168 reclamation by truncate */
169 int wl_error_count; /* m: # of wl_entries with errors */
170 size_t wl_reserved_bytes; /* never truncate log smaller than this */
171
172 #ifdef WAPBL_DEBUG_BUFBYTES
173 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
174 #endif
175
176 daddr_t *wl_deallocblks;/* lm: address of block */
177 int *wl_dealloclens; /* lm: size of block */
178 int wl_dealloccnt; /* lm: total count */
179 int wl_dealloclim; /* l: max count */
180
181 /* hashtable of inode numbers for allocated but unlinked inodes */
182 /* synch ??? */
183 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
184 u_long wl_inohashmask;
185 int wl_inohashcnt;
186
187 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
188 accounting */
189
190 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
191 daddr_t wl_buffer_dblk; /* l: buffer disk block address */
192 size_t wl_buffer_used; /* l: buffer current use */
193 };
194
195 #ifdef WAPBL_DEBUG_PRINT
196 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
197 #endif
198
199 /****************************************************************/
200 #ifdef _KERNEL
201
202 #ifdef WAPBL_DEBUG
203 struct wapbl *wapbl_debug_wl;
204 #endif
205
206 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
207 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
208 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
209 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
210 #endif /* _KERNEL */
211
212 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
213
214 static inline size_t wapbl_space_used(size_t avail, off_t head,
215 off_t tail);
216
217 #ifdef _KERNEL
218
219 static struct pool wapbl_entry_pool;
220
221 #define WAPBL_INODETRK_SIZE 83
222 static int wapbl_ino_pool_refcount;
223 static struct pool wapbl_ino_pool;
224 struct wapbl_ino {
225 LIST_ENTRY(wapbl_ino) wi_hash;
226 ino_t wi_ino;
227 mode_t wi_mode;
228 };
229
230 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
231 static void wapbl_inodetrk_free(struct wapbl *wl);
232 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
233
234 static size_t wapbl_transaction_len(struct wapbl *wl);
235 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
236
237 #if 0
238 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
239 #endif
240
241 static int wapbl_replay_isopen1(struct wapbl_replay *);
242
243 /*
244 * This is useful for debugging. If set, the log will
245 * only be truncated when necessary.
246 */
247 int wapbl_lazy_truncate = 0;
248
249 struct wapbl_ops wapbl_ops = {
250 .wo_wapbl_discard = wapbl_discard,
251 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
252 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
253 .wo_wapbl_replay_read = wapbl_replay_read,
254 .wo_wapbl_add_buf = wapbl_add_buf,
255 .wo_wapbl_remove_buf = wapbl_remove_buf,
256 .wo_wapbl_resize_buf = wapbl_resize_buf,
257 .wo_wapbl_begin = wapbl_begin,
258 .wo_wapbl_end = wapbl_end,
259 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
260
261 /* XXX: the following is only used to say "this is a wapbl buf" */
262 .wo_wapbl_biodone = wapbl_biodone,
263 };
264
265 static int
266 wapbl_sysctl_init(void)
267 {
268 int rv;
269 const struct sysctlnode *rnode, *cnode;
270
271 wapbl_sysctl = NULL;
272
273 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode,
274 CTLFLAG_PERMANENT,
275 CTLTYPE_NODE, "wapbl",
276 SYSCTL_DESCR("WAPBL journaling options"),
277 NULL, 0, NULL, 0,
278 CTL_VFS, CTL_CREATE, CTL_EOL);
279 if (rv)
280 return rv;
281
282 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
283 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
284 CTLTYPE_INT, "flush_disk_cache",
285 SYSCTL_DESCR("flush disk cache"),
286 NULL, 0, &wapbl_flush_disk_cache, 0,
287 CTL_CREATE, CTL_EOL);
288 if (rv)
289 return rv;
290
291 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
292 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
293 CTLTYPE_INT, "verbose_commit",
294 SYSCTL_DESCR("show time and size of wapbl log commits"),
295 NULL, 0, &wapbl_verbose_commit, 0,
296 CTL_CREATE, CTL_EOL);
297 return rv;
298 }
299
300 static void
301 wapbl_init(void)
302 {
303
304 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
305 "wapblentrypl", &pool_allocator_kmem, IPL_VM);
306
307 wapbl_sysctl_init();
308 }
309
310 #ifdef notyet
311 static int
312 wapbl_fini(bool interface)
313 {
314
315 if (aio_sysctl != NULL)
316 sysctl_teardown(&aio_sysctl);
317
318 pool_destroy(&wapbl_entry_pool);
319
320 return 0;
321 }
322 #endif
323
324 static int
325 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
326 {
327 int error, i;
328
329 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
330 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
331
332 /*
333 * Its only valid to reuse the replay log if its
334 * the same as the new log we just opened.
335 */
336 KDASSERT(!wapbl_replay_isopen(wr));
337 KASSERT(wl->wl_devvp->v_type == VBLK);
338 KASSERT(wr->wr_devvp->v_type == VBLK);
339 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
340 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
341 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
342 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
343 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
344 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
345
346 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
347
348 for (i = 0; i < wr->wr_inodescnt; i++)
349 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
350 wr->wr_inodes[i].wr_imode);
351
352 /* Make sure new transaction won't overwrite old inodes list */
353 KDASSERT(wapbl_transaction_len(wl) <=
354 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
355 wr->wr_inodestail));
356
357 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
358 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
359 wapbl_transaction_len(wl);
360
361 error = wapbl_write_inodes(wl, &wl->wl_head);
362 if (error)
363 return error;
364
365 KASSERT(wl->wl_head != wl->wl_tail);
366 KASSERT(wl->wl_head != 0);
367
368 return 0;
369 }
370
371 int
372 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
373 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
374 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
375 {
376 struct wapbl *wl;
377 struct vnode *devvp;
378 daddr_t logpbn;
379 int error;
380 int log_dev_bshift = ilog2(blksize);
381 int fs_dev_bshift = log_dev_bshift;
382 int run;
383
384 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
385 " count=%zu blksize=%zu\n", vp, off, count, blksize));
386
387 if (log_dev_bshift > fs_dev_bshift) {
388 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
389 ("wapbl: log device's block size cannot be larger "
390 "than filesystem's\n"));
391 /*
392 * Not currently implemented, although it could be if
393 * needed someday.
394 */
395 return ENOSYS;
396 }
397
398 if (off < 0)
399 return EINVAL;
400
401 if (blksize < DEV_BSIZE)
402 return EINVAL;
403 if (blksize % DEV_BSIZE)
404 return EINVAL;
405
406 /* XXXTODO: verify that the full load is writable */
407
408 /*
409 * XXX check for minimum log size
410 * minimum is governed by minimum amount of space
411 * to complete a transaction. (probably truncate)
412 */
413 /* XXX for now pick something minimal */
414 if ((count * blksize) < mp->mnt_maxphys) {
415 return ENOSPC;
416 }
417
418 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
419 return error;
420 }
421
422 wl = wapbl_calloc(1, sizeof(*wl));
423 rw_init(&wl->wl_rwlock);
424 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
425 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
426 LIST_INIT(&wl->wl_bufs);
427 SIMPLEQ_INIT(&wl->wl_entries);
428
429 wl->wl_logvp = vp;
430 wl->wl_devvp = devvp;
431 wl->wl_mount = mp;
432 wl->wl_logpbn = logpbn;
433 wl->wl_log_dev_bshift = log_dev_bshift;
434 wl->wl_fs_dev_bshift = fs_dev_bshift;
435
436 wl->wl_flush = flushfn;
437 wl->wl_flush_abort = flushabortfn;
438
439 /* Reserve two log device blocks for the commit headers */
440 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
441 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
442 /* truncate the log usage to a multiple of log_dev_bshift */
443 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
444 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
445
446 /*
447 * wl_bufbytes_max limits the size of the in memory transaction space.
448 * - Since buffers are allocated and accounted for in units of
449 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
450 * (i.e. 1<<PAGE_SHIFT)
451 * - Since the log device has to be written in units of
452 * 1<<wl_log_dev_bshift it is required to be a mulitple of
453 * 1<<wl_log_dev_bshift.
454 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
455 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
456 * Therefore it must be multiple of the least common multiple of those
457 * three quantities. Fortunately, all of those quantities are
458 * guaranteed to be a power of two, and the least common multiple of
459 * a set of numbers which are all powers of two is simply the maximum
460 * of those numbers. Finally, the maximum logarithm of a power of two
461 * is the same as the log of the maximum power of two. So we can do
462 * the following operations to size wl_bufbytes_max:
463 */
464
465 /* XXX fix actual number of pages reserved per filesystem. */
466 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
467
468 /* Round wl_bufbytes_max to the largest power of two constraint */
469 wl->wl_bufbytes_max >>= PAGE_SHIFT;
470 wl->wl_bufbytes_max <<= PAGE_SHIFT;
471 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
472 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
473 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
474 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
475
476 /* XXX maybe use filesystem fragment size instead of 1024 */
477 /* XXX fix actual number of buffers reserved per filesystem. */
478 wl->wl_bufcount_max = (nbuf / 2) * 1024;
479
480 /* XXX tie this into resource estimation */
481 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
482
483 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
484 wl->wl_dealloclim);
485 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
486 wl->wl_dealloclim);
487
488 wl->wl_buffer = wapbl_alloc(MAXPHYS);
489 wl->wl_buffer_used = 0;
490
491 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
492
493 /* Initialize the commit header */
494 {
495 struct wapbl_wc_header *wc;
496 size_t len = 1 << wl->wl_log_dev_bshift;
497 wc = wapbl_calloc(1, len);
498 wc->wc_type = WAPBL_WC_HEADER;
499 wc->wc_len = len;
500 wc->wc_circ_off = wl->wl_circ_off;
501 wc->wc_circ_size = wl->wl_circ_size;
502 /* XXX wc->wc_fsid */
503 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
504 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
505 wl->wl_wc_header = wc;
506 wl->wl_wc_scratch = wapbl_alloc(len);
507 }
508
509 /*
510 * if there was an existing set of unlinked but
511 * allocated inodes, preserve it in the new
512 * log.
513 */
514 if (wr && wr->wr_inodescnt) {
515 error = wapbl_start_flush_inodes(wl, wr);
516 if (error)
517 goto errout;
518 }
519
520 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
521 if (error) {
522 goto errout;
523 }
524
525 *wlp = wl;
526 #if defined(WAPBL_DEBUG)
527 wapbl_debug_wl = wl;
528 #endif
529
530 return 0;
531 errout:
532 wapbl_discard(wl);
533 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
534 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
535 wapbl_free(wl->wl_deallocblks,
536 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
537 wapbl_free(wl->wl_dealloclens,
538 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
539 wapbl_free(wl->wl_buffer, MAXPHYS);
540 wapbl_inodetrk_free(wl);
541 wapbl_free(wl, sizeof(*wl));
542
543 return error;
544 }
545
546 /*
547 * Like wapbl_flush, only discards the transaction
548 * completely
549 */
550
551 void
552 wapbl_discard(struct wapbl *wl)
553 {
554 struct wapbl_entry *we;
555 struct buf *bp;
556 int i;
557
558 /*
559 * XXX we may consider using upgrade here
560 * if we want to call flush from inside a transaction
561 */
562 rw_enter(&wl->wl_rwlock, RW_WRITER);
563 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
564 wl->wl_dealloccnt);
565
566 #ifdef WAPBL_DEBUG_PRINT
567 {
568 pid_t pid = -1;
569 lwpid_t lid = -1;
570 if (curproc)
571 pid = curproc->p_pid;
572 if (curlwp)
573 lid = curlwp->l_lid;
574 #ifdef WAPBL_DEBUG_BUFBYTES
575 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
576 ("wapbl_discard: thread %d.%d discarding "
577 "transaction\n"
578 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
579 "deallocs=%d inodes=%d\n"
580 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
581 "unsynced=%zu\n",
582 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
583 wl->wl_bcount, wl->wl_dealloccnt,
584 wl->wl_inohashcnt, wl->wl_error_count,
585 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
586 wl->wl_unsynced_bufbytes));
587 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
588 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
589 ("\tentry: bufcount = %zu, reclaimable = %zu, "
590 "error = %d, unsynced = %zu\n",
591 we->we_bufcount, we->we_reclaimable_bytes,
592 we->we_error, we->we_unsynced_bufbytes));
593 }
594 #else /* !WAPBL_DEBUG_BUFBYTES */
595 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
596 ("wapbl_discard: thread %d.%d discarding transaction\n"
597 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
598 "deallocs=%d inodes=%d\n"
599 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
600 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
601 wl->wl_bcount, wl->wl_dealloccnt,
602 wl->wl_inohashcnt, wl->wl_error_count,
603 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
604 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
605 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
606 ("\tentry: bufcount = %zu, reclaimable = %zu, "
607 "error = %d\n",
608 we->we_bufcount, we->we_reclaimable_bytes,
609 we->we_error));
610 }
611 #endif /* !WAPBL_DEBUG_BUFBYTES */
612 }
613 #endif /* WAPBL_DEBUG_PRINT */
614
615 for (i = 0; i <= wl->wl_inohashmask; i++) {
616 struct wapbl_ino_head *wih;
617 struct wapbl_ino *wi;
618
619 wih = &wl->wl_inohash[i];
620 while ((wi = LIST_FIRST(wih)) != NULL) {
621 LIST_REMOVE(wi, wi_hash);
622 pool_put(&wapbl_ino_pool, wi);
623 KASSERT(wl->wl_inohashcnt > 0);
624 wl->wl_inohashcnt--;
625 }
626 }
627
628 /*
629 * clean buffer list
630 */
631 mutex_enter(&bufcache_lock);
632 mutex_enter(&wl->wl_mtx);
633 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
634 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
635 /*
636 * The buffer will be unlocked and
637 * removed from the transaction in brelse
638 */
639 mutex_exit(&wl->wl_mtx);
640 brelsel(bp, 0);
641 mutex_enter(&wl->wl_mtx);
642 }
643 }
644 mutex_exit(&wl->wl_mtx);
645 mutex_exit(&bufcache_lock);
646
647 /*
648 * Remove references to this wl from wl_entries, free any which
649 * no longer have buffers, others will be freed in wapbl_biodone
650 * when they no longer have any buffers.
651 */
652 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
653 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
654 /* XXX should we be accumulating wl_error_count
655 * and increasing reclaimable bytes ? */
656 we->we_wapbl = NULL;
657 if (we->we_bufcount == 0) {
658 #ifdef WAPBL_DEBUG_BUFBYTES
659 KASSERT(we->we_unsynced_bufbytes == 0);
660 #endif
661 pool_put(&wapbl_entry_pool, we);
662 }
663 }
664
665 /* Discard list of deallocs */
666 wl->wl_dealloccnt = 0;
667 /* XXX should we clear wl_reserved_bytes? */
668
669 KASSERT(wl->wl_bufbytes == 0);
670 KASSERT(wl->wl_bcount == 0);
671 KASSERT(wl->wl_bufcount == 0);
672 KASSERT(LIST_EMPTY(&wl->wl_bufs));
673 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
674 KASSERT(wl->wl_inohashcnt == 0);
675
676 rw_exit(&wl->wl_rwlock);
677 }
678
679 int
680 wapbl_stop(struct wapbl *wl, int force)
681 {
682 int error;
683
684 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
685 error = wapbl_flush(wl, 1);
686 if (error) {
687 if (force)
688 wapbl_discard(wl);
689 else
690 return error;
691 }
692
693 /* Unlinked inodes persist after a flush */
694 if (wl->wl_inohashcnt) {
695 if (force) {
696 wapbl_discard(wl);
697 } else {
698 return EBUSY;
699 }
700 }
701
702 KASSERT(wl->wl_bufbytes == 0);
703 KASSERT(wl->wl_bcount == 0);
704 KASSERT(wl->wl_bufcount == 0);
705 KASSERT(LIST_EMPTY(&wl->wl_bufs));
706 KASSERT(wl->wl_dealloccnt == 0);
707 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
708 KASSERT(wl->wl_inohashcnt == 0);
709
710 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
711 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
712 wapbl_free(wl->wl_deallocblks,
713 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
714 wapbl_free(wl->wl_dealloclens,
715 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
716 wapbl_free(wl->wl_buffer, MAXPHYS);
717 wapbl_inodetrk_free(wl);
718
719 cv_destroy(&wl->wl_reclaimable_cv);
720 mutex_destroy(&wl->wl_mtx);
721 rw_destroy(&wl->wl_rwlock);
722 wapbl_free(wl, sizeof(*wl));
723
724 return 0;
725 }
726
727 static int
728 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
729 {
730 struct pstats *pstats = curlwp->l_proc->p_stats;
731 struct buf *bp;
732 int error;
733
734 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
735 KASSERT(devvp->v_type == VBLK);
736
737 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
738 mutex_enter(devvp->v_interlock);
739 devvp->v_numoutput++;
740 mutex_exit(devvp->v_interlock);
741 pstats->p_ru.ru_oublock++;
742 } else {
743 pstats->p_ru.ru_inblock++;
744 }
745
746 bp = getiobuf(devvp, true);
747 bp->b_flags = flags;
748 bp->b_cflags = BC_BUSY; /* silly & dubious */
749 bp->b_dev = devvp->v_rdev;
750 bp->b_data = data;
751 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
752 bp->b_blkno = pbn;
753 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
754
755 WAPBL_PRINTF(WAPBL_PRINT_IO,
756 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
757 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
758 bp->b_blkno, bp->b_dev));
759
760 VOP_STRATEGY(devvp, bp);
761
762 error = biowait(bp);
763 putiobuf(bp);
764
765 if (error) {
766 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
767 ("wapbl_doio: %s %zu bytes at block %" PRId64
768 " on dev 0x%"PRIx64" failed with error %d\n",
769 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
770 "write" : "read"),
771 len, pbn, devvp->v_rdev, error));
772 }
773
774 return error;
775 }
776
777 int
778 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
779 {
780
781 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
782 }
783
784 int
785 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
786 {
787
788 return wapbl_doio(data, len, devvp, pbn, B_READ);
789 }
790
791 /*
792 * Flush buffered data if any.
793 */
794 static int
795 wapbl_buffered_flush(struct wapbl *wl)
796 {
797 int error;
798
799 if (wl->wl_buffer_used == 0)
800 return 0;
801
802 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
803 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
804 wl->wl_buffer_used = 0;
805
806 return error;
807 }
808
809 /*
810 * Write data to the log.
811 * Try to coalesce writes and emit MAXPHYS aligned blocks.
812 */
813 static int
814 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
815 {
816 int error;
817 size_t resid;
818
819 /*
820 * If not adjacent to buffered data flush first. Disk block
821 * address is always valid for non-empty buffer.
822 */
823 if (wl->wl_buffer_used > 0 &&
824 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
825 error = wapbl_buffered_flush(wl);
826 if (error)
827 return error;
828 }
829 /*
830 * If this write goes to an empty buffer we have to
831 * save the disk block address first.
832 */
833 if (wl->wl_buffer_used == 0)
834 wl->wl_buffer_dblk = pbn;
835 /*
836 * Remaining space so this buffer ends on a MAXPHYS boundary.
837 *
838 * Cannot become less or equal zero as the buffer would have been
839 * flushed on the last call then.
840 */
841 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
842 wl->wl_buffer_used;
843 KASSERT(resid > 0);
844 KASSERT(dbtob(btodb(resid)) == resid);
845 if (len >= resid) {
846 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
847 wl->wl_buffer_used += resid;
848 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
849 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
850 data = (uint8_t *)data + resid;
851 len -= resid;
852 wl->wl_buffer_dblk = pbn + btodb(resid);
853 wl->wl_buffer_used = 0;
854 if (error)
855 return error;
856 }
857 KASSERT(len < MAXPHYS);
858 if (len > 0) {
859 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
860 wl->wl_buffer_used += len;
861 }
862
863 return 0;
864 }
865
866 /*
867 * Off is byte offset returns new offset for next write
868 * handles log wraparound
869 */
870 static int
871 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
872 {
873 size_t slen;
874 off_t off = *offp;
875 int error;
876 daddr_t pbn;
877
878 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
879 wl->wl_log_dev_bshift) == len);
880
881 if (off < wl->wl_circ_off)
882 off = wl->wl_circ_off;
883 slen = wl->wl_circ_off + wl->wl_circ_size - off;
884 if (slen < len) {
885 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
886 #ifdef _KERNEL
887 pbn = btodb(pbn << wl->wl_log_dev_bshift);
888 #endif
889 error = wapbl_buffered_write(data, slen, wl, pbn);
890 if (error)
891 return error;
892 data = (uint8_t *)data + slen;
893 len -= slen;
894 off = wl->wl_circ_off;
895 }
896 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
897 #ifdef _KERNEL
898 pbn = btodb(pbn << wl->wl_log_dev_bshift);
899 #endif
900 error = wapbl_buffered_write(data, len, wl, pbn);
901 if (error)
902 return error;
903 off += len;
904 if (off >= wl->wl_circ_off + wl->wl_circ_size)
905 off = wl->wl_circ_off;
906 *offp = off;
907 return 0;
908 }
909
910 /****************************************************************/
911
912 int
913 wapbl_begin(struct wapbl *wl, const char *file, int line)
914 {
915 int doflush;
916 unsigned lockcount;
917 uint32_t maxphys;
918
919 KDASSERT(wl);
920
921 /*
922 * XXX this needs to be made much more sophisticated.
923 * perhaps each wapbl_begin could reserve a specified
924 * number of buffers and bytes.
925 */
926 mutex_enter(&wl->wl_mtx);
927 lockcount = wl->wl_lock_count;
928 maxphys = wl->wl_mount->mnt_maxphys;
929 doflush = ((wl->wl_bufbytes + (lockcount * maxphys)) >
930 wl->wl_bufbytes_max / 2) ||
931 ((wl->wl_bufcount + (lockcount * 10)) >
932 wl->wl_bufcount_max / 2) ||
933 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
934 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
935 mutex_exit(&wl->wl_mtx);
936
937 if (doflush) {
938 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
939 ("force flush lockcnt=%d bufbytes=%zu "
940 "(max=%zu) bufcount=%zu (max=%zu) "
941 "dealloccnt %d (lim=%d)\n",
942 lockcount, wl->wl_bufbytes,
943 wl->wl_bufbytes_max, wl->wl_bufcount,
944 wl->wl_bufcount_max,
945 wl->wl_dealloccnt, wl->wl_dealloclim));
946 }
947
948 if (doflush) {
949 int error = wapbl_flush(wl, 0);
950 if (error)
951 return error;
952 }
953
954 rw_enter(&wl->wl_rwlock, RW_READER);
955 mutex_enter(&wl->wl_mtx);
956 wl->wl_lock_count++;
957 mutex_exit(&wl->wl_mtx);
958
959 #if defined(WAPBL_DEBUG_PRINT)
960 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
961 ("wapbl_begin thread %d.%d with bufcount=%zu "
962 "bufbytes=%zu bcount=%zu at %s:%d\n",
963 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
964 wl->wl_bufbytes, wl->wl_bcount, file, line));
965 #endif
966
967 return 0;
968 }
969
970 void
971 wapbl_end(struct wapbl *wl)
972 {
973
974 #if defined(WAPBL_DEBUG_PRINT)
975 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
976 ("wapbl_end thread %d.%d with bufcount=%zu "
977 "bufbytes=%zu bcount=%zu\n",
978 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
979 wl->wl_bufbytes, wl->wl_bcount));
980 #endif
981
982 #ifdef DIAGNOSTIC
983 size_t flushsize = wapbl_transaction_len(wl);
984 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
985 /*
986 * XXX this could be handled more gracefully, perhaps place
987 * only a partial transaction in the log and allow the
988 * remaining to flush without the protection of the journal.
989 */
990 panic("wapbl_end: current transaction too big to flush\n");
991 }
992 #endif
993
994 mutex_enter(&wl->wl_mtx);
995 KASSERT(wl->wl_lock_count > 0);
996 wl->wl_lock_count--;
997 mutex_exit(&wl->wl_mtx);
998
999 rw_exit(&wl->wl_rwlock);
1000 }
1001
1002 void
1003 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
1004 {
1005
1006 KASSERT(bp->b_cflags & BC_BUSY);
1007 KASSERT(bp->b_vp);
1008
1009 wapbl_jlock_assert(wl);
1010
1011 #if 0
1012 /*
1013 * XXX this might be an issue for swapfiles.
1014 * see uvm_swap.c:1702
1015 *
1016 * XXX2 why require it then? leap of semantics?
1017 */
1018 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
1019 #endif
1020
1021 mutex_enter(&wl->wl_mtx);
1022 if (bp->b_flags & B_LOCKED) {
1023 LIST_REMOVE(bp, b_wapbllist);
1024 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1025 ("wapbl_add_buf thread %d.%d re-adding buf %p "
1026 "with %d bytes %d bcount\n",
1027 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1028 bp->b_bcount));
1029 } else {
1030 /* unlocked by dirty buffers shouldn't exist */
1031 KASSERT(!(bp->b_oflags & BO_DELWRI));
1032 wl->wl_bufbytes += bp->b_bufsize;
1033 wl->wl_bcount += bp->b_bcount;
1034 wl->wl_bufcount++;
1035 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1036 ("wapbl_add_buf thread %d.%d adding buf %p "
1037 "with %d bytes %d bcount\n",
1038 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1039 bp->b_bcount));
1040 }
1041 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1042 mutex_exit(&wl->wl_mtx);
1043
1044 bp->b_flags |= B_LOCKED;
1045 }
1046
1047 static void
1048 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1049 {
1050
1051 KASSERT(mutex_owned(&wl->wl_mtx));
1052 KASSERT(bp->b_cflags & BC_BUSY);
1053 wapbl_jlock_assert(wl);
1054
1055 #if 0
1056 /*
1057 * XXX this might be an issue for swapfiles.
1058 * see uvm_swap.c:1725
1059 *
1060 * XXXdeux: see above
1061 */
1062 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
1063 #endif
1064 KASSERT(bp->b_flags & B_LOCKED);
1065
1066 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1067 ("wapbl_remove_buf thread %d.%d removing buf %p with "
1068 "%d bytes %d bcount\n",
1069 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1070
1071 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1072 wl->wl_bufbytes -= bp->b_bufsize;
1073 KASSERT(wl->wl_bcount >= bp->b_bcount);
1074 wl->wl_bcount -= bp->b_bcount;
1075 KASSERT(wl->wl_bufcount > 0);
1076 wl->wl_bufcount--;
1077 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1078 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1079 LIST_REMOVE(bp, b_wapbllist);
1080
1081 bp->b_flags &= ~B_LOCKED;
1082 }
1083
1084 /* called from brelsel() in vfs_bio among other places */
1085 void
1086 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1087 {
1088
1089 mutex_enter(&wl->wl_mtx);
1090 wapbl_remove_buf_locked(wl, bp);
1091 mutex_exit(&wl->wl_mtx);
1092 }
1093
1094 void
1095 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
1096 {
1097
1098 KASSERT(bp->b_cflags & BC_BUSY);
1099
1100 /*
1101 * XXX: why does this depend on B_LOCKED? otherwise the buf
1102 * is not for a transaction? if so, why is this called in the
1103 * first place?
1104 */
1105 if (bp->b_flags & B_LOCKED) {
1106 mutex_enter(&wl->wl_mtx);
1107 wl->wl_bufbytes += bp->b_bufsize - oldsz;
1108 wl->wl_bcount += bp->b_bcount - oldcnt;
1109 mutex_exit(&wl->wl_mtx);
1110 }
1111 }
1112
1113 #endif /* _KERNEL */
1114
1115 /****************************************************************/
1116 /* Some utility inlines */
1117
1118 static inline size_t
1119 wapbl_space_used(size_t avail, off_t head, off_t tail)
1120 {
1121
1122 if (tail == 0) {
1123 KASSERT(head == 0);
1124 return 0;
1125 }
1126 return ((head + (avail - 1) - tail) % avail) + 1;
1127 }
1128
1129 #ifdef _KERNEL
1130 /* This is used to advance the pointer at old to new value at old+delta */
1131 static inline off_t
1132 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1133 {
1134 off_t new;
1135
1136 /* Define acceptable ranges for inputs. */
1137 KASSERT(delta <= (size_t)size);
1138 KASSERT((old == 0) || ((size_t)old >= off));
1139 KASSERT(old < (off_t)(size + off));
1140
1141 if ((old == 0) && (delta != 0))
1142 new = off + delta;
1143 else if ((old + delta) < (size + off))
1144 new = old + delta;
1145 else
1146 new = (old + delta) - size;
1147
1148 /* Note some interesting axioms */
1149 KASSERT((delta != 0) || (new == old));
1150 KASSERT((delta == 0) || (new != 0));
1151 KASSERT((delta != (size)) || (new == old));
1152
1153 /* Define acceptable ranges for output. */
1154 KASSERT((new == 0) || ((size_t)new >= off));
1155 KASSERT((size_t)new < (size + off));
1156 return new;
1157 }
1158
1159 static inline size_t
1160 wapbl_space_free(size_t avail, off_t head, off_t tail)
1161 {
1162
1163 return avail - wapbl_space_used(avail, head, tail);
1164 }
1165
1166 static inline void
1167 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1168 off_t *tailp)
1169 {
1170 off_t head = *headp;
1171 off_t tail = *tailp;
1172
1173 KASSERT(delta <= wapbl_space_free(size, head, tail));
1174 head = wapbl_advance(size, off, head, delta);
1175 if ((tail == 0) && (head != 0))
1176 tail = off;
1177 *headp = head;
1178 *tailp = tail;
1179 }
1180
1181 static inline void
1182 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1183 off_t *tailp)
1184 {
1185 off_t head = *headp;
1186 off_t tail = *tailp;
1187
1188 KASSERT(delta <= wapbl_space_used(size, head, tail));
1189 tail = wapbl_advance(size, off, tail, delta);
1190 if (head == tail) {
1191 head = tail = 0;
1192 }
1193 *headp = head;
1194 *tailp = tail;
1195 }
1196
1197
1198 /****************************************************************/
1199
1200 /*
1201 * Remove transactions whose buffers are completely flushed to disk.
1202 * Will block until at least minfree space is available.
1203 * only intended to be called from inside wapbl_flush and therefore
1204 * does not protect against commit races with itself or with flush.
1205 */
1206 static int
1207 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1208 {
1209 size_t delta;
1210 size_t avail;
1211 off_t head;
1212 off_t tail;
1213 int error = 0;
1214
1215 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1216 KASSERT(rw_write_held(&wl->wl_rwlock));
1217
1218 mutex_enter(&wl->wl_mtx);
1219
1220 /*
1221 * First check to see if we have to do a commit
1222 * at all.
1223 */
1224 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1225 if (minfree < avail) {
1226 mutex_exit(&wl->wl_mtx);
1227 return 0;
1228 }
1229 minfree -= avail;
1230 while ((wl->wl_error_count == 0) &&
1231 (wl->wl_reclaimable_bytes < minfree)) {
1232 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1233 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1234 "minfree=%zd\n",
1235 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1236 minfree));
1237
1238 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1239 }
1240 if (wl->wl_reclaimable_bytes < minfree) {
1241 KASSERT(wl->wl_error_count);
1242 /* XXX maybe get actual error from buffer instead someday? */
1243 error = EIO;
1244 }
1245 head = wl->wl_head;
1246 tail = wl->wl_tail;
1247 delta = wl->wl_reclaimable_bytes;
1248
1249 /* If all of of the entries are flushed, then be sure to keep
1250 * the reserved bytes reserved. Watch out for discarded transactions,
1251 * which could leave more bytes reserved than are reclaimable.
1252 */
1253 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1254 (delta >= wl->wl_reserved_bytes)) {
1255 delta -= wl->wl_reserved_bytes;
1256 }
1257 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1258 &tail);
1259 KDASSERT(wl->wl_reserved_bytes <=
1260 wapbl_space_used(wl->wl_circ_size, head, tail));
1261 mutex_exit(&wl->wl_mtx);
1262
1263 if (error)
1264 return error;
1265
1266 if (waitonly)
1267 return 0;
1268
1269 /*
1270 * This is where head, tail and delta are unprotected
1271 * from races against itself or flush. This is ok since
1272 * we only call this routine from inside flush itself.
1273 *
1274 * XXX: how can it race against itself when accessed only
1275 * from behind the write-locked rwlock?
1276 */
1277 error = wapbl_write_commit(wl, head, tail);
1278 if (error)
1279 return error;
1280
1281 wl->wl_head = head;
1282 wl->wl_tail = tail;
1283
1284 mutex_enter(&wl->wl_mtx);
1285 KASSERT(wl->wl_reclaimable_bytes >= delta);
1286 wl->wl_reclaimable_bytes -= delta;
1287 mutex_exit(&wl->wl_mtx);
1288 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1289 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1290 curproc->p_pid, curlwp->l_lid, delta));
1291
1292 return 0;
1293 }
1294
1295 /****************************************************************/
1296
1297 void
1298 wapbl_biodone(struct buf *bp)
1299 {
1300 struct wapbl_entry *we = bp->b_private;
1301 struct wapbl *wl = we->we_wapbl;
1302 #ifdef WAPBL_DEBUG_BUFBYTES
1303 const int bufsize = bp->b_bufsize;
1304 #endif
1305
1306 /*
1307 * Handle possible flushing of buffers after log has been
1308 * decomissioned.
1309 */
1310 if (!wl) {
1311 KASSERT(we->we_bufcount > 0);
1312 we->we_bufcount--;
1313 #ifdef WAPBL_DEBUG_BUFBYTES
1314 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1315 we->we_unsynced_bufbytes -= bufsize;
1316 #endif
1317
1318 if (we->we_bufcount == 0) {
1319 #ifdef WAPBL_DEBUG_BUFBYTES
1320 KASSERT(we->we_unsynced_bufbytes == 0);
1321 #endif
1322 pool_put(&wapbl_entry_pool, we);
1323 }
1324
1325 brelse(bp, 0);
1326 return;
1327 }
1328
1329 #ifdef ohbother
1330 KDASSERT(bp->b_oflags & BO_DONE);
1331 KDASSERT(!(bp->b_oflags & BO_DELWRI));
1332 KDASSERT(bp->b_flags & B_ASYNC);
1333 KDASSERT(bp->b_cflags & BC_BUSY);
1334 KDASSERT(!(bp->b_flags & B_LOCKED));
1335 KDASSERT(!(bp->b_flags & B_READ));
1336 KDASSERT(!(bp->b_cflags & BC_INVAL));
1337 KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1338 #endif
1339
1340 if (bp->b_error) {
1341 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1342 /*
1343 * XXXpooka: interfaces not fully updated
1344 * Note: this was not enabled in the original patch
1345 * against netbsd4 either. I don't know if comment
1346 * above is true or not.
1347 */
1348
1349 /*
1350 * If an error occurs, report the error and leave the
1351 * buffer as a delayed write on the LRU queue.
1352 * restarting the write would likely result in
1353 * an error spinloop, so let it be done harmlessly
1354 * by the syncer.
1355 */
1356 bp->b_flags &= ~(B_DONE);
1357 simple_unlock(&bp->b_interlock);
1358
1359 if (we->we_error == 0) {
1360 mutex_enter(&wl->wl_mtx);
1361 wl->wl_error_count++;
1362 mutex_exit(&wl->wl_mtx);
1363 cv_broadcast(&wl->wl_reclaimable_cv);
1364 }
1365 we->we_error = bp->b_error;
1366 bp->b_error = 0;
1367 brelse(bp);
1368 return;
1369 #else
1370 /* For now, just mark the log permanently errored out */
1371
1372 mutex_enter(&wl->wl_mtx);
1373 if (wl->wl_error_count == 0) {
1374 wl->wl_error_count++;
1375 cv_broadcast(&wl->wl_reclaimable_cv);
1376 }
1377 mutex_exit(&wl->wl_mtx);
1378 #endif
1379 }
1380
1381 /*
1382 * Release the buffer here. wapbl_flush() may wait for the
1383 * log to become empty and we better unbusy the buffer before
1384 * wapbl_flush() returns.
1385 */
1386 brelse(bp, 0);
1387
1388 mutex_enter(&wl->wl_mtx);
1389
1390 KASSERT(we->we_bufcount > 0);
1391 we->we_bufcount--;
1392 #ifdef WAPBL_DEBUG_BUFBYTES
1393 KASSERT(we->we_unsynced_bufbytes >= bufsize);
1394 we->we_unsynced_bufbytes -= bufsize;
1395 KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1396 wl->wl_unsynced_bufbytes -= bufsize;
1397 #endif
1398
1399 /*
1400 * If the current transaction can be reclaimed, start
1401 * at the beginning and reclaim any consecutive reclaimable
1402 * transactions. If we successfully reclaim anything,
1403 * then wakeup anyone waiting for the reclaim.
1404 */
1405 if (we->we_bufcount == 0) {
1406 size_t delta = 0;
1407 int errcnt = 0;
1408 #ifdef WAPBL_DEBUG_BUFBYTES
1409 KDASSERT(we->we_unsynced_bufbytes == 0);
1410 #endif
1411 /*
1412 * clear any posted error, since the buffer it came from
1413 * has successfully flushed by now
1414 */
1415 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1416 (we->we_bufcount == 0)) {
1417 delta += we->we_reclaimable_bytes;
1418 if (we->we_error)
1419 errcnt++;
1420 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1421 pool_put(&wapbl_entry_pool, we);
1422 }
1423
1424 if (delta) {
1425 wl->wl_reclaimable_bytes += delta;
1426 KASSERT(wl->wl_error_count >= errcnt);
1427 wl->wl_error_count -= errcnt;
1428 cv_broadcast(&wl->wl_reclaimable_cv);
1429 }
1430 }
1431
1432 mutex_exit(&wl->wl_mtx);
1433 }
1434
1435 /*
1436 * Write transactions to disk + start I/O for contents
1437 */
1438 int
1439 wapbl_flush(struct wapbl *wl, int waitfor)
1440 {
1441 struct buf *bp;
1442 struct wapbl_entry *we;
1443 off_t off;
1444 off_t head;
1445 off_t tail;
1446 size_t delta = 0;
1447 size_t flushsize;
1448 size_t reserved;
1449 int error = 0;
1450
1451 /*
1452 * Do a quick check to see if a full flush can be skipped
1453 * This assumes that the flush callback does not need to be called
1454 * unless there are other outstanding bufs.
1455 */
1456 if (!waitfor) {
1457 size_t nbufs;
1458 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1459 protect the KASSERTS */
1460 nbufs = wl->wl_bufcount;
1461 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1462 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1463 mutex_exit(&wl->wl_mtx);
1464 if (nbufs == 0)
1465 return 0;
1466 }
1467
1468 /*
1469 * XXX we may consider using LK_UPGRADE here
1470 * if we want to call flush from inside a transaction
1471 */
1472 rw_enter(&wl->wl_rwlock, RW_WRITER);
1473 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1474 wl->wl_dealloccnt);
1475
1476 /*
1477 * Now that we are fully locked and flushed,
1478 * do another check for nothing to do.
1479 */
1480 if (wl->wl_bufcount == 0) {
1481 goto out;
1482 }
1483
1484 #if 0
1485 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1486 ("wapbl_flush thread %d.%d flushing entries with "
1487 "bufcount=%zu bufbytes=%zu\n",
1488 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1489 wl->wl_bufbytes));
1490 #endif
1491
1492 /* Calculate amount of space needed to flush */
1493 flushsize = wapbl_transaction_len(wl);
1494 if (wapbl_verbose_commit) {
1495 struct timespec ts;
1496 getnanotime(&ts);
1497 printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1498 __func__, (long long)ts.tv_sec,
1499 (long)ts.tv_nsec, flushsize);
1500 }
1501
1502 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1503 /*
1504 * XXX this could be handled more gracefully, perhaps place
1505 * only a partial transaction in the log and allow the
1506 * remaining to flush without the protection of the journal.
1507 */
1508 panic("wapbl_flush: current transaction too big to flush\n");
1509 }
1510
1511 error = wapbl_truncate(wl, flushsize, 0);
1512 if (error)
1513 goto out2;
1514
1515 off = wl->wl_head;
1516 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1517 (off < wl->wl_circ_off + wl->wl_circ_size)));
1518 error = wapbl_write_blocks(wl, &off);
1519 if (error)
1520 goto out2;
1521 error = wapbl_write_revocations(wl, &off);
1522 if (error)
1523 goto out2;
1524 error = wapbl_write_inodes(wl, &off);
1525 if (error)
1526 goto out2;
1527
1528 reserved = 0;
1529 if (wl->wl_inohashcnt)
1530 reserved = wapbl_transaction_inodes_len(wl);
1531
1532 head = wl->wl_head;
1533 tail = wl->wl_tail;
1534
1535 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1536 &head, &tail);
1537 #ifdef WAPBL_DEBUG
1538 if (head != off) {
1539 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1540 " off=%"PRIdMAX" flush=%zu\n",
1541 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1542 flushsize);
1543 }
1544 #else
1545 KASSERT(head == off);
1546 #endif
1547
1548 /* Opportunistically move the tail forward if we can */
1549 if (!wapbl_lazy_truncate) {
1550 mutex_enter(&wl->wl_mtx);
1551 delta = wl->wl_reclaimable_bytes;
1552 mutex_exit(&wl->wl_mtx);
1553 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1554 &head, &tail);
1555 }
1556
1557 error = wapbl_write_commit(wl, head, tail);
1558 if (error)
1559 goto out2;
1560
1561 we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1562
1563 #ifdef WAPBL_DEBUG_BUFBYTES
1564 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1565 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1566 " unsynced=%zu"
1567 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1568 "inodes=%d\n",
1569 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1570 wapbl_space_used(wl->wl_circ_size, head, tail),
1571 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1572 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1573 wl->wl_inohashcnt));
1574 #else
1575 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1576 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1577 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1578 "inodes=%d\n",
1579 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1580 wapbl_space_used(wl->wl_circ_size, head, tail),
1581 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1582 wl->wl_dealloccnt, wl->wl_inohashcnt));
1583 #endif
1584
1585
1586 mutex_enter(&bufcache_lock);
1587 mutex_enter(&wl->wl_mtx);
1588
1589 wl->wl_reserved_bytes = reserved;
1590 wl->wl_head = head;
1591 wl->wl_tail = tail;
1592 KASSERT(wl->wl_reclaimable_bytes >= delta);
1593 wl->wl_reclaimable_bytes -= delta;
1594 wl->wl_dealloccnt = 0;
1595 #ifdef WAPBL_DEBUG_BUFBYTES
1596 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1597 #endif
1598
1599 we->we_wapbl = wl;
1600 we->we_bufcount = wl->wl_bufcount;
1601 #ifdef WAPBL_DEBUG_BUFBYTES
1602 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1603 #endif
1604 we->we_reclaimable_bytes = flushsize;
1605 we->we_error = 0;
1606 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1607
1608 /*
1609 * this flushes bufs in reverse order than they were queued
1610 * it shouldn't matter, but if we care we could use TAILQ instead.
1611 * XXX Note they will get put on the lru queue when they flush
1612 * so we might actually want to change this to preserve order.
1613 */
1614 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1615 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1616 continue;
1617 }
1618 bp->b_iodone = wapbl_biodone;
1619 bp->b_private = we;
1620 bremfree(bp);
1621 wapbl_remove_buf_locked(wl, bp);
1622 mutex_exit(&wl->wl_mtx);
1623 mutex_exit(&bufcache_lock);
1624 bawrite(bp);
1625 mutex_enter(&bufcache_lock);
1626 mutex_enter(&wl->wl_mtx);
1627 }
1628 mutex_exit(&wl->wl_mtx);
1629 mutex_exit(&bufcache_lock);
1630
1631 #if 0
1632 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1633 ("wapbl_flush thread %d.%d done flushing entries...\n",
1634 curproc->p_pid, curlwp->l_lid));
1635 #endif
1636
1637 out:
1638
1639 /*
1640 * If the waitfor flag is set, don't return until everything is
1641 * fully flushed and the on disk log is empty.
1642 */
1643 if (waitfor) {
1644 error = wapbl_truncate(wl, wl->wl_circ_size -
1645 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1646 }
1647
1648 out2:
1649 if (error) {
1650 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1651 wl->wl_dealloclens, wl->wl_dealloccnt);
1652 }
1653
1654 #ifdef WAPBL_DEBUG_PRINT
1655 if (error) {
1656 pid_t pid = -1;
1657 lwpid_t lid = -1;
1658 if (curproc)
1659 pid = curproc->p_pid;
1660 if (curlwp)
1661 lid = curlwp->l_lid;
1662 mutex_enter(&wl->wl_mtx);
1663 #ifdef WAPBL_DEBUG_BUFBYTES
1664 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1665 ("wapbl_flush: thread %d.%d aborted flush: "
1666 "error = %d\n"
1667 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1668 "deallocs=%d inodes=%d\n"
1669 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1670 "unsynced=%zu\n",
1671 pid, lid, error, wl->wl_bufcount,
1672 wl->wl_bufbytes, wl->wl_bcount,
1673 wl->wl_dealloccnt, wl->wl_inohashcnt,
1674 wl->wl_error_count, wl->wl_reclaimable_bytes,
1675 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1676 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1677 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1678 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1679 "error = %d, unsynced = %zu\n",
1680 we->we_bufcount, we->we_reclaimable_bytes,
1681 we->we_error, we->we_unsynced_bufbytes));
1682 }
1683 #else
1684 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1685 ("wapbl_flush: thread %d.%d aborted flush: "
1686 "error = %d\n"
1687 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1688 "deallocs=%d inodes=%d\n"
1689 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1690 pid, lid, error, wl->wl_bufcount,
1691 wl->wl_bufbytes, wl->wl_bcount,
1692 wl->wl_dealloccnt, wl->wl_inohashcnt,
1693 wl->wl_error_count, wl->wl_reclaimable_bytes,
1694 wl->wl_reserved_bytes));
1695 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1696 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1697 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1698 "error = %d\n", we->we_bufcount,
1699 we->we_reclaimable_bytes, we->we_error));
1700 }
1701 #endif
1702 mutex_exit(&wl->wl_mtx);
1703 }
1704 #endif
1705
1706 rw_exit(&wl->wl_rwlock);
1707 return error;
1708 }
1709
1710 /****************************************************************/
1711
1712 void
1713 wapbl_jlock_assert(struct wapbl *wl)
1714 {
1715
1716 KASSERT(rw_lock_held(&wl->wl_rwlock));
1717 }
1718
1719 void
1720 wapbl_junlock_assert(struct wapbl *wl)
1721 {
1722
1723 KASSERT(!rw_write_held(&wl->wl_rwlock));
1724 }
1725
1726 /****************************************************************/
1727
1728 /* locks missing */
1729 void
1730 wapbl_print(struct wapbl *wl,
1731 int full,
1732 void (*pr)(const char *, ...))
1733 {
1734 struct buf *bp;
1735 struct wapbl_entry *we;
1736 (*pr)("wapbl %p", wl);
1737 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1738 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1739 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1740 wl->wl_circ_size, wl->wl_circ_off,
1741 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1742 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1743 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1744 #ifdef WAPBL_DEBUG_BUFBYTES
1745 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1746 "reserved = %zu errcnt = %d unsynced = %zu\n",
1747 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1748 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1749 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1750 #else
1751 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1752 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1753 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1754 wl->wl_error_count);
1755 #endif
1756 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1757 wl->wl_dealloccnt, wl->wl_dealloclim);
1758 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1759 wl->wl_inohashcnt, wl->wl_inohashmask);
1760 (*pr)("entries:\n");
1761 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1762 #ifdef WAPBL_DEBUG_BUFBYTES
1763 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1764 "unsynced = %zu\n",
1765 we->we_bufcount, we->we_reclaimable_bytes,
1766 we->we_error, we->we_unsynced_bufbytes);
1767 #else
1768 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1769 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1770 #endif
1771 }
1772 if (full) {
1773 int cnt = 0;
1774 (*pr)("bufs =");
1775 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1776 if (!LIST_NEXT(bp, b_wapbllist)) {
1777 (*pr)(" %p", bp);
1778 } else if ((++cnt % 6) == 0) {
1779 (*pr)(" %p,\n\t", bp);
1780 } else {
1781 (*pr)(" %p,", bp);
1782 }
1783 }
1784 (*pr)("\n");
1785
1786 (*pr)("dealloced blks = ");
1787 {
1788 int i;
1789 cnt = 0;
1790 for (i = 0; i < wl->wl_dealloccnt; i++) {
1791 (*pr)(" %"PRId64":%d,",
1792 wl->wl_deallocblks[i],
1793 wl->wl_dealloclens[i]);
1794 if ((++cnt % 4) == 0) {
1795 (*pr)("\n\t");
1796 }
1797 }
1798 }
1799 (*pr)("\n");
1800
1801 (*pr)("registered inodes = ");
1802 {
1803 int i;
1804 cnt = 0;
1805 for (i = 0; i <= wl->wl_inohashmask; i++) {
1806 struct wapbl_ino_head *wih;
1807 struct wapbl_ino *wi;
1808
1809 wih = &wl->wl_inohash[i];
1810 LIST_FOREACH(wi, wih, wi_hash) {
1811 if (wi->wi_ino == 0)
1812 continue;
1813 (*pr)(" %"PRIu64"/0%06"PRIo32",",
1814 wi->wi_ino, wi->wi_mode);
1815 if ((++cnt % 4) == 0) {
1816 (*pr)("\n\t");
1817 }
1818 }
1819 }
1820 (*pr)("\n");
1821 }
1822 }
1823 }
1824
1825 #if defined(WAPBL_DEBUG) || defined(DDB)
1826 void
1827 wapbl_dump(struct wapbl *wl)
1828 {
1829 #if defined(WAPBL_DEBUG)
1830 if (!wl)
1831 wl = wapbl_debug_wl;
1832 #endif
1833 if (!wl)
1834 return;
1835 wapbl_print(wl, 1, printf);
1836 }
1837 #endif
1838
1839 /****************************************************************/
1840
1841 void
1842 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1843 {
1844
1845 wapbl_jlock_assert(wl);
1846
1847 mutex_enter(&wl->wl_mtx);
1848 /* XXX should eventually instead tie this into resource estimation */
1849 /*
1850 * XXX this panic needs locking/mutex analysis and the
1851 * ability to cope with the failure.
1852 */
1853 /* XXX this XXX doesn't have enough XXX */
1854 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1855 panic("wapbl_register_deallocation: out of resources");
1856
1857 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1858 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1859 wl->wl_dealloccnt++;
1860 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1861 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1862 mutex_exit(&wl->wl_mtx);
1863 }
1864
1865 /****************************************************************/
1866
1867 static void
1868 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1869 {
1870
1871 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1872 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1873 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1874 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1875 }
1876 }
1877
1878 static void
1879 wapbl_inodetrk_free(struct wapbl *wl)
1880 {
1881
1882 /* XXX this KASSERT needs locking/mutex analysis */
1883 KASSERT(wl->wl_inohashcnt == 0);
1884 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1885 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1886 pool_destroy(&wapbl_ino_pool);
1887 }
1888 }
1889
1890 static struct wapbl_ino *
1891 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1892 {
1893 struct wapbl_ino_head *wih;
1894 struct wapbl_ino *wi;
1895
1896 KASSERT(mutex_owned(&wl->wl_mtx));
1897
1898 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1899 LIST_FOREACH(wi, wih, wi_hash) {
1900 if (ino == wi->wi_ino)
1901 return wi;
1902 }
1903 return 0;
1904 }
1905
1906 void
1907 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1908 {
1909 struct wapbl_ino_head *wih;
1910 struct wapbl_ino *wi;
1911
1912 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1913
1914 mutex_enter(&wl->wl_mtx);
1915 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1916 wi->wi_ino = ino;
1917 wi->wi_mode = mode;
1918 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1919 LIST_INSERT_HEAD(wih, wi, wi_hash);
1920 wl->wl_inohashcnt++;
1921 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1922 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1923 mutex_exit(&wl->wl_mtx);
1924 } else {
1925 mutex_exit(&wl->wl_mtx);
1926 pool_put(&wapbl_ino_pool, wi);
1927 }
1928 }
1929
1930 void
1931 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1932 {
1933 struct wapbl_ino *wi;
1934
1935 mutex_enter(&wl->wl_mtx);
1936 wi = wapbl_inodetrk_get(wl, ino);
1937 if (wi) {
1938 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1939 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1940 KASSERT(wl->wl_inohashcnt > 0);
1941 wl->wl_inohashcnt--;
1942 LIST_REMOVE(wi, wi_hash);
1943 mutex_exit(&wl->wl_mtx);
1944
1945 pool_put(&wapbl_ino_pool, wi);
1946 } else {
1947 mutex_exit(&wl->wl_mtx);
1948 }
1949 }
1950
1951 /****************************************************************/
1952
1953 static inline size_t
1954 wapbl_transaction_inodes_len(struct wapbl *wl)
1955 {
1956 int blocklen = 1<<wl->wl_log_dev_bshift;
1957 int iph;
1958
1959 /* Calculate number of inodes described in a inodelist header */
1960 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1961 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1962
1963 KASSERT(iph > 0);
1964
1965 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
1966 }
1967
1968
1969 /* Calculate amount of space a transaction will take on disk */
1970 static size_t
1971 wapbl_transaction_len(struct wapbl *wl)
1972 {
1973 int blocklen = 1<<wl->wl_log_dev_bshift;
1974 size_t len;
1975 int bph;
1976
1977 /* Calculate number of blocks described in a blocklist header */
1978 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1979 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1980
1981 KASSERT(bph > 0);
1982
1983 len = wl->wl_bcount;
1984 len += howmany(wl->wl_bufcount, bph) * blocklen;
1985 len += howmany(wl->wl_dealloccnt, bph) * blocklen;
1986 len += wapbl_transaction_inodes_len(wl);
1987
1988 return len;
1989 }
1990
1991 /*
1992 * wapbl_cache_sync: issue DIOCCACHESYNC
1993 */
1994 static int
1995 wapbl_cache_sync(struct wapbl *wl, const char *msg)
1996 {
1997 const bool verbose = wapbl_verbose_commit >= 2;
1998 struct bintime start_time;
1999 int force = 1;
2000 int error;
2001
2002 if (!wapbl_flush_disk_cache) {
2003 return 0;
2004 }
2005 if (verbose) {
2006 bintime(&start_time);
2007 }
2008 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2009 FWRITE, FSCRED);
2010 if (error) {
2011 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2012 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x "
2013 "returned %d\n", wl->wl_devvp->v_rdev, error));
2014 }
2015 if (verbose) {
2016 struct bintime d;
2017 struct timespec ts;
2018
2019 bintime(&d);
2020 bintime_sub(&d, &start_time);
2021 bintime2timespec(&d, &ts);
2022 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2023 msg, (uintmax_t)wl->wl_devvp->v_rdev,
2024 (uintmax_t)ts.tv_sec, ts.tv_nsec);
2025 }
2026 return error;
2027 }
2028
2029 /*
2030 * Perform commit operation
2031 *
2032 * Note that generation number incrementation needs to
2033 * be protected against racing with other invocations
2034 * of wapbl_write_commit. This is ok since this routine
2035 * is only invoked from wapbl_flush
2036 */
2037 static int
2038 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2039 {
2040 struct wapbl_wc_header *wc = wl->wl_wc_header;
2041 struct timespec ts;
2042 int error;
2043 daddr_t pbn;
2044
2045 error = wapbl_buffered_flush(wl);
2046 if (error)
2047 return error;
2048 /*
2049 * flush disk cache to ensure that blocks we've written are actually
2050 * written to the stable storage before the commit header.
2051 *
2052 * XXX Calc checksum here, instead we do this for now
2053 */
2054 wapbl_cache_sync(wl, "1");
2055
2056 wc->wc_head = head;
2057 wc->wc_tail = tail;
2058 wc->wc_checksum = 0;
2059 wc->wc_version = 1;
2060 getnanotime(&ts);
2061 wc->wc_time = ts.tv_sec;
2062 wc->wc_timensec = ts.tv_nsec;
2063
2064 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2065 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2066 (intmax_t)head, (intmax_t)tail));
2067
2068 /*
2069 * write the commit header.
2070 *
2071 * XXX if generation will rollover, then first zero
2072 * over second commit header before trying to write both headers.
2073 */
2074
2075 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
2076 #ifdef _KERNEL
2077 pbn = btodb(pbn << wc->wc_log_dev_bshift);
2078 #endif
2079 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2080 if (error)
2081 return error;
2082 error = wapbl_buffered_flush(wl);
2083 if (error)
2084 return error;
2085
2086 /*
2087 * flush disk cache to ensure that the commit header is actually
2088 * written before meta data blocks.
2089 */
2090 wapbl_cache_sync(wl, "2");
2091
2092 /*
2093 * If the generation number was zero, write it out a second time.
2094 * This handles initialization and generation number rollover
2095 */
2096 if (wc->wc_generation++ == 0) {
2097 error = wapbl_write_commit(wl, head, tail);
2098 /*
2099 * This panic should be able to be removed if we do the
2100 * zero'ing mentioned above, and we are certain to roll
2101 * back generation number on failure.
2102 */
2103 if (error)
2104 panic("wapbl_write_commit: error writing duplicate "
2105 "log header: %d\n", error);
2106 }
2107 return 0;
2108 }
2109
2110 /* Returns new offset value */
2111 static int
2112 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
2113 {
2114 struct wapbl_wc_blocklist *wc =
2115 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2116 int blocklen = 1<<wl->wl_log_dev_bshift;
2117 int bph;
2118 struct buf *bp;
2119 off_t off = *offp;
2120 int error;
2121 size_t padding;
2122
2123 KASSERT(rw_write_held(&wl->wl_rwlock));
2124
2125 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2126 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2127
2128 bp = LIST_FIRST(&wl->wl_bufs);
2129
2130 while (bp) {
2131 int cnt;
2132 struct buf *obp = bp;
2133
2134 KASSERT(bp->b_flags & B_LOCKED);
2135
2136 wc->wc_type = WAPBL_WC_BLOCKS;
2137 wc->wc_len = blocklen;
2138 wc->wc_blkcount = 0;
2139 while (bp && (wc->wc_blkcount < bph)) {
2140 /*
2141 * Make sure all the physical block numbers are up to
2142 * date. If this is not always true on a given
2143 * filesystem, then VOP_BMAP must be called. We
2144 * could call VOP_BMAP here, or else in the filesystem
2145 * specific flush callback, although neither of those
2146 * solutions allow us to take the vnode lock. If a
2147 * filesystem requires that we must take the vnode lock
2148 * to call VOP_BMAP, then we can probably do it in
2149 * bwrite when the vnode lock should already be held
2150 * by the invoking code.
2151 */
2152 KASSERT((bp->b_vp->v_type == VBLK) ||
2153 (bp->b_blkno != bp->b_lblkno));
2154 KASSERT(bp->b_blkno > 0);
2155
2156 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2157 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2158 wc->wc_len += bp->b_bcount;
2159 wc->wc_blkcount++;
2160 bp = LIST_NEXT(bp, b_wapbllist);
2161 }
2162 if (wc->wc_len % blocklen != 0) {
2163 padding = blocklen - wc->wc_len % blocklen;
2164 wc->wc_len += padding;
2165 } else {
2166 padding = 0;
2167 }
2168
2169 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2170 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2171 wc->wc_len, padding, (intmax_t)off));
2172
2173 error = wapbl_circ_write(wl, wc, blocklen, &off);
2174 if (error)
2175 return error;
2176 bp = obp;
2177 cnt = 0;
2178 while (bp && (cnt++ < bph)) {
2179 error = wapbl_circ_write(wl, bp->b_data,
2180 bp->b_bcount, &off);
2181 if (error)
2182 return error;
2183 bp = LIST_NEXT(bp, b_wapbllist);
2184 }
2185 if (padding) {
2186 void *zero;
2187
2188 zero = wapbl_alloc(padding);
2189 memset(zero, 0, padding);
2190 error = wapbl_circ_write(wl, zero, padding, &off);
2191 wapbl_free(zero, padding);
2192 if (error)
2193 return error;
2194 }
2195 }
2196 *offp = off;
2197 return 0;
2198 }
2199
2200 static int
2201 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2202 {
2203 struct wapbl_wc_blocklist *wc =
2204 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2205 int i;
2206 int blocklen = 1<<wl->wl_log_dev_bshift;
2207 int bph;
2208 off_t off = *offp;
2209 int error;
2210
2211 if (wl->wl_dealloccnt == 0)
2212 return 0;
2213
2214 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2215 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2216
2217 i = 0;
2218 while (i < wl->wl_dealloccnt) {
2219 wc->wc_type = WAPBL_WC_REVOCATIONS;
2220 wc->wc_len = blocklen;
2221 wc->wc_blkcount = 0;
2222 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2223 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2224 wl->wl_deallocblks[i];
2225 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2226 wl->wl_dealloclens[i];
2227 wc->wc_blkcount++;
2228 i++;
2229 }
2230 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2231 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2232 wc->wc_len, (intmax_t)off));
2233 error = wapbl_circ_write(wl, wc, blocklen, &off);
2234 if (error)
2235 return error;
2236 }
2237 *offp = off;
2238 return 0;
2239 }
2240
2241 static int
2242 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2243 {
2244 struct wapbl_wc_inodelist *wc =
2245 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2246 int i;
2247 int blocklen = 1 << wl->wl_log_dev_bshift;
2248 off_t off = *offp;
2249 int error;
2250
2251 struct wapbl_ino_head *wih;
2252 struct wapbl_ino *wi;
2253 int iph;
2254
2255 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2256 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2257
2258 i = 0;
2259 wih = &wl->wl_inohash[0];
2260 wi = 0;
2261 do {
2262 wc->wc_type = WAPBL_WC_INODES;
2263 wc->wc_len = blocklen;
2264 wc->wc_inocnt = 0;
2265 wc->wc_clear = (i == 0);
2266 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2267 while (!wi) {
2268 KASSERT((wih - &wl->wl_inohash[0])
2269 <= wl->wl_inohashmask);
2270 wi = LIST_FIRST(wih++);
2271 }
2272 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2273 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2274 wc->wc_inocnt++;
2275 i++;
2276 wi = LIST_NEXT(wi, wi_hash);
2277 }
2278 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2279 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2280 wc->wc_len, (intmax_t)off));
2281 error = wapbl_circ_write(wl, wc, blocklen, &off);
2282 if (error)
2283 return error;
2284 } while (i < wl->wl_inohashcnt);
2285
2286 *offp = off;
2287 return 0;
2288 }
2289
2290 #endif /* _KERNEL */
2291
2292 /****************************************************************/
2293
2294 struct wapbl_blk {
2295 LIST_ENTRY(wapbl_blk) wb_hash;
2296 daddr_t wb_blk;
2297 off_t wb_off; /* Offset of this block in the log */
2298 };
2299 #define WAPBL_BLKPOOL_MIN 83
2300
2301 static void
2302 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2303 {
2304 if (size < WAPBL_BLKPOOL_MIN)
2305 size = WAPBL_BLKPOOL_MIN;
2306 KASSERT(wr->wr_blkhash == 0);
2307 #ifdef _KERNEL
2308 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2309 #else /* ! _KERNEL */
2310 /* Manually implement hashinit */
2311 {
2312 unsigned long i, hashsize;
2313 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2314 continue;
2315 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2316 for (i = 0; i < hashsize; i++)
2317 LIST_INIT(&wr->wr_blkhash[i]);
2318 wr->wr_blkhashmask = hashsize - 1;
2319 }
2320 #endif /* ! _KERNEL */
2321 }
2322
2323 static void
2324 wapbl_blkhash_free(struct wapbl_replay *wr)
2325 {
2326 KASSERT(wr->wr_blkhashcnt == 0);
2327 #ifdef _KERNEL
2328 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2329 #else /* ! _KERNEL */
2330 wapbl_free(wr->wr_blkhash,
2331 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2332 #endif /* ! _KERNEL */
2333 }
2334
2335 static struct wapbl_blk *
2336 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2337 {
2338 struct wapbl_blk_head *wbh;
2339 struct wapbl_blk *wb;
2340 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2341 LIST_FOREACH(wb, wbh, wb_hash) {
2342 if (blk == wb->wb_blk)
2343 return wb;
2344 }
2345 return 0;
2346 }
2347
2348 static void
2349 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2350 {
2351 struct wapbl_blk_head *wbh;
2352 struct wapbl_blk *wb;
2353 wb = wapbl_blkhash_get(wr, blk);
2354 if (wb) {
2355 KASSERT(wb->wb_blk == blk);
2356 wb->wb_off = off;
2357 } else {
2358 wb = wapbl_alloc(sizeof(*wb));
2359 wb->wb_blk = blk;
2360 wb->wb_off = off;
2361 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2362 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2363 wr->wr_blkhashcnt++;
2364 }
2365 }
2366
2367 static void
2368 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2369 {
2370 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2371 if (wb) {
2372 KASSERT(wr->wr_blkhashcnt > 0);
2373 wr->wr_blkhashcnt--;
2374 LIST_REMOVE(wb, wb_hash);
2375 wapbl_free(wb, sizeof(*wb));
2376 }
2377 }
2378
2379 static void
2380 wapbl_blkhash_clear(struct wapbl_replay *wr)
2381 {
2382 unsigned long i;
2383 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2384 struct wapbl_blk *wb;
2385
2386 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2387 KASSERT(wr->wr_blkhashcnt > 0);
2388 wr->wr_blkhashcnt--;
2389 LIST_REMOVE(wb, wb_hash);
2390 wapbl_free(wb, sizeof(*wb));
2391 }
2392 }
2393 KASSERT(wr->wr_blkhashcnt == 0);
2394 }
2395
2396 /****************************************************************/
2397
2398 static int
2399 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2400 {
2401 size_t slen;
2402 off_t off = *offp;
2403 int error;
2404 daddr_t pbn;
2405
2406 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2407 wr->wr_log_dev_bshift) == len);
2408
2409 if (off < wr->wr_circ_off)
2410 off = wr->wr_circ_off;
2411 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2412 if (slen < len) {
2413 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2414 #ifdef _KERNEL
2415 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2416 #endif
2417 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2418 if (error)
2419 return error;
2420 data = (uint8_t *)data + slen;
2421 len -= slen;
2422 off = wr->wr_circ_off;
2423 }
2424 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2425 #ifdef _KERNEL
2426 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2427 #endif
2428 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2429 if (error)
2430 return error;
2431 off += len;
2432 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2433 off = wr->wr_circ_off;
2434 *offp = off;
2435 return 0;
2436 }
2437
2438 static void
2439 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2440 {
2441 size_t slen;
2442 off_t off = *offp;
2443
2444 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2445 wr->wr_log_dev_bshift) == len);
2446
2447 if (off < wr->wr_circ_off)
2448 off = wr->wr_circ_off;
2449 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2450 if (slen < len) {
2451 len -= slen;
2452 off = wr->wr_circ_off;
2453 }
2454 off += len;
2455 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2456 off = wr->wr_circ_off;
2457 *offp = off;
2458 }
2459
2460 /****************************************************************/
2461
2462 int
2463 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2464 daddr_t off, size_t count, size_t blksize)
2465 {
2466 struct wapbl_replay *wr;
2467 int error;
2468 struct vnode *devvp;
2469 daddr_t logpbn;
2470 uint8_t *scratch;
2471 struct wapbl_wc_header *wch;
2472 struct wapbl_wc_header *wch2;
2473 /* Use this until we read the actual log header */
2474 int log_dev_bshift = ilog2(blksize);
2475 size_t used;
2476 daddr_t pbn;
2477
2478 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2479 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2480 vp, off, count, blksize));
2481
2482 if (off < 0)
2483 return EINVAL;
2484
2485 if (blksize < DEV_BSIZE)
2486 return EINVAL;
2487 if (blksize % DEV_BSIZE)
2488 return EINVAL;
2489
2490 #ifdef _KERNEL
2491 #if 0
2492 /* XXX vp->v_size isn't reliably set for VBLK devices,
2493 * especially root. However, we might still want to verify
2494 * that the full load is readable */
2495 if ((off + count) * blksize > vp->v_size)
2496 return EINVAL;
2497 #endif
2498 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2499 return error;
2500 }
2501 #else /* ! _KERNEL */
2502 devvp = vp;
2503 logpbn = off;
2504 #endif /* ! _KERNEL */
2505
2506 scratch = wapbl_alloc(MAXBSIZE);
2507
2508 pbn = logpbn;
2509 #ifdef _KERNEL
2510 pbn = btodb(pbn << log_dev_bshift);
2511 #endif
2512 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2513 if (error)
2514 goto errout;
2515
2516 wch = (struct wapbl_wc_header *)scratch;
2517 wch2 =
2518 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2519 /* XXX verify checksums and magic numbers */
2520 if (wch->wc_type != WAPBL_WC_HEADER) {
2521 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2522 error = EFTYPE;
2523 goto errout;
2524 }
2525
2526 if (wch2->wc_generation > wch->wc_generation)
2527 wch = wch2;
2528
2529 wr = wapbl_calloc(1, sizeof(*wr));
2530
2531 wr->wr_logvp = vp;
2532 wr->wr_devvp = devvp;
2533 wr->wr_logpbn = logpbn;
2534
2535 wr->wr_scratch = scratch;
2536
2537 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2538 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2539 wr->wr_circ_off = wch->wc_circ_off;
2540 wr->wr_circ_size = wch->wc_circ_size;
2541 wr->wr_generation = wch->wc_generation;
2542
2543 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2544
2545 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2546 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2547 " len=%"PRId64" used=%zu\n",
2548 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2549 wch->wc_circ_size, used));
2550
2551 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2552
2553 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2554 if (error) {
2555 wapbl_replay_stop(wr);
2556 wapbl_replay_free(wr);
2557 return error;
2558 }
2559
2560 *wrp = wr;
2561 return 0;
2562
2563 errout:
2564 wapbl_free(scratch, MAXBSIZE);
2565 return error;
2566 }
2567
2568 void
2569 wapbl_replay_stop(struct wapbl_replay *wr)
2570 {
2571
2572 if (!wapbl_replay_isopen(wr))
2573 return;
2574
2575 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2576
2577 wapbl_free(wr->wr_scratch, MAXBSIZE);
2578 wr->wr_scratch = NULL;
2579
2580 wr->wr_logvp = NULL;
2581
2582 wapbl_blkhash_clear(wr);
2583 wapbl_blkhash_free(wr);
2584 }
2585
2586 void
2587 wapbl_replay_free(struct wapbl_replay *wr)
2588 {
2589
2590 KDASSERT(!wapbl_replay_isopen(wr));
2591
2592 if (wr->wr_inodes)
2593 wapbl_free(wr->wr_inodes,
2594 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2595 wapbl_free(wr, sizeof(*wr));
2596 }
2597
2598 #ifdef _KERNEL
2599 int
2600 wapbl_replay_isopen1(struct wapbl_replay *wr)
2601 {
2602
2603 return wapbl_replay_isopen(wr);
2604 }
2605 #endif
2606
2607 static void
2608 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2609 {
2610 struct wapbl_wc_blocklist *wc =
2611 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2612 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2613 int i, j, n;
2614
2615 for (i = 0; i < wc->wc_blkcount; i++) {
2616 /*
2617 * Enter each physical block into the hashtable independently.
2618 */
2619 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2620 for (j = 0; j < n; j++) {
2621 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2622 *offp);
2623 wapbl_circ_advance(wr, fsblklen, offp);
2624 }
2625 }
2626 }
2627
2628 static void
2629 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2630 {
2631 struct wapbl_wc_blocklist *wc =
2632 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2633 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2634 int i, j, n;
2635
2636 for (i = 0; i < wc->wc_blkcount; i++) {
2637 /*
2638 * Remove any blocks found from the hashtable.
2639 */
2640 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2641 for (j = 0; j < n; j++)
2642 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2643 }
2644 }
2645
2646 static void
2647 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2648 {
2649 struct wapbl_wc_inodelist *wc =
2650 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2651 void *new_inodes;
2652 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2653
2654 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2655
2656 /*
2657 * Keep track of where we found this so location won't be
2658 * overwritten.
2659 */
2660 if (wc->wc_clear) {
2661 wr->wr_inodestail = oldoff;
2662 wr->wr_inodescnt = 0;
2663 if (wr->wr_inodes != NULL) {
2664 wapbl_free(wr->wr_inodes, oldsize);
2665 wr->wr_inodes = NULL;
2666 }
2667 }
2668 wr->wr_inodeshead = newoff;
2669 if (wc->wc_inocnt == 0)
2670 return;
2671
2672 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2673 sizeof(wr->wr_inodes[0]));
2674 if (wr->wr_inodes != NULL) {
2675 memcpy(new_inodes, wr->wr_inodes, oldsize);
2676 wapbl_free(wr->wr_inodes, oldsize);
2677 }
2678 wr->wr_inodes = new_inodes;
2679 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2680 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2681 wr->wr_inodescnt += wc->wc_inocnt;
2682 }
2683
2684 static int
2685 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2686 {
2687 off_t off;
2688 int error;
2689
2690 int logblklen = 1 << wr->wr_log_dev_bshift;
2691
2692 wapbl_blkhash_clear(wr);
2693
2694 off = tail;
2695 while (off != head) {
2696 struct wapbl_wc_null *wcn;
2697 off_t saveoff = off;
2698 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2699 if (error)
2700 goto errout;
2701 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2702 switch (wcn->wc_type) {
2703 case WAPBL_WC_BLOCKS:
2704 wapbl_replay_process_blocks(wr, &off);
2705 break;
2706
2707 case WAPBL_WC_REVOCATIONS:
2708 wapbl_replay_process_revocations(wr);
2709 break;
2710
2711 case WAPBL_WC_INODES:
2712 wapbl_replay_process_inodes(wr, saveoff, off);
2713 break;
2714
2715 default:
2716 printf("Unrecognized wapbl type: 0x%08x\n",
2717 wcn->wc_type);
2718 error = EFTYPE;
2719 goto errout;
2720 }
2721 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2722 if (off != saveoff) {
2723 printf("wapbl_replay: corrupted records\n");
2724 error = EFTYPE;
2725 goto errout;
2726 }
2727 }
2728 return 0;
2729
2730 errout:
2731 wapbl_blkhash_clear(wr);
2732 return error;
2733 }
2734
2735 #if 0
2736 int
2737 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2738 {
2739 off_t off;
2740 int mismatchcnt = 0;
2741 int logblklen = 1 << wr->wr_log_dev_bshift;
2742 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2743 void *scratch1 = wapbl_alloc(MAXBSIZE);
2744 void *scratch2 = wapbl_alloc(MAXBSIZE);
2745 int error = 0;
2746
2747 KDASSERT(wapbl_replay_isopen(wr));
2748
2749 off = wch->wc_tail;
2750 while (off != wch->wc_head) {
2751 struct wapbl_wc_null *wcn;
2752 #ifdef DEBUG
2753 off_t saveoff = off;
2754 #endif
2755 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2756 if (error)
2757 goto out;
2758 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2759 switch (wcn->wc_type) {
2760 case WAPBL_WC_BLOCKS:
2761 {
2762 struct wapbl_wc_blocklist *wc =
2763 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2764 int i;
2765 for (i = 0; i < wc->wc_blkcount; i++) {
2766 int foundcnt = 0;
2767 int dirtycnt = 0;
2768 int j, n;
2769 /*
2770 * Check each physical block into the
2771 * hashtable independently
2772 */
2773 n = wc->wc_blocks[i].wc_dlen >>
2774 wch->wc_fs_dev_bshift;
2775 for (j = 0; j < n; j++) {
2776 struct wapbl_blk *wb =
2777 wapbl_blkhash_get(wr,
2778 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2779 if (wb && (wb->wb_off == off)) {
2780 foundcnt++;
2781 error =
2782 wapbl_circ_read(wr,
2783 scratch1, fsblklen,
2784 &off);
2785 if (error)
2786 goto out;
2787 error =
2788 wapbl_read(scratch2,
2789 fsblklen, fsdevvp,
2790 wb->wb_blk);
2791 if (error)
2792 goto out;
2793 if (memcmp(scratch1,
2794 scratch2,
2795 fsblklen)) {
2796 printf(
2797 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2798 wb->wb_blk, (intmax_t)off);
2799 dirtycnt++;
2800 mismatchcnt++;
2801 }
2802 } else {
2803 wapbl_circ_advance(wr,
2804 fsblklen, &off);
2805 }
2806 }
2807 #if 0
2808 /*
2809 * If all of the blocks in an entry
2810 * are clean, then remove all of its
2811 * blocks from the hashtable since they
2812 * never will need replay.
2813 */
2814 if ((foundcnt != 0) &&
2815 (dirtycnt == 0)) {
2816 off = saveoff;
2817 wapbl_circ_advance(wr,
2818 logblklen, &off);
2819 for (j = 0; j < n; j++) {
2820 struct wapbl_blk *wb =
2821 wapbl_blkhash_get(wr,
2822 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2823 if (wb &&
2824 (wb->wb_off == off)) {
2825 wapbl_blkhash_rem(wr, wb->wb_blk);
2826 }
2827 wapbl_circ_advance(wr,
2828 fsblklen, &off);
2829 }
2830 }
2831 #endif
2832 }
2833 }
2834 break;
2835 case WAPBL_WC_REVOCATIONS:
2836 case WAPBL_WC_INODES:
2837 break;
2838 default:
2839 KASSERT(0);
2840 }
2841 #ifdef DEBUG
2842 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2843 KASSERT(off == saveoff);
2844 #endif
2845 }
2846 out:
2847 wapbl_free(scratch1, MAXBSIZE);
2848 wapbl_free(scratch2, MAXBSIZE);
2849 if (!error && mismatchcnt)
2850 error = EFTYPE;
2851 return error;
2852 }
2853 #endif
2854
2855 int
2856 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2857 {
2858 struct wapbl_blk *wb;
2859 size_t i;
2860 off_t off;
2861 void *scratch;
2862 int error = 0;
2863 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2864
2865 KDASSERT(wapbl_replay_isopen(wr));
2866
2867 scratch = wapbl_alloc(MAXBSIZE);
2868
2869 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2870 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2871 off = wb->wb_off;
2872 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2873 if (error)
2874 break;
2875 error = wapbl_write(scratch, fsblklen, fsdevvp,
2876 wb->wb_blk);
2877 if (error)
2878 break;
2879 }
2880 }
2881
2882 wapbl_free(scratch, MAXBSIZE);
2883 return error;
2884 }
2885
2886 int
2887 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2888 {
2889 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2890
2891 KDASSERT(wapbl_replay_isopen(wr));
2892 KASSERT((len % fsblklen) == 0);
2893
2894 while (len != 0) {
2895 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2896 if (wb)
2897 return 1;
2898 len -= fsblklen;
2899 }
2900 return 0;
2901 }
2902
2903 int
2904 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2905 {
2906 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2907
2908 KDASSERT(wapbl_replay_isopen(wr));
2909
2910 KASSERT((len % fsblklen) == 0);
2911
2912 while (len != 0) {
2913 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2914 if (wb) {
2915 off_t off = wb->wb_off;
2916 int error;
2917 error = wapbl_circ_read(wr, data, fsblklen, &off);
2918 if (error)
2919 return error;
2920 }
2921 data = (uint8_t *)data + fsblklen;
2922 len -= fsblklen;
2923 blk++;
2924 }
2925 return 0;
2926 }
2927
2928 #ifdef _KERNEL
2929 /*
2930 * This is not really a module now, but maybe on it's way to
2931 * being one some day.
2932 */
2933 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2934
2935 static int
2936 wapbl_modcmd(modcmd_t cmd, void *arg)
2937 {
2938
2939 switch (cmd) {
2940 case MODULE_CMD_INIT:
2941 wapbl_init();
2942 return 0;
2943 case MODULE_CMD_FINI:
2944 #ifdef notyet
2945 return wapbl_fini(true);
2946 #endif
2947 return EOPNOTSUPP;
2948 default:
2949 return ENOTTY;
2950 }
2951 }
2952 #endif /* _KERNEL */
2953