vfs_wapbl.c revision 1.37 1 /* $NetBSD: vfs_wapbl.c,v 1.37 2010/09/10 10:14:55 drochner Exp $ */
2
3 /*-
4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.37 2010/09/10 10:14:55 drochner Exp $");
40
41 #include <sys/param.h>
42 #include <sys/bitops.h>
43
44 #ifdef _KERNEL
45 #include <sys/param.h>
46 #include <sys/namei.h>
47 #include <sys/proc.h>
48 #include <sys/uio.h>
49 #include <sys/vnode.h>
50 #include <sys/file.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/resourcevar.h>
54 #include <sys/conf.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
57 #include <sys/kauth.h>
58 #include <sys/mutex.h>
59 #include <sys/atomic.h>
60 #include <sys/wapbl.h>
61 #include <sys/wapbl_replay.h>
62
63 #include <miscfs/specfs/specdev.h>
64
65 #if 0 /* notyet */
66 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
67 #define wapbl_free(a, s) kmem_free((a), (s))
68 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
69 #else
70 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
71 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
72 #define wapbl_free(a, s) free((a), M_WAPBL)
73 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
74 #endif
75
76 #else /* !_KERNEL */
77 #include <assert.h>
78 #include <errno.h>
79 #include <stdio.h>
80 #include <stdbool.h>
81 #include <stdlib.h>
82 #include <string.h>
83
84 #include <sys/time.h>
85 #include <sys/wapbl.h>
86 #include <sys/wapbl_replay.h>
87
88 #define KDASSERT(x) assert(x)
89 #define KASSERT(x) assert(x)
90 #define wapbl_malloc(s) malloc(s)
91 #define wapbl_free(a, s) free(a)
92 #define wapbl_calloc(n, s) calloc((n), (s))
93
94 #endif /* !_KERNEL */
95
96 /*
97 * INTERNAL DATA STRUCTURES
98 */
99
100 /*
101 * This structure holds per-mount log information.
102 *
103 * Legend: a = atomic access only
104 * r = read-only after init
105 * l = rwlock held
106 * m = mutex held
107 * u = unlocked access ok
108 * b = bufcache_lock held
109 */
110 struct wapbl {
111 struct vnode *wl_logvp; /* r: log here */
112 struct vnode *wl_devvp; /* r: log on this device */
113 struct mount *wl_mount; /* r: mountpoint wl is associated with */
114 daddr_t wl_logpbn; /* r: Physical block number of start of log */
115 int wl_log_dev_bshift; /* r: logarithm of device block size of log
116 device */
117 int wl_fs_dev_bshift; /* r: logarithm of device block size of
118 filesystem device */
119
120 unsigned wl_lock_count; /* m: Count of transactions in progress */
121
122 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
123 size_t wl_circ_off; /* r: Number of bytes reserved at start */
124
125 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
126 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
127
128 off_t wl_head; /* l: Byte offset of log head */
129 off_t wl_tail; /* l: Byte offset of log tail */
130 /*
131 * head == tail == 0 means log is empty
132 * head == tail != 0 means log is full
133 * see assertions in wapbl_advance() for other boundary conditions.
134 * only truncate moves the tail, except when flush sets it to
135 * wl_header_size only flush moves the head, except when truncate
136 * sets it to 0.
137 */
138
139 struct wapbl_wc_header *wl_wc_header; /* l */
140 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
141
142 kmutex_t wl_mtx; /* u: short-term lock */
143 krwlock_t wl_rwlock; /* u: File system transaction lock */
144
145 /*
146 * Must be held while accessing
147 * wl_count or wl_bufs or head or tail
148 */
149
150 /*
151 * Callback called from within the flush routine to flush any extra
152 * bits. Note that flush may be skipped without calling this if
153 * there are no outstanding buffers in the transaction.
154 */
155 #if _KERNEL
156 wapbl_flush_fn_t wl_flush; /* r */
157 wapbl_flush_fn_t wl_flush_abort;/* r */
158 #endif
159
160 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
161 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
162 size_t wl_bcount; /* m: Total bcount of wl_bufs */
163
164 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
165
166 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
167 size_t wl_reclaimable_bytes; /* m: Amount of space available for
168 reclamation by truncate */
169 int wl_error_count; /* m: # of wl_entries with errors */
170 size_t wl_reserved_bytes; /* never truncate log smaller than this */
171
172 #ifdef WAPBL_DEBUG_BUFBYTES
173 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
174 #endif
175
176 daddr_t *wl_deallocblks;/* l: address of block */
177 int *wl_dealloclens; /* l: size of block */
178 int wl_dealloccnt; /* l: total count */
179 int wl_dealloclim; /* l: max count */
180
181 /* hashtable of inode numbers for allocated but unlinked inodes */
182 /* synch ??? */
183 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
184 u_long wl_inohashmask;
185 int wl_inohashcnt;
186
187 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
188 accounting */
189 };
190
191 #ifdef WAPBL_DEBUG_PRINT
192 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
193 #endif
194
195 /****************************************************************/
196 #ifdef _KERNEL
197
198 #ifdef WAPBL_DEBUG
199 struct wapbl *wapbl_debug_wl;
200 #endif
201
202 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
203 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
204 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
205 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
206 #endif /* _KERNEL */
207
208 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
209
210 static inline size_t wapbl_space_free(size_t avail, off_t head,
211 off_t tail);
212 static inline size_t wapbl_space_used(size_t avail, off_t head,
213 off_t tail);
214
215 #ifdef _KERNEL
216
217 #define WAPBL_INODETRK_SIZE 83
218 static int wapbl_ino_pool_refcount;
219 static struct pool wapbl_ino_pool;
220 struct wapbl_ino {
221 LIST_ENTRY(wapbl_ino) wi_hash;
222 ino_t wi_ino;
223 mode_t wi_mode;
224 };
225
226 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
227 static void wapbl_inodetrk_free(struct wapbl *wl);
228 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
229
230 static size_t wapbl_transaction_len(struct wapbl *wl);
231 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
232
233 #if 0
234 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
235 #endif
236
237 static int wapbl_replay_isopen1(struct wapbl_replay *);
238
239 /*
240 * This is useful for debugging. If set, the log will
241 * only be truncated when necessary.
242 */
243 int wapbl_lazy_truncate = 0;
244
245 struct wapbl_ops wapbl_ops = {
246 .wo_wapbl_discard = wapbl_discard,
247 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
248 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
249 .wo_wapbl_replay_read = wapbl_replay_read,
250 .wo_wapbl_add_buf = wapbl_add_buf,
251 .wo_wapbl_remove_buf = wapbl_remove_buf,
252 .wo_wapbl_resize_buf = wapbl_resize_buf,
253 .wo_wapbl_begin = wapbl_begin,
254 .wo_wapbl_end = wapbl_end,
255 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
256
257 /* XXX: the following is only used to say "this is a wapbl buf" */
258 .wo_wapbl_biodone = wapbl_biodone,
259 };
260
261 static int
262 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
263 {
264 int error, i;
265
266 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
267 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
268
269 /*
270 * Its only valid to reuse the replay log if its
271 * the same as the new log we just opened.
272 */
273 KDASSERT(!wapbl_replay_isopen(wr));
274 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
275 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
276 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
277 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
278 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
279 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
280
281 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
282
283 for (i = 0; i < wr->wr_inodescnt; i++)
284 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
285 wr->wr_inodes[i].wr_imode);
286
287 /* Make sure new transaction won't overwrite old inodes list */
288 KDASSERT(wapbl_transaction_len(wl) <=
289 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
290 wr->wr_inodestail));
291
292 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
293 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
294 wapbl_transaction_len(wl);
295
296 error = wapbl_write_inodes(wl, &wl->wl_head);
297 if (error)
298 return error;
299
300 KASSERT(wl->wl_head != wl->wl_tail);
301 KASSERT(wl->wl_head != 0);
302
303 return 0;
304 }
305
306 int
307 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
308 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
309 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
310 {
311 struct wapbl *wl;
312 struct vnode *devvp;
313 daddr_t logpbn;
314 int error;
315 int log_dev_bshift = ilog2(blksize);
316 int fs_dev_bshift = log_dev_bshift;
317 int run;
318
319 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
320 " count=%zu blksize=%zu\n", vp, off, count, blksize));
321
322 if (log_dev_bshift > fs_dev_bshift) {
323 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
324 ("wapbl: log device's block size cannot be larger "
325 "than filesystem's\n"));
326 /*
327 * Not currently implemented, although it could be if
328 * needed someday.
329 */
330 return ENOSYS;
331 }
332
333 if (off < 0)
334 return EINVAL;
335
336 if (blksize < DEV_BSIZE)
337 return EINVAL;
338 if (blksize % DEV_BSIZE)
339 return EINVAL;
340
341 /* XXXTODO: verify that the full load is writable */
342
343 /*
344 * XXX check for minimum log size
345 * minimum is governed by minimum amount of space
346 * to complete a transaction. (probably truncate)
347 */
348 /* XXX for now pick something minimal */
349 if ((count * blksize) < MAXPHYS) {
350 return ENOSPC;
351 }
352
353 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
354 return error;
355 }
356
357 wl = wapbl_calloc(1, sizeof(*wl));
358 rw_init(&wl->wl_rwlock);
359 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
360 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
361 LIST_INIT(&wl->wl_bufs);
362 SIMPLEQ_INIT(&wl->wl_entries);
363
364 wl->wl_logvp = vp;
365 wl->wl_devvp = devvp;
366 wl->wl_mount = mp;
367 wl->wl_logpbn = logpbn;
368 wl->wl_log_dev_bshift = log_dev_bshift;
369 wl->wl_fs_dev_bshift = fs_dev_bshift;
370
371 wl->wl_flush = flushfn;
372 wl->wl_flush_abort = flushabortfn;
373
374 /* Reserve two log device blocks for the commit headers */
375 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
376 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
377 /* truncate the log usage to a multiple of log_dev_bshift */
378 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
379 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
380
381 /*
382 * wl_bufbytes_max limits the size of the in memory transaction space.
383 * - Since buffers are allocated and accounted for in units of
384 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
385 * (i.e. 1<<PAGE_SHIFT)
386 * - Since the log device has to be written in units of
387 * 1<<wl_log_dev_bshift it is required to be a mulitple of
388 * 1<<wl_log_dev_bshift.
389 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
390 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
391 * Therefore it must be multiple of the least common multiple of those
392 * three quantities. Fortunately, all of those quantities are
393 * guaranteed to be a power of two, and the least common multiple of
394 * a set of numbers which are all powers of two is simply the maximum
395 * of those numbers. Finally, the maximum logarithm of a power of two
396 * is the same as the log of the maximum power of two. So we can do
397 * the following operations to size wl_bufbytes_max:
398 */
399
400 /* XXX fix actual number of pages reserved per filesystem. */
401 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
402
403 /* Round wl_bufbytes_max to the largest power of two constraint */
404 wl->wl_bufbytes_max >>= PAGE_SHIFT;
405 wl->wl_bufbytes_max <<= PAGE_SHIFT;
406 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
407 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
408 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
409 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
410
411 /* XXX maybe use filesystem fragment size instead of 1024 */
412 /* XXX fix actual number of buffers reserved per filesystem. */
413 wl->wl_bufcount_max = (nbuf / 2) * 1024;
414
415 /* XXX tie this into resource estimation */
416 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
417
418 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
419 wl->wl_dealloclim);
420 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
421 wl->wl_dealloclim);
422
423 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
424
425 /* Initialize the commit header */
426 {
427 struct wapbl_wc_header *wc;
428 size_t len = 1 << wl->wl_log_dev_bshift;
429 wc = wapbl_calloc(1, len);
430 wc->wc_type = WAPBL_WC_HEADER;
431 wc->wc_len = len;
432 wc->wc_circ_off = wl->wl_circ_off;
433 wc->wc_circ_size = wl->wl_circ_size;
434 /* XXX wc->wc_fsid */
435 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
436 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
437 wl->wl_wc_header = wc;
438 wl->wl_wc_scratch = wapbl_malloc(len);
439 }
440
441 /*
442 * if there was an existing set of unlinked but
443 * allocated inodes, preserve it in the new
444 * log.
445 */
446 if (wr && wr->wr_inodescnt) {
447 error = wapbl_start_flush_inodes(wl, wr);
448 if (error)
449 goto errout;
450 }
451
452 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
453 if (error) {
454 goto errout;
455 }
456
457 *wlp = wl;
458 #if defined(WAPBL_DEBUG)
459 wapbl_debug_wl = wl;
460 #endif
461
462 return 0;
463 errout:
464 wapbl_discard(wl);
465 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
466 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
467 wapbl_free(wl->wl_deallocblks,
468 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
469 wapbl_free(wl->wl_dealloclens,
470 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
471 wapbl_inodetrk_free(wl);
472 wapbl_free(wl, sizeof(*wl));
473
474 return error;
475 }
476
477 /*
478 * Like wapbl_flush, only discards the transaction
479 * completely
480 */
481
482 void
483 wapbl_discard(struct wapbl *wl)
484 {
485 struct wapbl_entry *we;
486 struct buf *bp;
487 int i;
488
489 /*
490 * XXX we may consider using upgrade here
491 * if we want to call flush from inside a transaction
492 */
493 rw_enter(&wl->wl_rwlock, RW_WRITER);
494 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
495 wl->wl_dealloccnt);
496
497 #ifdef WAPBL_DEBUG_PRINT
498 {
499 pid_t pid = -1;
500 lwpid_t lid = -1;
501 if (curproc)
502 pid = curproc->p_pid;
503 if (curlwp)
504 lid = curlwp->l_lid;
505 #ifdef WAPBL_DEBUG_BUFBYTES
506 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
507 ("wapbl_discard: thread %d.%d discarding "
508 "transaction\n"
509 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
510 "deallocs=%d inodes=%d\n"
511 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
512 "unsynced=%zu\n",
513 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
514 wl->wl_bcount, wl->wl_dealloccnt,
515 wl->wl_inohashcnt, wl->wl_error_count,
516 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
517 wl->wl_unsynced_bufbytes));
518 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
519 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
520 ("\tentry: bufcount = %zu, reclaimable = %zu, "
521 "error = %d, unsynced = %zu\n",
522 we->we_bufcount, we->we_reclaimable_bytes,
523 we->we_error, we->we_unsynced_bufbytes));
524 }
525 #else /* !WAPBL_DEBUG_BUFBYTES */
526 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
527 ("wapbl_discard: thread %d.%d discarding transaction\n"
528 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
529 "deallocs=%d inodes=%d\n"
530 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
531 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
532 wl->wl_bcount, wl->wl_dealloccnt,
533 wl->wl_inohashcnt, wl->wl_error_count,
534 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
535 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
536 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
537 ("\tentry: bufcount = %zu, reclaimable = %zu, "
538 "error = %d\n",
539 we->we_bufcount, we->we_reclaimable_bytes,
540 we->we_error));
541 }
542 #endif /* !WAPBL_DEBUG_BUFBYTES */
543 }
544 #endif /* WAPBL_DEBUG_PRINT */
545
546 for (i = 0; i <= wl->wl_inohashmask; i++) {
547 struct wapbl_ino_head *wih;
548 struct wapbl_ino *wi;
549
550 wih = &wl->wl_inohash[i];
551 while ((wi = LIST_FIRST(wih)) != NULL) {
552 LIST_REMOVE(wi, wi_hash);
553 pool_put(&wapbl_ino_pool, wi);
554 KASSERT(wl->wl_inohashcnt > 0);
555 wl->wl_inohashcnt--;
556 }
557 }
558
559 /*
560 * clean buffer list
561 */
562 mutex_enter(&bufcache_lock);
563 mutex_enter(&wl->wl_mtx);
564 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
565 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
566 /*
567 * The buffer will be unlocked and
568 * removed from the transaction in brelse
569 */
570 mutex_exit(&wl->wl_mtx);
571 brelsel(bp, 0);
572 mutex_enter(&wl->wl_mtx);
573 }
574 }
575 mutex_exit(&wl->wl_mtx);
576 mutex_exit(&bufcache_lock);
577
578 /*
579 * Remove references to this wl from wl_entries, free any which
580 * no longer have buffers, others will be freed in wapbl_biodone
581 * when they no longer have any buffers.
582 */
583 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
584 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
585 /* XXX should we be accumulating wl_error_count
586 * and increasing reclaimable bytes ? */
587 we->we_wapbl = NULL;
588 if (we->we_bufcount == 0) {
589 #ifdef WAPBL_DEBUG_BUFBYTES
590 KASSERT(we->we_unsynced_bufbytes == 0);
591 #endif
592 wapbl_free(we, sizeof(*we));
593 }
594 }
595
596 /* Discard list of deallocs */
597 wl->wl_dealloccnt = 0;
598 /* XXX should we clear wl_reserved_bytes? */
599
600 KASSERT(wl->wl_bufbytes == 0);
601 KASSERT(wl->wl_bcount == 0);
602 KASSERT(wl->wl_bufcount == 0);
603 KASSERT(LIST_EMPTY(&wl->wl_bufs));
604 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
605 KASSERT(wl->wl_inohashcnt == 0);
606
607 rw_exit(&wl->wl_rwlock);
608 }
609
610 int
611 wapbl_stop(struct wapbl *wl, int force)
612 {
613 struct vnode *vp;
614 int error;
615
616 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
617 error = wapbl_flush(wl, 1);
618 if (error) {
619 if (force)
620 wapbl_discard(wl);
621 else
622 return error;
623 }
624
625 /* Unlinked inodes persist after a flush */
626 if (wl->wl_inohashcnt) {
627 if (force) {
628 wapbl_discard(wl);
629 } else {
630 return EBUSY;
631 }
632 }
633
634 KASSERT(wl->wl_bufbytes == 0);
635 KASSERT(wl->wl_bcount == 0);
636 KASSERT(wl->wl_bufcount == 0);
637 KASSERT(LIST_EMPTY(&wl->wl_bufs));
638 KASSERT(wl->wl_dealloccnt == 0);
639 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
640 KASSERT(wl->wl_inohashcnt == 0);
641
642 vp = wl->wl_logvp;
643
644 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
645 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
646 wapbl_free(wl->wl_deallocblks,
647 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
648 wapbl_free(wl->wl_dealloclens,
649 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
650 wapbl_inodetrk_free(wl);
651
652 cv_destroy(&wl->wl_reclaimable_cv);
653 mutex_destroy(&wl->wl_mtx);
654 rw_destroy(&wl->wl_rwlock);
655 wapbl_free(wl, sizeof(*wl));
656
657 return 0;
658 }
659
660 static int
661 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
662 {
663 struct pstats *pstats = curlwp->l_proc->p_stats;
664 struct buf *bp;
665 int error;
666
667 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
668 KASSERT(devvp->v_type == VBLK);
669
670 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
671 mutex_enter(&devvp->v_interlock);
672 devvp->v_numoutput++;
673 mutex_exit(&devvp->v_interlock);
674 pstats->p_ru.ru_oublock++;
675 } else {
676 pstats->p_ru.ru_inblock++;
677 }
678
679 bp = getiobuf(devvp, true);
680 bp->b_flags = flags;
681 bp->b_cflags = BC_BUSY; /* silly & dubious */
682 bp->b_dev = devvp->v_rdev;
683 bp->b_data = data;
684 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
685 bp->b_blkno = pbn;
686
687 WAPBL_PRINTF(WAPBL_PRINT_IO,
688 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
689 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
690 bp->b_blkno, bp->b_dev));
691
692 VOP_STRATEGY(devvp, bp);
693
694 error = biowait(bp);
695 putiobuf(bp);
696
697 if (error) {
698 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
699 ("wapbl_doio: %s %zu bytes at block %" PRId64
700 " on dev 0x%"PRIx64" failed with error %d\n",
701 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
702 "write" : "read"),
703 len, pbn, devvp->v_rdev, error));
704 }
705
706 return error;
707 }
708
709 int
710 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
711 {
712
713 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
714 }
715
716 int
717 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
718 {
719
720 return wapbl_doio(data, len, devvp, pbn, B_READ);
721 }
722
723 /*
724 * Off is byte offset returns new offset for next write
725 * handles log wraparound
726 */
727 static int
728 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
729 {
730 size_t slen;
731 off_t off = *offp;
732 int error;
733 daddr_t pbn;
734
735 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
736 wl->wl_log_dev_bshift) == len);
737
738 if (off < wl->wl_circ_off)
739 off = wl->wl_circ_off;
740 slen = wl->wl_circ_off + wl->wl_circ_size - off;
741 if (slen < len) {
742 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
743 #ifdef _KERNEL
744 pbn = btodb(pbn << wl->wl_log_dev_bshift);
745 #endif
746 error = wapbl_write(data, slen, wl->wl_devvp, pbn);
747 if (error)
748 return error;
749 data = (uint8_t *)data + slen;
750 len -= slen;
751 off = wl->wl_circ_off;
752 }
753 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
754 #ifdef _KERNEL
755 pbn = btodb(pbn << wl->wl_log_dev_bshift);
756 #endif
757 error = wapbl_write(data, len, wl->wl_devvp, pbn);
758 if (error)
759 return error;
760 off += len;
761 if (off >= wl->wl_circ_off + wl->wl_circ_size)
762 off = wl->wl_circ_off;
763 *offp = off;
764 return 0;
765 }
766
767 /****************************************************************/
768
769 int
770 wapbl_begin(struct wapbl *wl, const char *file, int line)
771 {
772 int doflush;
773 unsigned lockcount;
774
775 KDASSERT(wl);
776
777 /*
778 * XXX this needs to be made much more sophisticated.
779 * perhaps each wapbl_begin could reserve a specified
780 * number of buffers and bytes.
781 */
782 mutex_enter(&wl->wl_mtx);
783 lockcount = wl->wl_lock_count;
784 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
785 wl->wl_bufbytes_max / 2) ||
786 ((wl->wl_bufcount + (lockcount * 10)) >
787 wl->wl_bufcount_max / 2) ||
788 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
789 (wl->wl_dealloccnt >=
790 (wl->wl_dealloclim - (wl->wl_dealloclim >> 8)));
791 mutex_exit(&wl->wl_mtx);
792
793 if (doflush) {
794 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
795 ("force flush lockcnt=%d bufbytes=%zu "
796 "(max=%zu) bufcount=%zu (max=%zu) "
797 "dealloccnt %d (lim=%d)\n",
798 lockcount, wl->wl_bufbytes,
799 wl->wl_bufbytes_max, wl->wl_bufcount,
800 wl->wl_bufcount_max,
801 wl->wl_dealloccnt, wl->wl_dealloclim));
802 }
803
804 if (doflush) {
805 int error = wapbl_flush(wl, 0);
806 if (error)
807 return error;
808 }
809
810 rw_enter(&wl->wl_rwlock, RW_READER);
811 mutex_enter(&wl->wl_mtx);
812 wl->wl_lock_count++;
813 mutex_exit(&wl->wl_mtx);
814
815 #if defined(WAPBL_DEBUG_PRINT)
816 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
817 ("wapbl_begin thread %d.%d with bufcount=%zu "
818 "bufbytes=%zu bcount=%zu at %s:%d\n",
819 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
820 wl->wl_bufbytes, wl->wl_bcount, file, line));
821 #endif
822
823 return 0;
824 }
825
826 void
827 wapbl_end(struct wapbl *wl)
828 {
829
830 #if defined(WAPBL_DEBUG_PRINT)
831 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
832 ("wapbl_end thread %d.%d with bufcount=%zu "
833 "bufbytes=%zu bcount=%zu\n",
834 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
835 wl->wl_bufbytes, wl->wl_bcount));
836 #endif
837
838 mutex_enter(&wl->wl_mtx);
839 KASSERT(wl->wl_lock_count > 0);
840 wl->wl_lock_count--;
841 mutex_exit(&wl->wl_mtx);
842
843 rw_exit(&wl->wl_rwlock);
844 }
845
846 void
847 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
848 {
849
850 KASSERT(bp->b_cflags & BC_BUSY);
851 KASSERT(bp->b_vp);
852
853 wapbl_jlock_assert(wl);
854
855 #if 0
856 /*
857 * XXX this might be an issue for swapfiles.
858 * see uvm_swap.c:1702
859 *
860 * XXX2 why require it then? leap of semantics?
861 */
862 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
863 #endif
864
865 mutex_enter(&wl->wl_mtx);
866 if (bp->b_flags & B_LOCKED) {
867 LIST_REMOVE(bp, b_wapbllist);
868 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
869 ("wapbl_add_buf thread %d.%d re-adding buf %p "
870 "with %d bytes %d bcount\n",
871 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
872 bp->b_bcount));
873 } else {
874 /* unlocked by dirty buffers shouldn't exist */
875 KASSERT(!(bp->b_oflags & BO_DELWRI));
876 wl->wl_bufbytes += bp->b_bufsize;
877 wl->wl_bcount += bp->b_bcount;
878 wl->wl_bufcount++;
879 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
880 ("wapbl_add_buf thread %d.%d adding buf %p "
881 "with %d bytes %d bcount\n",
882 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
883 bp->b_bcount));
884 }
885 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
886 mutex_exit(&wl->wl_mtx);
887
888 bp->b_flags |= B_LOCKED;
889 }
890
891 static void
892 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
893 {
894
895 KASSERT(mutex_owned(&wl->wl_mtx));
896 KASSERT(bp->b_cflags & BC_BUSY);
897 wapbl_jlock_assert(wl);
898
899 #if 0
900 /*
901 * XXX this might be an issue for swapfiles.
902 * see uvm_swap.c:1725
903 *
904 * XXXdeux: see above
905 */
906 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
907 #endif
908 KASSERT(bp->b_flags & B_LOCKED);
909
910 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
911 ("wapbl_remove_buf thread %d.%d removing buf %p with "
912 "%d bytes %d bcount\n",
913 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
914
915 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
916 wl->wl_bufbytes -= bp->b_bufsize;
917 KASSERT(wl->wl_bcount >= bp->b_bcount);
918 wl->wl_bcount -= bp->b_bcount;
919 KASSERT(wl->wl_bufcount > 0);
920 wl->wl_bufcount--;
921 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
922 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
923 LIST_REMOVE(bp, b_wapbllist);
924
925 bp->b_flags &= ~B_LOCKED;
926 }
927
928 /* called from brelsel() in vfs_bio among other places */
929 void
930 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
931 {
932
933 mutex_enter(&wl->wl_mtx);
934 wapbl_remove_buf_locked(wl, bp);
935 mutex_exit(&wl->wl_mtx);
936 }
937
938 void
939 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
940 {
941
942 KASSERT(bp->b_cflags & BC_BUSY);
943
944 /*
945 * XXX: why does this depend on B_LOCKED? otherwise the buf
946 * is not for a transaction? if so, why is this called in the
947 * first place?
948 */
949 if (bp->b_flags & B_LOCKED) {
950 mutex_enter(&wl->wl_mtx);
951 wl->wl_bufbytes += bp->b_bufsize - oldsz;
952 wl->wl_bcount += bp->b_bcount - oldcnt;
953 mutex_exit(&wl->wl_mtx);
954 }
955 }
956
957 #endif /* _KERNEL */
958
959 /****************************************************************/
960 /* Some utility inlines */
961
962 /* This is used to advance the pointer at old to new value at old+delta */
963 static inline off_t
964 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
965 {
966 off_t new;
967
968 /* Define acceptable ranges for inputs. */
969 KASSERT(delta <= size);
970 KASSERT((old == 0) || (old >= off));
971 KASSERT(old < (size + off));
972
973 if ((old == 0) && (delta != 0))
974 new = off + delta;
975 else if ((old + delta) < (size + off))
976 new = old + delta;
977 else
978 new = (old + delta) - size;
979
980 /* Note some interesting axioms */
981 KASSERT((delta != 0) || (new == old));
982 KASSERT((delta == 0) || (new != 0));
983 KASSERT((delta != (size)) || (new == old));
984
985 /* Define acceptable ranges for output. */
986 KASSERT((new == 0) || (new >= off));
987 KASSERT(new < (size + off));
988 return new;
989 }
990
991 static inline size_t
992 wapbl_space_used(size_t avail, off_t head, off_t tail)
993 {
994
995 if (tail == 0) {
996 KASSERT(head == 0);
997 return 0;
998 }
999 return ((head + (avail - 1) - tail) % avail) + 1;
1000 }
1001
1002 static inline size_t
1003 wapbl_space_free(size_t avail, off_t head, off_t tail)
1004 {
1005
1006 return avail - wapbl_space_used(avail, head, tail);
1007 }
1008
1009 static inline void
1010 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1011 off_t *tailp)
1012 {
1013 off_t head = *headp;
1014 off_t tail = *tailp;
1015
1016 KASSERT(delta <= wapbl_space_free(size, head, tail));
1017 head = wapbl_advance(size, off, head, delta);
1018 if ((tail == 0) && (head != 0))
1019 tail = off;
1020 *headp = head;
1021 *tailp = tail;
1022 }
1023
1024 static inline void
1025 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1026 off_t *tailp)
1027 {
1028 off_t head = *headp;
1029 off_t tail = *tailp;
1030
1031 KASSERT(delta <= wapbl_space_used(size, head, tail));
1032 tail = wapbl_advance(size, off, tail, delta);
1033 if (head == tail) {
1034 head = tail = 0;
1035 }
1036 *headp = head;
1037 *tailp = tail;
1038 }
1039
1040 #ifdef _KERNEL
1041
1042 /****************************************************************/
1043
1044 /*
1045 * Remove transactions whose buffers are completely flushed to disk.
1046 * Will block until at least minfree space is available.
1047 * only intended to be called from inside wapbl_flush and therefore
1048 * does not protect against commit races with itself or with flush.
1049 */
1050 static int
1051 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1052 {
1053 size_t delta;
1054 size_t avail;
1055 off_t head;
1056 off_t tail;
1057 int error = 0;
1058
1059 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1060 KASSERT(rw_write_held(&wl->wl_rwlock));
1061
1062 mutex_enter(&wl->wl_mtx);
1063
1064 /*
1065 * First check to see if we have to do a commit
1066 * at all.
1067 */
1068 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1069 if (minfree < avail) {
1070 mutex_exit(&wl->wl_mtx);
1071 return 0;
1072 }
1073 minfree -= avail;
1074 while ((wl->wl_error_count == 0) &&
1075 (wl->wl_reclaimable_bytes < minfree)) {
1076 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1077 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1078 "minfree=%zd\n",
1079 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1080 minfree));
1081
1082 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1083 }
1084 if (wl->wl_reclaimable_bytes < minfree) {
1085 KASSERT(wl->wl_error_count);
1086 /* XXX maybe get actual error from buffer instead someday? */
1087 error = EIO;
1088 }
1089 head = wl->wl_head;
1090 tail = wl->wl_tail;
1091 delta = wl->wl_reclaimable_bytes;
1092
1093 /* If all of of the entries are flushed, then be sure to keep
1094 * the reserved bytes reserved. Watch out for discarded transactions,
1095 * which could leave more bytes reserved than are reclaimable.
1096 */
1097 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1098 (delta >= wl->wl_reserved_bytes)) {
1099 delta -= wl->wl_reserved_bytes;
1100 }
1101 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1102 &tail);
1103 KDASSERT(wl->wl_reserved_bytes <=
1104 wapbl_space_used(wl->wl_circ_size, head, tail));
1105 mutex_exit(&wl->wl_mtx);
1106
1107 if (error)
1108 return error;
1109
1110 if (waitonly)
1111 return 0;
1112
1113 /*
1114 * This is where head, tail and delta are unprotected
1115 * from races against itself or flush. This is ok since
1116 * we only call this routine from inside flush itself.
1117 *
1118 * XXX: how can it race against itself when accessed only
1119 * from behind the write-locked rwlock?
1120 */
1121 error = wapbl_write_commit(wl, head, tail);
1122 if (error)
1123 return error;
1124
1125 wl->wl_head = head;
1126 wl->wl_tail = tail;
1127
1128 mutex_enter(&wl->wl_mtx);
1129 KASSERT(wl->wl_reclaimable_bytes >= delta);
1130 wl->wl_reclaimable_bytes -= delta;
1131 mutex_exit(&wl->wl_mtx);
1132 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1133 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1134 curproc->p_pid, curlwp->l_lid, delta));
1135
1136 return 0;
1137 }
1138
1139 /****************************************************************/
1140
1141 void
1142 wapbl_biodone(struct buf *bp)
1143 {
1144 struct wapbl_entry *we = bp->b_private;
1145 struct wapbl *wl = we->we_wapbl;
1146
1147 /*
1148 * Handle possible flushing of buffers after log has been
1149 * decomissioned.
1150 */
1151 if (!wl) {
1152 KASSERT(we->we_bufcount > 0);
1153 we->we_bufcount--;
1154 #ifdef WAPBL_DEBUG_BUFBYTES
1155 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1156 we->we_unsynced_bufbytes -= bp->b_bufsize;
1157 #endif
1158
1159 if (we->we_bufcount == 0) {
1160 #ifdef WAPBL_DEBUG_BUFBYTES
1161 KASSERT(we->we_unsynced_bufbytes == 0);
1162 #endif
1163 wapbl_free(we, sizeof(*we));
1164 }
1165
1166 brelse(bp, 0);
1167 return;
1168 }
1169
1170 #ifdef ohbother
1171 KDASSERT(bp->b_flags & B_DONE);
1172 KDASSERT(!(bp->b_flags & B_DELWRI));
1173 KDASSERT(bp->b_flags & B_ASYNC);
1174 KDASSERT(bp->b_flags & B_BUSY);
1175 KDASSERT(!(bp->b_flags & B_LOCKED));
1176 KDASSERT(!(bp->b_flags & B_READ));
1177 KDASSERT(!(bp->b_flags & B_INVAL));
1178 KDASSERT(!(bp->b_flags & B_NOCACHE));
1179 #endif
1180
1181 if (bp->b_error) {
1182 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1183 /*
1184 * XXXpooka: interfaces not fully updated
1185 * Note: this was not enabled in the original patch
1186 * against netbsd4 either. I don't know if comment
1187 * above is true or not.
1188 */
1189
1190 /*
1191 * If an error occurs, report the error and leave the
1192 * buffer as a delayed write on the LRU queue.
1193 * restarting the write would likely result in
1194 * an error spinloop, so let it be done harmlessly
1195 * by the syncer.
1196 */
1197 bp->b_flags &= ~(B_DONE);
1198 simple_unlock(&bp->b_interlock);
1199
1200 if (we->we_error == 0) {
1201 mutex_enter(&wl->wl_mtx);
1202 wl->wl_error_count++;
1203 mutex_exit(&wl->wl_mtx);
1204 cv_broadcast(&wl->wl_reclaimable_cv);
1205 }
1206 we->we_error = bp->b_error;
1207 bp->b_error = 0;
1208 brelse(bp);
1209 return;
1210 #else
1211 /* For now, just mark the log permanently errored out */
1212
1213 mutex_enter(&wl->wl_mtx);
1214 if (wl->wl_error_count == 0) {
1215 wl->wl_error_count++;
1216 cv_broadcast(&wl->wl_reclaimable_cv);
1217 }
1218 mutex_exit(&wl->wl_mtx);
1219 #endif
1220 }
1221
1222 mutex_enter(&wl->wl_mtx);
1223
1224 KASSERT(we->we_bufcount > 0);
1225 we->we_bufcount--;
1226 #ifdef WAPBL_DEBUG_BUFBYTES
1227 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1228 we->we_unsynced_bufbytes -= bp->b_bufsize;
1229 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1230 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1231 #endif
1232
1233 /*
1234 * If the current transaction can be reclaimed, start
1235 * at the beginning and reclaim any consecutive reclaimable
1236 * transactions. If we successfully reclaim anything,
1237 * then wakeup anyone waiting for the reclaim.
1238 */
1239 if (we->we_bufcount == 0) {
1240 size_t delta = 0;
1241 int errcnt = 0;
1242 #ifdef WAPBL_DEBUG_BUFBYTES
1243 KDASSERT(we->we_unsynced_bufbytes == 0);
1244 #endif
1245 /*
1246 * clear any posted error, since the buffer it came from
1247 * has successfully flushed by now
1248 */
1249 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1250 (we->we_bufcount == 0)) {
1251 delta += we->we_reclaimable_bytes;
1252 if (we->we_error)
1253 errcnt++;
1254 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1255 wapbl_free(we, sizeof(*we));
1256 }
1257
1258 if (delta) {
1259 wl->wl_reclaimable_bytes += delta;
1260 KASSERT(wl->wl_error_count >= errcnt);
1261 wl->wl_error_count -= errcnt;
1262 cv_broadcast(&wl->wl_reclaimable_cv);
1263 }
1264 }
1265
1266 mutex_exit(&wl->wl_mtx);
1267 brelse(bp, 0);
1268 }
1269
1270 /*
1271 * Write transactions to disk + start I/O for contents
1272 */
1273 int
1274 wapbl_flush(struct wapbl *wl, int waitfor)
1275 {
1276 struct buf *bp;
1277 struct wapbl_entry *we;
1278 off_t off;
1279 off_t head;
1280 off_t tail;
1281 size_t delta = 0;
1282 size_t flushsize;
1283 size_t reserved;
1284 int error = 0;
1285
1286 /*
1287 * Do a quick check to see if a full flush can be skipped
1288 * This assumes that the flush callback does not need to be called
1289 * unless there are other outstanding bufs.
1290 */
1291 if (!waitfor) {
1292 size_t nbufs;
1293 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1294 protect the KASSERTS */
1295 nbufs = wl->wl_bufcount;
1296 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1297 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1298 mutex_exit(&wl->wl_mtx);
1299 if (nbufs == 0)
1300 return 0;
1301 }
1302
1303 /*
1304 * XXX we may consider using LK_UPGRADE here
1305 * if we want to call flush from inside a transaction
1306 */
1307 rw_enter(&wl->wl_rwlock, RW_WRITER);
1308 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1309 wl->wl_dealloccnt);
1310
1311 /*
1312 * Now that we are fully locked and flushed,
1313 * do another check for nothing to do.
1314 */
1315 if (wl->wl_bufcount == 0) {
1316 goto out;
1317 }
1318
1319 #if 0
1320 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1321 ("wapbl_flush thread %d.%d flushing entries with "
1322 "bufcount=%zu bufbytes=%zu\n",
1323 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1324 wl->wl_bufbytes));
1325 #endif
1326
1327 /* Calculate amount of space needed to flush */
1328 flushsize = wapbl_transaction_len(wl);
1329
1330 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1331 /*
1332 * XXX this could be handled more gracefully, perhaps place
1333 * only a partial transaction in the log and allow the
1334 * remaining to flush without the protection of the journal.
1335 */
1336 panic("wapbl_flush: current transaction too big to flush\n");
1337 }
1338
1339 error = wapbl_truncate(wl, flushsize, 0);
1340 if (error)
1341 goto out2;
1342
1343 off = wl->wl_head;
1344 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1345 (off < wl->wl_circ_off + wl->wl_circ_size)));
1346 error = wapbl_write_blocks(wl, &off);
1347 if (error)
1348 goto out2;
1349 error = wapbl_write_revocations(wl, &off);
1350 if (error)
1351 goto out2;
1352 error = wapbl_write_inodes(wl, &off);
1353 if (error)
1354 goto out2;
1355
1356 reserved = 0;
1357 if (wl->wl_inohashcnt)
1358 reserved = wapbl_transaction_inodes_len(wl);
1359
1360 head = wl->wl_head;
1361 tail = wl->wl_tail;
1362
1363 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1364 &head, &tail);
1365 #ifdef WAPBL_DEBUG
1366 if (head != off) {
1367 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1368 " off=%"PRIdMAX" flush=%zu\n",
1369 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1370 flushsize);
1371 }
1372 #else
1373 KASSERT(head == off);
1374 #endif
1375
1376 /* Opportunistically move the tail forward if we can */
1377 if (!wapbl_lazy_truncate) {
1378 mutex_enter(&wl->wl_mtx);
1379 delta = wl->wl_reclaimable_bytes;
1380 mutex_exit(&wl->wl_mtx);
1381 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1382 &head, &tail);
1383 }
1384
1385 error = wapbl_write_commit(wl, head, tail);
1386 if (error)
1387 goto out2;
1388
1389 we = wapbl_calloc(1, sizeof(*we));
1390
1391 #ifdef WAPBL_DEBUG_BUFBYTES
1392 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1393 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1394 " unsynced=%zu"
1395 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1396 "inodes=%d\n",
1397 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1398 wapbl_space_used(wl->wl_circ_size, head, tail),
1399 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1400 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1401 wl->wl_inohashcnt));
1402 #else
1403 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1404 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1405 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1406 "inodes=%d\n",
1407 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1408 wapbl_space_used(wl->wl_circ_size, head, tail),
1409 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1410 wl->wl_dealloccnt, wl->wl_inohashcnt));
1411 #endif
1412
1413
1414 mutex_enter(&bufcache_lock);
1415 mutex_enter(&wl->wl_mtx);
1416
1417 wl->wl_reserved_bytes = reserved;
1418 wl->wl_head = head;
1419 wl->wl_tail = tail;
1420 KASSERT(wl->wl_reclaimable_bytes >= delta);
1421 wl->wl_reclaimable_bytes -= delta;
1422 wl->wl_dealloccnt = 0;
1423 #ifdef WAPBL_DEBUG_BUFBYTES
1424 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1425 #endif
1426
1427 we->we_wapbl = wl;
1428 we->we_bufcount = wl->wl_bufcount;
1429 #ifdef WAPBL_DEBUG_BUFBYTES
1430 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1431 #endif
1432 we->we_reclaimable_bytes = flushsize;
1433 we->we_error = 0;
1434 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1435
1436 /*
1437 * this flushes bufs in reverse order than they were queued
1438 * it shouldn't matter, but if we care we could use TAILQ instead.
1439 * XXX Note they will get put on the lru queue when they flush
1440 * so we might actually want to change this to preserve order.
1441 */
1442 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1443 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1444 continue;
1445 }
1446 bp->b_iodone = wapbl_biodone;
1447 bp->b_private = we;
1448 bremfree(bp);
1449 wapbl_remove_buf_locked(wl, bp);
1450 mutex_exit(&wl->wl_mtx);
1451 mutex_exit(&bufcache_lock);
1452 bawrite(bp);
1453 mutex_enter(&bufcache_lock);
1454 mutex_enter(&wl->wl_mtx);
1455 }
1456 mutex_exit(&wl->wl_mtx);
1457 mutex_exit(&bufcache_lock);
1458
1459 #if 0
1460 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1461 ("wapbl_flush thread %d.%d done flushing entries...\n",
1462 curproc->p_pid, curlwp->l_lid));
1463 #endif
1464
1465 out:
1466
1467 /*
1468 * If the waitfor flag is set, don't return until everything is
1469 * fully flushed and the on disk log is empty.
1470 */
1471 if (waitfor) {
1472 error = wapbl_truncate(wl, wl->wl_circ_size -
1473 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1474 }
1475
1476 out2:
1477 if (error) {
1478 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1479 wl->wl_dealloclens, wl->wl_dealloccnt);
1480 }
1481
1482 #ifdef WAPBL_DEBUG_PRINT
1483 if (error) {
1484 pid_t pid = -1;
1485 lwpid_t lid = -1;
1486 if (curproc)
1487 pid = curproc->p_pid;
1488 if (curlwp)
1489 lid = curlwp->l_lid;
1490 mutex_enter(&wl->wl_mtx);
1491 #ifdef WAPBL_DEBUG_BUFBYTES
1492 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1493 ("wapbl_flush: thread %d.%d aborted flush: "
1494 "error = %d\n"
1495 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1496 "deallocs=%d inodes=%d\n"
1497 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1498 "unsynced=%zu\n",
1499 pid, lid, error, wl->wl_bufcount,
1500 wl->wl_bufbytes, wl->wl_bcount,
1501 wl->wl_dealloccnt, wl->wl_inohashcnt,
1502 wl->wl_error_count, wl->wl_reclaimable_bytes,
1503 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1504 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1505 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1506 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1507 "error = %d, unsynced = %zu\n",
1508 we->we_bufcount, we->we_reclaimable_bytes,
1509 we->we_error, we->we_unsynced_bufbytes));
1510 }
1511 #else
1512 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1513 ("wapbl_flush: thread %d.%d aborted flush: "
1514 "error = %d\n"
1515 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1516 "deallocs=%d inodes=%d\n"
1517 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1518 pid, lid, error, wl->wl_bufcount,
1519 wl->wl_bufbytes, wl->wl_bcount,
1520 wl->wl_dealloccnt, wl->wl_inohashcnt,
1521 wl->wl_error_count, wl->wl_reclaimable_bytes,
1522 wl->wl_reserved_bytes));
1523 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1524 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1525 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1526 "error = %d\n", we->we_bufcount,
1527 we->we_reclaimable_bytes, we->we_error));
1528 }
1529 #endif
1530 mutex_exit(&wl->wl_mtx);
1531 }
1532 #endif
1533
1534 rw_exit(&wl->wl_rwlock);
1535 return error;
1536 }
1537
1538 /****************************************************************/
1539
1540 void
1541 wapbl_jlock_assert(struct wapbl *wl)
1542 {
1543
1544 KASSERT(rw_lock_held(&wl->wl_rwlock));
1545 }
1546
1547 void
1548 wapbl_junlock_assert(struct wapbl *wl)
1549 {
1550
1551 KASSERT(!rw_write_held(&wl->wl_rwlock));
1552 }
1553
1554 /****************************************************************/
1555
1556 /* locks missing */
1557 void
1558 wapbl_print(struct wapbl *wl,
1559 int full,
1560 void (*pr)(const char *, ...))
1561 {
1562 struct buf *bp;
1563 struct wapbl_entry *we;
1564 (*pr)("wapbl %p", wl);
1565 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1566 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1567 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1568 wl->wl_circ_size, wl->wl_circ_off,
1569 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1570 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1571 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1572 #ifdef WAPBL_DEBUG_BUFBYTES
1573 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1574 "reserved = %zu errcnt = %d unsynced = %zu\n",
1575 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1576 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1577 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1578 #else
1579 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1580 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1581 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1582 wl->wl_error_count);
1583 #endif
1584 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1585 wl->wl_dealloccnt, wl->wl_dealloclim);
1586 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1587 wl->wl_inohashcnt, wl->wl_inohashmask);
1588 (*pr)("entries:\n");
1589 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1590 #ifdef WAPBL_DEBUG_BUFBYTES
1591 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1592 "unsynced = %zu\n",
1593 we->we_bufcount, we->we_reclaimable_bytes,
1594 we->we_error, we->we_unsynced_bufbytes);
1595 #else
1596 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1597 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1598 #endif
1599 }
1600 if (full) {
1601 int cnt = 0;
1602 (*pr)("bufs =");
1603 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1604 if (!LIST_NEXT(bp, b_wapbllist)) {
1605 (*pr)(" %p", bp);
1606 } else if ((++cnt % 6) == 0) {
1607 (*pr)(" %p,\n\t", bp);
1608 } else {
1609 (*pr)(" %p,", bp);
1610 }
1611 }
1612 (*pr)("\n");
1613
1614 (*pr)("dealloced blks = ");
1615 {
1616 int i;
1617 cnt = 0;
1618 for (i = 0; i < wl->wl_dealloccnt; i++) {
1619 (*pr)(" %"PRId64":%d,",
1620 wl->wl_deallocblks[i],
1621 wl->wl_dealloclens[i]);
1622 if ((++cnt % 4) == 0) {
1623 (*pr)("\n\t");
1624 }
1625 }
1626 }
1627 (*pr)("\n");
1628
1629 (*pr)("registered inodes = ");
1630 {
1631 int i;
1632 cnt = 0;
1633 for (i = 0; i <= wl->wl_inohashmask; i++) {
1634 struct wapbl_ino_head *wih;
1635 struct wapbl_ino *wi;
1636
1637 wih = &wl->wl_inohash[i];
1638 LIST_FOREACH(wi, wih, wi_hash) {
1639 if (wi->wi_ino == 0)
1640 continue;
1641 (*pr)(" %"PRId32"/0%06"PRIo32",",
1642 wi->wi_ino, wi->wi_mode);
1643 if ((++cnt % 4) == 0) {
1644 (*pr)("\n\t");
1645 }
1646 }
1647 }
1648 (*pr)("\n");
1649 }
1650 }
1651 }
1652
1653 #if defined(WAPBL_DEBUG) || defined(DDB)
1654 void
1655 wapbl_dump(struct wapbl *wl)
1656 {
1657 #if defined(WAPBL_DEBUG)
1658 if (!wl)
1659 wl = wapbl_debug_wl;
1660 #endif
1661 if (!wl)
1662 return;
1663 wapbl_print(wl, 1, printf);
1664 }
1665 #endif
1666
1667 /****************************************************************/
1668
1669 void
1670 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1671 {
1672
1673 wapbl_jlock_assert(wl);
1674
1675 /* XXX should eventually instead tie this into resource estimation */
1676 /*
1677 * XXX this panic needs locking/mutex analysis and the
1678 * ability to cope with the failure.
1679 */
1680 /* XXX this XXX doesn't have enough XXX */
1681 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim))
1682 panic("wapbl_register_deallocation: out of resources");
1683
1684 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1685 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1686 wl->wl_dealloccnt++;
1687 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1688 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1689 }
1690
1691 /****************************************************************/
1692
1693 static void
1694 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1695 {
1696
1697 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1698 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1699 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1700 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1701 }
1702 }
1703
1704 static void
1705 wapbl_inodetrk_free(struct wapbl *wl)
1706 {
1707
1708 /* XXX this KASSERT needs locking/mutex analysis */
1709 KASSERT(wl->wl_inohashcnt == 0);
1710 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1711 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1712 pool_destroy(&wapbl_ino_pool);
1713 }
1714 }
1715
1716 static struct wapbl_ino *
1717 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1718 {
1719 struct wapbl_ino_head *wih;
1720 struct wapbl_ino *wi;
1721
1722 KASSERT(mutex_owned(&wl->wl_mtx));
1723
1724 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1725 LIST_FOREACH(wi, wih, wi_hash) {
1726 if (ino == wi->wi_ino)
1727 return wi;
1728 }
1729 return 0;
1730 }
1731
1732 void
1733 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1734 {
1735 struct wapbl_ino_head *wih;
1736 struct wapbl_ino *wi;
1737
1738 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1739
1740 mutex_enter(&wl->wl_mtx);
1741 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1742 wi->wi_ino = ino;
1743 wi->wi_mode = mode;
1744 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1745 LIST_INSERT_HEAD(wih, wi, wi_hash);
1746 wl->wl_inohashcnt++;
1747 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1748 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1749 mutex_exit(&wl->wl_mtx);
1750 } else {
1751 mutex_exit(&wl->wl_mtx);
1752 pool_put(&wapbl_ino_pool, wi);
1753 }
1754 }
1755
1756 void
1757 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1758 {
1759 struct wapbl_ino *wi;
1760
1761 mutex_enter(&wl->wl_mtx);
1762 wi = wapbl_inodetrk_get(wl, ino);
1763 if (wi) {
1764 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1765 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1766 KASSERT(wl->wl_inohashcnt > 0);
1767 wl->wl_inohashcnt--;
1768 LIST_REMOVE(wi, wi_hash);
1769 mutex_exit(&wl->wl_mtx);
1770
1771 pool_put(&wapbl_ino_pool, wi);
1772 } else {
1773 mutex_exit(&wl->wl_mtx);
1774 }
1775 }
1776
1777 /****************************************************************/
1778
1779 static inline size_t
1780 wapbl_transaction_inodes_len(struct wapbl *wl)
1781 {
1782 int blocklen = 1<<wl->wl_log_dev_bshift;
1783 int iph;
1784
1785 /* Calculate number of inodes described in a inodelist header */
1786 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1787 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1788
1789 KASSERT(iph > 0);
1790
1791 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1792 }
1793
1794
1795 /* Calculate amount of space a transaction will take on disk */
1796 static size_t
1797 wapbl_transaction_len(struct wapbl *wl)
1798 {
1799 int blocklen = 1<<wl->wl_log_dev_bshift;
1800 size_t len;
1801 int bph;
1802
1803 /* Calculate number of blocks described in a blocklist header */
1804 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1805 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1806
1807 KASSERT(bph > 0);
1808
1809 len = wl->wl_bcount;
1810 len += howmany(wl->wl_bufcount, bph)*blocklen;
1811 len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1812 len += wapbl_transaction_inodes_len(wl);
1813
1814 return len;
1815 }
1816
1817 /*
1818 * Perform commit operation
1819 *
1820 * Note that generation number incrementation needs to
1821 * be protected against racing with other invocations
1822 * of wapbl_commit. This is ok since this routine
1823 * is only invoked from wapbl_flush
1824 */
1825 static int
1826 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1827 {
1828 struct wapbl_wc_header *wc = wl->wl_wc_header;
1829 struct timespec ts;
1830 int error;
1831 int force = 1;
1832 daddr_t pbn;
1833
1834 /* XXX Calc checksum here, instead we do this for now */
1835 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1836 if (error) {
1837 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1838 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
1839 " returned %d\n", wl->wl_devvp->v_rdev, error));
1840 }
1841
1842 wc->wc_head = head;
1843 wc->wc_tail = tail;
1844 wc->wc_checksum = 0;
1845 wc->wc_version = 1;
1846 getnanotime(&ts);
1847 wc->wc_time = ts.tv_sec;
1848 wc->wc_timensec = ts.tv_nsec;
1849
1850 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1851 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1852 (intmax_t)head, (intmax_t)tail));
1853
1854 /*
1855 * XXX if generation will rollover, then first zero
1856 * over second commit header before trying to write both headers.
1857 */
1858
1859 pbn = wl->wl_logpbn + (wc->wc_generation % 2);
1860 #ifdef _KERNEL
1861 pbn = btodb(pbn << wc->wc_log_dev_bshift);
1862 #endif
1863 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
1864 if (error)
1865 return error;
1866
1867 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1868 if (error) {
1869 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1870 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%"PRIx64
1871 " returned %d\n", wl->wl_devvp->v_rdev, error));
1872 }
1873
1874 /*
1875 * If the generation number was zero, write it out a second time.
1876 * This handles initialization and generation number rollover
1877 */
1878 if (wc->wc_generation++ == 0) {
1879 error = wapbl_write_commit(wl, head, tail);
1880 /*
1881 * This panic should be able to be removed if we do the
1882 * zero'ing mentioned above, and we are certain to roll
1883 * back generation number on failure.
1884 */
1885 if (error)
1886 panic("wapbl_write_commit: error writing duplicate "
1887 "log header: %d\n", error);
1888 }
1889 return 0;
1890 }
1891
1892 /* Returns new offset value */
1893 static int
1894 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1895 {
1896 struct wapbl_wc_blocklist *wc =
1897 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1898 int blocklen = 1<<wl->wl_log_dev_bshift;
1899 int bph;
1900 struct buf *bp;
1901 off_t off = *offp;
1902 int error;
1903 size_t padding;
1904
1905 KASSERT(rw_write_held(&wl->wl_rwlock));
1906
1907 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1908 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1909
1910 bp = LIST_FIRST(&wl->wl_bufs);
1911
1912 while (bp) {
1913 int cnt;
1914 struct buf *obp = bp;
1915
1916 KASSERT(bp->b_flags & B_LOCKED);
1917
1918 wc->wc_type = WAPBL_WC_BLOCKS;
1919 wc->wc_len = blocklen;
1920 wc->wc_blkcount = 0;
1921 while (bp && (wc->wc_blkcount < bph)) {
1922 /*
1923 * Make sure all the physical block numbers are up to
1924 * date. If this is not always true on a given
1925 * filesystem, then VOP_BMAP must be called. We
1926 * could call VOP_BMAP here, or else in the filesystem
1927 * specific flush callback, although neither of those
1928 * solutions allow us to take the vnode lock. If a
1929 * filesystem requires that we must take the vnode lock
1930 * to call VOP_BMAP, then we can probably do it in
1931 * bwrite when the vnode lock should already be held
1932 * by the invoking code.
1933 */
1934 KASSERT((bp->b_vp->v_type == VBLK) ||
1935 (bp->b_blkno != bp->b_lblkno));
1936 KASSERT(bp->b_blkno > 0);
1937
1938 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1939 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1940 wc->wc_len += bp->b_bcount;
1941 wc->wc_blkcount++;
1942 bp = LIST_NEXT(bp, b_wapbllist);
1943 }
1944 if (wc->wc_len % blocklen != 0) {
1945 padding = blocklen - wc->wc_len % blocklen;
1946 wc->wc_len += padding;
1947 } else {
1948 padding = 0;
1949 }
1950
1951 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1952 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
1953 wc->wc_len, padding, (intmax_t)off));
1954
1955 error = wapbl_circ_write(wl, wc, blocklen, &off);
1956 if (error)
1957 return error;
1958 bp = obp;
1959 cnt = 0;
1960 while (bp && (cnt++ < bph)) {
1961 error = wapbl_circ_write(wl, bp->b_data,
1962 bp->b_bcount, &off);
1963 if (error)
1964 return error;
1965 bp = LIST_NEXT(bp, b_wapbllist);
1966 }
1967 if (padding) {
1968 void *zero;
1969
1970 zero = wapbl_malloc(padding);
1971 memset(zero, 0, padding);
1972 error = wapbl_circ_write(wl, zero, padding, &off);
1973 wapbl_free(zero, padding);
1974 if (error)
1975 return error;
1976 }
1977 }
1978 *offp = off;
1979 return 0;
1980 }
1981
1982 static int
1983 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
1984 {
1985 struct wapbl_wc_blocklist *wc =
1986 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1987 int i;
1988 int blocklen = 1<<wl->wl_log_dev_bshift;
1989 int bph;
1990 off_t off = *offp;
1991 int error;
1992
1993 if (wl->wl_dealloccnt == 0)
1994 return 0;
1995
1996 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1997 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1998
1999 i = 0;
2000 while (i < wl->wl_dealloccnt) {
2001 wc->wc_type = WAPBL_WC_REVOCATIONS;
2002 wc->wc_len = blocklen;
2003 wc->wc_blkcount = 0;
2004 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2005 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2006 wl->wl_deallocblks[i];
2007 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2008 wl->wl_dealloclens[i];
2009 wc->wc_blkcount++;
2010 i++;
2011 }
2012 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2013 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2014 wc->wc_len, (intmax_t)off));
2015 error = wapbl_circ_write(wl, wc, blocklen, &off);
2016 if (error)
2017 return error;
2018 }
2019 *offp = off;
2020 return 0;
2021 }
2022
2023 static int
2024 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2025 {
2026 struct wapbl_wc_inodelist *wc =
2027 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2028 int i;
2029 int blocklen = 1 << wl->wl_log_dev_bshift;
2030 off_t off = *offp;
2031 int error;
2032
2033 struct wapbl_ino_head *wih;
2034 struct wapbl_ino *wi;
2035 int iph;
2036
2037 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2038 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2039
2040 i = 0;
2041 wih = &wl->wl_inohash[0];
2042 wi = 0;
2043 do {
2044 wc->wc_type = WAPBL_WC_INODES;
2045 wc->wc_len = blocklen;
2046 wc->wc_inocnt = 0;
2047 wc->wc_clear = (i == 0);
2048 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2049 while (!wi) {
2050 KASSERT((wih - &wl->wl_inohash[0])
2051 <= wl->wl_inohashmask);
2052 wi = LIST_FIRST(wih++);
2053 }
2054 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2055 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2056 wc->wc_inocnt++;
2057 i++;
2058 wi = LIST_NEXT(wi, wi_hash);
2059 }
2060 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2061 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2062 wc->wc_len, (intmax_t)off));
2063 error = wapbl_circ_write(wl, wc, blocklen, &off);
2064 if (error)
2065 return error;
2066 } while (i < wl->wl_inohashcnt);
2067
2068 *offp = off;
2069 return 0;
2070 }
2071
2072 #endif /* _KERNEL */
2073
2074 /****************************************************************/
2075
2076 struct wapbl_blk {
2077 LIST_ENTRY(wapbl_blk) wb_hash;
2078 daddr_t wb_blk;
2079 off_t wb_off; /* Offset of this block in the log */
2080 };
2081 #define WAPBL_BLKPOOL_MIN 83
2082
2083 static void
2084 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2085 {
2086 if (size < WAPBL_BLKPOOL_MIN)
2087 size = WAPBL_BLKPOOL_MIN;
2088 KASSERT(wr->wr_blkhash == 0);
2089 #ifdef _KERNEL
2090 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2091 #else /* ! _KERNEL */
2092 /* Manually implement hashinit */
2093 {
2094 unsigned long i, hashsize;
2095 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2096 continue;
2097 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2098 for (i = 0; i < hashsize; i++)
2099 LIST_INIT(&wr->wr_blkhash[i]);
2100 wr->wr_blkhashmask = hashsize - 1;
2101 }
2102 #endif /* ! _KERNEL */
2103 }
2104
2105 static void
2106 wapbl_blkhash_free(struct wapbl_replay *wr)
2107 {
2108 KASSERT(wr->wr_blkhashcnt == 0);
2109 #ifdef _KERNEL
2110 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2111 #else /* ! _KERNEL */
2112 wapbl_free(wr->wr_blkhash,
2113 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2114 #endif /* ! _KERNEL */
2115 }
2116
2117 static struct wapbl_blk *
2118 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2119 {
2120 struct wapbl_blk_head *wbh;
2121 struct wapbl_blk *wb;
2122 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2123 LIST_FOREACH(wb, wbh, wb_hash) {
2124 if (blk == wb->wb_blk)
2125 return wb;
2126 }
2127 return 0;
2128 }
2129
2130 static void
2131 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2132 {
2133 struct wapbl_blk_head *wbh;
2134 struct wapbl_blk *wb;
2135 wb = wapbl_blkhash_get(wr, blk);
2136 if (wb) {
2137 KASSERT(wb->wb_blk == blk);
2138 wb->wb_off = off;
2139 } else {
2140 wb = wapbl_malloc(sizeof(*wb));
2141 wb->wb_blk = blk;
2142 wb->wb_off = off;
2143 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2144 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2145 wr->wr_blkhashcnt++;
2146 }
2147 }
2148
2149 static void
2150 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2151 {
2152 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2153 if (wb) {
2154 KASSERT(wr->wr_blkhashcnt > 0);
2155 wr->wr_blkhashcnt--;
2156 LIST_REMOVE(wb, wb_hash);
2157 wapbl_free(wb, sizeof(*wb));
2158 }
2159 }
2160
2161 static void
2162 wapbl_blkhash_clear(struct wapbl_replay *wr)
2163 {
2164 unsigned long i;
2165 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2166 struct wapbl_blk *wb;
2167
2168 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2169 KASSERT(wr->wr_blkhashcnt > 0);
2170 wr->wr_blkhashcnt--;
2171 LIST_REMOVE(wb, wb_hash);
2172 wapbl_free(wb, sizeof(*wb));
2173 }
2174 }
2175 KASSERT(wr->wr_blkhashcnt == 0);
2176 }
2177
2178 /****************************************************************/
2179
2180 static int
2181 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2182 {
2183 size_t slen;
2184 off_t off = *offp;
2185 int error;
2186 daddr_t pbn;
2187
2188 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2189 wr->wr_log_dev_bshift) == len);
2190
2191 if (off < wr->wr_circ_off)
2192 off = wr->wr_circ_off;
2193 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2194 if (slen < len) {
2195 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2196 #ifdef _KERNEL
2197 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2198 #endif
2199 error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2200 if (error)
2201 return error;
2202 data = (uint8_t *)data + slen;
2203 len -= slen;
2204 off = wr->wr_circ_off;
2205 }
2206 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2207 #ifdef _KERNEL
2208 pbn = btodb(pbn << wr->wr_log_dev_bshift);
2209 #endif
2210 error = wapbl_read(data, len, wr->wr_devvp, pbn);
2211 if (error)
2212 return error;
2213 off += len;
2214 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2215 off = wr->wr_circ_off;
2216 *offp = off;
2217 return 0;
2218 }
2219
2220 static void
2221 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2222 {
2223 size_t slen;
2224 off_t off = *offp;
2225
2226 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2227 wr->wr_log_dev_bshift) == len);
2228
2229 if (off < wr->wr_circ_off)
2230 off = wr->wr_circ_off;
2231 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2232 if (slen < len) {
2233 len -= slen;
2234 off = wr->wr_circ_off;
2235 }
2236 off += len;
2237 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2238 off = wr->wr_circ_off;
2239 *offp = off;
2240 }
2241
2242 /****************************************************************/
2243
2244 int
2245 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2246 daddr_t off, size_t count, size_t blksize)
2247 {
2248 struct wapbl_replay *wr;
2249 int error;
2250 struct vnode *devvp;
2251 daddr_t logpbn;
2252 uint8_t *scratch;
2253 struct wapbl_wc_header *wch;
2254 struct wapbl_wc_header *wch2;
2255 /* Use this until we read the actual log header */
2256 int log_dev_bshift = ilog2(blksize);
2257 size_t used;
2258 daddr_t pbn;
2259
2260 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2261 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2262 vp, off, count, blksize));
2263
2264 if (off < 0)
2265 return EINVAL;
2266
2267 if (blksize < DEV_BSIZE)
2268 return EINVAL;
2269 if (blksize % DEV_BSIZE)
2270 return EINVAL;
2271
2272 #ifdef _KERNEL
2273 #if 0
2274 /* XXX vp->v_size isn't reliably set for VBLK devices,
2275 * especially root. However, we might still want to verify
2276 * that the full load is readable */
2277 if ((off + count) * blksize > vp->v_size)
2278 return EINVAL;
2279 #endif
2280 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2281 return error;
2282 }
2283 #else /* ! _KERNEL */
2284 devvp = vp;
2285 logpbn = off;
2286 #endif /* ! _KERNEL */
2287
2288 scratch = wapbl_malloc(MAXBSIZE);
2289
2290 pbn = logpbn;
2291 #ifdef _KERNEL
2292 pbn = btodb(pbn << log_dev_bshift);
2293 #endif
2294 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
2295 if (error)
2296 goto errout;
2297
2298 wch = (struct wapbl_wc_header *)scratch;
2299 wch2 =
2300 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2301 /* XXX verify checksums and magic numbers */
2302 if (wch->wc_type != WAPBL_WC_HEADER) {
2303 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2304 error = EFTYPE;
2305 goto errout;
2306 }
2307
2308 if (wch2->wc_generation > wch->wc_generation)
2309 wch = wch2;
2310
2311 wr = wapbl_calloc(1, sizeof(*wr));
2312
2313 wr->wr_logvp = vp;
2314 wr->wr_devvp = devvp;
2315 wr->wr_logpbn = logpbn;
2316
2317 wr->wr_scratch = scratch;
2318
2319 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2320 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2321 wr->wr_circ_off = wch->wc_circ_off;
2322 wr->wr_circ_size = wch->wc_circ_size;
2323 wr->wr_generation = wch->wc_generation;
2324
2325 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2326
2327 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2328 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2329 " len=%"PRId64" used=%zu\n",
2330 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2331 wch->wc_circ_size, used));
2332
2333 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2334
2335 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2336 if (error) {
2337 wapbl_replay_stop(wr);
2338 wapbl_replay_free(wr);
2339 return error;
2340 }
2341
2342 *wrp = wr;
2343 return 0;
2344
2345 errout:
2346 wapbl_free(scratch, MAXBSIZE);
2347 return error;
2348 }
2349
2350 void
2351 wapbl_replay_stop(struct wapbl_replay *wr)
2352 {
2353
2354 if (!wapbl_replay_isopen(wr))
2355 return;
2356
2357 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2358
2359 wapbl_free(wr->wr_scratch, MAXBSIZE);
2360 wr->wr_scratch = NULL;
2361
2362 wr->wr_logvp = NULL;
2363
2364 wapbl_blkhash_clear(wr);
2365 wapbl_blkhash_free(wr);
2366 }
2367
2368 void
2369 wapbl_replay_free(struct wapbl_replay *wr)
2370 {
2371
2372 KDASSERT(!wapbl_replay_isopen(wr));
2373
2374 if (wr->wr_inodes)
2375 wapbl_free(wr->wr_inodes,
2376 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2377 wapbl_free(wr, sizeof(*wr));
2378 }
2379
2380 #ifdef _KERNEL
2381 int
2382 wapbl_replay_isopen1(struct wapbl_replay *wr)
2383 {
2384
2385 return wapbl_replay_isopen(wr);
2386 }
2387 #endif
2388
2389 static void
2390 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2391 {
2392 struct wapbl_wc_blocklist *wc =
2393 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2394 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2395 int i, j, n;
2396
2397 for (i = 0; i < wc->wc_blkcount; i++) {
2398 /*
2399 * Enter each physical block into the hashtable independently.
2400 */
2401 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2402 for (j = 0; j < n; j++) {
2403 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen),
2404 *offp);
2405 wapbl_circ_advance(wr, fsblklen, offp);
2406 }
2407 }
2408 }
2409
2410 static void
2411 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2412 {
2413 struct wapbl_wc_blocklist *wc =
2414 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2415 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2416 int i, j, n;
2417
2418 for (i = 0; i < wc->wc_blkcount; i++) {
2419 /*
2420 * Remove any blocks found from the hashtable.
2421 */
2422 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2423 for (j = 0; j < n; j++)
2424 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2425 }
2426 }
2427
2428 static void
2429 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2430 {
2431 struct wapbl_wc_inodelist *wc =
2432 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2433 void *new_inodes;
2434 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2435
2436 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2437
2438 /*
2439 * Keep track of where we found this so location won't be
2440 * overwritten.
2441 */
2442 if (wc->wc_clear) {
2443 wr->wr_inodestail = oldoff;
2444 wr->wr_inodescnt = 0;
2445 if (wr->wr_inodes != NULL) {
2446 wapbl_free(wr->wr_inodes, oldsize);
2447 wr->wr_inodes = NULL;
2448 }
2449 }
2450 wr->wr_inodeshead = newoff;
2451 if (wc->wc_inocnt == 0)
2452 return;
2453
2454 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
2455 sizeof(wr->wr_inodes[0]));
2456 if (wr->wr_inodes != NULL) {
2457 memcpy(new_inodes, wr->wr_inodes, oldsize);
2458 wapbl_free(wr->wr_inodes, oldsize);
2459 }
2460 wr->wr_inodes = new_inodes;
2461 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2462 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2463 wr->wr_inodescnt += wc->wc_inocnt;
2464 }
2465
2466 static int
2467 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2468 {
2469 off_t off;
2470 int error;
2471
2472 int logblklen = 1 << wr->wr_log_dev_bshift;
2473
2474 wapbl_blkhash_clear(wr);
2475
2476 off = tail;
2477 while (off != head) {
2478 struct wapbl_wc_null *wcn;
2479 off_t saveoff = off;
2480 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2481 if (error)
2482 goto errout;
2483 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2484 switch (wcn->wc_type) {
2485 case WAPBL_WC_BLOCKS:
2486 wapbl_replay_process_blocks(wr, &off);
2487 break;
2488
2489 case WAPBL_WC_REVOCATIONS:
2490 wapbl_replay_process_revocations(wr);
2491 break;
2492
2493 case WAPBL_WC_INODES:
2494 wapbl_replay_process_inodes(wr, saveoff, off);
2495 break;
2496
2497 default:
2498 printf("Unrecognized wapbl type: 0x%08x\n",
2499 wcn->wc_type);
2500 error = EFTYPE;
2501 goto errout;
2502 }
2503 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2504 if (off != saveoff) {
2505 printf("wapbl_replay: corrupted records\n");
2506 error = EFTYPE;
2507 goto errout;
2508 }
2509 }
2510 return 0;
2511
2512 errout:
2513 wapbl_blkhash_clear(wr);
2514 return error;
2515 }
2516
2517 #if 0
2518 int
2519 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2520 {
2521 off_t off;
2522 int mismatchcnt = 0;
2523 int logblklen = 1 << wr->wr_log_dev_bshift;
2524 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2525 void *scratch1 = wapbl_malloc(MAXBSIZE);
2526 void *scratch2 = wapbl_malloc(MAXBSIZE);
2527 int error = 0;
2528
2529 KDASSERT(wapbl_replay_isopen(wr));
2530
2531 off = wch->wc_tail;
2532 while (off != wch->wc_head) {
2533 struct wapbl_wc_null *wcn;
2534 #ifdef DEBUG
2535 off_t saveoff = off;
2536 #endif
2537 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2538 if (error)
2539 goto out;
2540 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2541 switch (wcn->wc_type) {
2542 case WAPBL_WC_BLOCKS:
2543 {
2544 struct wapbl_wc_blocklist *wc =
2545 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2546 int i;
2547 for (i = 0; i < wc->wc_blkcount; i++) {
2548 int foundcnt = 0;
2549 int dirtycnt = 0;
2550 int j, n;
2551 /*
2552 * Check each physical block into the
2553 * hashtable independently
2554 */
2555 n = wc->wc_blocks[i].wc_dlen >>
2556 wch->wc_fs_dev_bshift;
2557 for (j = 0; j < n; j++) {
2558 struct wapbl_blk *wb =
2559 wapbl_blkhash_get(wr,
2560 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2561 if (wb && (wb->wb_off == off)) {
2562 foundcnt++;
2563 error =
2564 wapbl_circ_read(wr,
2565 scratch1, fsblklen,
2566 &off);
2567 if (error)
2568 goto out;
2569 error =
2570 wapbl_read(scratch2,
2571 fsblklen, fsdevvp,
2572 wb->wb_blk);
2573 if (error)
2574 goto out;
2575 if (memcmp(scratch1,
2576 scratch2,
2577 fsblklen)) {
2578 printf(
2579 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2580 wb->wb_blk, (intmax_t)off);
2581 dirtycnt++;
2582 mismatchcnt++;
2583 }
2584 } else {
2585 wapbl_circ_advance(wr,
2586 fsblklen, &off);
2587 }
2588 }
2589 #if 0
2590 /*
2591 * If all of the blocks in an entry
2592 * are clean, then remove all of its
2593 * blocks from the hashtable since they
2594 * never will need replay.
2595 */
2596 if ((foundcnt != 0) &&
2597 (dirtycnt == 0)) {
2598 off = saveoff;
2599 wapbl_circ_advance(wr,
2600 logblklen, &off);
2601 for (j = 0; j < n; j++) {
2602 struct wapbl_blk *wb =
2603 wapbl_blkhash_get(wr,
2604 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen));
2605 if (wb &&
2606 (wb->wb_off == off)) {
2607 wapbl_blkhash_rem(wr, wb->wb_blk);
2608 }
2609 wapbl_circ_advance(wr,
2610 fsblklen, &off);
2611 }
2612 }
2613 #endif
2614 }
2615 }
2616 break;
2617 case WAPBL_WC_REVOCATIONS:
2618 case WAPBL_WC_INODES:
2619 break;
2620 default:
2621 KASSERT(0);
2622 }
2623 #ifdef DEBUG
2624 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2625 KASSERT(off == saveoff);
2626 #endif
2627 }
2628 out:
2629 wapbl_free(scratch1, MAXBSIZE);
2630 wapbl_free(scratch2, MAXBSIZE);
2631 if (!error && mismatchcnt)
2632 error = EFTYPE;
2633 return error;
2634 }
2635 #endif
2636
2637 int
2638 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2639 {
2640 struct wapbl_blk *wb;
2641 size_t i;
2642 off_t off;
2643 void *scratch;
2644 int error = 0;
2645 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2646
2647 KDASSERT(wapbl_replay_isopen(wr));
2648
2649 scratch = wapbl_malloc(MAXBSIZE);
2650
2651 for (i = 0; i <= wr->wr_blkhashmask; ++i) {
2652 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2653 off = wb->wb_off;
2654 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2655 if (error)
2656 break;
2657 error = wapbl_write(scratch, fsblklen, fsdevvp,
2658 wb->wb_blk);
2659 if (error)
2660 break;
2661 }
2662 }
2663
2664 wapbl_free(scratch, MAXBSIZE);
2665 return error;
2666 }
2667
2668 int
2669 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2670 {
2671 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2672
2673 KDASSERT(wapbl_replay_isopen(wr));
2674 KASSERT((len % fsblklen) == 0);
2675
2676 while (len != 0) {
2677 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2678 if (wb)
2679 return 1;
2680 len -= fsblklen;
2681 }
2682 return 0;
2683 }
2684
2685 int
2686 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2687 {
2688 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2689
2690 KDASSERT(wapbl_replay_isopen(wr));
2691
2692 KASSERT((len % fsblklen) == 0);
2693
2694 while (len != 0) {
2695 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2696 if (wb) {
2697 off_t off = wb->wb_off;
2698 int error;
2699 error = wapbl_circ_read(wr, data, fsblklen, &off);
2700 if (error)
2701 return error;
2702 }
2703 data = (uint8_t *)data + fsblklen;
2704 len -= fsblklen;
2705 blk++;
2706 }
2707 return 0;
2708 }
2709
2710 #ifdef _KERNEL
2711 /*
2712 * This is not really a module now, but maybe on it's way to
2713 * being one some day.
2714 */
2715 MODULE(MODULE_CLASS_VFS, wapbl, NULL);
2716
2717 static int
2718 wapbl_modcmd(modcmd_t cmd, void *arg)
2719 {
2720
2721 switch (cmd) {
2722 case MODULE_CMD_INIT:
2723 malloc_type_attach(M_WAPBL);
2724 return 0;
2725 case MODULE_CMD_FINI:
2726 return EOPNOTSUPP;
2727 default:
2728 return ENOTTY;
2729 }
2730 }
2731 #endif /* _KERNEL */
2732