vfs_wapbl.c revision 1.3 1 /* $NetBSD: vfs_wapbl.c,v 1.3 2008/08/11 02:45:27 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.3 2008/08/11 02:45:27 yamt Exp $");
37
38 #include <sys/param.h>
39
40 #ifdef _KERNEL
41 #include <sys/param.h>
42 #include <sys/namei.h>
43 #include <sys/proc.h>
44 #include <sys/uio.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/malloc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/conf.h>
50 #include <sys/mount.h>
51 #include <sys/kernel.h>
52 #include <sys/kauth.h>
53 #include <sys/mutex.h>
54 #include <sys/atomic.h>
55 #include <sys/wapbl.h>
56
57 #if WAPBL_UVM_ALLOC
58 #include <uvm/uvm.h>
59 #endif
60
61 #include <miscfs/specfs/specdev.h>
62
63 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
64 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
65 #define wapbl_free(a) free((a), M_WAPBL)
66 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
67
68 #else /* !_KERNEL */
69 #include <assert.h>
70 #include <errno.h>
71 #include <stdio.h>
72 #include <stdbool.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <sys/time.h>
77 #include <sys/wapbl.h>
78
79 #define KDASSERT(x) assert(x)
80 #define KASSERT(x) assert(x)
81 #define wapbl_malloc(s) malloc(s)
82 #define wapbl_free(a) free(a)
83 #define wapbl_calloc(n, s) calloc((n), (s))
84
85 #endif /* !_KERNEL */
86
87 /*
88 * INTERNAL DATA STRUCTURES
89 */
90
91 /*
92 * This structure holds per-mount log information.
93 *
94 * Legend: a = atomic access only
95 * r = read-only after init
96 * l = rwlock held
97 * m = mutex held
98 * u = unlocked access ok
99 * b = bufcache_lock held
100 */
101 struct wapbl {
102 struct vnode *wl_logvp; /* r: log here */
103 struct vnode *wl_devvp; /* r: log on this device */
104 struct mount *wl_mount; /* r: mountpoint wl is associated with */
105 daddr_t wl_logpbn; /* r: Physical block number of start of log */
106 int wl_log_dev_bshift; /* r: logarithm of device block size of log
107 device */
108 int wl_fs_dev_bshift; /* r: logarithm of device block size of
109 filesystem device */
110
111 unsigned wl_lock_count; /* m: Count of transactions in progress */
112
113 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
114 size_t wl_circ_off; /* r: Number of bytes reserved at start */
115
116 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
117 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
118
119 off_t wl_head; /* l: Byte offset of log head */
120 off_t wl_tail; /* l: Byte offset of log tail */
121 /*
122 * head == tail == 0 means log is empty
123 * head == tail != 0 means log is full
124 * see assertions in wapbl_advance() for other boundary conditions.
125 * only truncate moves the tail, except when flush sets it to
126 * wl_header_size only flush moves the head, except when truncate
127 * sets it to 0.
128 */
129
130 struct wapbl_wc_header *wl_wc_header; /* l */
131 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
132
133 kmutex_t wl_mtx; /* u: short-term lock */
134 krwlock_t wl_rwlock; /* u: File system transaction lock */
135
136 /*
137 * Must be held while accessing
138 * wl_count or wl_bufs or head or tail
139 */
140
141 /*
142 * Callback called from within the flush routine to flush any extra
143 * bits. Note that flush may be skipped without calling this if
144 * there are no outstanding buffers in the transaction.
145 */
146 wapbl_flush_fn_t wl_flush; /* r */
147 wapbl_flush_fn_t wl_flush_abort;/* r */
148
149 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
150 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
151 size_t wl_bcount; /* m: Total bcount of wl_bufs */
152
153 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
154
155 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
156 size_t wl_reclaimable_bytes; /* m: Amount of space available for
157 reclamation by truncate */
158 int wl_error_count; /* m: # of wl_entries with errors */
159 size_t wl_reserved_bytes; /* never truncate log smaller than this */
160
161 #ifdef WAPBL_DEBUG_BUFBYTES
162 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
163 #endif
164
165 daddr_t *wl_deallocblks;/* l: address of block */
166 int *wl_dealloclens; /* l: size of block (fragments, kom ihg) */
167 int wl_dealloccnt; /* l: total count */
168 int wl_dealloclim; /* l: max count */
169
170 /* hashtable of inode numbers for allocated but unlinked inodes */
171 /* synch ??? */
172 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
173 u_long wl_inohashmask;
174 int wl_inohashcnt;
175
176 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
177 accounting */
178 };
179
180 #ifdef WAPBL_DEBUG_PRINT
181 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
182 #endif
183
184 /****************************************************************/
185 #ifdef _KERNEL
186
187 #ifdef WAPBL_DEBUG
188 struct wapbl *wapbl_debug_wl;
189 #endif
190
191 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
192 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
193 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
194 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
195 #endif /* _KERNEL */
196
197 static int wapbl_replay_prescan(struct wapbl_replay *wr);
198 static int wapbl_replay_get_inodes(struct wapbl_replay *wr);
199
200 static __inline size_t wapbl_space_free(size_t avail, off_t head,
201 off_t tail);
202 static __inline size_t wapbl_space_used(size_t avail, off_t head,
203 off_t tail);
204
205 #ifdef _KERNEL
206
207 #define WAPBL_INODETRK_SIZE 83
208 static int wapbl_ino_pool_refcount;
209 static struct pool wapbl_ino_pool;
210 struct wapbl_ino {
211 LIST_ENTRY(wapbl_ino) wi_hash;
212 ino_t wi_ino;
213 mode_t wi_mode;
214 };
215
216 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
217 static void wapbl_inodetrk_free(struct wapbl *wl);
218 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
219
220 static size_t wapbl_transaction_len(struct wapbl *wl);
221 static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
222
223 /*
224 * This is useful for debugging. If set, the log will
225 * only be truncated when necessary.
226 */
227 int wapbl_lazy_truncate = 0;
228
229 struct wapbl_ops wapbl_ops = {
230 .wo_wapbl_discard = wapbl_discard,
231 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
232 .wo_wapbl_replay_read = wapbl_replay_read,
233 .wo_wapbl_add_buf = wapbl_add_buf,
234 .wo_wapbl_remove_buf = wapbl_remove_buf,
235 .wo_wapbl_resize_buf = wapbl_resize_buf,
236 .wo_wapbl_begin = wapbl_begin,
237 .wo_wapbl_end = wapbl_end,
238 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
239
240 /* XXX: the following is only used to say "this is a wapbl buf" */
241 .wo_wapbl_biodone = wapbl_biodone,
242 };
243
244 void
245 wapbl_init()
246 {
247
248 malloc_type_attach(M_WAPBL);
249 }
250
251 int
252 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
253 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
254 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
255 {
256 struct wapbl *wl;
257 struct vnode *devvp;
258 daddr_t logpbn;
259 int error;
260 int log_dev_bshift = DEV_BSHIFT;
261 int fs_dev_bshift = DEV_BSHIFT;
262 int run;
263
264 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
265 " count=%zu blksize=%zu\n", vp, off, count, blksize));
266
267 if (log_dev_bshift > fs_dev_bshift) {
268 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
269 ("wapbl: log device's block size cannot be larger "
270 "than filesystem's\n"));
271 /*
272 * Not currently implemented, although it could be if
273 * needed someday.
274 */
275 return ENOSYS;
276 }
277
278 if (off < 0)
279 return EINVAL;
280
281 if (blksize < DEV_BSIZE)
282 return EINVAL;
283 if (blksize % DEV_BSIZE)
284 return EINVAL;
285
286 /* XXXTODO: verify that the full load is writable */
287
288 /*
289 * XXX check for minimum log size
290 * minimum is governed by minimum amount of space
291 * to complete a transaction. (probably truncate)
292 */
293 /* XXX for now pick something minimal */
294 if ((count * blksize) < MAXPHYS) {
295 return ENOSPC;
296 }
297
298 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
299 return error;
300 }
301
302 wl = wapbl_calloc(1, sizeof(*wl));
303 rw_init(&wl->wl_rwlock);
304 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
305 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
306 LIST_INIT(&wl->wl_bufs);
307 SIMPLEQ_INIT(&wl->wl_entries);
308
309 wl->wl_logvp = vp;
310 wl->wl_devvp = devvp;
311 wl->wl_mount = mp;
312 wl->wl_logpbn = logpbn;
313 wl->wl_log_dev_bshift = log_dev_bshift;
314 wl->wl_fs_dev_bshift = fs_dev_bshift;
315
316 wl->wl_flush = flushfn;
317 wl->wl_flush_abort = flushabortfn;
318
319 /* Reserve two log device blocks for the commit headers */
320 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
321 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
322 /* truncate the log usage to a multiple of log_dev_bshift */
323 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
324 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
325
326 /*
327 * wl_bufbytes_max limits the size of the in memory transaction space.
328 * - Since buffers are allocated and accounted for in units of
329 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
330 * (i.e. 1<<PAGE_SHIFT)
331 * - Since the log device has to be written in units of
332 * 1<<wl_log_dev_bshift it is required to be a mulitple of
333 * 1<<wl_log_dev_bshift.
334 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
335 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
336 * Therefore it must be multiple of the least common multiple of those
337 * three quantities. Fortunately, all of those quantities are
338 * guaranteed to be a power of two, and the least common multiple of
339 * a set of numbers which are all powers of two is simply the maximum
340 * of those numbers. Finally, the maximum logarithm of a power of two
341 * is the same as the log of the maximum power of two. So we can do
342 * the following operations to size wl_bufbytes_max:
343 */
344
345 /* XXX fix actual number of pages reserved per filesystem. */
346 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
347
348 /* Round wl_bufbytes_max to the largest power of two constraint */
349 wl->wl_bufbytes_max >>= PAGE_SHIFT;
350 wl->wl_bufbytes_max <<= PAGE_SHIFT;
351 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
352 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
353 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
354 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
355
356 /* XXX maybe use filesystem fragment size instead of 1024 */
357 /* XXX fix actual number of buffers reserved per filesystem. */
358 wl->wl_bufcount_max = (nbuf / 2) * 1024;
359
360 /* XXX tie this into resource estimation */
361 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
362
363 #if WAPBL_UVM_ALLOC
364 wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map,
365 round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim));
366 KASSERT(wl->wl_deallocblks != NULL);
367 wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map,
368 round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim));
369 KASSERT(wl->wl_dealloclens != NULL);
370 #else
371 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
372 wl->wl_dealloclim);
373 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
374 wl->wl_dealloclim);
375 #endif
376
377 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
378
379 /* Initialize the commit header */
380 {
381 struct wapbl_wc_header *wc;
382 size_t len = 1<<wl->wl_log_dev_bshift;
383 wc = wapbl_calloc(1, len);
384 wc->wc_type = WAPBL_WC_HEADER;
385 wc->wc_len = len;
386 wc->wc_circ_off = wl->wl_circ_off;
387 wc->wc_circ_size = wl->wl_circ_size;
388 /* XXX wc->wc_fsid */
389 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
390 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
391 wl->wl_wc_header = wc;
392 wl->wl_wc_scratch = wapbl_malloc(len);
393 }
394
395 /*
396 * if there was an existing set of unlinked but
397 * allocated inodes, preserve it in the new
398 * log.
399 */
400 if (wr && wr->wr_inodescnt) {
401 int i;
402
403 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
404 ("wapbl_start: reusing log with %d inodes\n",
405 wr->wr_inodescnt));
406
407 /*
408 * Its only valid to reuse the replay log if its
409 * the same as the new log we just opened.
410 */
411 KDASSERT(!wapbl_replay_isopen(wr));
412 KASSERT(devvp->v_rdev == wr->wr_devvp->v_rdev);
413 KASSERT(logpbn == wr->wr_logpbn);
414 KASSERT(wl->wl_circ_size == wr->wr_wc_header.wc_circ_size);
415 KASSERT(wl->wl_circ_off == wr->wr_wc_header.wc_circ_off);
416 KASSERT(wl->wl_log_dev_bshift ==
417 wr->wr_wc_header.wc_log_dev_bshift);
418 KASSERT(wl->wl_fs_dev_bshift ==
419 wr->wr_wc_header.wc_fs_dev_bshift);
420
421 wl->wl_wc_header->wc_generation =
422 wr->wr_wc_header.wc_generation + 1;
423
424 for (i = 0; i < wr->wr_inodescnt; i++)
425 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
426 wr->wr_inodes[i].wr_imode);
427
428 /* Make sure new transaction won't overwrite old inodes list */
429 KDASSERT(wapbl_transaction_len(wl) <=
430 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
431 wr->wr_inodestail));
432
433 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
434 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
435 wapbl_transaction_len(wl);
436
437 error = wapbl_write_inodes(wl, &wl->wl_head);
438 if (error)
439 goto errout;
440
441 KASSERT(wl->wl_head != wl->wl_tail);
442 KASSERT(wl->wl_head != 0);
443 }
444
445 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
446 if (error) {
447 goto errout;
448 }
449
450 *wlp = wl;
451 #if defined(WAPBL_DEBUG)
452 wapbl_debug_wl = wl;
453 #endif
454
455 return 0;
456 errout:
457 wapbl_discard(wl);
458 wapbl_free(wl->wl_wc_scratch);
459 wapbl_free(wl->wl_wc_header);
460 #if WAPBL_UVM_ALLOC
461 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
462 round_page(sizeof(*wl->wl_deallocblks *
463 wl->wl_dealloclim)));
464 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
465 round_page(sizeof(*wl->wl_dealloclens *
466 wl->wl_dealloclim)));
467 #else
468 wapbl_free(wl->wl_deallocblks);
469 wapbl_free(wl->wl_dealloclens);
470 #endif
471 wapbl_inodetrk_free(wl);
472 wapbl_free(wl);
473
474 return error;
475 }
476
477 /*
478 * Like wapbl_flush, only discards the transaction
479 * completely
480 */
481
482 void
483 wapbl_discard(struct wapbl *wl)
484 {
485 struct wapbl_entry *we;
486 struct buf *bp;
487 int i;
488
489 /*
490 * XXX we may consider using upgrade here
491 * if we want to call flush from inside a transaction
492 */
493 rw_enter(&wl->wl_rwlock, RW_WRITER);
494 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
495 wl->wl_dealloccnt);
496
497 #ifdef WAPBL_DEBUG_PRINT
498 {
499 struct wapbl_entry *we;
500 pid_t pid = -1;
501 lwpid_t lid = -1;
502 if (curproc)
503 pid = curproc->p_pid;
504 if (curlwp)
505 lid = curlwp->l_lid;
506 #ifdef WAPBL_DEBUG_BUFBYTES
507 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
508 ("wapbl_discard: thread %d.%d discarding "
509 "transaction\n"
510 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
511 "deallocs=%d inodes=%d\n"
512 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
513 "unsynced=%zu\n",
514 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
515 wl->wl_bcount, wl->wl_dealloccnt,
516 wl->wl_inohashcnt, wl->wl_error_count,
517 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
518 wl->wl_unsynced_bufbytes));
519 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
520 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
521 ("\tentry: bufcount = %zu, reclaimable = %zu, "
522 "error = %d, unsynced = %zu\n",
523 we->we_bufcount, we->we_reclaimable_bytes,
524 we->we_error, we->we_unsynced_bufbytes));
525 }
526 #else /* !WAPBL_DEBUG_BUFBYTES */
527 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
528 ("wapbl_discard: thread %d.%d discarding transaction\n"
529 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
530 "deallocs=%d inodes=%d\n"
531 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
532 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
533 wl->wl_bcount, wl->wl_dealloccnt,
534 wl->wl_inohashcnt, wl->wl_error_count,
535 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
536 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
537 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
538 ("\tentry: bufcount = %zu, reclaimable = %zu, "
539 "error = %d\n",
540 we->we_bufcount, we->we_reclaimable_bytes,
541 we->we_error));
542 }
543 #endif /* !WAPBL_DEBUG_BUFBYTES */
544 }
545 #endif /* WAPBL_DEBUG_PRINT */
546
547 for (i = 0; i <= wl->wl_inohashmask; i++) {
548 struct wapbl_ino_head *wih;
549 struct wapbl_ino *wi;
550
551 wih = &wl->wl_inohash[i];
552 while ((wi = LIST_FIRST(wih)) != NULL) {
553 LIST_REMOVE(wi, wi_hash);
554 pool_put(&wapbl_ino_pool, wi);
555 KASSERT(wl->wl_inohashcnt > 0);
556 wl->wl_inohashcnt--;
557 }
558 }
559
560 /*
561 * clean buffer list
562 */
563 mutex_enter(&bufcache_lock);
564 mutex_enter(&wl->wl_mtx);
565 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
566 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
567 /*
568 * The buffer will be unlocked and
569 * removed from the transaction in brelse
570 */
571 mutex_exit(&wl->wl_mtx);
572 brelsel(bp, 0);
573 mutex_enter(&wl->wl_mtx);
574 }
575 }
576 mutex_exit(&wl->wl_mtx);
577 mutex_exit(&bufcache_lock);
578
579 /*
580 * Remove references to this wl from wl_entries, free any which
581 * no longer have buffers, others will be freed in wapbl_biodone
582 * when they no longer have any buffers.
583 */
584 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
585 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
586 /* XXX should we be accumulating wl_error_count
587 * and increasing reclaimable bytes ? */
588 we->we_wapbl = NULL;
589 if (we->we_bufcount == 0) {
590 #ifdef WAPBL_DEBUG_BUFBYTES
591 KASSERT(we->we_unsynced_bufbytes == 0);
592 #endif
593 wapbl_free(we);
594 }
595 }
596
597 /* Discard list of deallocs */
598 wl->wl_dealloccnt = 0;
599 /* XXX should we clear wl_reserved_bytes? */
600
601 KASSERT(wl->wl_bufbytes == 0);
602 KASSERT(wl->wl_bcount == 0);
603 KASSERT(wl->wl_bufcount == 0);
604 KASSERT(LIST_EMPTY(&wl->wl_bufs));
605 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
606 KASSERT(wl->wl_inohashcnt == 0);
607
608 rw_exit(&wl->wl_rwlock);
609 }
610
611 int
612 wapbl_stop(struct wapbl *wl, int force)
613 {
614 struct vnode *vp;
615 int error;
616
617 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
618 error = wapbl_flush(wl, 1);
619 if (error) {
620 if (force)
621 wapbl_discard(wl);
622 else
623 return error;
624 }
625
626 /* Unlinked inodes persist after a flush */
627 if (wl->wl_inohashcnt) {
628 if (force) {
629 wapbl_discard(wl);
630 } else {
631 return EBUSY;
632 }
633 }
634
635 KASSERT(wl->wl_bufbytes == 0);
636 KASSERT(wl->wl_bcount == 0);
637 KASSERT(wl->wl_bufcount == 0);
638 KASSERT(LIST_EMPTY(&wl->wl_bufs));
639 KASSERT(wl->wl_dealloccnt == 0);
640 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
641 KASSERT(wl->wl_inohashcnt == 0);
642
643 vp = wl->wl_logvp;
644
645 wapbl_free(wl->wl_wc_scratch);
646 wapbl_free(wl->wl_wc_header);
647 #if WAPBL_UVM_ALLOC
648 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
649 round_page(sizeof(*wl->wl_deallocblks *
650 wl->wl_dealloclim)));
651 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
652 round_page(sizeof(*wl->wl_dealloclens *
653 wl->wl_dealloclim)));
654 #else
655 wapbl_free(wl->wl_deallocblks);
656 wapbl_free(wl->wl_dealloclens);
657 #endif
658 wapbl_inodetrk_free(wl);
659
660 cv_destroy(&wl->wl_reclaimable_cv);
661 mutex_destroy(&wl->wl_mtx);
662 rw_destroy(&wl->wl_rwlock);
663 wapbl_free(wl);
664
665 return 0;
666 }
667
668 static int
669 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
670 {
671 struct pstats *pstats = curlwp->l_proc->p_stats;
672 struct buf *bp;
673 int error;
674
675 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
676 KASSERT(devvp->v_type == VBLK);
677
678 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
679 mutex_enter(&devvp->v_interlock);
680 devvp->v_numoutput++;
681 mutex_exit(&devvp->v_interlock);
682 pstats->p_ru.ru_oublock++;
683 } else {
684 pstats->p_ru.ru_inblock++;
685 }
686
687 bp = getiobuf(devvp, true);
688 bp->b_flags = flags;
689 bp->b_cflags = BC_BUSY; /* silly & dubious */
690 bp->b_dev = devvp->v_rdev;
691 bp->b_data = data;
692 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
693 bp->b_blkno = pbn;
694
695 WAPBL_PRINTF(WAPBL_PRINT_IO,
696 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
697 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
698 bp->b_blkno, bp->b_dev));
699
700 VOP_STRATEGY(devvp, bp);
701
702 error = biowait(bp);
703 putiobuf(bp);
704
705 if (error) {
706 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
707 ("wapbl_doio: %s %zu bytes at block %" PRId64
708 " on dev 0x%x failed with error %d\n",
709 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
710 "write" : "read"),
711 len, pbn, devvp->v_rdev, error));
712 }
713
714 return error;
715 }
716
717 int
718 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
719 {
720
721 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
722 }
723
724 int
725 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
726 {
727
728 return wapbl_doio(data, len, devvp, pbn, B_READ);
729 }
730
731 /*
732 * Off is byte offset returns new offset for next write
733 * handles log wraparound
734 */
735 static int
736 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
737 {
738 size_t slen;
739 off_t off = *offp;
740 int error;
741
742 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
743 wl->wl_log_dev_bshift) == len);
744
745 if (off < wl->wl_circ_off)
746 off = wl->wl_circ_off;
747 slen = wl->wl_circ_off + wl->wl_circ_size - off;
748 if (slen < len) {
749 error = wapbl_write(data, slen, wl->wl_devvp,
750 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
751 if (error)
752 return error;
753 data = (uint8_t *)data + slen;
754 len -= slen;
755 off = wl->wl_circ_off;
756 }
757 error = wapbl_write(data, len, wl->wl_devvp,
758 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
759 if (error)
760 return error;
761 off += len;
762 if (off >= wl->wl_circ_off + wl->wl_circ_size)
763 off = wl->wl_circ_off;
764 *offp = off;
765 return 0;
766 }
767
768 /****************************************************************/
769
770 int
771 wapbl_begin(struct wapbl *wl, const char *file, int line)
772 {
773 int doflush;
774 unsigned lockcount;
775 krw_t op;
776
777 KDASSERT(wl);
778
779 /*
780 * XXX: The original code calls for the use of a RW_READER lock
781 * here, but it turns out there are performance issues with high
782 * metadata-rate workloads (e.g. multiple simultaneous tar
783 * extractions). For now, we force the lock to be RW_WRITER,
784 * since that currently has the best performance characteristics
785 * (even for a single tar-file extraction).
786 *
787 */
788 #define WAPBL_DEBUG_SERIALIZE 1
789
790 #ifdef WAPBL_DEBUG_SERIALIZE
791 op = RW_WRITER;
792 #else
793 op = RW_READER;
794 #endif
795
796 /*
797 * XXX this needs to be made much more sophisticated.
798 * perhaps each wapbl_begin could reserve a specified
799 * number of buffers and bytes.
800 */
801 mutex_enter(&wl->wl_mtx);
802 lockcount = wl->wl_lock_count;
803 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
804 wl->wl_bufbytes_max / 2) ||
805 ((wl->wl_bufcount + (lockcount * 10)) >
806 wl->wl_bufcount_max / 2) ||
807 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
808 mutex_exit(&wl->wl_mtx);
809
810 if (doflush) {
811 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
812 ("force flush lockcnt=%d bufbytes=%zu "
813 "(max=%zu) bufcount=%zu (max=%zu)\n",
814 lockcount, wl->wl_bufbytes,
815 wl->wl_bufbytes_max, wl->wl_bufcount,
816 wl->wl_bufcount_max));
817 }
818
819 if (doflush) {
820 int error = wapbl_flush(wl, 0);
821 if (error)
822 return error;
823 }
824
825 rw_enter(&wl->wl_rwlock, op);
826 mutex_enter(&wl->wl_mtx);
827 wl->wl_lock_count++;
828 mutex_exit(&wl->wl_mtx);
829
830 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
831 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
832 ("wapbl_begin thread %d.%d with bufcount=%zu "
833 "bufbytes=%zu bcount=%zu at %s:%d\n",
834 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
835 wl->wl_bufbytes, wl->wl_bcount, file, line));
836 #endif
837
838 return 0;
839 }
840
841 void
842 wapbl_end(struct wapbl *wl)
843 {
844
845 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
846 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
847 ("wapbl_end thread %d.%d with bufcount=%zu "
848 "bufbytes=%zu bcount=%zu\n",
849 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
850 wl->wl_bufbytes, wl->wl_bcount));
851 #endif
852
853 mutex_enter(&wl->wl_mtx);
854 KASSERT(wl->wl_lock_count > 0);
855 wl->wl_lock_count--;
856 mutex_exit(&wl->wl_mtx);
857
858 rw_exit(&wl->wl_rwlock);
859 }
860
861 void
862 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
863 {
864
865 KASSERT(bp->b_cflags & BC_BUSY);
866 KASSERT(bp->b_vp);
867
868 wapbl_jlock_assert(wl);
869
870 #if 0
871 /*
872 * XXX this might be an issue for swapfiles.
873 * see uvm_swap.c:1702
874 *
875 * XXX2 why require it then? leap of semantics?
876 */
877 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
878 #endif
879
880 mutex_enter(&wl->wl_mtx);
881 if (bp->b_flags & B_LOCKED) {
882 LIST_REMOVE(bp, b_wapbllist);
883 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
884 ("wapbl_add_buf thread %d.%d re-adding buf %p "
885 "with %d bytes %d bcount\n",
886 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
887 bp->b_bcount));
888 } else {
889 /* unlocked by dirty buffers shouldn't exist */
890 KASSERT(!(bp->b_oflags & BO_DELWRI));
891 wl->wl_bufbytes += bp->b_bufsize;
892 wl->wl_bcount += bp->b_bcount;
893 wl->wl_bufcount++;
894 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
895 ("wapbl_add_buf thread %d.%d adding buf %p "
896 "with %d bytes %d bcount\n",
897 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
898 bp->b_bcount));
899 }
900 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
901 mutex_exit(&wl->wl_mtx);
902
903 bp->b_flags |= B_LOCKED;
904 }
905
906 static void
907 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
908 {
909
910 KASSERT(mutex_owned(&wl->wl_mtx));
911 KASSERT(bp->b_cflags & BC_BUSY);
912 wapbl_jlock_assert(wl);
913
914 #if 0
915 /*
916 * XXX this might be an issue for swapfiles.
917 * see uvm_swap.c:1725
918 *
919 * XXXdeux: see above
920 */
921 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
922 #endif
923 KASSERT(bp->b_flags & B_LOCKED);
924
925 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
926 ("wapbl_remove_buf thread %d.%d removing buf %p with "
927 "%d bytes %d bcount\n",
928 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
929
930 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
931 wl->wl_bufbytes -= bp->b_bufsize;
932 KASSERT(wl->wl_bcount >= bp->b_bcount);
933 wl->wl_bcount -= bp->b_bcount;
934 KASSERT(wl->wl_bufcount > 0);
935 wl->wl_bufcount--;
936 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
937 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
938 LIST_REMOVE(bp, b_wapbllist);
939
940 bp->b_flags &= ~B_LOCKED;
941 }
942
943 /* called from brelsel() in vfs_bio among other places */
944 void
945 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
946 {
947
948 mutex_enter(&wl->wl_mtx);
949 wapbl_remove_buf_locked(wl, bp);
950 mutex_exit(&wl->wl_mtx);
951 }
952
953 void
954 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
955 {
956
957 KASSERT(bp->b_cflags & BC_BUSY);
958
959 /*
960 * XXX: why does this depend on B_LOCKED? otherwise the buf
961 * is not for a transaction? if so, why is this called in the
962 * first place?
963 */
964 if (bp->b_flags & B_LOCKED) {
965 mutex_enter(&wl->wl_mtx);
966 wl->wl_bufbytes += bp->b_bufsize - oldsz;
967 wl->wl_bcount += bp->b_bcount - oldcnt;
968 mutex_exit(&wl->wl_mtx);
969 }
970 }
971
972 #endif /* _KERNEL */
973
974 /****************************************************************/
975 /* Some utility inlines */
976
977 /* This is used to advance the pointer at old to new value at old+delta */
978 static __inline off_t
979 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
980 {
981 off_t new;
982
983 /* Define acceptable ranges for inputs. */
984 KASSERT(delta <= size);
985 KASSERT((old == 0) || (old >= off));
986 KASSERT(old < (size + off));
987
988 if ((old == 0) && (delta != 0))
989 new = off + delta;
990 else if ((old + delta) < (size + off))
991 new = old + delta;
992 else
993 new = (old + delta) - size;
994
995 /* Note some interesting axioms */
996 KASSERT((delta != 0) || (new == old));
997 KASSERT((delta == 0) || (new != 0));
998 KASSERT((delta != (size)) || (new == old));
999
1000 /* Define acceptable ranges for output. */
1001 KASSERT((new == 0) || (new >= off));
1002 KASSERT(new < (size + off));
1003 return new;
1004 }
1005
1006 static __inline size_t
1007 wapbl_space_used(size_t avail, off_t head, off_t tail)
1008 {
1009
1010 if (tail == 0) {
1011 KASSERT(head == 0);
1012 return 0;
1013 }
1014 return ((head + (avail - 1) - tail) % avail) + 1;
1015 }
1016
1017 static __inline size_t
1018 wapbl_space_free(size_t avail, off_t head, off_t tail)
1019 {
1020
1021 return avail - wapbl_space_used(avail, head, tail);
1022 }
1023
1024 static __inline void
1025 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1026 off_t *tailp)
1027 {
1028 off_t head = *headp;
1029 off_t tail = *tailp;
1030
1031 KASSERT(delta <= wapbl_space_free(size, head, tail));
1032 head = wapbl_advance(size, off, head, delta);
1033 if ((tail == 0) && (head != 0))
1034 tail = off;
1035 *headp = head;
1036 *tailp = tail;
1037 }
1038
1039 static __inline void
1040 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1041 off_t *tailp)
1042 {
1043 off_t head = *headp;
1044 off_t tail = *tailp;
1045
1046 KASSERT(delta <= wapbl_space_used(size, head, tail));
1047 tail = wapbl_advance(size, off, tail, delta);
1048 if (head == tail) {
1049 head = tail = 0;
1050 }
1051 *headp = head;
1052 *tailp = tail;
1053 }
1054
1055 #ifdef _KERNEL
1056
1057 /****************************************************************/
1058
1059 /*
1060 * Remove transactions whose buffers are completely flushed to disk.
1061 * Will block until at least minfree space is available.
1062 * only intended to be called from inside wapbl_flush and therefore
1063 * does not protect against commit races with itself or with flush.
1064 */
1065 static int
1066 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1067 {
1068 size_t delta;
1069 size_t avail;
1070 off_t head;
1071 off_t tail;
1072 int error = 0;
1073
1074 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1075 KASSERT(rw_write_held(&wl->wl_rwlock));
1076
1077 mutex_enter(&wl->wl_mtx);
1078
1079 /*
1080 * First check to see if we have to do a commit
1081 * at all.
1082 */
1083 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1084 if (minfree < avail) {
1085 mutex_exit(&wl->wl_mtx);
1086 return 0;
1087 }
1088 minfree -= avail;
1089 while ((wl->wl_error_count == 0) &&
1090 (wl->wl_reclaimable_bytes < minfree)) {
1091 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1092 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1093 "minfree=%zd\n",
1094 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1095 minfree));
1096
1097 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1098 }
1099 if (wl->wl_reclaimable_bytes < minfree) {
1100 KASSERT(wl->wl_error_count);
1101 /* XXX maybe get actual error from buffer instead someday? */
1102 error = EIO;
1103 }
1104 head = wl->wl_head;
1105 tail = wl->wl_tail;
1106 delta = wl->wl_reclaimable_bytes;
1107
1108 /* If all of of the entries are flushed, then be sure to keep
1109 * the reserved bytes reserved. Watch out for discarded transactions,
1110 * which could leave more bytes reserved than are reclaimable.
1111 */
1112 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1113 (delta >= wl->wl_reserved_bytes)) {
1114 delta -= wl->wl_reserved_bytes;
1115 }
1116 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1117 &tail);
1118 KDASSERT(wl->wl_reserved_bytes <=
1119 wapbl_space_used(wl->wl_circ_size, head, tail));
1120 mutex_exit(&wl->wl_mtx);
1121
1122 if (error)
1123 return error;
1124
1125 if (waitonly)
1126 return 0;
1127
1128 /*
1129 * This is where head, tail and delta are unprotected
1130 * from races against itself or flush. This is ok since
1131 * we only call this routine from inside flush itself.
1132 *
1133 * XXX: how can it race against itself when accessed only
1134 * from behind the write-locked rwlock?
1135 */
1136 error = wapbl_write_commit(wl, head, tail);
1137 if (error)
1138 return error;
1139
1140 wl->wl_head = head;
1141 wl->wl_tail = tail;
1142
1143 mutex_enter(&wl->wl_mtx);
1144 KASSERT(wl->wl_reclaimable_bytes >= delta);
1145 wl->wl_reclaimable_bytes -= delta;
1146 mutex_exit(&wl->wl_mtx);
1147 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1148 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1149 curproc->p_pid, curlwp->l_lid, delta));
1150
1151 return 0;
1152 }
1153
1154 /****************************************************************/
1155
1156 void
1157 wapbl_biodone(struct buf *bp)
1158 {
1159 struct wapbl_entry *we = bp->b_private;
1160 struct wapbl *wl = we->we_wapbl;
1161
1162 /*
1163 * Handle possible flushing of buffers after log has been
1164 * decomissioned.
1165 */
1166 if (!wl) {
1167 KASSERT(we->we_bufcount > 0);
1168 we->we_bufcount--;
1169 #ifdef WAPBL_DEBUG_BUFBYTES
1170 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1171 we->we_unsynced_bufbytes -= bp->b_bufsize;
1172 #endif
1173
1174 if (we->we_bufcount == 0) {
1175 #ifdef WAPBL_DEBUG_BUFBYTES
1176 KASSERT(we->we_unsynced_bufbytes == 0);
1177 #endif
1178 wapbl_free(we);
1179 }
1180
1181 brelse(bp, 0);
1182 return;
1183 }
1184
1185 #ifdef ohbother
1186 KDASSERT(bp->b_flags & B_DONE);
1187 KDASSERT(!(bp->b_flags & B_DELWRI));
1188 KDASSERT(bp->b_flags & B_ASYNC);
1189 KDASSERT(bp->b_flags & B_BUSY);
1190 KDASSERT(!(bp->b_flags & B_LOCKED));
1191 KDASSERT(!(bp->b_flags & B_READ));
1192 KDASSERT(!(bp->b_flags & B_INVAL));
1193 KDASSERT(!(bp->b_flags & B_NOCACHE));
1194 #endif
1195
1196 if (bp->b_error) {
1197 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1198 XXXpooka: interfaces not fully updated
1199 Note: this was not enabled in the original patch
1200 against netbsd4 either. I don't know if comment
1201 above is true or not.
1202
1203 /*
1204 * If an error occurs, report the error and leave the
1205 * buffer as a delayed write on the LRU queue.
1206 * restarting the write would likely result in
1207 * an error spinloop, so let it be done harmlessly
1208 * by the syncer.
1209 */
1210 bp->b_flags &= ~(B_DONE);
1211 simple_unlock(&bp->b_interlock);
1212
1213 if (we->we_error == 0) {
1214 mutex_enter(&wl->wl_mtx);
1215 wl->wl_error_count++;
1216 mutex_exit(&wl->wl_mtx);
1217 cv_broadcast(&wl->wl_reclaimable_cv);
1218 }
1219 we->we_error = bp->b_error;
1220 bp->b_error = 0;
1221 brelse(bp);
1222 return;
1223 #else
1224 /* For now, just mark the log permanently errored out */
1225
1226 mutex_enter(&wl->wl_mtx);
1227 if (wl->wl_error_count == 0) {
1228 wl->wl_error_count++;
1229 cv_broadcast(&wl->wl_reclaimable_cv);
1230 }
1231 mutex_exit(&wl->wl_mtx);
1232 #endif
1233 }
1234
1235 mutex_enter(&wl->wl_mtx);
1236
1237 KASSERT(we->we_bufcount > 0);
1238 we->we_bufcount--;
1239 #ifdef WAPBL_DEBUG_BUFBYTES
1240 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1241 we->we_unsynced_bufbytes -= bp->b_bufsize;
1242 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1243 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1244 #endif
1245
1246 /*
1247 * If the current transaction can be reclaimed, start
1248 * at the beginning and reclaim any consecutive reclaimable
1249 * transactions. If we successfully reclaim anything,
1250 * then wakeup anyone waiting for the reclaim.
1251 */
1252 if (we->we_bufcount == 0) {
1253 size_t delta = 0;
1254 int errcnt = 0;
1255 #ifdef WAPBL_DEBUG_BUFBYTES
1256 KDASSERT(we->we_unsynced_bufbytes == 0);
1257 #endif
1258 /*
1259 * clear any posted error, since the buffer it came from
1260 * has successfully flushed by now
1261 */
1262 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1263 (we->we_bufcount == 0)) {
1264 delta += we->we_reclaimable_bytes;
1265 if (we->we_error)
1266 errcnt++;
1267 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1268 wapbl_free(we);
1269 }
1270
1271 if (delta) {
1272 wl->wl_reclaimable_bytes += delta;
1273 KASSERT(wl->wl_error_count >= errcnt);
1274 wl->wl_error_count -= errcnt;
1275 cv_broadcast(&wl->wl_reclaimable_cv);
1276 }
1277 }
1278
1279 mutex_exit(&wl->wl_mtx);
1280 brelse(bp, 0);
1281 }
1282
1283 /*
1284 * Write transactions to disk + start I/O for contents
1285 */
1286 int
1287 wapbl_flush(struct wapbl *wl, int waitfor)
1288 {
1289 struct buf *bp;
1290 struct wapbl_entry *we;
1291 off_t off;
1292 off_t head;
1293 off_t tail;
1294 size_t delta = 0;
1295 size_t flushsize;
1296 size_t reserved;
1297 int error = 0;
1298
1299 /*
1300 * Do a quick check to see if a full flush can be skipped
1301 * This assumes that the flush callback does not need to be called
1302 * unless there are other outstanding bufs.
1303 */
1304 if (!waitfor) {
1305 size_t nbufs;
1306 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1307 protect the KASSERTS */
1308 nbufs = wl->wl_bufcount;
1309 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1310 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1311 mutex_exit(&wl->wl_mtx);
1312 if (nbufs == 0)
1313 return 0;
1314 }
1315
1316 /*
1317 * XXX we may consider using LK_UPGRADE here
1318 * if we want to call flush from inside a transaction
1319 */
1320 rw_enter(&wl->wl_rwlock, RW_WRITER);
1321 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1322 wl->wl_dealloccnt);
1323
1324 /*
1325 * Now that we are fully locked and flushed,
1326 * do another check for nothing to do.
1327 */
1328 if (wl->wl_bufcount == 0) {
1329 goto out;
1330 }
1331
1332 #if 0
1333 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1334 ("wapbl_flush thread %d.%d flushing entries with "
1335 "bufcount=%zu bufbytes=%zu\n",
1336 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1337 wl->wl_bufbytes));
1338 #endif
1339
1340 /* Calculate amount of space needed to flush */
1341 flushsize = wapbl_transaction_len(wl);
1342
1343 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1344 /*
1345 * XXX this could be handled more gracefully, perhaps place
1346 * only a partial transaction in the log and allow the
1347 * remaining to flush without the protection of the journal.
1348 */
1349 panic("wapbl_flush: current transaction too big to flush\n");
1350 }
1351
1352 error = wapbl_truncate(wl, flushsize, 0);
1353 if (error)
1354 goto out2;
1355
1356 off = wl->wl_head;
1357 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1358 (off < wl->wl_circ_off + wl->wl_circ_size)));
1359 error = wapbl_write_blocks(wl, &off);
1360 if (error)
1361 goto out2;
1362 error = wapbl_write_revocations(wl, &off);
1363 if (error)
1364 goto out2;
1365 error = wapbl_write_inodes(wl, &off);
1366 if (error)
1367 goto out2;
1368
1369 reserved = 0;
1370 if (wl->wl_inohashcnt)
1371 reserved = wapbl_transaction_inodes_len(wl);
1372
1373 head = wl->wl_head;
1374 tail = wl->wl_tail;
1375
1376 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1377 &head, &tail);
1378 #ifdef WAPBL_DEBUG
1379 if (head != off) {
1380 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1381 " off=%"PRIdMAX" flush=%zu\n",
1382 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1383 flushsize);
1384 }
1385 #else
1386 KASSERT(head == off);
1387 #endif
1388
1389 /* Opportunistically move the tail forward if we can */
1390 if (!wapbl_lazy_truncate) {
1391 mutex_enter(&wl->wl_mtx);
1392 delta = wl->wl_reclaimable_bytes;
1393 mutex_exit(&wl->wl_mtx);
1394 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1395 &head, &tail);
1396 }
1397
1398 error = wapbl_write_commit(wl, head, tail);
1399 if (error)
1400 goto out2;
1401
1402 /* poolme? or kmemme? */
1403 we = wapbl_calloc(1, sizeof(*we));
1404
1405 #ifdef WAPBL_DEBUG_BUFBYTES
1406 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1407 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1408 " unsynced=%zu"
1409 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1410 "inodes=%d\n",
1411 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1412 wapbl_space_used(wl->wl_circ_size, head, tail),
1413 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1414 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1415 wl->wl_inohashcnt));
1416 #else
1417 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1418 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1419 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1420 "inodes=%d\n",
1421 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1422 wapbl_space_used(wl->wl_circ_size, head, tail),
1423 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1424 wl->wl_dealloccnt, wl->wl_inohashcnt));
1425 #endif
1426
1427
1428 mutex_enter(&bufcache_lock);
1429 mutex_enter(&wl->wl_mtx);
1430
1431 wl->wl_reserved_bytes = reserved;
1432 wl->wl_head = head;
1433 wl->wl_tail = tail;
1434 KASSERT(wl->wl_reclaimable_bytes >= delta);
1435 wl->wl_reclaimable_bytes -= delta;
1436 wl->wl_dealloccnt = 0;
1437 #ifdef WAPBL_DEBUG_BUFBYTES
1438 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1439 #endif
1440
1441 we->we_wapbl = wl;
1442 we->we_bufcount = wl->wl_bufcount;
1443 #ifdef WAPBL_DEBUG_BUFBYTES
1444 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1445 #endif
1446 we->we_reclaimable_bytes = flushsize;
1447 we->we_error = 0;
1448 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1449
1450 /*
1451 * this flushes bufs in reverse order than they were queued
1452 * it shouldn't matter, but if we care we could use TAILQ instead.
1453 * XXX Note they will get put on the lru queue when they flush
1454 * so we might actually want to change this to preserve order.
1455 */
1456 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1457 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1458 continue;
1459 }
1460 bp->b_iodone = wapbl_biodone;
1461 bp->b_private = we;
1462 bremfree(bp);
1463 wapbl_remove_buf_locked(wl, bp);
1464 mutex_exit(&wl->wl_mtx);
1465 mutex_exit(&bufcache_lock);
1466 bawrite(bp);
1467 mutex_enter(&bufcache_lock);
1468 mutex_enter(&wl->wl_mtx);
1469 }
1470 mutex_exit(&wl->wl_mtx);
1471 mutex_exit(&bufcache_lock);
1472
1473 #if 0
1474 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1475 ("wapbl_flush thread %d.%d done flushing entries...\n",
1476 curproc->p_pid, curlwp->l_lid));
1477 #endif
1478
1479 out:
1480
1481 /*
1482 * If the waitfor flag is set, don't return until everything is
1483 * fully flushed and the on disk log is empty.
1484 */
1485 if (waitfor) {
1486 error = wapbl_truncate(wl, wl->wl_circ_size -
1487 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1488 }
1489
1490 out2:
1491 if (error) {
1492 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1493 wl->wl_dealloclens, wl->wl_dealloccnt);
1494 }
1495
1496 #ifdef WAPBL_DEBUG_PRINT
1497 if (error) {
1498 pid_t pid = -1;
1499 lwpid_t lid = -1;
1500 if (curproc)
1501 pid = curproc->p_pid;
1502 if (curlwp)
1503 lid = curlwp->l_lid;
1504 mutex_enter(&wl->wl_mtx);
1505 #ifdef WAPBL_DEBUG_BUFBYTES
1506 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1507 ("wapbl_flush: thread %d.%d aborted flush: "
1508 "error = %d\n"
1509 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1510 "deallocs=%d inodes=%d\n"
1511 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1512 "unsynced=%zu\n",
1513 pid, lid, error, wl->wl_bufcount,
1514 wl->wl_bufbytes, wl->wl_bcount,
1515 wl->wl_dealloccnt, wl->wl_inohashcnt,
1516 wl->wl_error_count, wl->wl_reclaimable_bytes,
1517 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1518 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1519 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1520 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1521 "error = %d, unsynced = %zu\n",
1522 we->we_bufcount, we->we_reclaimable_bytes,
1523 we->we_error, we->we_unsynced_bufbytes));
1524 }
1525 #else
1526 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1527 ("wapbl_flush: thread %d.%d aborted flush: "
1528 "error = %d\n"
1529 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1530 "deallocs=%d inodes=%d\n"
1531 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1532 pid, lid, error, wl->wl_bufcount,
1533 wl->wl_bufbytes, wl->wl_bcount,
1534 wl->wl_dealloccnt, wl->wl_inohashcnt,
1535 wl->wl_error_count, wl->wl_reclaimable_bytes,
1536 wl->wl_reserved_bytes));
1537 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1538 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1539 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1540 "error = %d\n", we->we_bufcount,
1541 we->we_reclaimable_bytes, we->we_error));
1542 }
1543 #endif
1544 mutex_exit(&wl->wl_mtx);
1545 }
1546 #endif
1547
1548 rw_exit(&wl->wl_rwlock);
1549 return error;
1550 }
1551
1552 /****************************************************************/
1553
1554 void
1555 wapbl_jlock_assert(struct wapbl *wl)
1556 {
1557
1558 #ifdef WAPBL_DEBUG_SERIALIZE
1559 KASSERT(rw_write_held(&wl->wl_rwlock));
1560 #else
1561 KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
1562 #endif
1563 }
1564
1565 void
1566 wapbl_junlock_assert(struct wapbl *wl)
1567 {
1568
1569 #ifdef WAPBL_DEBUG_SERIALIZE
1570 KASSERT(!rw_write_held(&wl->wl_rwlock));
1571 #endif
1572 }
1573
1574 /****************************************************************/
1575
1576 /* locks missing */
1577 void
1578 wapbl_print(struct wapbl *wl,
1579 int full,
1580 void (*pr)(const char *, ...))
1581 {
1582 struct buf *bp;
1583 struct wapbl_entry *we;
1584 (*pr)("wapbl %p", wl);
1585 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1586 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1587 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1588 wl->wl_circ_size, wl->wl_circ_off,
1589 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1590 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1591 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1592 #ifdef WAPBL_DEBUG_BUFBYTES
1593 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1594 "reserved = %zu errcnt = %d unsynced = %zu\n",
1595 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1596 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1597 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1598 #else
1599 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1600 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1601 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1602 wl->wl_error_count);
1603 #endif
1604 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1605 wl->wl_dealloccnt, wl->wl_dealloclim);
1606 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1607 wl->wl_inohashcnt, wl->wl_inohashmask);
1608 (*pr)("entries:\n");
1609 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1610 #ifdef WAPBL_DEBUG_BUFBYTES
1611 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1612 "unsynced = %zu\n",
1613 we->we_bufcount, we->we_reclaimable_bytes,
1614 we->we_error, we->we_unsynced_bufbytes);
1615 #else
1616 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1617 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1618 #endif
1619 }
1620 if (full) {
1621 int cnt = 0;
1622 (*pr)("bufs =");
1623 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1624 if (!LIST_NEXT(bp, b_wapbllist)) {
1625 (*pr)(" %p", bp);
1626 } else if ((++cnt % 6) == 0) {
1627 (*pr)(" %p,\n\t", bp);
1628 } else {
1629 (*pr)(" %p,", bp);
1630 }
1631 }
1632 (*pr)("\n");
1633
1634 (*pr)("dealloced blks = ");
1635 {
1636 int i;
1637 cnt = 0;
1638 for (i = 0; i < wl->wl_dealloccnt; i++) {
1639 (*pr)(" %"PRId64":%d,",
1640 wl->wl_deallocblks[i],
1641 wl->wl_dealloclens[i]);
1642 if ((++cnt % 4) == 0) {
1643 (*pr)("\n\t");
1644 }
1645 }
1646 }
1647 (*pr)("\n");
1648
1649 (*pr)("registered inodes = ");
1650 {
1651 int i;
1652 cnt = 0;
1653 for (i = 0; i <= wl->wl_inohashmask; i++) {
1654 struct wapbl_ino_head *wih;
1655 struct wapbl_ino *wi;
1656
1657 wih = &wl->wl_inohash[i];
1658 LIST_FOREACH(wi, wih, wi_hash) {
1659 if (wi->wi_ino == 0)
1660 continue;
1661 (*pr)(" %"PRId32"/0%06"PRIo32",",
1662 wi->wi_ino, wi->wi_mode);
1663 if ((++cnt % 4) == 0) {
1664 (*pr)("\n\t");
1665 }
1666 }
1667 }
1668 (*pr)("\n");
1669 }
1670 }
1671 }
1672
1673 #if defined(WAPBL_DEBUG) || defined(DDB)
1674 void
1675 wapbl_dump(struct wapbl *wl)
1676 {
1677 #if defined(WAPBL_DEBUG)
1678 if (!wl)
1679 wl = wapbl_debug_wl;
1680 #endif
1681 if (!wl)
1682 return;
1683 wapbl_print(wl, 1, printf);
1684 }
1685 #endif
1686
1687 /****************************************************************/
1688
1689 void
1690 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1691 {
1692
1693 wapbl_jlock_assert(wl);
1694
1695 /* XXX should eventually instead tie this into resource estimation */
1696 /* XXX this KASSERT needs locking/mutex analysis */
1697 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
1698 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1699 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1700 wl->wl_dealloccnt++;
1701 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1702 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1703 }
1704
1705 /****************************************************************/
1706
1707 static void
1708 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1709 {
1710
1711 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1712 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1713 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1714 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1715 }
1716 }
1717
1718 static void
1719 wapbl_inodetrk_free(struct wapbl *wl)
1720 {
1721
1722 /* XXX this KASSERT needs locking/mutex analysis */
1723 KASSERT(wl->wl_inohashcnt == 0);
1724 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1725 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1726 pool_destroy(&wapbl_ino_pool);
1727 }
1728 }
1729
1730 static struct wapbl_ino *
1731 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1732 {
1733 struct wapbl_ino_head *wih;
1734 struct wapbl_ino *wi;
1735
1736 KASSERT(mutex_owned(&wl->wl_mtx));
1737
1738 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1739 LIST_FOREACH(wi, wih, wi_hash) {
1740 if (ino == wi->wi_ino)
1741 return wi;
1742 }
1743 return 0;
1744 }
1745
1746 void
1747 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1748 {
1749 struct wapbl_ino_head *wih;
1750 struct wapbl_ino *wi;
1751
1752 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1753
1754 mutex_enter(&wl->wl_mtx);
1755 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1756 wi->wi_ino = ino;
1757 wi->wi_mode = mode;
1758 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1759 LIST_INSERT_HEAD(wih, wi, wi_hash);
1760 wl->wl_inohashcnt++;
1761 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1762 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1763 mutex_exit(&wl->wl_mtx);
1764 } else {
1765 mutex_exit(&wl->wl_mtx);
1766 pool_put(&wapbl_ino_pool, wi);
1767 }
1768 }
1769
1770 void
1771 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1772 {
1773 struct wapbl_ino *wi;
1774
1775 mutex_enter(&wl->wl_mtx);
1776 wi = wapbl_inodetrk_get(wl, ino);
1777 if (wi) {
1778 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1779 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1780 KASSERT(wl->wl_inohashcnt > 0);
1781 wl->wl_inohashcnt--;
1782 LIST_REMOVE(wi, wi_hash);
1783 mutex_exit(&wl->wl_mtx);
1784
1785 pool_put(&wapbl_ino_pool, wi);
1786 } else {
1787 mutex_exit(&wl->wl_mtx);
1788 }
1789 }
1790
1791 /****************************************************************/
1792
1793 static __inline size_t
1794 wapbl_transaction_inodes_len(struct wapbl *wl)
1795 {
1796 int blocklen = 1<<wl->wl_log_dev_bshift;
1797 int iph;
1798
1799 /* Calculate number of inodes described in a inodelist header */
1800 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1801 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1802
1803 KASSERT(iph > 0);
1804
1805 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1806 }
1807
1808
1809 /* Calculate amount of space a transaction will take on disk */
1810 static size_t
1811 wapbl_transaction_len(struct wapbl *wl)
1812 {
1813 int blocklen = 1<<wl->wl_log_dev_bshift;
1814 size_t len;
1815 int bph;
1816
1817 /* Calculate number of blocks described in a blocklist header */
1818 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1819 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1820
1821 KASSERT(bph > 0);
1822
1823 len = wl->wl_bcount;
1824 len += howmany(wl->wl_bufcount, bph)*blocklen;
1825 len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1826 len += wapbl_transaction_inodes_len(wl);
1827
1828 return len;
1829 }
1830
1831 /*
1832 * Perform commit operation
1833 *
1834 * Note that generation number incrementation needs to
1835 * be protected against racing with other invocations
1836 * of wapbl_commit. This is ok since this routine
1837 * is only invoked from wapbl_flush
1838 */
1839 static int
1840 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1841 {
1842 struct wapbl_wc_header *wc = wl->wl_wc_header;
1843 struct timespec ts;
1844 int error;
1845 int force = 1;
1846
1847 /* XXX Calc checksum here, instead we do this for now */
1848 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1849 if (error) {
1850 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1851 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1852 "returned %d\n", wl->wl_devvp->v_rdev, error));
1853 }
1854
1855 wc->wc_head = head;
1856 wc->wc_tail = tail;
1857 wc->wc_checksum = 0;
1858 wc->wc_version = 1;
1859 getnanotime(&ts);
1860 wc->wc_time = ts.tv_sec;;
1861 wc->wc_timensec = ts.tv_nsec;
1862
1863 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1864 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1865 (intmax_t)head, (intmax_t)tail));
1866
1867 /*
1868 * XXX if generation will rollover, then first zero
1869 * over second commit header before trying to write both headers.
1870 */
1871
1872 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
1873 wl->wl_logpbn + wc->wc_generation % 2);
1874 if (error)
1875 return error;
1876
1877 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1878 if (error) {
1879 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1880 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1881 "returned %d\n", wl->wl_devvp->v_rdev, error));
1882 }
1883
1884 /*
1885 * If the generation number was zero, write it out a second time.
1886 * This handles initialization and generation number rollover
1887 */
1888 if (wc->wc_generation++ == 0) {
1889 error = wapbl_write_commit(wl, head, tail);
1890 /*
1891 * This panic should be able to be removed if we do the
1892 * zero'ing mentioned above, and we are certain to roll
1893 * back generation number on failure.
1894 */
1895 if (error)
1896 panic("wapbl_write_commit: error writing duplicate "
1897 "log header: %d\n", error);
1898 }
1899 return 0;
1900 }
1901
1902 /* Returns new offset value */
1903 static int
1904 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1905 {
1906 struct wapbl_wc_blocklist *wc =
1907 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1908 int blocklen = 1<<wl->wl_log_dev_bshift;
1909 int bph;
1910 struct buf *bp;
1911 off_t off = *offp;
1912 int error;
1913
1914 KASSERT(rw_write_held(&wl->wl_rwlock));
1915
1916 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1917 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1918
1919 bp = LIST_FIRST(&wl->wl_bufs);
1920
1921 while (bp) {
1922 int cnt;
1923 struct buf *obp = bp;
1924
1925 KASSERT(bp->b_flags & B_LOCKED);
1926
1927 wc->wc_type = WAPBL_WC_BLOCKS;
1928 wc->wc_len = blocklen;
1929 wc->wc_blkcount = 0;
1930 while (bp && (wc->wc_blkcount < bph)) {
1931 /*
1932 * Make sure all the physical block numbers are up to
1933 * date. If this is not always true on a given
1934 * filesystem, then VOP_BMAP must be called. We
1935 * could call VOP_BMAP here, or else in the filesystem
1936 * specific flush callback, although neither of those
1937 * solutions allow us to take the vnode lock. If a
1938 * filesystem requires that we must take the vnode lock
1939 * to call VOP_BMAP, then we can probably do it in
1940 * bwrite when the vnode lock should already be held
1941 * by the invoking code.
1942 */
1943 KASSERT((bp->b_vp->v_type == VBLK) ||
1944 (bp->b_blkno != bp->b_lblkno));
1945 KASSERT(bp->b_blkno > 0);
1946
1947 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1948 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1949 wc->wc_len += bp->b_bcount;
1950 wc->wc_blkcount++;
1951 bp = LIST_NEXT(bp, b_wapbllist);
1952 }
1953 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1954 ("wapbl_write_blocks: len = %u off = %"PRIdMAX"\n",
1955 wc->wc_len, (intmax_t)off));
1956
1957 error = wapbl_circ_write(wl, wc, blocklen, &off);
1958 if (error)
1959 return error;
1960 bp = obp;
1961 cnt = 0;
1962 while (bp && (cnt++ < bph)) {
1963 error = wapbl_circ_write(wl, bp->b_data,
1964 bp->b_bcount, &off);
1965 if (error)
1966 return error;
1967 bp = LIST_NEXT(bp, b_wapbllist);
1968 }
1969 }
1970 *offp = off;
1971 return 0;
1972 }
1973
1974 static int
1975 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
1976 {
1977 struct wapbl_wc_blocklist *wc =
1978 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1979 int i;
1980 int blocklen = 1<<wl->wl_log_dev_bshift;
1981 int bph;
1982 off_t off = *offp;
1983 int error;
1984
1985 if (wl->wl_dealloccnt == 0)
1986 return 0;
1987
1988 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1989 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1990
1991 i = 0;
1992 while (i < wl->wl_dealloccnt) {
1993 wc->wc_type = WAPBL_WC_REVOCATIONS;
1994 wc->wc_len = blocklen;
1995 wc->wc_blkcount = 0;
1996 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
1997 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
1998 wl->wl_deallocblks[i];
1999 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2000 wl->wl_dealloclens[i];
2001 wc->wc_blkcount++;
2002 i++;
2003 }
2004 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2005 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2006 wc->wc_len, (intmax_t)off));
2007 error = wapbl_circ_write(wl, wc, blocklen, &off);
2008 if (error)
2009 return error;
2010 }
2011 *offp = off;
2012 return 0;
2013 }
2014
2015 static int
2016 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2017 {
2018 struct wapbl_wc_inodelist *wc =
2019 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2020 int i;
2021 int blocklen = 1<<wl->wl_log_dev_bshift;
2022 off_t off = *offp;
2023 int error;
2024
2025 struct wapbl_ino_head *wih;
2026 struct wapbl_ino *wi;
2027 int iph;
2028
2029 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2030 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2031
2032 i = 0;
2033 wih = &wl->wl_inohash[0];
2034 wi = 0;
2035 do {
2036 wc->wc_type = WAPBL_WC_INODES;
2037 wc->wc_len = blocklen;
2038 wc->wc_inocnt = 0;
2039 wc->wc_clear = (i == 0);
2040 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2041 while (!wi) {
2042 KASSERT((wih - &wl->wl_inohash[0])
2043 <= wl->wl_inohashmask);
2044 wi = LIST_FIRST(wih++);
2045 }
2046 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2047 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2048 wc->wc_inocnt++;
2049 i++;
2050 wi = LIST_NEXT(wi, wi_hash);
2051 }
2052 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2053 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2054 wc->wc_len, (intmax_t)off));
2055 error = wapbl_circ_write(wl, wc, blocklen, &off);
2056 if (error)
2057 return error;
2058 } while (i < wl->wl_inohashcnt);
2059
2060 *offp = off;
2061 return 0;
2062 }
2063
2064 #endif /* _KERNEL */
2065
2066 /****************************************************************/
2067
2068 #ifdef _KERNEL
2069 static struct pool wapbl_blk_pool;
2070 static int wapbl_blk_pool_refcount;
2071 #endif
2072 struct wapbl_blk {
2073 LIST_ENTRY(wapbl_blk) wb_hash;
2074 daddr_t wb_blk;
2075 off_t wb_off; /* Offset of this block in the log */
2076 };
2077 #define WAPBL_BLKPOOL_MIN 83
2078
2079 static void
2080 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2081 {
2082 if (size < WAPBL_BLKPOOL_MIN)
2083 size = WAPBL_BLKPOOL_MIN;
2084 KASSERT(wr->wr_blkhash == 0);
2085 #ifdef _KERNEL
2086 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2087 if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) {
2088 pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0,
2089 "wapblblkpl", &pool_allocator_nointr, IPL_NONE);
2090 }
2091 #else /* ! _KERNEL */
2092 /* Manually implement hashinit */
2093 {
2094 int i;
2095 unsigned long hashsize;
2096 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2097 continue;
2098 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2099 for (i = 0; i < wr->wr_blkhashmask; i++)
2100 LIST_INIT(&wr->wr_blkhash[i]);
2101 wr->wr_blkhashmask = hashsize - 1;
2102 }
2103 #endif /* ! _KERNEL */
2104 }
2105
2106 static void
2107 wapbl_blkhash_free(struct wapbl_replay *wr)
2108 {
2109 KASSERT(wr->wr_blkhashcnt == 0);
2110 #ifdef _KERNEL
2111 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2112 if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) {
2113 pool_destroy(&wapbl_blk_pool);
2114 }
2115 #else /* ! _KERNEL */
2116 wapbl_free(wr->wr_blkhash);
2117 #endif /* ! _KERNEL */
2118 }
2119
2120 static struct wapbl_blk *
2121 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2122 {
2123 struct wapbl_blk_head *wbh;
2124 struct wapbl_blk *wb;
2125 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2126 LIST_FOREACH(wb, wbh, wb_hash) {
2127 if (blk == wb->wb_blk)
2128 return wb;
2129 }
2130 return 0;
2131 }
2132
2133 static void
2134 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2135 {
2136 struct wapbl_blk_head *wbh;
2137 struct wapbl_blk *wb;
2138 wb = wapbl_blkhash_get(wr, blk);
2139 if (wb) {
2140 KASSERT(wb->wb_blk == blk);
2141 wb->wb_off = off;
2142 } else {
2143 #ifdef _KERNEL
2144 wb = pool_get(&wapbl_blk_pool, PR_WAITOK);
2145 #else /* ! _KERNEL */
2146 wb = wapbl_malloc(sizeof(*wb));
2147 #endif /* ! _KERNEL */
2148 wb->wb_blk = blk;
2149 wb->wb_off = off;
2150 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2151 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2152 wr->wr_blkhashcnt++;
2153 }
2154 }
2155
2156 static void
2157 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2158 {
2159 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2160 if (wb) {
2161 KASSERT(wr->wr_blkhashcnt > 0);
2162 wr->wr_blkhashcnt--;
2163 LIST_REMOVE(wb, wb_hash);
2164 #ifdef _KERNEL
2165 pool_put(&wapbl_blk_pool, wb);
2166 #else /* ! _KERNEL */
2167 wapbl_free(wb);
2168 #endif /* ! _KERNEL */
2169 }
2170 }
2171
2172 static void
2173 wapbl_blkhash_clear(struct wapbl_replay *wr)
2174 {
2175 int i;
2176 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2177 struct wapbl_blk *wb;
2178
2179 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2180 KASSERT(wr->wr_blkhashcnt > 0);
2181 wr->wr_blkhashcnt--;
2182 LIST_REMOVE(wb, wb_hash);
2183 #ifdef _KERNEL
2184 pool_put(&wapbl_blk_pool, wb);
2185 #else /* ! _KERNEL */
2186 wapbl_free(wb);
2187 #endif /* ! _KERNEL */
2188 }
2189 }
2190 KASSERT(wr->wr_blkhashcnt == 0);
2191 }
2192
2193 /****************************************************************/
2194
2195 static int
2196 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2197 {
2198 size_t slen;
2199 struct wapbl_wc_header *wc = &wr->wr_wc_header;
2200 off_t off = *offp;
2201 int error;
2202
2203 KASSERT(((len >> wc->wc_log_dev_bshift) <<
2204 wc->wc_log_dev_bshift) == len);
2205 if (off < wc->wc_circ_off)
2206 off = wc->wc_circ_off;
2207 slen = wc->wc_circ_off + wc->wc_circ_size - off;
2208 if (slen < len) {
2209 error = wapbl_read(data, slen, wr->wr_devvp,
2210 wr->wr_logpbn + (off >> wc->wc_log_dev_bshift));
2211 if (error)
2212 return error;
2213 data = (uint8_t *)data + slen;
2214 len -= slen;
2215 off = wc->wc_circ_off;
2216 }
2217 error = wapbl_read(data, len, wr->wr_devvp,
2218 wr->wr_logpbn + (off >> wc->wc_log_dev_bshift));
2219 if (error)
2220 return error;
2221 off += len;
2222 if (off >= wc->wc_circ_off + wc->wc_circ_size)
2223 off = wc->wc_circ_off;
2224 *offp = off;
2225 return 0;
2226 }
2227
2228 static void
2229 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2230 {
2231 size_t slen;
2232 struct wapbl_wc_header *wc = &wr->wr_wc_header;
2233 off_t off = *offp;
2234
2235 KASSERT(((len >> wc->wc_log_dev_bshift) <<
2236 wc->wc_log_dev_bshift) == len);
2237
2238 if (off < wc->wc_circ_off)
2239 off = wc->wc_circ_off;
2240 slen = wc->wc_circ_off + wc->wc_circ_size - off;
2241 if (slen < len) {
2242 len -= slen;
2243 off = wc->wc_circ_off;
2244 }
2245 off += len;
2246 if (off >= wc->wc_circ_off + wc->wc_circ_size)
2247 off = wc->wc_circ_off;
2248 *offp = off;
2249 }
2250
2251 /****************************************************************/
2252
2253 int
2254 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2255 daddr_t off, size_t count, size_t blksize)
2256 {
2257 struct wapbl_replay *wr;
2258 int error;
2259 struct vnode *devvp;
2260 daddr_t logpbn;
2261 uint8_t *scratch;
2262 struct wapbl_wc_header *wch;
2263 struct wapbl_wc_header *wch2;
2264 /* Use this until we read the actual log header */
2265 int log_dev_bshift = DEV_BSHIFT;
2266 size_t used;
2267
2268 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2269 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2270 vp, off, count, blksize));
2271
2272 if (off < 0)
2273 return EINVAL;
2274
2275 if (blksize < DEV_BSIZE)
2276 return EINVAL;
2277 if (blksize % DEV_BSIZE)
2278 return EINVAL;
2279
2280 #ifdef _KERNEL
2281 #if 0
2282 /* XXX vp->v_size isn't reliably set for VBLK devices,
2283 * especially root. However, we might still want to verify
2284 * that the full load is readable */
2285 if ((off + count) * blksize > vp->v_size)
2286 return EINVAL;
2287 #endif
2288
2289 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2290 return error;
2291 }
2292 #else /* ! _KERNEL */
2293 devvp = vp;
2294 logpbn = off;
2295 #endif /* ! _KERNEL */
2296
2297 scratch = wapbl_malloc(MAXBSIZE);
2298
2299 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
2300 if (error)
2301 goto errout;
2302
2303 wch = (struct wapbl_wc_header *)scratch;
2304 wch2 =
2305 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2306 /* XXX verify checksums and magic numbers */
2307 if (wch->wc_type != WAPBL_WC_HEADER) {
2308 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2309 error = EFTYPE;
2310 goto errout;
2311 }
2312
2313 if (wch2->wc_generation > wch->wc_generation)
2314 wch = wch2;
2315
2316 wr = wapbl_calloc(1, sizeof(*wr));
2317
2318 wr->wr_logvp = vp;
2319 wr->wr_devvp = devvp;
2320 wr->wr_logpbn = logpbn;
2321
2322 wr->wr_scratch = scratch;
2323
2324 memcpy(&wr->wr_wc_header, wch, sizeof(wr->wr_wc_header));
2325
2326 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2327
2328 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2329 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2330 " len=%"PRId64" used=%zu\n",
2331 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2332 wch->wc_circ_size, used));
2333
2334 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2335 error = wapbl_replay_prescan(wr);
2336 if (error) {
2337 wapbl_replay_stop(wr);
2338 wapbl_replay_free(wr);
2339 return error;
2340 }
2341
2342 error = wapbl_replay_get_inodes(wr);
2343 if (error) {
2344 wapbl_replay_stop(wr);
2345 wapbl_replay_free(wr);
2346 return error;
2347 }
2348
2349 *wrp = wr;
2350 return 0;
2351
2352 errout:
2353 wapbl_free(scratch);
2354 return error;
2355 }
2356
2357 void
2358 wapbl_replay_stop(struct wapbl_replay *wr)
2359 {
2360
2361 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2362
2363 KDASSERT(wapbl_replay_isopen(wr));
2364
2365 wapbl_free(wr->wr_scratch);
2366 wr->wr_scratch = 0;
2367
2368 wr->wr_logvp = 0;
2369
2370 wapbl_blkhash_clear(wr);
2371 wapbl_blkhash_free(wr);
2372 }
2373
2374 void
2375 wapbl_replay_free(struct wapbl_replay *wr)
2376 {
2377
2378 KDASSERT(!wapbl_replay_isopen(wr));
2379
2380 if (wr->wr_inodes)
2381 wapbl_free(wr->wr_inodes);
2382 wapbl_free(wr);
2383 }
2384
2385 int
2386 wapbl_replay_isopen1(struct wapbl_replay *wr)
2387 {
2388
2389 return wapbl_replay_isopen(wr);
2390 }
2391
2392 static int
2393 wapbl_replay_prescan(struct wapbl_replay *wr)
2394 {
2395 off_t off;
2396 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2397 int error;
2398
2399 int logblklen = 1<<wch->wc_log_dev_bshift;
2400 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2401
2402 wapbl_blkhash_clear(wr);
2403
2404 off = wch->wc_tail;
2405 while (off != wch->wc_head) {
2406 struct wapbl_wc_null *wcn;
2407 off_t saveoff = off;
2408 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2409 if (error)
2410 goto errout;
2411 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2412 switch (wcn->wc_type) {
2413 case WAPBL_WC_BLOCKS:
2414 {
2415 struct wapbl_wc_blocklist *wc =
2416 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2417 int i;
2418 for (i = 0; i < wc->wc_blkcount; i++) {
2419 int j, n;
2420 /*
2421 * Enter each physical block into the
2422 * hashtable independently
2423 */
2424 n = wc->wc_blocks[i].wc_dlen >>
2425 wch->wc_fs_dev_bshift;
2426 for (j = 0; j < n; j++) {
2427 wapbl_blkhash_ins(wr,
2428 wc->wc_blocks[i].wc_daddr + j,
2429 off);
2430 wapbl_circ_advance(wr,
2431 fsblklen, &off);
2432 }
2433 }
2434 }
2435 break;
2436
2437 case WAPBL_WC_REVOCATIONS:
2438 {
2439 struct wapbl_wc_blocklist *wc =
2440 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2441 int i;
2442 for (i = 0; i < wc->wc_blkcount; i++) {
2443 int j, n;
2444 /*
2445 * Remove any blocks found from the
2446 * hashtable
2447 */
2448 n = wc->wc_blocks[i].wc_dlen >>
2449 wch->wc_fs_dev_bshift;
2450 for (j = 0; j < n; j++) {
2451 wapbl_blkhash_rem(wr,
2452 wc->wc_blocks[i].wc_daddr + j);
2453 }
2454 }
2455 }
2456 break;
2457
2458 case WAPBL_WC_INODES:
2459 {
2460 struct wapbl_wc_inodelist *wc =
2461 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2462 /*
2463 * Keep track of where we found this so we
2464 * can use it later
2465 */
2466 if (wc->wc_clear) {
2467 wr->wr_inodestail = saveoff;
2468 wr->wr_inodescnt = 0;
2469 }
2470 if (wr->wr_inodestail)
2471 wr->wr_inodeshead = off;
2472 wr->wr_inodescnt += wc->wc_inocnt;
2473 }
2474 break;
2475 default:
2476 printf("Unrecognized wapbl type: 0x%08x\n",
2477 wcn->wc_type);
2478 error = EFTYPE;
2479 goto errout;
2480 }
2481 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2482 if (off != saveoff) {
2483 printf("wapbl_replay: corrupted records\n");
2484 error = EFTYPE;
2485 goto errout;
2486 }
2487 }
2488 return 0;
2489
2490 errout:
2491 wapbl_blkhash_clear(wr);
2492 return error;
2493 }
2494
2495 static int
2496 wapbl_replay_get_inodes(struct wapbl_replay *wr)
2497 {
2498 off_t off;
2499 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2500 int logblklen = 1<<wch->wc_log_dev_bshift;
2501 int cnt= 0;
2502
2503 KDASSERT(wapbl_replay_isopen(wr));
2504
2505 if (wr->wr_inodescnt == 0)
2506 return 0;
2507
2508 KASSERT(!wr->wr_inodes);
2509
2510 wr->wr_inodes = wapbl_malloc(wr->wr_inodescnt*sizeof(wr->wr_inodes[0]));
2511
2512 off = wr->wr_inodestail;
2513
2514 while (off != wr->wr_inodeshead) {
2515 struct wapbl_wc_null *wcn;
2516 int error;
2517 off_t saveoff = off;
2518 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2519 if (error) {
2520 wapbl_free(wr->wr_inodes);
2521 wr->wr_inodes = 0;
2522 return error;
2523 }
2524 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2525 switch (wcn->wc_type) {
2526 case WAPBL_WC_BLOCKS:
2527 case WAPBL_WC_REVOCATIONS:
2528 break;
2529 case WAPBL_WC_INODES:
2530 {
2531 struct wapbl_wc_inodelist *wc =
2532 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2533 /*
2534 * Keep track of where we found this so we
2535 * can use it later
2536 */
2537 if (wc->wc_clear) {
2538 cnt = 0;
2539 }
2540 /* This memcpy assumes that wr_inodes is
2541 * laid out the same as wc_inodes. */
2542 memcpy(&wr->wr_inodes[cnt], wc->wc_inodes,
2543 wc->wc_inocnt*sizeof(wc->wc_inodes[0]));
2544 cnt += wc->wc_inocnt;
2545 }
2546 break;
2547 default:
2548 KASSERT(0);
2549 }
2550 off = saveoff;
2551 wapbl_circ_advance(wr, wcn->wc_len, &off);
2552 }
2553 KASSERT(cnt == wr->wr_inodescnt);
2554 return 0;
2555 }
2556
2557 #ifdef DEBUG
2558 int
2559 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2560 {
2561 off_t off;
2562 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2563 int mismatchcnt = 0;
2564 int logblklen = 1<<wch->wc_log_dev_bshift;
2565 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2566 void *scratch1 = wapbl_malloc(MAXBSIZE);
2567 void *scratch2 = wapbl_malloc(MAXBSIZE);
2568 int error = 0;
2569
2570 KDASSERT(wapbl_replay_isopen(wr));
2571
2572 off = wch->wc_tail;
2573 while (off != wch->wc_head) {
2574 struct wapbl_wc_null *wcn;
2575 #ifdef DEBUG
2576 off_t saveoff = off;
2577 #endif
2578 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2579 if (error)
2580 goto out;
2581 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2582 switch (wcn->wc_type) {
2583 case WAPBL_WC_BLOCKS:
2584 {
2585 struct wapbl_wc_blocklist *wc =
2586 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2587 int i;
2588 for (i = 0; i < wc->wc_blkcount; i++) {
2589 int foundcnt = 0;
2590 int dirtycnt = 0;
2591 int j, n;
2592 /*
2593 * Check each physical block into the
2594 * hashtable independently
2595 */
2596 n = wc->wc_blocks[i].wc_dlen >>
2597 wch->wc_fs_dev_bshift;
2598 for (j = 0; j < n; j++) {
2599 struct wapbl_blk *wb =
2600 wapbl_blkhash_get(wr,
2601 wc->wc_blocks[i].wc_daddr + j);
2602 if (wb && (wb->wb_off == off)) {
2603 foundcnt++;
2604 error =
2605 wapbl_circ_read(wr,
2606 scratch1, fsblklen,
2607 &off);
2608 if (error)
2609 goto out;
2610 error =
2611 wapbl_read(scratch2,
2612 fsblklen, fsdevvp,
2613 wb->wb_blk);
2614 if (error)
2615 goto out;
2616 if (memcmp(scratch1,
2617 scratch2,
2618 fsblklen)) {
2619 printf(
2620 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2621 wb->wb_blk, (intmax_t)off);
2622 dirtycnt++;
2623 mismatchcnt++;
2624 }
2625 } else {
2626 wapbl_circ_advance(wr,
2627 fsblklen, &off);
2628 }
2629 }
2630 #if 0
2631 /*
2632 * If all of the blocks in an entry
2633 * are clean, then remove all of its
2634 * blocks from the hashtable since they
2635 * never will need replay.
2636 */
2637 if ((foundcnt != 0) &&
2638 (dirtycnt == 0)) {
2639 off = saveoff;
2640 wapbl_circ_advance(wr,
2641 logblklen, &off);
2642 for (j = 0; j < n; j++) {
2643 struct wapbl_blk *wb =
2644 wapbl_blkhash_get(wr,
2645 wc->wc_blocks[i].wc_daddr + j);
2646 if (wb &&
2647 (wb->wb_off == off)) {
2648 wapbl_blkhash_rem(wr, wb->wb_blk);
2649 }
2650 wapbl_circ_advance(wr,
2651 fsblklen, &off);
2652 }
2653 }
2654 #endif
2655 }
2656 }
2657 break;
2658 case WAPBL_WC_REVOCATIONS:
2659 case WAPBL_WC_INODES:
2660 break;
2661 default:
2662 KASSERT(0);
2663 }
2664 #ifdef DEBUG
2665 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2666 KASSERT(off == saveoff);
2667 #endif
2668 }
2669 out:
2670 wapbl_free(scratch1);
2671 wapbl_free(scratch2);
2672 if (!error && mismatchcnt)
2673 error = EFTYPE;
2674 return error;
2675 }
2676 #endif
2677
2678 int
2679 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2680 {
2681 off_t off;
2682 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2683 int logblklen = 1<<wch->wc_log_dev_bshift;
2684 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2685 void *scratch1 = wapbl_malloc(MAXBSIZE);
2686 int error = 0;
2687
2688 KDASSERT(wapbl_replay_isopen(wr));
2689
2690 /*
2691 * This parses the journal for replay, although it could
2692 * just as easily walk the hashtable instead.
2693 */
2694
2695 off = wch->wc_tail;
2696 while (off != wch->wc_head) {
2697 struct wapbl_wc_null *wcn;
2698 #ifdef DEBUG
2699 off_t saveoff = off;
2700 #endif
2701 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2702 if (error)
2703 goto out;
2704 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2705 switch (wcn->wc_type) {
2706 case WAPBL_WC_BLOCKS:
2707 {
2708 struct wapbl_wc_blocklist *wc =
2709 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2710 int i;
2711 for (i = 0; i < wc->wc_blkcount; i++) {
2712 int j, n;
2713 /*
2714 * Check each physical block against
2715 * the hashtable independently
2716 */
2717 n = wc->wc_blocks[i].wc_dlen >>
2718 wch->wc_fs_dev_bshift;
2719 for (j = 0; j < n; j++) {
2720 struct wapbl_blk *wb =
2721 wapbl_blkhash_get(wr,
2722 wc->wc_blocks[i].wc_daddr + j);
2723 if (wb && (wb->wb_off == off)) {
2724 error = wapbl_circ_read(
2725 wr, scratch1,
2726 fsblklen, &off);
2727 if (error)
2728 goto out;
2729 error =
2730 wapbl_write(scratch1,
2731 fsblklen, fsdevvp,
2732 wb->wb_blk);
2733 if (error)
2734 goto out;
2735 } else {
2736 wapbl_circ_advance(wr,
2737 fsblklen, &off);
2738 }
2739 }
2740 }
2741 }
2742 break;
2743 case WAPBL_WC_REVOCATIONS:
2744 case WAPBL_WC_INODES:
2745 break;
2746 default:
2747 KASSERT(0);
2748 }
2749 #ifdef DEBUG
2750 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2751 KASSERT(off == saveoff);
2752 #endif
2753 }
2754 out:
2755 wapbl_free(scratch1);
2756 return error;
2757 }
2758
2759 int
2760 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2761 {
2762 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2763 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2764
2765 KDASSERT(wapbl_replay_isopen(wr));
2766
2767 KASSERT((len % fsblklen) == 0);
2768
2769 while (len != 0) {
2770 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2771 if (wb) {
2772 off_t off = wb->wb_off;
2773 int error;
2774 error = wapbl_circ_read(wr, data, fsblklen, &off);
2775 if (error)
2776 return error;
2777 }
2778 data = (uint8_t *)data + fsblklen;
2779 len -= fsblklen;
2780 blk++;
2781 }
2782 return 0;
2783 }
2784