vfs_wapbl.c revision 1.1.2.2 1 /* $NetBSD: vfs_wapbl.c,v 1.1.2.2 2008/06/11 12:09:59 simonb Exp $ */
2
3 /*-
4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * This implements file system independent write ahead filesystem logging.
41 */
42 #include <sys/cdefs.h>
43 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.1.2.2 2008/06/11 12:09:59 simonb Exp $");
44
45 #include <sys/param.h>
46
47 #ifdef _KERNEL
48 #include <sys/param.h>
49 #include <sys/namei.h>
50 #include <sys/proc.h>
51 #include <sys/uio.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/malloc.h>
55 #include <sys/resourcevar.h>
56 #include <sys/conf.h>
57 #include <sys/mount.h>
58 #include <sys/kernel.h>
59 #include <sys/kauth.h>
60 #include <sys/mutex.h>
61 #include <sys/wapbl.h>
62
63 #if WAPBL_UVM_ALLOC
64 #include <uvm/uvm.h>
65 #endif
66
67 #include <miscfs/specfs/specdev.h>
68
69 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
70 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
71 #define wapbl_free(a) free((a), M_WAPBL)
72 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
73
74 #else /* !_KERNEL */
75 #include <assert.h>
76 #include <errno.h>
77 #include <stdio.h>
78 #include <stdbool.h>
79 #include <stdlib.h>
80 #include <string.h>
81
82 #include <sys/time.h>
83 #include <sys/wapbl.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_malloc(s) malloc(s)
88 #define wapbl_free(a) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90
91 #endif /* !_KERNEL */
92
93 /*
94 * INTERNAL DATA STRUCTURES
95 */
96
97 /*
98 * This structure holds per-mount log information.
99 *
100 * Legend: a = atomic access only
101 * r = read-only after init
102 * l = rwlock held
103 * m = mutex held
104 * u = unlocked access ok
105 * b = bufcache_lock held
106 */
107 struct wapbl {
108 struct vnode *wl_logvp; /* r: log here */
109 struct vnode *wl_devvp; /* r: log on this device */
110 struct mount *wl_mount; /* r: mountpoint wl is associated with */
111 daddr_t wl_logpbn; /* r: Physical block number of start of log */
112 int wl_log_dev_bshift; /* r: logarithm of device block size of log
113 device */
114 int wl_fs_dev_bshift; /* r: logarithm of device block size of
115 filesystem device */
116
117 unsigned wl_lock_count; /* a: Count of transactions in progress */
118
119 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
120 size_t wl_circ_off; /* r: Number of bytes reserved at start */
121
122 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
123 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
124
125 off_t wl_head; /* l: Byte offset of log head */
126 off_t wl_tail; /* l: Byte offset of log tail */
127 /*
128 * head == tail == 0 means log is empty
129 * head == tail != 0 means log is full
130 * see assertions in wapbl_advance() for other boundary conditions.
131 * only truncate moves the tail, except when flush sets it to
132 * wl_header_size only flush moves the head, except when truncate
133 * sets it to 0.
134 */
135
136 struct wapbl_wc_header *wl_wc_header; /* l */
137 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
138
139 kmutex_t wl_mtx; /* u: short-term lock */
140 krwlock_t wl_rwlock; /* u: File system transaction lock */
141
142 /*
143 * Must be held while accessing
144 * wl_count or wl_bufs or head or tail
145 */
146
147 /*
148 * Callback called from within the flush routine to flush any extra
149 * bits. Note that flush may be skipped without calling this if
150 * there are no outstanding buffers in the transaction.
151 */
152 wapbl_flush_fn_t wl_flush; /* r */
153 wapbl_flush_fn_t wl_flush_abort;/* r */
154
155 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
156 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
157 size_t wl_bcount; /* m: Total bcount of wl_bufs */
158
159 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
160
161 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
162 size_t wl_reclaimable_bytes; /* m: Amount of space available for
163 reclamation by truncate */
164 int wl_error_count; /* m: # of wl_entries with errors */
165 size_t wl_reserved_bytes; /* never truncate log smaller than this */
166
167 #ifdef WAPBL_DEBUG_BUFBYTES
168 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
169 #endif
170
171 daddr_t *wl_deallocblks;/* l: address of block */
172 int *wl_dealloclens; /* l: size of block (fragments, kom ihg) */
173 int wl_dealloccnt; /* l: total count */
174 int wl_dealloclim; /* l: max count */
175
176 /* hashtable of inode numbers for allocated but unlinked inodes */
177 /* synch ??? */
178 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
179 u_long wl_inohashmask;
180 int wl_inohashcnt;
181
182 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
183 accounting */
184 };
185
186 #ifdef WAPBL_DEBUG_PRINT
187 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
188 #endif
189
190 /****************************************************************/
191 #ifdef _KERNEL
192
193 #ifdef WAPBL_DEBUG
194 struct wapbl *wapbl_debug_wl;
195 #endif
196
197 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
198 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
199 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
200 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
201 #endif /* _KERNEL */
202
203 static int wapbl_replay_prescan(struct wapbl_replay *wr);
204 static int wapbl_replay_get_inodes(struct wapbl_replay *wr);
205
206 static __inline size_t wapbl_space_free(size_t avail, off_t head,
207 off_t tail);
208 static __inline size_t wapbl_space_used(size_t avail, off_t head,
209 off_t tail);
210
211 #ifdef _KERNEL
212
213 #define WAPBL_INODETRK_SIZE 83
214 static int wapbl_ino_pool_refcount;
215 static struct pool wapbl_ino_pool;
216 struct wapbl_ino {
217 LIST_ENTRY(wapbl_ino) wi_hash;
218 ino_t wi_ino;
219 mode_t wi_mode;
220 };
221
222 static kmutex_t wapbl_global_mtx;
223
224 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
225 static void wapbl_inodetrk_free(struct wapbl *wl);
226 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
227
228 static size_t wapbl_transaction_len(struct wapbl *wl);
229 static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
230
231 /*
232 * This is useful for debugging. If set, the log will
233 * only be truncated when necessary.
234 */
235 int wapbl_lazy_truncate = 0;
236
237 struct wapbl_ops wapbl_ops = {
238 .wo_wapbl_discard = wapbl_discard,
239 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
240 .wo_wapbl_replay_read = wapbl_replay_read,
241 .wo_wapbl_add_buf = wapbl_add_buf,
242 .wo_wapbl_remove_buf = wapbl_remove_buf,
243 .wo_wapbl_resize_buf = wapbl_resize_buf,
244 .wo_wapbl_begin = wapbl_begin,
245 .wo_wapbl_end = wapbl_end,
246 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
247
248 /* XXX: the following is only used to say "this is a wapbl buf" */
249 .wo_wapbl_biodone = wapbl_biodone,
250 };
251
252 void
253 wapbl_init()
254 {
255
256 mutex_init(&wapbl_global_mtx, MUTEX_DEFAULT, IPL_NONE);
257 malloc_type_attach(M_WAPBL);
258 }
259
260 int
261 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
262 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
263 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
264 {
265 struct wapbl *wl;
266 struct vnode *devvp;
267 daddr_t logpbn;
268 int error;
269 int log_dev_bshift = DEV_BSHIFT;
270 int fs_dev_bshift = DEV_BSHIFT;
271 int run;
272
273 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
274 " count=%zu blksize=%zu\n", vp, off, count, blksize));
275
276 if (log_dev_bshift > fs_dev_bshift) {
277 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
278 ("wapbl: log device's block size cannot be larger "
279 "than filesystem's\n"));
280 /*
281 * Not currently implemented, although it could be if
282 * needed someday.
283 */
284 return ENOSYS;
285 }
286
287 if (off < 0)
288 return EINVAL;
289
290 if (blksize < DEV_BSIZE)
291 return EINVAL;
292 if (blksize % DEV_BSIZE)
293 return EINVAL;
294
295 /* XXXTODO: verify that the full load is writable */
296
297 /*
298 * XXX check for minimum log size
299 * minimum is governed by minimum amount of space
300 * to complete a transaction. (probably truncate)
301 */
302 /* XXX for now pick something minimal */
303 if ((count * blksize) < MAXPHYS) {
304 return ENOSPC;
305 }
306
307 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
308 return error;
309 }
310
311 wl = wapbl_calloc(1, sizeof(*wl));
312 rw_init(&wl->wl_rwlock);
313 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
314 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
315 LIST_INIT(&wl->wl_bufs);
316 SIMPLEQ_INIT(&wl->wl_entries);
317
318 wl->wl_logvp = vp;
319 wl->wl_devvp = devvp;
320 wl->wl_mount = mp;
321 wl->wl_logpbn = logpbn;
322 wl->wl_log_dev_bshift = log_dev_bshift;
323 wl->wl_fs_dev_bshift = fs_dev_bshift;
324
325 wl->wl_flush = flushfn;
326 wl->wl_flush_abort = flushabortfn;
327
328 /* Reserve two log device blocks for the commit headers */
329 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
330 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
331 /* truncate the log usage to a multiple of log_dev_bshift */
332 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
333 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
334
335 /*
336 * wl_bufbytes_max limits the size of the in memory transaction space.
337 * - Since buffers are allocated and accounted for in units of
338 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
339 * (i.e. 1<<PAGE_SHIFT)
340 * - Since the log device has to be written in units of
341 * 1<<wl_log_dev_bshift it is required to be a mulitple of
342 * 1<<wl_log_dev_bshift.
343 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
344 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
345 * Therefore it must be multiple of the least common multiple of those
346 * three quantities. Fortunately, all of those quantities are
347 * guaranteed to be a power of two, and the least common multiple of
348 * a set of numbers which are all powers of two is simply the maximum
349 * of those numbers. Finally, the maximum logarithm of a power of two
350 * is the same as the log of the maximum power of two. So we can do
351 * the following operations to size wl_bufbytes_max:
352 */
353
354 /* XXX fix actual number of pages reserved per filesystem. */
355 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
356
357 /* Round wl_bufbytes_max to the largest power of two constraint */
358 wl->wl_bufbytes_max >>= PAGE_SHIFT;
359 wl->wl_bufbytes_max <<= PAGE_SHIFT;
360 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
361 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
362 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
363 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
364
365 /* XXX maybe use filesystem fragment size instead of 1024 */
366 /* XXX fix actual number of buffers reserved per filesystem. */
367 wl->wl_bufcount_max = (nbuf / 2) * 1024;
368
369 /* XXX tie this into resource estimation */
370 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
371
372 #if WAPBL_UVM_ALLOC
373 wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map,
374 round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim));
375 KASSERT(wl->wl_deallocblks != NULL);
376 wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map,
377 round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim));
378 KASSERT(wl->wl_dealloclens != NULL);
379 #else
380 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
381 wl->wl_dealloclim);
382 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
383 wl->wl_dealloclim);
384 #endif
385
386 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
387
388 /* Initialize the commit header */
389 {
390 struct wapbl_wc_header *wc;
391 size_t len = 1<<wl->wl_log_dev_bshift;
392 wc = wapbl_calloc(1, len);
393 wc->wc_type = WAPBL_WC_HEADER;
394 wc->wc_len = len;
395 wc->wc_circ_off = wl->wl_circ_off;
396 wc->wc_circ_size = wl->wl_circ_size;
397 /* XXX wc->wc_fsid */
398 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
399 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
400 wl->wl_wc_header = wc;
401 wl->wl_wc_scratch = wapbl_malloc(len);
402 }
403
404 /*
405 * if there was an existing set of unlinked but
406 * allocated inodes, preserve it in the new
407 * log.
408 */
409 if (wr && wr->wr_inodescnt) {
410 int i;
411
412 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
413 ("wapbl_start: reusing log with %d inodes\n",
414 wr->wr_inodescnt));
415
416 /*
417 * Its only valid to reuse the replay log if its
418 * the same as the new log we just opened.
419 */
420 KDASSERT(!wapbl_replay_isopen(wr));
421 KASSERT(devvp->v_rdev == wr->wr_devvp->v_rdev);
422 KASSERT(logpbn == wr->wr_logpbn);
423 KASSERT(wl->wl_circ_size == wr->wr_wc_header.wc_circ_size);
424 KASSERT(wl->wl_circ_off == wr->wr_wc_header.wc_circ_off);
425 KASSERT(wl->wl_log_dev_bshift ==
426 wr->wr_wc_header.wc_log_dev_bshift);
427 KASSERT(wl->wl_fs_dev_bshift ==
428 wr->wr_wc_header.wc_fs_dev_bshift);
429
430 wl->wl_wc_header->wc_generation =
431 wr->wr_wc_header.wc_generation + 1;
432
433 for (i = 0; i < wr->wr_inodescnt; i++)
434 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
435 wr->wr_inodes[i].wr_imode);
436
437 /* Make sure new transaction won't overwrite old inodes list */
438 KDASSERT(wapbl_transaction_len(wl) <=
439 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
440 wr->wr_inodestail));
441
442 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
443 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
444 wapbl_transaction_len(wl);
445
446 error = wapbl_write_inodes(wl, &wl->wl_head);
447 if (error)
448 goto errout;
449
450 KASSERT(wl->wl_head != wl->wl_tail);
451 KASSERT(wl->wl_head != 0);
452 }
453
454 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
455 if (error) {
456 goto errout;
457 }
458
459 *wlp = wl;
460 #if defined(WAPBL_DEBUG)
461 wapbl_debug_wl = wl;
462 #endif
463
464 return 0;
465 errout:
466 wapbl_discard(wl);
467 wapbl_free(wl->wl_wc_scratch);
468 wapbl_free(wl->wl_wc_header);
469 #if WAPBL_UVM_ALLOC
470 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
471 round_page(sizeof(*wl->wl_deallocblks *
472 wl->wl_dealloclim)));
473 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
474 round_page(sizeof(*wl->wl_dealloclens *
475 wl->wl_dealloclim)));
476 #else
477 wapbl_free(wl->wl_deallocblks);
478 wapbl_free(wl->wl_dealloclens);
479 #endif
480 wapbl_inodetrk_free(wl);
481 wapbl_free(wl);
482
483 return error;
484 }
485
486 /*
487 * Like wapbl_flush, only discards the transaction
488 * completely
489 */
490
491 void
492 wapbl_discard(struct wapbl *wl)
493 {
494 struct wapbl_entry *we;
495 struct buf *bp;
496 int i;
497
498 /*
499 * XXX we may consider using upgrade here
500 * if we want to call flush from inside a transaction
501 */
502 rw_enter(&wl->wl_rwlock, RW_WRITER);
503 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
504 wl->wl_dealloccnt);
505
506 #ifdef WAPBL_DEBUG_PRINT
507 {
508 struct wapbl_entry *we;
509 pid_t pid = -1;
510 lwpid_t lid = -1;
511 if (curproc)
512 pid = curproc->p_pid;
513 if (curlwp)
514 lid = curlwp->l_lid;
515 #ifdef WAPBL_DEBUG_BUFBYTES
516 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
517 ("wapbl_discard: thread %d.%d discarding "
518 "transaction\n"
519 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
520 "deallocs=%d inodes=%d\n"
521 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
522 "unsynced=%zu\n",
523 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
524 wl->wl_bcount, wl->wl_dealloccnt,
525 wl->wl_inohashcnt, wl->wl_error_count,
526 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
527 wl->wl_unsynced_bufbytes));
528 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
529 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
530 ("\tentry: bufcount = %zu, reclaimable = %zu, "
531 "error = %d, unsynced = %zu\n",
532 we->we_bufcount, we->we_reclaimable_bytes,
533 we->we_error, we->we_unsynced_bufbytes));
534 }
535 #else /* !WAPBL_DEBUG_BUFBYTES */
536 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
537 ("wapbl_discard: thread %d.%d discarding transaction\n"
538 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
539 "deallocs=%d inodes=%d\n"
540 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
541 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
542 wl->wl_bcount, wl->wl_dealloccnt,
543 wl->wl_inohashcnt, wl->wl_error_count,
544 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
545 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
546 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
547 ("\tentry: bufcount = %zu, reclaimable = %zu, "
548 "error = %d\n",
549 we->we_bufcount, we->we_reclaimable_bytes,
550 we->we_error));
551 }
552 #endif /* !WAPBL_DEBUG_BUFBYTES */
553 }
554 #endif /* WAPBL_DEBUG_PRINT */
555
556 for (i = 0; i <= wl->wl_inohashmask; i++) {
557 struct wapbl_ino_head *wih;
558 struct wapbl_ino *wi;
559
560 wih = &wl->wl_inohash[i];
561 while ((wi = LIST_FIRST(wih)) != NULL) {
562 LIST_REMOVE(wi, wi_hash);
563 pool_put(&wapbl_ino_pool, wi);
564 KASSERT(wl->wl_inohashcnt > 0);
565 wl->wl_inohashcnt--;
566 }
567 }
568
569 /*
570 * clean buffer list
571 */
572 mutex_enter(&bufcache_lock);
573 mutex_enter(&wl->wl_mtx);
574 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
575 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
576 /*
577 * The buffer will be unlocked and
578 * removed from the transaction in brelse
579 */
580 mutex_exit(&wl->wl_mtx);
581 brelsel(bp, 0);
582 mutex_enter(&wl->wl_mtx);
583 }
584 }
585 mutex_exit(&wl->wl_mtx);
586 mutex_exit(&bufcache_lock);
587
588 /*
589 * Remove references to this wl from wl_entries, free any which
590 * no longer have buffers, others will be freed in wapbl_biodone
591 * when they no longer have any buffers.
592 */
593 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
594 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
595 /* XXX should we be accumulating wl_error_count
596 * and increasing reclaimable bytes ? */
597 we->we_wapbl = NULL;
598 if (we->we_bufcount == 0) {
599 #ifdef WAPBL_DEBUG_BUFBYTES
600 KASSERT(we->we_unsynced_bufbytes == 0);
601 #endif
602 wapbl_free(we);
603 }
604 }
605
606 /* Discard list of deallocs */
607 wl->wl_dealloccnt = 0;
608 /* XXX should we clear wl_reserved_bytes? */
609
610 KASSERT(wl->wl_bufbytes == 0);
611 KASSERT(wl->wl_bcount == 0);
612 KASSERT(wl->wl_bufcount == 0);
613 KASSERT(LIST_EMPTY(&wl->wl_bufs));
614 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
615 KASSERT(wl->wl_inohashcnt == 0);
616
617 rw_exit(&wl->wl_rwlock);
618 }
619
620 int
621 wapbl_stop(struct wapbl *wl, int force)
622 {
623 struct vnode *vp;
624 int error;
625
626 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
627 error = wapbl_flush(wl, 1);
628 if (error) {
629 if (force)
630 wapbl_discard(wl);
631 else
632 return error;
633 }
634
635 /* Unlinked inodes persist after a flush */
636 if (wl->wl_inohashcnt) {
637 if (force) {
638 wapbl_discard(wl);
639 } else {
640 return EBUSY;
641 }
642 }
643
644 KASSERT(wl->wl_bufbytes == 0);
645 KASSERT(wl->wl_bcount == 0);
646 KASSERT(wl->wl_bufcount == 0);
647 KASSERT(LIST_EMPTY(&wl->wl_bufs));
648 KASSERT(wl->wl_dealloccnt == 0);
649 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
650 KASSERT(wl->wl_inohashcnt == 0);
651
652 vp = wl->wl_logvp;
653
654 wapbl_free(wl->wl_wc_scratch);
655 wapbl_free(wl->wl_wc_header);
656 #if WAPBL_UVM_ALLOC
657 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
658 round_page(sizeof(*wl->wl_deallocblks *
659 wl->wl_dealloclim)));
660 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
661 round_page(sizeof(*wl->wl_dealloclens *
662 wl->wl_dealloclim)));
663 #else
664 wapbl_free(wl->wl_deallocblks);
665 wapbl_free(wl->wl_dealloclens);
666 #endif
667 wapbl_inodetrk_free(wl);
668 wapbl_free(wl);
669
670 return 0;
671 }
672
673 static int
674 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
675 {
676 struct pstats *pstats = curlwp->l_proc->p_stats;
677 struct buf *bp;
678 int error;
679
680 KASSERT((flags & ~(B_WRITE|B_READ)) == 0);
681 KASSERT(devvp->v_type == VBLK);
682
683 if ((flags & (B_WRITE|B_READ)) == B_WRITE) {
684 devvp->v_numoutput++;
685 pstats->p_ru.ru_oublock++;
686 } else {
687 pstats->p_ru.ru_inblock++;
688 }
689
690 bp = getiobuf(devvp, true);
691 bp->b_flags = flags;
692 bp->b_cflags = BC_BUSY; /* silly & dubious */
693 bp->b_dev = devvp->v_rdev;
694 bp->b_data = data;
695 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
696 bp->b_blkno = pbn;
697
698 WAPBL_PRINTF(WAPBL_PRINT_IO,
699 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
700 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
701 bp->b_blkno, bp->b_dev));
702
703 VOP_STRATEGY(devvp, bp);
704
705 error = biowait(bp);
706 putiobuf(bp);
707
708 if (error) {
709 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
710 ("wapbl_doio: %s %zu bytes at block %" PRId64
711 " on dev 0x%x failed with error %d\n",
712 (((flags & (B_WRITE|B_READ)) == B_WRITE) ? "write":"read"),
713 len, pbn, devvp->v_rdev, error));
714 }
715
716 return error;
717 }
718
719 int
720 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
721 {
722
723 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
724 }
725
726 int
727 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
728 {
729
730 return wapbl_doio(data, len, devvp, pbn, B_READ);
731 }
732
733 /*
734 * Off is byte offset returns new offset for next write
735 * handles log wraparound
736 */
737 static int
738 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
739 {
740 size_t slen;
741 off_t off = *offp;
742 int error;
743
744 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
745 wl->wl_log_dev_bshift) == len);
746
747 if (off < wl->wl_circ_off)
748 off = wl->wl_circ_off;
749 slen = wl->wl_circ_off + wl->wl_circ_size - off;
750 if (slen < len) {
751 error = wapbl_write(data, slen, wl->wl_devvp,
752 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
753 if (error)
754 return error;
755 data = (uint8_t *)data + slen;
756 len -= slen;
757 off = wl->wl_circ_off;
758 }
759 error = wapbl_write(data, len, wl->wl_devvp,
760 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
761 if (error)
762 return error;
763 off += len;
764 if (off >= wl->wl_circ_off + wl->wl_circ_size)
765 off = wl->wl_circ_off;
766 *offp = off;
767 return 0;
768 }
769
770 /****************************************************************/
771
772 int
773 wapbl_begin(struct wapbl *wl, const char *file, int line)
774 {
775 int doflush;
776 unsigned lockcount;
777 krw_t op;
778
779 KDASSERT(wl);
780
781 #ifdef WAPBL_DEBUG_SERIALIZE
782 op = RW_WRITER;
783 #else
784 op = RW_READER;
785 #endif
786
787 /*
788 * XXX this needs to be made much more sophisticated.
789 * perhaps each wapbl_begin could reserve a specified
790 * number of buffers and bytes.
791 */
792 mutex_enter(&wl->wl_mtx);
793 lockcount = wl->wl_lock_count;
794 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
795 wl->wl_bufbytes_max / 2) ||
796 ((wl->wl_bufcount + (lockcount * 10)) >
797 wl->wl_bufcount_max / 2) ||
798 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
799 mutex_exit(&wl->wl_mtx);
800
801 if (doflush) {
802 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
803 ("force flush lockcnt=%d bufbytes=%zu "
804 "(max=%zu) bufcount=%zu (max=%zu)\n",
805 lockcount, wl->wl_bufbytes,
806 wl->wl_bufbytes_max, wl->wl_bufcount,
807 wl->wl_bufcount_max));
808 }
809
810 if (doflush) {
811 int error = wapbl_flush(wl, 0);
812 if (error)
813 return error;
814 }
815
816 rw_enter(&wl->wl_rwlock, op);
817 mutex_enter(&wl->wl_mtx);
818 wl->wl_lock_count++;
819 mutex_exit(&wl->wl_mtx);
820
821 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
822 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
823 ("wapbl_begin thread %d.%d with bufcount=%zu "
824 "bufbytes=%zu bcount=%zu at %s:%d\n",
825 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
826 wl->wl_bufbytes, wl->wl_bcount, file, line));
827 #endif
828
829 return 0;
830 }
831
832 void
833 wapbl_end(struct wapbl *wl)
834 {
835
836 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
837 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
838 ("wapbl_end thread %d.%d with bufcount=%zu "
839 "bufbytes=%zu bcount=%zu\n",
840 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
841 wl->wl_bufbytes, wl->wl_bcount));
842 #endif
843
844 mutex_enter(&wl->wl_mtx);
845 KASSERT(wl->wl_lock_count > 0);
846 wl->wl_lock_count--;
847 mutex_exit(&wl->wl_mtx);
848
849 rw_exit(&wl->wl_rwlock);
850 }
851
852 void
853 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
854 {
855
856 KASSERT(bp->b_cflags & BC_BUSY);
857 KASSERT(bp->b_vp);
858
859 wapbl_jlock_assert(wl);
860
861 #if 0
862 /*
863 * XXX this might be an issue for swapfiles.
864 * see uvm_swap.c:1702
865 *
866 * XXX2 why require it then? leap of semantics?
867 */
868 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
869 #endif
870
871 mutex_enter(&wl->wl_mtx);
872 if (bp->b_flags & B_LOCKED) {
873 LIST_REMOVE(bp, b_wapbllist);
874 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
875 ("wapbl_add_buf thread %d.%d re-adding buf %p "
876 "with %d bytes %d bcount\n",
877 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
878 bp->b_bcount));
879 } else {
880 /* unlocked by dirty buffers shouldn't exist */
881 KASSERT(!(bp->b_oflags & BO_DELWRI));
882 wl->wl_bufbytes += bp->b_bufsize;
883 wl->wl_bcount += bp->b_bcount;
884 wl->wl_bufcount++;
885 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
886 ("wapbl_add_buf thread %d.%d adding buf %p "
887 "with %d bytes %d bcount\n",
888 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
889 bp->b_bcount));
890 }
891 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
892 mutex_exit(&wl->wl_mtx);
893
894 bp->b_flags |= B_LOCKED;
895 }
896
897 static void
898 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
899 {
900
901 KASSERT(mutex_owned(&wl->wl_mtx));
902 KASSERT(bp->b_cflags & BC_BUSY);
903 wapbl_jlock_assert(wl);
904
905 #if 0
906 /*
907 * XXX this might be an issue for swapfiles.
908 * see uvm_swap.c:1725
909 *
910 * XXXdeux: see above
911 */
912 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
913 #endif
914 KASSERT(bp->b_flags & B_LOCKED);
915
916 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
917 ("wapbl_remove_buf thread %d.%d removing buf %p with "
918 "%d bytes %d bcount\n",
919 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
920
921 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
922 wl->wl_bufbytes -= bp->b_bufsize;
923 KASSERT(wl->wl_bcount >= bp->b_bcount);
924 wl->wl_bcount -= bp->b_bcount;
925 KASSERT(wl->wl_bufcount > 0);
926 wl->wl_bufcount--;
927 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
928 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
929 LIST_REMOVE(bp, b_wapbllist);
930
931 bp->b_flags &= ~B_LOCKED;
932 }
933
934 /* called from brelsel() in vfs_bio among other places */
935 void
936 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
937 {
938
939 mutex_enter(&wl->wl_mtx);
940 wapbl_remove_buf_locked(wl, bp);
941 mutex_exit(&wl->wl_mtx);
942 }
943
944 void
945 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
946 {
947
948 KASSERT(bp->b_cflags & BC_BUSY);
949
950 /*
951 * XXX: why does this depend on B_LOCKED? otherwise the buf
952 * is not for a transaction? if so, why is this called in the
953 * first place?
954 */
955 if (bp->b_flags & B_LOCKED) {
956 mutex_enter(&wl->wl_mtx);
957 wl->wl_bufbytes += bp->b_bufsize - oldsz;
958 wl->wl_bcount += bp->b_bcount - oldcnt;
959 mutex_exit(&wl->wl_mtx);
960 }
961 }
962
963 #endif /* _KERNEL */
964
965 /****************************************************************/
966 /* Some utility inlines */
967
968 /* This is used to advance the pointer at old to new value at old+delta */
969 static __inline off_t
970 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
971 {
972 off_t new;
973
974 /* Define acceptable ranges for inputs. */
975 KASSERT(delta <= size);
976 KASSERT((old == 0) || (old >= off));
977 KASSERT(old < (size + off));
978
979 if ((old == 0) && (delta != 0))
980 new = off + delta;
981 else if ((old + delta) < (size + off))
982 new = old + delta;
983 else
984 new = (old + delta) - size;
985
986 /* Note some interesting axioms */
987 KASSERT((delta != 0) || (new == old));
988 KASSERT((delta == 0) || (new != 0));
989 KASSERT((delta != (size)) || (new == old));
990
991 /* Define acceptable ranges for output. */
992 KASSERT((new == 0) || (new >= off));
993 KASSERT(new < (size + off));
994 return new;
995 }
996
997 static __inline size_t
998 wapbl_space_used(size_t avail, off_t head, off_t tail)
999 {
1000
1001 if (tail == 0) {
1002 KASSERT(head == 0);
1003 return 0;
1004 }
1005 return ((head + (avail - 1) - tail) % avail) + 1;
1006 }
1007
1008 static __inline size_t
1009 wapbl_space_free(size_t avail, off_t head, off_t tail)
1010 {
1011
1012 return avail - wapbl_space_used(avail, head, tail);
1013 }
1014
1015 static __inline void
1016 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1017 off_t *tailp)
1018 {
1019 off_t head = *headp;
1020 off_t tail = *tailp;
1021
1022 KASSERT(delta <= wapbl_space_free(size, head, tail));
1023 head = wapbl_advance(size, off, head, delta);
1024 if ((tail == 0) && (head != 0))
1025 tail = off;
1026 *headp = head;
1027 *tailp = tail;
1028 }
1029
1030 static __inline void
1031 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1032 off_t *tailp)
1033 {
1034 off_t head = *headp;
1035 off_t tail = *tailp;
1036
1037 KASSERT(delta <= wapbl_space_used(size, head, tail));
1038 tail = wapbl_advance(size, off, tail, delta);
1039 if (head == tail) {
1040 head = tail = 0;
1041 }
1042 *headp = head;
1043 *tailp = tail;
1044 }
1045
1046 #ifdef _KERNEL
1047
1048 /****************************************************************/
1049
1050 /*
1051 * Remove transactions whose buffers are completely flushed to disk.
1052 * Will block until at least minfree space is available.
1053 * only intended to be called from inside wapbl_flush and therefore
1054 * does not protect against commit races with itself or with flush.
1055 */
1056 static int
1057 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1058 {
1059 size_t delta;
1060 size_t avail;
1061 off_t head;
1062 off_t tail;
1063 int error = 0;
1064
1065 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1066 KASSERT(rw_write_held(&wl->wl_rwlock));
1067
1068 mutex_enter(&wl->wl_mtx);
1069
1070 /*
1071 * First check to see if we have to do a commit
1072 * at all.
1073 */
1074 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1075 if (minfree < avail) {
1076 mutex_exit(&wl->wl_mtx);
1077 return 0;
1078 }
1079 minfree -= avail;
1080 while ((wl->wl_error_count == 0) &&
1081 (wl->wl_reclaimable_bytes < minfree)) {
1082 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1083 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1084 "minfree=%zd\n",
1085 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1086 minfree));
1087
1088 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1089 }
1090 if (wl->wl_reclaimable_bytes < minfree) {
1091 KASSERT(wl->wl_error_count);
1092 /* XXX maybe get actual error from buffer instead someday? */
1093 error = EIO;
1094 }
1095 head = wl->wl_head;
1096 tail = wl->wl_tail;
1097 delta = wl->wl_reclaimable_bytes;
1098
1099 /* If all of of the entries are flushed, then be sure to keep
1100 * the reserved bytes reserved. Watch out for discarded transactions,
1101 * which could leave more bytes reserved than are reclaimable.
1102 */
1103 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1104 (delta >= wl->wl_reserved_bytes)) {
1105 delta -= wl->wl_reserved_bytes;
1106 }
1107 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1108 &tail);
1109 KDASSERT(wl->wl_reserved_bytes <=
1110 wapbl_space_used(wl->wl_circ_size, head, tail));
1111 mutex_exit(&wl->wl_mtx);
1112
1113 if (error)
1114 return error;
1115
1116 if (waitonly)
1117 return 0;
1118
1119 /*
1120 * This is where head, tail and delta are unprotected
1121 * from races against itself or flush. This is ok since
1122 * we only call this routine from inside flush itself.
1123 *
1124 * XXX: how can it race against itself when accessed only
1125 * from behind the write-locked rwlock?
1126 */
1127 error = wapbl_write_commit(wl, head, tail);
1128 if (error)
1129 return error;
1130
1131 wl->wl_head = head;
1132 wl->wl_tail = tail;
1133
1134 mutex_enter(&wl->wl_mtx);
1135 KASSERT(wl->wl_reclaimable_bytes >= delta);
1136 wl->wl_reclaimable_bytes -= delta;
1137 mutex_exit(&wl->wl_mtx);
1138 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1139 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1140 curproc->p_pid, curlwp->l_lid, delta));
1141
1142 return 0;
1143 }
1144
1145 /****************************************************************/
1146
1147 void
1148 wapbl_biodone(struct buf *bp)
1149 {
1150 struct wapbl_entry *we = bp->b_private;
1151 struct wapbl *wl = we->we_wapbl;
1152
1153 /*
1154 * Handle possible flushing of buffers after log has been
1155 * decomissioned.
1156 */
1157 if (!wl) {
1158 KASSERT(we->we_bufcount > 0);
1159 we->we_bufcount--;
1160 #ifdef WAPBL_DEBUG_BUFBYTES
1161 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1162 we->we_unsynced_bufbytes -= bp->b_bufsize;
1163 #endif
1164
1165 if (we->we_bufcount == 0) {
1166 #ifdef WAPBL_DEBUG_BUFBYTES
1167 KASSERT(we->we_unsynced_bufbytes == 0);
1168 #endif
1169 wapbl_free(we);
1170 }
1171
1172 brelse(bp, 0);
1173 return;
1174 }
1175
1176 #ifdef ohbother
1177 KDASSERT(bp->b_flags & B_DONE);
1178 KDASSERT(!(bp->b_flags & B_DELWRI));
1179 KDASSERT(bp->b_flags & B_ASYNC);
1180 KDASSERT(bp->b_flags & B_BUSY);
1181 KDASSERT(!(bp->b_flags & B_LOCKED));
1182 KDASSERT(!(bp->b_flags & B_READ));
1183 KDASSERT(!(bp->b_flags & B_INVAL));
1184 KDASSERT(!(bp->b_flags & B_NOCACHE));
1185 #endif
1186
1187 if (bp->b_error) {
1188 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1189 XXXpooka: interfaces not fully updated
1190 Note: this was not enabled in the original patch
1191 against netbsd4 either. I don't know if comment
1192 above is true or not.
1193
1194 /*
1195 * If an error occurs, report the error and leave the
1196 * buffer as a delayed write on the LRU queue.
1197 * restarting the write would likely result in
1198 * an error spinloop, so let it be done harmlessly
1199 * by the syncer.
1200 */
1201 bp->b_flags &= ~(B_DONE);
1202 simple_unlock(&bp->b_interlock);
1203
1204 if (we->we_error == 0) {
1205 mutex_enter(&wl->wl_mtx);
1206 wl->wl_error_count++;
1207 mutex_exit(&wl->wl_mtx);
1208 cv_broadcast(&wl->wl_reclaimable_cv);
1209 }
1210 we->we_error = bp->b_error;
1211 bp->b_error = 0;
1212 brelse(bp);
1213 return;
1214 #else
1215 /* For now, just mark the log permanently errored out */
1216
1217 mutex_enter(&wl->wl_mtx);
1218 if (wl->wl_error_count == 0) {
1219 wl->wl_error_count++;
1220 cv_broadcast(&wl->wl_reclaimable_cv);
1221 }
1222 mutex_exit(&wl->wl_mtx);
1223 #endif
1224 }
1225
1226 mutex_enter(&wl->wl_mtx);
1227
1228 KASSERT(we->we_bufcount > 0);
1229 we->we_bufcount--;
1230 #ifdef WAPBL_DEBUG_BUFBYTES
1231 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1232 we->we_unsynced_bufbytes -= bp->b_bufsize;
1233 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1234 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1235 #endif
1236
1237 /*
1238 * If the current transaction can be reclaimed, start
1239 * at the beginning and reclaim any consecutive reclaimable
1240 * transactions. If we successfully reclaim anything,
1241 * then wakeup anyone waiting for the reclaim.
1242 */
1243 if (we->we_bufcount == 0) {
1244 size_t delta = 0;
1245 int errcnt = 0;
1246 #ifdef WAPBL_DEBUG_BUFBYTES
1247 KDASSERT(we->we_unsynced_bufbytes == 0);
1248 #endif
1249 /*
1250 * clear any posted error, since the buffer it came from
1251 * has successfully flushed by now
1252 */
1253 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1254 (we->we_bufcount == 0)) {
1255 delta += we->we_reclaimable_bytes;
1256 if (we->we_error)
1257 errcnt++;
1258 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1259 wapbl_free(we);
1260 }
1261
1262 if (delta) {
1263 wl->wl_reclaimable_bytes += delta;
1264 KASSERT(wl->wl_error_count >= errcnt);
1265 wl->wl_error_count -= errcnt;
1266 cv_broadcast(&wl->wl_reclaimable_cv);
1267 }
1268 }
1269
1270 mutex_exit(&wl->wl_mtx);
1271 brelse(bp, 0);
1272 }
1273
1274 /*
1275 * Write transactions to disk + start I/O for contents
1276 */
1277 int
1278 wapbl_flush(struct wapbl *wl, int waitfor)
1279 {
1280 struct buf *bp;
1281 struct wapbl_entry *we;
1282 off_t off;
1283 off_t head;
1284 off_t tail;
1285 size_t delta = 0;
1286 size_t flushsize;
1287 size_t reserved;
1288 int error = 0;
1289
1290 /*
1291 * Do a quick check to see if a full flush can be skipped
1292 * This assumes that the flush callback does not need to be called
1293 * unless there are other outstanding bufs.
1294 */
1295 if (!waitfor) {
1296 size_t nbufs;
1297 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1298 protect the KASSERTS */
1299 nbufs = wl->wl_bufcount;
1300 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1301 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1302 mutex_exit(&wl->wl_mtx);
1303 if (nbufs == 0)
1304 return 0;
1305 }
1306
1307 /*
1308 * XXX we may consider using LK_UPGRADE here
1309 * if we want to call flush from inside a transaction
1310 */
1311 rw_enter(&wl->wl_rwlock, RW_WRITER);
1312 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1313 wl->wl_dealloccnt);
1314
1315 /*
1316 * Now that we are fully locked and flushed,
1317 * do another check for nothing to do.
1318 */
1319 if (wl->wl_bufcount == 0) {
1320 goto out;
1321 }
1322
1323 #if 0
1324 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1325 ("wapbl_flush thread %d.%d flushing entries with "
1326 "bufcount=%zu bufbytes=%zu\n",
1327 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1328 wl->wl_bufbytes));
1329 #endif
1330
1331 /* Calculate amount of space needed to flush */
1332 flushsize = wapbl_transaction_len(wl);
1333
1334 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1335 /*
1336 * XXX this could be handled more gracefully, perhaps place
1337 * only a partial transaction in the log and allow the
1338 * remaining to flush without the protection of the journal.
1339 */
1340 panic("wapbl_flush: current transaction too big to flush\n");
1341 }
1342
1343 error = wapbl_truncate(wl, flushsize, 0);
1344 if (error)
1345 goto out2;
1346
1347 off = wl->wl_head;
1348 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1349 (off < wl->wl_circ_off + wl->wl_circ_size)));
1350 error = wapbl_write_blocks(wl, &off);
1351 if (error)
1352 goto out2;
1353 error = wapbl_write_revocations(wl, &off);
1354 if (error)
1355 goto out2;
1356 error = wapbl_write_inodes(wl, &off);
1357 if (error)
1358 goto out2;
1359
1360 reserved = 0;
1361 if (wl->wl_inohashcnt)
1362 reserved = wapbl_transaction_inodes_len(wl);
1363
1364 head = wl->wl_head;
1365 tail = wl->wl_tail;
1366
1367 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1368 &head, &tail);
1369 #ifdef WAPBL_DEBUG
1370 if (head != off) {
1371 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1372 " off=%"PRIdMAX" flush=%zu\n",
1373 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1374 flushsize);
1375 }
1376 #else
1377 KASSERT(head == off);
1378 #endif
1379
1380 /* Opportunistically move the tail forward if we can */
1381 if (!wapbl_lazy_truncate) {
1382 mutex_enter(&wl->wl_mtx);
1383 delta = wl->wl_reclaimable_bytes;
1384 mutex_exit(&wl->wl_mtx);
1385 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1386 &head, &tail);
1387 }
1388
1389 error = wapbl_write_commit(wl, head, tail);
1390 if (error)
1391 goto out2;
1392
1393 /* poolme? or kmemme? */
1394 we = wapbl_calloc(1, sizeof(*we));
1395
1396 #ifdef WAPBL_DEBUG_BUFBYTES
1397 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1398 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1399 " unsynced=%zu"
1400 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1401 "inodes=%d\n",
1402 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1403 wapbl_space_used(wl->wl_circ_size, head, tail),
1404 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1405 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1406 wl->wl_inohashcnt));
1407 #else
1408 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1409 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1410 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1411 "inodes=%d\n",
1412 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1413 wapbl_space_used(wl->wl_circ_size, head, tail),
1414 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1415 wl->wl_dealloccnt, wl->wl_inohashcnt));
1416 #endif
1417
1418
1419 mutex_enter(&bufcache_lock);
1420 mutex_enter(&wl->wl_mtx);
1421
1422 wl->wl_reserved_bytes = reserved;
1423 wl->wl_head = head;
1424 wl->wl_tail = tail;
1425 KASSERT(wl->wl_reclaimable_bytes >= delta);
1426 wl->wl_reclaimable_bytes -= delta;
1427 wl->wl_dealloccnt = 0;
1428 #ifdef WAPBL_DEBUG_BUFBYTES
1429 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1430 #endif
1431
1432 we->we_wapbl = wl;
1433 we->we_bufcount = wl->wl_bufcount;
1434 #ifdef WAPBL_DEBUG_BUFBYTES
1435 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1436 #endif
1437 we->we_reclaimable_bytes = flushsize;
1438 we->we_error = 0;
1439 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1440
1441 /*
1442 * this flushes bufs in reverse order than they were queued
1443 * it shouldn't matter, but if we care we could use TAILQ instead.
1444 * XXX Note they will get put on the lru queue when they flush
1445 * so we might actually want to change this to preserve order.
1446 */
1447 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1448 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1449 continue;
1450 }
1451 bp->b_iodone = wapbl_biodone;
1452 bp->b_private = we;
1453 bremfree(bp);
1454 wapbl_remove_buf_locked(wl, bp);
1455 mutex_exit(&wl->wl_mtx);
1456 mutex_exit(&bufcache_lock);
1457 bawrite(bp);
1458 mutex_enter(&bufcache_lock);
1459 mutex_enter(&wl->wl_mtx);
1460 }
1461 mutex_exit(&wl->wl_mtx);
1462 mutex_exit(&bufcache_lock);
1463
1464 #if 0
1465 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1466 ("wapbl_flush thread %d.%d done flushing entries...\n",
1467 curproc->p_pid, curlwp->l_lid));
1468 #endif
1469
1470 out:
1471
1472 /*
1473 * If the waitfor flag is set, don't return until everything is
1474 * fully flushed and the on disk log is empty.
1475 */
1476 if (waitfor) {
1477 error = wapbl_truncate(wl, wl->wl_circ_size -
1478 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1479 }
1480
1481 out2:
1482 if (error) {
1483 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1484 wl->wl_dealloclens, wl->wl_dealloccnt);
1485 }
1486
1487 #ifdef WAPBL_DEBUG_PRINT
1488 if (error) {
1489 pid_t pid = -1;
1490 lwpid_t lid = -1;
1491 if (curproc)
1492 pid = curproc->p_pid;
1493 if (curlwp)
1494 lid = curlwp->l_lid;
1495 mutex_enter(&wl->wl_mtx);
1496 #ifdef WAPBL_DEBUG_BUFBYTES
1497 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1498 ("wapbl_flush: thread %d.%d aborted flush: "
1499 "error = %d\n"
1500 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1501 "deallocs=%d inodes=%d\n"
1502 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1503 "unsynced=%zu\n",
1504 pid, lid, error, wl->wl_bufcount,
1505 wl->wl_bufbytes, wl->wl_bcount,
1506 wl->wl_dealloccnt, wl->wl_inohashcnt,
1507 wl->wl_error_count, wl->wl_reclaimable_bytes,
1508 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1509 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1510 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1511 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1512 "error = %d, unsynced = %zu\n",
1513 we->we_bufcount, we->we_reclaimable_bytes,
1514 we->we_error, we->we_unsynced_bufbytes));
1515 }
1516 #else
1517 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1518 ("wapbl_flush: thread %d.%d aborted flush: "
1519 "error = %d\n"
1520 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1521 "deallocs=%d inodes=%d\n"
1522 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1523 pid, lid, error, wl->wl_bufcount,
1524 wl->wl_bufbytes, wl->wl_bcount,
1525 wl->wl_dealloccnt, wl->wl_inohashcnt,
1526 wl->wl_error_count, wl->wl_reclaimable_bytes,
1527 wl->wl_reserved_bytes));
1528 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1529 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1530 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1531 "error = %d\n", we->we_bufcount,
1532 we->we_reclaimable_bytes, we->we_error));
1533 }
1534 #endif
1535 mutex_exit(&wl->wl_mtx);
1536 }
1537 #endif
1538
1539 rw_exit(&wl->wl_rwlock);
1540 return error;
1541 }
1542
1543 /****************************************************************/
1544
1545 void
1546 wapbl_jlock_assert(struct wapbl *wl)
1547 {
1548
1549 #ifdef WAPBL_DEBUG_SERIALIZE
1550 KASSERT(rw_write_held(&wl->wl_rwlock));
1551 #else
1552 KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
1553 #endif
1554 }
1555
1556 void
1557 wapbl_junlock_assert(struct wapbl *wl)
1558 {
1559
1560 #ifdef WAPBL_DEBUG_SERIALIZE
1561 KASSERT(!rw_write_held(&wl->wl_rwlock));
1562 #endif
1563 }
1564
1565 /****************************************************************/
1566
1567 /* locks missing */
1568 void
1569 wapbl_print(struct wapbl *wl,
1570 int full,
1571 void (*pr)(const char *, ...))
1572 {
1573 struct buf *bp;
1574 struct wapbl_entry *we;
1575 (*pr)("wapbl %p", wl);
1576 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1577 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1578 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1579 wl->wl_circ_size, wl->wl_circ_off,
1580 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1581 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1582 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1583 #ifdef WAPBL_DEBUG_BUFBYTES
1584 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1585 "reserved = %zu errcnt = %d unsynced = %zu\n",
1586 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1587 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1588 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1589 #else
1590 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1591 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1592 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1593 wl->wl_error_count);
1594 #endif
1595 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1596 wl->wl_dealloccnt, wl->wl_dealloclim);
1597 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1598 wl->wl_inohashcnt, wl->wl_inohashmask);
1599 (*pr)("entries:\n");
1600 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1601 #ifdef WAPBL_DEBUG_BUFBYTES
1602 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1603 "unsynced = %zu\n",
1604 we->we_bufcount, we->we_reclaimable_bytes,
1605 we->we_error, we->we_unsynced_bufbytes);
1606 #else
1607 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1608 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1609 #endif
1610 }
1611 if (full) {
1612 int cnt = 0;
1613 (*pr)("bufs =");
1614 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1615 if (!LIST_NEXT(bp, b_wapbllist)) {
1616 (*pr)(" %p", bp);
1617 } else if ((++cnt % 6) == 0) {
1618 (*pr)(" %p,\n\t", bp);
1619 } else {
1620 (*pr)(" %p,", bp);
1621 }
1622 }
1623 (*pr)("\n");
1624
1625 (*pr)("dealloced blks = ");
1626 {
1627 int i;
1628 cnt = 0;
1629 for (i = 0; i < wl->wl_dealloccnt; i++) {
1630 (*pr)(" %"PRId64":%d,",
1631 wl->wl_deallocblks[i],
1632 wl->wl_dealloclens[i]);
1633 if ((++cnt % 4) == 0) {
1634 (*pr)("\n\t");
1635 }
1636 }
1637 }
1638 (*pr)("\n");
1639
1640 (*pr)("registered inodes = ");
1641 {
1642 int i;
1643 cnt = 0;
1644 for (i = 0; i <= wl->wl_inohashmask; i++) {
1645 struct wapbl_ino_head *wih;
1646 struct wapbl_ino *wi;
1647
1648 wih = &wl->wl_inohash[i];
1649 LIST_FOREACH(wi, wih, wi_hash) {
1650 if (wi->wi_ino == 0)
1651 continue;
1652 (*pr)(" %"PRId32"/0%06"PRIo32",",
1653 wi->wi_ino, wi->wi_mode);
1654 if ((++cnt % 4) == 0) {
1655 (*pr)("\n\t");
1656 }
1657 }
1658 }
1659 (*pr)("\n");
1660 }
1661 }
1662 }
1663
1664 #if defined(WAPBL_DEBUG) || defined(DDB)
1665 void
1666 wapbl_dump(struct wapbl *wl)
1667 {
1668 #if defined(WAPBL_DEBUG)
1669 if (!wl)
1670 wl = wapbl_debug_wl;
1671 #endif
1672 if (!wl)
1673 return;
1674 wapbl_print(wl, 1, printf);
1675 }
1676 #endif
1677
1678 /****************************************************************/
1679
1680 void
1681 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1682 {
1683
1684 wapbl_jlock_assert(wl);
1685
1686 /* XXX should eventually instead tie this into resource estimation */
1687 /* XXX this KASSERT needs locking/mutex analysis */
1688 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
1689 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1690 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1691 wl->wl_dealloccnt++;
1692 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1693 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1694 }
1695
1696 /****************************************************************/
1697
1698 /*
1699 * Singleton pool init
1700 */
1701 static void
1702 wapbl_pool_init(int *refcnt, struct pool *pp, size_t size, const char *wchan)
1703 {
1704
1705 mutex_enter(&wapbl_global_mtx);
1706 if ((*refcnt)++ == 0)
1707 pool_init(pp, size, 0, 0, 0, wchan,
1708 &pool_allocator_nointr, IPL_NONE);
1709 mutex_exit(&wapbl_global_mtx);
1710 }
1711
1712 static void
1713 wapbl_pool_done(volatile int *refcnt, struct pool *pp)
1714 {
1715
1716 mutex_enter(&wapbl_global_mtx);
1717 if (--(*refcnt) == 0)
1718 pool_destroy(pp);
1719 mutex_exit(&wapbl_global_mtx);
1720 }
1721
1722 static void
1723 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1724 {
1725
1726 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1727 wapbl_pool_init(&wapbl_ino_pool_refcount, &wapbl_ino_pool,
1728 sizeof(struct wapbl_ino), "wapblinopl");
1729 }
1730
1731 static void
1732 wapbl_inodetrk_free(struct wapbl *wl)
1733 {
1734
1735 /* XXX this KASSERT needs locking/mutex analysis */
1736 KASSERT(wl->wl_inohashcnt == 0);
1737 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1738 wapbl_pool_done(&wapbl_ino_pool_refcount, &wapbl_ino_pool);
1739 }
1740
1741 static struct wapbl_ino *
1742 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1743 {
1744 struct wapbl_ino_head *wih;
1745 struct wapbl_ino *wi;
1746
1747 KASSERT(mutex_owned(&wl->wl_mtx));
1748
1749 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1750 LIST_FOREACH(wi, wih, wi_hash) {
1751 if (ino == wi->wi_ino)
1752 return wi;
1753 }
1754 return 0;
1755 }
1756
1757 void
1758 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1759 {
1760 struct wapbl_ino_head *wih;
1761 struct wapbl_ino *wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1762
1763 mutex_enter(&wl->wl_mtx);
1764 if (wapbl_inodetrk_get(wl, ino)) {
1765 pool_put(&wapbl_ino_pool, wi);
1766 } else {
1767 wi->wi_ino = ino;
1768 wi->wi_mode = mode;
1769 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1770 LIST_INSERT_HEAD(wih, wi, wi_hash);
1771 wl->wl_inohashcnt++;
1772 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1773 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1774 }
1775 mutex_exit(&wl->wl_mtx);
1776 }
1777
1778 void
1779 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1780 {
1781 struct wapbl_ino *wi;
1782
1783 mutex_enter(&wl->wl_mtx);
1784 wi = wapbl_inodetrk_get(wl, ino);
1785 if (wi) {
1786 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1787 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1788 KASSERT(wl->wl_inohashcnt > 0);
1789 wl->wl_inohashcnt--;
1790 LIST_REMOVE(wi, wi_hash);
1791 mutex_exit(&wl->wl_mtx);
1792
1793 pool_put(&wapbl_ino_pool, wi);
1794 } else {
1795 mutex_exit(&wl->wl_mtx);
1796 }
1797 }
1798
1799 /****************************************************************/
1800
1801 static __inline size_t
1802 wapbl_transaction_inodes_len(struct wapbl *wl)
1803 {
1804 int blocklen = 1<<wl->wl_log_dev_bshift;
1805 int iph;
1806
1807 /* Calculate number of inodes described in a inodelist header */
1808 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1809 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1810
1811 KASSERT(iph > 0);
1812
1813 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1814 }
1815
1816
1817 /* Calculate amount of space a transaction will take on disk */
1818 static size_t
1819 wapbl_transaction_len(struct wapbl *wl)
1820 {
1821 int blocklen = 1<<wl->wl_log_dev_bshift;
1822 size_t len;
1823 int bph;
1824
1825 /* Calculate number of blocks described in a blocklist header */
1826 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1827 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1828
1829 KASSERT(bph > 0);
1830
1831 len = wl->wl_bcount;
1832 len += howmany(wl->wl_bufcount, bph)*blocklen;
1833 len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1834 len += wapbl_transaction_inodes_len(wl);
1835
1836 return len;
1837 }
1838
1839 /*
1840 * Perform commit operation
1841 *
1842 * Note that generation number incrementation needs to
1843 * be protected against racing with other invocations
1844 * of wapbl_commit. This is ok since this routine
1845 * is only invoked from wapbl_flush
1846 */
1847 static int
1848 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1849 {
1850 struct wapbl_wc_header *wc = wl->wl_wc_header;
1851 struct timespec ts;
1852 int error;
1853 int force = 1;
1854
1855 /* XXX Calc checksum here, instead we do this for now */
1856 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1857 if (error) {
1858 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1859 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1860 "returned %d\n", wl->wl_devvp->v_rdev, error));
1861 }
1862
1863 wc->wc_head = head;
1864 wc->wc_tail = tail;
1865 wc->wc_checksum = 0;
1866 wc->wc_version = 1;
1867 getnanotime(&ts); /* XXX need higher resolution time here? */
1868 wc->wc_time = ts.tv_sec;;
1869 wc->wc_timensec = ts.tv_nsec;
1870
1871 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1872 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1873 (intmax_t)head, (intmax_t)tail));
1874
1875 /*
1876 * XXX if generation will rollover, then first zero
1877 * over second commit header before trying to write both headers.
1878 */
1879
1880 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
1881 wl->wl_logpbn + wc->wc_generation % 2);
1882 if (error)
1883 return error;
1884
1885 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1886 if (error) {
1887 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1888 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1889 "returned %d\n", wl->wl_devvp->v_rdev, error));
1890 }
1891
1892 /*
1893 * If the generation number was zero, write it out a second time.
1894 * This handles initialization and generation number rollover
1895 */
1896 if (wc->wc_generation++ == 0) {
1897 error = wapbl_write_commit(wl, head, tail);
1898 /*
1899 * This panic should be able to be removed if we do the
1900 * zero'ing mentioned above, and we are certain to roll
1901 * back generation number on failure.
1902 */
1903 if (error)
1904 panic("wapbl_write_commit: error writing duplicate "
1905 "log header: %d\n", error);
1906 }
1907 return 0;
1908 }
1909
1910 /* Returns new offset value */
1911 static int
1912 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1913 {
1914 struct wapbl_wc_blocklist *wc =
1915 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1916 int blocklen = 1<<wl->wl_log_dev_bshift;
1917 int bph;
1918 struct buf *bp;
1919 off_t off = *offp;
1920 int error;
1921
1922 KASSERT(rw_write_held(&wl->wl_rwlock));
1923
1924 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1925 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1926
1927 bp = LIST_FIRST(&wl->wl_bufs);
1928
1929 while (bp) {
1930 int cnt;
1931 struct buf *obp = bp;
1932
1933 KASSERT(bp->b_flags & B_LOCKED);
1934
1935 wc->wc_type = WAPBL_WC_BLOCKS;
1936 wc->wc_len = blocklen;
1937 wc->wc_blkcount = 0;
1938 while (bp && (wc->wc_blkcount < bph)) {
1939 /*
1940 * Make sure all the physical block numbers are up to
1941 * date. If this is not always true on a given
1942 * filesystem, then VOP_BMAP must be called. We
1943 * could call VOP_BMAP here, or else in the filesystem
1944 * specific flush callback, although neither of those
1945 * solutions allow us to take the vnode lock. If a
1946 * filesystem requires that we must take the vnode lock
1947 * to call VOP_BMAP, then we can probably do it in
1948 * bwrite when the vnode lock should already be held
1949 * by the invoking code.
1950 */
1951 KASSERT((bp->b_vp->v_type == VBLK) ||
1952 (bp->b_blkno != bp->b_lblkno));
1953 KASSERT(bp->b_blkno > 0);
1954
1955 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1956 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1957 wc->wc_len += bp->b_bcount;
1958 wc->wc_blkcount++;
1959 bp = LIST_NEXT(bp, b_wapbllist);
1960 }
1961 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1962 ("wapbl_write_blocks: len = %u off = %"PRIdMAX"\n",
1963 wc->wc_len, (intmax_t)off));
1964
1965 error = wapbl_circ_write(wl, wc, blocklen, &off);
1966 if (error)
1967 return error;
1968 bp = obp;
1969 cnt = 0;
1970 while (bp && (cnt++ < bph)) {
1971 error = wapbl_circ_write(wl, bp->b_data,
1972 bp->b_bcount, &off);
1973 if (error)
1974 return error;
1975 bp = LIST_NEXT(bp, b_wapbllist);
1976 }
1977 }
1978 *offp = off;
1979 return 0;
1980 }
1981
1982 static int
1983 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
1984 {
1985 struct wapbl_wc_blocklist *wc =
1986 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1987 int i;
1988 int blocklen = 1<<wl->wl_log_dev_bshift;
1989 int bph;
1990 off_t off = *offp;
1991 int error;
1992
1993 if (wl->wl_dealloccnt == 0)
1994 return 0;
1995
1996 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1997 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1998
1999 i = 0;
2000 while (i < wl->wl_dealloccnt) {
2001 wc->wc_type = WAPBL_WC_REVOCATIONS;
2002 wc->wc_len = blocklen;
2003 wc->wc_blkcount = 0;
2004 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2005 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2006 wl->wl_deallocblks[i];
2007 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2008 wl->wl_dealloclens[i];
2009 wc->wc_blkcount++;
2010 i++;
2011 }
2012 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2013 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2014 wc->wc_len, (intmax_t)off));
2015 error = wapbl_circ_write(wl, wc, blocklen, &off);
2016 if (error)
2017 return error;
2018 }
2019 *offp = off;
2020 return 0;
2021 }
2022
2023 static int
2024 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2025 {
2026 struct wapbl_wc_inodelist *wc =
2027 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2028 int i;
2029 int blocklen = 1<<wl->wl_log_dev_bshift;
2030 off_t off = *offp;
2031 int error;
2032
2033 struct wapbl_ino_head *wih;
2034 struct wapbl_ino *wi;
2035 int iph;
2036
2037 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2038 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2039
2040 i = 0;
2041 wih = &wl->wl_inohash[0];
2042 wi = 0;
2043 do {
2044 wc->wc_type = WAPBL_WC_INODES;
2045 wc->wc_len = blocklen;
2046 wc->wc_inocnt = 0;
2047 wc->wc_clear = (i == 0);
2048 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2049 while (!wi) {
2050 KASSERT((wih - &wl->wl_inohash[0])
2051 <= wl->wl_inohashmask);
2052 wi = LIST_FIRST(wih++);
2053 }
2054 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2055 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2056 wc->wc_inocnt++;
2057 i++;
2058 wi = LIST_NEXT(wi, wi_hash);
2059 }
2060 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2061 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2062 wc->wc_len, (intmax_t)off));
2063 error = wapbl_circ_write(wl, wc, blocklen, &off);
2064 if (error)
2065 return error;
2066 } while (i < wl->wl_inohashcnt);
2067
2068 *offp = off;
2069 return 0;
2070 }
2071
2072 #endif /* _KERNEL */
2073
2074 /****************************************************************/
2075
2076 #ifdef _KERNEL
2077 static struct pool wapbl_blk_pool;
2078 static int wapbl_blk_pool_refcount;
2079 #endif
2080 struct wapbl_blk {
2081 LIST_ENTRY(wapbl_blk) wb_hash;
2082 daddr_t wb_blk;
2083 off_t wb_off; /* Offset of this block in the log */
2084 };
2085 #define WAPBL_BLKPOOL_MIN 83
2086
2087 static void
2088 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2089 {
2090 if (size < WAPBL_BLKPOOL_MIN)
2091 size = WAPBL_BLKPOOL_MIN;
2092 KASSERT(wr->wr_blkhash == 0);
2093 #ifdef _KERNEL
2094 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2095 wapbl_pool_init(&wapbl_blk_pool_refcount, &wapbl_blk_pool,
2096 sizeof(struct wapbl_blk), "wapblblkpl");
2097 #else /* ! _KERNEL */
2098 /* Manually implement hashinit */
2099 {
2100 int i;
2101 unsigned long hashsize;
2102 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2103 continue;
2104 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2105 for (i = 0; i < wr->wr_blkhashmask; i++)
2106 LIST_INIT(&wr->wr_blkhash[i]);
2107 wr->wr_blkhashmask = hashsize - 1;
2108 }
2109 #endif /* ! _KERNEL */
2110 }
2111
2112 static void
2113 wapbl_blkhash_free(struct wapbl_replay *wr)
2114 {
2115 KASSERT(wr->wr_blkhashcnt == 0);
2116 #ifdef _KERNEL
2117 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2118 wapbl_pool_done(&wapbl_blk_pool_refcount, &wapbl_blk_pool);
2119 #else /* ! _KERNEL */
2120 wapbl_free(wr->wr_blkhash);
2121 #endif /* ! _KERNEL */
2122 }
2123
2124 static struct wapbl_blk *
2125 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2126 {
2127 struct wapbl_blk_head *wbh;
2128 struct wapbl_blk *wb;
2129 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2130 LIST_FOREACH(wb, wbh, wb_hash) {
2131 if (blk == wb->wb_blk)
2132 return wb;
2133 }
2134 return 0;
2135 }
2136
2137 static void
2138 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2139 {
2140 struct wapbl_blk_head *wbh;
2141 struct wapbl_blk *wb;
2142 wb = wapbl_blkhash_get(wr, blk);
2143 if (wb) {
2144 KASSERT(wb->wb_blk == blk);
2145 wb->wb_off = off;
2146 } else {
2147 #ifdef _KERNEL
2148 wb = pool_get(&wapbl_blk_pool, PR_WAITOK);
2149 #else /* ! _KERNEL */
2150 wb = wapbl_malloc(sizeof(*wb));
2151 #endif /* ! _KERNEL */
2152 wb->wb_blk = blk;
2153 wb->wb_off = off;
2154 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2155 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2156 wr->wr_blkhashcnt++;
2157 }
2158 }
2159
2160 static void
2161 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2162 {
2163 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2164 if (wb) {
2165 KASSERT(wr->wr_blkhashcnt > 0);
2166 wr->wr_blkhashcnt--;
2167 LIST_REMOVE(wb, wb_hash);
2168 #ifdef _KERNEL
2169 pool_put(&wapbl_blk_pool, wb);
2170 #else /* ! _KERNEL */
2171 wapbl_free(wb);
2172 #endif /* ! _KERNEL */
2173 }
2174 }
2175
2176 static void
2177 wapbl_blkhash_clear(struct wapbl_replay *wr)
2178 {
2179 int i;
2180 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2181 struct wapbl_blk *wb;
2182
2183 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2184 KASSERT(wr->wr_blkhashcnt > 0);
2185 wr->wr_blkhashcnt--;
2186 LIST_REMOVE(wb, wb_hash);
2187 #ifdef _KERNEL
2188 pool_put(&wapbl_blk_pool, wb);
2189 #else /* ! _KERNEL */
2190 wapbl_free(wb);
2191 #endif /* ! _KERNEL */
2192 }
2193 }
2194 KASSERT(wr->wr_blkhashcnt == 0);
2195 }
2196
2197 /****************************************************************/
2198
2199 static int
2200 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2201 {
2202 size_t slen;
2203 struct wapbl_wc_header *wc = &wr->wr_wc_header;
2204 off_t off = *offp;
2205 int error;
2206
2207 KASSERT(((len >> wc->wc_log_dev_bshift) <<
2208 wc->wc_log_dev_bshift) == len);
2209 if (off < wc->wc_circ_off)
2210 off = wc->wc_circ_off;
2211 slen = wc->wc_circ_off + wc->wc_circ_size - off;
2212 if (slen < len) {
2213 error = wapbl_read(data, slen, wr->wr_devvp,
2214 wr->wr_logpbn + (off >> wc->wc_log_dev_bshift));
2215 if (error)
2216 return error;
2217 data = (uint8_t *)data + slen;
2218 len -= slen;
2219 off = wc->wc_circ_off;
2220 }
2221 error = wapbl_read(data, len, wr->wr_devvp,
2222 wr->wr_logpbn + (off >> wc->wc_log_dev_bshift));
2223 if (error)
2224 return error;
2225 off += len;
2226 if (off >= wc->wc_circ_off + wc->wc_circ_size)
2227 off = wc->wc_circ_off;
2228 *offp = off;
2229 return 0;
2230 }
2231
2232 static void
2233 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2234 {
2235 size_t slen;
2236 struct wapbl_wc_header *wc = &wr->wr_wc_header;
2237 off_t off = *offp;
2238
2239 KASSERT(((len >> wc->wc_log_dev_bshift) <<
2240 wc->wc_log_dev_bshift) == len);
2241
2242 if (off < wc->wc_circ_off)
2243 off = wc->wc_circ_off;
2244 slen = wc->wc_circ_off + wc->wc_circ_size - off;
2245 if (slen < len) {
2246 len -= slen;
2247 off = wc->wc_circ_off;
2248 }
2249 off += len;
2250 if (off >= wc->wc_circ_off + wc->wc_circ_size)
2251 off = wc->wc_circ_off;
2252 *offp = off;
2253 }
2254
2255 /****************************************************************/
2256
2257 int
2258 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2259 daddr_t off, size_t count, size_t blksize)
2260 {
2261 struct wapbl_replay *wr;
2262 int error;
2263 struct vnode *devvp;
2264 daddr_t logpbn;
2265 uint8_t *scratch;
2266 struct wapbl_wc_header *wch;
2267 struct wapbl_wc_header *wch2;
2268 /* Use this until we read the actual log header */
2269 int log_dev_bshift = DEV_BSHIFT;
2270 size_t used;
2271
2272 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2273 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2274 vp, off, count, blksize));
2275
2276 if (off < 0)
2277 return EINVAL;
2278
2279 if (blksize < DEV_BSIZE)
2280 return EINVAL;
2281 if (blksize % DEV_BSIZE)
2282 return EINVAL;
2283
2284 #ifdef _KERNEL
2285 #if 0
2286 /* XXX vp->v_size isn't reliably set for VBLK devices,
2287 * especially root. However, we might still want to verify
2288 * that the full load is readable */
2289 if ((off + count) * blksize > vp->v_size)
2290 return EINVAL;
2291 #endif
2292
2293 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2294 return error;
2295 }
2296 #else /* ! _KERNEL */
2297 devvp = vp;
2298 logpbn = off;
2299 #endif /* ! _KERNEL */
2300
2301 scratch = wapbl_malloc(MAXBSIZE);
2302
2303 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
2304 if (error)
2305 goto errout;
2306
2307 wch = (struct wapbl_wc_header *)scratch;
2308 wch2 =
2309 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2310 /* XXX verify checksums and magic numbers */
2311 if (wch->wc_type != WAPBL_WC_HEADER) {
2312 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2313 error = EFTYPE;
2314 goto errout;
2315 }
2316
2317 if (wch2->wc_generation > wch->wc_generation)
2318 wch = wch2;
2319
2320 wr = wapbl_calloc(1, sizeof(*wr));
2321
2322 wr->wr_logvp = vp;
2323 wr->wr_devvp = devvp;
2324 wr->wr_logpbn = logpbn;
2325
2326 wr->wr_scratch = scratch;
2327
2328 memcpy(&wr->wr_wc_header, wch, sizeof(wr->wr_wc_header));
2329
2330 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2331
2332 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2333 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2334 " len=%"PRId64" used=%zu\n",
2335 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2336 wch->wc_circ_size, used));
2337
2338 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2339 error = wapbl_replay_prescan(wr);
2340 if (error) {
2341 wapbl_replay_stop(wr);
2342 wapbl_replay_free(wr);
2343 return error;
2344 }
2345
2346 error = wapbl_replay_get_inodes(wr);
2347 if (error) {
2348 wapbl_replay_stop(wr);
2349 wapbl_replay_free(wr);
2350 return error;
2351 }
2352
2353 *wrp = wr;
2354 return 0;
2355
2356 errout:
2357 wapbl_free(scratch);
2358 return error;
2359 }
2360
2361 void
2362 wapbl_replay_stop(struct wapbl_replay *wr)
2363 {
2364
2365 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2366
2367 KDASSERT(wapbl_replay_isopen(wr));
2368
2369 wapbl_free(wr->wr_scratch);
2370 wr->wr_scratch = 0;
2371
2372 wr->wr_logvp = 0;
2373
2374 wapbl_blkhash_clear(wr);
2375 wapbl_blkhash_free(wr);
2376 }
2377
2378 void
2379 wapbl_replay_free(struct wapbl_replay *wr)
2380 {
2381
2382 KDASSERT(!wapbl_replay_isopen(wr));
2383
2384 if (wr->wr_inodes)
2385 wapbl_free(wr->wr_inodes);
2386 wapbl_free(wr);
2387 }
2388
2389 int
2390 wapbl_replay_isopen1(struct wapbl_replay *wr)
2391 {
2392
2393 return wapbl_replay_isopen(wr);
2394 }
2395
2396 static int
2397 wapbl_replay_prescan(struct wapbl_replay *wr)
2398 {
2399 off_t off;
2400 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2401 int error;
2402
2403 int logblklen = 1<<wch->wc_log_dev_bshift;
2404 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2405
2406 wapbl_blkhash_clear(wr);
2407
2408 off = wch->wc_tail;
2409 while (off != wch->wc_head) {
2410 struct wapbl_wc_null *wcn;
2411 off_t saveoff = off;
2412 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2413 if (error)
2414 goto errout;
2415 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2416 switch (wcn->wc_type) {
2417 case WAPBL_WC_BLOCKS:
2418 {
2419 struct wapbl_wc_blocklist *wc =
2420 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2421 int i;
2422 for (i = 0; i < wc->wc_blkcount; i++) {
2423 int j, n;
2424 /*
2425 * Enter each physical block into the
2426 * hashtable independently
2427 */
2428 n = wc->wc_blocks[i].wc_dlen >>
2429 wch->wc_fs_dev_bshift;
2430 for (j = 0; j < n; j++) {
2431 wapbl_blkhash_ins(wr,
2432 wc->wc_blocks[i].wc_daddr + j,
2433 off);
2434 wapbl_circ_advance(wr,
2435 fsblklen, &off);
2436 }
2437 }
2438 }
2439 break;
2440
2441 case WAPBL_WC_REVOCATIONS:
2442 {
2443 struct wapbl_wc_blocklist *wc =
2444 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2445 int i;
2446 for (i = 0; i < wc->wc_blkcount; i++) {
2447 int j, n;
2448 /*
2449 * Remove any blocks found from the
2450 * hashtable
2451 */
2452 n = wc->wc_blocks[i].wc_dlen >>
2453 wch->wc_fs_dev_bshift;
2454 for (j = 0; j < n; j++) {
2455 wapbl_blkhash_rem(wr,
2456 wc->wc_blocks[i].wc_daddr + j);
2457 }
2458 }
2459 }
2460 break;
2461
2462 case WAPBL_WC_INODES:
2463 {
2464 struct wapbl_wc_inodelist *wc =
2465 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2466 /*
2467 * Keep track of where we found this so we
2468 * can use it later
2469 */
2470 if (wc->wc_clear) {
2471 wr->wr_inodestail = saveoff;
2472 wr->wr_inodescnt = 0;
2473 }
2474 if (wr->wr_inodestail)
2475 wr->wr_inodeshead = off;
2476 wr->wr_inodescnt += wc->wc_inocnt;
2477 }
2478 break;
2479 default:
2480 printf("Unrecognized wapbl type: 0x%08x\n",
2481 wcn->wc_type);
2482 error = EFTYPE;
2483 goto errout;
2484 }
2485 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2486 if (off != saveoff) {
2487 printf("wapbl_replay: corrupted records\n");
2488 error = EFTYPE;
2489 goto errout;
2490 }
2491 }
2492 return 0;
2493
2494 errout:
2495 wapbl_blkhash_clear(wr);
2496 return error;
2497 }
2498
2499 static int
2500 wapbl_replay_get_inodes(struct wapbl_replay *wr)
2501 {
2502 off_t off;
2503 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2504 int logblklen = 1<<wch->wc_log_dev_bshift;
2505 int cnt= 0;
2506
2507 KDASSERT(wapbl_replay_isopen(wr));
2508
2509 if (wr->wr_inodescnt == 0)
2510 return 0;
2511
2512 KASSERT(!wr->wr_inodes);
2513
2514 wr->wr_inodes = wapbl_malloc(wr->wr_inodescnt*sizeof(wr->wr_inodes[0]));
2515
2516 off = wr->wr_inodestail;
2517
2518 while (off != wr->wr_inodeshead) {
2519 struct wapbl_wc_null *wcn;
2520 int error;
2521 off_t saveoff = off;
2522 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2523 if (error) {
2524 wapbl_free(wr->wr_inodes);
2525 wr->wr_inodes = 0;
2526 return error;
2527 }
2528 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2529 switch (wcn->wc_type) {
2530 case WAPBL_WC_BLOCKS:
2531 case WAPBL_WC_REVOCATIONS:
2532 break;
2533 case WAPBL_WC_INODES:
2534 {
2535 struct wapbl_wc_inodelist *wc =
2536 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2537 /*
2538 * Keep track of where we found this so we
2539 * can use it later
2540 */
2541 if (wc->wc_clear) {
2542 cnt = 0;
2543 }
2544 /* This memcpy assumes that wr_inodes is
2545 * laid out the same as wc_inodes. */
2546 memcpy(&wr->wr_inodes[cnt], wc->wc_inodes,
2547 wc->wc_inocnt*sizeof(wc->wc_inodes[0]));
2548 cnt += wc->wc_inocnt;
2549 }
2550 break;
2551 default:
2552 KASSERT(0);
2553 }
2554 off = saveoff;
2555 wapbl_circ_advance(wr, wcn->wc_len, &off);
2556 }
2557 KASSERT(cnt == wr->wr_inodescnt);
2558 return 0;
2559 }
2560
2561 #ifdef DEBUG
2562 int
2563 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2564 {
2565 off_t off;
2566 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2567 int mismatchcnt = 0;
2568 int logblklen = 1<<wch->wc_log_dev_bshift;
2569 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2570 void *scratch1 = wapbl_malloc(MAXBSIZE);
2571 void *scratch2 = wapbl_malloc(MAXBSIZE);
2572 int error = 0;
2573
2574 KDASSERT(wapbl_replay_isopen(wr));
2575
2576 off = wch->wc_tail;
2577 while (off != wch->wc_head) {
2578 struct wapbl_wc_null *wcn;
2579 #ifdef DEBUG
2580 off_t saveoff = off;
2581 #endif
2582 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2583 if (error)
2584 goto out;
2585 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2586 switch (wcn->wc_type) {
2587 case WAPBL_WC_BLOCKS:
2588 {
2589 struct wapbl_wc_blocklist *wc =
2590 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2591 int i;
2592 for (i = 0; i < wc->wc_blkcount; i++) {
2593 int foundcnt = 0;
2594 int dirtycnt = 0;
2595 int j, n;
2596 /*
2597 * Check each physical block into the
2598 * hashtable independently
2599 */
2600 n = wc->wc_blocks[i].wc_dlen >>
2601 wch->wc_fs_dev_bshift;
2602 for (j = 0; j < n; j++) {
2603 struct wapbl_blk *wb =
2604 wapbl_blkhash_get(wr,
2605 wc->wc_blocks[i].wc_daddr + j);
2606 if (wb && (wb->wb_off == off)) {
2607 foundcnt++;
2608 error =
2609 wapbl_circ_read(wr,
2610 scratch1, fsblklen,
2611 &off);
2612 if (error)
2613 goto out;
2614 error =
2615 wapbl_read(scratch2,
2616 fsblklen, fsdevvp,
2617 wb->wb_blk);
2618 if (error)
2619 goto out;
2620 if (memcmp(scratch1,
2621 scratch2,
2622 fsblklen)) {
2623 printf(
2624 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2625 wb->wb_blk, (intmax_t)off);
2626 dirtycnt++;
2627 mismatchcnt++;
2628 }
2629 } else {
2630 wapbl_circ_advance(wr,
2631 fsblklen, &off);
2632 }
2633 }
2634 #if 0
2635 /*
2636 * If all of the blocks in an entry
2637 * are clean, then remove all of its
2638 * blocks from the hashtable since they
2639 * never will need replay.
2640 */
2641 if ((foundcnt != 0) &&
2642 (dirtycnt == 0)) {
2643 off = saveoff;
2644 wapbl_circ_advance(wr,
2645 logblklen, &off);
2646 for (j = 0; j < n; j++) {
2647 struct wapbl_blk *wb =
2648 wapbl_blkhash_get(wr,
2649 wc->wc_blocks[i].wc_daddr + j);
2650 if (wb &&
2651 (wb->wb_off == off)) {
2652 wapbl_blkhash_rem(wr, wb->wb_blk);
2653 }
2654 wapbl_circ_advance(wr,
2655 fsblklen, &off);
2656 }
2657 }
2658 #endif
2659 }
2660 }
2661 break;
2662 case WAPBL_WC_REVOCATIONS:
2663 case WAPBL_WC_INODES:
2664 break;
2665 default:
2666 KASSERT(0);
2667 }
2668 #ifdef DEBUG
2669 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2670 KASSERT(off == saveoff);
2671 #endif
2672 }
2673 out:
2674 wapbl_free(scratch1);
2675 wapbl_free(scratch2);
2676 if (!error && mismatchcnt)
2677 error = EFTYPE;
2678 return error;
2679 }
2680 #endif
2681
2682 int
2683 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2684 {
2685 off_t off;
2686 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2687 int logblklen = 1<<wch->wc_log_dev_bshift;
2688 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2689 void *scratch1 = wapbl_malloc(MAXBSIZE);
2690 int error = 0;
2691
2692 KDASSERT(wapbl_replay_isopen(wr));
2693
2694 /*
2695 * This parses the journal for replay, although it could
2696 * just as easily walk the hashtable instead.
2697 */
2698
2699 off = wch->wc_tail;
2700 while (off != wch->wc_head) {
2701 struct wapbl_wc_null *wcn;
2702 #ifdef DEBUG
2703 off_t saveoff = off;
2704 #endif
2705 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2706 if (error)
2707 goto out;
2708 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2709 switch (wcn->wc_type) {
2710 case WAPBL_WC_BLOCKS:
2711 {
2712 struct wapbl_wc_blocklist *wc =
2713 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2714 int i;
2715 for (i = 0; i < wc->wc_blkcount; i++) {
2716 int j, n;
2717 /*
2718 * Check each physical block against
2719 * the hashtable independently
2720 */
2721 n = wc->wc_blocks[i].wc_dlen >>
2722 wch->wc_fs_dev_bshift;
2723 for (j = 0; j < n; j++) {
2724 struct wapbl_blk *wb =
2725 wapbl_blkhash_get(wr,
2726 wc->wc_blocks[i].wc_daddr + j);
2727 if (wb && (wb->wb_off == off)) {
2728 error = wapbl_circ_read(
2729 wr, scratch1,
2730 fsblklen, &off);
2731 if (error)
2732 goto out;
2733 error =
2734 wapbl_write(scratch1,
2735 fsblklen, fsdevvp,
2736 wb->wb_blk);
2737 if (error)
2738 goto out;
2739 } else {
2740 wapbl_circ_advance(wr,
2741 fsblklen, &off);
2742 }
2743 }
2744 }
2745 }
2746 break;
2747 case WAPBL_WC_REVOCATIONS:
2748 case WAPBL_WC_INODES:
2749 break;
2750 default:
2751 KASSERT(0);
2752 }
2753 #ifdef DEBUG
2754 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2755 KASSERT(off == saveoff);
2756 #endif
2757 }
2758 out:
2759 wapbl_free(scratch1);
2760 return error;
2761 }
2762
2763 int
2764 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2765 {
2766 struct wapbl_wc_header *wch = &wr->wr_wc_header;
2767 int fsblklen = 1<<wch->wc_fs_dev_bshift;
2768
2769 KDASSERT(wapbl_replay_isopen(wr));
2770
2771 KASSERT((len % fsblklen) == 0);
2772
2773 while (len != 0) {
2774 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2775 if (wb) {
2776 off_t off = wb->wb_off;
2777 int error;
2778 error = wapbl_circ_read(wr, data, fsblklen, &off);
2779 if (error)
2780 return error;
2781 }
2782 data = (uint8_t *)data + fsblklen;
2783 len -= fsblklen;
2784 blk++;
2785 }
2786 return 0;
2787 }
2788