vfs_wapbl.c revision 1.16 1 /* $NetBSD: vfs_wapbl.c,v 1.16 2008/11/24 16:05:21 joerg Exp $ */
2
3 /*-
4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.16 2008/11/24 16:05:21 joerg Exp $");
40
41 #include <sys/param.h>
42
43 #ifdef _KERNEL
44 #include <sys/param.h>
45 #include <sys/namei.h>
46 #include <sys/proc.h>
47 #include <sys/uio.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/malloc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/conf.h>
53 #include <sys/mount.h>
54 #include <sys/kernel.h>
55 #include <sys/kauth.h>
56 #include <sys/mutex.h>
57 #include <sys/atomic.h>
58 #include <sys/wapbl.h>
59 #include <sys/wapbl_replay.h>
60
61 #if WAPBL_UVM_ALLOC
62 #include <uvm/uvm.h>
63 #endif
64
65 #include <miscfs/specfs/specdev.h>
66
67 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
68 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
69 #define wapbl_free(a) free((a), M_WAPBL)
70 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
71 #define wapbl_realloc(ptr, s) realloc((ptr), (s), M_WAPBL, M_WAITOK | M_ZERO)
72
73 #else /* !_KERNEL */
74 #include <assert.h>
75 #include <errno.h>
76 #include <stdio.h>
77 #include <stdbool.h>
78 #include <stdlib.h>
79 #include <string.h>
80
81 #include <sys/time.h>
82 #include <sys/wapbl.h>
83 #include <sys/wapbl_replay.h>
84
85 #define KDASSERT(x) assert(x)
86 #define KASSERT(x) assert(x)
87 #define wapbl_malloc(s) malloc(s)
88 #define wapbl_free(a) free(a)
89 #define wapbl_calloc(n, s) calloc((n), (s))
90 #define wapbl_realloc(ptr, s) realloc((ptr), (s))
91
92 #endif /* !_KERNEL */
93
94 /*
95 * INTERNAL DATA STRUCTURES
96 */
97
98 /*
99 * This structure holds per-mount log information.
100 *
101 * Legend: a = atomic access only
102 * r = read-only after init
103 * l = rwlock held
104 * m = mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* l: address of block */
175 int *wl_dealloclens; /* l: size of block (fragments, kom ihg) */
176 int wl_dealloccnt; /* l: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187 };
188
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191 #endif
192
193 /****************************************************************/
194 #ifdef _KERNEL
195
196 #ifdef WAPBL_DEBUG
197 struct wapbl *wapbl_debug_wl;
198 #endif
199
200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204 #endif /* _KERNEL */
205
206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207
208 static __inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail);
210 static __inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail);
212
213 #ifdef _KERNEL
214
215 #define WAPBL_INODETRK_SIZE 83
216 static int wapbl_ino_pool_refcount;
217 static struct pool wapbl_ino_pool;
218 struct wapbl_ino {
219 LIST_ENTRY(wapbl_ino) wi_hash;
220 ino_t wi_ino;
221 mode_t wi_mode;
222 };
223
224 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
225 static void wapbl_inodetrk_free(struct wapbl *wl);
226 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
227
228 static size_t wapbl_transaction_len(struct wapbl *wl);
229 static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
230
231 #if 0
232 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
233 #endif
234
235 static int wapbl_replay_isopen1(struct wapbl_replay *);
236
237 /*
238 * This is useful for debugging. If set, the log will
239 * only be truncated when necessary.
240 */
241 int wapbl_lazy_truncate = 0;
242
243 struct wapbl_ops wapbl_ops = {
244 .wo_wapbl_discard = wapbl_discard,
245 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
246 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
247 .wo_wapbl_replay_read = wapbl_replay_read,
248 .wo_wapbl_add_buf = wapbl_add_buf,
249 .wo_wapbl_remove_buf = wapbl_remove_buf,
250 .wo_wapbl_resize_buf = wapbl_resize_buf,
251 .wo_wapbl_begin = wapbl_begin,
252 .wo_wapbl_end = wapbl_end,
253 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
254
255 /* XXX: the following is only used to say "this is a wapbl buf" */
256 .wo_wapbl_biodone = wapbl_biodone,
257 };
258
259 void
260 wapbl_init()
261 {
262
263 malloc_type_attach(M_WAPBL);
264 }
265
266 static int
267 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
268 {
269 int error, i;
270
271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
272 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
273
274 /*
275 * Its only valid to reuse the replay log if its
276 * the same as the new log we just opened.
277 */
278 KDASSERT(!wapbl_replay_isopen(wr));
279 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
280 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
281 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
282 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
283 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
284 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
285
286 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
287
288 for (i = 0; i < wr->wr_inodescnt; i++)
289 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
290 wr->wr_inodes[i].wr_imode);
291
292 /* Make sure new transaction won't overwrite old inodes list */
293 KDASSERT(wapbl_transaction_len(wl) <=
294 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
295 wr->wr_inodestail));
296
297 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
298 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
299 wapbl_transaction_len(wl);
300
301 error = wapbl_write_inodes(wl, &wl->wl_head);
302 if (error)
303 return error;
304
305 KASSERT(wl->wl_head != wl->wl_tail);
306 KASSERT(wl->wl_head != 0);
307
308 return 0;
309 }
310
311 int
312 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
313 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
314 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
315 {
316 struct wapbl *wl;
317 struct vnode *devvp;
318 daddr_t logpbn;
319 int error;
320 int log_dev_bshift = DEV_BSHIFT;
321 int fs_dev_bshift = DEV_BSHIFT;
322 int run;
323
324 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
325 " count=%zu blksize=%zu\n", vp, off, count, blksize));
326
327 if (log_dev_bshift > fs_dev_bshift) {
328 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
329 ("wapbl: log device's block size cannot be larger "
330 "than filesystem's\n"));
331 /*
332 * Not currently implemented, although it could be if
333 * needed someday.
334 */
335 return ENOSYS;
336 }
337
338 if (off < 0)
339 return EINVAL;
340
341 if (blksize < DEV_BSIZE)
342 return EINVAL;
343 if (blksize % DEV_BSIZE)
344 return EINVAL;
345
346 /* XXXTODO: verify that the full load is writable */
347
348 /*
349 * XXX check for minimum log size
350 * minimum is governed by minimum amount of space
351 * to complete a transaction. (probably truncate)
352 */
353 /* XXX for now pick something minimal */
354 if ((count * blksize) < MAXPHYS) {
355 return ENOSPC;
356 }
357
358 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
359 return error;
360 }
361
362 wl = wapbl_calloc(1, sizeof(*wl));
363 rw_init(&wl->wl_rwlock);
364 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
365 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
366 LIST_INIT(&wl->wl_bufs);
367 SIMPLEQ_INIT(&wl->wl_entries);
368
369 wl->wl_logvp = vp;
370 wl->wl_devvp = devvp;
371 wl->wl_mount = mp;
372 wl->wl_logpbn = logpbn;
373 wl->wl_log_dev_bshift = log_dev_bshift;
374 wl->wl_fs_dev_bshift = fs_dev_bshift;
375
376 wl->wl_flush = flushfn;
377 wl->wl_flush_abort = flushabortfn;
378
379 /* Reserve two log device blocks for the commit headers */
380 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
381 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
382 /* truncate the log usage to a multiple of log_dev_bshift */
383 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
384 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
385
386 /*
387 * wl_bufbytes_max limits the size of the in memory transaction space.
388 * - Since buffers are allocated and accounted for in units of
389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
390 * (i.e. 1<<PAGE_SHIFT)
391 * - Since the log device has to be written in units of
392 * 1<<wl_log_dev_bshift it is required to be a mulitple of
393 * 1<<wl_log_dev_bshift.
394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
396 * Therefore it must be multiple of the least common multiple of those
397 * three quantities. Fortunately, all of those quantities are
398 * guaranteed to be a power of two, and the least common multiple of
399 * a set of numbers which are all powers of two is simply the maximum
400 * of those numbers. Finally, the maximum logarithm of a power of two
401 * is the same as the log of the maximum power of two. So we can do
402 * the following operations to size wl_bufbytes_max:
403 */
404
405 /* XXX fix actual number of pages reserved per filesystem. */
406 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
407
408 /* Round wl_bufbytes_max to the largest power of two constraint */
409 wl->wl_bufbytes_max >>= PAGE_SHIFT;
410 wl->wl_bufbytes_max <<= PAGE_SHIFT;
411 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
412 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
413 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
414 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
415
416 /* XXX maybe use filesystem fragment size instead of 1024 */
417 /* XXX fix actual number of buffers reserved per filesystem. */
418 wl->wl_bufcount_max = (nbuf / 2) * 1024;
419
420 /* XXX tie this into resource estimation */
421 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
422
423 #if WAPBL_UVM_ALLOC
424 wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map,
425 round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim));
426 KASSERT(wl->wl_deallocblks != NULL);
427 wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map,
428 round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim));
429 KASSERT(wl->wl_dealloclens != NULL);
430 #else
431 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
432 wl->wl_dealloclim);
433 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
434 wl->wl_dealloclim);
435 #endif
436
437 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
438
439 /* Initialize the commit header */
440 {
441 struct wapbl_wc_header *wc;
442 size_t len = 1 << wl->wl_log_dev_bshift;
443 wc = wapbl_calloc(1, len);
444 wc->wc_type = WAPBL_WC_HEADER;
445 wc->wc_len = len;
446 wc->wc_circ_off = wl->wl_circ_off;
447 wc->wc_circ_size = wl->wl_circ_size;
448 /* XXX wc->wc_fsid */
449 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
450 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
451 wl->wl_wc_header = wc;
452 wl->wl_wc_scratch = wapbl_malloc(len);
453 }
454
455 /*
456 * if there was an existing set of unlinked but
457 * allocated inodes, preserve it in the new
458 * log.
459 */
460 if (wr && wr->wr_inodescnt) {
461 error = wapbl_start_flush_inodes(wl, wr);
462 if (error)
463 goto errout;
464 }
465
466 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
467 if (error) {
468 goto errout;
469 }
470
471 *wlp = wl;
472 #if defined(WAPBL_DEBUG)
473 wapbl_debug_wl = wl;
474 #endif
475
476 return 0;
477 errout:
478 wapbl_discard(wl);
479 wapbl_free(wl->wl_wc_scratch);
480 wapbl_free(wl->wl_wc_header);
481 #if WAPBL_UVM_ALLOC
482 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
483 round_page(sizeof(*wl->wl_deallocblks *
484 wl->wl_dealloclim)));
485 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
486 round_page(sizeof(*wl->wl_dealloclens *
487 wl->wl_dealloclim)));
488 #else
489 wapbl_free(wl->wl_deallocblks);
490 wapbl_free(wl->wl_dealloclens);
491 #endif
492 wapbl_inodetrk_free(wl);
493 wapbl_free(wl);
494
495 return error;
496 }
497
498 /*
499 * Like wapbl_flush, only discards the transaction
500 * completely
501 */
502
503 void
504 wapbl_discard(struct wapbl *wl)
505 {
506 struct wapbl_entry *we;
507 struct buf *bp;
508 int i;
509
510 /*
511 * XXX we may consider using upgrade here
512 * if we want to call flush from inside a transaction
513 */
514 rw_enter(&wl->wl_rwlock, RW_WRITER);
515 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
516 wl->wl_dealloccnt);
517
518 #ifdef WAPBL_DEBUG_PRINT
519 {
520 struct wapbl_entry *we;
521 pid_t pid = -1;
522 lwpid_t lid = -1;
523 if (curproc)
524 pid = curproc->p_pid;
525 if (curlwp)
526 lid = curlwp->l_lid;
527 #ifdef WAPBL_DEBUG_BUFBYTES
528 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
529 ("wapbl_discard: thread %d.%d discarding "
530 "transaction\n"
531 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
532 "deallocs=%d inodes=%d\n"
533 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
534 "unsynced=%zu\n",
535 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
536 wl->wl_bcount, wl->wl_dealloccnt,
537 wl->wl_inohashcnt, wl->wl_error_count,
538 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
539 wl->wl_unsynced_bufbytes));
540 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
541 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
542 ("\tentry: bufcount = %zu, reclaimable = %zu, "
543 "error = %d, unsynced = %zu\n",
544 we->we_bufcount, we->we_reclaimable_bytes,
545 we->we_error, we->we_unsynced_bufbytes));
546 }
547 #else /* !WAPBL_DEBUG_BUFBYTES */
548 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
549 ("wapbl_discard: thread %d.%d discarding transaction\n"
550 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
551 "deallocs=%d inodes=%d\n"
552 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
553 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
554 wl->wl_bcount, wl->wl_dealloccnt,
555 wl->wl_inohashcnt, wl->wl_error_count,
556 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
557 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
558 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
559 ("\tentry: bufcount = %zu, reclaimable = %zu, "
560 "error = %d\n",
561 we->we_bufcount, we->we_reclaimable_bytes,
562 we->we_error));
563 }
564 #endif /* !WAPBL_DEBUG_BUFBYTES */
565 }
566 #endif /* WAPBL_DEBUG_PRINT */
567
568 for (i = 0; i <= wl->wl_inohashmask; i++) {
569 struct wapbl_ino_head *wih;
570 struct wapbl_ino *wi;
571
572 wih = &wl->wl_inohash[i];
573 while ((wi = LIST_FIRST(wih)) != NULL) {
574 LIST_REMOVE(wi, wi_hash);
575 pool_put(&wapbl_ino_pool, wi);
576 KASSERT(wl->wl_inohashcnt > 0);
577 wl->wl_inohashcnt--;
578 }
579 }
580
581 /*
582 * clean buffer list
583 */
584 mutex_enter(&bufcache_lock);
585 mutex_enter(&wl->wl_mtx);
586 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
587 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
588 /*
589 * The buffer will be unlocked and
590 * removed from the transaction in brelse
591 */
592 mutex_exit(&wl->wl_mtx);
593 brelsel(bp, 0);
594 mutex_enter(&wl->wl_mtx);
595 }
596 }
597 mutex_exit(&wl->wl_mtx);
598 mutex_exit(&bufcache_lock);
599
600 /*
601 * Remove references to this wl from wl_entries, free any which
602 * no longer have buffers, others will be freed in wapbl_biodone
603 * when they no longer have any buffers.
604 */
605 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
606 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
607 /* XXX should we be accumulating wl_error_count
608 * and increasing reclaimable bytes ? */
609 we->we_wapbl = NULL;
610 if (we->we_bufcount == 0) {
611 #ifdef WAPBL_DEBUG_BUFBYTES
612 KASSERT(we->we_unsynced_bufbytes == 0);
613 #endif
614 wapbl_free(we);
615 }
616 }
617
618 /* Discard list of deallocs */
619 wl->wl_dealloccnt = 0;
620 /* XXX should we clear wl_reserved_bytes? */
621
622 KASSERT(wl->wl_bufbytes == 0);
623 KASSERT(wl->wl_bcount == 0);
624 KASSERT(wl->wl_bufcount == 0);
625 KASSERT(LIST_EMPTY(&wl->wl_bufs));
626 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
627 KASSERT(wl->wl_inohashcnt == 0);
628
629 rw_exit(&wl->wl_rwlock);
630 }
631
632 int
633 wapbl_stop(struct wapbl *wl, int force)
634 {
635 struct vnode *vp;
636 int error;
637
638 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
639 error = wapbl_flush(wl, 1);
640 if (error) {
641 if (force)
642 wapbl_discard(wl);
643 else
644 return error;
645 }
646
647 /* Unlinked inodes persist after a flush */
648 if (wl->wl_inohashcnt) {
649 if (force) {
650 wapbl_discard(wl);
651 } else {
652 return EBUSY;
653 }
654 }
655
656 KASSERT(wl->wl_bufbytes == 0);
657 KASSERT(wl->wl_bcount == 0);
658 KASSERT(wl->wl_bufcount == 0);
659 KASSERT(LIST_EMPTY(&wl->wl_bufs));
660 KASSERT(wl->wl_dealloccnt == 0);
661 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
662 KASSERT(wl->wl_inohashcnt == 0);
663
664 vp = wl->wl_logvp;
665
666 wapbl_free(wl->wl_wc_scratch);
667 wapbl_free(wl->wl_wc_header);
668 #if WAPBL_UVM_ALLOC
669 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
670 round_page(sizeof(*wl->wl_deallocblks *
671 wl->wl_dealloclim)));
672 uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
673 round_page(sizeof(*wl->wl_dealloclens *
674 wl->wl_dealloclim)));
675 #else
676 wapbl_free(wl->wl_deallocblks);
677 wapbl_free(wl->wl_dealloclens);
678 #endif
679 wapbl_inodetrk_free(wl);
680
681 cv_destroy(&wl->wl_reclaimable_cv);
682 mutex_destroy(&wl->wl_mtx);
683 rw_destroy(&wl->wl_rwlock);
684 wapbl_free(wl);
685
686 return 0;
687 }
688
689 static int
690 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
691 {
692 struct pstats *pstats = curlwp->l_proc->p_stats;
693 struct buf *bp;
694 int error;
695
696 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
697 KASSERT(devvp->v_type == VBLK);
698
699 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
700 mutex_enter(&devvp->v_interlock);
701 devvp->v_numoutput++;
702 mutex_exit(&devvp->v_interlock);
703 pstats->p_ru.ru_oublock++;
704 } else {
705 pstats->p_ru.ru_inblock++;
706 }
707
708 bp = getiobuf(devvp, true);
709 bp->b_flags = flags;
710 bp->b_cflags = BC_BUSY; /* silly & dubious */
711 bp->b_dev = devvp->v_rdev;
712 bp->b_data = data;
713 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
714 bp->b_blkno = pbn;
715
716 WAPBL_PRINTF(WAPBL_PRINT_IO,
717 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
718 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
719 bp->b_blkno, bp->b_dev));
720
721 VOP_STRATEGY(devvp, bp);
722
723 error = biowait(bp);
724 putiobuf(bp);
725
726 if (error) {
727 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
728 ("wapbl_doio: %s %zu bytes at block %" PRId64
729 " on dev 0x%x failed with error %d\n",
730 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
731 "write" : "read"),
732 len, pbn, devvp->v_rdev, error));
733 }
734
735 return error;
736 }
737
738 int
739 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
740 {
741
742 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
743 }
744
745 int
746 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
747 {
748
749 return wapbl_doio(data, len, devvp, pbn, B_READ);
750 }
751
752 /*
753 * Off is byte offset returns new offset for next write
754 * handles log wraparound
755 */
756 static int
757 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
758 {
759 size_t slen;
760 off_t off = *offp;
761 int error;
762
763 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
764 wl->wl_log_dev_bshift) == len);
765
766 if (off < wl->wl_circ_off)
767 off = wl->wl_circ_off;
768 slen = wl->wl_circ_off + wl->wl_circ_size - off;
769 if (slen < len) {
770 error = wapbl_write(data, slen, wl->wl_devvp,
771 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
772 if (error)
773 return error;
774 data = (uint8_t *)data + slen;
775 len -= slen;
776 off = wl->wl_circ_off;
777 }
778 error = wapbl_write(data, len, wl->wl_devvp,
779 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
780 if (error)
781 return error;
782 off += len;
783 if (off >= wl->wl_circ_off + wl->wl_circ_size)
784 off = wl->wl_circ_off;
785 *offp = off;
786 return 0;
787 }
788
789 /****************************************************************/
790
791 int
792 wapbl_begin(struct wapbl *wl, const char *file, int line)
793 {
794 int doflush;
795 unsigned lockcount;
796 krw_t op;
797
798 KDASSERT(wl);
799
800 /*
801 * XXX: The original code calls for the use of a RW_READER lock
802 * here, but it turns out there are performance issues with high
803 * metadata-rate workloads (e.g. multiple simultaneous tar
804 * extractions). For now, we force the lock to be RW_WRITER,
805 * since that currently has the best performance characteristics
806 * (even for a single tar-file extraction).
807 *
808 */
809 #define WAPBL_DEBUG_SERIALIZE 1
810
811 #ifdef WAPBL_DEBUG_SERIALIZE
812 op = RW_WRITER;
813 #else
814 op = RW_READER;
815 #endif
816
817 /*
818 * XXX this needs to be made much more sophisticated.
819 * perhaps each wapbl_begin could reserve a specified
820 * number of buffers and bytes.
821 */
822 mutex_enter(&wl->wl_mtx);
823 lockcount = wl->wl_lock_count;
824 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
825 wl->wl_bufbytes_max / 2) ||
826 ((wl->wl_bufcount + (lockcount * 10)) >
827 wl->wl_bufcount_max / 2) ||
828 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
829 mutex_exit(&wl->wl_mtx);
830
831 if (doflush) {
832 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
833 ("force flush lockcnt=%d bufbytes=%zu "
834 "(max=%zu) bufcount=%zu (max=%zu)\n",
835 lockcount, wl->wl_bufbytes,
836 wl->wl_bufbytes_max, wl->wl_bufcount,
837 wl->wl_bufcount_max));
838 }
839
840 if (doflush) {
841 int error = wapbl_flush(wl, 0);
842 if (error)
843 return error;
844 }
845
846 rw_enter(&wl->wl_rwlock, op);
847 mutex_enter(&wl->wl_mtx);
848 wl->wl_lock_count++;
849 mutex_exit(&wl->wl_mtx);
850
851 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
852 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
853 ("wapbl_begin thread %d.%d with bufcount=%zu "
854 "bufbytes=%zu bcount=%zu at %s:%d\n",
855 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
856 wl->wl_bufbytes, wl->wl_bcount, file, line));
857 #endif
858
859 return 0;
860 }
861
862 void
863 wapbl_end(struct wapbl *wl)
864 {
865
866 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
867 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
868 ("wapbl_end thread %d.%d with bufcount=%zu "
869 "bufbytes=%zu bcount=%zu\n",
870 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
871 wl->wl_bufbytes, wl->wl_bcount));
872 #endif
873
874 mutex_enter(&wl->wl_mtx);
875 KASSERT(wl->wl_lock_count > 0);
876 wl->wl_lock_count--;
877 mutex_exit(&wl->wl_mtx);
878
879 rw_exit(&wl->wl_rwlock);
880 }
881
882 void
883 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
884 {
885
886 KASSERT(bp->b_cflags & BC_BUSY);
887 KASSERT(bp->b_vp);
888
889 wapbl_jlock_assert(wl);
890
891 #if 0
892 /*
893 * XXX this might be an issue for swapfiles.
894 * see uvm_swap.c:1702
895 *
896 * XXX2 why require it then? leap of semantics?
897 */
898 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
899 #endif
900
901 mutex_enter(&wl->wl_mtx);
902 if (bp->b_flags & B_LOCKED) {
903 LIST_REMOVE(bp, b_wapbllist);
904 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
905 ("wapbl_add_buf thread %d.%d re-adding buf %p "
906 "with %d bytes %d bcount\n",
907 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
908 bp->b_bcount));
909 } else {
910 /* unlocked by dirty buffers shouldn't exist */
911 KASSERT(!(bp->b_oflags & BO_DELWRI));
912 wl->wl_bufbytes += bp->b_bufsize;
913 wl->wl_bcount += bp->b_bcount;
914 wl->wl_bufcount++;
915 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
916 ("wapbl_add_buf thread %d.%d adding buf %p "
917 "with %d bytes %d bcount\n",
918 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
919 bp->b_bcount));
920 }
921 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
922 mutex_exit(&wl->wl_mtx);
923
924 bp->b_flags |= B_LOCKED;
925 }
926
927 static void
928 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
929 {
930
931 KASSERT(mutex_owned(&wl->wl_mtx));
932 KASSERT(bp->b_cflags & BC_BUSY);
933 wapbl_jlock_assert(wl);
934
935 #if 0
936 /*
937 * XXX this might be an issue for swapfiles.
938 * see uvm_swap.c:1725
939 *
940 * XXXdeux: see above
941 */
942 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
943 #endif
944 KASSERT(bp->b_flags & B_LOCKED);
945
946 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
947 ("wapbl_remove_buf thread %d.%d removing buf %p with "
948 "%d bytes %d bcount\n",
949 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
950
951 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
952 wl->wl_bufbytes -= bp->b_bufsize;
953 KASSERT(wl->wl_bcount >= bp->b_bcount);
954 wl->wl_bcount -= bp->b_bcount;
955 KASSERT(wl->wl_bufcount > 0);
956 wl->wl_bufcount--;
957 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
958 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
959 LIST_REMOVE(bp, b_wapbllist);
960
961 bp->b_flags &= ~B_LOCKED;
962 }
963
964 /* called from brelsel() in vfs_bio among other places */
965 void
966 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
967 {
968
969 mutex_enter(&wl->wl_mtx);
970 wapbl_remove_buf_locked(wl, bp);
971 mutex_exit(&wl->wl_mtx);
972 }
973
974 void
975 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
976 {
977
978 KASSERT(bp->b_cflags & BC_BUSY);
979
980 /*
981 * XXX: why does this depend on B_LOCKED? otherwise the buf
982 * is not for a transaction? if so, why is this called in the
983 * first place?
984 */
985 if (bp->b_flags & B_LOCKED) {
986 mutex_enter(&wl->wl_mtx);
987 wl->wl_bufbytes += bp->b_bufsize - oldsz;
988 wl->wl_bcount += bp->b_bcount - oldcnt;
989 mutex_exit(&wl->wl_mtx);
990 }
991 }
992
993 #endif /* _KERNEL */
994
995 /****************************************************************/
996 /* Some utility inlines */
997
998 /* This is used to advance the pointer at old to new value at old+delta */
999 static __inline off_t
1000 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
1001 {
1002 off_t new;
1003
1004 /* Define acceptable ranges for inputs. */
1005 KASSERT(delta <= size);
1006 KASSERT((old == 0) || (old >= off));
1007 KASSERT(old < (size + off));
1008
1009 if ((old == 0) && (delta != 0))
1010 new = off + delta;
1011 else if ((old + delta) < (size + off))
1012 new = old + delta;
1013 else
1014 new = (old + delta) - size;
1015
1016 /* Note some interesting axioms */
1017 KASSERT((delta != 0) || (new == old));
1018 KASSERT((delta == 0) || (new != 0));
1019 KASSERT((delta != (size)) || (new == old));
1020
1021 /* Define acceptable ranges for output. */
1022 KASSERT((new == 0) || (new >= off));
1023 KASSERT(new < (size + off));
1024 return new;
1025 }
1026
1027 static __inline size_t
1028 wapbl_space_used(size_t avail, off_t head, off_t tail)
1029 {
1030
1031 if (tail == 0) {
1032 KASSERT(head == 0);
1033 return 0;
1034 }
1035 return ((head + (avail - 1) - tail) % avail) + 1;
1036 }
1037
1038 static __inline size_t
1039 wapbl_space_free(size_t avail, off_t head, off_t tail)
1040 {
1041
1042 return avail - wapbl_space_used(avail, head, tail);
1043 }
1044
1045 static __inline void
1046 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1047 off_t *tailp)
1048 {
1049 off_t head = *headp;
1050 off_t tail = *tailp;
1051
1052 KASSERT(delta <= wapbl_space_free(size, head, tail));
1053 head = wapbl_advance(size, off, head, delta);
1054 if ((tail == 0) && (head != 0))
1055 tail = off;
1056 *headp = head;
1057 *tailp = tail;
1058 }
1059
1060 static __inline void
1061 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1062 off_t *tailp)
1063 {
1064 off_t head = *headp;
1065 off_t tail = *tailp;
1066
1067 KASSERT(delta <= wapbl_space_used(size, head, tail));
1068 tail = wapbl_advance(size, off, tail, delta);
1069 if (head == tail) {
1070 head = tail = 0;
1071 }
1072 *headp = head;
1073 *tailp = tail;
1074 }
1075
1076 #ifdef _KERNEL
1077
1078 /****************************************************************/
1079
1080 /*
1081 * Remove transactions whose buffers are completely flushed to disk.
1082 * Will block until at least minfree space is available.
1083 * only intended to be called from inside wapbl_flush and therefore
1084 * does not protect against commit races with itself or with flush.
1085 */
1086 static int
1087 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1088 {
1089 size_t delta;
1090 size_t avail;
1091 off_t head;
1092 off_t tail;
1093 int error = 0;
1094
1095 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1096 KASSERT(rw_write_held(&wl->wl_rwlock));
1097
1098 mutex_enter(&wl->wl_mtx);
1099
1100 /*
1101 * First check to see if we have to do a commit
1102 * at all.
1103 */
1104 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1105 if (minfree < avail) {
1106 mutex_exit(&wl->wl_mtx);
1107 return 0;
1108 }
1109 minfree -= avail;
1110 while ((wl->wl_error_count == 0) &&
1111 (wl->wl_reclaimable_bytes < minfree)) {
1112 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1113 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1114 "minfree=%zd\n",
1115 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1116 minfree));
1117
1118 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1119 }
1120 if (wl->wl_reclaimable_bytes < minfree) {
1121 KASSERT(wl->wl_error_count);
1122 /* XXX maybe get actual error from buffer instead someday? */
1123 error = EIO;
1124 }
1125 head = wl->wl_head;
1126 tail = wl->wl_tail;
1127 delta = wl->wl_reclaimable_bytes;
1128
1129 /* If all of of the entries are flushed, then be sure to keep
1130 * the reserved bytes reserved. Watch out for discarded transactions,
1131 * which could leave more bytes reserved than are reclaimable.
1132 */
1133 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1134 (delta >= wl->wl_reserved_bytes)) {
1135 delta -= wl->wl_reserved_bytes;
1136 }
1137 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1138 &tail);
1139 KDASSERT(wl->wl_reserved_bytes <=
1140 wapbl_space_used(wl->wl_circ_size, head, tail));
1141 mutex_exit(&wl->wl_mtx);
1142
1143 if (error)
1144 return error;
1145
1146 if (waitonly)
1147 return 0;
1148
1149 /*
1150 * This is where head, tail and delta are unprotected
1151 * from races against itself or flush. This is ok since
1152 * we only call this routine from inside flush itself.
1153 *
1154 * XXX: how can it race against itself when accessed only
1155 * from behind the write-locked rwlock?
1156 */
1157 error = wapbl_write_commit(wl, head, tail);
1158 if (error)
1159 return error;
1160
1161 wl->wl_head = head;
1162 wl->wl_tail = tail;
1163
1164 mutex_enter(&wl->wl_mtx);
1165 KASSERT(wl->wl_reclaimable_bytes >= delta);
1166 wl->wl_reclaimable_bytes -= delta;
1167 mutex_exit(&wl->wl_mtx);
1168 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1169 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1170 curproc->p_pid, curlwp->l_lid, delta));
1171
1172 return 0;
1173 }
1174
1175 /****************************************************************/
1176
1177 void
1178 wapbl_biodone(struct buf *bp)
1179 {
1180 struct wapbl_entry *we = bp->b_private;
1181 struct wapbl *wl = we->we_wapbl;
1182
1183 /*
1184 * Handle possible flushing of buffers after log has been
1185 * decomissioned.
1186 */
1187 if (!wl) {
1188 KASSERT(we->we_bufcount > 0);
1189 we->we_bufcount--;
1190 #ifdef WAPBL_DEBUG_BUFBYTES
1191 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1192 we->we_unsynced_bufbytes -= bp->b_bufsize;
1193 #endif
1194
1195 if (we->we_bufcount == 0) {
1196 #ifdef WAPBL_DEBUG_BUFBYTES
1197 KASSERT(we->we_unsynced_bufbytes == 0);
1198 #endif
1199 wapbl_free(we);
1200 }
1201
1202 brelse(bp, 0);
1203 return;
1204 }
1205
1206 #ifdef ohbother
1207 KDASSERT(bp->b_flags & B_DONE);
1208 KDASSERT(!(bp->b_flags & B_DELWRI));
1209 KDASSERT(bp->b_flags & B_ASYNC);
1210 KDASSERT(bp->b_flags & B_BUSY);
1211 KDASSERT(!(bp->b_flags & B_LOCKED));
1212 KDASSERT(!(bp->b_flags & B_READ));
1213 KDASSERT(!(bp->b_flags & B_INVAL));
1214 KDASSERT(!(bp->b_flags & B_NOCACHE));
1215 #endif
1216
1217 if (bp->b_error) {
1218 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1219 XXXpooka: interfaces not fully updated
1220 Note: this was not enabled in the original patch
1221 against netbsd4 either. I don't know if comment
1222 above is true or not.
1223
1224 /*
1225 * If an error occurs, report the error and leave the
1226 * buffer as a delayed write on the LRU queue.
1227 * restarting the write would likely result in
1228 * an error spinloop, so let it be done harmlessly
1229 * by the syncer.
1230 */
1231 bp->b_flags &= ~(B_DONE);
1232 simple_unlock(&bp->b_interlock);
1233
1234 if (we->we_error == 0) {
1235 mutex_enter(&wl->wl_mtx);
1236 wl->wl_error_count++;
1237 mutex_exit(&wl->wl_mtx);
1238 cv_broadcast(&wl->wl_reclaimable_cv);
1239 }
1240 we->we_error = bp->b_error;
1241 bp->b_error = 0;
1242 brelse(bp);
1243 return;
1244 #else
1245 /* For now, just mark the log permanently errored out */
1246
1247 mutex_enter(&wl->wl_mtx);
1248 if (wl->wl_error_count == 0) {
1249 wl->wl_error_count++;
1250 cv_broadcast(&wl->wl_reclaimable_cv);
1251 }
1252 mutex_exit(&wl->wl_mtx);
1253 #endif
1254 }
1255
1256 mutex_enter(&wl->wl_mtx);
1257
1258 KASSERT(we->we_bufcount > 0);
1259 we->we_bufcount--;
1260 #ifdef WAPBL_DEBUG_BUFBYTES
1261 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1262 we->we_unsynced_bufbytes -= bp->b_bufsize;
1263 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1264 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1265 #endif
1266
1267 /*
1268 * If the current transaction can be reclaimed, start
1269 * at the beginning and reclaim any consecutive reclaimable
1270 * transactions. If we successfully reclaim anything,
1271 * then wakeup anyone waiting for the reclaim.
1272 */
1273 if (we->we_bufcount == 0) {
1274 size_t delta = 0;
1275 int errcnt = 0;
1276 #ifdef WAPBL_DEBUG_BUFBYTES
1277 KDASSERT(we->we_unsynced_bufbytes == 0);
1278 #endif
1279 /*
1280 * clear any posted error, since the buffer it came from
1281 * has successfully flushed by now
1282 */
1283 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1284 (we->we_bufcount == 0)) {
1285 delta += we->we_reclaimable_bytes;
1286 if (we->we_error)
1287 errcnt++;
1288 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1289 wapbl_free(we);
1290 }
1291
1292 if (delta) {
1293 wl->wl_reclaimable_bytes += delta;
1294 KASSERT(wl->wl_error_count >= errcnt);
1295 wl->wl_error_count -= errcnt;
1296 cv_broadcast(&wl->wl_reclaimable_cv);
1297 }
1298 }
1299
1300 mutex_exit(&wl->wl_mtx);
1301 brelse(bp, 0);
1302 }
1303
1304 /*
1305 * Write transactions to disk + start I/O for contents
1306 */
1307 int
1308 wapbl_flush(struct wapbl *wl, int waitfor)
1309 {
1310 struct buf *bp;
1311 struct wapbl_entry *we;
1312 off_t off;
1313 off_t head;
1314 off_t tail;
1315 size_t delta = 0;
1316 size_t flushsize;
1317 size_t reserved;
1318 int error = 0;
1319
1320 /*
1321 * Do a quick check to see if a full flush can be skipped
1322 * This assumes that the flush callback does not need to be called
1323 * unless there are other outstanding bufs.
1324 */
1325 if (!waitfor) {
1326 size_t nbufs;
1327 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1328 protect the KASSERTS */
1329 nbufs = wl->wl_bufcount;
1330 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1331 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1332 mutex_exit(&wl->wl_mtx);
1333 if (nbufs == 0)
1334 return 0;
1335 }
1336
1337 /*
1338 * XXX we may consider using LK_UPGRADE here
1339 * if we want to call flush from inside a transaction
1340 */
1341 rw_enter(&wl->wl_rwlock, RW_WRITER);
1342 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1343 wl->wl_dealloccnt);
1344
1345 /*
1346 * Now that we are fully locked and flushed,
1347 * do another check for nothing to do.
1348 */
1349 if (wl->wl_bufcount == 0) {
1350 goto out;
1351 }
1352
1353 #if 0
1354 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1355 ("wapbl_flush thread %d.%d flushing entries with "
1356 "bufcount=%zu bufbytes=%zu\n",
1357 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1358 wl->wl_bufbytes));
1359 #endif
1360
1361 /* Calculate amount of space needed to flush */
1362 flushsize = wapbl_transaction_len(wl);
1363
1364 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1365 /*
1366 * XXX this could be handled more gracefully, perhaps place
1367 * only a partial transaction in the log and allow the
1368 * remaining to flush without the protection of the journal.
1369 */
1370 panic("wapbl_flush: current transaction too big to flush\n");
1371 }
1372
1373 error = wapbl_truncate(wl, flushsize, 0);
1374 if (error)
1375 goto out2;
1376
1377 off = wl->wl_head;
1378 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1379 (off < wl->wl_circ_off + wl->wl_circ_size)));
1380 error = wapbl_write_blocks(wl, &off);
1381 if (error)
1382 goto out2;
1383 error = wapbl_write_revocations(wl, &off);
1384 if (error)
1385 goto out2;
1386 error = wapbl_write_inodes(wl, &off);
1387 if (error)
1388 goto out2;
1389
1390 reserved = 0;
1391 if (wl->wl_inohashcnt)
1392 reserved = wapbl_transaction_inodes_len(wl);
1393
1394 head = wl->wl_head;
1395 tail = wl->wl_tail;
1396
1397 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1398 &head, &tail);
1399 #ifdef WAPBL_DEBUG
1400 if (head != off) {
1401 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1402 " off=%"PRIdMAX" flush=%zu\n",
1403 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1404 flushsize);
1405 }
1406 #else
1407 KASSERT(head == off);
1408 #endif
1409
1410 /* Opportunistically move the tail forward if we can */
1411 if (!wapbl_lazy_truncate) {
1412 mutex_enter(&wl->wl_mtx);
1413 delta = wl->wl_reclaimable_bytes;
1414 mutex_exit(&wl->wl_mtx);
1415 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1416 &head, &tail);
1417 }
1418
1419 error = wapbl_write_commit(wl, head, tail);
1420 if (error)
1421 goto out2;
1422
1423 /* poolme? or kmemme? */
1424 we = wapbl_calloc(1, sizeof(*we));
1425
1426 #ifdef WAPBL_DEBUG_BUFBYTES
1427 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1428 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1429 " unsynced=%zu"
1430 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1431 "inodes=%d\n",
1432 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1433 wapbl_space_used(wl->wl_circ_size, head, tail),
1434 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1435 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1436 wl->wl_inohashcnt));
1437 #else
1438 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1439 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1440 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1441 "inodes=%d\n",
1442 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1443 wapbl_space_used(wl->wl_circ_size, head, tail),
1444 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1445 wl->wl_dealloccnt, wl->wl_inohashcnt));
1446 #endif
1447
1448
1449 mutex_enter(&bufcache_lock);
1450 mutex_enter(&wl->wl_mtx);
1451
1452 wl->wl_reserved_bytes = reserved;
1453 wl->wl_head = head;
1454 wl->wl_tail = tail;
1455 KASSERT(wl->wl_reclaimable_bytes >= delta);
1456 wl->wl_reclaimable_bytes -= delta;
1457 wl->wl_dealloccnt = 0;
1458 #ifdef WAPBL_DEBUG_BUFBYTES
1459 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1460 #endif
1461
1462 we->we_wapbl = wl;
1463 we->we_bufcount = wl->wl_bufcount;
1464 #ifdef WAPBL_DEBUG_BUFBYTES
1465 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1466 #endif
1467 we->we_reclaimable_bytes = flushsize;
1468 we->we_error = 0;
1469 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1470
1471 /*
1472 * this flushes bufs in reverse order than they were queued
1473 * it shouldn't matter, but if we care we could use TAILQ instead.
1474 * XXX Note they will get put on the lru queue when they flush
1475 * so we might actually want to change this to preserve order.
1476 */
1477 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1478 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1479 continue;
1480 }
1481 bp->b_iodone = wapbl_biodone;
1482 bp->b_private = we;
1483 bremfree(bp);
1484 wapbl_remove_buf_locked(wl, bp);
1485 mutex_exit(&wl->wl_mtx);
1486 mutex_exit(&bufcache_lock);
1487 bawrite(bp);
1488 mutex_enter(&bufcache_lock);
1489 mutex_enter(&wl->wl_mtx);
1490 }
1491 mutex_exit(&wl->wl_mtx);
1492 mutex_exit(&bufcache_lock);
1493
1494 #if 0
1495 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1496 ("wapbl_flush thread %d.%d done flushing entries...\n",
1497 curproc->p_pid, curlwp->l_lid));
1498 #endif
1499
1500 out:
1501
1502 /*
1503 * If the waitfor flag is set, don't return until everything is
1504 * fully flushed and the on disk log is empty.
1505 */
1506 if (waitfor) {
1507 error = wapbl_truncate(wl, wl->wl_circ_size -
1508 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1509 }
1510
1511 out2:
1512 if (error) {
1513 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1514 wl->wl_dealloclens, wl->wl_dealloccnt);
1515 }
1516
1517 #ifdef WAPBL_DEBUG_PRINT
1518 if (error) {
1519 pid_t pid = -1;
1520 lwpid_t lid = -1;
1521 if (curproc)
1522 pid = curproc->p_pid;
1523 if (curlwp)
1524 lid = curlwp->l_lid;
1525 mutex_enter(&wl->wl_mtx);
1526 #ifdef WAPBL_DEBUG_BUFBYTES
1527 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1528 ("wapbl_flush: thread %d.%d aborted flush: "
1529 "error = %d\n"
1530 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1531 "deallocs=%d inodes=%d\n"
1532 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1533 "unsynced=%zu\n",
1534 pid, lid, error, wl->wl_bufcount,
1535 wl->wl_bufbytes, wl->wl_bcount,
1536 wl->wl_dealloccnt, wl->wl_inohashcnt,
1537 wl->wl_error_count, wl->wl_reclaimable_bytes,
1538 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1539 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1540 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1541 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1542 "error = %d, unsynced = %zu\n",
1543 we->we_bufcount, we->we_reclaimable_bytes,
1544 we->we_error, we->we_unsynced_bufbytes));
1545 }
1546 #else
1547 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1548 ("wapbl_flush: thread %d.%d aborted flush: "
1549 "error = %d\n"
1550 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1551 "deallocs=%d inodes=%d\n"
1552 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1553 pid, lid, error, wl->wl_bufcount,
1554 wl->wl_bufbytes, wl->wl_bcount,
1555 wl->wl_dealloccnt, wl->wl_inohashcnt,
1556 wl->wl_error_count, wl->wl_reclaimable_bytes,
1557 wl->wl_reserved_bytes));
1558 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1559 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1560 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1561 "error = %d\n", we->we_bufcount,
1562 we->we_reclaimable_bytes, we->we_error));
1563 }
1564 #endif
1565 mutex_exit(&wl->wl_mtx);
1566 }
1567 #endif
1568
1569 rw_exit(&wl->wl_rwlock);
1570 return error;
1571 }
1572
1573 /****************************************************************/
1574
1575 void
1576 wapbl_jlock_assert(struct wapbl *wl)
1577 {
1578
1579 #ifdef WAPBL_DEBUG_SERIALIZE
1580 KASSERT(rw_write_held(&wl->wl_rwlock));
1581 #else
1582 KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
1583 #endif
1584 }
1585
1586 void
1587 wapbl_junlock_assert(struct wapbl *wl)
1588 {
1589
1590 #ifdef WAPBL_DEBUG_SERIALIZE
1591 KASSERT(!rw_write_held(&wl->wl_rwlock));
1592 #endif
1593 }
1594
1595 /****************************************************************/
1596
1597 /* locks missing */
1598 void
1599 wapbl_print(struct wapbl *wl,
1600 int full,
1601 void (*pr)(const char *, ...))
1602 {
1603 struct buf *bp;
1604 struct wapbl_entry *we;
1605 (*pr)("wapbl %p", wl);
1606 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1607 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1608 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1609 wl->wl_circ_size, wl->wl_circ_off,
1610 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1611 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1612 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1613 #ifdef WAPBL_DEBUG_BUFBYTES
1614 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1615 "reserved = %zu errcnt = %d unsynced = %zu\n",
1616 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1617 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1618 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1619 #else
1620 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1621 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1622 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1623 wl->wl_error_count);
1624 #endif
1625 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1626 wl->wl_dealloccnt, wl->wl_dealloclim);
1627 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1628 wl->wl_inohashcnt, wl->wl_inohashmask);
1629 (*pr)("entries:\n");
1630 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1631 #ifdef WAPBL_DEBUG_BUFBYTES
1632 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1633 "unsynced = %zu\n",
1634 we->we_bufcount, we->we_reclaimable_bytes,
1635 we->we_error, we->we_unsynced_bufbytes);
1636 #else
1637 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1638 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1639 #endif
1640 }
1641 if (full) {
1642 int cnt = 0;
1643 (*pr)("bufs =");
1644 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1645 if (!LIST_NEXT(bp, b_wapbllist)) {
1646 (*pr)(" %p", bp);
1647 } else if ((++cnt % 6) == 0) {
1648 (*pr)(" %p,\n\t", bp);
1649 } else {
1650 (*pr)(" %p,", bp);
1651 }
1652 }
1653 (*pr)("\n");
1654
1655 (*pr)("dealloced blks = ");
1656 {
1657 int i;
1658 cnt = 0;
1659 for (i = 0; i < wl->wl_dealloccnt; i++) {
1660 (*pr)(" %"PRId64":%d,",
1661 wl->wl_deallocblks[i],
1662 wl->wl_dealloclens[i]);
1663 if ((++cnt % 4) == 0) {
1664 (*pr)("\n\t");
1665 }
1666 }
1667 }
1668 (*pr)("\n");
1669
1670 (*pr)("registered inodes = ");
1671 {
1672 int i;
1673 cnt = 0;
1674 for (i = 0; i <= wl->wl_inohashmask; i++) {
1675 struct wapbl_ino_head *wih;
1676 struct wapbl_ino *wi;
1677
1678 wih = &wl->wl_inohash[i];
1679 LIST_FOREACH(wi, wih, wi_hash) {
1680 if (wi->wi_ino == 0)
1681 continue;
1682 (*pr)(" %"PRId32"/0%06"PRIo32",",
1683 wi->wi_ino, wi->wi_mode);
1684 if ((++cnt % 4) == 0) {
1685 (*pr)("\n\t");
1686 }
1687 }
1688 }
1689 (*pr)("\n");
1690 }
1691 }
1692 }
1693
1694 #if defined(WAPBL_DEBUG) || defined(DDB)
1695 void
1696 wapbl_dump(struct wapbl *wl)
1697 {
1698 #if defined(WAPBL_DEBUG)
1699 if (!wl)
1700 wl = wapbl_debug_wl;
1701 #endif
1702 if (!wl)
1703 return;
1704 wapbl_print(wl, 1, printf);
1705 }
1706 #endif
1707
1708 /****************************************************************/
1709
1710 void
1711 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1712 {
1713
1714 wapbl_jlock_assert(wl);
1715
1716 /* XXX should eventually instead tie this into resource estimation */
1717 /* XXX this KASSERT needs locking/mutex analysis */
1718 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
1719 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1720 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1721 wl->wl_dealloccnt++;
1722 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1723 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1724 }
1725
1726 /****************************************************************/
1727
1728 static void
1729 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1730 {
1731
1732 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1733 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1734 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1735 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1736 }
1737 }
1738
1739 static void
1740 wapbl_inodetrk_free(struct wapbl *wl)
1741 {
1742
1743 /* XXX this KASSERT needs locking/mutex analysis */
1744 KASSERT(wl->wl_inohashcnt == 0);
1745 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1746 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1747 pool_destroy(&wapbl_ino_pool);
1748 }
1749 }
1750
1751 static struct wapbl_ino *
1752 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1753 {
1754 struct wapbl_ino_head *wih;
1755 struct wapbl_ino *wi;
1756
1757 KASSERT(mutex_owned(&wl->wl_mtx));
1758
1759 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1760 LIST_FOREACH(wi, wih, wi_hash) {
1761 if (ino == wi->wi_ino)
1762 return wi;
1763 }
1764 return 0;
1765 }
1766
1767 void
1768 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1769 {
1770 struct wapbl_ino_head *wih;
1771 struct wapbl_ino *wi;
1772
1773 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1774
1775 mutex_enter(&wl->wl_mtx);
1776 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1777 wi->wi_ino = ino;
1778 wi->wi_mode = mode;
1779 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1780 LIST_INSERT_HEAD(wih, wi, wi_hash);
1781 wl->wl_inohashcnt++;
1782 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1783 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1784 mutex_exit(&wl->wl_mtx);
1785 } else {
1786 mutex_exit(&wl->wl_mtx);
1787 pool_put(&wapbl_ino_pool, wi);
1788 }
1789 }
1790
1791 void
1792 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1793 {
1794 struct wapbl_ino *wi;
1795
1796 mutex_enter(&wl->wl_mtx);
1797 wi = wapbl_inodetrk_get(wl, ino);
1798 if (wi) {
1799 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1800 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1801 KASSERT(wl->wl_inohashcnt > 0);
1802 wl->wl_inohashcnt--;
1803 LIST_REMOVE(wi, wi_hash);
1804 mutex_exit(&wl->wl_mtx);
1805
1806 pool_put(&wapbl_ino_pool, wi);
1807 } else {
1808 mutex_exit(&wl->wl_mtx);
1809 }
1810 }
1811
1812 /****************************************************************/
1813
1814 static __inline size_t
1815 wapbl_transaction_inodes_len(struct wapbl *wl)
1816 {
1817 int blocklen = 1<<wl->wl_log_dev_bshift;
1818 int iph;
1819
1820 /* Calculate number of inodes described in a inodelist header */
1821 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1822 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1823
1824 KASSERT(iph > 0);
1825
1826 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1827 }
1828
1829
1830 /* Calculate amount of space a transaction will take on disk */
1831 static size_t
1832 wapbl_transaction_len(struct wapbl *wl)
1833 {
1834 int blocklen = 1<<wl->wl_log_dev_bshift;
1835 size_t len;
1836 int bph;
1837
1838 /* Calculate number of blocks described in a blocklist header */
1839 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1840 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1841
1842 KASSERT(bph > 0);
1843
1844 len = wl->wl_bcount;
1845 len += howmany(wl->wl_bufcount, bph)*blocklen;
1846 len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1847 len += wapbl_transaction_inodes_len(wl);
1848
1849 return len;
1850 }
1851
1852 /*
1853 * Perform commit operation
1854 *
1855 * Note that generation number incrementation needs to
1856 * be protected against racing with other invocations
1857 * of wapbl_commit. This is ok since this routine
1858 * is only invoked from wapbl_flush
1859 */
1860 static int
1861 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1862 {
1863 struct wapbl_wc_header *wc = wl->wl_wc_header;
1864 struct timespec ts;
1865 int error;
1866 int force = 1;
1867
1868 /* XXX Calc checksum here, instead we do this for now */
1869 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1870 if (error) {
1871 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1872 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1873 "returned %d\n", wl->wl_devvp->v_rdev, error));
1874 }
1875
1876 wc->wc_head = head;
1877 wc->wc_tail = tail;
1878 wc->wc_checksum = 0;
1879 wc->wc_version = 1;
1880 getnanotime(&ts);
1881 wc->wc_time = ts.tv_sec;;
1882 wc->wc_timensec = ts.tv_nsec;
1883
1884 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1885 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1886 (intmax_t)head, (intmax_t)tail));
1887
1888 /*
1889 * XXX if generation will rollover, then first zero
1890 * over second commit header before trying to write both headers.
1891 */
1892
1893 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
1894 wl->wl_logpbn + wc->wc_generation % 2);
1895 if (error)
1896 return error;
1897
1898 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1899 if (error) {
1900 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1901 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1902 "returned %d\n", wl->wl_devvp->v_rdev, error));
1903 }
1904
1905 /*
1906 * If the generation number was zero, write it out a second time.
1907 * This handles initialization and generation number rollover
1908 */
1909 if (wc->wc_generation++ == 0) {
1910 error = wapbl_write_commit(wl, head, tail);
1911 /*
1912 * This panic should be able to be removed if we do the
1913 * zero'ing mentioned above, and we are certain to roll
1914 * back generation number on failure.
1915 */
1916 if (error)
1917 panic("wapbl_write_commit: error writing duplicate "
1918 "log header: %d\n", error);
1919 }
1920 return 0;
1921 }
1922
1923 /* Returns new offset value */
1924 static int
1925 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1926 {
1927 struct wapbl_wc_blocklist *wc =
1928 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1929 int blocklen = 1<<wl->wl_log_dev_bshift;
1930 int bph;
1931 struct buf *bp;
1932 off_t off = *offp;
1933 int error;
1934 size_t padding;
1935
1936 KASSERT(rw_write_held(&wl->wl_rwlock));
1937
1938 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1939 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1940
1941 bp = LIST_FIRST(&wl->wl_bufs);
1942
1943 while (bp) {
1944 int cnt;
1945 struct buf *obp = bp;
1946
1947 KASSERT(bp->b_flags & B_LOCKED);
1948
1949 wc->wc_type = WAPBL_WC_BLOCKS;
1950 wc->wc_len = blocklen;
1951 wc->wc_blkcount = 0;
1952 while (bp && (wc->wc_blkcount < bph)) {
1953 /*
1954 * Make sure all the physical block numbers are up to
1955 * date. If this is not always true on a given
1956 * filesystem, then VOP_BMAP must be called. We
1957 * could call VOP_BMAP here, or else in the filesystem
1958 * specific flush callback, although neither of those
1959 * solutions allow us to take the vnode lock. If a
1960 * filesystem requires that we must take the vnode lock
1961 * to call VOP_BMAP, then we can probably do it in
1962 * bwrite when the vnode lock should already be held
1963 * by the invoking code.
1964 */
1965 KASSERT((bp->b_vp->v_type == VBLK) ||
1966 (bp->b_blkno != bp->b_lblkno));
1967 KASSERT(bp->b_blkno > 0);
1968
1969 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1970 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1971 wc->wc_len += bp->b_bcount;
1972 wc->wc_blkcount++;
1973 bp = LIST_NEXT(bp, b_wapbllist);
1974 }
1975 if (wc->wc_len % blocklen != 0) {
1976 padding = blocklen - wc->wc_len % blocklen;
1977 wc->wc_len += padding;
1978 } else {
1979 padding = 0;
1980 }
1981
1982 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1983 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
1984 wc->wc_len, padding, (intmax_t)off));
1985
1986 error = wapbl_circ_write(wl, wc, blocklen, &off);
1987 if (error)
1988 return error;
1989 bp = obp;
1990 cnt = 0;
1991 while (bp && (cnt++ < bph)) {
1992 error = wapbl_circ_write(wl, bp->b_data,
1993 bp->b_bcount, &off);
1994 if (error)
1995 return error;
1996 bp = LIST_NEXT(bp, b_wapbllist);
1997 }
1998 if (padding) {
1999 void *zero;
2000
2001 zero = wapbl_malloc(padding);
2002 memset(zero, 0, padding);
2003 error = wapbl_circ_write(wl, zero, padding, &off);
2004 wapbl_free(zero);
2005 if (error)
2006 return error;
2007 }
2008 }
2009 *offp = off;
2010 return 0;
2011 }
2012
2013 static int
2014 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
2015 {
2016 struct wapbl_wc_blocklist *wc =
2017 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2018 int i;
2019 int blocklen = 1<<wl->wl_log_dev_bshift;
2020 int bph;
2021 off_t off = *offp;
2022 int error;
2023
2024 if (wl->wl_dealloccnt == 0)
2025 return 0;
2026
2027 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2028 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2029
2030 i = 0;
2031 while (i < wl->wl_dealloccnt) {
2032 wc->wc_type = WAPBL_WC_REVOCATIONS;
2033 wc->wc_len = blocklen;
2034 wc->wc_blkcount = 0;
2035 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2036 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2037 wl->wl_deallocblks[i];
2038 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2039 wl->wl_dealloclens[i];
2040 wc->wc_blkcount++;
2041 i++;
2042 }
2043 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2044 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2045 wc->wc_len, (intmax_t)off));
2046 error = wapbl_circ_write(wl, wc, blocklen, &off);
2047 if (error)
2048 return error;
2049 }
2050 *offp = off;
2051 return 0;
2052 }
2053
2054 static int
2055 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2056 {
2057 struct wapbl_wc_inodelist *wc =
2058 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2059 int i;
2060 int blocklen = 1 << wl->wl_log_dev_bshift;
2061 off_t off = *offp;
2062 int error;
2063
2064 struct wapbl_ino_head *wih;
2065 struct wapbl_ino *wi;
2066 int iph;
2067
2068 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2069 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2070
2071 i = 0;
2072 wih = &wl->wl_inohash[0];
2073 wi = 0;
2074 do {
2075 wc->wc_type = WAPBL_WC_INODES;
2076 wc->wc_len = blocklen;
2077 wc->wc_inocnt = 0;
2078 wc->wc_clear = (i == 0);
2079 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2080 while (!wi) {
2081 KASSERT((wih - &wl->wl_inohash[0])
2082 <= wl->wl_inohashmask);
2083 wi = LIST_FIRST(wih++);
2084 }
2085 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2086 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2087 wc->wc_inocnt++;
2088 i++;
2089 wi = LIST_NEXT(wi, wi_hash);
2090 }
2091 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2092 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2093 wc->wc_len, (intmax_t)off));
2094 error = wapbl_circ_write(wl, wc, blocklen, &off);
2095 if (error)
2096 return error;
2097 } while (i < wl->wl_inohashcnt);
2098
2099 *offp = off;
2100 return 0;
2101 }
2102
2103 #endif /* _KERNEL */
2104
2105 /****************************************************************/
2106
2107 #ifdef _KERNEL
2108 static struct pool wapbl_blk_pool;
2109 static int wapbl_blk_pool_refcount;
2110 #endif
2111 struct wapbl_blk {
2112 LIST_ENTRY(wapbl_blk) wb_hash;
2113 daddr_t wb_blk;
2114 off_t wb_off; /* Offset of this block in the log */
2115 };
2116 #define WAPBL_BLKPOOL_MIN 83
2117
2118 static void
2119 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2120 {
2121 if (size < WAPBL_BLKPOOL_MIN)
2122 size = WAPBL_BLKPOOL_MIN;
2123 KASSERT(wr->wr_blkhash == 0);
2124 #ifdef _KERNEL
2125 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2126 if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) {
2127 pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0,
2128 "wapblblkpl", &pool_allocator_nointr, IPL_NONE);
2129 }
2130 #else /* ! _KERNEL */
2131 /* Manually implement hashinit */
2132 {
2133 int i;
2134 unsigned long hashsize;
2135 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2136 continue;
2137 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2138 for (i = 0; i < wr->wr_blkhashmask; i++)
2139 LIST_INIT(&wr->wr_blkhash[i]);
2140 wr->wr_blkhashmask = hashsize - 1;
2141 }
2142 #endif /* ! _KERNEL */
2143 }
2144
2145 static void
2146 wapbl_blkhash_free(struct wapbl_replay *wr)
2147 {
2148 KASSERT(wr->wr_blkhashcnt == 0);
2149 #ifdef _KERNEL
2150 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2151 if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) {
2152 pool_destroy(&wapbl_blk_pool);
2153 }
2154 #else /* ! _KERNEL */
2155 wapbl_free(wr->wr_blkhash);
2156 #endif /* ! _KERNEL */
2157 }
2158
2159 static struct wapbl_blk *
2160 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2161 {
2162 struct wapbl_blk_head *wbh;
2163 struct wapbl_blk *wb;
2164 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2165 LIST_FOREACH(wb, wbh, wb_hash) {
2166 if (blk == wb->wb_blk)
2167 return wb;
2168 }
2169 return 0;
2170 }
2171
2172 static void
2173 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2174 {
2175 struct wapbl_blk_head *wbh;
2176 struct wapbl_blk *wb;
2177 wb = wapbl_blkhash_get(wr, blk);
2178 if (wb) {
2179 KASSERT(wb->wb_blk == blk);
2180 wb->wb_off = off;
2181 } else {
2182 #ifdef _KERNEL
2183 wb = pool_get(&wapbl_blk_pool, PR_WAITOK);
2184 #else /* ! _KERNEL */
2185 wb = wapbl_malloc(sizeof(*wb));
2186 #endif /* ! _KERNEL */
2187 wb->wb_blk = blk;
2188 wb->wb_off = off;
2189 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2190 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2191 wr->wr_blkhashcnt++;
2192 }
2193 }
2194
2195 static void
2196 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2197 {
2198 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2199 if (wb) {
2200 KASSERT(wr->wr_blkhashcnt > 0);
2201 wr->wr_blkhashcnt--;
2202 LIST_REMOVE(wb, wb_hash);
2203 #ifdef _KERNEL
2204 pool_put(&wapbl_blk_pool, wb);
2205 #else /* ! _KERNEL */
2206 wapbl_free(wb);
2207 #endif /* ! _KERNEL */
2208 }
2209 }
2210
2211 static void
2212 wapbl_blkhash_clear(struct wapbl_replay *wr)
2213 {
2214 int i;
2215 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2216 struct wapbl_blk *wb;
2217
2218 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2219 KASSERT(wr->wr_blkhashcnt > 0);
2220 wr->wr_blkhashcnt--;
2221 LIST_REMOVE(wb, wb_hash);
2222 #ifdef _KERNEL
2223 pool_put(&wapbl_blk_pool, wb);
2224 #else /* ! _KERNEL */
2225 wapbl_free(wb);
2226 #endif /* ! _KERNEL */
2227 }
2228 }
2229 KASSERT(wr->wr_blkhashcnt == 0);
2230 }
2231
2232 /****************************************************************/
2233
2234 static int
2235 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2236 {
2237 size_t slen;
2238 off_t off = *offp;
2239 int error;
2240
2241 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2242 wr->wr_log_dev_bshift) == len);
2243 if (off < wr->wr_circ_off)
2244 off = wr->wr_circ_off;
2245 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2246 if (slen < len) {
2247 error = wapbl_read(data, slen, wr->wr_devvp,
2248 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
2249 if (error)
2250 return error;
2251 data = (uint8_t *)data + slen;
2252 len -= slen;
2253 off = wr->wr_circ_off;
2254 }
2255 error = wapbl_read(data, len, wr->wr_devvp,
2256 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
2257 if (error)
2258 return error;
2259 off += len;
2260 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2261 off = wr->wr_circ_off;
2262 *offp = off;
2263 return 0;
2264 }
2265
2266 static void
2267 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2268 {
2269 size_t slen;
2270 off_t off = *offp;
2271
2272 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2273 wr->wr_log_dev_bshift) == len);
2274
2275 if (off < wr->wr_circ_off)
2276 off = wr->wr_circ_off;
2277 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2278 if (slen < len) {
2279 len -= slen;
2280 off = wr->wr_circ_off;
2281 }
2282 off += len;
2283 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2284 off = wr->wr_circ_off;
2285 *offp = off;
2286 }
2287
2288 /****************************************************************/
2289
2290 int
2291 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2292 daddr_t off, size_t count, size_t blksize)
2293 {
2294 struct wapbl_replay *wr;
2295 int error;
2296 struct vnode *devvp;
2297 daddr_t logpbn;
2298 uint8_t *scratch;
2299 struct wapbl_wc_header *wch;
2300 struct wapbl_wc_header *wch2;
2301 /* Use this until we read the actual log header */
2302 int log_dev_bshift = DEV_BSHIFT;
2303 size_t used;
2304
2305 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2306 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2307 vp, off, count, blksize));
2308
2309 if (off < 0)
2310 return EINVAL;
2311
2312 if (blksize < DEV_BSIZE)
2313 return EINVAL;
2314 if (blksize % DEV_BSIZE)
2315 return EINVAL;
2316
2317 #ifdef _KERNEL
2318 #if 0
2319 /* XXX vp->v_size isn't reliably set for VBLK devices,
2320 * especially root. However, we might still want to verify
2321 * that the full load is readable */
2322 if ((off + count) * blksize > vp->v_size)
2323 return EINVAL;
2324 #endif
2325
2326 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2327 return error;
2328 }
2329 #else /* ! _KERNEL */
2330 devvp = vp;
2331 logpbn = off;
2332 #endif /* ! _KERNEL */
2333
2334 scratch = wapbl_malloc(MAXBSIZE);
2335
2336 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
2337 if (error)
2338 goto errout;
2339
2340 wch = (struct wapbl_wc_header *)scratch;
2341 wch2 =
2342 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2343 /* XXX verify checksums and magic numbers */
2344 if (wch->wc_type != WAPBL_WC_HEADER) {
2345 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2346 error = EFTYPE;
2347 goto errout;
2348 }
2349
2350 if (wch2->wc_generation > wch->wc_generation)
2351 wch = wch2;
2352
2353 wr = wapbl_calloc(1, sizeof(*wr));
2354
2355 wr->wr_logvp = vp;
2356 wr->wr_devvp = devvp;
2357 wr->wr_logpbn = logpbn;
2358
2359 wr->wr_scratch = scratch;
2360
2361 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2362 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2363 wr->wr_circ_off = wch->wc_circ_off;
2364 wr->wr_circ_size = wch->wc_circ_size;
2365 wr->wr_generation = wch->wc_generation;
2366
2367 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2368
2369 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2370 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2371 " len=%"PRId64" used=%zu\n",
2372 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2373 wch->wc_circ_size, used));
2374
2375 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2376
2377 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2378 if (error) {
2379 wapbl_replay_stop(wr);
2380 wapbl_replay_free(wr);
2381 return error;
2382 }
2383
2384 *wrp = wr;
2385 return 0;
2386
2387 errout:
2388 wapbl_free(scratch);
2389 return error;
2390 }
2391
2392 void
2393 wapbl_replay_stop(struct wapbl_replay *wr)
2394 {
2395
2396 if (!wapbl_replay_isopen(wr))
2397 return;
2398
2399 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2400
2401 wapbl_free(wr->wr_scratch);
2402 wr->wr_scratch = 0;
2403
2404 wr->wr_logvp = 0;
2405
2406 wapbl_blkhash_clear(wr);
2407 wapbl_blkhash_free(wr);
2408 }
2409
2410 void
2411 wapbl_replay_free(struct wapbl_replay *wr)
2412 {
2413
2414 KDASSERT(!wapbl_replay_isopen(wr));
2415
2416 if (wr->wr_inodes)
2417 wapbl_free(wr->wr_inodes);
2418 wapbl_free(wr);
2419 }
2420
2421 #ifdef _KERNEL
2422 int
2423 wapbl_replay_isopen1(struct wapbl_replay *wr)
2424 {
2425
2426 return wapbl_replay_isopen(wr);
2427 }
2428 #endif
2429
2430 static void
2431 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2432 {
2433 struct wapbl_wc_blocklist *wc =
2434 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2435 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2436 int i, j, n;
2437
2438 for (i = 0; i < wc->wc_blkcount; i++) {
2439 /*
2440 * Enter each physical block into the hashtable independently.
2441 */
2442 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2443 for (j = 0; j < n; j++) {
2444 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j,
2445 *offp);
2446 wapbl_circ_advance(wr, fsblklen, offp);
2447 }
2448 }
2449 }
2450
2451 static void
2452 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2453 {
2454 struct wapbl_wc_blocklist *wc =
2455 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2456 int i, j, n;
2457
2458 for (i = 0; i < wc->wc_blkcount; i++) {
2459 /*
2460 * Remove any blocks found from the hashtable.
2461 */
2462 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2463 for (j = 0; j < n; j++)
2464 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j);
2465 }
2466 }
2467
2468 static void
2469 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2470 {
2471 struct wapbl_wc_inodelist *wc =
2472 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2473 /*
2474 * Keep track of where we found this so location won't be
2475 * overwritten.
2476 */
2477 if (wc->wc_clear) {
2478 wr->wr_inodestail = oldoff;
2479 wr->wr_inodescnt = 0;
2480 if (wr->wr_inodes != NULL) {
2481 wapbl_free(wr->wr_inodes);
2482 wr->wr_inodes = NULL;
2483 }
2484 }
2485 wr->wr_inodeshead = newoff;
2486 if (wc->wc_inocnt == 0)
2487 return;
2488
2489 wr->wr_inodes = wapbl_realloc(wr->wr_inodes,
2490 (wr->wr_inodescnt + wc->wc_inocnt) * sizeof(wc->wc_inodes[0]));
2491 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2492 wc->wc_inocnt * sizeof(wc->wc_inodes[0]));
2493 wr->wr_inodescnt += wc->wc_inocnt;
2494 }
2495
2496 static int
2497 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2498 {
2499 off_t off;
2500 int error;
2501
2502 int logblklen = 1 << wr->wr_log_dev_bshift;
2503
2504 wapbl_blkhash_clear(wr);
2505
2506 off = tail;
2507 while (off != head) {
2508 struct wapbl_wc_null *wcn;
2509 off_t saveoff = off;
2510 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2511 if (error)
2512 goto errout;
2513 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2514 switch (wcn->wc_type) {
2515 case WAPBL_WC_BLOCKS:
2516 wapbl_replay_process_blocks(wr, &off);
2517 break;
2518
2519 case WAPBL_WC_REVOCATIONS:
2520 wapbl_replay_process_revocations(wr);
2521 break;
2522
2523 case WAPBL_WC_INODES:
2524 wapbl_replay_process_inodes(wr, saveoff, off);
2525 break;
2526
2527 default:
2528 printf("Unrecognized wapbl type: 0x%08x\n",
2529 wcn->wc_type);
2530 error = EFTYPE;
2531 goto errout;
2532 }
2533 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2534 if (off != saveoff) {
2535 printf("wapbl_replay: corrupted records\n");
2536 error = EFTYPE;
2537 goto errout;
2538 }
2539 }
2540 return 0;
2541
2542 errout:
2543 wapbl_blkhash_clear(wr);
2544 return error;
2545 }
2546
2547 #if 0
2548 int
2549 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2550 {
2551 off_t off;
2552 int mismatchcnt = 0;
2553 int logblklen = 1 << wr->wr_log_dev_bshift;
2554 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2555 void *scratch1 = wapbl_malloc(MAXBSIZE);
2556 void *scratch2 = wapbl_malloc(MAXBSIZE);
2557 int error = 0;
2558
2559 KDASSERT(wapbl_replay_isopen(wr));
2560
2561 off = wch->wc_tail;
2562 while (off != wch->wc_head) {
2563 struct wapbl_wc_null *wcn;
2564 #ifdef DEBUG
2565 off_t saveoff = off;
2566 #endif
2567 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2568 if (error)
2569 goto out;
2570 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2571 switch (wcn->wc_type) {
2572 case WAPBL_WC_BLOCKS:
2573 {
2574 struct wapbl_wc_blocklist *wc =
2575 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2576 int i;
2577 for (i = 0; i < wc->wc_blkcount; i++) {
2578 int foundcnt = 0;
2579 int dirtycnt = 0;
2580 int j, n;
2581 /*
2582 * Check each physical block into the
2583 * hashtable independently
2584 */
2585 n = wc->wc_blocks[i].wc_dlen >>
2586 wch->wc_fs_dev_bshift;
2587 for (j = 0; j < n; j++) {
2588 struct wapbl_blk *wb =
2589 wapbl_blkhash_get(wr,
2590 wc->wc_blocks[i].wc_daddr + j);
2591 if (wb && (wb->wb_off == off)) {
2592 foundcnt++;
2593 error =
2594 wapbl_circ_read(wr,
2595 scratch1, fsblklen,
2596 &off);
2597 if (error)
2598 goto out;
2599 error =
2600 wapbl_read(scratch2,
2601 fsblklen, fsdevvp,
2602 wb->wb_blk);
2603 if (error)
2604 goto out;
2605 if (memcmp(scratch1,
2606 scratch2,
2607 fsblklen)) {
2608 printf(
2609 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2610 wb->wb_blk, (intmax_t)off);
2611 dirtycnt++;
2612 mismatchcnt++;
2613 }
2614 } else {
2615 wapbl_circ_advance(wr,
2616 fsblklen, &off);
2617 }
2618 }
2619 #if 0
2620 /*
2621 * If all of the blocks in an entry
2622 * are clean, then remove all of its
2623 * blocks from the hashtable since they
2624 * never will need replay.
2625 */
2626 if ((foundcnt != 0) &&
2627 (dirtycnt == 0)) {
2628 off = saveoff;
2629 wapbl_circ_advance(wr,
2630 logblklen, &off);
2631 for (j = 0; j < n; j++) {
2632 struct wapbl_blk *wb =
2633 wapbl_blkhash_get(wr,
2634 wc->wc_blocks[i].wc_daddr + j);
2635 if (wb &&
2636 (wb->wb_off == off)) {
2637 wapbl_blkhash_rem(wr, wb->wb_blk);
2638 }
2639 wapbl_circ_advance(wr,
2640 fsblklen, &off);
2641 }
2642 }
2643 #endif
2644 }
2645 }
2646 break;
2647 case WAPBL_WC_REVOCATIONS:
2648 case WAPBL_WC_INODES:
2649 break;
2650 default:
2651 KASSERT(0);
2652 }
2653 #ifdef DEBUG
2654 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2655 KASSERT(off == saveoff);
2656 #endif
2657 }
2658 out:
2659 wapbl_free(scratch1);
2660 wapbl_free(scratch2);
2661 if (!error && mismatchcnt)
2662 error = EFTYPE;
2663 return error;
2664 }
2665 #endif
2666
2667 int
2668 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2669 {
2670 struct wapbl_blk *wb;
2671 size_t i;
2672 off_t off;
2673 void *scratch;
2674 int error = 0;
2675 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2676
2677 KDASSERT(wapbl_replay_isopen(wr));
2678
2679 scratch = wapbl_malloc(MAXBSIZE);
2680
2681 for (i = 0; i < wr->wr_blkhashmask; ++i) {
2682 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2683 off = wb->wb_off;
2684 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2685 if (error)
2686 break;
2687 error = wapbl_write(scratch, fsblklen, fsdevvp,
2688 wb->wb_blk);
2689 if (error)
2690 break;
2691 }
2692 }
2693
2694 wapbl_free(scratch);
2695 return error;
2696 }
2697
2698 int
2699 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2700 {
2701 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2702
2703 KDASSERT(wapbl_replay_isopen(wr));
2704 KASSERT((len % fsblklen) == 0);
2705
2706 while (len != 0) {
2707 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2708 if (wb)
2709 return 1;
2710 len -= fsblklen;
2711 }
2712 return 0;
2713 }
2714
2715 int
2716 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2717 {
2718 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2719
2720 KDASSERT(wapbl_replay_isopen(wr));
2721
2722 KASSERT((len % fsblklen) == 0);
2723
2724 while (len != 0) {
2725 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2726 if (wb) {
2727 off_t off = wb->wb_off;
2728 int error;
2729 error = wapbl_circ_read(wr, data, fsblklen, &off);
2730 if (error)
2731 return error;
2732 }
2733 data = (uint8_t *)data + fsblklen;
2734 len -= fsblklen;
2735 blk++;
2736 }
2737 return 0;
2738 }
2739