vfs_wapbl.c revision 1.22 1 /* $NetBSD: vfs_wapbl.c,v 1.22 2009/02/18 13:22:10 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * This implements file system independent write ahead filesystem logging.
34 */
35
36 #define WAPBL_INTERNAL
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.22 2009/02/18 13:22:10 yamt Exp $");
40
41 #include <sys/param.h>
42
43 #ifdef _KERNEL
44 #include <sys/param.h>
45 #include <sys/namei.h>
46 #include <sys/proc.h>
47 #include <sys/uio.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/malloc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/conf.h>
53 #include <sys/mount.h>
54 #include <sys/kernel.h>
55 #include <sys/kauth.h>
56 #include <sys/mutex.h>
57 #include <sys/atomic.h>
58 #include <sys/wapbl.h>
59 #include <sys/wapbl_replay.h>
60
61 #include <miscfs/specfs/specdev.h>
62
63 #if 0 /* notyet */
64 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
65 #define wapbl_free(a, s) kmem_free((a), (s))
66 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
67 #else
68 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
69 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
70 #define wapbl_free(a, s) free((a), M_WAPBL)
71 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
72 #endif
73
74 #else /* !_KERNEL */
75 #include <assert.h>
76 #include <errno.h>
77 #include <stdio.h>
78 #include <stdbool.h>
79 #include <stdlib.h>
80 #include <string.h>
81
82 #include <sys/time.h>
83 #include <sys/wapbl.h>
84 #include <sys/wapbl_replay.h>
85
86 #define KDASSERT(x) assert(x)
87 #define KASSERT(x) assert(x)
88 #define wapbl_malloc(s) malloc(s)
89 #define wapbl_free(a, s) free(a)
90 #define wapbl_calloc(n, s) calloc((n), (s))
91
92 #endif /* !_KERNEL */
93
94 /*
95 * INTERNAL DATA STRUCTURES
96 */
97
98 /*
99 * This structure holds per-mount log information.
100 *
101 * Legend: a = atomic access only
102 * r = read-only after init
103 * l = rwlock held
104 * m = mutex held
105 * u = unlocked access ok
106 * b = bufcache_lock held
107 */
108 struct wapbl {
109 struct vnode *wl_logvp; /* r: log here */
110 struct vnode *wl_devvp; /* r: log on this device */
111 struct mount *wl_mount; /* r: mountpoint wl is associated with */
112 daddr_t wl_logpbn; /* r: Physical block number of start of log */
113 int wl_log_dev_bshift; /* r: logarithm of device block size of log
114 device */
115 int wl_fs_dev_bshift; /* r: logarithm of device block size of
116 filesystem device */
117
118 unsigned wl_lock_count; /* m: Count of transactions in progress */
119
120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */
121 size_t wl_circ_off; /* r: Number of bytes reserved at start */
122
123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
125
126 off_t wl_head; /* l: Byte offset of log head */
127 off_t wl_tail; /* l: Byte offset of log tail */
128 /*
129 * head == tail == 0 means log is empty
130 * head == tail != 0 means log is full
131 * see assertions in wapbl_advance() for other boundary conditions.
132 * only truncate moves the tail, except when flush sets it to
133 * wl_header_size only flush moves the head, except when truncate
134 * sets it to 0.
135 */
136
137 struct wapbl_wc_header *wl_wc_header; /* l */
138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
139
140 kmutex_t wl_mtx; /* u: short-term lock */
141 krwlock_t wl_rwlock; /* u: File system transaction lock */
142
143 /*
144 * Must be held while accessing
145 * wl_count or wl_bufs or head or tail
146 */
147
148 /*
149 * Callback called from within the flush routine to flush any extra
150 * bits. Note that flush may be skipped without calling this if
151 * there are no outstanding buffers in the transaction.
152 */
153 #if _KERNEL
154 wapbl_flush_fn_t wl_flush; /* r */
155 wapbl_flush_fn_t wl_flush_abort;/* r */
156 #endif
157
158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
160 size_t wl_bcount; /* m: Total bcount of wl_bufs */
161
162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
163
164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */
165 size_t wl_reclaimable_bytes; /* m: Amount of space available for
166 reclamation by truncate */
167 int wl_error_count; /* m: # of wl_entries with errors */
168 size_t wl_reserved_bytes; /* never truncate log smaller than this */
169
170 #ifdef WAPBL_DEBUG_BUFBYTES
171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
172 #endif
173
174 daddr_t *wl_deallocblks;/* l: address of block */
175 int *wl_dealloclens; /* l: size of block */
176 int wl_dealloccnt; /* l: total count */
177 int wl_dealloclim; /* l: max count */
178
179 /* hashtable of inode numbers for allocated but unlinked inodes */
180 /* synch ??? */
181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
182 u_long wl_inohashmask;
183 int wl_inohashcnt;
184
185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
186 accounting */
187 };
188
189 #ifdef WAPBL_DEBUG_PRINT
190 int wapbl_debug_print = WAPBL_DEBUG_PRINT;
191 #endif
192
193 /****************************************************************/
194 #ifdef _KERNEL
195
196 #ifdef WAPBL_DEBUG
197 struct wapbl *wapbl_debug_wl;
198 #endif
199
200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
204 #endif /* _KERNEL */
205
206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
207
208 static __inline size_t wapbl_space_free(size_t avail, off_t head,
209 off_t tail);
210 static __inline size_t wapbl_space_used(size_t avail, off_t head,
211 off_t tail);
212
213 #ifdef _KERNEL
214
215 #define WAPBL_INODETRK_SIZE 83
216 static int wapbl_ino_pool_refcount;
217 static struct pool wapbl_ino_pool;
218 struct wapbl_ino {
219 LIST_ENTRY(wapbl_ino) wi_hash;
220 ino_t wi_ino;
221 mode_t wi_mode;
222 };
223
224 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
225 static void wapbl_inodetrk_free(struct wapbl *wl);
226 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
227
228 static size_t wapbl_transaction_len(struct wapbl *wl);
229 static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
230
231 #if 0
232 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
233 #endif
234
235 static int wapbl_replay_isopen1(struct wapbl_replay *);
236
237 /*
238 * This is useful for debugging. If set, the log will
239 * only be truncated when necessary.
240 */
241 int wapbl_lazy_truncate = 0;
242
243 struct wapbl_ops wapbl_ops = {
244 .wo_wapbl_discard = wapbl_discard,
245 .wo_wapbl_replay_isopen = wapbl_replay_isopen1,
246 .wo_wapbl_replay_can_read = wapbl_replay_can_read,
247 .wo_wapbl_replay_read = wapbl_replay_read,
248 .wo_wapbl_add_buf = wapbl_add_buf,
249 .wo_wapbl_remove_buf = wapbl_remove_buf,
250 .wo_wapbl_resize_buf = wapbl_resize_buf,
251 .wo_wapbl_begin = wapbl_begin,
252 .wo_wapbl_end = wapbl_end,
253 .wo_wapbl_junlock_assert= wapbl_junlock_assert,
254
255 /* XXX: the following is only used to say "this is a wapbl buf" */
256 .wo_wapbl_biodone = wapbl_biodone,
257 };
258
259 void
260 wapbl_init()
261 {
262
263 malloc_type_attach(M_WAPBL);
264 }
265
266 static int
267 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
268 {
269 int error, i;
270
271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
272 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
273
274 /*
275 * Its only valid to reuse the replay log if its
276 * the same as the new log we just opened.
277 */
278 KDASSERT(!wapbl_replay_isopen(wr));
279 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
280 KASSERT(wl->wl_logpbn == wr->wr_logpbn);
281 KASSERT(wl->wl_circ_size == wr->wr_circ_size);
282 KASSERT(wl->wl_circ_off == wr->wr_circ_off);
283 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
284 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
285
286 wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
287
288 for (i = 0; i < wr->wr_inodescnt; i++)
289 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
290 wr->wr_inodes[i].wr_imode);
291
292 /* Make sure new transaction won't overwrite old inodes list */
293 KDASSERT(wapbl_transaction_len(wl) <=
294 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
295 wr->wr_inodestail));
296
297 wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
298 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
299 wapbl_transaction_len(wl);
300
301 error = wapbl_write_inodes(wl, &wl->wl_head);
302 if (error)
303 return error;
304
305 KASSERT(wl->wl_head != wl->wl_tail);
306 KASSERT(wl->wl_head != 0);
307
308 return 0;
309 }
310
311 int
312 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
313 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
314 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
315 {
316 struct wapbl *wl;
317 struct vnode *devvp;
318 daddr_t logpbn;
319 int error;
320 int log_dev_bshift = DEV_BSHIFT;
321 int fs_dev_bshift = DEV_BSHIFT;
322 int run;
323
324 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
325 " count=%zu blksize=%zu\n", vp, off, count, blksize));
326
327 if (log_dev_bshift > fs_dev_bshift) {
328 WAPBL_PRINTF(WAPBL_PRINT_OPEN,
329 ("wapbl: log device's block size cannot be larger "
330 "than filesystem's\n"));
331 /*
332 * Not currently implemented, although it could be if
333 * needed someday.
334 */
335 return ENOSYS;
336 }
337
338 if (off < 0)
339 return EINVAL;
340
341 if (blksize < DEV_BSIZE)
342 return EINVAL;
343 if (blksize % DEV_BSIZE)
344 return EINVAL;
345
346 /* XXXTODO: verify that the full load is writable */
347
348 /*
349 * XXX check for minimum log size
350 * minimum is governed by minimum amount of space
351 * to complete a transaction. (probably truncate)
352 */
353 /* XXX for now pick something minimal */
354 if ((count * blksize) < MAXPHYS) {
355 return ENOSPC;
356 }
357
358 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
359 return error;
360 }
361
362 wl = wapbl_calloc(1, sizeof(*wl));
363 rw_init(&wl->wl_rwlock);
364 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
365 cv_init(&wl->wl_reclaimable_cv, "wapblrec");
366 LIST_INIT(&wl->wl_bufs);
367 SIMPLEQ_INIT(&wl->wl_entries);
368
369 wl->wl_logvp = vp;
370 wl->wl_devvp = devvp;
371 wl->wl_mount = mp;
372 wl->wl_logpbn = logpbn;
373 wl->wl_log_dev_bshift = log_dev_bshift;
374 wl->wl_fs_dev_bshift = fs_dev_bshift;
375
376 wl->wl_flush = flushfn;
377 wl->wl_flush_abort = flushabortfn;
378
379 /* Reserve two log device blocks for the commit headers */
380 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
381 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
382 /* truncate the log usage to a multiple of log_dev_bshift */
383 wl->wl_circ_size >>= wl->wl_log_dev_bshift;
384 wl->wl_circ_size <<= wl->wl_log_dev_bshift;
385
386 /*
387 * wl_bufbytes_max limits the size of the in memory transaction space.
388 * - Since buffers are allocated and accounted for in units of
389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE
390 * (i.e. 1<<PAGE_SHIFT)
391 * - Since the log device has to be written in units of
392 * 1<<wl_log_dev_bshift it is required to be a mulitple of
393 * 1<<wl_log_dev_bshift.
394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
396 * Therefore it must be multiple of the least common multiple of those
397 * three quantities. Fortunately, all of those quantities are
398 * guaranteed to be a power of two, and the least common multiple of
399 * a set of numbers which are all powers of two is simply the maximum
400 * of those numbers. Finally, the maximum logarithm of a power of two
401 * is the same as the log of the maximum power of two. So we can do
402 * the following operations to size wl_bufbytes_max:
403 */
404
405 /* XXX fix actual number of pages reserved per filesystem. */
406 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
407
408 /* Round wl_bufbytes_max to the largest power of two constraint */
409 wl->wl_bufbytes_max >>= PAGE_SHIFT;
410 wl->wl_bufbytes_max <<= PAGE_SHIFT;
411 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
412 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
413 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
414 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
415
416 /* XXX maybe use filesystem fragment size instead of 1024 */
417 /* XXX fix actual number of buffers reserved per filesystem. */
418 wl->wl_bufcount_max = (nbuf / 2) * 1024;
419
420 /* XXX tie this into resource estimation */
421 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
422
423 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
424 wl->wl_dealloclim);
425 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
426 wl->wl_dealloclim);
427
428 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
429
430 /* Initialize the commit header */
431 {
432 struct wapbl_wc_header *wc;
433 size_t len = 1 << wl->wl_log_dev_bshift;
434 wc = wapbl_calloc(1, len);
435 wc->wc_type = WAPBL_WC_HEADER;
436 wc->wc_len = len;
437 wc->wc_circ_off = wl->wl_circ_off;
438 wc->wc_circ_size = wl->wl_circ_size;
439 /* XXX wc->wc_fsid */
440 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
441 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
442 wl->wl_wc_header = wc;
443 wl->wl_wc_scratch = wapbl_malloc(len);
444 }
445
446 /*
447 * if there was an existing set of unlinked but
448 * allocated inodes, preserve it in the new
449 * log.
450 */
451 if (wr && wr->wr_inodescnt) {
452 error = wapbl_start_flush_inodes(wl, wr);
453 if (error)
454 goto errout;
455 }
456
457 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
458 if (error) {
459 goto errout;
460 }
461
462 *wlp = wl;
463 #if defined(WAPBL_DEBUG)
464 wapbl_debug_wl = wl;
465 #endif
466
467 return 0;
468 errout:
469 wapbl_discard(wl);
470 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
471 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
472 wapbl_free(wl->wl_deallocblks,
473 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
474 wapbl_free(wl->wl_dealloclens,
475 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
476 wapbl_inodetrk_free(wl);
477 wapbl_free(wl, sizeof(*wl));
478
479 return error;
480 }
481
482 /*
483 * Like wapbl_flush, only discards the transaction
484 * completely
485 */
486
487 void
488 wapbl_discard(struct wapbl *wl)
489 {
490 struct wapbl_entry *we;
491 struct buf *bp;
492 int i;
493
494 /*
495 * XXX we may consider using upgrade here
496 * if we want to call flush from inside a transaction
497 */
498 rw_enter(&wl->wl_rwlock, RW_WRITER);
499 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
500 wl->wl_dealloccnt);
501
502 #ifdef WAPBL_DEBUG_PRINT
503 {
504 struct wapbl_entry *we;
505 pid_t pid = -1;
506 lwpid_t lid = -1;
507 if (curproc)
508 pid = curproc->p_pid;
509 if (curlwp)
510 lid = curlwp->l_lid;
511 #ifdef WAPBL_DEBUG_BUFBYTES
512 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
513 ("wapbl_discard: thread %d.%d discarding "
514 "transaction\n"
515 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
516 "deallocs=%d inodes=%d\n"
517 "\terrcnt = %u, reclaimable=%zu reserved=%zu "
518 "unsynced=%zu\n",
519 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
520 wl->wl_bcount, wl->wl_dealloccnt,
521 wl->wl_inohashcnt, wl->wl_error_count,
522 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
523 wl->wl_unsynced_bufbytes));
524 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
525 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
526 ("\tentry: bufcount = %zu, reclaimable = %zu, "
527 "error = %d, unsynced = %zu\n",
528 we->we_bufcount, we->we_reclaimable_bytes,
529 we->we_error, we->we_unsynced_bufbytes));
530 }
531 #else /* !WAPBL_DEBUG_BUFBYTES */
532 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
533 ("wapbl_discard: thread %d.%d discarding transaction\n"
534 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
535 "deallocs=%d inodes=%d\n"
536 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
537 pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
538 wl->wl_bcount, wl->wl_dealloccnt,
539 wl->wl_inohashcnt, wl->wl_error_count,
540 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
541 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
542 WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
543 ("\tentry: bufcount = %zu, reclaimable = %zu, "
544 "error = %d\n",
545 we->we_bufcount, we->we_reclaimable_bytes,
546 we->we_error));
547 }
548 #endif /* !WAPBL_DEBUG_BUFBYTES */
549 }
550 #endif /* WAPBL_DEBUG_PRINT */
551
552 for (i = 0; i <= wl->wl_inohashmask; i++) {
553 struct wapbl_ino_head *wih;
554 struct wapbl_ino *wi;
555
556 wih = &wl->wl_inohash[i];
557 while ((wi = LIST_FIRST(wih)) != NULL) {
558 LIST_REMOVE(wi, wi_hash);
559 pool_put(&wapbl_ino_pool, wi);
560 KASSERT(wl->wl_inohashcnt > 0);
561 wl->wl_inohashcnt--;
562 }
563 }
564
565 /*
566 * clean buffer list
567 */
568 mutex_enter(&bufcache_lock);
569 mutex_enter(&wl->wl_mtx);
570 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
571 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
572 /*
573 * The buffer will be unlocked and
574 * removed from the transaction in brelse
575 */
576 mutex_exit(&wl->wl_mtx);
577 brelsel(bp, 0);
578 mutex_enter(&wl->wl_mtx);
579 }
580 }
581 mutex_exit(&wl->wl_mtx);
582 mutex_exit(&bufcache_lock);
583
584 /*
585 * Remove references to this wl from wl_entries, free any which
586 * no longer have buffers, others will be freed in wapbl_biodone
587 * when they no longer have any buffers.
588 */
589 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
590 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
591 /* XXX should we be accumulating wl_error_count
592 * and increasing reclaimable bytes ? */
593 we->we_wapbl = NULL;
594 if (we->we_bufcount == 0) {
595 #ifdef WAPBL_DEBUG_BUFBYTES
596 KASSERT(we->we_unsynced_bufbytes == 0);
597 #endif
598 wapbl_free(we, sizeof(*we));
599 }
600 }
601
602 /* Discard list of deallocs */
603 wl->wl_dealloccnt = 0;
604 /* XXX should we clear wl_reserved_bytes? */
605
606 KASSERT(wl->wl_bufbytes == 0);
607 KASSERT(wl->wl_bcount == 0);
608 KASSERT(wl->wl_bufcount == 0);
609 KASSERT(LIST_EMPTY(&wl->wl_bufs));
610 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
611 KASSERT(wl->wl_inohashcnt == 0);
612
613 rw_exit(&wl->wl_rwlock);
614 }
615
616 int
617 wapbl_stop(struct wapbl *wl, int force)
618 {
619 struct vnode *vp;
620 int error;
621
622 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
623 error = wapbl_flush(wl, 1);
624 if (error) {
625 if (force)
626 wapbl_discard(wl);
627 else
628 return error;
629 }
630
631 /* Unlinked inodes persist after a flush */
632 if (wl->wl_inohashcnt) {
633 if (force) {
634 wapbl_discard(wl);
635 } else {
636 return EBUSY;
637 }
638 }
639
640 KASSERT(wl->wl_bufbytes == 0);
641 KASSERT(wl->wl_bcount == 0);
642 KASSERT(wl->wl_bufcount == 0);
643 KASSERT(LIST_EMPTY(&wl->wl_bufs));
644 KASSERT(wl->wl_dealloccnt == 0);
645 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
646 KASSERT(wl->wl_inohashcnt == 0);
647
648 vp = wl->wl_logvp;
649
650 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
651 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
652 wapbl_free(wl->wl_deallocblks,
653 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
654 wapbl_free(wl->wl_dealloclens,
655 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
656 wapbl_inodetrk_free(wl);
657
658 cv_destroy(&wl->wl_reclaimable_cv);
659 mutex_destroy(&wl->wl_mtx);
660 rw_destroy(&wl->wl_rwlock);
661 wapbl_free(wl, sizeof(*wl));
662
663 return 0;
664 }
665
666 static int
667 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
668 {
669 struct pstats *pstats = curlwp->l_proc->p_stats;
670 struct buf *bp;
671 int error;
672
673 KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
674 KASSERT(devvp->v_type == VBLK);
675
676 if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
677 mutex_enter(&devvp->v_interlock);
678 devvp->v_numoutput++;
679 mutex_exit(&devvp->v_interlock);
680 pstats->p_ru.ru_oublock++;
681 } else {
682 pstats->p_ru.ru_inblock++;
683 }
684
685 bp = getiobuf(devvp, true);
686 bp->b_flags = flags;
687 bp->b_cflags = BC_BUSY; /* silly & dubious */
688 bp->b_dev = devvp->v_rdev;
689 bp->b_data = data;
690 bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
691 bp->b_blkno = pbn;
692
693 WAPBL_PRINTF(WAPBL_PRINT_IO,
694 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
695 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
696 bp->b_blkno, bp->b_dev));
697
698 VOP_STRATEGY(devvp, bp);
699
700 error = biowait(bp);
701 putiobuf(bp);
702
703 if (error) {
704 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
705 ("wapbl_doio: %s %zu bytes at block %" PRId64
706 " on dev 0x%x failed with error %d\n",
707 (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
708 "write" : "read"),
709 len, pbn, devvp->v_rdev, error));
710 }
711
712 return error;
713 }
714
715 int
716 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
717 {
718
719 return wapbl_doio(data, len, devvp, pbn, B_WRITE);
720 }
721
722 int
723 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
724 {
725
726 return wapbl_doio(data, len, devvp, pbn, B_READ);
727 }
728
729 /*
730 * Off is byte offset returns new offset for next write
731 * handles log wraparound
732 */
733 static int
734 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
735 {
736 size_t slen;
737 off_t off = *offp;
738 int error;
739
740 KDASSERT(((len >> wl->wl_log_dev_bshift) <<
741 wl->wl_log_dev_bshift) == len);
742
743 if (off < wl->wl_circ_off)
744 off = wl->wl_circ_off;
745 slen = wl->wl_circ_off + wl->wl_circ_size - off;
746 if (slen < len) {
747 error = wapbl_write(data, slen, wl->wl_devvp,
748 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
749 if (error)
750 return error;
751 data = (uint8_t *)data + slen;
752 len -= slen;
753 off = wl->wl_circ_off;
754 }
755 error = wapbl_write(data, len, wl->wl_devvp,
756 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
757 if (error)
758 return error;
759 off += len;
760 if (off >= wl->wl_circ_off + wl->wl_circ_size)
761 off = wl->wl_circ_off;
762 *offp = off;
763 return 0;
764 }
765
766 /****************************************************************/
767
768 int
769 wapbl_begin(struct wapbl *wl, const char *file, int line)
770 {
771 int doflush;
772 unsigned lockcount;
773 krw_t op;
774
775 KDASSERT(wl);
776
777 /*
778 * XXX: The original code calls for the use of a RW_READER lock
779 * here, but it turns out there are performance issues with high
780 * metadata-rate workloads (e.g. multiple simultaneous tar
781 * extractions). For now, we force the lock to be RW_WRITER,
782 * since that currently has the best performance characteristics
783 * (even for a single tar-file extraction).
784 *
785 */
786 #define WAPBL_DEBUG_SERIALIZE 1
787
788 #ifdef WAPBL_DEBUG_SERIALIZE
789 op = RW_WRITER;
790 #else
791 op = RW_READER;
792 #endif
793
794 /*
795 * XXX this needs to be made much more sophisticated.
796 * perhaps each wapbl_begin could reserve a specified
797 * number of buffers and bytes.
798 */
799 mutex_enter(&wl->wl_mtx);
800 lockcount = wl->wl_lock_count;
801 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
802 wl->wl_bufbytes_max / 2) ||
803 ((wl->wl_bufcount + (lockcount * 10)) >
804 wl->wl_bufcount_max / 2) ||
805 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
806 mutex_exit(&wl->wl_mtx);
807
808 if (doflush) {
809 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
810 ("force flush lockcnt=%d bufbytes=%zu "
811 "(max=%zu) bufcount=%zu (max=%zu)\n",
812 lockcount, wl->wl_bufbytes,
813 wl->wl_bufbytes_max, wl->wl_bufcount,
814 wl->wl_bufcount_max));
815 }
816
817 if (doflush) {
818 int error = wapbl_flush(wl, 0);
819 if (error)
820 return error;
821 }
822
823 rw_enter(&wl->wl_rwlock, op);
824 mutex_enter(&wl->wl_mtx);
825 wl->wl_lock_count++;
826 mutex_exit(&wl->wl_mtx);
827
828 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
829 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
830 ("wapbl_begin thread %d.%d with bufcount=%zu "
831 "bufbytes=%zu bcount=%zu at %s:%d\n",
832 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
833 wl->wl_bufbytes, wl->wl_bcount, file, line));
834 #endif
835
836 return 0;
837 }
838
839 void
840 wapbl_end(struct wapbl *wl)
841 {
842
843 #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
844 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
845 ("wapbl_end thread %d.%d with bufcount=%zu "
846 "bufbytes=%zu bcount=%zu\n",
847 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
848 wl->wl_bufbytes, wl->wl_bcount));
849 #endif
850
851 mutex_enter(&wl->wl_mtx);
852 KASSERT(wl->wl_lock_count > 0);
853 wl->wl_lock_count--;
854 mutex_exit(&wl->wl_mtx);
855
856 rw_exit(&wl->wl_rwlock);
857 }
858
859 void
860 wapbl_add_buf(struct wapbl *wl, struct buf * bp)
861 {
862
863 KASSERT(bp->b_cflags & BC_BUSY);
864 KASSERT(bp->b_vp);
865
866 wapbl_jlock_assert(wl);
867
868 #if 0
869 /*
870 * XXX this might be an issue for swapfiles.
871 * see uvm_swap.c:1702
872 *
873 * XXX2 why require it then? leap of semantics?
874 */
875 KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
876 #endif
877
878 mutex_enter(&wl->wl_mtx);
879 if (bp->b_flags & B_LOCKED) {
880 LIST_REMOVE(bp, b_wapbllist);
881 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
882 ("wapbl_add_buf thread %d.%d re-adding buf %p "
883 "with %d bytes %d bcount\n",
884 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
885 bp->b_bcount));
886 } else {
887 /* unlocked by dirty buffers shouldn't exist */
888 KASSERT(!(bp->b_oflags & BO_DELWRI));
889 wl->wl_bufbytes += bp->b_bufsize;
890 wl->wl_bcount += bp->b_bcount;
891 wl->wl_bufcount++;
892 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
893 ("wapbl_add_buf thread %d.%d adding buf %p "
894 "with %d bytes %d bcount\n",
895 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
896 bp->b_bcount));
897 }
898 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
899 mutex_exit(&wl->wl_mtx);
900
901 bp->b_flags |= B_LOCKED;
902 }
903
904 static void
905 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
906 {
907
908 KASSERT(mutex_owned(&wl->wl_mtx));
909 KASSERT(bp->b_cflags & BC_BUSY);
910 wapbl_jlock_assert(wl);
911
912 #if 0
913 /*
914 * XXX this might be an issue for swapfiles.
915 * see uvm_swap.c:1725
916 *
917 * XXXdeux: see above
918 */
919 KASSERT((bp->b_flags & BC_NOCACHE) == 0);
920 #endif
921 KASSERT(bp->b_flags & B_LOCKED);
922
923 WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
924 ("wapbl_remove_buf thread %d.%d removing buf %p with "
925 "%d bytes %d bcount\n",
926 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
927
928 KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
929 wl->wl_bufbytes -= bp->b_bufsize;
930 KASSERT(wl->wl_bcount >= bp->b_bcount);
931 wl->wl_bcount -= bp->b_bcount;
932 KASSERT(wl->wl_bufcount > 0);
933 wl->wl_bufcount--;
934 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
935 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
936 LIST_REMOVE(bp, b_wapbllist);
937
938 bp->b_flags &= ~B_LOCKED;
939 }
940
941 /* called from brelsel() in vfs_bio among other places */
942 void
943 wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
944 {
945
946 mutex_enter(&wl->wl_mtx);
947 wapbl_remove_buf_locked(wl, bp);
948 mutex_exit(&wl->wl_mtx);
949 }
950
951 void
952 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
953 {
954
955 KASSERT(bp->b_cflags & BC_BUSY);
956
957 /*
958 * XXX: why does this depend on B_LOCKED? otherwise the buf
959 * is not for a transaction? if so, why is this called in the
960 * first place?
961 */
962 if (bp->b_flags & B_LOCKED) {
963 mutex_enter(&wl->wl_mtx);
964 wl->wl_bufbytes += bp->b_bufsize - oldsz;
965 wl->wl_bcount += bp->b_bcount - oldcnt;
966 mutex_exit(&wl->wl_mtx);
967 }
968 }
969
970 #endif /* _KERNEL */
971
972 /****************************************************************/
973 /* Some utility inlines */
974
975 /* This is used to advance the pointer at old to new value at old+delta */
976 static __inline off_t
977 wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
978 {
979 off_t new;
980
981 /* Define acceptable ranges for inputs. */
982 KASSERT(delta <= size);
983 KASSERT((old == 0) || (old >= off));
984 KASSERT(old < (size + off));
985
986 if ((old == 0) && (delta != 0))
987 new = off + delta;
988 else if ((old + delta) < (size + off))
989 new = old + delta;
990 else
991 new = (old + delta) - size;
992
993 /* Note some interesting axioms */
994 KASSERT((delta != 0) || (new == old));
995 KASSERT((delta == 0) || (new != 0));
996 KASSERT((delta != (size)) || (new == old));
997
998 /* Define acceptable ranges for output. */
999 KASSERT((new == 0) || (new >= off));
1000 KASSERT(new < (size + off));
1001 return new;
1002 }
1003
1004 static __inline size_t
1005 wapbl_space_used(size_t avail, off_t head, off_t tail)
1006 {
1007
1008 if (tail == 0) {
1009 KASSERT(head == 0);
1010 return 0;
1011 }
1012 return ((head + (avail - 1) - tail) % avail) + 1;
1013 }
1014
1015 static __inline size_t
1016 wapbl_space_free(size_t avail, off_t head, off_t tail)
1017 {
1018
1019 return avail - wapbl_space_used(avail, head, tail);
1020 }
1021
1022 static __inline void
1023 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1024 off_t *tailp)
1025 {
1026 off_t head = *headp;
1027 off_t tail = *tailp;
1028
1029 KASSERT(delta <= wapbl_space_free(size, head, tail));
1030 head = wapbl_advance(size, off, head, delta);
1031 if ((tail == 0) && (head != 0))
1032 tail = off;
1033 *headp = head;
1034 *tailp = tail;
1035 }
1036
1037 static __inline void
1038 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1039 off_t *tailp)
1040 {
1041 off_t head = *headp;
1042 off_t tail = *tailp;
1043
1044 KASSERT(delta <= wapbl_space_used(size, head, tail));
1045 tail = wapbl_advance(size, off, tail, delta);
1046 if (head == tail) {
1047 head = tail = 0;
1048 }
1049 *headp = head;
1050 *tailp = tail;
1051 }
1052
1053 #ifdef _KERNEL
1054
1055 /****************************************************************/
1056
1057 /*
1058 * Remove transactions whose buffers are completely flushed to disk.
1059 * Will block until at least minfree space is available.
1060 * only intended to be called from inside wapbl_flush and therefore
1061 * does not protect against commit races with itself or with flush.
1062 */
1063 static int
1064 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
1065 {
1066 size_t delta;
1067 size_t avail;
1068 off_t head;
1069 off_t tail;
1070 int error = 0;
1071
1072 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1073 KASSERT(rw_write_held(&wl->wl_rwlock));
1074
1075 mutex_enter(&wl->wl_mtx);
1076
1077 /*
1078 * First check to see if we have to do a commit
1079 * at all.
1080 */
1081 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1082 if (minfree < avail) {
1083 mutex_exit(&wl->wl_mtx);
1084 return 0;
1085 }
1086 minfree -= avail;
1087 while ((wl->wl_error_count == 0) &&
1088 (wl->wl_reclaimable_bytes < minfree)) {
1089 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1090 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1091 "minfree=%zd\n",
1092 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1093 minfree));
1094
1095 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1096 }
1097 if (wl->wl_reclaimable_bytes < minfree) {
1098 KASSERT(wl->wl_error_count);
1099 /* XXX maybe get actual error from buffer instead someday? */
1100 error = EIO;
1101 }
1102 head = wl->wl_head;
1103 tail = wl->wl_tail;
1104 delta = wl->wl_reclaimable_bytes;
1105
1106 /* If all of of the entries are flushed, then be sure to keep
1107 * the reserved bytes reserved. Watch out for discarded transactions,
1108 * which could leave more bytes reserved than are reclaimable.
1109 */
1110 if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1111 (delta >= wl->wl_reserved_bytes)) {
1112 delta -= wl->wl_reserved_bytes;
1113 }
1114 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1115 &tail);
1116 KDASSERT(wl->wl_reserved_bytes <=
1117 wapbl_space_used(wl->wl_circ_size, head, tail));
1118 mutex_exit(&wl->wl_mtx);
1119
1120 if (error)
1121 return error;
1122
1123 if (waitonly)
1124 return 0;
1125
1126 /*
1127 * This is where head, tail and delta are unprotected
1128 * from races against itself or flush. This is ok since
1129 * we only call this routine from inside flush itself.
1130 *
1131 * XXX: how can it race against itself when accessed only
1132 * from behind the write-locked rwlock?
1133 */
1134 error = wapbl_write_commit(wl, head, tail);
1135 if (error)
1136 return error;
1137
1138 wl->wl_head = head;
1139 wl->wl_tail = tail;
1140
1141 mutex_enter(&wl->wl_mtx);
1142 KASSERT(wl->wl_reclaimable_bytes >= delta);
1143 wl->wl_reclaimable_bytes -= delta;
1144 mutex_exit(&wl->wl_mtx);
1145 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1146 ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1147 curproc->p_pid, curlwp->l_lid, delta));
1148
1149 return 0;
1150 }
1151
1152 /****************************************************************/
1153
1154 void
1155 wapbl_biodone(struct buf *bp)
1156 {
1157 struct wapbl_entry *we = bp->b_private;
1158 struct wapbl *wl = we->we_wapbl;
1159
1160 /*
1161 * Handle possible flushing of buffers after log has been
1162 * decomissioned.
1163 */
1164 if (!wl) {
1165 KASSERT(we->we_bufcount > 0);
1166 we->we_bufcount--;
1167 #ifdef WAPBL_DEBUG_BUFBYTES
1168 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1169 we->we_unsynced_bufbytes -= bp->b_bufsize;
1170 #endif
1171
1172 if (we->we_bufcount == 0) {
1173 #ifdef WAPBL_DEBUG_BUFBYTES
1174 KASSERT(we->we_unsynced_bufbytes == 0);
1175 #endif
1176 wapbl_free(we, sizeof(*we));
1177 }
1178
1179 brelse(bp, 0);
1180 return;
1181 }
1182
1183 #ifdef ohbother
1184 KDASSERT(bp->b_flags & B_DONE);
1185 KDASSERT(!(bp->b_flags & B_DELWRI));
1186 KDASSERT(bp->b_flags & B_ASYNC);
1187 KDASSERT(bp->b_flags & B_BUSY);
1188 KDASSERT(!(bp->b_flags & B_LOCKED));
1189 KDASSERT(!(bp->b_flags & B_READ));
1190 KDASSERT(!(bp->b_flags & B_INVAL));
1191 KDASSERT(!(bp->b_flags & B_NOCACHE));
1192 #endif
1193
1194 if (bp->b_error) {
1195 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */
1196 XXXpooka: interfaces not fully updated
1197 Note: this was not enabled in the original patch
1198 against netbsd4 either. I don't know if comment
1199 above is true or not.
1200
1201 /*
1202 * If an error occurs, report the error and leave the
1203 * buffer as a delayed write on the LRU queue.
1204 * restarting the write would likely result in
1205 * an error spinloop, so let it be done harmlessly
1206 * by the syncer.
1207 */
1208 bp->b_flags &= ~(B_DONE);
1209 simple_unlock(&bp->b_interlock);
1210
1211 if (we->we_error == 0) {
1212 mutex_enter(&wl->wl_mtx);
1213 wl->wl_error_count++;
1214 mutex_exit(&wl->wl_mtx);
1215 cv_broadcast(&wl->wl_reclaimable_cv);
1216 }
1217 we->we_error = bp->b_error;
1218 bp->b_error = 0;
1219 brelse(bp);
1220 return;
1221 #else
1222 /* For now, just mark the log permanently errored out */
1223
1224 mutex_enter(&wl->wl_mtx);
1225 if (wl->wl_error_count == 0) {
1226 wl->wl_error_count++;
1227 cv_broadcast(&wl->wl_reclaimable_cv);
1228 }
1229 mutex_exit(&wl->wl_mtx);
1230 #endif
1231 }
1232
1233 mutex_enter(&wl->wl_mtx);
1234
1235 KASSERT(we->we_bufcount > 0);
1236 we->we_bufcount--;
1237 #ifdef WAPBL_DEBUG_BUFBYTES
1238 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
1239 we->we_unsynced_bufbytes -= bp->b_bufsize;
1240 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
1241 wl->wl_unsynced_bufbytes -= bp->b_bufsize;
1242 #endif
1243
1244 /*
1245 * If the current transaction can be reclaimed, start
1246 * at the beginning and reclaim any consecutive reclaimable
1247 * transactions. If we successfully reclaim anything,
1248 * then wakeup anyone waiting for the reclaim.
1249 */
1250 if (we->we_bufcount == 0) {
1251 size_t delta = 0;
1252 int errcnt = 0;
1253 #ifdef WAPBL_DEBUG_BUFBYTES
1254 KDASSERT(we->we_unsynced_bufbytes == 0);
1255 #endif
1256 /*
1257 * clear any posted error, since the buffer it came from
1258 * has successfully flushed by now
1259 */
1260 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1261 (we->we_bufcount == 0)) {
1262 delta += we->we_reclaimable_bytes;
1263 if (we->we_error)
1264 errcnt++;
1265 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1266 wapbl_free(we, sizeof(*we));
1267 }
1268
1269 if (delta) {
1270 wl->wl_reclaimable_bytes += delta;
1271 KASSERT(wl->wl_error_count >= errcnt);
1272 wl->wl_error_count -= errcnt;
1273 cv_broadcast(&wl->wl_reclaimable_cv);
1274 }
1275 }
1276
1277 mutex_exit(&wl->wl_mtx);
1278 brelse(bp, 0);
1279 }
1280
1281 /*
1282 * Write transactions to disk + start I/O for contents
1283 */
1284 int
1285 wapbl_flush(struct wapbl *wl, int waitfor)
1286 {
1287 struct buf *bp;
1288 struct wapbl_entry *we;
1289 off_t off;
1290 off_t head;
1291 off_t tail;
1292 size_t delta = 0;
1293 size_t flushsize;
1294 size_t reserved;
1295 int error = 0;
1296
1297 /*
1298 * Do a quick check to see if a full flush can be skipped
1299 * This assumes that the flush callback does not need to be called
1300 * unless there are other outstanding bufs.
1301 */
1302 if (!waitfor) {
1303 size_t nbufs;
1304 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
1305 protect the KASSERTS */
1306 nbufs = wl->wl_bufcount;
1307 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
1308 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
1309 mutex_exit(&wl->wl_mtx);
1310 if (nbufs == 0)
1311 return 0;
1312 }
1313
1314 /*
1315 * XXX we may consider using LK_UPGRADE here
1316 * if we want to call flush from inside a transaction
1317 */
1318 rw_enter(&wl->wl_rwlock, RW_WRITER);
1319 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
1320 wl->wl_dealloccnt);
1321
1322 /*
1323 * Now that we are fully locked and flushed,
1324 * do another check for nothing to do.
1325 */
1326 if (wl->wl_bufcount == 0) {
1327 goto out;
1328 }
1329
1330 #if 0
1331 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1332 ("wapbl_flush thread %d.%d flushing entries with "
1333 "bufcount=%zu bufbytes=%zu\n",
1334 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1335 wl->wl_bufbytes));
1336 #endif
1337
1338 /* Calculate amount of space needed to flush */
1339 flushsize = wapbl_transaction_len(wl);
1340
1341 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1342 /*
1343 * XXX this could be handled more gracefully, perhaps place
1344 * only a partial transaction in the log and allow the
1345 * remaining to flush without the protection of the journal.
1346 */
1347 panic("wapbl_flush: current transaction too big to flush\n");
1348 }
1349
1350 error = wapbl_truncate(wl, flushsize, 0);
1351 if (error)
1352 goto out2;
1353
1354 off = wl->wl_head;
1355 KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
1356 (off < wl->wl_circ_off + wl->wl_circ_size)));
1357 error = wapbl_write_blocks(wl, &off);
1358 if (error)
1359 goto out2;
1360 error = wapbl_write_revocations(wl, &off);
1361 if (error)
1362 goto out2;
1363 error = wapbl_write_inodes(wl, &off);
1364 if (error)
1365 goto out2;
1366
1367 reserved = 0;
1368 if (wl->wl_inohashcnt)
1369 reserved = wapbl_transaction_inodes_len(wl);
1370
1371 head = wl->wl_head;
1372 tail = wl->wl_tail;
1373
1374 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1375 &head, &tail);
1376 #ifdef WAPBL_DEBUG
1377 if (head != off) {
1378 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1379 " off=%"PRIdMAX" flush=%zu\n",
1380 (intmax_t)head, (intmax_t)tail, (intmax_t)off,
1381 flushsize);
1382 }
1383 #else
1384 KASSERT(head == off);
1385 #endif
1386
1387 /* Opportunistically move the tail forward if we can */
1388 if (!wapbl_lazy_truncate) {
1389 mutex_enter(&wl->wl_mtx);
1390 delta = wl->wl_reclaimable_bytes;
1391 mutex_exit(&wl->wl_mtx);
1392 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1393 &head, &tail);
1394 }
1395
1396 error = wapbl_write_commit(wl, head, tail);
1397 if (error)
1398 goto out2;
1399
1400 we = wapbl_calloc(1, sizeof(*we));
1401
1402 #ifdef WAPBL_DEBUG_BUFBYTES
1403 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1404 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1405 " unsynced=%zu"
1406 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1407 "inodes=%d\n",
1408 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1409 wapbl_space_used(wl->wl_circ_size, head, tail),
1410 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1411 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1412 wl->wl_inohashcnt));
1413 #else
1414 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1415 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1416 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1417 "inodes=%d\n",
1418 curproc->p_pid, curlwp->l_lid, flushsize, delta,
1419 wapbl_space_used(wl->wl_circ_size, head, tail),
1420 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1421 wl->wl_dealloccnt, wl->wl_inohashcnt));
1422 #endif
1423
1424
1425 mutex_enter(&bufcache_lock);
1426 mutex_enter(&wl->wl_mtx);
1427
1428 wl->wl_reserved_bytes = reserved;
1429 wl->wl_head = head;
1430 wl->wl_tail = tail;
1431 KASSERT(wl->wl_reclaimable_bytes >= delta);
1432 wl->wl_reclaimable_bytes -= delta;
1433 wl->wl_dealloccnt = 0;
1434 #ifdef WAPBL_DEBUG_BUFBYTES
1435 wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1436 #endif
1437
1438 we->we_wapbl = wl;
1439 we->we_bufcount = wl->wl_bufcount;
1440 #ifdef WAPBL_DEBUG_BUFBYTES
1441 we->we_unsynced_bufbytes = wl->wl_bufbytes;
1442 #endif
1443 we->we_reclaimable_bytes = flushsize;
1444 we->we_error = 0;
1445 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1446
1447 /*
1448 * this flushes bufs in reverse order than they were queued
1449 * it shouldn't matter, but if we care we could use TAILQ instead.
1450 * XXX Note they will get put on the lru queue when they flush
1451 * so we might actually want to change this to preserve order.
1452 */
1453 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1454 if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
1455 continue;
1456 }
1457 bp->b_iodone = wapbl_biodone;
1458 bp->b_private = we;
1459 bremfree(bp);
1460 wapbl_remove_buf_locked(wl, bp);
1461 mutex_exit(&wl->wl_mtx);
1462 mutex_exit(&bufcache_lock);
1463 bawrite(bp);
1464 mutex_enter(&bufcache_lock);
1465 mutex_enter(&wl->wl_mtx);
1466 }
1467 mutex_exit(&wl->wl_mtx);
1468 mutex_exit(&bufcache_lock);
1469
1470 #if 0
1471 WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1472 ("wapbl_flush thread %d.%d done flushing entries...\n",
1473 curproc->p_pid, curlwp->l_lid));
1474 #endif
1475
1476 out:
1477
1478 /*
1479 * If the waitfor flag is set, don't return until everything is
1480 * fully flushed and the on disk log is empty.
1481 */
1482 if (waitfor) {
1483 error = wapbl_truncate(wl, wl->wl_circ_size -
1484 wl->wl_reserved_bytes, wapbl_lazy_truncate);
1485 }
1486
1487 out2:
1488 if (error) {
1489 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
1490 wl->wl_dealloclens, wl->wl_dealloccnt);
1491 }
1492
1493 #ifdef WAPBL_DEBUG_PRINT
1494 if (error) {
1495 pid_t pid = -1;
1496 lwpid_t lid = -1;
1497 if (curproc)
1498 pid = curproc->p_pid;
1499 if (curlwp)
1500 lid = curlwp->l_lid;
1501 mutex_enter(&wl->wl_mtx);
1502 #ifdef WAPBL_DEBUG_BUFBYTES
1503 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1504 ("wapbl_flush: thread %d.%d aborted flush: "
1505 "error = %d\n"
1506 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1507 "deallocs=%d inodes=%d\n"
1508 "\terrcnt = %d, reclaimable=%zu reserved=%zu "
1509 "unsynced=%zu\n",
1510 pid, lid, error, wl->wl_bufcount,
1511 wl->wl_bufbytes, wl->wl_bcount,
1512 wl->wl_dealloccnt, wl->wl_inohashcnt,
1513 wl->wl_error_count, wl->wl_reclaimable_bytes,
1514 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1515 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1516 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1517 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1518 "error = %d, unsynced = %zu\n",
1519 we->we_bufcount, we->we_reclaimable_bytes,
1520 we->we_error, we->we_unsynced_bufbytes));
1521 }
1522 #else
1523 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1524 ("wapbl_flush: thread %d.%d aborted flush: "
1525 "error = %d\n"
1526 "\tbufcount=%zu bufbytes=%zu bcount=%zu "
1527 "deallocs=%d inodes=%d\n"
1528 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1529 pid, lid, error, wl->wl_bufcount,
1530 wl->wl_bufbytes, wl->wl_bcount,
1531 wl->wl_dealloccnt, wl->wl_inohashcnt,
1532 wl->wl_error_count, wl->wl_reclaimable_bytes,
1533 wl->wl_reserved_bytes));
1534 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1535 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1536 ("\tentry: bufcount = %zu, reclaimable = %zu, "
1537 "error = %d\n", we->we_bufcount,
1538 we->we_reclaimable_bytes, we->we_error));
1539 }
1540 #endif
1541 mutex_exit(&wl->wl_mtx);
1542 }
1543 #endif
1544
1545 rw_exit(&wl->wl_rwlock);
1546 return error;
1547 }
1548
1549 /****************************************************************/
1550
1551 void
1552 wapbl_jlock_assert(struct wapbl *wl)
1553 {
1554
1555 #ifdef WAPBL_DEBUG_SERIALIZE
1556 KASSERT(rw_write_held(&wl->wl_rwlock));
1557 #else
1558 KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
1559 #endif
1560 }
1561
1562 void
1563 wapbl_junlock_assert(struct wapbl *wl)
1564 {
1565
1566 #ifdef WAPBL_DEBUG_SERIALIZE
1567 KASSERT(!rw_write_held(&wl->wl_rwlock));
1568 #endif
1569 }
1570
1571 /****************************************************************/
1572
1573 /* locks missing */
1574 void
1575 wapbl_print(struct wapbl *wl,
1576 int full,
1577 void (*pr)(const char *, ...))
1578 {
1579 struct buf *bp;
1580 struct wapbl_entry *we;
1581 (*pr)("wapbl %p", wl);
1582 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1583 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1584 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1585 wl->wl_circ_size, wl->wl_circ_off,
1586 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1587 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1588 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1589 #ifdef WAPBL_DEBUG_BUFBYTES
1590 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1591 "reserved = %zu errcnt = %d unsynced = %zu\n",
1592 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1593 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1594 wl->wl_error_count, wl->wl_unsynced_bufbytes);
1595 #else
1596 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1597 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1598 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1599 wl->wl_error_count);
1600 #endif
1601 (*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1602 wl->wl_dealloccnt, wl->wl_dealloclim);
1603 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1604 wl->wl_inohashcnt, wl->wl_inohashmask);
1605 (*pr)("entries:\n");
1606 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1607 #ifdef WAPBL_DEBUG_BUFBYTES
1608 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1609 "unsynced = %zu\n",
1610 we->we_bufcount, we->we_reclaimable_bytes,
1611 we->we_error, we->we_unsynced_bufbytes);
1612 #else
1613 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1614 we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1615 #endif
1616 }
1617 if (full) {
1618 int cnt = 0;
1619 (*pr)("bufs =");
1620 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1621 if (!LIST_NEXT(bp, b_wapbllist)) {
1622 (*pr)(" %p", bp);
1623 } else if ((++cnt % 6) == 0) {
1624 (*pr)(" %p,\n\t", bp);
1625 } else {
1626 (*pr)(" %p,", bp);
1627 }
1628 }
1629 (*pr)("\n");
1630
1631 (*pr)("dealloced blks = ");
1632 {
1633 int i;
1634 cnt = 0;
1635 for (i = 0; i < wl->wl_dealloccnt; i++) {
1636 (*pr)(" %"PRId64":%d,",
1637 wl->wl_deallocblks[i],
1638 wl->wl_dealloclens[i]);
1639 if ((++cnt % 4) == 0) {
1640 (*pr)("\n\t");
1641 }
1642 }
1643 }
1644 (*pr)("\n");
1645
1646 (*pr)("registered inodes = ");
1647 {
1648 int i;
1649 cnt = 0;
1650 for (i = 0; i <= wl->wl_inohashmask; i++) {
1651 struct wapbl_ino_head *wih;
1652 struct wapbl_ino *wi;
1653
1654 wih = &wl->wl_inohash[i];
1655 LIST_FOREACH(wi, wih, wi_hash) {
1656 if (wi->wi_ino == 0)
1657 continue;
1658 (*pr)(" %"PRId32"/0%06"PRIo32",",
1659 wi->wi_ino, wi->wi_mode);
1660 if ((++cnt % 4) == 0) {
1661 (*pr)("\n\t");
1662 }
1663 }
1664 }
1665 (*pr)("\n");
1666 }
1667 }
1668 }
1669
1670 #if defined(WAPBL_DEBUG) || defined(DDB)
1671 void
1672 wapbl_dump(struct wapbl *wl)
1673 {
1674 #if defined(WAPBL_DEBUG)
1675 if (!wl)
1676 wl = wapbl_debug_wl;
1677 #endif
1678 if (!wl)
1679 return;
1680 wapbl_print(wl, 1, printf);
1681 }
1682 #endif
1683
1684 /****************************************************************/
1685
1686 void
1687 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
1688 {
1689
1690 wapbl_jlock_assert(wl);
1691
1692 /* XXX should eventually instead tie this into resource estimation */
1693 /* XXX this KASSERT needs locking/mutex analysis */
1694 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
1695 wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
1696 wl->wl_dealloclens[wl->wl_dealloccnt] = len;
1697 wl->wl_dealloccnt++;
1698 WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1699 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
1700 }
1701
1702 /****************************************************************/
1703
1704 static void
1705 wapbl_inodetrk_init(struct wapbl *wl, u_int size)
1706 {
1707
1708 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
1709 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
1710 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
1711 "wapblinopl", &pool_allocator_nointr, IPL_NONE);
1712 }
1713 }
1714
1715 static void
1716 wapbl_inodetrk_free(struct wapbl *wl)
1717 {
1718
1719 /* XXX this KASSERT needs locking/mutex analysis */
1720 KASSERT(wl->wl_inohashcnt == 0);
1721 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
1722 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
1723 pool_destroy(&wapbl_ino_pool);
1724 }
1725 }
1726
1727 static struct wapbl_ino *
1728 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
1729 {
1730 struct wapbl_ino_head *wih;
1731 struct wapbl_ino *wi;
1732
1733 KASSERT(mutex_owned(&wl->wl_mtx));
1734
1735 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1736 LIST_FOREACH(wi, wih, wi_hash) {
1737 if (ino == wi->wi_ino)
1738 return wi;
1739 }
1740 return 0;
1741 }
1742
1743 void
1744 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1745 {
1746 struct wapbl_ino_head *wih;
1747 struct wapbl_ino *wi;
1748
1749 wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
1750
1751 mutex_enter(&wl->wl_mtx);
1752 if (wapbl_inodetrk_get(wl, ino) == NULL) {
1753 wi->wi_ino = ino;
1754 wi->wi_mode = mode;
1755 wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
1756 LIST_INSERT_HEAD(wih, wi, wi_hash);
1757 wl->wl_inohashcnt++;
1758 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1759 ("wapbl_register_inode: ino=%"PRId64"\n", ino));
1760 mutex_exit(&wl->wl_mtx);
1761 } else {
1762 mutex_exit(&wl->wl_mtx);
1763 pool_put(&wapbl_ino_pool, wi);
1764 }
1765 }
1766
1767 void
1768 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
1769 {
1770 struct wapbl_ino *wi;
1771
1772 mutex_enter(&wl->wl_mtx);
1773 wi = wapbl_inodetrk_get(wl, ino);
1774 if (wi) {
1775 WAPBL_PRINTF(WAPBL_PRINT_INODE,
1776 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
1777 KASSERT(wl->wl_inohashcnt > 0);
1778 wl->wl_inohashcnt--;
1779 LIST_REMOVE(wi, wi_hash);
1780 mutex_exit(&wl->wl_mtx);
1781
1782 pool_put(&wapbl_ino_pool, wi);
1783 } else {
1784 mutex_exit(&wl->wl_mtx);
1785 }
1786 }
1787
1788 /****************************************************************/
1789
1790 static __inline size_t
1791 wapbl_transaction_inodes_len(struct wapbl *wl)
1792 {
1793 int blocklen = 1<<wl->wl_log_dev_bshift;
1794 int iph;
1795
1796 /* Calculate number of inodes described in a inodelist header */
1797 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
1798 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
1799
1800 KASSERT(iph > 0);
1801
1802 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
1803 }
1804
1805
1806 /* Calculate amount of space a transaction will take on disk */
1807 static size_t
1808 wapbl_transaction_len(struct wapbl *wl)
1809 {
1810 int blocklen = 1<<wl->wl_log_dev_bshift;
1811 size_t len;
1812 int bph;
1813
1814 /* Calculate number of blocks described in a blocklist header */
1815 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1816 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1817
1818 KASSERT(bph > 0);
1819
1820 len = wl->wl_bcount;
1821 len += howmany(wl->wl_bufcount, bph)*blocklen;
1822 len += howmany(wl->wl_dealloccnt, bph)*blocklen;
1823 len += wapbl_transaction_inodes_len(wl);
1824
1825 return len;
1826 }
1827
1828 /*
1829 * Perform commit operation
1830 *
1831 * Note that generation number incrementation needs to
1832 * be protected against racing with other invocations
1833 * of wapbl_commit. This is ok since this routine
1834 * is only invoked from wapbl_flush
1835 */
1836 static int
1837 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
1838 {
1839 struct wapbl_wc_header *wc = wl->wl_wc_header;
1840 struct timespec ts;
1841 int error;
1842 int force = 1;
1843
1844 /* XXX Calc checksum here, instead we do this for now */
1845 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1846 if (error) {
1847 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1848 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1849 "returned %d\n", wl->wl_devvp->v_rdev, error));
1850 }
1851
1852 wc->wc_head = head;
1853 wc->wc_tail = tail;
1854 wc->wc_checksum = 0;
1855 wc->wc_version = 1;
1856 getnanotime(&ts);
1857 wc->wc_time = ts.tv_sec;
1858 wc->wc_timensec = ts.tv_nsec;
1859
1860 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1861 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
1862 (intmax_t)head, (intmax_t)tail));
1863
1864 /*
1865 * XXX if generation will rollover, then first zero
1866 * over second commit header before trying to write both headers.
1867 */
1868
1869 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
1870 wl->wl_logpbn + wc->wc_generation % 2);
1871 if (error)
1872 return error;
1873
1874 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
1875 if (error) {
1876 WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1877 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
1878 "returned %d\n", wl->wl_devvp->v_rdev, error));
1879 }
1880
1881 /*
1882 * If the generation number was zero, write it out a second time.
1883 * This handles initialization and generation number rollover
1884 */
1885 if (wc->wc_generation++ == 0) {
1886 error = wapbl_write_commit(wl, head, tail);
1887 /*
1888 * This panic should be able to be removed if we do the
1889 * zero'ing mentioned above, and we are certain to roll
1890 * back generation number on failure.
1891 */
1892 if (error)
1893 panic("wapbl_write_commit: error writing duplicate "
1894 "log header: %d\n", error);
1895 }
1896 return 0;
1897 }
1898
1899 /* Returns new offset value */
1900 static int
1901 wapbl_write_blocks(struct wapbl *wl, off_t *offp)
1902 {
1903 struct wapbl_wc_blocklist *wc =
1904 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1905 int blocklen = 1<<wl->wl_log_dev_bshift;
1906 int bph;
1907 struct buf *bp;
1908 off_t off = *offp;
1909 int error;
1910 size_t padding;
1911
1912 KASSERT(rw_write_held(&wl->wl_rwlock));
1913
1914 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
1915 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
1916
1917 bp = LIST_FIRST(&wl->wl_bufs);
1918
1919 while (bp) {
1920 int cnt;
1921 struct buf *obp = bp;
1922
1923 KASSERT(bp->b_flags & B_LOCKED);
1924
1925 wc->wc_type = WAPBL_WC_BLOCKS;
1926 wc->wc_len = blocklen;
1927 wc->wc_blkcount = 0;
1928 while (bp && (wc->wc_blkcount < bph)) {
1929 /*
1930 * Make sure all the physical block numbers are up to
1931 * date. If this is not always true on a given
1932 * filesystem, then VOP_BMAP must be called. We
1933 * could call VOP_BMAP here, or else in the filesystem
1934 * specific flush callback, although neither of those
1935 * solutions allow us to take the vnode lock. If a
1936 * filesystem requires that we must take the vnode lock
1937 * to call VOP_BMAP, then we can probably do it in
1938 * bwrite when the vnode lock should already be held
1939 * by the invoking code.
1940 */
1941 KASSERT((bp->b_vp->v_type == VBLK) ||
1942 (bp->b_blkno != bp->b_lblkno));
1943 KASSERT(bp->b_blkno > 0);
1944
1945 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
1946 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
1947 wc->wc_len += bp->b_bcount;
1948 wc->wc_blkcount++;
1949 bp = LIST_NEXT(bp, b_wapbllist);
1950 }
1951 if (wc->wc_len % blocklen != 0) {
1952 padding = blocklen - wc->wc_len % blocklen;
1953 wc->wc_len += padding;
1954 } else {
1955 padding = 0;
1956 }
1957
1958 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
1959 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
1960 wc->wc_len, padding, (intmax_t)off));
1961
1962 error = wapbl_circ_write(wl, wc, blocklen, &off);
1963 if (error)
1964 return error;
1965 bp = obp;
1966 cnt = 0;
1967 while (bp && (cnt++ < bph)) {
1968 error = wapbl_circ_write(wl, bp->b_data,
1969 bp->b_bcount, &off);
1970 if (error)
1971 return error;
1972 bp = LIST_NEXT(bp, b_wapbllist);
1973 }
1974 if (padding) {
1975 void *zero;
1976
1977 zero = wapbl_malloc(padding);
1978 memset(zero, 0, padding);
1979 error = wapbl_circ_write(wl, zero, padding, &off);
1980 wapbl_free(zero, padding);
1981 if (error)
1982 return error;
1983 }
1984 }
1985 *offp = off;
1986 return 0;
1987 }
1988
1989 static int
1990 wapbl_write_revocations(struct wapbl *wl, off_t *offp)
1991 {
1992 struct wapbl_wc_blocklist *wc =
1993 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
1994 int i;
1995 int blocklen = 1<<wl->wl_log_dev_bshift;
1996 int bph;
1997 off_t off = *offp;
1998 int error;
1999
2000 if (wl->wl_dealloccnt == 0)
2001 return 0;
2002
2003 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
2004 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
2005
2006 i = 0;
2007 while (i < wl->wl_dealloccnt) {
2008 wc->wc_type = WAPBL_WC_REVOCATIONS;
2009 wc->wc_len = blocklen;
2010 wc->wc_blkcount = 0;
2011 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
2012 wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2013 wl->wl_deallocblks[i];
2014 wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2015 wl->wl_dealloclens[i];
2016 wc->wc_blkcount++;
2017 i++;
2018 }
2019 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2020 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2021 wc->wc_len, (intmax_t)off));
2022 error = wapbl_circ_write(wl, wc, blocklen, &off);
2023 if (error)
2024 return error;
2025 }
2026 *offp = off;
2027 return 0;
2028 }
2029
2030 static int
2031 wapbl_write_inodes(struct wapbl *wl, off_t *offp)
2032 {
2033 struct wapbl_wc_inodelist *wc =
2034 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2035 int i;
2036 int blocklen = 1 << wl->wl_log_dev_bshift;
2037 off_t off = *offp;
2038 int error;
2039
2040 struct wapbl_ino_head *wih;
2041 struct wapbl_ino *wi;
2042 int iph;
2043
2044 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2045 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
2046
2047 i = 0;
2048 wih = &wl->wl_inohash[0];
2049 wi = 0;
2050 do {
2051 wc->wc_type = WAPBL_WC_INODES;
2052 wc->wc_len = blocklen;
2053 wc->wc_inocnt = 0;
2054 wc->wc_clear = (i == 0);
2055 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2056 while (!wi) {
2057 KASSERT((wih - &wl->wl_inohash[0])
2058 <= wl->wl_inohashmask);
2059 wi = LIST_FIRST(wih++);
2060 }
2061 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2062 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2063 wc->wc_inocnt++;
2064 i++;
2065 wi = LIST_NEXT(wi, wi_hash);
2066 }
2067 WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2068 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2069 wc->wc_len, (intmax_t)off));
2070 error = wapbl_circ_write(wl, wc, blocklen, &off);
2071 if (error)
2072 return error;
2073 } while (i < wl->wl_inohashcnt);
2074
2075 *offp = off;
2076 return 0;
2077 }
2078
2079 #endif /* _KERNEL */
2080
2081 /****************************************************************/
2082
2083 struct wapbl_blk {
2084 LIST_ENTRY(wapbl_blk) wb_hash;
2085 daddr_t wb_blk;
2086 off_t wb_off; /* Offset of this block in the log */
2087 };
2088 #define WAPBL_BLKPOOL_MIN 83
2089
2090 static void
2091 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2092 {
2093 if (size < WAPBL_BLKPOOL_MIN)
2094 size = WAPBL_BLKPOOL_MIN;
2095 KASSERT(wr->wr_blkhash == 0);
2096 #ifdef _KERNEL
2097 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2098 #else /* ! _KERNEL */
2099 /* Manually implement hashinit */
2100 {
2101 int i;
2102 unsigned long hashsize;
2103 for (hashsize = 1; hashsize < size; hashsize <<= 1)
2104 continue;
2105 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
2106 for (i = 0; i < wr->wr_blkhashmask; i++)
2107 LIST_INIT(&wr->wr_blkhash[i]);
2108 wr->wr_blkhashmask = hashsize - 1;
2109 }
2110 #endif /* ! _KERNEL */
2111 }
2112
2113 static void
2114 wapbl_blkhash_free(struct wapbl_replay *wr)
2115 {
2116 KASSERT(wr->wr_blkhashcnt == 0);
2117 #ifdef _KERNEL
2118 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2119 #else /* ! _KERNEL */
2120 wapbl_free(wr->wr_blkhash,
2121 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
2122 #endif /* ! _KERNEL */
2123 }
2124
2125 static struct wapbl_blk *
2126 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2127 {
2128 struct wapbl_blk_head *wbh;
2129 struct wapbl_blk *wb;
2130 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2131 LIST_FOREACH(wb, wbh, wb_hash) {
2132 if (blk == wb->wb_blk)
2133 return wb;
2134 }
2135 return 0;
2136 }
2137
2138 static void
2139 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2140 {
2141 struct wapbl_blk_head *wbh;
2142 struct wapbl_blk *wb;
2143 wb = wapbl_blkhash_get(wr, blk);
2144 if (wb) {
2145 KASSERT(wb->wb_blk == blk);
2146 wb->wb_off = off;
2147 } else {
2148 wb = wapbl_malloc(sizeof(*wb));
2149 wb->wb_blk = blk;
2150 wb->wb_off = off;
2151 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2152 LIST_INSERT_HEAD(wbh, wb, wb_hash);
2153 wr->wr_blkhashcnt++;
2154 }
2155 }
2156
2157 static void
2158 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2159 {
2160 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2161 if (wb) {
2162 KASSERT(wr->wr_blkhashcnt > 0);
2163 wr->wr_blkhashcnt--;
2164 LIST_REMOVE(wb, wb_hash);
2165 wapbl_free(wb, sizeof(*wb));
2166 }
2167 }
2168
2169 static void
2170 wapbl_blkhash_clear(struct wapbl_replay *wr)
2171 {
2172 int i;
2173 for (i = 0; i <= wr->wr_blkhashmask; i++) {
2174 struct wapbl_blk *wb;
2175
2176 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2177 KASSERT(wr->wr_blkhashcnt > 0);
2178 wr->wr_blkhashcnt--;
2179 LIST_REMOVE(wb, wb_hash);
2180 wapbl_free(wb, sizeof(*wb));
2181 }
2182 }
2183 KASSERT(wr->wr_blkhashcnt == 0);
2184 }
2185
2186 /****************************************************************/
2187
2188 static int
2189 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
2190 {
2191 size_t slen;
2192 off_t off = *offp;
2193 int error;
2194
2195 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2196 wr->wr_log_dev_bshift) == len);
2197 if (off < wr->wr_circ_off)
2198 off = wr->wr_circ_off;
2199 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2200 if (slen < len) {
2201 error = wapbl_read(data, slen, wr->wr_devvp,
2202 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
2203 if (error)
2204 return error;
2205 data = (uint8_t *)data + slen;
2206 len -= slen;
2207 off = wr->wr_circ_off;
2208 }
2209 error = wapbl_read(data, len, wr->wr_devvp,
2210 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
2211 if (error)
2212 return error;
2213 off += len;
2214 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2215 off = wr->wr_circ_off;
2216 *offp = off;
2217 return 0;
2218 }
2219
2220 static void
2221 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
2222 {
2223 size_t slen;
2224 off_t off = *offp;
2225
2226 KASSERT(((len >> wr->wr_log_dev_bshift) <<
2227 wr->wr_log_dev_bshift) == len);
2228
2229 if (off < wr->wr_circ_off)
2230 off = wr->wr_circ_off;
2231 slen = wr->wr_circ_off + wr->wr_circ_size - off;
2232 if (slen < len) {
2233 len -= slen;
2234 off = wr->wr_circ_off;
2235 }
2236 off += len;
2237 if (off >= wr->wr_circ_off + wr->wr_circ_size)
2238 off = wr->wr_circ_off;
2239 *offp = off;
2240 }
2241
2242 /****************************************************************/
2243
2244 int
2245 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
2246 daddr_t off, size_t count, size_t blksize)
2247 {
2248 struct wapbl_replay *wr;
2249 int error;
2250 struct vnode *devvp;
2251 daddr_t logpbn;
2252 uint8_t *scratch;
2253 struct wapbl_wc_header *wch;
2254 struct wapbl_wc_header *wch2;
2255 /* Use this until we read the actual log header */
2256 int log_dev_bshift = DEV_BSHIFT;
2257 size_t used;
2258
2259 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2260 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2261 vp, off, count, blksize));
2262
2263 if (off < 0)
2264 return EINVAL;
2265
2266 if (blksize < DEV_BSIZE)
2267 return EINVAL;
2268 if (blksize % DEV_BSIZE)
2269 return EINVAL;
2270
2271 #ifdef _KERNEL
2272 #if 0
2273 /* XXX vp->v_size isn't reliably set for VBLK devices,
2274 * especially root. However, we might still want to verify
2275 * that the full load is readable */
2276 if ((off + count) * blksize > vp->v_size)
2277 return EINVAL;
2278 #endif
2279
2280 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
2281 return error;
2282 }
2283 #else /* ! _KERNEL */
2284 devvp = vp;
2285 logpbn = off;
2286 #endif /* ! _KERNEL */
2287
2288 scratch = wapbl_malloc(MAXBSIZE);
2289
2290 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
2291 if (error)
2292 goto errout;
2293
2294 wch = (struct wapbl_wc_header *)scratch;
2295 wch2 =
2296 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
2297 /* XXX verify checksums and magic numbers */
2298 if (wch->wc_type != WAPBL_WC_HEADER) {
2299 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2300 error = EFTYPE;
2301 goto errout;
2302 }
2303
2304 if (wch2->wc_generation > wch->wc_generation)
2305 wch = wch2;
2306
2307 wr = wapbl_calloc(1, sizeof(*wr));
2308
2309 wr->wr_logvp = vp;
2310 wr->wr_devvp = devvp;
2311 wr->wr_logpbn = logpbn;
2312
2313 wr->wr_scratch = scratch;
2314
2315 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2316 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2317 wr->wr_circ_off = wch->wc_circ_off;
2318 wr->wr_circ_size = wch->wc_circ_size;
2319 wr->wr_generation = wch->wc_generation;
2320
2321 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2322
2323 WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2324 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2325 " len=%"PRId64" used=%zu\n",
2326 wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2327 wch->wc_circ_size, used));
2328
2329 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2330
2331 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2332 if (error) {
2333 wapbl_replay_stop(wr);
2334 wapbl_replay_free(wr);
2335 return error;
2336 }
2337
2338 *wrp = wr;
2339 return 0;
2340
2341 errout:
2342 wapbl_free(scratch, MAXBSIZE);
2343 return error;
2344 }
2345
2346 void
2347 wapbl_replay_stop(struct wapbl_replay *wr)
2348 {
2349
2350 if (!wapbl_replay_isopen(wr))
2351 return;
2352
2353 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2354
2355 wapbl_free(wr->wr_scratch, MAXBSIZE);
2356 wr->wr_scratch = NULL;
2357
2358 wr->wr_logvp = NULL;
2359
2360 wapbl_blkhash_clear(wr);
2361 wapbl_blkhash_free(wr);
2362 }
2363
2364 void
2365 wapbl_replay_free(struct wapbl_replay *wr)
2366 {
2367
2368 KDASSERT(!wapbl_replay_isopen(wr));
2369
2370 if (wr->wr_inodes)
2371 wapbl_free(wr->wr_inodes,
2372 wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
2373 wapbl_free(wr, sizeof(*wr));
2374 }
2375
2376 #ifdef _KERNEL
2377 int
2378 wapbl_replay_isopen1(struct wapbl_replay *wr)
2379 {
2380
2381 return wapbl_replay_isopen(wr);
2382 }
2383 #endif
2384
2385 static void
2386 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
2387 {
2388 struct wapbl_wc_blocklist *wc =
2389 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2390 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2391 int i, j, n;
2392
2393 for (i = 0; i < wc->wc_blkcount; i++) {
2394 /*
2395 * Enter each physical block into the hashtable independently.
2396 */
2397 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2398 for (j = 0; j < n; j++) {
2399 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j,
2400 *offp);
2401 wapbl_circ_advance(wr, fsblklen, offp);
2402 }
2403 }
2404 }
2405
2406 static void
2407 wapbl_replay_process_revocations(struct wapbl_replay *wr)
2408 {
2409 struct wapbl_wc_blocklist *wc =
2410 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2411 int i, j, n;
2412
2413 for (i = 0; i < wc->wc_blkcount; i++) {
2414 /*
2415 * Remove any blocks found from the hashtable.
2416 */
2417 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2418 for (j = 0; j < n; j++)
2419 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j);
2420 }
2421 }
2422
2423 static void
2424 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2425 {
2426 struct wapbl_wc_inodelist *wc =
2427 (struct wapbl_wc_inodelist *)wr->wr_scratch;
2428 void *new_inodes;
2429 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
2430
2431 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
2432
2433 /*
2434 * Keep track of where we found this so location won't be
2435 * overwritten.
2436 */
2437 if (wc->wc_clear) {
2438 wr->wr_inodestail = oldoff;
2439 wr->wr_inodescnt = 0;
2440 if (wr->wr_inodes != NULL) {
2441 wapbl_free(wr->wr_inodes, oldsize);
2442 wr->wr_inodes = NULL;
2443 }
2444 }
2445 wr->wr_inodeshead = newoff;
2446 if (wc->wc_inocnt == 0)
2447 return;
2448
2449 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
2450 sizeof(wr->wr_inodes[0]));
2451 if (wr->wr_inodes != NULL) {
2452 memcpy(new_inodes, wr->wr_inodes, oldsize);
2453 wapbl_free(wr->wr_inodes, oldsize);
2454 }
2455 wr->wr_inodes = new_inodes;
2456 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2457 wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
2458 wr->wr_inodescnt += wc->wc_inocnt;
2459 }
2460
2461 static int
2462 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2463 {
2464 off_t off;
2465 int error;
2466
2467 int logblklen = 1 << wr->wr_log_dev_bshift;
2468
2469 wapbl_blkhash_clear(wr);
2470
2471 off = tail;
2472 while (off != head) {
2473 struct wapbl_wc_null *wcn;
2474 off_t saveoff = off;
2475 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2476 if (error)
2477 goto errout;
2478 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2479 switch (wcn->wc_type) {
2480 case WAPBL_WC_BLOCKS:
2481 wapbl_replay_process_blocks(wr, &off);
2482 break;
2483
2484 case WAPBL_WC_REVOCATIONS:
2485 wapbl_replay_process_revocations(wr);
2486 break;
2487
2488 case WAPBL_WC_INODES:
2489 wapbl_replay_process_inodes(wr, saveoff, off);
2490 break;
2491
2492 default:
2493 printf("Unrecognized wapbl type: 0x%08x\n",
2494 wcn->wc_type);
2495 error = EFTYPE;
2496 goto errout;
2497 }
2498 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2499 if (off != saveoff) {
2500 printf("wapbl_replay: corrupted records\n");
2501 error = EFTYPE;
2502 goto errout;
2503 }
2504 }
2505 return 0;
2506
2507 errout:
2508 wapbl_blkhash_clear(wr);
2509 return error;
2510 }
2511
2512 #if 0
2513 int
2514 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
2515 {
2516 off_t off;
2517 int mismatchcnt = 0;
2518 int logblklen = 1 << wr->wr_log_dev_bshift;
2519 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2520 void *scratch1 = wapbl_malloc(MAXBSIZE);
2521 void *scratch2 = wapbl_malloc(MAXBSIZE);
2522 int error = 0;
2523
2524 KDASSERT(wapbl_replay_isopen(wr));
2525
2526 off = wch->wc_tail;
2527 while (off != wch->wc_head) {
2528 struct wapbl_wc_null *wcn;
2529 #ifdef DEBUG
2530 off_t saveoff = off;
2531 #endif
2532 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2533 if (error)
2534 goto out;
2535 wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2536 switch (wcn->wc_type) {
2537 case WAPBL_WC_BLOCKS:
2538 {
2539 struct wapbl_wc_blocklist *wc =
2540 (struct wapbl_wc_blocklist *)wr->wr_scratch;
2541 int i;
2542 for (i = 0; i < wc->wc_blkcount; i++) {
2543 int foundcnt = 0;
2544 int dirtycnt = 0;
2545 int j, n;
2546 /*
2547 * Check each physical block into the
2548 * hashtable independently
2549 */
2550 n = wc->wc_blocks[i].wc_dlen >>
2551 wch->wc_fs_dev_bshift;
2552 for (j = 0; j < n; j++) {
2553 struct wapbl_blk *wb =
2554 wapbl_blkhash_get(wr,
2555 wc->wc_blocks[i].wc_daddr + j);
2556 if (wb && (wb->wb_off == off)) {
2557 foundcnt++;
2558 error =
2559 wapbl_circ_read(wr,
2560 scratch1, fsblklen,
2561 &off);
2562 if (error)
2563 goto out;
2564 error =
2565 wapbl_read(scratch2,
2566 fsblklen, fsdevvp,
2567 wb->wb_blk);
2568 if (error)
2569 goto out;
2570 if (memcmp(scratch1,
2571 scratch2,
2572 fsblklen)) {
2573 printf(
2574 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
2575 wb->wb_blk, (intmax_t)off);
2576 dirtycnt++;
2577 mismatchcnt++;
2578 }
2579 } else {
2580 wapbl_circ_advance(wr,
2581 fsblklen, &off);
2582 }
2583 }
2584 #if 0
2585 /*
2586 * If all of the blocks in an entry
2587 * are clean, then remove all of its
2588 * blocks from the hashtable since they
2589 * never will need replay.
2590 */
2591 if ((foundcnt != 0) &&
2592 (dirtycnt == 0)) {
2593 off = saveoff;
2594 wapbl_circ_advance(wr,
2595 logblklen, &off);
2596 for (j = 0; j < n; j++) {
2597 struct wapbl_blk *wb =
2598 wapbl_blkhash_get(wr,
2599 wc->wc_blocks[i].wc_daddr + j);
2600 if (wb &&
2601 (wb->wb_off == off)) {
2602 wapbl_blkhash_rem(wr, wb->wb_blk);
2603 }
2604 wapbl_circ_advance(wr,
2605 fsblklen, &off);
2606 }
2607 }
2608 #endif
2609 }
2610 }
2611 break;
2612 case WAPBL_WC_REVOCATIONS:
2613 case WAPBL_WC_INODES:
2614 break;
2615 default:
2616 KASSERT(0);
2617 }
2618 #ifdef DEBUG
2619 wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2620 KASSERT(off == saveoff);
2621 #endif
2622 }
2623 out:
2624 wapbl_free(scratch1, MAXBSIZE);
2625 wapbl_free(scratch2, MAXBSIZE);
2626 if (!error && mismatchcnt)
2627 error = EFTYPE;
2628 return error;
2629 }
2630 #endif
2631
2632 int
2633 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
2634 {
2635 struct wapbl_blk *wb;
2636 size_t i;
2637 off_t off;
2638 void *scratch;
2639 int error = 0;
2640 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2641
2642 KDASSERT(wapbl_replay_isopen(wr));
2643
2644 scratch = wapbl_malloc(MAXBSIZE);
2645
2646 for (i = 0; i < wr->wr_blkhashmask; ++i) {
2647 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
2648 off = wb->wb_off;
2649 error = wapbl_circ_read(wr, scratch, fsblklen, &off);
2650 if (error)
2651 break;
2652 error = wapbl_write(scratch, fsblklen, fsdevvp,
2653 wb->wb_blk);
2654 if (error)
2655 break;
2656 }
2657 }
2658
2659 wapbl_free(scratch, MAXBSIZE);
2660 return error;
2661 }
2662
2663 int
2664 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
2665 {
2666 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2667
2668 KDASSERT(wapbl_replay_isopen(wr));
2669 KASSERT((len % fsblklen) == 0);
2670
2671 while (len != 0) {
2672 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2673 if (wb)
2674 return 1;
2675 len -= fsblklen;
2676 }
2677 return 0;
2678 }
2679
2680 int
2681 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
2682 {
2683 int fsblklen = 1 << wr->wr_fs_dev_bshift;
2684
2685 KDASSERT(wapbl_replay_isopen(wr));
2686
2687 KASSERT((len % fsblklen) == 0);
2688
2689 while (len != 0) {
2690 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2691 if (wb) {
2692 off_t off = wb->wb_off;
2693 int error;
2694 error = wapbl_circ_read(wr, data, fsblklen, &off);
2695 if (error)
2696 return error;
2697 }
2698 data = (uint8_t *)data + fsblklen;
2699 len -= fsblklen;
2700 blk++;
2701 }
2702 return 0;
2703 }
2704