Home | History | Annotate | Line # | Download | only in dev
      1 /*	$NetBSD: fss.c,v 1.114 2023/03/22 21:14:46 hannken Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2003 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Juergen Hannken-Illjes.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * File system snapshot disk driver.
     34  *
     35  * Block/character interface to the snapshot of a mounted file system.
     36  */
     37 
     38 #include <sys/cdefs.h>
     39 __KERNEL_RCSID(0, "$NetBSD: fss.c,v 1.114 2023/03/22 21:14:46 hannken Exp $");
     40 
     41 #include <sys/param.h>
     42 #include <sys/systm.h>
     43 #include <sys/namei.h>
     44 #include <sys/proc.h>
     45 #include <sys/errno.h>
     46 #include <sys/kmem.h>
     47 #include <sys/buf.h>
     48 #include <sys/ioctl.h>
     49 #include <sys/disklabel.h>
     50 #include <sys/device.h>
     51 #include <sys/disk.h>
     52 #include <sys/stat.h>
     53 #include <sys/mount.h>
     54 #include <sys/vnode.h>
     55 #include <sys/file.h>
     56 #include <sys/uio.h>
     57 #include <sys/conf.h>
     58 #include <sys/kthread.h>
     59 #include <sys/fstrans.h>
     60 #include <sys/vfs_syscalls.h>		/* For do_sys_unlink(). */
     61 
     62 #include <miscfs/specfs/specdev.h>
     63 
     64 #include <dev/fssvar.h>
     65 
     66 #include <uvm/uvm.h>
     67 
     68 #include "ioconf.h"
     69 
     70 dev_type_open(fss_open);
     71 dev_type_close(fss_close);
     72 dev_type_read(fss_read);
     73 dev_type_write(fss_write);
     74 dev_type_ioctl(fss_ioctl);
     75 dev_type_strategy(fss_strategy);
     76 dev_type_dump(fss_dump);
     77 dev_type_size(fss_size);
     78 
     79 static void fss_unmount_hook(struct mount *);
     80 static int fss_copy_on_write(void *, struct buf *, bool);
     81 static inline void fss_error(struct fss_softc *, const char *);
     82 static int fss_create_files(struct fss_softc *, struct fss_set *,
     83     off_t *, struct lwp *);
     84 static int fss_create_snapshot(struct fss_softc *, struct fss_set *,
     85     struct lwp *);
     86 static int fss_delete_snapshot(struct fss_softc *, struct lwp *);
     87 static int fss_softc_alloc(struct fss_softc *);
     88 static void fss_softc_free(struct fss_softc *);
     89 static int fss_read_cluster(struct fss_softc *, u_int32_t);
     90 static void fss_bs_thread(void *);
     91 static int fss_bs_io(struct fss_softc *, fss_io_type,
     92     u_int32_t, off_t, int, void *, size_t *);
     93 static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
     94 
     95 static kmutex_t fss_device_lock;	/* Protect all units. */
     96 static kcondvar_t fss_device_cv;	/* Serialize snapshot creation. */
     97 static bool fss_creating = false;	/* Currently creating a snapshot. */
     98 static int fss_num_attached = 0;	/* Number of attached devices. */
     99 static struct vfs_hooks fss_vfs_hooks = {
    100 	.vh_unmount = fss_unmount_hook
    101 };
    102 
    103 const struct bdevsw fss_bdevsw = {
    104 	.d_open = fss_open,
    105 	.d_close = fss_close,
    106 	.d_strategy = fss_strategy,
    107 	.d_ioctl = fss_ioctl,
    108 	.d_dump = fss_dump,
    109 	.d_psize = fss_size,
    110 	.d_discard = nodiscard,
    111 	.d_flag = D_DISK | D_MPSAFE
    112 };
    113 
    114 const struct cdevsw fss_cdevsw = {
    115 	.d_open = fss_open,
    116 	.d_close = fss_close,
    117 	.d_read = fss_read,
    118 	.d_write = fss_write,
    119 	.d_ioctl = fss_ioctl,
    120 	.d_stop = nostop,
    121 	.d_tty = notty,
    122 	.d_poll = nopoll,
    123 	.d_mmap = nommap,
    124 	.d_kqfilter = nokqfilter,
    125 	.d_discard = nodiscard,
    126 	.d_flag = D_DISK | D_MPSAFE
    127 };
    128 
    129 static int fss_match(device_t, cfdata_t, void *);
    130 static void fss_attach(device_t, device_t, void *);
    131 static int fss_detach(device_t, int);
    132 
    133 CFATTACH_DECL_NEW(fss, sizeof(struct fss_softc),
    134     fss_match, fss_attach, fss_detach, NULL);
    135 
    136 void
    137 fssattach(int num)
    138 {
    139 
    140 	mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
    141 	cv_init(&fss_device_cv, "snapwait");
    142 	if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
    143 		aprint_error("%s: unable to register\n", fss_cd.cd_name);
    144 }
    145 
    146 static int
    147 fss_match(device_t self, cfdata_t cfdata, void *aux)
    148 {
    149 	return 1;
    150 }
    151 
    152 static void
    153 fss_attach(device_t parent, device_t self, void *aux)
    154 {
    155 	struct fss_softc *sc = device_private(self);
    156 
    157 	sc->sc_dev = self;
    158 	sc->sc_bdev = NODEV;
    159 	mutex_init(&sc->sc_slock, MUTEX_DEFAULT, IPL_NONE);
    160 	cv_init(&sc->sc_work_cv, "fssbs");
    161 	cv_init(&sc->sc_cache_cv, "cowwait");
    162 	bufq_alloc(&sc->sc_bufq, "fcfs", 0);
    163 	sc->sc_dkdev = kmem_zalloc(sizeof(*sc->sc_dkdev), KM_SLEEP);
    164 	sc->sc_dkdev->dk_info = NULL;
    165 	disk_init(sc->sc_dkdev, device_xname(self), NULL);
    166 	if (!pmf_device_register(self, NULL, NULL))
    167 		aprint_error_dev(self, "couldn't establish power handler\n");
    168 
    169 	if (fss_num_attached++ == 0)
    170 		vfs_hooks_attach(&fss_vfs_hooks);
    171 }
    172 
    173 static int
    174 fss_detach(device_t self, int flags)
    175 {
    176 	struct fss_softc *sc = device_private(self);
    177 
    178 	mutex_enter(&sc->sc_slock);
    179 	if (sc->sc_state != FSS_IDLE) {
    180 		mutex_exit(&sc->sc_slock);
    181 		return EBUSY;
    182 	}
    183 	mutex_exit(&sc->sc_slock);
    184 
    185 	if (--fss_num_attached == 0)
    186 		vfs_hooks_detach(&fss_vfs_hooks);
    187 
    188 	pmf_device_deregister(self);
    189 	mutex_destroy(&sc->sc_slock);
    190 	cv_destroy(&sc->sc_work_cv);
    191 	cv_destroy(&sc->sc_cache_cv);
    192 	bufq_drain(sc->sc_bufq);
    193 	bufq_free(sc->sc_bufq);
    194 	disk_destroy(sc->sc_dkdev);
    195 	kmem_free(sc->sc_dkdev, sizeof(*sc->sc_dkdev));
    196 
    197 	return 0;
    198 }
    199 
    200 int
    201 fss_open(dev_t dev, int flags, int mode, struct lwp *l)
    202 {
    203 	int mflag;
    204 	cfdata_t cf;
    205 	struct fss_softc *sc;
    206 
    207 	mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
    208 
    209 	mutex_enter(&fss_device_lock);
    210 
    211 	sc = device_lookup_private(&fss_cd, minor(dev));
    212 	if (sc == NULL) {
    213 		cf = kmem_zalloc(sizeof(*cf), KM_SLEEP);
    214 		cf->cf_name = fss_cd.cd_name;
    215 		cf->cf_atname = fss_cd.cd_name;
    216 		cf->cf_unit = minor(dev);
    217 		cf->cf_fstate = FSTATE_STAR;
    218 		sc = device_private(config_attach_pseudo(cf));
    219 		if (sc == NULL) {
    220 			mutex_exit(&fss_device_lock);
    221 			return ENOMEM;
    222 		}
    223 		sc->sc_state = FSS_IDLE;
    224 	}
    225 
    226 	mutex_enter(&sc->sc_slock);
    227 
    228 	sc->sc_flags |= mflag;
    229 
    230 	mutex_exit(&sc->sc_slock);
    231 	mutex_exit(&fss_device_lock);
    232 
    233 	return 0;
    234 }
    235 
    236 int
    237 fss_close(dev_t dev, int flags, int mode, struct lwp *l)
    238 {
    239 	int mflag, error;
    240 	cfdata_t cf;
    241 	struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
    242 
    243 	if (sc == NULL)
    244 		return ENXIO;
    245 
    246 	mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
    247 	error = 0;
    248 
    249 	mutex_enter(&fss_device_lock);
    250 restart:
    251 	mutex_enter(&sc->sc_slock);
    252 	if ((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) != mflag) {
    253 		sc->sc_flags &= ~mflag;
    254 		mutex_exit(&sc->sc_slock);
    255 		mutex_exit(&fss_device_lock);
    256 		return 0;
    257 	}
    258 	if (sc->sc_state != FSS_IDLE &&
    259 	    (sc->sc_uflags & FSS_UNCONFIG_ON_CLOSE) != 0) {
    260 		sc->sc_uflags &= ~FSS_UNCONFIG_ON_CLOSE;
    261 		mutex_exit(&sc->sc_slock);
    262 		error = fss_ioctl(dev, FSSIOCCLR, NULL, FWRITE, l);
    263 		goto restart;
    264 	}
    265 	if (sc->sc_state != FSS_IDLE) {
    266 		mutex_exit(&sc->sc_slock);
    267 		mutex_exit(&fss_device_lock);
    268 		return error;
    269 	}
    270 
    271 	KASSERT(sc->sc_state == FSS_IDLE);
    272 	KASSERT((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) == mflag);
    273 	mutex_exit(&sc->sc_slock);
    274 	cf = device_cfdata(sc->sc_dev);
    275 	error = config_detach(sc->sc_dev, DETACH_QUIET);
    276 	if (! error)
    277 		kmem_free(cf, sizeof(*cf));
    278 	mutex_exit(&fss_device_lock);
    279 
    280 	return error;
    281 }
    282 
    283 void
    284 fss_strategy(struct buf *bp)
    285 {
    286 	const bool write = ((bp->b_flags & B_READ) != B_READ);
    287 	struct fss_softc *sc = device_lookup_private(&fss_cd, minor(bp->b_dev));
    288 
    289 	if (sc == NULL) {
    290 		bp->b_error = ENXIO;
    291 		goto done;
    292 	}
    293 
    294 	mutex_enter(&sc->sc_slock);
    295 
    296 	if (write || sc->sc_state != FSS_ACTIVE) {
    297 		bp->b_error = (write ? EROFS : ENXIO);
    298 		goto done;
    299 	}
    300 	/* Check bounds for non-persistent snapshots. */
    301 	if ((sc->sc_flags & FSS_PERSISTENT) == 0 &&
    302 	    bounds_check_with_mediasize(bp, DEV_BSIZE,
    303 	    btodb(FSS_CLTOB(sc, sc->sc_clcount - 1) + sc->sc_clresid)) <= 0)
    304 		goto done;
    305 
    306 	bp->b_rawblkno = bp->b_blkno;
    307 	bufq_put(sc->sc_bufq, bp);
    308 	cv_signal(&sc->sc_work_cv);
    309 
    310 	mutex_exit(&sc->sc_slock);
    311 	return;
    312 
    313 done:
    314 	if (sc != NULL)
    315 		mutex_exit(&sc->sc_slock);
    316 	bp->b_resid = bp->b_bcount;
    317 	biodone(bp);
    318 }
    319 
    320 int
    321 fss_read(dev_t dev, struct uio *uio, int flags)
    322 {
    323 	return physio(fss_strategy, NULL, dev, B_READ, minphys, uio);
    324 }
    325 
    326 int
    327 fss_write(dev_t dev, struct uio *uio, int flags)
    328 {
    329 	return physio(fss_strategy, NULL, dev, B_WRITE, minphys, uio);
    330 }
    331 
    332 int
    333 fss_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    334 {
    335 	int error = 0;
    336 	struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
    337 	struct fss_set _fss;
    338 	struct fss_set *fss = (struct fss_set *)data;
    339 	struct fss_set50 *fss50 = (struct fss_set50 *)data;
    340 	struct fss_get *fsg = (struct fss_get *)data;
    341 #ifndef _LP64
    342 	struct fss_get50 *fsg50 = (struct fss_get50 *)data;
    343 #endif
    344 
    345 	if (sc == NULL)
    346 		return ENXIO;
    347 
    348 	switch (cmd) {
    349 	case FSSIOCSET50:
    350 		fss = &_fss;
    351 		fss->fss_mount = fss50->fss_mount;
    352 		fss->fss_bstore = fss50->fss_bstore;
    353 		fss->fss_csize = fss50->fss_csize;
    354 		fss->fss_flags = 0;
    355 		/* Fall through */
    356 	case FSSIOCSET:
    357 		mutex_enter(&sc->sc_slock);
    358 		if ((flag & FWRITE) == 0)
    359 			error = EPERM;
    360 		if (error == 0 && sc->sc_state != FSS_IDLE) {
    361 			error = EBUSY;
    362 		} else {
    363 			sc->sc_state = FSS_CREATING;
    364 			copyinstr(fss->fss_mount, sc->sc_mntname,
    365 			    sizeof(sc->sc_mntname), NULL);
    366 			memset(&sc->sc_time, 0, sizeof(sc->sc_time));
    367 			sc->sc_clshift = 0;
    368 		}
    369 		mutex_exit(&sc->sc_slock);
    370 		if (error)
    371 			break;
    372 
    373 		/*
    374 		 * Serialize snapshot creation.
    375 		 */
    376 		mutex_enter(&fss_device_lock);
    377 		while (fss_creating) {
    378 			error = cv_wait_sig(&fss_device_cv, &fss_device_lock);
    379 			if (error) {
    380 				mutex_enter(&sc->sc_slock);
    381 				KASSERT(sc->sc_state == FSS_CREATING);
    382 				sc->sc_state = FSS_IDLE;
    383 				mutex_exit(&sc->sc_slock);
    384 				mutex_exit(&fss_device_lock);
    385 				break;
    386 			}
    387 		}
    388 		fss_creating = true;
    389 		mutex_exit(&fss_device_lock);
    390 
    391 		error = fss_create_snapshot(sc, fss, l);
    392 		mutex_enter(&sc->sc_slock);
    393 		if (error == 0) {
    394 			KASSERT(sc->sc_state == FSS_ACTIVE);
    395 			sc->sc_uflags = fss->fss_flags;
    396 		} else {
    397 			KASSERT(sc->sc_state == FSS_CREATING);
    398 			sc->sc_state = FSS_IDLE;
    399 		}
    400 		mutex_exit(&sc->sc_slock);
    401 
    402 		mutex_enter(&fss_device_lock);
    403 		fss_creating = false;
    404 		cv_broadcast(&fss_device_cv);
    405 		mutex_exit(&fss_device_lock);
    406 
    407 		break;
    408 
    409 	case FSSIOCCLR:
    410 		mutex_enter(&sc->sc_slock);
    411 		if ((flag & FWRITE) == 0) {
    412 			error = EPERM;
    413 		} else if (sc->sc_state != FSS_ACTIVE) {
    414 			error = EBUSY;
    415 		} else {
    416 			sc->sc_state = FSS_DESTROYING;
    417 		}
    418 		mutex_exit(&sc->sc_slock);
    419 		if (error)
    420 			break;
    421 
    422 		error = fss_delete_snapshot(sc, l);
    423 		mutex_enter(&sc->sc_slock);
    424 		if (error)
    425 			fss_error(sc, "Failed to delete snapshot");
    426 		else
    427 			KASSERT(sc->sc_state == FSS_IDLE);
    428 		mutex_exit(&sc->sc_slock);
    429 		break;
    430 
    431 #ifndef _LP64
    432 	case FSSIOCGET50:
    433 		mutex_enter(&sc->sc_slock);
    434 		if (sc->sc_state == FSS_IDLE) {
    435 			error = ENXIO;
    436 		} else if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
    437 			memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
    438 			fsg50->fsg_csize = FSS_CLSIZE(sc);
    439 			timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
    440 			fsg50->fsg_mount_size = sc->sc_clcount;
    441 			fsg50->fsg_bs_size = sc->sc_clnext;
    442 			error = 0;
    443 		} else {
    444 			memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
    445 			fsg50->fsg_csize = 0;
    446 			timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
    447 			fsg50->fsg_mount_size = 0;
    448 			fsg50->fsg_bs_size = 0;
    449 			error = 0;
    450 		}
    451 		mutex_exit(&sc->sc_slock);
    452 		break;
    453 #endif /* _LP64 */
    454 
    455 	case FSSIOCGET:
    456 		mutex_enter(&sc->sc_slock);
    457 		if (sc->sc_state == FSS_IDLE) {
    458 			error = ENXIO;
    459 		} else if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
    460 			memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
    461 			fsg->fsg_csize = FSS_CLSIZE(sc);
    462 			fsg->fsg_time = sc->sc_time;
    463 			fsg->fsg_mount_size = sc->sc_clcount;
    464 			fsg->fsg_bs_size = sc->sc_clnext;
    465 			error = 0;
    466 		} else {
    467 			memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
    468 			fsg->fsg_csize = 0;
    469 			fsg->fsg_time = sc->sc_time;
    470 			fsg->fsg_mount_size = 0;
    471 			fsg->fsg_bs_size = 0;
    472 			error = 0;
    473 		}
    474 		mutex_exit(&sc->sc_slock);
    475 		break;
    476 
    477 	case FSSIOFSET:
    478 		mutex_enter(&sc->sc_slock);
    479 		sc->sc_uflags = *(int *)data;
    480 		mutex_exit(&sc->sc_slock);
    481 		error = 0;
    482 		break;
    483 
    484 	case FSSIOFGET:
    485 		mutex_enter(&sc->sc_slock);
    486 		*(int *)data = sc->sc_uflags;
    487 		mutex_exit(&sc->sc_slock);
    488 		error = 0;
    489 		break;
    490 
    491 	default:
    492 		error = EINVAL;
    493 		break;
    494 	}
    495 
    496 	return error;
    497 }
    498 
    499 int
    500 fss_size(dev_t dev)
    501 {
    502 	return -1;
    503 }
    504 
    505 int
    506 fss_dump(dev_t dev, daddr_t blkno, void *va,
    507     size_t size)
    508 {
    509 	return EROFS;
    510 }
    511 
    512 /*
    513  * An error occurred reading or writing the snapshot or backing store.
    514  * If it is the first error log to console and disestablish cow handler.
    515  * The caller holds the mutex.
    516  */
    517 static inline void
    518 fss_error(struct fss_softc *sc, const char *msg)
    519 {
    520 
    521 	KASSERT(mutex_owned(&sc->sc_slock));
    522 
    523 	if ((sc->sc_flags & FSS_ERROR))
    524 		return;
    525 
    526 	aprint_error_dev(sc->sc_dev, "snapshot invalid: %s\n", msg);
    527 	if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
    528 		mutex_exit(&sc->sc_slock);
    529 		fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
    530 		mutex_enter(&sc->sc_slock);
    531 	}
    532 	sc->sc_flags |= FSS_ERROR;
    533 }
    534 
    535 /*
    536  * Allocate the variable sized parts of the softc and
    537  * fork the kernel thread.
    538  *
    539  * The fields sc_clcount, sc_clshift, sc_cache_size and sc_indir_size
    540  * must be initialized.
    541  */
    542 static int
    543 fss_softc_alloc(struct fss_softc *sc)
    544 {
    545 	int i, error;
    546 
    547 	if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
    548 		sc->sc_copied =
    549 		    kmem_zalloc(howmany(sc->sc_clcount, NBBY), KM_SLEEP);
    550 		sc->sc_cache = kmem_alloc(sc->sc_cache_size *
    551 		    sizeof(struct fss_cache), KM_SLEEP);
    552 		for (i = 0; i < sc->sc_cache_size; i++) {
    553 			sc->sc_cache[i].fc_type = FSS_CACHE_FREE;
    554 			sc->sc_cache[i].fc_data =
    555 			    kmem_alloc(FSS_CLSIZE(sc), KM_SLEEP);
    556 			cv_init(&sc->sc_cache[i].fc_state_cv, "cowwait1");
    557 		}
    558 
    559 		sc->sc_indir_valid =
    560 		    kmem_zalloc(howmany(sc->sc_indir_size, NBBY), KM_SLEEP);
    561 		sc->sc_indir_data = kmem_zalloc(FSS_CLSIZE(sc), KM_SLEEP);
    562 	} else {
    563 		sc->sc_copied = NULL;
    564 		sc->sc_cache = NULL;
    565 		sc->sc_indir_valid = NULL;
    566 		sc->sc_indir_data = NULL;
    567 	}
    568 
    569 	sc->sc_flags |= FSS_BS_THREAD;
    570 	if ((error = kthread_create(PRI_BIO, KTHREAD_MUSTJOIN, NULL,
    571 	    fss_bs_thread, sc, &sc->sc_bs_lwp,
    572 	    "%s", device_xname(sc->sc_dev))) != 0) {
    573 		sc->sc_flags &= ~FSS_BS_THREAD;
    574 		return error;
    575 	}
    576 
    577 	disk_attach(sc->sc_dkdev);
    578 
    579 	return 0;
    580 }
    581 
    582 /*
    583  * Free the variable sized parts of the softc.
    584  */
    585 static void
    586 fss_softc_free(struct fss_softc *sc)
    587 {
    588 	int i;
    589 
    590 	if ((sc->sc_flags & FSS_BS_THREAD) != 0) {
    591 		mutex_enter(&sc->sc_slock);
    592 		sc->sc_flags &= ~FSS_BS_THREAD;
    593 		cv_signal(&sc->sc_work_cv);
    594 		mutex_exit(&sc->sc_slock);
    595 		kthread_join(sc->sc_bs_lwp);
    596 
    597 		disk_detach(sc->sc_dkdev);
    598 	}
    599 
    600 	if (sc->sc_copied != NULL)
    601 		kmem_free(sc->sc_copied, howmany(sc->sc_clcount, NBBY));
    602 	sc->sc_copied = NULL;
    603 
    604 	if (sc->sc_cache != NULL) {
    605 		for (i = 0; i < sc->sc_cache_size; i++)
    606 			if (sc->sc_cache[i].fc_data != NULL) {
    607 				cv_destroy(&sc->sc_cache[i].fc_state_cv);
    608 				kmem_free(sc->sc_cache[i].fc_data,
    609 				    FSS_CLSIZE(sc));
    610 			}
    611 		kmem_free(sc->sc_cache,
    612 		    sc->sc_cache_size*sizeof(struct fss_cache));
    613 	}
    614 	sc->sc_cache = NULL;
    615 
    616 	if (sc->sc_indir_valid != NULL)
    617 		kmem_free(sc->sc_indir_valid, howmany(sc->sc_indir_size, NBBY));
    618 	sc->sc_indir_valid = NULL;
    619 
    620 	if (sc->sc_indir_data != NULL)
    621 		kmem_free(sc->sc_indir_data, FSS_CLSIZE(sc));
    622 	sc->sc_indir_data = NULL;
    623 }
    624 
    625 /*
    626  * Set all active snapshots on this file system into ERROR state.
    627  */
    628 static void
    629 fss_unmount_hook(struct mount *mp)
    630 {
    631 	int i;
    632 	struct fss_softc *sc;
    633 
    634 	mutex_enter(&fss_device_lock);
    635 	for (i = 0; i < fss_cd.cd_ndevs; i++) {
    636 		if ((sc = device_lookup_private(&fss_cd, i)) == NULL)
    637 			continue;
    638 		mutex_enter(&sc->sc_slock);
    639 		if (sc->sc_state != FSS_IDLE && sc->sc_mount == mp)
    640 			fss_error(sc, "forced by unmount");
    641 		mutex_exit(&sc->sc_slock);
    642 	}
    643 	mutex_exit(&fss_device_lock);
    644 }
    645 
    646 /*
    647  * A buffer is written to the snapshotted block device. Copy to
    648  * backing store if needed.
    649  */
    650 static int
    651 fss_copy_on_write(void *v, struct buf *bp, bool data_valid)
    652 {
    653 	int error;
    654 	u_int32_t cl, ch, c;
    655 	struct fss_softc *sc = v;
    656 
    657 	mutex_enter(&sc->sc_slock);
    658 	if (sc->sc_state != FSS_ACTIVE) {
    659 		mutex_exit(&sc->sc_slock);
    660 		return 0;
    661 	}
    662 
    663 	cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
    664 	ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
    665 	error = 0;
    666 	if (curlwp == uvm.pagedaemon_lwp) {
    667 		for (c = cl; c <= ch; c++)
    668 			if (isclr(sc->sc_copied, c)) {
    669 				error = ENOMEM;
    670 				break;
    671 			}
    672 	}
    673 	mutex_exit(&sc->sc_slock);
    674 
    675 	if (error == 0)
    676 		for (c = cl; c <= ch; c++) {
    677 			error = fss_read_cluster(sc, c);
    678 			if (error)
    679 				break;
    680 		}
    681 
    682 	return error;
    683 }
    684 
    685 /*
    686  * Lookup and open needed files.
    687  *
    688  * For file system internal snapshot initializes sc_mntname, sc_mount,
    689  * sc_bs_vp and sc_time.
    690  *
    691  * Otherwise returns dev and size of the underlying block device.
    692  * Initializes sc_mntname, sc_mount, sc_bdev, sc_bs_vp and sc_mount
    693  */
    694 static int
    695 fss_create_files(struct fss_softc *sc, struct fss_set *fss,
    696     off_t *bsize, struct lwp *l)
    697 {
    698 	int error, bits, fsbsize;
    699 	uint64_t numsec;
    700 	unsigned int secsize;
    701 	struct timespec ts;
    702 	/* distinguish lookup 1 from lookup 2 to reduce mistakes */
    703 	struct pathbuf *pb2;
    704 	struct vnode *vp, *vp2;
    705 
    706 	/*
    707 	 * Get the mounted file system.
    708 	 */
    709 
    710 	error = namei_simple_user(fss->fss_mount,
    711 				NSM_FOLLOW_NOEMULROOT, &vp);
    712 	if (error != 0)
    713 		return error;
    714 
    715 	if ((vp->v_vflag & VV_ROOT) != VV_ROOT) {
    716 		vrele(vp);
    717 		return EINVAL;
    718 	}
    719 
    720 	sc->sc_mount = vp->v_mount;
    721 	memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
    722 
    723 	vrele(vp);
    724 
    725 	/*
    726 	 * Check for file system internal snapshot.
    727 	 */
    728 
    729 	error = namei_simple_user(fss->fss_bstore,
    730 				NSM_FOLLOW_NOEMULROOT, &vp);
    731 	if (error != 0)
    732 		return error;
    733 
    734 	if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
    735 		sc->sc_flags |= FSS_PERSISTENT;
    736 		sc->sc_bs_vp = vp;
    737 
    738 		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
    739 		bits = sizeof(sc->sc_bs_bshift)*NBBY;
    740 		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
    741 		    sc->sc_bs_bshift++)
    742 			if (FSS_FSBSIZE(sc) == fsbsize)
    743 				break;
    744 		if (sc->sc_bs_bshift >= bits)
    745 			return EINVAL;
    746 
    747 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
    748 		sc->sc_clshift = 0;
    749 
    750 		if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
    751 			error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
    752 			if (error)
    753 				return error;
    754 		}
    755 		error = vn_lock(vp, LK_EXCLUSIVE);
    756 		if (error != 0)
    757 			return error;
    758 		error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
    759 		TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
    760 
    761 		VOP_UNLOCK(sc->sc_bs_vp);
    762 
    763 		return error;
    764 	}
    765 	vrele(vp);
    766 
    767 	/*
    768 	 * Get the block device it is mounted on and its size.
    769 	 */
    770 
    771 	error = spec_node_lookup_by_mount(sc->sc_mount, &vp);
    772 	if (error)
    773 		return error;
    774 	sc->sc_bdev = vp->v_rdev;
    775 
    776 	error = getdisksize(vp, &numsec, &secsize);
    777 	vrele(vp);
    778 	if (error)
    779 		return error;
    780 
    781 	*bsize = (off_t)numsec*secsize;
    782 
    783 	/*
    784 	 * Get the backing store
    785 	 */
    786 
    787 	error = pathbuf_copyin(fss->fss_bstore, &pb2);
    788 	if (error) {
    789  		return error;
    790 	}
    791 	error = vn_open(NULL, pb2, 0, FREAD|FWRITE, 0, &vp2, NULL, NULL);
    792 	if (error != 0) {
    793 		pathbuf_destroy(pb2);
    794 		return error;
    795 	}
    796 	VOP_UNLOCK(vp2);
    797 
    798 	sc->sc_bs_vp = vp2;
    799 
    800 	if (vp2->v_type != VREG && vp2->v_type != VCHR) {
    801 		vrele(vp2);
    802 		pathbuf_destroy(pb2);
    803 		return EINVAL;
    804 	}
    805 	pathbuf_destroy(pb2);
    806 
    807 	if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
    808 		error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
    809 		if (error)
    810 			return error;
    811 	}
    812 	if (sc->sc_bs_vp->v_type == VREG) {
    813 		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
    814 		if (fsbsize & (fsbsize-1))	/* No power of two */
    815 			return EINVAL;
    816 		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
    817 		    sc->sc_bs_bshift++)
    818 			if (FSS_FSBSIZE(sc) == fsbsize)
    819 				break;
    820 		if (sc->sc_bs_bshift >= 32)
    821 			return EINVAL;
    822 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
    823 	} else {
    824 		sc->sc_bs_bshift = DEV_BSHIFT;
    825 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
    826 	}
    827 
    828 	return 0;
    829 }
    830 
    831 /*
    832  * Create a snapshot.
    833  */
    834 static int
    835 fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
    836 {
    837 	int len, error;
    838 	u_int32_t csize;
    839 	off_t bsize;
    840 
    841 	bsize = 0;	/* XXX gcc */
    842 
    843 	/*
    844 	 * Open needed files.
    845 	 */
    846 	if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
    847 		goto bad;
    848 
    849 	if (sc->sc_flags & FSS_PERSISTENT) {
    850 		fss_softc_alloc(sc);
    851 		mutex_enter(&sc->sc_slock);
    852 		sc->sc_state = FSS_ACTIVE;
    853 		mutex_exit(&sc->sc_slock);
    854 		return 0;
    855 	}
    856 
    857 	/*
    858 	 * Set cluster size. Must be a power of two and
    859 	 * a multiple of backing store block size.
    860 	 */
    861 	if (fss->fss_csize <= 0)
    862 		csize = MAXPHYS;
    863 	else
    864 		csize = fss->fss_csize;
    865 	if (bsize/csize > FSS_CLUSTER_MAX)
    866 		csize = bsize/FSS_CLUSTER_MAX+1;
    867 
    868 	for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
    869 	    sc->sc_clshift++)
    870 		if (FSS_CLSIZE(sc) >= csize)
    871 			break;
    872 	if (sc->sc_clshift >= 32) {
    873 		error = EINVAL;
    874 		goto bad;
    875 	}
    876 	sc->sc_clmask = FSS_CLSIZE(sc)-1;
    877 
    878 	/*
    879 	 * Set number of cache slots.
    880 	 */
    881 	if (FSS_CLSIZE(sc) <= 8192)
    882 		sc->sc_cache_size = 32;
    883 	else if (FSS_CLSIZE(sc) <= 65536)
    884 		sc->sc_cache_size = 8;
    885 	else
    886 		sc->sc_cache_size = 4;
    887 
    888 	/*
    889 	 * Set number of clusters and size of last cluster.
    890 	 */
    891 	sc->sc_clcount = FSS_BTOCL(sc, bsize-1)+1;
    892 	sc->sc_clresid = FSS_CLOFF(sc, bsize-1)+1;
    893 
    894 	/*
    895 	 * Set size of indirect table.
    896 	 */
    897 	len = sc->sc_clcount*sizeof(u_int32_t);
    898 	sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
    899 	sc->sc_clnext = sc->sc_indir_size;
    900 	sc->sc_indir_cur = 0;
    901 
    902 	if ((error = fss_softc_alloc(sc)) != 0)
    903 		goto bad;
    904 
    905 	/*
    906 	 * Activate the snapshot.
    907 	 */
    908 
    909 	if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
    910 		goto bad;
    911 
    912 	microtime(&sc->sc_time);
    913 
    914 	vrele_flush(sc->sc_mount);
    915 	error = VFS_SYNC(sc->sc_mount, MNT_WAIT, curlwp->l_cred);
    916 	if (error == 0)
    917 		error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
    918 	if (error == 0) {
    919 		mutex_enter(&sc->sc_slock);
    920 		sc->sc_state = FSS_ACTIVE;
    921 		mutex_exit(&sc->sc_slock);
    922 	}
    923 
    924 	vfs_resume(sc->sc_mount);
    925 
    926 	if (error != 0)
    927 		goto bad;
    928 
    929 	aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
    930 	aprint_debug_dev(sc->sc_dev,
    931 	    "%u clusters of %u, %u cache slots, %u indir clusters\n",
    932 	    sc->sc_clcount, FSS_CLSIZE(sc),
    933 	    sc->sc_cache_size, sc->sc_indir_size);
    934 
    935 	return 0;
    936 
    937 bad:
    938 	fss_softc_free(sc);
    939 	if (sc->sc_bs_vp != NULL) {
    940 		if (sc->sc_flags & FSS_PERSISTENT)
    941 			vrele(sc->sc_bs_vp);
    942 		else
    943 			vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
    944 	}
    945 	sc->sc_bs_vp = NULL;
    946 
    947 	return error;
    948 }
    949 
    950 /*
    951  * Delete a snapshot.
    952  */
    953 static int
    954 fss_delete_snapshot(struct fss_softc *sc, struct lwp *l)
    955 {
    956 
    957 	mutex_enter(&sc->sc_slock);
    958 	if ((sc->sc_flags & FSS_PERSISTENT) == 0 &&
    959 	    (sc->sc_flags & FSS_ERROR) == 0) {
    960 		mutex_exit(&sc->sc_slock);
    961 		fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
    962 	} else {
    963 		mutex_exit(&sc->sc_slock);
    964 	}
    965 
    966 	fss_softc_free(sc);
    967 	if (sc->sc_flags & FSS_PERSISTENT)
    968 		vrele(sc->sc_bs_vp);
    969 	else
    970 		vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
    971 
    972 	mutex_enter(&sc->sc_slock);
    973 	sc->sc_state = FSS_IDLE;
    974 	sc->sc_mount = NULL;
    975 	sc->sc_bdev = NODEV;
    976 	sc->sc_bs_vp = NULL;
    977 	sc->sc_flags &= ~FSS_PERSISTENT;
    978 	mutex_exit(&sc->sc_slock);
    979 
    980 	return 0;
    981 }
    982 
    983 /*
    984  * Read a cluster from the snapshotted block device to the cache.
    985  */
    986 static int
    987 fss_read_cluster(struct fss_softc *sc, u_int32_t cl)
    988 {
    989 	int error, todo, offset, len;
    990 	daddr_t dblk;
    991 	struct buf *bp, *mbp;
    992 	struct fss_cache *scp, *scl;
    993 
    994 	/*
    995 	 * Get a free cache slot.
    996 	 */
    997 	scl = sc->sc_cache+sc->sc_cache_size;
    998 
    999 	mutex_enter(&sc->sc_slock);
   1000 
   1001 restart:
   1002 	if (isset(sc->sc_copied, cl) || sc->sc_state != FSS_ACTIVE) {
   1003 		mutex_exit(&sc->sc_slock);
   1004 		return 0;
   1005 	}
   1006 
   1007 	for (scp = sc->sc_cache; scp < scl; scp++) {
   1008 		if (scp->fc_type == FSS_CACHE_VALID) {
   1009 			if (scp->fc_cluster == cl) {
   1010 				mutex_exit(&sc->sc_slock);
   1011 				return 0;
   1012 			}
   1013 		} else if (scp->fc_type == FSS_CACHE_BUSY) {
   1014 			if (scp->fc_cluster == cl) {
   1015 				cv_wait(&scp->fc_state_cv, &sc->sc_slock);
   1016 				goto restart;
   1017 			}
   1018 		}
   1019 	}
   1020 
   1021 	for (scp = sc->sc_cache; scp < scl; scp++)
   1022 		if (scp->fc_type == FSS_CACHE_FREE) {
   1023 			scp->fc_type = FSS_CACHE_BUSY;
   1024 			scp->fc_cluster = cl;
   1025 			break;
   1026 		}
   1027 	if (scp >= scl) {
   1028 		cv_wait(&sc->sc_cache_cv, &sc->sc_slock);
   1029 		goto restart;
   1030 	}
   1031 
   1032 	mutex_exit(&sc->sc_slock);
   1033 
   1034 	/*
   1035 	 * Start the read.
   1036 	 */
   1037 	dblk = btodb(FSS_CLTOB(sc, cl));
   1038 	if (cl == sc->sc_clcount-1) {
   1039 		todo = sc->sc_clresid;
   1040 		memset((char *)scp->fc_data + todo, 0, FSS_CLSIZE(sc) - todo);
   1041 	} else
   1042 		todo = FSS_CLSIZE(sc);
   1043 	offset = 0;
   1044 	mbp = getiobuf(NULL, true);
   1045 	mbp->b_bufsize = todo;
   1046 	mbp->b_data = scp->fc_data;
   1047 	mbp->b_resid = mbp->b_bcount = todo;
   1048 	mbp->b_flags = B_READ;
   1049 	mbp->b_cflags = BC_BUSY;
   1050 	mbp->b_dev = sc->sc_bdev;
   1051 	while (todo > 0) {
   1052 		len = todo;
   1053 		if (len > MAXPHYS)
   1054 			len = MAXPHYS;
   1055 		if (btodb(FSS_CLTOB(sc, cl)) == dblk && len == todo)
   1056 			bp = mbp;
   1057 		else {
   1058 			bp = getiobuf(NULL, true);
   1059 			nestiobuf_setup(mbp, bp, offset, len);
   1060 		}
   1061 		bp->b_lblkno = 0;
   1062 		bp->b_blkno = dblk;
   1063 		bdev_strategy(bp);
   1064 		dblk += btodb(len);
   1065 		offset += len;
   1066 		todo -= len;
   1067 	}
   1068 	error = biowait(mbp);
   1069 	if (error == 0 && mbp->b_resid != 0)
   1070 		error = EIO;
   1071 	putiobuf(mbp);
   1072 
   1073 	mutex_enter(&sc->sc_slock);
   1074 	scp->fc_type = (error ? FSS_CACHE_FREE : FSS_CACHE_VALID);
   1075 	cv_broadcast(&scp->fc_state_cv);
   1076 	if (error == 0) {
   1077 		setbit(sc->sc_copied, scp->fc_cluster);
   1078 		cv_signal(&sc->sc_work_cv);
   1079 	}
   1080 	mutex_exit(&sc->sc_slock);
   1081 
   1082 	return error;
   1083 }
   1084 
   1085 /*
   1086  * Read/write clusters from/to backing store.
   1087  * For persistent snapshots must be called with cl == 0. off is the
   1088  * offset into the snapshot.
   1089  */
   1090 static int
   1091 fss_bs_io(struct fss_softc *sc, fss_io_type rw,
   1092     u_int32_t cl, off_t off, int len, void *data, size_t *resid)
   1093 {
   1094 	int error;
   1095 
   1096 	off += FSS_CLTOB(sc, cl);
   1097 
   1098 	vn_lock(sc->sc_bs_vp, LK_EXCLUSIVE|LK_RETRY);
   1099 
   1100 	error = vn_rdwr((rw == FSS_READ ? UIO_READ : UIO_WRITE), sc->sc_bs_vp,
   1101 	    data, len, off, UIO_SYSSPACE,
   1102 	    IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_NODELOCKED,
   1103 	    sc->sc_bs_lwp->l_cred, resid, NULL);
   1104 	if (error == 0) {
   1105 		rw_enter(sc->sc_bs_vp->v_uobj.vmobjlock, RW_WRITER);
   1106 		error = VOP_PUTPAGES(sc->sc_bs_vp, trunc_page(off),
   1107 		    round_page(off+len), PGO_CLEANIT | PGO_FREE | PGO_SYNCIO);
   1108 	}
   1109 
   1110 	VOP_UNLOCK(sc->sc_bs_vp);
   1111 
   1112 	return error;
   1113 }
   1114 
   1115 /*
   1116  * Get a pointer to the indirect slot for this cluster.
   1117  */
   1118 static u_int32_t *
   1119 fss_bs_indir(struct fss_softc *sc, u_int32_t cl)
   1120 {
   1121 	u_int32_t icl;
   1122 	int ioff;
   1123 
   1124 	icl = cl/(FSS_CLSIZE(sc)/sizeof(u_int32_t));
   1125 	ioff = cl%(FSS_CLSIZE(sc)/sizeof(u_int32_t));
   1126 
   1127 	if (sc->sc_indir_cur == icl)
   1128 		return &sc->sc_indir_data[ioff];
   1129 
   1130 	if (sc->sc_indir_dirty) {
   1131 		if (fss_bs_io(sc, FSS_WRITE, sc->sc_indir_cur, 0,
   1132 		    FSS_CLSIZE(sc), (void *)sc->sc_indir_data, NULL) != 0)
   1133 			return NULL;
   1134 		setbit(sc->sc_indir_valid, sc->sc_indir_cur);
   1135 	}
   1136 
   1137 	sc->sc_indir_dirty = 0;
   1138 	sc->sc_indir_cur = icl;
   1139 
   1140 	if (isset(sc->sc_indir_valid, sc->sc_indir_cur)) {
   1141 		if (fss_bs_io(sc, FSS_READ, sc->sc_indir_cur, 0,
   1142 		    FSS_CLSIZE(sc), (void *)sc->sc_indir_data, NULL) != 0)
   1143 			return NULL;
   1144 	} else
   1145 		memset(sc->sc_indir_data, 0, FSS_CLSIZE(sc));
   1146 
   1147 	return &sc->sc_indir_data[ioff];
   1148 }
   1149 
   1150 /*
   1151  * The kernel thread (one for every active snapshot).
   1152  *
   1153  * After wakeup it cleans the cache and runs the I/O requests.
   1154  */
   1155 static void
   1156 fss_bs_thread(void *arg)
   1157 {
   1158 	bool thread_idle, is_valid;
   1159 	int error, i, todo, len, crotor, is_read;
   1160 	long off;
   1161 	char *addr;
   1162 	u_int32_t c, cl, ch, *indirp;
   1163 	size_t resid;
   1164 	struct buf *bp, *nbp;
   1165 	struct fss_softc *sc;
   1166 	struct fss_cache *scp, *scl;
   1167 
   1168 	sc = arg;
   1169 	scl = sc->sc_cache+sc->sc_cache_size;
   1170 	crotor = 0;
   1171 	thread_idle = false;
   1172 
   1173 	mutex_enter(&sc->sc_slock);
   1174 
   1175 	for (;;) {
   1176 		if (thread_idle)
   1177 			cv_wait(&sc->sc_work_cv, &sc->sc_slock);
   1178 		thread_idle = true;
   1179 		if ((sc->sc_flags & FSS_BS_THREAD) == 0) {
   1180 			mutex_exit(&sc->sc_slock);
   1181 			kthread_exit(0);
   1182 		}
   1183 
   1184 		/*
   1185 		 * Process I/O requests (persistent)
   1186 		 */
   1187 
   1188 		if (sc->sc_flags & FSS_PERSISTENT) {
   1189 			if ((bp = bufq_get(sc->sc_bufq)) == NULL)
   1190 				continue;
   1191 			is_valid = (sc->sc_state == FSS_ACTIVE);
   1192 			is_read = (bp->b_flags & B_READ);
   1193 			thread_idle = false;
   1194 			mutex_exit(&sc->sc_slock);
   1195 
   1196 			if (is_valid) {
   1197 				disk_busy(sc->sc_dkdev);
   1198 				error = fss_bs_io(sc, FSS_READ, 0,
   1199 				    dbtob(bp->b_blkno), bp->b_bcount,
   1200 				    bp->b_data, &resid);
   1201 				if (error)
   1202 					resid = bp->b_bcount;
   1203 				disk_unbusy(sc->sc_dkdev,
   1204 				    (error ? 0 : bp->b_bcount), is_read);
   1205 			} else {
   1206 				error = ENXIO;
   1207 				resid = bp->b_bcount;
   1208 			}
   1209 
   1210 			bp->b_error = error;
   1211 			bp->b_resid = resid;
   1212 			biodone(bp);
   1213 
   1214 			mutex_enter(&sc->sc_slock);
   1215 			continue;
   1216 		}
   1217 
   1218 		/*
   1219 		 * Clean the cache
   1220 		 */
   1221 		for (i = 0; i < sc->sc_cache_size; i++) {
   1222 			crotor = (crotor + 1) % sc->sc_cache_size;
   1223 			scp = sc->sc_cache + crotor;
   1224 			if (scp->fc_type != FSS_CACHE_VALID)
   1225 				continue;
   1226 			mutex_exit(&sc->sc_slock);
   1227 
   1228 			thread_idle = false;
   1229 			indirp = fss_bs_indir(sc, scp->fc_cluster);
   1230 			if (indirp != NULL) {
   1231 				error = fss_bs_io(sc, FSS_WRITE, sc->sc_clnext,
   1232 				    0, FSS_CLSIZE(sc), scp->fc_data, NULL);
   1233 			} else
   1234 				error = EIO;
   1235 
   1236 			mutex_enter(&sc->sc_slock);
   1237 			if (error == 0) {
   1238 				*indirp = sc->sc_clnext++;
   1239 				sc->sc_indir_dirty = 1;
   1240 			} else
   1241 				fss_error(sc, "write error on backing store");
   1242 
   1243 			scp->fc_type = FSS_CACHE_FREE;
   1244 			cv_broadcast(&sc->sc_cache_cv);
   1245 			break;
   1246 		}
   1247 
   1248 		/*
   1249 		 * Process I/O requests
   1250 		 */
   1251 		if ((bp = bufq_get(sc->sc_bufq)) == NULL)
   1252 			continue;
   1253 		is_valid = (sc->sc_state == FSS_ACTIVE);
   1254 		is_read = (bp->b_flags & B_READ);
   1255 		thread_idle = false;
   1256 
   1257 		if (!is_valid) {
   1258 			mutex_exit(&sc->sc_slock);
   1259 
   1260 			bp->b_error = ENXIO;
   1261 			bp->b_resid = bp->b_bcount;
   1262 			biodone(bp);
   1263 
   1264 			mutex_enter(&sc->sc_slock);
   1265 			continue;
   1266 		}
   1267 
   1268 		disk_busy(sc->sc_dkdev);
   1269 
   1270 		/*
   1271 		 * First read from the snapshotted block device unless
   1272 		 * this request is completely covered by backing store.
   1273 		 */
   1274 
   1275 		cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
   1276 		off = FSS_CLOFF(sc, dbtob(bp->b_blkno));
   1277 		ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
   1278 		error = 0;
   1279 		bp->b_resid = 0;
   1280 		bp->b_error = 0;
   1281 		for (c = cl; c <= ch; c++) {
   1282 			if (isset(sc->sc_copied, c))
   1283 				continue;
   1284 			mutex_exit(&sc->sc_slock);
   1285 
   1286 			/* Not on backing store, read from device. */
   1287 			nbp = getiobuf(NULL, true);
   1288 			nbp->b_flags = B_READ | (bp->b_flags & B_PHYS);
   1289 			nbp->b_resid = nbp->b_bcount = bp->b_bcount;
   1290 			nbp->b_bufsize = bp->b_bcount;
   1291 			nbp->b_data = bp->b_data;
   1292 			nbp->b_blkno = bp->b_blkno;
   1293 			nbp->b_lblkno = 0;
   1294 			nbp->b_dev = sc->sc_bdev;
   1295 			SET(nbp->b_cflags, BC_BUSY);	/* mark buffer busy */
   1296 
   1297 			bdev_strategy(nbp);
   1298 
   1299 			error = biowait(nbp);
   1300 			if (error == 0 && nbp->b_resid != 0)
   1301 				error = EIO;
   1302 			if (error != 0) {
   1303 				bp->b_resid = bp->b_bcount;
   1304 				bp->b_error = nbp->b_error;
   1305 				disk_unbusy(sc->sc_dkdev, 0, is_read);
   1306 				biodone(bp);
   1307 			}
   1308 			putiobuf(nbp);
   1309 
   1310 			mutex_enter(&sc->sc_slock);
   1311 			break;
   1312 		}
   1313 		if (error)
   1314 			continue;
   1315 
   1316 		/*
   1317 		 * Replace those parts that have been saved to backing store.
   1318 		 */
   1319 
   1320 		addr = bp->b_data;
   1321 		todo = bp->b_bcount;
   1322 		for (c = cl; c <= ch; c++, off = 0, todo -= len, addr += len) {
   1323 			len = FSS_CLSIZE(sc)-off;
   1324 			if (len > todo)
   1325 				len = todo;
   1326 			if (isclr(sc->sc_copied, c))
   1327 				continue;
   1328 			mutex_exit(&sc->sc_slock);
   1329 
   1330 			indirp = fss_bs_indir(sc, c);
   1331 			if (indirp == NULL || *indirp == 0) {
   1332 				/*
   1333 				 * Not on backing store. Either in cache
   1334 				 * or hole in the snapshotted block device.
   1335 				 */
   1336 
   1337 				mutex_enter(&sc->sc_slock);
   1338 				for (scp = sc->sc_cache; scp < scl; scp++)
   1339 					if (scp->fc_type == FSS_CACHE_VALID &&
   1340 					    scp->fc_cluster == c)
   1341 						break;
   1342 				if (scp < scl)
   1343 					memcpy(addr, (char *)scp->fc_data+off,
   1344 					    len);
   1345 				else
   1346 					memset(addr, 0, len);
   1347 				continue;
   1348 			}
   1349 
   1350 			/*
   1351 			 * Read from backing store.
   1352 			 */
   1353 			error = fss_bs_io(sc, FSS_READ,
   1354 			    *indirp, off, len, addr, NULL);
   1355 
   1356 			mutex_enter(&sc->sc_slock);
   1357 			if (error) {
   1358 				bp->b_resid = bp->b_bcount;
   1359 				bp->b_error = error;
   1360 				break;
   1361 			}
   1362 		}
   1363 		mutex_exit(&sc->sc_slock);
   1364 
   1365 		disk_unbusy(sc->sc_dkdev, (error ? 0 : bp->b_bcount), is_read);
   1366 		biodone(bp);
   1367 
   1368 		mutex_enter(&sc->sc_slock);
   1369 	}
   1370 }
   1371 
   1372 #ifdef _MODULE
   1373 
   1374 #include <sys/module.h>
   1375 
   1376 MODULE(MODULE_CLASS_DRIVER, fss, "bufq_fcfs");
   1377 CFDRIVER_DECL(fss, DV_DISK, NULL);
   1378 
   1379 devmajor_t fss_bmajor = -1, fss_cmajor = -1;
   1380 
   1381 static int
   1382 fss_modcmd(modcmd_t cmd, void *arg)
   1383 {
   1384 	int error = 0;
   1385 
   1386 	switch (cmd) {
   1387 	case MODULE_CMD_INIT:
   1388 		mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
   1389 		cv_init(&fss_device_cv, "snapwait");
   1390 
   1391 		error = devsw_attach(fss_cd.cd_name,
   1392 		    &fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor);
   1393 		if (error) {
   1394 			mutex_destroy(&fss_device_lock);
   1395 			break;
   1396 		}
   1397 
   1398 		error = config_cfdriver_attach(&fss_cd);
   1399 		if (error) {
   1400 			devsw_detach(&fss_bdevsw, &fss_cdevsw);
   1401 			mutex_destroy(&fss_device_lock);
   1402 			break;
   1403 		}
   1404 
   1405 		error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
   1406 		if (error) {
   1407 			config_cfdriver_detach(&fss_cd);
   1408 			devsw_detach(&fss_bdevsw, &fss_cdevsw);
   1409 			mutex_destroy(&fss_device_lock);
   1410 			break;
   1411 		}
   1412 
   1413 		break;
   1414 
   1415 	case MODULE_CMD_FINI:
   1416 		error = config_cfattach_detach(fss_cd.cd_name, &fss_ca);
   1417 		if (error) {
   1418 			break;
   1419 		}
   1420 		error = config_cfdriver_detach(&fss_cd);
   1421 		if (error) {
   1422 			config_cfattach_attach(fss_cd.cd_name, &fss_ca);
   1423 			break;
   1424 		}
   1425 		devsw_detach(&fss_bdevsw, &fss_cdevsw);
   1426 		cv_destroy(&fss_device_cv);
   1427 		mutex_destroy(&fss_device_lock);
   1428 		break;
   1429 
   1430 	default:
   1431 		error = ENOTTY;
   1432 		break;
   1433 	}
   1434 
   1435 	return error;
   1436 }
   1437 
   1438 #endif /* _MODULE */
   1439