Home | History | Annotate | Line # | Download | only in kern
vfs_trans.c revision 1.71
      1 /*	$NetBSD: vfs_trans.c,v 1.71 2024/12/07 02:11:42 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Juergen Hannken-Illjes.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.71 2024/12/07 02:11:42 riastradh Exp $");
     34 
     35 /*
     36  * File system transaction operations.
     37  */
     38 
     39 #ifdef _KERNEL_OPT
     40 #include "opt_ddb.h"
     41 #endif
     42 
     43 #include <sys/param.h>
     44 #include <sys/types.h>
     45 
     46 #include <sys/atomic.h>
     47 #include <sys/buf.h>
     48 #include <sys/fstrans.h>
     49 #include <sys/hash.h>
     50 #include <sys/kmem.h>
     51 #include <sys/mount.h>
     52 #include <sys/pool.h>
     53 #include <sys/proc.h>
     54 #include <sys/pserialize.h>
     55 #include <sys/systm.h>
     56 #include <sys/vnode.h>
     57 
     58 #include <miscfs/deadfs/deadfs.h>
     59 #include <miscfs/specfs/specdev.h>
     60 
     61 #define FSTRANS_MOUNT_HASHSIZE	32
     62 
     63 enum fstrans_lock_type {
     64 	FSTRANS_LAZY,			/* Granted while not suspended */
     65 	FSTRANS_SHARED			/* Granted while not suspending */
     66 };
     67 
     68 struct fscow_handler {
     69 	LIST_ENTRY(fscow_handler) ch_list;
     70 	int (*ch_func)(void *, struct buf *, bool);
     71 	void *ch_arg;
     72 };
     73 struct fstrans_lwp_info {
     74 	struct fstrans_lwp_info *fli_succ;
     75 	struct lwp *fli_self;
     76 	struct mount *fli_mount;
     77 	struct fstrans_lwp_info *fli_alias;
     78 	struct fstrans_mount_info *fli_mountinfo;
     79 	int fli_trans_cnt;
     80 	int fli_alias_cnt;
     81 	int fli_cow_cnt;
     82 	enum fstrans_lock_type fli_lock_type;
     83 	LIST_ENTRY(fstrans_lwp_info) fli_list;
     84 };
     85 struct fstrans_mount_info {
     86 	enum fstrans_state fmi_state;
     87 	unsigned int fmi_ref_cnt;
     88 	bool fmi_gone;
     89 	bool fmi_cow_change;
     90 	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
     91 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
     92 	struct mount *fmi_mount;
     93 	struct fstrans_mount_info *fmi_lower_info;
     94 	struct lwp *fmi_owner;
     95 };
     96 SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
     97 
     98 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
     99     __cacheline_aligned;
    100 static kmutex_t fstrans_lock		/* Fstrans big lock. */
    101     __cacheline_aligned;
    102 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
    103 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
    104 static pserialize_t fstrans_psz;	/* Pserialize state. */
    105 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
    106 					/* List of all fstrans_lwp_info. */
    107 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
    108 
    109 static u_long fstrans_mount_hashmask;
    110 static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
    111 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
    112 
    113 static inline uint32_t fstrans_mount_hash(struct mount *);
    114 static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
    115 static void fstrans_mount_dtor(struct fstrans_mount_info *);
    116 static void fstrans_clear_lwp_info(void);
    117 static inline struct fstrans_lwp_info *
    118     fstrans_get_lwp_info(struct mount *, bool);
    119 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
    120 static int fstrans_lwp_pcc(void *, void *, int);
    121 static void fstrans_lwp_pcd(void *, void *);
    122 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
    123 static bool grant_lock(const struct fstrans_mount_info *,
    124     const enum fstrans_lock_type);
    125 static bool state_change_done(const struct fstrans_mount_info *);
    126 static bool cow_state_change_done(const struct fstrans_mount_info *);
    127 static void cow_change_enter(struct fstrans_mount_info *);
    128 static void cow_change_done(struct fstrans_mount_info *);
    129 
    130 /*
    131  * Initialize.
    132  */
    133 void
    134 fstrans_init(void)
    135 {
    136 
    137 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
    138 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
    139 	cv_init(&fstrans_state_cv, "fstchg");
    140 	cv_init(&fstrans_count_cv, "fstcnt");
    141 	fstrans_psz = pserialize_create();
    142 	LIST_INIT(&fstrans_fli_head);
    143 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
    144 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
    145 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
    146 	KASSERT(fstrans_lwp_cache != NULL);
    147 	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
    148 	    true, &fstrans_mount_hashmask);
    149 }
    150 
    151 /*
    152  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
    153  * produces cache misses on MP.  Minimise by keeping free entries on list.
    154  */
    155 int
    156 fstrans_lwp_pcc(void *arg, void *obj, int flags)
    157 {
    158 	struct fstrans_lwp_info *fli = obj;
    159 
    160 	memset(fli, 0, sizeof(*fli));
    161 
    162 	mutex_enter(&fstrans_lock);
    163 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
    164 	mutex_exit(&fstrans_lock);
    165 
    166 	return 0;
    167 }
    168 
    169 /*
    170  * pool_cache destructor
    171  */
    172 void
    173 fstrans_lwp_pcd(void *arg, void *obj)
    174 {
    175 	struct fstrans_lwp_info *fli = obj;
    176 
    177 	mutex_enter(&fstrans_lock);
    178 	LIST_REMOVE(fli, fli_list);
    179 	mutex_exit(&fstrans_lock);
    180 }
    181 
    182 /*
    183  * Deallocate lwp state.
    184  */
    185 void
    186 fstrans_lwp_dtor(lwp_t *l)
    187 {
    188 	struct fstrans_lwp_info *fli, *fli_next;
    189 
    190 	if (l->l_fstrans == NULL)
    191 		return;
    192 
    193 	mutex_enter(&fstrans_lock);
    194 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    195 		KASSERT(fli->fli_trans_cnt == 0);
    196 		KASSERT(fli->fli_cow_cnt == 0);
    197 		KASSERT(fli->fli_self == l);
    198 		if (fli->fli_mount != NULL)
    199 			fstrans_mount_dtor(fli->fli_mountinfo);
    200 		fli_next = fli->fli_succ;
    201 		fli->fli_alias_cnt = 0;
    202 		fli->fli_mount = NULL;
    203 		fli->fli_alias = NULL;
    204 		fli->fli_mountinfo = NULL;
    205 		fli->fli_self = NULL;
    206 	}
    207 	mutex_exit(&fstrans_lock);
    208 
    209 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    210 		fli_next = fli->fli_succ;
    211 		pool_cache_put(fstrans_lwp_cache, fli);
    212 	}
    213 	l->l_fstrans = NULL;
    214 }
    215 
    216 /*
    217  * mount pointer to hash
    218  */
    219 static inline uint32_t
    220 fstrans_mount_hash(struct mount *mp)
    221 {
    222 
    223 	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
    224 	    fstrans_mount_hashmask;
    225 }
    226 
    227 /*
    228  * retrieve fstrans_mount_info by mount or NULL
    229  */
    230 static inline struct fstrans_mount_info *
    231 fstrans_mount_get(struct mount *mp)
    232 {
    233 	uint32_t indx;
    234 	struct fstrans_mount_info *fmi, *fmi_lower;
    235 
    236 	KASSERT(mutex_owned(&fstrans_lock));
    237 
    238 	indx = fstrans_mount_hash(mp);
    239 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
    240 		if (fmi->fmi_mount == mp) {
    241 			if (__predict_false(mp->mnt_lower != NULL &&
    242 			    fmi->fmi_lower_info == NULL)) {
    243 				/*
    244 				 * Intern the lower/lowest mount into
    245 				 * this mount info on first lookup.
    246 				 */
    247 				KASSERT(fmi->fmi_ref_cnt == 1);
    248 
    249 				fmi_lower = fstrans_mount_get(mp->mnt_lower);
    250 				if (fmi_lower && fmi_lower->fmi_lower_info)
    251 					fmi_lower = fmi_lower->fmi_lower_info;
    252 				if (fmi_lower == NULL)
    253 					return NULL;
    254 				fmi->fmi_lower_info = fmi_lower;
    255 				fmi->fmi_lower_info->fmi_ref_cnt += 1;
    256 			}
    257 			return fmi;
    258 		}
    259 	}
    260 
    261 	return NULL;
    262 }
    263 
    264 /*
    265  * Dereference mount state.
    266  */
    267 static void
    268 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
    269 {
    270 
    271 	KASSERT(mutex_owned(&fstrans_lock));
    272 
    273 	KASSERT(fmi != NULL);
    274 	fmi->fmi_ref_cnt -= 1;
    275 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
    276 		return;
    277 	}
    278 
    279 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
    280 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
    281 	KASSERT(fmi->fmi_owner == NULL);
    282 
    283 	if (fmi->fmi_lower_info)
    284 		fstrans_mount_dtor(fmi->fmi_lower_info);
    285 
    286 	KASSERT(fstrans_gone_count > 0);
    287 	fstrans_gone_count -= 1;
    288 
    289 	KASSERT(fmi->fmi_mount->mnt_lower == NULL);
    290 
    291 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
    292 	kmem_free(fmi, sizeof(*fmi));
    293 }
    294 
    295 /*
    296  * Allocate mount state.
    297  */
    298 int
    299 fstrans_mount(struct mount *mp)
    300 {
    301 	uint32_t indx;
    302 	struct fstrans_mount_info *newfmi;
    303 
    304 	indx = fstrans_mount_hash(mp);
    305 
    306 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
    307 	newfmi->fmi_state = FSTRANS_NORMAL;
    308 	newfmi->fmi_ref_cnt = 1;
    309 	newfmi->fmi_gone = false;
    310 	LIST_INIT(&newfmi->fmi_cow_handler);
    311 	newfmi->fmi_cow_change = false;
    312 	newfmi->fmi_mount = mp;
    313 	newfmi->fmi_lower_info = NULL;
    314 	newfmi->fmi_owner = NULL;
    315 
    316 	mutex_enter(&fstrans_lock);
    317 	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
    318 	mutex_exit(&fstrans_lock);
    319 
    320 	return 0;
    321 }
    322 
    323 /*
    324  * Deallocate mount state.
    325  */
    326 void
    327 fstrans_unmount(struct mount *mp)
    328 {
    329 	uint32_t indx;
    330 	struct fstrans_mount_info *fmi;
    331 
    332 	indx = fstrans_mount_hash(mp);
    333 
    334 	mutex_enter(&fstrans_lock);
    335 	fmi = fstrans_mount_get(mp);
    336 	KASSERT(fmi != NULL);
    337 	fmi->fmi_gone = true;
    338 	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
    339 	    fmi, fstrans_mount_info, fmi_hash);
    340 	fstrans_gone_count += 1;
    341 	fstrans_mount_dtor(fmi);
    342 	mutex_exit(&fstrans_lock);
    343 }
    344 
    345 /*
    346  * Clear mount entries whose mount is gone.
    347  */
    348 static void
    349 fstrans_clear_lwp_info(void)
    350 {
    351 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
    352 
    353 	/*
    354 	 * Scan our list clearing entries whose mount is gone.
    355 	 */
    356 	mutex_enter(&fstrans_lock);
    357 	for (p = &curlwp->l_fstrans; *p; ) {
    358 		fli = *p;
    359 		if (fli->fli_mount != NULL &&
    360 		    fli->fli_mountinfo->fmi_gone &&
    361 		    fli->fli_trans_cnt == 0 &&
    362 		    fli->fli_cow_cnt == 0 &&
    363 		    fli->fli_alias_cnt == 0) {
    364 			*p = (*p)->fli_succ;
    365 			fstrans_mount_dtor(fli->fli_mountinfo);
    366 			if (fli->fli_alias) {
    367 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
    368 				fli->fli_alias->fli_alias_cnt--;
    369 			}
    370 			fli->fli_mount = NULL;
    371 			fli->fli_alias = NULL;
    372 			fli->fli_mountinfo = NULL;
    373 			fli->fli_self = NULL;
    374 			p = &curlwp->l_fstrans;
    375 			fli->fli_succ = tofree;
    376 			tofree = fli;
    377 		} else {
    378 			p = &(*p)->fli_succ;
    379 		}
    380 	}
    381 #ifdef DIAGNOSTIC
    382 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
    383 		if (fli->fli_alias != NULL)
    384 			KASSERT(fli->fli_alias->fli_self == curlwp);
    385 #endif /* DIAGNOSTIC */
    386 	mutex_exit(&fstrans_lock);
    387 
    388 	while (tofree != NULL) {
    389 		fli = tofree;
    390 		tofree = fli->fli_succ;
    391 		pool_cache_put(fstrans_lwp_cache, fli);
    392 	}
    393 }
    394 
    395 /*
    396  * Allocate and return per lwp info for this mount.
    397  */
    398 static struct fstrans_lwp_info *
    399 fstrans_alloc_lwp_info(struct mount *mp)
    400 {
    401 	struct fstrans_lwp_info *fli, *fli_lower;
    402 	struct fstrans_mount_info *fmi;
    403 
    404 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    405 		if (fli->fli_mount == mp)
    406 			return fli;
    407 	}
    408 
    409 	/*
    410 	 * Lookup mount info and get lower mount per lwp info.
    411 	 */
    412 	mutex_enter(&fstrans_lock);
    413 	fmi = fstrans_mount_get(mp);
    414 	if (fmi == NULL) {
    415 		mutex_exit(&fstrans_lock);
    416 		return NULL;
    417 	}
    418 	fmi->fmi_ref_cnt += 1;
    419 	mutex_exit(&fstrans_lock);
    420 
    421 	if (fmi->fmi_lower_info) {
    422 		fli_lower =
    423 		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
    424 		if (fli_lower == NULL) {
    425 			mutex_enter(&fstrans_lock);
    426 			fstrans_mount_dtor(fmi);
    427 			mutex_exit(&fstrans_lock);
    428 
    429 			return NULL;
    430 		}
    431 	} else {
    432 		fli_lower = NULL;
    433 	}
    434 
    435 	/*
    436 	 * Allocate a new entry.
    437 	 */
    438 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
    439 	KASSERT(fli->fli_trans_cnt == 0);
    440 	KASSERT(fli->fli_cow_cnt == 0);
    441 	KASSERT(fli->fli_alias_cnt == 0);
    442 	KASSERT(fli->fli_mount == NULL);
    443 	KASSERT(fli->fli_alias == NULL);
    444 	KASSERT(fli->fli_mountinfo == NULL);
    445 	KASSERT(fli->fli_self == NULL);
    446 
    447 	/*
    448 	 * Attach the mount info and alias.
    449 	 */
    450 
    451 	fli->fli_self = curlwp;
    452 	fli->fli_mount = mp;
    453 	fli->fli_mountinfo = fmi;
    454 
    455 	fli->fli_succ = curlwp->l_fstrans;
    456 	curlwp->l_fstrans = fli;
    457 
    458 	if (fli_lower) {
    459 		fli->fli_alias = fli_lower;
    460 		fli->fli_alias->fli_alias_cnt++;
    461 		fli = fli->fli_alias;
    462 	}
    463 
    464 	return fli;
    465 }
    466 
    467 /*
    468  * Retrieve the per lwp info for this mount allocating if necessary.
    469  */
    470 static inline struct fstrans_lwp_info *
    471 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
    472 {
    473 	struct fstrans_lwp_info *fli;
    474 
    475 	/*
    476 	 * Scan our list for a match.
    477 	 */
    478 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    479 		if (fli->fli_mount == mp) {
    480 			KASSERT(mp->mnt_lower == NULL ||
    481 			    fli->fli_alias != NULL);
    482 			if (fli->fli_alias != NULL)
    483 				fli = fli->fli_alias;
    484 			break;
    485 		}
    486 	}
    487 
    488 	if (do_alloc) {
    489 		if (__predict_false(fli == NULL))
    490 			fli = fstrans_alloc_lwp_info(mp);
    491 	}
    492 
    493 	return fli;
    494 }
    495 
    496 /*
    497  * Check if this lock type is granted at this state.
    498  */
    499 static bool
    500 grant_lock(const struct fstrans_mount_info *fmi,
    501     const enum fstrans_lock_type type)
    502 {
    503 
    504 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
    505 		return true;
    506 	if (fmi->fmi_owner == curlwp)
    507 		return true;
    508 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
    509 		return true;
    510 
    511 	return false;
    512 }
    513 
    514 /*
    515  * Start a transaction.  If this thread already has a transaction on this
    516  * file system increment the reference counter.
    517  */
    518 static inline int
    519 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
    520 {
    521 	int s;
    522 	struct fstrans_lwp_info *fli;
    523 	struct fstrans_mount_info *fmi;
    524 
    525 	ASSERT_SLEEPABLE();
    526 
    527 	fli = fstrans_get_lwp_info(mp, true);
    528 	if (fli == NULL)
    529 		return 0;
    530 	fmi = fli->fli_mountinfo;
    531 
    532 	if (fli->fli_trans_cnt > 0) {
    533 		fli->fli_trans_cnt += 1;
    534 
    535 		return 0;
    536 	}
    537 
    538 	s = pserialize_read_enter();
    539 	if (__predict_true(grant_lock(fmi, lock_type))) {
    540 		fli->fli_trans_cnt = 1;
    541 		fli->fli_lock_type = lock_type;
    542 		pserialize_read_exit(s);
    543 
    544 		return 0;
    545 	}
    546 	pserialize_read_exit(s);
    547 
    548 	if (! wait)
    549 		return EBUSY;
    550 
    551 	mutex_enter(&fstrans_lock);
    552 	while (! grant_lock(fmi, lock_type))
    553 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    554 	fli->fli_trans_cnt = 1;
    555 	fli->fli_lock_type = lock_type;
    556 	mutex_exit(&fstrans_lock);
    557 
    558 	return 0;
    559 }
    560 
    561 void
    562 fstrans_start(struct mount *mp)
    563 {
    564 	int error __diagused;
    565 
    566 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
    567 	KASSERT(error == 0);
    568 }
    569 
    570 int
    571 fstrans_start_nowait(struct mount *mp)
    572 {
    573 
    574 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
    575 }
    576 
    577 void
    578 fstrans_start_lazy(struct mount *mp)
    579 {
    580 	int error __diagused;
    581 
    582 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
    583 	KASSERT(error == 0);
    584 }
    585 
    586 /*
    587  * Finish a transaction.
    588  */
    589 void
    590 fstrans_done(struct mount *mp)
    591 {
    592 	int s;
    593 	struct fstrans_lwp_info *fli;
    594 	struct fstrans_mount_info *fmi;
    595 
    596 	fli = fstrans_get_lwp_info(mp, false);
    597 	if (fli == NULL)
    598 		return;
    599 	fmi = fli->fli_mountinfo;
    600 	KASSERT(fli->fli_trans_cnt > 0);
    601 
    602 	if (fli->fli_trans_cnt > 1) {
    603 		fli->fli_trans_cnt -= 1;
    604 
    605 		return;
    606 	}
    607 
    608 	if (__predict_false(fstrans_gone_count > 0))
    609 		fstrans_clear_lwp_info();
    610 
    611 	s = pserialize_read_enter();
    612 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
    613 		fli->fli_trans_cnt = 0;
    614 		pserialize_read_exit(s);
    615 
    616 		return;
    617 	}
    618 	pserialize_read_exit(s);
    619 
    620 	mutex_enter(&fstrans_lock);
    621 	fli->fli_trans_cnt = 0;
    622 	cv_signal(&fstrans_count_cv);
    623 	mutex_exit(&fstrans_lock);
    624 }
    625 
    626 /*
    627  * Check if we hold an lock.
    628  */
    629 int
    630 fstrans_held(struct mount *mp)
    631 {
    632 	struct fstrans_lwp_info *fli;
    633 	struct fstrans_mount_info *fmi;
    634 
    635 	KASSERT(mp != dead_rootmount);
    636 
    637 	fli = fstrans_get_lwp_info(mp, false);
    638 	if (fli == NULL)
    639 		return 0;
    640 	fmi = fli->fli_mountinfo;
    641 
    642 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
    643 }
    644 
    645 /*
    646  * Check if this thread has an exclusive lock.
    647  */
    648 int
    649 fstrans_is_owner(struct mount *mp)
    650 {
    651 	struct fstrans_lwp_info *fli;
    652 	struct fstrans_mount_info *fmi;
    653 
    654 	KASSERT(mp != dead_rootmount);
    655 
    656 	fli = fstrans_get_lwp_info(mp, false);
    657 	if (fli == NULL)
    658 		return 0;
    659 	fmi = fli->fli_mountinfo;
    660 
    661 	return (fmi->fmi_owner == curlwp);
    662 }
    663 
    664 /*
    665  * True, if no thread is in a transaction not granted at the current state.
    666  */
    667 static bool
    668 state_change_done(const struct fstrans_mount_info *fmi)
    669 {
    670 	struct fstrans_lwp_info *fli;
    671 
    672 	KASSERT(mutex_owned(&fstrans_lock));
    673 
    674 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    675 		if (fli->fli_mountinfo != fmi)
    676 			continue;
    677 		if (fli->fli_trans_cnt == 0)
    678 			continue;
    679 		if (fli->fli_self == curlwp)
    680 			continue;
    681 		if (grant_lock(fmi, fli->fli_lock_type))
    682 			continue;
    683 
    684 		return false;
    685 	}
    686 
    687 	return true;
    688 }
    689 
    690 /*
    691  * Set new file system state.
    692  */
    693 int
    694 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
    695 {
    696 	int error;
    697 	enum fstrans_state old_state;
    698 	struct fstrans_lwp_info *fli;
    699 	struct fstrans_mount_info *fmi;
    700 
    701 	KASSERT(mp != dead_rootmount);
    702 
    703 	fli = fstrans_get_lwp_info(mp, true);
    704 	if (fli == NULL)
    705 		return ENOENT;
    706 	fmi = fli->fli_mountinfo;
    707 	old_state = fmi->fmi_state;
    708 	if (old_state == new_state)
    709 		return 0;
    710 
    711 	mutex_enter(&fstrans_lock);
    712 	fmi->fmi_state = new_state;
    713 	pserialize_perform(fstrans_psz);
    714 
    715 	/*
    716 	 * All threads see the new state now.
    717 	 * Wait for transactions invalid at this state to leave.
    718 	 */
    719 	error = 0;
    720 	while (! state_change_done(fmi)) {
    721 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
    722 		if (error) {
    723 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
    724 			break;
    725 		}
    726 	}
    727 	if (old_state != new_state) {
    728 		if (old_state == FSTRANS_NORMAL) {
    729 			KASSERT(fmi->fmi_owner == NULL);
    730 			fmi->fmi_owner = curlwp;
    731 		}
    732 		if (new_state == FSTRANS_NORMAL) {
    733 			KASSERT(fmi->fmi_owner == curlwp);
    734 			fmi->fmi_owner = NULL;
    735 		}
    736 	}
    737 	cv_broadcast(&fstrans_state_cv);
    738 	mutex_exit(&fstrans_lock);
    739 
    740 	return error;
    741 }
    742 
    743 /*
    744  * Get current file system state.
    745  */
    746 enum fstrans_state
    747 fstrans_getstate(struct mount *mp)
    748 {
    749 	struct fstrans_lwp_info *fli;
    750 	struct fstrans_mount_info *fmi;
    751 
    752 	KASSERT(mp != dead_rootmount);
    753 
    754 	fli = fstrans_get_lwp_info(mp, true);
    755 	KASSERT(fli != NULL);
    756 	fmi = fli->fli_mountinfo;
    757 
    758 	return fmi->fmi_state;
    759 }
    760 
    761 /*
    762  * Request a filesystem to suspend all operations.
    763  */
    764 int
    765 vfs_suspend(struct mount *mp, int nowait)
    766 {
    767 	struct fstrans_lwp_info *fli;
    768 	int error;
    769 
    770 	if (mp == dead_rootmount)
    771 		return EOPNOTSUPP;
    772 
    773 	fli = fstrans_get_lwp_info(mp, true);
    774 	if (fli == NULL)
    775 		return ENOENT;
    776 
    777 	if (nowait) {
    778 		if (!mutex_tryenter(&vfs_suspend_lock))
    779 			return EWOULDBLOCK;
    780 	} else
    781 		mutex_enter(&vfs_suspend_lock);
    782 
    783 	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
    784 		mutex_exit(&vfs_suspend_lock);
    785 		return error;
    786 	}
    787 
    788 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
    789 		vfs_resume(mp);
    790 		return ENOENT;
    791 	}
    792 
    793 	return 0;
    794 }
    795 
    796 /*
    797  * Request a filesystem to resume all operations.
    798  */
    799 void
    800 vfs_resume(struct mount *mp)
    801 {
    802 	struct fstrans_lwp_info *fli;
    803 
    804 	KASSERT(mp != dead_rootmount);
    805 
    806 	fli = fstrans_get_lwp_info(mp, false);
    807 	mp = fli->fli_mount;
    808 
    809 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
    810 	mutex_exit(&vfs_suspend_lock);
    811 }
    812 
    813 /*
    814  * True, if no thread is running a cow handler.
    815  */
    816 static bool
    817 cow_state_change_done(const struct fstrans_mount_info *fmi)
    818 {
    819 	struct fstrans_lwp_info *fli;
    820 
    821 	KASSERT(mutex_owned(&fstrans_lock));
    822 	KASSERT(fmi->fmi_cow_change);
    823 
    824 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    825 		if (fli->fli_mount != fmi->fmi_mount)
    826 			continue;
    827 		if (fli->fli_cow_cnt == 0)
    828 			continue;
    829 
    830 		return false;
    831 	}
    832 
    833 	return true;
    834 }
    835 
    836 /*
    837  * Prepare for changing this mounts cow list.
    838  * Returns with fstrans_lock locked.
    839  */
    840 static void
    841 cow_change_enter(struct fstrans_mount_info *fmi)
    842 {
    843 
    844 	mutex_enter(&fstrans_lock);
    845 
    846 	/*
    847 	 * Wait for other threads changing the list.
    848 	 */
    849 	while (fmi->fmi_cow_change)
    850 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    851 
    852 	/*
    853 	 * Wait until all threads are aware of a state change.
    854 	 */
    855 	fmi->fmi_cow_change = true;
    856 	pserialize_perform(fstrans_psz);
    857 
    858 	while (! cow_state_change_done(fmi))
    859 		cv_wait(&fstrans_count_cv, &fstrans_lock);
    860 }
    861 
    862 /*
    863  * Done changing this mounts cow list.
    864  */
    865 static void
    866 cow_change_done(struct fstrans_mount_info *fmi)
    867 {
    868 
    869 	KASSERT(mutex_owned(&fstrans_lock));
    870 
    871 	fmi->fmi_cow_change = false;
    872 	pserialize_perform(fstrans_psz);
    873 
    874 	cv_broadcast(&fstrans_state_cv);
    875 
    876 	mutex_exit(&fstrans_lock);
    877 }
    878 
    879 /*
    880  * Add a handler to this mount.
    881  */
    882 int
    883 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    884     void *arg)
    885 {
    886 	struct fstrans_mount_info *fmi;
    887 	struct fscow_handler *newch;
    888 
    889 	KASSERT(mp != dead_rootmount);
    890 
    891 	mutex_enter(&fstrans_lock);
    892 	fmi = fstrans_mount_get(mp);
    893 	KASSERT(fmi != NULL);
    894 	fmi->fmi_ref_cnt += 1;
    895 	mutex_exit(&fstrans_lock);
    896 
    897 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
    898 	newch->ch_func = func;
    899 	newch->ch_arg = arg;
    900 
    901 	cow_change_enter(fmi);
    902 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
    903 	cow_change_done(fmi);
    904 
    905 	return 0;
    906 }
    907 
    908 /*
    909  * Remove a handler from this mount.
    910  */
    911 int
    912 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    913     void *arg)
    914 {
    915 	struct fstrans_mount_info *fmi;
    916 	struct fscow_handler *hp = NULL;
    917 
    918 	KASSERT(mp != dead_rootmount);
    919 
    920 	mutex_enter(&fstrans_lock);
    921 	fmi = fstrans_mount_get(mp);
    922 	KASSERT(fmi != NULL);
    923 	mutex_exit(&fstrans_lock);
    924 
    925 	cow_change_enter(fmi);
    926 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    927 		if (hp->ch_func == func && hp->ch_arg == arg)
    928 			break;
    929 	if (hp != NULL) {
    930 		LIST_REMOVE(hp, ch_list);
    931 		kmem_free(hp, sizeof(*hp));
    932 	}
    933 	fstrans_mount_dtor(fmi);
    934 	cow_change_done(fmi);
    935 
    936 	return hp ? 0 : EINVAL;
    937 }
    938 
    939 /*
    940  * Check for need to copy block that is about to be written.
    941  */
    942 int
    943 fscow_run(struct buf *bp, bool data_valid)
    944 {
    945 	int error, s;
    946 	struct mount *mp;
    947 	struct fstrans_lwp_info *fli;
    948 	struct fstrans_mount_info *fmi;
    949 	struct fscow_handler *hp;
    950 
    951 	/*
    952 	 * First check if we need run the copy-on-write handler.
    953 	 */
    954 	if ((bp->b_flags & B_COWDONE))
    955 		return 0;
    956 	if (bp->b_vp == NULL) {
    957 		bp->b_flags |= B_COWDONE;
    958 		return 0;
    959 	}
    960 	if (bp->b_vp->v_type == VBLK)
    961 		mp = spec_node_getmountedfs(bp->b_vp);
    962 	else
    963 		mp = bp->b_vp->v_mount;
    964 	if (mp == NULL || mp == dead_rootmount) {
    965 		bp->b_flags |= B_COWDONE;
    966 		return 0;
    967 	}
    968 
    969 	fli = fstrans_get_lwp_info(mp, true);
    970 	KASSERT(fli != NULL);
    971 	fmi = fli->fli_mountinfo;
    972 
    973 	/*
    974 	 * On non-recursed run check if other threads
    975 	 * want to change the list.
    976 	 */
    977 	if (fli->fli_cow_cnt == 0) {
    978 		s = pserialize_read_enter();
    979 		if (__predict_false(fmi->fmi_cow_change)) {
    980 			pserialize_read_exit(s);
    981 			mutex_enter(&fstrans_lock);
    982 			while (fmi->fmi_cow_change)
    983 				cv_wait(&fstrans_state_cv, &fstrans_lock);
    984 			fli->fli_cow_cnt = 1;
    985 			mutex_exit(&fstrans_lock);
    986 		} else {
    987 			fli->fli_cow_cnt = 1;
    988 			pserialize_read_exit(s);
    989 		}
    990 	} else
    991 		fli->fli_cow_cnt += 1;
    992 
    993 	/*
    994 	 * Run all copy-on-write handlers, stop on error.
    995 	 */
    996 	error = 0;
    997 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    998 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
    999 			break;
   1000  	if (error == 0)
   1001  		bp->b_flags |= B_COWDONE;
   1002 
   1003 	/*
   1004 	 * Check if other threads want to change the list.
   1005 	 */
   1006 	if (fli->fli_cow_cnt > 1) {
   1007 		fli->fli_cow_cnt -= 1;
   1008 	} else {
   1009 		s = pserialize_read_enter();
   1010 		if (__predict_false(fmi->fmi_cow_change)) {
   1011 			pserialize_read_exit(s);
   1012 			mutex_enter(&fstrans_lock);
   1013 			fli->fli_cow_cnt = 0;
   1014 			cv_signal(&fstrans_count_cv);
   1015 			mutex_exit(&fstrans_lock);
   1016 		} else {
   1017 			fli->fli_cow_cnt = 0;
   1018 			pserialize_read_exit(s);
   1019 		}
   1020 	}
   1021 
   1022 	return error;
   1023 }
   1024 
   1025 #if defined(DDB)
   1026 void fstrans_dump(int);
   1027 
   1028 static void
   1029 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
   1030 {
   1031 	char prefix[9];
   1032 	struct fstrans_lwp_info *fli;
   1033 
   1034 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
   1035 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
   1036 		if (fli->fli_self != l)
   1037 			continue;
   1038 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
   1039 			if (! verbose)
   1040 				continue;
   1041 		}
   1042 		printf("%-8s", prefix);
   1043 		if (verbose)
   1044 			printf(" @%p", fli);
   1045 		if (fli->fli_mount == dead_rootmount)
   1046 			printf(" <dead>");
   1047 		else if (fli->fli_mount != NULL)
   1048 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
   1049 		else
   1050 			printf(" NULL");
   1051 		if (fli->fli_alias != NULL) {
   1052 			struct mount *amp = fli->fli_alias->fli_mount;
   1053 
   1054 			printf(" alias");
   1055 			if (verbose)
   1056 				printf(" @%p", fli->fli_alias);
   1057 			if (amp == NULL)
   1058 				printf(" NULL");
   1059 			else
   1060 				printf(" (%s)", amp->mnt_stat.f_mntonname);
   1061 		}
   1062 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
   1063 			printf(" gone");
   1064 		if (fli->fli_trans_cnt == 0) {
   1065 			printf(" -");
   1066 		} else {
   1067 			switch (fli->fli_lock_type) {
   1068 			case FSTRANS_LAZY:
   1069 				printf(" lazy");
   1070 				break;
   1071 			case FSTRANS_SHARED:
   1072 				printf(" shared");
   1073 				break;
   1074 			default:
   1075 				printf(" %#x", fli->fli_lock_type);
   1076 				break;
   1077 			}
   1078 		}
   1079 		printf(" %d cow %d alias %d\n",
   1080 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
   1081 		prefix[0] = '\0';
   1082 	}
   1083 }
   1084 
   1085 static void
   1086 fstrans_print_mount(struct mount *mp, int verbose)
   1087 {
   1088 	uint32_t indx;
   1089 	struct fstrans_mount_info *fmi;
   1090 
   1091 	indx = fstrans_mount_hash(mp);
   1092 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
   1093 		if (fmi->fmi_mount == mp)
   1094 			break;
   1095 
   1096 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
   1097 		return;
   1098 
   1099 	printf("%-16s ", mp->mnt_stat.f_mntonname);
   1100 	if (fmi == NULL) {
   1101 		printf("(null)\n");
   1102 		return;
   1103 	}
   1104 	printf("owner %p ", fmi->fmi_owner);
   1105 	switch (fmi->fmi_state) {
   1106 	case FSTRANS_NORMAL:
   1107 		printf("state normal\n");
   1108 		break;
   1109 	case FSTRANS_SUSPENDING:
   1110 		printf("state suspending\n");
   1111 		break;
   1112 	case FSTRANS_SUSPENDED:
   1113 		printf("state suspended\n");
   1114 		break;
   1115 	default:
   1116 		printf("state %#x\n", fmi->fmi_state);
   1117 		break;
   1118 	}
   1119 }
   1120 
   1121 void
   1122 fstrans_dump(int full)
   1123 {
   1124 	const struct proclist_desc *pd;
   1125 	struct proc *p;
   1126 	struct lwp *l;
   1127 	struct mount *mp;
   1128 
   1129 	printf("Fstrans locks by lwp:\n");
   1130 	for (pd = proclists; pd->pd_list != NULL; pd++)
   1131 		PROCLIST_FOREACH(p, pd->pd_list)
   1132 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
   1133 				fstrans_print_lwp(p, l, full == 1);
   1134 
   1135 	printf("Fstrans state by mount:\n");
   1136 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
   1137 		fstrans_print_mount(mp, full == 1);
   1138 }
   1139 #endif /* defined(DDB) */
   1140