Home | History | Annotate | Line # | Download | only in kern
vfs_trans.c revision 1.63
      1 /*	$NetBSD: vfs_trans.c,v 1.63 2020/05/17 19:34:07 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Juergen Hannken-Illjes.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.63 2020/05/17 19:34:07 ad Exp $");
     34 
     35 /*
     36  * File system transaction operations.
     37  */
     38 
     39 #ifdef _KERNEL_OPT
     40 #include "opt_ddb.h"
     41 #endif
     42 
     43 #include <sys/param.h>
     44 #include <sys/systm.h>
     45 #include <sys/atomic.h>
     46 #include <sys/buf.h>
     47 #include <sys/kmem.h>
     48 #include <sys/mount.h>
     49 #include <sys/pserialize.h>
     50 #include <sys/vnode.h>
     51 #include <sys/fstrans.h>
     52 #include <sys/proc.h>
     53 #include <sys/pool.h>
     54 
     55 #include <miscfs/specfs/specdev.h>
     56 
     57 enum fstrans_lock_type {
     58 	FSTRANS_LAZY,			/* Granted while not suspended */
     59 	FSTRANS_SHARED			/* Granted while not suspending */
     60 };
     61 
     62 struct fscow_handler {
     63 	LIST_ENTRY(fscow_handler) ch_list;
     64 	int (*ch_func)(void *, struct buf *, bool);
     65 	void *ch_arg;
     66 };
     67 struct fstrans_lwp_info {
     68 	struct fstrans_lwp_info *fli_succ;
     69 	struct lwp *fli_self;
     70 	struct mount *fli_mount;
     71 	struct fstrans_lwp_info *fli_alias;
     72 	struct fstrans_mount_info *fli_mountinfo;
     73 	int fli_trans_cnt;
     74 	int fli_alias_cnt;
     75 	int fli_cow_cnt;
     76 	enum fstrans_lock_type fli_lock_type;
     77 	LIST_ENTRY(fstrans_lwp_info) fli_list;
     78 };
     79 struct fstrans_mount_info {
     80 	enum fstrans_state fmi_state;
     81 	unsigned int fmi_ref_cnt;
     82 	bool fmi_gone;
     83 	bool fmi_cow_change;
     84 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
     85 	struct mount *fmi_mount;
     86 	struct lwp *fmi_owner;
     87 };
     88 
     89 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
     90     __cacheline_aligned;
     91 static kmutex_t fstrans_lock		/* Fstrans big lock. */
     92     __cacheline_aligned;
     93 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
     94 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
     95 static pserialize_t fstrans_psz;	/* Pserialize state. */
     96 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
     97 					/* List of all fstrans_lwp_info. */
     98 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
     99 
    100 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
    101 
    102 static void fstrans_mount_dtor(struct fstrans_mount_info *);
    103 static void fstrans_clear_lwp_info(void);
    104 static inline struct fstrans_lwp_info *
    105     fstrans_get_lwp_info(struct mount *, bool);
    106 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
    107 static int fstrans_lwp_pcc(void *, void *, int);
    108 static void fstrans_lwp_pcd(void *, void *);
    109 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
    110 static bool grant_lock(const struct fstrans_mount_info *,
    111     const enum fstrans_lock_type);
    112 static bool state_change_done(const struct fstrans_mount_info *);
    113 static bool cow_state_change_done(const struct fstrans_mount_info *);
    114 static void cow_change_enter(struct fstrans_mount_info *);
    115 static void cow_change_done(struct fstrans_mount_info *);
    116 
    117 extern struct mount *dead_rootmount;
    118 
    119 #if defined(DIAGNOSTIC)
    120 
    121 struct fstrans_debug_mount {
    122 	struct mount *fdm_mount;
    123 	SLIST_ENTRY(fstrans_debug_mount) fdm_list;
    124 };
    125 
    126 static SLIST_HEAD(, fstrans_debug_mount) fstrans_debug_mount_head =
    127     SLIST_HEAD_INITIALIZER(fstrans_debug_mount_head);
    128 
    129 static void
    130 fstrans_debug_mount(struct mount *mp)
    131 {
    132 	struct fstrans_debug_mount *fdm, *new;
    133 
    134 	KASSERT(mutex_owned(&fstrans_lock));
    135 
    136 	mutex_exit(&fstrans_lock);
    137 	new = kmem_alloc(sizeof(*new), KM_SLEEP);
    138 	new->fdm_mount = mp;
    139 	mutex_enter(&fstrans_lock);
    140 
    141 	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
    142 		KASSERT(fdm->fdm_mount != mp);
    143 	SLIST_INSERT_HEAD(&fstrans_debug_mount_head, new, fdm_list);
    144 }
    145 
    146 static void
    147 fstrans_debug_unmount(struct mount *mp)
    148 {
    149 	struct fstrans_debug_mount *fdm;
    150 
    151 	KASSERT(mutex_owned(&fstrans_lock));
    152 
    153 	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
    154 		if (fdm->fdm_mount == mp)
    155 			break;
    156 	KASSERT(fdm != NULL);
    157 	SLIST_REMOVE(&fstrans_debug_mount_head, fdm,
    158 	    fstrans_debug_mount, fdm_list);
    159 	kmem_free(fdm, sizeof(*fdm));
    160 }
    161 
    162 static void
    163 fstrans_debug_validate_mount(struct mount *mp)
    164 {
    165 	struct fstrans_debug_mount *fdm;
    166 
    167 	KASSERT(mutex_owned(&fstrans_lock));
    168 
    169 	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
    170 		if (fdm->fdm_mount == mp)
    171 			break;
    172 	KASSERTMSG(fdm != NULL, "mount %p invalid", mp);
    173 }
    174 
    175 #else /* defined(DIAGNOSTIC) */
    176 
    177 #define fstrans_debug_mount(mp)
    178 #define fstrans_debug_unmount(mp)
    179 #define fstrans_debug_validate_mount(mp)
    180 
    181 #endif  /* defined(DIAGNOSTIC) */
    182 
    183 /*
    184  * Initialize.
    185  */
    186 void
    187 fstrans_init(void)
    188 {
    189 
    190 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
    191 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
    192 	cv_init(&fstrans_state_cv, "fstchg");
    193 	cv_init(&fstrans_count_cv, "fstcnt");
    194 	fstrans_psz = pserialize_create();
    195 	LIST_INIT(&fstrans_fli_head);
    196 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
    197 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
    198 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
    199 	KASSERT(fstrans_lwp_cache != NULL);
    200 }
    201 
    202 /*
    203  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
    204  * produces cache misses on MP.  Minimise by keeping free entries on list.
    205  */
    206 int
    207 fstrans_lwp_pcc(void *arg, void *obj, int flags)
    208 {
    209 	struct fstrans_lwp_info *fli = obj;
    210 
    211 	memset(fli, 0, sizeof(*fli));
    212 
    213 	mutex_enter(&fstrans_lock);
    214 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
    215 	mutex_exit(&fstrans_lock);
    216 
    217 	return 0;
    218 }
    219 
    220 /*
    221  * pool_cache destructor
    222  */
    223 void
    224 fstrans_lwp_pcd(void *arg, void *obj)
    225 {
    226 	struct fstrans_lwp_info *fli = obj;
    227 
    228 	mutex_enter(&fstrans_lock);
    229 	LIST_REMOVE(fli, fli_list);
    230 	mutex_exit(&fstrans_lock);
    231 }
    232 
    233 /*
    234  * Deallocate lwp state.
    235  */
    236 void
    237 fstrans_lwp_dtor(lwp_t *l)
    238 {
    239 	struct fstrans_lwp_info *fli, *fli_next;
    240 
    241 	if (l->l_fstrans == NULL)
    242 		return;
    243 
    244 	mutex_enter(&fstrans_lock);
    245 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    246 		KASSERT(fli->fli_trans_cnt == 0);
    247 		KASSERT(fli->fli_cow_cnt == 0);
    248 		KASSERT(fli->fli_self == l);
    249 		if (fli->fli_mount != NULL)
    250 			fstrans_mount_dtor(fli->fli_mountinfo);
    251 		fli_next = fli->fli_succ;
    252 		fli->fli_alias_cnt = 0;
    253 		fli->fli_mount = NULL;
    254 		fli->fli_alias = NULL;
    255 		fli->fli_mountinfo = NULL;
    256 		fli->fli_self = NULL;
    257 	}
    258 	mutex_exit(&fstrans_lock);
    259 
    260 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    261 		fli_next = fli->fli_succ;
    262 		pool_cache_put(fstrans_lwp_cache, fli);
    263 	}
    264 	l->l_fstrans = NULL;
    265 }
    266 
    267 /*
    268  * Dereference mount state.
    269  */
    270 static void
    271 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
    272 {
    273 
    274 	KASSERT(mutex_owned(&fstrans_lock));
    275 
    276 	KASSERT(fmi != NULL);
    277 	fmi->fmi_ref_cnt -= 1;
    278 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
    279 		return;
    280 	}
    281 
    282 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
    283 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
    284 	KASSERT(fmi->fmi_owner == NULL);
    285 
    286 	KASSERT(fstrans_gone_count > 0);
    287 	fstrans_gone_count -= 1;
    288 
    289 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
    290 	kmem_free(fmi, sizeof(*fmi));
    291 }
    292 
    293 /*
    294  * Allocate mount state.
    295  */
    296 int
    297 fstrans_mount(struct mount *mp)
    298 {
    299 	struct fstrans_mount_info *newfmi;
    300 
    301 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
    302 	newfmi->fmi_state = FSTRANS_NORMAL;
    303 	newfmi->fmi_ref_cnt = 1;
    304 	newfmi->fmi_gone = false;
    305 	LIST_INIT(&newfmi->fmi_cow_handler);
    306 	newfmi->fmi_cow_change = false;
    307 	newfmi->fmi_mount = mp;
    308 	newfmi->fmi_owner = NULL;
    309 
    310 	mutex_enter(&fstrans_lock);
    311 	mp->mnt_transinfo = newfmi;
    312 	fstrans_debug_mount(mp);
    313 	mutex_exit(&fstrans_lock);
    314 
    315 	return 0;
    316 }
    317 
    318 /*
    319  * Deallocate mount state.
    320  */
    321 void
    322 fstrans_unmount(struct mount *mp)
    323 {
    324 	struct fstrans_mount_info *fmi = mp->mnt_transinfo;
    325 
    326 	KASSERT(fmi != NULL);
    327 
    328 	mutex_enter(&fstrans_lock);
    329 	fstrans_debug_unmount(mp);
    330 	fmi->fmi_gone = true;
    331 	mp->mnt_transinfo = NULL;
    332 	fstrans_gone_count += 1;
    333 	fstrans_mount_dtor(fmi);
    334 	mutex_exit(&fstrans_lock);
    335 }
    336 
    337 /*
    338  * Clear mount entries whose mount is gone.
    339  */
    340 static void
    341 fstrans_clear_lwp_info(void)
    342 {
    343 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
    344 
    345 	/*
    346 	 * Scan our list clearing entries whose mount is gone.
    347 	 */
    348 	mutex_enter(&fstrans_lock);
    349 	for (p = &curlwp->l_fstrans; *p; ) {
    350 		fli = *p;
    351 		if (fli->fli_mount != NULL &&
    352 		    fli->fli_mountinfo->fmi_gone &&
    353 		    fli->fli_trans_cnt == 0 &&
    354 		    fli->fli_cow_cnt == 0 &&
    355 		    fli->fli_alias_cnt == 0) {
    356 			*p = (*p)->fli_succ;
    357 			fstrans_mount_dtor(fli->fli_mountinfo);
    358 			if (fli->fli_alias) {
    359 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
    360 				fli->fli_alias->fli_alias_cnt--;
    361 			}
    362 			fli->fli_mount = NULL;
    363 			fli->fli_alias = NULL;
    364 			fli->fli_mountinfo = NULL;
    365 			fli->fli_self = NULL;
    366 			p = &curlwp->l_fstrans;
    367 			fli->fli_succ = tofree;
    368 			tofree = fli;
    369 		} else {
    370 			p = &(*p)->fli_succ;
    371 		}
    372 	}
    373 #ifdef DIAGNOSTIC
    374 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
    375 		if (fli->fli_alias != NULL)
    376 			KASSERT(fli->fli_alias->fli_self == curlwp);
    377 #endif /* DIAGNOSTIC */
    378 	mutex_exit(&fstrans_lock);
    379 
    380 	while (tofree != NULL) {
    381 		fli = tofree;
    382 		tofree = fli->fli_succ;
    383 		pool_cache_put(fstrans_lwp_cache, fli);
    384 	}
    385 }
    386 
    387 /*
    388  * Allocate and return per lwp info for this mount.
    389  */
    390 static struct fstrans_lwp_info *
    391 fstrans_alloc_lwp_info(struct mount *mp)
    392 {
    393 	struct fstrans_lwp_info *fli;
    394 	struct fstrans_mount_info *fmi;
    395 
    396 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    397 		if (fli->fli_mount == mp)
    398 			return fli;
    399 	}
    400 
    401 	/*
    402 	 * Allocate a new entry.
    403 	 */
    404 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
    405 	KASSERT(fli->fli_trans_cnt == 0);
    406 	KASSERT(fli->fli_cow_cnt == 0);
    407 	KASSERT(fli->fli_alias_cnt == 0);
    408 	KASSERT(fli->fli_mount == NULL);
    409 	KASSERT(fli->fli_alias == NULL);
    410 	KASSERT(fli->fli_mountinfo == NULL);
    411 	KASSERT(fli->fli_self == NULL);
    412 	fli->fli_succ = curlwp->l_fstrans;
    413 	curlwp->l_fstrans = fli;
    414 
    415 	/*
    416 	 * Attach the entry to the mount if its mnt_transinfo is valid.
    417 	 */
    418 
    419 	mutex_enter(&fstrans_lock);
    420 	fli->fli_self = curlwp;
    421 	fstrans_debug_validate_mount(mp);
    422 	fmi = mp->mnt_transinfo;
    423 	KASSERT(fmi != NULL);
    424 	fli->fli_mount = mp;
    425 	fli->fli_mountinfo = fmi;
    426 	fmi->fmi_ref_cnt += 1;
    427 	do {
    428 		mp = mp->mnt_lower;
    429 	} while (mp && mp->mnt_lower);
    430 	mutex_exit(&fstrans_lock);
    431 
    432 	if (mp) {
    433 		fli->fli_alias = fstrans_alloc_lwp_info(mp);
    434 		fli->fli_alias->fli_alias_cnt++;
    435 		fli = fli->fli_alias;
    436 	}
    437 
    438 	return fli;
    439 }
    440 
    441 /*
    442  * Retrieve the per lwp info for this mount allocating if necessary.
    443  */
    444 static inline struct fstrans_lwp_info *
    445 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
    446 {
    447 	struct fstrans_lwp_info *fli;
    448 
    449 	/*
    450 	 * Scan our list for a match.
    451 	 */
    452 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    453 		if (fli->fli_mount == mp) {
    454 			KASSERT((mp->mnt_lower == NULL) ==
    455 			    (fli->fli_alias == NULL));
    456 			if (fli->fli_alias != NULL)
    457 				fli = fli->fli_alias;
    458 			break;
    459 		}
    460 	}
    461 
    462 	if (do_alloc) {
    463 		if (__predict_false(fli == NULL))
    464 			fli = fstrans_alloc_lwp_info(mp);
    465 		KASSERT(fli != NULL && !fli->fli_mountinfo->fmi_gone);
    466 	} else {
    467 		KASSERT(fli != NULL);
    468 	}
    469 
    470 	return fli;
    471 }
    472 
    473 /*
    474  * Check if this lock type is granted at this state.
    475  */
    476 static bool
    477 grant_lock(const struct fstrans_mount_info *fmi,
    478     const enum fstrans_lock_type type)
    479 {
    480 
    481 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
    482 		return true;
    483 	if (fmi->fmi_owner == curlwp)
    484 		return true;
    485 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
    486 		return true;
    487 
    488 	return false;
    489 }
    490 
    491 /*
    492  * Start a transaction.  If this thread already has a transaction on this
    493  * file system increment the reference counter.
    494  */
    495 static inline int
    496 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
    497 {
    498 	int s;
    499 	struct fstrans_lwp_info *fli;
    500 	struct fstrans_mount_info *fmi;
    501 
    502 #ifndef FSTRANS_DEAD_ENABLED
    503 	if (mp == dead_rootmount)
    504 		return 0;
    505 #endif
    506 
    507 	ASSERT_SLEEPABLE();
    508 
    509 	fli = fstrans_get_lwp_info(mp, true);
    510 	fmi = fli->fli_mountinfo;
    511 
    512 	if (fli->fli_trans_cnt > 0) {
    513 		fli->fli_trans_cnt += 1;
    514 
    515 		return 0;
    516 	}
    517 
    518 	s = pserialize_read_enter();
    519 	if (__predict_true(grant_lock(fmi, lock_type))) {
    520 		fli->fli_trans_cnt = 1;
    521 		fli->fli_lock_type = lock_type;
    522 		pserialize_read_exit(s);
    523 
    524 		return 0;
    525 	}
    526 	pserialize_read_exit(s);
    527 
    528 	if (! wait)
    529 		return EBUSY;
    530 
    531 	mutex_enter(&fstrans_lock);
    532 	while (! grant_lock(fmi, lock_type))
    533 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    534 	fli->fli_trans_cnt = 1;
    535 	fli->fli_lock_type = lock_type;
    536 	mutex_exit(&fstrans_lock);
    537 
    538 	return 0;
    539 }
    540 
    541 void
    542 fstrans_start(struct mount *mp)
    543 {
    544 	int error __diagused;
    545 
    546 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
    547 	KASSERT(error == 0);
    548 }
    549 
    550 int
    551 fstrans_start_nowait(struct mount *mp)
    552 {
    553 
    554 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
    555 }
    556 
    557 void
    558 fstrans_start_lazy(struct mount *mp)
    559 {
    560 	int error __diagused;
    561 
    562 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
    563 	KASSERT(error == 0);
    564 }
    565 
    566 /*
    567  * Finish a transaction.
    568  */
    569 void
    570 fstrans_done(struct mount *mp)
    571 {
    572 	int s;
    573 	struct fstrans_lwp_info *fli;
    574 	struct fstrans_mount_info *fmi;
    575 
    576 #ifndef FSTRANS_DEAD_ENABLED
    577 	if (mp == dead_rootmount)
    578 		return;
    579 #endif
    580 
    581 	fli = fstrans_get_lwp_info(mp, false);
    582 	fmi = fli->fli_mountinfo;
    583 	KASSERT(fli->fli_trans_cnt > 0);
    584 
    585 	if (fli->fli_trans_cnt > 1) {
    586 		fli->fli_trans_cnt -= 1;
    587 
    588 		return;
    589 	}
    590 
    591 	if (__predict_false(fstrans_gone_count > 0))
    592 		fstrans_clear_lwp_info();
    593 
    594 	s = pserialize_read_enter();
    595 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
    596 		fli->fli_trans_cnt = 0;
    597 		pserialize_read_exit(s);
    598 
    599 		return;
    600 	}
    601 	pserialize_read_exit(s);
    602 
    603 	mutex_enter(&fstrans_lock);
    604 	fli->fli_trans_cnt = 0;
    605 	cv_signal(&fstrans_count_cv);
    606 	mutex_exit(&fstrans_lock);
    607 }
    608 
    609 /*
    610  * Check if we hold an lock.
    611  */
    612 int
    613 fstrans_held(struct mount *mp)
    614 {
    615 	struct fstrans_lwp_info *fli;
    616 	struct fstrans_mount_info *fmi;
    617 
    618 	KASSERT(mp != dead_rootmount);
    619 
    620 	fli = fstrans_get_lwp_info(mp, true);
    621 	fmi = fli->fli_mountinfo;
    622 
    623 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
    624 }
    625 
    626 /*
    627  * Check if this thread has an exclusive lock.
    628  */
    629 int
    630 fstrans_is_owner(struct mount *mp)
    631 {
    632 	struct fstrans_lwp_info *fli;
    633 	struct fstrans_mount_info *fmi;
    634 
    635 	KASSERT(mp != dead_rootmount);
    636 
    637 	fli = fstrans_get_lwp_info(mp, true);
    638 	fmi = fli->fli_mountinfo;
    639 
    640 	return (fmi->fmi_owner == curlwp);
    641 }
    642 
    643 /*
    644  * True, if no thread is in a transaction not granted at the current state.
    645  */
    646 static bool
    647 state_change_done(const struct fstrans_mount_info *fmi)
    648 {
    649 	struct fstrans_lwp_info *fli;
    650 
    651 	KASSERT(mutex_owned(&fstrans_lock));
    652 
    653 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    654 		if (fli->fli_mountinfo != fmi)
    655 			continue;
    656 		if (fli->fli_trans_cnt == 0)
    657 			continue;
    658 		if (fli->fli_self == curlwp)
    659 			continue;
    660 		if (grant_lock(fmi, fli->fli_lock_type))
    661 			continue;
    662 
    663 		return false;
    664 	}
    665 
    666 	return true;
    667 }
    668 
    669 /*
    670  * Set new file system state.
    671  */
    672 int
    673 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
    674 {
    675 	int error;
    676 	enum fstrans_state old_state;
    677 	struct fstrans_lwp_info *fli;
    678 	struct fstrans_mount_info *fmi;
    679 
    680 	KASSERT(mp != dead_rootmount);
    681 
    682 	fli = fstrans_get_lwp_info(mp, true);
    683 	fmi = fli->fli_mountinfo;
    684 	old_state = fmi->fmi_state;
    685 	if (old_state == new_state)
    686 		return 0;
    687 
    688 	mutex_enter(&fstrans_lock);
    689 	fmi->fmi_state = new_state;
    690 	pserialize_perform(fstrans_psz);
    691 
    692 	/*
    693 	 * All threads see the new state now.
    694 	 * Wait for transactions invalid at this state to leave.
    695 	 */
    696 	error = 0;
    697 	while (! state_change_done(fmi)) {
    698 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
    699 		if (error) {
    700 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
    701 			break;
    702 		}
    703 	}
    704 	if (old_state != new_state) {
    705 		if (old_state == FSTRANS_NORMAL) {
    706 			KASSERT(fmi->fmi_owner == NULL);
    707 			fmi->fmi_owner = curlwp;
    708 		}
    709 		if (new_state == FSTRANS_NORMAL) {
    710 			KASSERT(fmi->fmi_owner == curlwp);
    711 			fmi->fmi_owner = NULL;
    712 		}
    713 	}
    714 	cv_broadcast(&fstrans_state_cv);
    715 	mutex_exit(&fstrans_lock);
    716 
    717 	return error;
    718 }
    719 
    720 /*
    721  * Get current file system state.
    722  */
    723 enum fstrans_state
    724 fstrans_getstate(struct mount *mp)
    725 {
    726 	struct fstrans_lwp_info *fli;
    727 	struct fstrans_mount_info *fmi;
    728 
    729 	KASSERT(mp != dead_rootmount);
    730 
    731 	fli = fstrans_get_lwp_info(mp, true);
    732 	fmi = fli->fli_mountinfo;
    733 
    734 	return fmi->fmi_state;
    735 }
    736 
    737 /*
    738  * Request a filesystem to suspend all operations.
    739  */
    740 int
    741 vfs_suspend(struct mount *mp, int nowait)
    742 {
    743 	struct fstrans_lwp_info *fli;
    744 	int error;
    745 
    746 	if (mp == dead_rootmount)
    747 		return EOPNOTSUPP;
    748 
    749 	fli = fstrans_get_lwp_info(mp, true);
    750 	mp = fli->fli_mount;
    751 
    752 	if (nowait) {
    753 		if (!mutex_tryenter(&vfs_suspend_lock))
    754 			return EWOULDBLOCK;
    755 	} else
    756 		mutex_enter(&vfs_suspend_lock);
    757 
    758 	if ((error = VFS_SUSPENDCTL(mp, SUSPEND_SUSPEND)) != 0)
    759 		mutex_exit(&vfs_suspend_lock);
    760 
    761 	return error;
    762 }
    763 
    764 /*
    765  * Request a filesystem to resume all operations.
    766  */
    767 void
    768 vfs_resume(struct mount *mp)
    769 {
    770 	struct fstrans_lwp_info *fli;
    771 
    772 	KASSERT(mp != dead_rootmount);
    773 
    774 	fli = fstrans_get_lwp_info(mp, false);
    775 	mp = fli->fli_mount;
    776 
    777 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
    778 	mutex_exit(&vfs_suspend_lock);
    779 }
    780 
    781 
    782 /*
    783  * True, if no thread is running a cow handler.
    784  */
    785 static bool
    786 cow_state_change_done(const struct fstrans_mount_info *fmi)
    787 {
    788 	struct fstrans_lwp_info *fli;
    789 
    790 	KASSERT(mutex_owned(&fstrans_lock));
    791 	KASSERT(fmi->fmi_cow_change);
    792 
    793 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    794 		if (fli->fli_mount != fmi->fmi_mount)
    795 			continue;
    796 		if (fli->fli_cow_cnt == 0)
    797 			continue;
    798 
    799 		return false;
    800 	}
    801 
    802 	return true;
    803 }
    804 
    805 /*
    806  * Prepare for changing this mounts cow list.
    807  * Returns with fstrans_lock locked.
    808  */
    809 static void
    810 cow_change_enter(struct fstrans_mount_info *fmi)
    811 {
    812 
    813 	mutex_enter(&fstrans_lock);
    814 
    815 	/*
    816 	 * Wait for other threads changing the list.
    817 	 */
    818 	while (fmi->fmi_cow_change)
    819 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    820 
    821 	/*
    822 	 * Wait until all threads are aware of a state change.
    823 	 */
    824 	fmi->fmi_cow_change = true;
    825 	pserialize_perform(fstrans_psz);
    826 
    827 	while (! cow_state_change_done(fmi))
    828 		cv_wait(&fstrans_count_cv, &fstrans_lock);
    829 }
    830 
    831 /*
    832  * Done changing this mounts cow list.
    833  */
    834 static void
    835 cow_change_done(struct fstrans_mount_info *fmi)
    836 {
    837 
    838 	KASSERT(mutex_owned(&fstrans_lock));
    839 
    840 	fmi->fmi_cow_change = false;
    841 	pserialize_perform(fstrans_psz);
    842 
    843 	cv_broadcast(&fstrans_state_cv);
    844 
    845 	mutex_exit(&fstrans_lock);
    846 }
    847 
    848 /*
    849  * Add a handler to this mount.
    850  */
    851 int
    852 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    853     void *arg)
    854 {
    855 	struct fstrans_mount_info *fmi;
    856 	struct fscow_handler *newch;
    857 
    858 	KASSERT(mp != dead_rootmount);
    859 
    860 	mutex_enter(&fstrans_lock);
    861 	fmi = mp->mnt_transinfo;
    862 	KASSERT(fmi != NULL);
    863 	fmi->fmi_ref_cnt += 1;
    864 	mutex_exit(&fstrans_lock);
    865 
    866 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
    867 	newch->ch_func = func;
    868 	newch->ch_arg = arg;
    869 
    870 	cow_change_enter(fmi);
    871 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
    872 	cow_change_done(fmi);
    873 
    874 	return 0;
    875 }
    876 
    877 /*
    878  * Remove a handler from this mount.
    879  */
    880 int
    881 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    882     void *arg)
    883 {
    884 	struct fstrans_mount_info *fmi;
    885 	struct fscow_handler *hp = NULL;
    886 
    887 	KASSERT(mp != dead_rootmount);
    888 
    889 	fmi = mp->mnt_transinfo;
    890 	KASSERT(fmi != NULL);
    891 
    892 	cow_change_enter(fmi);
    893 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    894 		if (hp->ch_func == func && hp->ch_arg == arg)
    895 			break;
    896 	if (hp != NULL) {
    897 		LIST_REMOVE(hp, ch_list);
    898 		kmem_free(hp, sizeof(*hp));
    899 	}
    900 	fstrans_mount_dtor(fmi);
    901 	cow_change_done(fmi);
    902 
    903 	return hp ? 0 : EINVAL;
    904 }
    905 
    906 /*
    907  * Check for need to copy block that is about to be written.
    908  */
    909 int
    910 fscow_run(struct buf *bp, bool data_valid)
    911 {
    912 	int error, s;
    913 	struct mount *mp;
    914 	struct fstrans_lwp_info *fli;
    915 	struct fstrans_mount_info *fmi;
    916 	struct fscow_handler *hp;
    917 
    918 	/*
    919 	 * First check if we need run the copy-on-write handler.
    920 	 */
    921 	if ((bp->b_flags & B_COWDONE))
    922 		return 0;
    923 	if (bp->b_vp == NULL) {
    924 		bp->b_flags |= B_COWDONE;
    925 		return 0;
    926 	}
    927 	if (bp->b_vp->v_type == VBLK)
    928 		mp = spec_node_getmountedfs(bp->b_vp);
    929 	else
    930 		mp = bp->b_vp->v_mount;
    931 	if (mp == NULL || mp == dead_rootmount) {
    932 		bp->b_flags |= B_COWDONE;
    933 		return 0;
    934 	}
    935 
    936 	fli = fstrans_get_lwp_info(mp, true);
    937 	fmi = fli->fli_mountinfo;
    938 
    939 	/*
    940 	 * On non-recursed run check if other threads
    941 	 * want to change the list.
    942 	 */
    943 	if (fli->fli_cow_cnt == 0) {
    944 		s = pserialize_read_enter();
    945 		if (__predict_false(fmi->fmi_cow_change)) {
    946 			pserialize_read_exit(s);
    947 			mutex_enter(&fstrans_lock);
    948 			while (fmi->fmi_cow_change)
    949 				cv_wait(&fstrans_state_cv, &fstrans_lock);
    950 			fli->fli_cow_cnt = 1;
    951 			mutex_exit(&fstrans_lock);
    952 		} else {
    953 			fli->fli_cow_cnt = 1;
    954 			pserialize_read_exit(s);
    955 		}
    956 	} else
    957 		fli->fli_cow_cnt += 1;
    958 
    959 	/*
    960 	 * Run all copy-on-write handlers, stop on error.
    961 	 */
    962 	error = 0;
    963 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    964 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
    965 			break;
    966  	if (error == 0)
    967  		bp->b_flags |= B_COWDONE;
    968 
    969 	/*
    970 	 * Check if other threads want to change the list.
    971 	 */
    972 	if (fli->fli_cow_cnt > 1) {
    973 		fli->fli_cow_cnt -= 1;
    974 	} else {
    975 		s = pserialize_read_enter();
    976 		if (__predict_false(fmi->fmi_cow_change)) {
    977 			pserialize_read_exit(s);
    978 			mutex_enter(&fstrans_lock);
    979 			fli->fli_cow_cnt = 0;
    980 			cv_signal(&fstrans_count_cv);
    981 			mutex_exit(&fstrans_lock);
    982 		} else {
    983 			fli->fli_cow_cnt = 0;
    984 			pserialize_read_exit(s);
    985 		}
    986 	}
    987 
    988 	return error;
    989 }
    990 
    991 #if defined(DDB)
    992 void fstrans_dump(int);
    993 
    994 static void
    995 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
    996 {
    997 	char prefix[9];
    998 	struct fstrans_lwp_info *fli;
    999 
   1000 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
   1001 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
   1002 		if (fli->fli_self != l)
   1003 			continue;
   1004 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
   1005 			if (! verbose)
   1006 				continue;
   1007 		}
   1008 		printf("%-8s", prefix);
   1009 		if (verbose)
   1010 			printf(" @%p", fli);
   1011 		if (fli->fli_mount == dead_rootmount)
   1012 			printf(" <dead>");
   1013 		else if (fli->fli_mount != NULL)
   1014 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
   1015 		else
   1016 			printf(" NULL");
   1017 		if (fli->fli_alias != NULL) {
   1018 			struct mount *amp = fli->fli_alias->fli_mount;
   1019 
   1020 			printf(" alias");
   1021 			if (verbose)
   1022 				printf(" @%p", fli->fli_alias);
   1023 			if (amp == NULL)
   1024 				printf(" NULL");
   1025 			else
   1026 				printf(" (%s)", amp->mnt_stat.f_mntonname);
   1027 		}
   1028 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
   1029 			printf(" gone");
   1030 		if (fli->fli_trans_cnt == 0) {
   1031 			printf(" -");
   1032 		} else {
   1033 			switch (fli->fli_lock_type) {
   1034 			case FSTRANS_LAZY:
   1035 				printf(" lazy");
   1036 				break;
   1037 			case FSTRANS_SHARED:
   1038 				printf(" shared");
   1039 				break;
   1040 			default:
   1041 				printf(" %#x", fli->fli_lock_type);
   1042 				break;
   1043 			}
   1044 		}
   1045 		printf(" %d cow %d alias %d\n",
   1046 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
   1047 		prefix[0] = '\0';
   1048 	}
   1049 }
   1050 
   1051 static void
   1052 fstrans_print_mount(struct mount *mp, int verbose)
   1053 {
   1054 	struct fstrans_mount_info *fmi;
   1055 
   1056 	fmi = mp->mnt_transinfo;
   1057 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
   1058 		return;
   1059 
   1060 	printf("%-16s ", mp->mnt_stat.f_mntonname);
   1061 	if (fmi == NULL) {
   1062 		printf("(null)\n");
   1063 		return;
   1064 	}
   1065 	printf("owner %p ", fmi->fmi_owner);
   1066 	switch (fmi->fmi_state) {
   1067 	case FSTRANS_NORMAL:
   1068 		printf("state normal\n");
   1069 		break;
   1070 	case FSTRANS_SUSPENDING:
   1071 		printf("state suspending\n");
   1072 		break;
   1073 	case FSTRANS_SUSPENDED:
   1074 		printf("state suspended\n");
   1075 		break;
   1076 	default:
   1077 		printf("state %#x\n", fmi->fmi_state);
   1078 		break;
   1079 	}
   1080 }
   1081 
   1082 void
   1083 fstrans_dump(int full)
   1084 {
   1085 	const struct proclist_desc *pd;
   1086 	struct proc *p;
   1087 	struct lwp *l;
   1088 	struct mount *mp;
   1089 
   1090 	printf("Fstrans locks by lwp:\n");
   1091 	for (pd = proclists; pd->pd_list != NULL; pd++)
   1092 		PROCLIST_FOREACH(p, pd->pd_list)
   1093 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
   1094 				fstrans_print_lwp(p, l, full == 1);
   1095 
   1096 	printf("Fstrans state by mount:\n");
   1097 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
   1098 		fstrans_print_mount(mp, full == 1);
   1099 }
   1100 #endif /* defined(DDB) */
   1101