Home | History | Annotate | Line # | Download | only in kern
vfs_trans.c revision 1.64
      1 /*	$NetBSD: vfs_trans.c,v 1.64 2022/06/28 00:13:48 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Juergen Hannken-Illjes.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.64 2022/06/28 00:13:48 riastradh Exp $");
     34 
     35 /*
     36  * File system transaction operations.
     37  */
     38 
     39 #ifdef _KERNEL_OPT
     40 #include "opt_ddb.h"
     41 #endif
     42 
     43 #include <sys/param.h>
     44 #include <sys/systm.h>
     45 #include <sys/atomic.h>
     46 #include <sys/buf.h>
     47 #include <sys/kmem.h>
     48 #include <sys/mount.h>
     49 #include <sys/pserialize.h>
     50 #include <sys/vnode.h>
     51 #include <sys/fstrans.h>
     52 #include <sys/proc.h>
     53 #include <sys/pool.h>
     54 
     55 #include <miscfs/specfs/specdev.h>
     56 
     57 enum fstrans_lock_type {
     58 	FSTRANS_LAZY,			/* Granted while not suspended */
     59 	FSTRANS_SHARED			/* Granted while not suspending */
     60 };
     61 
     62 struct fscow_handler {
     63 	LIST_ENTRY(fscow_handler) ch_list;
     64 	int (*ch_func)(void *, struct buf *, bool);
     65 	void *ch_arg;
     66 };
     67 struct fstrans_lwp_info {
     68 	struct fstrans_lwp_info *fli_succ;
     69 	struct lwp *fli_self;
     70 	struct mount *fli_mount;
     71 	struct fstrans_lwp_info *fli_alias;
     72 	struct fstrans_mount_info *fli_mountinfo;
     73 	int fli_trans_cnt;
     74 	int fli_alias_cnt;
     75 	int fli_cow_cnt;
     76 	enum fstrans_lock_type fli_lock_type;
     77 	LIST_ENTRY(fstrans_lwp_info) fli_list;
     78 };
     79 struct fstrans_mount_info {
     80 	enum fstrans_state fmi_state;
     81 	unsigned int fmi_ref_cnt;
     82 	bool fmi_gone;
     83 	bool fmi_cow_change;
     84 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
     85 	struct mount *fmi_mount;
     86 	struct lwp *fmi_owner;
     87 };
     88 
     89 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
     90     __cacheline_aligned;
     91 static kmutex_t fstrans_lock		/* Fstrans big lock. */
     92     __cacheline_aligned;
     93 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
     94 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
     95 static pserialize_t fstrans_psz;	/* Pserialize state. */
     96 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
     97 					/* List of all fstrans_lwp_info. */
     98 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
     99 
    100 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
    101 
    102 static void fstrans_mount_dtor(struct fstrans_mount_info *);
    103 static void fstrans_clear_lwp_info(void);
    104 static inline struct fstrans_lwp_info *
    105     fstrans_get_lwp_info(struct mount *, bool);
    106 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
    107 static int fstrans_lwp_pcc(void *, void *, int);
    108 static void fstrans_lwp_pcd(void *, void *);
    109 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
    110 static bool grant_lock(const struct fstrans_mount_info *,
    111     const enum fstrans_lock_type);
    112 static bool state_change_done(const struct fstrans_mount_info *);
    113 static bool cow_state_change_done(const struct fstrans_mount_info *);
    114 static void cow_change_enter(struct fstrans_mount_info *);
    115 static void cow_change_done(struct fstrans_mount_info *);
    116 
    117 extern struct mount *dead_rootmount;
    118 
    119 #if defined(DIAGNOSTIC)
    120 
    121 struct fstrans_debug_mount {
    122 	struct mount *fdm_mount;
    123 	SLIST_ENTRY(fstrans_debug_mount) fdm_list;
    124 };
    125 
    126 static SLIST_HEAD(, fstrans_debug_mount) fstrans_debug_mount_head =
    127     SLIST_HEAD_INITIALIZER(fstrans_debug_mount_head);
    128 
    129 static void
    130 fstrans_debug_mount(struct mount *mp)
    131 {
    132 	struct fstrans_debug_mount *fdm, *new;
    133 
    134 	KASSERT(mutex_owned(&fstrans_lock));
    135 
    136 	mutex_exit(&fstrans_lock);
    137 	new = kmem_alloc(sizeof(*new), KM_SLEEP);
    138 	new->fdm_mount = mp;
    139 	mutex_enter(&fstrans_lock);
    140 
    141 	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
    142 		KASSERT(fdm->fdm_mount != mp);
    143 	SLIST_INSERT_HEAD(&fstrans_debug_mount_head, new, fdm_list);
    144 }
    145 
    146 static void
    147 fstrans_debug_unmount(struct mount *mp)
    148 {
    149 	struct fstrans_debug_mount *fdm;
    150 
    151 	KASSERT(mutex_owned(&fstrans_lock));
    152 
    153 	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
    154 		if (fdm->fdm_mount == mp)
    155 			break;
    156 	KASSERT(fdm != NULL);
    157 	SLIST_REMOVE(&fstrans_debug_mount_head, fdm,
    158 	    fstrans_debug_mount, fdm_list);
    159 	kmem_free(fdm, sizeof(*fdm));
    160 }
    161 
    162 static void
    163 fstrans_debug_validate_mount(struct mount *mp)
    164 {
    165 	struct fstrans_debug_mount *fdm;
    166 
    167 	KASSERT(mutex_owned(&fstrans_lock));
    168 
    169 	SLIST_FOREACH(fdm, &fstrans_debug_mount_head, fdm_list)
    170 		if (fdm->fdm_mount == mp)
    171 			break;
    172 	KASSERTMSG(fdm != NULL, "mount %p invalid", mp);
    173 }
    174 
    175 #else /* defined(DIAGNOSTIC) */
    176 
    177 #define fstrans_debug_mount(mp)
    178 #define fstrans_debug_unmount(mp)
    179 #define fstrans_debug_validate_mount(mp)
    180 
    181 #endif  /* defined(DIAGNOSTIC) */
    182 
    183 /*
    184  * Initialize.
    185  */
    186 void
    187 fstrans_init(void)
    188 {
    189 
    190 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
    191 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
    192 	cv_init(&fstrans_state_cv, "fstchg");
    193 	cv_init(&fstrans_count_cv, "fstcnt");
    194 	fstrans_psz = pserialize_create();
    195 	LIST_INIT(&fstrans_fli_head);
    196 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
    197 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
    198 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
    199 	KASSERT(fstrans_lwp_cache != NULL);
    200 }
    201 
    202 /*
    203  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
    204  * produces cache misses on MP.  Minimise by keeping free entries on list.
    205  */
    206 int
    207 fstrans_lwp_pcc(void *arg, void *obj, int flags)
    208 {
    209 	struct fstrans_lwp_info *fli = obj;
    210 
    211 	memset(fli, 0, sizeof(*fli));
    212 
    213 	mutex_enter(&fstrans_lock);
    214 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
    215 	mutex_exit(&fstrans_lock);
    216 
    217 	return 0;
    218 }
    219 
    220 /*
    221  * pool_cache destructor
    222  */
    223 void
    224 fstrans_lwp_pcd(void *arg, void *obj)
    225 {
    226 	struct fstrans_lwp_info *fli = obj;
    227 
    228 	mutex_enter(&fstrans_lock);
    229 	LIST_REMOVE(fli, fli_list);
    230 	mutex_exit(&fstrans_lock);
    231 }
    232 
    233 /*
    234  * Deallocate lwp state.
    235  */
    236 void
    237 fstrans_lwp_dtor(lwp_t *l)
    238 {
    239 	struct fstrans_lwp_info *fli, *fli_next;
    240 
    241 	if (l->l_fstrans == NULL)
    242 		return;
    243 
    244 	mutex_enter(&fstrans_lock);
    245 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    246 		KASSERT(fli->fli_trans_cnt == 0);
    247 		KASSERT(fli->fli_cow_cnt == 0);
    248 		KASSERT(fli->fli_self == l);
    249 		if (fli->fli_mount != NULL)
    250 			fstrans_mount_dtor(fli->fli_mountinfo);
    251 		fli_next = fli->fli_succ;
    252 		fli->fli_alias_cnt = 0;
    253 		fli->fli_mount = NULL;
    254 		fli->fli_alias = NULL;
    255 		fli->fli_mountinfo = NULL;
    256 		fli->fli_self = NULL;
    257 	}
    258 	mutex_exit(&fstrans_lock);
    259 
    260 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    261 		fli_next = fli->fli_succ;
    262 		pool_cache_put(fstrans_lwp_cache, fli);
    263 	}
    264 	l->l_fstrans = NULL;
    265 }
    266 
    267 /*
    268  * Dereference mount state.
    269  */
    270 static void
    271 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
    272 {
    273 
    274 	KASSERT(mutex_owned(&fstrans_lock));
    275 
    276 	KASSERT(fmi != NULL);
    277 	fmi->fmi_ref_cnt -= 1;
    278 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
    279 		return;
    280 	}
    281 
    282 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
    283 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
    284 	KASSERT(fmi->fmi_owner == NULL);
    285 
    286 	KASSERT(fstrans_gone_count > 0);
    287 	fstrans_gone_count -= 1;
    288 
    289 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
    290 	kmem_free(fmi, sizeof(*fmi));
    291 }
    292 
    293 /*
    294  * Allocate mount state.
    295  */
    296 int
    297 fstrans_mount(struct mount *mp)
    298 {
    299 	struct fstrans_mount_info *newfmi;
    300 
    301 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
    302 	newfmi->fmi_state = FSTRANS_NORMAL;
    303 	newfmi->fmi_ref_cnt = 1;
    304 	newfmi->fmi_gone = false;
    305 	LIST_INIT(&newfmi->fmi_cow_handler);
    306 	newfmi->fmi_cow_change = false;
    307 	newfmi->fmi_mount = mp;
    308 	newfmi->fmi_owner = NULL;
    309 
    310 	mutex_enter(&fstrans_lock);
    311 	mp->mnt_transinfo = newfmi;
    312 	fstrans_debug_mount(mp);
    313 	mutex_exit(&fstrans_lock);
    314 
    315 	return 0;
    316 }
    317 
    318 /*
    319  * Deallocate mount state.
    320  */
    321 void
    322 fstrans_unmount(struct mount *mp)
    323 {
    324 	struct fstrans_mount_info *fmi = mp->mnt_transinfo;
    325 
    326 	KASSERT(fmi != NULL);
    327 
    328 	mutex_enter(&fstrans_lock);
    329 	fstrans_debug_unmount(mp);
    330 	fmi->fmi_gone = true;
    331 	mp->mnt_transinfo = NULL;
    332 	fstrans_gone_count += 1;
    333 	fstrans_mount_dtor(fmi);
    334 	mutex_exit(&fstrans_lock);
    335 }
    336 
    337 /*
    338  * Clear mount entries whose mount is gone.
    339  */
    340 static void
    341 fstrans_clear_lwp_info(void)
    342 {
    343 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
    344 
    345 	/*
    346 	 * Scan our list clearing entries whose mount is gone.
    347 	 */
    348 	mutex_enter(&fstrans_lock);
    349 	for (p = &curlwp->l_fstrans; *p; ) {
    350 		fli = *p;
    351 		if (fli->fli_mount != NULL &&
    352 		    fli->fli_mountinfo->fmi_gone &&
    353 		    fli->fli_trans_cnt == 0 &&
    354 		    fli->fli_cow_cnt == 0 &&
    355 		    fli->fli_alias_cnt == 0) {
    356 			*p = (*p)->fli_succ;
    357 			fstrans_mount_dtor(fli->fli_mountinfo);
    358 			if (fli->fli_alias) {
    359 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
    360 				fli->fli_alias->fli_alias_cnt--;
    361 			}
    362 			fli->fli_mount = NULL;
    363 			fli->fli_alias = NULL;
    364 			fli->fli_mountinfo = NULL;
    365 			fli->fli_self = NULL;
    366 			p = &curlwp->l_fstrans;
    367 			fli->fli_succ = tofree;
    368 			tofree = fli;
    369 		} else {
    370 			p = &(*p)->fli_succ;
    371 		}
    372 	}
    373 #ifdef DIAGNOSTIC
    374 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
    375 		if (fli->fli_alias != NULL)
    376 			KASSERT(fli->fli_alias->fli_self == curlwp);
    377 #endif /* DIAGNOSTIC */
    378 	mutex_exit(&fstrans_lock);
    379 
    380 	while (tofree != NULL) {
    381 		fli = tofree;
    382 		tofree = fli->fli_succ;
    383 		pool_cache_put(fstrans_lwp_cache, fli);
    384 	}
    385 }
    386 
    387 /*
    388  * Allocate and return per lwp info for this mount.
    389  */
    390 static struct fstrans_lwp_info *
    391 fstrans_alloc_lwp_info(struct mount *mp)
    392 {
    393 	struct fstrans_lwp_info *fli;
    394 	struct fstrans_mount_info *fmi;
    395 
    396 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    397 		if (fli->fli_mount == mp)
    398 			return fli;
    399 	}
    400 
    401 	/*
    402 	 * Allocate a new entry.
    403 	 */
    404 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
    405 	KASSERT(fli->fli_trans_cnt == 0);
    406 	KASSERT(fli->fli_cow_cnt == 0);
    407 	KASSERT(fli->fli_alias_cnt == 0);
    408 	KASSERT(fli->fli_mount == NULL);
    409 	KASSERT(fli->fli_alias == NULL);
    410 	KASSERT(fli->fli_mountinfo == NULL);
    411 	KASSERT(fli->fli_self == NULL);
    412 	fli->fli_succ = curlwp->l_fstrans;
    413 	curlwp->l_fstrans = fli;
    414 
    415 	/*
    416 	 * Attach the entry to the mount if its mnt_transinfo is valid.
    417 	 */
    418 
    419 	mutex_enter(&fstrans_lock);
    420 	fli->fli_self = curlwp;
    421 	fstrans_debug_validate_mount(mp);
    422 	fmi = mp->mnt_transinfo;
    423 	KASSERT(fmi != NULL);
    424 	fli->fli_mount = mp;
    425 	fli->fli_mountinfo = fmi;
    426 	fmi->fmi_ref_cnt += 1;
    427 	do {
    428 		mp = mp->mnt_lower;
    429 	} while (mp && mp->mnt_lower);
    430 	mutex_exit(&fstrans_lock);
    431 
    432 	if (mp) {
    433 		fli->fli_alias = fstrans_alloc_lwp_info(mp);
    434 		fli->fli_alias->fli_alias_cnt++;
    435 		fli = fli->fli_alias;
    436 	}
    437 
    438 	return fli;
    439 }
    440 
    441 /*
    442  * Retrieve the per lwp info for this mount allocating if necessary.
    443  */
    444 static inline struct fstrans_lwp_info *
    445 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
    446 {
    447 	struct fstrans_lwp_info *fli;
    448 
    449 	/*
    450 	 * Scan our list for a match.
    451 	 */
    452 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    453 		if (fli->fli_mount == mp) {
    454 			KASSERT((mp->mnt_lower == NULL) ==
    455 			    (fli->fli_alias == NULL));
    456 			if (fli->fli_alias != NULL)
    457 				fli = fli->fli_alias;
    458 			break;
    459 		}
    460 	}
    461 
    462 	if (do_alloc) {
    463 		if (__predict_false(fli == NULL))
    464 			fli = fstrans_alloc_lwp_info(mp);
    465 		KASSERT(fli != NULL);
    466 		KASSERT(!fli->fli_mountinfo->fmi_gone);
    467 	} else {
    468 		KASSERT(fli != NULL);
    469 	}
    470 
    471 	return fli;
    472 }
    473 
    474 /*
    475  * Check if this lock type is granted at this state.
    476  */
    477 static bool
    478 grant_lock(const struct fstrans_mount_info *fmi,
    479     const enum fstrans_lock_type type)
    480 {
    481 
    482 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
    483 		return true;
    484 	if (fmi->fmi_owner == curlwp)
    485 		return true;
    486 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
    487 		return true;
    488 
    489 	return false;
    490 }
    491 
    492 /*
    493  * Start a transaction.  If this thread already has a transaction on this
    494  * file system increment the reference counter.
    495  */
    496 static inline int
    497 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
    498 {
    499 	int s;
    500 	struct fstrans_lwp_info *fli;
    501 	struct fstrans_mount_info *fmi;
    502 
    503 #ifndef FSTRANS_DEAD_ENABLED
    504 	if (mp == dead_rootmount)
    505 		return 0;
    506 #endif
    507 
    508 	ASSERT_SLEEPABLE();
    509 
    510 	fli = fstrans_get_lwp_info(mp, true);
    511 	fmi = fli->fli_mountinfo;
    512 
    513 	if (fli->fli_trans_cnt > 0) {
    514 		fli->fli_trans_cnt += 1;
    515 
    516 		return 0;
    517 	}
    518 
    519 	s = pserialize_read_enter();
    520 	if (__predict_true(grant_lock(fmi, lock_type))) {
    521 		fli->fli_trans_cnt = 1;
    522 		fli->fli_lock_type = lock_type;
    523 		pserialize_read_exit(s);
    524 
    525 		return 0;
    526 	}
    527 	pserialize_read_exit(s);
    528 
    529 	if (! wait)
    530 		return EBUSY;
    531 
    532 	mutex_enter(&fstrans_lock);
    533 	while (! grant_lock(fmi, lock_type))
    534 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    535 	fli->fli_trans_cnt = 1;
    536 	fli->fli_lock_type = lock_type;
    537 	mutex_exit(&fstrans_lock);
    538 
    539 	return 0;
    540 }
    541 
    542 void
    543 fstrans_start(struct mount *mp)
    544 {
    545 	int error __diagused;
    546 
    547 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
    548 	KASSERT(error == 0);
    549 }
    550 
    551 int
    552 fstrans_start_nowait(struct mount *mp)
    553 {
    554 
    555 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
    556 }
    557 
    558 void
    559 fstrans_start_lazy(struct mount *mp)
    560 {
    561 	int error __diagused;
    562 
    563 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
    564 	KASSERT(error == 0);
    565 }
    566 
    567 /*
    568  * Finish a transaction.
    569  */
    570 void
    571 fstrans_done(struct mount *mp)
    572 {
    573 	int s;
    574 	struct fstrans_lwp_info *fli;
    575 	struct fstrans_mount_info *fmi;
    576 
    577 #ifndef FSTRANS_DEAD_ENABLED
    578 	if (mp == dead_rootmount)
    579 		return;
    580 #endif
    581 
    582 	fli = fstrans_get_lwp_info(mp, false);
    583 	fmi = fli->fli_mountinfo;
    584 	KASSERT(fli->fli_trans_cnt > 0);
    585 
    586 	if (fli->fli_trans_cnt > 1) {
    587 		fli->fli_trans_cnt -= 1;
    588 
    589 		return;
    590 	}
    591 
    592 	if (__predict_false(fstrans_gone_count > 0))
    593 		fstrans_clear_lwp_info();
    594 
    595 	s = pserialize_read_enter();
    596 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
    597 		fli->fli_trans_cnt = 0;
    598 		pserialize_read_exit(s);
    599 
    600 		return;
    601 	}
    602 	pserialize_read_exit(s);
    603 
    604 	mutex_enter(&fstrans_lock);
    605 	fli->fli_trans_cnt = 0;
    606 	cv_signal(&fstrans_count_cv);
    607 	mutex_exit(&fstrans_lock);
    608 }
    609 
    610 /*
    611  * Check if we hold an lock.
    612  */
    613 int
    614 fstrans_held(struct mount *mp)
    615 {
    616 	struct fstrans_lwp_info *fli;
    617 	struct fstrans_mount_info *fmi;
    618 
    619 	KASSERT(mp != dead_rootmount);
    620 
    621 	fli = fstrans_get_lwp_info(mp, true);
    622 	fmi = fli->fli_mountinfo;
    623 
    624 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
    625 }
    626 
    627 /*
    628  * Check if this thread has an exclusive lock.
    629  */
    630 int
    631 fstrans_is_owner(struct mount *mp)
    632 {
    633 	struct fstrans_lwp_info *fli;
    634 	struct fstrans_mount_info *fmi;
    635 
    636 	KASSERT(mp != dead_rootmount);
    637 
    638 	fli = fstrans_get_lwp_info(mp, true);
    639 	fmi = fli->fli_mountinfo;
    640 
    641 	return (fmi->fmi_owner == curlwp);
    642 }
    643 
    644 /*
    645  * True, if no thread is in a transaction not granted at the current state.
    646  */
    647 static bool
    648 state_change_done(const struct fstrans_mount_info *fmi)
    649 {
    650 	struct fstrans_lwp_info *fli;
    651 
    652 	KASSERT(mutex_owned(&fstrans_lock));
    653 
    654 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    655 		if (fli->fli_mountinfo != fmi)
    656 			continue;
    657 		if (fli->fli_trans_cnt == 0)
    658 			continue;
    659 		if (fli->fli_self == curlwp)
    660 			continue;
    661 		if (grant_lock(fmi, fli->fli_lock_type))
    662 			continue;
    663 
    664 		return false;
    665 	}
    666 
    667 	return true;
    668 }
    669 
    670 /*
    671  * Set new file system state.
    672  */
    673 int
    674 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
    675 {
    676 	int error;
    677 	enum fstrans_state old_state;
    678 	struct fstrans_lwp_info *fli;
    679 	struct fstrans_mount_info *fmi;
    680 
    681 	KASSERT(mp != dead_rootmount);
    682 
    683 	fli = fstrans_get_lwp_info(mp, true);
    684 	fmi = fli->fli_mountinfo;
    685 	old_state = fmi->fmi_state;
    686 	if (old_state == new_state)
    687 		return 0;
    688 
    689 	mutex_enter(&fstrans_lock);
    690 	fmi->fmi_state = new_state;
    691 	pserialize_perform(fstrans_psz);
    692 
    693 	/*
    694 	 * All threads see the new state now.
    695 	 * Wait for transactions invalid at this state to leave.
    696 	 */
    697 	error = 0;
    698 	while (! state_change_done(fmi)) {
    699 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
    700 		if (error) {
    701 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
    702 			break;
    703 		}
    704 	}
    705 	if (old_state != new_state) {
    706 		if (old_state == FSTRANS_NORMAL) {
    707 			KASSERT(fmi->fmi_owner == NULL);
    708 			fmi->fmi_owner = curlwp;
    709 		}
    710 		if (new_state == FSTRANS_NORMAL) {
    711 			KASSERT(fmi->fmi_owner == curlwp);
    712 			fmi->fmi_owner = NULL;
    713 		}
    714 	}
    715 	cv_broadcast(&fstrans_state_cv);
    716 	mutex_exit(&fstrans_lock);
    717 
    718 	return error;
    719 }
    720 
    721 /*
    722  * Get current file system state.
    723  */
    724 enum fstrans_state
    725 fstrans_getstate(struct mount *mp)
    726 {
    727 	struct fstrans_lwp_info *fli;
    728 	struct fstrans_mount_info *fmi;
    729 
    730 	KASSERT(mp != dead_rootmount);
    731 
    732 	fli = fstrans_get_lwp_info(mp, true);
    733 	fmi = fli->fli_mountinfo;
    734 
    735 	return fmi->fmi_state;
    736 }
    737 
    738 /*
    739  * Request a filesystem to suspend all operations.
    740  */
    741 int
    742 vfs_suspend(struct mount *mp, int nowait)
    743 {
    744 	struct fstrans_lwp_info *fli;
    745 	int error;
    746 
    747 	if (mp == dead_rootmount)
    748 		return EOPNOTSUPP;
    749 
    750 	fli = fstrans_get_lwp_info(mp, true);
    751 	mp = fli->fli_mount;
    752 
    753 	if (nowait) {
    754 		if (!mutex_tryenter(&vfs_suspend_lock))
    755 			return EWOULDBLOCK;
    756 	} else
    757 		mutex_enter(&vfs_suspend_lock);
    758 
    759 	if ((error = VFS_SUSPENDCTL(mp, SUSPEND_SUSPEND)) != 0)
    760 		mutex_exit(&vfs_suspend_lock);
    761 
    762 	return error;
    763 }
    764 
    765 /*
    766  * Request a filesystem to resume all operations.
    767  */
    768 void
    769 vfs_resume(struct mount *mp)
    770 {
    771 	struct fstrans_lwp_info *fli;
    772 
    773 	KASSERT(mp != dead_rootmount);
    774 
    775 	fli = fstrans_get_lwp_info(mp, false);
    776 	mp = fli->fli_mount;
    777 
    778 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
    779 	mutex_exit(&vfs_suspend_lock);
    780 }
    781 
    782 
    783 /*
    784  * True, if no thread is running a cow handler.
    785  */
    786 static bool
    787 cow_state_change_done(const struct fstrans_mount_info *fmi)
    788 {
    789 	struct fstrans_lwp_info *fli;
    790 
    791 	KASSERT(mutex_owned(&fstrans_lock));
    792 	KASSERT(fmi->fmi_cow_change);
    793 
    794 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    795 		if (fli->fli_mount != fmi->fmi_mount)
    796 			continue;
    797 		if (fli->fli_cow_cnt == 0)
    798 			continue;
    799 
    800 		return false;
    801 	}
    802 
    803 	return true;
    804 }
    805 
    806 /*
    807  * Prepare for changing this mounts cow list.
    808  * Returns with fstrans_lock locked.
    809  */
    810 static void
    811 cow_change_enter(struct fstrans_mount_info *fmi)
    812 {
    813 
    814 	mutex_enter(&fstrans_lock);
    815 
    816 	/*
    817 	 * Wait for other threads changing the list.
    818 	 */
    819 	while (fmi->fmi_cow_change)
    820 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    821 
    822 	/*
    823 	 * Wait until all threads are aware of a state change.
    824 	 */
    825 	fmi->fmi_cow_change = true;
    826 	pserialize_perform(fstrans_psz);
    827 
    828 	while (! cow_state_change_done(fmi))
    829 		cv_wait(&fstrans_count_cv, &fstrans_lock);
    830 }
    831 
    832 /*
    833  * Done changing this mounts cow list.
    834  */
    835 static void
    836 cow_change_done(struct fstrans_mount_info *fmi)
    837 {
    838 
    839 	KASSERT(mutex_owned(&fstrans_lock));
    840 
    841 	fmi->fmi_cow_change = false;
    842 	pserialize_perform(fstrans_psz);
    843 
    844 	cv_broadcast(&fstrans_state_cv);
    845 
    846 	mutex_exit(&fstrans_lock);
    847 }
    848 
    849 /*
    850  * Add a handler to this mount.
    851  */
    852 int
    853 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    854     void *arg)
    855 {
    856 	struct fstrans_mount_info *fmi;
    857 	struct fscow_handler *newch;
    858 
    859 	KASSERT(mp != dead_rootmount);
    860 
    861 	mutex_enter(&fstrans_lock);
    862 	fmi = mp->mnt_transinfo;
    863 	KASSERT(fmi != NULL);
    864 	fmi->fmi_ref_cnt += 1;
    865 	mutex_exit(&fstrans_lock);
    866 
    867 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
    868 	newch->ch_func = func;
    869 	newch->ch_arg = arg;
    870 
    871 	cow_change_enter(fmi);
    872 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
    873 	cow_change_done(fmi);
    874 
    875 	return 0;
    876 }
    877 
    878 /*
    879  * Remove a handler from this mount.
    880  */
    881 int
    882 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    883     void *arg)
    884 {
    885 	struct fstrans_mount_info *fmi;
    886 	struct fscow_handler *hp = NULL;
    887 
    888 	KASSERT(mp != dead_rootmount);
    889 
    890 	fmi = mp->mnt_transinfo;
    891 	KASSERT(fmi != NULL);
    892 
    893 	cow_change_enter(fmi);
    894 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    895 		if (hp->ch_func == func && hp->ch_arg == arg)
    896 			break;
    897 	if (hp != NULL) {
    898 		LIST_REMOVE(hp, ch_list);
    899 		kmem_free(hp, sizeof(*hp));
    900 	}
    901 	fstrans_mount_dtor(fmi);
    902 	cow_change_done(fmi);
    903 
    904 	return hp ? 0 : EINVAL;
    905 }
    906 
    907 /*
    908  * Check for need to copy block that is about to be written.
    909  */
    910 int
    911 fscow_run(struct buf *bp, bool data_valid)
    912 {
    913 	int error, s;
    914 	struct mount *mp;
    915 	struct fstrans_lwp_info *fli;
    916 	struct fstrans_mount_info *fmi;
    917 	struct fscow_handler *hp;
    918 
    919 	/*
    920 	 * First check if we need run the copy-on-write handler.
    921 	 */
    922 	if ((bp->b_flags & B_COWDONE))
    923 		return 0;
    924 	if (bp->b_vp == NULL) {
    925 		bp->b_flags |= B_COWDONE;
    926 		return 0;
    927 	}
    928 	if (bp->b_vp->v_type == VBLK)
    929 		mp = spec_node_getmountedfs(bp->b_vp);
    930 	else
    931 		mp = bp->b_vp->v_mount;
    932 	if (mp == NULL || mp == dead_rootmount) {
    933 		bp->b_flags |= B_COWDONE;
    934 		return 0;
    935 	}
    936 
    937 	fli = fstrans_get_lwp_info(mp, true);
    938 	fmi = fli->fli_mountinfo;
    939 
    940 	/*
    941 	 * On non-recursed run check if other threads
    942 	 * want to change the list.
    943 	 */
    944 	if (fli->fli_cow_cnt == 0) {
    945 		s = pserialize_read_enter();
    946 		if (__predict_false(fmi->fmi_cow_change)) {
    947 			pserialize_read_exit(s);
    948 			mutex_enter(&fstrans_lock);
    949 			while (fmi->fmi_cow_change)
    950 				cv_wait(&fstrans_state_cv, &fstrans_lock);
    951 			fli->fli_cow_cnt = 1;
    952 			mutex_exit(&fstrans_lock);
    953 		} else {
    954 			fli->fli_cow_cnt = 1;
    955 			pserialize_read_exit(s);
    956 		}
    957 	} else
    958 		fli->fli_cow_cnt += 1;
    959 
    960 	/*
    961 	 * Run all copy-on-write handlers, stop on error.
    962 	 */
    963 	error = 0;
    964 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    965 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
    966 			break;
    967  	if (error == 0)
    968  		bp->b_flags |= B_COWDONE;
    969 
    970 	/*
    971 	 * Check if other threads want to change the list.
    972 	 */
    973 	if (fli->fli_cow_cnt > 1) {
    974 		fli->fli_cow_cnt -= 1;
    975 	} else {
    976 		s = pserialize_read_enter();
    977 		if (__predict_false(fmi->fmi_cow_change)) {
    978 			pserialize_read_exit(s);
    979 			mutex_enter(&fstrans_lock);
    980 			fli->fli_cow_cnt = 0;
    981 			cv_signal(&fstrans_count_cv);
    982 			mutex_exit(&fstrans_lock);
    983 		} else {
    984 			fli->fli_cow_cnt = 0;
    985 			pserialize_read_exit(s);
    986 		}
    987 	}
    988 
    989 	return error;
    990 }
    991 
    992 #if defined(DDB)
    993 void fstrans_dump(int);
    994 
    995 static void
    996 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
    997 {
    998 	char prefix[9];
    999 	struct fstrans_lwp_info *fli;
   1000 
   1001 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
   1002 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
   1003 		if (fli->fli_self != l)
   1004 			continue;
   1005 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
   1006 			if (! verbose)
   1007 				continue;
   1008 		}
   1009 		printf("%-8s", prefix);
   1010 		if (verbose)
   1011 			printf(" @%p", fli);
   1012 		if (fli->fli_mount == dead_rootmount)
   1013 			printf(" <dead>");
   1014 		else if (fli->fli_mount != NULL)
   1015 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
   1016 		else
   1017 			printf(" NULL");
   1018 		if (fli->fli_alias != NULL) {
   1019 			struct mount *amp = fli->fli_alias->fli_mount;
   1020 
   1021 			printf(" alias");
   1022 			if (verbose)
   1023 				printf(" @%p", fli->fli_alias);
   1024 			if (amp == NULL)
   1025 				printf(" NULL");
   1026 			else
   1027 				printf(" (%s)", amp->mnt_stat.f_mntonname);
   1028 		}
   1029 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
   1030 			printf(" gone");
   1031 		if (fli->fli_trans_cnt == 0) {
   1032 			printf(" -");
   1033 		} else {
   1034 			switch (fli->fli_lock_type) {
   1035 			case FSTRANS_LAZY:
   1036 				printf(" lazy");
   1037 				break;
   1038 			case FSTRANS_SHARED:
   1039 				printf(" shared");
   1040 				break;
   1041 			default:
   1042 				printf(" %#x", fli->fli_lock_type);
   1043 				break;
   1044 			}
   1045 		}
   1046 		printf(" %d cow %d alias %d\n",
   1047 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
   1048 		prefix[0] = '\0';
   1049 	}
   1050 }
   1051 
   1052 static void
   1053 fstrans_print_mount(struct mount *mp, int verbose)
   1054 {
   1055 	struct fstrans_mount_info *fmi;
   1056 
   1057 	fmi = mp->mnt_transinfo;
   1058 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
   1059 		return;
   1060 
   1061 	printf("%-16s ", mp->mnt_stat.f_mntonname);
   1062 	if (fmi == NULL) {
   1063 		printf("(null)\n");
   1064 		return;
   1065 	}
   1066 	printf("owner %p ", fmi->fmi_owner);
   1067 	switch (fmi->fmi_state) {
   1068 	case FSTRANS_NORMAL:
   1069 		printf("state normal\n");
   1070 		break;
   1071 	case FSTRANS_SUSPENDING:
   1072 		printf("state suspending\n");
   1073 		break;
   1074 	case FSTRANS_SUSPENDED:
   1075 		printf("state suspended\n");
   1076 		break;
   1077 	default:
   1078 		printf("state %#x\n", fmi->fmi_state);
   1079 		break;
   1080 	}
   1081 }
   1082 
   1083 void
   1084 fstrans_dump(int full)
   1085 {
   1086 	const struct proclist_desc *pd;
   1087 	struct proc *p;
   1088 	struct lwp *l;
   1089 	struct mount *mp;
   1090 
   1091 	printf("Fstrans locks by lwp:\n");
   1092 	for (pd = proclists; pd->pd_list != NULL; pd++)
   1093 		PROCLIST_FOREACH(p, pd->pd_list)
   1094 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
   1095 				fstrans_print_lwp(p, l, full == 1);
   1096 
   1097 	printf("Fstrans state by mount:\n");
   1098 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
   1099 		fstrans_print_mount(mp, full == 1);
   1100 }
   1101 #endif /* defined(DDB) */
   1102