Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: vfs_trans.c,v 1.73 2024/12/07 02:27:38 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Juergen Hannken-Illjes.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.73 2024/12/07 02:27:38 riastradh Exp $");
     34 
     35 /*
     36  * File system transaction operations.
     37  */
     38 
     39 #ifdef _KERNEL_OPT
     40 #include "opt_ddb.h"
     41 #endif
     42 
     43 #include <sys/param.h>
     44 #include <sys/types.h>
     45 
     46 #include <sys/atomic.h>
     47 #include <sys/buf.h>
     48 #include <sys/fstrans.h>
     49 #include <sys/hash.h>
     50 #include <sys/kmem.h>
     51 #include <sys/mount.h>
     52 #include <sys/pool.h>
     53 #include <sys/proc.h>
     54 #include <sys/pserialize.h>
     55 #include <sys/sdt.h>
     56 #include <sys/systm.h>
     57 #include <sys/vnode.h>
     58 
     59 #include <miscfs/deadfs/deadfs.h>
     60 #include <miscfs/specfs/specdev.h>
     61 
     62 #define FSTRANS_MOUNT_HASHSIZE	32
     63 
     64 enum fstrans_lock_type {
     65 	FSTRANS_LAZY,			/* Granted while not suspended */
     66 	FSTRANS_SHARED			/* Granted while not suspending */
     67 };
     68 
     69 struct fscow_handler {
     70 	LIST_ENTRY(fscow_handler) ch_list;
     71 	int (*ch_func)(void *, struct buf *, bool);
     72 	void *ch_arg;
     73 };
     74 struct fstrans_lwp_info {
     75 	struct fstrans_lwp_info *fli_succ;
     76 	struct lwp *fli_self;
     77 	struct mount *fli_mount;
     78 	struct fstrans_lwp_info *fli_alias;
     79 	struct fstrans_mount_info *fli_mountinfo;
     80 	int fli_trans_cnt;
     81 	int fli_alias_cnt;
     82 	int fli_cow_cnt;
     83 	enum fstrans_lock_type fli_lock_type;
     84 	LIST_ENTRY(fstrans_lwp_info) fli_list;
     85 };
     86 struct fstrans_mount_info {
     87 	enum fstrans_state fmi_state;
     88 	unsigned int fmi_ref_cnt;
     89 	bool fmi_gone;
     90 	bool fmi_cow_change;
     91 	SLIST_ENTRY(fstrans_mount_info) fmi_hash;
     92 	LIST_HEAD(, fscow_handler) fmi_cow_handler;
     93 	struct mount *fmi_mount;
     94 	struct fstrans_mount_info *fmi_lower_info;
     95 	struct lwp *fmi_owner;
     96 };
     97 SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
     98 
     99 static kmutex_t vfs_suspend_lock	/* Serialize suspensions. */
    100     __cacheline_aligned;
    101 static kmutex_t fstrans_lock		/* Fstrans big lock. */
    102     __cacheline_aligned;
    103 static kcondvar_t fstrans_state_cv;	/* Fstrans or cow state changed. */
    104 static kcondvar_t fstrans_count_cv;	/* Fstrans or cow count changed. */
    105 static pserialize_t fstrans_psz;	/* Pserialize state. */
    106 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
    107 					/* List of all fstrans_lwp_info. */
    108 static pool_cache_t fstrans_lwp_cache;	/* Cache of fstrans_lwp_info. */
    109 
    110 static u_long fstrans_mount_hashmask;
    111 static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
    112 static int fstrans_gone_count;		/* Number of fstrans_mount_info gone. */
    113 
    114 static inline uint32_t fstrans_mount_hash(struct mount *);
    115 static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
    116 static void fstrans_mount_dtor(struct fstrans_mount_info *);
    117 static void fstrans_clear_lwp_info(void);
    118 static inline struct fstrans_lwp_info *
    119     fstrans_get_lwp_info(struct mount *, bool);
    120 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
    121 static int fstrans_lwp_pcc(void *, void *, int);
    122 static void fstrans_lwp_pcd(void *, void *);
    123 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
    124 static bool grant_lock(const struct fstrans_mount_info *,
    125     const enum fstrans_lock_type);
    126 static bool state_change_done(const struct fstrans_mount_info *);
    127 static bool cow_state_change_done(const struct fstrans_mount_info *);
    128 static void cow_change_enter(struct fstrans_mount_info *);
    129 static void cow_change_done(struct fstrans_mount_info *);
    130 
    131 /*
    132  * Initialize.
    133  */
    134 void
    135 fstrans_init(void)
    136 {
    137 
    138 	mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
    139 	mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
    140 	cv_init(&fstrans_state_cv, "fstchg");
    141 	cv_init(&fstrans_count_cv, "fstcnt");
    142 	fstrans_psz = pserialize_create();
    143 	LIST_INIT(&fstrans_fli_head);
    144 	fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
    145 	    coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
    146 	    fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
    147 	KASSERT(fstrans_lwp_cache != NULL);
    148 	fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
    149 	    true, &fstrans_mount_hashmask);
    150 }
    151 
    152 /*
    153  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
    154  * produces cache misses on MP.  Minimise by keeping free entries on list.
    155  */
    156 int
    157 fstrans_lwp_pcc(void *arg, void *obj, int flags)
    158 {
    159 	struct fstrans_lwp_info *fli = obj;
    160 
    161 	memset(fli, 0, sizeof(*fli));
    162 
    163 	mutex_enter(&fstrans_lock);
    164 	LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
    165 	mutex_exit(&fstrans_lock);
    166 
    167 	return 0;
    168 }
    169 
    170 /*
    171  * pool_cache destructor
    172  */
    173 void
    174 fstrans_lwp_pcd(void *arg, void *obj)
    175 {
    176 	struct fstrans_lwp_info *fli = obj;
    177 
    178 	mutex_enter(&fstrans_lock);
    179 	LIST_REMOVE(fli, fli_list);
    180 	mutex_exit(&fstrans_lock);
    181 }
    182 
    183 /*
    184  * Deallocate lwp state.
    185  */
    186 void
    187 fstrans_lwp_dtor(lwp_t *l)
    188 {
    189 	struct fstrans_lwp_info *fli, *fli_next;
    190 
    191 	if (l->l_fstrans == NULL)
    192 		return;
    193 
    194 	mutex_enter(&fstrans_lock);
    195 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    196 		KASSERT(fli->fli_trans_cnt == 0);
    197 		KASSERT(fli->fli_cow_cnt == 0);
    198 		KASSERT(fli->fli_self == l);
    199 		if (fli->fli_mount != NULL)
    200 			fstrans_mount_dtor(fli->fli_mountinfo);
    201 		fli_next = fli->fli_succ;
    202 		fli->fli_alias_cnt = 0;
    203 		fli->fli_mount = NULL;
    204 		fli->fli_alias = NULL;
    205 		fli->fli_mountinfo = NULL;
    206 		fli->fli_self = NULL;
    207 	}
    208 	mutex_exit(&fstrans_lock);
    209 
    210 	for (fli = l->l_fstrans; fli; fli = fli_next) {
    211 		fli_next = fli->fli_succ;
    212 		pool_cache_put(fstrans_lwp_cache, fli);
    213 	}
    214 	l->l_fstrans = NULL;
    215 }
    216 
    217 /*
    218  * mount pointer to hash
    219  */
    220 static inline uint32_t
    221 fstrans_mount_hash(struct mount *mp)
    222 {
    223 
    224 	return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
    225 	    fstrans_mount_hashmask;
    226 }
    227 
    228 /*
    229  * retrieve fstrans_mount_info by mount or NULL
    230  */
    231 static inline struct fstrans_mount_info *
    232 fstrans_mount_get(struct mount *mp)
    233 {
    234 	uint32_t indx;
    235 	struct fstrans_mount_info *fmi, *fmi_lower;
    236 
    237 	KASSERT(mutex_owned(&fstrans_lock));
    238 
    239 	indx = fstrans_mount_hash(mp);
    240 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
    241 		if (fmi->fmi_mount == mp) {
    242 			if (__predict_false(mp->mnt_lower != NULL &&
    243 			    fmi->fmi_lower_info == NULL)) {
    244 				/*
    245 				 * Intern the lower/lowest mount into
    246 				 * this mount info on first lookup.
    247 				 */
    248 				KASSERT(fmi->fmi_ref_cnt == 1);
    249 
    250 				fmi_lower = fstrans_mount_get(mp->mnt_lower);
    251 				if (fmi_lower && fmi_lower->fmi_lower_info)
    252 					fmi_lower = fmi_lower->fmi_lower_info;
    253 				if (fmi_lower == NULL)
    254 					return NULL;
    255 				fmi->fmi_lower_info = fmi_lower;
    256 				fmi->fmi_lower_info->fmi_ref_cnt += 1;
    257 			}
    258 			return fmi;
    259 		}
    260 	}
    261 
    262 	return NULL;
    263 }
    264 
    265 /*
    266  * Dereference mount state.
    267  */
    268 static void
    269 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
    270 {
    271 
    272 	KASSERT(mutex_owned(&fstrans_lock));
    273 
    274 	KASSERT(fmi != NULL);
    275 	fmi->fmi_ref_cnt -= 1;
    276 	if (__predict_true(fmi->fmi_ref_cnt > 0)) {
    277 		return;
    278 	}
    279 
    280 	KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
    281 	KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
    282 	KASSERT(fmi->fmi_owner == NULL);
    283 
    284 	if (fmi->fmi_lower_info)
    285 		fstrans_mount_dtor(fmi->fmi_lower_info);
    286 
    287 	KASSERT(fstrans_gone_count > 0);
    288 	fstrans_gone_count -= 1;
    289 
    290 	KASSERT(fmi->fmi_mount->mnt_lower == NULL);
    291 
    292 	kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
    293 	kmem_free(fmi, sizeof(*fmi));
    294 }
    295 
    296 /*
    297  * Allocate mount state.
    298  */
    299 int
    300 fstrans_mount(struct mount *mp)
    301 {
    302 	uint32_t indx;
    303 	struct fstrans_mount_info *newfmi;
    304 
    305 	indx = fstrans_mount_hash(mp);
    306 
    307 	newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
    308 	newfmi->fmi_state = FSTRANS_NORMAL;
    309 	newfmi->fmi_ref_cnt = 1;
    310 	newfmi->fmi_gone = false;
    311 	LIST_INIT(&newfmi->fmi_cow_handler);
    312 	newfmi->fmi_cow_change = false;
    313 	newfmi->fmi_mount = mp;
    314 	newfmi->fmi_lower_info = NULL;
    315 	newfmi->fmi_owner = NULL;
    316 
    317 	mutex_enter(&fstrans_lock);
    318 	SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
    319 	mutex_exit(&fstrans_lock);
    320 
    321 	return 0;
    322 }
    323 
    324 /*
    325  * Deallocate mount state.
    326  */
    327 void
    328 fstrans_unmount(struct mount *mp)
    329 {
    330 	uint32_t indx;
    331 	struct fstrans_mount_info *fmi;
    332 
    333 	indx = fstrans_mount_hash(mp);
    334 
    335 	mutex_enter(&fstrans_lock);
    336 	fmi = fstrans_mount_get(mp);
    337 	KASSERT(fmi != NULL);
    338 	fmi->fmi_gone = true;
    339 	SLIST_REMOVE(&fstrans_mount_hashtab[indx],
    340 	    fmi, fstrans_mount_info, fmi_hash);
    341 	fstrans_gone_count += 1;
    342 	fstrans_mount_dtor(fmi);
    343 	mutex_exit(&fstrans_lock);
    344 }
    345 
    346 /*
    347  * Clear mount entries whose mount is gone.
    348  */
    349 static void
    350 fstrans_clear_lwp_info(void)
    351 {
    352 	struct fstrans_lwp_info **p, *fli, *tofree = NULL;
    353 
    354 	/*
    355 	 * Scan our list clearing entries whose mount is gone.
    356 	 */
    357 	mutex_enter(&fstrans_lock);
    358 	for (p = &curlwp->l_fstrans; *p; ) {
    359 		fli = *p;
    360 		if (fli->fli_mount != NULL &&
    361 		    fli->fli_mountinfo->fmi_gone &&
    362 		    fli->fli_trans_cnt == 0 &&
    363 		    fli->fli_cow_cnt == 0 &&
    364 		    fli->fli_alias_cnt == 0) {
    365 			*p = (*p)->fli_succ;
    366 			fstrans_mount_dtor(fli->fli_mountinfo);
    367 			if (fli->fli_alias) {
    368 				KASSERT(fli->fli_alias->fli_alias_cnt > 0);
    369 				fli->fli_alias->fli_alias_cnt--;
    370 			}
    371 			fli->fli_mount = NULL;
    372 			fli->fli_alias = NULL;
    373 			fli->fli_mountinfo = NULL;
    374 			fli->fli_self = NULL;
    375 			p = &curlwp->l_fstrans;
    376 			fli->fli_succ = tofree;
    377 			tofree = fli;
    378 		} else {
    379 			p = &(*p)->fli_succ;
    380 		}
    381 	}
    382 #ifdef DIAGNOSTIC
    383 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
    384 		if (fli->fli_alias != NULL)
    385 			KASSERT(fli->fli_alias->fli_self == curlwp);
    386 #endif /* DIAGNOSTIC */
    387 	mutex_exit(&fstrans_lock);
    388 
    389 	while (tofree != NULL) {
    390 		fli = tofree;
    391 		tofree = fli->fli_succ;
    392 		pool_cache_put(fstrans_lwp_cache, fli);
    393 	}
    394 }
    395 
    396 /*
    397  * Allocate and return per lwp info for this mount.
    398  */
    399 static struct fstrans_lwp_info *
    400 fstrans_alloc_lwp_info(struct mount *mp)
    401 {
    402 	struct fstrans_lwp_info *fli, *fli_lower;
    403 	struct fstrans_mount_info *fmi;
    404 
    405 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    406 		if (fli->fli_mount == mp)
    407 			return fli;
    408 	}
    409 
    410 	/*
    411 	 * Lookup mount info and get lower mount per lwp info.
    412 	 */
    413 	mutex_enter(&fstrans_lock);
    414 	fmi = fstrans_mount_get(mp);
    415 	if (fmi == NULL) {
    416 		mutex_exit(&fstrans_lock);
    417 		return NULL;
    418 	}
    419 	fmi->fmi_ref_cnt += 1;
    420 	mutex_exit(&fstrans_lock);
    421 
    422 	if (fmi->fmi_lower_info) {
    423 		fli_lower =
    424 		    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
    425 		if (fli_lower == NULL) {
    426 			mutex_enter(&fstrans_lock);
    427 			fstrans_mount_dtor(fmi);
    428 			mutex_exit(&fstrans_lock);
    429 
    430 			return NULL;
    431 		}
    432 	} else {
    433 		fli_lower = NULL;
    434 	}
    435 
    436 	/*
    437 	 * Allocate a new entry.
    438 	 */
    439 	fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
    440 	KASSERT(fli->fli_trans_cnt == 0);
    441 	KASSERT(fli->fli_cow_cnt == 0);
    442 	KASSERT(fli->fli_alias_cnt == 0);
    443 	KASSERT(fli->fli_mount == NULL);
    444 	KASSERT(fli->fli_alias == NULL);
    445 	KASSERT(fli->fli_mountinfo == NULL);
    446 	KASSERT(fli->fli_self == NULL);
    447 
    448 	/*
    449 	 * Attach the mount info and alias.
    450 	 */
    451 
    452 	fli->fli_self = curlwp;
    453 	fli->fli_mount = mp;
    454 	fli->fli_mountinfo = fmi;
    455 
    456 	fli->fli_succ = curlwp->l_fstrans;
    457 	curlwp->l_fstrans = fli;
    458 
    459 	if (fli_lower) {
    460 		fli->fli_alias = fli_lower;
    461 		fli->fli_alias->fli_alias_cnt++;
    462 		fli = fli->fli_alias;
    463 	}
    464 
    465 	return fli;
    466 }
    467 
    468 /*
    469  * Retrieve the per lwp info for this mount allocating if necessary.
    470  */
    471 static inline struct fstrans_lwp_info *
    472 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
    473 {
    474 	struct fstrans_lwp_info *fli;
    475 
    476 	/*
    477 	 * Scan our list for a match.
    478 	 */
    479 	for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
    480 		if (fli->fli_mount == mp) {
    481 			KASSERT(mp->mnt_lower == NULL ||
    482 			    fli->fli_alias != NULL);
    483 			if (fli->fli_alias != NULL)
    484 				fli = fli->fli_alias;
    485 			break;
    486 		}
    487 	}
    488 
    489 	if (do_alloc) {
    490 		if (__predict_false(fli == NULL))
    491 			fli = fstrans_alloc_lwp_info(mp);
    492 	}
    493 
    494 	return fli;
    495 }
    496 
    497 /*
    498  * Check if this lock type is granted at this state.
    499  */
    500 static bool
    501 grant_lock(const struct fstrans_mount_info *fmi,
    502     const enum fstrans_lock_type type)
    503 {
    504 
    505 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
    506 		return true;
    507 	if (fmi->fmi_owner == curlwp)
    508 		return true;
    509 	if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
    510 		return true;
    511 
    512 	return false;
    513 }
    514 
    515 /*
    516  * Start a transaction.  If this thread already has a transaction on this
    517  * file system increment the reference counter.
    518  */
    519 static inline int
    520 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
    521 {
    522 	int s;
    523 	struct fstrans_lwp_info *fli;
    524 	struct fstrans_mount_info *fmi;
    525 
    526 	ASSERT_SLEEPABLE();
    527 
    528 	fli = fstrans_get_lwp_info(mp, true);
    529 	if (fli == NULL)
    530 		return 0;
    531 	fmi = fli->fli_mountinfo;
    532 
    533 	if (fli->fli_trans_cnt > 0) {
    534 		fli->fli_trans_cnt += 1;
    535 
    536 		return 0;
    537 	}
    538 
    539 	s = pserialize_read_enter();
    540 	if (__predict_true(grant_lock(fmi, lock_type))) {
    541 		fli->fli_trans_cnt = 1;
    542 		fli->fli_lock_type = lock_type;
    543 		pserialize_read_exit(s);
    544 
    545 		return 0;
    546 	}
    547 	pserialize_read_exit(s);
    548 
    549 	if (! wait)
    550 		return SET_ERROR(EBUSY);
    551 
    552 	mutex_enter(&fstrans_lock);
    553 	while (! grant_lock(fmi, lock_type))
    554 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    555 	fli->fli_trans_cnt = 1;
    556 	fli->fli_lock_type = lock_type;
    557 	mutex_exit(&fstrans_lock);
    558 
    559 	return 0;
    560 }
    561 
    562 void
    563 fstrans_start(struct mount *mp)
    564 {
    565 	int error __diagused;
    566 
    567 	error = _fstrans_start(mp, FSTRANS_SHARED, 1);
    568 	KASSERT(error == 0);
    569 }
    570 
    571 int
    572 fstrans_start_nowait(struct mount *mp)
    573 {
    574 
    575 	return _fstrans_start(mp, FSTRANS_SHARED, 0);
    576 }
    577 
    578 void
    579 fstrans_start_lazy(struct mount *mp)
    580 {
    581 	int error __diagused;
    582 
    583 	error = _fstrans_start(mp, FSTRANS_LAZY, 1);
    584 	KASSERT(error == 0);
    585 }
    586 
    587 /*
    588  * Finish a transaction.
    589  */
    590 void
    591 fstrans_done(struct mount *mp)
    592 {
    593 	int s;
    594 	struct fstrans_lwp_info *fli;
    595 	struct fstrans_mount_info *fmi;
    596 
    597 	fli = fstrans_get_lwp_info(mp, false);
    598 	if (fli == NULL)
    599 		return;
    600 	fmi = fli->fli_mountinfo;
    601 	KASSERT(fli->fli_trans_cnt > 0);
    602 
    603 	if (fli->fli_trans_cnt > 1) {
    604 		fli->fli_trans_cnt -= 1;
    605 
    606 		return;
    607 	}
    608 
    609 	if (__predict_false(fstrans_gone_count > 0))
    610 		fstrans_clear_lwp_info();
    611 
    612 	s = pserialize_read_enter();
    613 	if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
    614 		fli->fli_trans_cnt = 0;
    615 		pserialize_read_exit(s);
    616 
    617 		return;
    618 	}
    619 	pserialize_read_exit(s);
    620 
    621 	mutex_enter(&fstrans_lock);
    622 	fli->fli_trans_cnt = 0;
    623 	cv_signal(&fstrans_count_cv);
    624 	mutex_exit(&fstrans_lock);
    625 }
    626 
    627 /*
    628  * Check if we hold an lock.
    629  */
    630 int
    631 fstrans_held(struct mount *mp)
    632 {
    633 	struct fstrans_lwp_info *fli;
    634 	struct fstrans_mount_info *fmi;
    635 
    636 	KASSERT(mp != dead_rootmount);
    637 
    638 	fli = fstrans_get_lwp_info(mp, false);
    639 	if (fli == NULL)
    640 		return 0;
    641 	fmi = fli->fli_mountinfo;
    642 
    643 	return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
    644 }
    645 
    646 /*
    647  * Check if this thread has an exclusive lock.
    648  */
    649 int
    650 fstrans_is_owner(struct mount *mp)
    651 {
    652 	struct fstrans_lwp_info *fli;
    653 	struct fstrans_mount_info *fmi;
    654 
    655 	KASSERT(mp != dead_rootmount);
    656 
    657 	fli = fstrans_get_lwp_info(mp, false);
    658 	if (fli == NULL)
    659 		return 0;
    660 	fmi = fli->fli_mountinfo;
    661 
    662 	return (fmi->fmi_owner == curlwp);
    663 }
    664 
    665 /*
    666  * True, if no thread is in a transaction not granted at the current state.
    667  */
    668 static bool
    669 state_change_done(const struct fstrans_mount_info *fmi)
    670 {
    671 	struct fstrans_lwp_info *fli;
    672 
    673 	KASSERT(mutex_owned(&fstrans_lock));
    674 
    675 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    676 		if (fli->fli_mountinfo != fmi)
    677 			continue;
    678 		if (fli->fli_trans_cnt == 0)
    679 			continue;
    680 		if (fli->fli_self == curlwp)
    681 			continue;
    682 		if (grant_lock(fmi, fli->fli_lock_type))
    683 			continue;
    684 
    685 		return false;
    686 	}
    687 
    688 	return true;
    689 }
    690 
    691 /*
    692  * Set new file system state.
    693  */
    694 int
    695 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
    696 {
    697 	int error;
    698 	enum fstrans_state old_state;
    699 	struct fstrans_lwp_info *fli;
    700 	struct fstrans_mount_info *fmi;
    701 
    702 	KASSERT(mp != dead_rootmount);
    703 
    704 	fli = fstrans_get_lwp_info(mp, true);
    705 	if (fli == NULL)
    706 		return SET_ERROR(ENOENT);
    707 	fmi = fli->fli_mountinfo;
    708 	old_state = fmi->fmi_state;
    709 	if (old_state == new_state)
    710 		return 0;
    711 
    712 	mutex_enter(&fstrans_lock);
    713 	fmi->fmi_state = new_state;
    714 	pserialize_perform(fstrans_psz);
    715 
    716 	/*
    717 	 * All threads see the new state now.
    718 	 * Wait for transactions invalid at this state to leave.
    719 	 */
    720 	error = 0;
    721 	while (! state_change_done(fmi)) {
    722 		error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
    723 		if (error) {
    724 			new_state = fmi->fmi_state = FSTRANS_NORMAL;
    725 			break;
    726 		}
    727 	}
    728 	if (old_state != new_state) {
    729 		if (old_state == FSTRANS_NORMAL) {
    730 			KASSERT(fmi->fmi_owner == NULL);
    731 			fmi->fmi_owner = curlwp;
    732 		}
    733 		if (new_state == FSTRANS_NORMAL) {
    734 			KASSERT(fmi->fmi_owner == curlwp);
    735 			fmi->fmi_owner = NULL;
    736 		}
    737 	}
    738 	cv_broadcast(&fstrans_state_cv);
    739 	mutex_exit(&fstrans_lock);
    740 
    741 	return error;
    742 }
    743 
    744 /*
    745  * Get current file system state.
    746  */
    747 enum fstrans_state
    748 fstrans_getstate(struct mount *mp)
    749 {
    750 	struct fstrans_lwp_info *fli;
    751 	struct fstrans_mount_info *fmi;
    752 
    753 	KASSERT(mp != dead_rootmount);
    754 
    755 	fli = fstrans_get_lwp_info(mp, true);
    756 	KASSERT(fli != NULL);
    757 	fmi = fli->fli_mountinfo;
    758 
    759 	return fmi->fmi_state;
    760 }
    761 
    762 /*
    763  * Request a filesystem to suspend all operations.
    764  */
    765 int
    766 vfs_suspend(struct mount *mp, int nowait)
    767 {
    768 	struct fstrans_lwp_info *fli;
    769 	int error;
    770 
    771 	if (mp == dead_rootmount)
    772 		return SET_ERROR(EOPNOTSUPP);
    773 
    774 	fli = fstrans_get_lwp_info(mp, true);
    775 	if (fli == NULL)
    776 		return SET_ERROR(ENOENT);
    777 
    778 	if (nowait) {
    779 		if (!mutex_tryenter(&vfs_suspend_lock))
    780 			return SET_ERROR(EWOULDBLOCK);
    781 	} else
    782 		mutex_enter(&vfs_suspend_lock);
    783 
    784 	if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
    785 		mutex_exit(&vfs_suspend_lock);
    786 		return error;
    787 	}
    788 
    789 	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
    790 		vfs_resume(mp);
    791 		return SET_ERROR(ENOENT);
    792 	}
    793 
    794 	return 0;
    795 }
    796 
    797 /*
    798  * Request a filesystem to resume all operations.
    799  */
    800 void
    801 vfs_resume(struct mount *mp)
    802 {
    803 	struct fstrans_lwp_info *fli;
    804 
    805 	KASSERT(mp != dead_rootmount);
    806 
    807 	fli = fstrans_get_lwp_info(mp, false);
    808 	mp = fli->fli_mount;
    809 
    810 	VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
    811 	mutex_exit(&vfs_suspend_lock);
    812 }
    813 
    814 /*
    815  * True, if no thread is running a cow handler.
    816  */
    817 static bool
    818 cow_state_change_done(const struct fstrans_mount_info *fmi)
    819 {
    820 	struct fstrans_lwp_info *fli;
    821 
    822 	KASSERT(mutex_owned(&fstrans_lock));
    823 	KASSERT(fmi->fmi_cow_change);
    824 
    825 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
    826 		if (fli->fli_mount != fmi->fmi_mount)
    827 			continue;
    828 		if (fli->fli_cow_cnt == 0)
    829 			continue;
    830 
    831 		return false;
    832 	}
    833 
    834 	return true;
    835 }
    836 
    837 /*
    838  * Prepare for changing this mounts cow list.
    839  * Returns with fstrans_lock locked.
    840  */
    841 static void
    842 cow_change_enter(struct fstrans_mount_info *fmi)
    843 {
    844 
    845 	mutex_enter(&fstrans_lock);
    846 
    847 	/*
    848 	 * Wait for other threads changing the list.
    849 	 */
    850 	while (fmi->fmi_cow_change)
    851 		cv_wait(&fstrans_state_cv, &fstrans_lock);
    852 
    853 	/*
    854 	 * Wait until all threads are aware of a state change.
    855 	 */
    856 	fmi->fmi_cow_change = true;
    857 	pserialize_perform(fstrans_psz);
    858 
    859 	while (! cow_state_change_done(fmi))
    860 		cv_wait(&fstrans_count_cv, &fstrans_lock);
    861 }
    862 
    863 /*
    864  * Done changing this mounts cow list.
    865  */
    866 static void
    867 cow_change_done(struct fstrans_mount_info *fmi)
    868 {
    869 
    870 	KASSERT(mutex_owned(&fstrans_lock));
    871 
    872 	fmi->fmi_cow_change = false;
    873 	pserialize_perform(fstrans_psz);
    874 
    875 	cv_broadcast(&fstrans_state_cv);
    876 
    877 	mutex_exit(&fstrans_lock);
    878 }
    879 
    880 /*
    881  * Add a handler to this mount.
    882  */
    883 int
    884 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    885     void *arg)
    886 {
    887 	struct fstrans_mount_info *fmi;
    888 	struct fscow_handler *newch;
    889 
    890 	KASSERT(mp != dead_rootmount);
    891 
    892 	mutex_enter(&fstrans_lock);
    893 	fmi = fstrans_mount_get(mp);
    894 	KASSERT(fmi != NULL);
    895 	fmi->fmi_ref_cnt += 1;
    896 	mutex_exit(&fstrans_lock);
    897 
    898 	newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
    899 	newch->ch_func = func;
    900 	newch->ch_arg = arg;
    901 
    902 	cow_change_enter(fmi);
    903 	LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
    904 	cow_change_done(fmi);
    905 
    906 	return 0;
    907 }
    908 
    909 /*
    910  * Remove a handler from this mount.
    911  */
    912 int
    913 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    914     void *arg)
    915 {
    916 	struct fstrans_mount_info *fmi;
    917 	struct fscow_handler *hp = NULL;
    918 
    919 	KASSERT(mp != dead_rootmount);
    920 
    921 	mutex_enter(&fstrans_lock);
    922 	fmi = fstrans_mount_get(mp);
    923 	KASSERT(fmi != NULL);
    924 	mutex_exit(&fstrans_lock);
    925 
    926 	cow_change_enter(fmi);
    927 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    928 		if (hp->ch_func == func && hp->ch_arg == arg)
    929 			break;
    930 	if (hp != NULL) {
    931 		LIST_REMOVE(hp, ch_list);
    932 		kmem_free(hp, sizeof(*hp));
    933 	}
    934 	fstrans_mount_dtor(fmi);
    935 	cow_change_done(fmi);
    936 
    937 	return hp ? 0 : SET_ERROR(EINVAL);
    938 }
    939 
    940 /*
    941  * Check for need to copy block that is about to be written.
    942  */
    943 int
    944 fscow_run(struct buf *bp, bool data_valid)
    945 {
    946 	int error, s;
    947 	struct mount *mp;
    948 	struct fstrans_lwp_info *fli;
    949 	struct fstrans_mount_info *fmi;
    950 	struct fscow_handler *hp;
    951 
    952 	/*
    953 	 * First check if we need run the copy-on-write handler.
    954 	 */
    955 	if ((bp->b_flags & B_COWDONE))
    956 		return 0;
    957 	if (bp->b_vp == NULL) {
    958 		bp->b_flags |= B_COWDONE;
    959 		return 0;
    960 	}
    961 	if (bp->b_vp->v_type == VBLK)
    962 		mp = spec_node_getmountedfs(bp->b_vp);
    963 	else
    964 		mp = bp->b_vp->v_mount;
    965 	if (mp == NULL || mp == dead_rootmount) {
    966 		bp->b_flags |= B_COWDONE;
    967 		return 0;
    968 	}
    969 
    970 	fli = fstrans_get_lwp_info(mp, true);
    971 	KASSERT(fli != NULL);
    972 	fmi = fli->fli_mountinfo;
    973 
    974 	/*
    975 	 * On non-recursed run check if other threads
    976 	 * want to change the list.
    977 	 */
    978 	if (fli->fli_cow_cnt == 0) {
    979 		s = pserialize_read_enter();
    980 		if (__predict_false(fmi->fmi_cow_change)) {
    981 			pserialize_read_exit(s);
    982 			mutex_enter(&fstrans_lock);
    983 			while (fmi->fmi_cow_change)
    984 				cv_wait(&fstrans_state_cv, &fstrans_lock);
    985 			fli->fli_cow_cnt = 1;
    986 			mutex_exit(&fstrans_lock);
    987 		} else {
    988 			fli->fli_cow_cnt = 1;
    989 			pserialize_read_exit(s);
    990 		}
    991 	} else
    992 		fli->fli_cow_cnt += 1;
    993 
    994 	/*
    995 	 * Run all copy-on-write handlers, stop on error.
    996 	 */
    997 	error = 0;
    998 	LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
    999 		if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
   1000 			break;
   1001 	if (error == 0)
   1002 		bp->b_flags |= B_COWDONE;
   1003 
   1004 	/*
   1005 	 * Check if other threads want to change the list.
   1006 	 */
   1007 	if (fli->fli_cow_cnt > 1) {
   1008 		fli->fli_cow_cnt -= 1;
   1009 	} else {
   1010 		s = pserialize_read_enter();
   1011 		if (__predict_false(fmi->fmi_cow_change)) {
   1012 			pserialize_read_exit(s);
   1013 			mutex_enter(&fstrans_lock);
   1014 			fli->fli_cow_cnt = 0;
   1015 			cv_signal(&fstrans_count_cv);
   1016 			mutex_exit(&fstrans_lock);
   1017 		} else {
   1018 			fli->fli_cow_cnt = 0;
   1019 			pserialize_read_exit(s);
   1020 		}
   1021 	}
   1022 
   1023 	return error;
   1024 }
   1025 
   1026 #if defined(DDB)
   1027 void fstrans_dump(int);
   1028 
   1029 static void
   1030 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
   1031 {
   1032 	char prefix[9];
   1033 	struct fstrans_lwp_info *fli;
   1034 
   1035 	snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
   1036 	LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
   1037 		if (fli->fli_self != l)
   1038 			continue;
   1039 		if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
   1040 			if (! verbose)
   1041 				continue;
   1042 		}
   1043 		printf("%-8s", prefix);
   1044 		if (verbose)
   1045 			printf(" @%p", fli);
   1046 		if (fli->fli_mount == dead_rootmount)
   1047 			printf(" <dead>");
   1048 		else if (fli->fli_mount != NULL)
   1049 			printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
   1050 		else
   1051 			printf(" NULL");
   1052 		if (fli->fli_alias != NULL) {
   1053 			struct mount *amp = fli->fli_alias->fli_mount;
   1054 
   1055 			printf(" alias");
   1056 			if (verbose)
   1057 				printf(" @%p", fli->fli_alias);
   1058 			if (amp == NULL)
   1059 				printf(" NULL");
   1060 			else
   1061 				printf(" (%s)", amp->mnt_stat.f_mntonname);
   1062 		}
   1063 		if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
   1064 			printf(" gone");
   1065 		if (fli->fli_trans_cnt == 0) {
   1066 			printf(" -");
   1067 		} else {
   1068 			switch (fli->fli_lock_type) {
   1069 			case FSTRANS_LAZY:
   1070 				printf(" lazy");
   1071 				break;
   1072 			case FSTRANS_SHARED:
   1073 				printf(" shared");
   1074 				break;
   1075 			default:
   1076 				printf(" %#x", fli->fli_lock_type);
   1077 				break;
   1078 			}
   1079 		}
   1080 		printf(" %d cow %d alias %d\n",
   1081 		    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
   1082 		prefix[0] = '\0';
   1083 	}
   1084 }
   1085 
   1086 static void
   1087 fstrans_print_mount(struct mount *mp, int verbose)
   1088 {
   1089 	uint32_t indx;
   1090 	struct fstrans_mount_info *fmi;
   1091 
   1092 	indx = fstrans_mount_hash(mp);
   1093 	SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
   1094 		if (fmi->fmi_mount == mp)
   1095 			break;
   1096 
   1097 	if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
   1098 		return;
   1099 
   1100 	printf("%-16s ", mp->mnt_stat.f_mntonname);
   1101 	if (fmi == NULL) {
   1102 		printf("(null)\n");
   1103 		return;
   1104 	}
   1105 	printf("owner %p ", fmi->fmi_owner);
   1106 	switch (fmi->fmi_state) {
   1107 	case FSTRANS_NORMAL:
   1108 		printf("state normal\n");
   1109 		break;
   1110 	case FSTRANS_SUSPENDING:
   1111 		printf("state suspending\n");
   1112 		break;
   1113 	case FSTRANS_SUSPENDED:
   1114 		printf("state suspended\n");
   1115 		break;
   1116 	default:
   1117 		printf("state %#x\n", fmi->fmi_state);
   1118 		break;
   1119 	}
   1120 }
   1121 
   1122 void
   1123 fstrans_dump(int full)
   1124 {
   1125 	const struct proclist_desc *pd;
   1126 	struct proc *p;
   1127 	struct lwp *l;
   1128 	struct mount *mp;
   1129 
   1130 	printf("Fstrans locks by lwp:\n");
   1131 	for (pd = proclists; pd->pd_list != NULL; pd++)
   1132 		PROCLIST_FOREACH(p, pd->pd_list)
   1133 			LIST_FOREACH(l, &p->p_lwps, l_sibling)
   1134 				fstrans_print_lwp(p, l, full == 1);
   1135 
   1136 	printf("Fstrans state by mount:\n");
   1137 	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
   1138 		fstrans_print_mount(mp, full == 1);
   1139 }
   1140 #endif /* defined(DDB) */
   1141