Home | History | Annotate | Line # | Download | only in zfs
      1  1.1  haad /*
      2  1.1  haad  * CDDL HEADER START
      3  1.1  haad  *
      4  1.1  haad  * The contents of this file are subject to the terms of the
      5  1.1  haad  * Common Development and Distribution License (the "License").
      6  1.1  haad  * You may not use this file except in compliance with the License.
      7  1.1  haad  *
      8  1.1  haad  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  1.1  haad  * or http://www.opensolaris.org/os/licensing.
     10  1.1  haad  * See the License for the specific language governing permissions
     11  1.1  haad  * and limitations under the License.
     12  1.1  haad  *
     13  1.1  haad  * When distributing Covered Code, include this CDDL HEADER in each
     14  1.1  haad  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  1.1  haad  * If applicable, add the following below this CDDL HEADER, with the
     16  1.1  haad  * fields enclosed by brackets "[]" replaced with your own identifying
     17  1.1  haad  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  1.1  haad  *
     19  1.1  haad  * CDDL HEADER END
     20  1.1  haad  */
     21  1.1  haad /*
     22  1.3  haad  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  1.1  haad  * Use is subject to license terms.
     24  1.1  haad  */
     25  1.4   chs /*
     26  1.4   chs  * Copyright (c) 2012 by Delphix. All rights reserved.
     27  1.4   chs  */
     28  1.1  haad 
     29  1.1  haad #include <sys/refcount.h>
     30  1.1  haad #include <sys/rrwlock.h>
     31  1.1  haad 
     32  1.1  haad /*
     33  1.1  haad  * This file contains the implementation of a re-entrant read
     34  1.1  haad  * reader/writer lock (aka "rrwlock").
     35  1.1  haad  *
     36  1.1  haad  * This is a normal reader/writer lock with the additional feature
     37  1.1  haad  * of allowing threads who have already obtained a read lock to
     38  1.1  haad  * re-enter another read lock (re-entrant read) - even if there are
     39  1.1  haad  * waiting writers.
     40  1.1  haad  *
     41  1.1  haad  * Callers who have not obtained a read lock give waiting writers priority.
     42  1.1  haad  *
     43  1.1  haad  * The rrwlock_t lock does not allow re-entrant writers, nor does it
     44  1.1  haad  * allow a re-entrant mix of reads and writes (that is, it does not
     45  1.1  haad  * allow a caller who has already obtained a read lock to be able to
     46  1.1  haad  * then grab a write lock without first dropping all read locks, and
     47  1.1  haad  * vice versa).
     48  1.1  haad  *
     49  1.1  haad  * The rrwlock_t uses tsd (thread specific data) to keep a list of
     50  1.1  haad  * nodes (rrw_node_t), where each node keeps track of which specific
     51  1.1  haad  * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
     52  1.1  haad  * should be rare, a thread that grabs multiple reads on the same rrwlock_t
     53  1.1  haad  * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
     54  1.1  haad  * tsd list can represent a different rrwlock_t.  This allows a thread
     55  1.1  haad  * to enter multiple and unique rrwlock_ts for read locks at the same time.
     56  1.1  haad  *
     57  1.1  haad  * Since using tsd exposes some overhead, the rrwlock_t only needs to
     58  1.1  haad  * keep tsd data when writers are waiting.  If no writers are waiting, then
     59  1.1  haad  * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
     60  1.1  haad  * is needed.  Once a writer attempts to grab the lock, readers then
     61  1.1  haad  * keep tsd data and bump the linked readers count (rr_linked_rcount).
     62  1.1  haad  *
     63  1.1  haad  * If there are waiting writers and there are anonymous readers, then a
     64  1.1  haad  * reader doesn't know if it is a re-entrant lock. But since it may be one,
     65  1.1  haad  * we allow the read to proceed (otherwise it could deadlock).  Since once
     66  1.1  haad  * waiting writers are active, readers no longer bump the anonymous count,
     67  1.1  haad  * the anonymous readers will eventually flush themselves out.  At this point,
     68  1.1  haad  * readers will be able to tell if they are a re-entrant lock (have a
     69  1.1  haad  * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
     70  1.1  haad  * we must let the proceed.  If they are not, then the reader blocks for the
     71  1.1  haad  * waiting writers.  Hence, we do not starve writers.
     72  1.1  haad  */
     73  1.1  haad 
     74  1.1  haad /* global key for TSD */
     75  1.1  haad uint_t rrw_tsd_key;
     76  1.1  haad 
     77  1.1  haad typedef struct rrw_node {
     78  1.4   chs 	struct rrw_node *rn_next;
     79  1.4   chs 	rrwlock_t *rn_rrl;
     80  1.4   chs 	void *rn_tag;
     81  1.1  haad } rrw_node_t;
     82  1.1  haad 
     83  1.1  haad static rrw_node_t *
     84  1.1  haad rrn_find(rrwlock_t *rrl)
     85  1.1  haad {
     86  1.1  haad 	rrw_node_t *rn;
     87  1.1  haad 
     88  1.1  haad 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
     89  1.1  haad 		return (NULL);
     90  1.1  haad 
     91  1.1  haad 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
     92  1.1  haad 		if (rn->rn_rrl == rrl)
     93  1.1  haad 			return (rn);
     94  1.1  haad 	}
     95  1.1  haad 	return (NULL);
     96  1.1  haad }
     97  1.1  haad 
     98  1.1  haad /*
     99  1.1  haad  * Add a node to the head of the singly linked list.
    100  1.1  haad  */
    101  1.1  haad static void
    102  1.4   chs rrn_add(rrwlock_t *rrl, void *tag)
    103  1.1  haad {
    104  1.1  haad 	rrw_node_t *rn;
    105  1.1  haad 
    106  1.1  haad 	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
    107  1.1  haad 	rn->rn_rrl = rrl;
    108  1.1  haad 	rn->rn_next = tsd_get(rrw_tsd_key);
    109  1.4   chs 	rn->rn_tag = tag;
    110  1.1  haad 	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
    111  1.1  haad }
    112  1.1  haad 
    113  1.1  haad /*
    114  1.1  haad  * If a node is found for 'rrl', then remove the node from this
    115  1.1  haad  * thread's list and return TRUE; otherwise return FALSE.
    116  1.1  haad  */
    117  1.1  haad static boolean_t
    118  1.4   chs rrn_find_and_remove(rrwlock_t *rrl, void *tag)
    119  1.1  haad {
    120  1.1  haad 	rrw_node_t *rn;
    121  1.1  haad 	rrw_node_t *prev = NULL;
    122  1.1  haad 
    123  1.1  haad 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
    124  1.2  haad 		return (B_FALSE);
    125  1.1  haad 
    126  1.1  haad 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
    127  1.4   chs 		if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
    128  1.1  haad 			if (prev)
    129  1.1  haad 				prev->rn_next = rn->rn_next;
    130  1.1  haad 			else
    131  1.1  haad 				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
    132  1.1  haad 			kmem_free(rn, sizeof (*rn));
    133  1.1  haad 			return (B_TRUE);
    134  1.1  haad 		}
    135  1.1  haad 		prev = rn;
    136  1.1  haad 	}
    137  1.1  haad 	return (B_FALSE);
    138  1.1  haad }
    139  1.1  haad 
    140  1.1  haad void
    141  1.4   chs rrw_init(rrwlock_t *rrl, boolean_t track_all)
    142  1.1  haad {
    143  1.1  haad 	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
    144  1.1  haad 	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
    145  1.1  haad 	rrl->rr_writer = NULL;
    146  1.1  haad 	refcount_create(&rrl->rr_anon_rcount);
    147  1.1  haad 	refcount_create(&rrl->rr_linked_rcount);
    148  1.1  haad 	rrl->rr_writer_wanted = B_FALSE;
    149  1.4   chs 	rrl->rr_track_all = track_all;
    150  1.1  haad }
    151  1.1  haad 
    152  1.1  haad void
    153  1.1  haad rrw_destroy(rrwlock_t *rrl)
    154  1.1  haad {
    155  1.1  haad 	mutex_destroy(&rrl->rr_lock);
    156  1.1  haad 	cv_destroy(&rrl->rr_cv);
    157  1.1  haad 	ASSERT(rrl->rr_writer == NULL);
    158  1.1  haad 	refcount_destroy(&rrl->rr_anon_rcount);
    159  1.1  haad 	refcount_destroy(&rrl->rr_linked_rcount);
    160  1.1  haad }
    161  1.1  haad 
    162  1.1  haad static void
    163  1.4   chs rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
    164  1.1  haad {
    165  1.1  haad 	mutex_enter(&rrl->rr_lock);
    166  1.3  haad #if !defined(DEBUG) && defined(_KERNEL)
    167  1.4   chs 	if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
    168  1.4   chs 	    !rrl->rr_track_all) {
    169  1.3  haad 		rrl->rr_anon_rcount.rc_count++;
    170  1.3  haad 		mutex_exit(&rrl->rr_lock);
    171  1.3  haad 		return;
    172  1.3  haad 	}
    173  1.3  haad 	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
    174  1.3  haad #endif
    175  1.1  haad 	ASSERT(rrl->rr_writer != curthread);
    176  1.1  haad 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
    177  1.1  haad 
    178  1.4   chs 	while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
    179  1.4   chs 	    refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
    180  1.1  haad 	    rrn_find(rrl) == NULL))
    181  1.1  haad 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
    182  1.1  haad 
    183  1.4   chs 	if (rrl->rr_writer_wanted || rrl->rr_track_all) {
    184  1.1  haad 		/* may or may not be a re-entrant enter */
    185  1.4   chs 		rrn_add(rrl, tag);
    186  1.1  haad 		(void) refcount_add(&rrl->rr_linked_rcount, tag);
    187  1.1  haad 	} else {
    188  1.1  haad 		(void) refcount_add(&rrl->rr_anon_rcount, tag);
    189  1.1  haad 	}
    190  1.1  haad 	ASSERT(rrl->rr_writer == NULL);
    191  1.1  haad 	mutex_exit(&rrl->rr_lock);
    192  1.1  haad }
    193  1.1  haad 
    194  1.4   chs void
    195  1.4   chs rrw_enter_read(rrwlock_t *rrl, void *tag)
    196  1.4   chs {
    197  1.4   chs 	rrw_enter_read_impl(rrl, B_FALSE, tag);
    198  1.4   chs }
    199  1.4   chs 
    200  1.4   chs /*
    201  1.4   chs  * take a read lock even if there are pending write lock requests. if we want
    202  1.4   chs  * to take a lock reentrantly, but from different threads (that have a
    203  1.4   chs  * relationship to each other), the normal detection mechanism to overrule
    204  1.4   chs  * the pending writer does not work, so we have to give an explicit hint here.
    205  1.4   chs  */
    206  1.4   chs void
    207  1.4   chs rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
    208  1.4   chs {
    209  1.4   chs 	rrw_enter_read_impl(rrl, B_TRUE, tag);
    210  1.4   chs }
    211  1.4   chs 
    212  1.4   chs 
    213  1.4   chs void
    214  1.1  haad rrw_enter_write(rrwlock_t *rrl)
    215  1.1  haad {
    216  1.1  haad 	mutex_enter(&rrl->rr_lock);
    217  1.1  haad 	ASSERT(rrl->rr_writer != curthread);
    218  1.1  haad 
    219  1.1  haad 	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
    220  1.1  haad 	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
    221  1.1  haad 	    rrl->rr_writer != NULL) {
    222  1.1  haad 		rrl->rr_writer_wanted = B_TRUE;
    223  1.1  haad 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
    224  1.1  haad 	}
    225  1.1  haad 	rrl->rr_writer_wanted = B_FALSE;
    226  1.1  haad 	rrl->rr_writer = curthread;
    227  1.1  haad 	mutex_exit(&rrl->rr_lock);
    228  1.1  haad }
    229  1.1  haad 
    230  1.1  haad void
    231  1.1  haad rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
    232  1.1  haad {
    233  1.1  haad 	if (rw == RW_READER)
    234  1.1  haad 		rrw_enter_read(rrl, tag);
    235  1.1  haad 	else
    236  1.1  haad 		rrw_enter_write(rrl);
    237  1.1  haad }
    238  1.1  haad 
    239  1.1  haad void
    240  1.1  haad rrw_exit(rrwlock_t *rrl, void *tag)
    241  1.1  haad {
    242  1.1  haad 	mutex_enter(&rrl->rr_lock);
    243  1.3  haad #if !defined(DEBUG) && defined(_KERNEL)
    244  1.3  haad 	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
    245  1.3  haad 		rrl->rr_anon_rcount.rc_count--;
    246  1.3  haad 		if (rrl->rr_anon_rcount.rc_count == 0)
    247  1.3  haad 			cv_broadcast(&rrl->rr_cv);
    248  1.3  haad 		mutex_exit(&rrl->rr_lock);
    249  1.3  haad 		return;
    250  1.3  haad 	}
    251  1.3  haad 	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
    252  1.3  haad #endif
    253  1.1  haad 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
    254  1.1  haad 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
    255  1.1  haad 	    rrl->rr_writer != NULL);
    256  1.1  haad 
    257  1.1  haad 	if (rrl->rr_writer == NULL) {
    258  1.3  haad 		int64_t count;
    259  1.4   chs 		if (rrn_find_and_remove(rrl, tag)) {
    260  1.3  haad 			count = refcount_remove(&rrl->rr_linked_rcount, tag);
    261  1.4   chs 		} else {
    262  1.4   chs 			ASSERT(!rrl->rr_track_all);
    263  1.3  haad 			count = refcount_remove(&rrl->rr_anon_rcount, tag);
    264  1.4   chs 		}
    265  1.3  haad 		if (count == 0)
    266  1.3  haad 			cv_broadcast(&rrl->rr_cv);
    267  1.1  haad 	} else {
    268  1.1  haad 		ASSERT(rrl->rr_writer == curthread);
    269  1.1  haad 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
    270  1.1  haad 		    refcount_is_zero(&rrl->rr_linked_rcount));
    271  1.1  haad 		rrl->rr_writer = NULL;
    272  1.1  haad 		cv_broadcast(&rrl->rr_cv);
    273  1.1  haad 	}
    274  1.1  haad 	mutex_exit(&rrl->rr_lock);
    275  1.1  haad }
    276  1.1  haad 
    277  1.4   chs /*
    278  1.4   chs  * If the lock was created with track_all, rrw_held(RW_READER) will return
    279  1.4   chs  * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
    280  1.4   chs  * return B_TRUE if any thread has the lock for reader.
    281  1.4   chs  */
    282  1.1  haad boolean_t
    283  1.1  haad rrw_held(rrwlock_t *rrl, krw_t rw)
    284  1.1  haad {
    285  1.1  haad 	boolean_t held;
    286  1.1  haad 
    287  1.1  haad 	mutex_enter(&rrl->rr_lock);
    288  1.1  haad 	if (rw == RW_WRITER) {
    289  1.1  haad 		held = (rrl->rr_writer == curthread);
    290  1.1  haad 	} else {
    291  1.1  haad 		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
    292  1.4   chs 		    rrn_find(rrl) != NULL);
    293  1.1  haad 	}
    294  1.1  haad 	mutex_exit(&rrl->rr_lock);
    295  1.1  haad 
    296  1.1  haad 	return (held);
    297  1.1  haad }
    298  1.4   chs 
    299  1.4   chs void
    300  1.4   chs rrw_tsd_destroy(void *arg)
    301  1.4   chs {
    302  1.4   chs 	rrw_node_t *rn = arg;
    303  1.4   chs 	if (rn != NULL) {
    304  1.4   chs 		panic("thread %p terminating with rrw lock %p held",
    305  1.4   chs 		    (void *)curthread, (void *)rn->rn_rrl);
    306  1.4   chs 	}
    307  1.4   chs }
    308  1.4   chs 
    309  1.4   chs /*
    310  1.4   chs  * A reader-mostly lock implementation, tuning above reader-writer locks
    311  1.4   chs  * for hightly parallel read acquisitions, while pessimizing writes.
    312  1.4   chs  *
    313  1.4   chs  * The idea is to split single busy lock into array of locks, so that
    314  1.4   chs  * each reader can lock only one of them for read, depending on result
    315  1.4   chs  * of simple hash function.  That proportionally reduces lock congestion.
    316  1.4   chs  * Writer same time has to sequentially aquire write on all the locks.
    317  1.4   chs  * That makes write aquisition proportionally slower, but in places where
    318  1.4   chs  * it is used (filesystem unmount) performance is not critical.
    319  1.4   chs  *
    320  1.4   chs  * All the functions below are direct wrappers around functions above.
    321  1.4   chs  */
    322  1.4   chs void
    323  1.4   chs rrm_init(rrmlock_t *rrl, boolean_t track_all)
    324  1.4   chs {
    325  1.4   chs 	int i;
    326  1.4   chs 
    327  1.4   chs 	for (i = 0; i < RRM_NUM_LOCKS; i++)
    328  1.4   chs 		rrw_init(&rrl->locks[i], track_all);
    329  1.4   chs }
    330  1.4   chs 
    331  1.4   chs void
    332  1.4   chs rrm_destroy(rrmlock_t *rrl)
    333  1.4   chs {
    334  1.4   chs 	int i;
    335  1.4   chs 
    336  1.4   chs 	for (i = 0; i < RRM_NUM_LOCKS; i++)
    337  1.4   chs 		rrw_destroy(&rrl->locks[i]);
    338  1.4   chs }
    339  1.4   chs 
    340  1.4   chs void
    341  1.4   chs rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
    342  1.4   chs {
    343  1.4   chs 	if (rw == RW_READER)
    344  1.4   chs 		rrm_enter_read(rrl, tag);
    345  1.4   chs 	else
    346  1.4   chs 		rrm_enter_write(rrl);
    347  1.4   chs }
    348  1.4   chs 
    349  1.4   chs /*
    350  1.4   chs  * This maps the current thread to a specific lock.  Note that the lock
    351  1.4   chs  * must be released by the same thread that acquired it.  We do this
    352  1.4   chs  * mapping by taking the thread pointer mod a prime number.  We examine
    353  1.4   chs  * only the low 32 bits of the thread pointer, because 32-bit division
    354  1.4   chs  * is faster than 64-bit division, and the high 32 bits have little
    355  1.4   chs  * entropy anyway.
    356  1.4   chs  */
    357  1.4   chs #define	RRM_TD_LOCK()	(((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
    358  1.4   chs 
    359  1.4   chs void
    360  1.4   chs rrm_enter_read(rrmlock_t *rrl, void *tag)
    361  1.4   chs {
    362  1.4   chs 	rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
    363  1.4   chs }
    364  1.4   chs 
    365  1.4   chs void
    366  1.4   chs rrm_enter_write(rrmlock_t *rrl)
    367  1.4   chs {
    368  1.4   chs 	int i;
    369  1.4   chs 
    370  1.4   chs 	for (i = 0; i < RRM_NUM_LOCKS; i++)
    371  1.4   chs 		rrw_enter_write(&rrl->locks[i]);
    372  1.4   chs }
    373  1.4   chs 
    374  1.4   chs void
    375  1.4   chs rrm_exit(rrmlock_t *rrl, void *tag)
    376  1.4   chs {
    377  1.4   chs 	int i;
    378  1.4   chs 
    379  1.4   chs 	if (rrl->locks[0].rr_writer == curthread) {
    380  1.4   chs 		for (i = 0; i < RRM_NUM_LOCKS; i++)
    381  1.4   chs 			rrw_exit(&rrl->locks[i], tag);
    382  1.4   chs 	} else {
    383  1.4   chs 		rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
    384  1.4   chs 	}
    385  1.4   chs }
    386  1.4   chs 
    387  1.4   chs boolean_t
    388  1.4   chs rrm_held(rrmlock_t *rrl, krw_t rw)
    389  1.4   chs {
    390  1.4   chs 	if (rw == RW_WRITER) {
    391  1.4   chs 		return (rrw_held(&rrl->locks[0], rw));
    392  1.4   chs 	} else {
    393  1.4   chs 		return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
    394  1.4   chs 	}
    395  1.4   chs }
    396