Home | History | Annotate | Line # | Download | only in zfs
rrwlock.c revision 1.1.1.2
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/refcount.h>
     27 #include <sys/rrwlock.h>
     28 
     29 /*
     30  * This file contains the implementation of a re-entrant read
     31  * reader/writer lock (aka "rrwlock").
     32  *
     33  * This is a normal reader/writer lock with the additional feature
     34  * of allowing threads who have already obtained a read lock to
     35  * re-enter another read lock (re-entrant read) - even if there are
     36  * waiting writers.
     37  *
     38  * Callers who have not obtained a read lock give waiting writers priority.
     39  *
     40  * The rrwlock_t lock does not allow re-entrant writers, nor does it
     41  * allow a re-entrant mix of reads and writes (that is, it does not
     42  * allow a caller who has already obtained a read lock to be able to
     43  * then grab a write lock without first dropping all read locks, and
     44  * vice versa).
     45  *
     46  * The rrwlock_t uses tsd (thread specific data) to keep a list of
     47  * nodes (rrw_node_t), where each node keeps track of which specific
     48  * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
     49  * should be rare, a thread that grabs multiple reads on the same rrwlock_t
     50  * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
     51  * tsd list can represent a different rrwlock_t.  This allows a thread
     52  * to enter multiple and unique rrwlock_ts for read locks at the same time.
     53  *
     54  * Since using tsd exposes some overhead, the rrwlock_t only needs to
     55  * keep tsd data when writers are waiting.  If no writers are waiting, then
     56  * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
     57  * is needed.  Once a writer attempts to grab the lock, readers then
     58  * keep tsd data and bump the linked readers count (rr_linked_rcount).
     59  *
     60  * If there are waiting writers and there are anonymous readers, then a
     61  * reader doesn't know if it is a re-entrant lock. But since it may be one,
     62  * we allow the read to proceed (otherwise it could deadlock).  Since once
     63  * waiting writers are active, readers no longer bump the anonymous count,
     64  * the anonymous readers will eventually flush themselves out.  At this point,
     65  * readers will be able to tell if they are a re-entrant lock (have a
     66  * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
     67  * we must let the proceed.  If they are not, then the reader blocks for the
     68  * waiting writers.  Hence, we do not starve writers.
     69  */
     70 
     71 /* global key for TSD */
     72 uint_t rrw_tsd_key;
     73 
     74 typedef struct rrw_node {
     75 	struct rrw_node	*rn_next;
     76 	rrwlock_t	*rn_rrl;
     77 } rrw_node_t;
     78 
     79 static rrw_node_t *
     80 rrn_find(rrwlock_t *rrl)
     81 {
     82 	rrw_node_t *rn;
     83 
     84 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
     85 		return (NULL);
     86 
     87 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
     88 		if (rn->rn_rrl == rrl)
     89 			return (rn);
     90 	}
     91 	return (NULL);
     92 }
     93 
     94 /*
     95  * Add a node to the head of the singly linked list.
     96  */
     97 static void
     98 rrn_add(rrwlock_t *rrl)
     99 {
    100 	rrw_node_t *rn;
    101 
    102 	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
    103 	rn->rn_rrl = rrl;
    104 	rn->rn_next = tsd_get(rrw_tsd_key);
    105 	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
    106 }
    107 
    108 /*
    109  * If a node is found for 'rrl', then remove the node from this
    110  * thread's list and return TRUE; otherwise return FALSE.
    111  */
    112 static boolean_t
    113 rrn_find_and_remove(rrwlock_t *rrl)
    114 {
    115 	rrw_node_t *rn;
    116 	rrw_node_t *prev = NULL;
    117 
    118 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
    119 		return (B_FALSE);
    120 
    121 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
    122 		if (rn->rn_rrl == rrl) {
    123 			if (prev)
    124 				prev->rn_next = rn->rn_next;
    125 			else
    126 				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
    127 			kmem_free(rn, sizeof (*rn));
    128 			return (B_TRUE);
    129 		}
    130 		prev = rn;
    131 	}
    132 	return (B_FALSE);
    133 }
    134 
    135 void
    136 rrw_init(rrwlock_t *rrl)
    137 {
    138 	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
    139 	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
    140 	rrl->rr_writer = NULL;
    141 	refcount_create(&rrl->rr_anon_rcount);
    142 	refcount_create(&rrl->rr_linked_rcount);
    143 	rrl->rr_writer_wanted = B_FALSE;
    144 }
    145 
    146 void
    147 rrw_destroy(rrwlock_t *rrl)
    148 {
    149 	mutex_destroy(&rrl->rr_lock);
    150 	cv_destroy(&rrl->rr_cv);
    151 	ASSERT(rrl->rr_writer == NULL);
    152 	refcount_destroy(&rrl->rr_anon_rcount);
    153 	refcount_destroy(&rrl->rr_linked_rcount);
    154 }
    155 
    156 static void
    157 rrw_enter_read(rrwlock_t *rrl, void *tag)
    158 {
    159 	mutex_enter(&rrl->rr_lock);
    160 #if !defined(DEBUG) && defined(_KERNEL)
    161 	if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
    162 		rrl->rr_anon_rcount.rc_count++;
    163 		mutex_exit(&rrl->rr_lock);
    164 		return;
    165 	}
    166 	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
    167 #endif
    168 	ASSERT(rrl->rr_writer != curthread);
    169 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
    170 
    171 	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
    172 	    refcount_is_zero(&rrl->rr_anon_rcount) &&
    173 	    rrn_find(rrl) == NULL))
    174 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
    175 
    176 	if (rrl->rr_writer_wanted) {
    177 		/* may or may not be a re-entrant enter */
    178 		rrn_add(rrl);
    179 		(void) refcount_add(&rrl->rr_linked_rcount, tag);
    180 	} else {
    181 		(void) refcount_add(&rrl->rr_anon_rcount, tag);
    182 	}
    183 	ASSERT(rrl->rr_writer == NULL);
    184 	mutex_exit(&rrl->rr_lock);
    185 }
    186 
    187 static void
    188 rrw_enter_write(rrwlock_t *rrl)
    189 {
    190 	mutex_enter(&rrl->rr_lock);
    191 	ASSERT(rrl->rr_writer != curthread);
    192 
    193 	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
    194 	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
    195 	    rrl->rr_writer != NULL) {
    196 		rrl->rr_writer_wanted = B_TRUE;
    197 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
    198 	}
    199 	rrl->rr_writer_wanted = B_FALSE;
    200 	rrl->rr_writer = curthread;
    201 	mutex_exit(&rrl->rr_lock);
    202 }
    203 
    204 void
    205 rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
    206 {
    207 	if (rw == RW_READER)
    208 		rrw_enter_read(rrl, tag);
    209 	else
    210 		rrw_enter_write(rrl);
    211 }
    212 
    213 void
    214 rrw_exit(rrwlock_t *rrl, void *tag)
    215 {
    216 	mutex_enter(&rrl->rr_lock);
    217 #if !defined(DEBUG) && defined(_KERNEL)
    218 	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
    219 		rrl->rr_anon_rcount.rc_count--;
    220 		if (rrl->rr_anon_rcount.rc_count == 0)
    221 			cv_broadcast(&rrl->rr_cv);
    222 		mutex_exit(&rrl->rr_lock);
    223 		return;
    224 	}
    225 	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
    226 #endif
    227 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
    228 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
    229 	    rrl->rr_writer != NULL);
    230 
    231 	if (rrl->rr_writer == NULL) {
    232 		int64_t count;
    233 		if (rrn_find_and_remove(rrl))
    234 			count = refcount_remove(&rrl->rr_linked_rcount, tag);
    235 		else
    236 			count = refcount_remove(&rrl->rr_anon_rcount, tag);
    237 		if (count == 0)
    238 			cv_broadcast(&rrl->rr_cv);
    239 	} else {
    240 		ASSERT(rrl->rr_writer == curthread);
    241 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
    242 		    refcount_is_zero(&rrl->rr_linked_rcount));
    243 		rrl->rr_writer = NULL;
    244 		cv_broadcast(&rrl->rr_cv);
    245 	}
    246 	mutex_exit(&rrl->rr_lock);
    247 }
    248 
    249 boolean_t
    250 rrw_held(rrwlock_t *rrl, krw_t rw)
    251 {
    252 	boolean_t held;
    253 
    254 	mutex_enter(&rrl->rr_lock);
    255 	if (rw == RW_WRITER) {
    256 		held = (rrl->rr_writer == curthread);
    257 	} else {
    258 		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
    259 		    !refcount_is_zero(&rrl->rr_linked_rcount));
    260 	}
    261 	mutex_exit(&rrl->rr_lock);
    262 
    263 	return (held);
    264 }
    265