Home | History | Annotate | Line # | Download | only in zfs
rrwlock.c revision 1.1
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/refcount.h>
     29 #include <sys/rrwlock.h>
     30 
     31 /*
     32  * This file contains the implementation of a re-entrant read
     33  * reader/writer lock (aka "rrwlock").
     34  *
     35  * This is a normal reader/writer lock with the additional feature
     36  * of allowing threads who have already obtained a read lock to
     37  * re-enter another read lock (re-entrant read) - even if there are
     38  * waiting writers.
     39  *
     40  * Callers who have not obtained a read lock give waiting writers priority.
     41  *
     42  * The rrwlock_t lock does not allow re-entrant writers, nor does it
     43  * allow a re-entrant mix of reads and writes (that is, it does not
     44  * allow a caller who has already obtained a read lock to be able to
     45  * then grab a write lock without first dropping all read locks, and
     46  * vice versa).
     47  *
     48  * The rrwlock_t uses tsd (thread specific data) to keep a list of
     49  * nodes (rrw_node_t), where each node keeps track of which specific
     50  * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
     51  * should be rare, a thread that grabs multiple reads on the same rrwlock_t
     52  * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
     53  * tsd list can represent a different rrwlock_t.  This allows a thread
     54  * to enter multiple and unique rrwlock_ts for read locks at the same time.
     55  *
     56  * Since using tsd exposes some overhead, the rrwlock_t only needs to
     57  * keep tsd data when writers are waiting.  If no writers are waiting, then
     58  * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
     59  * is needed.  Once a writer attempts to grab the lock, readers then
     60  * keep tsd data and bump the linked readers count (rr_linked_rcount).
     61  *
     62  * If there are waiting writers and there are anonymous readers, then a
     63  * reader doesn't know if it is a re-entrant lock. But since it may be one,
     64  * we allow the read to proceed (otherwise it could deadlock).  Since once
     65  * waiting writers are active, readers no longer bump the anonymous count,
     66  * the anonymous readers will eventually flush themselves out.  At this point,
     67  * readers will be able to tell if they are a re-entrant lock (have a
     68  * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
     69  * we must let the proceed.  If they are not, then the reader blocks for the
     70  * waiting writers.  Hence, we do not starve writers.
     71  */
     72 
     73 /* global key for TSD */
     74 uint_t rrw_tsd_key;
     75 
     76 typedef struct rrw_node {
     77 	struct rrw_node	*rn_next;
     78 	rrwlock_t	*rn_rrl;
     79 } rrw_node_t;
     80 
     81 static rrw_node_t *
     82 rrn_find(rrwlock_t *rrl)
     83 {
     84 	rrw_node_t *rn;
     85 
     86 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
     87 		return (NULL);
     88 
     89 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
     90 		if (rn->rn_rrl == rrl)
     91 			return (rn);
     92 	}
     93 	return (NULL);
     94 }
     95 
     96 /*
     97  * Add a node to the head of the singly linked list.
     98  */
     99 static void
    100 rrn_add(rrwlock_t *rrl)
    101 {
    102 	rrw_node_t *rn;
    103 
    104 	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
    105 	rn->rn_rrl = rrl;
    106 	rn->rn_next = tsd_get(rrw_tsd_key);
    107 	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
    108 }
    109 
    110 /*
    111  * If a node is found for 'rrl', then remove the node from this
    112  * thread's list and return TRUE; otherwise return FALSE.
    113  */
    114 static boolean_t
    115 rrn_find_and_remove(rrwlock_t *rrl)
    116 {
    117 	rrw_node_t *rn;
    118 	rrw_node_t *prev = NULL;
    119 
    120 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
    121 		return (NULL);
    122 
    123 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
    124 		if (rn->rn_rrl == rrl) {
    125 			if (prev)
    126 				prev->rn_next = rn->rn_next;
    127 			else
    128 				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
    129 			kmem_free(rn, sizeof (*rn));
    130 			return (B_TRUE);
    131 		}
    132 		prev = rn;
    133 	}
    134 	return (B_FALSE);
    135 }
    136 
    137 void
    138 rrw_init(rrwlock_t *rrl)
    139 {
    140 	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
    141 	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
    142 	rrl->rr_writer = NULL;
    143 	refcount_create(&rrl->rr_anon_rcount);
    144 	refcount_create(&rrl->rr_linked_rcount);
    145 	rrl->rr_writer_wanted = B_FALSE;
    146 }
    147 
    148 void
    149 rrw_destroy(rrwlock_t *rrl)
    150 {
    151 	mutex_destroy(&rrl->rr_lock);
    152 	cv_destroy(&rrl->rr_cv);
    153 	ASSERT(rrl->rr_writer == NULL);
    154 	refcount_destroy(&rrl->rr_anon_rcount);
    155 	refcount_destroy(&rrl->rr_linked_rcount);
    156 }
    157 
    158 static void
    159 rrw_enter_read(rrwlock_t *rrl, void *tag)
    160 {
    161 	mutex_enter(&rrl->rr_lock);
    162 	ASSERT(rrl->rr_writer != curthread);
    163 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
    164 
    165 	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
    166 	    refcount_is_zero(&rrl->rr_anon_rcount) &&
    167 	    rrn_find(rrl) == NULL))
    168 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
    169 
    170 	if (rrl->rr_writer_wanted) {
    171 		/* may or may not be a re-entrant enter */
    172 		rrn_add(rrl);
    173 		(void) refcount_add(&rrl->rr_linked_rcount, tag);
    174 	} else {
    175 		(void) refcount_add(&rrl->rr_anon_rcount, tag);
    176 	}
    177 	ASSERT(rrl->rr_writer == NULL);
    178 	mutex_exit(&rrl->rr_lock);
    179 }
    180 
    181 static void
    182 rrw_enter_write(rrwlock_t *rrl)
    183 {
    184 	mutex_enter(&rrl->rr_lock);
    185 	ASSERT(rrl->rr_writer != curthread);
    186 
    187 	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
    188 	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
    189 	    rrl->rr_writer != NULL) {
    190 		rrl->rr_writer_wanted = B_TRUE;
    191 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
    192 	}
    193 	rrl->rr_writer_wanted = B_FALSE;
    194 	rrl->rr_writer = curthread;
    195 	mutex_exit(&rrl->rr_lock);
    196 }
    197 
    198 void
    199 rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
    200 {
    201 	if (rw == RW_READER)
    202 		rrw_enter_read(rrl, tag);
    203 	else
    204 		rrw_enter_write(rrl);
    205 }
    206 
    207 void
    208 rrw_exit(rrwlock_t *rrl, void *tag)
    209 {
    210 	mutex_enter(&rrl->rr_lock);
    211 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
    212 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
    213 	    rrl->rr_writer != NULL);
    214 
    215 	if (rrl->rr_writer == NULL) {
    216 		if (rrn_find_and_remove(rrl)) {
    217 			if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
    218 				cv_broadcast(&rrl->rr_cv);
    219 
    220 		} else {
    221 			if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
    222 				cv_broadcast(&rrl->rr_cv);
    223 		}
    224 	} else {
    225 		ASSERT(rrl->rr_writer == curthread);
    226 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
    227 		    refcount_is_zero(&rrl->rr_linked_rcount));
    228 		rrl->rr_writer = NULL;
    229 		cv_broadcast(&rrl->rr_cv);
    230 	}
    231 	mutex_exit(&rrl->rr_lock);
    232 }
    233 
    234 boolean_t
    235 rrw_held(rrwlock_t *rrl, krw_t rw)
    236 {
    237 	boolean_t held;
    238 
    239 	mutex_enter(&rrl->rr_lock);
    240 	if (rw == RW_WRITER) {
    241 		held = (rrl->rr_writer == curthread);
    242 	} else {
    243 		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
    244 		    !refcount_is_zero(&rrl->rr_linked_rcount));
    245 	}
    246 	mutex_exit(&rrl->rr_lock);
    247 
    248 	return (held);
    249 }
    250