1 1.1 haad /* 2 1.1 haad * CDDL HEADER START 3 1.1 haad * 4 1.1 haad * The contents of this file are subject to the terms of the 5 1.1 haad * Common Development and Distribution License (the "License"). 6 1.1 haad * You may not use this file except in compliance with the License. 7 1.1 haad * 8 1.1 haad * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 1.1 haad * or http://www.opensolaris.org/os/licensing. 10 1.1 haad * See the License for the specific language governing permissions 11 1.1 haad * and limitations under the License. 12 1.1 haad * 13 1.1 haad * When distributing Covered Code, include this CDDL HEADER in each 14 1.1 haad * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 1.1 haad * If applicable, add the following below this CDDL HEADER, with the 16 1.1 haad * fields enclosed by brackets "[]" replaced with your own identifying 17 1.1 haad * information: Portions Copyright [yyyy] [name of copyright owner] 18 1.1 haad * 19 1.1 haad * CDDL HEADER END 20 1.1 haad */ 21 1.1 haad /* 22 1.3 haad * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 1.1 haad * Use is subject to license terms. 24 1.1 haad */ 25 1.4 chs /* 26 1.4 chs * Copyright (c) 2012 by Delphix. All rights reserved. 27 1.4 chs */ 28 1.1 haad 29 1.1 haad #include <sys/refcount.h> 30 1.1 haad #include <sys/rrwlock.h> 31 1.1 haad 32 1.1 haad /* 33 1.1 haad * This file contains the implementation of a re-entrant read 34 1.1 haad * reader/writer lock (aka "rrwlock"). 35 1.1 haad * 36 1.1 haad * This is a normal reader/writer lock with the additional feature 37 1.1 haad * of allowing threads who have already obtained a read lock to 38 1.1 haad * re-enter another read lock (re-entrant read) - even if there are 39 1.1 haad * waiting writers. 40 1.1 haad * 41 1.1 haad * Callers who have not obtained a read lock give waiting writers priority. 42 1.1 haad * 43 1.1 haad * The rrwlock_t lock does not allow re-entrant writers, nor does it 44 1.1 haad * allow a re-entrant mix of reads and writes (that is, it does not 45 1.1 haad * allow a caller who has already obtained a read lock to be able to 46 1.1 haad * then grab a write lock without first dropping all read locks, and 47 1.1 haad * vice versa). 48 1.1 haad * 49 1.1 haad * The rrwlock_t uses tsd (thread specific data) to keep a list of 50 1.1 haad * nodes (rrw_node_t), where each node keeps track of which specific 51 1.1 haad * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 52 1.1 haad * should be rare, a thread that grabs multiple reads on the same rrwlock_t 53 1.1 haad * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 54 1.1 haad * tsd list can represent a different rrwlock_t. This allows a thread 55 1.1 haad * to enter multiple and unique rrwlock_ts for read locks at the same time. 56 1.1 haad * 57 1.1 haad * Since using tsd exposes some overhead, the rrwlock_t only needs to 58 1.1 haad * keep tsd data when writers are waiting. If no writers are waiting, then 59 1.1 haad * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 60 1.1 haad * is needed. Once a writer attempts to grab the lock, readers then 61 1.1 haad * keep tsd data and bump the linked readers count (rr_linked_rcount). 62 1.1 haad * 63 1.1 haad * If there are waiting writers and there are anonymous readers, then a 64 1.1 haad * reader doesn't know if it is a re-entrant lock. But since it may be one, 65 1.1 haad * we allow the read to proceed (otherwise it could deadlock). Since once 66 1.1 haad * waiting writers are active, readers no longer bump the anonymous count, 67 1.1 haad * the anonymous readers will eventually flush themselves out. At this point, 68 1.1 haad * readers will be able to tell if they are a re-entrant lock (have a 69 1.1 haad * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 70 1.1 haad * we must let the proceed. If they are not, then the reader blocks for the 71 1.1 haad * waiting writers. Hence, we do not starve writers. 72 1.1 haad */ 73 1.1 haad 74 1.1 haad /* global key for TSD */ 75 1.1 haad uint_t rrw_tsd_key; 76 1.1 haad 77 1.1 haad typedef struct rrw_node { 78 1.4 chs struct rrw_node *rn_next; 79 1.4 chs rrwlock_t *rn_rrl; 80 1.4 chs void *rn_tag; 81 1.1 haad } rrw_node_t; 82 1.1 haad 83 1.1 haad static rrw_node_t * 84 1.1 haad rrn_find(rrwlock_t *rrl) 85 1.1 haad { 86 1.1 haad rrw_node_t *rn; 87 1.1 haad 88 1.1 haad if (refcount_count(&rrl->rr_linked_rcount) == 0) 89 1.1 haad return (NULL); 90 1.1 haad 91 1.1 haad for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 92 1.1 haad if (rn->rn_rrl == rrl) 93 1.1 haad return (rn); 94 1.1 haad } 95 1.1 haad return (NULL); 96 1.1 haad } 97 1.1 haad 98 1.1 haad /* 99 1.1 haad * Add a node to the head of the singly linked list. 100 1.1 haad */ 101 1.1 haad static void 102 1.4 chs rrn_add(rrwlock_t *rrl, void *tag) 103 1.1 haad { 104 1.1 haad rrw_node_t *rn; 105 1.1 haad 106 1.1 haad rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 107 1.1 haad rn->rn_rrl = rrl; 108 1.1 haad rn->rn_next = tsd_get(rrw_tsd_key); 109 1.4 chs rn->rn_tag = tag; 110 1.1 haad VERIFY(tsd_set(rrw_tsd_key, rn) == 0); 111 1.1 haad } 112 1.1 haad 113 1.1 haad /* 114 1.1 haad * If a node is found for 'rrl', then remove the node from this 115 1.1 haad * thread's list and return TRUE; otherwise return FALSE. 116 1.1 haad */ 117 1.1 haad static boolean_t 118 1.4 chs rrn_find_and_remove(rrwlock_t *rrl, void *tag) 119 1.1 haad { 120 1.1 haad rrw_node_t *rn; 121 1.1 haad rrw_node_t *prev = NULL; 122 1.1 haad 123 1.1 haad if (refcount_count(&rrl->rr_linked_rcount) == 0) 124 1.2 haad return (B_FALSE); 125 1.1 haad 126 1.1 haad for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 127 1.4 chs if (rn->rn_rrl == rrl && rn->rn_tag == tag) { 128 1.1 haad if (prev) 129 1.1 haad prev->rn_next = rn->rn_next; 130 1.1 haad else 131 1.1 haad VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); 132 1.1 haad kmem_free(rn, sizeof (*rn)); 133 1.1 haad return (B_TRUE); 134 1.1 haad } 135 1.1 haad prev = rn; 136 1.1 haad } 137 1.1 haad return (B_FALSE); 138 1.1 haad } 139 1.1 haad 140 1.1 haad void 141 1.4 chs rrw_init(rrwlock_t *rrl, boolean_t track_all) 142 1.1 haad { 143 1.1 haad mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 144 1.1 haad cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 145 1.1 haad rrl->rr_writer = NULL; 146 1.1 haad refcount_create(&rrl->rr_anon_rcount); 147 1.1 haad refcount_create(&rrl->rr_linked_rcount); 148 1.1 haad rrl->rr_writer_wanted = B_FALSE; 149 1.4 chs rrl->rr_track_all = track_all; 150 1.1 haad } 151 1.1 haad 152 1.1 haad void 153 1.1 haad rrw_destroy(rrwlock_t *rrl) 154 1.1 haad { 155 1.1 haad mutex_destroy(&rrl->rr_lock); 156 1.1 haad cv_destroy(&rrl->rr_cv); 157 1.1 haad ASSERT(rrl->rr_writer == NULL); 158 1.1 haad refcount_destroy(&rrl->rr_anon_rcount); 159 1.1 haad refcount_destroy(&rrl->rr_linked_rcount); 160 1.1 haad } 161 1.1 haad 162 1.1 haad static void 163 1.4 chs rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) 164 1.1 haad { 165 1.1 haad mutex_enter(&rrl->rr_lock); 166 1.3 haad #if !defined(DEBUG) && defined(_KERNEL) 167 1.4 chs if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && 168 1.4 chs !rrl->rr_track_all) { 169 1.3 haad rrl->rr_anon_rcount.rc_count++; 170 1.3 haad mutex_exit(&rrl->rr_lock); 171 1.3 haad return; 172 1.3 haad } 173 1.3 haad DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 174 1.3 haad #endif 175 1.1 haad ASSERT(rrl->rr_writer != curthread); 176 1.1 haad ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); 177 1.1 haad 178 1.4 chs while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && 179 1.4 chs refcount_is_zero(&rrl->rr_anon_rcount) && !prio && 180 1.1 haad rrn_find(rrl) == NULL)) 181 1.1 haad cv_wait(&rrl->rr_cv, &rrl->rr_lock); 182 1.1 haad 183 1.4 chs if (rrl->rr_writer_wanted || rrl->rr_track_all) { 184 1.1 haad /* may or may not be a re-entrant enter */ 185 1.4 chs rrn_add(rrl, tag); 186 1.1 haad (void) refcount_add(&rrl->rr_linked_rcount, tag); 187 1.1 haad } else { 188 1.1 haad (void) refcount_add(&rrl->rr_anon_rcount, tag); 189 1.1 haad } 190 1.1 haad ASSERT(rrl->rr_writer == NULL); 191 1.1 haad mutex_exit(&rrl->rr_lock); 192 1.1 haad } 193 1.1 haad 194 1.4 chs void 195 1.4 chs rrw_enter_read(rrwlock_t *rrl, void *tag) 196 1.4 chs { 197 1.4 chs rrw_enter_read_impl(rrl, B_FALSE, tag); 198 1.4 chs } 199 1.4 chs 200 1.4 chs /* 201 1.4 chs * take a read lock even if there are pending write lock requests. if we want 202 1.4 chs * to take a lock reentrantly, but from different threads (that have a 203 1.4 chs * relationship to each other), the normal detection mechanism to overrule 204 1.4 chs * the pending writer does not work, so we have to give an explicit hint here. 205 1.4 chs */ 206 1.4 chs void 207 1.4 chs rrw_enter_read_prio(rrwlock_t *rrl, void *tag) 208 1.4 chs { 209 1.4 chs rrw_enter_read_impl(rrl, B_TRUE, tag); 210 1.4 chs } 211 1.4 chs 212 1.4 chs 213 1.4 chs void 214 1.1 haad rrw_enter_write(rrwlock_t *rrl) 215 1.1 haad { 216 1.1 haad mutex_enter(&rrl->rr_lock); 217 1.1 haad ASSERT(rrl->rr_writer != curthread); 218 1.1 haad 219 1.1 haad while (refcount_count(&rrl->rr_anon_rcount) > 0 || 220 1.1 haad refcount_count(&rrl->rr_linked_rcount) > 0 || 221 1.1 haad rrl->rr_writer != NULL) { 222 1.1 haad rrl->rr_writer_wanted = B_TRUE; 223 1.1 haad cv_wait(&rrl->rr_cv, &rrl->rr_lock); 224 1.1 haad } 225 1.1 haad rrl->rr_writer_wanted = B_FALSE; 226 1.1 haad rrl->rr_writer = curthread; 227 1.1 haad mutex_exit(&rrl->rr_lock); 228 1.1 haad } 229 1.1 haad 230 1.1 haad void 231 1.1 haad rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) 232 1.1 haad { 233 1.1 haad if (rw == RW_READER) 234 1.1 haad rrw_enter_read(rrl, tag); 235 1.1 haad else 236 1.1 haad rrw_enter_write(rrl); 237 1.1 haad } 238 1.1 haad 239 1.1 haad void 240 1.1 haad rrw_exit(rrwlock_t *rrl, void *tag) 241 1.1 haad { 242 1.1 haad mutex_enter(&rrl->rr_lock); 243 1.3 haad #if !defined(DEBUG) && defined(_KERNEL) 244 1.3 haad if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 245 1.3 haad rrl->rr_anon_rcount.rc_count--; 246 1.3 haad if (rrl->rr_anon_rcount.rc_count == 0) 247 1.3 haad cv_broadcast(&rrl->rr_cv); 248 1.3 haad mutex_exit(&rrl->rr_lock); 249 1.3 haad return; 250 1.3 haad } 251 1.3 haad DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 252 1.3 haad #endif 253 1.1 haad ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || 254 1.1 haad !refcount_is_zero(&rrl->rr_linked_rcount) || 255 1.1 haad rrl->rr_writer != NULL); 256 1.1 haad 257 1.1 haad if (rrl->rr_writer == NULL) { 258 1.3 haad int64_t count; 259 1.4 chs if (rrn_find_and_remove(rrl, tag)) { 260 1.3 haad count = refcount_remove(&rrl->rr_linked_rcount, tag); 261 1.4 chs } else { 262 1.4 chs ASSERT(!rrl->rr_track_all); 263 1.3 haad count = refcount_remove(&rrl->rr_anon_rcount, tag); 264 1.4 chs } 265 1.3 haad if (count == 0) 266 1.3 haad cv_broadcast(&rrl->rr_cv); 267 1.1 haad } else { 268 1.1 haad ASSERT(rrl->rr_writer == curthread); 269 1.1 haad ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && 270 1.1 haad refcount_is_zero(&rrl->rr_linked_rcount)); 271 1.1 haad rrl->rr_writer = NULL; 272 1.1 haad cv_broadcast(&rrl->rr_cv); 273 1.1 haad } 274 1.1 haad mutex_exit(&rrl->rr_lock); 275 1.1 haad } 276 1.1 haad 277 1.4 chs /* 278 1.4 chs * If the lock was created with track_all, rrw_held(RW_READER) will return 279 1.4 chs * B_TRUE iff the current thread has the lock for reader. Otherwise it may 280 1.4 chs * return B_TRUE if any thread has the lock for reader. 281 1.4 chs */ 282 1.1 haad boolean_t 283 1.1 haad rrw_held(rrwlock_t *rrl, krw_t rw) 284 1.1 haad { 285 1.1 haad boolean_t held; 286 1.1 haad 287 1.1 haad mutex_enter(&rrl->rr_lock); 288 1.1 haad if (rw == RW_WRITER) { 289 1.1 haad held = (rrl->rr_writer == curthread); 290 1.1 haad } else { 291 1.1 haad held = (!refcount_is_zero(&rrl->rr_anon_rcount) || 292 1.4 chs rrn_find(rrl) != NULL); 293 1.1 haad } 294 1.1 haad mutex_exit(&rrl->rr_lock); 295 1.1 haad 296 1.1 haad return (held); 297 1.1 haad } 298 1.4 chs 299 1.4 chs void 300 1.4 chs rrw_tsd_destroy(void *arg) 301 1.4 chs { 302 1.4 chs rrw_node_t *rn = arg; 303 1.4 chs if (rn != NULL) { 304 1.4 chs panic("thread %p terminating with rrw lock %p held", 305 1.4 chs (void *)curthread, (void *)rn->rn_rrl); 306 1.4 chs } 307 1.4 chs } 308 1.4 chs 309 1.4 chs /* 310 1.4 chs * A reader-mostly lock implementation, tuning above reader-writer locks 311 1.4 chs * for hightly parallel read acquisitions, while pessimizing writes. 312 1.4 chs * 313 1.4 chs * The idea is to split single busy lock into array of locks, so that 314 1.4 chs * each reader can lock only one of them for read, depending on result 315 1.4 chs * of simple hash function. That proportionally reduces lock congestion. 316 1.4 chs * Writer same time has to sequentially aquire write on all the locks. 317 1.4 chs * That makes write aquisition proportionally slower, but in places where 318 1.4 chs * it is used (filesystem unmount) performance is not critical. 319 1.4 chs * 320 1.4 chs * All the functions below are direct wrappers around functions above. 321 1.4 chs */ 322 1.4 chs void 323 1.4 chs rrm_init(rrmlock_t *rrl, boolean_t track_all) 324 1.4 chs { 325 1.4 chs int i; 326 1.4 chs 327 1.4 chs for (i = 0; i < RRM_NUM_LOCKS; i++) 328 1.4 chs rrw_init(&rrl->locks[i], track_all); 329 1.4 chs } 330 1.4 chs 331 1.4 chs void 332 1.4 chs rrm_destroy(rrmlock_t *rrl) 333 1.4 chs { 334 1.4 chs int i; 335 1.4 chs 336 1.4 chs for (i = 0; i < RRM_NUM_LOCKS; i++) 337 1.4 chs rrw_destroy(&rrl->locks[i]); 338 1.4 chs } 339 1.4 chs 340 1.4 chs void 341 1.4 chs rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) 342 1.4 chs { 343 1.4 chs if (rw == RW_READER) 344 1.4 chs rrm_enter_read(rrl, tag); 345 1.4 chs else 346 1.4 chs rrm_enter_write(rrl); 347 1.4 chs } 348 1.4 chs 349 1.4 chs /* 350 1.4 chs * This maps the current thread to a specific lock. Note that the lock 351 1.4 chs * must be released by the same thread that acquired it. We do this 352 1.4 chs * mapping by taking the thread pointer mod a prime number. We examine 353 1.4 chs * only the low 32 bits of the thread pointer, because 32-bit division 354 1.4 chs * is faster than 64-bit division, and the high 32 bits have little 355 1.4 chs * entropy anyway. 356 1.4 chs */ 357 1.4 chs #define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) 358 1.4 chs 359 1.4 chs void 360 1.4 chs rrm_enter_read(rrmlock_t *rrl, void *tag) 361 1.4 chs { 362 1.4 chs rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); 363 1.4 chs } 364 1.4 chs 365 1.4 chs void 366 1.4 chs rrm_enter_write(rrmlock_t *rrl) 367 1.4 chs { 368 1.4 chs int i; 369 1.4 chs 370 1.4 chs for (i = 0; i < RRM_NUM_LOCKS; i++) 371 1.4 chs rrw_enter_write(&rrl->locks[i]); 372 1.4 chs } 373 1.4 chs 374 1.4 chs void 375 1.4 chs rrm_exit(rrmlock_t *rrl, void *tag) 376 1.4 chs { 377 1.4 chs int i; 378 1.4 chs 379 1.4 chs if (rrl->locks[0].rr_writer == curthread) { 380 1.4 chs for (i = 0; i < RRM_NUM_LOCKS; i++) 381 1.4 chs rrw_exit(&rrl->locks[i], tag); 382 1.4 chs } else { 383 1.4 chs rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); 384 1.4 chs } 385 1.4 chs } 386 1.4 chs 387 1.4 chs boolean_t 388 1.4 chs rrm_held(rrmlock_t *rrl, krw_t rw) 389 1.4 chs { 390 1.4 chs if (rw == RW_WRITER) { 391 1.4 chs return (rrw_held(&rrl->locks[0], rw)); 392 1.4 chs } else { 393 1.4 chs return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); 394 1.4 chs } 395 1.4 chs } 396