Home | History | Annotate | Line # | Download | only in raidframe
rf_stripelocks.c revision 1.1
      1 /*	$NetBSD: rf_stripelocks.c,v 1.1 1998/11/13 04:20:34 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Authors: Mark Holland, Jim Zelenka
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /* :
     30  * Log: rf_stripelocks.c,v
     31  * Revision 1.35  1996/06/10 12:50:57  jimz
     32  * Add counters to freelists to track number of allocations, frees,
     33  * grows, max size, etc. Adjust a couple sets of PRIME params based
     34  * on the results.
     35  *
     36  * Revision 1.34  1996/06/10  11:55:47  jimz
     37  * Straightened out some per-array/not-per-array distinctions, fixed
     38  * a couple bugs related to confusion. Added shutdown lists. Removed
     39  * layout shutdown function (now subsumed by shutdown lists).
     40  *
     41  * Revision 1.33  1996/06/09  02:36:46  jimz
     42  * lots of little crufty cleanup- fixup whitespace
     43  * issues, comment #ifdefs, improve typing in some
     44  * places (esp size-related)
     45  *
     46  * Revision 1.32  1996/06/07  21:33:04  jimz
     47  * begin using consistent types for sector numbers,
     48  * stripe numbers, row+col numbers, recon unit numbers
     49  *
     50  * Revision 1.31  1996/06/05  18:06:02  jimz
     51  * Major code cleanup. The Great Renaming is now done.
     52  * Better modularity. Better typing. Fixed a bunch of
     53  * synchronization bugs. Made a lot of global stuff
     54  * per-desc or per-array. Removed dead code.
     55  *
     56  * Revision 1.30  1996/06/03  23:28:26  jimz
     57  * more bugfixes
     58  * check in tree to sync for IPDS runs with current bugfixes
     59  * there still may be a problem with threads in the script test
     60  * getting I/Os stuck- not trivially reproducible (runs ~50 times
     61  * in a row without getting stuck)
     62  *
     63  * Revision 1.29  1996/05/30  23:22:16  jimz
     64  * bugfixes of serialization, timing problems
     65  * more cleanup
     66  *
     67  * Revision 1.28  1996/05/30  11:29:41  jimz
     68  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
     69  * about when stripes should be locked (I made it consistent: no parity, no lock)
     70  * There was a lot of extra serialization of I/Os which I've removed- a lot of
     71  * it was to calculate values for the cache code, which is no longer with us.
     72  * More types, function, macro cleanup. Added code to properly quiesce the array
     73  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
     74  * before. Fixed memory allocation, freeing bugs.
     75  *
     76  * Revision 1.27  1996/05/27  18:56:37  jimz
     77  * more code cleanup
     78  * better typing
     79  * compiles in all 3 environments
     80  *
     81  * Revision 1.26  1996/05/24  22:17:04  jimz
     82  * continue code + namespace cleanup
     83  * typed a bunch of flags
     84  *
     85  * Revision 1.25  1996/05/23  00:33:23  jimz
     86  * code cleanup: move all debug decls to rf_options.c, all extern
     87  * debug decls to rf_options.h, all debug vars preceded by rf_
     88  *
     89  * Revision 1.24  1996/05/20  16:15:00  jimz
     90  * switch to rf_{mutex,cond}_{init,destroy}
     91  *
     92  * Revision 1.23  1996/05/18  19:51:34  jimz
     93  * major code cleanup- fix syntax, make some types consistent,
     94  * add prototypes, clean out dead code, et cetera
     95  *
     96  * Revision 1.22  1996/05/16  22:28:11  jimz
     97  * misc cleanup
     98  *
     99  * Revision 1.21  1996/05/15  23:39:52  jimz
    100  * remove #if 0 code
    101  *
    102  * Revision 1.20  1996/05/15  23:37:38  jimz
    103  * convert to using RF_FREELIST stuff for StripeLockDesc allocation
    104  *
    105  * Revision 1.19  1996/05/08  18:00:53  jimz
    106  * fix number of args to debug printf
    107  *
    108  * Revision 1.18  1996/05/06  22:33:07  jimz
    109  * added better debug info
    110  *
    111  * Revision 1.17  1996/05/06  22:09:01  wvcii
    112  * added copyright info and change log
    113  *
    114  */
    115 
    116 /*
    117  * stripelocks.c -- code to lock stripes for read and write access
    118  *
    119  * The code distinguishes between read locks and write locks. There can be
    120  * as many readers to given stripe as desired. When a write request comes
    121  * in, no further readers are allowed to enter, and all subsequent requests
    122  * are queued in FIFO order. When a the number of readers goes to zero, the
    123  * writer is given the lock. When a writer releases the lock, the list of
    124  * queued requests is scanned, and all readersq up to the next writer are
    125  * given the lock.
    126  *
    127  * The lock table size must be one less than a power of two, but HASH_STRIPEID
    128  * is the only function that requires this.
    129  *
    130  * The code now supports "range locks". When you ask to lock a stripe, you
    131  * specify a range of addresses in that stripe that you want to lock. When
    132  * you acquire the lock, you've locked only this range of addresses, and
    133  * other threads can concurrently read/write any non-overlapping portions
    134  * of the stripe. The "addresses" that you lock are abstract in that you
    135  * can pass in anything you like.  The expectation is that you'll pass in
    136  * the range of physical disk offsets of the parity bits you're planning
    137  * to update. The idea behind this, of course, is to allow sub-stripe
    138  * locking. The implementation is perhaps not the best imaginable; in the
    139  * worst case a lock release is O(n^2) in the total number of outstanding
    140  * requests to a given stripe.  Note that if you're striping with a
    141  * stripe unit size equal to an entire disk (i.e. not striping), there will
    142  * be only one stripe and you may spend some significant number of cycles
    143  * searching through stripe lock descriptors.
    144  */
    145 
    146 #ifdef _KERNEL
    147 #define KERNEL
    148 #endif
    149 
    150 #include "rf_types.h"
    151 #include "rf_raid.h"
    152 #include "rf_stripelocks.h"
    153 #include "rf_alloclist.h"
    154 #include "rf_threadid.h"
    155 #include "rf_general.h"
    156 #include "rf_freelist.h"
    157 #include "rf_debugprint.h"
    158 #include "rf_driver.h"
    159 #include "rf_shutdown.h"
    160 
    161 #define Dprintf1(s,a)         rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
    162 #define Dprintf2(s,a,b)       rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
    163 #define Dprintf3(s,a,b,c)     rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
    164 #define Dprintf4(s,a,b,c,d)   rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
    165 #define Dprintf5(s,a,b,c,d,e) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
    166 #define Dprintf6(s,a,b,c,d,e,f) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
    167 #define Dprintf7(s,a,b,c,d,e,f,g) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
    168 #define Dprintf8(s,a,b,c,d,e,f,g,h) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h))
    169 
    170 #ifndef KERNEL
    171 #define FLUSH fflush(stdout)
    172 #else /* !KERNEL */
    173 #define FLUSH
    174 #endif /* !KERNEL */
    175 
    176 #define HASH_STRIPEID(_sid_)  ( (_sid_) & (rf_lockTableSize-1) )
    177 #define MAX_FREELIST 100
    178 
    179 static void AddToWaitersQueue(RF_LockTableEntry_t *lockTable, RF_StripeLockDesc_t *lockDesc, RF_LockReqDesc_t *lockReqDesc);
    180 static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID);
    181 static void FreeStripeLockDesc(RF_StripeLockDesc_t *p);
    182 static void PrintLockedStripes(RF_LockTableEntry_t *lockTable);
    183 
    184 /* determines if two ranges overlap.  always yields false if either start value is negative  */
    185 #define SINGLE_RANGE_OVERLAP(_strt1, _stop1, _strt2, _stop2)                                     \
    186   ( (_strt1 >= 0) && (_strt2 >= 0) && (RF_MAX(_strt1, _strt2) <= RF_MIN(_stop1, _stop2)) )
    187 
    188 /* determines if any of the ranges specified in the two lock descriptors overlap each other */
    189 #define RANGE_OVERLAP(_cand, _pred)                                                  \
    190   ( SINGLE_RANGE_OVERLAP((_cand)->start,  (_cand)->stop,  (_pred)->start,  (_pred)->stop ) ||    \
    191     SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start,  (_pred)->stop ) ||    \
    192     SINGLE_RANGE_OVERLAP((_cand)->start,  (_cand)->stop,  (_pred)->start2, (_pred)->stop2) ||    \
    193     SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start2, (_pred)->stop2) )
    194 
    195 /* Determines if a candidate lock request conflicts with a predecessor lock req.
    196  * Note that the arguments are not interchangeable.
    197  * The rules are:
    198  *      a candidate read conflicts with a predecessor write if any ranges overlap
    199  *      a candidate write conflicts with a predecessor read if any ranges overlap
    200  *      a candidate write conflicts with a predecessor write if any ranges overlap
    201  */
    202 #define STRIPELOCK_CONFLICT(_cand, _pred)                                        \
    203   RANGE_OVERLAP((_cand), (_pred)) &&                                        \
    204   ( ( (((_cand)->type == RF_IO_TYPE_READ) && ((_pred)->type == RF_IO_TYPE_WRITE)) ||                      \
    205       (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_READ)) ||                      \
    206       (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_WRITE))                         \
    207     )                                                                            \
    208   )
    209 
    210 static RF_FreeList_t *rf_stripelock_freelist;
    211 #define RF_MAX_FREE_STRIPELOCK 128
    212 #define RF_STRIPELOCK_INC        8
    213 #define RF_STRIPELOCK_INITIAL   32
    214 
    215 static void rf_ShutdownStripeLockFreeList(void *);
    216 static void rf_RaidShutdownStripeLocks(void *);
    217 
    218 static void rf_ShutdownStripeLockFreeList(ignored)
    219   void  *ignored;
    220 {
    221 	RF_FREELIST_DESTROY(rf_stripelock_freelist,next,(RF_StripeLockDesc_t *));
    222 }
    223 
    224 int rf_ConfigureStripeLockFreeList(listp)
    225   RF_ShutdownList_t  **listp;
    226 {
    227 	unsigned mask;
    228 	int rc;
    229 
    230 	RF_FREELIST_CREATE(rf_stripelock_freelist, RF_MAX_FREE_STRIPELOCK,
    231 		RF_STRIPELOCK_INITIAL,sizeof(RF_StripeLockDesc_t));
    232 	rc = rf_ShutdownCreate(listp, rf_ShutdownStripeLockFreeList, NULL);
    233 	if (rc) {
    234 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
    235 			__FILE__, __LINE__, rc);
    236 		rf_ShutdownStripeLockFreeList(NULL);
    237 		return(rc);
    238 	}
    239 	RF_FREELIST_PRIME(rf_stripelock_freelist,RF_STRIPELOCK_INITIAL,next,
    240 		(RF_StripeLockDesc_t *));
    241 	for (mask=0x1; mask; mask<<=1)
    242 		if (rf_lockTableSize==mask)
    243 			break;
    244 	if (!mask) {
    245 		printf("[WARNING:  lock table size must be a power of two.  Setting to %d.]\n",RF_DEFAULT_LOCK_TABLE_SIZE);
    246 		rf_lockTableSize = RF_DEFAULT_LOCK_TABLE_SIZE;
    247 	}
    248 	return(0);
    249 }
    250 
    251 RF_LockTableEntry_t *rf_MakeLockTable()
    252 {
    253 	RF_LockTableEntry_t *lockTable;
    254 	int i, rc;
    255 
    256 	RF_Calloc(lockTable, ((int) rf_lockTableSize), sizeof(RF_LockTableEntry_t), (RF_LockTableEntry_t *));
    257 	if (lockTable == NULL)
    258 		return(NULL);
    259 	for (i=0; i<rf_lockTableSize; i++) {
    260 		rc = rf_mutex_init(&lockTable[i].mutex);
    261 		if (rc) {
    262 		    RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
    263 		      __LINE__, rc);
    264 			/* XXX clean up other mutexes */
    265 			return(NULL);
    266 		}
    267 	}
    268 	return(lockTable);
    269 }
    270 
    271 void rf_ShutdownStripeLocks(RF_LockTableEntry_t *lockTable)
    272 {
    273 	int i;
    274 
    275 	if (rf_stripeLockDebug) {
    276 		PrintLockedStripes(lockTable);
    277 	}
    278 	for (i=0; i<rf_lockTableSize; i++) {
    279 		rf_mutex_destroy(&lockTable[i].mutex);
    280 	}
    281 	RF_Free(lockTable, rf_lockTableSize*sizeof(RF_LockTableEntry_t));
    282 }
    283 
    284 static void rf_RaidShutdownStripeLocks(arg)
    285   void  *arg;
    286 {
    287 	RF_Raid_t *raidPtr = (RF_Raid_t *)arg;
    288 	rf_ShutdownStripeLocks(raidPtr->lockTable);
    289 }
    290 
    291 int rf_ConfigureStripeLocks(
    292   RF_ShutdownList_t  **listp,
    293   RF_Raid_t           *raidPtr,
    294   RF_Config_t         *cfgPtr)
    295 {
    296 	int rc;
    297 
    298 	raidPtr->lockTable = rf_MakeLockTable();
    299 	if (raidPtr->lockTable == NULL)
    300 		return(ENOMEM);
    301 	rc = rf_ShutdownCreate(listp, rf_RaidShutdownStripeLocks, raidPtr);
    302 	if (rc) {
    303 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
    304 			__FILE__, __LINE__, rc);
    305 		rf_ShutdownStripeLocks(raidPtr->lockTable);
    306 		return(rc);
    307 	}
    308 	return(0);
    309 }
    310 
    311 /* returns 0 if you've got the lock, and non-zero if you have to wait.
    312  * if and only if you have to wait, we'll cause cbFunc to get invoked
    313  * with cbArg when you are granted the lock.  We store a tag in *releaseTag
    314  * that you need to give back to us when you release the lock.
    315  */
    316 int rf_AcquireStripeLock(
    317   RF_LockTableEntry_t  *lockTable,
    318   RF_StripeNum_t        stripeID,
    319   RF_LockReqDesc_t     *lockReqDesc)
    320 {
    321   RF_StripeLockDesc_t *lockDesc;
    322   RF_LockReqDesc_t    *p;
    323   int tid=0, hashval = HASH_STRIPEID(stripeID);
    324   int retcode = 0;
    325 
    326   RF_ASSERT(RF_IO_IS_R_OR_W(lockReqDesc->type));
    327 
    328   if (rf_stripeLockDebug) {
    329     rf_get_threadid(tid);
    330     if (stripeID == -1) Dprintf1("[%d] Lock acquisition supressed (stripeID == -1)\n",tid);
    331     else {
    332       Dprintf8("[%d] Trying to acquire stripe lock table 0x%lx SID %ld type %c range %ld-%ld, range2 %ld-%ld hashval %d\n",
    333         tid, (unsigned long) lockTable, stripeID, lockReqDesc->type, lockReqDesc->start,
    334         lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
    335       Dprintf3("[%d] lock %ld hashval %d\n", tid, stripeID, hashval);
    336       FLUSH;
    337     }
    338   }
    339   if (stripeID == -1) return(0);
    340   lockReqDesc->next = NULL;                       /* just to be sure */
    341 
    342   RF_LOCK_MUTEX(lockTable[hashval].mutex);
    343   for (lockDesc = lockTable[hashval].descList; lockDesc; lockDesc=lockDesc->next) {
    344     if (lockDesc->stripeID == stripeID) break;
    345   }
    346 
    347   if (!lockDesc) {            /* no entry in table => no one reading or writing */
    348     lockDesc = AllocStripeLockDesc(stripeID);
    349     lockDesc->next = lockTable[hashval].descList;
    350     lockTable[hashval].descList = lockDesc;
    351     if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters++;
    352     lockDesc->granted = lockReqDesc;
    353     if (rf_stripeLockDebug) {Dprintf7("[%d] no one waiting: lock %ld %c %ld-%ld %ld-%ld granted\n",
    354 				tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2); FLUSH;}
    355   } else {
    356 
    357     if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters++;
    358 
    359     if (lockDesc->nWriters == 0) {                        /* no need to search any lists if there are no writers anywhere */
    360       lockReqDesc->next = lockDesc->granted;
    361       lockDesc->granted = lockReqDesc;
    362       if (rf_stripeLockDebug) {Dprintf7("[%d] no writers: lock %ld %c %ld-%ld %ld-%ld granted\n",
    363 				  tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2); FLUSH;}
    364     } else {
    365 
    366       /* search the granted & waiting lists for a conflict.  stop searching as soon as we find one */
    367       retcode = 0;
    368       for (p = lockDesc->granted; p; p=p->next) if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {retcode = 1; break;}
    369       if (!retcode) for (p = lockDesc->waitersH; p; p=p->next) if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {retcode = 2; break;}
    370 
    371       if (!retcode) {
    372 	lockReqDesc->next = lockDesc->granted;                                   /* no conflicts found => grant lock */
    373 	lockDesc->granted = lockReqDesc;
    374 	if (rf_stripeLockDebug) {
    375 		Dprintf7("[%d] no conflicts: lock %ld %c %ld-%ld %ld-%ld granted\n",
    376 			tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,
    377 			lockReqDesc->start2,lockReqDesc->stop2);
    378 		FLUSH;
    379 	}
    380       } else {
    381 	if (rf_stripeLockDebug) {
    382 		Dprintf6("[%d] conflict: lock %ld %c %ld-%ld hashval=%d not granted\n",
    383 				tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,
    384 				hashval);
    385 		Dprintf3("[%d] lock %ld retcode=%d\n", tid, stripeID, retcode);
    386 		FLUSH;
    387 	}
    388 	AddToWaitersQueue(lockTable, lockDesc, lockReqDesc);                     /* conflict => the current access must wait */
    389       }
    390     }
    391   }
    392 
    393   RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
    394   return(retcode);
    395 }
    396 
    397 void rf_ReleaseStripeLock(
    398   RF_LockTableEntry_t  *lockTable,
    399   RF_StripeNum_t        stripeID,
    400   RF_LockReqDesc_t     *lockReqDesc)
    401 {
    402   RF_StripeLockDesc_t *lockDesc, *ld_t;
    403   RF_LockReqDesc_t    *lr, *lr_t, *callbacklist, *t;
    404   RF_IoType_t type = lockReqDesc->type;
    405   int tid=0, hashval = HASH_STRIPEID(stripeID);
    406   int release_it, consider_it;
    407   RF_LockReqDesc_t *candidate, *candidate_t, *predecessor;
    408 
    409   RF_ASSERT(RF_IO_IS_R_OR_W(type));
    410 
    411   if (rf_stripeLockDebug) {
    412     rf_get_threadid(tid);
    413     if (stripeID == -1) Dprintf1("[%d] Lock release supressed (stripeID == -1)\n",tid);
    414     else {Dprintf8("[%d] Releasing stripe lock on stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
    415 		 tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2, lockTable); FLUSH;}
    416   }
    417 
    418   if (stripeID == -1) return;
    419 
    420   RF_LOCK_MUTEX(lockTable[hashval].mutex);
    421 
    422   /* find the stripe lock descriptor */
    423   for (ld_t = NULL, lockDesc = lockTable[hashval].descList; lockDesc; ld_t = lockDesc, lockDesc=lockDesc->next) {
    424     if (lockDesc->stripeID == stripeID) break;
    425   }
    426   RF_ASSERT(lockDesc);                                    /* major error to release a lock that doesn't exist */
    427 
    428   /* find the stripe lock request descriptor & delete it from the list */
    429   for (lr_t = NULL, lr = lockDesc->granted; lr; lr_t = lr, lr=lr->next) if (lr == lockReqDesc) break;
    430 
    431   RF_ASSERT(lr && (lr == lockReqDesc));                            /* major error to release a lock that hasn't been granted */
    432   if (lr_t) lr_t->next = lr->next; else {
    433     RF_ASSERT(lr == lockDesc->granted);
    434     lockDesc->granted = lr->next;
    435   }
    436   lr->next = NULL;
    437 
    438   if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters--;
    439 
    440   /* search through the waiters list to see if anyone needs to be woken up.
    441    * for each such descriptor in the wait list, we check it against everything granted and against
    442    * everything _in front_ of it in the waiters queue.  If it conflicts with none of these, we release it.
    443    *
    444    * DON'T TOUCH THE TEMPLINK POINTER OF ANYTHING IN THE GRANTED LIST HERE.  This will roach the case where
    445    * the callback tries to acquire a new lock in the same stripe.  There are some asserts to try and detect this.
    446    *
    447    * We apply 2 performance optimizations:
    448    * (1) if releasing this lock results in no more writers to this stripe, we just release everybody waiting,
    449    * since we place no restrictions on the number of concurrent reads.
    450    * (2) we consider as candidates for wakeup only those waiters that have a range overlap with either
    451    * the descriptor being woken up or with something in the callbacklist (i.e. something we've just now woken up).
    452    * This allows us to avoid the long evaluation for some descriptors.
    453    */
    454 
    455   callbacklist = NULL;
    456   if (lockDesc->nWriters == 0) {                          /* performance tweak (1) */
    457     while (lockDesc->waitersH) {
    458 
    459       lr = lockDesc->waitersH;                            /* delete from waiters list */
    460       lockDesc->waitersH = lr->next;
    461 
    462       RF_ASSERT(lr->type == RF_IO_TYPE_READ);
    463 
    464       lr->next = lockDesc->granted;                       /* add to granted list */
    465       lockDesc->granted = lr;
    466 
    467       RF_ASSERT(!lr->templink);
    468       lr->templink = callbacklist;                        /* put on callback list so that we'll invoke callback below */
    469       callbacklist = lr;
    470       if (rf_stripeLockDebug) {Dprintf8("[%d] No writers: granting lock stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
    471 				     tid,stripeID,lr->type,lr->start,lr->stop,lr->start2,lr->stop2,(unsigned long) lockTable); FLUSH;}
    472     }
    473     lockDesc->waitersT = NULL;                            /* we've purged the whole waiters list */
    474 
    475   } else for (candidate_t = NULL, candidate = lockDesc->waitersH; candidate; ) {
    476 
    477     /* performance tweak (2) */
    478     consider_it = 0;
    479     if (RANGE_OVERLAP(lockReqDesc, candidate)) consider_it = 1;
    480     else for (t = callbacklist; t; t=t->templink) if (RANGE_OVERLAP(t, candidate)) {
    481       consider_it = 1;
    482       break;
    483     }
    484     if (!consider_it) {
    485       if (rf_stripeLockDebug) {Dprintf8("[%d] No overlap: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
    486 				     tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
    487 				     (unsigned long) lockTable); FLUSH;}
    488       candidate_t = candidate; candidate = candidate->next;
    489       continue;
    490     }
    491 
    492 
    493     /* we have a candidate for release.  check to make sure it is not blocked by any granted locks */
    494     release_it = 1;
    495     for (predecessor = lockDesc->granted; predecessor; predecessor = predecessor->next) {
    496       if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
    497 	if (rf_stripeLockDebug) {
    498 	  Dprintf8("[%d] Conflicts with granted lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
    499 		   tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
    500 		   (unsigned long) lockTable); FLUSH;
    501 	}
    502 	release_it = 0; break;
    503       }
    504     }
    505 
    506     /* now check to see if the candidate is blocked by any waiters that occur before it it the wait queue */
    507     if (release_it) for (predecessor = lockDesc->waitersH; predecessor != candidate; predecessor = predecessor->next) {
    508       if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
    509 	if (rf_stripeLockDebug) {
    510 	  Dprintf8("[%d] Conflicts with waiting lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
    511 		   tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
    512 		   (unsigned long) lockTable); FLUSH;
    513 	}
    514 	release_it = 0; break;
    515       }
    516     }
    517 
    518     /* release it if indicated */
    519     if (release_it) {
    520       if (rf_stripeLockDebug) {Dprintf8("[%d] Granting lock to candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
    521 				     tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
    522 				     (unsigned long) lockTable); FLUSH;}
    523       if (candidate_t) {
    524 	candidate_t->next = candidate->next;
    525 	if (lockDesc->waitersT == candidate) lockDesc->waitersT = candidate_t;       /* cannot be waitersH since candidate_t is not NULL */
    526       } else {
    527 	RF_ASSERT(candidate == lockDesc->waitersH);
    528 	lockDesc->waitersH = lockDesc->waitersH->next;
    529 	if (!lockDesc->waitersH) lockDesc->waitersT = NULL;
    530       }
    531       candidate->next = lockDesc->granted;                    /* move it to the granted list */
    532       lockDesc->granted = candidate;
    533 
    534       RF_ASSERT(!candidate->templink);
    535       candidate->templink = callbacklist;                     /* put it on the list of things to be called after we release the mutex */
    536       callbacklist = candidate;
    537 
    538       if (!candidate_t) candidate = lockDesc->waitersH; else candidate = candidate_t->next;     /* continue with the rest of the list */
    539     } else {
    540       candidate_t = candidate; candidate = candidate->next;                                     /* continue with the rest of the list */
    541     }
    542   }
    543 
    544   /* delete the descriptor if no one is waiting or active */
    545   if (!lockDesc->granted && !lockDesc->waitersH) {
    546     RF_ASSERT(lockDesc->nWriters == 0);
    547     if (rf_stripeLockDebug) {
    548       Dprintf3("[%d] Last lock released (table 0x%lx): deleting desc for stripeID %ld\n",tid,(unsigned long) lockTable, stripeID); FLUSH;
    549     }
    550     if (ld_t) ld_t->next = lockDesc->next; else {
    551       RF_ASSERT(lockDesc == lockTable[hashval].descList);
    552       lockTable[hashval].descList = lockDesc->next;
    553     }
    554     FreeStripeLockDesc(lockDesc);
    555     lockDesc = NULL;                                                               /* only for the ASSERT below */
    556   }
    557 
    558   RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
    559 
    560   /* now that we've unlocked the mutex, invoke the callback on all the descriptors in the list */
    561   RF_ASSERT(!( (callbacklist) && (!lockDesc) ));                   /* if we deleted the descriptor, we should have no callbacks to do */
    562   for (candidate = callbacklist; candidate; ) {
    563     t = candidate;
    564     candidate = candidate->templink;
    565     t->templink = NULL;
    566     (t->cbFunc)(t->cbArg);
    567   }
    568 }
    569 
    570 /* must have the indicated lock table mutex upon entry */
    571 static void AddToWaitersQueue(
    572   RF_LockTableEntry_t  *lockTable,
    573   RF_StripeLockDesc_t  *lockDesc,
    574   RF_LockReqDesc_t     *lockReqDesc)
    575 {
    576   int tid;
    577 
    578   if (rf_stripeLockDebug) {
    579     rf_get_threadid(tid);
    580     Dprintf3("[%d] Waiting on lock for stripe %ld table 0x%lx\n", tid, lockDesc->stripeID, (unsigned long) lockTable); FLUSH;
    581   }
    582   if (!lockDesc->waitersH) {
    583     lockDesc->waitersH = lockDesc->waitersT = lockReqDesc;
    584   } else {
    585     lockDesc->waitersT->next = lockReqDesc;
    586     lockDesc->waitersT = lockReqDesc;
    587   }
    588 }
    589 
    590 static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID)
    591 {
    592 	RF_StripeLockDesc_t *p;
    593 
    594 	RF_FREELIST_GET(rf_stripelock_freelist,p,next,(RF_StripeLockDesc_t *));
    595 	if (p) {
    596 		p->stripeID = stripeID;
    597 	}
    598 	return(p);
    599 }
    600 
    601 static void FreeStripeLockDesc(RF_StripeLockDesc_t *p)
    602 {
    603 	RF_FREELIST_FREE(rf_stripelock_freelist,p,next);
    604 }
    605 
    606 static void PrintLockedStripes(lockTable)
    607   RF_LockTableEntry_t  *lockTable;
    608 {
    609   int i, j, foundone = 0, did;
    610   RF_StripeLockDesc_t *p;
    611   RF_LockReqDesc_t *q;
    612 
    613   RF_LOCK_MUTEX(rf_printf_mutex);
    614   printf("Locked stripes:\n");
    615   for (i=0; i<rf_lockTableSize; i++) if (lockTable[i].descList) {
    616     foundone = 1;
    617     for (p = lockTable[i].descList; p; p=p->next) {
    618       printf("Stripe ID 0x%lx (%d) nWriters %d\n",
    619 	     (long)p->stripeID, (int)p->stripeID, p->nWriters);
    620 
    621       if (! (p->granted) ) printf("Granted: (none)\n"); else printf("Granted:\n");
    622       for (did=1,j=0,q = p->granted; q; j++,q=q->next) {
    623 	printf("  %c(%ld-%ld",q->type,(long)q->start,(long)q->stop);
    624 	if (q->start2 != -1) printf(",%ld-%ld) ",(long)q->start2,
    625 				    (long)q->stop2); else printf(") ");
    626 	if (j && !(j%4)) {printf("\n"); did=1;} else did=0;
    627       }
    628       if (!did) printf("\n");
    629 
    630       if (! (p->waitersH) ) printf("Waiting: (none)\n"); else printf("Waiting:\n");
    631       for (did=1,j=0,q = p->waitersH; q; j++,q=q->next) {
    632 	printf("%c(%ld-%ld",q->type,(long)q->start,(long)q->stop);
    633 	if (q->start2 != -1) printf(",%ld-%ld) ",(long)q->start2,(long)q->stop2); else printf(") ");
    634 	if (j && !(j%4)) {printf("\n         "); did=1;} else did=0;
    635       }
    636       if (!did) printf("\n");
    637     }
    638   }
    639   if (!foundone) printf("(none)\n"); else printf("\n");
    640   RF_UNLOCK_MUTEX(rf_printf_mutex);
    641 }
    642