1 1.20 christos /* $NetBSD: rf_raid5.c,v 1.20 2019/02/09 03:34:00 christos Exp $ */ 2 1.1 oster /* 3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University. 4 1.1 oster * All rights reserved. 5 1.1 oster * 6 1.1 oster * Author: Mark Holland 7 1.1 oster * 8 1.1 oster * Permission to use, copy, modify and distribute this software and 9 1.1 oster * its documentation is hereby granted, provided that both the copyright 10 1.1 oster * notice and this permission notice appear in all copies of the 11 1.1 oster * software, derivative works or modified versions, and any portions 12 1.1 oster * thereof, and that both notices appear in supporting documentation. 13 1.1 oster * 14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 1.1 oster * 18 1.1 oster * Carnegie Mellon requests users of this software to return to 19 1.1 oster * 20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU 21 1.1 oster * School of Computer Science 22 1.1 oster * Carnegie Mellon University 23 1.1 oster * Pittsburgh PA 15213-3890 24 1.1 oster * 25 1.1 oster * any improvements or extensions that they make and grant Carnegie the 26 1.1 oster * rights to redistribute these changes. 27 1.1 oster */ 28 1.1 oster 29 1.1 oster /****************************************************************************** 30 1.1 oster * 31 1.1 oster * rf_raid5.c -- implements RAID Level 5 32 1.1 oster * 33 1.1 oster *****************************************************************************/ 34 1.6 lukem 35 1.6 lukem #include <sys/cdefs.h> 36 1.20 christos __KERNEL_RCSID(0, "$NetBSD: rf_raid5.c,v 1.20 2019/02/09 03:34:00 christos Exp $"); 37 1.1 oster 38 1.5 oster #include <dev/raidframe/raidframevar.h> 39 1.5 oster 40 1.1 oster #include "rf_raid.h" 41 1.1 oster #include "rf_raid5.h" 42 1.1 oster #include "rf_dag.h" 43 1.1 oster #include "rf_dagffrd.h" 44 1.1 oster #include "rf_dagffwr.h" 45 1.1 oster #include "rf_dagdegrd.h" 46 1.1 oster #include "rf_dagdegwr.h" 47 1.1 oster #include "rf_dagutils.h" 48 1.1 oster #include "rf_general.h" 49 1.1 oster #include "rf_map.h" 50 1.1 oster #include "rf_utils.h" 51 1.1 oster 52 1.1 oster typedef struct RF_Raid5ConfigInfo_s { 53 1.3 oster RF_RowCol_t **stripeIdentifier; /* filled in at config time and used 54 1.3 oster * by IdentifyStripe */ 55 1.3 oster } RF_Raid5ConfigInfo_t; 56 1.3 oster 57 1.15 perry int 58 1.19 christos rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 59 1.19 christos RF_Config_t *cfgPtr) 60 1.3 oster { 61 1.3 oster RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 62 1.3 oster RF_Raid5ConfigInfo_t *info; 63 1.3 oster RF_RowCol_t i, j, startdisk; 64 1.3 oster 65 1.3 oster /* create a RAID level 5 configuration structure */ 66 1.20 christos info = RF_MallocAndAdd(sizeof(*info), raidPtr->cleanupList); 67 1.3 oster if (info == NULL) 68 1.3 oster return (ENOMEM); 69 1.3 oster layoutPtr->layoutSpecificInfo = (void *) info; 70 1.3 oster 71 1.3 oster /* the stripe identifier must identify the disks in each stripe, IN 72 1.3 oster * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 73 1.3 oster info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList); 74 1.3 oster if (info->stripeIdentifier == NULL) 75 1.3 oster return (ENOMEM); 76 1.3 oster startdisk = 0; 77 1.3 oster for (i = 0; i < raidPtr->numCol; i++) { 78 1.3 oster for (j = 0; j < raidPtr->numCol; j++) { 79 1.3 oster info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol; 80 1.3 oster } 81 1.3 oster if ((--startdisk) < 0) 82 1.3 oster startdisk = raidPtr->numCol - 1; 83 1.3 oster } 84 1.1 oster 85 1.3 oster /* fill in the remaining layout parameters */ 86 1.3 oster layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 87 1.3 oster layoutPtr->numDataCol = raidPtr->numCol - 1; 88 1.3 oster layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 89 1.3 oster layoutPtr->numParityCol = 1; 90 1.3 oster layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 91 1.1 oster 92 1.3 oster raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 93 1.1 oster 94 1.3 oster return (0); 95 1.1 oster } 96 1.1 oster 97 1.15 perry int 98 1.19 christos rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr) 99 1.1 oster { 100 1.3 oster return (20); 101 1.1 oster } 102 1.1 oster 103 1.15 perry RF_HeadSepLimit_t 104 1.19 christos rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr) 105 1.1 oster { 106 1.3 oster return (10); 107 1.1 oster } 108 1.1 oster #if !defined(__NetBSD__) && !defined(_KERNEL) 109 1.1 oster /* not currently used */ 110 1.15 perry int 111 1.10 oster rf_ShutdownRAID5(RF_Raid_t *raidPtr) 112 1.1 oster { 113 1.3 oster return (0); 114 1.1 oster } 115 1.1 oster #endif 116 1.1 oster 117 1.15 perry void 118 1.10 oster rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, 119 1.18 christos RF_RowCol_t *col, RF_SectorNum_t *diskSector, 120 1.19 christos int remap) 121 1.1 oster { 122 1.3 oster RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; 123 1.3 oster *col = (SUID % raidPtr->numCol); 124 1.3 oster *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + 125 1.3 oster (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 126 1.1 oster } 127 1.1 oster 128 1.15 perry void 129 1.10 oster rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, 130 1.18 christos RF_RowCol_t *col, RF_SectorNum_t *diskSector, 131 1.19 christos int remap) 132 1.1 oster { 133 1.3 oster RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; 134 1.3 oster 135 1.3 oster *col = raidPtr->Layout.numDataCol - (SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol; 136 1.3 oster *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + 137 1.3 oster (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 138 1.1 oster } 139 1.1 oster 140 1.15 perry void 141 1.10 oster rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, 142 1.10 oster RF_RowCol_t **diskids) 143 1.1 oster { 144 1.3 oster RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); 145 1.3 oster RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 146 1.1 oster 147 1.3 oster *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 148 1.1 oster } 149 1.1 oster 150 1.15 perry void 151 1.19 christos rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr, 152 1.18 christos RF_StripeNum_t stripeID, 153 1.10 oster RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru) 154 1.1 oster { 155 1.3 oster *which_ru = 0; 156 1.3 oster *psID = stripeID; 157 1.1 oster } 158 1.1 oster /* select an algorithm for performing an access. Returns two pointers, 159 1.1 oster * one to a function that will return information about the DAG, and 160 1.1 oster * another to a function that will create the dag. 161 1.1 oster */ 162 1.15 perry void 163 1.10 oster rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, 164 1.10 oster RF_AccessStripeMap_t *asmap, 165 1.10 oster RF_VoidFuncPtr *createFunc) 166 1.1 oster { 167 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 168 1.3 oster RF_PhysDiskAddr_t *failedPDA = NULL; 169 1.9 oster RF_RowCol_t fcol; 170 1.3 oster RF_RowStatus_t rstat; 171 1.3 oster int prior_recon; 172 1.3 oster 173 1.3 oster RF_ASSERT(RF_IO_IS_R_OR_W(type)); 174 1.3 oster 175 1.11 oster if ((asmap->numDataFailed + asmap->numParityFailed > 1) || 176 1.11 oster (raidPtr->numFailures > 1)){ 177 1.13 oster #if RF_DEBUG_DAG 178 1.15 perry if (rf_dagDebug) 179 1.11 oster RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 180 1.13 oster #endif 181 1.8 oster *createFunc = NULL; 182 1.3 oster return; 183 1.12 oster } 184 1.3 oster 185 1.12 oster if (asmap->numDataFailed + asmap->numParityFailed == 1) { 186 1.15 perry 187 1.12 oster /* if under recon & already reconstructed, redirect 188 1.12 oster * the access to the spare drive and eliminate the 189 1.12 oster * failure indication */ 190 1.12 oster failedPDA = asmap->failedPDAs[0]; 191 1.12 oster fcol = failedPDA->col; 192 1.12 oster rstat = raidPtr->status; 193 1.12 oster prior_recon = (rstat == rf_rs_reconfigured) || ( 194 1.3 oster (rstat == rf_rs_reconstructing) ? 195 1.9 oster rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 196 1.3 oster ); 197 1.12 oster if (prior_recon) { 198 1.13 oster #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0 199 1.12 oster RF_RowCol_t oc = failedPDA->col; 200 1.12 oster RF_SectorNum_t oo = failedPDA->startSector; 201 1.15 perry #endif 202 1.14 oster #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 203 1.12 oster if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist 204 1.12 oster * spare space */ 205 1.15 perry 206 1.12 oster if (failedPDA == asmap->parityInfo) { 207 1.15 perry 208 1.12 oster /* parity has failed */ 209 1.12 oster (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, 210 1.12 oster &failedPDA->col, &failedPDA->startSector, RF_REMAP); 211 1.15 perry 212 1.12 oster if (asmap->parityInfo->next) { /* redir 2nd component, 213 1.12 oster * if any */ 214 1.12 oster RF_PhysDiskAddr_t *p = asmap->parityInfo->next; 215 1.12 oster RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 216 1.12 oster p->col = failedPDA->col; 217 1.12 oster p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 218 1.12 oster SUoffs; /* cheating: 219 1.12 oster * startSector is not 220 1.12 oster * really a RAID address */ 221 1.12 oster } 222 1.12 oster } else 223 1.12 oster if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) { 224 1.12 oster RF_ASSERT(0); /* should not ever 225 1.12 oster * happen */ 226 1.12 oster } else { 227 1.15 perry 228 1.12 oster /* data has failed */ 229 1.12 oster (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, 230 1.12 oster &failedPDA->col, &failedPDA->startSector, RF_REMAP); 231 1.15 perry 232 1.12 oster } 233 1.15 perry 234 1.14 oster } else { 235 1.15 perry #endif 236 1.14 oster /* redirect to dedicated spare space */ 237 1.15 perry 238 1.12 oster failedPDA->col = raidPtr->Disks[fcol].spareCol; 239 1.15 perry 240 1.12 oster /* the parity may have two distinct 241 1.12 oster * components, both of which may need 242 1.12 oster * to be redirected */ 243 1.12 oster if (asmap->parityInfo->next) { 244 1.3 oster if (failedPDA == asmap->parityInfo) { 245 1.12 oster failedPDA->next->col = failedPDA->col; 246 1.3 oster } else 247 1.12 oster if (failedPDA == asmap->parityInfo->next) { /* paranoid: should 248 1.12 oster * never occur */ 249 1.12 oster asmap->parityInfo->col = failedPDA->col; 250 1.3 oster } 251 1.3 oster } 252 1.14 oster #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 253 1.3 oster } 254 1.14 oster #endif 255 1.12 oster RF_ASSERT(failedPDA->col != -1); 256 1.15 perry 257 1.13 oster #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0 258 1.12 oster if (rf_dagDebug || rf_mapDebug) { 259 1.12 oster printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n", 260 1.15 perry raidPtr->raidid, type, oc, 261 1.12 oster (long) oo, failedPDA->col, 262 1.12 oster (long) failedPDA->startSector); 263 1.12 oster } 264 1.13 oster #endif 265 1.12 oster asmap->numDataFailed = asmap->numParityFailed = 0; 266 1.3 oster } 267 1.12 oster } 268 1.3 oster /* all dags begin/end with block/unblock node therefore, hdrSucc & 269 1.3 oster * termAnt counts should always be 1 also, these counts should not be 270 1.3 oster * visible outside dag creation routines - manipulating the counts 271 1.3 oster * here should be removed */ 272 1.3 oster if (type == RF_IO_TYPE_READ) { 273 1.3 oster if (asmap->numDataFailed == 0) 274 1.3 oster *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 275 1.3 oster else 276 1.3 oster *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 277 1.1 oster } else { 278 1.1 oster 279 1.1 oster 280 1.3 oster /* if mirroring, always use large writes. If the access 281 1.3 oster * requires two distinct parity updates, always do a small 282 1.3 oster * write. If the stripe contains a failure but the access 283 1.3 oster * does not, do a small write. The first conditional 284 1.3 oster * (numStripeUnitsAccessed <= numDataCol/2) uses a 285 1.3 oster * less-than-or-equal rather than just a less-than because 286 1.3 oster * when G is 3 or 4, numDataCol/2 is 1, and I want 287 1.3 oster * single-stripe-unit updates to use just one disk. */ 288 1.3 oster if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { 289 1.3 oster if (rf_suppressLocksAndLargeWrites || 290 1.3 oster (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || 291 1.3 oster (asmap->parityInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { 292 1.3 oster *createFunc = (RF_VoidFuncPtr) rf_CreateSmallWriteDAG; 293 1.3 oster } else 294 1.3 oster *createFunc = (RF_VoidFuncPtr) rf_CreateLargeWriteDAG; 295 1.3 oster } else { 296 1.3 oster if (asmap->numParityFailed == 1) 297 1.3 oster *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 298 1.3 oster else 299 1.17 christos if (asmap->numStripeUnitsAccessed != 1 && (failedPDA == NULL || failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)) 300 1.3 oster *createFunc = NULL; 301 1.3 oster else 302 1.3 oster *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 303 1.3 oster } 304 1.1 oster } 305 1.1 oster } 306