Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
     25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
     26  * Copyright (c) 2014 Integros [integros.com]
     27  */
     28 
     29 #include <sys/zfs_context.h>
     30 #include <sys/spa.h>
     31 #include <sys/vdev_impl.h>
     32 #ifndef __FreeBSD__
     33 #include <sys/vdev_disk.h>
     34 #endif
     35 #include <sys/vdev_file.h>
     36 #include <sys/vdev_raidz.h>
     37 #include <sys/zio.h>
     38 #include <sys/zio_checksum.h>
     39 #include <sys/fs/zfs.h>
     40 #include <sys/fm/fs/zfs.h>
     41 #ifdef __FreeBSD__
     42 #include <sys/bio.h>
     43 #endif
     44 
     45 /*
     46  * Virtual device vector for RAID-Z.
     47  *
     48  * This vdev supports single, double, and triple parity. For single parity,
     49  * we use a simple XOR of all the data columns. For double or triple parity,
     50  * we use a special case of Reed-Solomon coding. This extends the
     51  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
     52  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
     53  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
     54  * former is also based. The latter is designed to provide higher performance
     55  * for writes.
     56  *
     57  * Note that the Plank paper claimed to support arbitrary N+M, but was then
     58  * amended six years later identifying a critical flaw that invalidates its
     59  * claims. Nevertheless, the technique can be adapted to work for up to
     60  * triple parity. For additional parity, the amendment "Note: Correction to
     61  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
     62  * is viable, but the additional complexity means that write performance will
     63  * suffer.
     64  *
     65  * All of the methods above operate on a Galois field, defined over the
     66  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
     67  * can be expressed with a single byte. Briefly, the operations on the
     68  * field are defined as follows:
     69  *
     70  *   o addition (+) is represented by a bitwise XOR
     71  *   o subtraction (-) is therefore identical to addition: A + B = A - B
     72  *   o multiplication of A by 2 is defined by the following bitwise expression:
     73  *
     74  *	(A * 2)_7 = A_6
     75  *	(A * 2)_6 = A_5
     76  *	(A * 2)_5 = A_4
     77  *	(A * 2)_4 = A_3 + A_7
     78  *	(A * 2)_3 = A_2 + A_7
     79  *	(A * 2)_2 = A_1 + A_7
     80  *	(A * 2)_1 = A_0
     81  *	(A * 2)_0 = A_7
     82  *
     83  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
     84  * As an aside, this multiplication is derived from the error correcting
     85  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
     86  *
     87  * Observe that any number in the field (except for 0) can be expressed as a
     88  * power of 2 -- a generator for the field. We store a table of the powers of
     89  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
     90  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
     91  * than field addition). The inverse of a field element A (A^-1) is therefore
     92  * A ^ (255 - 1) = A^254.
     93  *
     94  * The up-to-three parity columns, P, Q, R over several data columns,
     95  * D_0, ... D_n-1, can be expressed by field operations:
     96  *
     97  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
     98  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
     99  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
    100  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
    101  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
    102  *
    103  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
    104  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
    105  * independent coefficients. (There are no additional coefficients that have
    106  * this property which is why the uncorrected Plank method breaks down.)
    107  *
    108  * See the reconstruction code below for how P, Q and R can used individually
    109  * or in concert to recover missing data columns.
    110  */
    111 
    112 typedef struct raidz_col {
    113 	uint64_t rc_devidx;		/* child device index for I/O */
    114 	uint64_t rc_offset;		/* device offset */
    115 	uint64_t rc_size;		/* I/O size */
    116 	void *rc_data;			/* I/O data */
    117 	void *rc_gdata;			/* used to store the "good" version */
    118 	int rc_error;			/* I/O error for this device */
    119 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
    120 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
    121 } raidz_col_t;
    122 
    123 typedef struct raidz_map {
    124 	uint64_t rm_cols;		/* Regular column count */
    125 	uint64_t rm_scols;		/* Count including skipped columns */
    126 	uint64_t rm_bigcols;		/* Number of oversized columns */
    127 	uint64_t rm_asize;		/* Actual total I/O size */
    128 	uint64_t rm_missingdata;	/* Count of missing data devices */
    129 	uint64_t rm_missingparity;	/* Count of missing parity devices */
    130 	uint64_t rm_firstdatacol;	/* First data column/parity count */
    131 	uint64_t rm_nskip;		/* Skipped sectors for padding */
    132 	uint64_t rm_skipstart;		/* Column index of padding start */
    133 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
    134 	uintptr_t rm_reports;		/* # of referencing checksum reports */
    135 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
    136 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
    137 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
    138 } raidz_map_t;
    139 
    140 #define	VDEV_RAIDZ_P		0
    141 #define	VDEV_RAIDZ_Q		1
    142 #define	VDEV_RAIDZ_R		2
    143 
    144 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
    145 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
    146 
    147 /*
    148  * We provide a mechanism to perform the field multiplication operation on a
    149  * 64-bit value all at once rather than a byte at a time. This works by
    150  * creating a mask from the top bit in each byte and using that to
    151  * conditionally apply the XOR of 0x1d.
    152  */
    153 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
    154 { \
    155 	(mask) = (x) & 0x8080808080808080ULL; \
    156 	(mask) = ((mask) << 1) - ((mask) >> 7); \
    157 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
    158 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
    159 }
    160 
    161 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
    162 { \
    163 	VDEV_RAIDZ_64MUL_2((x), mask); \
    164 	VDEV_RAIDZ_64MUL_2((x), mask); \
    165 }
    166 
    167 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
    168 
    169 /*
    170  * Force reconstruction to use the general purpose method.
    171  */
    172 int vdev_raidz_default_to_general;
    173 
    174 /* Powers of 2 in the Galois field defined above. */
    175 static const uint8_t vdev_raidz_pow2[256] = {
    176 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
    177 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
    178 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
    179 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
    180 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
    181 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
    182 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
    183 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
    184 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
    185 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
    186 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
    187 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
    188 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
    189 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
    190 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
    191 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
    192 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
    193 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
    194 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
    195 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
    196 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
    197 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
    198 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
    199 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
    200 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
    201 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
    202 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
    203 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
    204 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
    205 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
    206 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
    207 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
    208 };
    209 /* Logs of 2 in the Galois field defined above. */
    210 static const uint8_t vdev_raidz_log2[256] = {
    211 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
    212 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
    213 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
    214 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
    215 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
    216 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
    217 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
    218 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
    219 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
    220 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
    221 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
    222 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
    223 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
    224 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
    225 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
    226 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
    227 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
    228 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
    229 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
    230 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
    231 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
    232 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
    233 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
    234 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
    235 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
    236 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
    237 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
    238 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
    239 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
    240 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
    241 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
    242 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
    243 };
    244 
    245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
    246 
    247 /*
    248  * Multiply a given number by 2 raised to the given power.
    249  */
    250 static uint8_t
    251 vdev_raidz_exp2(uint_t a, int exp)
    252 {
    253 	if (a == 0)
    254 		return (0);
    255 
    256 	ASSERT(exp >= 0);
    257 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
    258 
    259 	exp += vdev_raidz_log2[a];
    260 	if (exp > 255)
    261 		exp -= 255;
    262 
    263 	return (vdev_raidz_pow2[exp]);
    264 }
    265 
    266 static void
    267 vdev_raidz_map_free(raidz_map_t *rm)
    268 {
    269 	int c;
    270 	size_t size;
    271 
    272 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    273 		if (rm->rm_col[c].rc_data != NULL)
    274 			zio_buf_free(rm->rm_col[c].rc_data,
    275 			    rm->rm_col[c].rc_size);
    276 
    277 		if (rm->rm_col[c].rc_gdata != NULL)
    278 			zio_buf_free(rm->rm_col[c].rc_gdata,
    279 			    rm->rm_col[c].rc_size);
    280 	}
    281 
    282 	size = 0;
    283 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
    284 		size += rm->rm_col[c].rc_size;
    285 
    286 	if (rm->rm_datacopy != NULL)
    287 		zio_buf_free(rm->rm_datacopy, size);
    288 
    289 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
    290 }
    291 
    292 static void
    293 vdev_raidz_map_free_vsd(zio_t *zio)
    294 {
    295 	raidz_map_t *rm = zio->io_vsd;
    296 
    297 	ASSERT0(rm->rm_freed);
    298 	rm->rm_freed = 1;
    299 
    300 	if (rm->rm_reports == 0)
    301 		vdev_raidz_map_free(rm);
    302 }
    303 
    304 /*ARGSUSED*/
    305 static void
    306 vdev_raidz_cksum_free(void *arg, size_t ignored)
    307 {
    308 	raidz_map_t *rm = arg;
    309 
    310 	ASSERT3U(rm->rm_reports, >, 0);
    311 
    312 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
    313 		vdev_raidz_map_free(rm);
    314 }
    315 
    316 static void
    317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
    318 {
    319 	raidz_map_t *rm = zcr->zcr_cbdata;
    320 	size_t c = zcr->zcr_cbinfo;
    321 	size_t x;
    322 
    323 	const char *good = NULL;
    324 	const char *bad = rm->rm_col[c].rc_data;
    325 
    326 	if (good_data == NULL) {
    327 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
    328 		return;
    329 	}
    330 
    331 	if (c < rm->rm_firstdatacol) {
    332 		/*
    333 		 * The first time through, calculate the parity blocks for
    334 		 * the good data (this relies on the fact that the good
    335 		 * data never changes for a given logical ZIO)
    336 		 */
    337 		if (rm->rm_col[0].rc_gdata == NULL) {
    338 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
    339 			char *buf;
    340 
    341 			/*
    342 			 * Set up the rm_col[]s to generate the parity for
    343 			 * good_data, first saving the parity bufs and
    344 			 * replacing them with buffers to hold the result.
    345 			 */
    346 			for (x = 0; x < rm->rm_firstdatacol; x++) {
    347 				bad_parity[x] = rm->rm_col[x].rc_data;
    348 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
    349 				    zio_buf_alloc(rm->rm_col[x].rc_size);
    350 			}
    351 
    352 			/* fill in the data columns from good_data */
    353 			buf = (char *)good_data;
    354 			for (; x < rm->rm_cols; x++) {
    355 				rm->rm_col[x].rc_data = buf;
    356 				buf += rm->rm_col[x].rc_size;
    357 			}
    358 
    359 			/*
    360 			 * Construct the parity from the good data.
    361 			 */
    362 			vdev_raidz_generate_parity(rm);
    363 
    364 			/* restore everything back to its original state */
    365 			for (x = 0; x < rm->rm_firstdatacol; x++)
    366 				rm->rm_col[x].rc_data = bad_parity[x];
    367 
    368 			buf = rm->rm_datacopy;
    369 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
    370 				rm->rm_col[x].rc_data = buf;
    371 				buf += rm->rm_col[x].rc_size;
    372 			}
    373 		}
    374 
    375 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
    376 		good = rm->rm_col[c].rc_gdata;
    377 	} else {
    378 		/* adjust good_data to point at the start of our column */
    379 		good = good_data;
    380 
    381 		for (x = rm->rm_firstdatacol; x < c; x++)
    382 			good += rm->rm_col[x].rc_size;
    383 	}
    384 
    385 	/* we drop the ereport if it ends up that the data was good */
    386 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
    387 }
    388 
    389 /*
    390  * Invoked indirectly by zfs_ereport_start_checksum(), called
    391  * below when our read operation fails completely.  The main point
    392  * is to keep a copy of everything we read from disk, so that at
    393  * vdev_raidz_cksum_finish() time we can compare it with the good data.
    394  */
    395 static void
    396 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
    397 {
    398 	size_t c = (size_t)(uintptr_t)arg;
    399 	caddr_t buf;
    400 
    401 	raidz_map_t *rm = zio->io_vsd;
    402 	size_t size;
    403 
    404 	/* set up the report and bump the refcount  */
    405 	zcr->zcr_cbdata = rm;
    406 	zcr->zcr_cbinfo = c;
    407 	zcr->zcr_finish = vdev_raidz_cksum_finish;
    408 	zcr->zcr_free = vdev_raidz_cksum_free;
    409 
    410 	rm->rm_reports++;
    411 	ASSERT3U(rm->rm_reports, >, 0);
    412 
    413 	if (rm->rm_datacopy != NULL)
    414 		return;
    415 
    416 	/*
    417 	 * It's the first time we're called for this raidz_map_t, so we need
    418 	 * to copy the data aside; there's no guarantee that our zio's buffer
    419 	 * won't be re-used for something else.
    420 	 *
    421 	 * Our parity data is already in separate buffers, so there's no need
    422 	 * to copy them.
    423 	 */
    424 
    425 	size = 0;
    426 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
    427 		size += rm->rm_col[c].rc_size;
    428 
    429 	buf = rm->rm_datacopy = zio_buf_alloc(size);
    430 
    431 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    432 		raidz_col_t *col = &rm->rm_col[c];
    433 
    434 		bcopy(col->rc_data, buf, col->rc_size);
    435 		col->rc_data = buf;
    436 
    437 		buf += col->rc_size;
    438 	}
    439 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
    440 }
    441 
    442 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
    443 	vdev_raidz_map_free_vsd,
    444 	vdev_raidz_cksum_report
    445 };
    446 
    447 /*
    448  * Divides the IO evenly across all child vdevs; usually, dcols is
    449  * the number of children in the target vdev.
    450  */
    451 static raidz_map_t *
    452 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
    453     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
    454 {
    455 	raidz_map_t *rm;
    456 	/* The starting RAIDZ (parent) vdev sector of the block. */
    457 	uint64_t b = offset >> unit_shift;
    458 	/* The zio's size in units of the vdev's minimum sector size. */
    459 	uint64_t s = size >> unit_shift;
    460 	/* The first column for this stripe. */
    461 	uint64_t f = b % dcols;
    462 	/* The starting byte offset on each child vdev. */
    463 	uint64_t o = (b / dcols) << unit_shift;
    464 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
    465 
    466 	/*
    467 	 * "Quotient": The number of data sectors for this stripe on all but
    468 	 * the "big column" child vdevs that also contain "remainder" data.
    469 	 */
    470 	q = s / (dcols - nparity);
    471 
    472 	/*
    473 	 * "Remainder": The number of partial stripe data sectors in this I/O.
    474 	 * This will add a sector to some, but not all, child vdevs.
    475 	 */
    476 	r = s - q * (dcols - nparity);
    477 
    478 	/* The number of "big columns" - those which contain remainder data. */
    479 	bc = (r == 0 ? 0 : r + nparity);
    480 
    481 	/*
    482 	 * The total number of data and parity sectors associated with
    483 	 * this I/O.
    484 	 */
    485 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
    486 
    487 	/* acols: The columns that will be accessed. */
    488 	/* scols: The columns that will be accessed or skipped. */
    489 	if (q == 0) {
    490 		/* Our I/O request doesn't span all child vdevs. */
    491 		acols = bc;
    492 		scols = MIN(dcols, roundup(bc, nparity + 1));
    493 	} else {
    494 		acols = dcols;
    495 		scols = dcols;
    496 	}
    497 
    498 	ASSERT3U(acols, <=, scols);
    499 
    500 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
    501 
    502 	rm->rm_cols = acols;
    503 	rm->rm_scols = scols;
    504 	rm->rm_bigcols = bc;
    505 	rm->rm_skipstart = bc;
    506 	rm->rm_missingdata = 0;
    507 	rm->rm_missingparity = 0;
    508 	rm->rm_firstdatacol = nparity;
    509 	rm->rm_datacopy = NULL;
    510 	rm->rm_reports = 0;
    511 	rm->rm_freed = 0;
    512 	rm->rm_ecksuminjected = 0;
    513 
    514 	asize = 0;
    515 
    516 	for (c = 0; c < scols; c++) {
    517 		col = f + c;
    518 		coff = o;
    519 		if (col >= dcols) {
    520 			col -= dcols;
    521 			coff += 1ULL << unit_shift;
    522 		}
    523 		rm->rm_col[c].rc_devidx = col;
    524 		rm->rm_col[c].rc_offset = coff;
    525 		rm->rm_col[c].rc_data = NULL;
    526 		rm->rm_col[c].rc_gdata = NULL;
    527 		rm->rm_col[c].rc_error = 0;
    528 		rm->rm_col[c].rc_tried = 0;
    529 		rm->rm_col[c].rc_skipped = 0;
    530 
    531 		if (c >= acols)
    532 			rm->rm_col[c].rc_size = 0;
    533 		else if (c < bc)
    534 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
    535 		else
    536 			rm->rm_col[c].rc_size = q << unit_shift;
    537 
    538 		asize += rm->rm_col[c].rc_size;
    539 	}
    540 
    541 	ASSERT3U(asize, ==, tot << unit_shift);
    542 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
    543 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
    544 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
    545 	ASSERT3U(rm->rm_nskip, <=, nparity);
    546 
    547 	if (!dofree) {
    548 		for (c = 0; c < rm->rm_firstdatacol; c++) {
    549 			rm->rm_col[c].rc_data =
    550 			    zio_buf_alloc(rm->rm_col[c].rc_size);
    551 		}
    552 
    553 		rm->rm_col[c].rc_data = data;
    554 
    555 		for (c = c + 1; c < acols; c++) {
    556 			rm->rm_col[c].rc_data =
    557 			    (char *)rm->rm_col[c - 1].rc_data +
    558 			    rm->rm_col[c - 1].rc_size;
    559 		}
    560 	}
    561 
    562 	/*
    563 	 * If all data stored spans all columns, there's a danger that parity
    564 	 * will always be on the same device and, since parity isn't read
    565 	 * during normal operation, that that device's I/O bandwidth won't be
    566 	 * used effectively. We therefore switch the parity every 1MB.
    567 	 *
    568 	 * ... at least that was, ostensibly, the theory. As a practical
    569 	 * matter unless we juggle the parity between all devices evenly, we
    570 	 * won't see any benefit. Further, occasional writes that aren't a
    571 	 * multiple of the LCM of the number of children and the minimum
    572 	 * stripe width are sufficient to avoid pessimal behavior.
    573 	 * Unfortunately, this decision created an implicit on-disk format
    574 	 * requirement that we need to support for all eternity, but only
    575 	 * for single-parity RAID-Z.
    576 	 *
    577 	 * If we intend to skip a sector in the zeroth column for padding
    578 	 * we must make sure to note this swap. We will never intend to
    579 	 * skip the first column since at least one data and one parity
    580 	 * column must appear in each row.
    581 	 */
    582 	ASSERT(rm->rm_cols >= 2);
    583 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
    584 
    585 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
    586 		devidx = rm->rm_col[0].rc_devidx;
    587 		o = rm->rm_col[0].rc_offset;
    588 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
    589 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
    590 		rm->rm_col[1].rc_devidx = devidx;
    591 		rm->rm_col[1].rc_offset = o;
    592 
    593 		if (rm->rm_skipstart == 0)
    594 			rm->rm_skipstart = 1;
    595 	}
    596 
    597 	return (rm);
    598 }
    599 
    600 static void
    601 vdev_raidz_generate_parity_p(raidz_map_t *rm)
    602 {
    603 	uint64_t *p, *src, pcount, ccount, i;
    604 	int c;
    605 
    606 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    607 
    608 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    609 		src = rm->rm_col[c].rc_data;
    610 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    611 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    612 
    613 		if (c == rm->rm_firstdatacol) {
    614 			ASSERT(ccount == pcount);
    615 			for (i = 0; i < ccount; i++, src++, p++) {
    616 				*p = *src;
    617 			}
    618 		} else {
    619 			ASSERT(ccount <= pcount);
    620 			for (i = 0; i < ccount; i++, src++, p++) {
    621 				*p ^= *src;
    622 			}
    623 		}
    624 	}
    625 }
    626 
    627 static void
    628 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
    629 {
    630 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
    631 	int c;
    632 
    633 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    634 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    635 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    636 
    637 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    638 		src = rm->rm_col[c].rc_data;
    639 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    640 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    641 
    642 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
    643 
    644 		if (c == rm->rm_firstdatacol) {
    645 			ASSERT(ccnt == pcnt || ccnt == 0);
    646 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
    647 				*p = *src;
    648 				*q = *src;
    649 			}
    650 			for (; i < pcnt; i++, src++, p++, q++) {
    651 				*p = 0;
    652 				*q = 0;
    653 			}
    654 		} else {
    655 			ASSERT(ccnt <= pcnt);
    656 
    657 			/*
    658 			 * Apply the algorithm described above by multiplying
    659 			 * the previous result and adding in the new value.
    660 			 */
    661 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
    662 				*p ^= *src;
    663 
    664 				VDEV_RAIDZ_64MUL_2(*q, mask);
    665 				*q ^= *src;
    666 			}
    667 
    668 			/*
    669 			 * Treat short columns as though they are full of 0s.
    670 			 * Note that there's therefore nothing needed for P.
    671 			 */
    672 			for (; i < pcnt; i++, q++) {
    673 				VDEV_RAIDZ_64MUL_2(*q, mask);
    674 			}
    675 		}
    676 	}
    677 }
    678 
    679 static void
    680 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
    681 {
    682 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
    683 	int c;
    684 
    685 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    686 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    687 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    688 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    689 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
    690 
    691 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    692 		src = rm->rm_col[c].rc_data;
    693 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    694 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    695 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
    696 
    697 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
    698 
    699 		if (c == rm->rm_firstdatacol) {
    700 			ASSERT(ccnt == pcnt || ccnt == 0);
    701 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
    702 				*p = *src;
    703 				*q = *src;
    704 				*r = *src;
    705 			}
    706 			for (; i < pcnt; i++, src++, p++, q++, r++) {
    707 				*p = 0;
    708 				*q = 0;
    709 				*r = 0;
    710 			}
    711 		} else {
    712 			ASSERT(ccnt <= pcnt);
    713 
    714 			/*
    715 			 * Apply the algorithm described above by multiplying
    716 			 * the previous result and adding in the new value.
    717 			 */
    718 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
    719 				*p ^= *src;
    720 
    721 				VDEV_RAIDZ_64MUL_2(*q, mask);
    722 				*q ^= *src;
    723 
    724 				VDEV_RAIDZ_64MUL_4(*r, mask);
    725 				*r ^= *src;
    726 			}
    727 
    728 			/*
    729 			 * Treat short columns as though they are full of 0s.
    730 			 * Note that there's therefore nothing needed for P.
    731 			 */
    732 			for (; i < pcnt; i++, q++, r++) {
    733 				VDEV_RAIDZ_64MUL_2(*q, mask);
    734 				VDEV_RAIDZ_64MUL_4(*r, mask);
    735 			}
    736 		}
    737 	}
    738 }
    739 
    740 /*
    741  * Generate RAID parity in the first virtual columns according to the number of
    742  * parity columns available.
    743  */
    744 static void
    745 vdev_raidz_generate_parity(raidz_map_t *rm)
    746 {
    747 	switch (rm->rm_firstdatacol) {
    748 	case 1:
    749 		vdev_raidz_generate_parity_p(rm);
    750 		break;
    751 	case 2:
    752 		vdev_raidz_generate_parity_pq(rm);
    753 		break;
    754 	case 3:
    755 		vdev_raidz_generate_parity_pqr(rm);
    756 		break;
    757 	default:
    758 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
    759 	}
    760 }
    761 
    762 static int
    763 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
    764 {
    765 	uint64_t *dst, *src, xcount, ccount, count, i;
    766 	int x = tgts[0];
    767 	int c;
    768 
    769 	ASSERT(ntgts == 1);
    770 	ASSERT(x >= rm->rm_firstdatacol);
    771 	ASSERT(x < rm->rm_cols);
    772 
    773 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    774 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
    775 	ASSERT(xcount > 0);
    776 
    777 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    778 	dst = rm->rm_col[x].rc_data;
    779 	for (i = 0; i < xcount; i++, dst++, src++) {
    780 		*dst = *src;
    781 	}
    782 
    783 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    784 		src = rm->rm_col[c].rc_data;
    785 		dst = rm->rm_col[x].rc_data;
    786 
    787 		if (c == x)
    788 			continue;
    789 
    790 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    791 		count = MIN(ccount, xcount);
    792 
    793 		for (i = 0; i < count; i++, dst++, src++) {
    794 			*dst ^= *src;
    795 		}
    796 	}
    797 
    798 	return (1 << VDEV_RAIDZ_P);
    799 }
    800 
    801 static int
    802 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
    803 {
    804 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
    805 	uint8_t *b;
    806 	int x = tgts[0];
    807 	int c, j, exp;
    808 
    809 	ASSERT(ntgts == 1);
    810 
    811 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    812 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
    813 
    814 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    815 		src = rm->rm_col[c].rc_data;
    816 		dst = rm->rm_col[x].rc_data;
    817 
    818 		if (c == x)
    819 			ccount = 0;
    820 		else
    821 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    822 
    823 		count = MIN(ccount, xcount);
    824 
    825 		if (c == rm->rm_firstdatacol) {
    826 			for (i = 0; i < count; i++, dst++, src++) {
    827 				*dst = *src;
    828 			}
    829 			for (; i < xcount; i++, dst++) {
    830 				*dst = 0;
    831 			}
    832 
    833 		} else {
    834 			for (i = 0; i < count; i++, dst++, src++) {
    835 				VDEV_RAIDZ_64MUL_2(*dst, mask);
    836 				*dst ^= *src;
    837 			}
    838 
    839 			for (; i < xcount; i++, dst++) {
    840 				VDEV_RAIDZ_64MUL_2(*dst, mask);
    841 			}
    842 		}
    843 	}
    844 
    845 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    846 	dst = rm->rm_col[x].rc_data;
    847 	exp = 255 - (rm->rm_cols - 1 - x);
    848 
    849 	for (i = 0; i < xcount; i++, dst++, src++) {
    850 		*dst ^= *src;
    851 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
    852 			*b = vdev_raidz_exp2(*b, exp);
    853 		}
    854 	}
    855 
    856 	return (1 << VDEV_RAIDZ_Q);
    857 }
    858 
    859 static int
    860 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
    861 {
    862 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
    863 	void *pdata, *qdata;
    864 	uint64_t xsize, ysize, i;
    865 	int x = tgts[0];
    866 	int y = tgts[1];
    867 
    868 	ASSERT(ntgts == 2);
    869 	ASSERT(x < y);
    870 	ASSERT(x >= rm->rm_firstdatacol);
    871 	ASSERT(y < rm->rm_cols);
    872 
    873 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
    874 
    875 	/*
    876 	 * Move the parity data aside -- we're going to compute parity as
    877 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
    878 	 * reuse the parity generation mechanism without trashing the actual
    879 	 * parity so we make those columns appear to be full of zeros by
    880 	 * setting their lengths to zero.
    881 	 */
    882 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    883 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    884 	xsize = rm->rm_col[x].rc_size;
    885 	ysize = rm->rm_col[y].rc_size;
    886 
    887 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
    888 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
    889 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
    890 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    891 	rm->rm_col[x].rc_size = 0;
    892 	rm->rm_col[y].rc_size = 0;
    893 
    894 	vdev_raidz_generate_parity_pq(rm);
    895 
    896 	rm->rm_col[x].rc_size = xsize;
    897 	rm->rm_col[y].rc_size = ysize;
    898 
    899 	p = pdata;
    900 	q = qdata;
    901 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    902 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    903 	xd = rm->rm_col[x].rc_data;
    904 	yd = rm->rm_col[y].rc_data;
    905 
    906 	/*
    907 	 * We now have:
    908 	 *	Pxy = P + D_x + D_y
    909 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
    910 	 *
    911 	 * We can then solve for D_x:
    912 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
    913 	 * where
    914 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
    915 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
    916 	 *
    917 	 * With D_x in hand, we can easily solve for D_y:
    918 	 *	D_y = P + Pxy + D_x
    919 	 */
    920 
    921 	a = vdev_raidz_pow2[255 + x - y];
    922 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
    923 	tmp = 255 - vdev_raidz_log2[a ^ 1];
    924 
    925 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
    926 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
    927 
    928 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
    929 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
    930 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
    931 
    932 		if (i < ysize)
    933 			*yd = *p ^ *pxy ^ *xd;
    934 	}
    935 
    936 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
    937 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
    938 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
    939 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    940 
    941 	/*
    942 	 * Restore the saved parity data.
    943 	 */
    944 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
    945 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
    946 
    947 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
    948 }
    949 
    950 /* BEGIN CSTYLED */
    951 /*
    952  * In the general case of reconstruction, we must solve the system of linear
    953  * equations defined by the coeffecients used to generate parity as well as
    954  * the contents of the data and parity disks. This can be expressed with
    955  * vectors for the original data (D) and the actual data (d) and parity (p)
    956  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
    957  *
    958  *            __   __                     __     __
    959  *            |     |         __     __   |  p_0  |
    960  *            |  V  |         |  D_0  |   | p_m-1 |
    961  *            |     |    x    |   :   | = |  d_0  |
    962  *            |  I  |         | D_n-1 |   |   :   |
    963  *            |     |         ~~     ~~   | d_n-1 |
    964  *            ~~   ~~                     ~~     ~~
    965  *
    966  * I is simply a square identity matrix of size n, and V is a vandermonde
    967  * matrix defined by the coeffecients we chose for the various parity columns
    968  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
    969  * computation as well as linear separability.
    970  *
    971  *      __               __               __     __
    972  *      |   1   ..  1 1 1 |               |  p_0  |
    973  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
    974  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
    975  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
    976  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
    977  *      |   :       : : : |   |   :   |   |  d_2  |
    978  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
    979  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
    980  *      |   0   ..  0 0 1 |               | d_n-1 |
    981  *      ~~               ~~               ~~     ~~
    982  *
    983  * Note that I, V, d, and p are known. To compute D, we must invert the
    984  * matrix and use the known data and parity values to reconstruct the unknown
    985  * data values. We begin by removing the rows in V|I and d|p that correspond
    986  * to failed or missing columns; we then make V|I square (n x n) and d|p
    987  * sized n by removing rows corresponding to unused parity from the bottom up
    988  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
    989  * using Gauss-Jordan elimination. In the example below we use m=3 parity
    990  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
    991  *           __                               __
    992  *           |  1   1   1   1   1   1   1   1  |
    993  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
    994  *           |  19 205 116  29  64  16  4   1  |      / /
    995  *           |  1   0   0   0   0   0   0   0  |     / /
    996  *           |  0   1   0   0   0   0   0   0  | <--' /
    997  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
    998  *           |  0   0   0   1   0   0   0   0  |
    999  *           |  0   0   0   0   1   0   0   0  |
   1000  *           |  0   0   0   0   0   1   0   0  |
   1001  *           |  0   0   0   0   0   0   1   0  |
   1002  *           |  0   0   0   0   0   0   0   1  |
   1003  *           ~~                               ~~
   1004  *           __                               __
   1005  *           |  1   1   1   1   1   1   1   1  |
   1006  *           |  19 205 116  29  64  16  4   1  |
   1007  *           |  1   0   0   0   0   0   0   0  |
   1008  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
   1009  *           |  0   0   0   0   1   0   0   0  |
   1010  *           |  0   0   0   0   0   1   0   0  |
   1011  *           |  0   0   0   0   0   0   1   0  |
   1012  *           |  0   0   0   0   0   0   0   1  |
   1013  *           ~~                               ~~
   1014  *
   1015  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
   1016  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
   1017  * matrix is not singular.
   1018  * __                                                                 __
   1019  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
   1020  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
   1021  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1022  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1023  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1024  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1025  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1026  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1027  * ~~                                                                 ~~
   1028  * __                                                                 __
   1029  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1030  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
   1031  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
   1032  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1033  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1034  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1035  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1036  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1037  * ~~                                                                 ~~
   1038  * __                                                                 __
   1039  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1040  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1041  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
   1042  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1043  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1044  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1045  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1046  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1047  * ~~                                                                 ~~
   1048  * __                                                                 __
   1049  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1050  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1051  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
   1052  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1053  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1054  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1055  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1056  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1057  * ~~                                                                 ~~
   1058  * __                                                                 __
   1059  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1060  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
   1061  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
   1062  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1063  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1064  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1065  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1066  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1067  * ~~                                                                 ~~
   1068  * __                                                                 __
   1069  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
   1070  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
   1071  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
   1072  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
   1073  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
   1074  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
   1075  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
   1076  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
   1077  * ~~                                                                 ~~
   1078  *                   __                               __
   1079  *                   |  0   0   1   0   0   0   0   0  |
   1080  *                   | 167 100  5   41 159 169 217 208 |
   1081  *                   | 166 100  4   40 158 168 216 209 |
   1082  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
   1083  *                   |  0   0   0   0   1   0   0   0  |
   1084  *                   |  0   0   0   0   0   1   0   0  |
   1085  *                   |  0   0   0   0   0   0   1   0  |
   1086  *                   |  0   0   0   0   0   0   0   1  |
   1087  *                   ~~                               ~~
   1088  *
   1089  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
   1090  * of the missing data.
   1091  *
   1092  * As is apparent from the example above, the only non-trivial rows in the
   1093  * inverse matrix correspond to the data disks that we're trying to
   1094  * reconstruct. Indeed, those are the only rows we need as the others would
   1095  * only be useful for reconstructing data known or assumed to be valid. For
   1096  * that reason, we only build the coefficients in the rows that correspond to
   1097  * targeted columns.
   1098  */
   1099 /* END CSTYLED */
   1100 
   1101 static void
   1102 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
   1103     uint8_t **rows)
   1104 {
   1105 	int i, j;
   1106 	int pow;
   1107 
   1108 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
   1109 
   1110 	/*
   1111 	 * Fill in the missing rows of interest.
   1112 	 */
   1113 	for (i = 0; i < nmap; i++) {
   1114 		ASSERT3S(0, <=, map[i]);
   1115 		ASSERT3S(map[i], <=, 2);
   1116 
   1117 		pow = map[i] * n;
   1118 		if (pow > 255)
   1119 			pow -= 255;
   1120 		ASSERT(pow <= 255);
   1121 
   1122 		for (j = 0; j < n; j++) {
   1123 			pow -= map[i];
   1124 			if (pow < 0)
   1125 				pow += 255;
   1126 			rows[i][j] = vdev_raidz_pow2[pow];
   1127 		}
   1128 	}
   1129 }
   1130 
   1131 static void
   1132 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
   1133     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
   1134 {
   1135 	int i, j, ii, jj;
   1136 	uint8_t log;
   1137 
   1138 	/*
   1139 	 * Assert that the first nmissing entries from the array of used
   1140 	 * columns correspond to parity columns and that subsequent entries
   1141 	 * correspond to data columns.
   1142 	 */
   1143 	for (i = 0; i < nmissing; i++) {
   1144 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
   1145 	}
   1146 	for (; i < n; i++) {
   1147 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
   1148 	}
   1149 
   1150 	/*
   1151 	 * First initialize the storage where we'll compute the inverse rows.
   1152 	 */
   1153 	for (i = 0; i < nmissing; i++) {
   1154 		for (j = 0; j < n; j++) {
   1155 			invrows[i][j] = (i == j) ? 1 : 0;
   1156 		}
   1157 	}
   1158 
   1159 	/*
   1160 	 * Subtract all trivial rows from the rows of consequence.
   1161 	 */
   1162 	for (i = 0; i < nmissing; i++) {
   1163 		for (j = nmissing; j < n; j++) {
   1164 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
   1165 			jj = used[j] - rm->rm_firstdatacol;
   1166 			ASSERT3S(jj, <, n);
   1167 			invrows[i][j] = rows[i][jj];
   1168 			rows[i][jj] = 0;
   1169 		}
   1170 	}
   1171 
   1172 	/*
   1173 	 * For each of the rows of interest, we must normalize it and subtract
   1174 	 * a multiple of it from the other rows.
   1175 	 */
   1176 	for (i = 0; i < nmissing; i++) {
   1177 		for (j = 0; j < missing[i]; j++) {
   1178 			ASSERT0(rows[i][j]);
   1179 		}
   1180 		ASSERT3U(rows[i][missing[i]], !=, 0);
   1181 
   1182 		/*
   1183 		 * Compute the inverse of the first element and multiply each
   1184 		 * element in the row by that value.
   1185 		 */
   1186 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
   1187 
   1188 		for (j = 0; j < n; j++) {
   1189 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
   1190 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
   1191 		}
   1192 
   1193 		for (ii = 0; ii < nmissing; ii++) {
   1194 			if (i == ii)
   1195 				continue;
   1196 
   1197 			ASSERT3U(rows[ii][missing[i]], !=, 0);
   1198 
   1199 			log = vdev_raidz_log2[rows[ii][missing[i]]];
   1200 
   1201 			for (j = 0; j < n; j++) {
   1202 				rows[ii][j] ^=
   1203 				    vdev_raidz_exp2(rows[i][j], log);
   1204 				invrows[ii][j] ^=
   1205 				    vdev_raidz_exp2(invrows[i][j], log);
   1206 			}
   1207 		}
   1208 	}
   1209 
   1210 	/*
   1211 	 * Verify that the data that is left in the rows are properly part of
   1212 	 * an identity matrix.
   1213 	 */
   1214 	for (i = 0; i < nmissing; i++) {
   1215 		for (j = 0; j < n; j++) {
   1216 			if (j == missing[i]) {
   1217 				ASSERT3U(rows[i][j], ==, 1);
   1218 			} else {
   1219 				ASSERT0(rows[i][j]);
   1220 			}
   1221 		}
   1222 	}
   1223 }
   1224 
   1225 static void
   1226 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
   1227     int *missing, uint8_t **invrows, const uint8_t *used)
   1228 {
   1229 	int i, j, x, cc, c;
   1230 	uint8_t *src;
   1231 	uint64_t ccount;
   1232 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
   1233 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
   1234 	uint8_t log = 0;
   1235 	uint8_t val;
   1236 	int ll;
   1237 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
   1238 	uint8_t *p, *pp;
   1239 	size_t psize;
   1240 
   1241 	psize = sizeof (invlog[0][0]) * n * nmissing;
   1242 	p = kmem_alloc(psize, KM_SLEEP);
   1243 
   1244 	for (pp = p, i = 0; i < nmissing; i++) {
   1245 		invlog[i] = pp;
   1246 		pp += n;
   1247 	}
   1248 
   1249 	for (i = 0; i < nmissing; i++) {
   1250 		for (j = 0; j < n; j++) {
   1251 			ASSERT3U(invrows[i][j], !=, 0);
   1252 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
   1253 		}
   1254 	}
   1255 
   1256 	for (i = 0; i < n; i++) {
   1257 		c = used[i];
   1258 		ASSERT3U(c, <, rm->rm_cols);
   1259 
   1260 		src = rm->rm_col[c].rc_data;
   1261 		ccount = rm->rm_col[c].rc_size;
   1262 		for (j = 0; j < nmissing; j++) {
   1263 			cc = missing[j] + rm->rm_firstdatacol;
   1264 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
   1265 			ASSERT3U(cc, <, rm->rm_cols);
   1266 			ASSERT3U(cc, !=, c);
   1267 
   1268 			dst[j] = rm->rm_col[cc].rc_data;
   1269 			dcount[j] = rm->rm_col[cc].rc_size;
   1270 		}
   1271 
   1272 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
   1273 
   1274 		for (x = 0; x < ccount; x++, src++) {
   1275 			if (*src != 0)
   1276 				log = vdev_raidz_log2[*src];
   1277 
   1278 			for (cc = 0; cc < nmissing; cc++) {
   1279 				if (x >= dcount[cc])
   1280 					continue;
   1281 
   1282 				if (*src == 0) {
   1283 					val = 0;
   1284 				} else {
   1285 					if ((ll = log + invlog[cc][i]) >= 255)
   1286 						ll -= 255;
   1287 					val = vdev_raidz_pow2[ll];
   1288 				}
   1289 
   1290 				if (i == 0)
   1291 					dst[cc][x] = val;
   1292 				else
   1293 					dst[cc][x] ^= val;
   1294 			}
   1295 		}
   1296 	}
   1297 
   1298 	kmem_free(p, psize);
   1299 }
   1300 
   1301 static int
   1302 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
   1303 {
   1304 	int n, i, c, t, tt;
   1305 	int nmissing_rows;
   1306 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
   1307 	int parity_map[VDEV_RAIDZ_MAXPARITY];
   1308 
   1309 	uint8_t *p, *pp;
   1310 	size_t psize;
   1311 
   1312 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
   1313 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
   1314 	uint8_t *used;
   1315 
   1316 	int code = 0;
   1317 
   1318 
   1319 	n = rm->rm_cols - rm->rm_firstdatacol;
   1320 
   1321 	/*
   1322 	 * Figure out which data columns are missing.
   1323 	 */
   1324 	nmissing_rows = 0;
   1325 	for (t = 0; t < ntgts; t++) {
   1326 		if (tgts[t] >= rm->rm_firstdatacol) {
   1327 			missing_rows[nmissing_rows++] =
   1328 			    tgts[t] - rm->rm_firstdatacol;
   1329 		}
   1330 	}
   1331 
   1332 	/*
   1333 	 * Figure out which parity columns to use to help generate the missing
   1334 	 * data columns.
   1335 	 */
   1336 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
   1337 		ASSERT(tt < ntgts);
   1338 		ASSERT(c < rm->rm_firstdatacol);
   1339 
   1340 		/*
   1341 		 * Skip any targeted parity columns.
   1342 		 */
   1343 		if (c == tgts[tt]) {
   1344 			tt++;
   1345 			continue;
   1346 		}
   1347 
   1348 		code |= 1 << c;
   1349 
   1350 		parity_map[i] = c;
   1351 		i++;
   1352 	}
   1353 
   1354 	ASSERT(code != 0);
   1355 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
   1356 
   1357 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
   1358 	    nmissing_rows * n + sizeof (used[0]) * n;
   1359 	p = kmem_alloc(psize, KM_SLEEP);
   1360 
   1361 	for (pp = p, i = 0; i < nmissing_rows; i++) {
   1362 		rows[i] = pp;
   1363 		pp += n;
   1364 		invrows[i] = pp;
   1365 		pp += n;
   1366 	}
   1367 	used = pp;
   1368 
   1369 	for (i = 0; i < nmissing_rows; i++) {
   1370 		used[i] = parity_map[i];
   1371 	}
   1372 
   1373 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1374 		if (tt < nmissing_rows &&
   1375 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
   1376 			tt++;
   1377 			continue;
   1378 		}
   1379 
   1380 		ASSERT3S(i, <, n);
   1381 		used[i] = c;
   1382 		i++;
   1383 	}
   1384 
   1385 	/*
   1386 	 * Initialize the interesting rows of the matrix.
   1387 	 */
   1388 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
   1389 
   1390 	/*
   1391 	 * Invert the matrix.
   1392 	 */
   1393 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
   1394 	    invrows, used);
   1395 
   1396 	/*
   1397 	 * Reconstruct the missing data using the generated matrix.
   1398 	 */
   1399 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
   1400 	    invrows, used);
   1401 
   1402 	kmem_free(p, psize);
   1403 
   1404 	return (code);
   1405 }
   1406 
   1407 static int
   1408 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
   1409 {
   1410 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
   1411 	int ntgts;
   1412 	int i, c;
   1413 	int code;
   1414 	int nbadparity, nbaddata;
   1415 	int parity_valid[VDEV_RAIDZ_MAXPARITY] = {0};
   1416 
   1417 	/*
   1418 	 * The tgts list must already be sorted.
   1419 	 */
   1420 	for (i = 1; i < nt; i++) {
   1421 		ASSERT(t[i] > t[i - 1]);
   1422 	}
   1423 
   1424 	nbadparity = rm->rm_firstdatacol;
   1425 	nbaddata = rm->rm_cols - nbadparity;
   1426 	ntgts = 0;
   1427 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
   1428 		if (c < rm->rm_firstdatacol)
   1429 			parity_valid[c] = B_FALSE;
   1430 
   1431 		if (i < nt && c == t[i]) {
   1432 			tgts[ntgts++] = c;
   1433 			i++;
   1434 		} else if (rm->rm_col[c].rc_error != 0) {
   1435 			tgts[ntgts++] = c;
   1436 		} else if (c >= rm->rm_firstdatacol) {
   1437 			nbaddata--;
   1438 		} else {
   1439 			parity_valid[c] = B_TRUE;
   1440 			nbadparity--;
   1441 		}
   1442 	}
   1443 
   1444 	ASSERT(ntgts >= nt);
   1445 	ASSERT(nbaddata >= 0);
   1446 	ASSERT(nbaddata + nbadparity == ntgts);
   1447 
   1448 	dt = &tgts[nbadparity];
   1449 
   1450 	/*
   1451 	 * See if we can use any of our optimized reconstruction routines.
   1452 	 */
   1453 	if (!vdev_raidz_default_to_general) {
   1454 		switch (nbaddata) {
   1455 		case 1:
   1456 			if (parity_valid[VDEV_RAIDZ_P])
   1457 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
   1458 
   1459 			ASSERT(rm->rm_firstdatacol > 1);
   1460 
   1461 			if (parity_valid[VDEV_RAIDZ_Q])
   1462 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
   1463 
   1464 			ASSERT(rm->rm_firstdatacol > 2);
   1465 			break;
   1466 
   1467 		case 2:
   1468 			ASSERT(rm->rm_firstdatacol > 1);
   1469 
   1470 			if (parity_valid[VDEV_RAIDZ_P] &&
   1471 			    parity_valid[VDEV_RAIDZ_Q])
   1472 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
   1473 
   1474 			ASSERT(rm->rm_firstdatacol > 2);
   1475 
   1476 			break;
   1477 		}
   1478 	}
   1479 
   1480 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
   1481 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
   1482 	ASSERT(code > 0);
   1483 	return (code);
   1484 }
   1485 
   1486 static int
   1487 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
   1488     uint64_t *logical_ashift, uint64_t *physical_ashift)
   1489 {
   1490 	vdev_t *cvd;
   1491 	uint64_t nparity = vd->vdev_nparity;
   1492 	int c;
   1493 	int lasterror = 0;
   1494 	int numerrors = 0;
   1495 
   1496 	ASSERT(nparity > 0);
   1497 
   1498 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
   1499 	    vd->vdev_children < nparity + 1) {
   1500 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
   1501 		return (SET_ERROR(EINVAL));
   1502 	}
   1503 
   1504 	vdev_open_children(vd);
   1505 
   1506 	for (c = 0; c < vd->vdev_children; c++) {
   1507 		cvd = vd->vdev_child[c];
   1508 
   1509 		if (cvd->vdev_open_error != 0) {
   1510 			lasterror = cvd->vdev_open_error;
   1511 			numerrors++;
   1512 			continue;
   1513 		}
   1514 
   1515 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
   1516 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
   1517 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
   1518 		*physical_ashift = MAX(*physical_ashift,
   1519 		    cvd->vdev_physical_ashift);
   1520 	}
   1521 
   1522 	*asize *= vd->vdev_children;
   1523 	*max_asize *= vd->vdev_children;
   1524 
   1525 	if (numerrors > nparity) {
   1526 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
   1527 		return (lasterror);
   1528 	}
   1529 
   1530 	return (0);
   1531 }
   1532 
   1533 static void
   1534 vdev_raidz_close(vdev_t *vd)
   1535 {
   1536 	int c;
   1537 
   1538 	for (c = 0; c < vd->vdev_children; c++)
   1539 		vdev_close(vd->vdev_child[c]);
   1540 }
   1541 
   1542 #ifdef illumos
   1543 /*
   1544  * Handle a read or write I/O to a RAID-Z dump device.
   1545  *
   1546  * The dump device is in a unique situation compared to other ZFS datasets:
   1547  * writing to this device should be as simple and fast as possible.  In
   1548  * addition, durability matters much less since the dump will be extracted
   1549  * once the machine reboots.  For that reason, this function eschews parity for
   1550  * performance and simplicity.  The dump device uses the checksum setting
   1551  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
   1552  * dataset.
   1553  *
   1554  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
   1555  * 128 KB will not fill an entire block; in addition, they may not be properly
   1556  * aligned.  In that case, this function uses the preallocated 128 KB block and
   1557  * omits reading or writing any "empty" portions of that block, as opposed to
   1558  * allocating a fresh appropriately-sized block.
   1559  *
   1560  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
   1561  *
   1562  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
   1563  *
   1564  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
   1565  * allocated which spans all five child vdevs.  8 KB of data would be written to
   1566  * each of four vdevs, with the fifth containing the parity bits.
   1567  *
   1568  *       parity    data     data     data     data
   1569  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
   1570  *         ^        ^        ^        ^        ^
   1571  *         |        |        |        |        |
   1572  *   8 KB parity    ------8 KB data blocks------
   1573  *
   1574  * However, when writing to the dump device, the behavior is different:
   1575  *
   1576  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
   1577  *
   1578  * Unlike the normal RAID-Z case in which the block is allocated based on the
   1579  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
   1580  * I/O size is less than 128 KB, only the actual portions of data are written.
   1581  * In this example the data is written to the third data vdev since that vdev
   1582  * contains the offset [64 KB, 96 KB).
   1583  *
   1584  *       parity    data     data     data     data
   1585  *     |        |        |        |   XX   |        |
   1586  *                                    ^
   1587  *                                    |
   1588  *                             32 KB data block
   1589  *
   1590  * As a result, an individual I/O may not span all child vdevs; moreover, a
   1591  * small I/O may only operate on a single child vdev.
   1592  *
   1593  * Note that since there are no parity bits calculated or written, this format
   1594  * remains the same no matter how many parity bits are used in a normal RAID-Z
   1595  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
   1596  * would look like:
   1597  *
   1598  *       parity   parity   parity    data     data     data     data
   1599  *     |        |        |        |        |        |   XX   |        |
   1600  *                                                      ^
   1601  *                                                      |
   1602  *                                               32 KB data block
   1603  */
   1604 int
   1605 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
   1606     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
   1607 {
   1608 	vdev_t *tvd = vd->vdev_top;
   1609 	vdev_t *cvd;
   1610 	raidz_map_t *rm;
   1611 	raidz_col_t *rc;
   1612 	int c, err = 0;
   1613 
   1614 	uint64_t start, end, colstart, colend;
   1615 	uint64_t coloffset, colsize, colskip;
   1616 
   1617 	int flags = doread ? BIO_READ : BIO_WRITE;
   1618 
   1619 #ifdef	_KERNEL
   1620 
   1621 	/*
   1622 	 * Don't write past the end of the block
   1623 	 */
   1624 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
   1625 
   1626 	start = offset;
   1627 	end = start + size;
   1628 
   1629 	/*
   1630 	 * Allocate a RAID-Z map for this block.  Note that this block starts
   1631 	 * from the "original" offset, this is, the offset of the extent which
   1632 	 * contains the requisite offset of the data being read or written.
   1633 	 *
   1634 	 * Even if this I/O operation doesn't span the full block size, let's
   1635 	 * treat the on-disk format as if the only blocks are the complete 128
   1636 	 * KB size.
   1637 	 */
   1638 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
   1639 	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
   1640 	    vd->vdev_children, vd->vdev_nparity);
   1641 
   1642 	coloffset = origoffset;
   1643 
   1644 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
   1645 	    c++, coloffset += rc->rc_size) {
   1646 		rc = &rm->rm_col[c];
   1647 		cvd = vd->vdev_child[rc->rc_devidx];
   1648 
   1649 		/*
   1650 		 * Find the start and end of this column in the RAID-Z map,
   1651 		 * keeping in mind that the stated size and offset of the
   1652 		 * operation may not fill the entire column for this vdev.
   1653 		 *
   1654 		 * If any portion of the data spans this column, issue the
   1655 		 * appropriate operation to the vdev.
   1656 		 */
   1657 		if (coloffset + rc->rc_size <= start)
   1658 			continue;
   1659 		if (coloffset >= end)
   1660 			continue;
   1661 
   1662 		colstart = MAX(coloffset, start);
   1663 		colend = MIN(end, coloffset + rc->rc_size);
   1664 		colsize = colend - colstart;
   1665 		colskip = colstart - coloffset;
   1666 
   1667 		VERIFY3U(colsize, <=, rc->rc_size);
   1668 		VERIFY3U(colskip, <=, rc->rc_size);
   1669 
   1670 		/*
   1671 		 * Note that the child vdev will have a vdev label at the start
   1672 		 * of its range of offsets, hence the need for
   1673 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
   1674 		 * example of why this calculation is needed.
   1675 		 */
   1676 		if ((err = vdev_disk_physio(cvd,
   1677 		    ((char *)rc->rc_data) + colskip, colsize,
   1678 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
   1679 		    flags, isdump)) != 0)
   1680 			break;
   1681 	}
   1682 
   1683 	vdev_raidz_map_free(rm);
   1684 #endif	/* KERNEL */
   1685 
   1686 	return (err);
   1687 }
   1688 #endif
   1689 
   1690 static uint64_t
   1691 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
   1692 {
   1693 	uint64_t asize;
   1694 	uint64_t ashift = vd->vdev_top->vdev_ashift;
   1695 	uint64_t cols = vd->vdev_children;
   1696 	uint64_t nparity = vd->vdev_nparity;
   1697 
   1698 	asize = ((psize - 1) >> ashift) + 1;
   1699 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
   1700 	asize = roundup(asize, nparity + 1) << ashift;
   1701 
   1702 	return (asize);
   1703 }
   1704 
   1705 static void
   1706 vdev_raidz_child_done(zio_t *zio)
   1707 {
   1708 	raidz_col_t *rc = zio->io_private;
   1709 
   1710 	rc->rc_error = zio->io_error;
   1711 	rc->rc_tried = 1;
   1712 	rc->rc_skipped = 0;
   1713 }
   1714 
   1715 /*
   1716  * Start an IO operation on a RAIDZ VDev
   1717  *
   1718  * Outline:
   1719  * - For write operations:
   1720  *   1. Generate the parity data
   1721  *   2. Create child zio write operations to each column's vdev, for both
   1722  *      data and parity.
   1723  *   3. If the column skips any sectors for padding, create optional dummy
   1724  *      write zio children for those areas to improve aggregation continuity.
   1725  * - For read operations:
   1726  *   1. Create child zio read operations to each data column's vdev to read
   1727  *      the range of data required for zio.
   1728  *   2. If this is a scrub or resilver operation, or if any of the data
   1729  *      vdevs have had errors, then create zio read operations to the parity
   1730  *      columns' VDevs as well.
   1731  */
   1732 static void
   1733 vdev_raidz_io_start(zio_t *zio)
   1734 {
   1735 	vdev_t *vd = zio->io_vd;
   1736 	vdev_t *tvd = vd->vdev_top;
   1737 	vdev_t *cvd;
   1738 	raidz_map_t *rm;
   1739 	raidz_col_t *rc;
   1740 	int c, i;
   1741 
   1742 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
   1743 	    zio->io_type == ZIO_TYPE_FREE,
   1744 	    tvd->vdev_ashift, vd->vdev_children,
   1745 	    vd->vdev_nparity);
   1746 
   1747 	zio->io_vsd = rm;
   1748 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
   1749 
   1750 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
   1751 
   1752 	if (zio->io_type == ZIO_TYPE_FREE) {
   1753 		for (c = 0; c < rm->rm_cols; c++) {
   1754 			rc = &rm->rm_col[c];
   1755 			cvd = vd->vdev_child[rc->rc_devidx];
   1756 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1757 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1758 			    zio->io_type, zio->io_priority, 0,
   1759 			    vdev_raidz_child_done, rc));
   1760 		}
   1761 
   1762 		zio_execute(zio);
   1763 		return;
   1764 	}
   1765 
   1766 	if (zio->io_type == ZIO_TYPE_WRITE) {
   1767 		vdev_raidz_generate_parity(rm);
   1768 
   1769 		for (c = 0; c < rm->rm_cols; c++) {
   1770 			rc = &rm->rm_col[c];
   1771 			cvd = vd->vdev_child[rc->rc_devidx];
   1772 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1773 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1774 			    zio->io_type, zio->io_priority, 0,
   1775 			    vdev_raidz_child_done, rc));
   1776 		}
   1777 
   1778 		/*
   1779 		 * Generate optional I/Os for any skipped sectors to improve
   1780 		 * aggregation contiguity.
   1781 		 */
   1782 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
   1783 			ASSERT(c <= rm->rm_scols);
   1784 			if (c == rm->rm_scols)
   1785 				c = 0;
   1786 			rc = &rm->rm_col[c];
   1787 			cvd = vd->vdev_child[rc->rc_devidx];
   1788 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1789 			    rc->rc_offset + rc->rc_size, NULL,
   1790 			    1 << tvd->vdev_ashift,
   1791 			    zio->io_type, zio->io_priority,
   1792 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
   1793 		}
   1794 
   1795 		zio_execute(zio);
   1796 		return;
   1797 	}
   1798 
   1799 	ASSERT(zio->io_type == ZIO_TYPE_READ);
   1800 
   1801 	/*
   1802 	 * Iterate over the columns in reverse order so that we hit the parity
   1803 	 * last -- any errors along the way will force us to read the parity.
   1804 	 */
   1805 	for (c = rm->rm_cols - 1; c >= 0; c--) {
   1806 		rc = &rm->rm_col[c];
   1807 		cvd = vd->vdev_child[rc->rc_devidx];
   1808 		if (!vdev_readable(cvd)) {
   1809 			if (c >= rm->rm_firstdatacol)
   1810 				rm->rm_missingdata++;
   1811 			else
   1812 				rm->rm_missingparity++;
   1813 			rc->rc_error = SET_ERROR(ENXIO);
   1814 			rc->rc_tried = 1;	/* don't even try */
   1815 			rc->rc_skipped = 1;
   1816 			continue;
   1817 		}
   1818 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
   1819 			if (c >= rm->rm_firstdatacol)
   1820 				rm->rm_missingdata++;
   1821 			else
   1822 				rm->rm_missingparity++;
   1823 			rc->rc_error = SET_ERROR(ESTALE);
   1824 			rc->rc_skipped = 1;
   1825 			continue;
   1826 		}
   1827 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
   1828 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
   1829 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   1830 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1831 			    zio->io_type, zio->io_priority, 0,
   1832 			    vdev_raidz_child_done, rc));
   1833 		}
   1834 	}
   1835 
   1836 	zio_execute(zio);
   1837 }
   1838 
   1839 
   1840 /*
   1841  * Report a checksum error for a child of a RAID-Z device.
   1842  */
   1843 static void
   1844 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
   1845 {
   1846 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
   1847 
   1848 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   1849 		zio_bad_cksum_t zbc;
   1850 		raidz_map_t *rm = zio->io_vsd;
   1851 
   1852 		mutex_enter(&vd->vdev_stat_lock);
   1853 		vd->vdev_stat.vs_checksum_errors++;
   1854 		mutex_exit(&vd->vdev_stat_lock);
   1855 
   1856 		zbc.zbc_has_cksum = 0;
   1857 		zbc.zbc_injected = rm->rm_ecksuminjected;
   1858 
   1859 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
   1860 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
   1861 		    &zbc);
   1862 	}
   1863 }
   1864 
   1865 /*
   1866  * We keep track of whether or not there were any injected errors, so that
   1867  * any ereports we generate can note it.
   1868  */
   1869 static int
   1870 raidz_checksum_verify(zio_t *zio)
   1871 {
   1872 	zio_bad_cksum_t zbc;
   1873 	raidz_map_t *rm = zio->io_vsd;
   1874 
   1875 	int ret = zio_checksum_error(zio, &zbc);
   1876 	if (ret != 0 && zbc.zbc_injected != 0)
   1877 		rm->rm_ecksuminjected = 1;
   1878 
   1879 	return (ret);
   1880 }
   1881 
   1882 /*
   1883  * Generate the parity from the data columns. If we tried and were able to
   1884  * read the parity without error, verify that the generated parity matches the
   1885  * data we read. If it doesn't, we fire off a checksum error. Return the
   1886  * number such failures.
   1887  */
   1888 static int
   1889 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
   1890 {
   1891 	void *orig[VDEV_RAIDZ_MAXPARITY];
   1892 	int c, ret = 0;
   1893 	raidz_col_t *rc;
   1894 
   1895 	blkptr_t *bp = zio->io_bp;
   1896 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
   1897 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
   1898 
   1899 	if (checksum == ZIO_CHECKSUM_NOPARITY)
   1900 		return (ret);
   1901 
   1902 	for (c = 0; c < rm->rm_firstdatacol; c++) {
   1903 		rc = &rm->rm_col[c];
   1904 		if (!rc->rc_tried || rc->rc_error != 0)
   1905 			continue;
   1906 		orig[c] = zio_buf_alloc(rc->rc_size);
   1907 		bcopy(rc->rc_data, orig[c], rc->rc_size);
   1908 	}
   1909 
   1910 	vdev_raidz_generate_parity(rm);
   1911 
   1912 	for (c = 0; c < rm->rm_firstdatacol; c++) {
   1913 		rc = &rm->rm_col[c];
   1914 		if (!rc->rc_tried || rc->rc_error != 0)
   1915 			continue;
   1916 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
   1917 			raidz_checksum_error(zio, rc, orig[c]);
   1918 			rc->rc_error = SET_ERROR(ECKSUM);
   1919 			ret++;
   1920 		}
   1921 		zio_buf_free(orig[c], rc->rc_size);
   1922 	}
   1923 
   1924 	return (ret);
   1925 }
   1926 
   1927 /*
   1928  * Keep statistics on all the ways that we used parity to correct data.
   1929  */
   1930 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
   1931 
   1932 static int
   1933 vdev_raidz_worst_error(raidz_map_t *rm)
   1934 {
   1935 	int error = 0;
   1936 
   1937 	for (int c = 0; c < rm->rm_cols; c++)
   1938 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
   1939 
   1940 	return (error);
   1941 }
   1942 
   1943 /*
   1944  * Iterate over all combinations of bad data and attempt a reconstruction.
   1945  * Note that the algorithm below is non-optimal because it doesn't take into
   1946  * account how reconstruction is actually performed. For example, with
   1947  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
   1948  * is targeted as invalid as if columns 1 and 4 are targeted since in both
   1949  * cases we'd only use parity information in column 0.
   1950  */
   1951 static int
   1952 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
   1953 {
   1954 	raidz_map_t *rm = zio->io_vsd;
   1955 	raidz_col_t *rc;
   1956 	void *orig[VDEV_RAIDZ_MAXPARITY];
   1957 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
   1958 	int *tgts = &tstore[1];
   1959 	int current, next, i, c, n;
   1960 	int code, ret = 0;
   1961 
   1962 	ASSERT(total_errors < rm->rm_firstdatacol);
   1963 
   1964 	/*
   1965 	 * This simplifies one edge condition.
   1966 	 */
   1967 	tgts[-1] = -1;
   1968 
   1969 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
   1970 		/*
   1971 		 * Initialize the targets array by finding the first n columns
   1972 		 * that contain no error.
   1973 		 *
   1974 		 * If there were no data errors, we need to ensure that we're
   1975 		 * always explicitly attempting to reconstruct at least one
   1976 		 * data column. To do this, we simply push the highest target
   1977 		 * up into the data columns.
   1978 		 */
   1979 		for (c = 0, i = 0; i < n; i++) {
   1980 			if (i == n - 1 && data_errors == 0 &&
   1981 			    c < rm->rm_firstdatacol) {
   1982 				c = rm->rm_firstdatacol;
   1983 			}
   1984 
   1985 			while (rm->rm_col[c].rc_error != 0) {
   1986 				c++;
   1987 				ASSERT3S(c, <, rm->rm_cols);
   1988 			}
   1989 
   1990 			tgts[i] = c++;
   1991 		}
   1992 
   1993 		/*
   1994 		 * Setting tgts[n] simplifies the other edge condition.
   1995 		 */
   1996 		tgts[n] = rm->rm_cols;
   1997 
   1998 		/*
   1999 		 * These buffers were allocated in previous iterations.
   2000 		 */
   2001 		for (i = 0; i < n - 1; i++) {
   2002 			ASSERT(orig[i] != NULL);
   2003 		}
   2004 
   2005 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
   2006 
   2007 		current = 0;
   2008 		next = tgts[current];
   2009 
   2010 		while (current != n) {
   2011 			tgts[current] = next;
   2012 			current = 0;
   2013 
   2014 			/*
   2015 			 * Save off the original data that we're going to
   2016 			 * attempt to reconstruct.
   2017 			 */
   2018 			for (i = 0; i < n; i++) {
   2019 				ASSERT(orig[i] != NULL);
   2020 				c = tgts[i];
   2021 				ASSERT3S(c, >=, 0);
   2022 				ASSERT3S(c, <, rm->rm_cols);
   2023 				rc = &rm->rm_col[c];
   2024 				bcopy(rc->rc_data, orig[i], rc->rc_size);
   2025 			}
   2026 
   2027 			/*
   2028 			 * Attempt a reconstruction and exit the outer loop on
   2029 			 * success.
   2030 			 */
   2031 			code = vdev_raidz_reconstruct(rm, tgts, n);
   2032 			if (raidz_checksum_verify(zio) == 0) {
   2033 				atomic_inc_64(&raidz_corrected[code]);
   2034 
   2035 				for (i = 0; i < n; i++) {
   2036 					c = tgts[i];
   2037 					rc = &rm->rm_col[c];
   2038 					ASSERT(rc->rc_error == 0);
   2039 					if (rc->rc_tried)
   2040 						raidz_checksum_error(zio, rc,
   2041 						    orig[i]);
   2042 					rc->rc_error = SET_ERROR(ECKSUM);
   2043 				}
   2044 
   2045 				ret = code;
   2046 				goto done;
   2047 			}
   2048 
   2049 			/*
   2050 			 * Restore the original data.
   2051 			 */
   2052 			for (i = 0; i < n; i++) {
   2053 				c = tgts[i];
   2054 				rc = &rm->rm_col[c];
   2055 				bcopy(orig[i], rc->rc_data, rc->rc_size);
   2056 			}
   2057 
   2058 			do {
   2059 				/*
   2060 				 * Find the next valid column after the current
   2061 				 * position..
   2062 				 */
   2063 				for (next = tgts[current] + 1;
   2064 				    next < rm->rm_cols &&
   2065 				    rm->rm_col[next].rc_error != 0; next++)
   2066 					continue;
   2067 
   2068 				ASSERT(next <= tgts[current + 1]);
   2069 
   2070 				/*
   2071 				 * If that spot is available, we're done here.
   2072 				 */
   2073 				if (next != tgts[current + 1])
   2074 					break;
   2075 
   2076 				/*
   2077 				 * Otherwise, find the next valid column after
   2078 				 * the previous position.
   2079 				 */
   2080 				for (c = tgts[current - 1] + 1;
   2081 				    rm->rm_col[c].rc_error != 0; c++)
   2082 					continue;
   2083 
   2084 				tgts[current] = c;
   2085 				current++;
   2086 
   2087 			} while (current != n);
   2088 		}
   2089 	}
   2090 	n--;
   2091 done:
   2092 	for (i = 0; i < n; i++) {
   2093 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
   2094 	}
   2095 
   2096 	return (ret);
   2097 }
   2098 
   2099 /*
   2100  * Complete an IO operation on a RAIDZ VDev
   2101  *
   2102  * Outline:
   2103  * - For write operations:
   2104  *   1. Check for errors on the child IOs.
   2105  *   2. Return, setting an error code if too few child VDevs were written
   2106  *      to reconstruct the data later.  Note that partial writes are
   2107  *      considered successful if they can be reconstructed at all.
   2108  * - For read operations:
   2109  *   1. Check for errors on the child IOs.
   2110  *   2. If data errors occurred:
   2111  *      a. Try to reassemble the data from the parity available.
   2112  *      b. If we haven't yet read the parity drives, read them now.
   2113  *      c. If all parity drives have been read but the data still doesn't
   2114  *         reassemble with a correct checksum, then try combinatorial
   2115  *         reconstruction.
   2116  *      d. If that doesn't work, return an error.
   2117  *   3. If there were unexpected errors or this is a resilver operation,
   2118  *      rewrite the vdevs that had errors.
   2119  */
   2120 static void
   2121 vdev_raidz_io_done(zio_t *zio)
   2122 {
   2123 	vdev_t *vd = zio->io_vd;
   2124 	vdev_t *cvd;
   2125 	raidz_map_t *rm = zio->io_vsd;
   2126 	raidz_col_t *rc;
   2127 	int unexpected_errors = 0;
   2128 	int parity_errors = 0;
   2129 	int parity_untried = 0;
   2130 	int data_errors = 0;
   2131 	int total_errors = 0;
   2132 	int n, c;
   2133 	int tgts[VDEV_RAIDZ_MAXPARITY];
   2134 	int code;
   2135 
   2136 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
   2137 
   2138 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
   2139 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
   2140 
   2141 	for (c = 0; c < rm->rm_cols; c++) {
   2142 		rc = &rm->rm_col[c];
   2143 
   2144 		if (rc->rc_error) {
   2145 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
   2146 
   2147 			if (c < rm->rm_firstdatacol)
   2148 				parity_errors++;
   2149 			else
   2150 				data_errors++;
   2151 
   2152 			if (!rc->rc_skipped)
   2153 				unexpected_errors++;
   2154 
   2155 			total_errors++;
   2156 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
   2157 			parity_untried++;
   2158 		}
   2159 	}
   2160 
   2161 	if (zio->io_type == ZIO_TYPE_WRITE) {
   2162 		/*
   2163 		 * XXX -- for now, treat partial writes as a success.
   2164 		 * (If we couldn't write enough columns to reconstruct
   2165 		 * the data, the I/O failed.  Otherwise, good enough.)
   2166 		 *
   2167 		 * Now that we support write reallocation, it would be better
   2168 		 * to treat partial failure as real failure unless there are
   2169 		 * no non-degraded top-level vdevs left, and not update DTLs
   2170 		 * if we intend to reallocate.
   2171 		 */
   2172 		/* XXPOLICY */
   2173 		if (total_errors > rm->rm_firstdatacol)
   2174 			zio->io_error = vdev_raidz_worst_error(rm);
   2175 
   2176 		return;
   2177 	} else if (zio->io_type == ZIO_TYPE_FREE) {
   2178 		return;
   2179 	}
   2180 
   2181 	ASSERT(zio->io_type == ZIO_TYPE_READ);
   2182 	/*
   2183 	 * There are three potential phases for a read:
   2184 	 *	1. produce valid data from the columns read
   2185 	 *	2. read all disks and try again
   2186 	 *	3. perform combinatorial reconstruction
   2187 	 *
   2188 	 * Each phase is progressively both more expensive and less likely to
   2189 	 * occur. If we encounter more errors than we can repair or all phases
   2190 	 * fail, we have no choice but to return an error.
   2191 	 */
   2192 
   2193 	/*
   2194 	 * If the number of errors we saw was correctable -- less than or equal
   2195 	 * to the number of parity disks read -- attempt to produce data that
   2196 	 * has a valid checksum. Naturally, this case applies in the absence of
   2197 	 * any errors.
   2198 	 */
   2199 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
   2200 		if (data_errors == 0) {
   2201 			if (raidz_checksum_verify(zio) == 0) {
   2202 				/*
   2203 				 * If we read parity information (unnecessarily
   2204 				 * as it happens since no reconstruction was
   2205 				 * needed) regenerate and verify the parity.
   2206 				 * We also regenerate parity when resilvering
   2207 				 * so we can write it out to the failed device
   2208 				 * later.
   2209 				 */
   2210 				if (parity_errors + parity_untried <
   2211 				    rm->rm_firstdatacol ||
   2212 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
   2213 					n = raidz_parity_verify(zio, rm);
   2214 					unexpected_errors += n;
   2215 					ASSERT(parity_errors + n <=
   2216 					    rm->rm_firstdatacol);
   2217 				}
   2218 				goto done;
   2219 			}
   2220 		} else {
   2221 			/*
   2222 			 * We either attempt to read all the parity columns or
   2223 			 * none of them. If we didn't try to read parity, we
   2224 			 * wouldn't be here in the correctable case. There must
   2225 			 * also have been fewer parity errors than parity
   2226 			 * columns or, again, we wouldn't be in this code path.
   2227 			 */
   2228 			ASSERT(parity_untried == 0);
   2229 			ASSERT(parity_errors < rm->rm_firstdatacol);
   2230 
   2231 			/*
   2232 			 * Identify the data columns that reported an error.
   2233 			 */
   2234 			n = 0;
   2235 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   2236 				rc = &rm->rm_col[c];
   2237 				if (rc->rc_error != 0) {
   2238 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
   2239 					tgts[n++] = c;
   2240 				}
   2241 			}
   2242 
   2243 			ASSERT(rm->rm_firstdatacol >= n);
   2244 
   2245 			code = vdev_raidz_reconstruct(rm, tgts, n);
   2246 
   2247 			if (raidz_checksum_verify(zio) == 0) {
   2248 				atomic_inc_64(&raidz_corrected[code]);
   2249 
   2250 				/*
   2251 				 * If we read more parity disks than were used
   2252 				 * for reconstruction, confirm that the other
   2253 				 * parity disks produced correct data. This
   2254 				 * routine is suboptimal in that it regenerates
   2255 				 * the parity that we already used in addition
   2256 				 * to the parity that we're attempting to
   2257 				 * verify, but this should be a relatively
   2258 				 * uncommon case, and can be optimized if it
   2259 				 * becomes a problem. Note that we regenerate
   2260 				 * parity when resilvering so we can write it
   2261 				 * out to failed devices later.
   2262 				 */
   2263 				if (parity_errors < rm->rm_firstdatacol - n ||
   2264 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
   2265 					n = raidz_parity_verify(zio, rm);
   2266 					unexpected_errors += n;
   2267 					ASSERT(parity_errors + n <=
   2268 					    rm->rm_firstdatacol);
   2269 				}
   2270 
   2271 				goto done;
   2272 			}
   2273 		}
   2274 	}
   2275 
   2276 	/*
   2277 	 * This isn't a typical situation -- either we got a read error or
   2278 	 * a child silently returned bad data. Read every block so we can
   2279 	 * try again with as much data and parity as we can track down. If
   2280 	 * we've already been through once before, all children will be marked
   2281 	 * as tried so we'll proceed to combinatorial reconstruction.
   2282 	 */
   2283 	unexpected_errors = 1;
   2284 	rm->rm_missingdata = 0;
   2285 	rm->rm_missingparity = 0;
   2286 
   2287 	for (c = 0; c < rm->rm_cols; c++) {
   2288 		if (rm->rm_col[c].rc_tried)
   2289 			continue;
   2290 
   2291 		zio_vdev_io_redone(zio);
   2292 		do {
   2293 			rc = &rm->rm_col[c];
   2294 			if (rc->rc_tried)
   2295 				continue;
   2296 			zio_nowait(zio_vdev_child_io(zio, NULL,
   2297 			    vd->vdev_child[rc->rc_devidx],
   2298 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   2299 			    zio->io_type, zio->io_priority, 0,
   2300 			    vdev_raidz_child_done, rc));
   2301 		} while (++c < rm->rm_cols);
   2302 
   2303 		return;
   2304 	}
   2305 
   2306 	/*
   2307 	 * At this point we've attempted to reconstruct the data given the
   2308 	 * errors we detected, and we've attempted to read all columns. There
   2309 	 * must, therefore, be one or more additional problems -- silent errors
   2310 	 * resulting in invalid data rather than explicit I/O errors resulting
   2311 	 * in absent data. We check if there is enough additional data to
   2312 	 * possibly reconstruct the data and then perform combinatorial
   2313 	 * reconstruction over all possible combinations. If that fails,
   2314 	 * we're cooked.
   2315 	 */
   2316 	if (total_errors > rm->rm_firstdatacol) {
   2317 		zio->io_error = vdev_raidz_worst_error(rm);
   2318 
   2319 	} else if (total_errors < rm->rm_firstdatacol &&
   2320 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
   2321 		/*
   2322 		 * If we didn't use all the available parity for the
   2323 		 * combinatorial reconstruction, verify that the remaining
   2324 		 * parity is correct.
   2325 		 */
   2326 		if (code != (1 << rm->rm_firstdatacol) - 1)
   2327 			(void) raidz_parity_verify(zio, rm);
   2328 	} else {
   2329 		/*
   2330 		 * We're here because either:
   2331 		 *
   2332 		 *	total_errors == rm_first_datacol, or
   2333 		 *	vdev_raidz_combrec() failed
   2334 		 *
   2335 		 * In either case, there is enough bad data to prevent
   2336 		 * reconstruction.
   2337 		 *
   2338 		 * Start checksum ereports for all children which haven't
   2339 		 * failed, and the IO wasn't speculative.
   2340 		 */
   2341 		zio->io_error = SET_ERROR(ECKSUM);
   2342 
   2343 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   2344 			for (c = 0; c < rm->rm_cols; c++) {
   2345 				rc = &rm->rm_col[c];
   2346 				if (rc->rc_error == 0) {
   2347 					zio_bad_cksum_t zbc;
   2348 					zbc.zbc_has_cksum = 0;
   2349 					zbc.zbc_injected =
   2350 					    rm->rm_ecksuminjected;
   2351 
   2352 					zfs_ereport_start_checksum(
   2353 					    zio->io_spa,
   2354 					    vd->vdev_child[rc->rc_devidx],
   2355 					    zio, rc->rc_offset, rc->rc_size,
   2356 					    (void *)(uintptr_t)c, &zbc);
   2357 				}
   2358 			}
   2359 		}
   2360 	}
   2361 
   2362 done:
   2363 	zio_checksum_verified(zio);
   2364 
   2365 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
   2366 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
   2367 		/*
   2368 		 * Use the good data we have in hand to repair damaged children.
   2369 		 */
   2370 		for (c = 0; c < rm->rm_cols; c++) {
   2371 			rc = &rm->rm_col[c];
   2372 			cvd = vd->vdev_child[rc->rc_devidx];
   2373 
   2374 			if (rc->rc_error == 0)
   2375 				continue;
   2376 
   2377 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
   2378 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   2379 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
   2380 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
   2381 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
   2382 		}
   2383 	}
   2384 }
   2385 
   2386 static void
   2387 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
   2388 {
   2389 	if (faulted > vd->vdev_nparity)
   2390 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
   2391 		    VDEV_AUX_NO_REPLICAS);
   2392 	else if (degraded + faulted != 0)
   2393 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
   2394 	else
   2395 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
   2396 }
   2397 
   2398 vdev_ops_t vdev_raidz_ops = {
   2399 	vdev_raidz_open,
   2400 	vdev_raidz_close,
   2401 	vdev_raidz_asize,
   2402 	vdev_raidz_io_start,
   2403 	vdev_raidz_io_done,
   2404 	vdev_raidz_state_change,
   2405 	NULL,
   2406 	NULL,
   2407 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
   2408 	B_FALSE			/* not a leaf vdev */
   2409 };
   2410