Home | History | Annotate | Line # | Download | only in udf
udf_strat_rmw.c revision 1.9.4.5
      1 /* $NetBSD: udf_strat_rmw.c,v 1.9.4.5 2008/12/10 22:18:20 snj Exp $ */
      2 
      3 /*
      4  * Copyright (c) 2006, 2008 Reinoud Zandijk
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  *
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 #ifndef lint
     31 __KERNEL_RCSID(0, "$NetBSD: udf_strat_rmw.c,v 1.9.4.5 2008/12/10 22:18:20 snj Exp $");
     32 #endif /* not lint */
     33 
     34 
     35 #if defined(_KERNEL_OPT)
     36 #include "opt_quota.h"
     37 #include "opt_compat_netbsd.h"
     38 #endif
     39 
     40 #include <sys/param.h>
     41 #include <sys/systm.h>
     42 #include <sys/sysctl.h>
     43 #include <sys/namei.h>
     44 #include <sys/proc.h>
     45 #include <sys/kernel.h>
     46 #include <sys/vnode.h>
     47 #include <miscfs/genfs/genfs_node.h>
     48 #include <sys/mount.h>
     49 #include <sys/buf.h>
     50 #include <sys/file.h>
     51 #include <sys/device.h>
     52 #include <sys/disklabel.h>
     53 #include <sys/ioctl.h>
     54 #include <sys/malloc.h>
     55 #include <sys/dirent.h>
     56 #include <sys/stat.h>
     57 #include <sys/conf.h>
     58 #include <sys/kauth.h>
     59 #include <sys/kthread.h>
     60 #include <dev/clock_subr.h>
     61 
     62 #include <fs/udf/ecma167-udf.h>
     63 #include <fs/udf/udf_mount.h>
     64 
     65 #include "udf.h"
     66 #include "udf_subr.h"
     67 #include "udf_bswap.h"
     68 
     69 
     70 #define VTOI(vnode) ((struct udf_node *) (vnode)->v_data)
     71 #define PRIV(ump) ((struct strat_private *) (ump)->strategy_private)
     72 #define BTOE(buf) ((struct udf_eccline *) ((buf)->b_private))
     73 
     74 /* --------------------------------------------------------------------- */
     75 
     76 #define UDF_MAX_PACKET_SIZE	64			/* DONT change this */
     77 
     78 /* sheduler states */
     79 #define UDF_SHED_WAITING	1			/* waiting on timeout */
     80 #define UDF_SHED_READING	2
     81 #define UDF_SHED_WRITING	3
     82 #define UDF_SHED_SEQWRITING	4
     83 #define UDF_SHED_IDLE		5			/* resting */
     84 #define UDF_SHED_FREE		6			/* recycleable */
     85 #define UDF_SHED_MAX		6+1
     86 
     87 /* flags */
     88 #define ECC_LOCKED		0x01			/* prevent access   */
     89 #define ECC_WANTED		0x02			/* trying access    */
     90 #define ECC_SEQWRITING		0x04			/* sequential queue */
     91 #define ECC_FLOATING		0x08			/* not queued yet   */
     92 
     93 #define ECC_WAITTIME		10
     94 
     95 
     96 TAILQ_HEAD(ecclineq, udf_eccline);
     97 struct udf_eccline {
     98 	struct udf_mount	 *ump;
     99 	uint64_t		  present;		/* preserve these */
    100 	uint64_t		  readin;		/* bitmap */
    101 	uint64_t		  dirty;		/* bitmap */
    102 	uint64_t		  error;		/* bitmap */
    103 	uint32_t		  refcnt;
    104 
    105 	struct timespec		  wait_time;
    106 	uint32_t		  flags;
    107 	uint32_t		  start_sector;		/* physical */
    108 
    109 	struct buf		 *buf;
    110 	void			 *blob;
    111 
    112 	struct buf		 *bufs[UDF_MAX_PACKET_SIZE];
    113 	uint32_t		  bufs_bpos[UDF_MAX_PACKET_SIZE];
    114 	int			  bufs_len[UDF_MAX_PACKET_SIZE];
    115 
    116 	int			  queued_on;		/* on which BUFQ list */
    117 	LIST_ENTRY(udf_eccline)   hashchain;		/* on sector lookup  */
    118 };
    119 
    120 
    121 struct strat_private {
    122 	lwp_t			 *queue_lwp;
    123 	kcondvar_t		  discstrat_cv;		/* to wait on       */
    124 	kmutex_t		  discstrat_mutex;	/* disc strategy    */
    125 	kmutex_t		  seqwrite_mutex;	/* protect mappings */
    126 
    127 	int			  thread_running;	/* thread control */
    128 	int			  run_thread;		/* thread control */
    129 	int			  thread_finished;	/* thread control */
    130 	int			  cur_queue;
    131 
    132 	int			  num_floating;
    133 	int			  num_queued[UDF_SHED_MAX];
    134 	struct bufq_state	 *queues[UDF_SHED_MAX];
    135 	struct timespec		  last_queued[UDF_SHED_MAX];
    136 	struct disk_strategy	  old_strategy_setting;
    137 
    138 	struct pool		  eccline_pool;
    139 	struct pool		  ecclineblob_pool;
    140 	LIST_HEAD(, udf_eccline)  eccline_hash[UDF_ECCBUF_HASHSIZE];
    141 };
    142 
    143 /* --------------------------------------------------------------------- */
    144 
    145 #define UDF_LOCK_ECCLINE(eccline) udf_lock_eccline(eccline)
    146 #define UDF_UNLOCK_ECCLINE(eccline) udf_unlock_eccline(eccline)
    147 
    148 /* can be called with or without discstrat lock */
    149 static void
    150 udf_lock_eccline(struct udf_eccline *eccline)
    151 {
    152 	struct strat_private *priv = PRIV(eccline->ump);
    153 	int waslocked, ret;
    154 
    155 	waslocked = mutex_owned(&priv->discstrat_mutex);
    156 	if (!waslocked)
    157 		mutex_enter(&priv->discstrat_mutex);
    158 
    159 	/* wait until its unlocked first */
    160 	while (eccline->flags & ECC_LOCKED) {
    161 		eccline->flags |= ECC_WANTED;
    162 		ret = cv_timedwait(&priv->discstrat_cv, &priv->discstrat_mutex,
    163 			hz/8);
    164 		if (ret == EWOULDBLOCK)
    165 			DPRINTF(LOCKING, ("eccline lock helt, waiting for "
    166 				"release"));
    167 	}
    168 	eccline->flags |= ECC_LOCKED;
    169 	eccline->flags &= ~ECC_WANTED;
    170 
    171 	if (!waslocked)
    172 		mutex_exit(&priv->discstrat_mutex);
    173 }
    174 
    175 
    176 /* can be called with or without discstrat lock */
    177 static void
    178 udf_unlock_eccline(struct udf_eccline *eccline)
    179 {
    180 	struct strat_private *priv = PRIV(eccline->ump);
    181 	int waslocked;
    182 
    183 	waslocked = mutex_owned(&priv->discstrat_mutex);
    184 	if (!waslocked)
    185 		mutex_enter(&priv->discstrat_mutex);
    186 
    187 	eccline->flags &= ~ECC_LOCKED;
    188 	cv_broadcast(&priv->discstrat_cv);
    189 
    190 	if (!waslocked)
    191 		mutex_exit(&priv->discstrat_mutex);
    192 }
    193 
    194 
    195 /* NOTE discstrat_mutex should be held! */
    196 static void
    197 udf_dispose_eccline(struct udf_eccline *eccline)
    198 {
    199 	struct strat_private *priv = PRIV(eccline->ump);
    200 	struct buf *ret;
    201 
    202 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    203 
    204 	KASSERT(eccline->refcnt == 0);
    205 	KASSERT(eccline->dirty  == 0);
    206 
    207 	DPRINTF(ECCLINE, ("dispose eccline with start sector %d, "
    208 		"present %0"PRIx64"\n", eccline->start_sector,
    209 		eccline->present));
    210 
    211 	if (eccline->queued_on) {
    212 		ret = BUFQ_CANCEL(priv->queues[eccline->queued_on], eccline->buf);
    213 		KASSERT(ret == eccline->buf);
    214 		priv->num_queued[eccline->queued_on]--;
    215 	}
    216 	LIST_REMOVE(eccline, hashchain);
    217 
    218 	if (eccline->flags & ECC_FLOATING) {
    219 		eccline->flags &= ~ECC_FLOATING;
    220 		priv->num_floating--;
    221 	}
    222 
    223 	putiobuf(eccline->buf);
    224 	pool_put(&priv->ecclineblob_pool, eccline->blob);
    225 	pool_put(&priv->eccline_pool, eccline);
    226 }
    227 
    228 
    229 /* NOTE discstrat_mutex should be held! */
    230 static void
    231 udf_push_eccline(struct udf_eccline *eccline, int newqueue)
    232 {
    233 	struct strat_private *priv = PRIV(eccline->ump);
    234 	struct buf *ret;
    235 	int curqueue;
    236 
    237 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    238 
    239 	DPRINTF(PARANOIA, ("DEBUG: buf %p pushed on queue %d\n", eccline->buf, newqueue));
    240 
    241 	/* requeue */
    242 	curqueue = eccline->queued_on;
    243 	if (curqueue) {
    244 		ret = BUFQ_CANCEL(priv->queues[curqueue], eccline->buf);
    245 
    246 		DPRINTF(PARANOIA, ("push_eccline BUFQ_CANCEL returned %p when "
    247 			"requested to remove %p from queue %d\n", ret,
    248 			eccline->buf, curqueue));
    249 #ifdef DIAGNOSTIC
    250 		if (ret == NULL) {
    251 			int i;
    252 
    253 			printf("udf_push_eccline: bufq_cancel can't find "
    254 				"buffer; dumping queues\n");
    255 			for (i = 1; i < UDF_SHED_MAX; i++) {
    256 				printf("queue %d\n\t", i);
    257 				ret = BUFQ_GET(priv->queues[i]);
    258 				while (ret) {
    259 					printf("%p ", ret);
    260 					if (ret == eccline->buf)
    261 						printf("[<-] ");
    262 					ret = BUFQ_GET(priv->queues[i]);
    263 				}
    264 				printf("\n");
    265 			}
    266 			panic("fatal queue bug; exit");
    267 		}
    268 #endif
    269 
    270 		KASSERT(ret == eccline->buf);
    271 		priv->num_queued[curqueue]--;
    272 	}
    273 
    274 	/* set buffer block numbers to make sure its queued correctly */
    275 	eccline->buf->b_lblkno   = eccline->start_sector;
    276 	eccline->buf->b_blkno    = eccline->start_sector;
    277 	eccline->buf->b_rawblkno = eccline->start_sector;
    278 
    279 	BUFQ_PUT(priv->queues[newqueue], eccline->buf);
    280 	eccline->queued_on = newqueue;
    281 	priv->num_queued[newqueue]++;
    282 	vfs_timestamp(&priv->last_queued[newqueue]);
    283 
    284 	if (eccline->flags & ECC_FLOATING) {
    285 		eccline->flags &= ~ECC_FLOATING;
    286 		priv->num_floating--;
    287 	}
    288 
    289 	/* tickle disc strategy statemachine */
    290 	if (newqueue != UDF_SHED_IDLE)
    291 		cv_signal(&priv->discstrat_cv);
    292 }
    293 
    294 
    295 static struct udf_eccline *
    296 udf_pop_eccline(struct strat_private *priv, int queued_on)
    297 {
    298 	struct udf_eccline *eccline;
    299 	struct buf *buf;
    300 
    301 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    302 
    303 	buf = BUFQ_GET(priv->queues[queued_on]);
    304 	if (!buf) {
    305 		KASSERT(priv->num_queued[queued_on] == 0);
    306 		return NULL;
    307 	}
    308 
    309 	eccline = BTOE(buf);
    310 	KASSERT(eccline->queued_on == queued_on);
    311 	eccline->queued_on = 0;
    312 	priv->num_queued[queued_on]--;
    313 
    314 	if (eccline->flags & ECC_FLOATING)
    315 		panic("popping already marked floating eccline");
    316 	eccline->flags |= ECC_FLOATING;
    317 	priv->num_floating++;
    318 
    319 	DPRINTF(PARANOIA, ("DEBUG: buf %p popped from queue %d\n",
    320 		eccline->buf, queued_on));
    321 
    322 	return eccline;
    323 }
    324 
    325 
    326 static struct udf_eccline *
    327 udf_geteccline(struct udf_mount *ump, uint32_t sector, int flags)
    328 {
    329 	struct strat_private *priv = PRIV(ump);
    330 	struct udf_eccline *eccline;
    331 	uint32_t start_sector, lb_size, blobsize;
    332 	uint8_t *eccline_blob;
    333 	int line, line_offset;
    334 	int num_busy, ret;
    335 
    336 	line_offset  = sector % ump->packet_size;
    337 	start_sector = sector - line_offset;
    338 	line = (start_sector/ump->packet_size) & UDF_ECCBUF_HASHMASK;
    339 
    340 	mutex_enter(&priv->discstrat_mutex);
    341 	KASSERT(priv->thread_running);
    342 
    343 retry:
    344 	DPRINTF(ECCLINE, ("get line sector %d, line %d\n", sector, line));
    345 	LIST_FOREACH(eccline, &priv->eccline_hash[line], hashchain) {
    346 		if (eccline->start_sector == start_sector) {
    347 			DPRINTF(ECCLINE, ("\tfound eccline, start_sector %d\n",
    348 				eccline->start_sector));
    349 
    350 			UDF_LOCK_ECCLINE(eccline);
    351 			/* move from freelist (!) */
    352 			if (eccline->queued_on == UDF_SHED_FREE) {
    353 				DPRINTF(ECCLINE, ("was on freelist\n"));
    354 				KASSERT(eccline->refcnt == 0);
    355 				udf_push_eccline(eccline, UDF_SHED_IDLE);
    356 			}
    357 			eccline->refcnt++;
    358 			mutex_exit(&priv->discstrat_mutex);
    359 			return eccline;
    360 		}
    361 	}
    362 
    363 	DPRINTF(ECCLINE, ("\tnot found in eccline cache\n"));
    364 	/* not found in eccline cache */
    365 
    366 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
    367 	blobsize = ump->packet_size * lb_size;
    368 
    369 	/* dont allow too many pending requests */
    370 	DPRINTF(ECCLINE, ("\tallocating new eccline\n"));
    371 	num_busy = (priv->num_queued[UDF_SHED_SEQWRITING] + priv->num_floating);
    372 	if ((flags & ECC_SEQWRITING) && (num_busy > UDF_ECCLINE_MAXBUSY)) {
    373 		ret = cv_timedwait(&priv->discstrat_cv,
    374 			&priv->discstrat_mutex, hz/8);
    375 		goto retry;
    376 	}
    377 
    378 	eccline_blob = pool_get(&priv->ecclineblob_pool, PR_NOWAIT);
    379 	eccline = pool_get(&priv->eccline_pool, PR_NOWAIT);
    380 	if ((eccline_blob == NULL) || (eccline == NULL)) {
    381 		if (eccline_blob)
    382 			pool_put(&priv->ecclineblob_pool, eccline_blob);
    383 		if (eccline)
    384 			pool_put(&priv->eccline_pool, eccline);
    385 
    386 		/* out of memory for now; canibalise freelist */
    387 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
    388 		if (eccline == NULL) {
    389 			/* serious trouble; wait and retry */
    390 			cv_timedwait(&priv->discstrat_cv,
    391 				&priv->discstrat_mutex, hz/8);
    392 			goto retry;
    393 		}
    394 		/* push back line if we're waiting for it */
    395 		if (eccline->flags & ECC_WANTED) {
    396 			udf_push_eccline(eccline, UDF_SHED_IDLE);
    397 			goto retry;
    398 		}
    399 
    400 		/* unlink this entry */
    401 		LIST_REMOVE(eccline, hashchain);
    402 
    403 		KASSERT(eccline->flags & ECC_FLOATING);
    404 
    405 		eccline_blob = eccline->blob;
    406 		memset(eccline, 0, sizeof(struct udf_eccline));
    407 		eccline->flags = ECC_FLOATING;
    408 	} else {
    409 		memset(eccline, 0, sizeof(struct udf_eccline));
    410 		eccline->flags = ECC_FLOATING;
    411 		priv->num_floating++;
    412 	}
    413 
    414 	eccline->queued_on = 0;
    415 	eccline->blob = eccline_blob;
    416 	eccline->buf  = getiobuf(NULL, true);
    417 	eccline->buf->b_private = eccline;	/* IMPORTANT */
    418 
    419 	/* initialise eccline blob */
    420 	memset(eccline->blob, 0, blobsize);
    421 
    422 	eccline->ump = ump;
    423 	eccline->present = eccline->readin = eccline->dirty = 0;
    424 	eccline->error = 0;
    425 	eccline->refcnt = 0;
    426 
    427 	eccline->start_sector    = start_sector;
    428 	eccline->buf->b_lblkno   = start_sector;
    429 	eccline->buf->b_blkno    = start_sector;
    430 	eccline->buf->b_rawblkno = start_sector;
    431 
    432 	LIST_INSERT_HEAD(&priv->eccline_hash[line], eccline, hashchain);
    433 
    434 	/*
    435 	 * TODO possible optimalisation for checking overlap with partitions
    436 	 * to get a clue on future eccline usage
    437 	 */
    438 	eccline->refcnt++;
    439 	UDF_LOCK_ECCLINE(eccline);
    440 
    441 	mutex_exit(&priv->discstrat_mutex);
    442 
    443 	return eccline;
    444 }
    445 
    446 
    447 static void
    448 udf_puteccline(struct udf_eccline *eccline)
    449 {
    450 	struct strat_private *priv = PRIV(eccline->ump);
    451 	struct udf_mount *ump = eccline->ump;
    452 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
    453 
    454 	mutex_enter(&priv->discstrat_mutex);
    455 
    456 	/* clear directly all readin requests from present ones */
    457 	if (eccline->readin & eccline->present) {
    458 		/* clear all read bits that are already read in */
    459 		eccline->readin &= (~eccline->present) & allbits;
    460 		wakeup(eccline);
    461 	}
    462 
    463 	DPRINTF(ECCLINE, ("put eccline start sector %d, refcnt %d\n",
    464 		eccline->start_sector, eccline->refcnt));
    465 
    466 	/* if we have active nodes we dont set it on seqwriting */
    467 	if (eccline->refcnt > 1)
    468 		eccline->flags &= ~ECC_SEQWRITING;
    469 
    470 	vfs_timestamp(&eccline->wait_time);
    471 	eccline->wait_time.tv_sec += ECC_WAITTIME;
    472 	udf_push_eccline(eccline, UDF_SHED_WAITING);
    473 
    474 	KASSERT(eccline->refcnt >= 1);
    475 	eccline->refcnt--;
    476 	UDF_UNLOCK_ECCLINE(eccline);
    477 
    478 	wakeup(eccline);
    479 	mutex_exit(&priv->discstrat_mutex);
    480 }
    481 
    482 /* --------------------------------------------------------------------- */
    483 
    484 static int
    485 udf_create_nodedscr_rmw(struct udf_strat_args *args)
    486 {
    487 	union dscrptr   **dscrptr  = &args->dscr;
    488 	struct udf_mount *ump      = args->ump;
    489 	struct long_ad   *icb      = args->icb;
    490 	struct udf_eccline *eccline;
    491 	uint64_t bit;
    492 	uint32_t sectornr, lb_size, dummy;
    493 	uint8_t *mem;
    494 	int error, eccsect;
    495 
    496 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    497 	if (error)
    498 		return error;
    499 
    500 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
    501 
    502 	/* get our eccline */
    503 	eccline = udf_geteccline(ump, sectornr, 0);
    504 	eccsect = sectornr - eccline->start_sector;
    505 
    506 	bit = (uint64_t) 1 << eccsect;
    507 	eccline->readin  &= ~bit;	/* just in case */
    508 	eccline->present |=  bit;
    509 	eccline->dirty   &= ~bit;	/* Err... euhm... clean? */
    510 
    511 	eccline->refcnt++;
    512 
    513 	/* clear space */
    514 	mem = ((uint8_t *) eccline->blob) + eccsect * lb_size;
    515 	memset(mem, 0, lb_size);
    516 
    517 	udf_puteccline(eccline);
    518 
    519 	*dscrptr = (union dscrptr *) mem;
    520 	return 0;
    521 }
    522 
    523 
    524 static void
    525 udf_free_nodedscr_rmw(struct udf_strat_args *args)
    526 {
    527 	struct udf_mount *ump  = args->ump;
    528 	struct long_ad   *icb  = args->icb;
    529 	struct udf_eccline *eccline;
    530 	uint64_t bit;
    531 	uint32_t sectornr, dummy;
    532 	int error, eccsect;
    533 
    534 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    535 	if (error)
    536 		return;
    537 
    538 	/* get our eccline */
    539 	eccline = udf_geteccline(ump, sectornr, 0);
    540 	eccsect = sectornr - eccline->start_sector;
    541 
    542 	bit = (uint64_t) 1 << eccsect;
    543 	eccline->readin &= ~bit;	/* just in case */
    544 
    545 	KASSERT(eccline->refcnt >= 1);
    546 	eccline->refcnt--;
    547 
    548 	udf_puteccline(eccline);
    549 }
    550 
    551 
    552 static int
    553 udf_read_nodedscr_rmw(struct udf_strat_args *args)
    554 {
    555 	union dscrptr   **dscrptr = &args->dscr;
    556 	struct udf_mount *ump = args->ump;
    557 	struct long_ad   *icb = args->icb;
    558 	struct udf_eccline *eccline;
    559 	uint64_t bit;
    560 	uint32_t sectornr, dummy;
    561 	uint8_t *pos;
    562 	int sector_size = ump->discinfo.sector_size;
    563 	int lb_size = udf_rw32(ump->logical_vol->lb_size);
    564 	int i, error, dscrlen, eccsect;
    565 
    566 	lb_size = lb_size;
    567 	KASSERT(sector_size == lb_size);
    568 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    569 	if (error)
    570 		return error;
    571 
    572 	/* get our eccline */
    573 	eccline = udf_geteccline(ump, sectornr, 0);
    574 	eccsect = sectornr - eccline->start_sector;
    575 
    576 	bit = (uint64_t) 1 << eccsect;
    577 	if ((eccline->present & bit) == 0) {
    578 		/* mark bit for readin */
    579 		eccline->readin |= bit;
    580 		eccline->refcnt++;	/* prevent recycling */
    581 		KASSERT(eccline->bufs[eccsect] == NULL);
    582 		udf_puteccline(eccline);
    583 
    584 		/* wait for completion; XXX remodel to lock bit code */
    585 		error = 0;
    586 		while ((eccline->present & bit) == 0) {
    587 			tsleep(eccline, PRIBIO+1, "udflvdrd", hz/8);
    588 			if (eccline->error & bit) {
    589 				KASSERT(eccline->refcnt >= 1);
    590 				eccline->refcnt--;	/* undo temp refcnt */
    591 				*dscrptr = NULL;
    592 				return EIO;		/* XXX error code */
    593 			}
    594 		}
    595 
    596 		/* reget our line */
    597 		eccline = udf_geteccline(ump, sectornr, 0);
    598 		KASSERT(eccline->refcnt >= 1);
    599 		eccline->refcnt--;	/* undo refcnt */
    600 	}
    601 
    602 	*dscrptr = (union dscrptr *)
    603 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
    604 
    605 	/* code from read_phys_descr */
    606 	/* check if its a valid tag */
    607 	error = udf_check_tag(*dscrptr);
    608 	if (error) {
    609 		/* check if its an empty block */
    610 		pos = (uint8_t *) *dscrptr;
    611 		for (i = 0; i < sector_size; i++, pos++) {
    612 			if (*pos) break;
    613 		}
    614 		if (i == sector_size) {
    615 			/* return no error but with no dscrptr */
    616 			error = 0;
    617 		}
    618 		*dscrptr = NULL;
    619 		udf_puteccline(eccline);
    620 		return error;
    621 	}
    622 
    623 	/* calculate descriptor size */
    624 	dscrlen = udf_tagsize(*dscrptr, sector_size);
    625 	error = udf_check_tag_payload(*dscrptr, dscrlen);
    626 	if (error) {
    627 		*dscrptr = NULL;
    628 		udf_puteccline(eccline);
    629 		return error;
    630 	}
    631 
    632 	eccline->refcnt++;
    633 	udf_puteccline(eccline);
    634 
    635 	return 0;
    636 }
    637 
    638 
    639 static int
    640 udf_write_nodedscr_rmw(struct udf_strat_args *args)
    641 {
    642 	union dscrptr    *dscrptr = args->dscr;
    643 	struct udf_mount *ump = args->ump;
    644 	struct long_ad   *icb = args->icb;
    645 	struct udf_node *udf_node = args->udf_node;
    646 	struct udf_eccline *eccline;
    647 	uint64_t bit;
    648 	uint32_t sectornr, logsectornr, dummy;
    649 	// int waitfor  = args->waitfor;
    650 	int sector_size = ump->discinfo.sector_size;
    651 	int lb_size = udf_rw32(ump->logical_vol->lb_size);
    652 	int error, eccsect;
    653 
    654 	lb_size = lb_size;
    655 	KASSERT(sector_size == lb_size);
    656 	sectornr    = 0;
    657 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    658 	if (error)
    659 		return error;
    660 
    661 	/* add reference to the vnode to prevent recycling */
    662 	vhold(udf_node->vnode);
    663 
    664 	/* get our eccline */
    665 	eccline = udf_geteccline(ump, sectornr, 0);
    666 	eccsect = sectornr - eccline->start_sector;
    667 
    668 	bit = (uint64_t) 1 << eccsect;
    669 
    670 	/* old callback still pending? */
    671 	if (eccline->bufs[eccsect]) {
    672 		DPRINTF(WRITE, ("udf_write_nodedscr_rmw: writing descriptor"
    673 					" over buffer?\n"));
    674 		nestiobuf_done(eccline->bufs[eccsect],
    675 				eccline->bufs_len[eccsect],
    676 				0);
    677 		eccline->bufs[eccsect] = NULL;
    678 	}
    679 
    680 	/* set sector number in the descriptor and validate */
    681 	dscrptr = (union dscrptr *)
    682 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
    683 	KASSERT(dscrptr == args->dscr);
    684 
    685 	logsectornr = udf_rw32(icb->loc.lb_num);
    686 	dscrptr->tag.tag_loc = udf_rw32(logsectornr);
    687 	udf_validate_tag_and_crc_sums(dscrptr);
    688 
    689 	udf_fixup_node_internals(ump, (uint8_t *) dscrptr, UDF_C_NODE);
    690 
    691 	/* set our flags */
    692 	KASSERT(eccline->present & bit);
    693 	eccline->dirty |= bit;
    694 
    695 	KASSERT(udf_tagsize(dscrptr, sector_size) <= sector_size);
    696 
    697 	udf_puteccline(eccline);
    698 
    699 	holdrele(udf_node->vnode);
    700 	udf_node->outstanding_nodedscr--;
    701 	if (udf_node->outstanding_nodedscr == 0) {
    702 		UDF_UNLOCK_NODE(udf_node, udf_node->i_flags & IN_CALLBACK_ULK);
    703 		wakeup(&udf_node->outstanding_nodedscr);
    704 	}
    705 
    706 	/* XXX waitfor not used */
    707 	return 0;
    708 }
    709 
    710 
    711 static void
    712 udf_queuebuf_rmw(struct udf_strat_args *args)
    713 {
    714 	struct udf_mount *ump = args->ump;
    715 	struct buf *buf = args->nestbuf;
    716 	struct desc_tag *tag;
    717 	struct strat_private *priv = PRIV(ump);
    718 	struct udf_eccline *eccline;
    719 	struct long_ad *node_ad_cpy;
    720 	uint64_t bit, *lmapping, *pmapping, *lmappos, *pmappos, blknr;
    721 	uint32_t buf_len, len, sectors, sectornr, our_sectornr;
    722 	uint32_t bpos;
    723 	uint16_t vpart_num;
    724 	uint8_t *fidblk, *src, *dst;
    725 	int sector_size = ump->discinfo.sector_size;
    726 	int blks = sector_size / DEV_BSIZE;
    727 	int eccsect, what, queue, error;
    728 
    729 	KASSERT(ump);
    730 	KASSERT(buf);
    731 	KASSERT(buf->b_iodone == nestiobuf_iodone);
    732 
    733 	blknr        = buf->b_blkno;
    734 	our_sectornr = blknr / blks;
    735 
    736 	what = buf->b_udf_c_type;
    737 	queue = UDF_SHED_READING;
    738 	if ((buf->b_flags & B_READ) == 0) {
    739 		/* writing */
    740 		queue = UDF_SHED_SEQWRITING;
    741 		if (what == UDF_C_DSCR)
    742 			queue = UDF_SHED_WRITING;
    743 		if (what == UDF_C_NODE)
    744 			queue = UDF_SHED_WRITING;
    745 	}
    746 
    747 	if (queue == UDF_SHED_READING) {
    748 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw READ %p : sector %d type %d,"
    749 			"b_resid %d, b_bcount %d, b_bufsize %d\n",
    750 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
    751 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
    752 
    753 		/* mark bits for reading */
    754 		buf_len = buf->b_bcount;
    755 		sectornr = our_sectornr;
    756 		eccline = udf_geteccline(ump, sectornr, 0);
    757 		eccsect = sectornr - eccline->start_sector;
    758 		bpos = 0;
    759 		while (buf_len) {
    760 			len = MIN(buf_len, sector_size);
    761 			if (eccsect == ump->packet_size) {
    762 				udf_puteccline(eccline);
    763 				eccline = udf_geteccline(ump, sectornr, 0);
    764 				eccsect = sectornr - eccline->start_sector;
    765 			}
    766 			bit = (uint64_t) 1 << eccsect;
    767 			error = eccline->error & bit ? EIO : 0;
    768 			if (eccline->present & bit) {
    769 				src = (uint8_t *) eccline->blob +
    770 					eccsect * sector_size;
    771 				dst = (uint8_t *) buf->b_data + bpos;
    772 				if (!error)
    773 					memcpy(dst, src, len);
    774 				nestiobuf_done(buf, len, error);
    775 			} else {
    776 				eccline->readin |= bit;
    777 				KASSERT(eccline->bufs[eccsect] == NULL);
    778 				eccline->bufs[eccsect] = buf;
    779 				eccline->bufs_bpos[eccsect] = bpos;
    780 				eccline->bufs_len[eccsect] = len;
    781 			}
    782 			bpos += sector_size;
    783 			eccsect++;
    784 			sectornr++;
    785 			buf_len -= len;
    786 		}
    787 		udf_puteccline(eccline);
    788 		return;
    789 	}
    790 
    791 	if (queue == UDF_SHED_WRITING) {
    792 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw WRITE %p : sector %d "
    793 			"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
    794 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
    795 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
    796 		/* if we have FIDs fixup using buffer's sector number(s) */
    797 		if (buf->b_udf_c_type == UDF_C_FIDS) {
    798 			panic("UDF_C_FIDS in SHED_WRITING!\n");
    799 #if 0
    800 			buf_len = buf->b_bcount;
    801 			sectornr = our_sectornr;
    802 			bpos = 0;
    803 			while (buf_len) {
    804 				len = MIN(buf_len, sector_size);
    805 				fidblk = (uint8_t *) buf->b_data + bpos;
    806 				udf_fixup_fid_block(fidblk, sector_size,
    807 					0, len, sectornr);
    808 				sectornr++;
    809 				bpos += len;
    810 				buf_len -= len;
    811 			}
    812 #endif
    813 		}
    814 		udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
    815 
    816 		/* copy parts into the bufs and set for writing */
    817 		buf_len = buf->b_bcount;
    818 		sectornr = our_sectornr;
    819 		eccline = udf_geteccline(ump, sectornr, 0);
    820 		eccsect = sectornr - eccline->start_sector;
    821 		bpos = 0;
    822 		while (buf_len) {
    823 			len = MIN(buf_len, sector_size);
    824 			if (eccsect == ump->packet_size) {
    825 				udf_puteccline(eccline);
    826 				eccline = udf_geteccline(ump, sectornr, 0);
    827 				eccsect = sectornr - eccline->start_sector;
    828 			}
    829 			bit = (uint64_t) 1 << eccsect;
    830 			KASSERT((eccline->readin & bit) == 0);
    831 			eccline->present |= bit;
    832 			eccline->dirty   |= bit;
    833 			if (eccline->bufs[eccsect]) {
    834 				/* old callback still pending */
    835 				nestiobuf_done(eccline->bufs[eccsect],
    836 						eccline->bufs_len[eccsect],
    837 						0);
    838 				eccline->bufs[eccsect] = NULL;
    839 			}
    840 
    841 			src = (uint8_t *) buf->b_data + bpos;
    842 			dst = (uint8_t *) eccline->blob + eccsect * sector_size;
    843 			if (len != sector_size)
    844 				memset(dst, 0, sector_size);
    845 			memcpy(dst, src, len);
    846 
    847 			/* note that its finished for this extent */
    848 			eccline->bufs[eccsect] = NULL;
    849 			nestiobuf_done(buf, len, 0);
    850 
    851 			bpos += sector_size;
    852 			eccsect++;
    853 			sectornr++;
    854 			buf_len -= len;
    855 		}
    856 		udf_puteccline(eccline);
    857 		return;
    858 
    859 	}
    860 
    861 	/* sequential writing */
    862 	KASSERT(queue == UDF_SHED_SEQWRITING);
    863 	DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw SEQWRITE %p : sector XXXX "
    864 		"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
    865 		buf, buf->b_udf_c_type, buf->b_resid, buf->b_bcount,
    866 		buf->b_bufsize));
    867 	/*
    868 	 * Buffers should not have been allocated to disc addresses yet on
    869 	 * this queue. Note that a buffer can get multiple extents allocated.
    870 	 * Note that it *looks* like the normal writing but its different in
    871 	 * the details.
    872 	 *
    873 	 * lmapping contains lb_num relative to base partition.
    874 	 *
    875 	 * XXX should we try to claim/organize the allocated memory to
    876 	 * block-aligned pieces?
    877 	 */
    878 	mutex_enter(&priv->seqwrite_mutex);
    879 
    880 	lmapping    = ump->la_lmapping;
    881 	node_ad_cpy = ump->la_node_ad_cpy;
    882 
    883 	/* logically allocate buf and map it in the file */
    884 	udf_late_allocate_buf(ump, buf, lmapping, node_ad_cpy, &vpart_num);
    885 
    886 	/* if we have FIDs, fixup using the new allocation table */
    887 	if (buf->b_udf_c_type == UDF_C_FIDS) {
    888 		buf_len = buf->b_bcount;
    889 		bpos = 0;
    890 		lmappos = lmapping;
    891 		while (buf_len) {
    892 			sectornr = *lmappos++;
    893 			len = MIN(buf_len, sector_size);
    894 			fidblk = (uint8_t *) buf->b_data + bpos;
    895 			udf_fixup_fid_block(fidblk, sector_size,
    896 				0, len, sectornr);
    897 			bpos += len;
    898 			buf_len -= len;
    899 		}
    900 	}
    901 	if (buf->b_udf_c_type == UDF_C_METADATA_SBM) {
    902 		if (buf->b_lblkno == 0) {
    903 			/* update the tag location inside */
    904 			tag = (struct desc_tag *) buf->b_data;
    905 			tag->tag_loc = udf_rw32(*lmapping);
    906 			udf_validate_tag_and_crc_sums(buf->b_data);
    907 		}
    908 	}
    909 	udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
    910 
    911 	/*
    912 	 * Translate new mappings in lmapping to pmappings.
    913 	 * pmapping to contain lb_nums as used for disc adressing.
    914 	 */
    915 	pmapping = ump->la_pmapping;
    916 	sectors  = (buf->b_bcount + sector_size -1) / sector_size;
    917 	udf_translate_vtop_list(ump, sectors, vpart_num, lmapping, pmapping);
    918 
    919 	/* copy parts into the bufs and set for writing */
    920 	pmappos = pmapping;
    921 	buf_len = buf->b_bcount;
    922 	sectornr = *pmappos++;
    923 	eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
    924 	eccsect = sectornr - eccline->start_sector;
    925 	bpos = 0;
    926 	while (buf_len) {
    927 		len = MIN(buf_len, sector_size);
    928 		eccsect = sectornr - eccline->start_sector;
    929 		if ((eccsect < 0) || (eccsect >= ump->packet_size)) {
    930 			eccline->flags |= ECC_SEQWRITING;
    931 			udf_puteccline(eccline);
    932 			eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
    933 			eccsect = sectornr - eccline->start_sector;
    934 		}
    935 		bit = (uint64_t) 1 << eccsect;
    936 		KASSERT((eccline->readin & bit) == 0);
    937 		eccline->present |= bit;
    938 		eccline->dirty   |= bit;
    939 		eccline->bufs[eccsect] = NULL;
    940 
    941 		src = (uint8_t *) buf->b_data + bpos;
    942 		dst = (uint8_t *)
    943 			eccline->blob + eccsect * sector_size;
    944 		if (len != sector_size)
    945 			memset(dst, 0, sector_size);
    946 		memcpy(dst, src, len);
    947 
    948 		/* note that its finished for this extent */
    949 		nestiobuf_done(buf, len, 0);
    950 
    951 		bpos += sector_size;
    952 		sectornr = *pmappos++;
    953 		buf_len -= len;
    954 	}
    955 	eccline->flags |= ECC_SEQWRITING;
    956 	udf_puteccline(eccline);
    957 	mutex_exit(&priv->seqwrite_mutex);
    958 }
    959 
    960 /* --------------------------------------------------------------------- */
    961 
    962 static void
    963 udf_shedule_read_callback(struct buf *buf)
    964 {
    965 	struct udf_eccline *eccline = BTOE(buf);
    966 	struct udf_mount *ump = eccline->ump;
    967 	uint64_t bit;
    968 	uint8_t *src, *dst;
    969 	int sector_size = ump->discinfo.sector_size;
    970 	int error, i, len;
    971 
    972 	DPRINTF(ECCLINE, ("read callback called\n"));
    973 	/* post process read action */
    974 	error = buf->b_error;
    975 	for (i = 0; i < ump->packet_size; i++) {
    976 		bit = (uint64_t) 1 << i;
    977 		src = (uint8_t *) buf->b_data +   i * sector_size;
    978 		dst = (uint8_t *) eccline->blob + i * sector_size;
    979 		if (eccline->present & bit)
    980 			continue;
    981 		eccline->present |= bit;
    982 		if (error)
    983 			eccline->error |= bit;
    984 		if (eccline->bufs[i]) {
    985 			dst = (uint8_t *) eccline->bufs[i]->b_data +
    986 				eccline->bufs_bpos[i];
    987 			len = eccline->bufs_len[i];
    988 			if (!error)
    989 				memcpy(dst, src, len);
    990 			nestiobuf_done(eccline->bufs[i], len, error);
    991 			eccline->bufs[i] = NULL;
    992 		}
    993 
    994 	}
    995 	KASSERT(buf->b_data == eccline->blob);
    996 	KASSERT(eccline->present == ((uint64_t) 1 << ump->packet_size)-1);
    997 
    998 	/*
    999 	 * XXX TODO what to do on read errors? read in all sectors
   1000 	 * synchronously and allocate a sparable entry?
   1001 	 */
   1002 
   1003 	udf_puteccline(eccline);
   1004 	DPRINTF(ECCLINE, ("read callback finished\n"));
   1005 }
   1006 
   1007 
   1008 static void
   1009 udf_shedule_write_callback(struct buf *buf)
   1010 {
   1011 	struct udf_eccline *eccline = BTOE(buf);
   1012 	struct udf_mount *ump = eccline->ump;
   1013 	uint64_t bit;
   1014 	int error, i, len;
   1015 
   1016 	DPRINTF(ECCLINE, ("write callback called\n"));
   1017 	/* post process write action */
   1018 	error = buf->b_error;
   1019 	for (i = 0; i < ump->packet_size; i++) {
   1020 		bit = (uint64_t) 1 << i;
   1021 		if ((eccline->dirty & bit) == 0)
   1022 			continue;
   1023 		if (error) {
   1024 			eccline->error |= bit;
   1025 		} else {
   1026 			eccline->dirty &= ~bit;
   1027 		}
   1028 		if (eccline->bufs[i]) {
   1029 			len = eccline->bufs_len[i];
   1030 			nestiobuf_done(eccline->bufs[i], len, error);
   1031 			eccline->bufs[i] = NULL;
   1032 		}
   1033 	}
   1034 	KASSERT(eccline->dirty == 0);
   1035 
   1036 	KASSERT(error == 0);
   1037 	/*
   1038 	 * XXX TODO on write errors allocate a sparable entry and reissue
   1039 	 */
   1040 
   1041 	udf_puteccline(eccline);
   1042 }
   1043 
   1044 
   1045 static void
   1046 udf_issue_eccline(struct udf_eccline *eccline, int queued_on)
   1047 {
   1048 	struct udf_mount *ump = eccline->ump;
   1049 	struct strat_private *priv = PRIV(ump);
   1050 	struct buf *buf, *nestbuf;
   1051 	uint64_t bit, allbits = ((uint64_t) 1 << ump->packet_size)-1;
   1052 	uint32_t start;
   1053 	int sector_size = ump->discinfo.sector_size;
   1054 	int blks = sector_size / DEV_BSIZE;
   1055 	int i;
   1056 
   1057 	if (queued_on == UDF_SHED_READING) {
   1058 		DPRINTF(SHEDULE, ("udf_issue_eccline reading : "));
   1059 		/* read all bits that are not yet present */
   1060 		eccline->readin = (~eccline->present) & allbits;
   1061 		KASSERT(eccline->readin);
   1062 		start = eccline->start_sector;
   1063 		buf = eccline->buf;
   1064 		buf_init(buf);
   1065 		buf->b_flags    = B_READ | B_ASYNC;
   1066 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
   1067 		buf->b_oflags   = 0;
   1068 		buf->b_iodone   = udf_shedule_read_callback;
   1069 		buf->b_data     = eccline->blob;
   1070 		buf->b_bcount   = ump->packet_size * sector_size;
   1071 		buf->b_resid    = buf->b_bcount;
   1072 		buf->b_bufsize  = buf->b_bcount;
   1073 		buf->b_private  = eccline;
   1074 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
   1075 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
   1076 		buf->b_proc     = NULL;
   1077 
   1078 		if (eccline->present != 0) {
   1079 			for (i = 0; i < ump->packet_size; i++) {
   1080 				bit = (uint64_t) 1 << i;
   1081 				if (eccline->present & bit) {
   1082 					nestiobuf_done(buf, sector_size, 0);
   1083 					continue;
   1084 				}
   1085 				nestbuf = getiobuf(NULL, true);
   1086 				nestiobuf_setup(buf, nestbuf, i * sector_size,
   1087 					sector_size);
   1088 				/* adjust blocknumber to read */
   1089 				nestbuf->b_blkno = buf->b_blkno + i*blks;
   1090 				nestbuf->b_rawblkno = buf->b_rawblkno + i*blks;
   1091 
   1092 				DPRINTF(SHEDULE, ("sector %d ",
   1093 					start + i));
   1094 				/* call asynchronous */
   1095 				VOP_STRATEGY(ump->devvp, nestbuf);
   1096 			}
   1097 			DPRINTF(SHEDULE, ("\n"));
   1098 			return;
   1099 		}
   1100 	} else {
   1101 		/* write or seqwrite */
   1102 		DPRINTF(SHEDULE, ("udf_issue_eccline writing or seqwriting : "));
   1103 		DPRINTF(SHEDULE, ("\n\tpresent %"PRIx64", readin %"PRIx64", "
   1104 			"dirty %"PRIx64"\n\t", eccline->present, eccline->readin,
   1105 			eccline->dirty));
   1106 		if (eccline->present != allbits) {
   1107 			/* requeue to read-only */
   1108 			DPRINTF(SHEDULE, ("\n\t-> not complete, requeue to "
   1109 				"reading\n"));
   1110 			udf_push_eccline(eccline, UDF_SHED_READING);
   1111 			return;
   1112 		}
   1113 		start = eccline->start_sector;
   1114 		buf = eccline->buf;
   1115 		buf_init(buf);
   1116 		buf->b_flags    = B_WRITE | B_ASYNC;
   1117 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
   1118 		buf->b_oflags   = 0;
   1119 		buf->b_iodone   = udf_shedule_write_callback;
   1120 		buf->b_data     = eccline->blob;
   1121 		buf->b_bcount   = ump->packet_size * sector_size;
   1122 		buf->b_resid    = buf->b_bcount;
   1123 		buf->b_bufsize  = buf->b_bcount;
   1124 		buf->b_private  = eccline;
   1125 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
   1126 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
   1127 		buf->b_proc     = NULL;
   1128 	}
   1129 
   1130 	mutex_exit(&priv->discstrat_mutex);
   1131 		/* call asynchronous */
   1132 		DPRINTF(SHEDULE, ("sector %d for %d\n",
   1133 			start, ump->packet_size));
   1134 		VOP_STRATEGY(ump->devvp, buf);
   1135 	mutex_enter(&priv->discstrat_mutex);
   1136 }
   1137 
   1138 
   1139 static void
   1140 udf_discstrat_thread(void *arg)
   1141 {
   1142 	struct udf_mount *ump = (struct udf_mount *) arg;
   1143 	struct strat_private *priv = PRIV(ump);
   1144 	struct udf_eccline *eccline;
   1145 	struct timespec now, *last;
   1146 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
   1147 	int new_queue, wait, work, num, cnt;
   1148 
   1149 	work = 1;
   1150 	priv->thread_running = 1;
   1151 	mutex_enter(&priv->discstrat_mutex);
   1152 	priv->num_floating = 0;
   1153 	while (priv->run_thread || work || priv->num_floating) {
   1154 		/* get our time */
   1155 		vfs_timestamp(&now);
   1156 
   1157 		/* maintenance: handle eccline state machine */
   1158 		num = priv->num_queued[UDF_SHED_WAITING];
   1159 		cnt = 0;
   1160 		while (cnt < num) {
   1161 			eccline = udf_pop_eccline(priv, UDF_SHED_WAITING);
   1162 			/* requeue */
   1163 			new_queue = UDF_SHED_FREE;
   1164 			if (eccline->refcnt > 0)
   1165 				new_queue = UDF_SHED_IDLE;
   1166 			if (eccline->flags & ECC_WANTED)
   1167 				new_queue = UDF_SHED_IDLE;
   1168 			if (eccline->readin)
   1169 				new_queue = UDF_SHED_READING;
   1170 			if (eccline->dirty) {
   1171 				new_queue = UDF_SHED_WAITING;
   1172 				if ((eccline->wait_time.tv_sec - now.tv_sec <= 0) ||
   1173 				   ((eccline->present == allbits) &&
   1174 				    (eccline->flags & ECC_SEQWRITING)))
   1175 				{
   1176 					new_queue = UDF_SHED_WRITING;
   1177 					if (eccline->flags & ECC_SEQWRITING)
   1178 						new_queue = UDF_SHED_SEQWRITING;
   1179 					if (eccline->present != allbits)
   1180 						new_queue = UDF_SHED_READING;
   1181 				}
   1182 			}
   1183 			udf_push_eccline(eccline, new_queue);
   1184 			cnt++;
   1185 		}
   1186 
   1187 		/* maintenance: free exess ecclines */
   1188 		while (priv->num_queued[UDF_SHED_FREE] > UDF_ECCLINE_MAXFREE) {
   1189 			eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
   1190 			KASSERT(eccline);
   1191 			KASSERT(eccline->refcnt == 0);
   1192 			if (eccline->flags & ECC_WANTED) {
   1193 				udf_push_eccline(eccline, UDF_SHED_IDLE);
   1194 				DPRINTF(ECCLINE, ("Tried removing, pushed back to free list\n"));
   1195 			} else {
   1196 				DPRINTF(ECCLINE, ("Removing entry from free list\n"));
   1197 				udf_dispose_eccline(eccline);
   1198 			}
   1199 		}
   1200 
   1201 		/* process the current selected queue */
   1202 		/* get our time */
   1203 		vfs_timestamp(&now);
   1204 		last = &priv->last_queued[priv->cur_queue];
   1205 
   1206 		/* get our line */
   1207 		eccline = udf_pop_eccline(priv, priv->cur_queue);
   1208 		if (eccline) {
   1209 			wait = 0;
   1210 			new_queue = priv->cur_queue;
   1211 			DPRINTF(ECCLINE, ("UDF_ISSUE_ECCLINE\n"));
   1212 
   1213 			/* complete the `get' by locking and refcounting it */
   1214 			UDF_LOCK_ECCLINE(eccline);
   1215 			eccline->refcnt++;
   1216 
   1217 			udf_issue_eccline(eccline, priv->cur_queue);
   1218 		} else {
   1219 			/* don't switch too quickly */
   1220 			if (now.tv_sec - last->tv_sec < 2) {
   1221 				/* wait some time */
   1222 				cv_timedwait(&priv->discstrat_cv,
   1223 					&priv->discstrat_mutex, hz);
   1224 				/* we assume there is work to be done */
   1225 				work = 1;
   1226 				continue;
   1227 			}
   1228 
   1229 			/* XXX select on queue lengths ? */
   1230 			wait = 1;
   1231 			/* check if we can/should switch */
   1232 			new_queue = priv->cur_queue;
   1233 			if (BUFQ_PEEK(priv->queues[UDF_SHED_READING]))
   1234 				new_queue = UDF_SHED_READING;
   1235 			if (BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]))
   1236 				new_queue = UDF_SHED_WRITING;
   1237 			if (BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]))
   1238 				new_queue = UDF_SHED_SEQWRITING;
   1239 		}
   1240 
   1241 		/* give room */
   1242 		mutex_exit(&priv->discstrat_mutex);
   1243 
   1244 		if (new_queue != priv->cur_queue) {
   1245 			wait = 0;
   1246 			DPRINTF(SHEDULE, ("switching from %d to %d\n",
   1247 				priv->cur_queue, new_queue));
   1248 			priv->cur_queue = new_queue;
   1249 		}
   1250 		mutex_enter(&priv->discstrat_mutex);
   1251 
   1252 		/* wait for more if needed */
   1253 		if (wait)
   1254 			cv_timedwait(&priv->discstrat_cv,
   1255 				&priv->discstrat_mutex, hz/4);	/* /8 */
   1256 
   1257 		work  = (BUFQ_PEEK(priv->queues[UDF_SHED_WAITING]) != NULL);
   1258 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_READING]) != NULL);
   1259 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) != NULL);
   1260 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) != NULL);
   1261 
   1262 		DPRINTF(PARANOIA, ("work : (%d, %d, %d) -> work %d, float %d\n",
   1263 			(BUFQ_PEEK(priv->queues[UDF_SHED_READING]) != NULL),
   1264 			(BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) != NULL),
   1265 			(BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) != NULL),
   1266 			work, priv->num_floating));
   1267 	}
   1268 
   1269 	mutex_exit(&priv->discstrat_mutex);
   1270 
   1271 	/* tear down remaining ecclines */
   1272 	mutex_enter(&priv->discstrat_mutex);
   1273 	KASSERT(priv->num_queued[UDF_SHED_WAITING] == 0);
   1274 	KASSERT(priv->num_queued[UDF_SHED_IDLE] == 0);
   1275 	KASSERT(priv->num_queued[UDF_SHED_READING] == 0);
   1276 	KASSERT(priv->num_queued[UDF_SHED_WRITING] == 0);
   1277 	KASSERT(priv->num_queued[UDF_SHED_SEQWRITING] == 0);
   1278 
   1279 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_WAITING]) == NULL);
   1280 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_IDLE]) == NULL);
   1281 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_READING]) == NULL);
   1282 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) == NULL);
   1283 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) == NULL);
   1284 	eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
   1285 	while (eccline) {
   1286 		udf_dispose_eccline(eccline);
   1287 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
   1288 	}
   1289 	KASSERT(priv->num_queued[UDF_SHED_FREE] == 0);
   1290 	mutex_exit(&priv->discstrat_mutex);
   1291 
   1292 	priv->thread_running  = 0;
   1293 	priv->thread_finished = 1;
   1294 	wakeup(&priv->run_thread);
   1295 	kthread_exit(0);
   1296 	/* not reached */
   1297 }
   1298 
   1299 /* --------------------------------------------------------------------- */
   1300 
   1301 /*
   1302  * Buffer memory pool allocator.
   1303  */
   1304 
   1305 static void *
   1306 ecclinepool_page_alloc(struct pool *pp, int flags)
   1307 {
   1308         return (void *)uvm_km_alloc(kernel_map,
   1309             MAXBSIZE, MAXBSIZE,
   1310             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
   1311 	    	| UVM_KMF_WIRED /* UVM_KMF_PAGABLE? */);
   1312 }
   1313 
   1314 static void
   1315 ecclinepool_page_free(struct pool *pp, void *v)
   1316 {
   1317         uvm_km_free(kernel_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
   1318 }
   1319 
   1320 static struct pool_allocator ecclinepool_allocator = {
   1321         .pa_alloc = ecclinepool_page_alloc,
   1322         .pa_free  = ecclinepool_page_free,
   1323         .pa_pagesz = MAXBSIZE,
   1324 };
   1325 
   1326 
   1327 static void
   1328 udf_discstrat_init_rmw(struct udf_strat_args *args)
   1329 {
   1330 	struct udf_mount *ump = args->ump;
   1331 	struct strat_private *priv = PRIV(ump);
   1332 	uint32_t lb_size, blobsize, hashline;
   1333 	int i;
   1334 
   1335 	KASSERT(ump);
   1336 	KASSERT(ump->logical_vol);
   1337 	KASSERT(priv == NULL);
   1338 
   1339 	lb_size = udf_rw32(ump->logical_vol->lb_size);
   1340 	blobsize = ump->packet_size * lb_size;
   1341 	KASSERT(lb_size > 0);
   1342 	KASSERT(ump->packet_size <= 64);
   1343 
   1344 	/* initialise our memory space */
   1345 	ump->strategy_private = malloc(sizeof(struct strat_private),
   1346 		M_UDFTEMP, M_WAITOK);
   1347 	priv = ump->strategy_private;
   1348 	memset(priv, 0 , sizeof(struct strat_private));
   1349 
   1350 	/* initialise locks */
   1351 	cv_init(&priv->discstrat_cv, "udfstrat");
   1352 	mutex_init(&priv->discstrat_mutex, MUTEX_DRIVER, IPL_BIO);
   1353 	mutex_init(&priv->seqwrite_mutex, MUTEX_DEFAULT, IPL_NONE);
   1354 
   1355 	/* initialise struct eccline pool */
   1356 	pool_init(&priv->eccline_pool, sizeof(struct udf_eccline),
   1357 		0, 0, 0, "udf_eccline_pool", NULL, IPL_NONE);
   1358 
   1359 	/* initialise eccline blob pool */
   1360         ecclinepool_allocator.pa_pagesz = blobsize;
   1361 	pool_init(&priv->ecclineblob_pool, blobsize,
   1362 		0, 0, 0, "udf_eccline_blob", &ecclinepool_allocator, IPL_NONE);
   1363 
   1364 	/* initialise main queues */
   1365 	for (i = 0; i < UDF_SHED_MAX; i++) {
   1366 		priv->num_queued[i] = 0;
   1367 		vfs_timestamp(&priv->last_queued[i]);
   1368 	}
   1369 	bufq_alloc(&priv->queues[UDF_SHED_WAITING], "fcfs",
   1370 		BUFQ_SORT_RAWBLOCK);
   1371 	bufq_alloc(&priv->queues[UDF_SHED_READING], "disksort",
   1372 		BUFQ_SORT_RAWBLOCK);
   1373 	bufq_alloc(&priv->queues[UDF_SHED_WRITING], "disksort",
   1374 		BUFQ_SORT_RAWBLOCK);
   1375 	bufq_alloc(&priv->queues[UDF_SHED_SEQWRITING], "disksort", 0);
   1376 
   1377 	/* initialise administrative queues */
   1378 	bufq_alloc(&priv->queues[UDF_SHED_IDLE], "fcfs", 0);
   1379 	bufq_alloc(&priv->queues[UDF_SHED_FREE], "fcfs", 0);
   1380 
   1381 	for (hashline = 0; hashline < UDF_ECCBUF_HASHSIZE; hashline++) {
   1382 		LIST_INIT(&priv->eccline_hash[hashline]);
   1383 	}
   1384 
   1385 	/* create our disk strategy thread */
   1386 	priv->cur_queue = UDF_SHED_READING;
   1387 	priv->thread_finished = 0;
   1388 	priv->thread_running  = 0;
   1389 	priv->run_thread      = 1;
   1390 	if (kthread_create(PRI_NONE, 0 /* KTHREAD_MPSAFE*/, NULL /* cpu_info*/,
   1391 		udf_discstrat_thread, ump, &priv->queue_lwp,
   1392 		"%s", "udf_rw")) {
   1393 		panic("fork udf_rw");
   1394 	}
   1395 
   1396 	/* wait for thread to spin up */
   1397 	while (!priv->thread_running) {
   1398 		tsleep(&priv->thread_running, PRIBIO+1, "udfshedstart", hz);
   1399 	}
   1400 }
   1401 
   1402 
   1403 static void
   1404 udf_discstrat_finish_rmw(struct udf_strat_args *args)
   1405 {
   1406 	struct udf_mount *ump = args->ump;
   1407 	struct strat_private *priv = PRIV(ump);
   1408 	int error;
   1409 
   1410 	if (ump == NULL)
   1411 		return;
   1412 
   1413 	/* stop our sheduling thread */
   1414 	KASSERT(priv->run_thread == 1);
   1415 	priv->run_thread = 0;
   1416 	wakeup(priv->queue_lwp);
   1417 	while (!priv->thread_finished) {
   1418 		error = tsleep(&priv->run_thread, PRIBIO+1,
   1419 			"udfshedfin", hz);
   1420 	}
   1421 	/* kthread should be finished now */
   1422 
   1423 	/* cleanup our pools */
   1424 	pool_destroy(&priv->eccline_pool);
   1425 	pool_destroy(&priv->ecclineblob_pool);
   1426 
   1427 	cv_destroy(&priv->discstrat_cv);
   1428 	mutex_destroy(&priv->discstrat_mutex);
   1429 	mutex_destroy(&priv->seqwrite_mutex);
   1430 
   1431 	/* free our private space */
   1432 	free(ump->strategy_private, M_UDFTEMP);
   1433 	ump->strategy_private = NULL;
   1434 }
   1435 
   1436 /* --------------------------------------------------------------------- */
   1437 
   1438 struct udf_strategy udf_strat_rmw =
   1439 {
   1440 	udf_create_nodedscr_rmw,
   1441 	udf_free_nodedscr_rmw,
   1442 	udf_read_nodedscr_rmw,
   1443 	udf_write_nodedscr_rmw,
   1444 	udf_queuebuf_rmw,
   1445 	udf_discstrat_init_rmw,
   1446 	udf_discstrat_finish_rmw
   1447 };
   1448 
   1449