Home | History | Annotate | Line # | Download | only in udf
      1 /* $NetBSD: udf_strat_rmw.c,v 1.31 2023/06/27 09:58:50 reinoud Exp $ */
      2 
      3 /*
      4  * Copyright (c) 2006, 2008 Reinoud Zandijk
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  *
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 #ifndef lint
     31 __KERNEL_RCSID(0, "$NetBSD: udf_strat_rmw.c,v 1.31 2023/06/27 09:58:50 reinoud Exp $");
     32 #endif /* not lint */
     33 
     34 
     35 #if defined(_KERNEL_OPT)
     36 #include "opt_compat_netbsd.h"
     37 #endif
     38 
     39 #include <sys/param.h>
     40 #include <sys/systm.h>
     41 #include <sys/sysctl.h>
     42 #include <sys/namei.h>
     43 #include <sys/proc.h>
     44 #include <sys/kernel.h>
     45 #include <sys/vnode.h>
     46 #include <miscfs/genfs/genfs_node.h>
     47 #include <sys/mount.h>
     48 #include <sys/buf.h>
     49 #include <sys/file.h>
     50 #include <sys/device.h>
     51 #include <sys/disklabel.h>
     52 #include <sys/ioctl.h>
     53 #include <sys/malloc.h>
     54 #include <sys/dirent.h>
     55 #include <sys/stat.h>
     56 #include <sys/conf.h>
     57 #include <sys/kauth.h>
     58 #include <sys/kthread.h>
     59 #include <dev/clock_subr.h>
     60 
     61 #include <fs/udf/ecma167-udf.h>
     62 #include <fs/udf/udf_mount.h>
     63 
     64 #include "udf.h"
     65 #include "udf_subr.h"
     66 #include "udf_bswap.h"
     67 
     68 
     69 #define VTOI(vnode) ((struct udf_node *) (vnode)->v_data)
     70 #define PRIV(ump) ((struct strat_private *) (ump)->strategy_private)
     71 #define BTOE(buf) ((struct udf_eccline *) ((buf)->b_private))
     72 
     73 /* --------------------------------------------------------------------- */
     74 
     75 #define UDF_MAX_PACKET_SIZE	64			/* DONT change this */
     76 
     77 /* sheduler states */
     78 #define UDF_SHED_WAITING	1			/* waiting on timeout */
     79 #define UDF_SHED_READING	2
     80 #define UDF_SHED_WRITING	3
     81 #define UDF_SHED_SEQWRITING	4
     82 #define UDF_SHED_IDLE		5			/* refcnt'd */
     83 #define UDF_SHED_FREE		6			/* recycleable */
     84 #define UDF_SHED_MAX		6+1
     85 
     86 /* flags */
     87 #define ECC_LOCKED		0x01			/* prevent access   */
     88 #define ECC_WANTED		0x02			/* trying access    */
     89 #define ECC_SEQWRITING		0x04			/* sequential queue */
     90 #define ECC_FLOATING		0x08			/* not queued yet   */
     91 
     92 #define ECC_WAITTIME		10
     93 
     94 
     95 TAILQ_HEAD(ecclineq, udf_eccline);
     96 struct udf_eccline {
     97 	struct udf_mount	 *ump;
     98 	uint64_t		  present;		/* preserve these */
     99 	uint64_t		  readin;		/* bitmap */
    100 	uint64_t		  dirty;		/* bitmap */
    101 	uint64_t		  error;		/* bitmap */
    102 	uint32_t		  refcnt;
    103 
    104 	struct timespec		  wait_time;
    105 	uint32_t		  flags;
    106 	uint32_t		  start_sector;		/* physical */
    107 
    108 	const char		 *fname;
    109 	int			  sline;
    110 
    111 	struct buf		 *buf;
    112 	void			 *blob;
    113 
    114 	struct buf		 *bufs[UDF_MAX_PACKET_SIZE];
    115 	uint32_t		  bufs_bpos[UDF_MAX_PACKET_SIZE];
    116 	int			  bufs_len[UDF_MAX_PACKET_SIZE];
    117 
    118 	int			  queued_on;		/* on which BUFQ list */
    119 	LIST_ENTRY(udf_eccline)   hashchain;		/* on sector lookup  */
    120 };
    121 
    122 
    123 struct strat_private {
    124 	lwp_t			 *queue_lwp;
    125 	kcondvar_t		  discstrat_cv;		/* to wait on       */
    126 	kmutex_t		  discstrat_mutex;	/* disc strategy    */
    127 	kmutex_t		  seqwrite_mutex;	/* protect mappings */
    128 
    129 	int			  thread_running;	/* thread control */
    130 	int			  run_thread;		/* thread control */
    131 	int			  thread_finished;	/* thread control */
    132 	int			  cur_queue;
    133 
    134 	int			  num_floating;
    135 	int			  num_queued[UDF_SHED_MAX];
    136 	struct bufq_state	 *queues[UDF_SHED_MAX];
    137 	struct timespec		  last_queued[UDF_SHED_MAX];
    138 	struct disk_strategy	  old_strategy_setting;
    139 
    140 	struct pool		  eccline_pool;
    141 	struct pool		  ecclineblob_pool;
    142 	LIST_HEAD(, udf_eccline)  eccline_hash[UDF_ECCBUF_HASHSIZE];
    143 };
    144 
    145 /* --------------------------------------------------------------------- */
    146 
    147 #define UDF_LOCK_ECCLINE(eccline) udf_lock_eccline(eccline, __FILE__, __LINE__)
    148 #define UDF_UNLOCK_ECCLINE(eccline) udf_unlock_eccline(eccline, __FILE__, __LINE__)
    149 
    150 /* can be called with or without discstrat lock */
    151 static void
    152 udf_lock_eccline(struct udf_eccline *eccline, const char *fname, int sline)
    153 {
    154 	struct strat_private *priv = PRIV(eccline->ump);
    155 	int waslocked, ret;
    156 
    157 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    158 
    159 	waslocked = mutex_owned(&priv->discstrat_mutex);
    160 	if (!waslocked)
    161 		mutex_enter(&priv->discstrat_mutex);
    162 
    163 	/* wait until its unlocked first */
    164 	eccline->refcnt++;
    165 	while (eccline->flags & ECC_LOCKED) {
    166 		DPRINTF(ECCLINE, ("waiting for lock at %s:%d\n",
    167 					fname, sline));
    168 		DPRINTF(ECCLINE, ("was locked at %s:%d\n",
    169 					eccline->fname, eccline->sline));
    170 		eccline->flags |= ECC_WANTED;
    171 		ret = cv_timedwait(&priv->discstrat_cv, &priv->discstrat_mutex,
    172 			hz/8);
    173 		if (ret == EWOULDBLOCK)
    174 			DPRINTF(LOCKING, ("eccline lock held, waiting for "
    175 				"release"));
    176 	}
    177 	eccline->flags |= ECC_LOCKED;
    178 	eccline->flags &= ~ECC_WANTED;
    179 	eccline->refcnt--;
    180 
    181 	eccline->fname = fname;
    182 	eccline->sline = sline;
    183 
    184 	if (!waslocked)
    185 		mutex_exit(&priv->discstrat_mutex);
    186 }
    187 
    188 
    189 /* can be called with or without discstrat lock */
    190 static void
    191 udf_unlock_eccline(struct udf_eccline *eccline, const char *fname, int sline)
    192 {
    193 	struct strat_private *priv = PRIV(eccline->ump);
    194 	int waslocked;
    195 
    196 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    197 
    198 	waslocked = mutex_owned(&priv->discstrat_mutex);
    199 	if (!waslocked)
    200 		mutex_enter(&priv->discstrat_mutex);
    201 
    202 	eccline->flags &= ~ECC_LOCKED;
    203 	cv_broadcast(&priv->discstrat_cv);
    204 
    205 	if (!waslocked)
    206 		mutex_exit(&priv->discstrat_mutex);
    207 }
    208 
    209 
    210 /* NOTE discstrat_mutex should be held! */
    211 static void
    212 udf_dispose_eccline(struct udf_eccline *eccline)
    213 {
    214 	struct strat_private *priv = PRIV(eccline->ump);
    215 
    216 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    217 
    218 	DPRINTF(ECCLINE, ("dispose eccline with start sector %d, "
    219 		"present %0"PRIx64"\n", eccline->start_sector,
    220 		eccline->present));
    221 
    222 	KASSERT(eccline->refcnt == 0);
    223 	KASSERT(eccline->dirty  == 0);
    224 	KASSERT(eccline->queued_on == 0);
    225 	KASSERT(eccline->flags & ECC_FLOATING);
    226 	KASSERT(eccline->flags & ECC_LOCKED);
    227 
    228 	LIST_REMOVE(eccline, hashchain);
    229 	priv->num_floating--;
    230 
    231 	putiobuf(eccline->buf);
    232 	pool_put(&priv->ecclineblob_pool, eccline->blob);
    233 	pool_put(&priv->eccline_pool, eccline);
    234 }
    235 
    236 
    237 /* NOTE discstrat_mutex should be held! */
    238 static void
    239 udf_push_eccline(struct udf_eccline *eccline, int newqueue)
    240 {
    241 	struct strat_private *priv = PRIV(eccline->ump);
    242 
    243 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    244 
    245 	DPRINTF(PARANOIA, ("DEBUG: buf %p pushed on queue %d\n", eccline->buf, newqueue));
    246 
    247 	KASSERT(eccline->queued_on == 0);
    248 	KASSERT(eccline->flags & ECC_FLOATING);
    249 
    250 	/* set buffer block numbers to make sure its queued correctly */
    251 	eccline->buf->b_lblkno   = eccline->start_sector;
    252 	eccline->buf->b_blkno    = eccline->start_sector;
    253 	eccline->buf->b_rawblkno = eccline->start_sector;
    254 
    255 	vfs_timestamp(&priv->last_queued[newqueue]);
    256 	eccline->flags &= ~ECC_FLOATING;
    257 	priv->num_floating--;
    258 	eccline->queued_on = newqueue;
    259 	priv->num_queued[newqueue]++;
    260 	bufq_put(priv->queues[newqueue], eccline->buf);
    261 
    262 	UDF_UNLOCK_ECCLINE(eccline);
    263 
    264 	/* XXX tickle disc strategy statemachine */
    265 	if (newqueue != UDF_SHED_IDLE)
    266 		cv_signal(&priv->discstrat_cv);
    267 }
    268 
    269 
    270 static struct udf_eccline *
    271 udf_peek_eccline(struct strat_private *priv, int queued_on)
    272 {
    273 	struct udf_eccline *eccline;
    274 	struct buf *buf;
    275 
    276 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    277 
    278 	for(;;) {
    279 		buf = bufq_peek(priv->queues[queued_on]);
    280 		/* could have been a race, but we'll revisit later */
    281 		if (buf == NULL)
    282 			return NULL;
    283 
    284 		eccline = BTOE(buf);
    285 		UDF_LOCK_ECCLINE(eccline);
    286 
    287 		/* might have changed before we obtained the lock */
    288 		if (eccline->queued_on == queued_on)
    289 			break;
    290 
    291 		UDF_UNLOCK_ECCLINE(eccline);
    292 	}
    293 
    294 	KASSERT(eccline->queued_on == queued_on);
    295 	KASSERT((eccline->flags & ECC_FLOATING) == 0);
    296 
    297 	DPRINTF(PARANOIA, ("DEBUG: buf %p peeked at queue %d\n",
    298 		eccline->buf, queued_on));
    299 
    300 	return eccline;
    301 }
    302 
    303 
    304 static struct udf_eccline *
    305 udf_pop_eccline(struct strat_private *priv, int queued_on)
    306 {
    307 	struct udf_eccline *eccline;
    308 	struct buf *buf;
    309 
    310 	KASSERT(mutex_owned(&priv->discstrat_mutex));
    311 
    312 	for(;;) {
    313 		buf = bufq_get(priv->queues[queued_on]);
    314 		if (buf == NULL) {
    315 			// KASSERT(priv->num_queued[queued_on] == 0);
    316 			return NULL;
    317 		}
    318 
    319 		eccline = BTOE(buf);
    320 		UDF_LOCK_ECCLINE(eccline);
    321 
    322 		/* might have changed before we obtained the lock */
    323 		if (eccline->queued_on == queued_on)
    324 			break;
    325 
    326 		UDF_UNLOCK_ECCLINE(eccline);
    327 	}
    328 
    329 	KASSERT(eccline->queued_on == queued_on);
    330 	KASSERT((eccline->flags & ECC_FLOATING) == 0);
    331 
    332 	priv->num_queued[queued_on]--;
    333 	eccline->queued_on = 0;
    334 
    335 	eccline->flags |= ECC_FLOATING;
    336 	priv->num_floating++;
    337 
    338 	DPRINTF(PARANOIA, ("DEBUG: buf %p popped from queue %d\n",
    339 		eccline->buf, queued_on));
    340 
    341 	return eccline;
    342 }
    343 
    344 
    345 static void
    346 udf_unqueue_eccline(struct strat_private *priv, struct udf_eccline *eccline)
    347 {
    348 	struct buf *ret __diagused;
    349 
    350 	UDF_LOCK_ECCLINE(eccline);
    351 	if (eccline->queued_on == 0) {
    352 		KASSERT(eccline->flags & ECC_FLOATING);
    353 		return;
    354 	}
    355 
    356 	ret = bufq_cancel(priv->queues[eccline->queued_on], eccline->buf);
    357 	KASSERT(ret == eccline->buf);
    358 
    359 	priv->num_queued[eccline->queued_on]--;
    360 	eccline->queued_on = 0;
    361 
    362 	eccline->flags |= ECC_FLOATING;
    363 	priv->num_floating++;
    364 }
    365 
    366 
    367 static struct udf_eccline *
    368 udf_geteccline(struct udf_mount *ump, uint32_t sector, int flags)
    369 {
    370 	struct strat_private *priv = PRIV(ump);
    371 	struct udf_eccline *eccline;
    372 	uint32_t start_sector, lb_size, blobsize;
    373 	uint8_t *eccline_blob;
    374 	int line, line_offset;
    375 	int num_busy;
    376 
    377 	mutex_enter(&priv->discstrat_mutex);
    378 
    379 	/* lookup in our line cache hashtable */
    380 	line_offset  = sector % ump->packet_size;
    381 	start_sector = sector - line_offset;
    382 	line = (start_sector/ump->packet_size) & UDF_ECCBUF_HASHMASK;
    383 
    384 	KASSERT(priv->thread_running);
    385 
    386 retry:
    387 	DPRINTF(ECCLINE, ("get line sector %d, line %d\n", sector, line));
    388 	LIST_FOREACH(eccline, &priv->eccline_hash[line], hashchain) {
    389 		if (eccline->start_sector == start_sector) {
    390 			DPRINTF(ECCLINE, ("\tfound eccline, start_sector %d\n",
    391 				eccline->start_sector));
    392 			udf_unqueue_eccline(priv, eccline);
    393 
    394 			mutex_exit(&priv->discstrat_mutex);
    395 			return eccline;
    396 		}
    397 	}
    398 
    399 	/* not found in eccline cache */
    400 	DPRINTF(ECCLINE, ("\tnot found in eccline cache\n"));
    401 
    402 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
    403 	blobsize = ump->packet_size * lb_size;
    404 
    405 	/* dont allow too many pending requests */
    406 	DPRINTF(ECCLINE, ("\tallocating new eccline\n"));
    407 	num_busy = (priv->num_queued[UDF_SHED_SEQWRITING] + priv->num_floating);
    408 	if ((flags & ECC_SEQWRITING) && (num_busy > UDF_ECCLINE_MAXBUSY)) {
    409 		cv_timedwait(&priv->discstrat_cv,
    410 			&priv->discstrat_mutex, hz/8);
    411 		goto retry;
    412 	}
    413 
    414 	eccline_blob = pool_get(&priv->ecclineblob_pool, PR_NOWAIT);
    415 	eccline = pool_get(&priv->eccline_pool, PR_NOWAIT);
    416 	if ((eccline_blob == NULL) || (eccline == NULL)) {
    417 		if (eccline_blob)
    418 			pool_put(&priv->ecclineblob_pool, eccline_blob);
    419 		if (eccline)
    420 			pool_put(&priv->eccline_pool, eccline);
    421 
    422 		/* out of memory for now; canibalise freelist */
    423 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
    424 		if (eccline == NULL) {
    425 			/* serious trouble; wait and retry */
    426 			cv_timedwait(&priv->discstrat_cv,
    427 				&priv->discstrat_mutex, hz/8);
    428 			goto retry;
    429 		}
    430 
    431 		/* push back line if we're waiting for it or its locked */
    432 		if (eccline->flags & ECC_WANTED) {
    433 			/* we won a race, but someone else needed it */
    434 			udf_push_eccline(eccline, UDF_SHED_FREE);
    435 			goto retry;
    436 		}
    437 
    438 		/* unlink this entry */
    439 		LIST_REMOVE(eccline, hashchain);
    440 		KASSERT(eccline->flags & ECC_FLOATING);
    441 		KASSERT(eccline->queued_on == 0);
    442 
    443 		eccline_blob = eccline->blob;
    444 		eccline->flags = ECC_FLOATING | ECC_LOCKED;
    445 	} else {
    446 		eccline->flags = ECC_FLOATING | ECC_LOCKED;
    447 		priv->num_floating++;
    448 	}
    449 
    450 	eccline->queued_on = 0;
    451 	eccline->blob = eccline_blob;
    452 	eccline->buf  = getiobuf(NULL, true);
    453 	eccline->buf->b_private = eccline;	/* IMPORTANT */
    454 
    455 	/* initialise eccline blob */
    456 	/* XXX memset expensive and strictly not needed XXX */
    457 	memset(eccline->blob, 0, blobsize);
    458 
    459 	eccline->ump = ump;
    460 	eccline->present = eccline->readin = eccline->dirty = 0;
    461 	eccline->error = 0;
    462 	eccline->refcnt = 0;
    463 	memset(eccline->bufs, 0, UDF_MAX_PACKET_SIZE * sizeof(struct buf *));
    464 
    465 	eccline->start_sector    = start_sector;
    466 	eccline->buf->b_lblkno   = start_sector;
    467 	eccline->buf->b_blkno    = start_sector;
    468 	eccline->buf->b_rawblkno = start_sector;
    469 
    470 	LIST_INSERT_HEAD(&priv->eccline_hash[line], eccline, hashchain);
    471 
    472 	/*
    473 	 * TODO possible optimalisation for checking overlap with partitions
    474 	 * to get a clue on future eccline usage
    475 	 */
    476 
    477 	KASSERT(eccline->refcnt == 0);
    478 	KASSERT(eccline->flags & ECC_FLOATING);
    479 	KASSERT(eccline->flags & ECC_LOCKED);
    480 	mutex_exit(&priv->discstrat_mutex);
    481 
    482 	return eccline;
    483 }
    484 
    485 
    486 static void
    487 udf_puteccline(struct udf_eccline *eccline)
    488 {
    489 	struct strat_private *priv = PRIV(eccline->ump);
    490 	struct udf_mount *ump = eccline->ump;
    491 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
    492 	int new_queue;
    493 
    494 	mutex_enter(&priv->discstrat_mutex);
    495 
    496 	DPRINTF(ECCLINE, ("put eccline start sector %d, refcnt %d\n",
    497 		eccline->start_sector, eccline->refcnt));
    498 
    499 	KASSERT(eccline->flags & ECC_LOCKED);
    500 	KASSERT(eccline->flags & ECC_FLOATING);
    501 
    502 	/* clear all read bits that are already read in */
    503 	if (eccline->readin & eccline->present)
    504 		eccline->readin &= (~eccline->present) & allbits;
    505 
    506 	/* if we have active nodes we dont set it on seqwriting */
    507 	if (eccline->refcnt > 1)
    508 		eccline->flags &= ~ECC_SEQWRITING;
    509 
    510 	/* select state */
    511 	new_queue = UDF_SHED_FREE;
    512 	if (eccline->refcnt > 0)
    513 		new_queue = UDF_SHED_IDLE;
    514 	if (eccline->flags & ECC_WANTED)
    515 		new_queue = UDF_SHED_IDLE;
    516 	if (eccline->readin)
    517 		new_queue = UDF_SHED_READING;
    518 	if (eccline->dirty) {
    519 		new_queue = UDF_SHED_WAITING;
    520 		vfs_timestamp(&eccline->wait_time);
    521 		eccline->wait_time.tv_sec += ECC_WAITTIME;
    522 
    523 		if (eccline->present == allbits) {
    524 			new_queue = UDF_SHED_WRITING;
    525 			if (eccline->flags & ECC_SEQWRITING)
    526 				new_queue = UDF_SHED_SEQWRITING;
    527 		}
    528 	}
    529 	udf_push_eccline(eccline, new_queue);
    530 
    531 	mutex_exit(&priv->discstrat_mutex);
    532 }
    533 
    534 /* --------------------------------------------------------------------- */
    535 
    536 static int
    537 udf_create_nodedscr_rmw(struct udf_strat_args *args)
    538 {
    539 	union dscrptr   **dscrptr  = &args->dscr;
    540 	struct udf_mount *ump      = args->ump;
    541 	struct long_ad   *icb      = args->icb;
    542 	struct udf_eccline *eccline;
    543 	uint64_t bit;
    544 	uint32_t sectornr, lb_size, dummy;
    545 	uint8_t *mem;
    546 	int error, eccsect;
    547 
    548 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    549 	if (error)
    550 		return error;
    551 
    552 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
    553 
    554 	/* get our eccline */
    555 	eccline = udf_geteccline(ump, sectornr, 0);
    556 	eccsect = sectornr - eccline->start_sector;
    557 
    558 	bit = (uint64_t) 1 << eccsect;
    559 	eccline->readin  &= ~bit;	/* just in case */
    560 	eccline->present |=  bit;
    561 	eccline->dirty   &= ~bit;	/* Err... euhm... clean? */
    562 
    563 	eccline->refcnt++;
    564 
    565 	/* clear space */
    566 	mem = ((uint8_t *) eccline->blob) + eccsect * lb_size;
    567 	memset(mem, 0, lb_size);
    568 
    569 	udf_puteccline(eccline);
    570 
    571 	*dscrptr = (union dscrptr *) mem;
    572 	return 0;
    573 }
    574 
    575 
    576 static void
    577 udf_free_nodedscr_rmw(struct udf_strat_args *args)
    578 {
    579 	struct udf_mount *ump  = args->ump;
    580 	struct long_ad   *icb  = args->icb;
    581 	struct udf_eccline *eccline;
    582 	uint64_t bit;
    583 	uint32_t sectornr, dummy;
    584 	int error, eccsect;
    585 
    586 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    587 	if (error)
    588 		return;
    589 
    590 	/* get our eccline */
    591 	eccline = udf_geteccline(ump, sectornr, 0);
    592 	eccsect = sectornr - eccline->start_sector;
    593 
    594 	bit = (uint64_t) 1 << eccsect;
    595 	KASSERT(eccline->present & bit);
    596 
    597 	eccline->readin &= ~bit;	/* just in case */
    598 	/* XXX eccline->dirty? */
    599 
    600 	KASSERT(eccline->refcnt >= 1);
    601 	eccline->refcnt--;
    602 
    603 	udf_puteccline(eccline);
    604 }
    605 
    606 
    607 static int
    608 udf_read_nodedscr_rmw(struct udf_strat_args *args)
    609 {
    610 	union dscrptr   **dscrptr = &args->dscr;
    611 	struct udf_mount *ump = args->ump;
    612 	struct long_ad   *icb = args->icb;
    613 	struct strat_private *priv;
    614 	struct udf_eccline *eccline;
    615 	uint64_t bit;
    616 	uint32_t sectornr, dummy;
    617 	uint8_t *pos;
    618 	int sector_size = ump->discinfo.sector_size;
    619 	int lb_size __diagused = udf_rw32(ump->logical_vol->lb_size);
    620 	int i, error, dscrlen, eccsect;
    621 
    622 	KASSERT(sector_size == lb_size);
    623 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    624 	if (error)
    625 		return error;
    626 
    627 	/* get our eccline */
    628 	eccline = udf_geteccline(ump, sectornr, 0);
    629 	eccsect = sectornr - eccline->start_sector;
    630 
    631 	bit = (uint64_t) 1 << eccsect;
    632 	if ((eccline->present & bit) == 0) {
    633 		/* mark bit for readin */
    634 		eccline->readin |= bit;
    635 		eccline->refcnt++;	/* prevent recycling */
    636 		KASSERT(eccline->bufs[eccsect] == NULL);
    637 		udf_puteccline(eccline);
    638 
    639 		/* wait for completion */
    640 		priv = PRIV(eccline->ump);
    641 		mutex_enter(&priv->discstrat_mutex);
    642 		while (((eccline->present | eccline->error) & bit) == 0) {
    643 			error = cv_timedwait(&priv->discstrat_cv,
    644 				&priv->discstrat_mutex,
    645 				hz/8);
    646 			if (error == EWOULDBLOCK)
    647 				DPRINTF(LOCKING, ("eccline waiting for read\n"));
    648 		}
    649 		mutex_exit(&priv->discstrat_mutex);
    650 
    651 		/* reget our line */
    652 		eccline = udf_geteccline(ump, sectornr, 0);
    653 		KASSERT(eccline->refcnt >= 1);
    654 		eccline->refcnt--;	/* undo refcnt */
    655 
    656 		if (eccline->error & bit) {
    657 			*dscrptr = NULL;
    658 			udf_puteccline(eccline);
    659 			return EIO;		/* XXX error code */
    660 		}
    661 	}
    662 
    663 	*dscrptr = (union dscrptr *)
    664 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
    665 
    666 	/* code from read_phys_descr */
    667 	/* check if its a valid tag */
    668 	error = udf_check_tag(*dscrptr);
    669 	if (error) {
    670 		/* check if its an empty block */
    671 		pos = (uint8_t *) *dscrptr;
    672 		for (i = 0; i < sector_size; i++, pos++) {
    673 			if (*pos) break;
    674 		}
    675 		if (i == sector_size) {
    676 			/* return no error but with no dscrptr */
    677 			error = 0;
    678 		}
    679 		*dscrptr = NULL;
    680 		udf_puteccline(eccline);
    681 		return error;
    682 	}
    683 
    684 	/* calculate descriptor size */
    685 	dscrlen = udf_tagsize(*dscrptr, sector_size);
    686 	error = udf_check_tag_payload(*dscrptr, dscrlen);
    687 	if (error) {
    688 		*dscrptr = NULL;
    689 		udf_puteccline(eccline);
    690 		return error;
    691 	}
    692 
    693 	/* we have a hold since it has a node descriptor */
    694 	eccline->refcnt++;
    695 	udf_puteccline(eccline);
    696 
    697 	return 0;
    698 }
    699 
    700 
    701 static int
    702 udf_write_nodedscr_rmw(struct udf_strat_args *args)
    703 {
    704 	union dscrptr    *dscrptr = args->dscr;
    705 	struct udf_mount *ump = args->ump;
    706 	struct long_ad   *icb = args->icb;
    707 	struct udf_node *udf_node = args->udf_node;
    708 	struct udf_eccline *eccline;
    709 	uint64_t bit;
    710 	uint32_t sectornr, logsectornr, dummy;
    711 	// int waitfor  = args->waitfor;
    712 	int sector_size = ump->discinfo.sector_size;
    713 	int lb_size __diagused = udf_rw32(ump->logical_vol->lb_size);
    714 	int error, eccsect;
    715 
    716 	KASSERT(sector_size == lb_size);
    717 	sectornr    = 0;
    718 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
    719 	if (error)
    720 		return error;
    721 
    722 	/* get our eccline */
    723 	eccline = udf_geteccline(ump, sectornr, 0);
    724 	eccsect = sectornr - eccline->start_sector;
    725 
    726 	bit = (uint64_t) 1 << eccsect;
    727 
    728 	/* old callback still pending? */
    729 	if (eccline->bufs[eccsect]) {
    730 		DPRINTF(WRITE, ("udf_write_nodedscr_rmw: writing descriptor"
    731 					" over buffer?\n"));
    732 		nestiobuf_done(eccline->bufs[eccsect],
    733 				eccline->bufs_len[eccsect],
    734 				0);
    735 		eccline->bufs[eccsect] = NULL;
    736 	}
    737 
    738 	/* set sector number in the descriptor and validate */
    739 	dscrptr = (union dscrptr *)
    740 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
    741 	KASSERT(dscrptr == args->dscr);
    742 
    743 	logsectornr = udf_rw32(icb->loc.lb_num);
    744 	dscrptr->tag.tag_loc = udf_rw32(logsectornr);
    745 	udf_validate_tag_and_crc_sums(dscrptr);
    746 
    747 	udf_fixup_node_internals(ump, (uint8_t *) dscrptr, UDF_C_NODE);
    748 
    749 	/* set our flags */
    750 	KASSERT(eccline->present & bit);
    751 	eccline->dirty |= bit;
    752 
    753 	KASSERT(udf_tagsize(dscrptr, sector_size) <= sector_size);
    754 
    755 	udf_node->outstanding_nodedscr--;
    756 	if (udf_node->outstanding_nodedscr == 0) {
    757 		/* XXX still using wakeup! */
    758 		UDF_UNLOCK_NODE(udf_node, 0);
    759 		cv_broadcast(&udf_node->node_lock);
    760 	}
    761 	udf_puteccline(eccline);
    762 
    763 	/* XXX waitfor not used */
    764 	return 0;
    765 }
    766 
    767 
    768 static void
    769 udf_queuebuf_rmw(struct udf_strat_args *args)
    770 {
    771 	struct udf_mount *ump = args->ump;
    772 	struct buf *buf = args->nestbuf;
    773 	struct desc_tag *tag;
    774 	struct strat_private *priv = PRIV(ump);
    775 	struct udf_eccline *eccline;
    776 	struct long_ad *node_ad_cpy;
    777 	uint64_t bit, *lmapping, *pmapping, *lmappos, *pmappos, blknr;
    778 	uint32_t buf_len, len, sectors, sectornr, our_sectornr;
    779 	uint32_t bpos;
    780 	uint16_t vpart_num;
    781 	uint8_t *fidblk, *src, *dst;
    782 	int sector_size = ump->discinfo.sector_size;
    783 	int blks = sector_size / DEV_BSIZE;
    784 	int eccsect, what, queue, error;
    785 
    786 	KASSERT(ump);
    787 	KASSERT(buf);
    788 	KASSERT(buf->b_iodone == nestiobuf_iodone);
    789 
    790 	blknr        = buf->b_blkno;
    791 	our_sectornr = blknr / blks;
    792 
    793 	what = buf->b_udf_c_type;
    794 	queue = UDF_SHED_READING;
    795 	if ((buf->b_flags & B_READ) == 0) {
    796 		/* writing */
    797 		queue = UDF_SHED_SEQWRITING;
    798 		if (what == UDF_C_ABSOLUTE)
    799 			queue = UDF_SHED_WRITING;
    800 		if (what == UDF_C_DSCR)
    801 			queue = UDF_SHED_WRITING;
    802 		if (what == UDF_C_NODE)
    803 			queue = UDF_SHED_WRITING;
    804 	}
    805 
    806 	if (queue == UDF_SHED_READING) {
    807 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw READ %p : sector %d type %d,"
    808 			"b_resid %d, b_bcount %d, b_bufsize %d\n",
    809 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
    810 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
    811 
    812 		/* mark bits for reading */
    813 		buf_len = buf->b_bcount;
    814 		sectornr = our_sectornr;
    815 		eccline = udf_geteccline(ump, sectornr, 0);
    816 		eccsect = sectornr - eccline->start_sector;
    817 		bpos = 0;
    818 		while (buf_len) {
    819 			len = MIN(buf_len, sector_size);
    820 			if ((eccsect < 0) || (eccsect >= ump->packet_size)) {
    821 				udf_puteccline(eccline);
    822 				eccline = udf_geteccline(ump, sectornr, 0);
    823 				eccsect = sectornr - eccline->start_sector;
    824 			}
    825 			bit = (uint64_t) 1 << eccsect;
    826 			error = eccline->error & bit ? EIO : 0;
    827 			if (eccline->present & bit) {
    828 				src = (uint8_t *) eccline->blob +
    829 					eccsect * sector_size;
    830 				dst = (uint8_t *) buf->b_data + bpos;
    831 				if (!error)
    832 					memcpy(dst, src, len);
    833 				nestiobuf_done(buf, len, error);
    834 			} else {
    835 				eccline->readin |= bit;
    836 				KASSERT(eccline->bufs[eccsect] == NULL);
    837 				eccline->bufs[eccsect] = buf;
    838 				eccline->bufs_bpos[eccsect] = bpos;
    839 				eccline->bufs_len[eccsect] = len;
    840 			}
    841 			bpos += sector_size;
    842 			eccsect++;
    843 			sectornr++;
    844 			buf_len -= len;
    845 		}
    846 		udf_puteccline(eccline);
    847 		return;
    848 	}
    849 
    850 	if (queue == UDF_SHED_WRITING) {
    851 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw WRITE %p : sector %d "
    852 			"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
    853 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
    854 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
    855 
    856 		/* if we have FIDs fixup using buffer's sector number(s) */
    857 		if (buf->b_udf_c_type == UDF_C_FIDS)
    858 			panic("UDF_C_FIDS in SHED_WRITING!\n");
    859 
    860 		udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
    861 
    862 		/* copy parts into the bufs and set for writing */
    863 		buf_len = buf->b_bcount;
    864 		sectornr = our_sectornr;
    865 		eccline = udf_geteccline(ump, sectornr, 0);
    866 		eccsect = sectornr - eccline->start_sector;
    867 		bpos = 0;
    868 		while (buf_len) {
    869 			len = MIN(buf_len, sector_size);
    870 			if ((eccsect < 0) || (eccsect >= ump->packet_size)) {
    871 				udf_puteccline(eccline);
    872 				eccline = udf_geteccline(ump, sectornr, 0);
    873 				eccsect = sectornr - eccline->start_sector;
    874 			}
    875 			bit = (uint64_t) 1 << eccsect;
    876 			KASSERT((eccline->readin & bit) == 0);
    877 			eccline->present |= bit;
    878 			eccline->dirty   |= bit;
    879 			if (eccline->bufs[eccsect]) {
    880 				/* old callback still pending */
    881 				nestiobuf_done(eccline->bufs[eccsect],
    882 						eccline->bufs_len[eccsect],
    883 						0);
    884 				eccline->bufs[eccsect] = NULL;
    885 			}
    886 
    887 			src = (uint8_t *) buf->b_data + bpos;
    888 			dst = (uint8_t *) eccline->blob + eccsect * sector_size;
    889 			if (len != sector_size)
    890 				memset(dst, 0, sector_size);
    891 			memcpy(dst, src, len);
    892 
    893 			/* note that its finished for this extent */
    894 			eccline->bufs[eccsect] = NULL;
    895 			nestiobuf_done(buf, len, 0);
    896 
    897 			bpos += sector_size;
    898 			eccsect++;
    899 			sectornr++;
    900 			buf_len -= len;
    901 		}
    902 		udf_puteccline(eccline);
    903 		return;
    904 
    905 	}
    906 
    907 	/* sequential writing */
    908 	KASSERT(queue == UDF_SHED_SEQWRITING);
    909 	DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw SEQWRITE %p : sector XXXX "
    910 		"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
    911 		buf, buf->b_udf_c_type, buf->b_resid, buf->b_bcount,
    912 		buf->b_bufsize));
    913 	/*
    914 	 * Buffers should not have been allocated to disc addresses yet on
    915 	 * this queue. Note that a buffer can get multiple extents allocated.
    916 	 * Note that it *looks* like the normal writing but its different in
    917 	 * the details.
    918 	 *
    919 	 * lmapping contains lb_num relative to base partition.
    920 	 *
    921 	 * XXX should we try to claim/organize the allocated memory to
    922 	 * block-aligned pieces?
    923 	 */
    924 	mutex_enter(&priv->seqwrite_mutex);
    925 
    926 	lmapping    = ump->la_lmapping;
    927 	node_ad_cpy = ump->la_node_ad_cpy;
    928 
    929 	/* logically allocate buf and map it in the file */
    930 	udf_late_allocate_buf(ump, buf, lmapping, node_ad_cpy, &vpart_num);
    931 
    932 	/* if we have FIDs, fixup using the new allocation table */
    933 	if (buf->b_udf_c_type == UDF_C_FIDS) {
    934 		buf_len = buf->b_bcount;
    935 		bpos = 0;
    936 		lmappos = lmapping;
    937 		while (buf_len) {
    938 			sectornr = *lmappos++;
    939 			len = MIN(buf_len, sector_size);
    940 			fidblk = (uint8_t *) buf->b_data + bpos;
    941 			udf_fixup_fid_block(fidblk, sector_size,
    942 				0, len, sectornr);
    943 			bpos += len;
    944 			buf_len -= len;
    945 		}
    946 	}
    947 	if (buf->b_udf_c_type == UDF_C_METADATA_SBM) {
    948 		if (buf->b_lblkno == 0) {
    949 			/* update the tag location inside */
    950 			tag = (struct desc_tag *) buf->b_data;
    951 			tag->tag_loc = udf_rw32(*lmapping);
    952 			udf_validate_tag_and_crc_sums(buf->b_data);
    953 		}
    954 	}
    955 	udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
    956 
    957 	/*
    958 	 * Translate new mappings in lmapping to pmappings.
    959 	 * pmapping to contain lb_nums as used for disc addressing.
    960 	 */
    961 	pmapping = ump->la_pmapping;
    962 	sectors  = (buf->b_bcount + sector_size -1) / sector_size;
    963 	udf_translate_vtop_list(ump, sectors, vpart_num, lmapping, pmapping);
    964 
    965 	/* copy parts into the bufs and set for writing */
    966 	pmappos = pmapping;
    967 	buf_len = buf->b_bcount;
    968 	sectornr = *pmappos++;
    969 	eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
    970 	eccsect = sectornr - eccline->start_sector;
    971 	bpos = 0;
    972 	while (buf_len) {
    973 		len = MIN(buf_len, sector_size);
    974 		eccsect = sectornr - eccline->start_sector;
    975 		if ((eccsect < 0) || (eccsect >= ump->packet_size)) {
    976 			eccline->flags |= ECC_SEQWRITING;
    977 			udf_puteccline(eccline);
    978 			eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
    979 			eccsect = sectornr - eccline->start_sector;
    980 		}
    981 		bit = (uint64_t) 1 << eccsect;
    982 		KASSERT((eccline->readin & bit) == 0);
    983 		eccline->present |= bit;
    984 		eccline->dirty   |= bit;
    985 		eccline->bufs[eccsect] = NULL;
    986 
    987 		src = (uint8_t *) buf->b_data + bpos;
    988 		dst = (uint8_t *)
    989 			eccline->blob + eccsect * sector_size;
    990 		if (len != sector_size)
    991 			memset(dst, 0, sector_size);
    992 		memcpy(dst, src, len);
    993 
    994 		/* note that its finished for this extent */
    995 		nestiobuf_done(buf, len, 0);
    996 
    997 		bpos += sector_size;
    998 		sectornr = *pmappos++;
    999 		buf_len -= len;
   1000 	}
   1001 	eccline->flags |= ECC_SEQWRITING;
   1002 	udf_puteccline(eccline);
   1003 	mutex_exit(&priv->seqwrite_mutex);
   1004 }
   1005 
   1006 /* --------------------------------------------------------------------- */
   1007 
   1008 static void
   1009 udf_sync_caches_rmw(struct udf_strat_args *args)
   1010 {
   1011 	struct udf_mount *ump = args->ump;
   1012 
   1013 	udf_mmc_synchronise_caches(ump);
   1014 }
   1015 
   1016 /* --------------------------------------------------------------------- */
   1017 
   1018 static void
   1019 udf_shedule_read_callback(struct buf *buf)
   1020 {
   1021 	struct udf_eccline *eccline = BTOE(buf);
   1022 	struct udf_mount *ump = eccline->ump;
   1023 	uint64_t bit;
   1024 	uint8_t *src, *dst;
   1025 	int sector_size = ump->discinfo.sector_size;
   1026 	int error, i, len;
   1027 
   1028 	DPRINTF(ECCLINE, ("read callback called on buf %p\n", buf));
   1029 
   1030 	/* post process read action */
   1031 	KASSERT(eccline->flags & ECC_LOCKED);
   1032 	error = buf->b_error;
   1033 	for (i = 0; i < ump->packet_size; i++) {
   1034 		bit = (uint64_t) 1 << i;
   1035 		src = (uint8_t *) buf->b_data +   i * sector_size;
   1036 		dst = (uint8_t *) eccline->blob + i * sector_size;
   1037 		if (eccline->present & bit)
   1038 			continue;
   1039 		eccline->present |= bit;
   1040 		if (error)
   1041 			eccline->error |= bit;
   1042 		if (eccline->bufs[i]) {
   1043 			dst = (uint8_t *) eccline->bufs[i]->b_data +
   1044 				eccline->bufs_bpos[i];
   1045 			len = eccline->bufs_len[i];
   1046 			if (!error)
   1047 				memcpy(dst, src, len);
   1048 			nestiobuf_done(eccline->bufs[i], len, error);
   1049 			eccline->bufs[i] = NULL;
   1050 		}
   1051 
   1052 	}
   1053 	KASSERT(buf->b_data == eccline->blob);
   1054 	KASSERT(eccline->present == ((uint64_t) 1 << ump->packet_size)-1);
   1055 
   1056 	/*
   1057 	 * XXX TODO what to do on read errors? read in all sectors
   1058 	 * synchronously and allocate a sparable entry?
   1059 	 */
   1060 
   1061 	udf_puteccline(eccline);
   1062 	DPRINTF(ECCLINE, ("read callback finished\n"));
   1063 }
   1064 
   1065 
   1066 static void
   1067 udf_shedule_write_callback(struct buf *buf)
   1068 {
   1069 	struct udf_eccline *eccline = BTOE(buf);
   1070 	struct udf_mount *ump = eccline->ump;
   1071 	uint64_t bit;
   1072 	int error, i;
   1073 
   1074 	DPRINTF(ECCLINE, ("write callback called on buf %p\n", buf));
   1075 
   1076 	/* post process write action */
   1077 	KASSERT(eccline->flags & ECC_LOCKED);
   1078 	error = buf->b_error;
   1079 	for (i = 0; i < ump->packet_size; i++) {
   1080 		bit = (uint64_t) 1 << i;
   1081 		if ((eccline->dirty & bit) == 0)
   1082 			continue;
   1083 		if (error) {
   1084 			eccline->error |= bit;
   1085 		} else {
   1086 			eccline->dirty &= ~bit;
   1087 		}
   1088 
   1089 		KASSERT(eccline->bufs[i] == 0);
   1090 	}
   1091 	KASSERT(eccline->dirty == 0);
   1092 	KASSERT(error == 0);
   1093 
   1094 	/*
   1095 	 * XXX TODO on write errors allocate a sparable entry and reissue
   1096 	 */
   1097 
   1098 	udf_puteccline(eccline);
   1099 	DPRINTF(ECCLINE, ("write callback finished\n"));
   1100 }
   1101 
   1102 
   1103 static void
   1104 udf_issue_eccline(struct udf_eccline *eccline, int queued_on)
   1105 {
   1106 	struct udf_mount *ump = eccline->ump;
   1107 	struct strat_private *priv = PRIV(ump);
   1108 	struct buf *buf, *nestbuf;
   1109 	uint64_t bit, allbits = ((uint64_t) 1 << ump->packet_size)-1;
   1110 	uint32_t start;
   1111 	int sector_size = ump->discinfo.sector_size;
   1112 	int blks = sector_size / DEV_BSIZE;
   1113 	int i;
   1114 
   1115 	KASSERT(eccline->flags & ECC_LOCKED);
   1116 
   1117 	if (queued_on == UDF_SHED_READING) {
   1118 		DPRINTF(SHEDULE, ("udf_issue_eccline reading : "));
   1119 		/* read all bits that are not yet present */
   1120 		eccline->readin = (~eccline->present) & allbits;
   1121 		KASSERT(eccline->readin);
   1122 		start = eccline->start_sector;
   1123 		buf = eccline->buf;
   1124 		buf->b_flags    = B_READ | B_ASYNC;
   1125 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
   1126 		buf->b_oflags   = 0;
   1127 		buf->b_iodone   = udf_shedule_read_callback;
   1128 		buf->b_data     = eccline->blob;
   1129 		buf->b_bcount   = ump->packet_size * sector_size;
   1130 		buf->b_resid    = buf->b_bcount;
   1131 		buf->b_bufsize  = buf->b_bcount;
   1132 		buf->b_private  = eccline;
   1133 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
   1134 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
   1135 		buf->b_proc     = NULL;
   1136 
   1137 		if (eccline->present != 0) {
   1138 			for (i = 0; i < ump->packet_size; i++) {
   1139 				bit = (uint64_t) 1 << i;
   1140 				if (eccline->present & bit) {
   1141 					nestiobuf_done(buf, sector_size, 0);
   1142 					continue;
   1143 				}
   1144 				nestbuf = getiobuf(NULL, true);
   1145 				nestiobuf_setup(buf, nestbuf, i * sector_size,
   1146 					sector_size);
   1147 				/* adjust blocknumber to read */
   1148 				nestbuf->b_blkno = buf->b_blkno + i*blks;
   1149 				nestbuf->b_rawblkno = buf->b_rawblkno + i*blks;
   1150 
   1151 				DPRINTF(SHEDULE, ("sector %d ", start + i));
   1152 
   1153 				/* mutex dance since it could lock */
   1154 				mutex_exit(&priv->discstrat_mutex);
   1155 					/* call asynchronous */
   1156 					VOP_STRATEGY(ump->devvp, nestbuf);
   1157 				mutex_enter(&priv->discstrat_mutex);
   1158 			}
   1159 			DPRINTF(SHEDULE, ("\n"));
   1160 			return;
   1161 		}
   1162 	} else {
   1163 		/* write or seqwrite */
   1164 		DPRINTF(SHEDULE, ("udf_issue_eccline writing or seqwriting : "));
   1165 		DPRINTF(SHEDULE, ("\n\tpresent %"PRIx64", readin %"PRIx64", "
   1166 			"dirty %"PRIx64"\n\t", eccline->present, eccline->readin,
   1167 			eccline->dirty));
   1168 		KASSERT(eccline->present == allbits);
   1169 
   1170 		start = eccline->start_sector;
   1171 		buf = eccline->buf;
   1172 		buf->b_flags    = B_WRITE | B_ASYNC;
   1173 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
   1174 		buf->b_oflags   = 0;
   1175 		buf->b_iodone   = udf_shedule_write_callback;
   1176 		buf->b_data     = eccline->blob;
   1177 		buf->b_bcount   = ump->packet_size * sector_size;
   1178 		buf->b_resid    = buf->b_bcount;
   1179 		buf->b_bufsize  = buf->b_bcount;
   1180 		buf->b_private  = eccline;
   1181 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
   1182 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
   1183 		buf->b_proc     = NULL;
   1184 	}
   1185 
   1186 	/* mutex dance since it could lock */
   1187 	mutex_exit(&priv->discstrat_mutex);
   1188 		/* call asynchronous */
   1189 		DPRINTF(SHEDULE, ("sector %d for %d\n",
   1190 			start, ump->packet_size));
   1191 		VOP_STRATEGY(ump->devvp, buf);
   1192 	mutex_enter(&priv->discstrat_mutex);
   1193 }
   1194 
   1195 
   1196 static void
   1197 udf_discstrat_thread(void *arg)
   1198 {
   1199 	struct udf_mount *ump = (struct udf_mount *) arg;
   1200 	struct strat_private *priv = PRIV(ump);
   1201 	struct udf_eccline *eccline;
   1202 	struct timespec now, *last;
   1203 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
   1204 	int new_queue, wait, work;
   1205 
   1206 	work = 1;
   1207 	priv->thread_running = 1;
   1208 	cv_broadcast(&priv->discstrat_cv);
   1209 
   1210 	mutex_enter(&priv->discstrat_mutex);
   1211 	priv->num_floating = 0;
   1212 	while (priv->run_thread || work || priv->num_floating) {
   1213 		/* get our time */
   1214 		vfs_timestamp(&now);
   1215 
   1216 		/* maintenance: handle eccline state machine */
   1217 		for(;;) {
   1218 			/* only peek at it */
   1219 			eccline = udf_peek_eccline(priv, UDF_SHED_WAITING);
   1220 			if (eccline == NULL)
   1221 				break;
   1222 
   1223 			/* if not reading, wait until the time has come */
   1224 			if ((priv->cur_queue != UDF_SHED_READING) &&
   1225 				(eccline->wait_time.tv_sec - now.tv_sec > 0)) {
   1226 					UDF_UNLOCK_ECCLINE(eccline);
   1227 					/* all others are later, so break off */
   1228 					break;
   1229 			}
   1230 
   1231 			/* release */
   1232 			UDF_UNLOCK_ECCLINE(eccline);
   1233 
   1234 			/* do get it */
   1235 			eccline = udf_pop_eccline(priv, UDF_SHED_WAITING);
   1236 
   1237 			/* requeue according to state */
   1238 			new_queue = UDF_SHED_FREE;	/* unlikely */
   1239 			if (eccline->refcnt > 0)
   1240 				new_queue = UDF_SHED_IDLE;
   1241 			if (eccline->flags & ECC_WANTED)
   1242 				new_queue = UDF_SHED_IDLE;
   1243 			if (eccline->readin)
   1244 				new_queue = UDF_SHED_READING;
   1245 			if (eccline->dirty) {
   1246 				new_queue = UDF_SHED_READING;
   1247 				if (eccline->present == allbits) {
   1248 					new_queue = UDF_SHED_WRITING;
   1249 					if (eccline->flags & ECC_SEQWRITING)
   1250 						new_queue = UDF_SHED_SEQWRITING;
   1251 				}
   1252 			}
   1253 			udf_push_eccline(eccline, new_queue);
   1254 		}
   1255 
   1256 		/* maintenance: free excess ecclines */
   1257 		while (priv->num_queued[UDF_SHED_FREE] > UDF_ECCLINE_MAXFREE) {
   1258 			eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
   1259 			KASSERT(eccline);
   1260 			KASSERT(eccline->refcnt == 0);
   1261 			if (eccline->flags & ECC_WANTED) {
   1262 				/* we won the race, but we dont want to win */
   1263 				DPRINTF(ECCLINE, ("Tried removing, pushed back to free list\n"));
   1264 				udf_push_eccline(eccline, UDF_SHED_IDLE);
   1265 			} else {
   1266 				DPRINTF(ECCLINE, ("Removing entry from free list\n"));
   1267 				udf_dispose_eccline(eccline);
   1268 			}
   1269 		}
   1270 
   1271 		/* process the current selected queue */
   1272 		/* get our time */
   1273 		vfs_timestamp(&now);
   1274 		last = &priv->last_queued[priv->cur_queue];
   1275 
   1276 		/* get our line */
   1277 		eccline = udf_pop_eccline(priv, priv->cur_queue);
   1278 		if (eccline) {
   1279 			wait = 0;
   1280 			new_queue = priv->cur_queue;
   1281 			DPRINTF(ECCLINE, ("UDF_ISSUE_ECCLINE\n"));
   1282 
   1283 			udf_issue_eccline(eccline, priv->cur_queue);
   1284 		} else {
   1285 			/* don't switch too quickly */
   1286 			if (now.tv_sec - last->tv_sec < 2) {
   1287 				/* wait some time */
   1288 				cv_timedwait(&priv->discstrat_cv,
   1289 					&priv->discstrat_mutex, hz);
   1290 				/* we assume there is work to be done */
   1291 				work = 1;
   1292 				continue;
   1293 			}
   1294 
   1295 			/* XXX select on queue lengths ? */
   1296 			wait = 1;
   1297 			/* check if we can/should switch */
   1298 			new_queue = priv->cur_queue;
   1299 			if (bufq_peek(priv->queues[UDF_SHED_READING]))
   1300 				new_queue = UDF_SHED_READING;
   1301 			if (bufq_peek(priv->queues[UDF_SHED_WRITING]))
   1302 				new_queue = UDF_SHED_WRITING;
   1303 			if (bufq_peek(priv->queues[UDF_SHED_SEQWRITING]))
   1304 				new_queue = UDF_SHED_SEQWRITING;
   1305 		}
   1306 
   1307 		/* give room */
   1308 		mutex_exit(&priv->discstrat_mutex);
   1309 
   1310 		if (new_queue != priv->cur_queue) {
   1311 			wait = 0;
   1312 			DPRINTF(SHEDULE, ("switching from %d to %d\n",
   1313 				priv->cur_queue, new_queue));
   1314 			priv->cur_queue = new_queue;
   1315 		}
   1316 		mutex_enter(&priv->discstrat_mutex);
   1317 
   1318 		/* wait for more if needed */
   1319 		if (wait)
   1320 			cv_timedwait(&priv->discstrat_cv,
   1321 				&priv->discstrat_mutex, hz/4);	/* /8 */
   1322 
   1323 		work  = (bufq_peek(priv->queues[UDF_SHED_WAITING]) != NULL);
   1324 		work |= (bufq_peek(priv->queues[UDF_SHED_READING]) != NULL);
   1325 		work |= (bufq_peek(priv->queues[UDF_SHED_WRITING]) != NULL);
   1326 		work |= (bufq_peek(priv->queues[UDF_SHED_SEQWRITING]) != NULL);
   1327 
   1328 		DPRINTF(PARANOIA, ("work : (%d, %d, %d) -> work %d, float %d\n",
   1329 			(bufq_peek(priv->queues[UDF_SHED_READING]) != NULL),
   1330 			(bufq_peek(priv->queues[UDF_SHED_WRITING]) != NULL),
   1331 			(bufq_peek(priv->queues[UDF_SHED_SEQWRITING]) != NULL),
   1332 			work, priv->num_floating));
   1333 	}
   1334 
   1335 	mutex_exit(&priv->discstrat_mutex);
   1336 
   1337 	/* tear down remaining ecclines */
   1338 	mutex_enter(&priv->discstrat_mutex);
   1339 	KASSERT(bufq_peek(priv->queues[UDF_SHED_WAITING]) == NULL);
   1340 	KASSERT(bufq_peek(priv->queues[UDF_SHED_IDLE]) == NULL);
   1341 	KASSERT(bufq_peek(priv->queues[UDF_SHED_READING]) == NULL);
   1342 	KASSERT(bufq_peek(priv->queues[UDF_SHED_WRITING]) == NULL);
   1343 	KASSERT(bufq_peek(priv->queues[UDF_SHED_SEQWRITING]) == NULL);
   1344 
   1345 	KASSERT(priv->num_queued[UDF_SHED_WAITING] == 0);
   1346 	KASSERT(priv->num_queued[UDF_SHED_IDLE] == 0);
   1347 	KASSERT(priv->num_queued[UDF_SHED_READING] == 0);
   1348 	KASSERT(priv->num_queued[UDF_SHED_WRITING] == 0);
   1349 	KASSERT(priv->num_queued[UDF_SHED_SEQWRITING] == 0);
   1350 
   1351 	eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
   1352 	while (eccline) {
   1353 		udf_dispose_eccline(eccline);
   1354 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
   1355 	}
   1356 	KASSERT(priv->num_queued[UDF_SHED_FREE] == 0);
   1357 	mutex_exit(&priv->discstrat_mutex);
   1358 
   1359 	priv->thread_running  = 0;
   1360 	priv->thread_finished = 1;
   1361 	cv_broadcast(&priv->discstrat_cv);
   1362 
   1363 	kthread_exit(0);
   1364 	/* not reached */
   1365 }
   1366 
   1367 /* --------------------------------------------------------------------- */
   1368 
   1369 /*
   1370  * Buffer memory pool allocator.
   1371  */
   1372 
   1373 static void *
   1374 ecclinepool_page_alloc(struct pool *pp, int flags)
   1375 {
   1376         return (void *)uvm_km_alloc(kernel_map,
   1377             MAXBSIZE, MAXBSIZE,
   1378             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
   1379 	    	| UVM_KMF_WIRED /* UVM_KMF_PAGABLE? */);
   1380 }
   1381 
   1382 static void
   1383 ecclinepool_page_free(struct pool *pp, void *v)
   1384 {
   1385         uvm_km_free(kernel_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
   1386 }
   1387 
   1388 static struct pool_allocator ecclinepool_allocator = {
   1389         .pa_alloc = ecclinepool_page_alloc,
   1390         .pa_free  = ecclinepool_page_free,
   1391         .pa_pagesz = MAXBSIZE,
   1392 };
   1393 
   1394 
   1395 static void
   1396 udf_discstrat_init_rmw(struct udf_strat_args *args)
   1397 {
   1398 	struct udf_mount *ump = args->ump;
   1399 	struct strat_private *priv = PRIV(ump);
   1400 	uint32_t lb_size, blobsize, hashline;
   1401 	int i;
   1402 
   1403 	KASSERT(ump);
   1404 	KASSERT(ump->logical_vol);
   1405 	KASSERT(priv == NULL);
   1406 
   1407 	lb_size = udf_rw32(ump->logical_vol->lb_size);
   1408 	blobsize = ump->packet_size * lb_size;
   1409 	KASSERT(lb_size > 0);
   1410 	KASSERT(ump->packet_size <= 64);
   1411 
   1412 	/* initialise our memory space */
   1413 	ump->strategy_private = malloc(sizeof(struct strat_private),
   1414 		M_UDFTEMP, M_WAITOK);
   1415 	priv = ump->strategy_private;
   1416 	memset(priv, 0 , sizeof(struct strat_private));
   1417 
   1418 	/* initialise locks */
   1419 	cv_init(&priv->discstrat_cv, "udfstrat");
   1420 	mutex_init(&priv->discstrat_mutex, MUTEX_DEFAULT, IPL_NONE);
   1421 	mutex_init(&priv->seqwrite_mutex, MUTEX_DEFAULT, IPL_NONE);
   1422 
   1423 	/* initialise struct eccline pool */
   1424 	pool_init(&priv->eccline_pool, sizeof(struct udf_eccline),
   1425 		0, 0, 0, "udf_eccline_pool", NULL, IPL_NONE);
   1426 
   1427 	/* initialise eccline blob pool */
   1428         ecclinepool_allocator.pa_pagesz = blobsize;
   1429 	pool_init(&priv->ecclineblob_pool, blobsize,
   1430 		0, 0, 0, "udf_eccline_blob", &ecclinepool_allocator, IPL_NONE);
   1431 
   1432 	/* initialise main queues */
   1433 	for (i = 0; i < UDF_SHED_MAX; i++) {
   1434 		priv->num_queued[i] = 0;
   1435 		vfs_timestamp(&priv->last_queued[i]);
   1436 	}
   1437 	bufq_alloc(&priv->queues[UDF_SHED_WAITING], "fcfs",
   1438 		BUFQ_SORT_RAWBLOCK);
   1439 	bufq_alloc(&priv->queues[UDF_SHED_READING], "disksort",
   1440 		BUFQ_SORT_RAWBLOCK);
   1441 	bufq_alloc(&priv->queues[UDF_SHED_WRITING], "disksort",
   1442 		BUFQ_SORT_RAWBLOCK);
   1443 	bufq_alloc(&priv->queues[UDF_SHED_SEQWRITING], "disksort", 0);
   1444 
   1445 	/* initialise administrative queues */
   1446 	bufq_alloc(&priv->queues[UDF_SHED_IDLE], "fcfs", 0);
   1447 	bufq_alloc(&priv->queues[UDF_SHED_FREE], "fcfs", 0);
   1448 
   1449 	for (hashline = 0; hashline < UDF_ECCBUF_HASHSIZE; hashline++) {
   1450 		LIST_INIT(&priv->eccline_hash[hashline]);
   1451 	}
   1452 
   1453 	/* create our disk strategy thread */
   1454 	priv->cur_queue = UDF_SHED_READING;
   1455 	priv->thread_finished = 0;
   1456 	priv->thread_running  = 0;
   1457 	priv->run_thread      = 1;
   1458 	if (kthread_create(PRI_NONE, 0 /* KTHREAD_MPSAFE*/, NULL /* cpu_info*/,
   1459 		udf_discstrat_thread, ump, &priv->queue_lwp,
   1460 		"%s", "udf_rw")) {
   1461 		panic("fork udf_rw");
   1462 	}
   1463 
   1464 	/* wait for thread to spin up */
   1465 	mutex_enter(&priv->discstrat_mutex);
   1466 	while (!priv->thread_running) {
   1467 		cv_timedwait(&priv->discstrat_cv, &priv->discstrat_mutex, hz);
   1468 	}
   1469 	mutex_exit(&priv->discstrat_mutex);
   1470 }
   1471 
   1472 
   1473 static void
   1474 udf_discstrat_finish_rmw(struct udf_strat_args *args)
   1475 {
   1476 	struct udf_mount *ump = args->ump;
   1477 	struct strat_private *priv = PRIV(ump);
   1478 
   1479 	if (ump == NULL)
   1480 		return;
   1481 
   1482 	/* stop our sheduling thread */
   1483 	KASSERT(priv->run_thread == 1);
   1484 	priv->run_thread = 0;
   1485 
   1486 	mutex_enter(&priv->discstrat_mutex);
   1487 	while (!priv->thread_finished) {
   1488 		cv_broadcast(&priv->discstrat_cv);
   1489 		cv_timedwait(&priv->discstrat_cv, &priv->discstrat_mutex, hz);
   1490 	}
   1491 	mutex_exit(&priv->discstrat_mutex);
   1492 
   1493 	/* kthread should be finished now */
   1494 	cv_destroy(&priv->discstrat_cv);
   1495 	mutex_destroy(&priv->discstrat_mutex);
   1496 	mutex_destroy(&priv->seqwrite_mutex);
   1497 
   1498 	/* cleanup our pools */
   1499 	pool_destroy(&priv->eccline_pool);
   1500 	pool_destroy(&priv->ecclineblob_pool);
   1501 
   1502 	/* free our private space */
   1503 	free(ump->strategy_private, M_UDFTEMP);
   1504 	ump->strategy_private = NULL;
   1505 }
   1506 
   1507 /* --------------------------------------------------------------------- */
   1508 
   1509 struct udf_strategy udf_strat_rmw =
   1510 {
   1511 	udf_create_nodedscr_rmw,
   1512 	udf_free_nodedscr_rmw,
   1513 	udf_read_nodedscr_rmw,
   1514 	udf_write_nodedscr_rmw,
   1515 	udf_queuebuf_rmw,
   1516 	udf_sync_caches_rmw,
   1517 	udf_discstrat_init_rmw,
   1518 	udf_discstrat_finish_rmw
   1519 };
   1520 
   1521