Home | History | Annotate | Line # | Download | only in xen
      1 /*      $NetBSD: xbdback_xenbus.c,v 1.107 2024/06/20 15:17:27 bouyer Exp $      */
      2 
      3 /*
      4  * Copyright (c) 2006,2024 Manuel Bouyer.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  *
     26  */
     27 
     28 #include <sys/cdefs.h>
     29 __KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.107 2024/06/20 15:17:27 bouyer Exp $");
     30 
     31 #include <sys/buf.h>
     32 #include <sys/condvar.h>
     33 #include <sys/conf.h>
     34 #include <sys/disk.h>
     35 #include <sys/device.h>
     36 #include <sys/fcntl.h>
     37 #include <sys/kauth.h>
     38 #include <sys/kernel.h>
     39 #include <sys/kmem.h>
     40 #include <sys/kthread.h>
     41 #include <sys/mutex.h>
     42 #include <sys/param.h>
     43 #include <sys/queue.h>
     44 #include <sys/systm.h>
     45 #include <sys/time.h>
     46 #include <sys/types.h>
     47 #include <sys/vnode.h>
     48 
     49 #include <xen/intr.h>
     50 #include <xen/hypervisor.h>
     51 #include <xen/xen.h>
     52 #include <xen/xen_shm.h>
     53 #include <xen/evtchn.h>
     54 #include <xen/xenbus.h>
     55 #include <xen/xenring.h>
     56 #include <xen/include/public/io/protocols.h>
     57 
     58 /* #define XENDEBUG_VBD */
     59 #ifdef XENDEBUG_VBD
     60 #define XENPRINTF(x) printf x
     61 #else
     62 #define XENPRINTF(x)
     63 #endif
     64 
     65 #define BLKIF_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
     66 
     67 /*
     68  * Backend block device driver for Xen
     69  */
     70 
     71 /* Values are expressed in 512-byte sectors */
     72 #define VBD_BSIZE 512
     73 #define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)
     74 
     75 #define VBD_VA_SIZE			MAXPHYS
     76 #define VBD_MAX_INDIRECT_SEGMENTS	(VBD_VA_SIZE >> PAGE_SHIFT)
     77 
     78 CTASSERT(XENSHM_MAX_PAGES_PER_REQUEST >= VBD_MAX_INDIRECT_SEGMENTS);
     79 
     80 struct xbdback_instance;
     81 
     82 /*
     83  * status of a xbdback instance:
     84  * WAITING: xbdback instance is connected, waiting for requests
     85  * RUN: xbdi thread must be woken up, I/Os have to be processed
     86  * DISCONNECTING: the instance is closing, no more I/Os can be scheduled
     87  * DISCONNECTED: no I/Os, no ring, the thread should terminate.
     88  */
     89 typedef enum {WAITING, RUN, DISCONNECTING, DISCONNECTED} xbdback_state_t;
     90 
     91 /*
     92  * Each xbdback instance is managed by a single thread that handles all
     93  * the I/O processing. As there are a variety of conditions that can block,
     94  * everything will be done in a sort of continuation-passing style.
     95  *
     96  * When the execution has to block to delay processing, for example to
     97  * allow system to recover because of memory shortage (via shared memory
     98  * callback), the return value of a continuation can be set to NULL. In that
     99  * case, the thread will go back to sleeping and wait for the proper
    100  * condition before it starts processing requests again from where it left.
    101  * Continuation state is "stored" in the xbdback instance (xbdi_cont),
    102  * and should only be manipulated by the instance thread.
    103  * If a continuation has to be restarted from a specific point,
    104  * the callback and argument can be stored in xbdi_cont_restart and
    105  * xbdi_cont_restart_obj
    106  *
    107  *
    108  * As xbdback(4) has to handle different sort of asynchronous events (Xen
    109  * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock
    110  * mutex is used to protect specific elements of the xbdback instance from
    111  * concurrent access: thread status and ring access (when pushing responses).
    112  *
    113  * Here's how the call graph is supposed to be for a single I/O:
    114  *
    115  * xbdback_co_main()
    116  *        |               --> xbdback_co_cache_flush()
    117  *        |               |    |
    118  *        |               |    -> xbdback_co_do_io() or NULL
    119  * xbdback_co_main_loop()-|
    120  *        |               |-> xbdback_co_main_done2() or NULL
    121  *        |               |
    122  *        |               --> xbdback_co_main_incr() -> xbdback_co_main_loop()
    123  *        |
    124  *     xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
    125  *        |
    126  *     xbdback_co_io_gotio() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
    127  *        |
    128  *     xbdback_co_do_io()
    129  *        |
    130  *     xbdback_co_main_incr() -> xbdback_co_main_loop()
    131  */
    132 typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
    133 
    134 enum xbdi_proto {
    135 	XBDIP_NATIVE,
    136 	XBDIP_32,
    137 	XBDIP_64
    138 };
    139 
    140 struct xbdback_va {
    141 	SLIST_ENTRY(xbdback_va) xv_next;
    142 	vaddr_t xv_vaddr;
    143 };
    144 
    145 /*
    146  * For each I/O operation associated with one of those requests, an
    147  * xbdback_io is allocated from a pool.  It may correspond to multiple
    148  * Xen disk requests, or parts of them, if several arrive at once that
    149  * can be coalesced.
    150  */
    151 struct xbdback_io {
    152 	SLIST_ENTRY(xbdback_io) xio_next;
    153 	/* The instance pointer is duplicated for convenience. */
    154 	struct xbdback_instance *xio_xbdi; /* our xbd instance */
    155 	/* _request state: track requests fetched from ring */
    156 	blkif_request_t xio_xen_req;
    157 	/* array of segments[VBD_MAX_INDIRECT_SEGMENTS] allocated separately */
    158 	struct blkif_request_segment *xio_seg;
    159 	bus_dmamap_t xio_seg_dmamap;
    160 	/* internal states */
    161 	union {
    162 		struct {
    163 			struct buf xio_buf; /* our I/O */
    164 			/* the virtual address to map the request at */
    165 			vaddr_t xio_vaddr;
    166 			struct xbdback_va *xio_xv;
    167 			vaddr_t xio_start_offset;	/* I/O start offset */
    168 			/* grants to map */
    169 			grant_ref_t xio_gref[VBD_MAX_INDIRECT_SEGMENTS];
    170 			/* grants release */
    171 			grant_handle_t xio_gh[VBD_MAX_INDIRECT_SEGMENTS];
    172 			bool xio_need_bounce; /* request is not contiguous */
    173 		} xio_rw;
    174 	} u;
    175 };
    176 #define xio_buf		u.xio_rw.xio_buf
    177 #define xio_vaddr	u.xio_rw.xio_vaddr
    178 #define xio_start_offset	u.xio_rw.xio_start_offset
    179 #define xio_xv		u.xio_rw.xio_xv
    180 #define xio_gref	u.xio_rw.xio_gref
    181 #define xio_gh		u.xio_rw.xio_gh
    182 #define xio_need_bounce	u.xio_rw.xio_need_bounce
    183 
    184 /* we keep the xbdback instances in a linked list */
    185 struct xbdback_instance {
    186 	SLIST_ENTRY(xbdback_instance) next;
    187 	struct xenbus_device *xbdi_xbusd; /* our xenstore entry */
    188 	struct xenbus_watch xbdi_watch; /* to watch our store */
    189 	domid_t xbdi_domid;	/* attached to this domain */
    190 	uint32_t xbdi_handle;	/* domain-specific handle */
    191 	char xbdi_name[16];	/* name of this instance */
    192 	/* mutex that protects concurrent access to the xbdback instance */
    193 	kmutex_t xbdi_lock;
    194 	kcondvar_t xbdi_cv;	/* wait channel for thread work */
    195 	xbdback_state_t xbdi_status; /* thread's status */
    196 	/* context and KVA for mapping transfers */
    197 	struct xbdback_io xbdi_io[BLKIF_RING_SIZE];
    198 	SLIST_HEAD(, xbdback_io) xbdi_io_free;
    199 	struct xbdback_va xbdi_va[BLKIF_RING_SIZE];
    200 	SLIST_HEAD(, xbdback_va) xbdi_va_free;
    201 	/* segments structure allocated in page-aligned chunks */
    202 	struct blkif_request_segment *xbdi_segs;
    203 	/* bounce buffer in case a transfer is not contiguous */
    204 	vaddr_t xbdi_bouncebuf;
    205 	int xbdi_bouncebuf_use; /* is bounce buffer in use? */
    206 	/* backing device parameters */
    207 	dev_t xbdi_dev;
    208 	const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */
    209 	struct vnode *xbdi_vp;
    210 	uint64_t xbdi_size;
    211 	bool xbdi_ro; /* is device read-only ? */
    212 	/* parameters for the communication */
    213 	unsigned int xbdi_evtchn;
    214 	struct intrhand *xbdi_ih;
    215 	/* private parameters for communication */
    216 	blkif_back_ring_proto_t xbdi_ring;
    217 	enum xbdi_proto xbdi_proto;
    218 	grant_handle_t xbdi_ring_handle; /* to unmap the ring */
    219 	vaddr_t xbdi_ring_va; /* to unmap the ring */
    220 	/* disconnection must be postponed until all I/O is done */
    221 	int xbdi_refcnt;
    222 	/*
    223 	 * State for I/O processing/coalescing follows; this has to
    224 	 * live here instead of on the stack because of the
    225 	 * continuation-ness (see above).
    226 	 */
    227 	RING_IDX xbdi_req_prod; /* limit on request indices */
    228 	xbdback_cont_t xbdi_cont;
    229 	/* if not NULL, will restart here after thread wakes up */
    230 	xbdback_cont_t xbdi_cont_restart;
    231 	void *xbdi_cont_restart_obj;
    232 	/* other state */
    233 	uint xbdi_pendingreqs; /* number of I/O in fly */
    234 	struct timeval xbdi_lasterr_time;    /* error time tracking */
    235 };
    236 /* Manipulation of the above reference count. */
    237 #define xbdi_get(xbdip) 					\
    238 do {								\
    239 	KASSERT(mutex_owned(&xbdip->xbdi_lock));		\
    240 	(xbdip)->xbdi_refcnt++;					\
    241 } while (0)
    242 
    243 #define xbdi_put(xbdip)						\
    244 do {								\
    245 	KASSERT(mutex_owned(&xbdip->xbdi_lock));		\
    246 	if (--((xbdip)->xbdi_refcnt) == 0)  			\
    247                xbdback_finish_disconnect(xbdip);		\
    248 } while (0)
    249 
    250 static SLIST_HEAD(, xbdback_instance) xbdback_instances;
    251 static kmutex_t xbdback_lock;
    252 
    253 /* Interval between reports of I/O errors from frontend */
    254 static const struct timeval xbdback_err_intvl = { 1, 0 };
    255 
    256        void xbdbackattach(int);
    257 static int  xbdback_xenbus_create(struct xenbus_device *);
    258 static int  xbdback_xenbus_destroy(void *);
    259 static void xbdback_frontend_changed(void *, XenbusState);
    260 static void xbdback_backend_changed(struct xenbus_watch *,
    261     const char **, unsigned int);
    262 static int  xbdback_evthandler(void *);
    263 
    264 static int  xbdback_connect(struct xbdback_instance *);
    265 static void xbdback_disconnect(struct xbdback_instance *);
    266 static void xbdback_finish_disconnect(struct xbdback_instance *);
    267 
    268 static bool xbdif_lookup(domid_t, uint32_t);
    269 
    270 static void *xbdback_co_main(struct xbdback_instance *, void *);
    271 static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
    272 static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
    273 static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
    274 
    275 static void *xbdback_co_cache_flush(struct xbdback_instance *, void *);
    276 
    277 static void *xbdback_co_io(struct xbdback_instance *, void *);
    278 static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
    279 
    280 static void *xbdback_co_do_io(struct xbdback_instance *, void *);
    281 
    282 static void xbdback_io_error(struct xbdback_io *, int);
    283 static void xbdback_iodone(struct buf *);
    284 static void xbdback_iodone_locked(struct xbdback_instance *,
    285 		struct xbdback_io *, struct buf *);
    286 static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int);
    287 
    288 static int  xbdback_map_shm(struct xbdback_io *);
    289 static void xbdback_unmap_shm(struct xbdback_io *);
    290 
    291 static struct xbdback_io *xbdback_io_get(struct xbdback_instance *);
    292 static void xbdback_io_put(struct xbdback_instance *, struct xbdback_io *);
    293 static void xbdback_thread(void *);
    294 static void xbdback_wakeup_thread(struct xbdback_instance *);
    295 static void xbdback_trampoline(struct xbdback_instance *, void *);
    296 
    297 static struct xenbus_backend_driver xbd_backend_driver = {
    298 	.xbakd_create = xbdback_xenbus_create,
    299 	.xbakd_type = "vbd"
    300 };
    301 
    302 void
    303 xbdbackattach(int n)
    304 {
    305 	XENPRINTF(("xbdbackattach\n"));
    306 
    307 	/*
    308 	 * initialize the backend driver, register the control message handler
    309 	 * and send driver up message.
    310 	 */
    311 	SLIST_INIT(&xbdback_instances);
    312 	mutex_init(&xbdback_lock, MUTEX_DEFAULT, IPL_NONE);
    313 
    314 	xenbus_backend_register(&xbd_backend_driver);
    315 }
    316 
    317 static int
    318 xbdback_xenbus_create(struct xenbus_device *xbusd)
    319 {
    320 	struct xbdback_instance *xbdi;
    321 	long domid, handle;
    322 	int error, i;
    323 	int segalloc = 0;
    324 	char *ep;
    325 
    326 	if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path,
    327 	    "frontend-id", &domid, 10)) != 0) {
    328 		aprint_error("xbdback: can't read %s/frontend-id: %d\n",
    329 		    xbusd->xbusd_path, error);
    330 		return error;
    331 	}
    332 
    333 	/*
    334 	 * get handle: this is the last component of the path; which is
    335 	 * a decimal number. $path/dev contains the device name, which is not
    336 	 * appropriate.
    337 	 */
    338 	for (i = strlen(xbusd->xbusd_path); i > 0; i--) {
    339 		if (xbusd->xbusd_path[i] == '/')
    340 			break;
    341 	}
    342 	if (i == 0) {
    343 		aprint_error("xbdback: can't parse %s\n",
    344 		    xbusd->xbusd_path);
    345 		return EFTYPE;
    346 	}
    347 	handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10);
    348 	if (*ep != '\0') {
    349 		aprint_error("xbdback: can't parse %s\n",
    350 		    xbusd->xbusd_path);
    351 		return EFTYPE;
    352 	}
    353 
    354 	xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP);
    355 
    356 	xbdi->xbdi_domid = domid;
    357 	xbdi->xbdi_handle = handle;
    358 	snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d",
    359 	    xbdi->xbdi_domid, xbdi->xbdi_handle);
    360 
    361 	mutex_enter(&xbdback_lock);
    362 	if (xbdif_lookup(domid, handle)) {
    363 		mutex_exit(&xbdback_lock);
    364 		kmem_free(xbdi, sizeof(*xbdi));
    365 		return EEXIST;
    366 	}
    367 	SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
    368 	mutex_exit(&xbdback_lock);
    369 
    370 	/* initialize status and reference counter */
    371 	xbdi->xbdi_status = DISCONNECTED;
    372 
    373 	mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO);
    374 	cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name);
    375 
    376 	mutex_enter(&xbdi->xbdi_lock);
    377 	xbdi_get(xbdi);
    378 	mutex_exit(&xbdi->xbdi_lock);
    379 
    380 	xbusd->xbusd_u.b.b_cookie = xbdi;
    381 	xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy;
    382 	xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
    383 	xbdi->xbdi_xbusd = xbusd;
    384 
    385 	SLIST_INIT(&xbdi->xbdi_va_free);
    386 	for (i = 0; i < BLKIF_RING_SIZE; i++) {
    387 		xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map,
    388 		    VBD_VA_SIZE, 0, UVM_KMF_VAONLY|UVM_KMF_WAITVA);
    389 		SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, &xbdi->xbdi_va[i],
    390 		    xv_next);
    391 	}
    392 
    393 	/*
    394 	 * allocate page-aligned memory for segments, so that for each
    395 	 * xbdback_io its segments are in a single page.
    396 	 * sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS
    397 	 * is 128 so this helps us avoiding a page boundary withing a
    398 	 * block of VBD_MAX_INDIRECT_SEGMENTS segments.
    399 	 */
    400 	CTASSERT(sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS == 128);
    401 	xbdi->xbdi_segs = (void *)uvm_km_alloc(kernel_map, round_page(
    402 	    sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS * BLKIF_RING_SIZE),
    403 	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_WAITVA);
    404 
    405 	SLIST_INIT(&xbdi->xbdi_io_free);
    406 	for (i = 0; i < BLKIF_RING_SIZE; i++) {
    407 		struct xbdback_io *xbd_io = &xbdi->xbdi_io[i];
    408 		xbd_io->xio_seg =
    409 		    &xbdi->xbdi_segs[i * VBD_MAX_INDIRECT_SEGMENTS];
    410 		error = bus_dmamap_create(xbdi->xbdi_xbusd->xbusd_dmat,
    411 		    PAGE_SIZE, 1, PAGE_SIZE, PAGE_SIZE,
    412 		    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
    413 		    &xbd_io->xio_seg_dmamap);
    414 		if (error != 0) {
    415 			printf("%s: can't create dma map for indirect segments %d\n",
    416 			    xbdi->xbdi_name, i);
    417 			goto fail;
    418 		}
    419 		error = bus_dmamap_load(xbdi->xbdi_xbusd->xbusd_dmat,
    420 		    xbd_io->xio_seg_dmamap, xbd_io->xio_seg,
    421 		    sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS,
    422 		    NULL, BUS_DMA_WAITOK);
    423 		if (error != 0) {
    424 			printf("%s: can't load dma map for indirect segments %d @%p (%d, %zu)\n",
    425 			    xbdi->xbdi_name, i, xbd_io->xio_seg, error, sizeof(xbd_io->xio_seg));
    426 			bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat,
    427 			    xbd_io->xio_seg_dmamap);
    428 			goto fail;
    429 		}
    430 		KASSERT(xbd_io->xio_seg_dmamap->dm_nsegs == 1);
    431 		segalloc = i;
    432 		SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next);
    433 	}
    434 
    435 	error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device",
    436 	    &xbdi->xbdi_watch, xbdback_backend_changed);
    437 	if (error) {
    438 		printf("failed to watch on %s/physical-device: %d\n",
    439 		    xbusd->xbusd_path, error);
    440 		goto fail;
    441 	}
    442 	xbdi->xbdi_watch.xbw_dev = xbusd;
    443 	error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
    444 	if (error) {
    445 		printf("failed to switch state on %s: %d\n",
    446 		    xbusd->xbusd_path, error);
    447 		goto fail2;
    448 	}
    449 
    450 	xbdi->xbdi_bouncebuf = uvm_km_alloc(kernel_map, MAXPHYS, PAGE_SIZE,
    451 	    UVM_KMF_WIRED | UVM_KMF_WAITVA);
    452 	return 0;
    453 fail2:
    454 	unregister_xenbus_watch(&xbdi->xbdi_watch);
    455 fail:
    456 	for (i = 0; i < segalloc; i++) {
    457 		struct xbdback_io *xbd_io = &xbdi->xbdi_io[i];
    458 		bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat,
    459 		    xbd_io->xio_seg_dmamap);
    460 		bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat,
    461 		    xbd_io->xio_seg_dmamap);
    462 	}
    463 	mutex_enter(&xbdback_lock);
    464 	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
    465 	mutex_exit(&xbdback_lock);
    466 	kmem_free(xbdi, sizeof(*xbdi));
    467 	return error;
    468 }
    469 
    470 static int
    471 xbdback_xenbus_destroy(void *arg)
    472 {
    473 	struct xbdback_instance *xbdi = arg;
    474 
    475 	XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status));
    476 
    477 	xbdback_disconnect(xbdi);
    478 
    479 	/* unregister watch */
    480 	if (xbdi->xbdi_watch.node)
    481 		xenbus_unwatch_path(&xbdi->xbdi_watch);
    482 	/* unmap ring */
    483 	if (xbdi->xbdi_ring_handle) {
    484 		xen_shm_unmap(xbdi->xbdi_ring_va, 1, &xbdi->xbdi_ring_handle);
    485 	}
    486 
    487 	if (xbdi->xbdi_ring_va != 0) {
    488 		uvm_km_free(kernel_map, xbdi->xbdi_ring_va,
    489 		    PAGE_SIZE, UVM_KMF_VAONLY);
    490 	}
    491 
    492 	/* close device */
    493 	if (xbdi->xbdi_size) {
    494 		const char *name;
    495 		struct dkwedge_info wi;
    496 		if (getdiskinfo(xbdi->xbdi_vp, &wi) == 0)
    497 			name = wi.dkw_devname;
    498 		else
    499 			name = "*unknown*";
    500 		printf("xbd backend: detach device %s for domain %d\n",
    501 		    name, xbdi->xbdi_domid);
    502 		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
    503 	}
    504 	mutex_enter(&xbdback_lock);
    505 	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
    506 	mutex_exit(&xbdback_lock);
    507 
    508 	for (int i = 0; i < BLKIF_RING_SIZE; i++) {
    509 		struct xbdback_io *xbd_io = &xbdi->xbdi_io[i];
    510 		bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat,
    511 		    xbd_io->xio_seg_dmamap);
    512 		bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat,
    513 		    xbd_io->xio_seg_dmamap);
    514 		if (xbdi->xbdi_va[i].xv_vaddr != 0) {
    515 			uvm_km_free(kernel_map, xbdi->xbdi_va[i].xv_vaddr,
    516 			    VBD_VA_SIZE, UVM_KMF_VAONLY);
    517 			xbdi->xbdi_va[i].xv_vaddr = 0;
    518 		}
    519 	}
    520 
    521 
    522 	mutex_destroy(&xbdi->xbdi_lock);
    523 	cv_destroy(&xbdi->xbdi_cv);
    524 	kmem_free(xbdi, sizeof(*xbdi));
    525 	return 0;
    526 }
    527 
    528 static int
    529 xbdback_connect(struct xbdback_instance *xbdi)
    530 {
    531 	int err;
    532 	evtchn_op_t evop;
    533 	grant_ref_t gring_ref;
    534 	u_long ring_ref, revtchn;
    535 	char xsproto[32];
    536 	const char *proto;
    537 	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
    538 
    539 	XENPRINTF(("xbdback %s: connect\n", xbusd->xbusd_path));
    540 	/* read comunication informations */
    541 	err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
    542 	    "ring-ref", &ring_ref, 10);
    543 	if (err) {
    544 		xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref",
    545 		    xbusd->xbusd_otherend);
    546 		return -1;
    547 	}
    548 	XENPRINTF(("xbdback %s: connect ring-ref %lu\n", xbusd->xbusd_path, ring_ref));
    549 	err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
    550 	    "event-channel", &revtchn, 10);
    551 	if (err) {
    552 		xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
    553 		    xbusd->xbusd_otherend);
    554 		return -1;
    555 	}
    556 	XENPRINTF(("xbdback %s: connect revtchn %lu\n", xbusd->xbusd_path, revtchn));
    557 	err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol",
    558 	    xsproto, sizeof(xsproto));
    559 	if (err) {
    560 		xbdi->xbdi_proto = XBDIP_NATIVE;
    561 		proto = "unspecified";
    562 		XENPRINTF(("xbdback %s: connect no xsproto\n", xbusd->xbusd_path));
    563 	} else {
    564 		XENPRINTF(("xbdback %s: connect xsproto %s\n", xbusd->xbusd_path, xsproto));
    565 		if (strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) {
    566 			xbdi->xbdi_proto = XBDIP_NATIVE;
    567 			proto = XEN_IO_PROTO_ABI_NATIVE;
    568 		} else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) {
    569 			xbdi->xbdi_proto = XBDIP_32;
    570 			proto = XEN_IO_PROTO_ABI_X86_32;
    571 		} else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) {
    572 			xbdi->xbdi_proto = XBDIP_64;
    573 			proto = XEN_IO_PROTO_ABI_X86_64;
    574 		} else {
    575 			aprint_error("xbd domain %d: unknown proto %s\n",
    576 			    xbdi->xbdi_domid, xsproto);
    577 			return -1;
    578 		}
    579 	}
    580 
    581 	/* allocate VA space and map rings */
    582 	xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
    583 	    UVM_KMF_VAONLY);
    584 	if (xbdi->xbdi_ring_va == 0) {
    585 		xenbus_dev_fatal(xbusd, ENOMEM,
    586 		    "can't get VA for ring", xbusd->xbusd_otherend);
    587 		return -1;
    588 	}
    589 	XENPRINTF(("xbdback %s: connect va 0x%" PRIxVADDR "\n", xbusd->xbusd_path, xbdi->xbdi_ring_va));
    590 
    591 	gring_ref = ring_ref;
    592 	if (xen_shm_map(1, xbdi->xbdi_domid, &gring_ref, xbdi->xbdi_ring_va,
    593 	    &xbdi->xbdi_ring_handle, 0) != 0) {
    594 		aprint_error("xbdback %s: can't map grant ref\n",
    595 		    xbusd->xbusd_path);
    596 		xenbus_dev_fatal(xbusd, EINVAL,
    597 		    "can't map ring", xbusd->xbusd_otherend);
    598 		goto err1;
    599 	}
    600 	XENPRINTF(("xbdback %s: connect grhandle %d\n", xbusd->xbusd_path, xbdi->xbdi_ring_handle));
    601 
    602 	switch(xbdi->xbdi_proto) {
    603 	case XBDIP_NATIVE:
    604 	{
    605 		blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va;
    606 		BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, sring, PAGE_SIZE);
    607 		break;
    608 	}
    609 	case XBDIP_32:
    610 	{
    611 		blkif_x86_32_sring_t *sring = (void *)xbdi->xbdi_ring_va;
    612 		BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, sring, PAGE_SIZE);
    613 		break;
    614 	}
    615 	case XBDIP_64:
    616 	{
    617 		blkif_x86_64_sring_t *sring = (void *)xbdi->xbdi_ring_va;
    618 		BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, sring, PAGE_SIZE);
    619 		break;
    620 	}
    621 	}
    622 
    623 	evop.cmd = EVTCHNOP_bind_interdomain;
    624 	evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid;
    625 	evop.u.bind_interdomain.remote_port = revtchn;
    626 	err = HYPERVISOR_event_channel_op(&evop);
    627 	if (err) {
    628 		aprint_error("blkback %s: "
    629 		    "can't get event channel: %d\n",
    630 		    xbusd->xbusd_otherend, err);
    631 		xenbus_dev_fatal(xbusd, err,
    632 		    "can't bind event channel", xbusd->xbusd_otherend);
    633 		goto err2;
    634 	}
    635 	xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port;
    636 	XENPRINTF(("xbdback %s: connect evchannel %d\n", xbusd->xbusd_path, xbdi->xbdi_evtchn));
    637 
    638 	xbdi->xbdi_ih = xen_intr_establish_xname(-1, &xen_pic,
    639 	    xbdi->xbdi_evtchn, IST_LEVEL, IPL_BIO, xbdback_evthandler, xbdi,
    640 	    true, xbdi->xbdi_name);
    641 	KASSERT(xbdi->xbdi_ih != NULL);
    642 	aprint_verbose("xbd backend domain %d handle %#x (%d) "
    643 	    "using event channel %d, protocol %s\n", xbdi->xbdi_domid,
    644 	    xbdi->xbdi_handle, xbdi->xbdi_handle, xbdi->xbdi_evtchn, proto);
    645 
    646 	/* enable the xbdback event handler machinery */
    647 	xbdi->xbdi_status = WAITING;
    648 	hypervisor_unmask_event(xbdi->xbdi_evtchn);
    649 	hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
    650 
    651 	if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
    652 	    xbdback_thread, xbdi, NULL, "%s", xbdi->xbdi_name) == 0)
    653 		return 0;
    654 
    655 err2:
    656 	/* unmap ring */
    657 	xen_shm_unmap(xbdi->xbdi_ring_va, 1, &xbdi->xbdi_ring_handle);
    658 err1:
    659 	/* free ring VA space */
    660 	uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY);
    661 	return -1;
    662 }
    663 
    664 /*
    665  * Signal a xbdback thread to disconnect. Done in 'xenwatch' thread context.
    666  */
    667 static void
    668 xbdback_disconnect(struct xbdback_instance *xbdi)
    669 {
    670 
    671 	mutex_enter(&xbdi->xbdi_lock);
    672 	if (xbdi->xbdi_status == DISCONNECTED) {
    673 		mutex_exit(&xbdi->xbdi_lock);
    674 		return;
    675 	}
    676 	hypervisor_mask_event(xbdi->xbdi_evtchn);
    677 
    678 	/* signal thread that we want to disconnect, then wait for it */
    679 	xbdi->xbdi_status = DISCONNECTING;
    680 	cv_signal(&xbdi->xbdi_cv);
    681 
    682 	while (xbdi->xbdi_status != DISCONNECTED)
    683 		cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
    684 
    685 	mutex_exit(&xbdi->xbdi_lock);
    686 	xen_intr_disestablish(xbdi->xbdi_ih);
    687 
    688 	xenbus_switch_state(xbdi->xbdi_xbusd, NULL, XenbusStateClosing);
    689 }
    690 
    691 static void
    692 xbdback_frontend_changed(void *arg, XenbusState new_state)
    693 {
    694 	struct xbdback_instance *xbdi = arg;
    695 	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
    696 
    697 	XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state));
    698 	switch(new_state) {
    699 	case XenbusStateInitialising:
    700 		break;
    701 	case XenbusStateInitialised:
    702 	case XenbusStateConnected:
    703 		if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN)
    704 			break;
    705 		xbdback_connect(xbdi);
    706 		break;
    707 	case XenbusStateClosing:
    708 		xbdback_disconnect(xbdi);
    709 		break;
    710 	case XenbusStateClosed:
    711 		/* otherend_changed() should handle it for us */
    712 		panic("xbdback_frontend_changed: closed\n");
    713 	case XenbusStateUnknown:
    714 	case XenbusStateInitWait:
    715 	default:
    716 		aprint_error("xbdback %s: invalid frontend state %d\n",
    717 		    xbusd->xbusd_path, new_state);
    718 	}
    719 	return;
    720 }
    721 
    722 static void
    723 xbdback_backend_changed(struct xenbus_watch *watch,
    724     const char **vec, unsigned int len)
    725 {
    726 	struct xenbus_device *xbusd = watch->xbw_dev;
    727 	struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie;
    728 	int err;
    729 	long dev;
    730 	char mode[32];
    731 	struct xenbus_transaction *xbt;
    732 	const char *devname;
    733 	int major;
    734 
    735 	err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device",
    736 	    &dev, 10);
    737 	/*
    738 	 * An error can occur as the watch can fire up just after being
    739 	 * registered. So we have to ignore error  :(
    740 	 */
    741 	if (err)
    742 		return;
    743 	/*
    744 	 * we can also fire up after having opened the device, don't try
    745 	 * to do it twice.
    746 	 */
    747 	if (xbdi->xbdi_vp != NULL) {
    748 		if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) {
    749 			if (xbdi->xbdi_dev != dev) {
    750 				printf("xbdback %s: changing physical device "
    751 				    "from %#"PRIx64" to %#lx not supported\n",
    752 				    xbusd->xbusd_path, xbdi->xbdi_dev, dev);
    753 			}
    754 		}
    755 		return;
    756 	}
    757 	xbdi->xbdi_dev = dev;
    758 	err = xenbus_read(NULL, xbusd->xbusd_path, "mode", mode, sizeof(mode));
    759 	if (err) {
    760 		printf("xbdback: failed to read %s/mode: %d\n",
    761 		    xbusd->xbusd_path, err);
    762 		return;
    763 	}
    764 	if (mode[0] == 'w')
    765 		xbdi->xbdi_ro = false;
    766 	else
    767 		xbdi->xbdi_ro = true;
    768 	major = major(xbdi->xbdi_dev);
    769 	devname = devsw_blk2name(major);
    770 	if (devname == NULL) {
    771 		printf("xbdback %s: unknown device 0x%"PRIx64"\n",
    772 		    xbusd->xbusd_path, xbdi->xbdi_dev);
    773 		return;
    774 	}
    775 	xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev);
    776 	if (xbdi->xbdi_bdevsw == NULL) {
    777 		printf("xbdback %s: no bdevsw for device 0x%"PRIx64"\n",
    778 		    xbusd->xbusd_path, xbdi->xbdi_dev);
    779 		return;
    780 	}
    781 	err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp);
    782 	if (err) {
    783 		printf("xbdback %s: can't open device 0x%"PRIx64": %d\n",
    784 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
    785 		return;
    786 	}
    787 	err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY);
    788 	if (err) {
    789 		printf("xbdback %s: can't vn_lock device 0x%"PRIx64": %d\n",
    790 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
    791 		vrele(xbdi->xbdi_vp);
    792 		return;
    793 	}
    794 	err  = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED);
    795 	if (err) {
    796 		printf("xbdback %s: can't VOP_OPEN device 0x%"PRIx64": %d\n",
    797 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
    798 		vput(xbdi->xbdi_vp);
    799 		return;
    800 	}
    801 	VOP_UNLOCK(xbdi->xbdi_vp);
    802 
    803 	/* dk device; get wedge data */
    804 	struct dkwedge_info wi;
    805 	if ((err = getdiskinfo(xbdi->xbdi_vp, &wi)) == 0) {
    806 		xbdi->xbdi_size = wi.dkw_size;
    807 		printf("xbd backend: attach device %s (size %" PRIu64 ") "
    808 		    "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size,
    809 		    xbdi->xbdi_domid);
    810 	} else {
    811 		/* If both Ioctls failed set device size to 0 and return */
    812 		printf("xbdback %s: can't DIOCGWEDGEINFO device "
    813 		    "0x%"PRIx64": %d\n", xbusd->xbusd_path,
    814 		    xbdi->xbdi_dev, err);
    815 		xbdi->xbdi_size = xbdi->xbdi_dev = 0;
    816 		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
    817 		xbdi->xbdi_vp = NULL;
    818 		return;
    819 	}
    820 again:
    821 	xbt = xenbus_transaction_start();
    822 	if (xbt == NULL) {
    823 		printf("xbdback %s: can't start transaction\n",
    824 		    xbusd->xbusd_path);
    825 		    return;
    826 	}
    827 	err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 ,
    828 	    xbdi->xbdi_size);
    829 	if (err) {
    830 		printf("xbdback: failed to write %s/sectors: %d\n",
    831 		    xbusd->xbusd_path, err);
    832 		goto abort;
    833 	}
    834 	err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u",
    835 	    xbdi->xbdi_ro ? VDISK_READONLY : 0);
    836 	if (err) {
    837 		printf("xbdback: failed to write %s/info: %d\n",
    838 		    xbusd->xbusd_path, err);
    839 		goto abort;
    840 	}
    841 	err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu",
    842 	    (u_long)DEV_BSIZE);
    843 	if (err) {
    844 		printf("xbdback: failed to write %s/sector-size: %d\n",
    845 		    xbusd->xbusd_path, err);
    846 		goto abort;
    847 	}
    848 	err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache",
    849 	    "%u", 1);
    850 	if (err) {
    851 		printf("xbdback: failed to write %s/feature-flush-cache: %d\n",
    852 		    xbusd->xbusd_path, err);
    853 		goto abort;
    854 	}
    855 	err = xenbus_printf(xbt, xbusd->xbusd_path,
    856 	    "feature-max-indirect-segments", "%u", VBD_MAX_INDIRECT_SEGMENTS);
    857 	if (err) {
    858 		printf("xbdback: failed to write %s/feature-indirect: %d\n",
    859 		    xbusd->xbusd_path, err);
    860 		goto abort;
    861 	}
    862 	err = xenbus_transaction_end(xbt, 0);
    863 	if (err == EAGAIN)
    864 		goto again;
    865 	if (err) {
    866 		printf("xbdback %s: can't end transaction: %d\n",
    867 		    xbusd->xbusd_path, err);
    868 	}
    869 	err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
    870 	if (err) {
    871 		printf("xbdback %s: can't switch state: %d\n",
    872 		    xbusd->xbusd_path, err);
    873 	}
    874 	return;
    875 abort:
    876 	xenbus_transaction_end(xbt, 1);
    877 }
    878 
    879 /*
    880  * Used by a xbdi thread to signal that it is now disconnected.
    881  */
    882 static void
    883 xbdback_finish_disconnect(struct xbdback_instance *xbdi)
    884 {
    885 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
    886 	KASSERT(xbdi->xbdi_status == DISCONNECTING);
    887 
    888 	xbdi->xbdi_status = DISCONNECTED;
    889 
    890 	cv_broadcast(&xbdi->xbdi_cv);
    891 }
    892 
    893 static bool
    894 xbdif_lookup(domid_t dom , uint32_t handle)
    895 {
    896 	struct xbdback_instance *xbdi;
    897 	bool found = false;
    898 
    899 	KASSERT(mutex_owned(&xbdback_lock));
    900 
    901 	SLIST_FOREACH(xbdi, &xbdback_instances, next) {
    902 		if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) {
    903 			found = true;
    904 			break;
    905 		}
    906 	}
    907 
    908 	return found;
    909 }
    910 
    911 static int
    912 xbdback_evthandler(void *arg)
    913 {
    914 	struct xbdback_instance *xbdi = arg;
    915 
    916 	XENPRINTF(("xbdback_evthandler domain %d: cont %p\n",
    917 	    xbdi->xbdi_domid, xbdi->xbdi_cont));
    918 
    919 	mutex_enter(&xbdi->xbdi_lock);
    920 	xbdback_wakeup_thread(xbdi);
    921 	mutex_exit(&xbdi->xbdi_lock);
    922 
    923 	return 1;
    924 }
    925 
    926 /*
    927  * Main thread routine for one xbdback instance. Woken up by
    928  * xbdback_evthandler when a domain has I/O work scheduled in a I/O ring.
    929  */
    930 static void
    931 xbdback_thread(void *arg)
    932 {
    933 	struct xbdback_instance *xbdi = arg;
    934 	void *obj;
    935 
    936 	mutex_enter(&xbdi->xbdi_lock);
    937 	for (;;) {
    938 		switch (xbdi->xbdi_status) {
    939 		case WAITING:
    940 			cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
    941 			break;
    942 		case RUN:
    943 			xbdi->xbdi_status = WAITING; /* reset state */
    944 			obj = xbdi;
    945 			if (xbdi->xbdi_cont_restart != NULL) {
    946 				KASSERT(xbdi->xbdi_cont == NULL);
    947 				xbdi->xbdi_cont = xbdi->xbdi_cont_restart;
    948 				obj = xbdi->xbdi_cont_restart_obj;
    949 				xbdi->xbdi_cont_restart = NULL;
    950 				xbdi->xbdi_cont_restart_obj = NULL;
    951 			}
    952 			if (xbdi->xbdi_cont == NULL) {
    953 				xbdi->xbdi_cont = xbdback_co_main;
    954 			}
    955 
    956 			xbdback_trampoline(xbdi, obj);
    957 			break;
    958 		case DISCONNECTING:
    959 			if (xbdi->xbdi_pendingreqs > 0) {
    960 				/* there are pending I/Os. Wait for them. */
    961 				cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
    962 				continue;
    963 			}
    964 
    965 			/* All I/Os should have been processed by now,
    966 			 * xbdi_refcnt should drop to 0 */
    967 			xbdi_put(xbdi);
    968 			KASSERT(xbdi->xbdi_refcnt == 0);
    969 			goto out;
    970 			/* NOTREACHED */
    971 		default:
    972 			panic("%s: invalid state %d",
    973 			    xbdi->xbdi_name, xbdi->xbdi_status);
    974 		}
    975 	}
    976 out:
    977 	mutex_exit(&xbdi->xbdi_lock);
    978 
    979 	kthread_exit(0);
    980 }
    981 
    982 static void *
    983 xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
    984 {
    985 	(void)obj;
    986 
    987 	xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod;
    988 	xen_rmb(); /* ensure we see all requests up to req_prod */
    989 	/*
    990 	 * note that we'll eventually get a full ring of request.
    991 	 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
    992 	 */
    993 	xbdi->xbdi_cont = xbdback_co_main_loop;
    994 	return xbdi;
    995 }
    996 
    997 /*
    998  * Fetch a blkif request from the ring, and pass control to the appropriate
    999  * continuation.
   1000  * If someone asked for disconnection, do not fetch any more request from
   1001  * the ring.
   1002  */
   1003 static void *
   1004 xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj __unused)
   1005 {
   1006 	blkif_request_t *req, *reqn;
   1007 	blkif_x86_32_request_t *req32;
   1008 	blkif_x86_64_request_t *req64;
   1009 	blkif_request_indirect_t *rinn;
   1010 	blkif_x86_32_request_indirect_t *rin32;
   1011 	blkif_x86_64_request_indirect_t *rin64;
   1012 
   1013 	if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
   1014 		struct xbdback_io *xbd_io = xbdback_io_get(xbdi);
   1015 		uint8_t real_op = 0xff;
   1016 
   1017 		if (xbd_io == NULL) {
   1018 			/* retry after iodone */
   1019 			xbdi->xbdi_cont = NULL;
   1020 			return NULL;
   1021 		}
   1022 		memset(&xbd_io->u, 0, sizeof(xbd_io->u));
   1023 
   1024 		buf_init(&xbd_io->xio_buf);
   1025 		xbd_io->xio_xbdi = xbdi;
   1026 
   1027 		req = &xbd_io->xio_xen_req;
   1028 		memset(req, 0, sizeof(*req));
   1029 
   1030 		switch(xbdi->xbdi_proto) {
   1031 		case XBDIP_NATIVE:
   1032 			reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
   1033 			    xbdi->xbdi_ring.ring_n.req_cons);
   1034 			real_op = req->operation = reqn->operation;
   1035 			if (real_op == BLKIF_OP_INDIRECT) {
   1036 				rinn = (blkif_request_indirect_t *)reqn;
   1037 				real_op = rinn->indirect_op;
   1038 			}
   1039 			req->id = reqn->id;
   1040 			break;
   1041 		case XBDIP_32:
   1042 			req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
   1043 			    xbdi->xbdi_ring.ring_n.req_cons);
   1044 			real_op = req->operation = req32->operation;
   1045 			if (real_op == BLKIF_OP_INDIRECT) {
   1046 				rin32 = (blkif_x86_32_request_indirect_t*)req32;
   1047 				real_op = rin32->indirect_op;
   1048 			}
   1049 			req->id = req32->id;
   1050 			break;
   1051 		case XBDIP_64:
   1052 			req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
   1053 			    xbdi->xbdi_ring.ring_n.req_cons);
   1054 			real_op = req->operation = req64->operation;
   1055 			if (real_op == BLKIF_OP_INDIRECT) {
   1056 				rin64 = (blkif_x86_64_request_indirect_t*)req64;
   1057 				real_op = rin64->indirect_op;
   1058 			}
   1059 			req->id = req64->id;
   1060 			break;
   1061 		}
   1062 		__insn_barrier();
   1063 		XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x "
   1064 		    "resp_prod 0x%x id %" PRIu64 "\n", req->operation,
   1065 			xbdi->xbdi_ring.ring_n.req_cons,
   1066 			xbdi->xbdi_req_prod,
   1067 			xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
   1068 			req->id));
   1069 		switch (req->operation) {
   1070 		case BLKIF_OP_INDIRECT:
   1071 			/* just check indirect_op, rest is handled later */
   1072 			if (real_op != BLKIF_OP_READ &&
   1073 			    real_op != BLKIF_OP_WRITE) {
   1074 				if (ratecheck(&xbdi->xbdi_lasterr_time,
   1075 				    &xbdback_err_intvl)) {
   1076 					printf("%s: unknown ind operation %d\n",
   1077 					    xbdi->xbdi_name,
   1078 					    real_op);
   1079 				}
   1080 				goto fail;
   1081 			}
   1082 			/* FALLTHROUGH */
   1083 		case BLKIF_OP_READ:
   1084 		case BLKIF_OP_WRITE:
   1085 			xbdi->xbdi_cont = xbdback_co_io;
   1086 			return xbd_io;
   1087 		case BLKIF_OP_FLUSH_DISKCACHE:
   1088 			xbdi->xbdi_cont = xbdback_co_cache_flush;
   1089 			return xbd_io;
   1090 		default:
   1091 			if (ratecheck(&xbdi->xbdi_lasterr_time,
   1092 			    &xbdback_err_intvl)) {
   1093 				printf("%s: unknown operation %d\n",
   1094 				    xbdi->xbdi_name, req->operation);
   1095 			}
   1096 fail:
   1097 			xbdback_send_reply(xbdi, req->id, real_op,
   1098 			    BLKIF_RSP_ERROR);
   1099 			xbdi->xbdi_cont = xbdback_co_main_incr;
   1100 			return xbdi;
   1101 		}
   1102 	} else {
   1103 		xbdi->xbdi_cont = xbdback_co_main_done2;
   1104 		return xbdi;
   1105 	}
   1106 }
   1107 
   1108 /*
   1109  * Increment consumer index and move on to the next request. In case
   1110  * we want to disconnect, leave continuation now.
   1111  */
   1112 static void *
   1113 xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj __unused)
   1114 {
   1115 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1116 
   1117 	blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n;
   1118 
   1119 	ring->req_cons++;
   1120 
   1121 	if (xbdi->xbdi_status == DISCONNECTING)
   1122 		xbdi->xbdi_cont = NULL;
   1123 	else
   1124 		xbdi->xbdi_cont = xbdback_co_main_loop;
   1125 
   1126 	return xbdi;
   1127 }
   1128 
   1129 /*
   1130  * Check for requests in the instance's ring. In case there are, start again
   1131  * from the beginning. If not, stall.
   1132  */
   1133 static void *
   1134 xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
   1135 {
   1136 	int work_to_do;
   1137 
   1138 	xen_wmb();
   1139 	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
   1140 	if (work_to_do)
   1141 		xbdi->xbdi_cont = xbdback_co_main;
   1142 	else
   1143 		xbdi->xbdi_cont = NULL;
   1144 
   1145 	return xbdi;
   1146 }
   1147 
   1148 /*
   1149  * Frontend requested a cache flush operation.
   1150  */
   1151 static void *
   1152 xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj)
   1153 {
   1154 	struct xbdback_io *xbd_io = obj;
   1155 	KASSERT(xbd_io->xio_xen_req.operation == BLKIF_OP_FLUSH_DISKCACHE);
   1156 	if (xbdi->xbdi_pendingreqs > 0) {
   1157 		/*
   1158 		 * There are pending requests.
   1159 		 * Event or iodone() will restart processing
   1160 		 */
   1161 		xbdi->xbdi_cont_restart = xbdback_co_cache_flush;
   1162 		xbdi->xbdi_cont_restart_obj = xbd_io;
   1163 		xbdi->xbdi_cont = NULL;
   1164 		return NULL;
   1165 	}
   1166 	xbdi_get(xbdi);
   1167 	xbdi->xbdi_cont = xbdback_co_do_io;
   1168 	return xbd_io;
   1169 }
   1170 
   1171 /*
   1172  * A read or write I/O request must be processed. Do some checks first,
   1173  * then get the segment information directly from the ring request.
   1174  */
   1175 static void *
   1176 xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
   1177 {
   1178 	int i, error;
   1179 	blkif_request_t *req, *reqn;
   1180 	blkif_x86_32_request_t *req32;
   1181 	blkif_x86_64_request_t *req64;
   1182 	blkif_request_indirect_t *rinn;
   1183 	blkif_x86_32_request_indirect_t *rin32;
   1184 	blkif_x86_64_request_indirect_t *rin64;
   1185 	const char *errstr;
   1186 	struct xbdback_io *xbd_io = obj;
   1187 	grant_ref_t in_gntref = 0;
   1188 
   1189 	req = &xbd_io->xio_xen_req;
   1190 
   1191 	/* some sanity checks */
   1192 	KASSERT(req->operation == BLKIF_OP_READ ||
   1193 	    req->operation == BLKIF_OP_WRITE ||
   1194 	    req->operation == BLKIF_OP_INDIRECT);
   1195 
   1196 	/* copy request segments */
   1197 	switch (xbdi->xbdi_proto) {
   1198 	case XBDIP_NATIVE:
   1199 		reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
   1200 		    xbdi->xbdi_ring.ring_n.req_cons);
   1201 		req->handle = reqn->handle;
   1202 		req->sector_number = reqn->sector_number;
   1203 		if (reqn->operation == BLKIF_OP_INDIRECT) {
   1204 			rinn = (blkif_request_indirect_t *)reqn;
   1205 			req->operation = rinn->indirect_op;
   1206 			req->nr_segments = (uint8_t)rinn->nr_segments;
   1207 			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) {
   1208 				errstr = "too many indirect segments";
   1209 				goto bad_segments;
   1210 			}
   1211 			in_gntref = rinn->indirect_grefs[0];
   1212 			/* first_sect and segment grefs fetched later */
   1213 		} else {
   1214 			req->nr_segments = reqn->nr_segments;
   1215 			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
   1216 				errstr = "too many segments";
   1217 				goto bad_segments;
   1218 			}
   1219 			for (i = 0; i < req->nr_segments; i++)
   1220 				xbd_io->xio_seg[i] = reqn->seg[i];
   1221 		}
   1222 		break;
   1223 	case XBDIP_32:
   1224 		req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
   1225 		    xbdi->xbdi_ring.ring_n.req_cons);
   1226 		req->handle = req32->handle;
   1227 		req->sector_number = req32->sector_number;
   1228 		if (req32->operation == BLKIF_OP_INDIRECT) {
   1229 			rin32 = (blkif_x86_32_request_indirect_t *)req32;
   1230 			req->operation = rin32->indirect_op;
   1231 			req->nr_segments = (uint8_t)rin32->nr_segments;
   1232 			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) {
   1233 				errstr = "too many indirect segments";
   1234 				goto bad_segments;
   1235 			}
   1236 			in_gntref = rin32->indirect_grefs[0];
   1237 			/* first_sect and segment grefs fetched later */
   1238 		} else {
   1239 			req->nr_segments = req32->nr_segments;
   1240 			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
   1241 				errstr = "too many segments";
   1242 				goto bad_segments;
   1243 			}
   1244 			for (i = 0; i < req->nr_segments; i++)
   1245 				xbd_io->xio_seg[i] = req32->seg[i];
   1246 		}
   1247 		break;
   1248 	case XBDIP_64:
   1249 		req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
   1250 		    xbdi->xbdi_ring.ring_n.req_cons);
   1251 		req->handle = req64->handle;
   1252 		req->sector_number = req64->sector_number;
   1253 		if (req64->operation == BLKIF_OP_INDIRECT) {
   1254 			rin64 = (blkif_x86_64_request_indirect_t *)req64;
   1255 			req->nr_segments = (uint8_t)rin64->nr_segments;
   1256 			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) {
   1257 				errstr = "too many indirect segments";
   1258 				goto bad_segments;
   1259 			}
   1260 			in_gntref = rin64->indirect_grefs[0];
   1261 			/* first_sect and segment grefs fetched later */
   1262 		} else {
   1263 			req->nr_segments = req64->nr_segments;
   1264 			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
   1265 				errstr = "too many segments";
   1266 				goto bad_segments;
   1267 			}
   1268 			for (i = 0; i < req->nr_segments; i++)
   1269 				xbd_io->xio_seg[i] = req64->seg[i];
   1270 		}
   1271 		break;
   1272 	}
   1273 
   1274 	if (req->operation == BLKIF_OP_WRITE) {
   1275 		if (xbdi->xbdi_ro) {
   1276 			error = EROFS;
   1277 			goto end;
   1278 		}
   1279 	}
   1280 
   1281 	/* Max value checked already earlier */
   1282 	if (req->nr_segments < 1) {
   1283 		errstr = "invalid number of segments";
   1284 		goto bad_segments;
   1285 	}
   1286 
   1287 	/* If segments are on an indirect page, copy them now */
   1288 	if (in_gntref) {
   1289 		gnttab_copy_t gop;
   1290 		paddr_t ma;
   1291 
   1292 		gop.flags = GNTCOPY_source_gref;
   1293 		gop.len = req->nr_segments
   1294 		    * sizeof(struct blkif_request_segment);
   1295 
   1296 		gop.source.u.ref = in_gntref;
   1297 		gop.source.offset = 0;
   1298 		gop.source.domid = xbdi->xbdi_domid;
   1299 
   1300 		ma = xbd_io->xio_seg_dmamap->dm_segs[0].ds_addr;
   1301 		gop.dest.offset = ma & PAGE_MASK;
   1302 		gop.dest.domid = DOMID_SELF;
   1303 		gop.dest.u.gmfn = ma >> PAGE_SHIFT;
   1304 
   1305 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, &gop, 1) != 0) {
   1306 			errstr = "GNTTABOP_copy failed";
   1307 			goto bad_segments;
   1308 		}
   1309 	}
   1310 
   1311 	xbdi_get(xbdi);
   1312 	xbdi->xbdi_cont = xbdback_co_io_gotio;
   1313 	return xbd_io;
   1314 
   1315  bad_segments:
   1316 	if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) {
   1317 		printf("%s: %s\n", xbdi->xbdi_name, errstr);
   1318 	}
   1319 	error = EINVAL;
   1320 	/* FALLTHROUGH */
   1321 
   1322  end:
   1323 	xbdback_send_reply(xbdi, req->id, req->operation,
   1324 	    (error == EROFS) ? BLKIF_RSP_EOPNOTSUPP : BLKIF_RSP_ERROR);
   1325 	xbdi->xbdi_cont = xbdback_co_main_incr;
   1326 	return xbdi;
   1327 }
   1328 
   1329 /* Prepare an I/O buffer for a xbdback instance */
   1330 static void *
   1331 xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)
   1332 {
   1333 	struct xbdback_io *xbd_io = obj;
   1334 	int buf_flags;
   1335 	size_t bcount;
   1336 	blkif_request_t *req = &xbd_io->xio_xen_req;
   1337 	uint8_t last_sect;
   1338 	int error;
   1339 
   1340 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1341 	KASSERT(xbdi->xbdi_refcnt > 0);
   1342 
   1343 	/* Process segments */
   1344 	bcount = 0;
   1345 	for (int i = 0; i < req->nr_segments; i++) {
   1346 		struct blkif_request_segment *seg = &xbd_io->xio_seg[i];
   1347 		if (seg->last_sect > VBD_MAXSECT ||
   1348 		    seg->first_sect > VBD_MAXSECT) {
   1349 			if (ratecheck(&xbdi->xbdi_lasterr_time,
   1350 			    &xbdback_err_intvl)) {
   1351 				printf("%s: invalid segment sectors %d %d\n",
   1352 				    xbdi->xbdi_name,
   1353 				    seg->first_sect, seg->last_sect);
   1354 			}
   1355 			xbdi->xbdi_pendingreqs++; /* xbdback_io_error will -- */
   1356 			xbdback_io_error(xbd_io, EINVAL);
   1357 			/* do not retry */
   1358 			xbdi->xbdi_cont = xbdback_co_main_incr;
   1359 			return xbdi;
   1360 		}
   1361 
   1362 		if (i > 0) {
   1363 			if (last_sect != VBD_MAXSECT ||
   1364 			    seg->first_sect != 0) {
   1365 				xbd_io->xio_need_bounce = 1;
   1366 			}
   1367 		}
   1368 		last_sect = seg->last_sect;
   1369 		xbd_io->xio_gref[i] = seg->gref;
   1370 		bcount += (seg->last_sect - seg->first_sect + 1)
   1371 			* VBD_BSIZE;
   1372 	}
   1373 	xbd_io->xio_start_offset = xbd_io->xio_seg[0].first_sect * VBD_BSIZE;
   1374 
   1375 	KASSERT(bcount <= MAXPHYS);
   1376 	KASSERT(xbd_io->xio_start_offset < PAGE_SIZE);
   1377 	KASSERT(bcount + xbd_io->xio_start_offset <= VBD_VA_SIZE);
   1378 
   1379 	/* Fill-in the buf */
   1380 	if (req->operation == BLKIF_OP_WRITE) {
   1381 		buf_flags = B_WRITE;
   1382 	} else {
   1383 		buf_flags = B_READ;
   1384 	}
   1385 
   1386 	xbd_io->xio_buf.b_flags = buf_flags;
   1387 	xbd_io->xio_buf.b_cflags = 0;
   1388 	xbd_io->xio_buf.b_oflags = 0;
   1389 	xbd_io->xio_buf.b_iodone = xbdback_iodone;
   1390 	xbd_io->xio_buf.b_proc = NULL;
   1391 	xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
   1392 	xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock;
   1393 	xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
   1394 	xbd_io->xio_buf.b_blkno = req->sector_number;
   1395 	xbd_io->xio_buf.b_bcount = bcount;
   1396 	if (__predict_false(xbd_io->xio_need_bounce)) {
   1397 		if (__predict_false(xbdi->xbdi_bouncebuf_use)) {
   1398 			KASSERT(xbdi->xbdi_pendingreqs > 1);
   1399 			/* retry later */
   1400 			xbdi->xbdi_cont_restart = xbdback_co_io_gotio;
   1401 			xbdi->xbdi_cont_restart_obj = xbd_io;
   1402 			xbdi->xbdi_cont = NULL;
   1403 			return NULL;
   1404 		}
   1405 		xbdi->xbdi_bouncebuf_use++;
   1406 		KASSERT(xbdi->xbdi_bouncebuf_use == 1);
   1407 		xbd_io->xio_buf.b_data = (void *)xbdi->xbdi_bouncebuf;
   1408 	}
   1409 	xbdi->xbdi_pendingreqs++;
   1410 	if ((error = xbdback_map_shm(xbd_io)) != 0) {
   1411 		xbdback_io_error(xbd_io, error);
   1412 		/* do not retry */
   1413 		xbdi->xbdi_cont = xbdback_co_main_incr;
   1414 		return xbdi;
   1415 	}
   1416 	if (__predict_true(xbd_io->xio_need_bounce == 0)) {
   1417 		xbd_io->xio_buf.b_data = (void *)
   1418 		    (xbd_io->xio_vaddr + xbd_io->xio_start_offset);
   1419 	}
   1420 
   1421 
   1422 	xbd_io->xio_buf.b_private = xbd_io;
   1423 
   1424 	xbdi->xbdi_cont = xbdback_co_do_io;
   1425 	return xbd_io;
   1426 }
   1427 
   1428 static void
   1429 xbdback_io_error(struct xbdback_io *xbd_io, int error)
   1430 {
   1431 	KASSERT(mutex_owned(&xbd_io->xio_xbdi->xbdi_lock));
   1432 
   1433 	struct buf *bp = &xbd_io->xio_buf;
   1434 
   1435 	bp->b_error = error;
   1436 	xbdback_iodone_locked(xbd_io->xio_xbdi, xbd_io, bp);
   1437 }
   1438 
   1439 /*
   1440  * Main xbdback I/O routine. It can either perform a flush operation or
   1441  * schedule a read/write operation.
   1442  */
   1443 static void *
   1444 xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj)
   1445 {
   1446 	struct xbdback_io *xbd_io = obj;
   1447 	blkif_request_t *req = &xbd_io->xio_xen_req;
   1448 
   1449 	KASSERT(xbdi->xbdi_refcnt > 0);
   1450 
   1451 	switch (req->operation) {
   1452 	case BLKIF_OP_FLUSH_DISKCACHE:
   1453 	{
   1454 		int error;
   1455 		int force = 1;
   1456 
   1457 		KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1458 		mutex_exit(&xbdi->xbdi_lock);
   1459 		error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE,
   1460 		    kauth_cred_get());
   1461 		mutex_enter(&xbdi->xbdi_lock);
   1462 		if (error) {
   1463 			aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n",
   1464 			    xbdi->xbdi_xbusd->xbusd_path, error);
   1465 			 if (error == EOPNOTSUPP || error == ENOTTY)
   1466 				error = BLKIF_RSP_EOPNOTSUPP;
   1467 			 else
   1468 				error = BLKIF_RSP_ERROR;
   1469 		} else
   1470 			error = BLKIF_RSP_OKAY;
   1471 		xbdback_send_reply(xbdi, req->id, req->operation, error);
   1472 		xbdback_io_put(xbdi, xbd_io);
   1473 		xbdi_put(xbdi);
   1474 		xbdi->xbdi_cont = xbdback_co_main_incr;
   1475 		return xbdi;
   1476 	}
   1477 	case BLKIF_OP_READ:
   1478 	case BLKIF_OP_WRITE:
   1479 		if (__predict_false(xbd_io->xio_need_bounce) &&
   1480 		    req->operation == BLKIF_OP_WRITE) {
   1481 			vaddr_t boffset = 0;
   1482 			for (int i = 0; i < req->nr_segments; i++) {
   1483 				struct blkif_request_segment *seg =
   1484 				    &xbd_io->xio_seg[i];
   1485 				vaddr_t segoffset = seg->first_sect * VBD_BSIZE;
   1486 				size_t segbcount =
   1487 				   (seg->last_sect - seg->first_sect + 1) *
   1488 				    VBD_BSIZE;
   1489 				KASSERT(segoffset + segbcount <= PAGE_SIZE);
   1490 				KASSERT(boffset + segbcount < MAXPHYS);
   1491 				segoffset += PAGE_SIZE * i;
   1492 				memcpy(
   1493 				    (void *)(xbdi->xbdi_bouncebuf + boffset),
   1494 				    (void *)(xbd_io->xio_vaddr + segoffset),
   1495 				    segbcount);
   1496 				boffset += segbcount;
   1497 			}
   1498 		}
   1499 		KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1500 		mutex_exit(&xbdi->xbdi_lock);
   1501 		if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
   1502 			mutex_enter(xbd_io->xio_buf.b_vp->v_interlock);
   1503 			xbd_io->xio_buf.b_vp->v_numoutput++;
   1504 			mutex_exit(xbd_io->xio_buf.b_vp->v_interlock);
   1505 		}
   1506 		/* will call xbdback_iodone() asynchronously when done */
   1507 		bdev_strategy(&xbd_io->xio_buf);
   1508 		mutex_enter(&xbdi->xbdi_lock);
   1509 		xbdi->xbdi_cont = xbdback_co_main_incr;
   1510 		return xbdi;
   1511 	default:
   1512 		/* Should never happen */
   1513 		panic("xbdback_co_do_io: unsupported operation %d",
   1514 		    req->operation);
   1515 	}
   1516 }
   1517 
   1518 /*
   1519  * Called from softint(9) context when an I/O is done: for each request, send
   1520  * back the associated reply to the domain.
   1521  */
   1522 static void
   1523 xbdback_iodone(struct buf *bp)
   1524 {
   1525 	struct xbdback_io *xbd_io;
   1526 	struct xbdback_instance *xbdi;
   1527 
   1528 	xbd_io = bp->b_private;
   1529 	KASSERT(bp == &xbd_io->xio_buf);
   1530 	xbdi = xbd_io->xio_xbdi;
   1531 
   1532 	mutex_enter(&xbdi->xbdi_lock);
   1533 	xbdback_iodone_locked(xbdi, xbd_io, bp);
   1534 	mutex_exit(&xbdi->xbdi_lock);
   1535 }
   1536 
   1537 /*
   1538  * This gets reused by xbdback_io_error to report errors from other sources.
   1539  */
   1540 static void
   1541 xbdback_iodone_locked(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io,
   1542     struct buf *bp)
   1543 {
   1544 	int status;
   1545 	blkif_request_t *req = &xbd_io->xio_xen_req;
   1546 
   1547 	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
   1548 		   xbdi->xbdi_domid, (long)xbd_io));
   1549 
   1550 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1551 
   1552 	KASSERT(bp->b_error != 0 || xbd_io->xio_xv != NULL);
   1553 	if (__predict_false(xbd_io->xio_need_bounce)) {
   1554 		KASSERT(xbd_io->xio_buf.b_data == (void *)xbdi->xbdi_bouncebuf);
   1555 
   1556 		KASSERT(req->operation == BLKIF_OP_WRITE ||
   1557 		    req->operation == BLKIF_OP_READ);
   1558 
   1559 		if (req->operation == BLKIF_OP_READ && bp->b_error == 0) {
   1560 			vaddr_t boffset = 0;
   1561 			for (int i = 0; i < req->nr_segments; i++) {
   1562 				struct blkif_request_segment *seg =
   1563 				    &xbd_io->xio_seg[i];
   1564 				vaddr_t segoffset = seg->first_sect * VBD_BSIZE;
   1565 				size_t segbcount =
   1566 				   (seg->last_sect - seg->first_sect + 1) *
   1567 				    VBD_BSIZE;
   1568 				KASSERT(segoffset + segbcount <= PAGE_SIZE);
   1569 				KASSERT(boffset + segbcount < MAXPHYS);
   1570 				segoffset += PAGE_SIZE * i;
   1571 				memcpy(
   1572 				    (void *)(xbd_io->xio_vaddr + segoffset),
   1573 				    (void *)(xbdi->xbdi_bouncebuf + boffset),
   1574 				    segbcount);
   1575 				boffset += segbcount;
   1576 			}
   1577 		}
   1578 		KASSERT(xbdi->xbdi_bouncebuf_use == 1);
   1579 		xbdi->xbdi_bouncebuf_use--;
   1580 	}
   1581 	if (xbd_io->xio_xv != NULL)
   1582 		xbdback_unmap_shm(xbd_io);
   1583 
   1584 	if (bp->b_error != 0) {
   1585 		printf("xbd IO domain %d: error %d\n",
   1586 		       xbdi->xbdi_domid, bp->b_error);
   1587 		status = BLKIF_RSP_ERROR;
   1588 	} else
   1589 		status = BLKIF_RSP_OKAY;
   1590 
   1591 	xbdback_send_reply(xbdi, req->id, req->operation, status);
   1592 
   1593 	xbdi_put(xbdi);
   1594 	KASSERT(xbdi->xbdi_pendingreqs > 0);
   1595 	xbdi->xbdi_pendingreqs--;
   1596 	buf_destroy(&xbd_io->xio_buf);
   1597 	xbdback_io_put(xbdi, xbd_io);
   1598 
   1599 	xbdback_wakeup_thread(xbdi);
   1600 }
   1601 
   1602 /*
   1603  * Wake up the per xbdback instance thread.
   1604  */
   1605 static void
   1606 xbdback_wakeup_thread(struct xbdback_instance *xbdi)
   1607 {
   1608 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1609 
   1610 	/* only set RUN state when we are WAITING for work */
   1611 	if (xbdi->xbdi_status == WAITING)
   1612 	       xbdi->xbdi_status = RUN;
   1613 	cv_signal(&xbdi->xbdi_cv);
   1614 }
   1615 
   1616 /*
   1617  * called once a request has completed. Place the reply in the ring and
   1618  * notify the guest OS.
   1619  */
   1620 static void
   1621 xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id,
   1622     int op, int status)
   1623 {
   1624 	blkif_response_t *resp_n;
   1625 	blkif_x86_32_response_t *resp32;
   1626 	blkif_x86_64_response_t *resp64;
   1627 	int notify;
   1628 
   1629 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1630 
   1631 	/*
   1632 	 * The ring can be accessed by the xbdback thread, xbdback_iodone()
   1633 	 * handler, or any handler that triggered the shm callback. So
   1634 	 * protect ring access via the xbdi_lock mutex.
   1635 	 */
   1636 	switch (xbdi->xbdi_proto) {
   1637 	case XBDIP_NATIVE:
   1638 		resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n,
   1639 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
   1640 		resp_n->id        = id;
   1641 		resp_n->operation = op;
   1642 		resp_n->status    = status;
   1643 		break;
   1644 	case XBDIP_32:
   1645 		resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32,
   1646 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
   1647 		resp32->id        = id;
   1648 		resp32->operation = op;
   1649 		resp32->status    = status;
   1650 		break;
   1651 	case XBDIP_64:
   1652 		resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64,
   1653 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
   1654 		resp64->id        = id;
   1655 		resp64->operation = op;
   1656 		resp64->status    = status;
   1657 		break;
   1658 	}
   1659 	xbdi->xbdi_ring.ring_n.rsp_prod_pvt++;
   1660 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify);
   1661 
   1662 	if (notify) {
   1663 		XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid));
   1664 		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
   1665 	}
   1666 }
   1667 
   1668 /*
   1669  * Map multiple entries of an I/O request into backend's VA space.
   1670  * The xbd_io->xio_gref array has to be filled out by the caller.
   1671  */
   1672 static int
   1673 xbdback_map_shm(struct xbdback_io *xbd_io)
   1674 {
   1675 	struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
   1676 	blkif_request_t *req = &xbd_io->xio_xen_req;
   1677 	int error;
   1678 
   1679 #ifdef XENDEBUG_VBD
   1680 	int i;
   1681 	printf("xbdback_map_shm map grant ");
   1682 	for (i = 0; i < req->nr_segments; i++) {
   1683 		printf("%u ", (u_int)xbd_io->xio_gref[i]);
   1684 	}
   1685 #endif
   1686 
   1687 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
   1688 	KASSERT(xbd_io->xio_xv == NULL);
   1689 
   1690 	xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free);
   1691 	KASSERT(xbd_io->xio_xv != NULL);
   1692 	SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next);
   1693 	xbd_io->xio_vaddr = xbd_io->xio_xv->xv_vaddr;
   1694 
   1695 	error = xen_shm_map(req->nr_segments, xbdi->xbdi_domid,
   1696 	    xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh,
   1697 	    (req->operation == BLKIF_OP_WRITE) ? XSHM_RO : 0);
   1698 
   1699 	switch(error) {
   1700 	case 0:
   1701 #ifdef XENDEBUG_VBD
   1702 		printf("handle");
   1703 		for (i = 0; i < req->nr_segments; i++) {
   1704 			printf(" %u ", (u_int)xbd_io->xio_gh[i]);
   1705 		}
   1706 		printf("\n");
   1707 #endif
   1708 		return 0;
   1709 	default:
   1710 		/* reset xio_xv so error handling won't try to unmap it */
   1711 		SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
   1712 		xbd_io->xio_xv = NULL;
   1713 		return error;
   1714 	}
   1715 }
   1716 
   1717 /* unmap a request from our virtual address space (request is done) */
   1718 static void
   1719 xbdback_unmap_shm(struct xbdback_io *xbd_io)
   1720 {
   1721 	struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
   1722 	blkif_request_t *req = &xbd_io->xio_xen_req;
   1723 
   1724 #ifdef XENDEBUG_VBD
   1725 	int i;
   1726 	printf("xbdback_unmap_shm handle ");
   1727 	for (i = 0; i < req->nr_segments; i++) {
   1728 		printf("%u ", (u_int)xbd_io->xio_gh[i]);
   1729 	}
   1730 	printf("\n");
   1731 #endif
   1732 
   1733 	KASSERT(xbd_io->xio_xv != NULL);
   1734 	xen_shm_unmap(xbd_io->xio_vaddr, req->nr_segments,
   1735 	    xbd_io->xio_gh);
   1736 	SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
   1737 	xbd_io->xio_xv = NULL;
   1738 	xbd_io->xio_vaddr = -1;
   1739 }
   1740 
   1741 /* Obtain memory from a pool */
   1742 static struct xbdback_io *
   1743 xbdback_io_get(struct xbdback_instance *xbdi)
   1744 {
   1745 	struct xbdback_io *xbd_io = SLIST_FIRST(&xbdi->xbdi_io_free);
   1746 	SLIST_REMOVE_HEAD(&xbdi->xbdi_io_free, xio_next);
   1747 	return xbd_io;
   1748 }
   1749 
   1750 /* Restore memory to a pool */
   1751 static void
   1752 xbdback_io_put(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io)
   1753 {
   1754 	KASSERT(xbd_io->xio_xv == NULL);
   1755 	KASSERT(xbd_io != NULL);
   1756 	SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next);
   1757 }
   1758 
   1759 /*
   1760  * Trampoline routine. Calls continuations in a loop and only exits when
   1761  * either the returned object or the next callback is NULL.
   1762  */
   1763 static void
   1764 xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
   1765 {
   1766 	xbdback_cont_t cont;
   1767 
   1768 	while(obj != NULL && xbdi->xbdi_cont != NULL) {
   1769 		KASSERT(xbdi->xbdi_cont_restart == NULL);
   1770 		KASSERT(xbdi->xbdi_cont_restart_obj == NULL);
   1771 		cont = xbdi->xbdi_cont;
   1772 #ifdef DIAGNOSTIC
   1773 		xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF;
   1774 #endif
   1775 		obj = (*cont)(xbdi, obj);
   1776 #ifdef DIAGNOSTIC
   1777 		if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) {
   1778 			printf("xbdback_trampoline: 0x%lx didn't set "
   1779 			       "xbdi->xbdi_cont!\n", (long)cont);
   1780 			panic("xbdback_trampoline: bad continuation");
   1781 		}
   1782 		if (xbdi->xbdi_cont_restart != NULL ||
   1783 		    xbdi->xbdi_cont_restart_obj != NULL) {
   1784 			KASSERT(xbdi->xbdi_cont_restart != NULL);
   1785 			KASSERT(xbdi->xbdi_cont_restart_obj != NULL);
   1786 			KASSERT(xbdi->xbdi_cont == NULL);
   1787 			KASSERT(obj == NULL);
   1788 		}
   1789 #endif
   1790 	}
   1791 }
   1792