1 /* $NetBSD: xbd_xenbus.c,v 1.135 2025/07/29 19:07:53 andvar Exp $ */ 2 3 /* 4 * Copyright (c) 2006 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * The file contains the xbd frontend code required for block-level 30 * communications (similar to hard disks) between two Xen domains. 31 * 32 * We are not supposed to receive solicitations spontaneously from backend. The 33 * protocol is therefore fairly simple and uses only one ring to communicate 34 * with backend: frontend posts requests to the ring then wait for their 35 * replies asynchronously. 36 * 37 * xbd follows NetBSD's disk(9) convention. At any time, a LWP can schedule 38 * an operation request for the device (be it open(), read(), write(), ...). 39 * Calls are typically processed that way: 40 * - initiate request: xbdread/write/open/ioctl/.. 41 * - depending on operation, it is handled directly by disk(9) subsystem or 42 * goes through physio(9) first. 43 * - the request is ultimately processed by xbd_diskstart() that prepares the 44 * xbd requests, post them in the ring I/O queue, then signal the backend. 45 * 46 * When a response is available in the queue, the backend signals the frontend 47 * via its event channel. This triggers xbd_handler(), which will link back 48 * the response to its request through the request ID, and mark the I/O as 49 * completed. 50 */ 51 52 #include <sys/cdefs.h> 53 __KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.135 2025/07/29 19:07:53 andvar Exp $"); 54 55 #include "opt_xen.h" 56 57 58 #include <sys/param.h> 59 #include <sys/buf.h> 60 #include <sys/bufq.h> 61 #include <sys/device.h> 62 #include <sys/disk.h> 63 #include <sys/disklabel.h> 64 #include <sys/conf.h> 65 #include <sys/fcntl.h> 66 #include <sys/kernel.h> 67 #include <sys/proc.h> 68 #include <sys/systm.h> 69 #include <sys/stat.h> 70 #include <sys/vnode.h> 71 #include <sys/mutex.h> 72 73 #include <dev/dkvar.h> 74 75 #include <uvm/uvm.h> 76 77 #include <xen/intr.h> 78 #include <xen/hypervisor.h> 79 #include <xen/evtchn.h> 80 #include <xen/granttables.h> 81 #include <xen/include/public/io/blkif.h> 82 #include <xen/include/public/io/protocols.h> 83 84 #include <xen/xenbus.h> 85 #include "locators.h" 86 87 #undef XBD_DEBUG 88 #ifdef XBD_DEBUG 89 #define DPRINTF(x) printf x; 90 #else 91 #define DPRINTF(x) 92 #endif 93 94 #define GRANT_INVALID_REF -1 95 96 #define XBD_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 97 #define XBD_MAX_XFER (PAGE_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST) 98 #define XBD_MAX_CHUNK 32*1024 /* max I/O size we process in 1 req */ 99 #define XBD_XFER_LIMIT (2*XBD_MAX_XFER) 100 101 #define XEN_BSHIFT 9 /* log2(XEN_BSIZE) */ 102 #define XEN_BSIZE (1 << XEN_BSHIFT) 103 104 CTASSERT((MAXPHYS <= 2*XBD_MAX_CHUNK)); 105 CTASSERT(XEN_BSIZE == DEV_BSIZE); 106 107 struct xbd_indirect { 108 SLIST_ENTRY(xbd_indirect) in_next; 109 struct blkif_request_segment *in_addr; 110 grant_ref_t in_gntref; 111 }; 112 113 struct xbd_req { 114 SLIST_ENTRY(xbd_req) req_next; 115 uint16_t req_id; /* ID passed to backend */ 116 bus_dmamap_t req_dmamap; 117 struct xbd_req *req_parent, *req_child; 118 bool req_parent_done; 119 union { 120 struct { 121 grant_ref_t req_gntref[XBD_XFER_LIMIT >> PAGE_SHIFT]; 122 struct buf *req_bp; /* buffer associated with this request */ 123 void *req_data; /* pointer to the data buffer */ 124 struct xbd_indirect *req_indirect; /* indirect page */ 125 } req_rw; 126 struct { 127 int s_error; 128 int s_done; 129 } req_sync; 130 } u; 131 }; 132 #define req_gntref u.req_rw.req_gntref 133 #define req_bp u.req_rw.req_bp 134 #define req_data u.req_rw.req_data 135 #define req_indirect u.req_rw.req_indirect 136 #define req_sync u.req_sync 137 138 struct xbd_xenbus_softc { 139 struct dk_softc sc_dksc; /* Must be first in this struct */ 140 struct xenbus_device *sc_xbusd; 141 unsigned int sc_evtchn; 142 143 struct intrhand *sc_ih; /* Interrupt handler for this instance. */ 144 kmutex_t sc_lock; 145 kcondvar_t sc_cache_flush_cv; 146 kcondvar_t sc_req_cv; 147 kcondvar_t sc_detach_cv; 148 kcondvar_t sc_suspend_cv; 149 150 blkif_front_ring_t sc_ring; 151 grant_ref_t sc_ring_gntref; 152 153 struct xbd_req sc_reqs[XBD_RING_SIZE]; 154 SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */ 155 156 struct xbd_indirect sc_indirect[XBD_RING_SIZE]; 157 SLIST_HEAD(,xbd_indirect) sc_indirect_head; 158 159 vmem_addr_t sc_unalign_buffer; 160 void *sc_unalign_used; 161 162 int sc_backend_status; /* our status with backend */ 163 #define BLKIF_STATE_DISCONNECTED 0 164 #define BLKIF_STATE_CONNECTED 1 165 #define BLKIF_STATE_SUSPENDED 2 166 167 int sc_shutdown; 168 #define BLKIF_SHUTDOWN_RUN 0 /* no shutdown */ 169 #define BLKIF_SHUTDOWN_REMOTE 1 /* backend-initiated shutdown in progress */ 170 #define BLKIF_SHUTDOWN_LOCAL 2 /* locally-initiated shutdown in progress */ 171 172 uint64_t sc_sectors; /* number of sc_secsize sectors for this device */ 173 u_long sc_secsize; /* sector size */ 174 uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */ 175 u_long sc_info; /* VDISK_* */ 176 u_long sc_handle; /* from backend */ 177 int sc_features; 178 #define BLKIF_FEATURE_CACHE_FLUSH 0x1 179 #define BLKIF_FEATURE_BARRIER 0x2 180 #define BLKIF_FEATURE_PERSISTENT 0x4 181 #define BLKIF_FEATURE_INDIRECT 0x8 182 #define BLKIF_FEATURE_BITS \ 183 "\20\1CACHE-FLUSH\2BARRIER\3PERSISTENT\4INDIRECT" 184 struct evcnt sc_cnt_map_unalign; 185 struct evcnt sc_cnt_unalign_busy; 186 struct evcnt sc_cnt_queue_full; 187 struct evcnt sc_cnt_indirect; 188 }; 189 190 static int xbd_xenbus_match(device_t, cfdata_t, void *); 191 static void xbd_xenbus_attach(device_t, device_t, void *); 192 static int xbd_xenbus_detach(device_t, int); 193 194 static bool xbd_xenbus_suspend(device_t, const pmf_qual_t *); 195 static bool xbd_xenbus_resume(device_t, const pmf_qual_t *); 196 197 static int xbd_handler(void *); 198 static int xbd_diskstart(device_t, struct buf *); 199 static void xbd_iosize(device_t, int *); 200 static void xbd_backend_changed(void *, XenbusState); 201 static void xbd_connect(struct xbd_xenbus_softc *); 202 static void xbd_features(struct xbd_xenbus_softc *); 203 204 static void xbd_diskstart_submit(struct xbd_xenbus_softc *, int, 205 struct buf *bp, int, bus_dmamap_t, grant_ref_t *); 206 static void xbd_diskstart_submit_indirect(struct xbd_xenbus_softc *, 207 struct xbd_req *, struct buf *bp); 208 static int xbd_map_align(struct xbd_xenbus_softc *, struct xbd_req *); 209 static void xbd_unmap_align(struct xbd_xenbus_softc *, struct xbd_req *, 210 struct buf *); 211 212 static void xbdminphys(struct buf *); 213 214 CFATTACH_DECL3_NEW(xbd, sizeof(struct xbd_xenbus_softc), 215 xbd_xenbus_match, xbd_xenbus_attach, xbd_xenbus_detach, NULL, NULL, NULL, 216 DVF_DETACH_SHUTDOWN); 217 218 static dev_type_open(xbdopen); 219 static dev_type_close(xbdclose); 220 static dev_type_read(xbdread); 221 static dev_type_write(xbdwrite); 222 static dev_type_ioctl(xbdioctl); 223 static dev_type_strategy(xbdstrategy); 224 static dev_type_dump(xbddump); 225 static dev_type_size(xbdsize); 226 227 const struct bdevsw xbd_bdevsw = { 228 .d_open = xbdopen, 229 .d_close = xbdclose, 230 .d_strategy = xbdstrategy, 231 .d_ioctl = xbdioctl, 232 .d_dump = xbddump, 233 .d_psize = xbdsize, 234 .d_discard = nodiscard, 235 .d_flag = D_DISK | D_MPSAFE 236 }; 237 238 const struct cdevsw xbd_cdevsw = { 239 .d_open = xbdopen, 240 .d_close = xbdclose, 241 .d_read = xbdread, 242 .d_write = xbdwrite, 243 .d_ioctl = xbdioctl, 244 .d_stop = nostop, 245 .d_tty = notty, 246 .d_poll = nopoll, 247 .d_mmap = nommap, 248 .d_kqfilter = nokqfilter, 249 .d_discard = nodiscard, 250 .d_flag = D_DISK | D_MPSAFE 251 }; 252 253 extern struct cfdriver xbd_cd; 254 255 static const struct dkdriver xbddkdriver = { 256 .d_strategy = xbdstrategy, 257 .d_minphys = xbdminphys, 258 .d_open = xbdopen, 259 .d_close = xbdclose, 260 .d_diskstart = xbd_diskstart, 261 .d_iosize = xbd_iosize, 262 }; 263 264 static int 265 xbd_xenbus_match(device_t parent, cfdata_t match, void *aux) 266 { 267 struct xenbusdev_attach_args *xa = aux; 268 269 if (strcmp(xa->xa_type, "vbd") != 0) 270 return 0; 271 272 if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT && 273 match->cf_loc[XENBUSCF_ID] != xa->xa_id) 274 return 0; 275 276 return 1; 277 } 278 279 static void 280 xbd_xenbus_attach(device_t parent, device_t self, void *aux) 281 { 282 struct xbd_xenbus_softc *sc = device_private(self); 283 struct xenbusdev_attach_args *xa = aux; 284 blkif_sring_t *ring; 285 RING_IDX i; 286 287 config_pending_incr(self); 288 aprint_normal(": Xen Virtual Block Device Interface\n"); 289 290 dk_init(&sc->sc_dksc, self, DKTYPE_ESDI); 291 disk_init(&sc->sc_dksc.sc_dkdev, device_xname(self), &xbddkdriver); 292 293 sc->sc_xbusd = xa->xa_xbusd; 294 sc->sc_xbusd->xbusd_otherend_changed = xbd_backend_changed; 295 296 mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_BIO); 297 cv_init(&sc->sc_cache_flush_cv, "xbdsync"); 298 cv_init(&sc->sc_req_cv, "xbdreq"); 299 cv_init(&sc->sc_detach_cv, "xbddetach"); 300 cv_init(&sc->sc_suspend_cv, "xbdsuspend"); 301 302 xbd_features(sc); 303 304 /* initialize free requests list */ 305 SLIST_INIT(&sc->sc_xbdreq_head); 306 for (i = 0; i < XBD_RING_SIZE; i++) { 307 sc->sc_reqs[i].req_id = i; 308 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, &sc->sc_reqs[i], 309 req_next); 310 } 311 312 if (sc->sc_features & BLKIF_FEATURE_INDIRECT) { 313 /* initialize indirect page list */ 314 for (i = 0; i < XBD_RING_SIZE; i++) { 315 vmem_addr_t va; 316 if (uvm_km_kmem_alloc(kmem_va_arena, 317 PAGE_SIZE, VM_SLEEP | VM_INSTANTFIT, &va) != 0) { 318 aprint_error_dev(self, 319 "can't alloc indirect pages\n"); 320 return; 321 } 322 sc->sc_indirect[i].in_addr = (void *)va; 323 SLIST_INSERT_HEAD(&sc->sc_indirect_head, 324 &sc->sc_indirect[i], in_next); 325 } 326 } 327 328 sc->sc_backend_status = BLKIF_STATE_DISCONNECTED; 329 sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE; 330 331 ring = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED); 332 if (ring == NULL) 333 panic("%s: can't alloc ring", device_xname(self)); 334 sc->sc_ring.sring = ring; 335 336 evcnt_attach_dynamic(&sc->sc_cnt_map_unalign, EVCNT_TYPE_MISC, 337 NULL, device_xname(self), "map unaligned"); 338 evcnt_attach_dynamic(&sc->sc_cnt_unalign_busy, EVCNT_TYPE_MISC, 339 NULL, device_xname(self), "map unaligned"); 340 evcnt_attach_dynamic(&sc->sc_cnt_queue_full, EVCNT_TYPE_MISC, 341 NULL, device_xname(self), "queue full"); 342 evcnt_attach_dynamic(&sc->sc_cnt_indirect, EVCNT_TYPE_MISC, 343 NULL, device_xname(self), "indirect segment"); 344 345 for (i = 0; i < XBD_RING_SIZE; i++) { 346 if (bus_dmamap_create(sc->sc_xbusd->xbusd_dmat, 347 MAXPHYS, XBD_XFER_LIMIT >> PAGE_SHIFT, 348 PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, 349 &sc->sc_reqs[i].req_dmamap) != 0) { 350 aprint_error_dev(self, "can't alloc dma maps\n"); 351 return; 352 } 353 } 354 355 if (uvm_km_kmem_alloc(kmem_va_arena, 356 MAXPHYS, VM_SLEEP | VM_INSTANTFIT, &sc->sc_unalign_buffer) != 0) { 357 aprint_error_dev(self, "can't alloc align buffer\n"); 358 return; 359 } 360 361 /* resume shared structures and tell backend that we are ready */ 362 if (xbd_xenbus_resume(self, PMF_Q_NONE) == false) { 363 uvm_km_free(kernel_map, (vaddr_t)ring, PAGE_SIZE, 364 UVM_KMF_WIRED); 365 return; 366 } 367 368 if (!pmf_device_register(self, xbd_xenbus_suspend, xbd_xenbus_resume)) 369 aprint_error_dev(self, "couldn't establish power handler\n"); 370 } 371 372 static int 373 xbd_xenbus_detach(device_t dev, int flags) 374 { 375 struct xbd_xenbus_softc *sc = device_private(dev); 376 int bmaj, cmaj, i, mn, rc; 377 378 DPRINTF(("%s: xbd_detach\n", device_xname(dev))); 379 380 rc = disk_begindetach(&sc->sc_dksc.sc_dkdev, NULL, dev, flags); 381 if (rc != 0) 382 return rc; 383 384 mutex_enter(&sc->sc_lock); 385 if (sc->sc_shutdown == BLKIF_SHUTDOWN_RUN) { 386 sc->sc_shutdown = BLKIF_SHUTDOWN_LOCAL; 387 388 /* wait for requests to complete */ 389 while (sc->sc_backend_status == BLKIF_STATE_CONNECTED && 390 disk_isbusy(&sc->sc_dksc.sc_dkdev)) { 391 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2); 392 } 393 mutex_exit(&sc->sc_lock); 394 395 /* Trigger state transition with backend */ 396 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosing); 397 398 mutex_enter(&sc->sc_lock); 399 } 400 if ((flags & DETACH_FORCE) == 0) { 401 /* xbd_xenbus_detach already in progress */ 402 cv_broadcast(&sc->sc_detach_cv); 403 mutex_exit(&sc->sc_lock); 404 return EALREADY; 405 } 406 mutex_exit(&sc->sc_lock); 407 while (xenbus_read_driver_state(sc->sc_xbusd->xbusd_otherend) 408 != XenbusStateClosed) { 409 mutex_enter(&sc->sc_lock); 410 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2); 411 mutex_exit(&sc->sc_lock); 412 } 413 414 /* locate the major number */ 415 bmaj = bdevsw_lookup_major(&xbd_bdevsw); 416 cmaj = cdevsw_lookup_major(&xbd_cdevsw); 417 418 /* Nuke the vnodes for any open instances. */ 419 for (i = 0; i < MAXPARTITIONS; i++) { 420 mn = DISKMINOR(device_unit(dev), i); 421 vdevgone(bmaj, mn, mn, VBLK); 422 vdevgone(cmaj, mn, mn, VCHR); 423 } 424 425 if (sc->sc_backend_status == BLKIF_STATE_CONNECTED) { 426 /* Delete all of our wedges. */ 427 dkwedge_delall(&sc->sc_dksc.sc_dkdev); 428 429 /* Kill off any queued buffers. */ 430 dk_drain(&sc->sc_dksc); 431 bufq_free(sc->sc_dksc.sc_bufq); 432 433 /* detach disk */ 434 disk_detach(&sc->sc_dksc.sc_dkdev); 435 disk_destroy(&sc->sc_dksc.sc_dkdev); 436 dk_detach(&sc->sc_dksc); 437 } 438 439 hypervisor_mask_event(sc->sc_evtchn); 440 if (sc->sc_ih != NULL) { 441 xen_intr_disestablish(sc->sc_ih); 442 sc->sc_ih = NULL; 443 } 444 445 mutex_enter(&sc->sc_lock); 446 while (xengnt_status(sc->sc_ring_gntref)) 447 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2); 448 mutex_exit(&sc->sc_lock); 449 450 xengnt_revoke_access(sc->sc_ring_gntref); 451 uvm_km_free(kernel_map, (vaddr_t)sc->sc_ring.sring, 452 PAGE_SIZE, UVM_KMF_WIRED); 453 454 for (i = 0; i < XBD_RING_SIZE; i++) { 455 if (sc->sc_reqs[i].req_dmamap != NULL) { 456 bus_dmamap_destroy(sc->sc_xbusd->xbusd_dmat, 457 sc->sc_reqs[i].req_dmamap); 458 sc->sc_reqs[i].req_dmamap = NULL; 459 } 460 } 461 462 if (sc->sc_unalign_buffer != 0) { 463 uvm_km_kmem_free(kmem_va_arena, sc->sc_unalign_buffer, MAXPHYS); 464 sc->sc_unalign_buffer = 0; 465 } 466 467 mutex_destroy(&sc->sc_lock); 468 469 evcnt_detach(&sc->sc_cnt_map_unalign); 470 evcnt_detach(&sc->sc_cnt_unalign_busy); 471 evcnt_detach(&sc->sc_cnt_queue_full); 472 evcnt_detach(&sc->sc_cnt_indirect); 473 474 pmf_device_deregister(dev); 475 476 return 0; 477 } 478 479 static bool 480 xbd_xenbus_suspend(device_t dev, const pmf_qual_t *qual) { 481 482 struct xbd_xenbus_softc *sc; 483 484 sc = device_private(dev); 485 486 mutex_enter(&sc->sc_lock); 487 /* wait for requests to complete, then suspend device */ 488 while (sc->sc_backend_status == BLKIF_STATE_CONNECTED && 489 disk_isbusy(&sc->sc_dksc.sc_dkdev)) { 490 cv_timedwait(&sc->sc_suspend_cv, &sc->sc_lock, hz/2); 491 } 492 493 hypervisor_mask_event(sc->sc_evtchn); 494 sc->sc_backend_status = BLKIF_STATE_SUSPENDED; 495 496 #ifdef DIAGNOSTIC 497 /* Check that all requests are finished and device ready for resume */ 498 int reqcnt = 0; 499 struct xbd_req *req; 500 SLIST_FOREACH(req, &sc->sc_xbdreq_head, req_next) 501 reqcnt++; 502 KASSERT(reqcnt == __arraycount(sc->sc_reqs)); 503 504 int incnt = 0; 505 struct xbd_indirect *in; 506 SLIST_FOREACH(in, &sc->sc_indirect_head, in_next) 507 incnt++; 508 KASSERT(incnt == __arraycount(sc->sc_indirect)); 509 #endif 510 511 mutex_exit(&sc->sc_lock); 512 513 xenbus_device_suspend(sc->sc_xbusd); 514 aprint_verbose_dev(dev, "removed event channel %d\n", sc->sc_evtchn); 515 516 return true; 517 } 518 519 static bool 520 xbd_xenbus_resume(device_t dev, const pmf_qual_t *qual) 521 { 522 struct xbd_xenbus_softc *sc; 523 struct xenbus_transaction *xbt; 524 int error; 525 blkif_sring_t *ring; 526 paddr_t ma; 527 const char *errmsg; 528 529 sc = device_private(dev); 530 531 /* All grants were removed during suspend */ 532 sc->sc_ring_gntref = GRANT_INVALID_REF; 533 534 /* Initialize ring */ 535 ring = sc->sc_ring.sring; 536 memset(ring, 0, PAGE_SIZE); 537 SHARED_RING_INIT(ring); 538 FRONT_RING_INIT(&sc->sc_ring, ring, PAGE_SIZE); 539 540 /* 541 * get MA address of the ring, and use it to set up the grant entry 542 * for the block device 543 */ 544 (void)pmap_extract_ma(pmap_kernel(), (vaddr_t)ring, &ma); 545 error = xenbus_grant_ring(sc->sc_xbusd, ma, &sc->sc_ring_gntref); 546 if (error) 547 goto abort_resume; 548 549 if (sc->sc_features & BLKIF_FEATURE_INDIRECT) { 550 for (int i = 0; i < XBD_RING_SIZE; i++) { 551 vaddr_t va = (vaddr_t)sc->sc_indirect[i].in_addr; 552 KASSERT(va != 0); 553 KASSERT((va & PAGE_MASK) == 0); 554 (void)pmap_extract_ma(pmap_kernel(), va, &ma); 555 if (xengnt_grant_access( 556 sc->sc_xbusd->xbusd_otherend_id, 557 ma, true, &sc->sc_indirect[i].in_gntref)) { 558 aprint_error_dev(dev, 559 "indirect page grant failed\n"); 560 goto abort_resume; 561 } 562 } 563 } 564 565 error = xenbus_alloc_evtchn(sc->sc_xbusd, &sc->sc_evtchn); 566 if (error) 567 goto abort_resume; 568 569 if (sc->sc_ih != NULL) { 570 xen_intr_disestablish(sc->sc_ih); 571 sc->sc_ih = NULL; 572 } 573 aprint_verbose_dev(dev, "using event channel %d\n", 574 sc->sc_evtchn); 575 sc->sc_ih = xen_intr_establish_xname(-1, &xen_pic, sc->sc_evtchn, 576 IST_LEVEL, IPL_BIO, &xbd_handler, sc, true, device_xname(dev)); 577 KASSERT(sc->sc_ih != NULL); 578 579 again: 580 xbt = xenbus_transaction_start(); 581 if (xbt == NULL) 582 return false; 583 error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path, 584 "ring-ref","%u", sc->sc_ring_gntref); 585 if (error) { 586 errmsg = "writing ring-ref"; 587 goto abort_transaction; 588 } 589 error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path, 590 "event-channel", "%u", sc->sc_evtchn); 591 if (error) { 592 errmsg = "writing event channel"; 593 goto abort_transaction; 594 } 595 error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path, 596 "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); 597 if (error) { 598 errmsg = "writing protocol"; 599 goto abort_transaction; 600 } 601 error = xenbus_transaction_end(xbt, 0); 602 if (error == EAGAIN) 603 goto again; 604 if (error != 0) { 605 xenbus_dev_fatal(sc->sc_xbusd, error, 606 "completing transaction"); 607 return false; 608 } 609 610 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateInitialised); 611 612 if (sc->sc_backend_status == BLKIF_STATE_SUSPENDED) { 613 /* 614 * device was suspended, softc structures are 615 * already initialized - we use a shortcut 616 */ 617 sc->sc_backend_status = BLKIF_STATE_CONNECTED; 618 xenbus_device_resume(sc->sc_xbusd); 619 hypervisor_unmask_event(sc->sc_evtchn); 620 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected); 621 } 622 623 return true; 624 625 abort_resume: 626 xenbus_dev_fatal(sc->sc_xbusd, error, "resuming device"); 627 return false; 628 629 abort_transaction: 630 xenbus_transaction_end(xbt, 1); 631 xenbus_dev_fatal(sc->sc_xbusd, error, "%s", errmsg); 632 return false; 633 } 634 635 static void 636 xbd_backend_changed(void *arg, XenbusState new_state) 637 { 638 struct xbd_xenbus_softc *sc = device_private((device_t)arg); 639 struct disk_geom *dg; 640 641 char buf[64]; 642 DPRINTF(("%s: new backend state %d\n", 643 device_xname(sc->sc_dksc.sc_dev), new_state)); 644 645 switch (new_state) { 646 case XenbusStateUnknown: 647 case XenbusStateInitialising: 648 case XenbusStateInitWait: 649 case XenbusStateInitialised: 650 break; 651 case XenbusStateClosing: 652 mutex_enter(&sc->sc_lock); 653 if (sc->sc_shutdown == BLKIF_SHUTDOWN_RUN) 654 sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE; 655 /* wait for requests to complete */ 656 while (sc->sc_backend_status == BLKIF_STATE_CONNECTED && 657 disk_isbusy(&sc->sc_dksc.sc_dkdev)) { 658 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2); 659 } 660 mutex_exit(&sc->sc_lock); 661 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed); 662 break; 663 case XenbusStateConnected: 664 /* 665 * note that xbd_backend_changed() can only be called by 666 * the xenbus thread. 667 */ 668 669 if (sc->sc_backend_status == BLKIF_STATE_CONNECTED || 670 sc->sc_backend_status == BLKIF_STATE_SUSPENDED) 671 /* already connected */ 672 return; 673 674 xbd_connect(sc); 675 sc->sc_shutdown = BLKIF_SHUTDOWN_RUN; 676 sc->sc_xbdsize = 677 sc->sc_sectors * (uint64_t)sc->sc_secsize / DEV_BSIZE; 678 dg = &sc->sc_dksc.sc_dkdev.dk_geom; 679 memset(dg, 0, sizeof(*dg)); 680 681 dg->dg_secperunit = sc->sc_sectors; 682 dg->dg_secsize = sc->sc_secsize; 683 dg->dg_ntracks = 1; 684 dg->dg_nsectors = (1024 * 1024) / dg->dg_secsize; 685 dg->dg_ncylinders = dg->dg_secperunit / dg->dg_nsectors; 686 687 bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0); 688 dk_attach(&sc->sc_dksc); 689 disk_attach(&sc->sc_dksc.sc_dkdev); 690 691 sc->sc_backend_status = BLKIF_STATE_CONNECTED; 692 hypervisor_unmask_event(sc->sc_evtchn); 693 694 format_bytes(buf, uimin(9, sizeof(buf)), 695 sc->sc_sectors * dg->dg_secsize); 696 aprint_normal_dev(sc->sc_dksc.sc_dev, 697 "%s, %d bytes/sect x %" PRIu64 " sectors\n", 698 buf, (int)dg->dg_secsize, sc->sc_sectors); 699 snprintb(buf, sizeof(buf), BLKIF_FEATURE_BITS, 700 sc->sc_features); 701 aprint_normal_dev(sc->sc_dksc.sc_dev, 702 "backend features %s\n", buf); 703 704 /* Discover wedges on this disk. */ 705 dkwedge_discover(&sc->sc_dksc.sc_dkdev); 706 707 disk_set_info(sc->sc_dksc.sc_dev, &sc->sc_dksc.sc_dkdev, NULL); 708 709 /* the disk should be working now */ 710 config_pending_decr(sc->sc_dksc.sc_dev); 711 break; 712 default: 713 panic("bad backend state %d", new_state); 714 } 715 } 716 717 static void 718 xbd_connect(struct xbd_xenbus_softc *sc) 719 { 720 int err; 721 unsigned long long sectors; 722 u_long val; 723 724 /* 725 * Must read feature-persistent later, e.g. Linux Dom0 writes 726 * this together with the device info. 727 */ 728 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend, 729 "feature-persistent", &val, 10); 730 if (err) 731 val = 0; 732 if (val > 0) 733 sc->sc_features |= BLKIF_FEATURE_PERSISTENT; 734 735 err = xenbus_read_ul(NULL, 736 sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10); 737 if (err) 738 panic("%s: can't read number from %s/virtual-device\n", 739 device_xname(sc->sc_dksc.sc_dev), 740 sc->sc_xbusd->xbusd_otherend); 741 err = xenbus_read_ul(NULL, 742 sc->sc_xbusd->xbusd_otherend, "info", &sc->sc_info, 10); 743 if (err) 744 panic("%s: can't read number from %s/info\n", 745 device_xname(sc->sc_dksc.sc_dev), 746 sc->sc_xbusd->xbusd_otherend); 747 err = xenbus_read_ul(NULL, 748 sc->sc_xbusd->xbusd_otherend, "sector-size", &sc->sc_secsize, 10); 749 if (err) 750 panic("%s: can't read number from %s/sector-size\n", 751 device_xname(sc->sc_dksc.sc_dev), 752 sc->sc_xbusd->xbusd_otherend); 753 754 err = xenbus_read_ull(NULL, 755 sc->sc_xbusd->xbusd_otherend, "sectors", §ors, 10); 756 if (err) 757 panic("%s: can't read number from %s/sectors\n", 758 device_xname(sc->sc_dksc.sc_dev), 759 sc->sc_xbusd->xbusd_otherend); 760 sc->sc_sectors = sectors * (uint64_t)XEN_BSIZE / sc->sc_secsize; 761 762 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected); 763 } 764 765 static void 766 xbd_features(struct xbd_xenbus_softc *sc) 767 { 768 int err; 769 u_long val; 770 771 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend, 772 "feature-flush-cache", &val, 10); 773 if (err) 774 val = 0; 775 if (val > 0) 776 sc->sc_features |= BLKIF_FEATURE_CACHE_FLUSH; 777 778 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend, 779 "feature-barrier", &val, 10); 780 if (err) 781 val = 0; 782 if (val > 0) 783 sc->sc_features |= BLKIF_FEATURE_BARRIER; 784 785 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend, 786 "feature-max-indirect-segments", &val, 10); 787 if (err) 788 val = 0; 789 if (val >= (MAXPHYS >> PAGE_SHIFT) + 1) { 790 /* We can use indirect segments, the limit is big enough */ 791 sc->sc_features |= BLKIF_FEATURE_INDIRECT; 792 } 793 } 794 795 static int 796 xbd_handler(void *arg) 797 { 798 struct xbd_xenbus_softc *sc = arg; 799 struct buf *bp; 800 RING_IDX resp_prod, i; 801 int more_to_do; 802 int seg; 803 grant_ref_t gntref; 804 805 DPRINTF(("xbd_handler(%s)\n", device_xname(sc->sc_dksc.sc_dev))); 806 807 if (__predict_false(sc->sc_backend_status != BLKIF_STATE_CONNECTED)) 808 return 0; 809 810 mutex_enter(&sc->sc_lock); 811 again: 812 resp_prod = sc->sc_ring.sring->rsp_prod; 813 xen_rmb(); /* ensure we see replies up to resp_prod */ 814 for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) { 815 blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i); 816 struct xbd_req *xbdreq = &sc->sc_reqs[rep->id]; 817 818 if (rep->operation == BLKIF_OP_FLUSH_DISKCACHE) { 819 KASSERT(xbdreq->req_bp == NULL); 820 xbdreq->req_sync.s_error = rep->status; 821 xbdreq->req_sync.s_done = 1; 822 cv_broadcast(&sc->sc_cache_flush_cv); 823 /* caller will free the req */ 824 continue; 825 } 826 827 if (rep->operation != BLKIF_OP_READ && 828 rep->operation != BLKIF_OP_WRITE) { 829 aprint_error_dev(sc->sc_dksc.sc_dev, 830 "bad operation %d from backend\n", rep->operation); 831 continue; 832 } 833 834 bp = xbdreq->req_bp; 835 xbdreq->req_bp = NULL; 836 KASSERT(bp != NULL && bp->b_data != NULL); 837 DPRINTF(("%s(%p): b_bcount = %ld\n", __func__, 838 bp, (long)bp->b_bcount)); 839 840 if (bp->b_error != 0 || rep->status != BLKIF_RSP_OKAY) { 841 DPRINTF(("%s: error %d status %d\n", __func__, 842 bp->b_error, rep->status)); 843 bp->b_error = EIO; 844 bp->b_resid = bp->b_bcount; 845 } 846 847 if (xbdreq->req_parent) { 848 struct xbd_req *req_parent = xbdreq->req_parent; 849 850 /* Unhook and recycle child */ 851 xbdreq->req_parent = NULL; 852 req_parent->req_child = NULL; 853 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, 854 req_next); 855 856 if (!req_parent->req_parent_done) { 857 /* Finished before parent, nothing else to do */ 858 continue; 859 } 860 861 /* Must do the cleanup now */ 862 xbdreq = req_parent; 863 } 864 if (xbdreq->req_child) { 865 /* Finished before child, child will cleanup */ 866 xbdreq->req_parent_done = true; 867 continue; 868 } 869 870 if (bp->b_error == 0) 871 bp->b_resid = 0; 872 873 KASSERT(xbdreq->req_dmamap->dm_nsegs > 0); 874 for (seg = 0; seg < xbdreq->req_dmamap->dm_nsegs; seg++) { 875 /* 876 * We are not allowing persistent mappings, so 877 * expect the backend to release the grant 878 * immediately. 879 */ 880 if (xbdreq->req_indirect) { 881 gntref = 882 xbdreq->req_indirect->in_addr[seg].gref; 883 } else 884 gntref = xbdreq->req_gntref[seg]; 885 KASSERT(xengnt_status(gntref) == 0); 886 xengnt_revoke_access(gntref); 887 } 888 889 bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat, xbdreq->req_dmamap); 890 891 if (__predict_false(bp->b_data != xbdreq->req_data)) 892 xbd_unmap_align(sc, xbdreq, bp); 893 xbdreq->req_data = NULL; 894 895 dk_done(&sc->sc_dksc, bp); 896 897 if (xbdreq->req_indirect) { 898 /* No persistent mappings, so check that 899 * backend unmapped the indirect segment grant too. 900 */ 901 KASSERT(xengnt_status(xbdreq->req_indirect->in_gntref) 902 == 0); 903 SLIST_INSERT_HEAD(&sc->sc_indirect_head, 904 xbdreq->req_indirect, in_next); 905 xbdreq->req_indirect = NULL; 906 } 907 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next); 908 } 909 sc->sc_ring.rsp_cons = i; 910 911 xen_wmb(); 912 RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do); 913 if (more_to_do) 914 goto again; 915 916 cv_signal(&sc->sc_req_cv); 917 mutex_exit(&sc->sc_lock); 918 919 dk_start(&sc->sc_dksc, NULL); 920 921 return 1; 922 } 923 924 static void 925 xbdminphys(struct buf *bp) 926 { 927 if (bp->b_bcount > XBD_XFER_LIMIT) { 928 bp->b_bcount = XBD_XFER_LIMIT; 929 } 930 minphys(bp); 931 } 932 933 static void 934 xbd_iosize(device_t dev, int *maxxfer) 935 { 936 /* 937 * Always restrict dumps to XBD_MAX_XFER to avoid indirect segments, 938 * so that it uses as little memory as possible. 939 */ 940 if (*maxxfer > XBD_MAX_XFER) 941 *maxxfer = XBD_MAX_XFER; 942 } 943 944 static int 945 xbdopen(dev_t dev, int flags, int fmt, struct lwp *l) 946 { 947 struct xbd_xenbus_softc *sc; 948 949 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev)); 950 if (sc == NULL) 951 return (ENXIO); 952 if ((flags & FWRITE) && (sc->sc_info & VDISK_READONLY)) 953 return EROFS; 954 955 DPRINTF(("xbdopen(%" PRIx64 ", %d)\n", dev, flags)); 956 return dk_open(&sc->sc_dksc, dev, flags, fmt, l); 957 } 958 959 static int 960 xbdclose(dev_t dev, int flags, int fmt, struct lwp *l) 961 { 962 struct xbd_xenbus_softc *sc; 963 964 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev)); 965 966 DPRINTF(("xbdclose(%" PRIx64 ", %d)\n", dev, flags)); 967 return dk_close(&sc->sc_dksc, dev, flags, fmt, l); 968 } 969 970 static void 971 xbdstrategy(struct buf *bp) 972 { 973 struct xbd_xenbus_softc *sc; 974 975 sc = device_lookup_private(&xbd_cd, DISKUNIT(bp->b_dev)); 976 977 DPRINTF(("xbdstrategy(%p): b_bcount = %ld\n", bp, 978 (long)bp->b_bcount)); 979 980 if (sc == NULL || sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) { 981 bp->b_error = EIO; 982 biodone(bp); 983 return; 984 } 985 if (__predict_false((sc->sc_info & VDISK_READONLY) && 986 (bp->b_flags & B_READ) == 0)) { 987 bp->b_error = EROFS; 988 biodone(bp); 989 return; 990 } 991 992 dk_strategy(&sc->sc_dksc, bp); 993 return; 994 } 995 996 static int 997 xbdsize(dev_t dev) 998 { 999 struct xbd_xenbus_softc *sc; 1000 1001 DPRINTF(("xbdsize(%" PRIx64 ")\n", dev)); 1002 1003 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev)); 1004 if (sc == NULL || sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) 1005 return -1; 1006 return dk_size(&sc->sc_dksc, dev); 1007 } 1008 1009 static int 1010 xbdread(dev_t dev, struct uio *uio, int flags) 1011 { 1012 struct xbd_xenbus_softc *sc = 1013 device_lookup_private(&xbd_cd, DISKUNIT(dev)); 1014 struct dk_softc *dksc = &sc->sc_dksc; 1015 1016 if (!DK_ATTACHED(dksc)) 1017 return ENXIO; 1018 return physio(xbdstrategy, NULL, dev, B_READ, xbdminphys, uio); 1019 } 1020 1021 static int 1022 xbdwrite(dev_t dev, struct uio *uio, int flags) 1023 { 1024 struct xbd_xenbus_softc *sc = 1025 device_lookup_private(&xbd_cd, DISKUNIT(dev)); 1026 struct dk_softc *dksc = &sc->sc_dksc; 1027 1028 if (!DK_ATTACHED(dksc)) 1029 return ENXIO; 1030 if (__predict_false(sc->sc_info & VDISK_READONLY)) 1031 return EROFS; 1032 return physio(xbdstrategy, NULL, dev, B_WRITE, xbdminphys, uio); 1033 } 1034 1035 static int 1036 xbdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 1037 { 1038 struct xbd_xenbus_softc *sc = 1039 device_lookup_private(&xbd_cd, DISKUNIT(dev)); 1040 struct dk_softc *dksc; 1041 int error; 1042 struct xbd_req *xbdreq; 1043 blkif_request_t *req; 1044 int notify; 1045 1046 DPRINTF(("xbdioctl(%" PRIx64 ", %08lx, %p, %d, %p)\n", 1047 dev, cmd, data, flag, l)); 1048 dksc = &sc->sc_dksc; 1049 1050 switch (cmd) { 1051 case DIOCGCACHE: 1052 { 1053 /* Assume there is write cache if cache-flush is supported */ 1054 int *bitsp = (int *)data; 1055 *bitsp = 0; 1056 if (sc->sc_features & BLKIF_FEATURE_CACHE_FLUSH) 1057 *bitsp |= DKCACHE_WRITE; 1058 error = 0; 1059 break; 1060 } 1061 case DIOCCACHESYNC: 1062 if ((sc->sc_features & BLKIF_FEATURE_CACHE_FLUSH) == 0) 1063 return EOPNOTSUPP; 1064 1065 mutex_enter(&sc->sc_lock); 1066 while ((xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head)) == NULL) 1067 cv_wait(&sc->sc_req_cv, &sc->sc_lock); 1068 KASSERT(!RING_FULL(&sc->sc_ring)); 1069 1070 SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next); 1071 req = RING_GET_REQUEST(&sc->sc_ring, 1072 sc->sc_ring.req_prod_pvt); 1073 req->id = xbdreq->req_id; 1074 req->operation = BLKIF_OP_FLUSH_DISKCACHE; 1075 req->handle = sc->sc_handle; 1076 xbdreq->req_sync.s_done = 0; 1077 sc->sc_ring.req_prod_pvt++; 1078 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify); 1079 if (notify) 1080 hypervisor_notify_via_evtchn(sc->sc_evtchn); 1081 /* request sent, now wait for completion */ 1082 while (xbdreq->req_sync.s_done == 0) 1083 cv_wait(&sc->sc_cache_flush_cv, &sc->sc_lock); 1084 1085 if (xbdreq->req_sync.s_error == BLKIF_RSP_EOPNOTSUPP) 1086 error = EOPNOTSUPP; 1087 else if (xbdreq->req_sync.s_error == BLKIF_RSP_OKAY) 1088 error = 0; 1089 else 1090 error = EIO; 1091 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next); 1092 cv_signal(&sc->sc_req_cv); 1093 mutex_exit(&sc->sc_lock); 1094 1095 /* Restart I/O if it was waiting for req */ 1096 dk_start(&sc->sc_dksc, NULL); 1097 break; 1098 1099 default: 1100 error = dk_ioctl(dksc, dev, cmd, data, flag, l); 1101 break; 1102 } 1103 1104 return error; 1105 } 1106 1107 static int 1108 xbddump(dev_t dev, daddr_t blkno, void *va, size_t size) 1109 { 1110 struct xbd_xenbus_softc *sc; 1111 1112 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev)); 1113 if (sc == NULL) 1114 return (ENXIO); 1115 1116 DPRINTF(("xbddump(%" PRIx64 ", %" PRId64 ", %p, %lu)\n", dev, blkno, va, 1117 (unsigned long)size)); 1118 return dk_dump(&sc->sc_dksc, dev, blkno, va, size, 0); 1119 } 1120 1121 static int 1122 xbd_diskstart(device_t self, struct buf *bp) 1123 { 1124 struct xbd_xenbus_softc *sc = device_private(self); 1125 struct xbd_req *xbdreq; 1126 int error = 0; 1127 int notify; 1128 1129 KASSERT(bp->b_bcount <= MAXPHYS); 1130 1131 DPRINTF(("xbd_diskstart(%p): b_bcount = %ld\n", 1132 bp, (long)bp->b_bcount)); 1133 1134 mutex_enter(&sc->sc_lock); 1135 1136 if (sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) { 1137 error = EIO; 1138 goto out; 1139 } 1140 1141 if (bp->b_rawblkno < 0 || bp->b_rawblkno > sc->sc_sectors) { 1142 /* invalid block number */ 1143 error = EINVAL; 1144 goto out; 1145 } 1146 1147 if (__predict_false( 1148 sc->sc_backend_status == BLKIF_STATE_SUSPENDED)) { 1149 /* device is suspended, do not consume buffer */ 1150 DPRINTF(("%s: (xbd_diskstart) device suspended\n", 1151 sc->sc_dksc.sc_xname)); 1152 error = EAGAIN; 1153 goto out; 1154 } 1155 1156 xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head); 1157 if (__predict_false(xbdreq == NULL)) { 1158 sc->sc_cnt_queue_full.ev_count++; 1159 DPRINTF(("xbd_diskstart: no req\n")); 1160 error = EAGAIN; 1161 goto out; 1162 } 1163 KASSERT(!RING_FULL(&sc->sc_ring)); 1164 1165 if ((sc->sc_features & BLKIF_FEATURE_INDIRECT) == 0 1166 && bp->b_bcount > XBD_MAX_CHUNK) { 1167 if (!SLIST_NEXT(xbdreq, req_next)) { 1168 DPRINTF(("%s: need extra req\n", __func__)); 1169 error = EAGAIN; 1170 goto out; 1171 } 1172 } 1173 1174 bp->b_resid = bp->b_bcount; 1175 xbdreq->req_bp = bp; 1176 xbdreq->req_data = bp->b_data; 1177 if (__predict_false((vaddr_t)bp->b_data & (sc->sc_secsize - 1))) { 1178 if (__predict_false(xbd_map_align(sc, xbdreq) != 0)) { 1179 DPRINTF(("xbd_diskstart: no align\n")); 1180 error = EAGAIN; 1181 goto out; 1182 } 1183 } 1184 1185 if (__predict_false(bus_dmamap_load(sc->sc_xbusd->xbusd_dmat, 1186 xbdreq->req_dmamap, xbdreq->req_data, bp->b_bcount, NULL, 1187 BUS_DMA_NOWAIT) != 0)) { 1188 printf("%s: %s: bus_dmamap_load failed\n", 1189 device_xname(sc->sc_dksc.sc_dev), __func__); 1190 if (__predict_false(bp->b_data != xbdreq->req_data)) 1191 xbd_unmap_align(sc, xbdreq, NULL); 1192 error = EINVAL; 1193 goto out; 1194 } 1195 KASSERTMSG(xbdreq->req_dmamap->dm_nsegs > 0, 1196 "dm_nsegs == 0 with bcount %d", bp->b_bcount); 1197 1198 for (int seg = 0; seg < xbdreq->req_dmamap->dm_nsegs; seg++) { 1199 KASSERT(seg < __arraycount(xbdreq->req_gntref)); 1200 1201 paddr_t ma = xbdreq->req_dmamap->dm_segs[seg].ds_addr; 1202 if (__predict_false(xengnt_grant_access( 1203 sc->sc_xbusd->xbusd_otherend_id, 1204 (ma & ~PAGE_MASK), (bp->b_flags & B_READ) == 0, 1205 &xbdreq->req_gntref[seg]))) { 1206 printf("%s: %s: xengnt_grant_access failed\n", 1207 device_xname(sc->sc_dksc.sc_dev), __func__); 1208 if (seg > 0) { 1209 for (; --seg >= 0; ) { 1210 xengnt_revoke_access( 1211 xbdreq->req_gntref[seg]); 1212 } 1213 } 1214 bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat, 1215 xbdreq->req_dmamap); 1216 if (__predict_false(bp->b_data != xbdreq->req_data)) 1217 xbd_unmap_align(sc, xbdreq, NULL); 1218 error = EAGAIN; 1219 goto out; 1220 } 1221 } 1222 1223 KASSERT(xbdreq->req_parent == NULL); 1224 KASSERT(xbdreq->req_child == NULL); 1225 1226 /* We are now committed to the transfer */ 1227 SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next); 1228 1229 if ((sc->sc_features & BLKIF_FEATURE_INDIRECT) != 0 && 1230 bp->b_bcount > XBD_MAX_CHUNK) { 1231 xbd_diskstart_submit_indirect(sc, xbdreq, bp); 1232 goto push; 1233 } 1234 1235 xbd_diskstart_submit(sc, xbdreq->req_id, 1236 bp, 0, xbdreq->req_dmamap, xbdreq->req_gntref); 1237 1238 if (bp->b_bcount > XBD_MAX_CHUNK) { 1239 KASSERT(!RING_FULL(&sc->sc_ring)); 1240 struct xbd_req *xbdreq2 = SLIST_FIRST(&sc->sc_xbdreq_head); 1241 KASSERT(xbdreq2 != NULL); /* Checked earlier */ 1242 SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next); 1243 xbdreq->req_child = xbdreq2; 1244 xbdreq->req_parent_done = false; 1245 xbdreq2->req_parent = xbdreq; 1246 xbdreq2->req_bp = bp; 1247 xbdreq2->req_data = xbdreq->req_data; 1248 xbd_diskstart_submit(sc, xbdreq2->req_id, 1249 bp, XBD_MAX_CHUNK, xbdreq->req_dmamap, 1250 xbdreq->req_gntref); 1251 } 1252 1253 push: 1254 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify); 1255 if (notify) 1256 hypervisor_notify_via_evtchn(sc->sc_evtchn); 1257 out: 1258 mutex_exit(&sc->sc_lock); 1259 return error; 1260 } 1261 1262 static void 1263 xbd_diskstart_submit(struct xbd_xenbus_softc *sc, 1264 int req_id, struct buf *bp, int start, bus_dmamap_t dmamap, 1265 grant_ref_t *gntref) 1266 { 1267 blkif_request_t *req; 1268 paddr_t ma; 1269 int nsects, nbytes, dmaseg, first_sect, size, segidx = 0; 1270 struct blkif_request_segment *reqseg; 1271 1272 KASSERT(mutex_owned(&sc->sc_lock)); 1273 1274 req = RING_GET_REQUEST(&sc->sc_ring, sc->sc_ring.req_prod_pvt); 1275 req->id = req_id; 1276 req->operation = 1277 bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE; 1278 req->sector_number = (bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE) + 1279 (start >> XEN_BSHIFT); 1280 req->handle = sc->sc_handle; 1281 DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n", 1282 __func__, req->id, req->operation, req->sector_number, 1283 req->handle)); 1284 1285 size = uimin(bp->b_bcount - start, XBD_MAX_CHUNK); 1286 for (dmaseg = 0; dmaseg < dmamap->dm_nsegs && size > 0; dmaseg++) { 1287 bus_dma_segment_t *ds = &dmamap->dm_segs[dmaseg]; 1288 1289 ma = ds->ds_addr; 1290 nbytes = ds->ds_len; 1291 1292 if (start > 0) { 1293 if (start >= nbytes) { 1294 start -= nbytes; 1295 continue; 1296 } 1297 ma += start; 1298 nbytes -= start; 1299 start = 0; 1300 } 1301 size -= nbytes; 1302 1303 KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0); 1304 KASSERT((nbytes & (sc->sc_secsize - 1)) == 0); 1305 KASSERT((size & (sc->sc_secsize - 1)) == 0); 1306 first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT; 1307 nsects = nbytes >> XEN_BSHIFT; 1308 1309 reqseg = &req->seg[segidx++]; 1310 reqseg->first_sect = first_sect; 1311 reqseg->last_sect = first_sect + nsects - 1; 1312 KASSERT(reqseg->first_sect <= reqseg->last_sect); 1313 KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE)); 1314 DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, segidx, 1315 reqseg->first_sect, reqseg->last_sect)); 1316 1317 reqseg->gref = gntref[dmaseg]; 1318 } 1319 KASSERT(segidx > 0); 1320 req->nr_segments = segidx; 1321 sc->sc_ring.req_prod_pvt++; 1322 } 1323 1324 static void 1325 xbd_diskstart_submit_indirect(struct xbd_xenbus_softc *sc, 1326 struct xbd_req *xbdreq, struct buf *bp) 1327 { 1328 blkif_request_indirect_t *req; 1329 paddr_t ma; 1330 int nsects, nbytes, dmaseg, first_sect; 1331 struct blkif_request_segment *reqseg; 1332 1333 KASSERT(mutex_owned(&sc->sc_lock)); 1334 1335 req = (blkif_request_indirect_t *)RING_GET_REQUEST(&sc->sc_ring, 1336 sc->sc_ring.req_prod_pvt); 1337 req->id = xbdreq->req_id; 1338 req->operation = BLKIF_OP_INDIRECT; 1339 req->indirect_op = 1340 bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE; 1341 req->sector_number = bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE; 1342 req->handle = sc->sc_handle; 1343 DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n", 1344 __func__, req->id, req->indirect_op, req->sector_number, 1345 req->handle)); 1346 1347 xbdreq->req_indirect = SLIST_FIRST(&sc->sc_indirect_head); 1348 KASSERT(xbdreq->req_indirect != NULL); /* always as many as reqs */ 1349 SLIST_REMOVE_HEAD(&sc->sc_indirect_head, in_next); 1350 req->indirect_grefs[0] = xbdreq->req_indirect->in_gntref; 1351 1352 reqseg = xbdreq->req_indirect->in_addr; 1353 for (dmaseg = 0; dmaseg < xbdreq->req_dmamap->dm_nsegs; dmaseg++) { 1354 bus_dma_segment_t *ds = &xbdreq->req_dmamap->dm_segs[dmaseg]; 1355 1356 ma = ds->ds_addr; 1357 nbytes = ds->ds_len; 1358 1359 KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0); 1360 KASSERT((nbytes & (sc->sc_secsize - 1)) == 0); 1361 1362 first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT; 1363 nsects = nbytes >> XEN_BSHIFT; 1364 1365 reqseg->first_sect = first_sect; 1366 reqseg->last_sect = first_sect + nsects - 1; 1367 reqseg->gref = xbdreq->req_gntref[dmaseg]; 1368 DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, dmaseg, 1369 reqseg->first_sect, reqseg->last_sect)); 1370 1371 KASSERT(reqseg->first_sect <= reqseg->last_sect); 1372 KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE)); 1373 1374 reqseg++; 1375 } 1376 req->nr_segments = dmaseg; 1377 sc->sc_ring.req_prod_pvt++; 1378 1379 sc->sc_cnt_indirect.ev_count++; 1380 } 1381 1382 static int 1383 xbd_map_align(struct xbd_xenbus_softc *sc, struct xbd_req *req) 1384 { 1385 sc->sc_cnt_map_unalign.ev_count++; 1386 1387 if (sc->sc_unalign_used) { 1388 sc->sc_cnt_unalign_busy.ev_count++; 1389 return EAGAIN; 1390 } 1391 sc->sc_unalign_used = req->req_bp; 1392 1393 KASSERT(req->req_bp->b_bcount <= MAXPHYS); 1394 req->req_data = (void *)sc->sc_unalign_buffer; 1395 if ((req->req_bp->b_flags & B_READ) == 0) 1396 memcpy(req->req_data, req->req_bp->b_data, 1397 req->req_bp->b_bcount); 1398 return 0; 1399 } 1400 1401 static void 1402 xbd_unmap_align(struct xbd_xenbus_softc *sc, struct xbd_req *req, 1403 struct buf *bp) 1404 { 1405 KASSERT(!bp || sc->sc_unalign_used == bp); 1406 if (bp && bp->b_flags & B_READ) 1407 memcpy(bp->b_data, req->req_data, bp->b_bcount); 1408 sc->sc_unalign_used = NULL; 1409 } 1410