1 /* $NetBSD: xenevt.c,v 1.68 2022/09/22 16:21:34 bouyer Exp $ */ 2 3 /* 4 * Copyright (c) 2005 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __KERNEL_RCSID(0, "$NetBSD: xenevt.c,v 1.68 2022/09/22 16:21:34 bouyer Exp $"); 30 31 #include "opt_xen.h" 32 #include <sys/param.h> 33 #include <sys/kernel.h> 34 #include <sys/mutex.h> 35 #include <sys/systm.h> 36 #include <sys/device.h> 37 #include <sys/file.h> 38 #include <sys/filedesc.h> 39 #include <sys/poll.h> 40 #include <sys/select.h> 41 #include <sys/proc.h> 42 #include <sys/conf.h> 43 #include <sys/intr.h> 44 #include <sys/kmem.h> 45 46 #include <uvm/uvm_extern.h> 47 48 #include <xen/hypervisor.h> 49 #include <xen/evtchn.h> 50 #include <xen/intr.h> 51 #ifdef XENPV 52 #include <xen/xenpmap.h> 53 #endif 54 #include <xen/xenio.h> 55 #include <xen/xenio3.h> 56 #include <xen/xen.h> 57 58 #include "ioconf.h" 59 60 /* 61 * Interface between the event channel and userland. 62 * Each process with a xenevt device instance open can register events it 63 * wants to receive. It will get pending events by read(), eventually blocking 64 * until some event is available. Pending events are ack'd by a bitmask 65 * write()en to the device. Some special operations (such as events binding) 66 * are done though ioctl(). 67 * Processes get a device instance by opening a cloning device. 68 */ 69 70 static int xenevt_fread(struct file *, off_t *, struct uio *, 71 kauth_cred_t, int); 72 static int xenevt_fwrite(struct file *, off_t *, struct uio *, 73 kauth_cred_t, int); 74 static int xenevt_fioctl(struct file *, u_long, void *); 75 static int xenevt_fpoll(struct file *, int); 76 static int xenevt_fclose(struct file *); 77 /* static int xenevt_fkqfilter(struct file *, struct knote *); */ 78 79 static const struct fileops xenevt_fileops = { 80 .fo_name = "xenevt", 81 .fo_read = xenevt_fread, 82 .fo_write = xenevt_fwrite, 83 .fo_ioctl = xenevt_fioctl, 84 .fo_fcntl = fnullop_fcntl, 85 .fo_poll = xenevt_fpoll, 86 .fo_stat = fbadop_stat, 87 .fo_close = xenevt_fclose, 88 .fo_kqfilter = /* xenevt_fkqfilter */ fnullop_kqfilter, 89 .fo_restart = fnullop_restart, 90 }; 91 92 dev_type_open(xenevtopen); 93 dev_type_read(xenevtread); 94 dev_type_mmap(xenevtmmap); 95 const struct cdevsw xenevt_cdevsw = { 96 .d_open = xenevtopen, 97 .d_close = nullclose, 98 .d_read = xenevtread, 99 .d_write = nowrite, 100 .d_ioctl = noioctl, 101 .d_stop = nostop, 102 .d_tty = notty, 103 .d_poll = nopoll, 104 .d_mmap = xenevtmmap, 105 .d_kqfilter = nokqfilter, 106 .d_discard = nodiscard, 107 .d_flag = D_OTHER 108 }; 109 110 /* minor numbers */ 111 #define DEV_EVT 0 112 #define DEV_XSD 1 113 114 /* per-instance datas */ 115 #define XENEVT_RING_SIZE 2048 116 #define XENEVT_RING_MASK 2047 117 118 #define BYTES_PER_PORT (sizeof(evtchn_port_t) / sizeof(uint8_t)) 119 120 struct xenevt_d { 121 kmutex_t lock; 122 kcondvar_t cv; 123 STAILQ_ENTRY(xenevt_d) pendingq; 124 bool pending; 125 evtchn_port_t ring[2048]; 126 u_int ring_read; /* pointer of the reader */ 127 u_int ring_write; /* pointer of the writer */ 128 u_int flags; 129 #define XENEVT_F_OVERFLOW 0x01 /* ring overflow */ 130 #define XENEVT_F_FREE 0x02 /* free entry */ 131 struct selinfo sel; /* used by poll */ 132 struct cpu_info *ci; /* preferred CPU for events for this device */ 133 }; 134 135 static struct intrhand *xenevt_ih; 136 static evtchn_port_t xenevt_ev; 137 138 /* event -> user device mapping */ 139 static struct xenevt_d *devevent[NR_EVENT_CHANNELS]; 140 141 /* pending events */ 142 static void *devevent_sih; 143 static kmutex_t devevent_lock; 144 static STAILQ_HEAD(, xenevt_d) devevent_pending; 145 146 static void xenevt_record(struct xenevt_d *, evtchn_port_t); 147 static void xenevt_free(struct xenevt_d *); 148 149 /* pending events */ 150 long xenevt_ev1; 151 long xenevt_ev2[NR_EVENT_CHANNELS]; 152 static int xenevt_processevt(void *); 153 154 static evtchn_port_t xenevt_alloc_event(void) 155 { 156 evtchn_op_t op; 157 op.cmd = EVTCHNOP_alloc_unbound; 158 op.u.alloc_unbound.dom = DOMID_SELF; 159 op.u.alloc_unbound.remote_dom = DOMID_SELF; 160 if (HYPERVISOR_event_channel_op(&op) != 0) 161 panic("%s: Failed to allocate loopback event\n", __func__); 162 163 return op.u.alloc_unbound.port; 164 } 165 166 /* called at boot time */ 167 void 168 xenevtattach(int n) 169 { 170 int level = IPL_HIGH; 171 172 if (!xendomain_is_privileged()) 173 return; 174 #ifndef XENPV 175 if (vm_guest != VM_GUEST_XENPVH) 176 return; 177 #endif 178 179 mutex_init(&devevent_lock, MUTEX_DEFAULT, IPL_HIGH); 180 STAILQ_INIT(&devevent_pending); 181 182 devevent_sih = softint_establish(SOFTINT_SERIAL, 183 (void (*)(void *))xenevt_notify, NULL); 184 memset(devevent, 0, sizeof(devevent)); 185 xenevt_ev1 = 0; 186 memset(xenevt_ev2, 0, sizeof(xenevt_ev2)); 187 188 /* 189 * Allocate a loopback event port. 190 * It won't be used by itself, but will help registering IPL 191 * handlers. 192 */ 193 xenevt_ev = xenevt_alloc_event(); 194 195 /* 196 * The real objective here is to wiggle into the ih callchain for 197 * IPL level on vCPU 0. (events are bound to vCPU 0 by default). 198 */ 199 xenevt_ih = event_set_handler(xenevt_ev, xenevt_processevt, NULL, 200 level, NULL, "xenevt", true, &cpu_info_primary); 201 202 KASSERT(xenevt_ih != NULL); 203 } 204 205 /* register pending event - always called with interrupt disabled */ 206 void 207 xenevt_setipending(int l1, int l2) 208 { 209 KASSERT(curcpu() == xenevt_ih->ih_cpu); 210 KASSERT(xenevt_ih->ih_cpu->ci_ilevel >= IPL_HIGH); 211 atomic_or_ulong(&xenevt_ev1, 1UL << l1); 212 atomic_or_ulong(&xenevt_ev2[l1], 1UL << l2); 213 atomic_or_64(&xenevt_ih->ih_cpu->ci_istate, (1ULL << SIR_XENIPL_HIGH)); 214 atomic_add_int(&xenevt_ih->ih_pending, 1); 215 evtsource[xenevt_ev]->ev_evcnt.ev_count++; 216 } 217 218 /* process pending events */ 219 static int 220 xenevt_processevt(void *v) 221 { 222 long l1, l2; 223 int l1i, l2i; 224 int port; 225 226 l1 = xen_atomic_xchg(&xenevt_ev1, 0); 227 while ((l1i = xen_ffs(l1)) != 0) { 228 l1i--; 229 l1 &= ~(1UL << l1i); 230 l2 = xen_atomic_xchg(&xenevt_ev2[l1i], 0); 231 while ((l2i = xen_ffs(l2)) != 0) { 232 l2i--; 233 l2 &= ~(1UL << l2i); 234 port = (l1i << LONG_SHIFT) + l2i; 235 xenevt_event(port); 236 } 237 } 238 239 return 0; 240 } 241 242 243 /* event callback, called at splhigh() */ 244 void 245 xenevt_event(int port) 246 { 247 struct xenevt_d *d; 248 249 mutex_enter(&devevent_lock); 250 d = devevent[port]; 251 if (d != NULL) { 252 xenevt_record(d, port); 253 254 if (d->pending == false) { 255 STAILQ_INSERT_TAIL(&devevent_pending, d, pendingq); 256 d->pending = true; 257 mutex_exit(&devevent_lock); 258 softint_schedule(devevent_sih); 259 return; 260 } 261 } 262 mutex_exit(&devevent_lock); 263 } 264 265 void 266 xenevt_notify(void) 267 { 268 struct xenevt_d *d; 269 270 for (;;) { 271 mutex_enter(&devevent_lock); 272 d = STAILQ_FIRST(&devevent_pending); 273 if (d == NULL) { 274 mutex_exit(&devevent_lock); 275 break; 276 } 277 STAILQ_REMOVE_HEAD(&devevent_pending, pendingq); 278 d->pending = false; 279 mutex_enter(&d->lock); 280 if (d->flags & XENEVT_F_FREE) { 281 xenevt_free(d); /* releases devevent_lock */ 282 } else { 283 mutex_exit(&devevent_lock); 284 selnotify(&d->sel, 0, 1); 285 cv_broadcast(&d->cv); 286 mutex_exit(&d->lock); 287 } 288 } 289 } 290 291 static void 292 xenevt_record(struct xenevt_d *d, evtchn_port_t port) 293 { 294 295 /* 296 * This algorithm overflows for one less slot than available. 297 * Not really an issue, and the correct algorithm would be more 298 * complex 299 */ 300 301 mutex_enter(&d->lock); 302 if (d->ring_read == 303 ((d->ring_write + 1) & XENEVT_RING_MASK)) { 304 d->flags |= XENEVT_F_OVERFLOW; 305 printf("xenevt_event: ring overflow port %d\n", port); 306 } else { 307 d->ring[d->ring_write] = port; 308 d->ring_write = (d->ring_write + 1) & XENEVT_RING_MASK; 309 } 310 mutex_exit(&d->lock); 311 } 312 313 /* open the xenevt device; this is where we clone */ 314 int 315 xenevtopen(dev_t dev, int flags, int mode, struct lwp *l) 316 { 317 struct xenevt_d *d; 318 struct file *fp; 319 int fd, error; 320 321 switch(minor(dev)) { 322 case DEV_EVT: 323 /* falloc() will fill in the descriptor for us. */ 324 if ((error = fd_allocfile(&fp, &fd)) != 0) 325 return error; 326 327 d = kmem_zalloc(sizeof(*d), KM_SLEEP); 328 d->ci = xenevt_ih->ih_cpu; 329 mutex_init(&d->lock, MUTEX_DEFAULT, IPL_HIGH); 330 cv_init(&d->cv, "xenevt"); 331 selinit(&d->sel); 332 return fd_clone(fp, fd, flags, &xenevt_fileops, d); 333 case DEV_XSD: 334 /* no clone for /dev/xsd_kva */ 335 return (0); 336 default: 337 break; 338 } 339 return ENODEV; 340 } 341 342 /* read from device: only for /dev/xsd_kva, xenevt is done though fread */ 343 int 344 xenevtread(dev_t dev, struct uio *uio, int flags) 345 { 346 #define LD_STRLEN 21 /* a 64bit integer needs 20 digits in base10 */ 347 if (minor(dev) == DEV_XSD) { 348 char strbuf[LD_STRLEN], *bf; 349 int off, error; 350 size_t len; 351 352 off = (int)uio->uio_offset; 353 if (off < 0) 354 return EINVAL; 355 len = snprintf(strbuf, sizeof(strbuf), "%ld\n", 356 xen_start_info.store_mfn); 357 if (off >= len) { 358 bf = strbuf; 359 len = 0; 360 } else { 361 bf = &strbuf[off]; 362 len -= off; 363 } 364 error = uiomove(bf, len, uio); 365 return error; 366 } 367 return ENODEV; 368 } 369 370 /* mmap: only for xsd_kva */ 371 paddr_t 372 xenevtmmap(dev_t dev, off_t off, int prot) 373 { 374 if (minor(dev) == DEV_XSD) { 375 /* only one page, so off is always 0 */ 376 if (off != 0) 377 return -1; 378 #ifdef XENPV 379 return x86_btop( 380 xpmap_mtop((paddr_t)xen_start_info.store_mfn << PAGE_SHIFT)); 381 #else 382 return x86_btop( 383 (paddr_t)xen_start_info.store_mfn << PAGE_SHIFT); 384 #endif 385 } 386 return -1; 387 } 388 389 static void 390 xenevt_free(struct xenevt_d *d) 391 { 392 int i; 393 KASSERT(mutex_owned(&devevent_lock)); 394 KASSERT(mutex_owned(&d->lock)); 395 396 for (i = 0; i < NR_EVENT_CHANNELS; i++ ) { 397 if (devevent[i] == d) { 398 evtchn_op_t op = { .cmd = 0 }; 399 int error; 400 401 hypervisor_mask_event(i); 402 xen_atomic_clear_bit(&d->ci->ci_evtmask[0], i); 403 devevent[i] = NULL; 404 op.cmd = EVTCHNOP_close; 405 op.u.close.port = i; 406 if ((error = HYPERVISOR_event_channel_op(&op))) { 407 printf("xenevt_fclose: error %d from " 408 "hypervisor\n", -error); 409 } 410 } 411 } 412 mutex_exit(&d->lock); 413 mutex_exit(&devevent_lock); 414 seldestroy(&d->sel); 415 cv_destroy(&d->cv); 416 mutex_destroy(&d->lock); 417 kmem_free(d, sizeof(*d)); 418 } 419 420 static int 421 xenevt_fclose(struct file *fp) 422 { 423 struct xenevt_d *d = fp->f_data; 424 425 mutex_enter(&devevent_lock); 426 mutex_enter(&d->lock); 427 if (d->pending) { 428 d->flags |= XENEVT_F_FREE; 429 mutex_exit(&d->lock); 430 mutex_exit(&devevent_lock); 431 } else { 432 xenevt_free(d); /* releases devevent_lock */ 433 } 434 435 fp->f_data = NULL; 436 return (0); 437 } 438 439 static int 440 xenevt_fread(struct file *fp, off_t *offp, struct uio *uio, 441 kauth_cred_t cred, int flags) 442 { 443 struct xenevt_d *d = fp->f_data; 444 int error, ring_read, ring_write; 445 size_t len, uio_len; 446 447 error = 0; 448 mutex_enter(&d->lock); 449 while (error == 0) { 450 ring_read = d->ring_read; 451 ring_write = d->ring_write; 452 if (ring_read != ring_write) { 453 break; 454 } 455 if (d->flags & XENEVT_F_OVERFLOW) { 456 break; 457 } 458 459 /* nothing to read */ 460 if ((fp->f_flag & FNONBLOCK) == 0) { 461 error = cv_wait_sig(&d->cv, &d->lock); 462 } else { 463 error = EAGAIN; 464 } 465 } 466 if (error == 0 && (d->flags & XENEVT_F_OVERFLOW)) { 467 error = EFBIG; 468 } 469 mutex_exit(&d->lock); 470 471 if (error) { 472 return error; 473 } 474 475 uio_len = uio->uio_resid / BYTES_PER_PORT; 476 if (ring_read <= ring_write) 477 len = ring_write - ring_read; 478 else 479 len = XENEVT_RING_SIZE - ring_read; 480 if (len > uio_len) 481 len = uio_len; 482 error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio); 483 if (error) 484 return error; 485 ring_read = (ring_read + len) & XENEVT_RING_MASK; 486 uio_len = uio->uio_resid / BYTES_PER_PORT; 487 if (uio_len == 0) 488 goto done; 489 /* ring wrapped, read the second part */ 490 len = ring_write - ring_read; 491 if (len > uio_len) 492 len = uio_len; 493 error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio); 494 if (error) 495 return error; 496 ring_read = (ring_read + len) & XENEVT_RING_MASK; 497 498 done: 499 mutex_enter(&d->lock); 500 d->ring_read = ring_read; 501 mutex_exit(&d->lock); 502 503 return 0; 504 } 505 506 static int 507 xenevt_fwrite(struct file *fp, off_t *offp, struct uio *uio, 508 kauth_cred_t cred, int flags) 509 { 510 struct xenevt_d *d = fp->f_data; 511 uint16_t *chans; 512 int i, nentries, error; 513 514 if (uio->uio_resid == 0) 515 return (0); 516 nentries = uio->uio_resid / sizeof(uint16_t); 517 if (nentries >= NR_EVENT_CHANNELS) 518 return EMSGSIZE; 519 chans = kmem_alloc(nentries * sizeof(uint16_t), KM_SLEEP); 520 error = uiomove(chans, uio->uio_resid, uio); 521 if (error) 522 goto out; 523 mutex_enter(&devevent_lock); 524 for (i = 0; i < nentries; i++) { 525 if (chans[i] < NR_EVENT_CHANNELS && 526 devevent[chans[i]] == d) { 527 hypervisor_unmask_event(chans[i]); 528 } 529 } 530 mutex_exit(&devevent_lock); 531 out: 532 kmem_free(chans, nentries * sizeof(uint16_t)); 533 return 0; 534 } 535 536 static int 537 xenevt_fioctl(struct file *fp, u_long cmd, void *addr) 538 { 539 struct xenevt_d *d = fp->f_data; 540 evtchn_op_t op = { .cmd = 0 }; 541 int error; 542 543 switch(cmd) { 544 case EVTCHN_RESET: 545 case IOCTL_EVTCHN_RESET: 546 mutex_enter(&d->lock); 547 d->ring_read = d->ring_write = 0; 548 d->flags = 0; 549 mutex_exit(&d->lock); 550 break; 551 case IOCTL_EVTCHN_BIND_VIRQ: 552 { 553 struct ioctl_evtchn_bind_virq *bind_virq = addr; 554 op.cmd = EVTCHNOP_bind_virq; 555 op.u.bind_virq.virq = bind_virq->virq; 556 op.u.bind_virq.vcpu = 0; 557 if ((error = HYPERVISOR_event_channel_op(&op))) { 558 printf("IOCTL_EVTCHN_BIND_VIRQ failed: virq %d error %d\n", bind_virq->virq, error); 559 return -error; 560 } 561 bind_virq->port = op.u.bind_virq.port; 562 mutex_enter(&devevent_lock); 563 KASSERT(devevent[bind_virq->port] == NULL); 564 devevent[bind_virq->port] = d; 565 mutex_exit(&devevent_lock); 566 xen_atomic_set_bit(&d->ci->ci_evtmask[0], bind_virq->port); 567 hypervisor_unmask_event(bind_virq->port); 568 break; 569 } 570 case IOCTL_EVTCHN_BIND_INTERDOMAIN: 571 { 572 struct ioctl_evtchn_bind_interdomain *bind_intd = addr; 573 op.cmd = EVTCHNOP_bind_interdomain; 574 op.u.bind_interdomain.remote_dom = bind_intd->remote_domain; 575 op.u.bind_interdomain.remote_port = bind_intd->remote_port; 576 if ((error = HYPERVISOR_event_channel_op(&op))) 577 return -error; 578 bind_intd->port = op.u.bind_interdomain.local_port; 579 mutex_enter(&devevent_lock); 580 KASSERT(devevent[bind_intd->port] == NULL); 581 devevent[bind_intd->port] = d; 582 mutex_exit(&devevent_lock); 583 xen_atomic_set_bit(&d->ci->ci_evtmask[0], bind_intd->port); 584 hypervisor_unmask_event(bind_intd->port); 585 break; 586 } 587 case IOCTL_EVTCHN_BIND_UNBOUND_PORT: 588 { 589 struct ioctl_evtchn_bind_unbound_port *bind_unbound = addr; 590 op.cmd = EVTCHNOP_alloc_unbound; 591 op.u.alloc_unbound.dom = DOMID_SELF; 592 op.u.alloc_unbound.remote_dom = bind_unbound->remote_domain; 593 if ((error = HYPERVISOR_event_channel_op(&op))) 594 return -error; 595 bind_unbound->port = op.u.alloc_unbound.port; 596 mutex_enter(&devevent_lock); 597 KASSERT(devevent[bind_unbound->port] == NULL); 598 devevent[bind_unbound->port] = d; 599 mutex_exit(&devevent_lock); 600 xen_atomic_set_bit(&d->ci->ci_evtmask[0], bind_unbound->port); 601 hypervisor_unmask_event(bind_unbound->port); 602 break; 603 } 604 case IOCTL_EVTCHN_UNBIND: 605 { 606 struct ioctl_evtchn_unbind *unbind = addr; 607 608 if (unbind->port >= NR_EVENT_CHANNELS) 609 return EINVAL; 610 mutex_enter(&devevent_lock); 611 if (devevent[unbind->port] != d) { 612 mutex_exit(&devevent_lock); 613 return ENOTCONN; 614 } 615 devevent[unbind->port] = NULL; 616 mutex_exit(&devevent_lock); 617 hypervisor_mask_event(unbind->port); 618 xen_atomic_clear_bit(&d->ci->ci_evtmask[0], unbind->port); 619 op.cmd = EVTCHNOP_close; 620 op.u.close.port = unbind->port; 621 if ((error = HYPERVISOR_event_channel_op(&op))) 622 return -error; 623 break; 624 } 625 case IOCTL_EVTCHN_NOTIFY: 626 { 627 struct ioctl_evtchn_notify *notify = addr; 628 629 if (notify->port >= NR_EVENT_CHANNELS) 630 return EINVAL; 631 mutex_enter(&devevent_lock); 632 if (devevent[notify->port] != d) { 633 mutex_exit(&devevent_lock); 634 return ENOTCONN; 635 } 636 hypervisor_notify_via_evtchn(notify->port); 637 mutex_exit(&devevent_lock); 638 break; 639 } 640 case FIONBIO: 641 break; 642 default: 643 return EINVAL; 644 } 645 return 0; 646 } 647 648 /* 649 * Support for poll() system call 650 * 651 * Return true if the specific operation will not block indefinitely. 652 */ 653 654 static int 655 xenevt_fpoll(struct file *fp, int events) 656 { 657 struct xenevt_d *d = fp->f_data; 658 int revents = events & (POLLOUT | POLLWRNORM); /* we can always write */ 659 660 mutex_enter(&d->lock); 661 if (events & (POLLIN | POLLRDNORM)) { 662 if (d->ring_read != d->ring_write) { 663 revents |= events & (POLLIN | POLLRDNORM); 664 } else { 665 /* Record that someone is waiting */ 666 selrecord(curlwp, &d->sel); 667 } 668 } 669 mutex_exit(&d->lock); 670 return (revents); 671 } 672