Home | History | Annotate | Line # | Download | only in librumphijack
hijack.c revision 1.7
      1 /*      $NetBSD: hijack.c,v 1.7 2011/01/09 19:56:33 pooka Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2011 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     25  * SUCH DAMAGE.
     26  */
     27 
     28 #include <sys/cdefs.h>
     29 __RCSID("$NetBSD: hijack.c,v 1.7 2011/01/09 19:56:33 pooka Exp $");
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/ioctl.h>
     34 #include <sys/socket.h>
     35 #include <sys/poll.h>
     36 
     37 #include <rump/rump.h>
     38 #include <rump/rumpclient.h>
     39 #include <rump/rump_syscalls.h>
     40 
     41 #include <assert.h>
     42 #include <dlfcn.h>
     43 #include <err.h>
     44 #include <errno.h>
     45 #include <fcntl.h>
     46 #include <poll.h>
     47 #include <pthread.h>
     48 #include <signal.h>
     49 #include <stdarg.h>
     50 #include <stdio.h>
     51 #include <stdlib.h>
     52 #include <time.h>
     53 #include <unistd.h>
     54 
     55 enum {	RUMPCALL_SOCKET, RUMPCALL_ACCEPT, RUMPCALL_BIND, RUMPCALL_CONNECT,
     56 	RUMPCALL_GETPEERNAME, RUMPCALL_GETSOCKNAME, RUMPCALL_LISTEN,
     57 	RUMPCALL_RECVFROM, RUMPCALL_RECVMSG,
     58 	RUMPCALL_SENDTO, RUMPCALL_SENDMSG,
     59 	RUMPCALL_GETSOCKOPT, RUMPCALL_SETSOCKOPT,
     60 	RUMPCALL_SHUTDOWN,
     61 	RUMPCALL_READ, RUMPCALL_READV,
     62 	RUMPCALL_WRITE, RUMPCALL_WRITEV,
     63 	RUMPCALL_IOCTL, RUMPCALL_FCNTL,
     64 	RUMPCALL_CLOSE,
     65 	RUMPCALL_POLLTS,
     66 	RUMPCALL__NUM
     67 };
     68 
     69 const char *sysnames[] = {
     70 	"__socket30",
     71 	"accept",
     72 	"bind",
     73 	"connect",
     74 	"getpeername",
     75 	"getsockname",
     76 	"listen",
     77 	"recvfrom",
     78 	"recvmsg",
     79 	"sendto",
     80 	"sendmsg",
     81 	"getsockopt",
     82 	"setsockopt",
     83 	"shutdown",
     84 	"read",
     85 	"readv",
     86 	"write",
     87 	"writev",
     88 	"ioctl",
     89 	"fcntl",
     90 	"close",
     91 	"__pollts50",
     92 };
     93 
     94 static int	(*host_socket)(int, int, int);
     95 static int	(*host_connect)(int, const struct sockaddr *, socklen_t);
     96 static int	(*host_bind)(int, const struct sockaddr *, socklen_t);
     97 static int	(*host_listen)(int, int);
     98 static int	(*host_accept)(int, struct sockaddr *, socklen_t *);
     99 static int	(*host_getpeername)(int, struct sockaddr *, socklen_t *);
    100 static int	(*host_getsockname)(int, struct sockaddr *, socklen_t *);
    101 static int	(*host_setsockopt)(int, int, int, const void *, socklen_t);
    102 
    103 static ssize_t	(*host_read)(int, void *, size_t);
    104 static ssize_t	(*host_readv)(int, const struct iovec *, int);
    105 static ssize_t	(*host_write)(int, const void *, size_t);
    106 static ssize_t	(*host_writev)(int, const struct iovec *, int);
    107 static int	(*host_ioctl)(int, unsigned long, ...);
    108 static int	(*host_fcntl)(int, int, ...);
    109 static int	(*host_close)(int);
    110 static int	(*host_pollts)(struct pollfd *, nfds_t,
    111 			       const struct timespec *, const sigset_t *);
    112 static pid_t	(*host_fork)(void);
    113 static int	(*host_dup2)(int, int);
    114 
    115 static void *rumpcalls[RUMPCALL__NUM];
    116 
    117 /*
    118  * This is called from librumpclient in case of LD_PRELOAD.
    119  * It ensures correct RTLD_NEXT.
    120  */
    121 static void *
    122 hijackdlsym(void *handle, const char *symbol)
    123 {
    124 
    125 	return dlsym(handle, symbol);
    126 }
    127 
    128 /* low calorie sockets? */
    129 static bool hostlocalsockets = false;
    130 
    131 static void __attribute__((constructor))
    132 rcinit(void)
    133 {
    134 	int (*rumpcinit)(void);
    135 	void **rumpcdlsym;
    136 	void *hand;
    137 	int i;
    138 
    139 	hand = dlopen("librumpclient.so", RTLD_LAZY|RTLD_GLOBAL);
    140 	if (!hand)
    141 		err(1, "cannot open librumpclient.so");
    142 	rumpcinit = dlsym(hand, "rumpclient_init");
    143 	_DIAGASSERT(rumpcinit);
    144 
    145 	rumpcdlsym = dlsym(hand, "rumpclient_dlsym");
    146 	*rumpcdlsym = hijackdlsym;
    147 
    148 	host_socket = dlsym(RTLD_NEXT, "__socket30");
    149 	host_listen = dlsym(RTLD_NEXT, "listen");
    150 	host_connect = dlsym(RTLD_NEXT, "connect");
    151 	host_bind = dlsym(RTLD_NEXT, "bind");
    152 	host_accept = dlsym(RTLD_NEXT, "accept");
    153 	host_getpeername = dlsym(RTLD_NEXT, "getpeername");
    154 	host_getsockname = dlsym(RTLD_NEXT, "getsockname");
    155 	host_setsockopt = dlsym(RTLD_NEXT, "setsockopt");
    156 
    157 	host_read = dlsym(RTLD_NEXT, "read");
    158 	host_readv = dlsym(RTLD_NEXT, "readv");
    159 	host_write = dlsym(RTLD_NEXT, "write");
    160 	host_writev = dlsym(RTLD_NEXT, "writev");
    161 	host_ioctl = dlsym(RTLD_NEXT, "ioctl");
    162 	host_fcntl = dlsym(RTLD_NEXT, "fcntl");
    163 	host_close = dlsym(RTLD_NEXT, "close");
    164 	host_pollts = dlsym(RTLD_NEXT, "pollts");
    165 	host_fork = dlsym(RTLD_NEXT, "fork");
    166 	host_dup2 = dlsym(RTLD_NEXT, "dup2");
    167 
    168 	for (i = 0; i < RUMPCALL__NUM; i++) {
    169 		char sysname[128];
    170 
    171 		snprintf(sysname, sizeof(sysname), "rump_sys_%s", sysnames[i]);
    172 		rumpcalls[i] = dlsym(hand, sysname);
    173 		if (!rumpcalls[i]) {
    174 			fprintf(stderr, "cannot find symbol: %s\n", sysname);
    175 			exit(1);
    176 		}
    177 	}
    178 
    179 	if (rumpcinit() == -1)
    180 		err(1, "rumpclient init");
    181 }
    182 
    183 static unsigned dup2mask;
    184 #define ISDUP2D(fd) (((fd+1) & dup2mask) == ((fd)+1))
    185 
    186 //#define DEBUGJACK
    187 #ifdef DEBUGJACK
    188 #define DPRINTF(x) mydprintf x
    189 static void
    190 mydprintf(const char *fmt, ...)
    191 {
    192 	va_list ap;
    193 
    194 	if (ISDUP2D(STDERR_FILENO))
    195 		return;
    196 
    197 	va_start(ap, fmt);
    198 	vfprintf(stderr, fmt, ap);
    199 	va_end(ap);
    200 }
    201 
    202 #else
    203 #define DPRINTF(x)
    204 #endif
    205 
    206 /* XXX: need runtime selection.  low for now due to FD_SETSIZE */
    207 #define HIJACK_FDOFF 128
    208 #define HIJACK_SELECT 128 /* XXX */
    209 #define HIJACK_ASSERT 128 /* XXX */
    210 static int
    211 fd_rump2host(int fd)
    212 {
    213 
    214 	if (fd == -1)
    215 		return fd;
    216 
    217 	if (!ISDUP2D(fd))
    218 		fd += HIJACK_FDOFF;
    219 
    220 	return fd;
    221 }
    222 
    223 static int
    224 fd_host2rump(int fd)
    225 {
    226 
    227 	if (!ISDUP2D(fd))
    228 		fd -= HIJACK_FDOFF;
    229 	return fd;
    230 }
    231 
    232 static bool
    233 fd_isrump(int fd)
    234 {
    235 
    236 	return ISDUP2D(fd) || fd >= HIJACK_FDOFF;
    237 }
    238 
    239 #define assertfd(_fd_) assert(ISDUP2D(_fd_) || (_fd_) >= HIJACK_ASSERT)
    240 #undef HIJACK_FDOFF
    241 
    242 /*
    243  * Following wrappers always call the rump kernel.
    244  */
    245 
    246 int __socket30(int, int, int);
    247 int
    248 __socket30(int domain, int type, int protocol)
    249 {
    250 	int (*rc_socket)(int, int, int);
    251 	int fd;
    252 	bool dohost;
    253 
    254 	dohost = hostlocalsockets && (domain == AF_LOCAL);
    255 
    256 	if (dohost)
    257 		rc_socket = host_socket;
    258 	else
    259 		rc_socket = rumpcalls[RUMPCALL_SOCKET];
    260 	fd = rc_socket(domain, type, protocol);
    261 
    262 	if (!dohost)
    263 		fd = fd_rump2host(fd);
    264 	DPRINTF(("socket <- %d\n", fd));
    265 
    266 	return fd;
    267 }
    268 
    269 int
    270 accept(int s, struct sockaddr *addr, socklen_t *addrlen)
    271 {
    272 	int (*rc_accept)(int, struct sockaddr *, socklen_t *);
    273 	int fd;
    274 	bool isrump;
    275 
    276 	isrump = fd_isrump(s);
    277 
    278 	DPRINTF(("accept -> %d", s));
    279 	if (isrump) {
    280 		rc_accept = rumpcalls[RUMPCALL_ACCEPT];
    281 		s = fd_host2rump(s);
    282 	} else {
    283 		rc_accept = host_accept;
    284 	}
    285 	fd = rc_accept(s, addr, addrlen);
    286 	if (fd != -1 && isrump)
    287 		fd = fd_rump2host(fd);
    288 
    289 	DPRINTF((" <- %d\n", fd));
    290 
    291 	return fd;
    292 }
    293 
    294 int
    295 bind(int s, const struct sockaddr *name, socklen_t namelen)
    296 {
    297 	int (*rc_bind)(int, const struct sockaddr *, socklen_t);
    298 
    299 	DPRINTF(("bind -> %d\n", s));
    300 	if (fd_isrump(s)) {
    301 		rc_bind = rumpcalls[RUMPCALL_BIND];
    302 		s = fd_host2rump(s);
    303 	} else {
    304 		rc_bind = host_bind;
    305 	}
    306 	return rc_bind(s, name, namelen);
    307 }
    308 
    309 int
    310 connect(int s, const struct sockaddr *name, socklen_t namelen)
    311 {
    312 	int (*rc_connect)(int, const struct sockaddr *, socklen_t);
    313 
    314 	DPRINTF(("connect -> %d\n", s));
    315 	if (fd_isrump(s)) {
    316 		rc_connect = rumpcalls[RUMPCALL_CONNECT];
    317 		s = fd_host2rump(s);
    318 	} else {
    319 		rc_connect = host_connect;
    320 	}
    321 
    322 	return rc_connect(s, name, namelen);
    323 }
    324 
    325 int
    326 getpeername(int s, struct sockaddr *name, socklen_t *namelen)
    327 {
    328 	int (*rc_getpeername)(int, struct sockaddr *, socklen_t *);
    329 
    330 	DPRINTF(("getpeername -> %d\n", s));
    331 	if (fd_isrump(s)) {
    332 		rc_getpeername = rumpcalls[RUMPCALL_GETPEERNAME];
    333 		s = fd_host2rump(s);
    334 	} else {
    335 		rc_getpeername = host_getpeername;
    336 	}
    337 	return rc_getpeername(s, name, namelen);
    338 }
    339 
    340 int
    341 getsockname(int s, struct sockaddr *name, socklen_t *namelen)
    342 {
    343 	int (*rc_getsockname)(int, struct sockaddr *, socklen_t *);
    344 
    345 	DPRINTF(("getsockname -> %d\n", s));
    346 	if (fd_isrump(s)) {
    347 		rc_getsockname = rumpcalls[RUMPCALL_GETSOCKNAME];
    348 		s = fd_host2rump(s);
    349 	} else {
    350 		rc_getsockname = host_getsockname;
    351 	}
    352 	return rc_getsockname(s, name, namelen);
    353 }
    354 
    355 int
    356 listen(int s, int backlog)
    357 {
    358 	int (*rc_listen)(int, int);
    359 
    360 	DPRINTF(("listen -> %d\n", s));
    361 	if (fd_isrump(s)) {
    362 		rc_listen = rumpcalls[RUMPCALL_LISTEN];
    363 		s = fd_host2rump(s);
    364 	} else {
    365 		rc_listen = host_listen;
    366 	}
    367 	return rc_listen(s, backlog);
    368 }
    369 
    370 ssize_t
    371 recv(int s, void *buf, size_t len, int flags)
    372 {
    373 
    374 	return recvfrom(s, buf, len, flags, NULL, NULL);
    375 }
    376 
    377 ssize_t
    378 recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
    379 	socklen_t *fromlen)
    380 {
    381 	int (*rc_recvfrom)(int, void *, size_t, int,
    382 	    struct sockaddr *, socklen_t *);
    383 
    384 	DPRINTF(("recvfrom\n"));
    385 	assertfd(s);
    386 	rc_recvfrom = rumpcalls[RUMPCALL_RECVFROM];
    387 	return rc_recvfrom(fd_host2rump(s), buf, len, flags, from, fromlen);
    388 }
    389 
    390 ssize_t
    391 recvmsg(int s, struct msghdr *msg, int flags)
    392 {
    393 	int (*rc_recvmsg)(int, struct msghdr *, int);
    394 
    395 	DPRINTF(("recvmsg\n"));
    396 	assertfd(s);
    397 	rc_recvmsg = rumpcalls[RUMPCALL_RECVMSG];
    398 	return rc_recvmsg(fd_host2rump(s), msg, flags);
    399 }
    400 
    401 ssize_t
    402 send(int s, const void *buf, size_t len, int flags)
    403 {
    404 
    405 	return sendto(s, buf, len, flags, NULL, 0);
    406 }
    407 
    408 ssize_t
    409 sendto(int s, const void *buf, size_t len, int flags,
    410 	const struct sockaddr *to, socklen_t tolen)
    411 {
    412 	int (*rc_sendto)(int, const void *, size_t, int,
    413 	    const struct sockaddr *, socklen_t);
    414 
    415 	if (s == -1)
    416 		return len;
    417 
    418 	DPRINTF(("sendto\n"));
    419 	assertfd(s);
    420 	rc_sendto = rumpcalls[RUMPCALL_SENDTO];
    421 	return rc_sendto(fd_host2rump(s), buf, len, flags, to, tolen);
    422 }
    423 
    424 ssize_t
    425 sendmsg(int s, const struct msghdr *msg, int flags)
    426 {
    427 	int (*rc_sendmsg)(int, const struct msghdr *, int);
    428 
    429 	DPRINTF(("sendmsg\n"));
    430 	assertfd(s);
    431 	rc_sendmsg = rumpcalls[RUMPCALL_SENDTO];
    432 	return rc_sendmsg(fd_host2rump(s), msg, flags);
    433 }
    434 
    435 int
    436 getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen)
    437 {
    438 	int (*rc_getsockopt)(int, int, int, void *, socklen_t *);
    439 
    440 	DPRINTF(("getsockopt -> %d\n", s));
    441 	assertfd(s);
    442 	rc_getsockopt = rumpcalls[RUMPCALL_GETSOCKOPT];
    443 	return rc_getsockopt(fd_host2rump(s), level, optname, optval, optlen);
    444 }
    445 
    446 int
    447 setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen)
    448 {
    449 	int (*rc_setsockopt)(int, int, int, const void *, socklen_t);
    450 
    451 	DPRINTF(("setsockopt -> %d\n", s));
    452 	if (fd_isrump(s)) {
    453 		rc_setsockopt = rumpcalls[RUMPCALL_SETSOCKOPT];
    454 		s = fd_host2rump(s);
    455 	} else {
    456 		rc_setsockopt = host_setsockopt;
    457 	}
    458 	return rc_setsockopt(s, level, optname, optval, optlen);
    459 }
    460 
    461 int
    462 shutdown(int s, int how)
    463 {
    464 	int (*rc_shutdown)(int, int);
    465 
    466 	DPRINTF(("shutdown -> %d\n", s));
    467 	assertfd(s);
    468 	rc_shutdown = rumpcalls[RUMPCALL_SHUTDOWN];
    469 	return rc_shutdown(fd_host2rump(s), how);
    470 }
    471 
    472 /*
    473  * dup2 is special.  we allow dup2 of a rump kernel fd to 0-2 since
    474  * many programs do that.  dup2 of a rump kernel fd to another value
    475  * not >= fdoff is an error.
    476  *
    477  * Note: cannot rump2host newd, because it is often hardcoded.
    478  *
    479  * XXX: should disable debug prints after stdout/stderr are dup2'd
    480  */
    481 int
    482 dup2(int oldd, int newd)
    483 {
    484 	int rv;
    485 
    486 	DPRINTF(("dup2 -> %d (o) -> %d (n)\n", oldd, newd));
    487 
    488 	if (fd_isrump(oldd)) {
    489 		if (!(newd >= 0 && newd <= 2))
    490 			return EBADF;
    491 		oldd = fd_host2rump(oldd);
    492 		rv = rump_sys_dup2(oldd, newd);
    493 		if (rv != -1)
    494 			dup2mask |= newd+1;
    495 		return rv;
    496 	} else {
    497 		return host_dup2(oldd, newd);
    498 	}
    499 }
    500 
    501 /*
    502  * We just wrap fork the appropriate rump client calls to preserve
    503  * the file descriptors of the forked parent in the child, but
    504  * prevent double use of connection fd.
    505  */
    506 
    507 pid_t
    508 fork()
    509 {
    510 	struct rumpclient_fork *rf;
    511 	pid_t rv;
    512 
    513 	DPRINTF(("fork\n"));
    514 
    515 	if ((rf = rumpclient_prefork()) == NULL)
    516 		return -1;
    517 
    518 	switch ((rv = host_fork())) {
    519 	case -1:
    520 		/* XXX: cancel rf */
    521 		break;
    522 	case 0:
    523 		if (rumpclient_fork_init(rf) == -1)
    524 			rv = -1;
    525 		break;
    526 	default:
    527 		break;
    528 	}
    529 
    530 	DPRINTF(("fork returns %d\n", rv));
    531 	return rv;
    532 }
    533 
    534 /*
    535  * Hybrids
    536  */
    537 
    538 ssize_t
    539 read(int fd, void *buf, size_t len)
    540 {
    541 	int (*op_read)(int, void *, size_t);
    542 	ssize_t n;
    543 
    544 	DPRINTF(("read %d\n", fd));
    545 	if (fd_isrump(fd)) {
    546 		fd = fd_host2rump(fd);
    547 		op_read = rumpcalls[RUMPCALL_READ];
    548 	} else {
    549 		op_read = host_read;
    550 	}
    551 
    552 	n = op_read(fd, buf, len);
    553 	return n;
    554 }
    555 
    556 ssize_t
    557 readv(int fd, const struct iovec *iov, int iovcnt)
    558 {
    559 	int (*op_readv)(int, const struct iovec *, int);
    560 
    561 	DPRINTF(("readv %d\n", fd));
    562 	if (fd_isrump(fd)) {
    563 		fd = fd_host2rump(fd);
    564 		op_readv = rumpcalls[RUMPCALL_READV];
    565 	} else {
    566 		op_readv = host_readv;
    567 	}
    568 
    569 	return op_readv(fd, iov, iovcnt);
    570 }
    571 
    572 ssize_t
    573 write(int fd, const void *buf, size_t len)
    574 {
    575 	int (*op_write)(int, const void *, size_t);
    576 
    577 	if (fd_isrump(fd)) {
    578 		fd = fd_host2rump(fd);
    579 		op_write = rumpcalls[RUMPCALL_WRITE];
    580 	} else {
    581 		op_write = host_write;
    582 	}
    583 
    584 	return op_write(fd, buf, len);
    585 }
    586 
    587 ssize_t
    588 writev(int fd, const struct iovec *iov, int iovcnt)
    589 {
    590 	int (*op_writev)(int, const struct iovec *, int);
    591 
    592 	DPRINTF(("writev %d\n", fd));
    593 	if (fd_isrump(fd)) {
    594 		fd = fd_host2rump(fd);
    595 		op_writev = rumpcalls[RUMPCALL_WRITEV];
    596 	} else {
    597 		op_writev = host_writev;
    598 	}
    599 
    600 	return op_writev(fd, iov, iovcnt);
    601 }
    602 
    603 int
    604 ioctl(int fd, unsigned long cmd, ...)
    605 {
    606 	int (*op_ioctl)(int, unsigned long cmd, ...);
    607 	va_list ap;
    608 	int rv;
    609 
    610 	DPRINTF(("ioctl\n"));
    611 	if (fd_isrump(fd)) {
    612 		fd = fd_host2rump(fd);
    613 		op_ioctl = rumpcalls[RUMPCALL_IOCTL];
    614 	} else {
    615 		op_ioctl = host_ioctl;
    616 	}
    617 
    618 	va_start(ap, cmd);
    619 	rv = op_ioctl(fd, cmd, va_arg(ap, void *));
    620 	va_end(ap);
    621 	return rv;
    622 }
    623 
    624 int
    625 fcntl(int fd, int cmd, ...)
    626 {
    627 	int (*op_fcntl)(int, int, ...);
    628 	va_list ap;
    629 	int rv;
    630 
    631 	DPRINTF(("fcntl\n"));
    632 	if (fd_isrump(fd)) {
    633 		fd = fd_host2rump(fd);
    634 		op_fcntl = rumpcalls[RUMPCALL_FCNTL];
    635 	} else {
    636 		op_fcntl = host_fcntl;
    637 	}
    638 
    639 	va_start(ap, cmd);
    640 	rv = op_fcntl(fd, cmd, va_arg(ap, void *));
    641 	va_end(ap);
    642 	return rv;
    643 }
    644 
    645 int
    646 close(int fd)
    647 {
    648 	int (*op_close)(int);
    649 
    650 	DPRINTF(("close %d\n", fd));
    651 	if (fd_isrump(fd)) {
    652 		fd = fd_host2rump(fd);
    653 		op_close = rumpcalls[RUMPCALL_CLOSE];
    654 	} else {
    655 		op_close = host_close;
    656 	}
    657 
    658 	return op_close(fd);
    659 }
    660 
    661 int
    662 select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
    663 	struct timeval *timeout)
    664 {
    665 	struct pollfd *pfds;
    666 	struct timespec ts, *tsp = NULL;
    667 	nfds_t i, j, realnfds;
    668 	int rv, incr;
    669 
    670 	DPRINTF(("select\n"));
    671 
    672 	/*
    673 	 * Well, first we must scan the fds to figure out how many
    674 	 * fds there really are.  This is because up to and including
    675 	 * nb5 poll() silently refuses nfds > process_open_fds.
    676 	 * Seems to be fixed in current, thank the maker.
    677 	 * god damn cluster...bomb.
    678 	 */
    679 
    680 	for (i = 0, realnfds = 0; i < nfds; i++) {
    681 		if (readfds && FD_ISSET(i, readfds)) {
    682 			realnfds++;
    683 			continue;
    684 		}
    685 		if (writefds && FD_ISSET(i, writefds)) {
    686 			realnfds++;
    687 			continue;
    688 		}
    689 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    690 			realnfds++;
    691 			continue;
    692 		}
    693 	}
    694 
    695 	if (realnfds) {
    696 		pfds = malloc(sizeof(*pfds) * realnfds);
    697 		if (!pfds)
    698 			return -1;
    699 	} else {
    700 		pfds = NULL;
    701 	}
    702 
    703 	for (i = 0, j = 0; i < nfds; i++) {
    704 		incr = 0;
    705 		pfds[j].events = pfds[j].revents = 0;
    706 		if (readfds && FD_ISSET(i, readfds)) {
    707 			pfds[j].fd = i;
    708 			pfds[j].events |= POLLIN;
    709 			incr=1;
    710 		}
    711 		if (writefds && FD_ISSET(i, writefds)) {
    712 			pfds[j].fd = i;
    713 			pfds[j].events |= POLLOUT;
    714 			incr=1;
    715 		}
    716 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    717 			pfds[j].fd = i;
    718 			pfds[j].events |= POLLHUP|POLLERR;
    719 			incr=1;
    720 		}
    721 		if (incr)
    722 			j++;
    723 	}
    724 
    725 	if (timeout) {
    726 		TIMEVAL_TO_TIMESPEC(timeout, &ts);
    727 		tsp = &ts;
    728 	}
    729 	rv = pollts(pfds, realnfds, tsp, NULL);
    730 	if (rv <= 0)
    731 		goto out;
    732 
    733 	/*
    734 	 * ok, harvest results.  first zero out entries (can't use
    735 	 * FD_ZERO for the obvious select-me-not reason).  whee.
    736 	 */
    737 	for (i = 0; i < nfds; i++) {
    738 		if (readfds)
    739 			FD_CLR(i, readfds);
    740 		if (writefds)
    741 			FD_CLR(i, writefds);
    742 		if (exceptfds)
    743 			FD_CLR(i, exceptfds);
    744 	}
    745 
    746 	/* and then plug in the results */
    747 	for (i = 0; i < realnfds; i++) {
    748 		if (readfds) {
    749 			if (pfds[i].revents & POLLIN) {
    750 				FD_SET(pfds[i].fd, readfds);
    751 			}
    752 		}
    753 		if (writefds) {
    754 			if (pfds[i].revents & POLLOUT) {
    755 				FD_SET(pfds[i].fd, writefds);
    756 			}
    757 		}
    758 		if (exceptfds) {
    759 			if (pfds[i].revents & (POLLHUP|POLLERR)) {
    760 				FD_SET(pfds[i].fd, exceptfds);
    761 			}
    762 		}
    763 	}
    764 
    765  out:
    766 	free(pfds);
    767 	return rv;
    768 }
    769 
    770 static void
    771 checkpoll(struct pollfd *fds, nfds_t nfds, int *hostcall, int *rumpcall)
    772 {
    773 	nfds_t i;
    774 
    775 	for (i = 0; i < nfds; i++) {
    776 		if (fd_isrump(fds[i].fd))
    777 			(*rumpcall)++;
    778 		else
    779 			(*hostcall)++;
    780 	}
    781 }
    782 
    783 static void
    784 adjustpoll(struct pollfd *fds, nfds_t nfds, int (*fdadj)(int))
    785 {
    786 	nfds_t i;
    787 
    788 	for (i = 0; i < nfds; i++) {
    789 		fds[i].fd = fdadj(fds[i].fd);
    790 	}
    791 }
    792 
    793 struct mytimespec {
    794 	uint64_t tv_sec;
    795 	long tv_nsec;
    796 };
    797 
    798 /*
    799  * poll is easy as long as the call comes in the fds only in one
    800  * kernel.  otherwise its quite tricky...
    801  */
    802 struct pollarg {
    803 	struct pollfd *pfds;
    804 	nfds_t nfds;
    805 	const struct timespec *ts;
    806 	const sigset_t *sigmask;
    807 	int pipefd;
    808 	int errnum;
    809 };
    810 
    811 static void *
    812 hostpoll(void *arg)
    813 {
    814 	struct pollarg *parg = arg;
    815 	intptr_t rv;
    816 
    817 	rv = host_pollts(parg->pfds, parg->nfds, parg->ts, parg->sigmask);
    818 	if (rv == -1)
    819 		parg->errnum = errno;
    820 	rump_sys_write(parg->pipefd, &rv, sizeof(rv));
    821 
    822 	return (void *)(intptr_t)rv;
    823 }
    824 
    825 int
    826 pollts(struct pollfd *fds, nfds_t nfds, const struct timespec *ts,
    827 	const sigset_t *sigmask)
    828 {
    829 	int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
    830 			 const sigset_t *);
    831 	int hostcall = 0, rumpcall = 0;
    832 	pthread_t pt;
    833 	nfds_t i;
    834 	int rv;
    835 
    836 #if 0
    837 	/* XXX: quick 5.0 kludge.  do syscall compat in rumpclient properly */
    838 	struct mytimespec mts;
    839 	if (ts) {
    840 		mts.tv_sec = ts->tv_sec;
    841 		mts.tv_nsec = ts->tv_nsec;
    842 		ts = (struct timespec *)&mts;
    843 	}
    844 #endif
    845 
    846 	DPRINTF(("poll\n"));
    847 	checkpoll(fds, nfds, &hostcall, &rumpcall);
    848 
    849 	if (hostcall && rumpcall) {
    850 		struct pollfd *pfd_host = NULL, *pfd_rump = NULL;
    851 		int rpipe[2] = {-1,-1}, hpipe[2] = {-1,-1};
    852 		struct pollarg parg;
    853 		uintptr_t lrv;
    854 		int sverrno = 0, trv;
    855 
    856 		/*
    857 		 * ok, this is where it gets tricky.  We must support
    858 		 * this since it's a very common operation in certain
    859 		 * types of software (telnet, netcat, etc).  We allocate
    860 		 * two vectors and run two poll commands in separate
    861 		 * threads.  Whichever returns first "wins" and the
    862 		 * other kernel's fds won't show activity.
    863 		 */
    864 		rv = -1;
    865 
    866 		/* allocate full vector for O(n) joining after call */
    867 		pfd_host = malloc(sizeof(*pfd_host)*(nfds+1));
    868 		if (!pfd_host)
    869 			goto out;
    870 		pfd_rump = malloc(sizeof(*pfd_rump)*(nfds+1));
    871 		if (!pfd_rump) {
    872 			goto out;
    873 		}
    874 
    875 		/* split vectors */
    876 		for (i = 0; i < nfds; i++) {
    877 			if (fds[i].fd == -1) {
    878 				pfd_host[i].fd = -1;
    879 				pfd_rump[i].fd = -1;
    880 			} else if (fd_isrump(fds[i].fd)) {
    881 				pfd_host[i].fd = -1;
    882 				pfd_rump[i].fd = fd_host2rump(fds[i].fd);
    883 				pfd_rump[i].events = fds[i].events;
    884 			} else {
    885 				pfd_rump[i].fd = -1;
    886 				pfd_host[i].fd = fds[i].fd;
    887 				pfd_host[i].events = fds[i].events;
    888 			}
    889 		}
    890 
    891 		/*
    892 		 * then, open two pipes, one for notifications
    893 		 * to each kernel.
    894 		 */
    895 		if (rump_sys_pipe(rpipe) == -1)
    896 			goto out;
    897 		if (pipe(hpipe) == -1)
    898 			goto out;
    899 
    900 		pfd_host[nfds].fd = hpipe[0];
    901 		pfd_host[nfds].events = POLLIN;
    902 		pfd_rump[nfds].fd = rpipe[0];
    903 		pfd_rump[nfds].events = POLLIN;
    904 
    905 		/*
    906 		 * then, create a thread to do host part and meanwhile
    907 		 * do rump kernel part right here
    908 		 */
    909 
    910 		parg.pfds = pfd_host;
    911 		parg.nfds = nfds+1;
    912 		parg.ts = ts;
    913 		parg.sigmask = sigmask;
    914 		parg.pipefd = rpipe[1];
    915 		pthread_create(&pt, NULL, hostpoll, &parg);
    916 
    917 		op_pollts = rumpcalls[RUMPCALL_POLLTS];
    918 		lrv = op_pollts(pfd_rump, nfds+1, ts, NULL);
    919 		sverrno = errno;
    920 		write(hpipe[1], &rv, sizeof(rv));
    921 		pthread_join(pt, (void *)&trv);
    922 
    923 		/* check who "won" and merge results */
    924 		if (lrv != 0 && pfd_host[nfds].revents & POLLIN) {
    925 			rv = trv;
    926 
    927 			for (i = 0; i < nfds; i++) {
    928 				if (pfd_rump[i].fd != -1)
    929 					fds[i].revents = pfd_rump[i].revents;
    930 			}
    931 			sverrno = parg.errnum;
    932 		} else if (trv != 0 && pfd_rump[nfds].revents & POLLIN) {
    933 			rv = trv;
    934 
    935 			for (i = 0; i < nfds; i++) {
    936 				if (pfd_host[i].fd != -1)
    937 					fds[i].revents = pfd_host[i].revents;
    938 			}
    939 		} else {
    940 			rv = 0;
    941 		}
    942 
    943  out:
    944 		if (rpipe[0] != -1)
    945 			rump_sys_close(rpipe[0]);
    946 		if (rpipe[1] != -1)
    947 			rump_sys_close(rpipe[1]);
    948 		if (hpipe[0] != -1)
    949 			close(hpipe[0]);
    950 		if (hpipe[1] != -1)
    951 			close(hpipe[1]);
    952 		free(pfd_host);
    953 		free(pfd_rump);
    954 		errno = sverrno;
    955 	} else {
    956 		if (hostcall) {
    957 			op_pollts = host_pollts;
    958 		} else {
    959 			op_pollts = rumpcalls[RUMPCALL_POLLTS];
    960 			adjustpoll(fds, nfds, fd_host2rump);
    961 		}
    962 
    963 		rv = op_pollts(fds, nfds, ts, sigmask);
    964 		if (rumpcall)
    965 			adjustpoll(fds, nfds, fd_rump2host);
    966 	}
    967 
    968 	return rv;
    969 }
    970 
    971 int
    972 poll(struct pollfd *fds, nfds_t nfds, int timeout)
    973 {
    974 	struct timespec ts;
    975 	struct timespec *tsp = NULL;
    976 
    977 	if (timeout != INFTIM) {
    978 		ts.tv_sec = timeout / 1000;
    979 		ts.tv_nsec = (timeout % 1000) * 1000;
    980 
    981 		tsp = &ts;
    982 	}
    983 
    984 	return pollts(fds, nfds, tsp, NULL);
    985 }
    986