Home | History | Annotate | Line # | Download | only in librumphijack
hijack.c revision 1.8
      1 /*      $NetBSD: hijack.c,v 1.8 2011/01/17 16:27:54 pooka Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2011 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     25  * SUCH DAMAGE.
     26  */
     27 
     28 #include <sys/cdefs.h>
     29 __RCSID("$NetBSD: hijack.c,v 1.8 2011/01/17 16:27:54 pooka Exp $");
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/ioctl.h>
     34 #include <sys/socket.h>
     35 #include <sys/poll.h>
     36 
     37 #include <rump/rumpclient.h>
     38 #include <rump/rump_syscalls.h>
     39 
     40 #include <assert.h>
     41 #include <dlfcn.h>
     42 #include <err.h>
     43 #include <errno.h>
     44 #include <fcntl.h>
     45 #include <poll.h>
     46 #include <pthread.h>
     47 #include <signal.h>
     48 #include <stdarg.h>
     49 #include <stdbool.h>
     50 #include <stdio.h>
     51 #include <stdlib.h>
     52 #include <time.h>
     53 #include <unistd.h>
     54 
     55 enum {	RUMPCALL_SOCKET, RUMPCALL_ACCEPT, RUMPCALL_BIND, RUMPCALL_CONNECT,
     56 	RUMPCALL_GETPEERNAME, RUMPCALL_GETSOCKNAME, RUMPCALL_LISTEN,
     57 	RUMPCALL_RECVFROM, RUMPCALL_RECVMSG,
     58 	RUMPCALL_SENDTO, RUMPCALL_SENDMSG,
     59 	RUMPCALL_GETSOCKOPT, RUMPCALL_SETSOCKOPT,
     60 	RUMPCALL_SHUTDOWN,
     61 	RUMPCALL_READ, RUMPCALL_READV,
     62 	RUMPCALL_WRITE, RUMPCALL_WRITEV,
     63 	RUMPCALL_IOCTL, RUMPCALL_FCNTL,
     64 	RUMPCALL_CLOSE,
     65 	RUMPCALL_POLLTS,
     66 	RUMPCALL__NUM
     67 };
     68 
     69 #define RSYS_STRING(a) __STRING(a)
     70 #define RSYS_NAME(a) RSYS_STRING(__CONCAT(RUMP_SYS_RENAME_,a))
     71 
     72 const char *sysnames[] = {
     73 	RSYS_NAME(SOCKET),
     74 	RSYS_NAME(ACCEPT),
     75 	RSYS_NAME(BIND),
     76 	RSYS_NAME(CONNECT),
     77 	RSYS_NAME(GETPEERNAME),
     78 	RSYS_NAME(GETSOCKNAME),
     79 	RSYS_NAME(LISTEN),
     80 	RSYS_NAME(RECVFROM),
     81 	RSYS_NAME(RECVMSG),
     82 	RSYS_NAME(SENDTO),
     83 	RSYS_NAME(SENDMSG),
     84 	RSYS_NAME(GETSOCKOPT),
     85 	RSYS_NAME(SETSOCKOPT),
     86 	RSYS_NAME(SHUTDOWN),
     87 	RSYS_NAME(READ),
     88 	RSYS_NAME(READV),
     89 	RSYS_NAME(WRITE),
     90 	RSYS_NAME(WRITEV),
     91 	RSYS_NAME(IOCTL),
     92 	RSYS_NAME(FCNTL),
     93 	RSYS_NAME(CLOSE),
     94 	RSYS_NAME(POLLTS),
     95 };
     96 
     97 static int	(*host_socket)(int, int, int);
     98 static int	(*host_connect)(int, const struct sockaddr *, socklen_t);
     99 static int	(*host_bind)(int, const struct sockaddr *, socklen_t);
    100 static int	(*host_listen)(int, int);
    101 static int	(*host_accept)(int, struct sockaddr *, socklen_t *);
    102 static int	(*host_getpeername)(int, struct sockaddr *, socklen_t *);
    103 static int	(*host_getsockname)(int, struct sockaddr *, socklen_t *);
    104 static int	(*host_setsockopt)(int, int, int, const void *, socklen_t);
    105 
    106 static ssize_t	(*host_read)(int, void *, size_t);
    107 static ssize_t	(*host_readv)(int, const struct iovec *, int);
    108 static ssize_t	(*host_write)(int, const void *, size_t);
    109 static ssize_t	(*host_writev)(int, const struct iovec *, int);
    110 static int	(*host_ioctl)(int, unsigned long, ...);
    111 static int	(*host_fcntl)(int, int, ...);
    112 static int	(*host_close)(int);
    113 static int	(*host_pollts)(struct pollfd *, nfds_t,
    114 			       const struct timespec *, const sigset_t *);
    115 static pid_t	(*host_fork)(void);
    116 static int	(*host_dup2)(int, int);
    117 
    118 static void *rumpcalls[RUMPCALL__NUM];
    119 
    120 /*
    121  * This is called from librumpclient in case of LD_PRELOAD.
    122  * It ensures correct RTLD_NEXT.
    123  */
    124 static void *
    125 hijackdlsym(void *handle, const char *symbol)
    126 {
    127 
    128 	return dlsym(handle, symbol);
    129 }
    130 
    131 /* low calorie sockets? */
    132 static bool hostlocalsockets = false;
    133 
    134 static void __attribute__((constructor))
    135 rcinit(void)
    136 {
    137 	int (*rumpcinit)(void);
    138 	void **rumpcdlsym;
    139 	void *hand;
    140 	int i;
    141 
    142 	hand = dlopen("librumpclient.so", RTLD_LAZY|RTLD_GLOBAL);
    143 	if (!hand)
    144 		err(1, "cannot open librumpclient.so");
    145 	rumpcinit = dlsym(hand, "rumpclient_init");
    146 	_DIAGASSERT(rumpcinit);
    147 
    148 	rumpcdlsym = dlsym(hand, "rumpclient_dlsym");
    149 	*rumpcdlsym = hijackdlsym;
    150 
    151 	host_socket = dlsym(RTLD_NEXT, "__socket30");
    152 	host_listen = dlsym(RTLD_NEXT, "listen");
    153 	host_connect = dlsym(RTLD_NEXT, "connect");
    154 	host_bind = dlsym(RTLD_NEXT, "bind");
    155 	host_accept = dlsym(RTLD_NEXT, "accept");
    156 	host_getpeername = dlsym(RTLD_NEXT, "getpeername");
    157 	host_getsockname = dlsym(RTLD_NEXT, "getsockname");
    158 	host_setsockopt = dlsym(RTLD_NEXT, "setsockopt");
    159 
    160 	host_read = dlsym(RTLD_NEXT, "read");
    161 	host_readv = dlsym(RTLD_NEXT, "readv");
    162 	host_write = dlsym(RTLD_NEXT, "write");
    163 	host_writev = dlsym(RTLD_NEXT, "writev");
    164 	host_ioctl = dlsym(RTLD_NEXT, "ioctl");
    165 	host_fcntl = dlsym(RTLD_NEXT, "fcntl");
    166 	host_close = dlsym(RTLD_NEXT, "close");
    167 	host_pollts = dlsym(RTLD_NEXT, "pollts");
    168 	host_fork = dlsym(RTLD_NEXT, "fork");
    169 	host_dup2 = dlsym(RTLD_NEXT, "dup2");
    170 
    171 	for (i = 0; i < RUMPCALL__NUM; i++) {
    172 		rumpcalls[i] = dlsym(hand, sysnames[i]);
    173 		if (!rumpcalls[i]) {
    174 			fprintf(stderr, "rumphijack: cannot find symbol: %s\n",
    175 			    sysnames[i]);
    176 			exit(1);
    177 		}
    178 	}
    179 
    180 	if (rumpcinit() == -1)
    181 		err(1, "rumpclient init");
    182 }
    183 
    184 static unsigned dup2mask;
    185 #define ISDUP2D(fd) (((fd+1) & dup2mask) == ((fd)+1))
    186 
    187 //#define DEBUGJACK
    188 #ifdef DEBUGJACK
    189 #define DPRINTF(x) mydprintf x
    190 static void
    191 mydprintf(const char *fmt, ...)
    192 {
    193 	va_list ap;
    194 
    195 	if (ISDUP2D(STDERR_FILENO))
    196 		return;
    197 
    198 	va_start(ap, fmt);
    199 	vfprintf(stderr, fmt, ap);
    200 	va_end(ap);
    201 }
    202 
    203 #else
    204 #define DPRINTF(x)
    205 #endif
    206 
    207 /* XXX: need runtime selection.  low for now due to FD_SETSIZE */
    208 #define HIJACK_FDOFF 128
    209 #define HIJACK_SELECT 128 /* XXX */
    210 #define HIJACK_ASSERT 128 /* XXX */
    211 static int
    212 fd_rump2host(int fd)
    213 {
    214 
    215 	if (fd == -1)
    216 		return fd;
    217 
    218 	if (!ISDUP2D(fd))
    219 		fd += HIJACK_FDOFF;
    220 
    221 	return fd;
    222 }
    223 
    224 static int
    225 fd_host2rump(int fd)
    226 {
    227 
    228 	if (!ISDUP2D(fd))
    229 		fd -= HIJACK_FDOFF;
    230 	return fd;
    231 }
    232 
    233 static bool
    234 fd_isrump(int fd)
    235 {
    236 
    237 	return ISDUP2D(fd) || fd >= HIJACK_FDOFF;
    238 }
    239 
    240 #define assertfd(_fd_) assert(ISDUP2D(_fd_) || (_fd_) >= HIJACK_ASSERT)
    241 #undef HIJACK_FDOFF
    242 
    243 /*
    244  * Following wrappers always call the rump kernel.
    245  */
    246 
    247 int __socket30(int, int, int);
    248 int
    249 __socket30(int domain, int type, int protocol)
    250 {
    251 	int (*rc_socket)(int, int, int);
    252 	int fd;
    253 	bool dohost;
    254 
    255 	dohost = hostlocalsockets && (domain == AF_LOCAL);
    256 
    257 	if (dohost)
    258 		rc_socket = host_socket;
    259 	else
    260 		rc_socket = rumpcalls[RUMPCALL_SOCKET];
    261 	fd = rc_socket(domain, type, protocol);
    262 
    263 	if (!dohost)
    264 		fd = fd_rump2host(fd);
    265 	DPRINTF(("socket <- %d\n", fd));
    266 
    267 	return fd;
    268 }
    269 
    270 int
    271 accept(int s, struct sockaddr *addr, socklen_t *addrlen)
    272 {
    273 	int (*rc_accept)(int, struct sockaddr *, socklen_t *);
    274 	int fd;
    275 	bool isrump;
    276 
    277 	isrump = fd_isrump(s);
    278 
    279 	DPRINTF(("accept -> %d", s));
    280 	if (isrump) {
    281 		rc_accept = rumpcalls[RUMPCALL_ACCEPT];
    282 		s = fd_host2rump(s);
    283 	} else {
    284 		rc_accept = host_accept;
    285 	}
    286 	fd = rc_accept(s, addr, addrlen);
    287 	if (fd != -1 && isrump)
    288 		fd = fd_rump2host(fd);
    289 
    290 	DPRINTF((" <- %d\n", fd));
    291 
    292 	return fd;
    293 }
    294 
    295 int
    296 bind(int s, const struct sockaddr *name, socklen_t namelen)
    297 {
    298 	int (*rc_bind)(int, const struct sockaddr *, socklen_t);
    299 
    300 	DPRINTF(("bind -> %d\n", s));
    301 	if (fd_isrump(s)) {
    302 		rc_bind = rumpcalls[RUMPCALL_BIND];
    303 		s = fd_host2rump(s);
    304 	} else {
    305 		rc_bind = host_bind;
    306 	}
    307 	return rc_bind(s, name, namelen);
    308 }
    309 
    310 int
    311 connect(int s, const struct sockaddr *name, socklen_t namelen)
    312 {
    313 	int (*rc_connect)(int, const struct sockaddr *, socklen_t);
    314 
    315 	DPRINTF(("connect -> %d\n", s));
    316 	if (fd_isrump(s)) {
    317 		rc_connect = rumpcalls[RUMPCALL_CONNECT];
    318 		s = fd_host2rump(s);
    319 	} else {
    320 		rc_connect = host_connect;
    321 	}
    322 
    323 	return rc_connect(s, name, namelen);
    324 }
    325 
    326 int
    327 getpeername(int s, struct sockaddr *name, socklen_t *namelen)
    328 {
    329 	int (*rc_getpeername)(int, struct sockaddr *, socklen_t *);
    330 
    331 	DPRINTF(("getpeername -> %d\n", s));
    332 	if (fd_isrump(s)) {
    333 		rc_getpeername = rumpcalls[RUMPCALL_GETPEERNAME];
    334 		s = fd_host2rump(s);
    335 	} else {
    336 		rc_getpeername = host_getpeername;
    337 	}
    338 	return rc_getpeername(s, name, namelen);
    339 }
    340 
    341 int
    342 getsockname(int s, struct sockaddr *name, socklen_t *namelen)
    343 {
    344 	int (*rc_getsockname)(int, struct sockaddr *, socklen_t *);
    345 
    346 	DPRINTF(("getsockname -> %d\n", s));
    347 	if (fd_isrump(s)) {
    348 		rc_getsockname = rumpcalls[RUMPCALL_GETSOCKNAME];
    349 		s = fd_host2rump(s);
    350 	} else {
    351 		rc_getsockname = host_getsockname;
    352 	}
    353 	return rc_getsockname(s, name, namelen);
    354 }
    355 
    356 int
    357 listen(int s, int backlog)
    358 {
    359 	int (*rc_listen)(int, int);
    360 
    361 	DPRINTF(("listen -> %d\n", s));
    362 	if (fd_isrump(s)) {
    363 		rc_listen = rumpcalls[RUMPCALL_LISTEN];
    364 		s = fd_host2rump(s);
    365 	} else {
    366 		rc_listen = host_listen;
    367 	}
    368 	return rc_listen(s, backlog);
    369 }
    370 
    371 ssize_t
    372 recv(int s, void *buf, size_t len, int flags)
    373 {
    374 
    375 	return recvfrom(s, buf, len, flags, NULL, NULL);
    376 }
    377 
    378 ssize_t
    379 recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
    380 	socklen_t *fromlen)
    381 {
    382 	int (*rc_recvfrom)(int, void *, size_t, int,
    383 	    struct sockaddr *, socklen_t *);
    384 
    385 	DPRINTF(("recvfrom\n"));
    386 	assertfd(s);
    387 	rc_recvfrom = rumpcalls[RUMPCALL_RECVFROM];
    388 	return rc_recvfrom(fd_host2rump(s), buf, len, flags, from, fromlen);
    389 }
    390 
    391 ssize_t
    392 recvmsg(int s, struct msghdr *msg, int flags)
    393 {
    394 	int (*rc_recvmsg)(int, struct msghdr *, int);
    395 
    396 	DPRINTF(("recvmsg\n"));
    397 	assertfd(s);
    398 	rc_recvmsg = rumpcalls[RUMPCALL_RECVMSG];
    399 	return rc_recvmsg(fd_host2rump(s), msg, flags);
    400 }
    401 
    402 ssize_t
    403 send(int s, const void *buf, size_t len, int flags)
    404 {
    405 
    406 	return sendto(s, buf, len, flags, NULL, 0);
    407 }
    408 
    409 ssize_t
    410 sendto(int s, const void *buf, size_t len, int flags,
    411 	const struct sockaddr *to, socklen_t tolen)
    412 {
    413 	int (*rc_sendto)(int, const void *, size_t, int,
    414 	    const struct sockaddr *, socklen_t);
    415 
    416 	if (s == -1)
    417 		return len;
    418 
    419 	DPRINTF(("sendto\n"));
    420 	assertfd(s);
    421 	rc_sendto = rumpcalls[RUMPCALL_SENDTO];
    422 	return rc_sendto(fd_host2rump(s), buf, len, flags, to, tolen);
    423 }
    424 
    425 ssize_t
    426 sendmsg(int s, const struct msghdr *msg, int flags)
    427 {
    428 	int (*rc_sendmsg)(int, const struct msghdr *, int);
    429 
    430 	DPRINTF(("sendmsg\n"));
    431 	assertfd(s);
    432 	rc_sendmsg = rumpcalls[RUMPCALL_SENDTO];
    433 	return rc_sendmsg(fd_host2rump(s), msg, flags);
    434 }
    435 
    436 int
    437 getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen)
    438 {
    439 	int (*rc_getsockopt)(int, int, int, void *, socklen_t *);
    440 
    441 	DPRINTF(("getsockopt -> %d\n", s));
    442 	assertfd(s);
    443 	rc_getsockopt = rumpcalls[RUMPCALL_GETSOCKOPT];
    444 	return rc_getsockopt(fd_host2rump(s), level, optname, optval, optlen);
    445 }
    446 
    447 int
    448 setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen)
    449 {
    450 	int (*rc_setsockopt)(int, int, int, const void *, socklen_t);
    451 
    452 	DPRINTF(("setsockopt -> %d\n", s));
    453 	if (fd_isrump(s)) {
    454 		rc_setsockopt = rumpcalls[RUMPCALL_SETSOCKOPT];
    455 		s = fd_host2rump(s);
    456 	} else {
    457 		rc_setsockopt = host_setsockopt;
    458 	}
    459 	return rc_setsockopt(s, level, optname, optval, optlen);
    460 }
    461 
    462 int
    463 shutdown(int s, int how)
    464 {
    465 	int (*rc_shutdown)(int, int);
    466 
    467 	DPRINTF(("shutdown -> %d\n", s));
    468 	assertfd(s);
    469 	rc_shutdown = rumpcalls[RUMPCALL_SHUTDOWN];
    470 	return rc_shutdown(fd_host2rump(s), how);
    471 }
    472 
    473 /*
    474  * dup2 is special.  we allow dup2 of a rump kernel fd to 0-2 since
    475  * many programs do that.  dup2 of a rump kernel fd to another value
    476  * not >= fdoff is an error.
    477  *
    478  * Note: cannot rump2host newd, because it is often hardcoded.
    479  *
    480  * XXX: should disable debug prints after stdout/stderr are dup2'd
    481  */
    482 int
    483 dup2(int oldd, int newd)
    484 {
    485 	int rv;
    486 
    487 	DPRINTF(("dup2 -> %d (o) -> %d (n)\n", oldd, newd));
    488 
    489 	if (fd_isrump(oldd)) {
    490 		if (!(newd >= 0 && newd <= 2))
    491 			return EBADF;
    492 		oldd = fd_host2rump(oldd);
    493 		rv = rump_sys_dup2(oldd, newd);
    494 		if (rv != -1)
    495 			dup2mask |= newd+1;
    496 		return rv;
    497 	} else {
    498 		return host_dup2(oldd, newd);
    499 	}
    500 }
    501 
    502 /*
    503  * We just wrap fork the appropriate rump client calls to preserve
    504  * the file descriptors of the forked parent in the child, but
    505  * prevent double use of connection fd.
    506  */
    507 
    508 pid_t
    509 fork()
    510 {
    511 	struct rumpclient_fork *rf;
    512 	pid_t rv;
    513 
    514 	DPRINTF(("fork\n"));
    515 
    516 	if ((rf = rumpclient_prefork()) == NULL)
    517 		return -1;
    518 
    519 	switch ((rv = host_fork())) {
    520 	case -1:
    521 		/* XXX: cancel rf */
    522 		break;
    523 	case 0:
    524 		if (rumpclient_fork_init(rf) == -1)
    525 			rv = -1;
    526 		break;
    527 	default:
    528 		break;
    529 	}
    530 
    531 	DPRINTF(("fork returns %d\n", rv));
    532 	return rv;
    533 }
    534 
    535 /*
    536  * Hybrids
    537  */
    538 
    539 ssize_t
    540 read(int fd, void *buf, size_t len)
    541 {
    542 	int (*op_read)(int, void *, size_t);
    543 	ssize_t n;
    544 
    545 	DPRINTF(("read %d\n", fd));
    546 	if (fd_isrump(fd)) {
    547 		fd = fd_host2rump(fd);
    548 		op_read = rumpcalls[RUMPCALL_READ];
    549 	} else {
    550 		op_read = host_read;
    551 	}
    552 
    553 	n = op_read(fd, buf, len);
    554 	return n;
    555 }
    556 
    557 ssize_t
    558 readv(int fd, const struct iovec *iov, int iovcnt)
    559 {
    560 	int (*op_readv)(int, const struct iovec *, int);
    561 
    562 	DPRINTF(("readv %d\n", fd));
    563 	if (fd_isrump(fd)) {
    564 		fd = fd_host2rump(fd);
    565 		op_readv = rumpcalls[RUMPCALL_READV];
    566 	} else {
    567 		op_readv = host_readv;
    568 	}
    569 
    570 	return op_readv(fd, iov, iovcnt);
    571 }
    572 
    573 ssize_t
    574 write(int fd, const void *buf, size_t len)
    575 {
    576 	int (*op_write)(int, const void *, size_t);
    577 
    578 	if (fd_isrump(fd)) {
    579 		fd = fd_host2rump(fd);
    580 		op_write = rumpcalls[RUMPCALL_WRITE];
    581 	} else {
    582 		op_write = host_write;
    583 	}
    584 
    585 	return op_write(fd, buf, len);
    586 }
    587 
    588 ssize_t
    589 writev(int fd, const struct iovec *iov, int iovcnt)
    590 {
    591 	int (*op_writev)(int, const struct iovec *, int);
    592 
    593 	DPRINTF(("writev %d\n", fd));
    594 	if (fd_isrump(fd)) {
    595 		fd = fd_host2rump(fd);
    596 		op_writev = rumpcalls[RUMPCALL_WRITEV];
    597 	} else {
    598 		op_writev = host_writev;
    599 	}
    600 
    601 	return op_writev(fd, iov, iovcnt);
    602 }
    603 
    604 int
    605 ioctl(int fd, unsigned long cmd, ...)
    606 {
    607 	int (*op_ioctl)(int, unsigned long cmd, ...);
    608 	va_list ap;
    609 	int rv;
    610 
    611 	DPRINTF(("ioctl\n"));
    612 	if (fd_isrump(fd)) {
    613 		fd = fd_host2rump(fd);
    614 		op_ioctl = rumpcalls[RUMPCALL_IOCTL];
    615 	} else {
    616 		op_ioctl = host_ioctl;
    617 	}
    618 
    619 	va_start(ap, cmd);
    620 	rv = op_ioctl(fd, cmd, va_arg(ap, void *));
    621 	va_end(ap);
    622 	return rv;
    623 }
    624 
    625 int
    626 fcntl(int fd, int cmd, ...)
    627 {
    628 	int (*op_fcntl)(int, int, ...);
    629 	va_list ap;
    630 	int rv;
    631 
    632 	DPRINTF(("fcntl\n"));
    633 	if (fd_isrump(fd)) {
    634 		fd = fd_host2rump(fd);
    635 		op_fcntl = rumpcalls[RUMPCALL_FCNTL];
    636 	} else {
    637 		op_fcntl = host_fcntl;
    638 	}
    639 
    640 	va_start(ap, cmd);
    641 	rv = op_fcntl(fd, cmd, va_arg(ap, void *));
    642 	va_end(ap);
    643 	return rv;
    644 }
    645 
    646 int
    647 close(int fd)
    648 {
    649 	int (*op_close)(int);
    650 
    651 	DPRINTF(("close %d\n", fd));
    652 	if (fd_isrump(fd)) {
    653 		fd = fd_host2rump(fd);
    654 		op_close = rumpcalls[RUMPCALL_CLOSE];
    655 	} else {
    656 		op_close = host_close;
    657 	}
    658 
    659 	return op_close(fd);
    660 }
    661 
    662 int
    663 select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
    664 	struct timeval *timeout)
    665 {
    666 	struct pollfd *pfds;
    667 	struct timespec ts, *tsp = NULL;
    668 	nfds_t i, j, realnfds;
    669 	int rv, incr;
    670 
    671 	DPRINTF(("select\n"));
    672 
    673 	/*
    674 	 * Well, first we must scan the fds to figure out how many
    675 	 * fds there really are.  This is because up to and including
    676 	 * nb5 poll() silently refuses nfds > process_open_fds.
    677 	 * Seems to be fixed in current, thank the maker.
    678 	 * god damn cluster...bomb.
    679 	 */
    680 
    681 	for (i = 0, realnfds = 0; i < nfds; i++) {
    682 		if (readfds && FD_ISSET(i, readfds)) {
    683 			realnfds++;
    684 			continue;
    685 		}
    686 		if (writefds && FD_ISSET(i, writefds)) {
    687 			realnfds++;
    688 			continue;
    689 		}
    690 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    691 			realnfds++;
    692 			continue;
    693 		}
    694 	}
    695 
    696 	if (realnfds) {
    697 		pfds = malloc(sizeof(*pfds) * realnfds);
    698 		if (!pfds)
    699 			return -1;
    700 	} else {
    701 		pfds = NULL;
    702 	}
    703 
    704 	for (i = 0, j = 0; i < nfds; i++) {
    705 		incr = 0;
    706 		pfds[j].events = pfds[j].revents = 0;
    707 		if (readfds && FD_ISSET(i, readfds)) {
    708 			pfds[j].fd = i;
    709 			pfds[j].events |= POLLIN;
    710 			incr=1;
    711 		}
    712 		if (writefds && FD_ISSET(i, writefds)) {
    713 			pfds[j].fd = i;
    714 			pfds[j].events |= POLLOUT;
    715 			incr=1;
    716 		}
    717 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    718 			pfds[j].fd = i;
    719 			pfds[j].events |= POLLHUP|POLLERR;
    720 			incr=1;
    721 		}
    722 		if (incr)
    723 			j++;
    724 	}
    725 
    726 	if (timeout) {
    727 		TIMEVAL_TO_TIMESPEC(timeout, &ts);
    728 		tsp = &ts;
    729 	}
    730 	rv = pollts(pfds, realnfds, tsp, NULL);
    731 	if (rv <= 0)
    732 		goto out;
    733 
    734 	/*
    735 	 * ok, harvest results.  first zero out entries (can't use
    736 	 * FD_ZERO for the obvious select-me-not reason).  whee.
    737 	 */
    738 	for (i = 0; i < nfds; i++) {
    739 		if (readfds)
    740 			FD_CLR(i, readfds);
    741 		if (writefds)
    742 			FD_CLR(i, writefds);
    743 		if (exceptfds)
    744 			FD_CLR(i, exceptfds);
    745 	}
    746 
    747 	/* and then plug in the results */
    748 	for (i = 0; i < realnfds; i++) {
    749 		if (readfds) {
    750 			if (pfds[i].revents & POLLIN) {
    751 				FD_SET(pfds[i].fd, readfds);
    752 			}
    753 		}
    754 		if (writefds) {
    755 			if (pfds[i].revents & POLLOUT) {
    756 				FD_SET(pfds[i].fd, writefds);
    757 			}
    758 		}
    759 		if (exceptfds) {
    760 			if (pfds[i].revents & (POLLHUP|POLLERR)) {
    761 				FD_SET(pfds[i].fd, exceptfds);
    762 			}
    763 		}
    764 	}
    765 
    766  out:
    767 	free(pfds);
    768 	return rv;
    769 }
    770 
    771 static void
    772 checkpoll(struct pollfd *fds, nfds_t nfds, int *hostcall, int *rumpcall)
    773 {
    774 	nfds_t i;
    775 
    776 	for (i = 0; i < nfds; i++) {
    777 		if (fd_isrump(fds[i].fd))
    778 			(*rumpcall)++;
    779 		else
    780 			(*hostcall)++;
    781 	}
    782 }
    783 
    784 static void
    785 adjustpoll(struct pollfd *fds, nfds_t nfds, int (*fdadj)(int))
    786 {
    787 	nfds_t i;
    788 
    789 	for (i = 0; i < nfds; i++) {
    790 		fds[i].fd = fdadj(fds[i].fd);
    791 	}
    792 }
    793 
    794 /*
    795  * poll is easy as long as the call comes in the fds only in one
    796  * kernel.  otherwise its quite tricky...
    797  */
    798 struct pollarg {
    799 	struct pollfd *pfds;
    800 	nfds_t nfds;
    801 	const struct timespec *ts;
    802 	const sigset_t *sigmask;
    803 	int pipefd;
    804 	int errnum;
    805 };
    806 
    807 static void *
    808 hostpoll(void *arg)
    809 {
    810 	struct pollarg *parg = arg;
    811 	intptr_t rv;
    812 
    813 	rv = host_pollts(parg->pfds, parg->nfds, parg->ts, parg->sigmask);
    814 	if (rv == -1)
    815 		parg->errnum = errno;
    816 	rump_sys_write(parg->pipefd, &rv, sizeof(rv));
    817 
    818 	return (void *)(intptr_t)rv;
    819 }
    820 
    821 int
    822 pollts(struct pollfd *fds, nfds_t nfds, const struct timespec *ts,
    823 	const sigset_t *sigmask)
    824 {
    825 	int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
    826 			 const sigset_t *);
    827 	int hostcall = 0, rumpcall = 0;
    828 	pthread_t pt;
    829 	nfds_t i;
    830 	int rv;
    831 
    832 	DPRINTF(("poll\n"));
    833 	checkpoll(fds, nfds, &hostcall, &rumpcall);
    834 
    835 	if (hostcall && rumpcall) {
    836 		struct pollfd *pfd_host = NULL, *pfd_rump = NULL;
    837 		int rpipe[2] = {-1,-1}, hpipe[2] = {-1,-1};
    838 		struct pollarg parg;
    839 		uintptr_t lrv;
    840 		int sverrno = 0, trv;
    841 
    842 		/*
    843 		 * ok, this is where it gets tricky.  We must support
    844 		 * this since it's a very common operation in certain
    845 		 * types of software (telnet, netcat, etc).  We allocate
    846 		 * two vectors and run two poll commands in separate
    847 		 * threads.  Whichever returns first "wins" and the
    848 		 * other kernel's fds won't show activity.
    849 		 */
    850 		rv = -1;
    851 
    852 		/* allocate full vector for O(n) joining after call */
    853 		pfd_host = malloc(sizeof(*pfd_host)*(nfds+1));
    854 		if (!pfd_host)
    855 			goto out;
    856 		pfd_rump = malloc(sizeof(*pfd_rump)*(nfds+1));
    857 		if (!pfd_rump) {
    858 			goto out;
    859 		}
    860 
    861 		/* split vectors */
    862 		for (i = 0; i < nfds; i++) {
    863 			if (fds[i].fd == -1) {
    864 				pfd_host[i].fd = -1;
    865 				pfd_rump[i].fd = -1;
    866 			} else if (fd_isrump(fds[i].fd)) {
    867 				pfd_host[i].fd = -1;
    868 				pfd_rump[i].fd = fd_host2rump(fds[i].fd);
    869 				pfd_rump[i].events = fds[i].events;
    870 			} else {
    871 				pfd_rump[i].fd = -1;
    872 				pfd_host[i].fd = fds[i].fd;
    873 				pfd_host[i].events = fds[i].events;
    874 			}
    875 		}
    876 
    877 		/*
    878 		 * then, open two pipes, one for notifications
    879 		 * to each kernel.
    880 		 */
    881 		if (rump_sys_pipe(rpipe) == -1)
    882 			goto out;
    883 		if (pipe(hpipe) == -1)
    884 			goto out;
    885 
    886 		pfd_host[nfds].fd = hpipe[0];
    887 		pfd_host[nfds].events = POLLIN;
    888 		pfd_rump[nfds].fd = rpipe[0];
    889 		pfd_rump[nfds].events = POLLIN;
    890 
    891 		/*
    892 		 * then, create a thread to do host part and meanwhile
    893 		 * do rump kernel part right here
    894 		 */
    895 
    896 		parg.pfds = pfd_host;
    897 		parg.nfds = nfds+1;
    898 		parg.ts = ts;
    899 		parg.sigmask = sigmask;
    900 		parg.pipefd = rpipe[1];
    901 		pthread_create(&pt, NULL, hostpoll, &parg);
    902 
    903 		op_pollts = rumpcalls[RUMPCALL_POLLTS];
    904 		lrv = op_pollts(pfd_rump, nfds+1, ts, NULL);
    905 		sverrno = errno;
    906 		write(hpipe[1], &rv, sizeof(rv));
    907 		pthread_join(pt, (void *)&trv);
    908 
    909 		/* check who "won" and merge results */
    910 		if (lrv != 0 && pfd_host[nfds].revents & POLLIN) {
    911 			rv = trv;
    912 
    913 			for (i = 0; i < nfds; i++) {
    914 				if (pfd_rump[i].fd != -1)
    915 					fds[i].revents = pfd_rump[i].revents;
    916 			}
    917 			sverrno = parg.errnum;
    918 		} else if (trv != 0 && pfd_rump[nfds].revents & POLLIN) {
    919 			rv = trv;
    920 
    921 			for (i = 0; i < nfds; i++) {
    922 				if (pfd_host[i].fd != -1)
    923 					fds[i].revents = pfd_host[i].revents;
    924 			}
    925 		} else {
    926 			rv = 0;
    927 		}
    928 
    929  out:
    930 		if (rpipe[0] != -1)
    931 			rump_sys_close(rpipe[0]);
    932 		if (rpipe[1] != -1)
    933 			rump_sys_close(rpipe[1]);
    934 		if (hpipe[0] != -1)
    935 			close(hpipe[0]);
    936 		if (hpipe[1] != -1)
    937 			close(hpipe[1]);
    938 		free(pfd_host);
    939 		free(pfd_rump);
    940 		errno = sverrno;
    941 	} else {
    942 		if (hostcall) {
    943 			op_pollts = host_pollts;
    944 		} else {
    945 			op_pollts = rumpcalls[RUMPCALL_POLLTS];
    946 			adjustpoll(fds, nfds, fd_host2rump);
    947 		}
    948 
    949 		rv = op_pollts(fds, nfds, ts, sigmask);
    950 		if (rumpcall)
    951 			adjustpoll(fds, nfds, fd_rump2host);
    952 	}
    953 
    954 	return rv;
    955 }
    956 
    957 int
    958 poll(struct pollfd *fds, nfds_t nfds, int timeout)
    959 {
    960 	struct timespec ts;
    961 	struct timespec *tsp = NULL;
    962 
    963 	if (timeout != INFTIM) {
    964 		ts.tv_sec = timeout / 1000;
    965 		ts.tv_nsec = (timeout % 1000) * 1000;
    966 
    967 		tsp = &ts;
    968 	}
    969 
    970 	return pollts(fds, nfds, tsp, NULL);
    971 }
    972