Home | History | Annotate | Line # | Download | only in librumphijack
hijack.c revision 1.10
      1 /*      $NetBSD: hijack.c,v 1.10 2011/01/18 11:04:10 pooka Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2011 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     25  * SUCH DAMAGE.
     26  */
     27 
     28 #include <sys/cdefs.h>
     29 __RCSID("$NetBSD: hijack.c,v 1.10 2011/01/18 11:04:10 pooka Exp $");
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/event.h>
     34 #include <sys/ioctl.h>
     35 #include <sys/socket.h>
     36 #include <sys/poll.h>
     37 
     38 #include <rump/rumpclient.h>
     39 #include <rump/rump_syscalls.h>
     40 
     41 #include <assert.h>
     42 #include <dlfcn.h>
     43 #include <err.h>
     44 #include <errno.h>
     45 #include <fcntl.h>
     46 #include <poll.h>
     47 #include <pthread.h>
     48 #include <signal.h>
     49 #include <stdarg.h>
     50 #include <stdbool.h>
     51 #include <stdio.h>
     52 #include <stdlib.h>
     53 #include <time.h>
     54 #include <unistd.h>
     55 
     56 enum {	RUMPCALL_SOCKET, RUMPCALL_ACCEPT, RUMPCALL_BIND, RUMPCALL_CONNECT,
     57 	RUMPCALL_GETPEERNAME, RUMPCALL_GETSOCKNAME, RUMPCALL_LISTEN,
     58 	RUMPCALL_RECVFROM, RUMPCALL_RECVMSG,
     59 	RUMPCALL_SENDTO, RUMPCALL_SENDMSG,
     60 	RUMPCALL_GETSOCKOPT, RUMPCALL_SETSOCKOPT,
     61 	RUMPCALL_SHUTDOWN,
     62 	RUMPCALL_READ, RUMPCALL_READV,
     63 	RUMPCALL_WRITE, RUMPCALL_WRITEV,
     64 	RUMPCALL_IOCTL, RUMPCALL_FCNTL,
     65 	RUMPCALL_CLOSE,
     66 	RUMPCALL_POLLTS,
     67 	RUMPCALL__NUM
     68 };
     69 
     70 #define RSYS_STRING(a) __STRING(a)
     71 #define RSYS_NAME(a) RSYS_STRING(__CONCAT(RUMP_SYS_RENAME_,a))
     72 
     73 const char *sysnames[] = {
     74 	RSYS_NAME(SOCKET),
     75 	RSYS_NAME(ACCEPT),
     76 	RSYS_NAME(BIND),
     77 	RSYS_NAME(CONNECT),
     78 	RSYS_NAME(GETPEERNAME),
     79 	RSYS_NAME(GETSOCKNAME),
     80 	RSYS_NAME(LISTEN),
     81 	RSYS_NAME(RECVFROM),
     82 	RSYS_NAME(RECVMSG),
     83 	RSYS_NAME(SENDTO),
     84 	RSYS_NAME(SENDMSG),
     85 	RSYS_NAME(GETSOCKOPT),
     86 	RSYS_NAME(SETSOCKOPT),
     87 	RSYS_NAME(SHUTDOWN),
     88 	RSYS_NAME(READ),
     89 	RSYS_NAME(READV),
     90 	RSYS_NAME(WRITE),
     91 	RSYS_NAME(WRITEV),
     92 	RSYS_NAME(IOCTL),
     93 	RSYS_NAME(FCNTL),
     94 	RSYS_NAME(CLOSE),
     95 	RSYS_NAME(POLLTS),
     96 };
     97 
     98 static int	(*host_socket)(int, int, int);
     99 static int	(*host_connect)(int, const struct sockaddr *, socklen_t);
    100 static int	(*host_bind)(int, const struct sockaddr *, socklen_t);
    101 static int	(*host_listen)(int, int);
    102 static int	(*host_accept)(int, struct sockaddr *, socklen_t *);
    103 static int	(*host_getpeername)(int, struct sockaddr *, socklen_t *);
    104 static int	(*host_getsockname)(int, struct sockaddr *, socklen_t *);
    105 static int	(*host_setsockopt)(int, int, int, const void *, socklen_t);
    106 
    107 static ssize_t	(*host_read)(int, void *, size_t);
    108 static ssize_t	(*host_readv)(int, const struct iovec *, int);
    109 static ssize_t	(*host_write)(int, const void *, size_t);
    110 static ssize_t	(*host_writev)(int, const struct iovec *, int);
    111 static int	(*host_ioctl)(int, unsigned long, ...);
    112 static int	(*host_fcntl)(int, int, ...);
    113 static int	(*host_close)(int);
    114 static int	(*host_pollts)(struct pollfd *, nfds_t,
    115 			       const struct timespec *, const sigset_t *);
    116 static pid_t	(*host_fork)(void);
    117 static int	(*host_dup2)(int, int);
    118 
    119 static void *rumpcalls[RUMPCALL__NUM];
    120 
    121 /*
    122  * This is called from librumpclient in case of LD_PRELOAD.
    123  * It ensures correct RTLD_NEXT.
    124  */
    125 static void *
    126 hijackdlsym(void *handle, const char *symbol)
    127 {
    128 
    129 	return dlsym(handle, symbol);
    130 }
    131 
    132 /* low calorie sockets? */
    133 static bool hostlocalsockets = false;
    134 
    135 static void __attribute__((constructor))
    136 rcinit(void)
    137 {
    138 	int (*rumpcinit)(void);
    139 	void **rumpcdlsym;
    140 	void *hand;
    141 	int i;
    142 
    143 	hand = dlopen("librumpclient.so", RTLD_LAZY|RTLD_GLOBAL);
    144 	if (!hand)
    145 		err(1, "cannot open librumpclient.so");
    146 	rumpcinit = dlsym(hand, "rumpclient_init");
    147 	_DIAGASSERT(rumpcinit);
    148 
    149 	rumpcdlsym = dlsym(hand, "rumpclient_dlsym");
    150 	*rumpcdlsym = hijackdlsym;
    151 
    152 	host_socket = dlsym(RTLD_NEXT, "__socket30");
    153 	host_listen = dlsym(RTLD_NEXT, "listen");
    154 	host_connect = dlsym(RTLD_NEXT, "connect");
    155 	host_bind = dlsym(RTLD_NEXT, "bind");
    156 	host_accept = dlsym(RTLD_NEXT, "accept");
    157 	host_getpeername = dlsym(RTLD_NEXT, "getpeername");
    158 	host_getsockname = dlsym(RTLD_NEXT, "getsockname");
    159 	host_setsockopt = dlsym(RTLD_NEXT, "setsockopt");
    160 
    161 	host_read = dlsym(RTLD_NEXT, "read");
    162 	host_readv = dlsym(RTLD_NEXT, "readv");
    163 	host_write = dlsym(RTLD_NEXT, "write");
    164 	host_writev = dlsym(RTLD_NEXT, "writev");
    165 	host_ioctl = dlsym(RTLD_NEXT, "ioctl");
    166 	host_fcntl = dlsym(RTLD_NEXT, "fcntl");
    167 	host_close = dlsym(RTLD_NEXT, "close");
    168 	host_pollts = dlsym(RTLD_NEXT, "pollts");
    169 	host_fork = dlsym(RTLD_NEXT, "fork");
    170 	host_dup2 = dlsym(RTLD_NEXT, "dup2");
    171 
    172 	for (i = 0; i < RUMPCALL__NUM; i++) {
    173 		rumpcalls[i] = dlsym(hand, sysnames[i]);
    174 		if (!rumpcalls[i]) {
    175 			fprintf(stderr, "rumphijack: cannot find symbol: %s\n",
    176 			    sysnames[i]);
    177 			exit(1);
    178 		}
    179 	}
    180 
    181 	if (rumpcinit() == -1)
    182 		err(1, "rumpclient init");
    183 }
    184 
    185 static unsigned dup2mask;
    186 #define ISDUP2D(fd) (1<<(fd) & dup2mask)
    187 
    188 //#define DEBUGJACK
    189 #ifdef DEBUGJACK
    190 #define DPRINTF(x) mydprintf x
    191 static void
    192 mydprintf(const char *fmt, ...)
    193 {
    194 	va_list ap;
    195 
    196 	if (ISDUP2D(STDERR_FILENO))
    197 		return;
    198 
    199 	va_start(ap, fmt);
    200 	vfprintf(stderr, fmt, ap);
    201 	va_end(ap);
    202 }
    203 
    204 #else
    205 #define DPRINTF(x)
    206 #endif
    207 
    208 /* XXX: need runtime selection.  low for now due to FD_SETSIZE */
    209 #define HIJACK_FDOFF 128
    210 #define HIJACK_SELECT 128 /* XXX */
    211 #define HIJACK_ASSERT 128 /* XXX */
    212 static int
    213 fd_rump2host(int fd)
    214 {
    215 
    216 	if (fd == -1)
    217 		return fd;
    218 
    219 	if (!ISDUP2D(fd))
    220 		fd += HIJACK_FDOFF;
    221 
    222 	return fd;
    223 }
    224 
    225 static int
    226 fd_host2rump(int fd)
    227 {
    228 
    229 	if (!ISDUP2D(fd))
    230 		fd -= HIJACK_FDOFF;
    231 	return fd;
    232 }
    233 
    234 static bool
    235 fd_isrump(int fd)
    236 {
    237 
    238 	return ISDUP2D(fd) || fd >= HIJACK_FDOFF;
    239 }
    240 
    241 #define assertfd(_fd_) assert(ISDUP2D(_fd_) || (_fd_) >= HIJACK_ASSERT)
    242 #undef HIJACK_FDOFF
    243 
    244 int __socket30(int, int, int);
    245 int
    246 __socket30(int domain, int type, int protocol)
    247 {
    248 	int (*rc_socket)(int, int, int);
    249 	int fd;
    250 	bool dohost;
    251 
    252 	dohost = hostlocalsockets && (domain == AF_LOCAL);
    253 
    254 	if (dohost)
    255 		rc_socket = host_socket;
    256 	else
    257 		rc_socket = rumpcalls[RUMPCALL_SOCKET];
    258 	fd = rc_socket(domain, type, protocol);
    259 
    260 	if (!dohost)
    261 		fd = fd_rump2host(fd);
    262 	DPRINTF(("socket <- %d\n", fd));
    263 
    264 	return fd;
    265 }
    266 
    267 int
    268 accept(int s, struct sockaddr *addr, socklen_t *addrlen)
    269 {
    270 	int (*rc_accept)(int, struct sockaddr *, socklen_t *);
    271 	int fd;
    272 	bool isrump;
    273 
    274 	isrump = fd_isrump(s);
    275 
    276 	DPRINTF(("accept -> %d", s));
    277 	if (isrump) {
    278 		rc_accept = rumpcalls[RUMPCALL_ACCEPT];
    279 		s = fd_host2rump(s);
    280 	} else {
    281 		rc_accept = host_accept;
    282 	}
    283 	fd = rc_accept(s, addr, addrlen);
    284 	if (fd != -1 && isrump)
    285 		fd = fd_rump2host(fd);
    286 
    287 	DPRINTF((" <- %d\n", fd));
    288 
    289 	return fd;
    290 }
    291 
    292 int
    293 bind(int s, const struct sockaddr *name, socklen_t namelen)
    294 {
    295 	int (*rc_bind)(int, const struct sockaddr *, socklen_t);
    296 
    297 	DPRINTF(("bind -> %d\n", s));
    298 	if (fd_isrump(s)) {
    299 		rc_bind = rumpcalls[RUMPCALL_BIND];
    300 		s = fd_host2rump(s);
    301 	} else {
    302 		rc_bind = host_bind;
    303 	}
    304 	return rc_bind(s, name, namelen);
    305 }
    306 
    307 int
    308 connect(int s, const struct sockaddr *name, socklen_t namelen)
    309 {
    310 	int (*rc_connect)(int, const struct sockaddr *, socklen_t);
    311 
    312 	DPRINTF(("connect -> %d\n", s));
    313 	if (fd_isrump(s)) {
    314 		rc_connect = rumpcalls[RUMPCALL_CONNECT];
    315 		s = fd_host2rump(s);
    316 	} else {
    317 		rc_connect = host_connect;
    318 	}
    319 
    320 	return rc_connect(s, name, namelen);
    321 }
    322 
    323 int
    324 getpeername(int s, struct sockaddr *name, socklen_t *namelen)
    325 {
    326 	int (*rc_getpeername)(int, struct sockaddr *, socklen_t *);
    327 
    328 	DPRINTF(("getpeername -> %d\n", s));
    329 	if (fd_isrump(s)) {
    330 		rc_getpeername = rumpcalls[RUMPCALL_GETPEERNAME];
    331 		s = fd_host2rump(s);
    332 	} else {
    333 		rc_getpeername = host_getpeername;
    334 	}
    335 	return rc_getpeername(s, name, namelen);
    336 }
    337 
    338 int
    339 getsockname(int s, struct sockaddr *name, socklen_t *namelen)
    340 {
    341 	int (*rc_getsockname)(int, struct sockaddr *, socklen_t *);
    342 
    343 	DPRINTF(("getsockname -> %d\n", s));
    344 	if (fd_isrump(s)) {
    345 		rc_getsockname = rumpcalls[RUMPCALL_GETSOCKNAME];
    346 		s = fd_host2rump(s);
    347 	} else {
    348 		rc_getsockname = host_getsockname;
    349 	}
    350 	return rc_getsockname(s, name, namelen);
    351 }
    352 
    353 int
    354 listen(int s, int backlog)
    355 {
    356 	int (*rc_listen)(int, int);
    357 
    358 	DPRINTF(("listen -> %d\n", s));
    359 	if (fd_isrump(s)) {
    360 		rc_listen = rumpcalls[RUMPCALL_LISTEN];
    361 		s = fd_host2rump(s);
    362 	} else {
    363 		rc_listen = host_listen;
    364 	}
    365 	return rc_listen(s, backlog);
    366 }
    367 
    368 ssize_t
    369 recv(int s, void *buf, size_t len, int flags)
    370 {
    371 
    372 	return recvfrom(s, buf, len, flags, NULL, NULL);
    373 }
    374 
    375 ssize_t
    376 recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
    377 	socklen_t *fromlen)
    378 {
    379 	int (*rc_recvfrom)(int, void *, size_t, int,
    380 	    struct sockaddr *, socklen_t *);
    381 
    382 	DPRINTF(("recvfrom\n"));
    383 	assertfd(s);
    384 	rc_recvfrom = rumpcalls[RUMPCALL_RECVFROM];
    385 	return rc_recvfrom(fd_host2rump(s), buf, len, flags, from, fromlen);
    386 }
    387 
    388 ssize_t
    389 recvmsg(int s, struct msghdr *msg, int flags)
    390 {
    391 	int (*rc_recvmsg)(int, struct msghdr *, int);
    392 
    393 	DPRINTF(("recvmsg\n"));
    394 	assertfd(s);
    395 	rc_recvmsg = rumpcalls[RUMPCALL_RECVMSG];
    396 	return rc_recvmsg(fd_host2rump(s), msg, flags);
    397 }
    398 
    399 ssize_t
    400 send(int s, const void *buf, size_t len, int flags)
    401 {
    402 
    403 	return sendto(s, buf, len, flags, NULL, 0);
    404 }
    405 
    406 ssize_t
    407 sendto(int s, const void *buf, size_t len, int flags,
    408 	const struct sockaddr *to, socklen_t tolen)
    409 {
    410 	int (*rc_sendto)(int, const void *, size_t, int,
    411 	    const struct sockaddr *, socklen_t);
    412 
    413 	if (s == -1)
    414 		return len;
    415 
    416 	DPRINTF(("sendto\n"));
    417 	assertfd(s);
    418 	rc_sendto = rumpcalls[RUMPCALL_SENDTO];
    419 	return rc_sendto(fd_host2rump(s), buf, len, flags, to, tolen);
    420 }
    421 
    422 ssize_t
    423 sendmsg(int s, const struct msghdr *msg, int flags)
    424 {
    425 	int (*rc_sendmsg)(int, const struct msghdr *, int);
    426 
    427 	DPRINTF(("sendmsg\n"));
    428 	assertfd(s);
    429 	rc_sendmsg = rumpcalls[RUMPCALL_SENDTO];
    430 	return rc_sendmsg(fd_host2rump(s), msg, flags);
    431 }
    432 
    433 int
    434 getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen)
    435 {
    436 	int (*rc_getsockopt)(int, int, int, void *, socklen_t *);
    437 
    438 	DPRINTF(("getsockopt -> %d\n", s));
    439 	assertfd(s);
    440 	rc_getsockopt = rumpcalls[RUMPCALL_GETSOCKOPT];
    441 	return rc_getsockopt(fd_host2rump(s), level, optname, optval, optlen);
    442 }
    443 
    444 int
    445 setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen)
    446 {
    447 	int (*rc_setsockopt)(int, int, int, const void *, socklen_t);
    448 
    449 	DPRINTF(("setsockopt -> %d\n", s));
    450 	if (fd_isrump(s)) {
    451 		rc_setsockopt = rumpcalls[RUMPCALL_SETSOCKOPT];
    452 		s = fd_host2rump(s);
    453 	} else {
    454 		rc_setsockopt = host_setsockopt;
    455 	}
    456 	return rc_setsockopt(s, level, optname, optval, optlen);
    457 }
    458 
    459 int
    460 shutdown(int s, int how)
    461 {
    462 	int (*rc_shutdown)(int, int);
    463 
    464 	DPRINTF(("shutdown -> %d\n", s));
    465 	assertfd(s);
    466 	rc_shutdown = rumpcalls[RUMPCALL_SHUTDOWN];
    467 	return rc_shutdown(fd_host2rump(s), how);
    468 }
    469 
    470 /*
    471  * dup2 is special.  we allow dup2 of a rump kernel fd to 0-2 since
    472  * many programs do that.  dup2 of a rump kernel fd to another value
    473  * not >= fdoff is an error.
    474  *
    475  * Note: cannot rump2host newd, because it is often hardcoded.
    476  *
    477  * XXX: should disable debug prints after stdout/stderr are dup2'd
    478  */
    479 int
    480 dup2(int oldd, int newd)
    481 {
    482 	int rv;
    483 
    484 	DPRINTF(("dup2 -> %d (o) -> %d (n)\n", oldd, newd));
    485 
    486 	if (fd_isrump(oldd)) {
    487 		if (!(newd >= 0 && newd <= 2))
    488 			return EBADF;
    489 		oldd = fd_host2rump(oldd);
    490 		rv = rump_sys_dup2(oldd, newd);
    491 		if (rv != -1)
    492 			dup2mask |= 1<<newd;
    493 	} else {
    494 		rv = host_dup2(oldd, newd);
    495 	}
    496 
    497 	return rv;
    498 }
    499 
    500 /*
    501  * We just wrap fork the appropriate rump client calls to preserve
    502  * the file descriptors of the forked parent in the child, but
    503  * prevent double use of connection fd.
    504  */
    505 
    506 pid_t
    507 fork()
    508 {
    509 	struct rumpclient_fork *rf;
    510 	pid_t rv;
    511 
    512 	DPRINTF(("fork\n"));
    513 
    514 	if ((rf = rumpclient_prefork()) == NULL)
    515 		return -1;
    516 
    517 	switch ((rv = host_fork())) {
    518 	case -1:
    519 		/* XXX: cancel rf */
    520 		break;
    521 	case 0:
    522 		if (rumpclient_fork_init(rf) == -1)
    523 			rv = -1;
    524 		break;
    525 	default:
    526 		break;
    527 	}
    528 
    529 	DPRINTF(("fork returns %d\n", rv));
    530 	return rv;
    531 }
    532 
    533 /*
    534  * Hybrids
    535  */
    536 
    537 ssize_t
    538 read(int fd, void *buf, size_t len)
    539 {
    540 	int (*op_read)(int, void *, size_t);
    541 	ssize_t n;
    542 
    543 	DPRINTF(("read %d\n", fd));
    544 	if (fd_isrump(fd)) {
    545 		fd = fd_host2rump(fd);
    546 		op_read = rumpcalls[RUMPCALL_READ];
    547 	} else {
    548 		op_read = host_read;
    549 	}
    550 
    551 	n = op_read(fd, buf, len);
    552 	return n;
    553 }
    554 
    555 ssize_t
    556 readv(int fd, const struct iovec *iov, int iovcnt)
    557 {
    558 	int (*op_readv)(int, const struct iovec *, int);
    559 
    560 	DPRINTF(("readv %d\n", fd));
    561 	if (fd_isrump(fd)) {
    562 		fd = fd_host2rump(fd);
    563 		op_readv = rumpcalls[RUMPCALL_READV];
    564 	} else {
    565 		op_readv = host_readv;
    566 	}
    567 
    568 	return op_readv(fd, iov, iovcnt);
    569 }
    570 
    571 ssize_t
    572 write(int fd, const void *buf, size_t len)
    573 {
    574 	int (*op_write)(int, const void *, size_t);
    575 
    576 	if (fd_isrump(fd)) {
    577 		fd = fd_host2rump(fd);
    578 		op_write = rumpcalls[RUMPCALL_WRITE];
    579 	} else {
    580 		op_write = host_write;
    581 	}
    582 
    583 	return op_write(fd, buf, len);
    584 }
    585 
    586 ssize_t
    587 writev(int fd, const struct iovec *iov, int iovcnt)
    588 {
    589 	int (*op_writev)(int, const struct iovec *, int);
    590 
    591 	DPRINTF(("writev %d\n", fd));
    592 	if (fd_isrump(fd)) {
    593 		fd = fd_host2rump(fd);
    594 		op_writev = rumpcalls[RUMPCALL_WRITEV];
    595 	} else {
    596 		op_writev = host_writev;
    597 	}
    598 
    599 	return op_writev(fd, iov, iovcnt);
    600 }
    601 
    602 int
    603 ioctl(int fd, unsigned long cmd, ...)
    604 {
    605 	int (*op_ioctl)(int, unsigned long cmd, ...);
    606 	va_list ap;
    607 	int rv;
    608 
    609 	DPRINTF(("ioctl\n"));
    610 	if (fd_isrump(fd)) {
    611 		fd = fd_host2rump(fd);
    612 		op_ioctl = rumpcalls[RUMPCALL_IOCTL];
    613 	} else {
    614 		op_ioctl = host_ioctl;
    615 	}
    616 
    617 	va_start(ap, cmd);
    618 	rv = op_ioctl(fd, cmd, va_arg(ap, void *));
    619 	va_end(ap);
    620 	return rv;
    621 }
    622 
    623 int
    624 fcntl(int fd, int cmd, ...)
    625 {
    626 	int (*op_fcntl)(int, int, ...);
    627 	va_list ap;
    628 	int rv;
    629 
    630 	DPRINTF(("fcntl\n"));
    631 	if (fd_isrump(fd)) {
    632 		fd = fd_host2rump(fd);
    633 		op_fcntl = rumpcalls[RUMPCALL_FCNTL];
    634 	} else {
    635 		op_fcntl = host_fcntl;
    636 	}
    637 
    638 	va_start(ap, cmd);
    639 	rv = op_fcntl(fd, cmd, va_arg(ap, void *));
    640 	va_end(ap);
    641 	return rv;
    642 }
    643 
    644 int
    645 close(int fd)
    646 {
    647 	int (*op_close)(int);
    648 
    649 	DPRINTF(("close %d\n", fd));
    650 	if (fd_isrump(fd)) {
    651 		fd = fd_host2rump(fd);
    652 		op_close = rumpcalls[RUMPCALL_CLOSE];
    653 	} else {
    654 		op_close = host_close;
    655 	}
    656 
    657 	return op_close(fd);
    658 }
    659 
    660 int
    661 select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
    662 	struct timeval *timeout)
    663 {
    664 	struct pollfd *pfds;
    665 	struct timespec ts, *tsp = NULL;
    666 	nfds_t i, j, realnfds;
    667 	int rv, incr;
    668 
    669 	DPRINTF(("select\n"));
    670 
    671 	/*
    672 	 * Well, first we must scan the fds to figure out how many
    673 	 * fds there really are.  This is because up to and including
    674 	 * nb5 poll() silently refuses nfds > process_open_fds.
    675 	 * Seems to be fixed in current, thank the maker.
    676 	 * god damn cluster...bomb.
    677 	 */
    678 
    679 	for (i = 0, realnfds = 0; i < nfds; i++) {
    680 		if (readfds && FD_ISSET(i, readfds)) {
    681 			realnfds++;
    682 			continue;
    683 		}
    684 		if (writefds && FD_ISSET(i, writefds)) {
    685 			realnfds++;
    686 			continue;
    687 		}
    688 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    689 			realnfds++;
    690 			continue;
    691 		}
    692 	}
    693 
    694 	if (realnfds) {
    695 		pfds = malloc(sizeof(*pfds) * realnfds);
    696 		if (!pfds)
    697 			return -1;
    698 	} else {
    699 		pfds = NULL;
    700 	}
    701 
    702 	for (i = 0, j = 0; i < nfds; i++) {
    703 		incr = 0;
    704 		pfds[j].events = pfds[j].revents = 0;
    705 		if (readfds && FD_ISSET(i, readfds)) {
    706 			pfds[j].fd = i;
    707 			pfds[j].events |= POLLIN;
    708 			incr=1;
    709 		}
    710 		if (writefds && FD_ISSET(i, writefds)) {
    711 			pfds[j].fd = i;
    712 			pfds[j].events |= POLLOUT;
    713 			incr=1;
    714 		}
    715 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    716 			pfds[j].fd = i;
    717 			pfds[j].events |= POLLHUP|POLLERR;
    718 			incr=1;
    719 		}
    720 		if (incr)
    721 			j++;
    722 	}
    723 
    724 	if (timeout) {
    725 		TIMEVAL_TO_TIMESPEC(timeout, &ts);
    726 		tsp = &ts;
    727 	}
    728 	rv = pollts(pfds, realnfds, tsp, NULL);
    729 	if (rv <= 0)
    730 		goto out;
    731 
    732 	/*
    733 	 * ok, harvest results.  first zero out entries (can't use
    734 	 * FD_ZERO for the obvious select-me-not reason).  whee.
    735 	 */
    736 	for (i = 0; i < nfds; i++) {
    737 		if (readfds)
    738 			FD_CLR(i, readfds);
    739 		if (writefds)
    740 			FD_CLR(i, writefds);
    741 		if (exceptfds)
    742 			FD_CLR(i, exceptfds);
    743 	}
    744 
    745 	/* and then plug in the results */
    746 	for (i = 0; i < realnfds; i++) {
    747 		if (readfds) {
    748 			if (pfds[i].revents & POLLIN) {
    749 				FD_SET(pfds[i].fd, readfds);
    750 			}
    751 		}
    752 		if (writefds) {
    753 			if (pfds[i].revents & POLLOUT) {
    754 				FD_SET(pfds[i].fd, writefds);
    755 			}
    756 		}
    757 		if (exceptfds) {
    758 			if (pfds[i].revents & (POLLHUP|POLLERR)) {
    759 				FD_SET(pfds[i].fd, exceptfds);
    760 			}
    761 		}
    762 	}
    763 
    764  out:
    765 	free(pfds);
    766 	return rv;
    767 }
    768 
    769 static void
    770 checkpoll(struct pollfd *fds, nfds_t nfds, int *hostcall, int *rumpcall)
    771 {
    772 	nfds_t i;
    773 
    774 	for (i = 0; i < nfds; i++) {
    775 		if (fd_isrump(fds[i].fd))
    776 			(*rumpcall)++;
    777 		else
    778 			(*hostcall)++;
    779 	}
    780 }
    781 
    782 static void
    783 adjustpoll(struct pollfd *fds, nfds_t nfds, int (*fdadj)(int))
    784 {
    785 	nfds_t i;
    786 
    787 	for (i = 0; i < nfds; i++) {
    788 		fds[i].fd = fdadj(fds[i].fd);
    789 	}
    790 }
    791 
    792 /*
    793  * poll is easy as long as the call comes in the fds only in one
    794  * kernel.  otherwise its quite tricky...
    795  */
    796 struct pollarg {
    797 	struct pollfd *pfds;
    798 	nfds_t nfds;
    799 	const struct timespec *ts;
    800 	const sigset_t *sigmask;
    801 	int pipefd;
    802 	int errnum;
    803 };
    804 
    805 static void *
    806 hostpoll(void *arg)
    807 {
    808 	struct pollarg *parg = arg;
    809 	intptr_t rv;
    810 
    811 	rv = host_pollts(parg->pfds, parg->nfds, parg->ts, parg->sigmask);
    812 	if (rv == -1)
    813 		parg->errnum = errno;
    814 	rump_sys_write(parg->pipefd, &rv, sizeof(rv));
    815 
    816 	return (void *)(intptr_t)rv;
    817 }
    818 
    819 int
    820 pollts(struct pollfd *fds, nfds_t nfds, const struct timespec *ts,
    821 	const sigset_t *sigmask)
    822 {
    823 	int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
    824 			 const sigset_t *);
    825 	int hostcall = 0, rumpcall = 0;
    826 	pthread_t pt;
    827 	nfds_t i;
    828 	int rv;
    829 
    830 	DPRINTF(("poll\n"));
    831 	checkpoll(fds, nfds, &hostcall, &rumpcall);
    832 
    833 	if (hostcall && rumpcall) {
    834 		struct pollfd *pfd_host = NULL, *pfd_rump = NULL;
    835 		int rpipe[2] = {-1,-1}, hpipe[2] = {-1,-1};
    836 		struct pollarg parg;
    837 		uintptr_t lrv;
    838 		int sverrno = 0, trv;
    839 
    840 		/*
    841 		 * ok, this is where it gets tricky.  We must support
    842 		 * this since it's a very common operation in certain
    843 		 * types of software (telnet, netcat, etc).  We allocate
    844 		 * two vectors and run two poll commands in separate
    845 		 * threads.  Whichever returns first "wins" and the
    846 		 * other kernel's fds won't show activity.
    847 		 */
    848 		rv = -1;
    849 
    850 		/* allocate full vector for O(n) joining after call */
    851 		pfd_host = malloc(sizeof(*pfd_host)*(nfds+1));
    852 		if (!pfd_host)
    853 			goto out;
    854 		pfd_rump = malloc(sizeof(*pfd_rump)*(nfds+1));
    855 		if (!pfd_rump) {
    856 			goto out;
    857 		}
    858 
    859 		/* split vectors */
    860 		for (i = 0; i < nfds; i++) {
    861 			if (fds[i].fd == -1) {
    862 				pfd_host[i].fd = -1;
    863 				pfd_rump[i].fd = -1;
    864 			} else if (fd_isrump(fds[i].fd)) {
    865 				pfd_host[i].fd = -1;
    866 				pfd_rump[i].fd = fd_host2rump(fds[i].fd);
    867 				pfd_rump[i].events = fds[i].events;
    868 			} else {
    869 				pfd_rump[i].fd = -1;
    870 				pfd_host[i].fd = fds[i].fd;
    871 				pfd_host[i].events = fds[i].events;
    872 			}
    873 		}
    874 
    875 		/*
    876 		 * then, open two pipes, one for notifications
    877 		 * to each kernel.
    878 		 */
    879 		if (rump_sys_pipe(rpipe) == -1)
    880 			goto out;
    881 		if (pipe(hpipe) == -1)
    882 			goto out;
    883 
    884 		pfd_host[nfds].fd = hpipe[0];
    885 		pfd_host[nfds].events = POLLIN;
    886 		pfd_rump[nfds].fd = rpipe[0];
    887 		pfd_rump[nfds].events = POLLIN;
    888 
    889 		/*
    890 		 * then, create a thread to do host part and meanwhile
    891 		 * do rump kernel part right here
    892 		 */
    893 
    894 		parg.pfds = pfd_host;
    895 		parg.nfds = nfds+1;
    896 		parg.ts = ts;
    897 		parg.sigmask = sigmask;
    898 		parg.pipefd = rpipe[1];
    899 		pthread_create(&pt, NULL, hostpoll, &parg);
    900 
    901 		op_pollts = rumpcalls[RUMPCALL_POLLTS];
    902 		lrv = op_pollts(pfd_rump, nfds+1, ts, NULL);
    903 		sverrno = errno;
    904 		write(hpipe[1], &rv, sizeof(rv));
    905 		pthread_join(pt, (void *)&trv);
    906 
    907 		/* check who "won" and merge results */
    908 		if (lrv != 0 && pfd_host[nfds].revents & POLLIN) {
    909 			rv = trv;
    910 
    911 			for (i = 0; i < nfds; i++) {
    912 				if (pfd_rump[i].fd != -1)
    913 					fds[i].revents = pfd_rump[i].revents;
    914 			}
    915 			sverrno = parg.errnum;
    916 		} else if (trv != 0 && pfd_rump[nfds].revents & POLLIN) {
    917 			rv = trv;
    918 
    919 			for (i = 0; i < nfds; i++) {
    920 				if (pfd_host[i].fd != -1)
    921 					fds[i].revents = pfd_host[i].revents;
    922 			}
    923 		} else {
    924 			rv = 0;
    925 		}
    926 
    927  out:
    928 		if (rpipe[0] != -1)
    929 			rump_sys_close(rpipe[0]);
    930 		if (rpipe[1] != -1)
    931 			rump_sys_close(rpipe[1]);
    932 		if (hpipe[0] != -1)
    933 			host_close(hpipe[0]);
    934 		if (hpipe[1] != -1)
    935 			host_close(hpipe[1]);
    936 		free(pfd_host);
    937 		free(pfd_rump);
    938 		errno = sverrno;
    939 	} else {
    940 		if (hostcall) {
    941 			op_pollts = host_pollts;
    942 		} else {
    943 			op_pollts = rumpcalls[RUMPCALL_POLLTS];
    944 			adjustpoll(fds, nfds, fd_host2rump);
    945 		}
    946 
    947 		rv = op_pollts(fds, nfds, ts, sigmask);
    948 		if (rumpcall)
    949 			adjustpoll(fds, nfds, fd_rump2host);
    950 	}
    951 
    952 	return rv;
    953 }
    954 
    955 int
    956 poll(struct pollfd *fds, nfds_t nfds, int timeout)
    957 {
    958 	struct timespec ts;
    959 	struct timespec *tsp = NULL;
    960 
    961 	if (timeout != INFTIM) {
    962 		ts.tv_sec = timeout / 1000;
    963 		ts.tv_nsec = (timeout % 1000) * 1000;
    964 
    965 		tsp = &ts;
    966 	}
    967 
    968 	return pollts(fds, nfds, tsp, NULL);
    969 }
    970 
    971 int
    972 kqueue(void)
    973 {
    974 
    975 	abort();
    976 }
    977 
    978 int
    979 kevent(int kq, const struct kevent *changelist, size_t nchanges,
    980 	struct kevent *eventlist, size_t nevents,
    981 	const struct timespec *timeout)
    982 {
    983 
    984 	abort();
    985 }
    986