Home | History | Annotate | Line # | Download | only in librumphijack
hijack.c revision 1.16
      1 /*      $NetBSD: hijack.c,v 1.16 2011/01/19 11:27:01 pooka Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2011 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     25  * SUCH DAMAGE.
     26  */
     27 
     28 #include <sys/cdefs.h>
     29 __RCSID("$NetBSD: hijack.c,v 1.16 2011/01/19 11:27:01 pooka Exp $");
     30 
     31 #include <sys/param.h>
     32 #include <sys/types.h>
     33 #include <sys/event.h>
     34 #include <sys/ioctl.h>
     35 #include <sys/socket.h>
     36 #include <sys/poll.h>
     37 
     38 #include <rump/rumpclient.h>
     39 #include <rump/rump_syscalls.h>
     40 
     41 #include <assert.h>
     42 #include <dlfcn.h>
     43 #include <err.h>
     44 #include <errno.h>
     45 #include <fcntl.h>
     46 #include <poll.h>
     47 #include <pthread.h>
     48 #include <signal.h>
     49 #include <stdarg.h>
     50 #include <stdbool.h>
     51 #include <stdio.h>
     52 #include <stdlib.h>
     53 #include <time.h>
     54 #include <unistd.h>
     55 
     56 enum {	RUMPCALL_SOCKET, RUMPCALL_ACCEPT, RUMPCALL_BIND, RUMPCALL_CONNECT,
     57 	RUMPCALL_GETPEERNAME, RUMPCALL_GETSOCKNAME, RUMPCALL_LISTEN,
     58 	RUMPCALL_RECVFROM, RUMPCALL_RECVMSG,
     59 	RUMPCALL_SENDTO, RUMPCALL_SENDMSG,
     60 	RUMPCALL_GETSOCKOPT, RUMPCALL_SETSOCKOPT,
     61 	RUMPCALL_SHUTDOWN,
     62 	RUMPCALL_READ, RUMPCALL_READV,
     63 	RUMPCALL_WRITE, RUMPCALL_WRITEV,
     64 	RUMPCALL_IOCTL, RUMPCALL_FCNTL,
     65 	RUMPCALL_CLOSE,
     66 	RUMPCALL_POLLTS,
     67 	RUMPCALL__NUM
     68 };
     69 
     70 #define RSYS_STRING(a) __STRING(a)
     71 #define RSYS_NAME(a) RSYS_STRING(__CONCAT(RUMP_SYS_RENAME_,a))
     72 
     73 const char *sysnames[] = {
     74 	RSYS_NAME(SOCKET),
     75 	RSYS_NAME(ACCEPT),
     76 	RSYS_NAME(BIND),
     77 	RSYS_NAME(CONNECT),
     78 	RSYS_NAME(GETPEERNAME),
     79 	RSYS_NAME(GETSOCKNAME),
     80 	RSYS_NAME(LISTEN),
     81 	RSYS_NAME(RECVFROM),
     82 	RSYS_NAME(RECVMSG),
     83 	RSYS_NAME(SENDTO),
     84 	RSYS_NAME(SENDMSG),
     85 	RSYS_NAME(GETSOCKOPT),
     86 	RSYS_NAME(SETSOCKOPT),
     87 	RSYS_NAME(SHUTDOWN),
     88 	RSYS_NAME(READ),
     89 	RSYS_NAME(READV),
     90 	RSYS_NAME(WRITE),
     91 	RSYS_NAME(WRITEV),
     92 	RSYS_NAME(IOCTL),
     93 	RSYS_NAME(FCNTL),
     94 	RSYS_NAME(CLOSE),
     95 	RSYS_NAME(POLLTS),
     96 };
     97 
     98 static int	(*host_socket)(int, int, int);
     99 static int	(*host_connect)(int, const struct sockaddr *, socklen_t);
    100 static int	(*host_bind)(int, const struct sockaddr *, socklen_t);
    101 static int	(*host_listen)(int, int);
    102 static int	(*host_accept)(int, struct sockaddr *, socklen_t *);
    103 static int	(*host_getpeername)(int, struct sockaddr *, socklen_t *);
    104 static int	(*host_getsockname)(int, struct sockaddr *, socklen_t *);
    105 static int	(*host_setsockopt)(int, int, int, const void *, socklen_t);
    106 
    107 static ssize_t	(*host_read)(int, void *, size_t);
    108 static ssize_t	(*host_readv)(int, const struct iovec *, int);
    109 static ssize_t	(*host_write)(int, const void *, size_t);
    110 static ssize_t	(*host_writev)(int, const struct iovec *, int);
    111 static int	(*host_ioctl)(int, unsigned long, ...);
    112 static int	(*host_fcntl)(int, int, ...);
    113 static int	(*host_close)(int);
    114 static int	(*host_pollts)(struct pollfd *, nfds_t,
    115 			       const struct timespec *, const sigset_t *);
    116 static pid_t	(*host_fork)(void);
    117 static int	(*host_dup2)(int, int);
    118 static int	(*host_shutdown)(int, int);
    119 /* XXX */
    120 static void	*host_sendto;
    121 static void	*host_recvfrom;
    122 
    123 static void *rumpcalls[RUMPCALL__NUM];
    124 
    125 /*
    126  * Would be nice to get this automatically in sync with libc.
    127  * Also, this does not work for compat-using binaries!
    128  */
    129 
    130 #if !__NetBSD_Prereq__(5,99,7)
    131 #define SELECT select
    132 #define POLLTS pollts
    133 #define POLL poll
    134 #else
    135 #define SELECT __select50
    136 #define POLLTS __pollts50
    137 #define POLL __poll50
    138 
    139 int SELECT(int, fd_set *, fd_set *, fd_set *, struct timeval *);
    140 int POLLTS(struct pollfd *, nfds_t, const struct timespec *, const sigset_t *);
    141 int POLL(struct pollfd *, nfds_t, int);
    142 #endif
    143 
    144 /*
    145  * This is called from librumpclient in case of LD_PRELOAD.
    146  * It ensures correct RTLD_NEXT.
    147  */
    148 static void *
    149 hijackdlsym(void *handle, const char *symbol)
    150 {
    151 
    152 	return dlsym(handle, symbol);
    153 }
    154 
    155 /* low calorie sockets? */
    156 static bool hostlocalsockets = true;
    157 
    158 static void __attribute__((constructor))
    159 rcinit(void)
    160 {
    161 	int (*rumpcinit)(void);
    162 	void **rumpcdlsym;
    163 	void *hand;
    164 	int i;
    165 
    166 	hand = dlopen("librumpclient.so", RTLD_LAZY|RTLD_GLOBAL);
    167 	if (!hand)
    168 		err(1, "cannot open librumpclient.so");
    169 	rumpcinit = dlsym(hand, "rumpclient_init");
    170 	_DIAGASSERT(rumpcinit);
    171 
    172 	rumpcdlsym = dlsym(hand, "rumpclient_dlsym");
    173 	*rumpcdlsym = hijackdlsym;
    174 
    175 	host_socket = dlsym(RTLD_NEXT, "__socket30");
    176 	host_listen = dlsym(RTLD_NEXT, "listen");
    177 	host_connect = dlsym(RTLD_NEXT, "connect");
    178 	host_bind = dlsym(RTLD_NEXT, "bind");
    179 	host_accept = dlsym(RTLD_NEXT, "accept");
    180 	host_getpeername = dlsym(RTLD_NEXT, "getpeername");
    181 	host_getsockname = dlsym(RTLD_NEXT, "getsockname");
    182 	host_setsockopt = dlsym(RTLD_NEXT, "setsockopt");
    183 
    184 	host_read = dlsym(RTLD_NEXT, "read");
    185 	host_readv = dlsym(RTLD_NEXT, "readv");
    186 	host_write = dlsym(RTLD_NEXT, "write");
    187 	host_writev = dlsym(RTLD_NEXT, "writev");
    188 	host_ioctl = dlsym(RTLD_NEXT, "ioctl");
    189 	host_fcntl = dlsym(RTLD_NEXT, "fcntl");
    190 	host_close = dlsym(RTLD_NEXT, "close");
    191 	host_pollts = dlsym(RTLD_NEXT, "pollts");
    192 	host_fork = dlsym(RTLD_NEXT, "fork");
    193 	host_dup2 = dlsym(RTLD_NEXT, "dup2");
    194 	host_shutdown = dlsym(RTLD_NEXT, "shutdown");
    195 	host_sendto = dlsym(RTLD_NEXT, "sendto");
    196 	host_recvfrom = dlsym(RTLD_NEXT, "recvfrom");
    197 
    198 	for (i = 0; i < RUMPCALL__NUM; i++) {
    199 		rumpcalls[i] = dlsym(hand, sysnames[i]);
    200 		if (!rumpcalls[i]) {
    201 			fprintf(stderr, "rumphijack: cannot find symbol: %s\n",
    202 			    sysnames[i]);
    203 			exit(1);
    204 		}
    205 	}
    206 
    207 	if (rumpcinit() == -1)
    208 		err(1, "rumpclient init");
    209 }
    210 
    211 static unsigned dup2mask;
    212 #define ISDUP2D(fd) (1<<(fd) & dup2mask)
    213 
    214 //#define DEBUGJACK
    215 #ifdef DEBUGJACK
    216 #define DPRINTF(x) mydprintf x
    217 static void
    218 mydprintf(const char *fmt, ...)
    219 {
    220 	va_list ap;
    221 
    222 	if (ISDUP2D(STDERR_FILENO))
    223 		return;
    224 
    225 	va_start(ap, fmt);
    226 	vfprintf(stderr, fmt, ap);
    227 	va_end(ap);
    228 }
    229 
    230 #else
    231 #define DPRINTF(x)
    232 #endif
    233 
    234 /* XXX: need runtime selection.  low for now due to FD_SETSIZE */
    235 #define HIJACK_FDOFF 128
    236 #define HIJACK_SELECT 128 /* XXX */
    237 #define HIJACK_ASSERT 128 /* XXX */
    238 static int
    239 fd_rump2host(int fd)
    240 {
    241 
    242 	if (fd == -1)
    243 		return fd;
    244 
    245 	if (!ISDUP2D(fd))
    246 		fd += HIJACK_FDOFF;
    247 
    248 	return fd;
    249 }
    250 
    251 static int
    252 fd_host2rump(int fd)
    253 {
    254 
    255 	if (!ISDUP2D(fd))
    256 		fd -= HIJACK_FDOFF;
    257 	return fd;
    258 }
    259 
    260 static bool
    261 fd_isrump(int fd)
    262 {
    263 
    264 	return ISDUP2D(fd) || fd >= HIJACK_FDOFF;
    265 }
    266 
    267 #define assertfd(_fd_) assert(ISDUP2D(_fd_) || (_fd_) >= HIJACK_ASSERT)
    268 #undef HIJACK_FDOFF
    269 
    270 int __socket30(int, int, int);
    271 int
    272 __socket30(int domain, int type, int protocol)
    273 {
    274 	int (*rc_socket)(int, int, int);
    275 	int fd;
    276 	bool dohost;
    277 
    278 	dohost = hostlocalsockets && (domain == AF_LOCAL);
    279 
    280 	if (dohost)
    281 		rc_socket = host_socket;
    282 	else
    283 		rc_socket = rumpcalls[RUMPCALL_SOCKET];
    284 	fd = rc_socket(domain, type, protocol);
    285 
    286 	if (!dohost)
    287 		fd = fd_rump2host(fd);
    288 	DPRINTF(("socket <- %d\n", fd));
    289 
    290 	return fd;
    291 }
    292 
    293 int
    294 accept(int s, struct sockaddr *addr, socklen_t *addrlen)
    295 {
    296 	int (*rc_accept)(int, struct sockaddr *, socklen_t *);
    297 	int fd;
    298 	bool isrump;
    299 
    300 	isrump = fd_isrump(s);
    301 
    302 	DPRINTF(("accept -> %d", s));
    303 	if (isrump) {
    304 		rc_accept = rumpcalls[RUMPCALL_ACCEPT];
    305 		s = fd_host2rump(s);
    306 	} else {
    307 		rc_accept = host_accept;
    308 	}
    309 	fd = rc_accept(s, addr, addrlen);
    310 	if (fd != -1 && isrump)
    311 		fd = fd_rump2host(fd);
    312 
    313 	DPRINTF((" <- %d\n", fd));
    314 
    315 	return fd;
    316 }
    317 
    318 int
    319 bind(int s, const struct sockaddr *name, socklen_t namelen)
    320 {
    321 	int (*rc_bind)(int, const struct sockaddr *, socklen_t);
    322 
    323 	DPRINTF(("bind -> %d\n", s));
    324 	if (fd_isrump(s)) {
    325 		rc_bind = rumpcalls[RUMPCALL_BIND];
    326 		s = fd_host2rump(s);
    327 	} else {
    328 		rc_bind = host_bind;
    329 	}
    330 	return rc_bind(s, name, namelen);
    331 }
    332 
    333 int
    334 connect(int s, const struct sockaddr *name, socklen_t namelen)
    335 {
    336 	int (*rc_connect)(int, const struct sockaddr *, socklen_t);
    337 
    338 	DPRINTF(("connect -> %d\n", s));
    339 	if (fd_isrump(s)) {
    340 		rc_connect = rumpcalls[RUMPCALL_CONNECT];
    341 		s = fd_host2rump(s);
    342 	} else {
    343 		rc_connect = host_connect;
    344 	}
    345 
    346 	return rc_connect(s, name, namelen);
    347 }
    348 
    349 int
    350 getpeername(int s, struct sockaddr *name, socklen_t *namelen)
    351 {
    352 	int (*rc_getpeername)(int, struct sockaddr *, socklen_t *);
    353 
    354 	DPRINTF(("getpeername -> %d\n", s));
    355 	if (fd_isrump(s)) {
    356 		rc_getpeername = rumpcalls[RUMPCALL_GETPEERNAME];
    357 		s = fd_host2rump(s);
    358 	} else {
    359 		rc_getpeername = host_getpeername;
    360 	}
    361 	return rc_getpeername(s, name, namelen);
    362 }
    363 
    364 int
    365 getsockname(int s, struct sockaddr *name, socklen_t *namelen)
    366 {
    367 	int (*rc_getsockname)(int, struct sockaddr *, socklen_t *);
    368 
    369 	DPRINTF(("getsockname -> %d\n", s));
    370 	if (fd_isrump(s)) {
    371 		rc_getsockname = rumpcalls[RUMPCALL_GETSOCKNAME];
    372 		s = fd_host2rump(s);
    373 	} else {
    374 		rc_getsockname = host_getsockname;
    375 	}
    376 	return rc_getsockname(s, name, namelen);
    377 }
    378 
    379 int
    380 listen(int s, int backlog)
    381 {
    382 	int (*rc_listen)(int, int);
    383 
    384 	DPRINTF(("listen -> %d\n", s));
    385 	if (fd_isrump(s)) {
    386 		rc_listen = rumpcalls[RUMPCALL_LISTEN];
    387 		s = fd_host2rump(s);
    388 	} else {
    389 		rc_listen = host_listen;
    390 	}
    391 	return rc_listen(s, backlog);
    392 }
    393 
    394 ssize_t
    395 recv(int s, void *buf, size_t len, int flags)
    396 {
    397 
    398 	return recvfrom(s, buf, len, flags, NULL, NULL);
    399 }
    400 
    401 ssize_t
    402 recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
    403 	socklen_t *fromlen)
    404 {
    405 	int (*rc_recvfrom)(int, void *, size_t, int,
    406 	    struct sockaddr *, socklen_t *);
    407 
    408 	DPRINTF(("recvfrom\n"));
    409 	if (fd_isrump(s)) {
    410 		rc_recvfrom = rumpcalls[RUMPCALL_RECVFROM];
    411 		s = fd_host2rump(s);
    412 	} else {
    413 		rc_recvfrom = host_recvfrom;
    414 	}
    415 
    416 	return rc_recvfrom(s, buf, len, flags, from, fromlen);
    417 }
    418 
    419 ssize_t
    420 recvmsg(int s, struct msghdr *msg, int flags)
    421 {
    422 	int (*rc_recvmsg)(int, struct msghdr *, int);
    423 
    424 	DPRINTF(("recvmsg\n"));
    425 	assertfd(s);
    426 	rc_recvmsg = rumpcalls[RUMPCALL_RECVMSG];
    427 	return rc_recvmsg(fd_host2rump(s), msg, flags);
    428 }
    429 
    430 ssize_t
    431 send(int s, const void *buf, size_t len, int flags)
    432 {
    433 
    434 	return sendto(s, buf, len, flags, NULL, 0);
    435 }
    436 
    437 ssize_t
    438 sendto(int s, const void *buf, size_t len, int flags,
    439 	const struct sockaddr *to, socklen_t tolen)
    440 {
    441 	int (*rc_sendto)(int, const void *, size_t, int,
    442 	    const struct sockaddr *, socklen_t);
    443 
    444 	if (s == -1)
    445 		return len;
    446 	DPRINTF(("sendto\n"));
    447 
    448 	if (fd_isrump(s)) {
    449 		rc_sendto = rumpcalls[RUMPCALL_SENDTO];
    450 		s = fd_host2rump(s);
    451 	} else {
    452 		rc_sendto = host_sendto;
    453 	}
    454 	return rc_sendto(s, buf, len, flags, to, tolen);
    455 }
    456 
    457 ssize_t
    458 sendmsg(int s, const struct msghdr *msg, int flags)
    459 {
    460 	int (*rc_sendmsg)(int, const struct msghdr *, int);
    461 
    462 	DPRINTF(("sendmsg\n"));
    463 	assertfd(s);
    464 	rc_sendmsg = rumpcalls[RUMPCALL_SENDTO];
    465 	return rc_sendmsg(fd_host2rump(s), msg, flags);
    466 }
    467 
    468 int
    469 getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen)
    470 {
    471 	int (*rc_getsockopt)(int, int, int, void *, socklen_t *);
    472 
    473 	DPRINTF(("getsockopt -> %d\n", s));
    474 	assertfd(s);
    475 	rc_getsockopt = rumpcalls[RUMPCALL_GETSOCKOPT];
    476 	return rc_getsockopt(fd_host2rump(s), level, optname, optval, optlen);
    477 }
    478 
    479 int
    480 setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen)
    481 {
    482 	int (*rc_setsockopt)(int, int, int, const void *, socklen_t);
    483 
    484 	DPRINTF(("setsockopt -> %d\n", s));
    485 	if (fd_isrump(s)) {
    486 		rc_setsockopt = rumpcalls[RUMPCALL_SETSOCKOPT];
    487 		s = fd_host2rump(s);
    488 	} else {
    489 		rc_setsockopt = host_setsockopt;
    490 	}
    491 	return rc_setsockopt(s, level, optname, optval, optlen);
    492 }
    493 
    494 int
    495 shutdown(int s, int how)
    496 {
    497 	int (*rc_shutdown)(int, int);
    498 
    499 	DPRINTF(("shutdown -> %d\n", s));
    500 	if (fd_isrump(s)) {
    501 		rc_shutdown = rumpcalls[RUMPCALL_SHUTDOWN];
    502 		s = fd_host2rump(s);
    503 	} else {
    504 		rc_shutdown = host_shutdown;
    505 	}
    506 	return rc_shutdown(s, how);
    507 }
    508 
    509 /*
    510  * dup2 is special.  we allow dup2 of a rump kernel fd to 0-2 since
    511  * many programs do that.  dup2 of a rump kernel fd to another value
    512  * not >= fdoff is an error.
    513  *
    514  * Note: cannot rump2host newd, because it is often hardcoded.
    515  *
    516  * XXX: should disable debug prints after stdout/stderr are dup2'd
    517  */
    518 int
    519 dup2(int oldd, int newd)
    520 {
    521 	int rv;
    522 
    523 	DPRINTF(("dup2 -> %d (o) -> %d (n)\n", oldd, newd));
    524 
    525 	if (fd_isrump(oldd)) {
    526 		if (!(newd >= 0 && newd <= 2))
    527 			return EBADF;
    528 		oldd = fd_host2rump(oldd);
    529 		rv = rump_sys_dup2(oldd, newd);
    530 		if (rv != -1)
    531 			dup2mask |= 1<<newd;
    532 	} else {
    533 		rv = host_dup2(oldd, newd);
    534 	}
    535 
    536 	return rv;
    537 }
    538 
    539 /*
    540  * We just wrap fork the appropriate rump client calls to preserve
    541  * the file descriptors of the forked parent in the child, but
    542  * prevent double use of connection fd.
    543  */
    544 
    545 pid_t
    546 fork()
    547 {
    548 	struct rumpclient_fork *rf;
    549 	pid_t rv;
    550 
    551 	DPRINTF(("fork\n"));
    552 
    553 	if ((rf = rumpclient_prefork()) == NULL)
    554 		return -1;
    555 
    556 	switch ((rv = host_fork())) {
    557 	case -1:
    558 		/* XXX: cancel rf */
    559 		break;
    560 	case 0:
    561 		if (rumpclient_fork_init(rf) == -1)
    562 			rv = -1;
    563 		break;
    564 	default:
    565 		break;
    566 	}
    567 
    568 	DPRINTF(("fork returns %d\n", rv));
    569 	return rv;
    570 }
    571 
    572 /*
    573  * Hybrids
    574  */
    575 
    576 ssize_t
    577 read(int fd, void *buf, size_t len)
    578 {
    579 	ssize_t (*op_read)(int, void *, size_t);
    580 	ssize_t n;
    581 
    582 	DPRINTF(("read %d\n", fd));
    583 	if (fd_isrump(fd)) {
    584 		fd = fd_host2rump(fd);
    585 		op_read = rumpcalls[RUMPCALL_READ];
    586 	} else {
    587 		op_read = host_read;
    588 	}
    589 
    590 	n = op_read(fd, buf, len);
    591 	return n;
    592 }
    593 
    594 ssize_t
    595 readv(int fd, const struct iovec *iov, int iovcnt)
    596 {
    597 	ssize_t (*op_readv)(int, const struct iovec *, int);
    598 
    599 	DPRINTF(("readv %d\n", fd));
    600 	if (fd_isrump(fd)) {
    601 		fd = fd_host2rump(fd);
    602 		op_readv = rumpcalls[RUMPCALL_READV];
    603 	} else {
    604 		op_readv = host_readv;
    605 	}
    606 
    607 	return op_readv(fd, iov, iovcnt);
    608 }
    609 
    610 ssize_t
    611 write(int fd, const void *buf, size_t len)
    612 {
    613 	ssize_t (*op_write)(int, const void *, size_t);
    614 
    615 	if (fd_isrump(fd)) {
    616 		fd = fd_host2rump(fd);
    617 		op_write = rumpcalls[RUMPCALL_WRITE];
    618 	} else {
    619 		op_write = host_write;
    620 	}
    621 
    622 	return op_write(fd, buf, len);
    623 }
    624 
    625 ssize_t
    626 writev(int fd, const struct iovec *iov, int iovcnt)
    627 {
    628 	ssize_t (*op_writev)(int, const struct iovec *, int);
    629 
    630 	DPRINTF(("writev %d\n", fd));
    631 	if (fd_isrump(fd)) {
    632 		fd = fd_host2rump(fd);
    633 		op_writev = rumpcalls[RUMPCALL_WRITEV];
    634 	} else {
    635 		op_writev = host_writev;
    636 	}
    637 
    638 	return op_writev(fd, iov, iovcnt);
    639 }
    640 
    641 int
    642 ioctl(int fd, unsigned long cmd, ...)
    643 {
    644 	int (*op_ioctl)(int, unsigned long cmd, ...);
    645 	va_list ap;
    646 	int rv;
    647 
    648 	DPRINTF(("ioctl\n"));
    649 	if (fd_isrump(fd)) {
    650 		fd = fd_host2rump(fd);
    651 		op_ioctl = rumpcalls[RUMPCALL_IOCTL];
    652 	} else {
    653 		op_ioctl = host_ioctl;
    654 	}
    655 
    656 	va_start(ap, cmd);
    657 	rv = op_ioctl(fd, cmd, va_arg(ap, void *));
    658 	va_end(ap);
    659 	return rv;
    660 }
    661 
    662 int
    663 fcntl(int fd, int cmd, ...)
    664 {
    665 	int (*op_fcntl)(int, int, ...);
    666 	va_list ap;
    667 	int rv;
    668 
    669 	DPRINTF(("fcntl\n"));
    670 	if (fd_isrump(fd)) {
    671 		fd = fd_host2rump(fd);
    672 		op_fcntl = rumpcalls[RUMPCALL_FCNTL];
    673 	} else {
    674 		op_fcntl = host_fcntl;
    675 	}
    676 
    677 	va_start(ap, cmd);
    678 	rv = op_fcntl(fd, cmd, va_arg(ap, void *));
    679 	va_end(ap);
    680 	return rv;
    681 }
    682 
    683 int
    684 close(int fd)
    685 {
    686 	int (*op_close)(int);
    687 
    688 	DPRINTF(("close %d\n", fd));
    689 	if (fd_isrump(fd)) {
    690 		fd = fd_host2rump(fd);
    691 		op_close = rumpcalls[RUMPCALL_CLOSE];
    692 	} else {
    693 		op_close = host_close;
    694 	}
    695 
    696 	return op_close(fd);
    697 }
    698 
    699 int
    700 SELECT(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
    701 	struct timeval *timeout)
    702 {
    703 	struct pollfd *pfds;
    704 	struct timespec ts, *tsp = NULL;
    705 	nfds_t i, j, realnfds;
    706 	int rv, incr;
    707 
    708 	DPRINTF(("select\n"));
    709 
    710 	/*
    711 	 * Well, first we must scan the fds to figure out how many
    712 	 * fds there really are.  This is because up to and including
    713 	 * nb5 poll() silently refuses nfds > process_open_fds.
    714 	 * Seems to be fixed in current, thank the maker.
    715 	 * god damn cluster...bomb.
    716 	 */
    717 
    718 	for (i = 0, realnfds = 0; i < nfds; i++) {
    719 		if (readfds && FD_ISSET(i, readfds)) {
    720 			realnfds++;
    721 			continue;
    722 		}
    723 		if (writefds && FD_ISSET(i, writefds)) {
    724 			realnfds++;
    725 			continue;
    726 		}
    727 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    728 			realnfds++;
    729 			continue;
    730 		}
    731 	}
    732 
    733 	if (realnfds) {
    734 		pfds = malloc(sizeof(*pfds) * realnfds);
    735 		if (!pfds)
    736 			return -1;
    737 	} else {
    738 		pfds = NULL;
    739 	}
    740 
    741 	for (i = 0, j = 0; i < nfds; i++) {
    742 		incr = 0;
    743 		pfds[j].events = pfds[j].revents = 0;
    744 		if (readfds && FD_ISSET(i, readfds)) {
    745 			pfds[j].fd = i;
    746 			pfds[j].events |= POLLIN;
    747 			incr=1;
    748 		}
    749 		if (writefds && FD_ISSET(i, writefds)) {
    750 			pfds[j].fd = i;
    751 			pfds[j].events |= POLLOUT;
    752 			incr=1;
    753 		}
    754 		if (exceptfds && FD_ISSET(i, exceptfds)) {
    755 			pfds[j].fd = i;
    756 			pfds[j].events |= POLLHUP|POLLERR;
    757 			incr=1;
    758 		}
    759 		if (incr)
    760 			j++;
    761 	}
    762 
    763 	if (timeout) {
    764 		TIMEVAL_TO_TIMESPEC(timeout, &ts);
    765 		tsp = &ts;
    766 	}
    767 	rv = pollts(pfds, realnfds, tsp, NULL);
    768 	if (rv <= 0)
    769 		goto out;
    770 
    771 	/*
    772 	 * ok, harvest results.  first zero out entries (can't use
    773 	 * FD_ZERO for the obvious select-me-not reason).  whee.
    774 	 */
    775 	for (i = 0; i < nfds; i++) {
    776 		if (readfds)
    777 			FD_CLR(i, readfds);
    778 		if (writefds)
    779 			FD_CLR(i, writefds);
    780 		if (exceptfds)
    781 			FD_CLR(i, exceptfds);
    782 	}
    783 
    784 	/* and then plug in the results */
    785 	for (i = 0; i < realnfds; i++) {
    786 		if (readfds) {
    787 			if (pfds[i].revents & POLLIN) {
    788 				FD_SET(pfds[i].fd, readfds);
    789 			}
    790 		}
    791 		if (writefds) {
    792 			if (pfds[i].revents & POLLOUT) {
    793 				FD_SET(pfds[i].fd, writefds);
    794 			}
    795 		}
    796 		if (exceptfds) {
    797 			if (pfds[i].revents & (POLLHUP|POLLERR)) {
    798 				FD_SET(pfds[i].fd, exceptfds);
    799 			}
    800 		}
    801 	}
    802 
    803  out:
    804 	free(pfds);
    805 	return rv;
    806 }
    807 
    808 static void
    809 checkpoll(struct pollfd *fds, nfds_t nfds, int *hostcall, int *rumpcall)
    810 {
    811 	nfds_t i;
    812 
    813 	for (i = 0; i < nfds; i++) {
    814 		if (fds[i].fd == -1)
    815 			continue;
    816 
    817 		if (fd_isrump(fds[i].fd))
    818 			(*rumpcall)++;
    819 		else
    820 			(*hostcall)++;
    821 	}
    822 }
    823 
    824 static void
    825 adjustpoll(struct pollfd *fds, nfds_t nfds, int (*fdadj)(int))
    826 {
    827 	nfds_t i;
    828 
    829 	for (i = 0; i < nfds; i++) {
    830 		fds[i].fd = fdadj(fds[i].fd);
    831 	}
    832 }
    833 
    834 /*
    835  * poll is easy as long as the call comes in the fds only in one
    836  * kernel.  otherwise its quite tricky...
    837  */
    838 struct pollarg {
    839 	struct pollfd *pfds;
    840 	nfds_t nfds;
    841 	const struct timespec *ts;
    842 	const sigset_t *sigmask;
    843 	int pipefd;
    844 	int errnum;
    845 };
    846 
    847 static void *
    848 hostpoll(void *arg)
    849 {
    850 	struct pollarg *parg = arg;
    851 	intptr_t rv;
    852 
    853 	rv = host_pollts(parg->pfds, parg->nfds, parg->ts, parg->sigmask);
    854 	if (rv == -1)
    855 		parg->errnum = errno;
    856 	rump_sys_write(parg->pipefd, &rv, sizeof(rv));
    857 
    858 	return (void *)(intptr_t)rv;
    859 }
    860 
    861 int
    862 POLLTS(struct pollfd *fds, nfds_t nfds, const struct timespec *ts,
    863 	const sigset_t *sigmask)
    864 {
    865 	int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
    866 			 const sigset_t *);
    867 	int hostcall = 0, rumpcall = 0;
    868 	pthread_t pt;
    869 	nfds_t i;
    870 	int rv;
    871 
    872 	DPRINTF(("poll\n"));
    873 	checkpoll(fds, nfds, &hostcall, &rumpcall);
    874 
    875 	if (hostcall && rumpcall) {
    876 		struct pollfd *pfd_host = NULL, *pfd_rump = NULL;
    877 		int rpipe[2] = {-1,-1}, hpipe[2] = {-1,-1};
    878 		struct pollarg parg;
    879 		uintptr_t lrv;
    880 		int sverrno = 0, trv;
    881 
    882 		/*
    883 		 * ok, this is where it gets tricky.  We must support
    884 		 * this since it's a very common operation in certain
    885 		 * types of software (telnet, netcat, etc).  We allocate
    886 		 * two vectors and run two poll commands in separate
    887 		 * threads.  Whichever returns first "wins" and the
    888 		 * other kernel's fds won't show activity.
    889 		 */
    890 		rv = -1;
    891 
    892 		/* allocate full vector for O(n) joining after call */
    893 		pfd_host = malloc(sizeof(*pfd_host)*(nfds+1));
    894 		if (!pfd_host)
    895 			goto out;
    896 		pfd_rump = malloc(sizeof(*pfd_rump)*(nfds+1));
    897 		if (!pfd_rump) {
    898 			goto out;
    899 		}
    900 
    901 		/* split vectors */
    902 		for (i = 0; i < nfds; i++) {
    903 			if (fds[i].fd == -1) {
    904 				pfd_host[i].fd = -1;
    905 				pfd_rump[i].fd = -1;
    906 			} else if (fd_isrump(fds[i].fd)) {
    907 				pfd_host[i].fd = -1;
    908 				pfd_rump[i].fd = fd_host2rump(fds[i].fd);
    909 				pfd_rump[i].events = fds[i].events;
    910 			} else {
    911 				pfd_rump[i].fd = -1;
    912 				pfd_host[i].fd = fds[i].fd;
    913 				pfd_host[i].events = fds[i].events;
    914 			}
    915 			fds[i].revents = 0;
    916 		}
    917 
    918 		/*
    919 		 * then, open two pipes, one for notifications
    920 		 * to each kernel.
    921 		 */
    922 		if (rump_sys_pipe(rpipe) == -1)
    923 			goto out;
    924 		if (pipe(hpipe) == -1)
    925 			goto out;
    926 
    927 		pfd_host[nfds].fd = hpipe[0];
    928 		pfd_host[nfds].events = POLLIN;
    929 		pfd_rump[nfds].fd = rpipe[0];
    930 		pfd_rump[nfds].events = POLLIN;
    931 
    932 		/*
    933 		 * then, create a thread to do host part and meanwhile
    934 		 * do rump kernel part right here
    935 		 */
    936 
    937 		parg.pfds = pfd_host;
    938 		parg.nfds = nfds+1;
    939 		parg.ts = ts;
    940 		parg.sigmask = sigmask;
    941 		parg.pipefd = rpipe[1];
    942 		pthread_create(&pt, NULL, hostpoll, &parg);
    943 
    944 		op_pollts = rumpcalls[RUMPCALL_POLLTS];
    945 		lrv = op_pollts(pfd_rump, nfds+1, ts, NULL);
    946 		sverrno = errno;
    947 		write(hpipe[1], &rv, sizeof(rv));
    948 		pthread_join(pt, (void *)&trv);
    949 
    950 		/* check who "won" and merge results */
    951 		if (lrv != 0 && pfd_host[nfds].revents & POLLIN) {
    952 			rv = trv;
    953 
    954 			for (i = 0; i < nfds; i++) {
    955 				if (pfd_rump[i].fd != -1)
    956 					fds[i].revents = pfd_rump[i].revents;
    957 			}
    958 			sverrno = parg.errnum;
    959 		} else if (trv != 0 && pfd_rump[nfds].revents & POLLIN) {
    960 			rv = trv;
    961 
    962 			for (i = 0; i < nfds; i++) {
    963 				if (pfd_host[i].fd != -1)
    964 					fds[i].revents = pfd_host[i].revents;
    965 			}
    966 		} else {
    967 			rv = 0;
    968 		}
    969 
    970  out:
    971 		if (rpipe[0] != -1)
    972 			rump_sys_close(rpipe[0]);
    973 		if (rpipe[1] != -1)
    974 			rump_sys_close(rpipe[1]);
    975 		if (hpipe[0] != -1)
    976 			host_close(hpipe[0]);
    977 		if (hpipe[1] != -1)
    978 			host_close(hpipe[1]);
    979 		free(pfd_host);
    980 		free(pfd_rump);
    981 		errno = sverrno;
    982 	} else {
    983 		if (hostcall) {
    984 			op_pollts = host_pollts;
    985 		} else {
    986 			op_pollts = rumpcalls[RUMPCALL_POLLTS];
    987 			adjustpoll(fds, nfds, fd_host2rump);
    988 		}
    989 
    990 		rv = op_pollts(fds, nfds, ts, sigmask);
    991 		if (rumpcall)
    992 			adjustpoll(fds, nfds, fd_rump2host);
    993 	}
    994 
    995 	return rv;
    996 }
    997 
    998 int
    999 POLL(struct pollfd *fds, nfds_t nfds, int timeout)
   1000 {
   1001 	struct timespec ts;
   1002 	struct timespec *tsp = NULL;
   1003 
   1004 	if (timeout != INFTIM) {
   1005 		ts.tv_sec = timeout / 1000;
   1006 		ts.tv_nsec = (timeout % 1000) * 1000*1000;
   1007 
   1008 		tsp = &ts;
   1009 	}
   1010 
   1011 	return pollts(fds, nfds, tsp, NULL);
   1012 }
   1013 
   1014 int
   1015 kqueue(void)
   1016 {
   1017 
   1018 	abort();
   1019 }
   1020 
   1021 int
   1022 kevent(int kq, const struct kevent *changelist, size_t nchanges,
   1023 	struct kevent *eventlist, size_t nevents,
   1024 	const struct timespec *timeout)
   1025 {
   1026 
   1027 	abort();
   1028 }
   1029