hijack.c revision 1.42 1 /* $NetBSD: hijack.c,v 1.42 2011/02/16 15:33:46 pooka Exp $ */
2
3 /*-
4 * Copyright (c) 2011 Antti Kantee. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __RCSID("$NetBSD: hijack.c,v 1.42 2011/02/16 15:33:46 pooka Exp $");
30
31 #define __ssp_weak_name(fun) _hijack_ ## fun
32
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/event.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/poll.h>
39
40 #include <rump/rumpclient.h>
41 #include <rump/rump_syscalls.h>
42
43 #include <assert.h>
44 #include <dlfcn.h>
45 #include <err.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <poll.h>
49 #include <pthread.h>
50 #include <signal.h>
51 #include <stdarg.h>
52 #include <stdbool.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 enum dualcall {
60 DUALCALL_WRITE, DUALCALL_WRITEV,
61 DUALCALL_IOCTL, DUALCALL_FCNTL,
62 DUALCALL_SOCKET, DUALCALL_ACCEPT, DUALCALL_BIND, DUALCALL_CONNECT,
63 DUALCALL_GETPEERNAME, DUALCALL_GETSOCKNAME, DUALCALL_LISTEN,
64 DUALCALL_RECVFROM, DUALCALL_RECVMSG,
65 DUALCALL_SENDTO, DUALCALL_SENDMSG,
66 DUALCALL_GETSOCKOPT, DUALCALL_SETSOCKOPT,
67 DUALCALL_SHUTDOWN,
68 DUALCALL_READ, DUALCALL_READV,
69 DUALCALL_DUP2,
70 DUALCALL_CLOSE,
71 DUALCALL_POLLTS,
72 DUALCALL_KEVENT,
73 DUALCALL__NUM
74 };
75
76 #define RSYS_STRING(a) __STRING(a)
77 #define RSYS_NAME(a) RSYS_STRING(__CONCAT(RUMP_SYS_RENAME_,a))
78
79 /*
80 * Would be nice to get this automatically in sync with libc.
81 * Also, this does not work for compat-using binaries!
82 */
83 #if !__NetBSD_Prereq__(5,99,7)
84 #define REALSELECT select
85 #define REALPOLLTS pollts
86 #define REALKEVENT kevent
87 #else
88 #define REALSELECT _sys___select50
89 #define REALPOLLTS _sys___pollts50
90 #define REALKEVENT _sys___kevent50
91 #endif
92 #define REALREAD _sys_read
93
94 int REALSELECT(int, fd_set *, fd_set *, fd_set *, struct timeval *);
95 int REALPOLLTS(struct pollfd *, nfds_t,
96 const struct timespec *, const sigset_t *);
97 int REALKEVENT(int, const struct kevent *, size_t, struct kevent *, size_t,
98 const struct timespec *);
99 ssize_t REALREAD(int, void *, size_t);
100
101 #define S(a) __STRING(a)
102 struct sysnames {
103 enum dualcall scm_callnum;
104 const char *scm_hostname;
105 const char *scm_rumpname;
106 } syscnames[] = {
107 { DUALCALL_SOCKET, "__socket30", RSYS_NAME(SOCKET) },
108 { DUALCALL_ACCEPT, "accept", RSYS_NAME(ACCEPT) },
109 { DUALCALL_BIND, "bind", RSYS_NAME(BIND) },
110 { DUALCALL_CONNECT, "connect", RSYS_NAME(CONNECT) },
111 { DUALCALL_GETPEERNAME, "getpeername", RSYS_NAME(GETPEERNAME) },
112 { DUALCALL_GETSOCKNAME, "getsockname", RSYS_NAME(GETSOCKNAME) },
113 { DUALCALL_LISTEN, "listen", RSYS_NAME(LISTEN) },
114 { DUALCALL_RECVFROM, "recvfrom", RSYS_NAME(RECVFROM) },
115 { DUALCALL_RECVMSG, "recvmsg", RSYS_NAME(RECVMSG) },
116 { DUALCALL_SENDTO, "sendto", RSYS_NAME(SENDTO) },
117 { DUALCALL_SENDMSG, "sendmsg", RSYS_NAME(SENDMSG) },
118 { DUALCALL_GETSOCKOPT, "getsockopt", RSYS_NAME(GETSOCKOPT) },
119 { DUALCALL_SETSOCKOPT, "setsockopt", RSYS_NAME(SETSOCKOPT) },
120 { DUALCALL_SHUTDOWN, "shutdown", RSYS_NAME(SHUTDOWN) },
121 { DUALCALL_READ, S(REALREAD), RSYS_NAME(READ) },
122 { DUALCALL_READV, "readv", RSYS_NAME(READV) },
123 { DUALCALL_WRITE, "write", RSYS_NAME(WRITE) },
124 { DUALCALL_WRITEV, "writev", RSYS_NAME(WRITEV) },
125 { DUALCALL_IOCTL, "ioctl", RSYS_NAME(IOCTL) },
126 { DUALCALL_FCNTL, "fcntl", RSYS_NAME(FCNTL) },
127 { DUALCALL_DUP2, "dup2", RSYS_NAME(DUP2) },
128 { DUALCALL_CLOSE, "close", RSYS_NAME(CLOSE) },
129 { DUALCALL_POLLTS, S(REALPOLLTS), RSYS_NAME(POLLTS) },
130 { DUALCALL_KEVENT, S(REALKEVENT), RSYS_NAME(KEVENT) },
131 };
132 #undef S
133
134 struct bothsys {
135 void *bs_host;
136 void *bs_rump;
137 } syscalls[DUALCALL__NUM];
138 #define GETSYSCALL(which, name) syscalls[DUALCALL_##name].bs_##which
139
140 pid_t (*host_fork)(void);
141 int (*host_daemon)(int, int);
142 int (*host_execve)(const char *, char *const[], char *const[]);
143
144 static uint32_t dup2mask;
145 #define ISDUP2D(fd) (((fd) < 32) && (1<<(fd) & dup2mask))
146 #define SETDUP2(fd) \
147 do { if ((fd) < 32) dup2mask |= (1<<(fd)); } while (/*CONSTCOND*/0)
148 #define CLRDUP2(fd) \
149 do { if ((fd) < 32) dup2mask &= ~(1<<(fd)); } while (/*CONSTCOND*/0)
150
151 //#define DEBUGJACK
152 #ifdef DEBUGJACK
153 #define DPRINTF(x) mydprintf x
154 static void
155 mydprintf(const char *fmt, ...)
156 {
157 va_list ap;
158
159 if (ISDUP2D(STDERR_FILENO))
160 return;
161
162 va_start(ap, fmt);
163 vfprintf(stderr, fmt, ap);
164 va_end(ap);
165 }
166
167 #else
168 #define DPRINTF(x)
169 #endif
170
171 #define FDCALL(type, name, rcname, args, proto, vars) \
172 type name args \
173 { \
174 type (*fun) proto; \
175 \
176 DPRINTF(("%s -> %d\n", __STRING(name), fd)); \
177 if (fd_isrump(fd)) { \
178 fun = syscalls[rcname].bs_rump; \
179 fd = fd_host2rump(fd); \
180 } else { \
181 fun = syscalls[rcname].bs_host; \
182 } \
183 \
184 return fun vars; \
185 }
186
187 /*
188 * This is called from librumpclient in case of LD_PRELOAD.
189 * It ensures correct RTLD_NEXT.
190 *
191 * ... except, it's apparently extremely difficult to force
192 * at least gcc to generate an actual stack frame here. So
193 * sprinkle some volatile foobar and baz to throw the optimizer
194 * off the scent and generate a variable assignment with the
195 * return value. The posterboy for this meltdown is amd64
196 * with -O2. At least with gcc 4.1.3 i386 works regardless of
197 * optimization.
198 */
199 volatile int rumphijack_unrope; /* there, unhang yourself */
200 static void *
201 hijackdlsym(void *handle, const char *symbol)
202 {
203 void *rv;
204
205 rv = dlsym(handle, symbol);
206 rumphijack_unrope = *(volatile int *)rv;
207
208 return (void *)rv;
209 }
210
211 /* low calorie sockets? */
212 static bool hostlocalsockets = true;
213
214 static void __attribute__((constructor))
215 rcinit(void)
216 {
217 char buf[64];
218 extern void *(*rumpclient_dlsym)(void *, const char *);
219 unsigned i, j;
220
221 rumpclient_dlsym = hijackdlsym;
222 host_fork = dlsym(RTLD_NEXT, "fork");
223 host_daemon = dlsym(RTLD_NEXT, "daemon");
224 host_execve = dlsym(RTLD_NEXT, "execve");
225
226 /*
227 * In theory cannot print anything during lookups because
228 * we might not have the call vector set up. so, the errx()
229 * is a bit of a strech, but it might work.
230 */
231
232 for (i = 0; i < DUALCALL__NUM; i++) {
233 /* build runtime O(1) access */
234 for (j = 0; j < __arraycount(syscnames); j++) {
235 if (syscnames[j].scm_callnum == i)
236 break;
237 }
238
239 if (j == __arraycount(syscnames))
240 errx(1, "rumphijack error: syscall pos %d missing", i);
241
242 syscalls[i].bs_host = dlsym(RTLD_NEXT,
243 syscnames[j].scm_hostname);
244 if (syscalls[i].bs_host == NULL)
245 errx(1, "hostcall %s not found missing",
246 syscnames[j].scm_hostname);
247
248 syscalls[i].bs_rump = dlsym(RTLD_NEXT,
249 syscnames[j].scm_rumpname);
250 if (syscalls[i].bs_rump == NULL)
251 errx(1, "rumpcall %s not found missing",
252 syscnames[j].scm_rumpname);
253 }
254
255 if (rumpclient_init() == -1)
256 err(1, "rumpclient init");
257
258 /* set client persistence level */
259 if (getenv_r("RUMPHIJACK_RETRY", buf, sizeof(buf)) == -1) {
260 if (errno == ERANGE)
261 err(1, "invalid RUMPHIJACK_RETRY");
262 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_INFTIME);
263 } else {
264 if (strcmp(buf, "die") == 0)
265 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_DIE);
266 else if (strcmp(buf, "inftime") == 0)
267 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_INFTIME);
268 else if (strcmp(buf, "once") == 0)
269 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_ONCE);
270 else {
271 time_t timeout;
272
273 timeout = (time_t)strtoll(buf, NULL, 10);
274 if (timeout <= 0)
275 errx(1, "RUMPHIJACK_RETRY must be keyword "
276 "or a positive integer, got: %s", buf);
277
278 rumpclient_setconnretry(timeout);
279 }
280 }
281
282 if (getenv_r("RUMPHIJACK__DUP2MASK", buf, sizeof(buf)) == 0) {
283 dup2mask = strtoul(buf, NULL, 10);
284 }
285 }
286
287 /* XXX: need runtime selection. low for now due to FD_SETSIZE */
288 #define HIJACK_FDOFF 128
289 static int
290 fd_rump2host(int fd)
291 {
292
293 if (fd == -1)
294 return fd;
295
296 if (!ISDUP2D(fd))
297 fd += HIJACK_FDOFF;
298
299 return fd;
300 }
301
302 static int
303 fd_host2rump(int fd)
304 {
305
306 if (!ISDUP2D(fd))
307 fd -= HIJACK_FDOFF;
308 return fd;
309 }
310
311 static bool
312 fd_isrump(int fd)
313 {
314
315 return ISDUP2D(fd) || fd >= HIJACK_FDOFF;
316 }
317
318 #define assertfd(_fd_) assert(ISDUP2D(_fd_) || (_fd_) >= HIJACK_FDOFF)
319
320 static int
321 dodup(int oldd, int minfd)
322 {
323 int (*op_fcntl)(int, int, ...);
324 int newd;
325 int isrump;
326
327 DPRINTF(("dup -> %d (minfd %d)\n", oldd, minfd));
328 if (fd_isrump(oldd)) {
329 op_fcntl = GETSYSCALL(rump, FCNTL);
330 oldd = fd_host2rump(oldd);
331 isrump = 1;
332 } else {
333 op_fcntl = GETSYSCALL(host, FCNTL);
334 isrump = 0;
335 }
336
337 newd = op_fcntl(oldd, F_DUPFD, minfd);
338
339 if (isrump)
340 newd = fd_rump2host(newd);
341 DPRINTF(("dup <- %d\n", newd));
342
343 return newd;
344 }
345
346 int __socket30(int, int, int);
347 int
348 __socket30(int domain, int type, int protocol)
349 {
350 int (*op_socket)(int, int, int);
351 int fd;
352 bool dohost;
353
354 dohost = hostlocalsockets && (domain == AF_LOCAL);
355
356 if (dohost)
357 op_socket = GETSYSCALL(host, SOCKET);
358 else
359 op_socket = GETSYSCALL(rump, SOCKET);
360 fd = op_socket(domain, type, protocol);
361
362 if (!dohost)
363 fd = fd_rump2host(fd);
364 DPRINTF(("socket <- %d\n", fd));
365
366 return fd;
367 }
368
369 int
370 accept(int s, struct sockaddr *addr, socklen_t *addrlen)
371 {
372 int (*op_accept)(int, struct sockaddr *, socklen_t *);
373 int fd;
374 bool isrump;
375
376 isrump = fd_isrump(s);
377
378 DPRINTF(("accept -> %d", s));
379 if (isrump) {
380 op_accept = GETSYSCALL(rump, ACCEPT);
381 s = fd_host2rump(s);
382 } else {
383 op_accept = GETSYSCALL(host, ACCEPT);
384 }
385 fd = op_accept(s, addr, addrlen);
386 if (fd != -1 && isrump)
387 fd = fd_rump2host(fd);
388
389 DPRINTF((" <- %d\n", fd));
390
391 return fd;
392 }
393
394 /*
395 * ioctl and fcntl are varargs calls and need special treatment
396 */
397 int
398 ioctl(int fd, unsigned long cmd, ...)
399 {
400 int (*op_ioctl)(int, unsigned long cmd, ...);
401 va_list ap;
402 int rv;
403
404 DPRINTF(("ioctl -> %d\n", fd));
405 if (fd_isrump(fd)) {
406 fd = fd_host2rump(fd);
407 op_ioctl = GETSYSCALL(rump, IOCTL);
408 } else {
409 op_ioctl = GETSYSCALL(host, IOCTL);
410 }
411
412 va_start(ap, cmd);
413 rv = op_ioctl(fd, cmd, va_arg(ap, void *));
414 va_end(ap);
415 return rv;
416 }
417
418 #include <syslog.h>
419 int
420 fcntl(int fd, int cmd, ...)
421 {
422 int (*op_fcntl)(int, int, ...);
423 va_list ap;
424 int rv, minfd, i;
425
426 DPRINTF(("fcntl -> %d (cmd %d)\n", fd, cmd));
427
428 switch (cmd) {
429 case F_DUPFD:
430 va_start(ap, cmd);
431 minfd = va_arg(ap, int);
432 va_end(ap);
433 return dodup(fd, minfd);
434
435 case F_CLOSEM:
436 /*
437 * So, if fd < HIJACKOFF, we want to do a host closem.
438 */
439
440 if (fd < HIJACK_FDOFF) {
441 int closemfd = fd;
442
443 if (rumpclient__closenotify(&closemfd,
444 RUMPCLIENT_CLOSE_FCLOSEM) == -1)
445 return -1;
446 op_fcntl = GETSYSCALL(host, FCNTL);
447 rv = op_fcntl(closemfd, cmd);
448 if (rv)
449 return rv;
450 }
451
452 /*
453 * Additionally, we want to do a rump closem, but only
454 * for the file descriptors not within the dup2mask.
455 */
456
457 /* why don't we offer fls()? */
458 for (i = 31; i >= 0; i--) {
459 if (dup2mask & 1<<i)
460 break;
461 }
462
463 if (fd >= HIJACK_FDOFF)
464 fd -= HIJACK_FDOFF;
465 else
466 fd = 0;
467 fd = MAX(i+1, fd);
468
469 /* hmm, maybe we should close rump fd's not within dup2mask? */
470
471 return rump_sys_fcntl(fd, F_CLOSEM);
472
473 case F_MAXFD:
474 /*
475 * For maxfd, if there's a rump kernel fd, return
476 * it hostified. Otherwise, return host's MAXFD
477 * return value.
478 */
479 if ((rv = rump_sys_fcntl(fd, F_MAXFD)) != -1) {
480 /*
481 * This might go a little wrong in case
482 * of dup2 to [012], but I'm not sure if
483 * there's a justification for tracking
484 * that info. Consider e.g.
485 * dup2(rumpfd, 2) followed by rump_sys_open()
486 * returning 1. We should return 1+HIJACKOFF,
487 * not 2+HIJACKOFF. However, if [01] is not
488 * open, the correct return value is 2.
489 */
490 return fd_rump2host(fd);
491 } else {
492 op_fcntl = GETSYSCALL(host, FCNTL);
493 return op_fcntl(fd, F_MAXFD);
494 }
495 /*NOTREACHED*/
496
497 default:
498 if (fd_isrump(fd)) {
499 fd = fd_host2rump(fd);
500 op_fcntl = GETSYSCALL(rump, FCNTL);
501 } else {
502 op_fcntl = GETSYSCALL(host, FCNTL);
503 }
504
505 va_start(ap, cmd);
506 rv = op_fcntl(fd, cmd, va_arg(ap, void *));
507 va_end(ap);
508 return rv;
509 }
510 /*NOTREACHED*/
511 }
512
513 int
514 close(int fd)
515 {
516 int (*op_close)(int);
517 int rv;
518
519 DPRINTF(("close -> %d\n", fd));
520 if (fd_isrump(fd)) {
521 int undup2 = 0;
522
523 if (ISDUP2D(fd))
524 undup2 = 1;
525 fd = fd_host2rump(fd);
526 op_close = GETSYSCALL(rump, CLOSE);
527 rv = op_close(fd);
528 if (rv == 0 && undup2)
529 CLRDUP2(fd);
530 } else {
531 if (rumpclient__closenotify(&fd, RUMPCLIENT_CLOSE_CLOSE) == -1)
532 return -1;
533 op_close = GETSYSCALL(host, CLOSE);
534 rv = op_close(fd);
535 }
536
537 return rv;
538 }
539
540 /*
541 * write cannot issue a standard debug printf due to recursion
542 */
543 ssize_t
544 write(int fd, const void *buf, size_t blen)
545 {
546 ssize_t (*op_write)(int, const void *, size_t);
547
548 if (fd_isrump(fd)) {
549 fd = fd_host2rump(fd);
550 op_write = GETSYSCALL(rump, WRITE);
551 } else {
552 op_write = GETSYSCALL(host, WRITE);
553 }
554
555 return op_write(fd, buf, blen);
556 }
557
558 /*
559 * dup2 is special. we allow dup2 of a rump kernel fd to 0-2 since
560 * many programs do that. dup2 of a rump kernel fd to another value
561 * not >= fdoff is an error.
562 *
563 * Note: cannot rump2host newd, because it is often hardcoded.
564 */
565 int
566 dup2(int oldd, int newd)
567 {
568 int (*host_dup2)(int, int);
569 int rv;
570
571 DPRINTF(("dup2 -> %d (o) -> %d (n)\n", oldd, newd));
572
573 if (fd_isrump(oldd)) {
574 if (!(newd >= 0 && newd <= 2))
575 return EBADF;
576 oldd = fd_host2rump(oldd);
577 rv = rump_sys_dup2(oldd, newd);
578 if (rv != -1)
579 SETDUP2(newd);
580 } else {
581 host_dup2 = syscalls[DUALCALL_DUP2].bs_host;
582 if (rumpclient__closenotify(&newd, RUMPCLIENT_CLOSE_DUP2) == -1)
583 return -1;
584 rv = host_dup2(oldd, newd);
585 }
586
587 return rv;
588 }
589
590 int
591 dup(int oldd)
592 {
593
594 return dodup(oldd, 0);
595 }
596
597 pid_t
598 fork()
599 {
600 pid_t rv;
601
602 DPRINTF(("fork\n"));
603
604 rv = rumpclient_fork(host_fork);
605
606 DPRINTF(("fork returns %d\n", rv));
607 return rv;
608 }
609
610 int
611 daemon(int nochdir, int noclose)
612 {
613 struct rumpclient_fork *rf;
614
615 if ((rf = rumpclient_prefork()) == NULL)
616 return -1;
617
618 if (host_daemon(nochdir, noclose) == -1)
619 return -1;
620
621 if (rumpclient_fork_init(rf) == -1)
622 return -1;
623
624 return 0;
625 }
626
627 int
628 execve(const char *path, char *const argv[], char *const envp[])
629 {
630 char buf[128];
631 char *dup2str;
632 char **newenv;
633 size_t nelem;
634 int rv, sverrno;
635
636 snprintf(buf, sizeof(buf), "RUMPHIJACK__DUP2MASK=%u", dup2mask);
637 dup2str = malloc(strlen(buf)+1);
638 if (dup2str == NULL)
639 return ENOMEM;
640 strcpy(dup2str, buf);
641
642 for (nelem = 0; envp && envp[nelem]; nelem++)
643 continue;
644 newenv = malloc(sizeof(*newenv) * nelem+2);
645 if (newenv == NULL) {
646 free(dup2str);
647 return ENOMEM;
648 }
649 memcpy(newenv, envp, nelem*sizeof(*newenv));
650 newenv[nelem] = dup2str;
651 newenv[nelem+1] = NULL;
652
653 rv = rumpclient_exec(path, argv, newenv);
654
655 _DIAGASSERT(rv != 0);
656 sverrno = errno;
657 free(newenv);
658 free(dup2str);
659 errno = sverrno;
660 return rv;
661 }
662
663 /*
664 * select is done by calling poll.
665 */
666 int
667 REALSELECT(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
668 struct timeval *timeout)
669 {
670 struct pollfd *pfds;
671 struct timespec ts, *tsp = NULL;
672 nfds_t realnfds;
673 int i, j;
674 int rv, incr;
675
676 DPRINTF(("select\n"));
677
678 /*
679 * Well, first we must scan the fds to figure out how many
680 * fds there really are. This is because up to and including
681 * nb5 poll() silently refuses nfds > process_maxopen_fds.
682 * Seems to be fixed in current, thank the maker.
683 * god damn cluster...bomb.
684 */
685
686 for (i = 0, realnfds = 0; i < nfds; i++) {
687 if (readfds && FD_ISSET(i, readfds)) {
688 realnfds++;
689 continue;
690 }
691 if (writefds && FD_ISSET(i, writefds)) {
692 realnfds++;
693 continue;
694 }
695 if (exceptfds && FD_ISSET(i, exceptfds)) {
696 realnfds++;
697 continue;
698 }
699 }
700
701 if (realnfds) {
702 pfds = calloc(realnfds, sizeof(*pfds));
703 if (!pfds)
704 return -1;
705 } else {
706 pfds = NULL;
707 }
708
709 for (i = 0, j = 0; i < nfds; i++) {
710 incr = 0;
711 if (readfds && FD_ISSET(i, readfds)) {
712 pfds[j].fd = i;
713 pfds[j].events |= POLLIN;
714 incr=1;
715 }
716 if (writefds && FD_ISSET(i, writefds)) {
717 pfds[j].fd = i;
718 pfds[j].events |= POLLOUT;
719 incr=1;
720 }
721 if (exceptfds && FD_ISSET(i, exceptfds)) {
722 pfds[j].fd = i;
723 pfds[j].events |= POLLHUP|POLLERR;
724 incr=1;
725 }
726 if (incr)
727 j++;
728 }
729 assert(j == (int)realnfds);
730
731 if (timeout) {
732 TIMEVAL_TO_TIMESPEC(timeout, &ts);
733 tsp = &ts;
734 }
735 rv = REALPOLLTS(pfds, realnfds, tsp, NULL);
736 /*
737 * "If select() returns with an error the descriptor sets
738 * will be unmodified"
739 */
740 if (rv < 0)
741 goto out;
742
743 /*
744 * zero out results (can't use FD_ZERO for the
745 * obvious select-me-not reason). whee.
746 *
747 * We do this here since some software ignores the return
748 * value of select, and hence if the timeout expires, it may
749 * assume all input descriptors have activity.
750 */
751 for (i = 0; i < nfds; i++) {
752 if (readfds)
753 FD_CLR(i, readfds);
754 if (writefds)
755 FD_CLR(i, writefds);
756 if (exceptfds)
757 FD_CLR(i, exceptfds);
758 }
759 if (rv == 0)
760 goto out;
761
762 /*
763 * We have >0 fds with activity. Harvest the results.
764 */
765 for (i = 0; i < (int)realnfds; i++) {
766 if (readfds) {
767 if (pfds[i].revents & POLLIN) {
768 FD_SET(pfds[i].fd, readfds);
769 }
770 }
771 if (writefds) {
772 if (pfds[i].revents & POLLOUT) {
773 FD_SET(pfds[i].fd, writefds);
774 }
775 }
776 if (exceptfds) {
777 if (pfds[i].revents & (POLLHUP|POLLERR)) {
778 FD_SET(pfds[i].fd, exceptfds);
779 }
780 }
781 }
782
783 out:
784 free(pfds);
785 return rv;
786 }
787
788 static void
789 checkpoll(struct pollfd *fds, nfds_t nfds, int *hostcall, int *rumpcall)
790 {
791 nfds_t i;
792
793 for (i = 0; i < nfds; i++) {
794 if (fds[i].fd == -1)
795 continue;
796
797 if (fd_isrump(fds[i].fd))
798 (*rumpcall)++;
799 else
800 (*hostcall)++;
801 }
802 }
803
804 static void
805 adjustpoll(struct pollfd *fds, nfds_t nfds, int (*fdadj)(int))
806 {
807 nfds_t i;
808
809 for (i = 0; i < nfds; i++) {
810 fds[i].fd = fdadj(fds[i].fd);
811 }
812 }
813
814 /*
815 * poll is easy as long as the call comes in the fds only in one
816 * kernel. otherwise its quite tricky...
817 */
818 struct pollarg {
819 struct pollfd *pfds;
820 nfds_t nfds;
821 const struct timespec *ts;
822 const sigset_t *sigmask;
823 int pipefd;
824 int errnum;
825 };
826
827 static void *
828 hostpoll(void *arg)
829 {
830 int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
831 const sigset_t *);
832 struct pollarg *parg = arg;
833 intptr_t rv;
834
835 op_pollts = GETSYSCALL(host, POLLTS);
836 rv = op_pollts(parg->pfds, parg->nfds, parg->ts, parg->sigmask);
837 if (rv == -1)
838 parg->errnum = errno;
839 rump_sys_write(parg->pipefd, &rv, sizeof(rv));
840
841 return (void *)(intptr_t)rv;
842 }
843
844 int
845 REALPOLLTS(struct pollfd *fds, nfds_t nfds, const struct timespec *ts,
846 const sigset_t *sigmask)
847 {
848 int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
849 const sigset_t *);
850 int (*host_close)(int);
851 int hostcall = 0, rumpcall = 0;
852 pthread_t pt;
853 nfds_t i;
854 int rv;
855
856 DPRINTF(("poll\n"));
857 checkpoll(fds, nfds, &hostcall, &rumpcall);
858
859 if (hostcall && rumpcall) {
860 struct pollfd *pfd_host = NULL, *pfd_rump = NULL;
861 int rpipe[2] = {-1,-1}, hpipe[2] = {-1,-1};
862 struct pollarg parg;
863 uintptr_t lrv;
864 int sverrno = 0, trv;
865
866 /*
867 * ok, this is where it gets tricky. We must support
868 * this since it's a very common operation in certain
869 * types of software (telnet, netcat, etc). We allocate
870 * two vectors and run two poll commands in separate
871 * threads. Whichever returns first "wins" and the
872 * other kernel's fds won't show activity.
873 */
874 rv = -1;
875
876 /* allocate full vector for O(n) joining after call */
877 pfd_host = malloc(sizeof(*pfd_host)*(nfds+1));
878 if (!pfd_host)
879 goto out;
880 pfd_rump = malloc(sizeof(*pfd_rump)*(nfds+1));
881 if (!pfd_rump) {
882 goto out;
883 }
884
885 /* split vectors */
886 for (i = 0; i < nfds; i++) {
887 if (fds[i].fd == -1) {
888 pfd_host[i].fd = -1;
889 pfd_rump[i].fd = -1;
890 } else if (fd_isrump(fds[i].fd)) {
891 pfd_host[i].fd = -1;
892 pfd_rump[i].fd = fd_host2rump(fds[i].fd);
893 pfd_rump[i].events = fds[i].events;
894 } else {
895 pfd_rump[i].fd = -1;
896 pfd_host[i].fd = fds[i].fd;
897 pfd_host[i].events = fds[i].events;
898 }
899 pfd_rump[i].revents = pfd_host[i].revents = 0;
900 fds[i].revents = 0;
901 }
902
903 /*
904 * then, open two pipes, one for notifications
905 * to each kernel.
906 */
907 if (rump_sys_pipe(rpipe) == -1)
908 goto out;
909 if (pipe(hpipe) == -1)
910 goto out;
911
912 pfd_host[nfds].fd = hpipe[0];
913 pfd_host[nfds].events = POLLIN;
914 pfd_rump[nfds].fd = rpipe[0];
915 pfd_rump[nfds].events = POLLIN;
916
917 /*
918 * then, create a thread to do host part and meanwhile
919 * do rump kernel part right here
920 */
921
922 parg.pfds = pfd_host;
923 parg.nfds = nfds+1;
924 parg.ts = ts;
925 parg.sigmask = sigmask;
926 parg.pipefd = rpipe[1];
927 pthread_create(&pt, NULL, hostpoll, &parg);
928
929 op_pollts = GETSYSCALL(rump, POLLTS);
930 lrv = op_pollts(pfd_rump, nfds+1, ts, NULL);
931 sverrno = errno;
932 write(hpipe[1], &rv, sizeof(rv));
933 pthread_join(pt, (void *)&trv);
934
935 /* check who "won" and merge results */
936 if (lrv != 0 && pfd_host[nfds].revents & POLLIN) {
937 rv = trv;
938
939 for (i = 0; i < nfds; i++) {
940 if (pfd_rump[i].fd != -1)
941 fds[i].revents = pfd_rump[i].revents;
942 }
943 sverrno = parg.errnum;
944 } else if (trv != 0 && pfd_rump[nfds].revents & POLLIN) {
945 rv = trv;
946
947 for (i = 0; i < nfds; i++) {
948 if (pfd_host[i].fd != -1)
949 fds[i].revents = pfd_host[i].revents;
950 }
951 } else {
952 rv = 0;
953 }
954
955 out:
956 host_close = GETSYSCALL(host, CLOSE);
957 if (rpipe[0] != -1)
958 rump_sys_close(rpipe[0]);
959 if (rpipe[1] != -1)
960 rump_sys_close(rpipe[1]);
961 if (hpipe[0] != -1)
962 host_close(hpipe[0]);
963 if (hpipe[1] != -1)
964 host_close(hpipe[1]);
965 free(pfd_host);
966 free(pfd_rump);
967 errno = sverrno;
968 } else {
969 if (hostcall) {
970 op_pollts = GETSYSCALL(host, POLLTS);
971 } else {
972 op_pollts = GETSYSCALL(rump, POLLTS);
973 adjustpoll(fds, nfds, fd_host2rump);
974 }
975
976 rv = op_pollts(fds, nfds, ts, sigmask);
977 if (rumpcall)
978 adjustpoll(fds, nfds, fd_rump2host);
979 }
980
981 return rv;
982 }
983
984 int
985 poll(struct pollfd *fds, nfds_t nfds, int timeout)
986 {
987 struct timespec ts;
988 struct timespec *tsp = NULL;
989
990 if (timeout != INFTIM) {
991 ts.tv_sec = timeout / 1000;
992 ts.tv_nsec = (timeout % 1000) * 1000*1000;
993
994 tsp = &ts;
995 }
996
997 return REALPOLLTS(fds, nfds, tsp, NULL);
998 }
999
1000 int
1001 REALKEVENT(int kq, const struct kevent *changelist, size_t nchanges,
1002 struct kevent *eventlist, size_t nevents,
1003 const struct timespec *timeout)
1004 {
1005 int (*op_kevent)(int, const struct kevent *, size_t,
1006 struct kevent *, size_t, const struct timespec *);
1007 const struct kevent *ev;
1008 size_t i;
1009
1010 /*
1011 * Check that we don't attempt to kevent rump kernel fd's.
1012 * That needs similar treatment to select/poll, but is slightly
1013 * trickier since we need to manage to different kq descriptors.
1014 * (TODO, in case you're wondering).
1015 */
1016 for (i = 0; i < nchanges; i++) {
1017 ev = &changelist[i];
1018 if (ev->filter == EVFILT_READ || ev->filter == EVFILT_WRITE ||
1019 ev->filter == EVFILT_VNODE) {
1020 if (fd_isrump((int)ev->ident))
1021 return ENOTSUP;
1022 }
1023 }
1024
1025 op_kevent = GETSYSCALL(host, KEVENT);
1026 return op_kevent(kq, changelist, nchanges, eventlist, nevents, timeout);
1027 }
1028
1029 /*
1030 * Rest are std type calls.
1031 */
1032
1033 FDCALL(int, bind, DUALCALL_BIND, \
1034 (int fd, const struct sockaddr *name, socklen_t namelen), \
1035 (int, const struct sockaddr *, socklen_t), \
1036 (fd, name, namelen))
1037
1038 FDCALL(int, connect, DUALCALL_CONNECT, \
1039 (int fd, const struct sockaddr *name, socklen_t namelen), \
1040 (int, const struct sockaddr *, socklen_t), \
1041 (fd, name, namelen))
1042
1043 FDCALL(int, getpeername, DUALCALL_GETPEERNAME, \
1044 (int fd, struct sockaddr *name, socklen_t *namelen), \
1045 (int, struct sockaddr *, socklen_t *), \
1046 (fd, name, namelen))
1047
1048 FDCALL(int, getsockname, DUALCALL_GETSOCKNAME, \
1049 (int fd, struct sockaddr *name, socklen_t *namelen), \
1050 (int, struct sockaddr *, socklen_t *), \
1051 (fd, name, namelen))
1052
1053 FDCALL(int, listen, DUALCALL_LISTEN, \
1054 (int fd, int backlog), \
1055 (int, int), \
1056 (fd, backlog))
1057
1058 FDCALL(ssize_t, recvfrom, DUALCALL_RECVFROM, \
1059 (int fd, void *buf, size_t len, int flags, \
1060 struct sockaddr *from, socklen_t *fromlen), \
1061 (int, void *, size_t, int, struct sockaddr *, socklen_t *), \
1062 (fd, buf, len, flags, from, fromlen))
1063
1064 FDCALL(ssize_t, sendto, DUALCALL_SENDTO, \
1065 (int fd, const void *buf, size_t len, int flags, \
1066 const struct sockaddr *to, socklen_t tolen), \
1067 (int, const void *, size_t, int, \
1068 const struct sockaddr *, socklen_t), \
1069 (fd, buf, len, flags, to, tolen))
1070
1071 FDCALL(ssize_t, recvmsg, DUALCALL_RECVMSG, \
1072 (int fd, struct msghdr *msg, int flags), \
1073 (int, struct msghdr *, int), \
1074 (fd, msg, flags))
1075
1076 FDCALL(ssize_t, sendmsg, DUALCALL_SENDMSG, \
1077 (int fd, const struct msghdr *msg, int flags), \
1078 (int, const struct msghdr *, int), \
1079 (fd, msg, flags))
1080
1081 FDCALL(int, getsockopt, DUALCALL_GETSOCKOPT, \
1082 (int fd, int level, int optn, void *optval, socklen_t *optlen), \
1083 (int, int, int, void *, socklen_t *), \
1084 (fd, level, optn, optval, optlen))
1085
1086 FDCALL(int, setsockopt, DUALCALL_SETSOCKOPT, \
1087 (int fd, int level, int optn, \
1088 const void *optval, socklen_t optlen), \
1089 (int, int, int, const void *, socklen_t), \
1090 (fd, level, optn, optval, optlen))
1091
1092 FDCALL(int, shutdown, DUALCALL_SHUTDOWN, \
1093 (int fd, int how), \
1094 (int, int), \
1095 (fd, how))
1096
1097 #if _FORTIFY_SOURCE > 0
1098 #define STUB(fun) __ssp_weak_name(fun)
1099 ssize_t _sys_readlink(const char * __restrict, char * __restrict, size_t);
1100 ssize_t
1101 STUB(readlink)(const char * __restrict path, char * __restrict buf,
1102 size_t bufsiz)
1103 {
1104 return _sys_readlink(path, buf, bufsiz);
1105 }
1106
1107 char *_sys_getcwd(char *, size_t);
1108 char *
1109 STUB(getcwd)(char *buf, size_t size)
1110 {
1111 return _sys_getcwd(buf, size);
1112 }
1113 #else
1114 #define STUB(fun) fun
1115 #endif
1116
1117 FDCALL(ssize_t, REALREAD, DUALCALL_READ, \
1118 (int fd, void *buf, size_t buflen), \
1119 (int, void *, size_t), \
1120 (fd, buf, buflen))
1121
1122 FDCALL(ssize_t, readv, DUALCALL_READV, \
1123 (int fd, const struct iovec *iov, int iovcnt), \
1124 (int, const struct iovec *, int), \
1125 (fd, iov, iovcnt))
1126
1127 FDCALL(ssize_t, writev, DUALCALL_WRITEV, \
1128 (int fd, const struct iovec *iov, int iovcnt), \
1129 (int, const struct iovec *, int), \
1130 (fd, iov, iovcnt))
1131