hijack.c revision 1.36 1 /* $NetBSD: hijack.c,v 1.36 2011/02/11 12:46:41 pooka Exp $ */
2
3 /*-
4 * Copyright (c) 2011 Antti Kantee. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __RCSID("$NetBSD: hijack.c,v 1.36 2011/02/11 12:46:41 pooka Exp $");
30
31 #define __ssp_weak_name(fun) _hijack_ ## fun
32
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/event.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/poll.h>
39
40 #include <rump/rumpclient.h>
41 #include <rump/rump_syscalls.h>
42
43 #include <assert.h>
44 #include <dlfcn.h>
45 #include <err.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <poll.h>
49 #include <pthread.h>
50 #include <signal.h>
51 #include <stdarg.h>
52 #include <stdbool.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 enum dualcall {
60 DUALCALL_WRITE, DUALCALL_WRITEV,
61 DUALCALL_IOCTL, DUALCALL_FCNTL,
62 DUALCALL_SOCKET, DUALCALL_ACCEPT, DUALCALL_BIND, DUALCALL_CONNECT,
63 DUALCALL_GETPEERNAME, DUALCALL_GETSOCKNAME, DUALCALL_LISTEN,
64 DUALCALL_RECVFROM, DUALCALL_RECVMSG,
65 DUALCALL_SENDTO, DUALCALL_SENDMSG,
66 DUALCALL_GETSOCKOPT, DUALCALL_SETSOCKOPT,
67 DUALCALL_SHUTDOWN,
68 DUALCALL_READ, DUALCALL_READV,
69 DUALCALL_DUP, DUALCALL_DUP2,
70 DUALCALL_CLOSE,
71 DUALCALL_POLLTS,
72 DUALCALL_KEVENT,
73 DUALCALL__NUM
74 };
75
76 #define RSYS_STRING(a) __STRING(a)
77 #define RSYS_NAME(a) RSYS_STRING(__CONCAT(RUMP_SYS_RENAME_,a))
78
79 /*
80 * Would be nice to get this automatically in sync with libc.
81 * Also, this does not work for compat-using binaries!
82 */
83 #if !__NetBSD_Prereq__(5,99,7)
84 #define REALSELECT select
85 #define REALPOLLTS pollts
86 #define REALKEVENT kevent
87 #else
88 #define REALSELECT _sys___select50
89 #define REALPOLLTS _sys___pollts50
90 #define REALKEVENT _sys___kevent50
91 #endif
92 #define REALREAD _sys_read
93
94 int REALSELECT(int, fd_set *, fd_set *, fd_set *, struct timeval *);
95 int REALPOLLTS(struct pollfd *, nfds_t,
96 const struct timespec *, const sigset_t *);
97 int REALKEVENT(int, const struct kevent *, size_t, struct kevent *, size_t,
98 const struct timespec *);
99 ssize_t REALREAD(int, void *, size_t);
100
101 #define S(a) __STRING(a)
102 struct sysnames {
103 enum dualcall scm_callnum;
104 const char *scm_hostname;
105 const char *scm_rumpname;
106 } syscnames[] = {
107 { DUALCALL_SOCKET, "__socket30", RSYS_NAME(SOCKET) },
108 { DUALCALL_ACCEPT, "accept", RSYS_NAME(ACCEPT) },
109 { DUALCALL_BIND, "bind", RSYS_NAME(BIND) },
110 { DUALCALL_CONNECT, "connect", RSYS_NAME(CONNECT) },
111 { DUALCALL_GETPEERNAME, "getpeername", RSYS_NAME(GETPEERNAME) },
112 { DUALCALL_GETSOCKNAME, "getsockname", RSYS_NAME(GETSOCKNAME) },
113 { DUALCALL_LISTEN, "listen", RSYS_NAME(LISTEN) },
114 { DUALCALL_RECVFROM, "recvfrom", RSYS_NAME(RECVFROM) },
115 { DUALCALL_RECVMSG, "recvmsg", RSYS_NAME(RECVMSG) },
116 { DUALCALL_SENDTO, "sendto", RSYS_NAME(SENDTO) },
117 { DUALCALL_SENDMSG, "sendmsg", RSYS_NAME(SENDMSG) },
118 { DUALCALL_GETSOCKOPT, "getsockopt", RSYS_NAME(GETSOCKOPT) },
119 { DUALCALL_SETSOCKOPT, "setsockopt", RSYS_NAME(SETSOCKOPT) },
120 { DUALCALL_SHUTDOWN, "shutdown", RSYS_NAME(SHUTDOWN) },
121 { DUALCALL_READ, S(REALREAD), RSYS_NAME(READ) },
122 { DUALCALL_READV, "readv", RSYS_NAME(READV) },
123 { DUALCALL_WRITE, "write", RSYS_NAME(WRITE) },
124 { DUALCALL_WRITEV, "writev", RSYS_NAME(WRITEV) },
125 { DUALCALL_IOCTL, "ioctl", RSYS_NAME(IOCTL) },
126 { DUALCALL_FCNTL, "fcntl", RSYS_NAME(FCNTL) },
127 { DUALCALL_DUP, "dup", RSYS_NAME(DUP) },
128 { DUALCALL_DUP2, "dup2", RSYS_NAME(DUP2) },
129 { DUALCALL_CLOSE, "close", RSYS_NAME(CLOSE) },
130 { DUALCALL_POLLTS, S(REALPOLLTS), RSYS_NAME(POLLTS) },
131 { DUALCALL_KEVENT, S(REALKEVENT), RSYS_NAME(KEVENT) },
132 };
133 #undef S
134
135 struct bothsys {
136 void *bs_host;
137 void *bs_rump;
138 } syscalls[DUALCALL__NUM];
139 #define GETSYSCALL(which, name) syscalls[DUALCALL_##name].bs_##which
140
141 pid_t (*host_fork)(void);
142 int (*host_daemon)(int, int);
143
144 static unsigned dup2mask;
145 #define ISDUP2D(fd) (1<<(fd) & dup2mask)
146
147 //#define DEBUGJACK
148 #ifdef DEBUGJACK
149 #define DPRINTF(x) mydprintf x
150 static void
151 mydprintf(const char *fmt, ...)
152 {
153 va_list ap;
154
155 if (ISDUP2D(STDERR_FILENO))
156 return;
157
158 va_start(ap, fmt);
159 vfprintf(stderr, fmt, ap);
160 va_end(ap);
161 }
162
163 #else
164 #define DPRINTF(x)
165 #endif
166
167 #define FDCALL(type, name, rcname, args, proto, vars) \
168 type name args \
169 { \
170 type (*fun) proto; \
171 \
172 DPRINTF(("%s -> %d\n", __STRING(name), fd)); \
173 if (fd_isrump(fd)) { \
174 fun = syscalls[rcname].bs_rump; \
175 fd = fd_host2rump(fd); \
176 } else { \
177 fun = syscalls[rcname].bs_host; \
178 } \
179 \
180 return fun vars; \
181 }
182
183 /*
184 * This is called from librumpclient in case of LD_PRELOAD.
185 * It ensures correct RTLD_NEXT.
186 *
187 * ... except, it's apparently extremely difficult to force
188 * at least gcc to generate an actual stack frame here. So
189 * sprinkle some volatile foobar and baz to throw the optimizer
190 * off the scent and generate a variable assignment with the
191 * return value. The posterboy for this meltdown is amd64
192 * with -O2. At least with gcc 4.1.3 i386 works regardless of
193 * optimization.
194 */
195 volatile int rumphijack_unrope; /* there, unhang yourself */
196 static void *
197 hijackdlsym(void *handle, const char *symbol)
198 {
199 void *rv;
200
201 rv = dlsym(handle, symbol);
202 rumphijack_unrope = *(volatile int *)rv;
203
204 return (void *)rv;
205 }
206
207 /* low calorie sockets? */
208 static bool hostlocalsockets = true;
209
210 static void __attribute__((constructor))
211 rcinit(void)
212 {
213 char buf[64];
214 extern void *(*rumpclient_dlsym)(void *, const char *);
215 unsigned i, j;
216
217 rumpclient_dlsym = hijackdlsym;
218 host_fork = dlsym(RTLD_NEXT, "fork");
219 host_daemon = dlsym(RTLD_NEXT, "daemon");
220
221 /*
222 * In theory cannot print anything during lookups because
223 * we might not have the call vector set up. so, the errx()
224 * is a bit of a strech, but it might work.
225 */
226
227 for (i = 0; i < DUALCALL__NUM; i++) {
228 /* build runtime O(1) access */
229 for (j = 0; j < __arraycount(syscnames); j++) {
230 if (syscnames[j].scm_callnum == i)
231 break;
232 }
233
234 if (j == __arraycount(syscnames))
235 errx(1, "rumphijack error: syscall pos %d missing", i);
236
237 syscalls[i].bs_host = dlsym(RTLD_NEXT,
238 syscnames[j].scm_hostname);
239 if (syscalls[i].bs_host == NULL)
240 errx(1, "hostcall %s not found missing",
241 syscnames[j].scm_hostname);
242
243 syscalls[i].bs_rump = dlsym(RTLD_NEXT,
244 syscnames[j].scm_rumpname);
245 if (syscalls[i].bs_rump == NULL)
246 errx(1, "rumpcall %s not found missing",
247 syscnames[j].scm_rumpname);
248 }
249
250 if (rumpclient_init() == -1)
251 err(1, "rumpclient init");
252
253 /* set client persistence level */
254 if (getenv_r("RUMPHIJACK_RETRY", buf, sizeof(buf)) == -1) {
255 if (errno == ERANGE)
256 err(1, "invalid RUMPHIJACK_RETRY");
257 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_INFTIME);
258 } else {
259 if (strcmp(buf, "die") == 0)
260 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_DIE);
261 else if (strcmp(buf, "inftime") == 0)
262 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_INFTIME);
263 else if (strcmp(buf, "once") == 0)
264 rumpclient_setconnretry(RUMPCLIENT_RETRYCONN_ONCE);
265 else {
266 time_t timeout;
267
268 timeout = (time_t)strtoll(buf, NULL, 10);
269 if (timeout <= 0)
270 errx(1, "RUMPHIJACK_RETRY must be keyword "
271 "or a positive integer, got: %s", buf);
272
273 rumpclient_setconnretry(timeout);
274 }
275 }
276 }
277
278 /* XXX: need runtime selection. low for now due to FD_SETSIZE */
279 #define HIJACK_FDOFF 128
280 #define HIJACK_ASSERT 128 /* XXX */
281 static int
282 fd_rump2host(int fd)
283 {
284
285 if (fd == -1)
286 return fd;
287
288 if (!ISDUP2D(fd))
289 fd += HIJACK_FDOFF;
290
291 return fd;
292 }
293
294 static int
295 fd_host2rump(int fd)
296 {
297
298 if (!ISDUP2D(fd))
299 fd -= HIJACK_FDOFF;
300 return fd;
301 }
302
303 static bool
304 fd_isrump(int fd)
305 {
306
307 return ISDUP2D(fd) || fd >= HIJACK_FDOFF;
308 }
309
310 #define assertfd(_fd_) assert(ISDUP2D(_fd_) || (_fd_) >= HIJACK_ASSERT)
311 #undef HIJACK_FDOFF
312
313 int __socket30(int, int, int);
314 int
315 __socket30(int domain, int type, int protocol)
316 {
317 int (*op_socket)(int, int, int);
318 int fd;
319 bool dohost;
320
321 dohost = hostlocalsockets && (domain == AF_LOCAL);
322
323 if (dohost)
324 op_socket = GETSYSCALL(host, SOCKET);
325 else
326 op_socket = GETSYSCALL(rump, SOCKET);
327 fd = op_socket(domain, type, protocol);
328
329 if (!dohost)
330 fd = fd_rump2host(fd);
331 DPRINTF(("socket <- %d\n", fd));
332
333 return fd;
334 }
335
336 int
337 accept(int s, struct sockaddr *addr, socklen_t *addrlen)
338 {
339 int (*op_accept)(int, struct sockaddr *, socklen_t *);
340 int fd;
341 bool isrump;
342
343 isrump = fd_isrump(s);
344
345 DPRINTF(("accept -> %d", s));
346 if (isrump) {
347 op_accept = GETSYSCALL(rump, ACCEPT);
348 s = fd_host2rump(s);
349 } else {
350 op_accept = GETSYSCALL(host, ACCEPT);
351 }
352 fd = op_accept(s, addr, addrlen);
353 if (fd != -1 && isrump)
354 fd = fd_rump2host(fd);
355
356 DPRINTF((" <- %d\n", fd));
357
358 return fd;
359 }
360
361 /*
362 * ioctl and fcntl are varargs calls and need special treatment
363 */
364 int
365 ioctl(int fd, unsigned long cmd, ...)
366 {
367 int (*op_ioctl)(int, unsigned long cmd, ...);
368 va_list ap;
369 int rv;
370
371 DPRINTF(("ioctl -> %d\n", fd));
372 if (fd_isrump(fd)) {
373 fd = fd_host2rump(fd);
374 op_ioctl = GETSYSCALL(rump, IOCTL);
375 } else {
376 op_ioctl = GETSYSCALL(host, IOCTL);
377 }
378
379 va_start(ap, cmd);
380 rv = op_ioctl(fd, cmd, va_arg(ap, void *));
381 va_end(ap);
382 return rv;
383 }
384
385
386 /* TODO: support F_DUPFD, F_CLOSEM, F_MAXFD */
387 int
388 fcntl(int fd, int cmd, ...)
389 {
390 int (*op_fcntl)(int, int, ...);
391 va_list ap;
392 int rv;
393
394 DPRINTF(("fcntl -> %d\n", fd));
395 if (fd_isrump(fd)) {
396 fd = fd_host2rump(fd);
397 op_fcntl = GETSYSCALL(rump, FCNTL);
398 } else {
399 op_fcntl = GETSYSCALL(host, FCNTL);
400 }
401
402 va_start(ap, cmd);
403 rv = op_fcntl(fd, cmd, va_arg(ap, void *));
404 va_end(ap);
405 return rv;
406 }
407
408 /*
409 * write cannot issue a standard debug printf due to recursion
410 */
411 ssize_t
412 write(int fd, const void *buf, size_t blen)
413 {
414 ssize_t (*op_write)(int, const void *, size_t);
415
416 if (fd_isrump(fd)) {
417 fd = fd_host2rump(fd);
418 op_write = GETSYSCALL(rump, WRITE);
419 } else {
420 op_write = GETSYSCALL(host, WRITE);
421 }
422
423 return op_write(fd, buf, blen);
424 }
425
426 /*
427 * dup2 is special. we allow dup2 of a rump kernel fd to 0-2 since
428 * many programs do that. dup2 of a rump kernel fd to another value
429 * not >= fdoff is an error.
430 *
431 * Note: cannot rump2host newd, because it is often hardcoded.
432 */
433 int
434 dup2(int oldd, int newd)
435 {
436 int (*host_dup2)(int, int);
437 int rv;
438
439 DPRINTF(("dup2 -> %d (o) -> %d (n)\n", oldd, newd));
440
441 if (fd_isrump(oldd)) {
442 if (!(newd >= 0 && newd <= 2))
443 return EBADF;
444 oldd = fd_host2rump(oldd);
445 rv = rump_sys_dup2(oldd, newd);
446 if (rv != -1)
447 dup2mask |= 1<<newd;
448 } else {
449 host_dup2 = syscalls[DUALCALL_DUP2].bs_host;
450 rv = host_dup2(oldd, newd);
451 }
452
453 return rv;
454 }
455
456 int
457 dup(int oldd)
458 {
459 int (*op_dup)(int);
460 int newd;
461
462 DPRINTF(("dup -> %d\n", oldd));
463 if (fd_isrump(oldd)) {
464 op_dup = GETSYSCALL(rump, DUP);
465 } else {
466 op_dup = GETSYSCALL(host, DUP);
467 }
468
469 newd = op_dup(oldd);
470
471 if (fd_isrump(oldd))
472 newd = fd_rump2host(newd);
473 DPRINTF(("dup <- %d\n", newd));
474
475 return newd;
476 }
477
478 /*
479 * We just wrap fork the appropriate rump client calls to preserve
480 * the file descriptors of the forked parent in the child, but
481 * prevent double use of connection fd.
482 */
483 pid_t
484 fork()
485 {
486 struct rumpclient_fork *rf;
487 pid_t rv;
488
489 DPRINTF(("fork\n"));
490
491 if ((rf = rumpclient_prefork()) == NULL)
492 return -1;
493
494 switch ((rv = host_fork())) {
495 case -1:
496 /* XXX: cancel rf */
497 break;
498 case 0:
499 if (rumpclient_fork_init(rf) == -1)
500 rv = -1;
501 break;
502 default:
503 break;
504 }
505
506 DPRINTF(("fork returns %d\n", rv));
507 return rv;
508 }
509
510 int
511 daemon(int nochdir, int noclose)
512 {
513 struct rumpclient_fork *rf;
514
515 if ((rf = rumpclient_prefork()) == NULL)
516 return -1;
517
518 if (host_daemon(nochdir, noclose) == -1)
519 return -1;
520
521 if (rumpclient_fork_init(rf) == -1)
522 return -1;
523
524 return 0;
525 }
526
527 /*
528 * select is done by calling poll.
529 */
530 int
531 REALSELECT(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
532 struct timeval *timeout)
533 {
534 struct pollfd *pfds;
535 struct timespec ts, *tsp = NULL;
536 nfds_t realnfds;
537 int i, j;
538 int rv, incr;
539
540 DPRINTF(("select\n"));
541
542 /*
543 * Well, first we must scan the fds to figure out how many
544 * fds there really are. This is because up to and including
545 * nb5 poll() silently refuses nfds > process_maxopen_fds.
546 * Seems to be fixed in current, thank the maker.
547 * god damn cluster...bomb.
548 */
549
550 for (i = 0, realnfds = 0; i < nfds; i++) {
551 if (readfds && FD_ISSET(i, readfds)) {
552 realnfds++;
553 continue;
554 }
555 if (writefds && FD_ISSET(i, writefds)) {
556 realnfds++;
557 continue;
558 }
559 if (exceptfds && FD_ISSET(i, exceptfds)) {
560 realnfds++;
561 continue;
562 }
563 }
564
565 if (realnfds) {
566 pfds = malloc(sizeof(*pfds) * realnfds);
567 if (!pfds)
568 return -1;
569 } else {
570 pfds = NULL;
571 }
572
573 for (i = 0, j = 0; i < nfds; i++) {
574 incr = 0;
575 pfds[j].events = pfds[j].revents = 0;
576 if (readfds && FD_ISSET(i, readfds)) {
577 pfds[j].fd = i;
578 pfds[j].events |= POLLIN;
579 incr=1;
580 }
581 if (writefds && FD_ISSET(i, writefds)) {
582 pfds[j].fd = i;
583 pfds[j].events |= POLLOUT;
584 incr=1;
585 }
586 if (exceptfds && FD_ISSET(i, exceptfds)) {
587 pfds[j].fd = i;
588 pfds[j].events |= POLLHUP|POLLERR;
589 incr=1;
590 }
591 if (incr)
592 j++;
593 }
594 assert(j == realnfds);
595
596 if (timeout) {
597 TIMEVAL_TO_TIMESPEC(timeout, &ts);
598 tsp = &ts;
599 }
600 rv = REALPOLLTS(pfds, realnfds, tsp, NULL);
601 /*
602 * "If select() returns with an error the descriptor sets
603 * will be unmodified"
604 */
605 if (rv < 0)
606 goto out;
607
608 /*
609 * zero out results (can't use FD_ZERO for the
610 * obvious select-me-not reason). whee.
611 *
612 * We do this here since some software ignores the return
613 * value of select, and hence if the timeout expires, it may
614 * assume all input descriptors have activity.
615 */
616 for (i = 0; i < nfds; i++) {
617 if (readfds)
618 FD_CLR(i, readfds);
619 if (writefds)
620 FD_CLR(i, writefds);
621 if (exceptfds)
622 FD_CLR(i, exceptfds);
623 }
624 if (rv == 0)
625 goto out;
626
627 /*
628 * We have >0 fds with activity. Harvest the results.
629 */
630 for (i = 0; i < (int)realnfds; i++) {
631 if (readfds) {
632 if (pfds[i].revents & POLLIN) {
633 FD_SET(pfds[i].fd, readfds);
634 }
635 }
636 if (writefds) {
637 if (pfds[i].revents & POLLOUT) {
638 FD_SET(pfds[i].fd, writefds);
639 }
640 }
641 if (exceptfds) {
642 if (pfds[i].revents & (POLLHUP|POLLERR)) {
643 FD_SET(pfds[i].fd, exceptfds);
644 }
645 }
646 }
647
648 out:
649 free(pfds);
650 return rv;
651 }
652
653 static void
654 checkpoll(struct pollfd *fds, nfds_t nfds, int *hostcall, int *rumpcall)
655 {
656 nfds_t i;
657
658 for (i = 0; i < nfds; i++) {
659 if (fds[i].fd == -1)
660 continue;
661
662 if (fd_isrump(fds[i].fd))
663 (*rumpcall)++;
664 else
665 (*hostcall)++;
666 }
667 }
668
669 static void
670 adjustpoll(struct pollfd *fds, nfds_t nfds, int (*fdadj)(int))
671 {
672 nfds_t i;
673
674 for (i = 0; i < nfds; i++) {
675 fds[i].fd = fdadj(fds[i].fd);
676 }
677 }
678
679 /*
680 * poll is easy as long as the call comes in the fds only in one
681 * kernel. otherwise its quite tricky...
682 */
683 struct pollarg {
684 struct pollfd *pfds;
685 nfds_t nfds;
686 const struct timespec *ts;
687 const sigset_t *sigmask;
688 int pipefd;
689 int errnum;
690 };
691
692 static void *
693 hostpoll(void *arg)
694 {
695 int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
696 const sigset_t *);
697 struct pollarg *parg = arg;
698 intptr_t rv;
699
700 op_pollts = GETSYSCALL(host, POLLTS);
701 rv = op_pollts(parg->pfds, parg->nfds, parg->ts, parg->sigmask);
702 if (rv == -1)
703 parg->errnum = errno;
704 rump_sys_write(parg->pipefd, &rv, sizeof(rv));
705
706 return (void *)(intptr_t)rv;
707 }
708
709 int
710 REALPOLLTS(struct pollfd *fds, nfds_t nfds, const struct timespec *ts,
711 const sigset_t *sigmask)
712 {
713 int (*op_pollts)(struct pollfd *, nfds_t, const struct timespec *,
714 const sigset_t *);
715 int (*host_close)(int);
716 int hostcall = 0, rumpcall = 0;
717 pthread_t pt;
718 nfds_t i;
719 int rv;
720
721 DPRINTF(("poll\n"));
722 checkpoll(fds, nfds, &hostcall, &rumpcall);
723
724 if (hostcall && rumpcall) {
725 struct pollfd *pfd_host = NULL, *pfd_rump = NULL;
726 int rpipe[2] = {-1,-1}, hpipe[2] = {-1,-1};
727 struct pollarg parg;
728 uintptr_t lrv;
729 int sverrno = 0, trv;
730
731 /*
732 * ok, this is where it gets tricky. We must support
733 * this since it's a very common operation in certain
734 * types of software (telnet, netcat, etc). We allocate
735 * two vectors and run two poll commands in separate
736 * threads. Whichever returns first "wins" and the
737 * other kernel's fds won't show activity.
738 */
739 rv = -1;
740
741 /* allocate full vector for O(n) joining after call */
742 pfd_host = malloc(sizeof(*pfd_host)*(nfds+1));
743 if (!pfd_host)
744 goto out;
745 pfd_rump = malloc(sizeof(*pfd_rump)*(nfds+1));
746 if (!pfd_rump) {
747 goto out;
748 }
749
750 /* split vectors */
751 for (i = 0; i < nfds; i++) {
752 if (fds[i].fd == -1) {
753 pfd_host[i].fd = -1;
754 pfd_rump[i].fd = -1;
755 } else if (fd_isrump(fds[i].fd)) {
756 pfd_host[i].fd = -1;
757 pfd_rump[i].fd = fd_host2rump(fds[i].fd);
758 pfd_rump[i].events = fds[i].events;
759 } else {
760 pfd_rump[i].fd = -1;
761 pfd_host[i].fd = fds[i].fd;
762 pfd_host[i].events = fds[i].events;
763 }
764 fds[i].revents = 0;
765 }
766
767 /*
768 * then, open two pipes, one for notifications
769 * to each kernel.
770 */
771 if (rump_sys_pipe(rpipe) == -1)
772 goto out;
773 if (pipe(hpipe) == -1)
774 goto out;
775
776 pfd_host[nfds].fd = hpipe[0];
777 pfd_host[nfds].events = POLLIN;
778 pfd_rump[nfds].fd = rpipe[0];
779 pfd_rump[nfds].events = POLLIN;
780
781 /*
782 * then, create a thread to do host part and meanwhile
783 * do rump kernel part right here
784 */
785
786 parg.pfds = pfd_host;
787 parg.nfds = nfds+1;
788 parg.ts = ts;
789 parg.sigmask = sigmask;
790 parg.pipefd = rpipe[1];
791 pthread_create(&pt, NULL, hostpoll, &parg);
792
793 op_pollts = GETSYSCALL(rump, POLLTS);
794 lrv = op_pollts(pfd_rump, nfds+1, ts, NULL);
795 sverrno = errno;
796 write(hpipe[1], &rv, sizeof(rv));
797 pthread_join(pt, (void *)&trv);
798
799 /* check who "won" and merge results */
800 if (lrv != 0 && pfd_host[nfds].revents & POLLIN) {
801 rv = trv;
802
803 for (i = 0; i < nfds; i++) {
804 if (pfd_rump[i].fd != -1)
805 fds[i].revents = pfd_rump[i].revents;
806 }
807 sverrno = parg.errnum;
808 } else if (trv != 0 && pfd_rump[nfds].revents & POLLIN) {
809 rv = trv;
810
811 for (i = 0; i < nfds; i++) {
812 if (pfd_host[i].fd != -1)
813 fds[i].revents = pfd_host[i].revents;
814 }
815 } else {
816 rv = 0;
817 }
818
819 out:
820 host_close = GETSYSCALL(host, CLOSE);
821 if (rpipe[0] != -1)
822 rump_sys_close(rpipe[0]);
823 if (rpipe[1] != -1)
824 rump_sys_close(rpipe[1]);
825 if (hpipe[0] != -1)
826 host_close(hpipe[0]);
827 if (hpipe[1] != -1)
828 host_close(hpipe[1]);
829 free(pfd_host);
830 free(pfd_rump);
831 errno = sverrno;
832 } else {
833 if (hostcall) {
834 op_pollts = GETSYSCALL(host, POLLTS);
835 } else {
836 op_pollts = GETSYSCALL(rump, POLLTS);
837 adjustpoll(fds, nfds, fd_host2rump);
838 }
839
840 rv = op_pollts(fds, nfds, ts, sigmask);
841 if (rumpcall)
842 adjustpoll(fds, nfds, fd_rump2host);
843 }
844
845 return rv;
846 }
847
848 int
849 poll(struct pollfd *fds, nfds_t nfds, int timeout)
850 {
851 struct timespec ts;
852 struct timespec *tsp = NULL;
853
854 if (timeout != INFTIM) {
855 ts.tv_sec = timeout / 1000;
856 ts.tv_nsec = (timeout % 1000) * 1000*1000;
857
858 tsp = &ts;
859 }
860
861 return REALPOLLTS(fds, nfds, tsp, NULL);
862 }
863
864 int
865 REALKEVENT(int kq, const struct kevent *changelist, size_t nchanges,
866 struct kevent *eventlist, size_t nevents,
867 const struct timespec *timeout)
868 {
869 int (*op_kevent)(int, const struct kevent *, size_t,
870 struct kevent *, size_t, const struct timespec *);
871 const struct kevent *ev;
872 size_t i;
873
874 /*
875 * Check that we don't attempt to kevent rump kernel fd's.
876 * That needs similar treatment to select/poll, but is slightly
877 * trickier since we need to manage to different kq descriptors.
878 * (TODO, in case you're wondering).
879 */
880 for (i = 0; i < nchanges; i++) {
881 ev = &changelist[i];
882 if (ev->filter == EVFILT_READ || ev->filter == EVFILT_WRITE ||
883 ev->filter == EVFILT_VNODE) {
884 if (fd_isrump(ev->ident))
885 return ENOTSUP;
886 }
887 }
888
889 op_kevent = GETSYSCALL(host, KEVENT);
890 return op_kevent(kq, changelist, nchanges, eventlist, nevents, timeout);
891 }
892
893 /*
894 * Rest are std type calls.
895 */
896
897 FDCALL(int, bind, DUALCALL_BIND, \
898 (int fd, const struct sockaddr *name, socklen_t namelen), \
899 (int, const struct sockaddr *, socklen_t), \
900 (fd, name, namelen))
901
902 FDCALL(int, connect, DUALCALL_CONNECT, \
903 (int fd, const struct sockaddr *name, socklen_t namelen), \
904 (int, const struct sockaddr *, socklen_t), \
905 (fd, name, namelen))
906
907 FDCALL(int, getpeername, DUALCALL_GETPEERNAME, \
908 (int fd, struct sockaddr *name, socklen_t *namelen), \
909 (int, struct sockaddr *, socklen_t *), \
910 (fd, name, namelen))
911
912 FDCALL(int, getsockname, DUALCALL_GETSOCKNAME, \
913 (int fd, struct sockaddr *name, socklen_t *namelen), \
914 (int, struct sockaddr *, socklen_t *), \
915 (fd, name, namelen))
916
917 FDCALL(int, listen, DUALCALL_LISTEN, \
918 (int fd, int backlog), \
919 (int, int), \
920 (fd, backlog))
921
922 FDCALL(ssize_t, recvfrom, DUALCALL_RECVFROM, \
923 (int fd, void *buf, size_t len, int flags, \
924 struct sockaddr *from, socklen_t *fromlen), \
925 (int, void *, size_t, int, struct sockaddr *, socklen_t *), \
926 (fd, buf, len, flags, from, fromlen))
927
928 FDCALL(ssize_t, sendto, DUALCALL_SENDTO, \
929 (int fd, const void *buf, size_t len, int flags, \
930 const struct sockaddr *to, socklen_t tolen), \
931 (int, const void *, size_t, int, \
932 const struct sockaddr *, socklen_t), \
933 (fd, buf, len, flags, to, tolen))
934
935 FDCALL(ssize_t, recvmsg, DUALCALL_RECVMSG, \
936 (int fd, struct msghdr *msg, int flags), \
937 (int, struct msghdr *, int), \
938 (fd, msg, flags))
939
940 FDCALL(ssize_t, sendmsg, DUALCALL_SENDMSG, \
941 (int fd, const struct msghdr *msg, int flags), \
942 (int, const struct msghdr *, int), \
943 (fd, msg, flags))
944
945 FDCALL(int, getsockopt, DUALCALL_GETSOCKOPT, \
946 (int fd, int level, int optn, void *optval, socklen_t *optlen), \
947 (int, int, int, void *, socklen_t *), \
948 (fd, level, optn, optval, optlen))
949
950 FDCALL(int, setsockopt, DUALCALL_SETSOCKOPT, \
951 (int fd, int level, int optn, \
952 const void *optval, socklen_t optlen), \
953 (int, int, int, const void *, socklen_t), \
954 (fd, level, optn, optval, optlen))
955
956 FDCALL(int, shutdown, DUALCALL_SHUTDOWN, \
957 (int fd, int how), \
958 (int, int), \
959 (fd, how))
960
961 #if _FORTIFY_SOURCE > 0
962 #define STUB(fun) __ssp_weak_name(fun)
963 ssize_t _sys_readlink(const char * __restrict, char * __restrict, size_t);
964 ssize_t
965 STUB(readlink)(const char * __restrict path, char * __restrict buf,
966 size_t bufsiz)
967 {
968 return _sys_readlink(path, buf, bufsiz);
969 }
970
971 char *_sys_getcwd(char *, size_t);
972 char *
973 STUB(getcwd)(char *buf, size_t size)
974 {
975 return _sys_getcwd(buf, size);
976 }
977 #else
978 #define STUB(fun) fun
979 #endif
980
981 FDCALL(ssize_t, REALREAD, DUALCALL_READ, \
982 (int fd, void *buf, size_t buflen), \
983 (int, void *, size_t), \
984 (fd, buf, buflen))
985
986 FDCALL(ssize_t, readv, DUALCALL_READV, \
987 (int fd, const struct iovec *iov, int iovcnt), \
988 (int, const struct iovec *, int), \
989 (fd, iov, iovcnt))
990
991 FDCALL(ssize_t, writev, DUALCALL_WRITEV, \
992 (int fd, const struct iovec *iov, int iovcnt), \
993 (int, const struct iovec *, int), \
994 (fd, iov, iovcnt))
995
996 FDCALL(int, close, DUALCALL_CLOSE, \
997 (int fd), \
998 (int), \
999 (fd))
1000