kern_syscall.c revision 1.10 1 /* $NetBSD: kern_syscall.c,v 1.10 2015/03/07 16:38:07 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software developed for The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.10 2015/03/07 16:38:07 christos Exp $");
34
35 #ifdef _KERNEL_OPT
36 #include "opt_modular.h"
37 #include "opt_syscall_debug.h"
38 #include "opt_ktrace.h"
39 #include "opt_ptrace.h"
40 #include "opt_dtrace.h"
41 #endif
42
43 /* XXX To get syscall prototypes. */
44 #define SYSVSHM
45 #define SYSVSEM
46 #define SYSVMSG
47
48 #include <sys/param.h>
49 #include <sys/module.h>
50 #include <sys/sched.h>
51 #include <sys/syscall.h>
52 #include <sys/syscallargs.h>
53 #include <sys/syscallvar.h>
54 #include <sys/systm.h>
55 #include <sys/xcall.h>
56 #include <sys/ktrace.h>
57 #include <sys/ptrace.h>
58
59 int
60 sys_nomodule(struct lwp *l, const void *v, register_t *retval)
61 {
62 #ifdef MODULAR
63 static struct {
64 u_int al_code;
65 const char *al_module;
66 } const autoload[] = {
67 { SYS_aio_cancel, "aio" },
68 { SYS_aio_error, "aio" },
69 { SYS_aio_fsync, "aio" },
70 { SYS_aio_read, "aio" },
71 { SYS_aio_return, "aio" },
72 { SYS___aio_suspend50, "aio" },
73 { SYS_aio_write, "aio" },
74 { SYS_lio_listio, "aio" },
75 { SYS_mq_open, "mqueue" },
76 { SYS_mq_close, "mqueue" },
77 { SYS_mq_unlink, "mqueue" },
78 { SYS_mq_getattr, "mqueue" },
79 { SYS_mq_setattr, "mqueue" },
80 { SYS_mq_notify, "mqueue" },
81 { SYS_mq_send, "mqueue" },
82 { SYS_mq_receive, "mqueue" },
83 { SYS___mq_timedsend50, "mqueue" },
84 { SYS___mq_timedreceive50, "mqueue" },
85 { SYS_compat_43_fstat43, "compat" },
86 { SYS_compat_43_lstat43, "compat" },
87 { SYS_compat_43_oaccept, "compat" },
88 { SYS_compat_43_ocreat, "compat" },
89 { SYS_compat_43_oftruncate, "compat" },
90 { SYS_compat_43_ogetdirentries, "compat" },
91 { SYS_compat_43_ogetdtablesize, "compat" },
92 { SYS_compat_43_ogethostid, "compat" },
93 { SYS_compat_43_ogethostname, "compat" },
94 { SYS_compat_43_ogetkerninfo, "compat" },
95 { SYS_compat_43_ogetpagesize, "compat" },
96 { SYS_compat_43_ogetpeername, "compat" },
97 { SYS_compat_43_ogetrlimit, "compat" },
98 { SYS_compat_43_ogetsockname, "compat" },
99 { SYS_compat_43_okillpg, "compat" },
100 { SYS_compat_43_olseek, "compat" },
101 { SYS_compat_43_ommap, "compat" },
102 { SYS_compat_43_oquota, "compat" },
103 { SYS_compat_43_orecv, "compat" },
104 { SYS_compat_43_orecvfrom, "compat" },
105 { SYS_compat_43_orecvmsg, "compat" },
106 { SYS_compat_43_osend, "compat" },
107 { SYS_compat_43_osendmsg, "compat" },
108 { SYS_compat_43_osethostid, "compat" },
109 { SYS_compat_43_osethostname, "compat" },
110 { SYS_compat_43_osetrlimit, "compat" },
111 { SYS_compat_43_osigblock, "compat" },
112 { SYS_compat_43_osigsetmask, "compat" },
113 { SYS_compat_43_osigstack, "compat" },
114 { SYS_compat_43_osigvec, "compat" },
115 { SYS_compat_43_otruncate, "compat" },
116 { SYS_compat_43_owait, "compat" },
117 { SYS_compat_43_stat43, "compat" },
118 { SYS_compat_09_ogetdomainname, "compat" },
119 { SYS_compat_09_osetdomainname, "compat" },
120 { SYS_compat_09_ouname, "compat" },
121 #ifndef _LP64
122 { SYS_compat_10_omsgsys, "compat" },
123 { SYS_compat_10_osemsys, "compat" },
124 { SYS_compat_10_oshmsys, "compat" },
125 #endif
126 { SYS_compat_12_fstat12, "compat" },
127 { SYS_compat_12_getdirentries, "compat" },
128 { SYS_compat_12_lstat12, "compat" },
129 { SYS_compat_12_msync, "compat" },
130 { SYS_compat_12_oreboot, "compat" },
131 { SYS_compat_12_oswapon, "compat" },
132 { SYS_compat_12_stat12, "compat" },
133 { SYS_compat_13_sigaction13, "compat" },
134 { SYS_compat_13_sigaltstack13, "compat" },
135 { SYS_compat_13_sigpending13, "compat" },
136 { SYS_compat_13_sigprocmask13, "compat" },
137 { SYS_compat_13_sigreturn13, "compat" },
138 { SYS_compat_13_sigsuspend13, "compat" },
139 { SYS_compat_14___semctl, "compat" },
140 { SYS_compat_14_msgctl, "compat" },
141 { SYS_compat_14_shmctl, "compat" },
142 { SYS_compat_16___sigaction14, "compat" },
143 { SYS_compat_16___sigreturn14, "compat" },
144 { SYS_compat_20_fhstatfs, "compat" },
145 { SYS_compat_20_fstatfs, "compat" },
146 { SYS_compat_20_getfsstat, "compat" },
147 { SYS_compat_20_statfs, "compat" },
148 { SYS_compat_30___fhstat30, "compat" },
149 { SYS_compat_30___fstat13, "compat" },
150 { SYS_compat_30___lstat13, "compat" },
151 { SYS_compat_30___stat13, "compat" },
152 { SYS_compat_30_fhopen, "compat" },
153 { SYS_compat_30_fhstat, "compat" },
154 { SYS_compat_30_fhstatvfs1, "compat" },
155 { SYS_compat_30_getdents, "compat" },
156 { SYS_compat_30_getfh, "compat" },
157 { SYS_compat_30_socket, "compat" },
158 { SYS_compat_40_mount, "compat" },
159 { SYS_compat_50_wait4, "compat" },
160 { SYS_compat_50_mknod, "compat" },
161 { SYS_compat_50_setitimer, "compat" },
162 { SYS_compat_50_getitimer, "compat" },
163 { SYS_compat_50_select, "compat" },
164 { SYS_compat_50_gettimeofday, "compat" },
165 { SYS_compat_50_getrusage, "compat" },
166 { SYS_compat_50_settimeofday, "compat" },
167 { SYS_compat_50_utimes, "compat" },
168 { SYS_compat_50_adjtime, "compat" },
169 { SYS_compat_50_lfs_segwait, "compat" },
170 { SYS_compat_50_futimes, "compat" },
171 { SYS_compat_50_clock_gettime, "compat" },
172 { SYS_compat_50_clock_settime, "compat" },
173 { SYS_compat_50_clock_getres, "compat" },
174 { SYS_compat_50_timer_settime, "compat" },
175 { SYS_compat_50_timer_gettime, "compat" },
176 { SYS_compat_50_nanosleep, "compat" },
177 { SYS_compat_50___sigtimedwait, "compat" },
178 { SYS_compat_50_mq_timedsend, "compat" },
179 { SYS_compat_50_mq_timedreceive, "compat" },
180 { SYS_compat_50_lutimes, "compat" },
181 { SYS_compat_50_____semctl13, "compat" },
182 { SYS_compat_50___msgctl13, "compat" },
183 { SYS_compat_50___shmctl13, "compat" },
184 { SYS_compat_50__lwp_park, "compat" },
185 { SYS_compat_50_kevent, "compat" },
186 { SYS_compat_50_pselect, "compat" },
187 { SYS_compat_50_pollts, "compat" },
188 { SYS_compat_50___stat30, "compat" },
189 { SYS_compat_50___fstat30, "compat" },
190 { SYS_compat_50___lstat30, "compat" },
191 { SYS_compat_50___ntp_gettime30, "compat" },
192 { SYS_compat_50___fhstat40, "compat" },
193 { SYS_compat_50_aio_suspend, "compat" },
194 { SYS_compat_60__lwp_park, "compat" },
195 { SYS__ksem_init, "ksem" },
196 { SYS__ksem_open, "ksem" },
197 { SYS__ksem_unlink, "ksem" },
198 { SYS__ksem_close, "ksem" },
199 { SYS__ksem_post, "ksem" },
200 { SYS__ksem_wait, "ksem" },
201 { SYS__ksem_trywait, "ksem" },
202 { SYS__ksem_getvalue, "ksem" },
203 { SYS__ksem_destroy, "ksem" },
204 { SYS__ksem_timedwait, "ksem" },
205 { SYS_nfssvc, "nfsserver" },
206 { SYS_afssys, "openafs" },
207 };
208 const struct sysent *sy;
209 const struct emul *em;
210 int code, i;
211
212 /*
213 * Restart the syscall if we interrupted a module unload that
214 * failed. Acquiring kernconfig_lock delays us until any unload
215 * has been completed or rolled back.
216 */
217 kernconfig_lock();
218 sy = l->l_sysent;
219 if (sy->sy_call != sys_nomodule) {
220 kernconfig_unlock();
221 return ERESTART;
222 }
223 /*
224 * Try to autoload a module to satisfy the request. If it
225 * works, retry the request.
226 */
227 em = l->l_proc->p_emul;
228 if (em == &emul_netbsd) {
229 code = sy - em->e_sysent;
230 for (i = 0; i < __arraycount(autoload); i++) {
231 if (autoload[i].al_code != code) {
232 continue;
233 }
234 if (module_autoload(autoload[i].al_module,
235 MODULE_CLASS_ANY) != 0 ||
236 sy->sy_call == sys_nomodule) {
237 break;
238 }
239 kernconfig_unlock();
240 return ERESTART;
241 }
242 }
243 kernconfig_unlock();
244 #endif /* MODULAR */
245
246 return sys_nosys(l, v, retval);
247 }
248
249 int
250 syscall_establish(const struct emul *em, const struct syscall_package *sp)
251 {
252 struct sysent *sy;
253 int i;
254
255 KASSERT(kernconfig_is_held());
256
257 if (em == NULL) {
258 em = &emul_netbsd;
259 }
260 sy = em->e_sysent;
261
262 /*
263 * Ensure that all preconditions are valid, since this is
264 * an all or nothing deal. Once a system call is entered,
265 * it can become busy and we could be unable to remove it
266 * on error.
267 */
268 for (i = 0; sp[i].sp_call != NULL; i++) {
269 if (sy[sp[i].sp_code].sy_call != sys_nomodule) {
270 #ifdef DIAGNOSTIC
271 printf("syscall %d is busy\n", sp[i].sp_code);
272 #endif
273 return EBUSY;
274 }
275 }
276 /* Everything looks good, patch them in. */
277 for (i = 0; sp[i].sp_call != NULL; i++) {
278 sy[sp[i].sp_code].sy_call = sp[i].sp_call;
279 }
280
281 return 0;
282 }
283
284 int
285 syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
286 {
287 struct sysent *sy;
288 uint64_t where;
289 lwp_t *l;
290 int i;
291
292 KASSERT(kernconfig_is_held());
293
294 if (em == NULL) {
295 em = &emul_netbsd;
296 }
297 sy = em->e_sysent;
298
299 /*
300 * First, patch the system calls to sys_nomodule to gate further
301 * activity.
302 */
303 for (i = 0; sp[i].sp_call != NULL; i++) {
304 KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
305 sy[sp[i].sp_code].sy_call = sys_nomodule;
306 }
307
308 /*
309 * Run a cross call to cycle through all CPUs. This does two
310 * things: lock activity provides a barrier and makes our update
311 * of sy_call visible to all CPUs, and upon return we can be sure
312 * that we see pertinent values of l_sysent posted by remote CPUs.
313 */
314 where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
315 xc_wait(where);
316
317 /*
318 * Now it's safe to check l_sysent. Run through all LWPs and see
319 * if anyone is still using the system call.
320 */
321 for (i = 0; sp[i].sp_call != NULL; i++) {
322 mutex_enter(proc_lock);
323 LIST_FOREACH(l, &alllwp, l_list) {
324 if (l->l_sysent == &sy[sp[i].sp_code]) {
325 break;
326 }
327 }
328 mutex_exit(proc_lock);
329 if (l == NULL) {
330 continue;
331 }
332 /*
333 * We lose: one or more calls are still in use. Put back
334 * the old entrypoints and act like nothing happened.
335 * When we drop kernconfig_lock, any system calls held in
336 * sys_nomodule() will be restarted.
337 */
338 for (i = 0; sp[i].sp_call != NULL; i++) {
339 sy[sp[i].sp_code].sy_call = sp[i].sp_call;
340 }
341 return EBUSY;
342 }
343
344 return 0;
345 }
346
347 /*
348 * Return true if system call tracing is enabled for the specified process.
349 */
350 bool
351 trace_is_enabled(struct proc *p)
352 {
353 #ifdef SYSCALL_DEBUG
354 return (true);
355 #endif
356 #ifdef KTRACE
357 if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
358 return (true);
359 #endif
360 #ifdef PTRACE
361 if (ISSET(p->p_slflag, PSL_SYSCALL))
362 return (true);
363 #endif
364
365 return (false);
366 }
367
368 /*
369 * Start trace of particular system call. If process is being traced,
370 * this routine is called by MD syscall dispatch code just before
371 * a system call is actually executed.
372 */
373 int
374 trace_enter(register_t code, const struct sysent *sy, const void *args)
375 {
376 int error = 0;
377
378 #ifdef KDTRACE_HOOKS
379 if (sy->sy_entry) {
380 struct emul *e = curlwp->l_proc->p_emul;
381 (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args, NULL, 0);
382 }
383 #endif
384
385 #ifdef SYSCALL_DEBUG
386 scdebug_call(code, args);
387 #endif /* SYSCALL_DEBUG */
388
389 ktrsyscall(code, args, sy->sy_narg);
390
391 #ifdef PTRACE
392 if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
393 (PSL_SYSCALL|PSL_TRACED)) {
394 process_stoptrace();
395 if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
396 /* tracer will emulate syscall for us */
397 error = EJUSTRETURN;
398 }
399 }
400 #endif
401 return error;
402 }
403
404 /*
405 * End trace of particular system call. If process is being traced,
406 * this routine is called by MD syscall dispatch code just after
407 * a system call finishes.
408 * MD caller guarantees the passed 'code' is within the supported
409 * system call number range for emulation the process runs under.
410 */
411 void
412 trace_exit(register_t code, const struct sysent *sy, const void *args,
413 register_t rval[], int error)
414 {
415 #if defined(PTRACE) || defined(KDTRACE_HOOKS)
416 struct proc *p = curlwp->l_proc;
417 #endif
418
419 #ifdef KDTRACE_HOOKS
420 if (sy->sy_return) {
421 (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy, args,
422 rval, error);
423 }
424 #endif
425
426 #ifdef SYSCALL_DEBUG
427 scdebug_ret(code, error, rval);
428 #endif /* SYSCALL_DEBUG */
429
430 ktrsysret(code, error, rval);
431
432 #ifdef PTRACE
433 if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
434 (PSL_SYSCALL|PSL_TRACED))
435 process_stoptrace();
436 CLR(p->p_slflag, PSL_SYSCALLEMU);
437 #endif
438 }
439