Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: kern_syscall.c,v 1.23 2026/02/01 03:32:44 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software developed for The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.23 2026/02/01 03:32:44 riastradh Exp $");
     34 
     35 #ifdef _KERNEL_OPT
     36 #include "opt_dtrace.h"
     37 #include "opt_ktrace.h"
     38 #include "opt_modular.h"
     39 #include "opt_ptrace.h"
     40 #include "opt_syscall_debug.h"
     41 #endif
     42 
     43 /* XXX To get syscall prototypes. */
     44 #define SYSVSHM
     45 #define SYSVSEM
     46 #define SYSVMSG
     47 
     48 #include <sys/param.h>
     49 
     50 #include <sys/ktrace.h>
     51 #include <sys/module.h>
     52 #include <sys/ptrace.h>
     53 #include <sys/sched.h>
     54 #include <sys/sdt.h>
     55 #include <sys/syscall.h>
     56 #include <sys/syscallargs.h>
     57 #include <sys/syscallvar.h>
     58 #include <sys/systm.h>
     59 #include <sys/xcall.h>
     60 
     61 int
     62 sys_nomodule(struct lwp *l, const void *v, register_t *retval)
     63 {
     64 #ifdef MODULAR
     65 
     66 	const struct sysent *sy;
     67 	const struct emul *em;
     68 	const struct sc_autoload *auto_list;
     69 	u_int code;
     70 
     71 	/*
     72 	 * Restart the syscall if we interrupted a module unload that
     73 	 * failed.  Acquiring kernconfig_lock delays us until any unload
     74 	 * has been completed or rolled back.
     75 	 */
     76 	kernconfig_lock();
     77 	sy = l->l_sysent;
     78 	if (sy->sy_call != sys_nomodule) {
     79 		kernconfig_unlock();
     80 		return SET_ERROR(ERESTART);
     81 	}
     82 	/*
     83 	 * Try to autoload a module to satisfy the request.  If it
     84 	 * works, retry the request.
     85 	 */
     86 	em = l->l_proc->p_emul;
     87 	code = sy - em->e_sysent;
     88 
     89 	if ((auto_list = em->e_sc_autoload) != NULL)
     90 		for (; auto_list->al_code > 0; auto_list++) {
     91 			if (auto_list->al_code != code) {
     92 				continue;
     93 			}
     94 			if (module_autoload(auto_list->al_module,
     95 			    MODULE_CLASS_ANY) != 0 ||
     96 			    sy->sy_call == sys_nomodule) {
     97 			    	break;
     98 			}
     99 			kernconfig_unlock();
    100 			return SET_ERROR(ERESTART);
    101 		}
    102 	kernconfig_unlock();
    103 #endif	/* MODULAR */
    104 
    105 	return sys_nosys(l, v, retval);
    106 }
    107 
    108 int
    109 syscall_establish(const struct emul *em, const struct syscall_package *sp)
    110 {
    111 	struct sysent *sy;
    112 	int i;
    113 
    114 	KASSERT(kernconfig_is_held());
    115 
    116 	if (em == NULL) {
    117 		em = &emul_netbsd;
    118 	}
    119 	sy = em->e_sysent;
    120 
    121 	/*
    122 	 * Ensure that all preconditions are valid, since this is
    123 	 * an all or nothing deal.  Once a system call is entered,
    124 	 * it can become busy and we could be unable to remove it
    125 	 * on error.
    126 	 */
    127 	for (i = 0; sp[i].sp_call != NULL; i++) {
    128 		if (sp[i].sp_code >= SYS_NSYSENT)
    129 			return SET_ERROR(EINVAL);
    130 		if (sy[sp[i].sp_code].sy_call != sys_nomodule &&
    131 		    sy[sp[i].sp_code].sy_call != sys_nosys) {
    132 #ifdef DIAGNOSTIC
    133 			printf("syscall %d is busy\n", sp[i].sp_code);
    134 #endif
    135 			return SET_ERROR(EBUSY);
    136 		}
    137 	}
    138 	/* Everything looks good, patch them in. */
    139 	for (i = 0; sp[i].sp_call != NULL; i++) {
    140 		sy[sp[i].sp_code].sy_call = sp[i].sp_call;
    141 	}
    142 
    143 	return 0;
    144 }
    145 
    146 int
    147 syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
    148 {
    149 	struct sysent *sy;
    150 	const uint32_t *sb;
    151 	lwp_t *l;
    152 	int i;
    153 
    154 	KASSERT(kernconfig_is_held());
    155 
    156 	if (em == NULL) {
    157 		em = &emul_netbsd;
    158 	}
    159 	sy = em->e_sysent;
    160 	sb = em->e_nomodbits;
    161 
    162 	/*
    163 	 * First, patch the system calls to sys_nomodule or sys_nosys
    164 	 * to gate further activity.
    165 	 */
    166 	for (i = 0; sp[i].sp_call != NULL; i++) {
    167 		KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
    168 		sy[sp[i].sp_code].sy_call =
    169 		    sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ?
    170 		      sys_nomodule : sys_nosys;
    171 	}
    172 
    173 	/*
    174 	 * Run a cross call to cycle through all CPUs.  This does two
    175 	 * things: lock activity provides a barrier and makes our update
    176 	 * of sy_call visible to all CPUs, and upon return we can be sure
    177 	 * that we see pertinent values of l_sysent posted by remote CPUs.
    178 	 */
    179 	xc_barrier(0);
    180 
    181 	/*
    182 	 * Now it's safe to check l_sysent.  Run through all LWPs and see
    183 	 * if anyone is still using the system call.
    184 	 */
    185 	for (i = 0; sp[i].sp_call != NULL; i++) {
    186 		mutex_enter(&proc_lock);
    187 		LIST_FOREACH(l, &alllwp, l_list) {
    188 			if (l->l_sysent == &sy[sp[i].sp_code]) {
    189 				break;
    190 			}
    191 		}
    192 		mutex_exit(&proc_lock);
    193 		if (l == NULL) {
    194 			continue;
    195 		}
    196 		/*
    197 		 * We lose: one or more calls are still in use.  Put back
    198 		 * the old entrypoints and act like nothing happened.
    199 		 * When we drop kernconfig_lock, any system calls held in
    200 		 * sys_nomodule() will be restarted.
    201 		 */
    202 		for (i = 0; sp[i].sp_call != NULL; i++) {
    203 			sy[sp[i].sp_code].sy_call = sp[i].sp_call;
    204 		}
    205 		return SET_ERROR(EBUSY);
    206 	}
    207 
    208 	return 0;
    209 }
    210 
    211 /*
    212  * Return true if system call tracing is enabled for the specified process.
    213  */
    214 bool
    215 trace_is_enabled(struct proc *p)
    216 {
    217 #ifdef SYSCALL_DEBUG
    218 	return (true);
    219 #endif
    220 #ifdef KTRACE
    221 	if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
    222 		return (true);
    223 #endif
    224 #ifdef PTRACE
    225 	if (ISSET(p->p_slflag, PSL_SYSCALL))
    226 		return (true);
    227 #endif
    228 
    229 	return (false);
    230 }
    231 
    232 /*
    233  * Start trace of particular system call. If process is being traced,
    234  * this routine is called by MD syscall dispatch code just before
    235  * a system call is actually executed.
    236  */
    237 int
    238 trace_enter(register_t code, const struct sysent *sy, const void *args)
    239 {
    240 	int error = 0;
    241 #if defined(PTRACE) || defined(KDTRACE_HOOKS)
    242 	struct proc *p = curlwp->l_proc;
    243 #endif
    244 
    245 #ifdef KDTRACE_HOOKS
    246 	if (sy->sy_entry) {
    247 		struct emul *e = p->p_emul;
    248 		if (e->e_dtrace_syscall)
    249 			(*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args,
    250 			    NULL, 0);
    251 	}
    252 #endif
    253 
    254 #ifdef SYSCALL_DEBUG
    255 	scdebug_call(code, args);
    256 #endif /* SYSCALL_DEBUG */
    257 
    258 	ktrsyscall(code, args, sy->sy_narg);
    259 
    260 #ifdef PTRACE
    261 	if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
    262 	    (PSL_SYSCALL|PSL_TRACED)) {
    263 		proc_stoptrace(TRAP_SCE, code, args, NULL, 0);
    264 		if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
    265 			/* tracer will emulate syscall for us */
    266 			error = SET_ERROR(EJUSTRETURN);
    267 		}
    268 	}
    269 #endif
    270 	return error;
    271 }
    272 
    273 /*
    274  * End trace of particular system call. If process is being traced,
    275  * this routine is called by MD syscall dispatch code just after
    276  * a system call finishes.
    277  * MD caller guarantees the passed 'code' is within the supported
    278  * system call number range for emulation the process runs under.
    279  */
    280 void
    281 trace_exit(register_t code, const struct sysent *sy, const void *args,
    282     register_t rval[], int error)
    283 {
    284 #if defined(PTRACE) || defined(KDTRACE_HOOKS)
    285 	struct proc *p = curlwp->l_proc;
    286 #endif
    287 
    288 #ifdef KDTRACE_HOOKS
    289 	if (sy->sy_return) {
    290 		struct emul *e = p->p_emul;
    291 		if (e->e_dtrace_syscall)
    292 			(*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy,
    293 			    args, rval, error);
    294 	}
    295 #endif
    296 
    297 #ifdef SYSCALL_DEBUG
    298 	scdebug_ret(code, error, rval);
    299 #endif /* SYSCALL_DEBUG */
    300 
    301 	ktrsysret(code, error, rval);
    302 
    303 #ifdef PTRACE
    304 	if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
    305 	    (PSL_SYSCALL|PSL_TRACED)) {
    306 		proc_stoptrace(TRAP_SCX, code, args, rval, error);
    307 	}
    308 	CLR(p->p_slflag, PSL_SYSCALLEMU);
    309 #endif
    310 }
    311