Home | History | Annotate | Line # | Download | only in kern
sys_sched.c revision 1.33
      1 /*	$NetBSD: sys_sched.c,v 1.33 2009/03/03 21:55:06 rmind Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2008, Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * System calls relating to the scheduler.
     31  *
     32  * Lock order:
     33  *
     34  *	cpu_lock ->
     35  *	    proc_lock ->
     36  *		proc_t::p_lock ->
     37  *		    lwp_t::lwp_lock
     38  *
     39  * TODO:
     40  *  - Handle pthread_setschedprio() as defined by POSIX;
     41  *  - Handle sched_yield() case for SCHED_FIFO as defined by POSIX;
     42  */
     43 
     44 #include <sys/cdefs.h>
     45 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.33 2009/03/03 21:55:06 rmind Exp $");
     46 
     47 #include <sys/param.h>
     48 
     49 #include <sys/cpu.h>
     50 #include <sys/kauth.h>
     51 #include <sys/kmem.h>
     52 #include <sys/lwp.h>
     53 #include <sys/mutex.h>
     54 #include <sys/proc.h>
     55 #include <sys/pset.h>
     56 #include <sys/sa.h>
     57 #include <sys/savar.h>
     58 #include <sys/sched.h>
     59 #include <sys/syscallargs.h>
     60 #include <sys/sysctl.h>
     61 #include <sys/systm.h>
     62 #include <sys/types.h>
     63 #include <sys/unistd.h>
     64 
     65 #include "opt_sa.h"
     66 
     67 /*
     68  * Convert user priority or the in-kernel priority or convert the current
     69  * priority to the appropriate range according to the policy change.
     70  */
     71 static pri_t
     72 convert_pri(lwp_t *l, int policy, pri_t pri)
     73 {
     74 
     75 	/* Convert user priority to the in-kernel */
     76 	if (pri != PRI_NONE) {
     77 		/* Only for real-time threads */
     78 		KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
     79 		KASSERT(policy != SCHED_OTHER);
     80 		return PRI_USER_RT + pri;
     81 	}
     82 
     83 	/* Neither policy, nor priority change */
     84 	if (l->l_class == policy)
     85 		return l->l_priority;
     86 
     87 	/* Time-sharing -> real-time */
     88 	if (l->l_class == SCHED_OTHER) {
     89 		KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
     90 		return PRI_USER_RT;
     91 	}
     92 
     93 	/* Real-time -> time-sharing */
     94 	if (policy == SCHED_OTHER) {
     95 		KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
     96 		return l->l_priority - PRI_USER_RT;
     97 	}
     98 
     99 	/* Real-time -> real-time */
    100 	return l->l_priority;
    101 }
    102 
    103 int
    104 do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
    105     const struct sched_param *params)
    106 {
    107 	struct proc *p;
    108 	struct lwp *t;
    109 	pri_t pri;
    110 	u_int lcnt;
    111 	int error;
    112 
    113 	error = 0;
    114 
    115 	pri = params->sched_priority;
    116 
    117 	/* If no parameters specified, just return (this should not happen) */
    118 	if (pri == PRI_NONE && policy == SCHED_NONE)
    119 		return 0;
    120 
    121 	/* Validate scheduling class */
    122 	if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
    123 		return EINVAL;
    124 
    125 	/* Validate priority */
    126 	if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
    127 		return EINVAL;
    128 
    129 	if (pid != 0) {
    130 		/* Find the process */
    131 		mutex_enter(proc_lock);
    132 		p = p_find(pid, PFIND_LOCKED);
    133 		if (p == NULL) {
    134 			mutex_exit(proc_lock);
    135 			return ESRCH;
    136 		}
    137 		mutex_enter(p->p_lock);
    138 		mutex_exit(proc_lock);
    139 		/* Disallow modification of system processes */
    140 		if ((p->p_flag & PK_SYSTEM) != 0) {
    141 			mutex_exit(p->p_lock);
    142 			return EPERM;
    143 		}
    144 	} else {
    145 		/* Use the calling process */
    146 		p = curlwp->l_proc;
    147 		mutex_enter(p->p_lock);
    148 	}
    149 
    150 	/* Find the LWP(s) */
    151 	lcnt = 0;
    152 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
    153 		pri_t kpri;
    154 		int lpolicy;
    155 
    156 		if (lid && lid != t->l_lid)
    157 			continue;
    158 
    159 		lcnt++;
    160 		lwp_lock(t);
    161 		lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
    162 
    163 		/* Disallow setting of priority for SCHED_OTHER threads */
    164 		if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
    165 			lwp_unlock(t);
    166 			error = EINVAL;
    167 			break;
    168 		}
    169 
    170 		/* Convert priority, if needed */
    171 		kpri = convert_pri(t, lpolicy, pri);
    172 
    173 		/* Check the permission */
    174 		error = kauth_authorize_process(kauth_cred_get(),
    175 		    KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
    176 		    KAUTH_ARG(kpri));
    177 		if (error) {
    178 			lwp_unlock(t);
    179 			break;
    180 		}
    181 
    182 		/* Set the scheduling class, change the priority */
    183 		t->l_class = lpolicy;
    184 		lwp_changepri(t, kpri);
    185 		lwp_unlock(t);
    186 	}
    187 	mutex_exit(p->p_lock);
    188 	return (lcnt == 0) ? ESRCH : error;
    189 }
    190 
    191 /*
    192  * Set scheduling parameters.
    193  */
    194 int
    195 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
    196     register_t *retval)
    197 {
    198 	/* {
    199 		syscallarg(pid_t) pid;
    200 		syscallarg(lwpid_t) lid;
    201 		syscallarg(int) policy;
    202 		syscallarg(const struct sched_param *) params;
    203 	} */
    204 	struct sched_param params;
    205 	int error;
    206 
    207 	/* Get the parameters from the user-space */
    208 	error = copyin(SCARG(uap, params), &params, sizeof(params));
    209 	if (error)
    210 		goto out;
    211 
    212 	error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
    213 	    SCARG(uap, policy), &params);
    214 out:
    215 	return error;
    216 }
    217 
    218 int
    219 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
    220     struct sched_param *params)
    221 {
    222 	struct sched_param lparams;
    223 	struct lwp *t;
    224 	int error, lpolicy;
    225 
    226 	/* Locks the LWP */
    227 	t = lwp_find2(pid, lid);
    228 	if (t == NULL)
    229 		return ESRCH;
    230 
    231 	/* Check the permission */
    232 	error = kauth_authorize_process(kauth_cred_get(),
    233 	    KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
    234 	if (error != 0) {
    235 		mutex_exit(t->l_proc->p_lock);
    236 		return error;
    237 	}
    238 
    239 	lwp_lock(t);
    240 	lparams.sched_priority = t->l_priority;
    241 	lpolicy = t->l_class;
    242 
    243 	switch (lpolicy) {
    244 	case SCHED_OTHER:
    245 		lparams.sched_priority -= PRI_USER;
    246 		break;
    247 	case SCHED_RR:
    248 	case SCHED_FIFO:
    249 		lparams.sched_priority -= PRI_USER_RT;
    250 		break;
    251 	}
    252 
    253 	if (policy != NULL)
    254 		*policy = lpolicy;
    255 
    256 	if (params != NULL)
    257 		*params = lparams;
    258 
    259 	lwp_unlock(t);
    260 	mutex_exit(t->l_proc->p_lock);
    261 	return error;
    262 }
    263 
    264 /*
    265  * Get scheduling parameters.
    266  */
    267 int
    268 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
    269     register_t *retval)
    270 {
    271 	/* {
    272 		syscallarg(pid_t) pid;
    273 		syscallarg(lwpid_t) lid;
    274 		syscallarg(int *) policy;
    275 		syscallarg(struct sched_param *) params;
    276 	} */
    277 	struct sched_param params;
    278 	int error, policy;
    279 
    280 	error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
    281 	    &params);
    282 	if (error)
    283 		goto out;
    284 
    285 	error = copyout(&params, SCARG(uap, params), sizeof(params));
    286 	if (error == 0 && SCARG(uap, policy) != NULL)
    287 		error = copyout(&policy, SCARG(uap, policy), sizeof(int));
    288 out:
    289 	return error;
    290 }
    291 
    292 /*
    293  * Allocate the CPU set, and get it from userspace.
    294  */
    295 static int
    296 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
    297 {
    298 	int error;
    299 
    300 	*dset = kcpuset_create();
    301 	error = kcpuset_copyin(sset, *dset, size);
    302 	if (error != 0)
    303 		kcpuset_unuse(*dset, NULL);
    304 	return error;
    305 }
    306 
    307 /*
    308  * Set affinity.
    309  */
    310 int
    311 sys__sched_setaffinity(struct lwp *l,
    312     const struct sys__sched_setaffinity_args *uap, register_t *retval)
    313 {
    314 	/* {
    315 		syscallarg(pid_t) pid;
    316 		syscallarg(lwpid_t) lid;
    317 		syscallarg(size_t) size;
    318 		syscallarg(const cpuset_t *) cpuset;
    319 	} */
    320 	kcpuset_t *cpuset, *cpulst = NULL;
    321 	struct cpu_info *ici, *ci;
    322 	struct proc *p;
    323 	struct lwp *t;
    324 	CPU_INFO_ITERATOR cii;
    325 	bool alloff;
    326 	lwpid_t lid;
    327 	u_int lcnt;
    328 	int error;
    329 
    330 	error = genkcpuset(&cpuset, SCARG(uap, cpuset), SCARG(uap, size));
    331 	if (error)
    332 		return error;
    333 
    334 	/*
    335 	 * Traverse _each_ CPU to:
    336 	 *  - Check that CPUs in the mask have no assigned processor set.
    337 	 *  - Check that at least one CPU from the mask is online.
    338 	 *  - Find the first target CPU to migrate.
    339 	 *
    340 	 * To avoid the race with CPU online/offline calls and processor sets,
    341 	 * cpu_lock will be locked for the entire operation.
    342 	 */
    343 	ci = NULL;
    344 	alloff = false;
    345 	mutex_enter(&cpu_lock);
    346 	for (CPU_INFO_FOREACH(cii, ici)) {
    347 		struct schedstate_percpu *ispc;
    348 
    349 		if (kcpuset_isset(cpu_index(ici), cpuset) == 0)
    350 			continue;
    351 
    352 		ispc = &ici->ci_schedstate;
    353 		/* Check that CPU is not in the processor-set */
    354 		if (ispc->spc_psid != PS_NONE) {
    355 			error = EPERM;
    356 			goto out;
    357 		}
    358 		/* Skip offline CPUs */
    359 		if (ispc->spc_flags & SPCF_OFFLINE) {
    360 			alloff = true;
    361 			continue;
    362 		}
    363 		/* Target CPU to migrate */
    364 		if (ci == NULL) {
    365 			ci = ici;
    366 		}
    367 	}
    368 	if (ci == NULL) {
    369 		if (alloff) {
    370 			/* All CPUs in the set are offline */
    371 			error = EPERM;
    372 			goto out;
    373 		}
    374 		/* Empty set */
    375 		kcpuset_unuse(cpuset, &cpulst);
    376 		cpuset = NULL;
    377 	}
    378 
    379 	if (SCARG(uap, pid) != 0) {
    380 		/* Find the process */
    381 		mutex_enter(proc_lock);
    382 		p = p_find(SCARG(uap, pid), PFIND_LOCKED);
    383 		if (p == NULL) {
    384 			mutex_exit(proc_lock);
    385 			error = ESRCH;
    386 			goto out;
    387 		}
    388 		mutex_enter(p->p_lock);
    389 		mutex_exit(proc_lock);
    390 		/* Disallow modification of system processes. */
    391 		if ((p->p_flag & PK_SYSTEM) != 0) {
    392 			mutex_exit(p->p_lock);
    393 			error = EPERM;
    394 			goto out;
    395 		}
    396 	} else {
    397 		/* Use the calling process */
    398 		p = l->l_proc;
    399 		mutex_enter(p->p_lock);
    400 	}
    401 
    402 	/*
    403 	 * Check the permission.
    404 	 */
    405 	error = kauth_authorize_process(l->l_cred,
    406 	    KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
    407 	if (error != 0) {
    408 		mutex_exit(p->p_lock);
    409 		goto out;
    410 	}
    411 
    412 #ifdef KERN_SA
    413 	/* Changing the affinity of a SA process is not supported */
    414 	if ((p->p_sflag & (PS_SA | PS_WEXIT)) != 0 || p->p_sa != NULL) {
    415 		mutex_exit(p->p_lock);
    416 		error = EINVAL;
    417 		goto out;
    418 	}
    419 #endif
    420 
    421 	/* Find the LWP(s) */
    422 	lcnt = 0;
    423 	lid = SCARG(uap, lid);
    424 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
    425 		if (lid && lid != t->l_lid)
    426 			continue;
    427 		lwp_lock(t);
    428 		/* It is not allowed to set the affinity for zombie LWPs */
    429 		if (t->l_stat == LSZOMB) {
    430 			lwp_unlock(t);
    431 			continue;
    432 		}
    433 		if (cpuset) {
    434 			/* Set the affinity flag and new CPU set */
    435 			t->l_flag |= LW_AFFINITY;
    436 			kcpuset_use(cpuset);
    437 			if (t->l_affinity != NULL)
    438 				kcpuset_unuse(t->l_affinity, &cpulst);
    439 			t->l_affinity = cpuset;
    440 			/* Migrate to another CPU, unlocks LWP */
    441 			lwp_migrate(t, ci);
    442 		} else {
    443 			/* Unset the affinity flag */
    444 			t->l_flag &= ~LW_AFFINITY;
    445 			if (t->l_affinity != NULL)
    446 				kcpuset_unuse(t->l_affinity, &cpulst);
    447 			t->l_affinity = NULL;
    448 			lwp_unlock(t);
    449 		}
    450 		lcnt++;
    451 	}
    452 	mutex_exit(p->p_lock);
    453 	if (lcnt == 0)
    454 		error = ESRCH;
    455 out:
    456 	mutex_exit(&cpu_lock);
    457 	if (cpuset != NULL)
    458 		kcpuset_unuse(cpuset, &cpulst);
    459 	kcpuset_destroy(cpulst);
    460 	return error;
    461 }
    462 
    463 /*
    464  * Get affinity.
    465  */
    466 int
    467 sys__sched_getaffinity(struct lwp *l,
    468     const struct sys__sched_getaffinity_args *uap, register_t *retval)
    469 {
    470 	/* {
    471 		syscallarg(pid_t) pid;
    472 		syscallarg(lwpid_t) lid;
    473 		syscallarg(size_t) size;
    474 		syscallarg(cpuset_t *) cpuset;
    475 	} */
    476 	struct lwp *t;
    477 	kcpuset_t *cpuset;
    478 	int error;
    479 
    480 	error = genkcpuset(&cpuset, SCARG(uap, cpuset), SCARG(uap, size));
    481 	if (error)
    482 		return error;
    483 
    484 	/* Locks the LWP */
    485 	t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
    486 	if (t == NULL) {
    487 		error = ESRCH;
    488 		goto out;
    489 	}
    490 	/* Check the permission */
    491 	if (kauth_authorize_process(l->l_cred,
    492 	    KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
    493 		mutex_exit(t->l_proc->p_lock);
    494 		error = EPERM;
    495 		goto out;
    496 	}
    497 	lwp_lock(t);
    498 	if (t->l_flag & LW_AFFINITY) {
    499 		KASSERT(t->l_affinity != NULL);
    500 		kcpuset_copy(cpuset, t->l_affinity);
    501 	} else
    502 		kcpuset_zero(cpuset);
    503 	lwp_unlock(t);
    504 	mutex_exit(t->l_proc->p_lock);
    505 
    506 	error = kcpuset_copyout(cpuset, SCARG(uap, cpuset), SCARG(uap, size));
    507 out:
    508 	kcpuset_unuse(cpuset, NULL);
    509 	return error;
    510 }
    511 
    512 /*
    513  * Yield.
    514  */
    515 int
    516 sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
    517 {
    518 
    519 	yield();
    520 #ifdef KERN_SA
    521 	if (l->l_flag & LW_SA) {
    522 		sa_preempt(l);
    523 	}
    524 #endif
    525 	return 0;
    526 }
    527 
    528 /*
    529  * Sysctl nodes and initialization.
    530  */
    531 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
    532 {
    533 	const struct sysctlnode *node = NULL;
    534 
    535 	sysctl_createv(clog, 0, NULL, NULL,
    536 		CTLFLAG_PERMANENT,
    537 		CTLTYPE_NODE, "kern", NULL,
    538 		NULL, 0, NULL, 0,
    539 		CTL_KERN, CTL_EOL);
    540 	sysctl_createv(clog, 0, NULL, NULL,
    541 		CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
    542 		CTLTYPE_INT, "posix_sched",
    543 		SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
    544 			     "Process Scheduling option to which the "
    545 			     "system attempts to conform"),
    546 		NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
    547 		CTL_KERN, CTL_CREATE, CTL_EOL);
    548 	sysctl_createv(clog, 0, NULL, &node,
    549 		CTLFLAG_PERMANENT,
    550 		CTLTYPE_NODE, "sched",
    551 		SYSCTL_DESCR("Scheduler options"),
    552 		NULL, 0, NULL, 0,
    553 		CTL_KERN, CTL_CREATE, CTL_EOL);
    554 
    555 	if (node == NULL)
    556 		return;
    557 
    558 	sysctl_createv(clog, 0, &node, NULL,
    559 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
    560 		CTLTYPE_INT, "pri_min",
    561 		SYSCTL_DESCR("Minimal POSIX real-time priority"),
    562 		NULL, SCHED_PRI_MIN, NULL, 0,
    563 		CTL_CREATE, CTL_EOL);
    564 	sysctl_createv(clog, 0, &node, NULL,
    565 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
    566 		CTLTYPE_INT, "pri_max",
    567 		SYSCTL_DESCR("Maximal POSIX real-time priority"),
    568 		NULL, SCHED_PRI_MAX, NULL, 0,
    569 		CTL_CREATE, CTL_EOL);
    570 }
    571