Home | History | Annotate | Line # | Download | only in kern
sys_sched.c revision 1.36
      1 /*	$NetBSD: sys_sched.c,v 1.36 2011/08/07 13:33:01 rmind Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * System calls relating to the scheduler.
     31  *
     32  * Lock order:
     33  *
     34  *	cpu_lock ->
     35  *	    proc_lock ->
     36  *		proc_t::p_lock ->
     37  *		    lwp_t::lwp_lock
     38  *
     39  * TODO:
     40  *  - Handle pthread_setschedprio() as defined by POSIX;
     41  *  - Handle sched_yield() case for SCHED_FIFO as defined by POSIX;
     42  */
     43 
     44 #include <sys/cdefs.h>
     45 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.36 2011/08/07 13:33:01 rmind Exp $");
     46 
     47 #include <sys/param.h>
     48 
     49 #include <sys/cpu.h>
     50 #include <sys/kauth.h>
     51 #include <sys/kmem.h>
     52 #include <sys/lwp.h>
     53 #include <sys/mutex.h>
     54 #include <sys/proc.h>
     55 #include <sys/pset.h>
     56 #include <sys/sa.h>
     57 #include <sys/savar.h>
     58 #include <sys/sched.h>
     59 #include <sys/syscallargs.h>
     60 #include <sys/sysctl.h>
     61 #include <sys/systm.h>
     62 #include <sys/types.h>
     63 #include <sys/unistd.h>
     64 
     65 #include "opt_sa.h"
     66 
     67 static struct sysctllog *sched_sysctl_log;
     68 static kauth_listener_t sched_listener;
     69 
     70 /*
     71  * Convert user priority or the in-kernel priority or convert the current
     72  * priority to the appropriate range according to the policy change.
     73  */
     74 static pri_t
     75 convert_pri(lwp_t *l, int policy, pri_t pri)
     76 {
     77 
     78 	/* Convert user priority to the in-kernel */
     79 	if (pri != PRI_NONE) {
     80 		/* Only for real-time threads */
     81 		KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
     82 		KASSERT(policy != SCHED_OTHER);
     83 		return PRI_USER_RT + pri;
     84 	}
     85 
     86 	/* Neither policy, nor priority change */
     87 	if (l->l_class == policy)
     88 		return l->l_priority;
     89 
     90 	/* Time-sharing -> real-time */
     91 	if (l->l_class == SCHED_OTHER) {
     92 		KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
     93 		return PRI_USER_RT;
     94 	}
     95 
     96 	/* Real-time -> time-sharing */
     97 	if (policy == SCHED_OTHER) {
     98 		KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
     99 		return l->l_priority - PRI_USER_RT;
    100 	}
    101 
    102 	/* Real-time -> real-time */
    103 	return l->l_priority;
    104 }
    105 
    106 int
    107 do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
    108     const struct sched_param *params)
    109 {
    110 	struct proc *p;
    111 	struct lwp *t;
    112 	pri_t pri;
    113 	u_int lcnt;
    114 	int error;
    115 
    116 	error = 0;
    117 
    118 	pri = params->sched_priority;
    119 
    120 	/* If no parameters specified, just return (this should not happen) */
    121 	if (pri == PRI_NONE && policy == SCHED_NONE)
    122 		return 0;
    123 
    124 	/* Validate scheduling class */
    125 	if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
    126 		return EINVAL;
    127 
    128 	/* Validate priority */
    129 	if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
    130 		return EINVAL;
    131 
    132 	if (pid != 0) {
    133 		/* Find the process */
    134 		mutex_enter(proc_lock);
    135 		p = proc_find(pid);
    136 		if (p == NULL) {
    137 			mutex_exit(proc_lock);
    138 			return ESRCH;
    139 		}
    140 		mutex_enter(p->p_lock);
    141 		mutex_exit(proc_lock);
    142 		/* Disallow modification of system processes */
    143 		if ((p->p_flag & PK_SYSTEM) != 0) {
    144 			mutex_exit(p->p_lock);
    145 			return EPERM;
    146 		}
    147 	} else {
    148 		/* Use the calling process */
    149 		p = curlwp->l_proc;
    150 		mutex_enter(p->p_lock);
    151 	}
    152 
    153 	/* Find the LWP(s) */
    154 	lcnt = 0;
    155 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
    156 		pri_t kpri;
    157 		int lpolicy;
    158 
    159 		if (lid && lid != t->l_lid)
    160 			continue;
    161 
    162 		lcnt++;
    163 		lwp_lock(t);
    164 		lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
    165 
    166 		/* Disallow setting of priority for SCHED_OTHER threads */
    167 		if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
    168 			lwp_unlock(t);
    169 			error = EINVAL;
    170 			break;
    171 		}
    172 
    173 		/* Convert priority, if needed */
    174 		kpri = convert_pri(t, lpolicy, pri);
    175 
    176 		/* Check the permission */
    177 		error = kauth_authorize_process(kauth_cred_get(),
    178 		    KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
    179 		    KAUTH_ARG(kpri));
    180 		if (error) {
    181 			lwp_unlock(t);
    182 			break;
    183 		}
    184 
    185 		/* Set the scheduling class, change the priority */
    186 		t->l_class = lpolicy;
    187 		lwp_changepri(t, kpri);
    188 		lwp_unlock(t);
    189 	}
    190 	mutex_exit(p->p_lock);
    191 	return (lcnt == 0) ? ESRCH : error;
    192 }
    193 
    194 /*
    195  * Set scheduling parameters.
    196  */
    197 int
    198 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
    199     register_t *retval)
    200 {
    201 	/* {
    202 		syscallarg(pid_t) pid;
    203 		syscallarg(lwpid_t) lid;
    204 		syscallarg(int) policy;
    205 		syscallarg(const struct sched_param *) params;
    206 	} */
    207 	struct sched_param params;
    208 	int error;
    209 
    210 	/* Get the parameters from the user-space */
    211 	error = copyin(SCARG(uap, params), &params, sizeof(params));
    212 	if (error)
    213 		goto out;
    214 
    215 	error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
    216 	    SCARG(uap, policy), &params);
    217 out:
    218 	return error;
    219 }
    220 
    221 int
    222 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
    223     struct sched_param *params)
    224 {
    225 	struct sched_param lparams;
    226 	struct lwp *t;
    227 	int error, lpolicy;
    228 
    229 	/* Locks the LWP */
    230 	t = lwp_find2(pid, lid);
    231 	if (t == NULL)
    232 		return ESRCH;
    233 
    234 	/* Check the permission */
    235 	error = kauth_authorize_process(kauth_cred_get(),
    236 	    KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
    237 	if (error != 0) {
    238 		mutex_exit(t->l_proc->p_lock);
    239 		return error;
    240 	}
    241 
    242 	lwp_lock(t);
    243 	lparams.sched_priority = t->l_priority;
    244 	lpolicy = t->l_class;
    245 
    246 	switch (lpolicy) {
    247 	case SCHED_OTHER:
    248 		lparams.sched_priority -= PRI_USER;
    249 		break;
    250 	case SCHED_RR:
    251 	case SCHED_FIFO:
    252 		lparams.sched_priority -= PRI_USER_RT;
    253 		break;
    254 	}
    255 
    256 	if (policy != NULL)
    257 		*policy = lpolicy;
    258 
    259 	if (params != NULL)
    260 		*params = lparams;
    261 
    262 	lwp_unlock(t);
    263 	mutex_exit(t->l_proc->p_lock);
    264 	return error;
    265 }
    266 
    267 /*
    268  * Get scheduling parameters.
    269  */
    270 int
    271 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
    272     register_t *retval)
    273 {
    274 	/* {
    275 		syscallarg(pid_t) pid;
    276 		syscallarg(lwpid_t) lid;
    277 		syscallarg(int *) policy;
    278 		syscallarg(struct sched_param *) params;
    279 	} */
    280 	struct sched_param params;
    281 	int error, policy;
    282 
    283 	error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
    284 	    &params);
    285 	if (error)
    286 		goto out;
    287 
    288 	error = copyout(&params, SCARG(uap, params), sizeof(params));
    289 	if (error == 0 && SCARG(uap, policy) != NULL)
    290 		error = copyout(&policy, SCARG(uap, policy), sizeof(int));
    291 out:
    292 	return error;
    293 }
    294 
    295 /*
    296  * Allocate the CPU set, and get it from userspace.
    297  */
    298 static int
    299 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
    300 {
    301 	kcpuset_t *kset;
    302 	int error;
    303 
    304 	kcpuset_create(&kset);
    305 	error = kcpuset_copyin(sset, kset, size);
    306 	if (error) {
    307 		kcpuset_unuse(kset, NULL);
    308 	} else {
    309 		*dset = kset;
    310 	}
    311 	return error;
    312 }
    313 
    314 /*
    315  * Set affinity.
    316  */
    317 int
    318 sys__sched_setaffinity(struct lwp *l,
    319     const struct sys__sched_setaffinity_args *uap, register_t *retval)
    320 {
    321 	/* {
    322 		syscallarg(pid_t) pid;
    323 		syscallarg(lwpid_t) lid;
    324 		syscallarg(size_t) size;
    325 		syscallarg(const cpuset_t *) cpuset;
    326 	} */
    327 	kcpuset_t *kcset, *kcpulst = NULL;
    328 	struct cpu_info *ici, *ci;
    329 	struct proc *p;
    330 	struct lwp *t;
    331 	CPU_INFO_ITERATOR cii;
    332 	bool alloff;
    333 	lwpid_t lid;
    334 	u_int lcnt;
    335 	int error;
    336 
    337 	error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
    338 	if (error)
    339 		return error;
    340 
    341 	/*
    342 	 * Traverse _each_ CPU to:
    343 	 *  - Check that CPUs in the mask have no assigned processor set.
    344 	 *  - Check that at least one CPU from the mask is online.
    345 	 *  - Find the first target CPU to migrate.
    346 	 *
    347 	 * To avoid the race with CPU online/offline calls and processor sets,
    348 	 * cpu_lock will be locked for the entire operation.
    349 	 */
    350 	ci = NULL;
    351 	alloff = false;
    352 	mutex_enter(&cpu_lock);
    353 	for (CPU_INFO_FOREACH(cii, ici)) {
    354 		struct schedstate_percpu *ispc;
    355 
    356 		if (kcpuset_isset(kcset, cpu_index(ici)) == 0)
    357 			continue;
    358 
    359 		ispc = &ici->ci_schedstate;
    360 		/* Check that CPU is not in the processor-set */
    361 		if (ispc->spc_psid != PS_NONE) {
    362 			error = EPERM;
    363 			goto out;
    364 		}
    365 		/* Skip offline CPUs */
    366 		if (ispc->spc_flags & SPCF_OFFLINE) {
    367 			alloff = true;
    368 			continue;
    369 		}
    370 		/* Target CPU to migrate */
    371 		if (ci == NULL) {
    372 			ci = ici;
    373 		}
    374 	}
    375 	if (ci == NULL) {
    376 		if (alloff) {
    377 			/* All CPUs in the set are offline */
    378 			error = EPERM;
    379 			goto out;
    380 		}
    381 		/* Empty set */
    382 		kcpuset_unuse(kcset, &kcpulst);
    383 		kcset = NULL;
    384 	}
    385 
    386 	if (SCARG(uap, pid) != 0) {
    387 		/* Find the process */
    388 		mutex_enter(proc_lock);
    389 		p = proc_find(SCARG(uap, pid));
    390 		if (p == NULL) {
    391 			mutex_exit(proc_lock);
    392 			error = ESRCH;
    393 			goto out;
    394 		}
    395 		mutex_enter(p->p_lock);
    396 		mutex_exit(proc_lock);
    397 		/* Disallow modification of system processes. */
    398 		if ((p->p_flag & PK_SYSTEM) != 0) {
    399 			mutex_exit(p->p_lock);
    400 			error = EPERM;
    401 			goto out;
    402 		}
    403 	} else {
    404 		/* Use the calling process */
    405 		p = l->l_proc;
    406 		mutex_enter(p->p_lock);
    407 	}
    408 
    409 	/*
    410 	 * Check the permission.
    411 	 */
    412 	error = kauth_authorize_process(l->l_cred,
    413 	    KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
    414 	if (error != 0) {
    415 		mutex_exit(p->p_lock);
    416 		goto out;
    417 	}
    418 
    419 #ifdef KERN_SA
    420 	/* Changing the affinity of a SA process is not supported */
    421 	if ((p->p_sflag & (PS_SA | PS_WEXIT)) != 0 || p->p_sa != NULL) {
    422 		mutex_exit(p->p_lock);
    423 		error = EINVAL;
    424 		goto out;
    425 	}
    426 #endif
    427 
    428 	/* Find the LWP(s) */
    429 	lcnt = 0;
    430 	lid = SCARG(uap, lid);
    431 	LIST_FOREACH(t, &p->p_lwps, l_sibling) {
    432 		if (lid && lid != t->l_lid)
    433 			continue;
    434 		lwp_lock(t);
    435 		/* It is not allowed to set the affinity for zombie LWPs */
    436 		if (t->l_stat == LSZOMB) {
    437 			lwp_unlock(t);
    438 			continue;
    439 		}
    440 		if (kcset) {
    441 			/* Set the affinity flag and new CPU set */
    442 			t->l_flag |= LW_AFFINITY;
    443 			kcpuset_use(kcset);
    444 			if (t->l_affinity != NULL)
    445 				kcpuset_unuse(t->l_affinity, &kcpulst);
    446 			t->l_affinity = kcset;
    447 			/* Migrate to another CPU, unlocks LWP */
    448 			lwp_migrate(t, ci);
    449 		} else {
    450 			/* Unset the affinity flag */
    451 			t->l_flag &= ~LW_AFFINITY;
    452 			if (t->l_affinity != NULL)
    453 				kcpuset_unuse(t->l_affinity, &kcpulst);
    454 			t->l_affinity = NULL;
    455 			lwp_unlock(t);
    456 		}
    457 		lcnt++;
    458 	}
    459 	mutex_exit(p->p_lock);
    460 	if (lcnt == 0) {
    461 		error = ESRCH;
    462 	}
    463 out:
    464 	mutex_exit(&cpu_lock);
    465 
    466 	/*
    467 	 * Drop the initial reference (LWPs, if any, have the ownership now),
    468 	 * and destroy whatever is in the G/C list, if filled.
    469 	 */
    470 	if (kcset) {
    471 		kcpuset_unuse(kcset, &kcpulst);
    472 	}
    473 	if (kcpulst) {
    474 		kcpuset_destroy(kcpulst);
    475 	}
    476 	return error;
    477 }
    478 
    479 /*
    480  * Get affinity.
    481  */
    482 int
    483 sys__sched_getaffinity(struct lwp *l,
    484     const struct sys__sched_getaffinity_args *uap, register_t *retval)
    485 {
    486 	/* {
    487 		syscallarg(pid_t) pid;
    488 		syscallarg(lwpid_t) lid;
    489 		syscallarg(size_t) size;
    490 		syscallarg(cpuset_t *) cpuset;
    491 	} */
    492 	struct lwp *t;
    493 	kcpuset_t *kcset;
    494 	int error;
    495 
    496 	error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
    497 	if (error)
    498 		return error;
    499 
    500 	/* Locks the LWP */
    501 	t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
    502 	if (t == NULL) {
    503 		error = ESRCH;
    504 		goto out;
    505 	}
    506 	/* Check the permission */
    507 	if (kauth_authorize_process(l->l_cred,
    508 	    KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
    509 		mutex_exit(t->l_proc->p_lock);
    510 		error = EPERM;
    511 		goto out;
    512 	}
    513 	lwp_lock(t);
    514 	if (t->l_flag & LW_AFFINITY) {
    515 		KASSERT(t->l_affinity != NULL);
    516 		kcpuset_copy(kcset, t->l_affinity);
    517 	} else {
    518 		kcpuset_zero(kcset);
    519 	}
    520 	lwp_unlock(t);
    521 	mutex_exit(t->l_proc->p_lock);
    522 
    523 	error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
    524 out:
    525 	kcpuset_unuse(kcset, NULL);
    526 	return error;
    527 }
    528 
    529 /*
    530  * Yield.
    531  */
    532 int
    533 sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
    534 {
    535 
    536 	yield();
    537 #ifdef KERN_SA
    538 	if (l->l_flag & LW_SA) {
    539 		sa_preempt(l);
    540 	}
    541 #endif
    542 	return 0;
    543 }
    544 
    545 /*
    546  * Sysctl nodes and initialization.
    547  */
    548 static void
    549 sysctl_sched_setup(struct sysctllog **clog)
    550 {
    551 	const struct sysctlnode *node = NULL;
    552 
    553 	sysctl_createv(clog, 0, NULL, NULL,
    554 		CTLFLAG_PERMANENT,
    555 		CTLTYPE_NODE, "kern", NULL,
    556 		NULL, 0, NULL, 0,
    557 		CTL_KERN, CTL_EOL);
    558 	sysctl_createv(clog, 0, NULL, NULL,
    559 		CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
    560 		CTLTYPE_INT, "posix_sched",
    561 		SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
    562 			     "Process Scheduling option to which the "
    563 			     "system attempts to conform"),
    564 		NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
    565 		CTL_KERN, CTL_CREATE, CTL_EOL);
    566 	sysctl_createv(clog, 0, NULL, &node,
    567 		CTLFLAG_PERMANENT,
    568 		CTLTYPE_NODE, "sched",
    569 		SYSCTL_DESCR("Scheduler options"),
    570 		NULL, 0, NULL, 0,
    571 		CTL_KERN, CTL_CREATE, CTL_EOL);
    572 
    573 	if (node == NULL)
    574 		return;
    575 
    576 	sysctl_createv(clog, 0, &node, NULL,
    577 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
    578 		CTLTYPE_INT, "pri_min",
    579 		SYSCTL_DESCR("Minimal POSIX real-time priority"),
    580 		NULL, SCHED_PRI_MIN, NULL, 0,
    581 		CTL_CREATE, CTL_EOL);
    582 	sysctl_createv(clog, 0, &node, NULL,
    583 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
    584 		CTLTYPE_INT, "pri_max",
    585 		SYSCTL_DESCR("Maximal POSIX real-time priority"),
    586 		NULL, SCHED_PRI_MAX, NULL, 0,
    587 		CTL_CREATE, CTL_EOL);
    588 }
    589 
    590 static int
    591 sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    592     void *arg0, void *arg1, void *arg2, void *arg3)
    593 {
    594 	struct proc *p;
    595 	int result;
    596 
    597 	result = KAUTH_RESULT_DEFER;
    598 	p = arg0;
    599 
    600 	switch (action) {
    601 	case KAUTH_PROCESS_SCHEDULER_GETPARAM:
    602 		if (kauth_cred_uidmatch(cred, p->p_cred))
    603 			result = KAUTH_RESULT_ALLOW;
    604 		break;
    605 
    606 	case KAUTH_PROCESS_SCHEDULER_SETPARAM:
    607 		if (kauth_cred_uidmatch(cred, p->p_cred)) {
    608 			struct lwp *l;
    609 			int policy;
    610 			pri_t priority;
    611 
    612 			l = arg1;
    613 			policy = (int)(unsigned long)arg2;
    614 			priority = (pri_t)(unsigned long)arg3;
    615 
    616 			if ((policy == l->l_class ||
    617 			    (policy != SCHED_FIFO && policy != SCHED_RR)) &&
    618 			    priority <= l->l_priority)
    619 				result = KAUTH_RESULT_ALLOW;
    620 		}
    621 
    622 		break;
    623 
    624 	case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
    625 		result = KAUTH_RESULT_ALLOW;
    626 		break;
    627 
    628 	case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
    629 		/* Privileged; we let the secmodel handle this. */
    630 		break;
    631 
    632 	default:
    633 		break;
    634 	}
    635 
    636 	return result;
    637 }
    638 
    639 void
    640 sched_init(void)
    641 {
    642 
    643 	sysctl_sched_setup(&sched_sysctl_log);
    644 
    645 	sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
    646 	    sched_listener_cb, NULL);
    647 }
    648