1 1.366 riastrad /* $NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $ */ 2 1.63 thorpej 3 1.63 thorpej /*- 4 1.359 ad * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020, 2023 5 1.260 ad * The NetBSD Foundation, Inc. 6 1.63 thorpej * All rights reserved. 7 1.63 thorpej * 8 1.63 thorpej * This code is derived from software contributed to The NetBSD Foundation 9 1.63 thorpej * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 1.188 yamt * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 1.188 yamt * Daniel Sieger. 12 1.63 thorpej * 13 1.63 thorpej * Redistribution and use in source and binary forms, with or without 14 1.63 thorpej * modification, are permitted provided that the following conditions 15 1.63 thorpej * are met: 16 1.63 thorpej * 1. Redistributions of source code must retain the above copyright 17 1.63 thorpej * notice, this list of conditions and the following disclaimer. 18 1.63 thorpej * 2. Redistributions in binary form must reproduce the above copyright 19 1.63 thorpej * notice, this list of conditions and the following disclaimer in the 20 1.63 thorpej * documentation and/or other materials provided with the distribution. 21 1.63 thorpej * 22 1.63 thorpej * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 1.63 thorpej * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 1.63 thorpej * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 1.63 thorpej * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 1.63 thorpej * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 1.63 thorpej * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 1.63 thorpej * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 1.63 thorpej * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 1.63 thorpej * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 1.63 thorpej * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 1.63 thorpej * POSSIBILITY OF SUCH DAMAGE. 33 1.63 thorpej */ 34 1.26 cgd 35 1.26 cgd /*- 36 1.26 cgd * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 1.26 cgd * The Regents of the University of California. All rights reserved. 38 1.26 cgd * (c) UNIX System Laboratories, Inc. 39 1.26 cgd * All or some portions of this file are derived from material licensed 40 1.26 cgd * to the University of California by American Telephone and Telegraph 41 1.26 cgd * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 1.26 cgd * the permission of UNIX System Laboratories, Inc. 43 1.26 cgd * 44 1.26 cgd * Redistribution and use in source and binary forms, with or without 45 1.26 cgd * modification, are permitted provided that the following conditions 46 1.26 cgd * are met: 47 1.26 cgd * 1. Redistributions of source code must retain the above copyright 48 1.26 cgd * notice, this list of conditions and the following disclaimer. 49 1.26 cgd * 2. Redistributions in binary form must reproduce the above copyright 50 1.26 cgd * notice, this list of conditions and the following disclaimer in the 51 1.26 cgd * documentation and/or other materials provided with the distribution. 52 1.136 agc * 3. Neither the name of the University nor the names of its contributors 53 1.26 cgd * may be used to endorse or promote products derived from this software 54 1.26 cgd * without specific prior written permission. 55 1.26 cgd * 56 1.26 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 1.26 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 1.26 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 1.26 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 1.26 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 1.26 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 1.26 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 1.26 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 1.26 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 1.26 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 1.26 cgd * SUCH DAMAGE. 67 1.26 cgd * 68 1.50 fvdl * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 1.26 cgd */ 70 1.106 lukem 71 1.106 lukem #include <sys/cdefs.h> 72 1.366 riastrad __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $"); 73 1.48 mrg 74 1.109 yamt #include "opt_kstack.h" 75 1.357 riastrad #include "opt_ddb.h" 76 1.277 darran #include "opt_dtrace.h" 77 1.26 cgd 78 1.174 ad #define __MUTEX_PRIVATE 79 1.174 ad 80 1.26 cgd #include <sys/param.h> 81 1.365 riastrad 82 1.365 riastrad #include <sys/atomic.h> 83 1.365 riastrad #include <sys/cpu.h> 84 1.365 riastrad #include <sys/dtrace_bsd.h> 85 1.365 riastrad #include <sys/evcnt.h> 86 1.365 riastrad #include <sys/intr.h> 87 1.365 riastrad #include <sys/kernel.h> 88 1.365 riastrad #include <sys/lockdebug.h> 89 1.365 riastrad #include <sys/lwpctl.h> 90 1.26 cgd #include <sys/proc.h> 91 1.290 christos #include <sys/pserialize.h> 92 1.352 riastrad #include <sys/resource.h> 93 1.26 cgd #include <sys/resourcevar.h> 94 1.341 ad #include <sys/rwlock.h> 95 1.55 ross #include <sys/sched.h> 96 1.365 riastrad #include <sys/sleepq.h> 97 1.365 riastrad #include <sys/syncobj.h> 98 1.179 dsl #include <sys/syscall_stats.h> 99 1.295 njoly #include <sys/syslog.h> 100 1.365 riastrad #include <sys/systm.h> 101 1.47 mrg 102 1.47 mrg #include <uvm/uvm_extern.h> 103 1.47 mrg 104 1.231 ad #include <dev/lockstat.h> 105 1.231 ad 106 1.279 darran int dtrace_vtime_active=0; 107 1.276 darran dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 108 1.276 darran 109 1.357 riastrad #ifdef DDB 110 1.357 riastrad #include <ddb/ddb.h> 111 1.357 riastrad #endif 112 1.357 riastrad 113 1.271 rmind static void sched_unsleep(struct lwp *, bool); 114 1.188 yamt static void sched_changepri(struct lwp *, pri_t); 115 1.188 yamt static void sched_lendpri(struct lwp *, pri_t); 116 1.122 thorpej 117 1.174 ad syncobj_t sleep_syncobj = { 118 1.358 riastrad .sobj_name = "sleep", 119 1.313 ozaki .sobj_flag = SOBJ_SLEEPQ_SORTED, 120 1.359 ad .sobj_boostpri = PRI_KERNEL, 121 1.313 ozaki .sobj_unsleep = sleepq_unsleep, 122 1.313 ozaki .sobj_changepri = sleepq_changepri, 123 1.313 ozaki .sobj_lendpri = sleepq_lendpri, 124 1.313 ozaki .sobj_owner = syncobj_noowner, 125 1.174 ad }; 126 1.174 ad 127 1.174 ad syncobj_t sched_syncobj = { 128 1.358 riastrad .sobj_name = "sched", 129 1.313 ozaki .sobj_flag = SOBJ_SLEEPQ_SORTED, 130 1.359 ad .sobj_boostpri = PRI_USER, 131 1.313 ozaki .sobj_unsleep = sched_unsleep, 132 1.313 ozaki .sobj_changepri = sched_changepri, 133 1.313 ozaki .sobj_lendpri = sched_lendpri, 134 1.313 ozaki .sobj_owner = syncobj_noowner, 135 1.174 ad }; 136 1.122 thorpej 137 1.342 ad syncobj_t kpause_syncobj = { 138 1.358 riastrad .sobj_name = "kpause", 139 1.342 ad .sobj_flag = SOBJ_SLEEPQ_NULL, 140 1.359 ad .sobj_boostpri = PRI_KERNEL, 141 1.342 ad .sobj_unsleep = sleepq_unsleep, 142 1.342 ad .sobj_changepri = sleepq_changepri, 143 1.342 ad .sobj_lendpri = sleepq_lendpri, 144 1.342 ad .sobj_owner = syncobj_noowner, 145 1.342 ad }; 146 1.342 ad 147 1.289 rmind /* "Lightning bolt": once a second sleep address. */ 148 1.289 rmind kcondvar_t lbolt __cacheline_aligned; 149 1.223 ad 150 1.289 rmind u_int sched_pstats_ticks __cacheline_aligned; 151 1.289 rmind 152 1.289 rmind /* Preemption event counters. */ 153 1.289 rmind static struct evcnt kpreempt_ev_crit __cacheline_aligned; 154 1.289 rmind static struct evcnt kpreempt_ev_klock __cacheline_aligned; 155 1.289 rmind static struct evcnt kpreempt_ev_immed __cacheline_aligned; 156 1.231 ad 157 1.237 rmind void 158 1.270 elad synch_init(void) 159 1.237 rmind { 160 1.237 rmind 161 1.237 rmind cv_init(&lbolt, "lbolt"); 162 1.237 rmind 163 1.239 ad evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 164 1.237 rmind "kpreempt", "defer: critical section"); 165 1.239 ad evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 166 1.237 rmind "kpreempt", "defer: kernel_lock"); 167 1.239 ad evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 168 1.237 rmind "kpreempt", "immediate"); 169 1.237 rmind } 170 1.237 rmind 171 1.26 cgd /* 172 1.174 ad * OBSOLETE INTERFACE 173 1.174 ad * 174 1.255 skrll * General sleep call. Suspends the current LWP until a wakeup is 175 1.255 skrll * performed on the specified identifier. The LWP will then be made 176 1.174 ad * runnable with the specified priority. Sleeps at most timo/hz seconds (0 177 1.174 ad * means no timeout). If pri includes PCATCH flag, signals are checked 178 1.26 cgd * before and after sleeping, else signals are not checked. Returns 0 if 179 1.26 cgd * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 180 1.26 cgd * signal needs to be delivered, ERESTART is returned if the current system 181 1.26 cgd * call should be restarted if possible, and EINTR is returned if the system 182 1.26 cgd * call should be interrupted by the signal (return EINTR). 183 1.26 cgd */ 184 1.26 cgd int 185 1.297 rmind tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) 186 1.26 cgd { 187 1.122 thorpej struct lwp *l = curlwp; 188 1.174 ad sleepq_t *sq; 189 1.244 ad kmutex_t *mp; 190 1.347 ad bool catch_p; 191 1.362 ad int nlocks; 192 1.26 cgd 193 1.204 ad KASSERT((l->l_pflag & LP_INTR) == 0); 194 1.272 pooka KASSERT(ident != &lbolt); 195 1.356 riastrad //KASSERT(KERNEL_LOCKED_P()); 196 1.204 ad 197 1.174 ad if (sleepq_dontsleep(l)) { 198 1.174 ad (void)sleepq_abort(NULL, 0); 199 1.174 ad return 0; 200 1.26 cgd } 201 1.78 sommerfe 202 1.347 ad catch_p = priority & PCATCH; 203 1.244 ad sq = sleeptab_lookup(&sleeptab, ident, &mp); 204 1.362 ad nlocks = sleepq_enter(sq, l, mp); 205 1.347 ad sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 206 1.362 ad return sleepq_block(timo, catch_p, &sleep_syncobj, nlocks); 207 1.26 cgd } 208 1.26 cgd 209 1.187 ad int 210 1.187 ad mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 211 1.187 ad kmutex_t *mtx) 212 1.187 ad { 213 1.187 ad struct lwp *l = curlwp; 214 1.187 ad sleepq_t *sq; 215 1.244 ad kmutex_t *mp; 216 1.347 ad bool catch_p; 217 1.362 ad int error, nlocks; 218 1.187 ad 219 1.204 ad KASSERT((l->l_pflag & LP_INTR) == 0); 220 1.272 pooka KASSERT(ident != &lbolt); 221 1.204 ad 222 1.187 ad if (sleepq_dontsleep(l)) { 223 1.187 ad (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 224 1.187 ad return 0; 225 1.187 ad } 226 1.187 ad 227 1.347 ad catch_p = priority & PCATCH; 228 1.244 ad sq = sleeptab_lookup(&sleeptab, ident, &mp); 229 1.362 ad nlocks = sleepq_enter(sq, l, mp); 230 1.347 ad sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 231 1.187 ad mutex_exit(mtx); 232 1.362 ad error = sleepq_block(timo, catch_p, &sleep_syncobj, nlocks); 233 1.187 ad 234 1.187 ad if ((priority & PNORELOCK) == 0) 235 1.187 ad mutex_enter(mtx); 236 1.297 rmind 237 1.187 ad return error; 238 1.187 ad } 239 1.187 ad 240 1.26 cgd /* 241 1.174 ad * General sleep call for situations where a wake-up is not expected. 242 1.26 cgd */ 243 1.174 ad int 244 1.182 thorpej kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 245 1.26 cgd { 246 1.174 ad struct lwp *l = curlwp; 247 1.362 ad int error, nlocks; 248 1.26 cgd 249 1.366 riastrad KASSERTMSG(timo != 0 || intr, "wmesg=%s intr=%s timo=%d mtx=%p", 250 1.366 riastrad wmesg, intr ? "true" : "false", timo, mtx); 251 1.284 pooka 252 1.174 ad if (sleepq_dontsleep(l)) 253 1.174 ad return sleepq_abort(NULL, 0); 254 1.26 cgd 255 1.174 ad if (mtx != NULL) 256 1.174 ad mutex_exit(mtx); 257 1.363 ad nlocks = sleepq_enter(NULL, l, NULL); 258 1.347 ad sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr); 259 1.362 ad error = sleepq_block(timo, intr, &kpause_syncobj, nlocks); 260 1.174 ad if (mtx != NULL) 261 1.174 ad mutex_enter(mtx); 262 1.83 thorpej 263 1.174 ad return error; 264 1.139 cl } 265 1.139 cl 266 1.26 cgd /* 267 1.174 ad * OBSOLETE INTERFACE 268 1.174 ad * 269 1.255 skrll * Make all LWPs sleeping on the specified identifier runnable. 270 1.26 cgd */ 271 1.26 cgd void 272 1.174 ad wakeup(wchan_t ident) 273 1.26 cgd { 274 1.174 ad sleepq_t *sq; 275 1.244 ad kmutex_t *mp; 276 1.83 thorpej 277 1.261 rmind if (__predict_false(cold)) 278 1.174 ad return; 279 1.83 thorpej 280 1.244 ad sq = sleeptab_lookup(&sleeptab, ident, &mp); 281 1.244 ad sleepq_wake(sq, ident, (u_int)-1, mp); 282 1.63 thorpej } 283 1.63 thorpej 284 1.63 thorpej /* 285 1.255 skrll * General yield call. Puts the current LWP back on its run queue and 286 1.343 ad * performs a context switch. 287 1.117 gmcgarry */ 288 1.117 gmcgarry void 289 1.117 gmcgarry yield(void) 290 1.117 gmcgarry { 291 1.122 thorpej struct lwp *l = curlwp; 292 1.362 ad int nlocks; 293 1.117 gmcgarry 294 1.362 ad KERNEL_UNLOCK_ALL(l, &nlocks); 295 1.174 ad lwp_lock(l); 296 1.329 ad 297 1.217 ad KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 298 1.188 yamt KASSERT(l->l_stat == LSONPROC); 299 1.329 ad 300 1.329 ad spc_lock(l->l_cpu); 301 1.329 ad mi_switch(l); 302 1.362 ad KERNEL_LOCK(nlocks, l); 303 1.69 thorpej } 304 1.69 thorpej 305 1.69 thorpej /* 306 1.255 skrll * General preemption call. Puts the current LWP back on its run queue 307 1.343 ad * and performs an involuntary context switch. Different from yield() 308 1.343 ad * in that: 309 1.343 ad * 310 1.343 ad * - It's counted differently (involuntary vs. voluntary). 311 1.343 ad * - Realtime threads go to the head of their runqueue vs. tail for yield(). 312 1.69 thorpej */ 313 1.69 thorpej void 314 1.174 ad preempt(void) 315 1.69 thorpej { 316 1.122 thorpej struct lwp *l = curlwp; 317 1.362 ad int nlocks; 318 1.69 thorpej 319 1.362 ad KERNEL_UNLOCK_ALL(l, &nlocks); 320 1.174 ad lwp_lock(l); 321 1.329 ad 322 1.217 ad KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 323 1.188 yamt KASSERT(l->l_stat == LSONPROC); 324 1.329 ad 325 1.343 ad spc_lock(l->l_cpu); 326 1.321 mlelstv l->l_pflag |= LP_PREEMPTING; 327 1.329 ad mi_switch(l); 328 1.362 ad KERNEL_LOCK(nlocks, l); 329 1.69 thorpej } 330 1.69 thorpej 331 1.234 ad /* 332 1.346 ad * Return true if the current LWP should yield the processor. Intended to 333 1.346 ad * be used by long-running code in kernel. 334 1.343 ad */ 335 1.346 ad inline bool 336 1.346 ad preempt_needed(void) 337 1.343 ad { 338 1.343 ad lwp_t *l = curlwp; 339 1.343 ad int needed; 340 1.343 ad 341 1.343 ad KPREEMPT_DISABLE(l); 342 1.346 ad needed = l->l_cpu->ci_want_resched; 343 1.343 ad KPREEMPT_ENABLE(l); 344 1.343 ad 345 1.348 maxv return (needed != 0); 346 1.343 ad } 347 1.343 ad 348 1.343 ad /* 349 1.346 ad * A breathing point for long running code in kernel. 350 1.343 ad */ 351 1.346 ad void 352 1.346 ad preempt_point(void) 353 1.343 ad { 354 1.343 ad 355 1.346 ad if (__predict_false(preempt_needed())) { 356 1.346 ad preempt(); 357 1.346 ad } 358 1.343 ad } 359 1.343 ad 360 1.343 ad /* 361 1.234 ad * Handle a request made by another agent to preempt the current LWP 362 1.234 ad * in-kernel. Usually called when l_dopreempt may be non-zero. 363 1.234 ad * 364 1.234 ad * Character addresses for lockstat only. 365 1.234 ad */ 366 1.326 ad static char kpreempt_is_disabled; 367 1.231 ad static char kernel_lock_held; 368 1.326 ad static char is_softint_lwp; 369 1.326 ad static char spl_is_raised; 370 1.231 ad 371 1.231 ad bool 372 1.231 ad kpreempt(uintptr_t where) 373 1.231 ad { 374 1.231 ad uintptr_t failed; 375 1.231 ad lwp_t *l; 376 1.264 ad int s, dop, lsflag; 377 1.231 ad 378 1.231 ad l = curlwp; 379 1.231 ad failed = 0; 380 1.231 ad while ((dop = l->l_dopreempt) != 0) { 381 1.231 ad if (l->l_stat != LSONPROC) { 382 1.231 ad /* 383 1.231 ad * About to block (or die), let it happen. 384 1.231 ad * Doesn't really count as "preemption has 385 1.231 ad * been blocked", since we're going to 386 1.231 ad * context switch. 387 1.231 ad */ 388 1.325 ad atomic_swap_uint(&l->l_dopreempt, 0); 389 1.231 ad return true; 390 1.231 ad } 391 1.345 ad KASSERT((l->l_flag & LW_IDLE) == 0); 392 1.231 ad if (__predict_false(l->l_nopreempt != 0)) { 393 1.231 ad /* LWP holds preemption disabled, explicitly. */ 394 1.231 ad if ((dop & DOPREEMPT_COUNTED) == 0) { 395 1.234 ad kpreempt_ev_crit.ev_count++; 396 1.231 ad } 397 1.326 ad failed = (uintptr_t)&kpreempt_is_disabled; 398 1.231 ad break; 399 1.231 ad } 400 1.231 ad if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 401 1.261 rmind /* Can't preempt soft interrupts yet. */ 402 1.325 ad atomic_swap_uint(&l->l_dopreempt, 0); 403 1.326 ad failed = (uintptr_t)&is_softint_lwp; 404 1.261 rmind break; 405 1.231 ad } 406 1.231 ad s = splsched(); 407 1.338 ad if (__predict_false(l->l_blcnt != 0 || 408 1.338 ad curcpu()->ci_biglock_wanted != NULL)) { 409 1.231 ad /* Hold or want kernel_lock, code is not MT safe. */ 410 1.231 ad splx(s); 411 1.231 ad if ((dop & DOPREEMPT_COUNTED) == 0) { 412 1.234 ad kpreempt_ev_klock.ev_count++; 413 1.231 ad } 414 1.231 ad failed = (uintptr_t)&kernel_lock_held; 415 1.231 ad break; 416 1.231 ad } 417 1.231 ad if (__predict_false(!cpu_kpreempt_enter(where, s))) { 418 1.231 ad /* 419 1.231 ad * It may be that the IPL is too high. 420 1.231 ad * kpreempt_enter() can schedule an 421 1.231 ad * interrupt to retry later. 422 1.231 ad */ 423 1.231 ad splx(s); 424 1.326 ad failed = (uintptr_t)&spl_is_raised; 425 1.231 ad break; 426 1.231 ad } 427 1.231 ad /* Do it! */ 428 1.231 ad if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 429 1.234 ad kpreempt_ev_immed.ev_count++; 430 1.231 ad } 431 1.231 ad lwp_lock(l); 432 1.326 ad l->l_pflag |= LP_PREEMPTING; 433 1.329 ad spc_lock(l->l_cpu); 434 1.231 ad mi_switch(l); 435 1.231 ad l->l_nopreempt++; 436 1.231 ad splx(s); 437 1.231 ad 438 1.231 ad /* Take care of any MD cleanup. */ 439 1.231 ad cpu_kpreempt_exit(where); 440 1.231 ad l->l_nopreempt--; 441 1.231 ad } 442 1.231 ad 443 1.264 ad if (__predict_true(!failed)) { 444 1.264 ad return false; 445 1.264 ad } 446 1.264 ad 447 1.231 ad /* Record preemption failure for reporting via lockstat. */ 448 1.264 ad atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 449 1.264 ad lsflag = 0; 450 1.264 ad LOCKSTAT_ENTER(lsflag); 451 1.264 ad if (__predict_false(lsflag)) { 452 1.264 ad if (where == 0) { 453 1.264 ad where = (uintptr_t)__builtin_return_address(0); 454 1.264 ad } 455 1.264 ad /* Preemption is on, might recurse, so make it atomic. */ 456 1.264 ad if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 457 1.264 ad (void *)where) == NULL) { 458 1.264 ad LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 459 1.264 ad l->l_pfaillock = failed; 460 1.231 ad } 461 1.231 ad } 462 1.264 ad LOCKSTAT_EXIT(lsflag); 463 1.264 ad return true; 464 1.231 ad } 465 1.231 ad 466 1.69 thorpej /* 467 1.231 ad * Return true if preemption is explicitly disabled. 468 1.230 ad */ 469 1.231 ad bool 470 1.231 ad kpreempt_disabled(void) 471 1.231 ad { 472 1.261 rmind const lwp_t *l = curlwp; 473 1.231 ad 474 1.231 ad return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 475 1.332 ad (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || 476 1.332 ad cpu_kpreempt_disabled(); 477 1.231 ad } 478 1.230 ad 479 1.230 ad /* 480 1.231 ad * Disable kernel preemption. 481 1.230 ad */ 482 1.230 ad void 483 1.231 ad kpreempt_disable(void) 484 1.230 ad { 485 1.230 ad 486 1.231 ad KPREEMPT_DISABLE(curlwp); 487 1.230 ad } 488 1.230 ad 489 1.230 ad /* 490 1.231 ad * Reenable kernel preemption. 491 1.230 ad */ 492 1.231 ad void 493 1.231 ad kpreempt_enable(void) 494 1.230 ad { 495 1.230 ad 496 1.231 ad KPREEMPT_ENABLE(curlwp); 497 1.230 ad } 498 1.230 ad 499 1.230 ad /* 500 1.188 yamt * Compute the amount of time during which the current lwp was running. 501 1.130 nathanw * 502 1.188 yamt * - update l_rtime unless it's an idle lwp. 503 1.188 yamt */ 504 1.188 yamt 505 1.199 ad void 506 1.212 yamt updatertime(lwp_t *l, const struct bintime *now) 507 1.188 yamt { 508 1.357 riastrad static bool backwards = false; 509 1.188 yamt 510 1.261 rmind if (__predict_false(l->l_flag & LW_IDLE)) 511 1.188 yamt return; 512 1.188 yamt 513 1.357 riastrad if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { 514 1.357 riastrad char caller[128]; 515 1.357 riastrad 516 1.357 riastrad #ifdef DDB 517 1.357 riastrad db_symstr(caller, sizeof(caller), 518 1.357 riastrad (db_expr_t)(intptr_t)__builtin_return_address(0), 519 1.357 riastrad DB_STGY_PROC); 520 1.357 riastrad #else 521 1.357 riastrad snprintf(caller, sizeof(caller), "%p", 522 1.357 riastrad __builtin_return_address(0)); 523 1.357 riastrad #endif 524 1.357 riastrad backwards = true; 525 1.357 riastrad printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:" 526 1.357 riastrad " timecounter went backwards" 527 1.357 riastrad " from (%jd + 0x%016"PRIx64"/2^64) sec" 528 1.357 riastrad " to (%jd + 0x%016"PRIx64"/2^64) sec" 529 1.357 riastrad " in %s\n", 530 1.357 riastrad (long)l->l_lid, 531 1.357 riastrad l->l_proc->p_comm, 532 1.357 riastrad l->l_name ? " " : "", 533 1.357 riastrad l->l_name ? l->l_name : "", 534 1.357 riastrad l->l_pflag, 535 1.357 riastrad (intmax_t)l->l_stime.sec, l->l_stime.frac, 536 1.357 riastrad (intmax_t)now->sec, now->frac, 537 1.357 riastrad caller); 538 1.357 riastrad } 539 1.357 riastrad 540 1.212 yamt /* rtime += now - stime */ 541 1.212 yamt bintime_add(&l->l_rtime, now); 542 1.212 yamt bintime_sub(&l->l_rtime, &l->l_stime); 543 1.188 yamt } 544 1.188 yamt 545 1.188 yamt /* 546 1.245 ad * Select next LWP from the current CPU to run.. 547 1.245 ad */ 548 1.245 ad static inline lwp_t * 549 1.245 ad nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 550 1.245 ad { 551 1.245 ad lwp_t *newl; 552 1.245 ad 553 1.245 ad /* 554 1.245 ad * Let sched_nextlwp() select the LWP to run the CPU next. 555 1.245 ad * If no LWP is runnable, select the idle LWP. 556 1.245 ad * 557 1.340 ad * On arrival here LWPs on a run queue are locked by spc_mutex which 558 1.340 ad * is currently held. Idle LWPs are always locked by spc_lwplock, 559 1.340 ad * which may or may not be held here. On exit from this code block, 560 1.340 ad * in all cases newl is locked by spc_lwplock. 561 1.245 ad */ 562 1.245 ad newl = sched_nextlwp(); 563 1.245 ad if (newl != NULL) { 564 1.245 ad sched_dequeue(newl); 565 1.245 ad KASSERT(lwp_locked(newl, spc->spc_mutex)); 566 1.274 rmind KASSERT(newl->l_cpu == ci); 567 1.340 ad newl->l_stat = LSONPROC; 568 1.340 ad newl->l_pflag |= LP_RUNNING; 569 1.360 ad newl->l_boostpri = PRI_NONE; 570 1.340 ad spc->spc_curpriority = lwp_eprio(newl); 571 1.340 ad spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE); 572 1.245 ad lwp_setlock(newl, spc->spc_lwplock); 573 1.245 ad } else { 574 1.340 ad /* 575 1.345 ad * The idle LWP does not get set to LSONPROC, because 576 1.345 ad * otherwise it screws up the output from top(1) etc. 577 1.340 ad */ 578 1.245 ad newl = ci->ci_data.cpu_idlelwp; 579 1.340 ad newl->l_pflag |= LP_RUNNING; 580 1.340 ad spc->spc_curpriority = PRI_IDLE; 581 1.334 ad spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) | 582 1.334 ad SPCF_IDLE; 583 1.245 ad } 584 1.261 rmind 585 1.245 ad /* 586 1.325 ad * Only clear want_resched if there are no pending (slow) software 587 1.325 ad * interrupts. We can do this without an atomic, because no new 588 1.325 ad * LWPs can appear in the queue due to our hold on spc_mutex, and 589 1.325 ad * the update to ci_want_resched will become globally visible before 590 1.325 ad * the release of spc_mutex becomes globally visible. 591 1.245 ad */ 592 1.353 martin if (ci->ci_data.cpu_softints == 0) 593 1.353 martin ci->ci_want_resched = 0; 594 1.245 ad 595 1.245 ad return newl; 596 1.245 ad } 597 1.245 ad 598 1.245 ad /* 599 1.188 yamt * The machine independent parts of context switch. 600 1.188 yamt * 601 1.335 ad * NOTE: l->l_cpu is not changed in this routine, because an LWP never 602 1.335 ad * changes its own l_cpu (that would screw up curcpu on many ports and could 603 1.335 ad * cause all kinds of other evil stuff). l_cpu is always changed by some 604 1.339 ad * other actor, when it's known the LWP is not running (the LP_RUNNING flag 605 1.335 ad * is checked under lock). 606 1.26 cgd */ 607 1.329 ad void 608 1.199 ad mi_switch(lwp_t *l) 609 1.26 cgd { 610 1.246 rmind struct cpu_info *ci; 611 1.76 thorpej struct schedstate_percpu *spc; 612 1.188 yamt struct lwp *newl; 613 1.339 ad kmutex_t *lock; 614 1.329 ad int oldspl; 615 1.212 yamt struct bintime bt; 616 1.199 ad bool returning; 617 1.26 cgd 618 1.188 yamt KASSERT(lwp_locked(l, NULL)); 619 1.231 ad KASSERT(kpreempt_disabled()); 620 1.329 ad KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); 621 1.337 ad KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked"); 622 1.174 ad 623 1.174 ad kstack_check_magic(l); 624 1.83 thorpej 625 1.212 yamt binuptime(&bt); 626 1.199 ad 627 1.304 matt KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 628 1.339 ad KASSERT((l->l_pflag & LP_RUNNING) != 0); 629 1.329 ad KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN); 630 1.329 ad ci = curcpu(); 631 1.196 ad spc = &ci->ci_schedstate; 632 1.199 ad returning = false; 633 1.190 ad newl = NULL; 634 1.190 ad 635 1.199 ad /* 636 1.199 ad * If we have been asked to switch to a specific LWP, then there 637 1.199 ad * is no need to inspect the run queues. If a soft interrupt is 638 1.199 ad * blocking, then return to the interrupted thread without adjusting 639 1.199 ad * VM context or its start time: neither have been changed in order 640 1.199 ad * to take the interrupt. 641 1.199 ad */ 642 1.190 ad if (l->l_switchto != NULL) { 643 1.204 ad if ((l->l_pflag & LP_INTR) != 0) { 644 1.199 ad returning = true; 645 1.199 ad softint_block(l); 646 1.248 ad if ((l->l_pflag & LP_TIMEINTR) != 0) 647 1.212 yamt updatertime(l, &bt); 648 1.199 ad } 649 1.190 ad newl = l->l_switchto; 650 1.190 ad l->l_switchto = NULL; 651 1.190 ad } 652 1.204 ad #ifndef __HAVE_FAST_SOFTINTS 653 1.204 ad else if (ci->ci_data.cpu_softints != 0) { 654 1.204 ad /* There are pending soft interrupts, so pick one. */ 655 1.204 ad newl = softint_picklwp(); 656 1.204 ad newl->l_stat = LSONPROC; 657 1.339 ad newl->l_pflag |= LP_RUNNING; 658 1.204 ad } 659 1.204 ad #endif /* !__HAVE_FAST_SOFTINTS */ 660 1.190 ad 661 1.113 gmcgarry /* 662 1.174 ad * If on the CPU and we have gotten this far, then we must yield. 663 1.113 gmcgarry */ 664 1.246 rmind if (l->l_stat == LSONPROC && l != newl) { 665 1.217 ad KASSERT(lwp_locked(l, spc->spc_lwplock)); 666 1.329 ad KASSERT((l->l_flag & LW_IDLE) == 0); 667 1.329 ad l->l_stat = LSRUN; 668 1.329 ad lwp_setlock(l, spc->spc_mutex); 669 1.329 ad sched_enqueue(l); 670 1.336 ad sched_preempted(l); 671 1.336 ad 672 1.329 ad /* 673 1.329 ad * Handle migration. Note that "migrating LWP" may 674 1.329 ad * be reset here, if interrupt/preemption happens 675 1.329 ad * early in idle LWP. 676 1.329 ad */ 677 1.329 ad if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { 678 1.329 ad KASSERT((l->l_pflag & LP_INTR) == 0); 679 1.329 ad spc->spc_migrating = l; 680 1.329 ad } 681 1.174 ad } 682 1.174 ad 683 1.245 ad /* Pick new LWP to run. */ 684 1.190 ad if (newl == NULL) { 685 1.245 ad newl = nextlwp(ci, spc); 686 1.199 ad } 687 1.199 ad 688 1.204 ad /* Items that must be updated with the CPU locked. */ 689 1.199 ad if (!returning) { 690 1.326 ad /* Count time spent in current system call */ 691 1.326 ad SYSCALL_TIME_SLEEP(l); 692 1.326 ad 693 1.326 ad updatertime(l, &bt); 694 1.326 ad 695 1.204 ad /* Update the new LWP's start time. */ 696 1.212 yamt newl->l_stime = bt; 697 1.204 ad 698 1.199 ad /* 699 1.204 ad * ci_curlwp changes when a fast soft interrupt occurs. 700 1.327 ad * We use ci_onproc to keep track of which kernel or 701 1.204 ad * user thread is running 'underneath' the software 702 1.204 ad * interrupt. This is important for time accounting, 703 1.204 ad * itimers and forcing user threads to preempt (aston). 704 1.199 ad */ 705 1.327 ad ci->ci_onproc = newl; 706 1.188 yamt } 707 1.188 yamt 708 1.241 ad /* 709 1.325 ad * Preemption related tasks. Must be done holding spc_mutex. Clear 710 1.325 ad * l_dopreempt without an atomic - it's only ever set non-zero by 711 1.325 ad * sched_resched_cpu() which also holds spc_mutex, and only ever 712 1.325 ad * cleared by the LWP itself (us) with atomics when not under lock. 713 1.241 ad */ 714 1.231 ad l->l_dopreempt = 0; 715 1.231 ad if (__predict_false(l->l_pfailaddr != 0)) { 716 1.231 ad LOCKSTAT_FLAG(lsflag); 717 1.231 ad LOCKSTAT_ENTER(lsflag); 718 1.231 ad LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 719 1.231 ad LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 720 1.231 ad 1, l->l_pfailtime, l->l_pfailaddr); 721 1.231 ad LOCKSTAT_EXIT(lsflag); 722 1.231 ad l->l_pfailtime = 0; 723 1.231 ad l->l_pfaillock = 0; 724 1.231 ad l->l_pfailaddr = 0; 725 1.231 ad } 726 1.231 ad 727 1.188 yamt if (l != newl) { 728 1.188 yamt struct lwp *prevlwp; 729 1.174 ad 730 1.209 ad /* Release all locks, but leave the current LWP locked */ 731 1.246 rmind if (l->l_mutex == spc->spc_mutex) { 732 1.209 ad /* 733 1.209 ad * Drop spc_lwplock, if the current LWP has been moved 734 1.209 ad * to the run queue (it is now locked by spc_mutex). 735 1.209 ad */ 736 1.217 ad mutex_spin_exit(spc->spc_lwplock); 737 1.188 yamt } else { 738 1.209 ad /* 739 1.209 ad * Otherwise, drop the spc_mutex, we are done with the 740 1.209 ad * run queues. 741 1.209 ad */ 742 1.188 yamt mutex_spin_exit(spc->spc_mutex); 743 1.188 yamt } 744 1.188 yamt 745 1.330 ad /* We're down to only one lock, so do debug checks. */ 746 1.330 ad LOCKDEBUG_BARRIER(l->l_mutex, 1); 747 1.330 ad 748 1.335 ad /* Count the context switch. */ 749 1.335 ad CPU_COUNT(CPU_COUNT_NSWTCH, 1); 750 1.335 ad if ((l->l_pflag & LP_PREEMPTING) != 0) { 751 1.361 ad l->l_ru.ru_nivcsw++; 752 1.335 ad l->l_pflag &= ~LP_PREEMPTING; 753 1.361 ad } else { 754 1.361 ad l->l_ru.ru_nvcsw++; 755 1.335 ad } 756 1.209 ad 757 1.209 ad /* 758 1.209 ad * Increase the count of spin-mutexes before the release 759 1.335 ad * of the last lock - we must remain at IPL_SCHED after 760 1.335 ad * releasing the lock. 761 1.209 ad */ 762 1.287 matt KASSERTMSG(ci->ci_mtx_count == -1, 763 1.301 rmind "%s: cpu%u: ci_mtx_count (%d) != -1 " 764 1.301 rmind "(block with spin-mutex held)", 765 1.291 jym __func__, cpu_index(ci), ci->ci_mtx_count); 766 1.209 ad oldspl = MUTEX_SPIN_OLDSPL(ci); 767 1.335 ad ci->ci_mtx_count = -2; 768 1.188 yamt 769 1.209 ad /* Update status for lwpctl, if present. */ 770 1.335 ad if (l->l_lwpctl != NULL) { 771 1.335 ad l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ? 772 1.335 ad LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE); 773 1.335 ad } 774 1.209 ad 775 1.199 ad /* 776 1.335 ad * If curlwp is a soft interrupt LWP, there's nobody on the 777 1.335 ad * other side to unlock - we're returning into an assembly 778 1.335 ad * trampoline. Unlock now. This is safe because this is a 779 1.335 ad * kernel LWP and is bound to current CPU: the worst anyone 780 1.335 ad * else will do to it, is to put it back onto this CPU's run 781 1.335 ad * queue (and the CPU is busy here right now!). 782 1.199 ad */ 783 1.335 ad if (returning) { 784 1.335 ad /* Keep IPL_SCHED after this; MD code will fix up. */ 785 1.339 ad l->l_pflag &= ~LP_RUNNING; 786 1.335 ad lwp_unlock(l); 787 1.335 ad } else { 788 1.335 ad /* A normal LWP: save old VM context. */ 789 1.199 ad pmap_deactivate(l); 790 1.209 ad } 791 1.207 ad 792 1.276 darran /* 793 1.276 darran * If DTrace has set the active vtime enum to anything 794 1.276 darran * other than INACTIVE (0), then it should have set the 795 1.276 darran * function to call. 796 1.276 darran */ 797 1.278 darran if (__predict_false(dtrace_vtime_active)) { 798 1.276 darran (*dtrace_vtime_switch_func)(newl); 799 1.276 darran } 800 1.276 darran 801 1.318 ozaki /* 802 1.318 ozaki * We must ensure not to come here from inside a read section. 803 1.318 ozaki */ 804 1.318 ozaki KASSERT(pserialize_not_in_read_section()); 805 1.318 ozaki 806 1.188 yamt /* Switch to the new LWP.. */ 807 1.305 mlelstv #ifdef MULTIPROCESSOR 808 1.304 matt KASSERT(curlwp == ci->ci_curlwp); 809 1.305 mlelstv #endif 810 1.304 matt KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 811 1.204 ad prevlwp = cpu_switchto(l, newl, returning); 812 1.207 ad ci = curcpu(); 813 1.305 mlelstv #ifdef MULTIPROCESSOR 814 1.304 matt KASSERT(curlwp == ci->ci_curlwp); 815 1.305 mlelstv #endif 816 1.304 matt KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", 817 1.304 matt l, curlwp, prevlwp); 818 1.335 ad KASSERT(prevlwp != NULL); 819 1.335 ad KASSERT(l->l_cpu == ci); 820 1.335 ad KASSERT(ci->ci_mtx_count == -2); 821 1.335 ad 822 1.335 ad /* 823 1.339 ad * Immediately mark the previous LWP as no longer running 824 1.350 riastrad * and unlock (to keep lock wait times short as possible). 825 1.339 ad * We'll still be at IPL_SCHED afterwards. If a zombie, 826 1.339 ad * don't touch after clearing LP_RUNNING as it could be 827 1.339 ad * reaped by another CPU. Issue a memory barrier to ensure 828 1.339 ad * this. 829 1.350 riastrad * 830 1.350 riastrad * atomic_store_release matches atomic_load_acquire in 831 1.350 riastrad * lwp_free. 832 1.335 ad */ 833 1.339 ad KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0); 834 1.339 ad lock = prevlwp->l_mutex; 835 1.339 ad if (__predict_false(prevlwp->l_stat == LSZOMB)) { 836 1.350 riastrad atomic_store_release(&prevlwp->l_pflag, 837 1.350 riastrad prevlwp->l_pflag & ~LP_RUNNING); 838 1.350 riastrad } else { 839 1.350 riastrad prevlwp->l_pflag &= ~LP_RUNNING; 840 1.339 ad } 841 1.339 ad mutex_spin_exit(lock); 842 1.207 ad 843 1.188 yamt /* 844 1.209 ad * Switched away - we have new curlwp. 845 1.209 ad * Restore VM context and IPL. 846 1.188 yamt */ 847 1.209 ad pmap_activate(l); 848 1.288 rmind pcu_switchpoint(l); 849 1.265 rmind 850 1.209 ad /* Update status for lwpctl, if present. */ 851 1.219 ad if (l->l_lwpctl != NULL) { 852 1.209 ad l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 853 1.219 ad l->l_lwpctl->lc_pctr++; 854 1.219 ad } 855 1.174 ad 856 1.300 yamt /* 857 1.335 ad * Normalize the spin mutex count and restore the previous 858 1.335 ad * SPL. Note that, unless the caller disabled preemption, 859 1.335 ad * we can be preempted at any time after this splx(). 860 1.300 yamt */ 861 1.331 ad KASSERT(l->l_cpu == ci); 862 1.335 ad KASSERT(ci->ci_mtx_count == -1); 863 1.335 ad ci->ci_mtx_count = 0; 864 1.329 ad splx(oldspl); 865 1.188 yamt } else { 866 1.188 yamt /* Nothing to do - just unlock and return. */ 867 1.246 rmind mutex_spin_exit(spc->spc_mutex); 868 1.321 mlelstv l->l_pflag &= ~LP_PREEMPTING; 869 1.188 yamt lwp_unlock(l); 870 1.122 thorpej } 871 1.110 briggs 872 1.188 yamt KASSERT(l == curlwp); 873 1.345 ad KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0); 874 1.188 yamt 875 1.180 dsl SYSCALL_TIME_WAKEUP(l); 876 1.188 yamt LOCKDEBUG_BARRIER(NULL, 1); 877 1.26 cgd } 878 1.26 cgd 879 1.26 cgd /* 880 1.271 rmind * setrunnable: change LWP state to be runnable, placing it on the run queue. 881 1.174 ad * 882 1.174 ad * Call with the process and LWP locked. Will return with the LWP unlocked. 883 1.26 cgd */ 884 1.26 cgd void 885 1.122 thorpej setrunnable(struct lwp *l) 886 1.26 cgd { 887 1.122 thorpej struct proc *p = l->l_proc; 888 1.205 ad struct cpu_info *ci; 889 1.326 ad kmutex_t *oldlock; 890 1.26 cgd 891 1.188 yamt KASSERT((l->l_flag & LW_IDLE) == 0); 892 1.324 kamil KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); 893 1.229 ad KASSERT(mutex_owned(p->p_lock)); 894 1.183 ad KASSERT(lwp_locked(l, NULL)); 895 1.205 ad KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 896 1.83 thorpej 897 1.122 thorpej switch (l->l_stat) { 898 1.122 thorpej case LSSTOP: 899 1.33 mycroft /* 900 1.33 mycroft * If we're being traced (possibly because someone attached us 901 1.33 mycroft * while we were stopped), check for a signal from the debugger. 902 1.33 mycroft */ 903 1.310 christos if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) 904 1.174 ad signotify(l); 905 1.174 ad p->p_nrlwps++; 906 1.26 cgd break; 907 1.174 ad case LSSUSPENDED: 908 1.326 ad KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 909 1.178 pavel l->l_flag &= ~LW_WSUSPEND; 910 1.174 ad p->p_nrlwps++; 911 1.192 rmind cv_broadcast(&p->p_lwpcv); 912 1.122 thorpej break; 913 1.174 ad case LSSLEEP: 914 1.174 ad KASSERT(l->l_wchan != NULL); 915 1.26 cgd break; 916 1.326 ad case LSIDL: 917 1.326 ad KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 918 1.326 ad break; 919 1.174 ad default: 920 1.174 ad panic("setrunnable: lwp %p state was %d", l, l->l_stat); 921 1.26 cgd } 922 1.139 cl 923 1.174 ad /* 924 1.286 pooka * If the LWP was sleeping, start it again. 925 1.174 ad */ 926 1.174 ad if (l->l_wchan != NULL) { 927 1.174 ad l->l_stat = LSSLEEP; 928 1.183 ad /* lwp_unsleep() will release the lock. */ 929 1.221 ad lwp_unsleep(l, true); 930 1.174 ad return; 931 1.174 ad } 932 1.139 cl 933 1.174 ad /* 934 1.174 ad * If the LWP is still on the CPU, mark it as LSONPROC. It may be 935 1.174 ad * about to call mi_switch(), in which case it will yield. 936 1.174 ad */ 937 1.339 ad if ((l->l_pflag & LP_RUNNING) != 0) { 938 1.174 ad l->l_stat = LSONPROC; 939 1.174 ad l->l_slptime = 0; 940 1.174 ad lwp_unlock(l); 941 1.174 ad return; 942 1.174 ad } 943 1.122 thorpej 944 1.174 ad /* 945 1.205 ad * Look for a CPU to run. 946 1.205 ad * Set the LWP runnable. 947 1.174 ad */ 948 1.205 ad ci = sched_takecpu(l); 949 1.205 ad l->l_cpu = ci; 950 1.236 ad spc_lock(ci); 951 1.326 ad oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex); 952 1.188 yamt sched_setrunnable(l); 953 1.174 ad l->l_stat = LSRUN; 954 1.122 thorpej l->l_slptime = 0; 955 1.326 ad sched_enqueue(l); 956 1.326 ad sched_resched_lwp(l, true); 957 1.326 ad /* SPC & LWP now unlocked. */ 958 1.326 ad mutex_spin_exit(oldlock); 959 1.26 cgd } 960 1.26 cgd 961 1.26 cgd /* 962 1.174 ad * suspendsched: 963 1.174 ad * 964 1.266 yamt * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 965 1.174 ad */ 966 1.94 bouyer void 967 1.174 ad suspendsched(void) 968 1.94 bouyer { 969 1.174 ad CPU_INFO_ITERATOR cii; 970 1.174 ad struct cpu_info *ci; 971 1.122 thorpej struct lwp *l; 972 1.174 ad struct proc *p; 973 1.94 bouyer 974 1.94 bouyer /* 975 1.174 ad * We do this by process in order not to violate the locking rules. 976 1.94 bouyer */ 977 1.349 ad mutex_enter(&proc_lock); 978 1.174 ad PROCLIST_FOREACH(p, &allproc) { 979 1.229 ad mutex_enter(p->p_lock); 980 1.178 pavel if ((p->p_flag & PK_SYSTEM) != 0) { 981 1.229 ad mutex_exit(p->p_lock); 982 1.94 bouyer continue; 983 1.174 ad } 984 1.174 ad 985 1.309 pgoyette if (p->p_stat != SSTOP) { 986 1.309 pgoyette if (p->p_stat != SZOMB && p->p_stat != SDEAD) { 987 1.309 pgoyette p->p_pptr->p_nstopchild++; 988 1.309 pgoyette p->p_waited = 0; 989 1.309 pgoyette } 990 1.309 pgoyette p->p_stat = SSTOP; 991 1.309 pgoyette } 992 1.174 ad 993 1.174 ad LIST_FOREACH(l, &p->p_lwps, l_sibling) { 994 1.174 ad if (l == curlwp) 995 1.174 ad continue; 996 1.174 ad 997 1.174 ad lwp_lock(l); 998 1.122 thorpej 999 1.97 enami /* 1000 1.174 ad * Set L_WREBOOT so that the LWP will suspend itself 1001 1.174 ad * when it tries to return to user mode. We want to 1002 1.174 ad * try and get to get as many LWPs as possible to 1003 1.174 ad * the user / kernel boundary, so that they will 1004 1.174 ad * release any locks that they hold. 1005 1.97 enami */ 1006 1.178 pavel l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1007 1.174 ad 1008 1.174 ad if (l->l_stat == LSSLEEP && 1009 1.178 pavel (l->l_flag & LW_SINTR) != 0) { 1010 1.174 ad /* setrunnable() will release the lock. */ 1011 1.174 ad setrunnable(l); 1012 1.174 ad continue; 1013 1.174 ad } 1014 1.174 ad 1015 1.174 ad lwp_unlock(l); 1016 1.94 bouyer } 1017 1.174 ad 1018 1.229 ad mutex_exit(p->p_lock); 1019 1.94 bouyer } 1020 1.349 ad mutex_exit(&proc_lock); 1021 1.174 ad 1022 1.174 ad /* 1023 1.174 ad * Kick all CPUs to make them preempt any LWPs running in user mode. 1024 1.326 ad * They'll trap into the kernel and suspend themselves in userret(). 1025 1.326 ad * 1026 1.326 ad * Unusually, we don't hold any other scheduler object locked, which 1027 1.326 ad * would keep preemption off for sched_resched_cpu(), so disable it 1028 1.326 ad * explicitly. 1029 1.174 ad */ 1030 1.326 ad kpreempt_disable(); 1031 1.204 ad for (CPU_INFO_FOREACH(cii, ci)) { 1032 1.204 ad spc_lock(ci); 1033 1.326 ad sched_resched_cpu(ci, PRI_KERNEL, true); 1034 1.326 ad /* spc now unlocked */ 1035 1.204 ad } 1036 1.326 ad kpreempt_enable(); 1037 1.174 ad } 1038 1.174 ad 1039 1.174 ad /* 1040 1.174 ad * sched_unsleep: 1041 1.174 ad * 1042 1.174 ad * The is called when the LWP has not been awoken normally but instead 1043 1.174 ad * interrupted: for example, if the sleep timed out. Because of this, 1044 1.174 ad * it's not a valid action for running or idle LWPs. 1045 1.174 ad */ 1046 1.271 rmind static void 1047 1.221 ad sched_unsleep(struct lwp *l, bool cleanup) 1048 1.174 ad { 1049 1.174 ad 1050 1.174 ad lwp_unlock(l); 1051 1.174 ad panic("sched_unsleep"); 1052 1.174 ad } 1053 1.174 ad 1054 1.250 rmind static void 1055 1.326 ad sched_changepri(struct lwp *l, pri_t pri) 1056 1.188 yamt { 1057 1.326 ad struct schedstate_percpu *spc; 1058 1.326 ad struct cpu_info *ci; 1059 1.188 yamt 1060 1.250 rmind KASSERT(lwp_locked(l, NULL)); 1061 1.188 yamt 1062 1.326 ad ci = l->l_cpu; 1063 1.326 ad spc = &ci->ci_schedstate; 1064 1.174 ad 1065 1.271 rmind if (l->l_stat == LSRUN) { 1066 1.326 ad KASSERT(lwp_locked(l, spc->spc_mutex)); 1067 1.204 ad sched_dequeue(l); 1068 1.204 ad l->l_priority = pri; 1069 1.326 ad sched_enqueue(l); 1070 1.326 ad sched_resched_lwp(l, false); 1071 1.326 ad } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1072 1.326 ad /* On priority drop, only evict realtime LWPs. */ 1073 1.326 ad KASSERT(lwp_locked(l, spc->spc_lwplock)); 1074 1.326 ad l->l_priority = pri; 1075 1.326 ad spc_lock(ci); 1076 1.326 ad sched_resched_cpu(ci, spc->spc_maxpriority, true); 1077 1.326 ad /* spc now unlocked */ 1078 1.204 ad } else { 1079 1.174 ad l->l_priority = pri; 1080 1.157 yamt } 1081 1.184 yamt } 1082 1.184 yamt 1083 1.188 yamt static void 1084 1.185 yamt sched_lendpri(struct lwp *l, pri_t pri) 1085 1.184 yamt { 1086 1.326 ad struct schedstate_percpu *spc; 1087 1.326 ad struct cpu_info *ci; 1088 1.184 yamt 1089 1.188 yamt KASSERT(lwp_locked(l, NULL)); 1090 1.184 yamt 1091 1.326 ad ci = l->l_cpu; 1092 1.326 ad spc = &ci->ci_schedstate; 1093 1.326 ad 1094 1.271 rmind if (l->l_stat == LSRUN) { 1095 1.326 ad KASSERT(lwp_locked(l, spc->spc_mutex)); 1096 1.204 ad sched_dequeue(l); 1097 1.204 ad l->l_inheritedprio = pri; 1098 1.311 christos l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1099 1.326 ad sched_enqueue(l); 1100 1.326 ad sched_resched_lwp(l, false); 1101 1.326 ad } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1102 1.326 ad /* On priority drop, only evict realtime LWPs. */ 1103 1.326 ad KASSERT(lwp_locked(l, spc->spc_lwplock)); 1104 1.326 ad l->l_inheritedprio = pri; 1105 1.326 ad l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1106 1.326 ad spc_lock(ci); 1107 1.326 ad sched_resched_cpu(ci, spc->spc_maxpriority, true); 1108 1.326 ad /* spc now unlocked */ 1109 1.204 ad } else { 1110 1.184 yamt l->l_inheritedprio = pri; 1111 1.311 christos l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1112 1.184 yamt } 1113 1.184 yamt } 1114 1.184 yamt 1115 1.184 yamt struct lwp * 1116 1.184 yamt syncobj_noowner(wchan_t wchan) 1117 1.184 yamt { 1118 1.184 yamt 1119 1.184 yamt return NULL; 1120 1.151 yamt } 1121 1.151 yamt 1122 1.250 rmind /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1123 1.281 rmind const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1124 1.281 rmind 1125 1.281 rmind /* 1126 1.281 rmind * Constants for averages over 1, 5 and 15 minutes when sampling at 1127 1.281 rmind * 5 second intervals. 1128 1.281 rmind */ 1129 1.281 rmind static const fixpt_t cexp[ ] = { 1130 1.281 rmind 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1131 1.281 rmind 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1132 1.281 rmind 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1133 1.281 rmind }; 1134 1.134 matt 1135 1.134 matt /* 1136 1.188 yamt * sched_pstats: 1137 1.188 yamt * 1138 1.281 rmind * => Update process statistics and check CPU resource allocation. 1139 1.281 rmind * => Call scheduler-specific hook to eventually adjust LWP priorities. 1140 1.281 rmind * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1141 1.130 nathanw */ 1142 1.113 gmcgarry void 1143 1.281 rmind sched_pstats(void) 1144 1.113 gmcgarry { 1145 1.281 rmind struct loadavg *avg = &averunnable; 1146 1.249 rmind const int clkhz = (stathz != 0 ? stathz : hz); 1147 1.357 riastrad static bool backwardslwp = false; 1148 1.357 riastrad static bool backwardsproc = false; 1149 1.281 rmind static u_int lavg_count = 0; 1150 1.188 yamt struct proc *p; 1151 1.281 rmind int nrun; 1152 1.113 gmcgarry 1153 1.188 yamt sched_pstats_ticks++; 1154 1.281 rmind if (++lavg_count >= 5) { 1155 1.281 rmind lavg_count = 0; 1156 1.281 rmind nrun = 0; 1157 1.281 rmind } 1158 1.349 ad mutex_enter(&proc_lock); 1159 1.188 yamt PROCLIST_FOREACH(p, &allproc) { 1160 1.281 rmind struct lwp *l; 1161 1.281 rmind struct rlimit *rlim; 1162 1.296 dholland time_t runtm; 1163 1.281 rmind int sig; 1164 1.281 rmind 1165 1.271 rmind /* Increment sleep time (if sleeping), ignore overflow. */ 1166 1.229 ad mutex_enter(p->p_lock); 1167 1.212 yamt runtm = p->p_rtime.sec; 1168 1.188 yamt LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1169 1.281 rmind fixpt_t lpctcpu; 1170 1.281 rmind u_int lcpticks; 1171 1.281 rmind 1172 1.249 rmind if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1173 1.188 yamt continue; 1174 1.188 yamt lwp_lock(l); 1175 1.357 riastrad if (__predict_false(l->l_rtime.sec < 0) && 1176 1.357 riastrad !backwardslwp) { 1177 1.357 riastrad backwardslwp = true; 1178 1.357 riastrad printf("WARNING: lwp %ld (%s%s%s): " 1179 1.357 riastrad "negative runtime: " 1180 1.357 riastrad "(%jd + 0x%016"PRIx64"/2^64) sec\n", 1181 1.357 riastrad (long)l->l_lid, 1182 1.357 riastrad l->l_proc->p_comm, 1183 1.357 riastrad l->l_name ? " " : "", 1184 1.357 riastrad l->l_name ? l->l_name : "", 1185 1.357 riastrad (intmax_t)l->l_rtime.sec, 1186 1.357 riastrad l->l_rtime.frac); 1187 1.357 riastrad } 1188 1.212 yamt runtm += l->l_rtime.sec; 1189 1.188 yamt l->l_swtime++; 1190 1.242 rmind sched_lwp_stats(l); 1191 1.281 rmind 1192 1.281 rmind /* For load average calculation. */ 1193 1.282 rmind if (__predict_false(lavg_count == 0) && 1194 1.282 rmind (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1195 1.281 rmind switch (l->l_stat) { 1196 1.281 rmind case LSSLEEP: 1197 1.281 rmind if (l->l_slptime > 1) { 1198 1.281 rmind break; 1199 1.281 rmind } 1200 1.323 mrg /* FALLTHROUGH */ 1201 1.281 rmind case LSRUN: 1202 1.281 rmind case LSONPROC: 1203 1.281 rmind case LSIDL: 1204 1.281 rmind nrun++; 1205 1.281 rmind } 1206 1.281 rmind } 1207 1.282 rmind lwp_unlock(l); 1208 1.282 rmind 1209 1.282 rmind l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1210 1.282 rmind if (l->l_slptime != 0) 1211 1.282 rmind continue; 1212 1.282 rmind 1213 1.282 rmind lpctcpu = l->l_pctcpu; 1214 1.282 rmind lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1215 1.282 rmind lpctcpu += ((FSCALE - ccpu) * 1216 1.282 rmind (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1217 1.282 rmind l->l_pctcpu = lpctcpu; 1218 1.188 yamt } 1219 1.249 rmind /* Calculating p_pctcpu only for ps(1) */ 1220 1.188 yamt p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1221 1.174 ad 1222 1.303 christos if (__predict_false(runtm < 0)) { 1223 1.357 riastrad if (!backwardsproc) { 1224 1.357 riastrad backwardsproc = true; 1225 1.357 riastrad printf("WARNING: pid %ld (%s): " 1226 1.357 riastrad "negative runtime; " 1227 1.357 riastrad "monotonic clock has gone backwards\n", 1228 1.357 riastrad (long)p->p_pid, p->p_comm); 1229 1.303 christos } 1230 1.303 christos mutex_exit(p->p_lock); 1231 1.303 christos continue; 1232 1.303 christos } 1233 1.303 christos 1234 1.188 yamt /* 1235 1.188 yamt * Check if the process exceeds its CPU resource allocation. 1236 1.293 apb * If over the hard limit, kill it with SIGKILL. 1237 1.293 apb * If over the soft limit, send SIGXCPU and raise 1238 1.293 apb * the soft limit a little. 1239 1.188 yamt */ 1240 1.188 yamt rlim = &p->p_rlimit[RLIMIT_CPU]; 1241 1.188 yamt sig = 0; 1242 1.249 rmind if (__predict_false(runtm >= rlim->rlim_cur)) { 1243 1.293 apb if (runtm >= rlim->rlim_max) { 1244 1.188 yamt sig = SIGKILL; 1245 1.312 christos log(LOG_NOTICE, 1246 1.312 christos "pid %d, command %s, is killed: %s\n", 1247 1.312 christos p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1248 1.293 apb uprintf("pid %d, command %s, is killed: %s\n", 1249 1.312 christos p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1250 1.293 apb } else { 1251 1.188 yamt sig = SIGXCPU; 1252 1.188 yamt if (rlim->rlim_cur < rlim->rlim_max) 1253 1.188 yamt rlim->rlim_cur += 5; 1254 1.188 yamt } 1255 1.188 yamt } 1256 1.229 ad mutex_exit(p->p_lock); 1257 1.303 christos if (__predict_false(sig)) { 1258 1.259 rmind KASSERT((p->p_flag & PK_SYSTEM) == 0); 1259 1.188 yamt psignal(p, sig); 1260 1.259 rmind } 1261 1.174 ad } 1262 1.281 rmind 1263 1.281 rmind /* Load average calculation. */ 1264 1.281 rmind if (__predict_false(lavg_count == 0)) { 1265 1.281 rmind int i; 1266 1.283 martin CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1267 1.281 rmind for (i = 0; i < __arraycount(cexp); i++) { 1268 1.281 rmind avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1269 1.281 rmind nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1270 1.281 rmind } 1271 1.281 rmind } 1272 1.281 rmind 1273 1.281 rmind /* Lightning bolt. */ 1274 1.273 pooka cv_broadcast(&lbolt); 1275 1.325 ad 1276 1.349 ad mutex_exit(&proc_lock); 1277 1.113 gmcgarry } 1278