kern_heartbeat.c revision 1.2 1 /* $NetBSD: kern_heartbeat.c,v 1.2 2023/07/07 17:05:13 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress
31 *
32 * Manual tests to run when changing this file. Magic numbers are for
33 * evbarm; adjust for other platforms. Tests involving cpuctl
34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU
35 * system, offline all but one CPU.
36 *
37 * 1. cpuctl offline 0
38 * sleep 20
39 * cpuctl online 0
40 *
41 * 2. cpuctl offline 1
42 * sleep 20
43 * cpuctl online 1
44 *
45 * 3. cpuctl offline 0
46 * sysctl -w kern.heartbeat.max_period=5
47 * sleep 10
48 * sysctl -w kern.heartbeat.max_period=0
49 * sleep 10
50 * sysctl -w kern.heartbeat.max_period=5
51 * sleep 10
52 * cpuctl online 0
53 *
54 * 4. sysctl -w debug.crashme_enable=1
55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
56 * # verify system panics after 15sec
57 *
58 * 5. sysctl -w debug.crashme_enable=1
59 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
60 * # verify system panics after 15sec
61 *
62 * 6. cpuctl offline 0
63 * sysctl -w debug.crashme_enable=1
64 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
65 * # verify system panics after 15sec
66 *
67 * 7. cpuctl offline 0
68 * sysctl -w debug.crashme_enable=1
69 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM
70 * # verify system panics after 15sec
71 *
72 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU
73 * # require a hardware watchdog timer.
74 * #cpuctl offline 0
75 * #sysctl -w debug.crashme_enable
76 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
77 * # hope watchdog timer kicks in
78 */
79
80 #include <sys/cdefs.h>
81 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.2 2023/07/07 17:05:13 riastradh Exp $");
82
83 #ifdef _KERNEL_OPT
84 #include "opt_ddb.h"
85 #include "opt_heartbeat.h"
86 #endif
87
88 #include "heartbeat.h"
89
90 #include <sys/param.h>
91 #include <sys/types.h>
92
93 #include <sys/atomic.h>
94 #include <sys/cpu.h>
95 #include <sys/errno.h>
96 #include <sys/heartbeat.h>
97 #include <sys/ipi.h>
98 #include <sys/mutex.h>
99 #include <sys/sysctl.h>
100 #include <sys/systm.h>
101 #include <sys/xcall.h>
102
103 #ifdef DDB
104 #include <ddb/ddb.h>
105 #endif
106
107 static inline bool
108 curcpu_stable(void)
109 {
110
111 return kpreempt_disabled() ||
112 (curlwp->l_pflag & LP_BOUND) ||
113 cpu_intr_p() ||
114 cpu_softintr_p();
115 }
116
117 /*
118 * Global state.
119 *
120 * heartbeat_lock serializes access to heartbeat_max_period_secs
121 * and heartbeat_max_period_ticks. Two separate variables so we
122 * can avoid multiplication or division in the heartbeat routine.
123 *
124 * heartbeat_sih is stable after initialization in
125 * heartbeat_start.
126 */
127 kmutex_t heartbeat_lock __cacheline_aligned;
128 unsigned heartbeat_max_period_secs __read_mostly;
129 unsigned heartbeat_max_period_ticks __read_mostly;
130
131 void *heartbeat_sih __read_mostly;
132
133 /*
134 * heartbeat_suspend()
135 *
136 * Suspend heartbeat monitoring of the current CPU.
137 *
138 * Called after the current CPU has been marked offline but before
139 * it has stopped running. Caller must have preemption disabled.
140 */
141 void
142 heartbeat_suspend(void)
143 {
144
145 KASSERT(curcpu_stable());
146
147 /*
148 * Nothing to do -- we just check the SPCF_OFFLINE flag.
149 */
150 }
151
152 /*
153 * heartbeat_resume()
154 *
155 * Resume heartbeat monitoring of the current CPU.
156 *
157 * Called after the current CPU has started running but before it
158 * has been marked online. Also used internally when starting up
159 * heartbeat monitoring at boot or when the maximum period is set
160 * from zero to nonzero. Caller must have preemption disabled.
161 */
162 void
163 heartbeat_resume(void)
164 {
165 struct cpu_info *ci = curcpu();
166 int s;
167
168 KASSERT(curcpu_stable());
169
170 /*
171 * Block heartbeats while we reset the state so we don't
172 * spuriously think we had a heart attack in the middle of
173 * resetting the count and the uptime stamp.
174 */
175 s = splsched();
176 ci->ci_heartbeat_count = 0;
177 ci->ci_heartbeat_uptime_cache = atomic_load_relaxed(&time_uptime);
178 ci->ci_heartbeat_uptime_stamp = 0;
179 splx(s);
180 }
181
182 /*
183 * heartbeat_reset_xc(a, b)
184 *
185 * Cross-call handler to reset heartbeat state just prior to
186 * enabling heartbeat checks.
187 */
188 static void
189 heartbeat_reset_xc(void *a, void *b)
190 {
191
192 heartbeat_resume();
193 }
194
195 /*
196 * set_max_period(max_period)
197 *
198 * Set the maximum period, in seconds, for heartbeat checks.
199 *
200 * - If max_period is zero, disable them.
201 *
202 * - If the max period was zero and max_period is nonzero, ensure
203 * all CPUs' heartbeat uptime caches are up-to-date before
204 * re-enabling them.
205 *
206 * max_period must be below UINT_MAX/4/hz to avoid arithmetic
207 * overflow and give room for slop.
208 *
209 * Caller must hold heartbeat_lock.
210 */
211 static void
212 set_max_period(unsigned max_period)
213 {
214
215 KASSERTMSG(max_period <= UINT_MAX/4/hz,
216 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)",
217 max_period, UINT_MAX/4/hz, hz);
218 KASSERT(mutex_owned(&heartbeat_lock));
219
220 /*
221 * If we're enabling heartbeat checks, make sure we have a
222 * reasonably up-to-date time_uptime cache on all CPUs so we
223 * don't think we had an instant heart attack.
224 */
225 if (heartbeat_max_period_secs == 0 && max_period != 0)
226 xc_wait(xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL));
227
228 /*
229 * Once the heartbeat state has been updated on all (online)
230 * CPUs, set the period. At this point, heartbeat checks can
231 * begin.
232 */
233 atomic_store_relaxed(&heartbeat_max_period_secs, max_period);
234 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz);
235 }
236
237 /*
238 * heartbeat_max_period_ticks(SYSCTLFN_ARGS)
239 *
240 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies
241 * it lies within a reasonable interval and sets it.
242 */
243 static int
244 heartbeat_max_period_sysctl(SYSCTLFN_ARGS)
245 {
246 struct sysctlnode node;
247 unsigned max_period;
248 int error;
249
250 mutex_enter(&heartbeat_lock);
251
252 max_period = heartbeat_max_period_secs;
253 node = *rnode;
254 node.sysctl_data = &max_period;
255 error = sysctl_lookup(SYSCTLFN_CALL(&node));
256 if (error || newp == NULL)
257 goto out;
258
259 /*
260 * Ensure there's plenty of slop between heartbeats.
261 */
262 if (max_period > UINT_MAX/4/hz) {
263 error = EOVERFLOW;
264 goto out;
265 }
266
267 /*
268 * Success! Set the period. This enables heartbeat checks if
269 * we went from zero period to nonzero period, or disables them
270 * if the other way around.
271 */
272 set_max_period(max_period);
273 error = 0;
274
275 out: mutex_exit(&heartbeat_lock);
276 return error;
277 }
278
279 /*
280 * sysctl_heartbeat_setup()
281 *
282 * Set up the kern.heartbeat.* sysctl subtree.
283 */
284 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup")
285 {
286 const struct sysctlnode *rnode;
287 int error;
288
289 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE);
290
291 /* kern.heartbeat */
292 error = sysctl_createv(NULL, 0, NULL, &rnode,
293 CTLFLAG_PERMANENT,
294 CTLTYPE_NODE, "heartbeat",
295 SYSCTL_DESCR("Kernel heartbeat parameters"),
296 NULL, 0, NULL, 0,
297 CTL_KERN, CTL_CREATE, CTL_EOL);
298 if (error) {
299 printf("%s: failed to create kern.heartbeat: %d\n",
300 __func__, error);
301 return;
302 }
303
304 /* kern.heartbeat.max_period */
305 error = sysctl_createv(NULL, 0, &rnode, NULL,
306 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
307 CTLTYPE_INT, "max_period",
308 SYSCTL_DESCR("Max seconds between heartbeats before panic"),
309 &heartbeat_max_period_sysctl, 0, NULL, 0,
310 CTL_CREATE, CTL_EOL);
311 if (error) {
312 printf("%s: failed to create kern.heartbeat.max_period: %d\n",
313 __func__, error);
314 return;
315 }
316 }
317
318 /*
319 * heartbeat_intr(cookie)
320 *
321 * Soft interrupt handler to update the local CPU's view of the
322 * system uptime. This runs at the same priority level as
323 * callouts, so if callouts are stuck on this CPU, it won't run,
324 * and eventually another CPU will notice that this one is stuck.
325 *
326 * Don't do spl* here -- keep it to a minimum so if anything goes
327 * wrong we don't end up with hard interrupts blocked and unable
328 * to detect a missed heartbeat.
329 */
330 static void
331 heartbeat_intr(void *cookie)
332 {
333 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count);
334 unsigned uptime = atomic_load_relaxed(&time_uptime);
335
336 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count);
337 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime);
338 }
339
340 /*
341 * heartbeat_start()
342 *
343 * Start system heartbeat monitoring.
344 */
345 void
346 heartbeat_start(void)
347 {
348 const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT;
349
350 /*
351 * Establish a softint so we can schedule it once ready. This
352 * should be at the lowest softint priority level so that we
353 * ensure all softint priorities are making progress.
354 */
355 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
356 &heartbeat_intr, NULL);
357
358 /*
359 * Now that the softint is established, kick off heartbeat
360 * monitoring with the default period. This will initialize
361 * the per-CPU state to an up-to-date cache of time_uptime.
362 */
363 mutex_enter(&heartbeat_lock);
364 set_max_period(max_period);
365 mutex_exit(&heartbeat_lock);
366 }
367
368 /*
369 * defibrillator(cookie)
370 *
371 * IPI handler for defibrillation. If the CPU's heart has stopped
372 * beating normally, but the CPU can still execute things,
373 * acknowledge the IPI to the doctor and then panic so we at least
374 * get a stack trace from whatever the current CPU is stuck doing,
375 * if not a core dump.
376 *
377 * (This metaphor is a little stretched, since defibrillation is
378 * usually administered when the heart is beating errattically but
379 * hasn't stopped, and causes the heart to stop temporarily, and
380 * one hopes it is not fatal. But we're (software) engineers, so
381 * we can stretch metaphors like silly putty in a blender.)
382 */
383 static void
384 defibrillator(void *cookie)
385 {
386 bool *ack = cookie;
387
388 atomic_store_relaxed(ack, true);
389 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()),
390 curlwp->l_lid,
391 curlwp->l_name ? curlwp->l_name : curproc->p_comm);
392 }
393
394 /*
395 * defibrillate(ci, unsigned d)
396 *
397 * The patient CPU ci's heart has stopped beating after d seconds.
398 * Force the patient CPU ci to panic, or panic on this CPU if the
399 * patient CPU doesn't respond within 1sec.
400 */
401 static void __noinline
402 defibrillate(struct cpu_info *ci, unsigned d)
403 {
404 bool ack = false;
405 ipi_msg_t msg = {
406 .func = &defibrillator,
407 .arg = &ack,
408 };
409 unsigned countdown = 1000; /* 1sec */
410
411 KASSERT(curcpu_stable());
412
413 /*
414 * First notify the console that the patient CPU's heart seems
415 * to have stopped beating.
416 */
417 printf("%s: found %s heart stopped beating after %u seconds\n",
418 cpu_name(curcpu()), cpu_name(ci), d);
419
420 /*
421 * Next, give the patient CPU a chance to panic, so we get a
422 * stack trace on that CPU even if we don't get a crash dump.
423 */
424 ipi_unicast(&msg, ci);
425
426 /*
427 * Busy-wait up to 1sec for the patient CPU to print a stack
428 * trace and panic. If the patient CPU acknowledges the IPI,
429 * or if we're panicking anyway, just give up and stop here --
430 * the system is coming down soon and we should avoid getting
431 * in the way.
432 */
433 while (countdown --> 0) {
434 if (atomic_load_relaxed(&ack) ||
435 atomic_load_relaxed(&panicstr) != NULL)
436 return;
437 DELAY(1000); /* 1ms */
438 }
439
440 /*
441 * The patient CPU failed to acknowledge the panic request.
442 * Panic now; with any luck, we'll get a crash dump.
443 */
444 panic("%s: found %s heart stopped beating and unresponsive",
445 cpu_name(curcpu()), cpu_name(ci));
446 }
447
448 /*
449 * select_patient()
450 *
451 * Select another CPU to check the heartbeat of. Returns NULL if
452 * there are no other online CPUs. Never returns curcpu().
453 * Caller must have kpreemption disabled.
454 */
455 static struct cpu_info *
456 select_patient(void)
457 {
458 CPU_INFO_ITERATOR cii;
459 struct cpu_info *first = NULL, *patient = NULL, *ci;
460 bool passedcur = false;
461
462 KASSERT(curcpu_stable());
463
464 /*
465 * In the iteration order of all CPUs, find the next online CPU
466 * after curcpu(), or the first online one if curcpu() is last
467 * in the iteration order.
468 */
469 for (CPU_INFO_FOREACH(cii, ci)) {
470 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
471 continue;
472 if (passedcur) {
473 /*
474 * (...|curcpu()|ci|...)
475 *
476 * Found the patient right after curcpu().
477 */
478 KASSERT(patient != ci);
479 patient = ci;
480 break;
481 }
482 if (ci == curcpu()) {
483 /*
484 * (...|prev|ci=curcpu()|next|...)
485 *
486 * Note that we want next (or first, if there's
487 * nothing after curcpu()).
488 */
489 passedcur = true;
490 continue;
491 }
492 if (first == NULL) {
493 /*
494 * (ci|...|curcpu()|...)
495 *
496 * Record ci as first in case there's nothing
497 * after curcpu().
498 */
499 first = ci;
500 continue;
501 }
502 }
503
504 /*
505 * If we hit the end, wrap around to the beginning.
506 */
507 if (patient == NULL) {
508 KASSERT(passedcur);
509 patient = first;
510 }
511
512 return patient;
513 }
514
515 /*
516 * heartbeat()
517 *
518 * 1. Count a heartbeat on the local CPU.
519 *
520 * 2. Panic if the system uptime doesn't seem to have advanced in
521 * a while.
522 *
523 * 3. Panic if the soft interrupt on this CPU hasn't advanced the
524 * local view of the system uptime.
525 *
526 * 4. Schedule the soft interrupt to advance the local view of the
527 * system uptime.
528 *
529 * 5. Select another CPU to check the heartbeat of.
530 *
531 * 6. Panic if the other CPU hasn't advanced its view of the
532 * system uptime in a while.
533 */
534 void
535 heartbeat(void)
536 {
537 unsigned period_ticks, period_secs;
538 unsigned count, uptime, cache, stamp, d;
539 struct cpu_info *patient;
540
541 KASSERT(curcpu_stable());
542
543 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks);
544 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs);
545 if (__predict_false(period_ticks == 0) ||
546 __predict_false(period_secs == 0) ||
547 __predict_false(curcpu()->ci_schedstate.spc_flags & SPCF_OFFLINE))
548 return;
549
550 /*
551 * Count a heartbeat on this CPU.
552 */
553 count = curcpu()->ci_heartbeat_count++;
554
555 /*
556 * If the uptime hasn't changed, make sure that we haven't
557 * counted too many of our own heartbeats since the uptime last
558 * changed, and stop here -- we only do the cross-CPU work once
559 * per second.
560 */
561 uptime = atomic_load_relaxed(&time_uptime);
562 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache);
563 if (__predict_true(cache == uptime)) {
564 /*
565 * Timecounter hasn't advanced by more than a second.
566 * Make sure the timecounter isn't stuck according to
567 * our heartbeats.
568 *
569 * Our own heartbeat count can't roll back, and
570 * time_uptime should be updated before it wraps
571 * around, so d should never go negative; hence no
572 * check for d < UINT_MAX/2.
573 */
574 stamp =
575 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp);
576 d = count - stamp;
577 if (__predict_false(d > period_ticks)) {
578 panic("%s: time has not advanced in %u heartbeats",
579 cpu_name(curcpu()), d);
580 }
581 return;
582 }
583
584 /*
585 * If the uptime has changed, make sure that it hasn't changed
586 * so much that softints must be stuck on this CPU. Since
587 * time_uptime is monotonic, this can't go negative, hence no
588 * check for d < UINT_MAX/2.
589 *
590 * This uses the hard timer interrupt handler on the current
591 * CPU to ensure soft interrupts at all priority levels have
592 * made progress.
593 */
594 d = uptime - cache;
595 if (__predict_false(d > period_secs)) {
596 panic("%s: softints stuck for %u seconds",
597 cpu_name(curcpu()), d);
598 }
599
600 /*
601 * Schedule a softint to update our cache of the system uptime
602 * so the next call to heartbeat, on this or another CPU, can
603 * detect progress on this one.
604 */
605 softint_schedule(heartbeat_sih);
606
607 /*
608 * Select a patient to check the heartbeat of. If there's no
609 * other online CPU, nothing to do.
610 */
611 patient = select_patient();
612 if (patient == NULL)
613 return;
614
615 /*
616 * Verify that time is advancing on the patient CPU. If the
617 * delta exceeds UINT_MAX/2, that means it is already ahead by
618 * a little on the other CPU, and the subtraction went
619 * negative, which is OK. If the CPU has been
620 * offlined since we selected it, no worries.
621 *
622 * This uses the current CPU to ensure the other CPU has made
623 * progress, even if the other CPU's hard timer interrupt
624 * handler is stuck for some reason.
625 *
626 * XXX Maybe confirm it hasn't gone negative by more than
627 * max_period?
628 */
629 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache);
630 if (__predict_false(d > period_secs) &&
631 __predict_false(d < UINT_MAX/2) &&
632 ((patient->ci_schedstate.spc_flags & SPCF_OFFLINE) == 0))
633 defibrillate(patient, d);
634 }
635
636 /*
637 * heartbeat_dump()
638 *
639 * Print the heartbeat data of all CPUs. Can be called from ddb.
640 */
641 #ifdef DDB
642 static unsigned
643 db_read_unsigned(const unsigned *p)
644 {
645 unsigned x;
646
647 db_read_bytes((db_addr_t)p, sizeof(x), (char *)&x);
648
649 return x;
650 }
651
652 void
653 heartbeat_dump(void)
654 {
655 struct cpu_info *ci;
656
657 db_printf("Heartbeats:\n");
658 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
659 db_printf("cpu%u: count %u uptime %u stamp %u\n",
660 db_read_unsigned(&ci->ci_index),
661 db_read_unsigned(&ci->ci_heartbeat_count),
662 db_read_unsigned(&ci->ci_heartbeat_uptime_cache),
663 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp));
664 }
665 }
666 #endif
667