kern_heartbeat.c revision 1.3 1 /* $NetBSD: kern_heartbeat.c,v 1.3 2023/07/08 13:59:05 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress
31 *
32 * Manual tests to run when changing this file. Magic numbers are for
33 * evbarm; adjust for other platforms. Tests involving cpuctl
34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU
35 * system, offline all but one CPU.
36 *
37 * 1. cpuctl offline 0
38 * sleep 20
39 * cpuctl online 0
40 *
41 * 2. cpuctl offline 1
42 * sleep 20
43 * cpuctl online 1
44 *
45 * 3. cpuctl offline 0
46 * sysctl -w kern.heartbeat.max_period=5
47 * sleep 10
48 * sysctl -w kern.heartbeat.max_period=0
49 * sleep 10
50 * sysctl -w kern.heartbeat.max_period=5
51 * sleep 10
52 * cpuctl online 0
53 *
54 * 4. sysctl -w debug.crashme_enable=1
55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
56 * # verify system panics after 15sec
57 *
58 * 5. sysctl -w debug.crashme_enable=1
59 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
60 * # verify system panics after 15sec
61 *
62 * 6. cpuctl offline 0
63 * sysctl -w debug.crashme_enable=1
64 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
65 * # verify system panics after 15sec
66 *
67 * 7. cpuctl offline 0
68 * sysctl -w debug.crashme_enable=1
69 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM
70 * # verify system panics after 15sec
71 *
72 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU
73 * # require a hardware watchdog timer.
74 * #cpuctl offline 0
75 * #sysctl -w debug.crashme_enable
76 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
77 * # hope watchdog timer kicks in
78 */
79
80 #include <sys/cdefs.h>
81 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.3 2023/07/08 13:59:05 riastradh Exp $");
82
83 #ifdef _KERNEL_OPT
84 #include "opt_ddb.h"
85 #include "opt_heartbeat.h"
86 #endif
87
88 #include "heartbeat.h"
89
90 #include <sys/param.h>
91 #include <sys/types.h>
92
93 #include <sys/atomic.h>
94 #include <sys/cpu.h>
95 #include <sys/errno.h>
96 #include <sys/heartbeat.h>
97 #include <sys/ipi.h>
98 #include <sys/mutex.h>
99 #include <sys/sysctl.h>
100 #include <sys/systm.h>
101 #include <sys/xcall.h>
102
103 #ifdef DDB
104 #include <ddb/ddb.h>
105 #endif
106
107 /*
108 * Global state.
109 *
110 * heartbeat_lock serializes access to heartbeat_max_period_secs
111 * and heartbeat_max_period_ticks. Two separate variables so we
112 * can avoid multiplication or division in the heartbeat routine.
113 *
114 * heartbeat_sih is stable after initialization in
115 * heartbeat_start.
116 */
117 kmutex_t heartbeat_lock __cacheline_aligned;
118 unsigned heartbeat_max_period_secs __read_mostly;
119 unsigned heartbeat_max_period_ticks __read_mostly;
120
121 void *heartbeat_sih __read_mostly;
122
123 /*
124 * heartbeat_suspend()
125 *
126 * Suspend heartbeat monitoring of the current CPU.
127 *
128 * Called after the current CPU has been marked offline but before
129 * it has stopped running. Caller must have preemption disabled.
130 */
131 void
132 heartbeat_suspend(void)
133 {
134
135 KASSERT(curcpu_stable());
136
137 /*
138 * Nothing to do -- we just check the SPCF_OFFLINE flag.
139 */
140 }
141
142 /*
143 * heartbeat_resume()
144 *
145 * Resume heartbeat monitoring of the current CPU.
146 *
147 * Called after the current CPU has started running but before it
148 * has been marked online. Also used internally when starting up
149 * heartbeat monitoring at boot or when the maximum period is set
150 * from zero to nonzero. Caller must have preemption disabled.
151 */
152 void
153 heartbeat_resume(void)
154 {
155 struct cpu_info *ci = curcpu();
156 int s;
157
158 KASSERT(curcpu_stable());
159
160 /*
161 * Block heartbeats while we reset the state so we don't
162 * spuriously think we had a heart attack in the middle of
163 * resetting the count and the uptime stamp.
164 */
165 s = splsched();
166 ci->ci_heartbeat_count = 0;
167 ci->ci_heartbeat_uptime_cache = atomic_load_relaxed(&time_uptime);
168 ci->ci_heartbeat_uptime_stamp = 0;
169 splx(s);
170 }
171
172 /*
173 * heartbeat_reset_xc(a, b)
174 *
175 * Cross-call handler to reset heartbeat state just prior to
176 * enabling heartbeat checks.
177 */
178 static void
179 heartbeat_reset_xc(void *a, void *b)
180 {
181
182 heartbeat_resume();
183 }
184
185 /*
186 * set_max_period(max_period)
187 *
188 * Set the maximum period, in seconds, for heartbeat checks.
189 *
190 * - If max_period is zero, disable them.
191 *
192 * - If the max period was zero and max_period is nonzero, ensure
193 * all CPUs' heartbeat uptime caches are up-to-date before
194 * re-enabling them.
195 *
196 * max_period must be below UINT_MAX/4/hz to avoid arithmetic
197 * overflow and give room for slop.
198 *
199 * Caller must hold heartbeat_lock.
200 */
201 static void
202 set_max_period(unsigned max_period)
203 {
204
205 KASSERTMSG(max_period <= UINT_MAX/4/hz,
206 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)",
207 max_period, UINT_MAX/4/hz, hz);
208 KASSERT(mutex_owned(&heartbeat_lock));
209
210 /*
211 * If we're enabling heartbeat checks, make sure we have a
212 * reasonably up-to-date time_uptime cache on all CPUs so we
213 * don't think we had an instant heart attack.
214 */
215 if (heartbeat_max_period_secs == 0 && max_period != 0)
216 xc_wait(xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL));
217
218 /*
219 * Once the heartbeat state has been updated on all (online)
220 * CPUs, set the period. At this point, heartbeat checks can
221 * begin.
222 */
223 atomic_store_relaxed(&heartbeat_max_period_secs, max_period);
224 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz);
225 }
226
227 /*
228 * heartbeat_max_period_ticks(SYSCTLFN_ARGS)
229 *
230 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies
231 * it lies within a reasonable interval and sets it.
232 */
233 static int
234 heartbeat_max_period_sysctl(SYSCTLFN_ARGS)
235 {
236 struct sysctlnode node;
237 unsigned max_period;
238 int error;
239
240 mutex_enter(&heartbeat_lock);
241
242 max_period = heartbeat_max_period_secs;
243 node = *rnode;
244 node.sysctl_data = &max_period;
245 error = sysctl_lookup(SYSCTLFN_CALL(&node));
246 if (error || newp == NULL)
247 goto out;
248
249 /*
250 * Ensure there's plenty of slop between heartbeats.
251 */
252 if (max_period > UINT_MAX/4/hz) {
253 error = EOVERFLOW;
254 goto out;
255 }
256
257 /*
258 * Success! Set the period. This enables heartbeat checks if
259 * we went from zero period to nonzero period, or disables them
260 * if the other way around.
261 */
262 set_max_period(max_period);
263 error = 0;
264
265 out: mutex_exit(&heartbeat_lock);
266 return error;
267 }
268
269 /*
270 * sysctl_heartbeat_setup()
271 *
272 * Set up the kern.heartbeat.* sysctl subtree.
273 */
274 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup")
275 {
276 const struct sysctlnode *rnode;
277 int error;
278
279 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE);
280
281 /* kern.heartbeat */
282 error = sysctl_createv(NULL, 0, NULL, &rnode,
283 CTLFLAG_PERMANENT,
284 CTLTYPE_NODE, "heartbeat",
285 SYSCTL_DESCR("Kernel heartbeat parameters"),
286 NULL, 0, NULL, 0,
287 CTL_KERN, CTL_CREATE, CTL_EOL);
288 if (error) {
289 printf("%s: failed to create kern.heartbeat: %d\n",
290 __func__, error);
291 return;
292 }
293
294 /* kern.heartbeat.max_period */
295 error = sysctl_createv(NULL, 0, &rnode, NULL,
296 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
297 CTLTYPE_INT, "max_period",
298 SYSCTL_DESCR("Max seconds between heartbeats before panic"),
299 &heartbeat_max_period_sysctl, 0, NULL, 0,
300 CTL_CREATE, CTL_EOL);
301 if (error) {
302 printf("%s: failed to create kern.heartbeat.max_period: %d\n",
303 __func__, error);
304 return;
305 }
306 }
307
308 /*
309 * heartbeat_intr(cookie)
310 *
311 * Soft interrupt handler to update the local CPU's view of the
312 * system uptime. This runs at the same priority level as
313 * callouts, so if callouts are stuck on this CPU, it won't run,
314 * and eventually another CPU will notice that this one is stuck.
315 *
316 * Don't do spl* here -- keep it to a minimum so if anything goes
317 * wrong we don't end up with hard interrupts blocked and unable
318 * to detect a missed heartbeat.
319 */
320 static void
321 heartbeat_intr(void *cookie)
322 {
323 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count);
324 unsigned uptime = atomic_load_relaxed(&time_uptime);
325
326 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count);
327 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime);
328 }
329
330 /*
331 * heartbeat_start()
332 *
333 * Start system heartbeat monitoring.
334 */
335 void
336 heartbeat_start(void)
337 {
338 const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT;
339
340 /*
341 * Establish a softint so we can schedule it once ready. This
342 * should be at the lowest softint priority level so that we
343 * ensure all softint priorities are making progress.
344 */
345 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
346 &heartbeat_intr, NULL);
347
348 /*
349 * Now that the softint is established, kick off heartbeat
350 * monitoring with the default period. This will initialize
351 * the per-CPU state to an up-to-date cache of time_uptime.
352 */
353 mutex_enter(&heartbeat_lock);
354 set_max_period(max_period);
355 mutex_exit(&heartbeat_lock);
356 }
357
358 /*
359 * defibrillator(cookie)
360 *
361 * IPI handler for defibrillation. If the CPU's heart has stopped
362 * beating normally, but the CPU can still execute things,
363 * acknowledge the IPI to the doctor and then panic so we at least
364 * get a stack trace from whatever the current CPU is stuck doing,
365 * if not a core dump.
366 *
367 * (This metaphor is a little stretched, since defibrillation is
368 * usually administered when the heart is beating errattically but
369 * hasn't stopped, and causes the heart to stop temporarily, and
370 * one hopes it is not fatal. But we're (software) engineers, so
371 * we can stretch metaphors like silly putty in a blender.)
372 */
373 static void
374 defibrillator(void *cookie)
375 {
376 bool *ack = cookie;
377
378 atomic_store_relaxed(ack, true);
379 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()),
380 curlwp->l_lid,
381 curlwp->l_name ? curlwp->l_name : curproc->p_comm);
382 }
383
384 /*
385 * defibrillate(ci, unsigned d)
386 *
387 * The patient CPU ci's heart has stopped beating after d seconds.
388 * Force the patient CPU ci to panic, or panic on this CPU if the
389 * patient CPU doesn't respond within 1sec.
390 */
391 static void __noinline
392 defibrillate(struct cpu_info *ci, unsigned d)
393 {
394 bool ack = false;
395 ipi_msg_t msg = {
396 .func = &defibrillator,
397 .arg = &ack,
398 };
399 unsigned countdown = 1000; /* 1sec */
400
401 KASSERT(curcpu_stable());
402
403 /*
404 * First notify the console that the patient CPU's heart seems
405 * to have stopped beating.
406 */
407 printf("%s: found %s heart stopped beating after %u seconds\n",
408 cpu_name(curcpu()), cpu_name(ci), d);
409
410 /*
411 * Next, give the patient CPU a chance to panic, so we get a
412 * stack trace on that CPU even if we don't get a crash dump.
413 */
414 ipi_unicast(&msg, ci);
415
416 /*
417 * Busy-wait up to 1sec for the patient CPU to print a stack
418 * trace and panic. If the patient CPU acknowledges the IPI,
419 * or if we're panicking anyway, just give up and stop here --
420 * the system is coming down soon and we should avoid getting
421 * in the way.
422 */
423 while (countdown --> 0) {
424 if (atomic_load_relaxed(&ack) ||
425 atomic_load_relaxed(&panicstr) != NULL)
426 return;
427 DELAY(1000); /* 1ms */
428 }
429
430 /*
431 * The patient CPU failed to acknowledge the panic request.
432 * Panic now; with any luck, we'll get a crash dump.
433 */
434 panic("%s: found %s heart stopped beating and unresponsive",
435 cpu_name(curcpu()), cpu_name(ci));
436 }
437
438 /*
439 * select_patient()
440 *
441 * Select another CPU to check the heartbeat of. Returns NULL if
442 * there are no other online CPUs. Never returns curcpu().
443 * Caller must have kpreemption disabled.
444 */
445 static struct cpu_info *
446 select_patient(void)
447 {
448 CPU_INFO_ITERATOR cii;
449 struct cpu_info *first = NULL, *patient = NULL, *ci;
450 bool passedcur = false;
451
452 KASSERT(curcpu_stable());
453
454 /*
455 * In the iteration order of all CPUs, find the next online CPU
456 * after curcpu(), or the first online one if curcpu() is last
457 * in the iteration order.
458 */
459 for (CPU_INFO_FOREACH(cii, ci)) {
460 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
461 continue;
462 if (passedcur) {
463 /*
464 * (...|curcpu()|ci|...)
465 *
466 * Found the patient right after curcpu().
467 */
468 KASSERT(patient != ci);
469 patient = ci;
470 break;
471 }
472 if (ci == curcpu()) {
473 /*
474 * (...|prev|ci=curcpu()|next|...)
475 *
476 * Note that we want next (or first, if there's
477 * nothing after curcpu()).
478 */
479 passedcur = true;
480 continue;
481 }
482 if (first == NULL) {
483 /*
484 * (ci|...|curcpu()|...)
485 *
486 * Record ci as first in case there's nothing
487 * after curcpu().
488 */
489 first = ci;
490 continue;
491 }
492 }
493
494 /*
495 * If we hit the end, wrap around to the beginning.
496 */
497 if (patient == NULL) {
498 KASSERT(passedcur);
499 patient = first;
500 }
501
502 return patient;
503 }
504
505 /*
506 * heartbeat()
507 *
508 * 1. Count a heartbeat on the local CPU.
509 *
510 * 2. Panic if the system uptime doesn't seem to have advanced in
511 * a while.
512 *
513 * 3. Panic if the soft interrupt on this CPU hasn't advanced the
514 * local view of the system uptime.
515 *
516 * 4. Schedule the soft interrupt to advance the local view of the
517 * system uptime.
518 *
519 * 5. Select another CPU to check the heartbeat of.
520 *
521 * 6. Panic if the other CPU hasn't advanced its view of the
522 * system uptime in a while.
523 */
524 void
525 heartbeat(void)
526 {
527 unsigned period_ticks, period_secs;
528 unsigned count, uptime, cache, stamp, d;
529 struct cpu_info *patient;
530
531 KASSERT(curcpu_stable());
532
533 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks);
534 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs);
535 if (__predict_false(period_ticks == 0) ||
536 __predict_false(period_secs == 0) ||
537 __predict_false(curcpu()->ci_schedstate.spc_flags & SPCF_OFFLINE))
538 return;
539
540 /*
541 * Count a heartbeat on this CPU.
542 */
543 count = curcpu()->ci_heartbeat_count++;
544
545 /*
546 * If the uptime hasn't changed, make sure that we haven't
547 * counted too many of our own heartbeats since the uptime last
548 * changed, and stop here -- we only do the cross-CPU work once
549 * per second.
550 */
551 uptime = atomic_load_relaxed(&time_uptime);
552 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache);
553 if (__predict_true(cache == uptime)) {
554 /*
555 * Timecounter hasn't advanced by more than a second.
556 * Make sure the timecounter isn't stuck according to
557 * our heartbeats.
558 *
559 * Our own heartbeat count can't roll back, and
560 * time_uptime should be updated before it wraps
561 * around, so d should never go negative; hence no
562 * check for d < UINT_MAX/2.
563 */
564 stamp =
565 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp);
566 d = count - stamp;
567 if (__predict_false(d > period_ticks)) {
568 panic("%s: time has not advanced in %u heartbeats",
569 cpu_name(curcpu()), d);
570 }
571 return;
572 }
573
574 /*
575 * If the uptime has changed, make sure that it hasn't changed
576 * so much that softints must be stuck on this CPU. Since
577 * time_uptime is monotonic, this can't go negative, hence no
578 * check for d < UINT_MAX/2.
579 *
580 * This uses the hard timer interrupt handler on the current
581 * CPU to ensure soft interrupts at all priority levels have
582 * made progress.
583 */
584 d = uptime - cache;
585 if (__predict_false(d > period_secs)) {
586 panic("%s: softints stuck for %u seconds",
587 cpu_name(curcpu()), d);
588 }
589
590 /*
591 * Schedule a softint to update our cache of the system uptime
592 * so the next call to heartbeat, on this or another CPU, can
593 * detect progress on this one.
594 */
595 softint_schedule(heartbeat_sih);
596
597 /*
598 * Select a patient to check the heartbeat of. If there's no
599 * other online CPU, nothing to do.
600 */
601 patient = select_patient();
602 if (patient == NULL)
603 return;
604
605 /*
606 * Verify that time is advancing on the patient CPU. If the
607 * delta exceeds UINT_MAX/2, that means it is already ahead by
608 * a little on the other CPU, and the subtraction went
609 * negative, which is OK. If the CPU has been
610 * offlined since we selected it, no worries.
611 *
612 * This uses the current CPU to ensure the other CPU has made
613 * progress, even if the other CPU's hard timer interrupt
614 * handler is stuck for some reason.
615 *
616 * XXX Maybe confirm it hasn't gone negative by more than
617 * max_period?
618 */
619 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache);
620 if (__predict_false(d > period_secs) &&
621 __predict_false(d < UINT_MAX/2) &&
622 ((patient->ci_schedstate.spc_flags & SPCF_OFFLINE) == 0))
623 defibrillate(patient, d);
624 }
625
626 /*
627 * heartbeat_dump()
628 *
629 * Print the heartbeat data of all CPUs. Can be called from ddb.
630 */
631 #ifdef DDB
632 static unsigned
633 db_read_unsigned(const unsigned *p)
634 {
635 unsigned x;
636
637 db_read_bytes((db_addr_t)p, sizeof(x), (char *)&x);
638
639 return x;
640 }
641
642 void
643 heartbeat_dump(void)
644 {
645 struct cpu_info *ci;
646
647 db_printf("Heartbeats:\n");
648 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
649 db_printf("cpu%u: count %u uptime %u stamp %u\n",
650 db_read_unsigned(&ci->ci_index),
651 db_read_unsigned(&ci->ci_heartbeat_count),
652 db_read_unsigned(&ci->ci_heartbeat_uptime_cache),
653 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp));
654 }
655 }
656 #endif
657