kern_heartbeat.c revision 1.14 1 /* $NetBSD: kern_heartbeat.c,v 1.14 2024/08/25 01:14:01 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress
31 *
32 * Manual tests to run when changing this file. Magic numbers are for
33 * evbarm; adjust for other platforms. Tests involving cpuctl
34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU
35 * system, offline all but one CPU.
36 *
37 * 1. cpuctl offline 0
38 * sleep 20
39 * cpuctl online 0
40 *
41 * 2. cpuctl offline 1
42 * sleep 20
43 * cpuctl online 1
44 *
45 * 3. cpuctl offline 0
46 * sysctl -w kern.heartbeat.max_period=5
47 * sleep 10
48 * sysctl -w kern.heartbeat.max_period=0
49 * sleep 10
50 * sysctl -w kern.heartbeat.max_period=5
51 * sleep 10
52 * cpuctl online 0
53 *
54 * 4. sysctl -w debug.crashme_enable=1
55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
56 * # verify system panics after 15sec, with a stack trace through
57 * # crashme_spl_spinout
58 *
59 * 5. sysctl -w debug.crashme_enable=1
60 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
61 * # verify system panics after 15sec, with a stack trace through
62 * # crashme_spl_spinout
63 *
64 * 6. cpuctl offline 0
65 * sysctl -w debug.crashme_enable=1
66 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
67 * # verify system panics after 15sec, with a stack trace through
68 * # crashme_spl_spinout
69 *
70 * 7. cpuctl offline 0
71 * sysctl -w debug.crashme_enable=1
72 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM
73 * # verify system panics after 15sec, with a stack trace through
74 * # crashme_spl_spinout
75 *
76 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU
77 * # require a hardware watchdog timer.
78 * #cpuctl offline 0
79 * #sysctl -w debug.crashme_enable
80 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
81 * # hope watchdog timer kicks in
82 */
83
84 #include <sys/cdefs.h>
85 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.14 2024/08/25 01:14:01 riastradh Exp $");
86
87 #ifdef _KERNEL_OPT
88 #include "opt_ddb.h"
89 #include "opt_heartbeat.h"
90 #endif
91
92 #include "heartbeat.h"
93
94 #include <sys/param.h>
95 #include <sys/types.h>
96
97 #include <sys/atomic.h>
98 #include <sys/cpu.h>
99 #include <sys/errno.h>
100 #include <sys/heartbeat.h>
101 #include <sys/ipi.h>
102 #include <sys/kernel.h>
103 #include <sys/mutex.h>
104 #include <sys/sysctl.h>
105 #include <sys/systm.h>
106 #include <sys/xcall.h>
107
108 #ifdef DDB
109 #include <ddb/ddb.h>
110 #endif
111
112 /*
113 * Global state.
114 *
115 * heartbeat_lock serializes access to heartbeat_max_period_secs
116 * and heartbeat_max_period_ticks. Two separate variables so we
117 * can avoid multiplication or division in the heartbeat routine.
118 *
119 * heartbeat_sih is stable after initialization in
120 * heartbeat_start.
121 */
122 kmutex_t heartbeat_lock __cacheline_aligned;
123 unsigned heartbeat_max_period_secs __read_mostly;
124 unsigned heartbeat_max_period_ticks __read_mostly;
125
126 void *heartbeat_sih __read_mostly;
127
128 /*
129 * heartbeat_suspend()
130 *
131 * Suspend heartbeat monitoring of the current CPU.
132 *
133 * Called after the current CPU has been marked offline but before
134 * it has stopped running, or after IPL has been raised for
135 * polling-mode console input. Nestable (but only 2^32 times, so
136 * don't do this in a loop). Reversed by heartbeat_resume.
137 *
138 * Caller must be bound to the CPU, i.e., curcpu_stable() must be
139 * true. This function does not assert curcpu_stable() since it
140 * is used in the ddb entry path, where any assertions risk
141 * infinite regress into undebuggable chaos, so callers must be
142 * careful.
143 */
144 void
145 heartbeat_suspend(void)
146 {
147 unsigned *p;
148
149 p = &curcpu()->ci_heartbeat_suspend;
150 atomic_store_relaxed(p, *p + 1);
151 }
152
153 /*
154 * heartbeat_resume_cpu(ci)
155 *
156 * Resume heartbeat monitoring of ci.
157 *
158 * Called at startup while cold, and whenever heartbeat monitoring
159 * is re-enabled after being disabled or the period is changed.
160 * When not cold, ci must be the current CPU.
161 *
162 * Must be run at splsched.
163 */
164 static void
165 heartbeat_resume_cpu(struct cpu_info *ci)
166 {
167
168 KASSERT(__predict_false(cold) || curcpu_stable());
169 KASSERT(__predict_false(cold) || ci == curcpu());
170 /* XXX KASSERT IPL_SCHED */
171
172 ci->ci_heartbeat_count = 0;
173 ci->ci_heartbeat_uptime_cache = time_uptime32;
174 ci->ci_heartbeat_uptime_stamp = 0;
175 }
176
177 /*
178 * heartbeat_resume()
179 *
180 * Resume heartbeat monitoring of the current CPU.
181 *
182 * Called after the current CPU has started running but before it
183 * has been marked online, or when ending polling-mode input
184 * before IPL is restored. Reverses heartbeat_suspend.
185 *
186 * Caller must be bound to the CPU, i.e., curcpu_stable() must be
187 * true.
188 */
189 void
190 heartbeat_resume(void)
191 {
192 struct cpu_info *ci = curcpu();
193 unsigned *p;
194 int s;
195
196 KASSERT(curcpu_stable());
197
198 /*
199 * Reset the state so nobody spuriously thinks we had a heart
200 * attack as soon as the heartbeat checks resume.
201 */
202 s = splsched();
203 heartbeat_resume_cpu(ci);
204 splx(s);
205
206 p = &ci->ci_heartbeat_suspend;
207 atomic_store_relaxed(p, *p - 1);
208 }
209
210 /*
211 * heartbeat_timecounter_suspended()
212 *
213 * True if timecounter heartbeat checks are suspended because the
214 * timecounter may not be advancing, false if heartbeat checks
215 * should check for timecounter progress.
216 */
217 static bool
218 heartbeat_timecounter_suspended(void)
219 {
220 CPU_INFO_ITERATOR cii;
221 struct cpu_info *ci;
222
223 /*
224 * The timecounter ticks only on the primary CPU. Check
225 * whether it's suspended.
226 *
227 * XXX Would be nice if we could find the primary CPU without
228 * iterating over all CPUs.
229 */
230 for (CPU_INFO_FOREACH(cii, ci)) {
231 if (CPU_IS_PRIMARY(ci))
232 return atomic_load_relaxed(&ci->ci_heartbeat_suspend);
233 }
234
235 /*
236 * This should be unreachable -- there had better be a primary
237 * CPU in the system! If not, the timecounter will be busted
238 * anyway.
239 */
240 panic("no primary CPU");
241 }
242
243 /*
244 * heartbeat_reset_xc(a, b)
245 *
246 * Cross-call handler to reset heartbeat state just prior to
247 * enabling heartbeat checks.
248 */
249 static void
250 heartbeat_reset_xc(void *a, void *b)
251 {
252 int s;
253
254 s = splsched();
255 heartbeat_resume_cpu(curcpu());
256 splx(s);
257 }
258
259 /*
260 * set_max_period(max_period)
261 *
262 * Set the maximum period, in seconds, for heartbeat checks.
263 *
264 * - If max_period is zero, disable them.
265 *
266 * - If the max period was zero and max_period is nonzero, ensure
267 * all CPUs' heartbeat uptime caches are up-to-date before
268 * re-enabling them.
269 *
270 * max_period must be below UINT_MAX/4/hz to avoid arithmetic
271 * overflow and give room for slop.
272 *
273 * Caller must hold heartbeat_lock.
274 */
275 static void
276 set_max_period(unsigned max_period)
277 {
278
279 KASSERTMSG(max_period <= UINT_MAX/4/hz,
280 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)",
281 max_period, UINT_MAX/4/hz, hz);
282 KASSERT(mutex_owned(&heartbeat_lock));
283
284 /*
285 * If we're enabling heartbeat checks, make sure we have a
286 * reasonably up-to-date time_uptime32 cache on all CPUs so we
287 * don't think we had an instant heart attack.
288 */
289 if (heartbeat_max_period_secs == 0 && max_period != 0) {
290 if (cold) {
291 CPU_INFO_ITERATOR cii;
292 struct cpu_info *ci;
293
294 for (CPU_INFO_FOREACH(cii, ci))
295 heartbeat_resume_cpu(ci);
296 } else {
297 const uint64_t ticket =
298 xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL);
299 xc_wait(ticket);
300 }
301 }
302
303 /*
304 * Once the heartbeat state has been updated on all (online)
305 * CPUs, set the period. At this point, heartbeat checks can
306 * begin.
307 */
308 atomic_store_relaxed(&heartbeat_max_period_secs, max_period);
309 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz);
310 }
311
312 /*
313 * heartbeat_max_period_ticks(SYSCTLFN_ARGS)
314 *
315 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies
316 * it lies within a reasonable interval and sets it.
317 */
318 static int
319 heartbeat_max_period_sysctl(SYSCTLFN_ARGS)
320 {
321 struct sysctlnode node;
322 unsigned max_period;
323 int error;
324
325 mutex_enter(&heartbeat_lock);
326
327 max_period = heartbeat_max_period_secs;
328 node = *rnode;
329 node.sysctl_data = &max_period;
330 error = sysctl_lookup(SYSCTLFN_CALL(&node));
331 if (error || newp == NULL)
332 goto out;
333
334 /*
335 * Ensure there's plenty of slop between heartbeats.
336 */
337 if (max_period > UINT_MAX/4/hz) {
338 error = EOVERFLOW;
339 goto out;
340 }
341
342 /*
343 * Success! Set the period. This enables heartbeat checks if
344 * we went from zero period to nonzero period, or disables them
345 * if the other way around.
346 */
347 set_max_period(max_period);
348 error = 0;
349
350 out: mutex_exit(&heartbeat_lock);
351 return error;
352 }
353
354 /*
355 * sysctl_heartbeat_setup()
356 *
357 * Set up the kern.heartbeat.* sysctl subtree.
358 */
359 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup")
360 {
361 const struct sysctlnode *rnode;
362 int error;
363
364 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE);
365
366 /* kern.heartbeat */
367 error = sysctl_createv(NULL, 0, NULL, &rnode,
368 CTLFLAG_PERMANENT,
369 CTLTYPE_NODE, "heartbeat",
370 SYSCTL_DESCR("Kernel heartbeat parameters"),
371 NULL, 0, NULL, 0,
372 CTL_KERN, CTL_CREATE, CTL_EOL);
373 if (error) {
374 printf("%s: failed to create kern.heartbeat: %d\n",
375 __func__, error);
376 return;
377 }
378
379 /* kern.heartbeat.max_period */
380 error = sysctl_createv(NULL, 0, &rnode, NULL,
381 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
382 CTLTYPE_INT, "max_period",
383 SYSCTL_DESCR("Max seconds between heartbeats before panic"),
384 &heartbeat_max_period_sysctl, 0, NULL, 0,
385 CTL_CREATE, CTL_EOL);
386 if (error) {
387 printf("%s: failed to create kern.heartbeat.max_period: %d\n",
388 __func__, error);
389 return;
390 }
391 }
392
393 /*
394 * heartbeat_intr(cookie)
395 *
396 * Soft interrupt handler to update the local CPU's view of the
397 * system uptime. This runs at the same priority level as
398 * callouts, so if callouts are stuck on this CPU, it won't run,
399 * and eventually another CPU will notice that this one is stuck.
400 *
401 * Don't do spl* here -- keep it to a minimum so if anything goes
402 * wrong we don't end up with hard interrupts blocked and unable
403 * to detect a missed heartbeat.
404 */
405 static void
406 heartbeat_intr(void *cookie)
407 {
408 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count);
409 unsigned uptime = time_uptime32;
410
411 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count);
412 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime);
413 }
414
415 /*
416 * heartbeat_start()
417 *
418 * Start system heartbeat monitoring.
419 */
420 void
421 heartbeat_start(void)
422 {
423 enum { max_period = HEARTBEAT_MAX_PERIOD_DEFAULT };
424
425 /*
426 * Ensure the maximum period is small enough that we never have
427 * to worry about 32-bit wraparound even if there's a lot of
428 * slop. (In fact this is required to be less than
429 * UINT_MAX/4/hz, but that's not a compile-time constant.)
430 */
431 __CTASSERT(max_period < UINT_MAX/4);
432
433 /*
434 * Establish a softint so we can schedule it once ready. This
435 * should be at the lowest softint priority level so that we
436 * ensure all softint priorities are making progress.
437 */
438 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
439 &heartbeat_intr, NULL);
440
441 /*
442 * Now that the softint is established, kick off heartbeat
443 * monitoring with the default period. This will initialize
444 * the per-CPU state to an up-to-date cache of time_uptime32.
445 */
446 mutex_enter(&heartbeat_lock);
447 set_max_period(max_period);
448 mutex_exit(&heartbeat_lock);
449 }
450
451 /*
452 * defibrillator(cookie)
453 *
454 * IPI handler for defibrillation. If the CPU's heart has stopped
455 * beating normally, but the CPU can still execute things,
456 * acknowledge the IPI to the doctor and then panic so we at least
457 * get a stack trace from whatever the current CPU is stuck doing,
458 * if not a core dump.
459 *
460 * (This metaphor is a little stretched, since defibrillation is
461 * usually administered when the heart is beating errattically but
462 * hasn't stopped, and causes the heart to stop temporarily, and
463 * one hopes it is not fatal. But we're (software) engineers, so
464 * we can stretch metaphors like silly putty in a blender.)
465 */
466 static void
467 defibrillator(void *cookie)
468 {
469 bool *ack = cookie;
470
471 /*
472 * Acknowledge the interrupt so the doctor CPU won't trigger a
473 * new panic for defibrillation timeout.
474 */
475 atomic_store_relaxed(ack, true);
476
477 /*
478 * If a panic is already in progress, we may have interrupted
479 * the logic that prints a stack trace on this CPU -- so let's
480 * not make it worse by giving the misapprehension of a
481 * recursive panic.
482 */
483 if (atomic_load_relaxed(&panicstr) != NULL)
484 return;
485
486 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()),
487 curlwp->l_lid,
488 curlwp->l_name ? curlwp->l_name : curproc->p_comm);
489 }
490
491 /*
492 * defibrillate(ci, unsigned d)
493 *
494 * The patient CPU ci's heart has stopped beating after d seconds.
495 * Force the patient CPU ci to panic, or panic on this CPU if the
496 * patient CPU doesn't respond within 1sec.
497 */
498 static void __noinline
499 defibrillate(struct cpu_info *ci, unsigned d)
500 {
501 bool ack = false;
502 ipi_msg_t msg = {
503 .func = &defibrillator,
504 .arg = &ack,
505 };
506 unsigned countdown = 1000; /* 1sec */
507
508 KASSERT(curcpu_stable());
509
510 /*
511 * First notify the console that the patient CPU's heart seems
512 * to have stopped beating.
513 */
514 printf("%s: found %s heart stopped beating after %u seconds\n",
515 cpu_name(curcpu()), cpu_name(ci), d);
516
517 /*
518 * Next, give the patient CPU a chance to panic, so we get a
519 * stack trace on that CPU even if we don't get a crash dump.
520 */
521 ipi_unicast(&msg, ci);
522
523 /*
524 * Busy-wait up to 1sec for the patient CPU to print a stack
525 * trace and panic. If the patient CPU acknowledges the IPI,
526 * just give up and stop here -- the system is coming down soon
527 * and we should avoid getting in the way.
528 */
529 while (countdown --> 0) {
530 if (atomic_load_relaxed(&ack))
531 return;
532 DELAY(1000); /* 1ms */
533 }
534
535 /*
536 * The patient CPU failed to acknowledge the panic request.
537 * Panic now; with any luck, we'll get a crash dump.
538 */
539 panic("%s: found %s heart stopped beating and unresponsive",
540 cpu_name(curcpu()), cpu_name(ci));
541 }
542
543 /*
544 * select_patient()
545 *
546 * Select another CPU to check the heartbeat of. Returns NULL if
547 * there are no other online CPUs. Never returns curcpu().
548 * Caller must have kpreemption disabled.
549 */
550 static struct cpu_info *
551 select_patient(void)
552 {
553 CPU_INFO_ITERATOR cii;
554 struct cpu_info *first = NULL, *patient = NULL, *ci;
555 bool passedcur = false;
556
557 KASSERT(curcpu_stable());
558
559 /*
560 * In the iteration order of all CPUs, find the next online CPU
561 * after curcpu(), or the first online one if curcpu() is last
562 * in the iteration order.
563 */
564 for (CPU_INFO_FOREACH(cii, ci)) {
565 if (atomic_load_relaxed(&ci->ci_heartbeat_suspend))
566 continue;
567 if (passedcur) {
568 /*
569 * (...|curcpu()|ci|...)
570 *
571 * Found the patient right after curcpu().
572 */
573 KASSERT(patient != ci);
574 patient = ci;
575 break;
576 }
577 if (ci == curcpu()) {
578 /*
579 * (...|prev|ci=curcpu()|next|...)
580 *
581 * Note that we want next (or first, if there's
582 * nothing after curcpu()).
583 */
584 passedcur = true;
585 continue;
586 }
587 if (first == NULL) {
588 /*
589 * (ci|...|curcpu()|...)
590 *
591 * Record ci as first in case there's nothing
592 * after curcpu().
593 */
594 first = ci;
595 continue;
596 }
597 }
598
599 /*
600 * If we hit the end, wrap around to the beginning.
601 */
602 if (patient == NULL) {
603 KASSERT(passedcur);
604 patient = first;
605 }
606
607 return patient;
608 }
609
610 /*
611 * heartbeat()
612 *
613 * 1. Count a heartbeat on the local CPU.
614 *
615 * 2. Panic if the system uptime doesn't seem to have advanced in
616 * a while.
617 *
618 * 3. Panic if the soft interrupt on this CPU hasn't advanced the
619 * local view of the system uptime.
620 *
621 * 4. Schedule the soft interrupt to advance the local view of the
622 * system uptime.
623 *
624 * 5. Select another CPU to check the heartbeat of.
625 *
626 * 6. Panic if the other CPU hasn't advanced its view of the
627 * system uptime in a while.
628 */
629 void
630 heartbeat(void)
631 {
632 unsigned period_ticks, period_secs;
633 unsigned count, uptime, cache, stamp, d;
634 struct cpu_info *patient;
635
636 KASSERT(curcpu_stable());
637
638 /*
639 * If heartbeat checks are disabled globally, or if they are
640 * suspended locally, or if we're already panicking so it's not
641 * helpful to trigger more panics for more reasons, do nothing.
642 */
643 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks);
644 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs);
645 if (__predict_false(period_ticks == 0) ||
646 __predict_false(period_secs == 0) ||
647 __predict_false(curcpu()->ci_heartbeat_suspend) ||
648 __predict_false(panicstr != NULL))
649 return;
650
651 /*
652 * Count a heartbeat on this CPU.
653 */
654 count = curcpu()->ci_heartbeat_count++;
655
656 /*
657 * If the uptime hasn't changed, make sure that we haven't
658 * counted too many of our own heartbeats since the uptime last
659 * changed, and stop here -- we only do the cross-CPU work once
660 * per second.
661 */
662 uptime = time_uptime32;
663 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache);
664 if (__predict_true(cache == uptime)) {
665 /*
666 * Timecounter hasn't advanced by more than a second.
667 * Make sure the timecounter isn't stuck according to
668 * our heartbeats -- unless timecounter heartbeats are
669 * suspended too.
670 *
671 * Our own heartbeat count can't roll back, and
672 * time_uptime32 should be updated before it wraps
673 * around, so d should never go negative; hence no
674 * check for d < UINT_MAX/2.
675 */
676 stamp =
677 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp);
678 d = count - stamp;
679 if (__predict_false(d > period_ticks) &&
680 !heartbeat_timecounter_suspended()) {
681 panic("%s: time has not advanced in %u heartbeats",
682 cpu_name(curcpu()), d);
683 }
684 return;
685 }
686
687 /*
688 * If the uptime has changed, make sure that it hasn't changed
689 * so much that softints must be stuck on this CPU. Since
690 * time_uptime32 is monotonic and our cache of it is updated at
691 * most every UINT_MAX/4/hz sec (hence no concern about
692 * wraparound even after 68 or 136 years), this can't go
693 * negative, hence no check for d < UINT_MAX/2.
694 *
695 * This uses the hard timer interrupt handler on the current
696 * CPU to ensure soft interrupts at all priority levels have
697 * made progress.
698 */
699 d = uptime - cache;
700 if (__predict_false(d > period_secs)) {
701 panic("%s: softints stuck for %u seconds",
702 cpu_name(curcpu()), d);
703 }
704
705 /*
706 * Schedule a softint to update our cache of the system uptime
707 * so the next call to heartbeat, on this or another CPU, can
708 * detect progress on this one.
709 */
710 softint_schedule(heartbeat_sih);
711
712 /*
713 * Select a patient to check the heartbeat of. If there's no
714 * other online CPU, nothing to do.
715 */
716 patient = select_patient();
717 if (patient == NULL)
718 return;
719
720 /*
721 * Verify that time is advancing on the patient CPU. If the
722 * delta exceeds UINT_MAX/2, that means it is already ahead by
723 * a little on the other CPU, and the subtraction went
724 * negative, which is OK. If the CPU's heartbeats have been
725 * suspended since we selected it, no worries.
726 *
727 * This uses the current CPU to ensure the other CPU has made
728 * progress, even if the other CPU's hard timer interrupt
729 * handler is stuck for some reason.
730 *
731 * XXX Maybe confirm it hasn't gone negative by more than
732 * max_period?
733 */
734 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache);
735 if (__predict_false(d > period_secs) &&
736 __predict_false(d < UINT_MAX/2) &&
737 atomic_load_relaxed(&patient->ci_heartbeat_suspend) == 0)
738 defibrillate(patient, d);
739 }
740
741 /*
742 * heartbeat_dump()
743 *
744 * Print the heartbeat data of all CPUs. Can be called from ddb.
745 */
746 #ifdef DDB
747 static unsigned
748 db_read_unsigned(const volatile unsigned *p)
749 {
750 unsigned x;
751
752 db_read_bytes((db_addr_t)(uintptr_t)p, sizeof(x), (char *)&x);
753
754 return x;
755 }
756
757 void
758 heartbeat_dump(void)
759 {
760 struct cpu_info *ci;
761
762 db_printf("Heartbeats:\n");
763 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
764 db_printf("cpu%u: count %u uptime %u stamp %u suspend %u\n",
765 db_read_unsigned(&ci->ci_index),
766 db_read_unsigned(&ci->ci_heartbeat_count),
767 db_read_unsigned(&ci->ci_heartbeat_uptime_cache),
768 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp),
769 db_read_unsigned(&ci->ci_heartbeat_suspend));
770 }
771 }
772 #endif
773