kern_heartbeat.c revision 1.10 1 /* $NetBSD: kern_heartbeat.c,v 1.10 2023/09/06 12:29:14 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress
31 *
32 * Manual tests to run when changing this file. Magic numbers are for
33 * evbarm; adjust for other platforms. Tests involving cpuctl
34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU
35 * system, offline all but one CPU.
36 *
37 * 1. cpuctl offline 0
38 * sleep 20
39 * cpuctl online 0
40 *
41 * 2. cpuctl offline 1
42 * sleep 20
43 * cpuctl online 1
44 *
45 * 3. cpuctl offline 0
46 * sysctl -w kern.heartbeat.max_period=5
47 * sleep 10
48 * sysctl -w kern.heartbeat.max_period=0
49 * sleep 10
50 * sysctl -w kern.heartbeat.max_period=5
51 * sleep 10
52 * cpuctl online 0
53 *
54 * 4. sysctl -w debug.crashme_enable=1
55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
56 * # verify system panics after 15sec, with a stack trace through
57 * # crashme_spl_spinout
58 *
59 * 5. sysctl -w debug.crashme_enable=1
60 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
61 * # verify system panics after 15sec, with a stack trace through
62 * # crashme_spl_spinout
63 *
64 * 6. cpuctl offline 0
65 * sysctl -w debug.crashme_enable=1
66 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK
67 * # verify system panics after 15sec, with a stack trace through
68 * # crashme_spl_spinout
69 *
70 * 7. cpuctl offline 0
71 * sysctl -w debug.crashme_enable=1
72 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM
73 * # verify system panics after 15sec, with a stack trace through
74 * # crashme_spl_spinout
75 *
76 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU
77 * # require a hardware watchdog timer.
78 * #cpuctl offline 0
79 * #sysctl -w debug.crashme_enable
80 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED
81 * # hope watchdog timer kicks in
82 */
83
84 #include <sys/cdefs.h>
85 __KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.10 2023/09/06 12:29:14 riastradh Exp $");
86
87 #ifdef _KERNEL_OPT
88 #include "opt_ddb.h"
89 #include "opt_heartbeat.h"
90 #endif
91
92 #include "heartbeat.h"
93
94 #include <sys/param.h>
95 #include <sys/types.h>
96
97 #include <sys/atomic.h>
98 #include <sys/cpu.h>
99 #include <sys/errno.h>
100 #include <sys/heartbeat.h>
101 #include <sys/ipi.h>
102 #include <sys/kernel.h>
103 #include <sys/mutex.h>
104 #include <sys/sysctl.h>
105 #include <sys/systm.h>
106 #include <sys/xcall.h>
107
108 #ifdef DDB
109 #include <ddb/ddb.h>
110 #endif
111
112 /*
113 * Global state.
114 *
115 * heartbeat_lock serializes access to heartbeat_max_period_secs
116 * and heartbeat_max_period_ticks. Two separate variables so we
117 * can avoid multiplication or division in the heartbeat routine.
118 *
119 * heartbeat_sih is stable after initialization in
120 * heartbeat_start.
121 */
122 kmutex_t heartbeat_lock __cacheline_aligned;
123 unsigned heartbeat_max_period_secs __read_mostly;
124 unsigned heartbeat_max_period_ticks __read_mostly;
125
126 void *heartbeat_sih __read_mostly;
127
128 /*
129 * heartbeat_suspend()
130 *
131 * Suspend heartbeat monitoring of the current CPU.
132 *
133 * Called after the current CPU has been marked offline but before
134 * it has stopped running, or after IPL has been raised for
135 * polling-mode console input. Binds to the current CPU as a side
136 * effect. Nestable (but only up to 2^32 times, so don't do this
137 * in a loop). Reversed by heartbeat_resume.
138 */
139 void
140 heartbeat_suspend(void)
141 {
142 unsigned *p;
143
144 /*
145 * We could use curlwp_bind, but we'd have to record whether we
146 * were already bound or not to pass to curlwp_bindx in
147 * heartbeat_resume. Using kpreempt_disable is simpler and
148 * unlikely to have any adverse consequences, since this only
149 * happens when we're about to go into a tight polling loop at
150 * raised IPL anyway.
151 */
152 kpreempt_disable();
153 p = &curcpu()->ci_heartbeat_suspend;
154 atomic_store_relaxed(p, *p + 1);
155 }
156
157 /*
158 * heartbeat_resume_cpu(ci)
159 *
160 * Resume heartbeat monitoring of ci.
161 *
162 * Called at startup while cold, and whenever heartbeat monitoring
163 * is re-enabled after being disabled or the period is changed.
164 * When not cold, ci must be the current CPU.
165 *
166 * Must be run at splsched.
167 */
168 static void
169 heartbeat_resume_cpu(struct cpu_info *ci)
170 {
171
172 KASSERT(__predict_false(cold) || curcpu_stable());
173 KASSERT(__predict_false(cold) || ci == curcpu());
174 /* XXX KASSERT IPL_SCHED */
175
176 ci->ci_heartbeat_count = 0;
177 ci->ci_heartbeat_uptime_cache = time_uptime;
178 ci->ci_heartbeat_uptime_stamp = 0;
179 }
180
181 /*
182 * heartbeat_resume()
183 *
184 * Resume heartbeat monitoring of the current CPU.
185 *
186 * Called after the current CPU has started running but before it
187 * has been marked online, or when ending polling-mode input
188 * before IPL is restored. Reverses heartbeat_suspend.
189 */
190 void
191 heartbeat_resume(void)
192 {
193 struct cpu_info *ci = curcpu();
194 unsigned *p;
195 int s;
196
197 /*
198 * Reset the state so nobody spuriously thinks we had a heart
199 * attack as soon as the heartbeat checks resume.
200 */
201 s = splsched();
202 heartbeat_resume_cpu(ci);
203 splx(s);
204
205 p = &ci->ci_heartbeat_suspend;
206 atomic_store_relaxed(p, *p - 1);
207 kpreempt_enable();
208 }
209
210 /*
211 * heartbeat_timecounter_suspended()
212 *
213 * True if timecounter heartbeat checks are suspended because the
214 * timecounter may not be advancing, false if heartbeat checks
215 * should check for timecounter progress.
216 */
217 static bool
218 heartbeat_timecounter_suspended(void)
219 {
220 CPU_INFO_ITERATOR cii;
221 struct cpu_info *ci;
222
223 /*
224 * The timecounter ticks only on the primary CPU. Check
225 * whether it's suspended.
226 *
227 * XXX Would be nice if we could find the primary CPU without
228 * iterating over all CPUs.
229 */
230 for (CPU_INFO_FOREACH(cii, ci)) {
231 if (CPU_IS_PRIMARY(ci))
232 return atomic_load_relaxed(&ci->ci_heartbeat_suspend);
233 }
234
235 /*
236 * This should be unreachable -- there had better be a primary
237 * CPU in the system! If not, the timecounter will be busted
238 * anyway.
239 */
240 panic("no primary CPU");
241 }
242
243 /*
244 * heartbeat_reset_xc(a, b)
245 *
246 * Cross-call handler to reset heartbeat state just prior to
247 * enabling heartbeat checks.
248 */
249 static void
250 heartbeat_reset_xc(void *a, void *b)
251 {
252 int s;
253
254 s = splsched();
255 heartbeat_resume_cpu(curcpu());
256 splx(s);
257 }
258
259 /*
260 * set_max_period(max_period)
261 *
262 * Set the maximum period, in seconds, for heartbeat checks.
263 *
264 * - If max_period is zero, disable them.
265 *
266 * - If the max period was zero and max_period is nonzero, ensure
267 * all CPUs' heartbeat uptime caches are up-to-date before
268 * re-enabling them.
269 *
270 * max_period must be below UINT_MAX/4/hz to avoid arithmetic
271 * overflow and give room for slop.
272 *
273 * Caller must hold heartbeat_lock.
274 */
275 static void
276 set_max_period(unsigned max_period)
277 {
278
279 KASSERTMSG(max_period <= UINT_MAX/4/hz,
280 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)",
281 max_period, UINT_MAX/4/hz, hz);
282 KASSERT(mutex_owned(&heartbeat_lock));
283
284 /*
285 * If we're enabling heartbeat checks, make sure we have a
286 * reasonably up-to-date time_uptime cache on all CPUs so we
287 * don't think we had an instant heart attack.
288 */
289 if (heartbeat_max_period_secs == 0 && max_period != 0) {
290 if (cold) {
291 CPU_INFO_ITERATOR cii;
292 struct cpu_info *ci;
293
294 for (CPU_INFO_FOREACH(cii, ci))
295 heartbeat_resume_cpu(ci);
296 } else {
297 const uint64_t ticket =
298 xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL);
299 xc_wait(ticket);
300 }
301 }
302
303 /*
304 * Once the heartbeat state has been updated on all (online)
305 * CPUs, set the period. At this point, heartbeat checks can
306 * begin.
307 */
308 atomic_store_relaxed(&heartbeat_max_period_secs, max_period);
309 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz);
310 }
311
312 /*
313 * heartbeat_max_period_ticks(SYSCTLFN_ARGS)
314 *
315 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies
316 * it lies within a reasonable interval and sets it.
317 */
318 static int
319 heartbeat_max_period_sysctl(SYSCTLFN_ARGS)
320 {
321 struct sysctlnode node;
322 unsigned max_period;
323 int error;
324
325 mutex_enter(&heartbeat_lock);
326
327 max_period = heartbeat_max_period_secs;
328 node = *rnode;
329 node.sysctl_data = &max_period;
330 error = sysctl_lookup(SYSCTLFN_CALL(&node));
331 if (error || newp == NULL)
332 goto out;
333
334 /*
335 * Ensure there's plenty of slop between heartbeats.
336 */
337 if (max_period > UINT_MAX/4/hz) {
338 error = EOVERFLOW;
339 goto out;
340 }
341
342 /*
343 * Success! Set the period. This enables heartbeat checks if
344 * we went from zero period to nonzero period, or disables them
345 * if the other way around.
346 */
347 set_max_period(max_period);
348 error = 0;
349
350 out: mutex_exit(&heartbeat_lock);
351 return error;
352 }
353
354 /*
355 * sysctl_heartbeat_setup()
356 *
357 * Set up the kern.heartbeat.* sysctl subtree.
358 */
359 SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup")
360 {
361 const struct sysctlnode *rnode;
362 int error;
363
364 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE);
365
366 /* kern.heartbeat */
367 error = sysctl_createv(NULL, 0, NULL, &rnode,
368 CTLFLAG_PERMANENT,
369 CTLTYPE_NODE, "heartbeat",
370 SYSCTL_DESCR("Kernel heartbeat parameters"),
371 NULL, 0, NULL, 0,
372 CTL_KERN, CTL_CREATE, CTL_EOL);
373 if (error) {
374 printf("%s: failed to create kern.heartbeat: %d\n",
375 __func__, error);
376 return;
377 }
378
379 /* kern.heartbeat.max_period */
380 error = sysctl_createv(NULL, 0, &rnode, NULL,
381 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
382 CTLTYPE_INT, "max_period",
383 SYSCTL_DESCR("Max seconds between heartbeats before panic"),
384 &heartbeat_max_period_sysctl, 0, NULL, 0,
385 CTL_CREATE, CTL_EOL);
386 if (error) {
387 printf("%s: failed to create kern.heartbeat.max_period: %d\n",
388 __func__, error);
389 return;
390 }
391 }
392
393 /*
394 * heartbeat_intr(cookie)
395 *
396 * Soft interrupt handler to update the local CPU's view of the
397 * system uptime. This runs at the same priority level as
398 * callouts, so if callouts are stuck on this CPU, it won't run,
399 * and eventually another CPU will notice that this one is stuck.
400 *
401 * Don't do spl* here -- keep it to a minimum so if anything goes
402 * wrong we don't end up with hard interrupts blocked and unable
403 * to detect a missed heartbeat.
404 */
405 static void
406 heartbeat_intr(void *cookie)
407 {
408 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count);
409 unsigned uptime = time_uptime;
410
411 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count);
412 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime);
413 }
414
415 /*
416 * heartbeat_start()
417 *
418 * Start system heartbeat monitoring.
419 */
420 void
421 heartbeat_start(void)
422 {
423 const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT;
424
425 /*
426 * Establish a softint so we can schedule it once ready. This
427 * should be at the lowest softint priority level so that we
428 * ensure all softint priorities are making progress.
429 */
430 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
431 &heartbeat_intr, NULL);
432
433 /*
434 * Now that the softint is established, kick off heartbeat
435 * monitoring with the default period. This will initialize
436 * the per-CPU state to an up-to-date cache of time_uptime.
437 */
438 mutex_enter(&heartbeat_lock);
439 set_max_period(max_period);
440 mutex_exit(&heartbeat_lock);
441 }
442
443 /*
444 * defibrillator(cookie)
445 *
446 * IPI handler for defibrillation. If the CPU's heart has stopped
447 * beating normally, but the CPU can still execute things,
448 * acknowledge the IPI to the doctor and then panic so we at least
449 * get a stack trace from whatever the current CPU is stuck doing,
450 * if not a core dump.
451 *
452 * (This metaphor is a little stretched, since defibrillation is
453 * usually administered when the heart is beating errattically but
454 * hasn't stopped, and causes the heart to stop temporarily, and
455 * one hopes it is not fatal. But we're (software) engineers, so
456 * we can stretch metaphors like silly putty in a blender.)
457 */
458 static void
459 defibrillator(void *cookie)
460 {
461 bool *ack = cookie;
462
463 /*
464 * Acknowledge the interrupt so the doctor CPU won't trigger a
465 * new panic for defibrillation timeout.
466 */
467 atomic_store_relaxed(ack, true);
468
469 /*
470 * If a panic is already in progress, we may have interrupted
471 * the logic that prints a stack trace on this CPU -- so let's
472 * not make it worse by giving the misapprehension of a
473 * recursive panic.
474 */
475 if (atomic_load_relaxed(&panicstr) != NULL)
476 return;
477
478 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()),
479 curlwp->l_lid,
480 curlwp->l_name ? curlwp->l_name : curproc->p_comm);
481 }
482
483 /*
484 * defibrillate(ci, unsigned d)
485 *
486 * The patient CPU ci's heart has stopped beating after d seconds.
487 * Force the patient CPU ci to panic, or panic on this CPU if the
488 * patient CPU doesn't respond within 1sec.
489 */
490 static void __noinline
491 defibrillate(struct cpu_info *ci, unsigned d)
492 {
493 bool ack = false;
494 ipi_msg_t msg = {
495 .func = &defibrillator,
496 .arg = &ack,
497 };
498 unsigned countdown = 1000; /* 1sec */
499
500 KASSERT(curcpu_stable());
501
502 /*
503 * First notify the console that the patient CPU's heart seems
504 * to have stopped beating.
505 */
506 printf("%s: found %s heart stopped beating after %u seconds\n",
507 cpu_name(curcpu()), cpu_name(ci), d);
508
509 /*
510 * Next, give the patient CPU a chance to panic, so we get a
511 * stack trace on that CPU even if we don't get a crash dump.
512 */
513 ipi_unicast(&msg, ci);
514
515 /*
516 * Busy-wait up to 1sec for the patient CPU to print a stack
517 * trace and panic. If the patient CPU acknowledges the IPI,
518 * just give up and stop here -- the system is coming down soon
519 * and we should avoid getting in the way.
520 */
521 while (countdown --> 0) {
522 if (atomic_load_relaxed(&ack))
523 return;
524 DELAY(1000); /* 1ms */
525 }
526
527 /*
528 * The patient CPU failed to acknowledge the panic request.
529 * Panic now; with any luck, we'll get a crash dump.
530 */
531 panic("%s: found %s heart stopped beating and unresponsive",
532 cpu_name(curcpu()), cpu_name(ci));
533 }
534
535 /*
536 * select_patient()
537 *
538 * Select another CPU to check the heartbeat of. Returns NULL if
539 * there are no other online CPUs. Never returns curcpu().
540 * Caller must have kpreemption disabled.
541 */
542 static struct cpu_info *
543 select_patient(void)
544 {
545 CPU_INFO_ITERATOR cii;
546 struct cpu_info *first = NULL, *patient = NULL, *ci;
547 bool passedcur = false;
548
549 KASSERT(curcpu_stable());
550
551 /*
552 * In the iteration order of all CPUs, find the next online CPU
553 * after curcpu(), or the first online one if curcpu() is last
554 * in the iteration order.
555 */
556 for (CPU_INFO_FOREACH(cii, ci)) {
557 if (atomic_load_relaxed(&ci->ci_heartbeat_suspend))
558 continue;
559 if (passedcur) {
560 /*
561 * (...|curcpu()|ci|...)
562 *
563 * Found the patient right after curcpu().
564 */
565 KASSERT(patient != ci);
566 patient = ci;
567 break;
568 }
569 if (ci == curcpu()) {
570 /*
571 * (...|prev|ci=curcpu()|next|...)
572 *
573 * Note that we want next (or first, if there's
574 * nothing after curcpu()).
575 */
576 passedcur = true;
577 continue;
578 }
579 if (first == NULL) {
580 /*
581 * (ci|...|curcpu()|...)
582 *
583 * Record ci as first in case there's nothing
584 * after curcpu().
585 */
586 first = ci;
587 continue;
588 }
589 }
590
591 /*
592 * If we hit the end, wrap around to the beginning.
593 */
594 if (patient == NULL) {
595 KASSERT(passedcur);
596 patient = first;
597 }
598
599 return patient;
600 }
601
602 /*
603 * heartbeat()
604 *
605 * 1. Count a heartbeat on the local CPU.
606 *
607 * 2. Panic if the system uptime doesn't seem to have advanced in
608 * a while.
609 *
610 * 3. Panic if the soft interrupt on this CPU hasn't advanced the
611 * local view of the system uptime.
612 *
613 * 4. Schedule the soft interrupt to advance the local view of the
614 * system uptime.
615 *
616 * 5. Select another CPU to check the heartbeat of.
617 *
618 * 6. Panic if the other CPU hasn't advanced its view of the
619 * system uptime in a while.
620 */
621 void
622 heartbeat(void)
623 {
624 unsigned period_ticks, period_secs;
625 unsigned count, uptime, cache, stamp, d;
626 struct cpu_info *patient;
627
628 KASSERT(curcpu_stable());
629
630 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks);
631 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs);
632 if (__predict_false(period_ticks == 0) ||
633 __predict_false(period_secs == 0) ||
634 __predict_false(curcpu()->ci_heartbeat_suspend))
635 return;
636
637 /*
638 * Count a heartbeat on this CPU.
639 */
640 count = curcpu()->ci_heartbeat_count++;
641
642 /*
643 * If the uptime hasn't changed, make sure that we haven't
644 * counted too many of our own heartbeats since the uptime last
645 * changed, and stop here -- we only do the cross-CPU work once
646 * per second.
647 */
648 uptime = time_uptime;
649 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache);
650 if (__predict_true(cache == uptime)) {
651 /*
652 * Timecounter hasn't advanced by more than a second.
653 * Make sure the timecounter isn't stuck according to
654 * our heartbeats -- unless timecounter heartbeats are
655 * suspended too.
656 *
657 * Our own heartbeat count can't roll back, and
658 * time_uptime should be updated before it wraps
659 * around, so d should never go negative; hence no
660 * check for d < UINT_MAX/2.
661 */
662 stamp =
663 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp);
664 d = count - stamp;
665 if (__predict_false(d > period_ticks) &&
666 !heartbeat_timecounter_suspended()) {
667 panic("%s: time has not advanced in %u heartbeats",
668 cpu_name(curcpu()), d);
669 }
670 return;
671 }
672
673 /*
674 * If the uptime has changed, make sure that it hasn't changed
675 * so much that softints must be stuck on this CPU. Since
676 * time_uptime is monotonic, this can't go negative, hence no
677 * check for d < UINT_MAX/2.
678 *
679 * This uses the hard timer interrupt handler on the current
680 * CPU to ensure soft interrupts at all priority levels have
681 * made progress.
682 */
683 d = uptime - cache;
684 if (__predict_false(d > period_secs)) {
685 panic("%s: softints stuck for %u seconds",
686 cpu_name(curcpu()), d);
687 }
688
689 /*
690 * Schedule a softint to update our cache of the system uptime
691 * so the next call to heartbeat, on this or another CPU, can
692 * detect progress on this one.
693 */
694 softint_schedule(heartbeat_sih);
695
696 /*
697 * Select a patient to check the heartbeat of. If there's no
698 * other online CPU, nothing to do.
699 */
700 patient = select_patient();
701 if (patient == NULL)
702 return;
703
704 /*
705 * Verify that time is advancing on the patient CPU. If the
706 * delta exceeds UINT_MAX/2, that means it is already ahead by
707 * a little on the other CPU, and the subtraction went
708 * negative, which is OK. If the CPU's heartbeats have been
709 * suspended since we selected it, no worries.
710 *
711 * This uses the current CPU to ensure the other CPU has made
712 * progress, even if the other CPU's hard timer interrupt
713 * handler is stuck for some reason.
714 *
715 * XXX Maybe confirm it hasn't gone negative by more than
716 * max_period?
717 */
718 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache);
719 if (__predict_false(d > period_secs) &&
720 __predict_false(d < UINT_MAX/2) &&
721 atomic_load_relaxed(&patient->ci_heartbeat_suspend) == 0)
722 defibrillate(patient, d);
723 }
724
725 /*
726 * heartbeat_dump()
727 *
728 * Print the heartbeat data of all CPUs. Can be called from ddb.
729 */
730 #ifdef DDB
731 static unsigned
732 db_read_unsigned(const volatile unsigned *p)
733 {
734 unsigned x;
735
736 db_read_bytes((db_addr_t)(uintptr_t)p, sizeof(x), (char *)&x);
737
738 return x;
739 }
740
741 void
742 heartbeat_dump(void)
743 {
744 struct cpu_info *ci;
745
746 db_printf("Heartbeats:\n");
747 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
748 db_printf("cpu%u: count %u uptime %u stamp %u suspend %u\n",
749 db_read_unsigned(&ci->ci_index),
750 db_read_unsigned(&ci->ci_heartbeat_count),
751 db_read_unsigned(&ci->ci_heartbeat_uptime_cache),
752 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp),
753 db_read_unsigned(&ci->ci_heartbeat_suspend));
754 }
755 }
756 #endif
757