xen_clock.c revision 1.13 1 /* $NetBSD: xen_clock.c,v 1.13 2023/07/28 10:38:44 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "opt_xen.h"
33
34 #ifndef XEN_CLOCK_DEBUG
35 #define XEN_CLOCK_DEBUG 0
36 #endif
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.13 2023/07/28 10:38:44 riastradh Exp $");
40
41 #include <sys/param.h>
42 #include <sys/types.h>
43 #include <sys/atomic.h>
44 #include <sys/callout.h>
45 #include <sys/cpu.h>
46 #include <sys/device.h>
47 #include <sys/evcnt.h>
48 #include <sys/intr.h>
49 #include <sys/kernel.h>
50 #include <sys/lwp.h>
51 #include <sys/proc.h>
52 #include <sys/sdt.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/time.h>
56 #include <sys/timetc.h>
57
58 #include <dev/clock_subr.h>
59
60 #include <machine/cpu.h>
61 #include <machine/cpu_counter.h>
62 #include <machine/lock.h>
63
64 #include <xen/evtchn.h>
65 #include <xen/hypervisor.h>
66 #include <xen/include/public/vcpu.h>
67 #include <xen/xen.h>
68
69 #include <x86/rtc.h>
70
71 #define NS_PER_TICK ((uint64_t)1000000000ULL/hz)
72
73 static uint64_t xen_vcputime_systime_ns(void);
74 static uint64_t xen_vcputime_raw_systime_ns(void);
75 static uint64_t xen_global_systime_ns(void);
76 static unsigned xen_get_timecount(struct timecounter *);
77 static int xen_timer_handler(void *, struct clockframe *);
78
79 /*
80 * dtrace probes
81 */
82 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__backward,
83 "uint64_t"/*raw_systime_ns*/,
84 "uint64_t"/*tsc_timestamp*/,
85 "uint64_t"/*tsc_to_system_mul*/,
86 "int"/*tsc_shift*/,
87 "uint64_t"/*delta_ns*/,
88 "uint64_t"/*tsc*/,
89 "uint64_t"/*systime_ns*/);
90 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__delta__negative,
91 "uint64_t"/*raw_systime_ns*/,
92 "uint64_t"/*tsc_timestamp*/,
93 "uint64_t"/*tsc_to_system_mul*/,
94 "int"/*tsc_shift*/,
95 "uint64_t"/*delta_ns*/,
96 "uint64_t"/*tsc*/,
97 "uint64_t"/*systime_ns*/);
98 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__wraparound,
99 "uint64_t"/*raw_systime_ns*/,
100 "uint64_t"/*tsc_timestamp*/,
101 "uint64_t"/*tsc_to_system_mul*/,
102 "int"/*tsc_shift*/,
103 "uint64_t"/*delta_ns*/,
104 "uint64_t"/*tsc*/,
105 "uint64_t"/*systime_ns*/);
106 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__backward,
107 "uint64_t"/*raw_systime_ns*/,
108 "uint64_t"/*tsc_timestamp*/,
109 "uint64_t"/*tsc_to_system_mul*/,
110 "int"/*tsc_shift*/,
111 "uint64_t"/*delta_ns*/,
112 "uint64_t"/*tsc*/,
113 "uint64_t"/*systime_ns*/);
114
115 SDT_PROBE_DEFINE3(sdt, xen, timecounter, backward,
116 "uint64_t"/*local*/,
117 "uint64_t"/*skew*/,
118 "uint64_t"/*global*/);
119
120 SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward,
121 "uint64_t"/*last_systime_ns*/,
122 "uint64_t"/*this_systime_ns*/);
123 SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed,
124 "uint64_t"/*last_systime_ns*/,
125 "uint64_t"/*this_systime_ns*/,
126 "uint64_t"/*remaining_ns*/);
127
128 /*
129 * xen timecounter:
130 *
131 * Xen vCPU system time, plus an adjustment with rdtsc.
132 */
133 static struct timecounter xen_timecounter = {
134 .tc_get_timecount = xen_get_timecount,
135 .tc_poll_pps = NULL,
136 .tc_counter_mask = ~0U,
137 .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */
138 .tc_name = "xen_system_time",
139 .tc_quality = 10000,
140 };
141
142 /*
143 * xen_global_systime_ns_stamp
144 *
145 * The latest Xen vCPU system time that has been observed on any
146 * CPU, for a global monotonic view of the Xen system time clock.
147 */
148 static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned;
149
150 #ifdef DOM0OPS
151 /*
152 * xen timepush state:
153 *
154 * Callout to periodically, after a sysctl-configurable number of
155 * NetBSD ticks, set the Xen hypervisor's wall clock time.
156 */
157 static struct {
158 struct callout ch;
159 int ticks;
160 } xen_timepush;
161
162 static void xen_timepush_init(void);
163 static void xen_timepush_intr(void *);
164 static int sysctl_xen_timepush(SYSCTLFN_ARGS);
165 #endif
166
167 /*
168 * xen_rdtsc()
169 *
170 * Read the local pCPU's tsc.
171 */
172 static inline uint64_t
173 xen_rdtsc(void)
174 {
175 uint32_t lo, hi;
176
177 asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
178
179 return ((uint64_t)hi << 32) | lo;
180 }
181
182 /*
183 * struct xen_vcputime_ticket
184 *
185 * State for a vCPU read section, during which a caller may read
186 * from fields of a struct vcpu_time_info and call xen_rdtsc.
187 * Caller must enter with xen_vcputime_enter, exit with
188 * xen_vcputime_exit, and be prepared to retry if
189 * xen_vcputime_exit fails.
190 */
191 struct xen_vcputime_ticket {
192 uint64_t version;
193 };
194
195 /*
196 * xen_vcputime_enter(tp)
197 *
198 * Enter a vCPU time read section and store a ticket in *tp, which
199 * the caller must use with xen_vcputime_exit. Return a pointer
200 * to the current CPU's vcpu_time_info structure. Caller must
201 * already be bound to the CPU.
202 */
203 static inline volatile struct vcpu_time_info *
204 xen_vcputime_enter(struct xen_vcputime_ticket *tp)
205 {
206 volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time;
207
208 while (__predict_false(1 & (tp->version = vt->version)))
209 SPINLOCK_BACKOFF_HOOK;
210
211 /*
212 * Must read the version before reading the tsc on the local
213 * pCPU. We are racing only with interruption by the
214 * hypervisor, so no need for a stronger memory barrier.
215 */
216 __insn_barrier();
217
218 return vt;
219 }
220
221 /*
222 * xen_vcputime_exit(vt, tp)
223 *
224 * Exit a vCPU time read section with the ticket in *tp from
225 * xen_vcputime_enter. Return true on success, false if caller
226 * must retry.
227 */
228 static inline bool
229 xen_vcputime_exit(volatile struct vcpu_time_info *vt,
230 struct xen_vcputime_ticket *tp)
231 {
232
233 KASSERT(vt == &curcpu()->ci_vcpu->time);
234
235 /*
236 * Must read the tsc before re-reading the version on the local
237 * pCPU. We are racing only with interruption by the
238 * hypervisor, so no need for a stronger memory barrier.
239 */
240 __insn_barrier();
241
242 return tp->version == vt->version;
243 }
244
245 /*
246 * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
247 *
248 * Convert a difference in tsc units to a difference in
249 * nanoseconds given a multiplier and shift for the unit
250 * conversion.
251 */
252 static inline uint64_t
253 xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
254 int8_t tsc_shift)
255 {
256 uint32_t delta_tsc_hi, delta_tsc_lo;
257
258 if (tsc_shift < 0)
259 delta_tsc >>= -tsc_shift;
260 else
261 delta_tsc <<= tsc_shift;
262
263 delta_tsc_hi = delta_tsc >> 32;
264 delta_tsc_lo = delta_tsc & 0xffffffffUL;
265
266 /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
267 return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
268 (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
269 }
270
271 /*
272 * xen_vcputime_systime_ns()
273 *
274 * Return a snapshot of the Xen system time plus an adjustment
275 * from the tsc, in units of nanoseconds. Caller must be bound to
276 * the current CPU.
277 */
278 static uint64_t
279 xen_vcputime_systime_ns(void)
280 {
281 volatile struct vcpu_time_info *vt;
282 struct cpu_info *ci = curcpu();
283 struct xen_vcputime_ticket ticket;
284 uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns;
285 uint32_t tsc_to_system_mul;
286 int8_t tsc_shift;
287 uint64_t systime_ns;
288
289 /* We'd better be bound to the CPU in _some_ way. */
290 KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() ||
291 (curlwp->l_flag & LP_BOUND));
292
293 /*
294 * Repeatedly try to read the system time, corresponding tsc
295 * timestamp, and tsc frequency until we get a consistent view.
296 */
297 do {
298 vt = xen_vcputime_enter(&ticket);
299
300 /* Grab Xen's snapshot of raw system time and tsc. */
301 raw_systime_ns = vt->system_time;
302 tsc_timestamp = vt->tsc_timestamp;
303
304 /* Get Xen's current idea of how fast the tsc is counting. */
305 tsc_to_system_mul = vt->tsc_to_system_mul;
306 tsc_shift = vt->tsc_shift;
307
308 /* Read the CPU's tsc. */
309 tsc = xen_rdtsc();
310 } while (!xen_vcputime_exit(vt, &ticket));
311
312 /*
313 * Out of paranoia, check whether the tsc has gone backwards
314 * since Xen's timestamp.
315 *
316 * This shouldn't happen because the Xen hypervisor is supposed
317 * to have read the tsc _before_ writing to the vcpu_time_info
318 * page, _before_ we read the tsc.
319 *
320 * Further, if we switched pCPUs after reading the tsc
321 * timestamp but before reading the CPU's tsc, the hypervisor
322 * had better notify us by updating the version too and forcing
323 * us to retry the vCPU time read.
324 */
325 if (__predict_false(tsc < tsc_timestamp)) {
326 /*
327 * Notify the console that the CPU's tsc appeared to
328 * run behind Xen's idea of it, and pretend it hadn't.
329 */
330 SDT_PROBE7(sdt, xen, clock, tsc__backward,
331 raw_systime_ns, tsc_timestamp,
332 tsc_to_system_mul, tsc_shift, /*delta_ns*/0, tsc,
333 /*systime_ns*/raw_systime_ns);
334 #if XEN_CLOCK_DEBUG
335 device_printf(ci->ci_dev, "xen cpu tsc %"PRIu64
336 " ran backwards from timestamp %"PRIu64
337 " by %"PRIu64"\n",
338 tsc, tsc_timestamp, tsc_timestamp - tsc);
339 #endif
340 ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++;
341 delta_ns = delta_tsc = 0;
342 } else {
343 /* Find how far the CPU's tsc has advanced. */
344 delta_tsc = tsc - tsc_timestamp;
345
346 /* Convert the tsc delta to a nanosecond delta. */
347 delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul,
348 tsc_shift);
349 }
350
351 /*
352 * Notify the console if the delta computation yielded a
353 * negative, and pretend it hadn't.
354 *
355 * This doesn't make sense but I include it out of paranoia.
356 */
357 if (__predict_false((int64_t)delta_ns < 0)) {
358 SDT_PROBE7(sdt, xen, clock, tsc__delta__negative,
359 raw_systime_ns, tsc_timestamp,
360 tsc_to_system_mul, tsc_shift, delta_ns, tsc,
361 /*systime_ns*/raw_systime_ns);
362 #if XEN_CLOCK_DEBUG
363 device_printf(ci->ci_dev, "xen tsc delta in ns went negative:"
364 " %"PRId64"\n", delta_ns);
365 #endif
366 ci->ci_xen_tsc_delta_negative_evcnt.ev_count++;
367 delta_ns = 0;
368 }
369
370 /*
371 * Compute the TSC-adjusted system time.
372 */
373 systime_ns = raw_systime_ns + delta_ns;
374
375 /*
376 * Notify the console if the addition wrapped around.
377 *
378 * This shouldn't happen because system time should be relative
379 * to a reasonable reference point, not centuries in the past.
380 * (2^64 ns is approximately half a millennium.)
381 */
382 if (__predict_false(systime_ns < raw_systime_ns)) {
383 SDT_PROBE7(sdt, xen, clock, systime__wraparound,
384 raw_systime_ns, tsc_timestamp,
385 tsc_to_system_mul, tsc_shift, delta_ns, tsc,
386 systime_ns);
387 #if XEN_CLOCK_DEBUG
388 printf("xen raw systime + tsc delta wrapped around:"
389 " %"PRIu64" + %"PRIu64" = %"PRIu64"\n",
390 raw_systime_ns, delta_ns, systime_ns);
391 #endif
392 ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++;
393 }
394
395 /*
396 * Notify the console if the TSC-adjusted Xen system time
397 * appears to have gone backwards, and pretend we had gone
398 * forward. This seems to happen pretty regularly under load.
399 */
400 if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) {
401 SDT_PROBE7(sdt, xen, clock, systime__backward,
402 raw_systime_ns, tsc_timestamp,
403 tsc_to_system_mul, tsc_shift, delta_ns, tsc,
404 systime_ns);
405 #if XEN_CLOCK_DEBUG
406 printf("xen raw systime + tsc delta went backwards:"
407 " %"PRIu64" > %"PRIu64"\n",
408 ci->ci_xen_last_systime_ns, systime_ns);
409 printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n"
410 " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n"
411 " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n"
412 " delta_ns=%"PRIu64"\n",
413 raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul,
414 tsc_shift, delta_tsc, delta_ns);
415 #endif
416 ci->ci_xen_raw_systime_backwards_evcnt.ev_count++;
417 systime_ns = ci->ci_xen_last_systime_ns + 1;
418 }
419
420 /* Remember the TSC-adjusted Xen system time. */
421 ci->ci_xen_last_systime_ns = systime_ns;
422
423 /* We had better not have migrated CPUs. */
424 KASSERT(ci == curcpu());
425
426 /* And we're done: return the TSC-adjusted systime in nanoseconds. */
427 return systime_ns;
428 }
429
430 /*
431 * xen_vcputime_raw_systime_ns()
432 *
433 * Return a snapshot of the current Xen system time to the
434 * resolution of the Xen hypervisor tick, in units of nanoseconds.
435 */
436 static uint64_t
437 xen_vcputime_raw_systime_ns(void)
438 {
439 volatile struct vcpu_time_info *vt;
440 struct xen_vcputime_ticket ticket;
441 uint64_t raw_systime_ns;
442
443 do {
444 vt = xen_vcputime_enter(&ticket);
445 raw_systime_ns = vt->system_time;
446 } while (!xen_vcputime_exit(vt, &ticket));
447
448 return raw_systime_ns;
449 }
450
451 /*
452 * struct xen_wallclock_ticket
453 *
454 * State for a wall clock read section, during which a caller may
455 * read from the wall clock fields of HYPERVISOR_shared_info.
456 * Caller must enter with xen_wallclock_enter, exit with
457 * xen_wallclock_exit, and be prepared to retry if
458 * xen_wallclock_exit fails.
459 */
460 struct xen_wallclock_ticket {
461 uint32_t version;
462 };
463
464 /*
465 * xen_wallclock_enter(tp)
466 *
467 * Enter a wall clock read section and store a ticket in *tp,
468 * which the caller must use with xen_wallclock_exit.
469 */
470 static inline void
471 xen_wallclock_enter(struct xen_wallclock_ticket *tp)
472 {
473
474 while (__predict_false(1 & (tp->version =
475 HYPERVISOR_shared_info->wc_version)))
476 SPINLOCK_BACKOFF_HOOK;
477
478 /*
479 * Must read the version from memory before reading the
480 * timestamp from memory, as written potentially by another
481 * pCPU.
482 */
483 membar_consumer();
484 }
485
486 /*
487 * xen_wallclock_exit(tp)
488 *
489 * Exit a wall clock read section with the ticket in *tp from
490 * xen_wallclock_enter. Return true on success, false if caller
491 * must retry.
492 */
493 static inline bool
494 xen_wallclock_exit(struct xen_wallclock_ticket *tp)
495 {
496
497 /*
498 * Must read the timestamp from memory before re-reading the
499 * version from memory, as written potentially by another pCPU.
500 */
501 membar_consumer();
502
503 return tp->version == HYPERVISOR_shared_info->wc_version;
504 }
505
506 /*
507 * xen_global_systime_ns()
508 *
509 * Return a global monotonic view of the system time in
510 * nanoseconds, computed by the per-CPU Xen raw system time plus
511 * an rdtsc adjustment, and advance the view of the system time
512 * for all other CPUs.
513 */
514 static uint64_t
515 xen_global_systime_ns(void)
516 {
517 struct cpu_info *ci;
518 uint64_t local, global, skew, result;
519
520 /*
521 * Find the local timecount on this CPU, and make sure it does
522 * not precede the latest global timecount witnessed so far by
523 * any CPU. If it does, add to the local CPU's skew from the
524 * fastest CPU.
525 *
526 * XXX Can we avoid retrying if the CAS fails?
527 */
528 int s = splsched(); /* make sure we won't be interrupted */
529 ci = curcpu();
530 do {
531 local = xen_vcputime_systime_ns();
532 skew = ci->ci_xen_systime_ns_skew;
533 global = xen_global_systime_ns_stamp;
534 if (__predict_false(local + skew < global + 1)) {
535 SDT_PROBE3(sdt, xen, timecounter, backward,
536 local, skew, global);
537 #if XEN_CLOCK_DEBUG
538 device_printf(ci->ci_dev,
539 "xen timecounter went backwards:"
540 " local=%"PRIu64" skew=%"PRIu64" global=%"PRIu64","
541 " adding %"PRIu64" to skew\n",
542 local, skew, global, global + 1 - (local + skew));
543 #endif
544 ci->ci_xen_timecounter_backwards_evcnt.ev_count++;
545 result = global + 1;
546 ci->ci_xen_systime_ns_skew += global + 1 -
547 (local + skew);
548 } else {
549 result = local + skew;
550 }
551 } while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result)
552 != global);
553 KASSERT(ci == curcpu());
554 splx(s);
555
556 return result;
557 }
558
559 /*
560 * xen_get_timecount(tc)
561 *
562 * Return the low 32 bits of a global monotonic view of the Xen
563 * system time.
564 */
565 static unsigned
566 xen_get_timecount(struct timecounter *tc)
567 {
568
569 KASSERT(tc == &xen_timecounter);
570
571 return (unsigned)xen_global_systime_ns();
572 }
573
574 /*
575 * xen_delay(n)
576 *
577 * Wait approximately n microseconds.
578 */
579 void
580 xen_delay(unsigned n)
581 {
582 int bound;
583
584 /* Bind to the CPU so we don't compare tsc on different CPUs. */
585 bound = curlwp_bind();
586
587 if (curcpu()->ci_vcpu == NULL) {
588 curlwp_bindx(bound);
589 return;
590 }
591
592 /* Short wait (<500us) or long wait? */
593 if (n < 500000) {
594 /*
595 * Xen system time is not precise enough for short
596 * delays, so use the tsc instead.
597 *
598 * We work with the current tsc frequency, and figure
599 * that if it changes while we're delaying, we've
600 * probably delayed long enough -- up to 500us.
601 *
602 * We do not use cpu_frequency(ci), which uses a
603 * quantity detected at boot time, and which may have
604 * changed by now if Xen has migrated this vCPU to
605 * another pCPU.
606 *
607 * XXX How long does it take to migrate pCPUs?
608 */
609 volatile struct vcpu_time_info *vt;
610 struct xen_vcputime_ticket ticket;
611 uint64_t tsc_start, last_tsc, tsc;
612 uint32_t tsc_to_system_mul;
613 int8_t tsc_shift;
614
615 /* Get the starting tsc and tsc frequency. */
616 do {
617 vt = xen_vcputime_enter(&ticket);
618 tsc_start = last_tsc = xen_rdtsc();
619 tsc_to_system_mul = vt->tsc_to_system_mul;
620 tsc_shift = vt->tsc_shift;
621 } while (!xen_vcputime_exit(vt, &ticket));
622
623 /*
624 * Wait until as many tsc ticks as there are in n
625 * microseconds have elapsed, or the tsc has gone
626 * backwards meaning we've probably migrated pCPUs.
627 */
628 for (;;) {
629 tsc = xen_rdtsc();
630 if (__predict_false(tsc < last_tsc))
631 break;
632 if (xen_tsc_to_ns_delta(tsc - tsc_start,
633 tsc_to_system_mul, tsc_shift)/1000 >= n)
634 break;
635 last_tsc = tsc;
636 }
637 } else {
638 /*
639 * Use the Xen system time for >=500us delays. From my
640 * testing, it seems to sometimes run backward by about
641 * 110us, which is not so bad.
642 */
643 uint64_t n_ns = 1000*(uint64_t)n;
644 uint64_t start_ns;
645
646 /* Get the start time. */
647 start_ns = xen_vcputime_raw_systime_ns();
648
649 /* Wait until the system time has passed the end. */
650 do {
651 HYPERVISOR_yield();
652 } while (xen_vcputime_raw_systime_ns() - start_ns < n_ns);
653 }
654
655 /* Unbind from the CPU if we weren't already bound. */
656 curlwp_bindx(bound);
657 }
658
659 /*
660 * xen_suspendclocks(ci)
661 *
662 * Stop handling the Xen timer event on the CPU of ci. Caller
663 * must be running on and bound to ci's CPU.
664 *
665 * Actually, caller must have kpreemption disabled, because that's
666 * easier to assert at the moment.
667 */
668 void
669 xen_suspendclocks(struct cpu_info *ci)
670 {
671 int evtch;
672
673 KASSERT(ci == curcpu());
674 KASSERT(kpreempt_disabled());
675
676 /*
677 * Find the VIRQ_TIMER event channel and close it so new timer
678 * interrupt events stop getting delivered to it.
679 *
680 * XXX Should this happen later? This is not the reverse order
681 * of xen_resumeclocks. It is apparently necessary in this
682 * order only because we don't stash evtchn anywhere, but we
683 * could stash it.
684 */
685 evtch = unbind_virq_from_evtch(VIRQ_TIMER);
686 KASSERT(evtch != -1);
687
688 /*
689 * Mask the event channel so we stop getting new interrupts on
690 * it.
691 */
692 hypervisor_mask_event(evtch);
693
694 /*
695 * Now that we are no longer getting new interrupts, remove the
696 * handler and wait for any existing calls to the handler to
697 * complete. After this point, there can be no concurrent
698 * calls to xen_timer_handler.
699 */
700 event_remove_handler(evtch,
701 __FPTRCAST(int (*)(void *), xen_timer_handler), ci);
702
703 aprint_verbose("Xen clock: removed event channel %d\n", evtch);
704
705 /* We'd better not have switched CPUs. */
706 KASSERT(ci == curcpu());
707 }
708
709 /*
710 * xen_resumeclocks(ci)
711 *
712 * Start handling the Xen timer event on the CPU of ci. Arm the
713 * Xen timer. Caller must be running on and bound to ci's CPU.
714 *
715 * Actually, caller must have kpreemption disabled, because that's
716 * easier to assert at the moment.
717 */
718 void
719 xen_resumeclocks(struct cpu_info *ci)
720 {
721 char intr_xname[INTRDEVNAMEBUF];
722 int evtch;
723 int error __diagused;
724
725 KASSERT(ci == curcpu());
726 KASSERT(kpreempt_disabled());
727
728 /*
729 * Allocate an event channel to receive VIRQ_TIMER events.
730 */
731 evtch = bind_virq_to_evtch(VIRQ_TIMER);
732 KASSERT(evtch != -1);
733
734 /*
735 * Set an event handler for VIRQ_TIMER events to call
736 * xen_timer_handler.
737 */
738 snprintf(intr_xname, sizeof(intr_xname), "%s clock",
739 device_xname(ci->ci_dev));
740 /* XXX sketchy function pointer cast -- fix the API, please */
741 if (event_set_handler(evtch,
742 __FPTRCAST(int (*)(void *), xen_timer_handler),
743 ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL)
744 panic("failed to establish timer interrupt handler");
745
746 aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch);
747
748 /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */
749 if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
750 error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
751 ci->ci_vcpuid, NULL);
752 KASSERT(error == 0);
753 }
754
755 /* Pretend the last hardclock happened right now. */
756 ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns();
757
758 /* Arm the one-shot timer. */
759 error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns +
760 NS_PER_TICK);
761 KASSERT(error == 0);
762
763 /*
764 * Ready to go. Unmask the event. After this point, Xen may
765 * start calling xen_timer_handler.
766 */
767 hypervisor_unmask_event(evtch);
768
769 /* We'd better not have switched CPUs. */
770 KASSERT(ci == curcpu());
771 }
772
773 /*
774 * xen_timer_handler(cookie, frame)
775 *
776 * Periodic Xen timer event handler for NetBSD hardclock. Calls
777 * to this may get delayed, so we run hardclock as many times as
778 * we need to in order to cover the Xen system time that elapsed.
779 * After that, re-arm the timer to run again at the next tick.
780 * The cookie is the pointer to struct cpu_info.
781 */
782 static int
783 xen_timer_handler(void *cookie, struct clockframe *frame)
784 {
785 struct cpu_info *ci = curcpu();
786 uint64_t last, now, delta, next;
787 int error;
788
789 KASSERT(cpu_intr_p());
790 KASSERT(cookie == ci);
791
792 #if defined(XENPV)
793 frame = NULL; /* We use values cached in curcpu() */
794 #endif
795 again:
796 /*
797 * Find how many nanoseconds of Xen system time has elapsed
798 * since the last hardclock tick.
799 */
800 last = ci->ci_xen_hardclock_systime_ns;
801 now = xen_vcputime_systime_ns();
802 if (now < last) {
803 SDT_PROBE2(sdt, xen, hardclock, systime__backward,
804 last, now);
805 #if XEN_CLOCK_DEBUG
806 device_printf(ci->ci_dev, "xen systime ran backwards"
807 " in hardclock %"PRIu64"ns\n",
808 last - now);
809 #endif
810 ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++;
811 now = last;
812 }
813 delta = now - last;
814
815 /*
816 * Play hardclock catchup: run the hardclock timer as many
817 * times as appears necessary based on how much time has
818 * passed.
819 */
820 while (delta >= NS_PER_TICK) {
821 ci->ci_xen_hardclock_systime_ns += NS_PER_TICK;
822 delta -= NS_PER_TICK;
823 hardclock(frame);
824 if (__predict_false(delta >= NS_PER_TICK)) {
825 SDT_PROBE3(sdt, xen, hardclock, missed,
826 last, now, delta);
827 ci->ci_xen_missed_hardclock_evcnt.ev_count++;
828 }
829 }
830
831 /*
832 * Re-arm the timer. If it fails, it's probably because the
833 * time is in the past, so update our idea of what the Xen
834 * system time is and try again.
835 */
836 next = ci->ci_xen_hardclock_systime_ns + NS_PER_TICK;
837 error = HYPERVISOR_set_timer_op(next);
838 if (error)
839 goto again;
840
841 /* Success! */
842 return 0;
843 }
844
845 /*
846 * xen_initclocks()
847 *
848 * Initialize the Xen clocks on the current CPU.
849 */
850 void
851 xen_initclocks(void)
852 {
853 struct cpu_info *ci = curcpu();
854
855 /* If this is the primary CPU, do global initialization first. */
856 if (ci == &cpu_info_primary) {
857 /* Initialize the systemwide Xen timecounter. */
858 tc_init(&xen_timecounter);
859 }
860
861 /* Attach the event counters. */
862 evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt,
863 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
864 "cpu tsc ran backwards");
865 evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt,
866 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
867 "tsc delta went negative");
868 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt,
869 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
870 "raw systime wrapped around");
871 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt,
872 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
873 "raw systime went backwards");
874 evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt,
875 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
876 "systime went backwards in hardclock");
877 evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt,
878 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
879 "missed hardclock");
880 evcnt_attach_dynamic(&ci->ci_xen_timecounter_backwards_evcnt,
881 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
882 "timecounter went backwards");
883
884 /* Fire up the clocks. */
885 xen_resumeclocks(ci);
886
887 #ifdef DOM0OPS
888 /*
889 * If this is a privileged dom0, start pushing the wall
890 * clock time back to the Xen hypervisor.
891 */
892 if (ci == &cpu_info_primary && xendomain_is_privileged())
893 xen_timepush_init();
894 #endif
895 }
896
897 #ifdef DOM0OPS
898
899 /*
900 * xen_timepush_init()
901 *
902 * Initialize callout to periodically set Xen hypervisor's wall
903 * clock time.
904 */
905 static void
906 xen_timepush_init(void)
907 {
908 struct sysctllog *log = NULL;
909 const struct sysctlnode *node = NULL;
910 int error;
911
912 /* Start periodically updating the hypervisor's wall clock time. */
913 callout_init(&xen_timepush.ch, 0);
914 callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);
915
916 /* Pick a default frequency for timepush. */
917 xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */
918
919 /* Create machdep.xen node. */
920 /* XXX Creation of the `machdep.xen' node should be elsewhere. */
921 error = sysctl_createv(&log, 0, NULL, &node, 0,
922 CTLTYPE_NODE, "xen",
923 SYSCTL_DESCR("Xen top level node"),
924 NULL, 0, NULL, 0,
925 CTL_MACHDEP, CTL_CREATE, CTL_EOL);
926 if (error)
927 goto fail;
928 KASSERT(node != NULL);
929
930 /* Create int machdep.xen.timepush_ticks knob. */
931 error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE,
932 CTLTYPE_INT, "timepush_ticks",
933 SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
934 " 0 to disable"),
935 sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
936 CTL_CREATE, CTL_EOL);
937 if (error)
938 goto fail;
939
940 /* Start the timepush callout. */
941 callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
942
943 /* Success! */
944 return;
945
946 fail: sysctl_teardown(&log);
947 }
948
949 /*
950 * xen_timepush_intr(cookie)
951 *
952 * Callout interrupt handler to push NetBSD's idea of the wall
953 * clock time, usually synchronized with NTP, back to the Xen
954 * hypervisor.
955 */
956 static void
957 xen_timepush_intr(void *cookie)
958 {
959
960 resettodr();
961 if (xen_timepush.ticks)
962 callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
963 }
964
965 /*
966 * sysctl_xen_timepush(...)
967 *
968 * Sysctl handler to set machdep.xen.timepush_ticks.
969 */
970 static int
971 sysctl_xen_timepush(SYSCTLFN_ARGS)
972 {
973 struct sysctlnode node;
974 int ticks;
975 int error;
976
977 ticks = xen_timepush.ticks;
978 node = *rnode;
979 node.sysctl_data = &ticks;
980 error = sysctl_lookup(SYSCTLFN_CALL(&node));
981 if (error || newp == NULL)
982 return error;
983
984 if (ticks < 0)
985 return EINVAL;
986
987 if (ticks != xen_timepush.ticks) {
988 xen_timepush.ticks = ticks;
989
990 if (ticks == 0)
991 callout_stop(&xen_timepush.ch);
992 else
993 callout_schedule(&xen_timepush.ch, ticks);
994 }
995
996 return 0;
997 }
998
999 #endif /* DOM0OPS */
1000
1001 static int xen_rtc_get(struct todr_chip_handle *, struct timeval *);
1002 static int xen_rtc_set(struct todr_chip_handle *, struct timeval *);
1003 static void xen_wallclock_time(struct timespec *);
1004 /*
1005 * xen time of day register:
1006 *
1007 * Xen wall clock time, plus a Xen vCPU system time adjustment.
1008 */
1009 static struct todr_chip_handle xen_todr_chip = {
1010 .todr_gettime = xen_rtc_get,
1011 .todr_settime = xen_rtc_set,
1012 };
1013
1014 /*
1015 * xen_startrtclock()
1016 *
1017 * Initialize the real-time clock from x86 machdep autoconf.
1018 */
1019 void
1020 xen_startrtclock(void)
1021 {
1022
1023 todr_attach(&xen_todr_chip);
1024 }
1025
1026 /*
1027 * xen_rtc_get(todr, tv)
1028 *
1029 * Get the current real-time clock from the Xen wall clock time
1030 * and vCPU system time adjustment.
1031 */
1032 static int
1033 xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
1034 {
1035 struct timespec ts;
1036
1037 xen_wallclock_time(&ts);
1038 TIMESPEC_TO_TIMEVAL(tvp, &ts);
1039
1040 return 0;
1041 }
1042
1043 /*
1044 * xen_rtc_set(todr, tv)
1045 *
1046 * Set the Xen wall clock time, if we can.
1047 */
1048 static int
1049 xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
1050 {
1051 #ifdef DOM0OPS
1052 struct clock_ymdhms dt;
1053 xen_platform_op_t op;
1054 uint64_t systime_ns;
1055
1056 if (xendomain_is_privileged()) {
1057 /* Convert to ymdhms and set the x86 ISA RTC. */
1058 clock_secs_to_ymdhms(tvp->tv_sec, &dt);
1059 rtc_set_ymdhms(NULL, &dt);
1060
1061 /* Get the global system time so we can preserve it. */
1062 systime_ns = xen_global_systime_ns();
1063
1064 /* Set the hypervisor wall clock time. */
1065 op.cmd = XENPF_settime;
1066 op.u.settime.secs = tvp->tv_sec;
1067 op.u.settime.nsecs = tvp->tv_usec * 1000;
1068 op.u.settime.system_time = systime_ns;
1069 return HYPERVISOR_platform_op(&op);
1070 }
1071 #endif
1072
1073 /* XXX Should this fail if not on privileged dom0? */
1074 return 0;
1075 }
1076
1077 /*
1078 * xen_wallclock_time(tsp)
1079 *
1080 * Return a snapshot of the current low-resolution wall clock
1081 * time, as reported by the hypervisor, in tsp.
1082 */
1083 static void
1084 xen_wallclock_time(struct timespec *tsp)
1085 {
1086 struct xen_wallclock_ticket ticket;
1087 uint64_t systime_ns;
1088
1089 int s = splsched(); /* make sure we won't be interrupted */
1090 /* Read the last wall clock sample from the hypervisor. */
1091 do {
1092 xen_wallclock_enter(&ticket);
1093 tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
1094 tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
1095 } while (!xen_wallclock_exit(&ticket));
1096
1097 /* Get the global system time. */
1098 systime_ns = xen_global_systime_ns();
1099 splx(s);
1100
1101 /* Add the system time to the wall clock time. */
1102 systime_ns += tsp->tv_nsec;
1103 tsp->tv_sec += systime_ns / 1000000000ull;
1104 tsp->tv_nsec = systime_ns % 1000000000ull;
1105 }
1106
1107 #ifdef XENPV
1108 /*
1109 * setstatclockrate(rate)
1110 *
1111 * Set the statclock to run at rate, in units of ticks per second.
1112 *
1113 * Currently Xen does not have a separate statclock, so this is a
1114 * noop; instad the statclock runs in hardclock.
1115 */
1116 void
1117 setstatclockrate(int rate)
1118 {
1119 }
1120 #endif /* XENPV */
1121