xen_clock.c revision 1.17 1 /* $NetBSD: xen_clock.c,v 1.17 2023/08/01 20:11:13 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "opt_xen.h"
33
34 #ifndef XEN_CLOCK_DEBUG
35 #define XEN_CLOCK_DEBUG 0
36 #endif
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.17 2023/08/01 20:11:13 riastradh Exp $");
40
41 #include <sys/param.h>
42 #include <sys/types.h>
43 #include <sys/atomic.h>
44 #include <sys/callout.h>
45 #include <sys/cpu.h>
46 #include <sys/device.h>
47 #include <sys/evcnt.h>
48 #include <sys/intr.h>
49 #include <sys/kernel.h>
50 #include <sys/lwp.h>
51 #include <sys/proc.h>
52 #include <sys/sdt.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/time.h>
56 #include <sys/timetc.h>
57
58 #include <dev/clock_subr.h>
59
60 #include <machine/cpu.h>
61 #include <machine/cpu_counter.h>
62 #include <machine/lock.h>
63
64 #include <xen/evtchn.h>
65 #include <xen/hypervisor.h>
66 #include <xen/include/public/vcpu.h>
67 #include <xen/xen.h>
68
69 #include <x86/rtc.h>
70
71 #define NS_PER_TICK ((uint64_t)1000000000ULL/hz)
72
73 static uint64_t xen_vcputime_systime_ns(void);
74 static uint64_t xen_vcputime_raw_systime_ns(void);
75 static uint64_t xen_global_systime_ns(void);
76 static unsigned xen_get_timecount(struct timecounter *);
77 static int xen_timer_handler(void *, struct clockframe *);
78
79 /*
80 * dtrace probes
81 */
82 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__backward,
83 "uint64_t"/*raw_systime_ns*/,
84 "uint64_t"/*tsc_timestamp*/,
85 "uint64_t"/*tsc_to_system_mul*/,
86 "int"/*tsc_shift*/,
87 "uint64_t"/*delta_ns*/,
88 "uint64_t"/*tsc*/,
89 "uint64_t"/*systime_ns*/);
90 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__delta__negative,
91 "uint64_t"/*raw_systime_ns*/,
92 "uint64_t"/*tsc_timestamp*/,
93 "uint64_t"/*tsc_to_system_mul*/,
94 "int"/*tsc_shift*/,
95 "uint64_t"/*delta_ns*/,
96 "uint64_t"/*tsc*/,
97 "uint64_t"/*systime_ns*/);
98 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__wraparound,
99 "uint64_t"/*raw_systime_ns*/,
100 "uint64_t"/*tsc_timestamp*/,
101 "uint64_t"/*tsc_to_system_mul*/,
102 "int"/*tsc_shift*/,
103 "uint64_t"/*delta_ns*/,
104 "uint64_t"/*tsc*/,
105 "uint64_t"/*systime_ns*/);
106 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__backward,
107 "uint64_t"/*raw_systime_ns*/,
108 "uint64_t"/*tsc_timestamp*/,
109 "uint64_t"/*tsc_to_system_mul*/,
110 "int"/*tsc_shift*/,
111 "uint64_t"/*delta_ns*/,
112 "uint64_t"/*tsc*/,
113 "uint64_t"/*systime_ns*/);
114
115 SDT_PROBE_DEFINE3(sdt, xen, timecounter, backward,
116 "uint64_t"/*local*/,
117 "uint64_t"/*skew*/,
118 "uint64_t"/*global*/);
119
120 SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward,
121 "uint64_t"/*last_systime_ns*/,
122 "uint64_t"/*this_systime_ns*/);
123 SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick,
124 "uint64_t"/*last_systime_ns*/,
125 "uint64_t"/*this_systime_ns*/);
126 SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump,
127 "uint64_t"/*last_systime_ns*/,
128 "uint64_t"/*this_systime_ns*/,
129 "uint64_t"/*nticks*/);
130 SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed,
131 "uint64_t"/*last_systime_ns*/,
132 "uint64_t"/*this_systime_ns*/,
133 "uint64_t"/*remaining_ns*/);
134
135 /*
136 * xen timecounter:
137 *
138 * Xen vCPU system time, plus an adjustment with rdtsc.
139 */
140 static struct timecounter xen_timecounter = {
141 .tc_get_timecount = xen_get_timecount,
142 .tc_poll_pps = NULL,
143 .tc_counter_mask = ~0U,
144 .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */
145 .tc_name = "xen_system_time",
146 .tc_quality = 10000,
147 };
148
149 /*
150 * xen_global_systime_ns_stamp
151 *
152 * The latest Xen vCPU system time that has been observed on any
153 * CPU, for a global monotonic view of the Xen system time clock.
154 */
155 static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned;
156
157 #ifdef DOM0OPS
158 /*
159 * xen timepush state:
160 *
161 * Callout to periodically, after a sysctl-configurable number of
162 * NetBSD ticks, set the Xen hypervisor's wall clock time.
163 */
164 static struct {
165 struct callout ch;
166 int ticks;
167 } xen_timepush;
168
169 static void xen_timepush_init(void);
170 static void xen_timepush_intr(void *);
171 static int sysctl_xen_timepush(SYSCTLFN_ARGS);
172 #endif
173
174 /*
175 * xen_rdtsc()
176 *
177 * Read the local pCPU's tsc.
178 */
179 static inline uint64_t
180 xen_rdtsc(void)
181 {
182 uint32_t lo, hi;
183
184 asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
185
186 return ((uint64_t)hi << 32) | lo;
187 }
188
189 /*
190 * struct xen_vcputime_ticket
191 *
192 * State for a vCPU read section, during which a caller may read
193 * from fields of a struct vcpu_time_info and call xen_rdtsc.
194 * Caller must enter with xen_vcputime_enter, exit with
195 * xen_vcputime_exit, and be prepared to retry if
196 * xen_vcputime_exit fails.
197 */
198 struct xen_vcputime_ticket {
199 uint64_t version;
200 };
201
202 /*
203 * xen_vcputime_enter(tp)
204 *
205 * Enter a vCPU time read section and store a ticket in *tp, which
206 * the caller must use with xen_vcputime_exit. Return a pointer
207 * to the current CPU's vcpu_time_info structure. Caller must
208 * already be bound to the CPU.
209 */
210 static inline volatile struct vcpu_time_info *
211 xen_vcputime_enter(struct xen_vcputime_ticket *tp)
212 {
213 volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time;
214
215 while (__predict_false(1 & (tp->version = vt->version)))
216 SPINLOCK_BACKOFF_HOOK;
217
218 /*
219 * Must read the version before reading the tsc on the local
220 * pCPU. We are racing only with interruption by the
221 * hypervisor, so no need for a stronger memory barrier.
222 */
223 __insn_barrier();
224
225 return vt;
226 }
227
228 /*
229 * xen_vcputime_exit(vt, tp)
230 *
231 * Exit a vCPU time read section with the ticket in *tp from
232 * xen_vcputime_enter. Return true on success, false if caller
233 * must retry.
234 */
235 static inline bool
236 xen_vcputime_exit(volatile struct vcpu_time_info *vt,
237 struct xen_vcputime_ticket *tp)
238 {
239
240 KASSERT(vt == &curcpu()->ci_vcpu->time);
241
242 /*
243 * Must read the tsc before re-reading the version on the local
244 * pCPU. We are racing only with interruption by the
245 * hypervisor, so no need for a stronger memory barrier.
246 */
247 __insn_barrier();
248
249 return tp->version == vt->version;
250 }
251
252 /*
253 * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
254 *
255 * Convert a difference in tsc units to a difference in
256 * nanoseconds given a multiplier and shift for the unit
257 * conversion.
258 */
259 static inline uint64_t
260 xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
261 int8_t tsc_shift)
262 {
263 uint32_t delta_tsc_hi, delta_tsc_lo;
264
265 if (tsc_shift < 0)
266 delta_tsc >>= -tsc_shift;
267 else
268 delta_tsc <<= tsc_shift;
269
270 delta_tsc_hi = delta_tsc >> 32;
271 delta_tsc_lo = delta_tsc & 0xffffffffUL;
272
273 /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
274 return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
275 (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
276 }
277
278 /*
279 * xen_vcputime_systime_ns()
280 *
281 * Return a snapshot of the Xen system time plus an adjustment
282 * from the tsc, in units of nanoseconds. Caller must be bound to
283 * the current CPU.
284 */
285 static uint64_t
286 xen_vcputime_systime_ns(void)
287 {
288 volatile struct vcpu_time_info *vt;
289 struct cpu_info *ci = curcpu();
290 struct xen_vcputime_ticket ticket;
291 uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns;
292 uint32_t tsc_to_system_mul;
293 int8_t tsc_shift;
294 uint64_t systime_ns;
295
296 /* We'd better be bound to the CPU in _some_ way. */
297 KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() ||
298 (curlwp->l_flag & LP_BOUND));
299
300 /*
301 * Repeatedly try to read the system time, corresponding tsc
302 * timestamp, and tsc frequency until we get a consistent view.
303 */
304 do {
305 vt = xen_vcputime_enter(&ticket);
306
307 /* Grab Xen's snapshot of raw system time and tsc. */
308 raw_systime_ns = vt->system_time;
309 tsc_timestamp = vt->tsc_timestamp;
310
311 /* Get Xen's current idea of how fast the tsc is counting. */
312 tsc_to_system_mul = vt->tsc_to_system_mul;
313 tsc_shift = vt->tsc_shift;
314
315 /* Read the CPU's tsc. */
316 tsc = xen_rdtsc();
317 } while (!xen_vcputime_exit(vt, &ticket));
318
319 /*
320 * Out of paranoia, check whether the tsc has gone backwards
321 * since Xen's timestamp.
322 *
323 * This shouldn't happen because the Xen hypervisor is supposed
324 * to have read the tsc _before_ writing to the vcpu_time_info
325 * page, _before_ we read the tsc.
326 *
327 * Further, if we switched pCPUs after reading the tsc
328 * timestamp but before reading the CPU's tsc, the hypervisor
329 * had better notify us by updating the version too and forcing
330 * us to retry the vCPU time read.
331 */
332 if (__predict_false(tsc < tsc_timestamp)) {
333 /*
334 * Notify the console that the CPU's tsc appeared to
335 * run behind Xen's idea of it, and pretend it hadn't.
336 */
337 SDT_PROBE7(sdt, xen, clock, tsc__backward,
338 raw_systime_ns, tsc_timestamp,
339 tsc_to_system_mul, tsc_shift, /*delta_ns*/0, tsc,
340 /*systime_ns*/raw_systime_ns);
341 #if XEN_CLOCK_DEBUG
342 device_printf(ci->ci_dev, "xen cpu tsc %"PRIu64
343 " ran backwards from timestamp %"PRIu64
344 " by %"PRIu64"\n",
345 tsc, tsc_timestamp, tsc_timestamp - tsc);
346 #endif
347 ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++;
348 delta_ns = delta_tsc = 0;
349 } else {
350 /* Find how far the CPU's tsc has advanced. */
351 delta_tsc = tsc - tsc_timestamp;
352
353 /* Convert the tsc delta to a nanosecond delta. */
354 delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul,
355 tsc_shift);
356 }
357
358 /*
359 * Notify the console if the delta computation yielded a
360 * negative, and pretend it hadn't.
361 *
362 * This doesn't make sense but I include it out of paranoia.
363 */
364 if (__predict_false((int64_t)delta_ns < 0)) {
365 SDT_PROBE7(sdt, xen, clock, tsc__delta__negative,
366 raw_systime_ns, tsc_timestamp,
367 tsc_to_system_mul, tsc_shift, delta_ns, tsc,
368 /*systime_ns*/raw_systime_ns);
369 #if XEN_CLOCK_DEBUG
370 device_printf(ci->ci_dev, "xen tsc delta in ns went negative:"
371 " %"PRId64"\n", delta_ns);
372 #endif
373 ci->ci_xen_tsc_delta_negative_evcnt.ev_count++;
374 delta_ns = 0;
375 }
376
377 /*
378 * Compute the TSC-adjusted system time.
379 */
380 systime_ns = raw_systime_ns + delta_ns;
381
382 /*
383 * Notify the console if the addition wrapped around.
384 *
385 * This shouldn't happen because system time should be relative
386 * to a reasonable reference point, not centuries in the past.
387 * (2^64 ns is approximately half a millennium.)
388 */
389 if (__predict_false(systime_ns < raw_systime_ns)) {
390 SDT_PROBE7(sdt, xen, clock, systime__wraparound,
391 raw_systime_ns, tsc_timestamp,
392 tsc_to_system_mul, tsc_shift, delta_ns, tsc,
393 systime_ns);
394 #if XEN_CLOCK_DEBUG
395 printf("xen raw systime + tsc delta wrapped around:"
396 " %"PRIu64" + %"PRIu64" = %"PRIu64"\n",
397 raw_systime_ns, delta_ns, systime_ns);
398 #endif
399 ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++;
400 }
401
402 /*
403 * Notify the console if the TSC-adjusted Xen system time
404 * appears to have gone backwards, and pretend we had gone
405 * forward. This seems to happen pretty regularly under load.
406 */
407 if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) {
408 SDT_PROBE7(sdt, xen, clock, systime__backward,
409 raw_systime_ns, tsc_timestamp,
410 tsc_to_system_mul, tsc_shift, delta_ns, tsc,
411 systime_ns);
412 #if XEN_CLOCK_DEBUG
413 printf("xen raw systime + tsc delta went backwards:"
414 " %"PRIu64" > %"PRIu64"\n",
415 ci->ci_xen_last_systime_ns, systime_ns);
416 printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n"
417 " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n"
418 " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n"
419 " delta_ns=%"PRIu64"\n",
420 raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul,
421 tsc_shift, delta_tsc, delta_ns);
422 #endif
423 ci->ci_xen_raw_systime_backwards_evcnt.ev_count++;
424 systime_ns = ci->ci_xen_last_systime_ns + 1;
425 }
426
427 /* Remember the TSC-adjusted Xen system time. */
428 ci->ci_xen_last_systime_ns = systime_ns;
429
430 /* We had better not have migrated CPUs. */
431 KASSERT(ci == curcpu());
432
433 /* And we're done: return the TSC-adjusted systime in nanoseconds. */
434 return systime_ns;
435 }
436
437 /*
438 * xen_vcputime_raw_systime_ns()
439 *
440 * Return a snapshot of the current Xen system time to the
441 * resolution of the Xen hypervisor tick, in units of nanoseconds.
442 */
443 static uint64_t
444 xen_vcputime_raw_systime_ns(void)
445 {
446 volatile struct vcpu_time_info *vt;
447 struct xen_vcputime_ticket ticket;
448 uint64_t raw_systime_ns;
449
450 do {
451 vt = xen_vcputime_enter(&ticket);
452 raw_systime_ns = vt->system_time;
453 } while (!xen_vcputime_exit(vt, &ticket));
454
455 return raw_systime_ns;
456 }
457
458 /*
459 * struct xen_wallclock_ticket
460 *
461 * State for a wall clock read section, during which a caller may
462 * read from the wall clock fields of HYPERVISOR_shared_info.
463 * Caller must enter with xen_wallclock_enter, exit with
464 * xen_wallclock_exit, and be prepared to retry if
465 * xen_wallclock_exit fails.
466 */
467 struct xen_wallclock_ticket {
468 uint32_t version;
469 };
470
471 /*
472 * xen_wallclock_enter(tp)
473 *
474 * Enter a wall clock read section and store a ticket in *tp,
475 * which the caller must use with xen_wallclock_exit.
476 */
477 static inline void
478 xen_wallclock_enter(struct xen_wallclock_ticket *tp)
479 {
480
481 while (__predict_false(1 & (tp->version =
482 HYPERVISOR_shared_info->wc_version)))
483 SPINLOCK_BACKOFF_HOOK;
484
485 /*
486 * Must read the version from memory before reading the
487 * timestamp from memory, as written potentially by another
488 * pCPU.
489 */
490 membar_consumer();
491 }
492
493 /*
494 * xen_wallclock_exit(tp)
495 *
496 * Exit a wall clock read section with the ticket in *tp from
497 * xen_wallclock_enter. Return true on success, false if caller
498 * must retry.
499 */
500 static inline bool
501 xen_wallclock_exit(struct xen_wallclock_ticket *tp)
502 {
503
504 /*
505 * Must read the timestamp from memory before re-reading the
506 * version from memory, as written potentially by another pCPU.
507 */
508 membar_consumer();
509
510 return tp->version == HYPERVISOR_shared_info->wc_version;
511 }
512
513 /*
514 * xen_global_systime_ns()
515 *
516 * Return a global monotonic view of the system time in
517 * nanoseconds, computed by the per-CPU Xen raw system time plus
518 * an rdtsc adjustment, and advance the view of the system time
519 * for all other CPUs.
520 */
521 static uint64_t
522 xen_global_systime_ns(void)
523 {
524 struct cpu_info *ci;
525 uint64_t local, global, skew, result;
526
527 /*
528 * Find the local timecount on this CPU, and make sure it does
529 * not precede the latest global timecount witnessed so far by
530 * any CPU. If it does, add to the local CPU's skew from the
531 * fastest CPU.
532 *
533 * XXX Can we avoid retrying if the CAS fails?
534 */
535 int s = splsched(); /* make sure we won't be interrupted */
536 ci = curcpu();
537 do {
538 local = xen_vcputime_systime_ns();
539 skew = ci->ci_xen_systime_ns_skew;
540 global = xen_global_systime_ns_stamp;
541 if (__predict_false(local + skew < global + 1)) {
542 SDT_PROBE3(sdt, xen, timecounter, backward,
543 local, skew, global);
544 #if XEN_CLOCK_DEBUG
545 device_printf(ci->ci_dev,
546 "xen timecounter went backwards:"
547 " local=%"PRIu64" skew=%"PRIu64" global=%"PRIu64","
548 " adding %"PRIu64" to skew\n",
549 local, skew, global, global + 1 - (local + skew));
550 #endif
551 ci->ci_xen_timecounter_backwards_evcnt.ev_count++;
552 result = global + 1;
553 ci->ci_xen_systime_ns_skew += global + 1 -
554 (local + skew);
555 } else {
556 result = local + skew;
557 }
558 } while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result)
559 != global);
560 KASSERT(ci == curcpu());
561 splx(s);
562
563 return result;
564 }
565
566 /*
567 * xen_get_timecount(tc)
568 *
569 * Return the low 32 bits of a global monotonic view of the Xen
570 * system time.
571 */
572 static unsigned
573 xen_get_timecount(struct timecounter *tc)
574 {
575
576 KASSERT(tc == &xen_timecounter);
577
578 return (unsigned)xen_global_systime_ns();
579 }
580
581 /*
582 * xen_delay(n)
583 *
584 * Wait approximately n microseconds.
585 */
586 void
587 xen_delay(unsigned n)
588 {
589 int bound;
590
591 /* Bind to the CPU so we don't compare tsc on different CPUs. */
592 bound = curlwp_bind();
593
594 if (curcpu()->ci_vcpu == NULL) {
595 curlwp_bindx(bound);
596 return;
597 }
598
599 /* Short wait (<500us) or long wait? */
600 if (n < 500000) {
601 /*
602 * Xen system time is not precise enough for short
603 * delays, so use the tsc instead.
604 *
605 * We work with the current tsc frequency, and figure
606 * that if it changes while we're delaying, we've
607 * probably delayed long enough -- up to 500us.
608 *
609 * We do not use cpu_frequency(ci), which uses a
610 * quantity detected at boot time, and which may have
611 * changed by now if Xen has migrated this vCPU to
612 * another pCPU.
613 *
614 * XXX How long does it take to migrate pCPUs?
615 */
616 volatile struct vcpu_time_info *vt;
617 struct xen_vcputime_ticket ticket;
618 uint64_t tsc_start, last_tsc, tsc;
619 uint32_t tsc_to_system_mul;
620 int8_t tsc_shift;
621
622 /* Get the starting tsc and tsc frequency. */
623 do {
624 vt = xen_vcputime_enter(&ticket);
625 tsc_start = last_tsc = xen_rdtsc();
626 tsc_to_system_mul = vt->tsc_to_system_mul;
627 tsc_shift = vt->tsc_shift;
628 } while (!xen_vcputime_exit(vt, &ticket));
629
630 /*
631 * Wait until as many tsc ticks as there are in n
632 * microseconds have elapsed, or the tsc has gone
633 * backwards meaning we've probably migrated pCPUs.
634 */
635 for (;;) {
636 tsc = xen_rdtsc();
637 if (__predict_false(tsc < last_tsc))
638 break;
639 if (xen_tsc_to_ns_delta(tsc - tsc_start,
640 tsc_to_system_mul, tsc_shift)/1000 >= n)
641 break;
642 last_tsc = tsc;
643 }
644 } else {
645 /*
646 * Use the Xen system time for >=500us delays. From my
647 * testing, it seems to sometimes run backward by about
648 * 110us, which is not so bad.
649 */
650 uint64_t n_ns = 1000*(uint64_t)n;
651 uint64_t start_ns;
652
653 /* Get the start time. */
654 start_ns = xen_vcputime_raw_systime_ns();
655
656 /* Wait until the system time has passed the end. */
657 do {
658 HYPERVISOR_yield();
659 } while (xen_vcputime_raw_systime_ns() - start_ns < n_ns);
660 }
661
662 /* Unbind from the CPU if we weren't already bound. */
663 curlwp_bindx(bound);
664 }
665
666 /*
667 * xen_suspendclocks(ci)
668 *
669 * Stop handling the Xen timer event on the CPU of ci. Caller
670 * must be running on and bound to ci's CPU.
671 *
672 * Actually, caller must have kpreemption disabled, because that's
673 * easier to assert at the moment.
674 */
675 void
676 xen_suspendclocks(struct cpu_info *ci)
677 {
678 int evtch;
679
680 KASSERT(ci == curcpu());
681 KASSERT(kpreempt_disabled());
682
683 /*
684 * Find the VIRQ_TIMER event channel and close it so new timer
685 * interrupt events stop getting delivered to it.
686 *
687 * XXX Should this happen later? This is not the reverse order
688 * of xen_resumeclocks. It is apparently necessary in this
689 * order only because we don't stash evtchn anywhere, but we
690 * could stash it.
691 */
692 evtch = unbind_virq_from_evtch(VIRQ_TIMER);
693 KASSERT(evtch != -1);
694
695 /*
696 * Mask the event channel so we stop getting new interrupts on
697 * it.
698 */
699 hypervisor_mask_event(evtch);
700
701 /*
702 * Now that we are no longer getting new interrupts, remove the
703 * handler and wait for any existing calls to the handler to
704 * complete. After this point, there can be no concurrent
705 * calls to xen_timer_handler.
706 */
707 event_remove_handler(evtch,
708 __FPTRCAST(int (*)(void *), xen_timer_handler), ci);
709
710 aprint_verbose("Xen clock: removed event channel %d\n", evtch);
711
712 /* We'd better not have switched CPUs. */
713 KASSERT(ci == curcpu());
714 }
715
716 /*
717 * xen_resumeclocks(ci)
718 *
719 * Start handling the Xen timer event on the CPU of ci. Arm the
720 * Xen timer. Caller must be running on and bound to ci's CPU.
721 *
722 * Actually, caller must have kpreemption disabled, because that's
723 * easier to assert at the moment.
724 */
725 void
726 xen_resumeclocks(struct cpu_info *ci)
727 {
728 char intr_xname[INTRDEVNAMEBUF];
729 int evtch;
730 int error __diagused;
731
732 KASSERT(ci == curcpu());
733 KASSERT(kpreempt_disabled());
734
735 /*
736 * Allocate an event channel to receive VIRQ_TIMER events.
737 */
738 evtch = bind_virq_to_evtch(VIRQ_TIMER);
739 KASSERT(evtch != -1);
740
741 /*
742 * Set an event handler for VIRQ_TIMER events to call
743 * xen_timer_handler.
744 */
745 snprintf(intr_xname, sizeof(intr_xname), "%s clock",
746 device_xname(ci->ci_dev));
747 /* XXX sketchy function pointer cast -- fix the API, please */
748 if (event_set_handler(evtch,
749 __FPTRCAST(int (*)(void *), xen_timer_handler),
750 ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL)
751 panic("failed to establish timer interrupt handler");
752
753 aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch);
754
755 /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */
756 if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
757 error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
758 ci->ci_vcpuid, NULL);
759 KASSERT(error == 0);
760 }
761
762 /* Pretend the last hardclock happened right now. */
763 ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns();
764
765 /* Arm the one-shot timer. */
766 error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns +
767 NS_PER_TICK);
768 KASSERT(error == 0);
769
770 /*
771 * Ready to go. Unmask the event. After this point, Xen may
772 * start calling xen_timer_handler.
773 */
774 hypervisor_unmask_event(evtch);
775
776 /* We'd better not have switched CPUs. */
777 KASSERT(ci == curcpu());
778 }
779
780 /*
781 * xen_timer_handler(cookie, frame)
782 *
783 * Periodic Xen timer event handler for NetBSD hardclock. Calls
784 * to this may get delayed, so we run hardclock as many times as
785 * we need to in order to cover the Xen system time that elapsed.
786 * After that, re-arm the timer to run again at the next tick.
787 * The cookie is the pointer to struct cpu_info.
788 */
789 static int
790 xen_timer_handler(void *cookie, struct clockframe *frame)
791 {
792 const uint64_t ns_per_tick = NS_PER_TICK;
793 struct cpu_info *ci = curcpu();
794 uint64_t last, now, delta, next;
795 int error;
796
797 KASSERT(cpu_intr_p());
798 KASSERT(cookie == ci);
799
800 #if defined(XENPV)
801 frame = NULL; /* We use values cached in curcpu() */
802 #endif
803 again:
804 /*
805 * Find how many nanoseconds of Xen system time has elapsed
806 * since the last hardclock tick.
807 */
808 last = ci->ci_xen_hardclock_systime_ns;
809 now = xen_vcputime_systime_ns();
810 SDT_PROBE2(sdt, xen, hardclock, tick, last, now);
811 if (__predict_false(now < last)) {
812 SDT_PROBE2(sdt, xen, hardclock, systime__backward,
813 last, now);
814 #if XEN_CLOCK_DEBUG
815 device_printf(ci->ci_dev, "xen systime ran backwards"
816 " in hardclock %"PRIu64"ns\n",
817 last - now);
818 #endif
819 ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++;
820 now = last;
821 }
822 delta = now - last;
823
824 /*
825 * Play hardclock catchup: run the hardclock timer as many
826 * times as appears necessary based on how much time has
827 * passed.
828 */
829 if (__predict_false(delta >= 2*ns_per_tick)) {
830 SDT_PROBE3(sdt, xen, hardclock, jump,
831 last, now, delta/ns_per_tick);
832
833 /*
834 * Warn if we violate timecounter(9) contract: with a
835 * k-bit timeocunter (here k = 32), and timecounter
836 * frequency f (here f = 1 GHz), the maximum period
837 * between hardclock calls is 2^k / f.
838 */
839 if (delta > xen_timecounter.tc_counter_mask) {
840 printf("WARNING: hardclock skipped %"PRIu64"ns"
841 " (%"PRIu64" -> %"PRIu64"),"
842 " exceeding maximum of %"PRIu32"ns"
843 " for timecounter(9)\n",
844 last, now, delta,
845 xen_timecounter.tc_counter_mask);
846 ci->ci_xen_timecounter_jump_evcnt.ev_count++;
847 }
848 }
849 while (delta >= ns_per_tick) {
850 ci->ci_xen_hardclock_systime_ns += ns_per_tick;
851 delta -= ns_per_tick;
852 hardclock(frame);
853 if (__predict_false(delta >= ns_per_tick)) {
854 SDT_PROBE3(sdt, xen, hardclock, missed,
855 last, now, delta);
856 ci->ci_xen_missed_hardclock_evcnt.ev_count++;
857 }
858 }
859
860 /*
861 * Re-arm the timer. If it fails, it's probably because the
862 * time is in the past, so update our idea of what the Xen
863 * system time is and try again.
864 */
865 next = ci->ci_xen_hardclock_systime_ns + ns_per_tick;
866 error = HYPERVISOR_set_timer_op(next);
867 if (error)
868 goto again;
869
870 /* Success! */
871 return 0;
872 }
873
874 /*
875 * xen_initclocks()
876 *
877 * Initialize the Xen clocks on the current CPU.
878 */
879 void
880 xen_initclocks(void)
881 {
882 struct cpu_info *ci = curcpu();
883
884 /* If this is the primary CPU, do global initialization first. */
885 if (ci == &cpu_info_primary) {
886 /* Initialize the systemwide Xen timecounter. */
887 tc_init(&xen_timecounter);
888 }
889
890 /* Attach the event counters. */
891 evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt,
892 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
893 "cpu tsc ran backwards");
894 evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt,
895 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
896 "tsc delta went negative");
897 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt,
898 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
899 "raw systime wrapped around");
900 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt,
901 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
902 "raw systime went backwards");
903 evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt,
904 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
905 "systime went backwards in hardclock");
906 evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt,
907 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
908 "missed hardclock");
909 evcnt_attach_dynamic(&ci->ci_xen_timecounter_backwards_evcnt,
910 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
911 "timecounter went backwards");
912 evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt,
913 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
914 "hardclock jumped past timecounter max");
915
916 /* Fire up the clocks. */
917 xen_resumeclocks(ci);
918
919 #ifdef DOM0OPS
920 /*
921 * If this is a privileged dom0, start pushing the wall
922 * clock time back to the Xen hypervisor.
923 */
924 if (ci == &cpu_info_primary && xendomain_is_privileged())
925 xen_timepush_init();
926 #endif
927 }
928
929 #ifdef DOM0OPS
930
931 /*
932 * xen_timepush_init()
933 *
934 * Initialize callout to periodically set Xen hypervisor's wall
935 * clock time.
936 */
937 static void
938 xen_timepush_init(void)
939 {
940 struct sysctllog *log = NULL;
941 const struct sysctlnode *node = NULL;
942 int error;
943
944 /* Start periodically updating the hypervisor's wall clock time. */
945 callout_init(&xen_timepush.ch, 0);
946 callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);
947
948 /* Pick a default frequency for timepush. */
949 xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */
950
951 /* Create machdep.xen node. */
952 /* XXX Creation of the `machdep.xen' node should be elsewhere. */
953 error = sysctl_createv(&log, 0, NULL, &node, 0,
954 CTLTYPE_NODE, "xen",
955 SYSCTL_DESCR("Xen top level node"),
956 NULL, 0, NULL, 0,
957 CTL_MACHDEP, CTL_CREATE, CTL_EOL);
958 if (error)
959 goto fail;
960 KASSERT(node != NULL);
961
962 /* Create int machdep.xen.timepush_ticks knob. */
963 error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE,
964 CTLTYPE_INT, "timepush_ticks",
965 SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
966 " 0 to disable"),
967 sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
968 CTL_CREATE, CTL_EOL);
969 if (error)
970 goto fail;
971
972 /* Start the timepush callout. */
973 callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
974
975 /* Success! */
976 return;
977
978 fail: sysctl_teardown(&log);
979 }
980
981 /*
982 * xen_timepush_intr(cookie)
983 *
984 * Callout interrupt handler to push NetBSD's idea of the wall
985 * clock time, usually synchronized with NTP, back to the Xen
986 * hypervisor.
987 */
988 static void
989 xen_timepush_intr(void *cookie)
990 {
991
992 resettodr();
993 if (xen_timepush.ticks)
994 callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
995 }
996
997 /*
998 * sysctl_xen_timepush(...)
999 *
1000 * Sysctl handler to set machdep.xen.timepush_ticks.
1001 */
1002 static int
1003 sysctl_xen_timepush(SYSCTLFN_ARGS)
1004 {
1005 struct sysctlnode node;
1006 int ticks;
1007 int error;
1008
1009 ticks = xen_timepush.ticks;
1010 node = *rnode;
1011 node.sysctl_data = &ticks;
1012 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1013 if (error || newp == NULL)
1014 return error;
1015
1016 if (ticks < 0)
1017 return EINVAL;
1018
1019 if (ticks != xen_timepush.ticks) {
1020 xen_timepush.ticks = ticks;
1021
1022 if (ticks == 0)
1023 callout_stop(&xen_timepush.ch);
1024 else
1025 callout_schedule(&xen_timepush.ch, ticks);
1026 }
1027
1028 return 0;
1029 }
1030
1031 #endif /* DOM0OPS */
1032
1033 static int xen_rtc_get(struct todr_chip_handle *, struct timeval *);
1034 static int xen_rtc_set(struct todr_chip_handle *, struct timeval *);
1035 static void xen_wallclock_time(struct timespec *);
1036 /*
1037 * xen time of day register:
1038 *
1039 * Xen wall clock time, plus a Xen vCPU system time adjustment.
1040 */
1041 static struct todr_chip_handle xen_todr_chip = {
1042 .todr_gettime = xen_rtc_get,
1043 .todr_settime = xen_rtc_set,
1044 };
1045
1046 /*
1047 * xen_startrtclock()
1048 *
1049 * Initialize the real-time clock from x86 machdep autoconf.
1050 */
1051 void
1052 xen_startrtclock(void)
1053 {
1054
1055 todr_attach(&xen_todr_chip);
1056 }
1057
1058 /*
1059 * xen_rtc_get(todr, tv)
1060 *
1061 * Get the current real-time clock from the Xen wall clock time
1062 * and vCPU system time adjustment.
1063 */
1064 static int
1065 xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
1066 {
1067 struct timespec ts;
1068
1069 xen_wallclock_time(&ts);
1070 TIMESPEC_TO_TIMEVAL(tvp, &ts);
1071
1072 return 0;
1073 }
1074
1075 /*
1076 * xen_rtc_set(todr, tv)
1077 *
1078 * Set the Xen wall clock time, if we can.
1079 */
1080 static int
1081 xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
1082 {
1083 #ifdef DOM0OPS
1084 struct clock_ymdhms dt;
1085 xen_platform_op_t op;
1086 uint64_t systime_ns;
1087
1088 if (xendomain_is_privileged()) {
1089 /* Convert to ymdhms and set the x86 ISA RTC. */
1090 clock_secs_to_ymdhms(tvp->tv_sec, &dt);
1091 rtc_set_ymdhms(NULL, &dt);
1092
1093 /* Get the global system time so we can preserve it. */
1094 systime_ns = xen_global_systime_ns();
1095
1096 /* Set the hypervisor wall clock time. */
1097 op.cmd = XENPF_settime;
1098 op.u.settime.secs = tvp->tv_sec;
1099 op.u.settime.nsecs = tvp->tv_usec * 1000;
1100 op.u.settime.system_time = systime_ns;
1101 return HYPERVISOR_platform_op(&op);
1102 }
1103 #endif
1104
1105 /* XXX Should this fail if not on privileged dom0? */
1106 return 0;
1107 }
1108
1109 /*
1110 * xen_wallclock_time(tsp)
1111 *
1112 * Return a snapshot of the current low-resolution wall clock
1113 * time, as reported by the hypervisor, in tsp.
1114 */
1115 static void
1116 xen_wallclock_time(struct timespec *tsp)
1117 {
1118 struct xen_wallclock_ticket ticket;
1119 uint64_t systime_ns;
1120
1121 int s = splsched(); /* make sure we won't be interrupted */
1122 /* Read the last wall clock sample from the hypervisor. */
1123 do {
1124 xen_wallclock_enter(&ticket);
1125 tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
1126 tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
1127 } while (!xen_wallclock_exit(&ticket));
1128
1129 /* Get the global system time. */
1130 systime_ns = xen_global_systime_ns();
1131 splx(s);
1132
1133 /* Add the system time to the wall clock time. */
1134 systime_ns += tsp->tv_nsec;
1135 tsp->tv_sec += systime_ns / 1000000000ull;
1136 tsp->tv_nsec = systime_ns % 1000000000ull;
1137 }
1138
1139 #ifdef XENPV
1140 /*
1141 * setstatclockrate(rate)
1142 *
1143 * Set the statclock to run at rate, in units of ticks per second.
1144 *
1145 * Currently Xen does not have a separate statclock, so this is a
1146 * noop; instad the statclock runs in hardclock.
1147 */
1148 void
1149 setstatclockrate(int rate)
1150 {
1151 }
1152 #endif /* XENPV */
1153