1 /* $NetBSD: xen_clock.c,v 1.23 2025/08/18 20:59:54 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include "opt_xen.h" 33 34 #ifndef XEN_CLOCK_DEBUG 35 #define XEN_CLOCK_DEBUG 0 36 #endif 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.23 2025/08/18 20:59:54 andvar Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 #include <sys/atomic.h> 44 #include <sys/callout.h> 45 #include <sys/cpu.h> 46 #include <sys/device.h> 47 #include <sys/evcnt.h> 48 #include <sys/intr.h> 49 #include <sys/kernel.h> 50 #include <sys/lwp.h> 51 #include <sys/proc.h> 52 #include <sys/sdt.h> 53 #include <sys/sysctl.h> 54 #include <sys/systm.h> 55 #include <sys/time.h> 56 #include <sys/timetc.h> 57 58 #include <dev/clock_subr.h> 59 60 #include <machine/cpu.h> 61 #include <machine/cpu_counter.h> 62 #include <machine/lock.h> 63 64 #include <xen/evtchn.h> 65 #include <xen/hypervisor.h> 66 #include <xen/include/public/vcpu.h> 67 #include <xen/xen.h> 68 69 #include <x86/rtc.h> 70 71 #define NS_PER_TICK ((uint64_t)1000000000ULL/hz) 72 73 static uint64_t xen_vcputime_systime_ns(void); 74 static uint64_t xen_vcputime_raw_systime_ns(void); 75 static uint64_t xen_global_systime_ns(void); 76 static unsigned xen_get_timecount(struct timecounter *); 77 static int xen_timer_handler(void *, struct clockframe *); 78 79 /* 80 * dtrace probes 81 */ 82 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__backward, 83 "uint64_t"/*raw_systime_ns*/, 84 "uint64_t"/*tsc_timestamp*/, 85 "uint64_t"/*tsc_to_system_mul*/, 86 "int"/*tsc_shift*/, 87 "uint64_t"/*delta_ns*/, 88 "uint64_t"/*tsc*/, 89 "uint64_t"/*systime_ns*/); 90 SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__delta__negative, 91 "uint64_t"/*raw_systime_ns*/, 92 "uint64_t"/*tsc_timestamp*/, 93 "uint64_t"/*tsc_to_system_mul*/, 94 "int"/*tsc_shift*/, 95 "uint64_t"/*delta_ns*/, 96 "uint64_t"/*tsc*/, 97 "uint64_t"/*systime_ns*/); 98 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__wraparound, 99 "uint64_t"/*raw_systime_ns*/, 100 "uint64_t"/*tsc_timestamp*/, 101 "uint64_t"/*tsc_to_system_mul*/, 102 "int"/*tsc_shift*/, 103 "uint64_t"/*delta_ns*/, 104 "uint64_t"/*tsc*/, 105 "uint64_t"/*systime_ns*/); 106 SDT_PROBE_DEFINE7(sdt, xen, clock, systime__backward, 107 "uint64_t"/*raw_systime_ns*/, 108 "uint64_t"/*tsc_timestamp*/, 109 "uint64_t"/*tsc_to_system_mul*/, 110 "int"/*tsc_shift*/, 111 "uint64_t"/*delta_ns*/, 112 "uint64_t"/*tsc*/, 113 "uint64_t"/*systime_ns*/); 114 115 SDT_PROBE_DEFINE3(sdt, xen, timecounter, backward, 116 "uint64_t"/*local*/, 117 "uint64_t"/*skew*/, 118 "uint64_t"/*global*/); 119 120 SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward, 121 "uint64_t"/*last_systime_ns*/, 122 "uint64_t"/*this_systime_ns*/); 123 SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick, 124 "uint64_t"/*last_systime_ns*/, 125 "uint64_t"/*this_systime_ns*/); 126 SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump, 127 "uint64_t"/*last_systime_ns*/, 128 "uint64_t"/*this_systime_ns*/, 129 "uint64_t"/*nticks*/); 130 SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed, 131 "uint64_t"/*last_systime_ns*/, 132 "uint64_t"/*this_systime_ns*/, 133 "uint64_t"/*remaining_ns*/); 134 135 /* 136 * xen timecounter: 137 * 138 * Xen vCPU system time, plus an adjustment with rdtsc. 139 */ 140 static struct timecounter xen_timecounter = { 141 .tc_get_timecount = xen_get_timecount, 142 .tc_poll_pps = NULL, 143 .tc_counter_mask = ~0U, 144 .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */ 145 .tc_name = "xen_system_time", 146 .tc_quality = 10000, 147 }; 148 149 /* 150 * xen_global_systime_ns_stamp 151 * 152 * The latest Xen vCPU system time that has been observed on any 153 * CPU, for a global monotonic view of the Xen system time clock. 154 */ 155 static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned; 156 157 #ifdef DOM0OPS 158 /* 159 * xen timepush state: 160 * 161 * Callout to periodically, after a sysctl-configurable number of 162 * NetBSD ticks, set the Xen hypervisor's wall clock time. 163 */ 164 static struct { 165 struct callout ch; 166 int ticks; 167 } xen_timepush; 168 169 static void xen_timepush_init(void); 170 static void xen_timepush_intr(void *); 171 static int sysctl_xen_timepush(SYSCTLFN_ARGS); 172 #endif 173 174 /* 175 * struct xen_vcputime_ticket 176 * 177 * State for a vCPU read section, during which a caller may read 178 * from fields of a struct vcpu_time_info and call rdtsc. 179 * 180 * Caller must enter with xen_vcputime_enter, exit with 181 * xen_vcputime_exit, and be prepared to retry if 182 * xen_vcputime_exit fails. 183 */ 184 struct xen_vcputime_ticket { 185 uint64_t version; 186 }; 187 188 /* 189 * xen_vcputime_enter(tp) 190 * 191 * Enter a vCPU time read section and store a ticket in *tp, which 192 * the caller must use with xen_vcputime_exit. Return a pointer 193 * to the current CPU's vcpu_time_info structure. Caller must 194 * already be bound to the CPU. 195 */ 196 static inline volatile struct vcpu_time_info * 197 xen_vcputime_enter(struct xen_vcputime_ticket *tp) 198 { 199 volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time; 200 201 while (__predict_false(1 & (tp->version = vt->version))) 202 SPINLOCK_BACKOFF_HOOK; 203 204 /* 205 * Must read the version before reading the tsc on the local 206 * pCPU. We are racing only with interruption by the 207 * hypervisor, so no need for a stronger memory barrier. 208 */ 209 __insn_barrier(); 210 211 return vt; 212 } 213 214 /* 215 * xen_vcputime_exit(vt, tp) 216 * 217 * Exit a vCPU time read section with the ticket in *tp from 218 * xen_vcputime_enter. Return true on success, false if caller 219 * must retry. 220 */ 221 static inline bool 222 xen_vcputime_exit(volatile struct vcpu_time_info *vt, 223 struct xen_vcputime_ticket *tp) 224 { 225 226 KASSERT(vt == &curcpu()->ci_vcpu->time); 227 228 /* 229 * Must read the tsc before re-reading the version on the local 230 * pCPU. We are racing only with interruption by the 231 * hypervisor, so no need for a stronger memory barrier. 232 */ 233 __insn_barrier(); 234 235 return tp->version == vt->version; 236 } 237 238 /* 239 * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift) 240 * 241 * Convert a difference in tsc units to a difference in 242 * nanoseconds given a multiplier and shift for the unit 243 * conversion. 244 */ 245 static inline uint64_t 246 xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul, 247 int8_t tsc_shift) 248 { 249 uint32_t delta_tsc_hi, delta_tsc_lo; 250 251 if (tsc_shift < 0) 252 delta_tsc >>= -tsc_shift; 253 else 254 delta_tsc <<= tsc_shift; 255 256 delta_tsc_hi = delta_tsc >> 32; 257 delta_tsc_lo = delta_tsc & 0xffffffffUL; 258 259 /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */ 260 return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) + 261 (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32); 262 } 263 264 /* 265 * xen_vcputime_systime_ns() 266 * 267 * Return a snapshot of the Xen system time plus an adjustment 268 * from the tsc, in units of nanoseconds. Caller must be bound to 269 * the current CPU. 270 */ 271 static uint64_t 272 xen_vcputime_systime_ns(void) 273 { 274 volatile struct vcpu_time_info *vt; 275 struct cpu_info *ci = curcpu(); 276 struct xen_vcputime_ticket ticket; 277 uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns; 278 uint32_t tsc_to_system_mul; 279 int8_t tsc_shift; 280 uint64_t systime_ns; 281 282 /* We'd better be bound to the CPU in _some_ way. */ 283 KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() || 284 (curlwp->l_flag & LP_BOUND)); 285 286 /* 287 * Repeatedly try to read the system time, corresponding tsc 288 * timestamp, and tsc frequency until we get a consistent view. 289 */ 290 do { 291 vt = xen_vcputime_enter(&ticket); 292 293 /* Grab Xen's snapshot of raw system time and tsc. */ 294 raw_systime_ns = vt->system_time; 295 tsc_timestamp = vt->tsc_timestamp; 296 297 /* Get Xen's current idea of how fast the tsc is counting. */ 298 tsc_to_system_mul = vt->tsc_to_system_mul; 299 tsc_shift = vt->tsc_shift; 300 301 /* Read the CPU's tsc. */ 302 tsc = rdtsc(); 303 } while (!xen_vcputime_exit(vt, &ticket)); 304 305 /* 306 * Out of paranoia, check whether the tsc has gone backwards 307 * since Xen's timestamp. 308 * 309 * This shouldn't happen because the Xen hypervisor is supposed 310 * to have read the tsc _before_ writing to the vcpu_time_info 311 * page, _before_ we read the tsc. 312 * 313 * Further, if we switched pCPUs after reading the tsc 314 * timestamp but before reading the CPU's tsc, the hypervisor 315 * had better notify us by updating the version too and forcing 316 * us to retry the vCPU time read. 317 */ 318 if (__predict_false(tsc < tsc_timestamp)) { 319 /* 320 * Notify the console that the CPU's tsc appeared to 321 * run behind Xen's idea of it, and pretend it hadn't. 322 */ 323 SDT_PROBE7(sdt, xen, clock, tsc__backward, 324 raw_systime_ns, tsc_timestamp, 325 tsc_to_system_mul, tsc_shift, /*delta_ns*/0, tsc, 326 /*systime_ns*/raw_systime_ns); 327 #if XEN_CLOCK_DEBUG 328 device_printf(ci->ci_dev, "xen cpu tsc %"PRIu64 329 " ran backwards from timestamp %"PRIu64 330 " by %"PRIu64"\n", 331 tsc, tsc_timestamp, tsc_timestamp - tsc); 332 #endif 333 ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++; 334 delta_ns = delta_tsc = 0; 335 } else { 336 /* Find how far the CPU's tsc has advanced. */ 337 delta_tsc = tsc - tsc_timestamp; 338 339 /* Convert the tsc delta to a nanosecond delta. */ 340 delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul, 341 tsc_shift); 342 } 343 344 /* 345 * Notify the console if the delta computation yielded a 346 * negative, and pretend it hadn't. 347 * 348 * This doesn't make sense but I include it out of paranoia. 349 */ 350 if (__predict_false((int64_t)delta_ns < 0)) { 351 SDT_PROBE7(sdt, xen, clock, tsc__delta__negative, 352 raw_systime_ns, tsc_timestamp, 353 tsc_to_system_mul, tsc_shift, delta_ns, tsc, 354 /*systime_ns*/raw_systime_ns); 355 #if XEN_CLOCK_DEBUG 356 device_printf(ci->ci_dev, "xen tsc delta in ns went negative:" 357 " %"PRId64"\n", delta_ns); 358 #endif 359 ci->ci_xen_tsc_delta_negative_evcnt.ev_count++; 360 delta_ns = 0; 361 } 362 363 /* 364 * Compute the TSC-adjusted system time. 365 */ 366 systime_ns = raw_systime_ns + delta_ns; 367 368 /* 369 * Notify the console if the addition wrapped around. 370 * 371 * This shouldn't happen because system time should be relative 372 * to a reasonable reference point, not centuries in the past. 373 * (2^64 ns is approximately half a millennium.) 374 */ 375 if (__predict_false(systime_ns < raw_systime_ns)) { 376 SDT_PROBE7(sdt, xen, clock, systime__wraparound, 377 raw_systime_ns, tsc_timestamp, 378 tsc_to_system_mul, tsc_shift, delta_ns, tsc, 379 systime_ns); 380 #if XEN_CLOCK_DEBUG 381 printf("xen raw systime + tsc delta wrapped around:" 382 " %"PRIu64" + %"PRIu64" = %"PRIu64"\n", 383 raw_systime_ns, delta_ns, systime_ns); 384 #endif 385 ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++; 386 } 387 388 /* 389 * Notify the console if the TSC-adjusted Xen system time 390 * appears to have gone backwards, and pretend we had gone 391 * forward. This seems to happen pretty regularly under load. 392 */ 393 if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) { 394 SDT_PROBE7(sdt, xen, clock, systime__backward, 395 raw_systime_ns, tsc_timestamp, 396 tsc_to_system_mul, tsc_shift, delta_ns, tsc, 397 systime_ns); 398 #if XEN_CLOCK_DEBUG 399 printf("xen raw systime + tsc delta went backwards:" 400 " %"PRIu64" > %"PRIu64"\n", 401 ci->ci_xen_last_systime_ns, systime_ns); 402 printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n" 403 " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n" 404 " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n" 405 " delta_ns=%"PRIu64"\n", 406 raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul, 407 tsc_shift, delta_tsc, delta_ns); 408 #endif 409 ci->ci_xen_raw_systime_backwards_evcnt.ev_count++; 410 systime_ns = ci->ci_xen_last_systime_ns + 1; 411 } 412 413 /* Remember the TSC-adjusted Xen system time. */ 414 ci->ci_xen_last_systime_ns = systime_ns; 415 416 /* We had better not have migrated CPUs. */ 417 KASSERT(ci == curcpu()); 418 419 /* And we're done: return the TSC-adjusted systime in nanoseconds. */ 420 return systime_ns; 421 } 422 423 /* 424 * xen_vcputime_raw_systime_ns() 425 * 426 * Return a snapshot of the current Xen system time to the 427 * resolution of the Xen hypervisor tick, in units of nanoseconds. 428 */ 429 static uint64_t 430 xen_vcputime_raw_systime_ns(void) 431 { 432 volatile struct vcpu_time_info *vt; 433 struct xen_vcputime_ticket ticket; 434 uint64_t raw_systime_ns; 435 436 do { 437 vt = xen_vcputime_enter(&ticket); 438 raw_systime_ns = vt->system_time; 439 } while (!xen_vcputime_exit(vt, &ticket)); 440 441 return raw_systime_ns; 442 } 443 444 /* 445 * struct xen_wallclock_ticket 446 * 447 * State for a wall clock read section, during which a caller may 448 * read from the wall clock fields of HYPERVISOR_shared_info. 449 * Caller must enter with xen_wallclock_enter, exit with 450 * xen_wallclock_exit, and be prepared to retry if 451 * xen_wallclock_exit fails. 452 */ 453 struct xen_wallclock_ticket { 454 uint32_t version; 455 }; 456 457 /* 458 * xen_wallclock_enter(tp) 459 * 460 * Enter a wall clock read section and store a ticket in *tp, 461 * which the caller must use with xen_wallclock_exit. 462 */ 463 static inline void 464 xen_wallclock_enter(struct xen_wallclock_ticket *tp) 465 { 466 467 while (__predict_false(1 & (tp->version = 468 HYPERVISOR_shared_info->wc_version))) 469 SPINLOCK_BACKOFF_HOOK; 470 471 /* 472 * Must read the version from memory before reading the 473 * timestamp from memory, as written potentially by another 474 * pCPU. 475 */ 476 membar_consumer(); 477 } 478 479 /* 480 * xen_wallclock_exit(tp) 481 * 482 * Exit a wall clock read section with the ticket in *tp from 483 * xen_wallclock_enter. Return true on success, false if caller 484 * must retry. 485 */ 486 static inline bool 487 xen_wallclock_exit(struct xen_wallclock_ticket *tp) 488 { 489 490 /* 491 * Must read the timestamp from memory before re-reading the 492 * version from memory, as written potentially by another pCPU. 493 */ 494 membar_consumer(); 495 496 return tp->version == HYPERVISOR_shared_info->wc_version; 497 } 498 499 /* 500 * xen_global_systime_ns() 501 * 502 * Return a global monotonic view of the system time in 503 * nanoseconds, computed by the per-CPU Xen raw system time plus 504 * an rdtsc adjustment, and advance the view of the system time 505 * for all other CPUs. 506 */ 507 static uint64_t 508 xen_global_systime_ns(void) 509 { 510 struct cpu_info *ci; 511 uint64_t local, global, skew, result; 512 513 /* 514 * Find the local timecount on this CPU, and make sure it does 515 * not precede the latest global timecount witnessed so far by 516 * any CPU. If it does, add to the local CPU's skew from the 517 * fastest CPU. 518 * 519 * XXX Can we avoid retrying if the CAS fails? 520 */ 521 int s = splsched(); /* make sure we won't be interrupted */ 522 ci = curcpu(); 523 do { 524 local = xen_vcputime_systime_ns(); 525 skew = ci->ci_xen_systime_ns_skew; 526 global = xen_global_systime_ns_stamp; 527 if (__predict_false(local + skew < global + 1)) { 528 SDT_PROBE3(sdt, xen, timecounter, backward, 529 local, skew, global); 530 #if XEN_CLOCK_DEBUG 531 device_printf(ci->ci_dev, 532 "xen timecounter went backwards:" 533 " local=%"PRIu64" skew=%"PRIu64" global=%"PRIu64"," 534 " adding %"PRIu64" to skew\n", 535 local, skew, global, global + 1 - (local + skew)); 536 #endif 537 ci->ci_xen_timecounter_backwards_evcnt.ev_count++; 538 result = global + 1; 539 ci->ci_xen_systime_ns_skew += global + 1 - 540 (local + skew); 541 } else { 542 result = local + skew; 543 } 544 } while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result) 545 != global); 546 KASSERT(ci == curcpu()); 547 splx(s); 548 549 return result; 550 } 551 552 /* 553 * xen_get_timecount(tc) 554 * 555 * Return the low 32 bits of a global monotonic view of the Xen 556 * system time. 557 */ 558 static unsigned 559 xen_get_timecount(struct timecounter *tc) 560 { 561 562 KASSERT(tc == &xen_timecounter); 563 564 return (unsigned)xen_global_systime_ns(); 565 } 566 567 /* 568 * xen_delay(n) 569 * 570 * Wait approximately n microseconds. 571 */ 572 void 573 xen_delay(unsigned n) 574 { 575 int bound; 576 577 /* Bind to the CPU so we don't compare tsc on different CPUs. */ 578 bound = curlwp_bind(); 579 580 if (curcpu()->ci_vcpu == NULL) { 581 curlwp_bindx(bound); 582 return; 583 } 584 585 /* Short wait (<500us) or long wait? */ 586 if (n < 500000) { 587 /* 588 * Xen system time is not precise enough for short 589 * delays, so use the tsc instead. 590 * 591 * We work with the current tsc frequency, and figure 592 * that if it changes while we're delaying, we've 593 * probably delayed long enough -- up to 500us. 594 * 595 * We do not use cpu_frequency(ci), which uses a 596 * quantity detected at boot time, and which may have 597 * changed by now if Xen has migrated this vCPU to 598 * another pCPU. 599 * 600 * XXX How long does it take to migrate pCPUs? 601 */ 602 volatile struct vcpu_time_info *vt; 603 struct xen_vcputime_ticket ticket; 604 uint64_t tsc_start, last_tsc, tsc; 605 uint32_t tsc_to_system_mul; 606 int8_t tsc_shift; 607 608 /* Get the starting tsc and tsc frequency. */ 609 do { 610 vt = xen_vcputime_enter(&ticket); 611 tsc_start = last_tsc = rdtsc(); 612 tsc_to_system_mul = vt->tsc_to_system_mul; 613 tsc_shift = vt->tsc_shift; 614 } while (!xen_vcputime_exit(vt, &ticket)); 615 616 /* 617 * Wait until as many tsc ticks as there are in n 618 * microseconds have elapsed, or the tsc has gone 619 * backwards meaning we've probably migrated pCPUs. 620 */ 621 for (;;) { 622 tsc = rdtsc(); 623 if (__predict_false(tsc < last_tsc)) 624 break; 625 if (xen_tsc_to_ns_delta(tsc - tsc_start, 626 tsc_to_system_mul, tsc_shift)/1000 >= n) 627 break; 628 last_tsc = tsc; 629 } 630 } else { 631 /* 632 * Use the Xen system time for >=500us delays. From my 633 * testing, it seems to sometimes run backward by about 634 * 110us, which is not so bad. 635 */ 636 uint64_t n_ns = 1000*(uint64_t)n; 637 uint64_t start_ns; 638 639 /* Get the start time. */ 640 start_ns = xen_vcputime_raw_systime_ns(); 641 642 /* Wait until the system time has passed the end. */ 643 do { 644 HYPERVISOR_yield(); 645 } while (xen_vcputime_raw_systime_ns() - start_ns < n_ns); 646 } 647 648 /* Unbind from the CPU if we weren't already bound. */ 649 curlwp_bindx(bound); 650 } 651 652 /* 653 * xen_suspendclocks(ci) 654 * 655 * Stop handling the Xen timer event on the CPU of ci. Caller 656 * must be running on and bound to ci's CPU. 657 * 658 * Actually, caller must have kpreemption disabled, because that's 659 * easier to assert at the moment. 660 */ 661 void 662 xen_suspendclocks(struct cpu_info *ci) 663 { 664 int evtch; 665 666 KASSERT(ci == curcpu()); 667 KASSERT(kpreempt_disabled()); 668 669 /* 670 * Find the VIRQ_TIMER event channel and close it so new timer 671 * interrupt events stop getting delivered to it. 672 * 673 * XXX Should this happen later? This is not the reverse order 674 * of xen_resumeclocks. It is apparently necessary in this 675 * order only because we don't stash evtchn anywhere, but we 676 * could stash it. 677 */ 678 evtch = unbind_virq_from_evtch(VIRQ_TIMER); 679 KASSERT(evtch != -1); 680 681 /* 682 * Mask the event channel so we stop getting new interrupts on 683 * it. 684 */ 685 hypervisor_mask_event(evtch); 686 687 /* 688 * Now that we are no longer getting new interrupts, remove the 689 * handler and wait for any existing calls to the handler to 690 * complete. After this point, there can be no concurrent 691 * calls to xen_timer_handler. 692 */ 693 event_remove_handler(evtch, 694 __FPTRCAST(int (*)(void *), xen_timer_handler), ci); 695 696 aprint_verbose("Xen clock: removed event channel %d\n", evtch); 697 698 /* We'd better not have switched CPUs. */ 699 KASSERT(ci == curcpu()); 700 } 701 702 /* 703 * xen_resumeclocks(ci) 704 * 705 * Start handling the Xen timer event on the CPU of ci. Arm the 706 * Xen timer. Caller must be running on and bound to ci's CPU. 707 * 708 * Actually, caller must have kpreemption disabled, because that's 709 * easier to assert at the moment. 710 */ 711 void 712 xen_resumeclocks(struct cpu_info *ci) 713 { 714 char intr_xname[INTRDEVNAMEBUF]; 715 int evtch; 716 int error __diagused; 717 718 KASSERT(ci == curcpu()); 719 KASSERT(kpreempt_disabled()); 720 721 /* 722 * Allocate an event channel to receive VIRQ_TIMER events. 723 */ 724 evtch = bind_virq_to_evtch(VIRQ_TIMER); 725 KASSERT(evtch != -1); 726 727 /* 728 * Set an event handler for VIRQ_TIMER events to call 729 * xen_timer_handler. 730 */ 731 snprintf(intr_xname, sizeof(intr_xname), "%s clock", 732 device_xname(ci->ci_dev)); 733 /* XXX sketchy function pointer cast -- fix the API, please */ 734 if (event_set_handler(evtch, 735 __FPTRCAST(int (*)(void *), xen_timer_handler), 736 ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL) 737 panic("failed to establish timer interrupt handler"); 738 739 aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch); 740 741 /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */ 742 if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) { 743 error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 744 ci->ci_vcpuid, NULL); 745 KASSERT(error == 0); 746 } 747 748 /* Pretend the last hardclock happened right now. */ 749 ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns(); 750 751 /* Arm the one-shot timer. */ 752 error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns + 753 NS_PER_TICK); 754 KASSERT(error == 0); 755 756 /* 757 * Ready to go. Unmask the event. After this point, Xen may 758 * start calling xen_timer_handler. 759 */ 760 hypervisor_unmask_event(evtch); 761 762 /* We'd better not have switched CPUs. */ 763 KASSERT(ci == curcpu()); 764 } 765 766 /* 767 * xen_timer_handler(cookie, frame) 768 * 769 * Periodic Xen timer event handler for NetBSD hardclock. Calls 770 * to this may get delayed, so we run hardclock as many times as 771 * we need to in order to cover the Xen system time that elapsed. 772 * After that, re-arm the timer to run again at the next tick. 773 * The cookie is the pointer to struct cpu_info. 774 */ 775 static int 776 xen_timer_handler(void *cookie, struct clockframe *frame) 777 { 778 const uint64_t ns_per_tick = NS_PER_TICK; 779 struct cpu_info *ci = curcpu(); 780 uint64_t last, now, delta, next; 781 int error; 782 783 KASSERT(cpu_intr_p()); 784 KASSERT(cookie == ci); 785 786 #if defined(XENPV) 787 frame = NULL; /* We use values cached in curcpu() */ 788 #endif 789 /* 790 * Find how many nanoseconds of Xen system time has elapsed 791 * since the last hardclock tick. 792 */ 793 last = ci->ci_xen_hardclock_systime_ns; 794 now = xen_vcputime_systime_ns(); 795 SDT_PROBE2(sdt, xen, hardclock, tick, last, now); 796 if (__predict_false(now < last)) { 797 SDT_PROBE2(sdt, xen, hardclock, systime__backward, 798 last, now); 799 #if XEN_CLOCK_DEBUG 800 device_printf(ci->ci_dev, "xen systime ran backwards" 801 " in hardclock %"PRIu64"ns\n", 802 last - now); 803 #endif 804 ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++; 805 /* 806 * we've lost track of time. Just pretends that one 807 * tick elapsed, and reset our idea of last tick. 808 */ 809 ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick; 810 } 811 delta = now - last; 812 813 /* 814 * Play hardclock catchup: run the hardclock timer as many 815 * times as appears necessary based on how much time has 816 * passed. 817 */ 818 if (__predict_false(delta >= 2*ns_per_tick)) { 819 SDT_PROBE3(sdt, xen, hardclock, jump, 820 last, now, delta/ns_per_tick); 821 822 /* 823 * Warn if we violate timecounter(9) contract: with a 824 * k-bit timecounter (here k = 32), and timecounter 825 * frequency f (here f = 1 GHz), the maximum period 826 * between hardclock calls is 2^k / f. 827 */ 828 if (delta > xen_timecounter.tc_counter_mask) { 829 printf("WARNING: hardclock skipped %"PRIu64"ns" 830 " (%"PRIu64" -> %"PRIu64")," 831 " exceeding maximum of %"PRIu32"ns" 832 " for timecounter(9)\n", 833 last, now, delta, 834 xen_timecounter.tc_counter_mask); 835 ci->ci_xen_timecounter_jump_evcnt.ev_count++; 836 } 837 /* don't try to catch up more than one second at once */ 838 if (delta > 1000000000UL) 839 delta = 1000000000UL; 840 } 841 while (delta >= ns_per_tick) { 842 ci->ci_xen_hardclock_systime_ns += ns_per_tick; 843 delta -= ns_per_tick; 844 hardclock(frame); 845 if (__predict_false(delta >= ns_per_tick)) { 846 SDT_PROBE3(sdt, xen, hardclock, missed, 847 last, now, delta); 848 ci->ci_xen_missed_hardclock_evcnt.ev_count++; 849 } 850 } 851 852 /* 853 * Re-arm the timer. If it fails, it's probably because the 854 * time is in the past, possibly because we're in the 855 * process of catching up missed hardclock calls. 856 * In this case schedule a tick in the near future. 857 */ 858 next = ci->ci_xen_hardclock_systime_ns + ns_per_tick; 859 error = HYPERVISOR_set_timer_op(next); 860 if (error) { 861 next = xen_vcputime_systime_ns() + ns_per_tick / 2; 862 error = HYPERVISOR_set_timer_op(next); 863 if (error) { 864 panic("failed to re-arm Xen timer %d", error); 865 } 866 } 867 868 /* Success! */ 869 return 0; 870 } 871 872 /* 873 * xen_initclocks() 874 * 875 * Initialize the Xen clocks on the current CPU. 876 */ 877 void 878 xen_initclocks(void) 879 { 880 struct cpu_info *ci = curcpu(); 881 882 /* If this is the primary CPU, do global initialization first. */ 883 if (ci == &cpu_info_primary) { 884 /* Initialize the systemwide Xen timecounter. */ 885 tc_init(&xen_timecounter); 886 } 887 888 /* Attach the event counters. */ 889 evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt, 890 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 891 "cpu tsc ran backwards"); 892 evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt, 893 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 894 "tsc delta went negative"); 895 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt, 896 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 897 "raw systime wrapped around"); 898 evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt, 899 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 900 "raw systime went backwards"); 901 evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt, 902 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 903 "systime went backwards in hardclock"); 904 evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt, 905 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 906 "missed hardclock"); 907 evcnt_attach_dynamic(&ci->ci_xen_timecounter_backwards_evcnt, 908 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 909 "timecounter went backwards"); 910 evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt, 911 EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 912 "hardclock jumped past timecounter max"); 913 914 /* Fire up the clocks. */ 915 xen_resumeclocks(ci); 916 917 #ifdef DOM0OPS 918 /* 919 * If this is a privileged dom0, start pushing the wall 920 * clock time back to the Xen hypervisor. 921 */ 922 if (ci == &cpu_info_primary && xendomain_is_privileged()) 923 xen_timepush_init(); 924 #endif 925 } 926 927 #ifdef DOM0OPS 928 929 /* 930 * xen_timepush_init() 931 * 932 * Initialize callout to periodically set Xen hypervisor's wall 933 * clock time. 934 */ 935 static void 936 xen_timepush_init(void) 937 { 938 struct sysctllog *log = NULL; 939 const struct sysctlnode *node = NULL; 940 int error; 941 942 /* Start periodically updating the hypervisor's wall clock time. */ 943 callout_init(&xen_timepush.ch, 0); 944 callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL); 945 946 /* Pick a default frequency for timepush. */ 947 xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */ 948 949 /* Create machdep.xen node. */ 950 /* XXX Creation of the `machdep.xen' node should be elsewhere. */ 951 error = sysctl_createv(&log, 0, NULL, &node, 0, 952 CTLTYPE_NODE, "xen", 953 SYSCTL_DESCR("Xen top level node"), 954 NULL, 0, NULL, 0, 955 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 956 if (error) 957 goto fail; 958 KASSERT(node != NULL); 959 960 /* Create int machdep.xen.timepush_ticks knob. */ 961 error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE, 962 CTLTYPE_INT, "timepush_ticks", 963 SYSCTL_DESCR("How often to update the hypervisor's time-of-day;" 964 " 0 to disable"), 965 sysctl_xen_timepush, 0, &xen_timepush.ticks, 0, 966 CTL_CREATE, CTL_EOL); 967 if (error) 968 goto fail; 969 970 /* Start the timepush callout. */ 971 callout_schedule(&xen_timepush.ch, xen_timepush.ticks); 972 973 /* Success! */ 974 return; 975 976 fail: sysctl_teardown(&log); 977 } 978 979 /* 980 * xen_timepush_intr(cookie) 981 * 982 * Callout interrupt handler to push NetBSD's idea of the wall 983 * clock time, usually synchronized with NTP, back to the Xen 984 * hypervisor. 985 */ 986 static void 987 xen_timepush_intr(void *cookie) 988 { 989 990 resettodr(); 991 if (xen_timepush.ticks) 992 callout_schedule(&xen_timepush.ch, xen_timepush.ticks); 993 } 994 995 /* 996 * sysctl_xen_timepush(...) 997 * 998 * Sysctl handler to set machdep.xen.timepush_ticks. 999 */ 1000 static int 1001 sysctl_xen_timepush(SYSCTLFN_ARGS) 1002 { 1003 struct sysctlnode node; 1004 int ticks; 1005 int error; 1006 1007 ticks = xen_timepush.ticks; 1008 node = *rnode; 1009 node.sysctl_data = &ticks; 1010 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1011 if (error || newp == NULL) 1012 return error; 1013 1014 if (ticks < 0) 1015 return EINVAL; 1016 1017 if (ticks != xen_timepush.ticks) { 1018 xen_timepush.ticks = ticks; 1019 1020 if (ticks == 0) 1021 callout_stop(&xen_timepush.ch); 1022 else 1023 callout_schedule(&xen_timepush.ch, ticks); 1024 } 1025 1026 return 0; 1027 } 1028 1029 #endif /* DOM0OPS */ 1030 1031 static int xen_rtc_get(struct todr_chip_handle *, struct timeval *); 1032 static int xen_rtc_set(struct todr_chip_handle *, struct timeval *); 1033 static void xen_wallclock_time(struct timespec *); 1034 /* 1035 * xen time of day register: 1036 * 1037 * Xen wall clock time, plus a Xen vCPU system time adjustment. 1038 */ 1039 static struct todr_chip_handle xen_todr_chip = { 1040 .todr_gettime = xen_rtc_get, 1041 .todr_settime = xen_rtc_set, 1042 }; 1043 1044 /* 1045 * xen_startrtclock() 1046 * 1047 * Initialize the real-time clock from x86 machdep autoconf. 1048 */ 1049 void 1050 xen_startrtclock(void) 1051 { 1052 1053 todr_attach(&xen_todr_chip); 1054 } 1055 1056 /* 1057 * xen_rtc_get(todr, tv) 1058 * 1059 * Get the current real-time clock from the Xen wall clock time 1060 * and vCPU system time adjustment. 1061 */ 1062 static int 1063 xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp) 1064 { 1065 struct timespec ts; 1066 1067 xen_wallclock_time(&ts); 1068 TIMESPEC_TO_TIMEVAL(tvp, &ts); 1069 1070 return 0; 1071 } 1072 1073 /* 1074 * xen_rtc_set(todr, tv) 1075 * 1076 * Set the Xen wall clock time, if we can. 1077 */ 1078 static int 1079 xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp) 1080 { 1081 #ifdef DOM0OPS 1082 struct clock_ymdhms dt; 1083 xen_platform_op_t op; 1084 uint64_t systime_ns; 1085 1086 if (xendomain_is_privileged()) { 1087 /* Convert to ymdhms and set the x86 ISA RTC. */ 1088 clock_secs_to_ymdhms(tvp->tv_sec, &dt); 1089 rtc_set_ymdhms(NULL, &dt); 1090 1091 /* Get the global system time so we can preserve it. */ 1092 systime_ns = xen_global_systime_ns(); 1093 1094 /* Set the hypervisor wall clock time. */ 1095 memset(&op, 0, sizeof(op)); 1096 op.cmd = XENPF_settime; 1097 op.u.settime.secs = tvp->tv_sec; 1098 op.u.settime.nsecs = tvp->tv_usec * 1000; 1099 op.u.settime.system_time = systime_ns; 1100 return HYPERVISOR_platform_op(&op); 1101 } 1102 #endif 1103 1104 /* XXX Should this fail if not on privileged dom0? */ 1105 return 0; 1106 } 1107 1108 /* 1109 * xen_wallclock_time(tsp) 1110 * 1111 * Return a snapshot of the current low-resolution wall clock 1112 * time, as reported by the hypervisor, in tsp. 1113 */ 1114 static void 1115 xen_wallclock_time(struct timespec *tsp) 1116 { 1117 struct xen_wallclock_ticket ticket; 1118 uint64_t systime_ns; 1119 1120 int s = splsched(); /* make sure we won't be interrupted */ 1121 /* Read the last wall clock sample from the hypervisor. */ 1122 do { 1123 xen_wallclock_enter(&ticket); 1124 tsp->tv_sec = HYPERVISOR_shared_info->wc_sec; 1125 tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec; 1126 } while (!xen_wallclock_exit(&ticket)); 1127 1128 /* Get the global system time. */ 1129 systime_ns = xen_global_systime_ns(); 1130 splx(s); 1131 1132 /* Add the system time to the wall clock time. */ 1133 systime_ns += tsp->tv_nsec; 1134 tsp->tv_sec += systime_ns / 1000000000ull; 1135 tsp->tv_nsec = systime_ns % 1000000000ull; 1136 } 1137 1138 #ifdef XENPV 1139 /* 1140 * setstatclockrate(rate) 1141 * 1142 * Set the statclock to run at rate, in units of ticks per second. 1143 * 1144 * Currently Xen does not have a separate statclock, so this is a 1145 * noop; instead the statclock runs in hardclock. 1146 */ 1147 void 1148 setstatclockrate(int rate) 1149 { 1150 } 1151 #endif /* XENPV */ 1152