1 1.23 andvar /* $NetBSD: xen_clock.c,v 1.23 2025/08/18 20:59:54 andvar Exp $ */ 2 1.2 bouyer 3 1.2 bouyer /*- 4 1.2 bouyer * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc. 5 1.2 bouyer * All rights reserved. 6 1.2 bouyer * 7 1.2 bouyer * This code is derived from software contributed to The NetBSD Foundation 8 1.2 bouyer * by Taylor R. Campbell. 9 1.2 bouyer * 10 1.2 bouyer * Redistribution and use in source and binary forms, with or without 11 1.2 bouyer * modification, are permitted provided that the following conditions 12 1.2 bouyer * are met: 13 1.2 bouyer * 1. Redistributions of source code must retain the above copyright 14 1.2 bouyer * notice, this list of conditions and the following disclaimer. 15 1.2 bouyer * 2. Redistributions in binary form must reproduce the above copyright 16 1.2 bouyer * notice, this list of conditions and the following disclaimer in the 17 1.2 bouyer * documentation and/or other materials provided with the distribution. 18 1.2 bouyer * 19 1.2 bouyer * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.2 bouyer * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.2 bouyer * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.2 bouyer * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.2 bouyer * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.2 bouyer * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.2 bouyer * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.2 bouyer * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.2 bouyer * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.2 bouyer * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.2 bouyer * POSSIBILITY OF SUCH DAMAGE. 30 1.2 bouyer */ 31 1.2 bouyer 32 1.2 bouyer #include "opt_xen.h" 33 1.2 bouyer 34 1.2 bouyer #ifndef XEN_CLOCK_DEBUG 35 1.2 bouyer #define XEN_CLOCK_DEBUG 0 36 1.2 bouyer #endif 37 1.2 bouyer 38 1.2 bouyer #include <sys/cdefs.h> 39 1.23 andvar __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.23 2025/08/18 20:59:54 andvar Exp $"); 40 1.2 bouyer 41 1.2 bouyer #include <sys/param.h> 42 1.2 bouyer #include <sys/types.h> 43 1.2 bouyer #include <sys/atomic.h> 44 1.2 bouyer #include <sys/callout.h> 45 1.2 bouyer #include <sys/cpu.h> 46 1.2 bouyer #include <sys/device.h> 47 1.2 bouyer #include <sys/evcnt.h> 48 1.2 bouyer #include <sys/intr.h> 49 1.2 bouyer #include <sys/kernel.h> 50 1.2 bouyer #include <sys/lwp.h> 51 1.2 bouyer #include <sys/proc.h> 52 1.9 riastrad #include <sys/sdt.h> 53 1.2 bouyer #include <sys/sysctl.h> 54 1.2 bouyer #include <sys/systm.h> 55 1.2 bouyer #include <sys/time.h> 56 1.2 bouyer #include <sys/timetc.h> 57 1.2 bouyer 58 1.2 bouyer #include <dev/clock_subr.h> 59 1.2 bouyer 60 1.2 bouyer #include <machine/cpu.h> 61 1.2 bouyer #include <machine/cpu_counter.h> 62 1.2 bouyer #include <machine/lock.h> 63 1.2 bouyer 64 1.2 bouyer #include <xen/evtchn.h> 65 1.2 bouyer #include <xen/hypervisor.h> 66 1.2 bouyer #include <xen/include/public/vcpu.h> 67 1.2 bouyer #include <xen/xen.h> 68 1.2 bouyer 69 1.2 bouyer #include <x86/rtc.h> 70 1.2 bouyer 71 1.2 bouyer #define NS_PER_TICK ((uint64_t)1000000000ULL/hz) 72 1.2 bouyer 73 1.2 bouyer static uint64_t xen_vcputime_systime_ns(void); 74 1.2 bouyer static uint64_t xen_vcputime_raw_systime_ns(void); 75 1.2 bouyer static uint64_t xen_global_systime_ns(void); 76 1.2 bouyer static unsigned xen_get_timecount(struct timecounter *); 77 1.2 bouyer static int xen_timer_handler(void *, struct clockframe *); 78 1.2 bouyer 79 1.2 bouyer /* 80 1.9 riastrad * dtrace probes 81 1.9 riastrad */ 82 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__backward, 83 1.9 riastrad "uint64_t"/*raw_systime_ns*/, 84 1.9 riastrad "uint64_t"/*tsc_timestamp*/, 85 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/, 86 1.9 riastrad "int"/*tsc_shift*/, 87 1.9 riastrad "uint64_t"/*delta_ns*/, 88 1.9 riastrad "uint64_t"/*tsc*/, 89 1.9 riastrad "uint64_t"/*systime_ns*/); 90 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__delta__negative, 91 1.9 riastrad "uint64_t"/*raw_systime_ns*/, 92 1.9 riastrad "uint64_t"/*tsc_timestamp*/, 93 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/, 94 1.9 riastrad "int"/*tsc_shift*/, 95 1.9 riastrad "uint64_t"/*delta_ns*/, 96 1.9 riastrad "uint64_t"/*tsc*/, 97 1.9 riastrad "uint64_t"/*systime_ns*/); 98 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, systime__wraparound, 99 1.9 riastrad "uint64_t"/*raw_systime_ns*/, 100 1.9 riastrad "uint64_t"/*tsc_timestamp*/, 101 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/, 102 1.9 riastrad "int"/*tsc_shift*/, 103 1.9 riastrad "uint64_t"/*delta_ns*/, 104 1.9 riastrad "uint64_t"/*tsc*/, 105 1.9 riastrad "uint64_t"/*systime_ns*/); 106 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, systime__backward, 107 1.9 riastrad "uint64_t"/*raw_systime_ns*/, 108 1.9 riastrad "uint64_t"/*tsc_timestamp*/, 109 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/, 110 1.9 riastrad "int"/*tsc_shift*/, 111 1.9 riastrad "uint64_t"/*delta_ns*/, 112 1.9 riastrad "uint64_t"/*tsc*/, 113 1.9 riastrad "uint64_t"/*systime_ns*/); 114 1.9 riastrad 115 1.11 riastrad SDT_PROBE_DEFINE3(sdt, xen, timecounter, backward, 116 1.11 riastrad "uint64_t"/*local*/, 117 1.11 riastrad "uint64_t"/*skew*/, 118 1.11 riastrad "uint64_t"/*global*/); 119 1.11 riastrad 120 1.9 riastrad SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward, 121 1.9 riastrad "uint64_t"/*last_systime_ns*/, 122 1.9 riastrad "uint64_t"/*this_systime_ns*/); 123 1.15 riastrad SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick, 124 1.15 riastrad "uint64_t"/*last_systime_ns*/, 125 1.15 riastrad "uint64_t"/*this_systime_ns*/); 126 1.15 riastrad SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump, 127 1.15 riastrad "uint64_t"/*last_systime_ns*/, 128 1.15 riastrad "uint64_t"/*this_systime_ns*/, 129 1.15 riastrad "uint64_t"/*nticks*/); 130 1.9 riastrad SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed, 131 1.9 riastrad "uint64_t"/*last_systime_ns*/, 132 1.9 riastrad "uint64_t"/*this_systime_ns*/, 133 1.9 riastrad "uint64_t"/*remaining_ns*/); 134 1.9 riastrad 135 1.9 riastrad /* 136 1.2 bouyer * xen timecounter: 137 1.2 bouyer * 138 1.2 bouyer * Xen vCPU system time, plus an adjustment with rdtsc. 139 1.2 bouyer */ 140 1.2 bouyer static struct timecounter xen_timecounter = { 141 1.2 bouyer .tc_get_timecount = xen_get_timecount, 142 1.2 bouyer .tc_poll_pps = NULL, 143 1.2 bouyer .tc_counter_mask = ~0U, 144 1.2 bouyer .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */ 145 1.2 bouyer .tc_name = "xen_system_time", 146 1.2 bouyer .tc_quality = 10000, 147 1.2 bouyer }; 148 1.2 bouyer 149 1.2 bouyer /* 150 1.2 bouyer * xen_global_systime_ns_stamp 151 1.2 bouyer * 152 1.2 bouyer * The latest Xen vCPU system time that has been observed on any 153 1.2 bouyer * CPU, for a global monotonic view of the Xen system time clock. 154 1.2 bouyer */ 155 1.2 bouyer static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned; 156 1.2 bouyer 157 1.2 bouyer #ifdef DOM0OPS 158 1.2 bouyer /* 159 1.2 bouyer * xen timepush state: 160 1.2 bouyer * 161 1.2 bouyer * Callout to periodically, after a sysctl-configurable number of 162 1.2 bouyer * NetBSD ticks, set the Xen hypervisor's wall clock time. 163 1.2 bouyer */ 164 1.2 bouyer static struct { 165 1.2 bouyer struct callout ch; 166 1.2 bouyer int ticks; 167 1.2 bouyer } xen_timepush; 168 1.2 bouyer 169 1.2 bouyer static void xen_timepush_init(void); 170 1.2 bouyer static void xen_timepush_intr(void *); 171 1.2 bouyer static int sysctl_xen_timepush(SYSCTLFN_ARGS); 172 1.2 bouyer #endif 173 1.2 bouyer 174 1.2 bouyer /* 175 1.2 bouyer * struct xen_vcputime_ticket 176 1.2 bouyer * 177 1.2 bouyer * State for a vCPU read section, during which a caller may read 178 1.22 riastrad * from fields of a struct vcpu_time_info and call rdtsc. 179 1.22 riastrad * 180 1.2 bouyer * Caller must enter with xen_vcputime_enter, exit with 181 1.2 bouyer * xen_vcputime_exit, and be prepared to retry if 182 1.2 bouyer * xen_vcputime_exit fails. 183 1.2 bouyer */ 184 1.2 bouyer struct xen_vcputime_ticket { 185 1.2 bouyer uint64_t version; 186 1.2 bouyer }; 187 1.2 bouyer 188 1.2 bouyer /* 189 1.2 bouyer * xen_vcputime_enter(tp) 190 1.2 bouyer * 191 1.2 bouyer * Enter a vCPU time read section and store a ticket in *tp, which 192 1.2 bouyer * the caller must use with xen_vcputime_exit. Return a pointer 193 1.2 bouyer * to the current CPU's vcpu_time_info structure. Caller must 194 1.2 bouyer * already be bound to the CPU. 195 1.2 bouyer */ 196 1.2 bouyer static inline volatile struct vcpu_time_info * 197 1.2 bouyer xen_vcputime_enter(struct xen_vcputime_ticket *tp) 198 1.2 bouyer { 199 1.2 bouyer volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time; 200 1.2 bouyer 201 1.2 bouyer while (__predict_false(1 & (tp->version = vt->version))) 202 1.2 bouyer SPINLOCK_BACKOFF_HOOK; 203 1.2 bouyer 204 1.2 bouyer /* 205 1.2 bouyer * Must read the version before reading the tsc on the local 206 1.2 bouyer * pCPU. We are racing only with interruption by the 207 1.2 bouyer * hypervisor, so no need for a stronger memory barrier. 208 1.2 bouyer */ 209 1.2 bouyer __insn_barrier(); 210 1.2 bouyer 211 1.2 bouyer return vt; 212 1.2 bouyer } 213 1.2 bouyer 214 1.2 bouyer /* 215 1.2 bouyer * xen_vcputime_exit(vt, tp) 216 1.2 bouyer * 217 1.2 bouyer * Exit a vCPU time read section with the ticket in *tp from 218 1.2 bouyer * xen_vcputime_enter. Return true on success, false if caller 219 1.2 bouyer * must retry. 220 1.2 bouyer */ 221 1.2 bouyer static inline bool 222 1.2 bouyer xen_vcputime_exit(volatile struct vcpu_time_info *vt, 223 1.2 bouyer struct xen_vcputime_ticket *tp) 224 1.2 bouyer { 225 1.2 bouyer 226 1.2 bouyer KASSERT(vt == &curcpu()->ci_vcpu->time); 227 1.2 bouyer 228 1.2 bouyer /* 229 1.2 bouyer * Must read the tsc before re-reading the version on the local 230 1.2 bouyer * pCPU. We are racing only with interruption by the 231 1.2 bouyer * hypervisor, so no need for a stronger memory barrier. 232 1.2 bouyer */ 233 1.2 bouyer __insn_barrier(); 234 1.2 bouyer 235 1.2 bouyer return tp->version == vt->version; 236 1.2 bouyer } 237 1.2 bouyer 238 1.2 bouyer /* 239 1.2 bouyer * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift) 240 1.2 bouyer * 241 1.2 bouyer * Convert a difference in tsc units to a difference in 242 1.2 bouyer * nanoseconds given a multiplier and shift for the unit 243 1.2 bouyer * conversion. 244 1.2 bouyer */ 245 1.2 bouyer static inline uint64_t 246 1.2 bouyer xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul, 247 1.2 bouyer int8_t tsc_shift) 248 1.2 bouyer { 249 1.2 bouyer uint32_t delta_tsc_hi, delta_tsc_lo; 250 1.2 bouyer 251 1.2 bouyer if (tsc_shift < 0) 252 1.2 bouyer delta_tsc >>= -tsc_shift; 253 1.2 bouyer else 254 1.2 bouyer delta_tsc <<= tsc_shift; 255 1.2 bouyer 256 1.2 bouyer delta_tsc_hi = delta_tsc >> 32; 257 1.2 bouyer delta_tsc_lo = delta_tsc & 0xffffffffUL; 258 1.2 bouyer 259 1.2 bouyer /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */ 260 1.2 bouyer return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) + 261 1.2 bouyer (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32); 262 1.2 bouyer } 263 1.2 bouyer 264 1.2 bouyer /* 265 1.2 bouyer * xen_vcputime_systime_ns() 266 1.2 bouyer * 267 1.2 bouyer * Return a snapshot of the Xen system time plus an adjustment 268 1.2 bouyer * from the tsc, in units of nanoseconds. Caller must be bound to 269 1.2 bouyer * the current CPU. 270 1.2 bouyer */ 271 1.2 bouyer static uint64_t 272 1.2 bouyer xen_vcputime_systime_ns(void) 273 1.2 bouyer { 274 1.2 bouyer volatile struct vcpu_time_info *vt; 275 1.2 bouyer struct cpu_info *ci = curcpu(); 276 1.2 bouyer struct xen_vcputime_ticket ticket; 277 1.2 bouyer uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns; 278 1.2 bouyer uint32_t tsc_to_system_mul; 279 1.2 bouyer int8_t tsc_shift; 280 1.2 bouyer uint64_t systime_ns; 281 1.2 bouyer 282 1.2 bouyer /* We'd better be bound to the CPU in _some_ way. */ 283 1.2 bouyer KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() || 284 1.2 bouyer (curlwp->l_flag & LP_BOUND)); 285 1.2 bouyer 286 1.2 bouyer /* 287 1.2 bouyer * Repeatedly try to read the system time, corresponding tsc 288 1.2 bouyer * timestamp, and tsc frequency until we get a consistent view. 289 1.2 bouyer */ 290 1.2 bouyer do { 291 1.2 bouyer vt = xen_vcputime_enter(&ticket); 292 1.2 bouyer 293 1.2 bouyer /* Grab Xen's snapshot of raw system time and tsc. */ 294 1.2 bouyer raw_systime_ns = vt->system_time; 295 1.2 bouyer tsc_timestamp = vt->tsc_timestamp; 296 1.2 bouyer 297 1.2 bouyer /* Get Xen's current idea of how fast the tsc is counting. */ 298 1.2 bouyer tsc_to_system_mul = vt->tsc_to_system_mul; 299 1.2 bouyer tsc_shift = vt->tsc_shift; 300 1.2 bouyer 301 1.2 bouyer /* Read the CPU's tsc. */ 302 1.22 riastrad tsc = rdtsc(); 303 1.2 bouyer } while (!xen_vcputime_exit(vt, &ticket)); 304 1.2 bouyer 305 1.2 bouyer /* 306 1.2 bouyer * Out of paranoia, check whether the tsc has gone backwards 307 1.2 bouyer * since Xen's timestamp. 308 1.2 bouyer * 309 1.2 bouyer * This shouldn't happen because the Xen hypervisor is supposed 310 1.2 bouyer * to have read the tsc _before_ writing to the vcpu_time_info 311 1.2 bouyer * page, _before_ we read the tsc. 312 1.2 bouyer * 313 1.2 bouyer * Further, if we switched pCPUs after reading the tsc 314 1.2 bouyer * timestamp but before reading the CPU's tsc, the hypervisor 315 1.2 bouyer * had better notify us by updating the version too and forcing 316 1.2 bouyer * us to retry the vCPU time read. 317 1.2 bouyer */ 318 1.2 bouyer if (__predict_false(tsc < tsc_timestamp)) { 319 1.2 bouyer /* 320 1.2 bouyer * Notify the console that the CPU's tsc appeared to 321 1.2 bouyer * run behind Xen's idea of it, and pretend it hadn't. 322 1.2 bouyer */ 323 1.9 riastrad SDT_PROBE7(sdt, xen, clock, tsc__backward, 324 1.9 riastrad raw_systime_ns, tsc_timestamp, 325 1.9 riastrad tsc_to_system_mul, tsc_shift, /*delta_ns*/0, tsc, 326 1.9 riastrad /*systime_ns*/raw_systime_ns); 327 1.9 riastrad #if XEN_CLOCK_DEBUG 328 1.8 bouyer device_printf(ci->ci_dev, "xen cpu tsc %"PRIu64 329 1.2 bouyer " ran backwards from timestamp %"PRIu64 330 1.2 bouyer " by %"PRIu64"\n", 331 1.2 bouyer tsc, tsc_timestamp, tsc_timestamp - tsc); 332 1.2 bouyer #endif 333 1.2 bouyer ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++; 334 1.2 bouyer delta_ns = delta_tsc = 0; 335 1.2 bouyer } else { 336 1.2 bouyer /* Find how far the CPU's tsc has advanced. */ 337 1.2 bouyer delta_tsc = tsc - tsc_timestamp; 338 1.2 bouyer 339 1.2 bouyer /* Convert the tsc delta to a nanosecond delta. */ 340 1.2 bouyer delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul, 341 1.2 bouyer tsc_shift); 342 1.2 bouyer } 343 1.2 bouyer 344 1.2 bouyer /* 345 1.2 bouyer * Notify the console if the delta computation yielded a 346 1.2 bouyer * negative, and pretend it hadn't. 347 1.2 bouyer * 348 1.2 bouyer * This doesn't make sense but I include it out of paranoia. 349 1.2 bouyer */ 350 1.2 bouyer if (__predict_false((int64_t)delta_ns < 0)) { 351 1.9 riastrad SDT_PROBE7(sdt, xen, clock, tsc__delta__negative, 352 1.9 riastrad raw_systime_ns, tsc_timestamp, 353 1.9 riastrad tsc_to_system_mul, tsc_shift, delta_ns, tsc, 354 1.9 riastrad /*systime_ns*/raw_systime_ns); 355 1.9 riastrad #if XEN_CLOCK_DEBUG 356 1.9 riastrad device_printf(ci->ci_dev, "xen tsc delta in ns went negative:" 357 1.9 riastrad " %"PRId64"\n", delta_ns); 358 1.2 bouyer #endif 359 1.2 bouyer ci->ci_xen_tsc_delta_negative_evcnt.ev_count++; 360 1.2 bouyer delta_ns = 0; 361 1.2 bouyer } 362 1.2 bouyer 363 1.2 bouyer /* 364 1.2 bouyer * Compute the TSC-adjusted system time. 365 1.2 bouyer */ 366 1.2 bouyer systime_ns = raw_systime_ns + delta_ns; 367 1.2 bouyer 368 1.2 bouyer /* 369 1.2 bouyer * Notify the console if the addition wrapped around. 370 1.2 bouyer * 371 1.2 bouyer * This shouldn't happen because system time should be relative 372 1.2 bouyer * to a reasonable reference point, not centuries in the past. 373 1.2 bouyer * (2^64 ns is approximately half a millennium.) 374 1.2 bouyer */ 375 1.2 bouyer if (__predict_false(systime_ns < raw_systime_ns)) { 376 1.9 riastrad SDT_PROBE7(sdt, xen, clock, systime__wraparound, 377 1.9 riastrad raw_systime_ns, tsc_timestamp, 378 1.9 riastrad tsc_to_system_mul, tsc_shift, delta_ns, tsc, 379 1.9 riastrad systime_ns); 380 1.9 riastrad #if XEN_CLOCK_DEBUG 381 1.2 bouyer printf("xen raw systime + tsc delta wrapped around:" 382 1.2 bouyer " %"PRIu64" + %"PRIu64" = %"PRIu64"\n", 383 1.2 bouyer raw_systime_ns, delta_ns, systime_ns); 384 1.2 bouyer #endif 385 1.2 bouyer ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++; 386 1.2 bouyer } 387 1.2 bouyer 388 1.2 bouyer /* 389 1.2 bouyer * Notify the console if the TSC-adjusted Xen system time 390 1.2 bouyer * appears to have gone backwards, and pretend we had gone 391 1.2 bouyer * forward. This seems to happen pretty regularly under load. 392 1.2 bouyer */ 393 1.2 bouyer if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) { 394 1.9 riastrad SDT_PROBE7(sdt, xen, clock, systime__backward, 395 1.9 riastrad raw_systime_ns, tsc_timestamp, 396 1.9 riastrad tsc_to_system_mul, tsc_shift, delta_ns, tsc, 397 1.9 riastrad systime_ns); 398 1.9 riastrad #if XEN_CLOCK_DEBUG 399 1.2 bouyer printf("xen raw systime + tsc delta went backwards:" 400 1.2 bouyer " %"PRIu64" > %"PRIu64"\n", 401 1.2 bouyer ci->ci_xen_last_systime_ns, systime_ns); 402 1.2 bouyer printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n" 403 1.2 bouyer " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n" 404 1.2 bouyer " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n" 405 1.2 bouyer " delta_ns=%"PRIu64"\n", 406 1.2 bouyer raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul, 407 1.2 bouyer tsc_shift, delta_tsc, delta_ns); 408 1.2 bouyer #endif 409 1.2 bouyer ci->ci_xen_raw_systime_backwards_evcnt.ev_count++; 410 1.2 bouyer systime_ns = ci->ci_xen_last_systime_ns + 1; 411 1.2 bouyer } 412 1.2 bouyer 413 1.2 bouyer /* Remember the TSC-adjusted Xen system time. */ 414 1.2 bouyer ci->ci_xen_last_systime_ns = systime_ns; 415 1.2 bouyer 416 1.2 bouyer /* We had better not have migrated CPUs. */ 417 1.2 bouyer KASSERT(ci == curcpu()); 418 1.2 bouyer 419 1.2 bouyer /* And we're done: return the TSC-adjusted systime in nanoseconds. */ 420 1.2 bouyer return systime_ns; 421 1.2 bouyer } 422 1.2 bouyer 423 1.2 bouyer /* 424 1.2 bouyer * xen_vcputime_raw_systime_ns() 425 1.2 bouyer * 426 1.2 bouyer * Return a snapshot of the current Xen system time to the 427 1.2 bouyer * resolution of the Xen hypervisor tick, in units of nanoseconds. 428 1.2 bouyer */ 429 1.2 bouyer static uint64_t 430 1.2 bouyer xen_vcputime_raw_systime_ns(void) 431 1.2 bouyer { 432 1.2 bouyer volatile struct vcpu_time_info *vt; 433 1.2 bouyer struct xen_vcputime_ticket ticket; 434 1.2 bouyer uint64_t raw_systime_ns; 435 1.2 bouyer 436 1.2 bouyer do { 437 1.2 bouyer vt = xen_vcputime_enter(&ticket); 438 1.2 bouyer raw_systime_ns = vt->system_time; 439 1.2 bouyer } while (!xen_vcputime_exit(vt, &ticket)); 440 1.2 bouyer 441 1.2 bouyer return raw_systime_ns; 442 1.2 bouyer } 443 1.2 bouyer 444 1.2 bouyer /* 445 1.2 bouyer * struct xen_wallclock_ticket 446 1.2 bouyer * 447 1.2 bouyer * State for a wall clock read section, during which a caller may 448 1.2 bouyer * read from the wall clock fields of HYPERVISOR_shared_info. 449 1.2 bouyer * Caller must enter with xen_wallclock_enter, exit with 450 1.2 bouyer * xen_wallclock_exit, and be prepared to retry if 451 1.2 bouyer * xen_wallclock_exit fails. 452 1.2 bouyer */ 453 1.2 bouyer struct xen_wallclock_ticket { 454 1.2 bouyer uint32_t version; 455 1.2 bouyer }; 456 1.2 bouyer 457 1.2 bouyer /* 458 1.2 bouyer * xen_wallclock_enter(tp) 459 1.2 bouyer * 460 1.2 bouyer * Enter a wall clock read section and store a ticket in *tp, 461 1.2 bouyer * which the caller must use with xen_wallclock_exit. 462 1.2 bouyer */ 463 1.2 bouyer static inline void 464 1.2 bouyer xen_wallclock_enter(struct xen_wallclock_ticket *tp) 465 1.2 bouyer { 466 1.2 bouyer 467 1.2 bouyer while (__predict_false(1 & (tp->version = 468 1.2 bouyer HYPERVISOR_shared_info->wc_version))) 469 1.2 bouyer SPINLOCK_BACKOFF_HOOK; 470 1.2 bouyer 471 1.2 bouyer /* 472 1.2 bouyer * Must read the version from memory before reading the 473 1.2 bouyer * timestamp from memory, as written potentially by another 474 1.2 bouyer * pCPU. 475 1.2 bouyer */ 476 1.2 bouyer membar_consumer(); 477 1.2 bouyer } 478 1.2 bouyer 479 1.2 bouyer /* 480 1.2 bouyer * xen_wallclock_exit(tp) 481 1.2 bouyer * 482 1.2 bouyer * Exit a wall clock read section with the ticket in *tp from 483 1.2 bouyer * xen_wallclock_enter. Return true on success, false if caller 484 1.2 bouyer * must retry. 485 1.2 bouyer */ 486 1.2 bouyer static inline bool 487 1.2 bouyer xen_wallclock_exit(struct xen_wallclock_ticket *tp) 488 1.2 bouyer { 489 1.2 bouyer 490 1.2 bouyer /* 491 1.2 bouyer * Must read the timestamp from memory before re-reading the 492 1.2 bouyer * version from memory, as written potentially by another pCPU. 493 1.2 bouyer */ 494 1.2 bouyer membar_consumer(); 495 1.2 bouyer 496 1.2 bouyer return tp->version == HYPERVISOR_shared_info->wc_version; 497 1.2 bouyer } 498 1.2 bouyer 499 1.2 bouyer /* 500 1.2 bouyer * xen_global_systime_ns() 501 1.2 bouyer * 502 1.2 bouyer * Return a global monotonic view of the system time in 503 1.2 bouyer * nanoseconds, computed by the per-CPU Xen raw system time plus 504 1.2 bouyer * an rdtsc adjustment, and advance the view of the system time 505 1.2 bouyer * for all other CPUs. 506 1.2 bouyer */ 507 1.2 bouyer static uint64_t 508 1.2 bouyer xen_global_systime_ns(void) 509 1.2 bouyer { 510 1.2 bouyer struct cpu_info *ci; 511 1.11 riastrad uint64_t local, global, skew, result; 512 1.2 bouyer 513 1.2 bouyer /* 514 1.2 bouyer * Find the local timecount on this CPU, and make sure it does 515 1.2 bouyer * not precede the latest global timecount witnessed so far by 516 1.2 bouyer * any CPU. If it does, add to the local CPU's skew from the 517 1.2 bouyer * fastest CPU. 518 1.2 bouyer * 519 1.2 bouyer * XXX Can we avoid retrying if the CAS fails? 520 1.2 bouyer */ 521 1.2 bouyer int s = splsched(); /* make sure we won't be interrupted */ 522 1.2 bouyer ci = curcpu(); 523 1.2 bouyer do { 524 1.2 bouyer local = xen_vcputime_systime_ns(); 525 1.11 riastrad skew = ci->ci_xen_systime_ns_skew; 526 1.2 bouyer global = xen_global_systime_ns_stamp; 527 1.11 riastrad if (__predict_false(local + skew < global + 1)) { 528 1.11 riastrad SDT_PROBE3(sdt, xen, timecounter, backward, 529 1.11 riastrad local, skew, global); 530 1.11 riastrad #if XEN_CLOCK_DEBUG 531 1.11 riastrad device_printf(ci->ci_dev, 532 1.11 riastrad "xen timecounter went backwards:" 533 1.11 riastrad " local=%"PRIu64" skew=%"PRIu64" global=%"PRIu64"," 534 1.11 riastrad " adding %"PRIu64" to skew\n", 535 1.11 riastrad local, skew, global, global + 1 - (local + skew)); 536 1.11 riastrad #endif 537 1.11 riastrad ci->ci_xen_timecounter_backwards_evcnt.ev_count++; 538 1.2 bouyer result = global + 1; 539 1.11 riastrad ci->ci_xen_systime_ns_skew += global + 1 - 540 1.11 riastrad (local + skew); 541 1.2 bouyer } else { 542 1.11 riastrad result = local + skew; 543 1.2 bouyer } 544 1.2 bouyer } while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result) 545 1.2 bouyer != global); 546 1.2 bouyer KASSERT(ci == curcpu()); 547 1.2 bouyer splx(s); 548 1.2 bouyer 549 1.2 bouyer return result; 550 1.2 bouyer } 551 1.2 bouyer 552 1.2 bouyer /* 553 1.2 bouyer * xen_get_timecount(tc) 554 1.2 bouyer * 555 1.2 bouyer * Return the low 32 bits of a global monotonic view of the Xen 556 1.2 bouyer * system time. 557 1.2 bouyer */ 558 1.2 bouyer static unsigned 559 1.2 bouyer xen_get_timecount(struct timecounter *tc) 560 1.2 bouyer { 561 1.2 bouyer 562 1.2 bouyer KASSERT(tc == &xen_timecounter); 563 1.2 bouyer 564 1.2 bouyer return (unsigned)xen_global_systime_ns(); 565 1.2 bouyer } 566 1.2 bouyer 567 1.2 bouyer /* 568 1.2 bouyer * xen_delay(n) 569 1.2 bouyer * 570 1.2 bouyer * Wait approximately n microseconds. 571 1.2 bouyer */ 572 1.2 bouyer void 573 1.2 bouyer xen_delay(unsigned n) 574 1.2 bouyer { 575 1.2 bouyer int bound; 576 1.2 bouyer 577 1.2 bouyer /* Bind to the CPU so we don't compare tsc on different CPUs. */ 578 1.2 bouyer bound = curlwp_bind(); 579 1.2 bouyer 580 1.2 bouyer if (curcpu()->ci_vcpu == NULL) { 581 1.2 bouyer curlwp_bindx(bound); 582 1.2 bouyer return; 583 1.2 bouyer } 584 1.2 bouyer 585 1.2 bouyer /* Short wait (<500us) or long wait? */ 586 1.2 bouyer if (n < 500000) { 587 1.2 bouyer /* 588 1.2 bouyer * Xen system time is not precise enough for short 589 1.2 bouyer * delays, so use the tsc instead. 590 1.2 bouyer * 591 1.2 bouyer * We work with the current tsc frequency, and figure 592 1.2 bouyer * that if it changes while we're delaying, we've 593 1.2 bouyer * probably delayed long enough -- up to 500us. 594 1.2 bouyer * 595 1.2 bouyer * We do not use cpu_frequency(ci), which uses a 596 1.2 bouyer * quantity detected at boot time, and which may have 597 1.2 bouyer * changed by now if Xen has migrated this vCPU to 598 1.2 bouyer * another pCPU. 599 1.2 bouyer * 600 1.2 bouyer * XXX How long does it take to migrate pCPUs? 601 1.2 bouyer */ 602 1.2 bouyer volatile struct vcpu_time_info *vt; 603 1.2 bouyer struct xen_vcputime_ticket ticket; 604 1.2 bouyer uint64_t tsc_start, last_tsc, tsc; 605 1.2 bouyer uint32_t tsc_to_system_mul; 606 1.2 bouyer int8_t tsc_shift; 607 1.2 bouyer 608 1.2 bouyer /* Get the starting tsc and tsc frequency. */ 609 1.2 bouyer do { 610 1.2 bouyer vt = xen_vcputime_enter(&ticket); 611 1.22 riastrad tsc_start = last_tsc = rdtsc(); 612 1.2 bouyer tsc_to_system_mul = vt->tsc_to_system_mul; 613 1.2 bouyer tsc_shift = vt->tsc_shift; 614 1.2 bouyer } while (!xen_vcputime_exit(vt, &ticket)); 615 1.2 bouyer 616 1.2 bouyer /* 617 1.2 bouyer * Wait until as many tsc ticks as there are in n 618 1.2 bouyer * microseconds have elapsed, or the tsc has gone 619 1.2 bouyer * backwards meaning we've probably migrated pCPUs. 620 1.2 bouyer */ 621 1.2 bouyer for (;;) { 622 1.22 riastrad tsc = rdtsc(); 623 1.2 bouyer if (__predict_false(tsc < last_tsc)) 624 1.2 bouyer break; 625 1.2 bouyer if (xen_tsc_to_ns_delta(tsc - tsc_start, 626 1.2 bouyer tsc_to_system_mul, tsc_shift)/1000 >= n) 627 1.2 bouyer break; 628 1.2 bouyer last_tsc = tsc; 629 1.2 bouyer } 630 1.2 bouyer } else { 631 1.2 bouyer /* 632 1.2 bouyer * Use the Xen system time for >=500us delays. From my 633 1.2 bouyer * testing, it seems to sometimes run backward by about 634 1.2 bouyer * 110us, which is not so bad. 635 1.2 bouyer */ 636 1.2 bouyer uint64_t n_ns = 1000*(uint64_t)n; 637 1.2 bouyer uint64_t start_ns; 638 1.2 bouyer 639 1.2 bouyer /* Get the start time. */ 640 1.2 bouyer start_ns = xen_vcputime_raw_systime_ns(); 641 1.2 bouyer 642 1.2 bouyer /* Wait until the system time has passed the end. */ 643 1.2 bouyer do { 644 1.2 bouyer HYPERVISOR_yield(); 645 1.2 bouyer } while (xen_vcputime_raw_systime_ns() - start_ns < n_ns); 646 1.2 bouyer } 647 1.2 bouyer 648 1.2 bouyer /* Unbind from the CPU if we weren't already bound. */ 649 1.2 bouyer curlwp_bindx(bound); 650 1.2 bouyer } 651 1.2 bouyer 652 1.2 bouyer /* 653 1.2 bouyer * xen_suspendclocks(ci) 654 1.2 bouyer * 655 1.2 bouyer * Stop handling the Xen timer event on the CPU of ci. Caller 656 1.2 bouyer * must be running on and bound to ci's CPU. 657 1.2 bouyer * 658 1.2 bouyer * Actually, caller must have kpreemption disabled, because that's 659 1.2 bouyer * easier to assert at the moment. 660 1.2 bouyer */ 661 1.2 bouyer void 662 1.2 bouyer xen_suspendclocks(struct cpu_info *ci) 663 1.2 bouyer { 664 1.2 bouyer int evtch; 665 1.2 bouyer 666 1.2 bouyer KASSERT(ci == curcpu()); 667 1.2 bouyer KASSERT(kpreempt_disabled()); 668 1.2 bouyer 669 1.13 riastrad /* 670 1.13 riastrad * Find the VIRQ_TIMER event channel and close it so new timer 671 1.13 riastrad * interrupt events stop getting delivered to it. 672 1.13 riastrad * 673 1.13 riastrad * XXX Should this happen later? This is not the reverse order 674 1.13 riastrad * of xen_resumeclocks. It is apparently necessary in this 675 1.13 riastrad * order only because we don't stash evtchn anywhere, but we 676 1.13 riastrad * could stash it. 677 1.13 riastrad */ 678 1.2 bouyer evtch = unbind_virq_from_evtch(VIRQ_TIMER); 679 1.2 bouyer KASSERT(evtch != -1); 680 1.2 bouyer 681 1.13 riastrad /* 682 1.13 riastrad * Mask the event channel so we stop getting new interrupts on 683 1.13 riastrad * it. 684 1.13 riastrad */ 685 1.2 bouyer hypervisor_mask_event(evtch); 686 1.13 riastrad 687 1.13 riastrad /* 688 1.13 riastrad * Now that we are no longer getting new interrupts, remove the 689 1.13 riastrad * handler and wait for any existing calls to the handler to 690 1.13 riastrad * complete. After this point, there can be no concurrent 691 1.13 riastrad * calls to xen_timer_handler. 692 1.13 riastrad */ 693 1.13 riastrad event_remove_handler(evtch, 694 1.2 bouyer __FPTRCAST(int (*)(void *), xen_timer_handler), ci); 695 1.2 bouyer 696 1.2 bouyer aprint_verbose("Xen clock: removed event channel %d\n", evtch); 697 1.2 bouyer 698 1.2 bouyer /* We'd better not have switched CPUs. */ 699 1.2 bouyer KASSERT(ci == curcpu()); 700 1.2 bouyer } 701 1.2 bouyer 702 1.2 bouyer /* 703 1.2 bouyer * xen_resumeclocks(ci) 704 1.2 bouyer * 705 1.2 bouyer * Start handling the Xen timer event on the CPU of ci. Arm the 706 1.2 bouyer * Xen timer. Caller must be running on and bound to ci's CPU. 707 1.2 bouyer * 708 1.2 bouyer * Actually, caller must have kpreemption disabled, because that's 709 1.2 bouyer * easier to assert at the moment. 710 1.2 bouyer */ 711 1.2 bouyer void 712 1.2 bouyer xen_resumeclocks(struct cpu_info *ci) 713 1.2 bouyer { 714 1.2 bouyer char intr_xname[INTRDEVNAMEBUF]; 715 1.2 bouyer int evtch; 716 1.2 bouyer int error __diagused; 717 1.2 bouyer 718 1.2 bouyer KASSERT(ci == curcpu()); 719 1.2 bouyer KASSERT(kpreempt_disabled()); 720 1.2 bouyer 721 1.13 riastrad /* 722 1.13 riastrad * Allocate an event channel to receive VIRQ_TIMER events. 723 1.13 riastrad */ 724 1.2 bouyer evtch = bind_virq_to_evtch(VIRQ_TIMER); 725 1.2 bouyer KASSERT(evtch != -1); 726 1.2 bouyer 727 1.13 riastrad /* 728 1.13 riastrad * Set an event handler for VIRQ_TIMER events to call 729 1.13 riastrad * xen_timer_handler. 730 1.13 riastrad */ 731 1.2 bouyer snprintf(intr_xname, sizeof(intr_xname), "%s clock", 732 1.2 bouyer device_xname(ci->ci_dev)); 733 1.2 bouyer /* XXX sketchy function pointer cast -- fix the API, please */ 734 1.2 bouyer if (event_set_handler(evtch, 735 1.2 bouyer __FPTRCAST(int (*)(void *), xen_timer_handler), 736 1.5 bouyer ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL) 737 1.2 bouyer panic("failed to establish timer interrupt handler"); 738 1.2 bouyer 739 1.2 bouyer aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch); 740 1.2 bouyer 741 1.2 bouyer /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */ 742 1.2 bouyer if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) { 743 1.2 bouyer error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 744 1.2 bouyer ci->ci_vcpuid, NULL); 745 1.2 bouyer KASSERT(error == 0); 746 1.2 bouyer } 747 1.2 bouyer 748 1.2 bouyer /* Pretend the last hardclock happened right now. */ 749 1.2 bouyer ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns(); 750 1.2 bouyer 751 1.2 bouyer /* Arm the one-shot timer. */ 752 1.2 bouyer error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns + 753 1.2 bouyer NS_PER_TICK); 754 1.2 bouyer KASSERT(error == 0); 755 1.13 riastrad 756 1.13 riastrad /* 757 1.13 riastrad * Ready to go. Unmask the event. After this point, Xen may 758 1.13 riastrad * start calling xen_timer_handler. 759 1.13 riastrad */ 760 1.12 bouyer hypervisor_unmask_event(evtch); 761 1.2 bouyer 762 1.2 bouyer /* We'd better not have switched CPUs. */ 763 1.2 bouyer KASSERT(ci == curcpu()); 764 1.2 bouyer } 765 1.2 bouyer 766 1.2 bouyer /* 767 1.2 bouyer * xen_timer_handler(cookie, frame) 768 1.2 bouyer * 769 1.2 bouyer * Periodic Xen timer event handler for NetBSD hardclock. Calls 770 1.2 bouyer * to this may get delayed, so we run hardclock as many times as 771 1.2 bouyer * we need to in order to cover the Xen system time that elapsed. 772 1.2 bouyer * After that, re-arm the timer to run again at the next tick. 773 1.2 bouyer * The cookie is the pointer to struct cpu_info. 774 1.2 bouyer */ 775 1.2 bouyer static int 776 1.2 bouyer xen_timer_handler(void *cookie, struct clockframe *frame) 777 1.2 bouyer { 778 1.14 riastrad const uint64_t ns_per_tick = NS_PER_TICK; 779 1.2 bouyer struct cpu_info *ci = curcpu(); 780 1.2 bouyer uint64_t last, now, delta, next; 781 1.2 bouyer int error; 782 1.2 bouyer 783 1.2 bouyer KASSERT(cpu_intr_p()); 784 1.2 bouyer KASSERT(cookie == ci); 785 1.2 bouyer 786 1.7 riastrad #if defined(XENPV) 787 1.2 bouyer frame = NULL; /* We use values cached in curcpu() */ 788 1.2 bouyer #endif 789 1.2 bouyer /* 790 1.2 bouyer * Find how many nanoseconds of Xen system time has elapsed 791 1.2 bouyer * since the last hardclock tick. 792 1.2 bouyer */ 793 1.2 bouyer last = ci->ci_xen_hardclock_systime_ns; 794 1.2 bouyer now = xen_vcputime_systime_ns(); 795 1.15 riastrad SDT_PROBE2(sdt, xen, hardclock, tick, last, now); 796 1.15 riastrad if (__predict_false(now < last)) { 797 1.9 riastrad SDT_PROBE2(sdt, xen, hardclock, systime__backward, 798 1.9 riastrad last, now); 799 1.9 riastrad #if XEN_CLOCK_DEBUG 800 1.9 riastrad device_printf(ci->ci_dev, "xen systime ran backwards" 801 1.9 riastrad " in hardclock %"PRIu64"ns\n", 802 1.2 bouyer last - now); 803 1.2 bouyer #endif 804 1.2 bouyer ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++; 805 1.18 bouyer /* 806 1.18 bouyer * we've lost track of time. Just pretends that one 807 1.18 bouyer * tick elapsed, and reset our idea of last tick. 808 1.18 bouyer */ 809 1.18 bouyer ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick; 810 1.2 bouyer } 811 1.2 bouyer delta = now - last; 812 1.2 bouyer 813 1.2 bouyer /* 814 1.2 bouyer * Play hardclock catchup: run the hardclock timer as many 815 1.2 bouyer * times as appears necessary based on how much time has 816 1.2 bouyer * passed. 817 1.2 bouyer */ 818 1.15 riastrad if (__predict_false(delta >= 2*ns_per_tick)) { 819 1.15 riastrad SDT_PROBE3(sdt, xen, hardclock, jump, 820 1.15 riastrad last, now, delta/ns_per_tick); 821 1.16 riastrad 822 1.16 riastrad /* 823 1.16 riastrad * Warn if we violate timecounter(9) contract: with a 824 1.20 andvar * k-bit timecounter (here k = 32), and timecounter 825 1.16 riastrad * frequency f (here f = 1 GHz), the maximum period 826 1.17 riastrad * between hardclock calls is 2^k / f. 827 1.16 riastrad */ 828 1.16 riastrad if (delta > xen_timecounter.tc_counter_mask) { 829 1.16 riastrad printf("WARNING: hardclock skipped %"PRIu64"ns" 830 1.16 riastrad " (%"PRIu64" -> %"PRIu64")," 831 1.16 riastrad " exceeding maximum of %"PRIu32"ns" 832 1.16 riastrad " for timecounter(9)\n", 833 1.16 riastrad last, now, delta, 834 1.16 riastrad xen_timecounter.tc_counter_mask); 835 1.16 riastrad ci->ci_xen_timecounter_jump_evcnt.ev_count++; 836 1.16 riastrad } 837 1.18 bouyer /* don't try to catch up more than one second at once */ 838 1.18 bouyer if (delta > 1000000000UL) 839 1.18 bouyer delta = 1000000000UL; 840 1.15 riastrad } 841 1.14 riastrad while (delta >= ns_per_tick) { 842 1.14 riastrad ci->ci_xen_hardclock_systime_ns += ns_per_tick; 843 1.14 riastrad delta -= ns_per_tick; 844 1.2 bouyer hardclock(frame); 845 1.14 riastrad if (__predict_false(delta >= ns_per_tick)) { 846 1.9 riastrad SDT_PROBE3(sdt, xen, hardclock, missed, 847 1.9 riastrad last, now, delta); 848 1.2 bouyer ci->ci_xen_missed_hardclock_evcnt.ev_count++; 849 1.9 riastrad } 850 1.2 bouyer } 851 1.2 bouyer 852 1.2 bouyer /* 853 1.2 bouyer * Re-arm the timer. If it fails, it's probably because the 854 1.18 bouyer * time is in the past, possibly because we're in the 855 1.18 bouyer * process of catching up missed hardclock calls. 856 1.21 andvar * In this case schedule a tick in the near future. 857 1.2 bouyer */ 858 1.14 riastrad next = ci->ci_xen_hardclock_systime_ns + ns_per_tick; 859 1.2 bouyer error = HYPERVISOR_set_timer_op(next); 860 1.18 bouyer if (error) { 861 1.18 bouyer next = xen_vcputime_systime_ns() + ns_per_tick / 2; 862 1.18 bouyer error = HYPERVISOR_set_timer_op(next); 863 1.18 bouyer if (error) { 864 1.18 bouyer panic("failed to re-arm Xen timer %d", error); 865 1.18 bouyer } 866 1.18 bouyer } 867 1.2 bouyer 868 1.2 bouyer /* Success! */ 869 1.2 bouyer return 0; 870 1.2 bouyer } 871 1.2 bouyer 872 1.2 bouyer /* 873 1.6 ad * xen_initclocks() 874 1.2 bouyer * 875 1.2 bouyer * Initialize the Xen clocks on the current CPU. 876 1.2 bouyer */ 877 1.2 bouyer void 878 1.6 ad xen_initclocks(void) 879 1.2 bouyer { 880 1.2 bouyer struct cpu_info *ci = curcpu(); 881 1.2 bouyer 882 1.2 bouyer /* If this is the primary CPU, do global initialization first. */ 883 1.2 bouyer if (ci == &cpu_info_primary) { 884 1.2 bouyer /* Initialize the systemwide Xen timecounter. */ 885 1.2 bouyer tc_init(&xen_timecounter); 886 1.2 bouyer } 887 1.2 bouyer 888 1.2 bouyer /* Attach the event counters. */ 889 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt, 890 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 891 1.2 bouyer "cpu tsc ran backwards"); 892 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt, 893 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 894 1.2 bouyer "tsc delta went negative"); 895 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt, 896 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 897 1.2 bouyer "raw systime wrapped around"); 898 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt, 899 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 900 1.2 bouyer "raw systime went backwards"); 901 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt, 902 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 903 1.2 bouyer "systime went backwards in hardclock"); 904 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt, 905 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 906 1.2 bouyer "missed hardclock"); 907 1.11 riastrad evcnt_attach_dynamic(&ci->ci_xen_timecounter_backwards_evcnt, 908 1.11 riastrad EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 909 1.11 riastrad "timecounter went backwards"); 910 1.17 riastrad evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt, 911 1.17 riastrad EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), 912 1.17 riastrad "hardclock jumped past timecounter max"); 913 1.2 bouyer 914 1.2 bouyer /* Fire up the clocks. */ 915 1.2 bouyer xen_resumeclocks(ci); 916 1.2 bouyer 917 1.2 bouyer #ifdef DOM0OPS 918 1.2 bouyer /* 919 1.2 bouyer * If this is a privileged dom0, start pushing the wall 920 1.2 bouyer * clock time back to the Xen hypervisor. 921 1.2 bouyer */ 922 1.6 ad if (ci == &cpu_info_primary && xendomain_is_privileged()) 923 1.2 bouyer xen_timepush_init(); 924 1.2 bouyer #endif 925 1.2 bouyer } 926 1.2 bouyer 927 1.2 bouyer #ifdef DOM0OPS 928 1.2 bouyer 929 1.2 bouyer /* 930 1.2 bouyer * xen_timepush_init() 931 1.2 bouyer * 932 1.2 bouyer * Initialize callout to periodically set Xen hypervisor's wall 933 1.2 bouyer * clock time. 934 1.2 bouyer */ 935 1.2 bouyer static void 936 1.2 bouyer xen_timepush_init(void) 937 1.2 bouyer { 938 1.2 bouyer struct sysctllog *log = NULL; 939 1.2 bouyer const struct sysctlnode *node = NULL; 940 1.2 bouyer int error; 941 1.2 bouyer 942 1.2 bouyer /* Start periodically updating the hypervisor's wall clock time. */ 943 1.2 bouyer callout_init(&xen_timepush.ch, 0); 944 1.2 bouyer callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL); 945 1.2 bouyer 946 1.2 bouyer /* Pick a default frequency for timepush. */ 947 1.2 bouyer xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */ 948 1.2 bouyer 949 1.2 bouyer /* Create machdep.xen node. */ 950 1.2 bouyer /* XXX Creation of the `machdep.xen' node should be elsewhere. */ 951 1.2 bouyer error = sysctl_createv(&log, 0, NULL, &node, 0, 952 1.2 bouyer CTLTYPE_NODE, "xen", 953 1.2 bouyer SYSCTL_DESCR("Xen top level node"), 954 1.2 bouyer NULL, 0, NULL, 0, 955 1.2 bouyer CTL_MACHDEP, CTL_CREATE, CTL_EOL); 956 1.2 bouyer if (error) 957 1.2 bouyer goto fail; 958 1.2 bouyer KASSERT(node != NULL); 959 1.2 bouyer 960 1.2 bouyer /* Create int machdep.xen.timepush_ticks knob. */ 961 1.2 bouyer error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE, 962 1.2 bouyer CTLTYPE_INT, "timepush_ticks", 963 1.2 bouyer SYSCTL_DESCR("How often to update the hypervisor's time-of-day;" 964 1.2 bouyer " 0 to disable"), 965 1.2 bouyer sysctl_xen_timepush, 0, &xen_timepush.ticks, 0, 966 1.2 bouyer CTL_CREATE, CTL_EOL); 967 1.2 bouyer if (error) 968 1.2 bouyer goto fail; 969 1.2 bouyer 970 1.2 bouyer /* Start the timepush callout. */ 971 1.2 bouyer callout_schedule(&xen_timepush.ch, xen_timepush.ticks); 972 1.2 bouyer 973 1.2 bouyer /* Success! */ 974 1.2 bouyer return; 975 1.2 bouyer 976 1.2 bouyer fail: sysctl_teardown(&log); 977 1.2 bouyer } 978 1.2 bouyer 979 1.2 bouyer /* 980 1.2 bouyer * xen_timepush_intr(cookie) 981 1.2 bouyer * 982 1.2 bouyer * Callout interrupt handler to push NetBSD's idea of the wall 983 1.2 bouyer * clock time, usually synchronized with NTP, back to the Xen 984 1.2 bouyer * hypervisor. 985 1.2 bouyer */ 986 1.2 bouyer static void 987 1.2 bouyer xen_timepush_intr(void *cookie) 988 1.2 bouyer { 989 1.2 bouyer 990 1.2 bouyer resettodr(); 991 1.2 bouyer if (xen_timepush.ticks) 992 1.2 bouyer callout_schedule(&xen_timepush.ch, xen_timepush.ticks); 993 1.2 bouyer } 994 1.2 bouyer 995 1.2 bouyer /* 996 1.2 bouyer * sysctl_xen_timepush(...) 997 1.2 bouyer * 998 1.2 bouyer * Sysctl handler to set machdep.xen.timepush_ticks. 999 1.2 bouyer */ 1000 1.2 bouyer static int 1001 1.2 bouyer sysctl_xen_timepush(SYSCTLFN_ARGS) 1002 1.2 bouyer { 1003 1.2 bouyer struct sysctlnode node; 1004 1.2 bouyer int ticks; 1005 1.2 bouyer int error; 1006 1.2 bouyer 1007 1.2 bouyer ticks = xen_timepush.ticks; 1008 1.2 bouyer node = *rnode; 1009 1.2 bouyer node.sysctl_data = &ticks; 1010 1.2 bouyer error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1011 1.2 bouyer if (error || newp == NULL) 1012 1.2 bouyer return error; 1013 1.2 bouyer 1014 1.2 bouyer if (ticks < 0) 1015 1.2 bouyer return EINVAL; 1016 1.2 bouyer 1017 1.2 bouyer if (ticks != xen_timepush.ticks) { 1018 1.2 bouyer xen_timepush.ticks = ticks; 1019 1.2 bouyer 1020 1.2 bouyer if (ticks == 0) 1021 1.2 bouyer callout_stop(&xen_timepush.ch); 1022 1.2 bouyer else 1023 1.2 bouyer callout_schedule(&xen_timepush.ch, ticks); 1024 1.2 bouyer } 1025 1.2 bouyer 1026 1.2 bouyer return 0; 1027 1.2 bouyer } 1028 1.2 bouyer 1029 1.2 bouyer #endif /* DOM0OPS */ 1030 1.2 bouyer 1031 1.2 bouyer static int xen_rtc_get(struct todr_chip_handle *, struct timeval *); 1032 1.2 bouyer static int xen_rtc_set(struct todr_chip_handle *, struct timeval *); 1033 1.2 bouyer static void xen_wallclock_time(struct timespec *); 1034 1.2 bouyer /* 1035 1.2 bouyer * xen time of day register: 1036 1.2 bouyer * 1037 1.2 bouyer * Xen wall clock time, plus a Xen vCPU system time adjustment. 1038 1.2 bouyer */ 1039 1.2 bouyer static struct todr_chip_handle xen_todr_chip = { 1040 1.2 bouyer .todr_gettime = xen_rtc_get, 1041 1.2 bouyer .todr_settime = xen_rtc_set, 1042 1.2 bouyer }; 1043 1.2 bouyer 1044 1.2 bouyer /* 1045 1.4 bouyer * xen_startrtclock() 1046 1.2 bouyer * 1047 1.2 bouyer * Initialize the real-time clock from x86 machdep autoconf. 1048 1.2 bouyer */ 1049 1.2 bouyer void 1050 1.4 bouyer xen_startrtclock(void) 1051 1.2 bouyer { 1052 1.2 bouyer 1053 1.2 bouyer todr_attach(&xen_todr_chip); 1054 1.2 bouyer } 1055 1.2 bouyer 1056 1.2 bouyer /* 1057 1.2 bouyer * xen_rtc_get(todr, tv) 1058 1.2 bouyer * 1059 1.2 bouyer * Get the current real-time clock from the Xen wall clock time 1060 1.2 bouyer * and vCPU system time adjustment. 1061 1.2 bouyer */ 1062 1.2 bouyer static int 1063 1.2 bouyer xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp) 1064 1.2 bouyer { 1065 1.2 bouyer struct timespec ts; 1066 1.2 bouyer 1067 1.2 bouyer xen_wallclock_time(&ts); 1068 1.2 bouyer TIMESPEC_TO_TIMEVAL(tvp, &ts); 1069 1.2 bouyer 1070 1.2 bouyer return 0; 1071 1.2 bouyer } 1072 1.2 bouyer 1073 1.2 bouyer /* 1074 1.2 bouyer * xen_rtc_set(todr, tv) 1075 1.2 bouyer * 1076 1.2 bouyer * Set the Xen wall clock time, if we can. 1077 1.2 bouyer */ 1078 1.2 bouyer static int 1079 1.2 bouyer xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp) 1080 1.2 bouyer { 1081 1.2 bouyer #ifdef DOM0OPS 1082 1.2 bouyer struct clock_ymdhms dt; 1083 1.2 bouyer xen_platform_op_t op; 1084 1.2 bouyer uint64_t systime_ns; 1085 1.2 bouyer 1086 1.2 bouyer if (xendomain_is_privileged()) { 1087 1.2 bouyer /* Convert to ymdhms and set the x86 ISA RTC. */ 1088 1.2 bouyer clock_secs_to_ymdhms(tvp->tv_sec, &dt); 1089 1.2 bouyer rtc_set_ymdhms(NULL, &dt); 1090 1.2 bouyer 1091 1.2 bouyer /* Get the global system time so we can preserve it. */ 1092 1.2 bouyer systime_ns = xen_global_systime_ns(); 1093 1.2 bouyer 1094 1.2 bouyer /* Set the hypervisor wall clock time. */ 1095 1.19 riastrad memset(&op, 0, sizeof(op)); 1096 1.2 bouyer op.cmd = XENPF_settime; 1097 1.2 bouyer op.u.settime.secs = tvp->tv_sec; 1098 1.2 bouyer op.u.settime.nsecs = tvp->tv_usec * 1000; 1099 1.2 bouyer op.u.settime.system_time = systime_ns; 1100 1.2 bouyer return HYPERVISOR_platform_op(&op); 1101 1.2 bouyer } 1102 1.2 bouyer #endif 1103 1.2 bouyer 1104 1.2 bouyer /* XXX Should this fail if not on privileged dom0? */ 1105 1.2 bouyer return 0; 1106 1.2 bouyer } 1107 1.2 bouyer 1108 1.2 bouyer /* 1109 1.2 bouyer * xen_wallclock_time(tsp) 1110 1.2 bouyer * 1111 1.2 bouyer * Return a snapshot of the current low-resolution wall clock 1112 1.2 bouyer * time, as reported by the hypervisor, in tsp. 1113 1.2 bouyer */ 1114 1.2 bouyer static void 1115 1.2 bouyer xen_wallclock_time(struct timespec *tsp) 1116 1.2 bouyer { 1117 1.2 bouyer struct xen_wallclock_ticket ticket; 1118 1.2 bouyer uint64_t systime_ns; 1119 1.2 bouyer 1120 1.2 bouyer int s = splsched(); /* make sure we won't be interrupted */ 1121 1.2 bouyer /* Read the last wall clock sample from the hypervisor. */ 1122 1.2 bouyer do { 1123 1.2 bouyer xen_wallclock_enter(&ticket); 1124 1.2 bouyer tsp->tv_sec = HYPERVISOR_shared_info->wc_sec; 1125 1.2 bouyer tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec; 1126 1.2 bouyer } while (!xen_wallclock_exit(&ticket)); 1127 1.2 bouyer 1128 1.2 bouyer /* Get the global system time. */ 1129 1.2 bouyer systime_ns = xen_global_systime_ns(); 1130 1.2 bouyer splx(s); 1131 1.2 bouyer 1132 1.2 bouyer /* Add the system time to the wall clock time. */ 1133 1.2 bouyer systime_ns += tsp->tv_nsec; 1134 1.2 bouyer tsp->tv_sec += systime_ns / 1000000000ull; 1135 1.2 bouyer tsp->tv_nsec = systime_ns % 1000000000ull; 1136 1.2 bouyer } 1137 1.2 bouyer 1138 1.4 bouyer #ifdef XENPV 1139 1.4 bouyer /* 1140 1.4 bouyer * setstatclockrate(rate) 1141 1.4 bouyer * 1142 1.4 bouyer * Set the statclock to run at rate, in units of ticks per second. 1143 1.4 bouyer * 1144 1.4 bouyer * Currently Xen does not have a separate statclock, so this is a 1145 1.23 andvar * noop; instead the statclock runs in hardclock. 1146 1.4 bouyer */ 1147 1.4 bouyer void 1148 1.4 bouyer setstatclockrate(int rate) 1149 1.4 bouyer { 1150 1.4 bouyer } 1151 1.2 bouyer #endif /* XENPV */ 1152