xen_clock.c revision 1.23 1 1.23 andvar /* $NetBSD: xen_clock.c,v 1.23 2025/08/18 20:59:54 andvar Exp $ */
2 1.2 bouyer
3 1.2 bouyer /*-
4 1.2 bouyer * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc.
5 1.2 bouyer * All rights reserved.
6 1.2 bouyer *
7 1.2 bouyer * This code is derived from software contributed to The NetBSD Foundation
8 1.2 bouyer * by Taylor R. Campbell.
9 1.2 bouyer *
10 1.2 bouyer * Redistribution and use in source and binary forms, with or without
11 1.2 bouyer * modification, are permitted provided that the following conditions
12 1.2 bouyer * are met:
13 1.2 bouyer * 1. Redistributions of source code must retain the above copyright
14 1.2 bouyer * notice, this list of conditions and the following disclaimer.
15 1.2 bouyer * 2. Redistributions in binary form must reproduce the above copyright
16 1.2 bouyer * notice, this list of conditions and the following disclaimer in the
17 1.2 bouyer * documentation and/or other materials provided with the distribution.
18 1.2 bouyer *
19 1.2 bouyer * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 1.2 bouyer * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 1.2 bouyer * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 1.2 bouyer * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 1.2 bouyer * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 1.2 bouyer * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 1.2 bouyer * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 1.2 bouyer * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 1.2 bouyer * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 1.2 bouyer * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 1.2 bouyer * POSSIBILITY OF SUCH DAMAGE.
30 1.2 bouyer */
31 1.2 bouyer
32 1.2 bouyer #include "opt_xen.h"
33 1.2 bouyer
34 1.2 bouyer #ifndef XEN_CLOCK_DEBUG
35 1.2 bouyer #define XEN_CLOCK_DEBUG 0
36 1.2 bouyer #endif
37 1.2 bouyer
38 1.2 bouyer #include <sys/cdefs.h>
39 1.23 andvar __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.23 2025/08/18 20:59:54 andvar Exp $");
40 1.2 bouyer
41 1.2 bouyer #include <sys/param.h>
42 1.2 bouyer #include <sys/types.h>
43 1.2 bouyer #include <sys/atomic.h>
44 1.2 bouyer #include <sys/callout.h>
45 1.2 bouyer #include <sys/cpu.h>
46 1.2 bouyer #include <sys/device.h>
47 1.2 bouyer #include <sys/evcnt.h>
48 1.2 bouyer #include <sys/intr.h>
49 1.2 bouyer #include <sys/kernel.h>
50 1.2 bouyer #include <sys/lwp.h>
51 1.2 bouyer #include <sys/proc.h>
52 1.9 riastrad #include <sys/sdt.h>
53 1.2 bouyer #include <sys/sysctl.h>
54 1.2 bouyer #include <sys/systm.h>
55 1.2 bouyer #include <sys/time.h>
56 1.2 bouyer #include <sys/timetc.h>
57 1.2 bouyer
58 1.2 bouyer #include <dev/clock_subr.h>
59 1.2 bouyer
60 1.2 bouyer #include <machine/cpu.h>
61 1.2 bouyer #include <machine/cpu_counter.h>
62 1.2 bouyer #include <machine/lock.h>
63 1.2 bouyer
64 1.2 bouyer #include <xen/evtchn.h>
65 1.2 bouyer #include <xen/hypervisor.h>
66 1.2 bouyer #include <xen/include/public/vcpu.h>
67 1.2 bouyer #include <xen/xen.h>
68 1.2 bouyer
69 1.2 bouyer #include <x86/rtc.h>
70 1.2 bouyer
71 1.2 bouyer #define NS_PER_TICK ((uint64_t)1000000000ULL/hz)
72 1.2 bouyer
73 1.2 bouyer static uint64_t xen_vcputime_systime_ns(void);
74 1.2 bouyer static uint64_t xen_vcputime_raw_systime_ns(void);
75 1.2 bouyer static uint64_t xen_global_systime_ns(void);
76 1.2 bouyer static unsigned xen_get_timecount(struct timecounter *);
77 1.2 bouyer static int xen_timer_handler(void *, struct clockframe *);
78 1.2 bouyer
79 1.2 bouyer /*
80 1.9 riastrad * dtrace probes
81 1.9 riastrad */
82 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__backward,
83 1.9 riastrad "uint64_t"/*raw_systime_ns*/,
84 1.9 riastrad "uint64_t"/*tsc_timestamp*/,
85 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/,
86 1.9 riastrad "int"/*tsc_shift*/,
87 1.9 riastrad "uint64_t"/*delta_ns*/,
88 1.9 riastrad "uint64_t"/*tsc*/,
89 1.9 riastrad "uint64_t"/*systime_ns*/);
90 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, tsc__delta__negative,
91 1.9 riastrad "uint64_t"/*raw_systime_ns*/,
92 1.9 riastrad "uint64_t"/*tsc_timestamp*/,
93 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/,
94 1.9 riastrad "int"/*tsc_shift*/,
95 1.9 riastrad "uint64_t"/*delta_ns*/,
96 1.9 riastrad "uint64_t"/*tsc*/,
97 1.9 riastrad "uint64_t"/*systime_ns*/);
98 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, systime__wraparound,
99 1.9 riastrad "uint64_t"/*raw_systime_ns*/,
100 1.9 riastrad "uint64_t"/*tsc_timestamp*/,
101 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/,
102 1.9 riastrad "int"/*tsc_shift*/,
103 1.9 riastrad "uint64_t"/*delta_ns*/,
104 1.9 riastrad "uint64_t"/*tsc*/,
105 1.9 riastrad "uint64_t"/*systime_ns*/);
106 1.9 riastrad SDT_PROBE_DEFINE7(sdt, xen, clock, systime__backward,
107 1.9 riastrad "uint64_t"/*raw_systime_ns*/,
108 1.9 riastrad "uint64_t"/*tsc_timestamp*/,
109 1.9 riastrad "uint64_t"/*tsc_to_system_mul*/,
110 1.9 riastrad "int"/*tsc_shift*/,
111 1.9 riastrad "uint64_t"/*delta_ns*/,
112 1.9 riastrad "uint64_t"/*tsc*/,
113 1.9 riastrad "uint64_t"/*systime_ns*/);
114 1.9 riastrad
115 1.11 riastrad SDT_PROBE_DEFINE3(sdt, xen, timecounter, backward,
116 1.11 riastrad "uint64_t"/*local*/,
117 1.11 riastrad "uint64_t"/*skew*/,
118 1.11 riastrad "uint64_t"/*global*/);
119 1.11 riastrad
120 1.9 riastrad SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward,
121 1.9 riastrad "uint64_t"/*last_systime_ns*/,
122 1.9 riastrad "uint64_t"/*this_systime_ns*/);
123 1.15 riastrad SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick,
124 1.15 riastrad "uint64_t"/*last_systime_ns*/,
125 1.15 riastrad "uint64_t"/*this_systime_ns*/);
126 1.15 riastrad SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump,
127 1.15 riastrad "uint64_t"/*last_systime_ns*/,
128 1.15 riastrad "uint64_t"/*this_systime_ns*/,
129 1.15 riastrad "uint64_t"/*nticks*/);
130 1.9 riastrad SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed,
131 1.9 riastrad "uint64_t"/*last_systime_ns*/,
132 1.9 riastrad "uint64_t"/*this_systime_ns*/,
133 1.9 riastrad "uint64_t"/*remaining_ns*/);
134 1.9 riastrad
135 1.9 riastrad /*
136 1.2 bouyer * xen timecounter:
137 1.2 bouyer *
138 1.2 bouyer * Xen vCPU system time, plus an adjustment with rdtsc.
139 1.2 bouyer */
140 1.2 bouyer static struct timecounter xen_timecounter = {
141 1.2 bouyer .tc_get_timecount = xen_get_timecount,
142 1.2 bouyer .tc_poll_pps = NULL,
143 1.2 bouyer .tc_counter_mask = ~0U,
144 1.2 bouyer .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */
145 1.2 bouyer .tc_name = "xen_system_time",
146 1.2 bouyer .tc_quality = 10000,
147 1.2 bouyer };
148 1.2 bouyer
149 1.2 bouyer /*
150 1.2 bouyer * xen_global_systime_ns_stamp
151 1.2 bouyer *
152 1.2 bouyer * The latest Xen vCPU system time that has been observed on any
153 1.2 bouyer * CPU, for a global monotonic view of the Xen system time clock.
154 1.2 bouyer */
155 1.2 bouyer static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned;
156 1.2 bouyer
157 1.2 bouyer #ifdef DOM0OPS
158 1.2 bouyer /*
159 1.2 bouyer * xen timepush state:
160 1.2 bouyer *
161 1.2 bouyer * Callout to periodically, after a sysctl-configurable number of
162 1.2 bouyer * NetBSD ticks, set the Xen hypervisor's wall clock time.
163 1.2 bouyer */
164 1.2 bouyer static struct {
165 1.2 bouyer struct callout ch;
166 1.2 bouyer int ticks;
167 1.2 bouyer } xen_timepush;
168 1.2 bouyer
169 1.2 bouyer static void xen_timepush_init(void);
170 1.2 bouyer static void xen_timepush_intr(void *);
171 1.2 bouyer static int sysctl_xen_timepush(SYSCTLFN_ARGS);
172 1.2 bouyer #endif
173 1.2 bouyer
174 1.2 bouyer /*
175 1.2 bouyer * struct xen_vcputime_ticket
176 1.2 bouyer *
177 1.2 bouyer * State for a vCPU read section, during which a caller may read
178 1.22 riastrad * from fields of a struct vcpu_time_info and call rdtsc.
179 1.22 riastrad *
180 1.2 bouyer * Caller must enter with xen_vcputime_enter, exit with
181 1.2 bouyer * xen_vcputime_exit, and be prepared to retry if
182 1.2 bouyer * xen_vcputime_exit fails.
183 1.2 bouyer */
184 1.2 bouyer struct xen_vcputime_ticket {
185 1.2 bouyer uint64_t version;
186 1.2 bouyer };
187 1.2 bouyer
188 1.2 bouyer /*
189 1.2 bouyer * xen_vcputime_enter(tp)
190 1.2 bouyer *
191 1.2 bouyer * Enter a vCPU time read section and store a ticket in *tp, which
192 1.2 bouyer * the caller must use with xen_vcputime_exit. Return a pointer
193 1.2 bouyer * to the current CPU's vcpu_time_info structure. Caller must
194 1.2 bouyer * already be bound to the CPU.
195 1.2 bouyer */
196 1.2 bouyer static inline volatile struct vcpu_time_info *
197 1.2 bouyer xen_vcputime_enter(struct xen_vcputime_ticket *tp)
198 1.2 bouyer {
199 1.2 bouyer volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time;
200 1.2 bouyer
201 1.2 bouyer while (__predict_false(1 & (tp->version = vt->version)))
202 1.2 bouyer SPINLOCK_BACKOFF_HOOK;
203 1.2 bouyer
204 1.2 bouyer /*
205 1.2 bouyer * Must read the version before reading the tsc on the local
206 1.2 bouyer * pCPU. We are racing only with interruption by the
207 1.2 bouyer * hypervisor, so no need for a stronger memory barrier.
208 1.2 bouyer */
209 1.2 bouyer __insn_barrier();
210 1.2 bouyer
211 1.2 bouyer return vt;
212 1.2 bouyer }
213 1.2 bouyer
214 1.2 bouyer /*
215 1.2 bouyer * xen_vcputime_exit(vt, tp)
216 1.2 bouyer *
217 1.2 bouyer * Exit a vCPU time read section with the ticket in *tp from
218 1.2 bouyer * xen_vcputime_enter. Return true on success, false if caller
219 1.2 bouyer * must retry.
220 1.2 bouyer */
221 1.2 bouyer static inline bool
222 1.2 bouyer xen_vcputime_exit(volatile struct vcpu_time_info *vt,
223 1.2 bouyer struct xen_vcputime_ticket *tp)
224 1.2 bouyer {
225 1.2 bouyer
226 1.2 bouyer KASSERT(vt == &curcpu()->ci_vcpu->time);
227 1.2 bouyer
228 1.2 bouyer /*
229 1.2 bouyer * Must read the tsc before re-reading the version on the local
230 1.2 bouyer * pCPU. We are racing only with interruption by the
231 1.2 bouyer * hypervisor, so no need for a stronger memory barrier.
232 1.2 bouyer */
233 1.2 bouyer __insn_barrier();
234 1.2 bouyer
235 1.2 bouyer return tp->version == vt->version;
236 1.2 bouyer }
237 1.2 bouyer
238 1.2 bouyer /*
239 1.2 bouyer * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
240 1.2 bouyer *
241 1.2 bouyer * Convert a difference in tsc units to a difference in
242 1.2 bouyer * nanoseconds given a multiplier and shift for the unit
243 1.2 bouyer * conversion.
244 1.2 bouyer */
245 1.2 bouyer static inline uint64_t
246 1.2 bouyer xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
247 1.2 bouyer int8_t tsc_shift)
248 1.2 bouyer {
249 1.2 bouyer uint32_t delta_tsc_hi, delta_tsc_lo;
250 1.2 bouyer
251 1.2 bouyer if (tsc_shift < 0)
252 1.2 bouyer delta_tsc >>= -tsc_shift;
253 1.2 bouyer else
254 1.2 bouyer delta_tsc <<= tsc_shift;
255 1.2 bouyer
256 1.2 bouyer delta_tsc_hi = delta_tsc >> 32;
257 1.2 bouyer delta_tsc_lo = delta_tsc & 0xffffffffUL;
258 1.2 bouyer
259 1.2 bouyer /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
260 1.2 bouyer return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
261 1.2 bouyer (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
262 1.2 bouyer }
263 1.2 bouyer
264 1.2 bouyer /*
265 1.2 bouyer * xen_vcputime_systime_ns()
266 1.2 bouyer *
267 1.2 bouyer * Return a snapshot of the Xen system time plus an adjustment
268 1.2 bouyer * from the tsc, in units of nanoseconds. Caller must be bound to
269 1.2 bouyer * the current CPU.
270 1.2 bouyer */
271 1.2 bouyer static uint64_t
272 1.2 bouyer xen_vcputime_systime_ns(void)
273 1.2 bouyer {
274 1.2 bouyer volatile struct vcpu_time_info *vt;
275 1.2 bouyer struct cpu_info *ci = curcpu();
276 1.2 bouyer struct xen_vcputime_ticket ticket;
277 1.2 bouyer uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns;
278 1.2 bouyer uint32_t tsc_to_system_mul;
279 1.2 bouyer int8_t tsc_shift;
280 1.2 bouyer uint64_t systime_ns;
281 1.2 bouyer
282 1.2 bouyer /* We'd better be bound to the CPU in _some_ way. */
283 1.2 bouyer KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() ||
284 1.2 bouyer (curlwp->l_flag & LP_BOUND));
285 1.2 bouyer
286 1.2 bouyer /*
287 1.2 bouyer * Repeatedly try to read the system time, corresponding tsc
288 1.2 bouyer * timestamp, and tsc frequency until we get a consistent view.
289 1.2 bouyer */
290 1.2 bouyer do {
291 1.2 bouyer vt = xen_vcputime_enter(&ticket);
292 1.2 bouyer
293 1.2 bouyer /* Grab Xen's snapshot of raw system time and tsc. */
294 1.2 bouyer raw_systime_ns = vt->system_time;
295 1.2 bouyer tsc_timestamp = vt->tsc_timestamp;
296 1.2 bouyer
297 1.2 bouyer /* Get Xen's current idea of how fast the tsc is counting. */
298 1.2 bouyer tsc_to_system_mul = vt->tsc_to_system_mul;
299 1.2 bouyer tsc_shift = vt->tsc_shift;
300 1.2 bouyer
301 1.2 bouyer /* Read the CPU's tsc. */
302 1.22 riastrad tsc = rdtsc();
303 1.2 bouyer } while (!xen_vcputime_exit(vt, &ticket));
304 1.2 bouyer
305 1.2 bouyer /*
306 1.2 bouyer * Out of paranoia, check whether the tsc has gone backwards
307 1.2 bouyer * since Xen's timestamp.
308 1.2 bouyer *
309 1.2 bouyer * This shouldn't happen because the Xen hypervisor is supposed
310 1.2 bouyer * to have read the tsc _before_ writing to the vcpu_time_info
311 1.2 bouyer * page, _before_ we read the tsc.
312 1.2 bouyer *
313 1.2 bouyer * Further, if we switched pCPUs after reading the tsc
314 1.2 bouyer * timestamp but before reading the CPU's tsc, the hypervisor
315 1.2 bouyer * had better notify us by updating the version too and forcing
316 1.2 bouyer * us to retry the vCPU time read.
317 1.2 bouyer */
318 1.2 bouyer if (__predict_false(tsc < tsc_timestamp)) {
319 1.2 bouyer /*
320 1.2 bouyer * Notify the console that the CPU's tsc appeared to
321 1.2 bouyer * run behind Xen's idea of it, and pretend it hadn't.
322 1.2 bouyer */
323 1.9 riastrad SDT_PROBE7(sdt, xen, clock, tsc__backward,
324 1.9 riastrad raw_systime_ns, tsc_timestamp,
325 1.9 riastrad tsc_to_system_mul, tsc_shift, /*delta_ns*/0, tsc,
326 1.9 riastrad /*systime_ns*/raw_systime_ns);
327 1.9 riastrad #if XEN_CLOCK_DEBUG
328 1.8 bouyer device_printf(ci->ci_dev, "xen cpu tsc %"PRIu64
329 1.2 bouyer " ran backwards from timestamp %"PRIu64
330 1.2 bouyer " by %"PRIu64"\n",
331 1.2 bouyer tsc, tsc_timestamp, tsc_timestamp - tsc);
332 1.2 bouyer #endif
333 1.2 bouyer ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++;
334 1.2 bouyer delta_ns = delta_tsc = 0;
335 1.2 bouyer } else {
336 1.2 bouyer /* Find how far the CPU's tsc has advanced. */
337 1.2 bouyer delta_tsc = tsc - tsc_timestamp;
338 1.2 bouyer
339 1.2 bouyer /* Convert the tsc delta to a nanosecond delta. */
340 1.2 bouyer delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul,
341 1.2 bouyer tsc_shift);
342 1.2 bouyer }
343 1.2 bouyer
344 1.2 bouyer /*
345 1.2 bouyer * Notify the console if the delta computation yielded a
346 1.2 bouyer * negative, and pretend it hadn't.
347 1.2 bouyer *
348 1.2 bouyer * This doesn't make sense but I include it out of paranoia.
349 1.2 bouyer */
350 1.2 bouyer if (__predict_false((int64_t)delta_ns < 0)) {
351 1.9 riastrad SDT_PROBE7(sdt, xen, clock, tsc__delta__negative,
352 1.9 riastrad raw_systime_ns, tsc_timestamp,
353 1.9 riastrad tsc_to_system_mul, tsc_shift, delta_ns, tsc,
354 1.9 riastrad /*systime_ns*/raw_systime_ns);
355 1.9 riastrad #if XEN_CLOCK_DEBUG
356 1.9 riastrad device_printf(ci->ci_dev, "xen tsc delta in ns went negative:"
357 1.9 riastrad " %"PRId64"\n", delta_ns);
358 1.2 bouyer #endif
359 1.2 bouyer ci->ci_xen_tsc_delta_negative_evcnt.ev_count++;
360 1.2 bouyer delta_ns = 0;
361 1.2 bouyer }
362 1.2 bouyer
363 1.2 bouyer /*
364 1.2 bouyer * Compute the TSC-adjusted system time.
365 1.2 bouyer */
366 1.2 bouyer systime_ns = raw_systime_ns + delta_ns;
367 1.2 bouyer
368 1.2 bouyer /*
369 1.2 bouyer * Notify the console if the addition wrapped around.
370 1.2 bouyer *
371 1.2 bouyer * This shouldn't happen because system time should be relative
372 1.2 bouyer * to a reasonable reference point, not centuries in the past.
373 1.2 bouyer * (2^64 ns is approximately half a millennium.)
374 1.2 bouyer */
375 1.2 bouyer if (__predict_false(systime_ns < raw_systime_ns)) {
376 1.9 riastrad SDT_PROBE7(sdt, xen, clock, systime__wraparound,
377 1.9 riastrad raw_systime_ns, tsc_timestamp,
378 1.9 riastrad tsc_to_system_mul, tsc_shift, delta_ns, tsc,
379 1.9 riastrad systime_ns);
380 1.9 riastrad #if XEN_CLOCK_DEBUG
381 1.2 bouyer printf("xen raw systime + tsc delta wrapped around:"
382 1.2 bouyer " %"PRIu64" + %"PRIu64" = %"PRIu64"\n",
383 1.2 bouyer raw_systime_ns, delta_ns, systime_ns);
384 1.2 bouyer #endif
385 1.2 bouyer ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++;
386 1.2 bouyer }
387 1.2 bouyer
388 1.2 bouyer /*
389 1.2 bouyer * Notify the console if the TSC-adjusted Xen system time
390 1.2 bouyer * appears to have gone backwards, and pretend we had gone
391 1.2 bouyer * forward. This seems to happen pretty regularly under load.
392 1.2 bouyer */
393 1.2 bouyer if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) {
394 1.9 riastrad SDT_PROBE7(sdt, xen, clock, systime__backward,
395 1.9 riastrad raw_systime_ns, tsc_timestamp,
396 1.9 riastrad tsc_to_system_mul, tsc_shift, delta_ns, tsc,
397 1.9 riastrad systime_ns);
398 1.9 riastrad #if XEN_CLOCK_DEBUG
399 1.2 bouyer printf("xen raw systime + tsc delta went backwards:"
400 1.2 bouyer " %"PRIu64" > %"PRIu64"\n",
401 1.2 bouyer ci->ci_xen_last_systime_ns, systime_ns);
402 1.2 bouyer printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n"
403 1.2 bouyer " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n"
404 1.2 bouyer " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n"
405 1.2 bouyer " delta_ns=%"PRIu64"\n",
406 1.2 bouyer raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul,
407 1.2 bouyer tsc_shift, delta_tsc, delta_ns);
408 1.2 bouyer #endif
409 1.2 bouyer ci->ci_xen_raw_systime_backwards_evcnt.ev_count++;
410 1.2 bouyer systime_ns = ci->ci_xen_last_systime_ns + 1;
411 1.2 bouyer }
412 1.2 bouyer
413 1.2 bouyer /* Remember the TSC-adjusted Xen system time. */
414 1.2 bouyer ci->ci_xen_last_systime_ns = systime_ns;
415 1.2 bouyer
416 1.2 bouyer /* We had better not have migrated CPUs. */
417 1.2 bouyer KASSERT(ci == curcpu());
418 1.2 bouyer
419 1.2 bouyer /* And we're done: return the TSC-adjusted systime in nanoseconds. */
420 1.2 bouyer return systime_ns;
421 1.2 bouyer }
422 1.2 bouyer
423 1.2 bouyer /*
424 1.2 bouyer * xen_vcputime_raw_systime_ns()
425 1.2 bouyer *
426 1.2 bouyer * Return a snapshot of the current Xen system time to the
427 1.2 bouyer * resolution of the Xen hypervisor tick, in units of nanoseconds.
428 1.2 bouyer */
429 1.2 bouyer static uint64_t
430 1.2 bouyer xen_vcputime_raw_systime_ns(void)
431 1.2 bouyer {
432 1.2 bouyer volatile struct vcpu_time_info *vt;
433 1.2 bouyer struct xen_vcputime_ticket ticket;
434 1.2 bouyer uint64_t raw_systime_ns;
435 1.2 bouyer
436 1.2 bouyer do {
437 1.2 bouyer vt = xen_vcputime_enter(&ticket);
438 1.2 bouyer raw_systime_ns = vt->system_time;
439 1.2 bouyer } while (!xen_vcputime_exit(vt, &ticket));
440 1.2 bouyer
441 1.2 bouyer return raw_systime_ns;
442 1.2 bouyer }
443 1.2 bouyer
444 1.2 bouyer /*
445 1.2 bouyer * struct xen_wallclock_ticket
446 1.2 bouyer *
447 1.2 bouyer * State for a wall clock read section, during which a caller may
448 1.2 bouyer * read from the wall clock fields of HYPERVISOR_shared_info.
449 1.2 bouyer * Caller must enter with xen_wallclock_enter, exit with
450 1.2 bouyer * xen_wallclock_exit, and be prepared to retry if
451 1.2 bouyer * xen_wallclock_exit fails.
452 1.2 bouyer */
453 1.2 bouyer struct xen_wallclock_ticket {
454 1.2 bouyer uint32_t version;
455 1.2 bouyer };
456 1.2 bouyer
457 1.2 bouyer /*
458 1.2 bouyer * xen_wallclock_enter(tp)
459 1.2 bouyer *
460 1.2 bouyer * Enter a wall clock read section and store a ticket in *tp,
461 1.2 bouyer * which the caller must use with xen_wallclock_exit.
462 1.2 bouyer */
463 1.2 bouyer static inline void
464 1.2 bouyer xen_wallclock_enter(struct xen_wallclock_ticket *tp)
465 1.2 bouyer {
466 1.2 bouyer
467 1.2 bouyer while (__predict_false(1 & (tp->version =
468 1.2 bouyer HYPERVISOR_shared_info->wc_version)))
469 1.2 bouyer SPINLOCK_BACKOFF_HOOK;
470 1.2 bouyer
471 1.2 bouyer /*
472 1.2 bouyer * Must read the version from memory before reading the
473 1.2 bouyer * timestamp from memory, as written potentially by another
474 1.2 bouyer * pCPU.
475 1.2 bouyer */
476 1.2 bouyer membar_consumer();
477 1.2 bouyer }
478 1.2 bouyer
479 1.2 bouyer /*
480 1.2 bouyer * xen_wallclock_exit(tp)
481 1.2 bouyer *
482 1.2 bouyer * Exit a wall clock read section with the ticket in *tp from
483 1.2 bouyer * xen_wallclock_enter. Return true on success, false if caller
484 1.2 bouyer * must retry.
485 1.2 bouyer */
486 1.2 bouyer static inline bool
487 1.2 bouyer xen_wallclock_exit(struct xen_wallclock_ticket *tp)
488 1.2 bouyer {
489 1.2 bouyer
490 1.2 bouyer /*
491 1.2 bouyer * Must read the timestamp from memory before re-reading the
492 1.2 bouyer * version from memory, as written potentially by another pCPU.
493 1.2 bouyer */
494 1.2 bouyer membar_consumer();
495 1.2 bouyer
496 1.2 bouyer return tp->version == HYPERVISOR_shared_info->wc_version;
497 1.2 bouyer }
498 1.2 bouyer
499 1.2 bouyer /*
500 1.2 bouyer * xen_global_systime_ns()
501 1.2 bouyer *
502 1.2 bouyer * Return a global monotonic view of the system time in
503 1.2 bouyer * nanoseconds, computed by the per-CPU Xen raw system time plus
504 1.2 bouyer * an rdtsc adjustment, and advance the view of the system time
505 1.2 bouyer * for all other CPUs.
506 1.2 bouyer */
507 1.2 bouyer static uint64_t
508 1.2 bouyer xen_global_systime_ns(void)
509 1.2 bouyer {
510 1.2 bouyer struct cpu_info *ci;
511 1.11 riastrad uint64_t local, global, skew, result;
512 1.2 bouyer
513 1.2 bouyer /*
514 1.2 bouyer * Find the local timecount on this CPU, and make sure it does
515 1.2 bouyer * not precede the latest global timecount witnessed so far by
516 1.2 bouyer * any CPU. If it does, add to the local CPU's skew from the
517 1.2 bouyer * fastest CPU.
518 1.2 bouyer *
519 1.2 bouyer * XXX Can we avoid retrying if the CAS fails?
520 1.2 bouyer */
521 1.2 bouyer int s = splsched(); /* make sure we won't be interrupted */
522 1.2 bouyer ci = curcpu();
523 1.2 bouyer do {
524 1.2 bouyer local = xen_vcputime_systime_ns();
525 1.11 riastrad skew = ci->ci_xen_systime_ns_skew;
526 1.2 bouyer global = xen_global_systime_ns_stamp;
527 1.11 riastrad if (__predict_false(local + skew < global + 1)) {
528 1.11 riastrad SDT_PROBE3(sdt, xen, timecounter, backward,
529 1.11 riastrad local, skew, global);
530 1.11 riastrad #if XEN_CLOCK_DEBUG
531 1.11 riastrad device_printf(ci->ci_dev,
532 1.11 riastrad "xen timecounter went backwards:"
533 1.11 riastrad " local=%"PRIu64" skew=%"PRIu64" global=%"PRIu64","
534 1.11 riastrad " adding %"PRIu64" to skew\n",
535 1.11 riastrad local, skew, global, global + 1 - (local + skew));
536 1.11 riastrad #endif
537 1.11 riastrad ci->ci_xen_timecounter_backwards_evcnt.ev_count++;
538 1.2 bouyer result = global + 1;
539 1.11 riastrad ci->ci_xen_systime_ns_skew += global + 1 -
540 1.11 riastrad (local + skew);
541 1.2 bouyer } else {
542 1.11 riastrad result = local + skew;
543 1.2 bouyer }
544 1.2 bouyer } while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result)
545 1.2 bouyer != global);
546 1.2 bouyer KASSERT(ci == curcpu());
547 1.2 bouyer splx(s);
548 1.2 bouyer
549 1.2 bouyer return result;
550 1.2 bouyer }
551 1.2 bouyer
552 1.2 bouyer /*
553 1.2 bouyer * xen_get_timecount(tc)
554 1.2 bouyer *
555 1.2 bouyer * Return the low 32 bits of a global monotonic view of the Xen
556 1.2 bouyer * system time.
557 1.2 bouyer */
558 1.2 bouyer static unsigned
559 1.2 bouyer xen_get_timecount(struct timecounter *tc)
560 1.2 bouyer {
561 1.2 bouyer
562 1.2 bouyer KASSERT(tc == &xen_timecounter);
563 1.2 bouyer
564 1.2 bouyer return (unsigned)xen_global_systime_ns();
565 1.2 bouyer }
566 1.2 bouyer
567 1.2 bouyer /*
568 1.2 bouyer * xen_delay(n)
569 1.2 bouyer *
570 1.2 bouyer * Wait approximately n microseconds.
571 1.2 bouyer */
572 1.2 bouyer void
573 1.2 bouyer xen_delay(unsigned n)
574 1.2 bouyer {
575 1.2 bouyer int bound;
576 1.2 bouyer
577 1.2 bouyer /* Bind to the CPU so we don't compare tsc on different CPUs. */
578 1.2 bouyer bound = curlwp_bind();
579 1.2 bouyer
580 1.2 bouyer if (curcpu()->ci_vcpu == NULL) {
581 1.2 bouyer curlwp_bindx(bound);
582 1.2 bouyer return;
583 1.2 bouyer }
584 1.2 bouyer
585 1.2 bouyer /* Short wait (<500us) or long wait? */
586 1.2 bouyer if (n < 500000) {
587 1.2 bouyer /*
588 1.2 bouyer * Xen system time is not precise enough for short
589 1.2 bouyer * delays, so use the tsc instead.
590 1.2 bouyer *
591 1.2 bouyer * We work with the current tsc frequency, and figure
592 1.2 bouyer * that if it changes while we're delaying, we've
593 1.2 bouyer * probably delayed long enough -- up to 500us.
594 1.2 bouyer *
595 1.2 bouyer * We do not use cpu_frequency(ci), which uses a
596 1.2 bouyer * quantity detected at boot time, and which may have
597 1.2 bouyer * changed by now if Xen has migrated this vCPU to
598 1.2 bouyer * another pCPU.
599 1.2 bouyer *
600 1.2 bouyer * XXX How long does it take to migrate pCPUs?
601 1.2 bouyer */
602 1.2 bouyer volatile struct vcpu_time_info *vt;
603 1.2 bouyer struct xen_vcputime_ticket ticket;
604 1.2 bouyer uint64_t tsc_start, last_tsc, tsc;
605 1.2 bouyer uint32_t tsc_to_system_mul;
606 1.2 bouyer int8_t tsc_shift;
607 1.2 bouyer
608 1.2 bouyer /* Get the starting tsc and tsc frequency. */
609 1.2 bouyer do {
610 1.2 bouyer vt = xen_vcputime_enter(&ticket);
611 1.22 riastrad tsc_start = last_tsc = rdtsc();
612 1.2 bouyer tsc_to_system_mul = vt->tsc_to_system_mul;
613 1.2 bouyer tsc_shift = vt->tsc_shift;
614 1.2 bouyer } while (!xen_vcputime_exit(vt, &ticket));
615 1.2 bouyer
616 1.2 bouyer /*
617 1.2 bouyer * Wait until as many tsc ticks as there are in n
618 1.2 bouyer * microseconds have elapsed, or the tsc has gone
619 1.2 bouyer * backwards meaning we've probably migrated pCPUs.
620 1.2 bouyer */
621 1.2 bouyer for (;;) {
622 1.22 riastrad tsc = rdtsc();
623 1.2 bouyer if (__predict_false(tsc < last_tsc))
624 1.2 bouyer break;
625 1.2 bouyer if (xen_tsc_to_ns_delta(tsc - tsc_start,
626 1.2 bouyer tsc_to_system_mul, tsc_shift)/1000 >= n)
627 1.2 bouyer break;
628 1.2 bouyer last_tsc = tsc;
629 1.2 bouyer }
630 1.2 bouyer } else {
631 1.2 bouyer /*
632 1.2 bouyer * Use the Xen system time for >=500us delays. From my
633 1.2 bouyer * testing, it seems to sometimes run backward by about
634 1.2 bouyer * 110us, which is not so bad.
635 1.2 bouyer */
636 1.2 bouyer uint64_t n_ns = 1000*(uint64_t)n;
637 1.2 bouyer uint64_t start_ns;
638 1.2 bouyer
639 1.2 bouyer /* Get the start time. */
640 1.2 bouyer start_ns = xen_vcputime_raw_systime_ns();
641 1.2 bouyer
642 1.2 bouyer /* Wait until the system time has passed the end. */
643 1.2 bouyer do {
644 1.2 bouyer HYPERVISOR_yield();
645 1.2 bouyer } while (xen_vcputime_raw_systime_ns() - start_ns < n_ns);
646 1.2 bouyer }
647 1.2 bouyer
648 1.2 bouyer /* Unbind from the CPU if we weren't already bound. */
649 1.2 bouyer curlwp_bindx(bound);
650 1.2 bouyer }
651 1.2 bouyer
652 1.2 bouyer /*
653 1.2 bouyer * xen_suspendclocks(ci)
654 1.2 bouyer *
655 1.2 bouyer * Stop handling the Xen timer event on the CPU of ci. Caller
656 1.2 bouyer * must be running on and bound to ci's CPU.
657 1.2 bouyer *
658 1.2 bouyer * Actually, caller must have kpreemption disabled, because that's
659 1.2 bouyer * easier to assert at the moment.
660 1.2 bouyer */
661 1.2 bouyer void
662 1.2 bouyer xen_suspendclocks(struct cpu_info *ci)
663 1.2 bouyer {
664 1.2 bouyer int evtch;
665 1.2 bouyer
666 1.2 bouyer KASSERT(ci == curcpu());
667 1.2 bouyer KASSERT(kpreempt_disabled());
668 1.2 bouyer
669 1.13 riastrad /*
670 1.13 riastrad * Find the VIRQ_TIMER event channel and close it so new timer
671 1.13 riastrad * interrupt events stop getting delivered to it.
672 1.13 riastrad *
673 1.13 riastrad * XXX Should this happen later? This is not the reverse order
674 1.13 riastrad * of xen_resumeclocks. It is apparently necessary in this
675 1.13 riastrad * order only because we don't stash evtchn anywhere, but we
676 1.13 riastrad * could stash it.
677 1.13 riastrad */
678 1.2 bouyer evtch = unbind_virq_from_evtch(VIRQ_TIMER);
679 1.2 bouyer KASSERT(evtch != -1);
680 1.2 bouyer
681 1.13 riastrad /*
682 1.13 riastrad * Mask the event channel so we stop getting new interrupts on
683 1.13 riastrad * it.
684 1.13 riastrad */
685 1.2 bouyer hypervisor_mask_event(evtch);
686 1.13 riastrad
687 1.13 riastrad /*
688 1.13 riastrad * Now that we are no longer getting new interrupts, remove the
689 1.13 riastrad * handler and wait for any existing calls to the handler to
690 1.13 riastrad * complete. After this point, there can be no concurrent
691 1.13 riastrad * calls to xen_timer_handler.
692 1.13 riastrad */
693 1.13 riastrad event_remove_handler(evtch,
694 1.2 bouyer __FPTRCAST(int (*)(void *), xen_timer_handler), ci);
695 1.2 bouyer
696 1.2 bouyer aprint_verbose("Xen clock: removed event channel %d\n", evtch);
697 1.2 bouyer
698 1.2 bouyer /* We'd better not have switched CPUs. */
699 1.2 bouyer KASSERT(ci == curcpu());
700 1.2 bouyer }
701 1.2 bouyer
702 1.2 bouyer /*
703 1.2 bouyer * xen_resumeclocks(ci)
704 1.2 bouyer *
705 1.2 bouyer * Start handling the Xen timer event on the CPU of ci. Arm the
706 1.2 bouyer * Xen timer. Caller must be running on and bound to ci's CPU.
707 1.2 bouyer *
708 1.2 bouyer * Actually, caller must have kpreemption disabled, because that's
709 1.2 bouyer * easier to assert at the moment.
710 1.2 bouyer */
711 1.2 bouyer void
712 1.2 bouyer xen_resumeclocks(struct cpu_info *ci)
713 1.2 bouyer {
714 1.2 bouyer char intr_xname[INTRDEVNAMEBUF];
715 1.2 bouyer int evtch;
716 1.2 bouyer int error __diagused;
717 1.2 bouyer
718 1.2 bouyer KASSERT(ci == curcpu());
719 1.2 bouyer KASSERT(kpreempt_disabled());
720 1.2 bouyer
721 1.13 riastrad /*
722 1.13 riastrad * Allocate an event channel to receive VIRQ_TIMER events.
723 1.13 riastrad */
724 1.2 bouyer evtch = bind_virq_to_evtch(VIRQ_TIMER);
725 1.2 bouyer KASSERT(evtch != -1);
726 1.2 bouyer
727 1.13 riastrad /*
728 1.13 riastrad * Set an event handler for VIRQ_TIMER events to call
729 1.13 riastrad * xen_timer_handler.
730 1.13 riastrad */
731 1.2 bouyer snprintf(intr_xname, sizeof(intr_xname), "%s clock",
732 1.2 bouyer device_xname(ci->ci_dev));
733 1.2 bouyer /* XXX sketchy function pointer cast -- fix the API, please */
734 1.2 bouyer if (event_set_handler(evtch,
735 1.2 bouyer __FPTRCAST(int (*)(void *), xen_timer_handler),
736 1.5 bouyer ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL)
737 1.2 bouyer panic("failed to establish timer interrupt handler");
738 1.2 bouyer
739 1.2 bouyer aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch);
740 1.2 bouyer
741 1.2 bouyer /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */
742 1.2 bouyer if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
743 1.2 bouyer error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
744 1.2 bouyer ci->ci_vcpuid, NULL);
745 1.2 bouyer KASSERT(error == 0);
746 1.2 bouyer }
747 1.2 bouyer
748 1.2 bouyer /* Pretend the last hardclock happened right now. */
749 1.2 bouyer ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns();
750 1.2 bouyer
751 1.2 bouyer /* Arm the one-shot timer. */
752 1.2 bouyer error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns +
753 1.2 bouyer NS_PER_TICK);
754 1.2 bouyer KASSERT(error == 0);
755 1.13 riastrad
756 1.13 riastrad /*
757 1.13 riastrad * Ready to go. Unmask the event. After this point, Xen may
758 1.13 riastrad * start calling xen_timer_handler.
759 1.13 riastrad */
760 1.12 bouyer hypervisor_unmask_event(evtch);
761 1.2 bouyer
762 1.2 bouyer /* We'd better not have switched CPUs. */
763 1.2 bouyer KASSERT(ci == curcpu());
764 1.2 bouyer }
765 1.2 bouyer
766 1.2 bouyer /*
767 1.2 bouyer * xen_timer_handler(cookie, frame)
768 1.2 bouyer *
769 1.2 bouyer * Periodic Xen timer event handler for NetBSD hardclock. Calls
770 1.2 bouyer * to this may get delayed, so we run hardclock as many times as
771 1.2 bouyer * we need to in order to cover the Xen system time that elapsed.
772 1.2 bouyer * After that, re-arm the timer to run again at the next tick.
773 1.2 bouyer * The cookie is the pointer to struct cpu_info.
774 1.2 bouyer */
775 1.2 bouyer static int
776 1.2 bouyer xen_timer_handler(void *cookie, struct clockframe *frame)
777 1.2 bouyer {
778 1.14 riastrad const uint64_t ns_per_tick = NS_PER_TICK;
779 1.2 bouyer struct cpu_info *ci = curcpu();
780 1.2 bouyer uint64_t last, now, delta, next;
781 1.2 bouyer int error;
782 1.2 bouyer
783 1.2 bouyer KASSERT(cpu_intr_p());
784 1.2 bouyer KASSERT(cookie == ci);
785 1.2 bouyer
786 1.7 riastrad #if defined(XENPV)
787 1.2 bouyer frame = NULL; /* We use values cached in curcpu() */
788 1.2 bouyer #endif
789 1.2 bouyer /*
790 1.2 bouyer * Find how many nanoseconds of Xen system time has elapsed
791 1.2 bouyer * since the last hardclock tick.
792 1.2 bouyer */
793 1.2 bouyer last = ci->ci_xen_hardclock_systime_ns;
794 1.2 bouyer now = xen_vcputime_systime_ns();
795 1.15 riastrad SDT_PROBE2(sdt, xen, hardclock, tick, last, now);
796 1.15 riastrad if (__predict_false(now < last)) {
797 1.9 riastrad SDT_PROBE2(sdt, xen, hardclock, systime__backward,
798 1.9 riastrad last, now);
799 1.9 riastrad #if XEN_CLOCK_DEBUG
800 1.9 riastrad device_printf(ci->ci_dev, "xen systime ran backwards"
801 1.9 riastrad " in hardclock %"PRIu64"ns\n",
802 1.2 bouyer last - now);
803 1.2 bouyer #endif
804 1.2 bouyer ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++;
805 1.18 bouyer /*
806 1.18 bouyer * we've lost track of time. Just pretends that one
807 1.18 bouyer * tick elapsed, and reset our idea of last tick.
808 1.18 bouyer */
809 1.18 bouyer ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick;
810 1.2 bouyer }
811 1.2 bouyer delta = now - last;
812 1.2 bouyer
813 1.2 bouyer /*
814 1.2 bouyer * Play hardclock catchup: run the hardclock timer as many
815 1.2 bouyer * times as appears necessary based on how much time has
816 1.2 bouyer * passed.
817 1.2 bouyer */
818 1.15 riastrad if (__predict_false(delta >= 2*ns_per_tick)) {
819 1.15 riastrad SDT_PROBE3(sdt, xen, hardclock, jump,
820 1.15 riastrad last, now, delta/ns_per_tick);
821 1.16 riastrad
822 1.16 riastrad /*
823 1.16 riastrad * Warn if we violate timecounter(9) contract: with a
824 1.20 andvar * k-bit timecounter (here k = 32), and timecounter
825 1.16 riastrad * frequency f (here f = 1 GHz), the maximum period
826 1.17 riastrad * between hardclock calls is 2^k / f.
827 1.16 riastrad */
828 1.16 riastrad if (delta > xen_timecounter.tc_counter_mask) {
829 1.16 riastrad printf("WARNING: hardclock skipped %"PRIu64"ns"
830 1.16 riastrad " (%"PRIu64" -> %"PRIu64"),"
831 1.16 riastrad " exceeding maximum of %"PRIu32"ns"
832 1.16 riastrad " for timecounter(9)\n",
833 1.16 riastrad last, now, delta,
834 1.16 riastrad xen_timecounter.tc_counter_mask);
835 1.16 riastrad ci->ci_xen_timecounter_jump_evcnt.ev_count++;
836 1.16 riastrad }
837 1.18 bouyer /* don't try to catch up more than one second at once */
838 1.18 bouyer if (delta > 1000000000UL)
839 1.18 bouyer delta = 1000000000UL;
840 1.15 riastrad }
841 1.14 riastrad while (delta >= ns_per_tick) {
842 1.14 riastrad ci->ci_xen_hardclock_systime_ns += ns_per_tick;
843 1.14 riastrad delta -= ns_per_tick;
844 1.2 bouyer hardclock(frame);
845 1.14 riastrad if (__predict_false(delta >= ns_per_tick)) {
846 1.9 riastrad SDT_PROBE3(sdt, xen, hardclock, missed,
847 1.9 riastrad last, now, delta);
848 1.2 bouyer ci->ci_xen_missed_hardclock_evcnt.ev_count++;
849 1.9 riastrad }
850 1.2 bouyer }
851 1.2 bouyer
852 1.2 bouyer /*
853 1.2 bouyer * Re-arm the timer. If it fails, it's probably because the
854 1.18 bouyer * time is in the past, possibly because we're in the
855 1.18 bouyer * process of catching up missed hardclock calls.
856 1.21 andvar * In this case schedule a tick in the near future.
857 1.2 bouyer */
858 1.14 riastrad next = ci->ci_xen_hardclock_systime_ns + ns_per_tick;
859 1.2 bouyer error = HYPERVISOR_set_timer_op(next);
860 1.18 bouyer if (error) {
861 1.18 bouyer next = xen_vcputime_systime_ns() + ns_per_tick / 2;
862 1.18 bouyer error = HYPERVISOR_set_timer_op(next);
863 1.18 bouyer if (error) {
864 1.18 bouyer panic("failed to re-arm Xen timer %d", error);
865 1.18 bouyer }
866 1.18 bouyer }
867 1.2 bouyer
868 1.2 bouyer /* Success! */
869 1.2 bouyer return 0;
870 1.2 bouyer }
871 1.2 bouyer
872 1.2 bouyer /*
873 1.6 ad * xen_initclocks()
874 1.2 bouyer *
875 1.2 bouyer * Initialize the Xen clocks on the current CPU.
876 1.2 bouyer */
877 1.2 bouyer void
878 1.6 ad xen_initclocks(void)
879 1.2 bouyer {
880 1.2 bouyer struct cpu_info *ci = curcpu();
881 1.2 bouyer
882 1.2 bouyer /* If this is the primary CPU, do global initialization first. */
883 1.2 bouyer if (ci == &cpu_info_primary) {
884 1.2 bouyer /* Initialize the systemwide Xen timecounter. */
885 1.2 bouyer tc_init(&xen_timecounter);
886 1.2 bouyer }
887 1.2 bouyer
888 1.2 bouyer /* Attach the event counters. */
889 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt,
890 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
891 1.2 bouyer "cpu tsc ran backwards");
892 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt,
893 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
894 1.2 bouyer "tsc delta went negative");
895 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt,
896 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
897 1.2 bouyer "raw systime wrapped around");
898 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt,
899 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
900 1.2 bouyer "raw systime went backwards");
901 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt,
902 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
903 1.2 bouyer "systime went backwards in hardclock");
904 1.2 bouyer evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt,
905 1.2 bouyer EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
906 1.2 bouyer "missed hardclock");
907 1.11 riastrad evcnt_attach_dynamic(&ci->ci_xen_timecounter_backwards_evcnt,
908 1.11 riastrad EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
909 1.11 riastrad "timecounter went backwards");
910 1.17 riastrad evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt,
911 1.17 riastrad EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
912 1.17 riastrad "hardclock jumped past timecounter max");
913 1.2 bouyer
914 1.2 bouyer /* Fire up the clocks. */
915 1.2 bouyer xen_resumeclocks(ci);
916 1.2 bouyer
917 1.2 bouyer #ifdef DOM0OPS
918 1.2 bouyer /*
919 1.2 bouyer * If this is a privileged dom0, start pushing the wall
920 1.2 bouyer * clock time back to the Xen hypervisor.
921 1.2 bouyer */
922 1.6 ad if (ci == &cpu_info_primary && xendomain_is_privileged())
923 1.2 bouyer xen_timepush_init();
924 1.2 bouyer #endif
925 1.2 bouyer }
926 1.2 bouyer
927 1.2 bouyer #ifdef DOM0OPS
928 1.2 bouyer
929 1.2 bouyer /*
930 1.2 bouyer * xen_timepush_init()
931 1.2 bouyer *
932 1.2 bouyer * Initialize callout to periodically set Xen hypervisor's wall
933 1.2 bouyer * clock time.
934 1.2 bouyer */
935 1.2 bouyer static void
936 1.2 bouyer xen_timepush_init(void)
937 1.2 bouyer {
938 1.2 bouyer struct sysctllog *log = NULL;
939 1.2 bouyer const struct sysctlnode *node = NULL;
940 1.2 bouyer int error;
941 1.2 bouyer
942 1.2 bouyer /* Start periodically updating the hypervisor's wall clock time. */
943 1.2 bouyer callout_init(&xen_timepush.ch, 0);
944 1.2 bouyer callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);
945 1.2 bouyer
946 1.2 bouyer /* Pick a default frequency for timepush. */
947 1.2 bouyer xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */
948 1.2 bouyer
949 1.2 bouyer /* Create machdep.xen node. */
950 1.2 bouyer /* XXX Creation of the `machdep.xen' node should be elsewhere. */
951 1.2 bouyer error = sysctl_createv(&log, 0, NULL, &node, 0,
952 1.2 bouyer CTLTYPE_NODE, "xen",
953 1.2 bouyer SYSCTL_DESCR("Xen top level node"),
954 1.2 bouyer NULL, 0, NULL, 0,
955 1.2 bouyer CTL_MACHDEP, CTL_CREATE, CTL_EOL);
956 1.2 bouyer if (error)
957 1.2 bouyer goto fail;
958 1.2 bouyer KASSERT(node != NULL);
959 1.2 bouyer
960 1.2 bouyer /* Create int machdep.xen.timepush_ticks knob. */
961 1.2 bouyer error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE,
962 1.2 bouyer CTLTYPE_INT, "timepush_ticks",
963 1.2 bouyer SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
964 1.2 bouyer " 0 to disable"),
965 1.2 bouyer sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
966 1.2 bouyer CTL_CREATE, CTL_EOL);
967 1.2 bouyer if (error)
968 1.2 bouyer goto fail;
969 1.2 bouyer
970 1.2 bouyer /* Start the timepush callout. */
971 1.2 bouyer callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
972 1.2 bouyer
973 1.2 bouyer /* Success! */
974 1.2 bouyer return;
975 1.2 bouyer
976 1.2 bouyer fail: sysctl_teardown(&log);
977 1.2 bouyer }
978 1.2 bouyer
979 1.2 bouyer /*
980 1.2 bouyer * xen_timepush_intr(cookie)
981 1.2 bouyer *
982 1.2 bouyer * Callout interrupt handler to push NetBSD's idea of the wall
983 1.2 bouyer * clock time, usually synchronized with NTP, back to the Xen
984 1.2 bouyer * hypervisor.
985 1.2 bouyer */
986 1.2 bouyer static void
987 1.2 bouyer xen_timepush_intr(void *cookie)
988 1.2 bouyer {
989 1.2 bouyer
990 1.2 bouyer resettodr();
991 1.2 bouyer if (xen_timepush.ticks)
992 1.2 bouyer callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
993 1.2 bouyer }
994 1.2 bouyer
995 1.2 bouyer /*
996 1.2 bouyer * sysctl_xen_timepush(...)
997 1.2 bouyer *
998 1.2 bouyer * Sysctl handler to set machdep.xen.timepush_ticks.
999 1.2 bouyer */
1000 1.2 bouyer static int
1001 1.2 bouyer sysctl_xen_timepush(SYSCTLFN_ARGS)
1002 1.2 bouyer {
1003 1.2 bouyer struct sysctlnode node;
1004 1.2 bouyer int ticks;
1005 1.2 bouyer int error;
1006 1.2 bouyer
1007 1.2 bouyer ticks = xen_timepush.ticks;
1008 1.2 bouyer node = *rnode;
1009 1.2 bouyer node.sysctl_data = &ticks;
1010 1.2 bouyer error = sysctl_lookup(SYSCTLFN_CALL(&node));
1011 1.2 bouyer if (error || newp == NULL)
1012 1.2 bouyer return error;
1013 1.2 bouyer
1014 1.2 bouyer if (ticks < 0)
1015 1.2 bouyer return EINVAL;
1016 1.2 bouyer
1017 1.2 bouyer if (ticks != xen_timepush.ticks) {
1018 1.2 bouyer xen_timepush.ticks = ticks;
1019 1.2 bouyer
1020 1.2 bouyer if (ticks == 0)
1021 1.2 bouyer callout_stop(&xen_timepush.ch);
1022 1.2 bouyer else
1023 1.2 bouyer callout_schedule(&xen_timepush.ch, ticks);
1024 1.2 bouyer }
1025 1.2 bouyer
1026 1.2 bouyer return 0;
1027 1.2 bouyer }
1028 1.2 bouyer
1029 1.2 bouyer #endif /* DOM0OPS */
1030 1.2 bouyer
1031 1.2 bouyer static int xen_rtc_get(struct todr_chip_handle *, struct timeval *);
1032 1.2 bouyer static int xen_rtc_set(struct todr_chip_handle *, struct timeval *);
1033 1.2 bouyer static void xen_wallclock_time(struct timespec *);
1034 1.2 bouyer /*
1035 1.2 bouyer * xen time of day register:
1036 1.2 bouyer *
1037 1.2 bouyer * Xen wall clock time, plus a Xen vCPU system time adjustment.
1038 1.2 bouyer */
1039 1.2 bouyer static struct todr_chip_handle xen_todr_chip = {
1040 1.2 bouyer .todr_gettime = xen_rtc_get,
1041 1.2 bouyer .todr_settime = xen_rtc_set,
1042 1.2 bouyer };
1043 1.2 bouyer
1044 1.2 bouyer /*
1045 1.4 bouyer * xen_startrtclock()
1046 1.2 bouyer *
1047 1.2 bouyer * Initialize the real-time clock from x86 machdep autoconf.
1048 1.2 bouyer */
1049 1.2 bouyer void
1050 1.4 bouyer xen_startrtclock(void)
1051 1.2 bouyer {
1052 1.2 bouyer
1053 1.2 bouyer todr_attach(&xen_todr_chip);
1054 1.2 bouyer }
1055 1.2 bouyer
1056 1.2 bouyer /*
1057 1.2 bouyer * xen_rtc_get(todr, tv)
1058 1.2 bouyer *
1059 1.2 bouyer * Get the current real-time clock from the Xen wall clock time
1060 1.2 bouyer * and vCPU system time adjustment.
1061 1.2 bouyer */
1062 1.2 bouyer static int
1063 1.2 bouyer xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
1064 1.2 bouyer {
1065 1.2 bouyer struct timespec ts;
1066 1.2 bouyer
1067 1.2 bouyer xen_wallclock_time(&ts);
1068 1.2 bouyer TIMESPEC_TO_TIMEVAL(tvp, &ts);
1069 1.2 bouyer
1070 1.2 bouyer return 0;
1071 1.2 bouyer }
1072 1.2 bouyer
1073 1.2 bouyer /*
1074 1.2 bouyer * xen_rtc_set(todr, tv)
1075 1.2 bouyer *
1076 1.2 bouyer * Set the Xen wall clock time, if we can.
1077 1.2 bouyer */
1078 1.2 bouyer static int
1079 1.2 bouyer xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
1080 1.2 bouyer {
1081 1.2 bouyer #ifdef DOM0OPS
1082 1.2 bouyer struct clock_ymdhms dt;
1083 1.2 bouyer xen_platform_op_t op;
1084 1.2 bouyer uint64_t systime_ns;
1085 1.2 bouyer
1086 1.2 bouyer if (xendomain_is_privileged()) {
1087 1.2 bouyer /* Convert to ymdhms and set the x86 ISA RTC. */
1088 1.2 bouyer clock_secs_to_ymdhms(tvp->tv_sec, &dt);
1089 1.2 bouyer rtc_set_ymdhms(NULL, &dt);
1090 1.2 bouyer
1091 1.2 bouyer /* Get the global system time so we can preserve it. */
1092 1.2 bouyer systime_ns = xen_global_systime_ns();
1093 1.2 bouyer
1094 1.2 bouyer /* Set the hypervisor wall clock time. */
1095 1.19 riastrad memset(&op, 0, sizeof(op));
1096 1.2 bouyer op.cmd = XENPF_settime;
1097 1.2 bouyer op.u.settime.secs = tvp->tv_sec;
1098 1.2 bouyer op.u.settime.nsecs = tvp->tv_usec * 1000;
1099 1.2 bouyer op.u.settime.system_time = systime_ns;
1100 1.2 bouyer return HYPERVISOR_platform_op(&op);
1101 1.2 bouyer }
1102 1.2 bouyer #endif
1103 1.2 bouyer
1104 1.2 bouyer /* XXX Should this fail if not on privileged dom0? */
1105 1.2 bouyer return 0;
1106 1.2 bouyer }
1107 1.2 bouyer
1108 1.2 bouyer /*
1109 1.2 bouyer * xen_wallclock_time(tsp)
1110 1.2 bouyer *
1111 1.2 bouyer * Return a snapshot of the current low-resolution wall clock
1112 1.2 bouyer * time, as reported by the hypervisor, in tsp.
1113 1.2 bouyer */
1114 1.2 bouyer static void
1115 1.2 bouyer xen_wallclock_time(struct timespec *tsp)
1116 1.2 bouyer {
1117 1.2 bouyer struct xen_wallclock_ticket ticket;
1118 1.2 bouyer uint64_t systime_ns;
1119 1.2 bouyer
1120 1.2 bouyer int s = splsched(); /* make sure we won't be interrupted */
1121 1.2 bouyer /* Read the last wall clock sample from the hypervisor. */
1122 1.2 bouyer do {
1123 1.2 bouyer xen_wallclock_enter(&ticket);
1124 1.2 bouyer tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
1125 1.2 bouyer tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
1126 1.2 bouyer } while (!xen_wallclock_exit(&ticket));
1127 1.2 bouyer
1128 1.2 bouyer /* Get the global system time. */
1129 1.2 bouyer systime_ns = xen_global_systime_ns();
1130 1.2 bouyer splx(s);
1131 1.2 bouyer
1132 1.2 bouyer /* Add the system time to the wall clock time. */
1133 1.2 bouyer systime_ns += tsp->tv_nsec;
1134 1.2 bouyer tsp->tv_sec += systime_ns / 1000000000ull;
1135 1.2 bouyer tsp->tv_nsec = systime_ns % 1000000000ull;
1136 1.2 bouyer }
1137 1.2 bouyer
1138 1.4 bouyer #ifdef XENPV
1139 1.4 bouyer /*
1140 1.4 bouyer * setstatclockrate(rate)
1141 1.4 bouyer *
1142 1.4 bouyer * Set the statclock to run at rate, in units of ticks per second.
1143 1.4 bouyer *
1144 1.4 bouyer * Currently Xen does not have a separate statclock, so this is a
1145 1.23 andvar * noop; instead the statclock runs in hardclock.
1146 1.4 bouyer */
1147 1.4 bouyer void
1148 1.4 bouyer setstatclockrate(int rate)
1149 1.4 bouyer {
1150 1.4 bouyer }
1151 1.2 bouyer #endif /* XENPV */
1152