Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: tsc.c,v 1.64 2026/03/01 13:57:41 yamt Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.64 2026/03/01 13:57:41 yamt Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/time.h>
     35 #include <sys/timetc.h>
     36 #include <sys/lwp.h>
     37 #include <sys/atomic.h>
     38 #include <sys/kernel.h>
     39 #include <sys/cpu.h>
     40 #include <sys/xcall.h>
     41 #include <sys/lock.h>
     42 #ifdef BOOT_DURATION
     43 #include <sys/boot_duration.h>
     44 #endif
     45 
     46 #include <machine/cpu_counter.h>
     47 #include <machine/cpuvar.h>
     48 #include <machine/cpufunc.h>
     49 #include <machine/specialreg.h>
     50 #include <machine/cputypes.h>
     51 
     52 #include "tsc.h"
     53 
     54 #define	TSC_SYNC_ROUNDS		1000
     55 #define	ABS(a)			((a) >= 0 ? (a) : -(a))
     56 
     57 static u_int	tsc_get_timecount(struct timecounter *);
     58 
     59 static void	tsc_delay(unsigned int);
     60 
     61 static uint64_t	tsc_dummy_cacheline __cacheline_aligned;
     62 uint64_t	tsc_freq __read_mostly;	/* exported for sysctl */
     63 #ifdef BOOT_DURATION
     64 extern uint32_t	starttsc_lo;
     65 extern uint32_t	starttsc_hi;
     66 #endif
     67 static int64_t	tsc_drift_max = 1000;	/* max cycles */
     68 static int64_t	tsc_drift_observed;
     69 uint64_t	(*rdtsc)(void) = rdtsc_cpuid;
     70 uint64_t	(*cpu_counter)(void) = cpu_counter_cpuid;
     71 uint32_t	(*cpu_counter32)(void) = cpu_counter32_cpuid;
     72 
     73 int tsc_user_enabled = 1;
     74 
     75 static volatile int64_t	tsc_sync_val;
     76 static volatile struct cpu_info	*tsc_sync_cpu;
     77 
     78 static struct timecounter tsc_timecounter = {
     79 	.tc_get_timecount = tsc_get_timecount,
     80 	.tc_counter_mask = ~0U,
     81 	.tc_name = "TSC",
     82 	.tc_quality = 3000,
     83 };
     84 
     85 bool
     86 tsc_is_invariant(void)
     87 {
     88 	struct cpu_info *ci;
     89 	uint32_t descs[4];
     90 	uint32_t family;
     91 	bool invariant;
     92 
     93 	if (!cpu_hascounter())
     94 		return false;
     95 
     96 	ci = curcpu();
     97 	invariant = false;
     98 
     99 	if (cpu_vendor == CPUVENDOR_INTEL) {
    100 		/*
    101 		 * From Intel(tm) 64 and IA-32 Architectures Software
    102 		 * Developer's Manual Volume 3A: System Programming Guide,
    103 		 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
    104 		 * where the TSC is known invariant:
    105 		 *
    106 		 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
    107 		 * Core Solo and Core Duo processors (family 06, model 0e)
    108 		 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
    109 		 * Core 2 and Xeon (family 06, model 17)
    110 		 * Atom (family 06, model 1c)
    111 		 *
    112 		 * We'll also assume that it's safe on the Pentium, and
    113 		 * that it's safe on P-II and P-III Xeons due to the
    114 		 * typical configuration of those systems.
    115 		 *
    116 		 */
    117 		switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
    118 		case 0x05:
    119 			invariant = true;
    120 			break;
    121 		case 0x06:
    122 			invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
    123 			    CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
    124 			    CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
    125 			    CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
    126 			break;
    127 		case 0x0f:
    128 			invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
    129 			break;
    130 		}
    131 	} else if (cpu_vendor == CPUVENDOR_AMD) {
    132 		/*
    133 		 * TSC and Power Management Events on AMD Processors
    134 		 * Nov 2, 2005 Rich Brunner, AMD Fellow
    135 		 * http://lkml.org/lkml/2005/11/4/173
    136 		 *
    137 		 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
    138 		 * Management Features, AMD64 Architecture Programmer's
    139 		 * Manual Volume 3: General-Purpose and System Instructions.
    140 		 * The check is done below.
    141 		 */
    142 
    143 		/*
    144 		 * AMD Errata 778: Processor Core Time Stamp Counters May
    145 		 * Experience Drift
    146 		 *
    147 		 * This affects all family 15h and family 16h processors.
    148 		 */
    149 		switch (CPUID_TO_FAMILY(ci->ci_signature)) {
    150 		case 0x15:
    151 		case 0x16:
    152 			return false;
    153 		}
    154 	}
    155 
    156 	/*
    157 	 * The best way to check whether the TSC counter is invariant or not
    158 	 * is to check CPUID 80000007.
    159 	 */
    160 	family = CPUID_TO_BASEFAMILY(ci->ci_signature);
    161 	if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
    162 	    && ((family == 0x06) || (family == 0x0f))) {
    163 		x86_cpuid(0x80000000, descs);
    164 		if (descs[0] >= 0x80000007) {
    165 			x86_cpuid(0x80000007, descs);
    166 			invariant = (descs[3] & CPUID_APM_ITSC) != 0;
    167 		}
    168 	}
    169 
    170 	return invariant;
    171 }
    172 
    173 /* Setup function pointers for rdtsc() and timecounter(9). */
    174 void
    175 tsc_setfunc(struct cpu_info *ci)
    176 {
    177 	bool use_lfence, use_mfence;
    178 
    179 	use_lfence = use_mfence = false;
    180 
    181 	/*
    182 	 * XXX On AMD, we might be able to use lfence for some cases:
    183 	 *   a) if MSR_DE_CFG exist and the bit 1 is set.
    184 	 *   b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
    185 	 *      lfence is always serializing.
    186 	 *
    187 	 * We don't use it because the test result showed mfence was better
    188 	 * than lfence with MSR_DE_CFG.
    189 	 */
    190 	if (cpu_vendor == CPUVENDOR_AMD)
    191 		use_mfence = true;
    192 	else if (cpu_vendor == CPUVENDOR_INTEL)
    193 		use_lfence = true;
    194 
    195 	/* LFENCE and MFENCE are applicable if SSE2 is set. */
    196 	if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
    197 		use_lfence = use_mfence = false;
    198 
    199 #define TSC_SETFUNC(fence)						      \
    200 	do {								      \
    201 		rdtsc = rdtsc_##fence;					      \
    202 		cpu_counter = cpu_counter_##fence;			      \
    203 		cpu_counter32 = cpu_counter32_##fence;			      \
    204 	} while (/* CONSTCOND */ 0)
    205 
    206 	if (use_lfence)
    207 		TSC_SETFUNC(lfence);
    208 	else if (use_mfence)
    209 		TSC_SETFUNC(mfence);
    210 	else
    211 		TSC_SETFUNC(cpuid);
    212 
    213 	aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
    214 	    use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
    215 }
    216 
    217 /*
    218  * Initialize timecounter(9) and DELAY() function of TSC.
    219  *
    220  * This function is called after all secondary processors were brought up
    221  * and drift has been measured, and after any other potential delay funcs
    222  * have been installed (e.g. lapic_delay()).
    223  */
    224 void
    225 tsc_tc_init(void)
    226 {
    227 	struct cpu_info *ci;
    228 	bool invariant;
    229 
    230 	if (!cpu_hascounter())
    231 		return;
    232 
    233 	ci = curcpu();
    234 	tsc_freq = ci->ci_data.cpu_cc_freq;
    235 	invariant = tsc_is_invariant();
    236 	if (!invariant) {
    237 		aprint_debug("TSC not known invariant on this CPU\n");
    238 		tsc_timecounter.tc_quality = -100;
    239 	} else if (tsc_drift_observed > tsc_drift_max) {
    240 		aprint_error("ERROR: %lld cycle TSC drift observed\n",
    241 		    (long long)tsc_drift_observed);
    242 		tsc_timecounter.tc_quality = -100;
    243 		invariant = false;
    244 	} else if (vm_guest == VM_GUEST_NO) {
    245 		delay_func = tsc_delay;
    246 	} else if (vm_guest == VM_GUEST_VIRTUALBOX ||
    247 		   vm_guest == VM_GUEST_NVMM) {
    248 		/*
    249 		 * nvmm doesn't advance guest tsc constantly unless
    250 		 * the vcpu is pinned to a physical cpu.
    251 		 */
    252 		tsc_timecounter.tc_quality = -100;
    253 	}
    254 
    255 	if (tsc_freq != 0) {
    256 		tsc_timecounter.tc_frequency = tsc_freq;
    257 		tc_init(&tsc_timecounter);
    258 	}
    259 }
    260 
    261 /*
    262  * Record drift (in clock cycles).  Called during AP startup.
    263  */
    264 void
    265 tsc_sync_drift(int64_t drift)
    266 {
    267 
    268 	if (drift < 0)
    269 		drift = -drift;
    270 	if (drift > tsc_drift_observed)
    271 		tsc_drift_observed = drift;
    272 }
    273 
    274 /*
    275  * Called during startup of APs, by the boot processor.  Interrupts
    276  * are disabled on entry.
    277  */
    278 static void __noinline
    279 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
    280 {
    281 	uint64_t bptsc;
    282 
    283 	if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
    284 		panic("tsc_sync_bp: 1");
    285 	}
    286 
    287 	/* Prepare a cache miss for the other side. */
    288 	(void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
    289 
    290 	/* Flag our readiness. */
    291 	atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
    292 
    293 	/* Wait for other side then read our TSC. */
    294 	while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
    295 		__insn_barrier();
    296 	}
    297 	bptsc = rdtsc();
    298 
    299 	/* Wait for the results to come in. */
    300 	while (tsc_sync_cpu == ci) {
    301 		x86_pause();
    302 	}
    303 	if (tsc_sync_cpu != NULL) {
    304 		panic("tsc_sync_bp: 2");
    305 	}
    306 
    307 	*bptscp = bptsc;
    308 	*aptscp = tsc_sync_val;
    309 }
    310 
    311 void
    312 tsc_sync_bp(struct cpu_info *ci)
    313 {
    314 	int64_t bptsc, aptsc, val, diff;
    315 
    316 	if (!cpu_hascounter())
    317 		return;
    318 
    319 	val = INT64_MAX;
    320 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
    321 		tsc_read_bp(ci, &bptsc, &aptsc);
    322 		diff = bptsc - aptsc;
    323 		if (ABS(diff) < ABS(val)) {
    324 			val = diff;
    325 		}
    326 	}
    327 
    328 	ci->ci_data.cpu_cc_skew = val;
    329 }
    330 
    331 /*
    332  * Called during startup of AP, by the AP itself.  Interrupts are
    333  * disabled on entry.
    334  */
    335 static void __noinline
    336 tsc_post_ap(struct cpu_info *ci)
    337 {
    338 	uint64_t tsc;
    339 
    340 	/* Wait for go-ahead from primary. */
    341 	while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
    342 		__insn_barrier();
    343 	}
    344 
    345 	/* Instruct primary to read its counter. */
    346 	atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
    347 
    348 	/* Suffer a cache miss, then read TSC. */
    349 	__insn_barrier();
    350 	tsc = tsc_dummy_cacheline;
    351 	__insn_barrier();
    352 	tsc += rdtsc();
    353 
    354 	/* Post result.  Ensure the whole value goes out atomically. */
    355 	(void)atomic_swap_64(&tsc_sync_val, tsc);
    356 
    357 	if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
    358 		panic("tsc_sync_ap");
    359 	}
    360 }
    361 
    362 void
    363 tsc_sync_ap(struct cpu_info *ci)
    364 {
    365 
    366 	if (!cpu_hascounter())
    367 		return;
    368 
    369 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
    370 		tsc_post_ap(ci);
    371 	}
    372 }
    373 
    374 static void
    375 tsc_apply_cpu(void *arg1, void *arg2)
    376 {
    377 	bool enable = arg1 != NULL;
    378 	if (enable) {
    379 		lcr4(rcr4() & ~CR4_TSD);
    380 	} else {
    381 		lcr4(rcr4() | CR4_TSD);
    382 	}
    383 }
    384 
    385 void
    386 tsc_user_enable(void)
    387 {
    388 	uint64_t xc;
    389 
    390 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
    391 	xc_wait(xc);
    392 }
    393 
    394 void
    395 tsc_user_disable(void)
    396 {
    397 	uint64_t xc;
    398 
    399 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
    400 	xc_wait(xc);
    401 }
    402 
    403 uint64_t
    404 cpu_frequency(struct cpu_info *ci)
    405 {
    406 
    407 	return ci->ci_data.cpu_cc_freq;
    408 }
    409 
    410 int
    411 cpu_hascounter(void)
    412 {
    413 
    414 	return cpu_feature[0] & CPUID_TSC;
    415 }
    416 
    417 static void
    418 tsc_delay(unsigned int us)
    419 {
    420 	uint64_t start, delta;
    421 
    422 	start = cpu_counter();
    423 	delta = (uint64_t)us * tsc_freq / 1000000;
    424 
    425 	while ((cpu_counter() - start) < delta) {
    426 		x86_pause();
    427 	}
    428 }
    429 
    430 static u_int
    431 tsc_get_timecount(struct timecounter *tc)
    432 {
    433 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
    434 	static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
    435 	static int lastwarn;
    436 	uint64_t cur, prev;
    437 	lwp_t *l = curlwp;
    438 	int ticks;
    439 
    440 	/*
    441 	 * Previous value must be read before the counter and stored to
    442 	 * after, because this routine can be called from interrupt context
    443 	 * and may run over the top of an existing invocation.  Ordering is
    444 	 * guaranteed by "volatile" on md_tsc.
    445 	 */
    446 	prev = l->l_md.md_tsc;
    447 	cur = cpu_counter();
    448 	if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
    449 	    __cpu_simple_lock_try(&lock)) {
    450 		ticks = getticks();
    451 		if (ticks - lastwarn >= hz) {
    452 			printf(
    453 			    "WARNING: %s TSC went backwards by %u - "
    454 			    "change sysctl(7) kern.timecounter?\n",
    455 			    cpu_name(curcpu()), (unsigned)(prev - cur));
    456 			lastwarn = ticks;
    457 		}
    458 		__cpu_simple_unlock(&lock);
    459 	}
    460 	l->l_md.md_tsc = cur;
    461 	return (uint32_t)cur;
    462 #else
    463 	return cpu_counter32();
    464 #endif
    465 }
    466 
    467 /*
    468  * tsc has been reset; zero the cached tsc of every lwp in the system
    469  * so we don't spuriously report that the tsc has gone backward.
    470  * Caller must ensure all LWPs are quiescent (except the current one,
    471  * obviously) and interrupts are blocked while we update this.
    472  */
    473 void
    474 tsc_tc_reset(void)
    475 {
    476 	struct lwp *l;
    477 
    478 	LIST_FOREACH(l, &alllwp, l_list)
    479 		l->l_md.md_tsc = 0;
    480 }
    481 
    482 #ifdef BOOT_DURATION
    483 /* Returns the kernel boot time in milliseconds. */
    484 uint64_t
    485 boot_duration_timer(void)
    486 {
    487 	KASSERT(curcpu_stable());
    488 	KASSERT(CPU_IS_PRIMARY(curcpu()));
    489 	return (rdtsc() - ((uint64_t)starttsc_hi << 32 | starttsc_lo)) /
    490 	    (curcpu()->ci_data.cpu_cc_freq / 1000);
    491 }
    492 #endif
    493