Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: tsc.c,v 1.63 2025/05/08 05:31:16 imil Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.63 2025/05/08 05:31:16 imil Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/time.h>
     35 #include <sys/timetc.h>
     36 #include <sys/lwp.h>
     37 #include <sys/atomic.h>
     38 #include <sys/kernel.h>
     39 #include <sys/cpu.h>
     40 #include <sys/xcall.h>
     41 #include <sys/lock.h>
     42 #ifdef BOOT_DURATION
     43 #include <sys/boot_duration.h>
     44 #endif
     45 
     46 #include <machine/cpu_counter.h>
     47 #include <machine/cpuvar.h>
     48 #include <machine/cpufunc.h>
     49 #include <machine/specialreg.h>
     50 #include <machine/cputypes.h>
     51 
     52 #include "tsc.h"
     53 
     54 #define	TSC_SYNC_ROUNDS		1000
     55 #define	ABS(a)			((a) >= 0 ? (a) : -(a))
     56 
     57 static u_int	tsc_get_timecount(struct timecounter *);
     58 
     59 static void	tsc_delay(unsigned int);
     60 
     61 static uint64_t	tsc_dummy_cacheline __cacheline_aligned;
     62 uint64_t	tsc_freq __read_mostly;	/* exported for sysctl */
     63 #ifdef BOOT_DURATION
     64 extern uint32_t	starttsc_lo;
     65 extern uint32_t	starttsc_hi;
     66 #endif
     67 static int64_t	tsc_drift_max = 1000;	/* max cycles */
     68 static int64_t	tsc_drift_observed;
     69 uint64_t	(*rdtsc)(void) = rdtsc_cpuid;
     70 uint64_t	(*cpu_counter)(void) = cpu_counter_cpuid;
     71 uint32_t	(*cpu_counter32)(void) = cpu_counter32_cpuid;
     72 
     73 int tsc_user_enabled = 1;
     74 
     75 static volatile int64_t	tsc_sync_val;
     76 static volatile struct cpu_info	*tsc_sync_cpu;
     77 
     78 static struct timecounter tsc_timecounter = {
     79 	.tc_get_timecount = tsc_get_timecount,
     80 	.tc_counter_mask = ~0U,
     81 	.tc_name = "TSC",
     82 	.tc_quality = 3000,
     83 };
     84 
     85 bool
     86 tsc_is_invariant(void)
     87 {
     88 	struct cpu_info *ci;
     89 	uint32_t descs[4];
     90 	uint32_t family;
     91 	bool invariant;
     92 
     93 	if (!cpu_hascounter())
     94 		return false;
     95 
     96 	ci = curcpu();
     97 	invariant = false;
     98 
     99 	if (cpu_vendor == CPUVENDOR_INTEL) {
    100 		/*
    101 		 * From Intel(tm) 64 and IA-32 Architectures Software
    102 		 * Developer's Manual Volume 3A: System Programming Guide,
    103 		 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
    104 		 * where the TSC is known invariant:
    105 		 *
    106 		 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
    107 		 * Core Solo and Core Duo processors (family 06, model 0e)
    108 		 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
    109 		 * Core 2 and Xeon (family 06, model 17)
    110 		 * Atom (family 06, model 1c)
    111 		 *
    112 		 * We'll also assume that it's safe on the Pentium, and
    113 		 * that it's safe on P-II and P-III Xeons due to the
    114 		 * typical configuration of those systems.
    115 		 *
    116 		 */
    117 		switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
    118 		case 0x05:
    119 			invariant = true;
    120 			break;
    121 		case 0x06:
    122 			invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
    123 			    CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
    124 			    CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
    125 			    CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
    126 			break;
    127 		case 0x0f:
    128 			invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
    129 			break;
    130 		}
    131 	} else if (cpu_vendor == CPUVENDOR_AMD) {
    132 		/*
    133 		 * TSC and Power Management Events on AMD Processors
    134 		 * Nov 2, 2005 Rich Brunner, AMD Fellow
    135 		 * http://lkml.org/lkml/2005/11/4/173
    136 		 *
    137 		 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
    138 		 * Management Features, AMD64 Architecture Programmer's
    139 		 * Manual Volume 3: General-Purpose and System Instructions.
    140 		 * The check is done below.
    141 		 */
    142 
    143 		/*
    144 		 * AMD Errata 778: Processor Core Time Stamp Counters May
    145 		 * Experience Drift
    146 		 *
    147 		 * This affects all family 15h and family 16h processors.
    148 		 */
    149 		switch (CPUID_TO_FAMILY(ci->ci_signature)) {
    150 		case 0x15:
    151 		case 0x16:
    152 			return false;
    153 		}
    154 	}
    155 
    156 	/*
    157 	 * The best way to check whether the TSC counter is invariant or not
    158 	 * is to check CPUID 80000007.
    159 	 */
    160 	family = CPUID_TO_BASEFAMILY(ci->ci_signature);
    161 	if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
    162 	    && ((family == 0x06) || (family == 0x0f))) {
    163 		x86_cpuid(0x80000000, descs);
    164 		if (descs[0] >= 0x80000007) {
    165 			x86_cpuid(0x80000007, descs);
    166 			invariant = (descs[3] & CPUID_APM_ITSC) != 0;
    167 		}
    168 	}
    169 
    170 	return invariant;
    171 }
    172 
    173 /* Setup function pointers for rdtsc() and timecounter(9). */
    174 void
    175 tsc_setfunc(struct cpu_info *ci)
    176 {
    177 	bool use_lfence, use_mfence;
    178 
    179 	use_lfence = use_mfence = false;
    180 
    181 	/*
    182 	 * XXX On AMD, we might be able to use lfence for some cases:
    183 	 *   a) if MSR_DE_CFG exist and the bit 1 is set.
    184 	 *   b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
    185 	 *      lfence is always serializing.
    186 	 *
    187 	 * We don't use it because the test result showed mfence was better
    188 	 * than lfence with MSR_DE_CFG.
    189 	 */
    190 	if (cpu_vendor == CPUVENDOR_AMD)
    191 		use_mfence = true;
    192 	else if (cpu_vendor == CPUVENDOR_INTEL)
    193 		use_lfence = true;
    194 
    195 	/* LFENCE and MFENCE are applicable if SSE2 is set. */
    196 	if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
    197 		use_lfence = use_mfence = false;
    198 
    199 #define TSC_SETFUNC(fence)						      \
    200 	do {								      \
    201 		rdtsc = rdtsc_##fence;					      \
    202 		cpu_counter = cpu_counter_##fence;			      \
    203 		cpu_counter32 = cpu_counter32_##fence;			      \
    204 	} while (/* CONSTCOND */ 0)
    205 
    206 	if (use_lfence)
    207 		TSC_SETFUNC(lfence);
    208 	else if (use_mfence)
    209 		TSC_SETFUNC(mfence);
    210 	else
    211 		TSC_SETFUNC(cpuid);
    212 
    213 	aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
    214 	    use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
    215 }
    216 
    217 /*
    218  * Initialize timecounter(9) and DELAY() function of TSC.
    219  *
    220  * This function is called after all secondary processors were brought up
    221  * and drift has been measured, and after any other potential delay funcs
    222  * have been installed (e.g. lapic_delay()).
    223  */
    224 void
    225 tsc_tc_init(void)
    226 {
    227 	struct cpu_info *ci;
    228 	bool invariant;
    229 
    230 	if (!cpu_hascounter())
    231 		return;
    232 
    233 	ci = curcpu();
    234 	tsc_freq = ci->ci_data.cpu_cc_freq;
    235 	invariant = tsc_is_invariant();
    236 	if (!invariant) {
    237 		aprint_debug("TSC not known invariant on this CPU\n");
    238 		tsc_timecounter.tc_quality = -100;
    239 	} else if (tsc_drift_observed > tsc_drift_max) {
    240 		aprint_error("ERROR: %lld cycle TSC drift observed\n",
    241 		    (long long)tsc_drift_observed);
    242 		tsc_timecounter.tc_quality = -100;
    243 		invariant = false;
    244 	} else if (vm_guest == VM_GUEST_NO) {
    245 		delay_func = tsc_delay;
    246 	} else if (vm_guest == VM_GUEST_VIRTUALBOX) {
    247 		tsc_timecounter.tc_quality = -100;
    248 	}
    249 
    250 	if (tsc_freq != 0) {
    251 		tsc_timecounter.tc_frequency = tsc_freq;
    252 		tc_init(&tsc_timecounter);
    253 	}
    254 }
    255 
    256 /*
    257  * Record drift (in clock cycles).  Called during AP startup.
    258  */
    259 void
    260 tsc_sync_drift(int64_t drift)
    261 {
    262 
    263 	if (drift < 0)
    264 		drift = -drift;
    265 	if (drift > tsc_drift_observed)
    266 		tsc_drift_observed = drift;
    267 }
    268 
    269 /*
    270  * Called during startup of APs, by the boot processor.  Interrupts
    271  * are disabled on entry.
    272  */
    273 static void __noinline
    274 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
    275 {
    276 	uint64_t bptsc;
    277 
    278 	if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
    279 		panic("tsc_sync_bp: 1");
    280 	}
    281 
    282 	/* Prepare a cache miss for the other side. */
    283 	(void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
    284 
    285 	/* Flag our readiness. */
    286 	atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
    287 
    288 	/* Wait for other side then read our TSC. */
    289 	while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
    290 		__insn_barrier();
    291 	}
    292 	bptsc = rdtsc();
    293 
    294 	/* Wait for the results to come in. */
    295 	while (tsc_sync_cpu == ci) {
    296 		x86_pause();
    297 	}
    298 	if (tsc_sync_cpu != NULL) {
    299 		panic("tsc_sync_bp: 2");
    300 	}
    301 
    302 	*bptscp = bptsc;
    303 	*aptscp = tsc_sync_val;
    304 }
    305 
    306 void
    307 tsc_sync_bp(struct cpu_info *ci)
    308 {
    309 	int64_t bptsc, aptsc, val, diff;
    310 
    311 	if (!cpu_hascounter())
    312 		return;
    313 
    314 	val = INT64_MAX;
    315 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
    316 		tsc_read_bp(ci, &bptsc, &aptsc);
    317 		diff = bptsc - aptsc;
    318 		if (ABS(diff) < ABS(val)) {
    319 			val = diff;
    320 		}
    321 	}
    322 
    323 	ci->ci_data.cpu_cc_skew = val;
    324 }
    325 
    326 /*
    327  * Called during startup of AP, by the AP itself.  Interrupts are
    328  * disabled on entry.
    329  */
    330 static void __noinline
    331 tsc_post_ap(struct cpu_info *ci)
    332 {
    333 	uint64_t tsc;
    334 
    335 	/* Wait for go-ahead from primary. */
    336 	while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
    337 		__insn_barrier();
    338 	}
    339 
    340 	/* Instruct primary to read its counter. */
    341 	atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
    342 
    343 	/* Suffer a cache miss, then read TSC. */
    344 	__insn_barrier();
    345 	tsc = tsc_dummy_cacheline;
    346 	__insn_barrier();
    347 	tsc += rdtsc();
    348 
    349 	/* Post result.  Ensure the whole value goes out atomically. */
    350 	(void)atomic_swap_64(&tsc_sync_val, tsc);
    351 
    352 	if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
    353 		panic("tsc_sync_ap");
    354 	}
    355 }
    356 
    357 void
    358 tsc_sync_ap(struct cpu_info *ci)
    359 {
    360 
    361 	if (!cpu_hascounter())
    362 		return;
    363 
    364 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
    365 		tsc_post_ap(ci);
    366 	}
    367 }
    368 
    369 static void
    370 tsc_apply_cpu(void *arg1, void *arg2)
    371 {
    372 	bool enable = arg1 != NULL;
    373 	if (enable) {
    374 		lcr4(rcr4() & ~CR4_TSD);
    375 	} else {
    376 		lcr4(rcr4() | CR4_TSD);
    377 	}
    378 }
    379 
    380 void
    381 tsc_user_enable(void)
    382 {
    383 	uint64_t xc;
    384 
    385 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
    386 	xc_wait(xc);
    387 }
    388 
    389 void
    390 tsc_user_disable(void)
    391 {
    392 	uint64_t xc;
    393 
    394 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
    395 	xc_wait(xc);
    396 }
    397 
    398 uint64_t
    399 cpu_frequency(struct cpu_info *ci)
    400 {
    401 
    402 	return ci->ci_data.cpu_cc_freq;
    403 }
    404 
    405 int
    406 cpu_hascounter(void)
    407 {
    408 
    409 	return cpu_feature[0] & CPUID_TSC;
    410 }
    411 
    412 static void
    413 tsc_delay(unsigned int us)
    414 {
    415 	uint64_t start, delta;
    416 
    417 	start = cpu_counter();
    418 	delta = (uint64_t)us * tsc_freq / 1000000;
    419 
    420 	while ((cpu_counter() - start) < delta) {
    421 		x86_pause();
    422 	}
    423 }
    424 
    425 static u_int
    426 tsc_get_timecount(struct timecounter *tc)
    427 {
    428 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
    429 	static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
    430 	static int lastwarn;
    431 	uint64_t cur, prev;
    432 	lwp_t *l = curlwp;
    433 	int ticks;
    434 
    435 	/*
    436 	 * Previous value must be read before the counter and stored to
    437 	 * after, because this routine can be called from interrupt context
    438 	 * and may run over the top of an existing invocation.  Ordering is
    439 	 * guaranteed by "volatile" on md_tsc.
    440 	 */
    441 	prev = l->l_md.md_tsc;
    442 	cur = cpu_counter();
    443 	if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
    444 	    __cpu_simple_lock_try(&lock)) {
    445 		ticks = getticks();
    446 		if (ticks - lastwarn >= hz) {
    447 			printf(
    448 			    "WARNING: %s TSC went backwards by %u - "
    449 			    "change sysctl(7) kern.timecounter?\n",
    450 			    cpu_name(curcpu()), (unsigned)(prev - cur));
    451 			lastwarn = ticks;
    452 		}
    453 		__cpu_simple_unlock(&lock);
    454 	}
    455 	l->l_md.md_tsc = cur;
    456 	return (uint32_t)cur;
    457 #else
    458 	return cpu_counter32();
    459 #endif
    460 }
    461 
    462 /*
    463  * tsc has been reset; zero the cached tsc of every lwp in the system
    464  * so we don't spuriously report that the tsc has gone backward.
    465  * Caller must ensure all LWPs are quiescent (except the current one,
    466  * obviously) and interrupts are blocked while we update this.
    467  */
    468 void
    469 tsc_tc_reset(void)
    470 {
    471 	struct lwp *l;
    472 
    473 	LIST_FOREACH(l, &alllwp, l_list)
    474 		l->l_md.md_tsc = 0;
    475 }
    476 
    477 #ifdef BOOT_DURATION
    478 /* Returns the kernel boot time in milliseconds. */
    479 uint64_t
    480 boot_duration_timer(void)
    481 {
    482 	KASSERT(curcpu_stable());
    483 	KASSERT(CPU_IS_PRIMARY(curcpu()));
    484 	return (rdtsc() - ((uint64_t)starttsc_hi << 32 | starttsc_lo)) /
    485 	    (curcpu()->ci_data.cpu_cc_freq / 1000);
    486 }
    487 #endif
    488