1 /* $NetBSD: tsc.c,v 1.63 2025/05/08 05:31:16 imil Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.63 2025/05/08 05:31:16 imil Exp $"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/time.h> 35 #include <sys/timetc.h> 36 #include <sys/lwp.h> 37 #include <sys/atomic.h> 38 #include <sys/kernel.h> 39 #include <sys/cpu.h> 40 #include <sys/xcall.h> 41 #include <sys/lock.h> 42 #ifdef BOOT_DURATION 43 #include <sys/boot_duration.h> 44 #endif 45 46 #include <machine/cpu_counter.h> 47 #include <machine/cpuvar.h> 48 #include <machine/cpufunc.h> 49 #include <machine/specialreg.h> 50 #include <machine/cputypes.h> 51 52 #include "tsc.h" 53 54 #define TSC_SYNC_ROUNDS 1000 55 #define ABS(a) ((a) >= 0 ? (a) : -(a)) 56 57 static u_int tsc_get_timecount(struct timecounter *); 58 59 static void tsc_delay(unsigned int); 60 61 static uint64_t tsc_dummy_cacheline __cacheline_aligned; 62 uint64_t tsc_freq __read_mostly; /* exported for sysctl */ 63 #ifdef BOOT_DURATION 64 extern uint32_t starttsc_lo; 65 extern uint32_t starttsc_hi; 66 #endif 67 static int64_t tsc_drift_max = 1000; /* max cycles */ 68 static int64_t tsc_drift_observed; 69 uint64_t (*rdtsc)(void) = rdtsc_cpuid; 70 uint64_t (*cpu_counter)(void) = cpu_counter_cpuid; 71 uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid; 72 73 int tsc_user_enabled = 1; 74 75 static volatile int64_t tsc_sync_val; 76 static volatile struct cpu_info *tsc_sync_cpu; 77 78 static struct timecounter tsc_timecounter = { 79 .tc_get_timecount = tsc_get_timecount, 80 .tc_counter_mask = ~0U, 81 .tc_name = "TSC", 82 .tc_quality = 3000, 83 }; 84 85 bool 86 tsc_is_invariant(void) 87 { 88 struct cpu_info *ci; 89 uint32_t descs[4]; 90 uint32_t family; 91 bool invariant; 92 93 if (!cpu_hascounter()) 94 return false; 95 96 ci = curcpu(); 97 invariant = false; 98 99 if (cpu_vendor == CPUVENDOR_INTEL) { 100 /* 101 * From Intel(tm) 64 and IA-32 Architectures Software 102 * Developer's Manual Volume 3A: System Programming Guide, 103 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors 104 * where the TSC is known invariant: 105 * 106 * Pentium 4, Intel Xeon (family 0f, models 03 and higher) 107 * Core Solo and Core Duo processors (family 06, model 0e) 108 * Xeon 5100 series and Core 2 Duo (family 06, model 0f) 109 * Core 2 and Xeon (family 06, model 17) 110 * Atom (family 06, model 1c) 111 * 112 * We'll also assume that it's safe on the Pentium, and 113 * that it's safe on P-II and P-III Xeons due to the 114 * typical configuration of those systems. 115 * 116 */ 117 switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) { 118 case 0x05: 119 invariant = true; 120 break; 121 case 0x06: 122 invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e || 123 CPUID_TO_MODEL(ci->ci_signature) == 0x0f || 124 CPUID_TO_MODEL(ci->ci_signature) == 0x17 || 125 CPUID_TO_MODEL(ci->ci_signature) == 0x1c; 126 break; 127 case 0x0f: 128 invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03; 129 break; 130 } 131 } else if (cpu_vendor == CPUVENDOR_AMD) { 132 /* 133 * TSC and Power Management Events on AMD Processors 134 * Nov 2, 2005 Rich Brunner, AMD Fellow 135 * http://lkml.org/lkml/2005/11/4/173 136 * 137 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power 138 * Management Features, AMD64 Architecture Programmer's 139 * Manual Volume 3: General-Purpose and System Instructions. 140 * The check is done below. 141 */ 142 143 /* 144 * AMD Errata 778: Processor Core Time Stamp Counters May 145 * Experience Drift 146 * 147 * This affects all family 15h and family 16h processors. 148 */ 149 switch (CPUID_TO_FAMILY(ci->ci_signature)) { 150 case 0x15: 151 case 0x16: 152 return false; 153 } 154 } 155 156 /* 157 * The best way to check whether the TSC counter is invariant or not 158 * is to check CPUID 80000007. 159 */ 160 family = CPUID_TO_BASEFAMILY(ci->ci_signature); 161 if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD)) 162 && ((family == 0x06) || (family == 0x0f))) { 163 x86_cpuid(0x80000000, descs); 164 if (descs[0] >= 0x80000007) { 165 x86_cpuid(0x80000007, descs); 166 invariant = (descs[3] & CPUID_APM_ITSC) != 0; 167 } 168 } 169 170 return invariant; 171 } 172 173 /* Setup function pointers for rdtsc() and timecounter(9). */ 174 void 175 tsc_setfunc(struct cpu_info *ci) 176 { 177 bool use_lfence, use_mfence; 178 179 use_lfence = use_mfence = false; 180 181 /* 182 * XXX On AMD, we might be able to use lfence for some cases: 183 * a) if MSR_DE_CFG exist and the bit 1 is set. 184 * b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and 185 * lfence is always serializing. 186 * 187 * We don't use it because the test result showed mfence was better 188 * than lfence with MSR_DE_CFG. 189 */ 190 if (cpu_vendor == CPUVENDOR_AMD) 191 use_mfence = true; 192 else if (cpu_vendor == CPUVENDOR_INTEL) 193 use_lfence = true; 194 195 /* LFENCE and MFENCE are applicable if SSE2 is set. */ 196 if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0) 197 use_lfence = use_mfence = false; 198 199 #define TSC_SETFUNC(fence) \ 200 do { \ 201 rdtsc = rdtsc_##fence; \ 202 cpu_counter = cpu_counter_##fence; \ 203 cpu_counter32 = cpu_counter32_##fence; \ 204 } while (/* CONSTCOND */ 0) 205 206 if (use_lfence) 207 TSC_SETFUNC(lfence); 208 else if (use_mfence) 209 TSC_SETFUNC(mfence); 210 else 211 TSC_SETFUNC(cpuid); 212 213 aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n", 214 use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid")); 215 } 216 217 /* 218 * Initialize timecounter(9) and DELAY() function of TSC. 219 * 220 * This function is called after all secondary processors were brought up 221 * and drift has been measured, and after any other potential delay funcs 222 * have been installed (e.g. lapic_delay()). 223 */ 224 void 225 tsc_tc_init(void) 226 { 227 struct cpu_info *ci; 228 bool invariant; 229 230 if (!cpu_hascounter()) 231 return; 232 233 ci = curcpu(); 234 tsc_freq = ci->ci_data.cpu_cc_freq; 235 invariant = tsc_is_invariant(); 236 if (!invariant) { 237 aprint_debug("TSC not known invariant on this CPU\n"); 238 tsc_timecounter.tc_quality = -100; 239 } else if (tsc_drift_observed > tsc_drift_max) { 240 aprint_error("ERROR: %lld cycle TSC drift observed\n", 241 (long long)tsc_drift_observed); 242 tsc_timecounter.tc_quality = -100; 243 invariant = false; 244 } else if (vm_guest == VM_GUEST_NO) { 245 delay_func = tsc_delay; 246 } else if (vm_guest == VM_GUEST_VIRTUALBOX) { 247 tsc_timecounter.tc_quality = -100; 248 } 249 250 if (tsc_freq != 0) { 251 tsc_timecounter.tc_frequency = tsc_freq; 252 tc_init(&tsc_timecounter); 253 } 254 } 255 256 /* 257 * Record drift (in clock cycles). Called during AP startup. 258 */ 259 void 260 tsc_sync_drift(int64_t drift) 261 { 262 263 if (drift < 0) 264 drift = -drift; 265 if (drift > tsc_drift_observed) 266 tsc_drift_observed = drift; 267 } 268 269 /* 270 * Called during startup of APs, by the boot processor. Interrupts 271 * are disabled on entry. 272 */ 273 static void __noinline 274 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) 275 { 276 uint64_t bptsc; 277 278 if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) { 279 panic("tsc_sync_bp: 1"); 280 } 281 282 /* Prepare a cache miss for the other side. */ 283 (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0); 284 285 /* Flag our readiness. */ 286 atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC); 287 288 /* Wait for other side then read our TSC. */ 289 while ((ci->ci_flags & CPUF_SYNCTSC) != 0) { 290 __insn_barrier(); 291 } 292 bptsc = rdtsc(); 293 294 /* Wait for the results to come in. */ 295 while (tsc_sync_cpu == ci) { 296 x86_pause(); 297 } 298 if (tsc_sync_cpu != NULL) { 299 panic("tsc_sync_bp: 2"); 300 } 301 302 *bptscp = bptsc; 303 *aptscp = tsc_sync_val; 304 } 305 306 void 307 tsc_sync_bp(struct cpu_info *ci) 308 { 309 int64_t bptsc, aptsc, val, diff; 310 311 if (!cpu_hascounter()) 312 return; 313 314 val = INT64_MAX; 315 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) { 316 tsc_read_bp(ci, &bptsc, &aptsc); 317 diff = bptsc - aptsc; 318 if (ABS(diff) < ABS(val)) { 319 val = diff; 320 } 321 } 322 323 ci->ci_data.cpu_cc_skew = val; 324 } 325 326 /* 327 * Called during startup of AP, by the AP itself. Interrupts are 328 * disabled on entry. 329 */ 330 static void __noinline 331 tsc_post_ap(struct cpu_info *ci) 332 { 333 uint64_t tsc; 334 335 /* Wait for go-ahead from primary. */ 336 while ((ci->ci_flags & CPUF_SYNCTSC) == 0) { 337 __insn_barrier(); 338 } 339 340 /* Instruct primary to read its counter. */ 341 atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC); 342 343 /* Suffer a cache miss, then read TSC. */ 344 __insn_barrier(); 345 tsc = tsc_dummy_cacheline; 346 __insn_barrier(); 347 tsc += rdtsc(); 348 349 /* Post result. Ensure the whole value goes out atomically. */ 350 (void)atomic_swap_64(&tsc_sync_val, tsc); 351 352 if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) { 353 panic("tsc_sync_ap"); 354 } 355 } 356 357 void 358 tsc_sync_ap(struct cpu_info *ci) 359 { 360 361 if (!cpu_hascounter()) 362 return; 363 364 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) { 365 tsc_post_ap(ci); 366 } 367 } 368 369 static void 370 tsc_apply_cpu(void *arg1, void *arg2) 371 { 372 bool enable = arg1 != NULL; 373 if (enable) { 374 lcr4(rcr4() & ~CR4_TSD); 375 } else { 376 lcr4(rcr4() | CR4_TSD); 377 } 378 } 379 380 void 381 tsc_user_enable(void) 382 { 383 uint64_t xc; 384 385 xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL); 386 xc_wait(xc); 387 } 388 389 void 390 tsc_user_disable(void) 391 { 392 uint64_t xc; 393 394 xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL); 395 xc_wait(xc); 396 } 397 398 uint64_t 399 cpu_frequency(struct cpu_info *ci) 400 { 401 402 return ci->ci_data.cpu_cc_freq; 403 } 404 405 int 406 cpu_hascounter(void) 407 { 408 409 return cpu_feature[0] & CPUID_TSC; 410 } 411 412 static void 413 tsc_delay(unsigned int us) 414 { 415 uint64_t start, delta; 416 417 start = cpu_counter(); 418 delta = (uint64_t)us * tsc_freq / 1000000; 419 420 while ((cpu_counter() - start) < delta) { 421 x86_pause(); 422 } 423 } 424 425 static u_int 426 tsc_get_timecount(struct timecounter *tc) 427 { 428 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */ 429 static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED; 430 static int lastwarn; 431 uint64_t cur, prev; 432 lwp_t *l = curlwp; 433 int ticks; 434 435 /* 436 * Previous value must be read before the counter and stored to 437 * after, because this routine can be called from interrupt context 438 * and may run over the top of an existing invocation. Ordering is 439 * guaranteed by "volatile" on md_tsc. 440 */ 441 prev = l->l_md.md_tsc; 442 cur = cpu_counter(); 443 if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) && 444 __cpu_simple_lock_try(&lock)) { 445 ticks = getticks(); 446 if (ticks - lastwarn >= hz) { 447 printf( 448 "WARNING: %s TSC went backwards by %u - " 449 "change sysctl(7) kern.timecounter?\n", 450 cpu_name(curcpu()), (unsigned)(prev - cur)); 451 lastwarn = ticks; 452 } 453 __cpu_simple_unlock(&lock); 454 } 455 l->l_md.md_tsc = cur; 456 return (uint32_t)cur; 457 #else 458 return cpu_counter32(); 459 #endif 460 } 461 462 /* 463 * tsc has been reset; zero the cached tsc of every lwp in the system 464 * so we don't spuriously report that the tsc has gone backward. 465 * Caller must ensure all LWPs are quiescent (except the current one, 466 * obviously) and interrupts are blocked while we update this. 467 */ 468 void 469 tsc_tc_reset(void) 470 { 471 struct lwp *l; 472 473 LIST_FOREACH(l, &alllwp, l_list) 474 l->l_md.md_tsc = 0; 475 } 476 477 #ifdef BOOT_DURATION 478 /* Returns the kernel boot time in milliseconds. */ 479 uint64_t 480 boot_duration_timer(void) 481 { 482 KASSERT(curcpu_stable()); 483 KASSERT(CPU_IS_PRIMARY(curcpu())); 484 return (rdtsc() - ((uint64_t)starttsc_hi << 32 | starttsc_lo)) / 485 (curcpu()->ci_data.cpu_cc_freq / 1000); 486 } 487 #endif 488