1 /* $NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $ */ 2 3 /*- 4 * Copyright (c) 2008-2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran and Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * x86 pmap(9) module: TLB shootdowns. 34 * 35 * TLB shootdowns are hard interrupts that operate outside the SPL framework. 36 * They do not need to be blocked, provided that the pmap module gets the 37 * order of events correct. The calls are made by poking the LAPIC directly. 38 * The interrupt handler is short and does one of the following: invalidate 39 * a set of pages, all user TLB entries or the entire TLB. 40 */ 41 42 #include <sys/cdefs.h> 43 __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $"); 44 45 #include <sys/param.h> 46 #include <sys/kernel.h> 47 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/cpu.h> 51 #include <sys/intr.h> 52 #include <uvm/uvm.h> 53 54 #include <machine/cpuvar.h> 55 #include <machine/pmap_private.h> 56 57 #ifdef XENPV 58 #include <xen/xenpmap.h> 59 #endif /* XENPV */ 60 #include <x86/i82489reg.h> 61 #include <x86/i82489var.h> 62 63 /* 64 * TLB shootdown packet. Each CPU has a copy of this packet, where we build 65 * sets of TLB shootdowns. If shootdowns need to occur on remote CPUs, the 66 * packet is copied into a shared mailbox kept on the initiator's kernel 67 * stack. Once the copy is made, no further updates to the mailbox are made 68 * until the request is completed. This keeps the cache line in the shared 69 * state, and bus traffic to a minimum. 70 * 71 * In order to make maximal use of the available space, control fields are 72 * overlaid into the lower 12 bits of the first 4 virtual addresses. This 73 * is very ugly, but it counts. 74 * 75 * On i386 the packet is 64 bytes in size. On amd64 it's 128 bytes. This 76 * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown 77 * interrupts could be issued. 78 */ 79 80 #define TP_MAXVA 16 /* for individual mappings */ 81 #define TP_ALLVA PAGE_MASK /* special: shoot all mappings */ 82 83 typedef struct { 84 uintptr_t tp_store[TP_MAXVA]; 85 } pmap_tlb_packet_t; 86 87 #define TP_COUNT 0 88 #define TP_USERPMAP 1 89 #define TP_GLOBAL 2 90 #define TP_DONE 3 91 92 #define TP_GET_COUNT(tp) ((tp)->tp_store[TP_COUNT] & PAGE_MASK) 93 #define TP_GET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] & 1) 94 #define TP_GET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] & 1) 95 #define TP_GET_DONE(tp) (atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1) 96 #define TP_GET_VA(tp, i) ((tp)->tp_store[(i)] & ~PAGE_MASK) 97 98 #define TP_INC_COUNT(tp) ((tp)->tp_store[TP_COUNT]++) 99 #define TP_SET_ALLVA(tp) ((tp)->tp_store[TP_COUNT] |= TP_ALLVA) 100 #define TP_SET_VA(tp, c, va) ((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK)) 101 102 #define TP_SET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] |= 1) 103 #define TP_SET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] |= 1) 104 #define TP_SET_DONE(tp) \ 105 do { \ 106 uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \ 107 atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1); \ 108 } while (/* CONSTCOND */ 0); 109 110 #define TP_CLEAR(tp) memset(__UNVOLATILE(tp), 0, sizeof(*(tp))); 111 112 /* 113 * TLB shootdown state. 114 */ 115 static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned; 116 static volatile u_int pmap_tlb_pendcount __cacheline_aligned; 117 static struct evcnt pmap_tlb_evcnt __cacheline_aligned; 118 119 /* 120 * TLB shootdown statistics. 121 */ 122 #ifdef TLBSTATS 123 static struct evcnt tlbstat_local[TLBSHOOT__MAX]; 124 static struct evcnt tlbstat_remote[TLBSHOOT__MAX]; 125 static struct evcnt tlbstat_kernel[TLBSHOOT__MAX]; 126 static struct evcnt tlbstat_single_req; 127 static struct evcnt tlbstat_single_issue; 128 static const char * tlbstat_name[ ] = { 129 "REMOVE_ALL", 130 "KENTER", 131 "KREMOVE", 132 "FREE_PTP", 133 "REMOVE_PTE", 134 "SYNC_PV", 135 "WRITE_PROTECT", 136 "ENTER", 137 "NVMM", 138 "BUS_DMA", 139 "BUS_SPACE", 140 }; 141 #endif 142 143 void 144 pmap_tlb_init(void) 145 { 146 147 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 148 NULL, "TLB", "shootdown"); 149 150 #ifdef TLBSTATS 151 int i; 152 153 for (i = 0; i < TLBSHOOT__MAX; i++) { 154 evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC, 155 NULL, "tlbshoot local", tlbstat_name[i]); 156 } 157 for (i = 0; i < TLBSHOOT__MAX; i++) { 158 evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC, 159 NULL, "tlbshoot remote", tlbstat_name[i]); 160 } 161 for (i = 0; i < TLBSHOOT__MAX; i++) { 162 evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC, 163 NULL, "tlbshoot kernel", tlbstat_name[i]); 164 } 165 evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC, 166 NULL, "tlbshoot single page", "requests"); 167 evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC, 168 NULL, "tlbshoot single page", "issues"); 169 #endif 170 } 171 172 void 173 pmap_tlb_cpu_init(struct cpu_info *ci) 174 { 175 pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; 176 177 memset(tp, 0, sizeof(pmap_tlb_packet_t)); 178 kcpuset_create(&ci->ci_tlb_cpuset, true); 179 } 180 181 static inline void 182 pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why) 183 { 184 #ifdef TLBSTATS 185 const cpuid_t cid = cpu_index(curcpu()); 186 bool local = false, remote = false; 187 188 if (va != (vaddr_t)-1LL) { 189 atomic_inc_64(&tlbstat_single_req.ev_count); 190 } 191 if (pm == pmap_kernel()) { 192 atomic_inc_64(&tlbstat_kernel[why].ev_count); 193 return; 194 } 195 196 if (va >= VM_MAXUSER_ADDRESS) { 197 remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid); 198 local = kcpuset_isset(pm->pm_kernel_cpus, cid); 199 } 200 remote |= kcpuset_isotherset(pm->pm_cpus, cid); 201 local |= kcpuset_isset(pm->pm_cpus, cid); 202 203 if (local) { 204 atomic_inc_64(&tlbstat_local[why].ev_count); 205 } 206 if (remote) { 207 atomic_inc_64(&tlbstat_remote[why].ev_count); 208 } 209 #endif 210 } 211 212 static inline void 213 pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp) 214 { 215 int i = TP_GET_COUNT(tp); 216 217 /* Find out what we need to invalidate. */ 218 if (i == TP_ALLVA) { 219 if (TP_GET_GLOBAL(tp) != 0) { 220 /* Invalidating all TLB entries. */ 221 tlbflushg(); 222 } else { 223 /* Invalidating non-global TLB entries only. */ 224 tlbflush(); 225 } 226 } else { 227 /* Invalidating a single page or a range of pages. */ 228 KASSERT(i != 0); 229 do { 230 --i; 231 pmap_update_pg(TP_GET_VA(tp, i)); 232 } while (i > 0); 233 } 234 } 235 236 /* 237 * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'. 238 */ 239 void 240 pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why) 241 { 242 pmap_tlb_packet_t *tp; 243 struct cpu_info *ci; 244 uint8_t count; 245 int s; 246 247 #ifndef XENPV 248 KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel()); 249 #endif 250 251 if (__predict_false(pm->pm_tlb_flush != NULL)) { 252 (*pm->pm_tlb_flush)(pm); 253 return; 254 } 255 256 if ((pte & PTE_PS) != 0) { 257 va &= PTE_LGFRAME; 258 } 259 260 /* 261 * Add the shootdown operation to our pending set. 262 */ 263 s = splvm(); 264 ci = curcpu(); 265 tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; 266 267 /* Whole address flush will be needed if PTE_G is set. */ 268 if ((pte & PTE_G) != 0) { 269 TP_SET_GLOBAL(tp); 270 } 271 count = TP_GET_COUNT(tp); 272 273 if (count < TP_MAXVA && va != (vaddr_t)-1LL) { 274 /* Flush a single page. */ 275 TP_SET_VA(tp, count, va); 276 TP_INC_COUNT(tp); 277 } else { 278 /* Flush everything - may already be set. */ 279 TP_SET_ALLVA(tp); 280 } 281 282 if (pm != pmap_kernel()) { 283 kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus); 284 if (va >= VM_MAXUSER_ADDRESS) { 285 kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus); 286 } 287 TP_SET_USERPMAP(tp); 288 } else { 289 kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running); 290 } 291 pmap_tlbstat_count(pm, va, why); 292 splx(s); 293 } 294 295 #ifdef XENPV 296 297 static inline void 298 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target) 299 { 300 #ifdef MULTIPROCESSOR 301 int i = TP_GET_COUNT(tp); 302 303 if (i != TP_ALLVA) { 304 /* Invalidating a single page or a range of pages. */ 305 KASSERT(i != 0); 306 do { 307 --i; 308 xen_mcast_invlpg(TP_GET_VA(tp, i), target); 309 } while (i > 0); 310 } else { 311 xen_mcast_tlbflush(target); 312 } 313 314 /* Remote CPUs have been synchronously flushed. */ 315 pmap_tlb_pendcount = 0; 316 pmap_tlb_packet = NULL; 317 TP_SET_DONE(tp); 318 #endif /* MULTIPROCESSOR */ 319 } 320 321 #else 322 323 static inline void 324 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target) 325 { 326 #ifdef MULTIPROCESSOR 327 int err = 0; 328 329 if (!kcpuset_match(target, kcpuset_attached)) { 330 const struct cpu_info * const self = curcpu(); 331 CPU_INFO_ITERATOR cii; 332 struct cpu_info *lci; 333 334 for (CPU_INFO_FOREACH(cii, lci)) { 335 const cpuid_t lcid = cpu_index(lci); 336 337 if (__predict_false(lci == self) || 338 !kcpuset_isset(target, lcid)) { 339 continue; 340 } 341 err |= x86_ipi(LAPIC_TLB_VECTOR, 342 lci->ci_cpuid, LAPIC_DLMODE_FIXED); 343 } 344 } else { 345 err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL, 346 LAPIC_DLMODE_FIXED); 347 } 348 KASSERT(err == 0); 349 #endif /* MULTIPROCESSOR */ 350 } 351 352 #endif /* XENPV */ 353 354 /* 355 * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU. 356 * 357 * => Must be called with preemption disabled. 358 */ 359 void 360 pmap_tlb_shootnow(void) 361 { 362 volatile pmap_tlb_packet_t *tp, *ts; 363 volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT]; 364 struct cpu_info *ci; 365 kcpuset_t *target; 366 u_int local, rcpucount; 367 cpuid_t cid; 368 int s; 369 370 KASSERT(kpreempt_disabled()); 371 372 /* Pre-check first. */ 373 ci = curcpu(); 374 tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; 375 if (TP_GET_COUNT(tp) == 0) { 376 return; 377 } 378 379 /* An interrupt may have flushed our updates, so check again. */ 380 s = splvm(); 381 if (TP_GET_COUNT(tp) == 0) { 382 splx(s); 383 return; 384 } 385 386 cid = cpu_index(ci); 387 target = ci->ci_tlb_cpuset; 388 local = kcpuset_isset(target, cid) ? 1 : 0; 389 rcpucount = kcpuset_countset(target) - local; 390 391 /* 392 * Fast path for local shootdowns only. Do the shootdowns, and 393 * clear out the buffer for the next user. 394 */ 395 if (rcpucount == 0) { 396 pmap_tlb_invalidate(tp); 397 kcpuset_zero(ci->ci_tlb_cpuset); 398 TP_CLEAR(tp); 399 splx(s); 400 return; 401 } 402 403 /* 404 * Copy the packet into the stack buffer, and gain ownership of the 405 * global pointer. We must keep interrupts blocked once we own the 406 * pointer and until the IPIs are triggered, or we could deadlock 407 * against an interrupt on the current CPU trying the same. 408 */ 409 KASSERT(rcpucount < ncpu); 410 ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT); 411 *ts = *tp; 412 KASSERT(TP_GET_DONE(ts) == 0); 413 while (atomic_cas_ptr(&pmap_tlb_packet, NULL, 414 __UNVOLATILE(ts)) != NULL) { 415 KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts); 416 /* 417 * Don't bother with exponentional backoff, as the pointer 418 * is in a dedicated cache line and only updated twice per 419 * IPI (in contrast to the pending counter). The cache 420 * line will spend most of its time in the SHARED state. 421 */ 422 splx(s); 423 do { 424 x86_pause(); 425 } while (atomic_load_relaxed(&pmap_tlb_packet) != NULL); 426 s = splvm(); 427 428 /* 429 * An interrupt might have done the shootdowns for 430 * us while we spun. 431 */ 432 if (TP_GET_COUNT(tp) == 0) { 433 splx(s); 434 return; 435 } 436 } 437 438 /* 439 * Ownership of the global pointer provides serialization of the 440 * update to the count and the event counter. With those values 441 * updated, start shootdowns on remote CPUs. 442 */ 443 pmap_tlb_pendcount = rcpucount; 444 pmap_tlb_evcnt.ev_count++; 445 pmap_tlb_processpacket(ts, target); 446 447 /* 448 * Clear out the local CPU's buffer for the next user. Once done, 449 * we can drop the IPL. 450 */ 451 #ifdef TLBSTATS 452 if (TP_GET_COUNT(tp) != TP_ALLVA) { 453 atomic_add_64(&tlbstat_single_issue.ev_count, 454 TP_GET_COUNT(tp)); 455 } 456 #endif 457 kcpuset_zero(ci->ci_tlb_cpuset); 458 TP_CLEAR(tp); 459 splx(s); 460 461 /* 462 * Shootdowns on remote CPUs are now in flight. In the meantime, 463 * perform local shootdown if needed, using our copy of the packet. 464 */ 465 if (local) { 466 pmap_tlb_invalidate(ts); 467 } 468 469 /* 470 * Wait for the updates to be processed by remote CPUs. Poll the 471 * flag in the packet in order to limit bus traffic (only the last 472 * CPU out will update it and only we are reading it). No memory 473 * barrier required due to prior stores - yay x86. 474 */ 475 while (TP_GET_DONE(ts) == 0) { 476 x86_pause(); 477 } 478 } 479 480 /* 481 * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries. 482 * 483 * Called from IPI only. We are outside the SPL framework, with interrupts 484 * disabled on the CPU: be careful. 485 * 486 * TLB flush and the interrupt that brought us here are serializing 487 * operations (they defeat speculative execution). Any speculative load 488 * producing a TLB fill between receipt of the interrupt and the TLB flush 489 * will load "current" PTEs. None of the mappings relied on by this ISR for 490 * its execution will be changing. So it's safe to acknowledge the request 491 * and allow the initiator to proceed before performing the flush. 492 */ 493 void 494 pmap_tlb_intr(void) 495 { 496 pmap_tlb_packet_t copy; 497 volatile pmap_tlb_packet_t *source; 498 struct cpu_info *ci; 499 500 /* Make a private copy of the packet. */ 501 source = pmap_tlb_packet; 502 copy = *source; 503 504 /* 505 * If we are the last CPU out, clear the active pointer and mark the 506 * packet as done. Both can be done without using an atomic, and 507 * the one atomic we do use serves as our memory barrier. 508 * 509 * It's important to clear the active pointer before setting 510 * TP_DONE, to ensure a remote CPU does not exit & re-enter 511 * pmap_tlb_shootnow() only to find its current pointer still 512 * seemingly active. 513 */ 514 if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) { 515 atomic_store_relaxed(&pmap_tlb_packet, NULL); 516 __insn_barrier(); 517 TP_SET_DONE(source); 518 } 519 pmap_tlb_invalidate(©); 520 521 /* 522 * Check the current TLB state. If we don't want further flushes 523 * for this pmap, then take the CPU out of the pmap's set. The 524 * order of updates to the set and TLB state must closely align with 525 * the pmap code, as we can interrupt code running in the pmap 526 * module. 527 */ 528 ci = curcpu(); 529 if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(©) != 0) { 530 kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci)); 531 ci->ci_tlbstate = TLBSTATE_STALE; 532 } 533 } 534