Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2008-2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran and Mindaugas Rasiukevicius.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * x86 pmap(9) module: TLB shootdowns.
     34  *
     35  * TLB shootdowns are hard interrupts that operate outside the SPL framework.
     36  * They do not need to be blocked, provided that the pmap module gets the
     37  * order of events correct.  The calls are made by poking the LAPIC directly.
     38  * The interrupt handler is short and does one of the following: invalidate
     39  * a set of pages, all user TLB entries or the entire TLB.
     40  */
     41 
     42 #include <sys/cdefs.h>
     43 __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $");
     44 
     45 #include <sys/param.h>
     46 #include <sys/kernel.h>
     47 
     48 #include <sys/systm.h>
     49 #include <sys/atomic.h>
     50 #include <sys/cpu.h>
     51 #include <sys/intr.h>
     52 #include <uvm/uvm.h>
     53 
     54 #include <machine/cpuvar.h>
     55 #include <machine/pmap_private.h>
     56 
     57 #ifdef XENPV
     58 #include <xen/xenpmap.h>
     59 #endif /* XENPV */
     60 #include <x86/i82489reg.h>
     61 #include <x86/i82489var.h>
     62 
     63 /*
     64  * TLB shootdown packet.  Each CPU has a copy of this packet, where we build
     65  * sets of TLB shootdowns.  If shootdowns need to occur on remote CPUs, the
     66  * packet is copied into a shared mailbox kept on the initiator's kernel
     67  * stack.  Once the copy is made, no further updates to the mailbox are made
     68  * until the request is completed.  This keeps the cache line in the shared
     69  * state, and bus traffic to a minimum.
     70  *
     71  * In order to make maximal use of the available space, control fields are
     72  * overlaid into the lower 12 bits of the first 4 virtual addresses.  This
     73  * is very ugly, but it counts.
     74  *
     75  * On i386 the packet is 64 bytes in size.  On amd64 it's 128 bytes.  This
     76  * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown
     77  * interrupts could be issued.
     78  */
     79 
     80 #define	TP_MAXVA	16		/* for individual mappings */
     81 #define	TP_ALLVA	PAGE_MASK	/* special: shoot all mappings */
     82 
     83 typedef struct {
     84 	uintptr_t		tp_store[TP_MAXVA];
     85 } pmap_tlb_packet_t;
     86 
     87 #define	TP_COUNT	0
     88 #define	TP_USERPMAP	1
     89 #define	TP_GLOBAL	2
     90 #define	TP_DONE		3
     91 
     92 #define	TP_GET_COUNT(tp)	((tp)->tp_store[TP_COUNT] & PAGE_MASK)
     93 #define	TP_GET_USERPMAP(tp)	((tp)->tp_store[TP_USERPMAP] & 1)
     94 #define	TP_GET_GLOBAL(tp)	((tp)->tp_store[TP_GLOBAL] & 1)
     95 #define	TP_GET_DONE(tp)		(atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1)
     96 #define	TP_GET_VA(tp, i)	((tp)->tp_store[(i)] & ~PAGE_MASK)
     97 
     98 #define	TP_INC_COUNT(tp)	((tp)->tp_store[TP_COUNT]++)
     99 #define	TP_SET_ALLVA(tp)	((tp)->tp_store[TP_COUNT] |= TP_ALLVA)
    100 #define	TP_SET_VA(tp, c, va)	((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK))
    101 
    102 #define	TP_SET_USERPMAP(tp)	((tp)->tp_store[TP_USERPMAP] |= 1)
    103 #define	TP_SET_GLOBAL(tp)	((tp)->tp_store[TP_GLOBAL] |= 1)
    104 #define	TP_SET_DONE(tp)							     \
    105 	do {								     \
    106 		uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \
    107 		atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1);	     \
    108 	} while (/* CONSTCOND */ 0);
    109 
    110 #define	TP_CLEAR(tp)		memset(__UNVOLATILE(tp), 0, sizeof(*(tp)));
    111 
    112 /*
    113  * TLB shootdown state.
    114  */
    115 static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned;
    116 static volatile u_int		pmap_tlb_pendcount	__cacheline_aligned;
    117 static struct evcnt		pmap_tlb_evcnt		__cacheline_aligned;
    118 
    119 /*
    120  * TLB shootdown statistics.
    121  */
    122 #ifdef TLBSTATS
    123 static struct evcnt		tlbstat_local[TLBSHOOT__MAX];
    124 static struct evcnt		tlbstat_remote[TLBSHOOT__MAX];
    125 static struct evcnt		tlbstat_kernel[TLBSHOOT__MAX];
    126 static struct evcnt		tlbstat_single_req;
    127 static struct evcnt		tlbstat_single_issue;
    128 static const char *		tlbstat_name[ ] = {
    129 	"REMOVE_ALL",
    130 	"KENTER",
    131 	"KREMOVE",
    132 	"FREE_PTP",
    133 	"REMOVE_PTE",
    134 	"SYNC_PV",
    135 	"WRITE_PROTECT",
    136 	"ENTER",
    137 	"NVMM",
    138 	"BUS_DMA",
    139 	"BUS_SPACE",
    140 };
    141 #endif
    142 
    143 void
    144 pmap_tlb_init(void)
    145 {
    146 
    147 	evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
    148 	    NULL, "TLB", "shootdown");
    149 
    150 #ifdef TLBSTATS
    151 	int i;
    152 
    153 	for (i = 0; i < TLBSHOOT__MAX; i++) {
    154 		evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC,
    155 		    NULL, "tlbshoot local", tlbstat_name[i]);
    156 	}
    157 	for (i = 0; i < TLBSHOOT__MAX; i++) {
    158 		evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC,
    159 		    NULL, "tlbshoot remote", tlbstat_name[i]);
    160 	}
    161 	for (i = 0; i < TLBSHOOT__MAX; i++) {
    162 		evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC,
    163 		    NULL, "tlbshoot kernel", tlbstat_name[i]);
    164 	}
    165 	evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC,
    166 	    NULL, "tlbshoot single page", "requests");
    167 	evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC,
    168 	    NULL, "tlbshoot single page", "issues");
    169 #endif
    170 }
    171 
    172 void
    173 pmap_tlb_cpu_init(struct cpu_info *ci)
    174 {
    175 	pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
    176 
    177 	memset(tp, 0, sizeof(pmap_tlb_packet_t));
    178 	kcpuset_create(&ci->ci_tlb_cpuset, true);
    179 }
    180 
    181 static inline void
    182 pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why)
    183 {
    184 #ifdef TLBSTATS
    185 	const cpuid_t cid = cpu_index(curcpu());
    186 	bool local = false, remote = false;
    187 
    188 	if (va != (vaddr_t)-1LL) {
    189 		atomic_inc_64(&tlbstat_single_req.ev_count);
    190 	}
    191 	if (pm == pmap_kernel()) {
    192 		atomic_inc_64(&tlbstat_kernel[why].ev_count);
    193 		return;
    194 	}
    195 
    196 	if (va >= VM_MAXUSER_ADDRESS) {
    197 		remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid);
    198 		local = kcpuset_isset(pm->pm_kernel_cpus, cid);
    199 	}
    200 	remote |= kcpuset_isotherset(pm->pm_cpus, cid);
    201 	local |= kcpuset_isset(pm->pm_cpus, cid);
    202 
    203 	if (local) {
    204 		atomic_inc_64(&tlbstat_local[why].ev_count);
    205 	}
    206 	if (remote) {
    207 		atomic_inc_64(&tlbstat_remote[why].ev_count);
    208 	}
    209 #endif
    210 }
    211 
    212 static inline void
    213 pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp)
    214 {
    215 	int i = TP_GET_COUNT(tp);
    216 
    217 	/* Find out what we need to invalidate. */
    218 	if (i == TP_ALLVA) {
    219 		if (TP_GET_GLOBAL(tp) != 0) {
    220 			/* Invalidating all TLB entries. */
    221 			tlbflushg();
    222 		} else {
    223 			/* Invalidating non-global TLB entries only. */
    224 			tlbflush();
    225 		}
    226 	} else {
    227 		/* Invalidating a single page or a range of pages. */
    228 		KASSERT(i != 0);
    229 		do {
    230 			--i;
    231 			pmap_update_pg(TP_GET_VA(tp, i));
    232 		} while (i > 0);
    233 	}
    234 }
    235 
    236 /*
    237  * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'.
    238  */
    239 void
    240 pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why)
    241 {
    242 	pmap_tlb_packet_t *tp;
    243 	struct cpu_info *ci;
    244 	uint8_t count;
    245 	int s;
    246 
    247 #ifndef XENPV
    248 	KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel());
    249 #endif
    250 
    251 	if (__predict_false(pm->pm_tlb_flush != NULL)) {
    252 		(*pm->pm_tlb_flush)(pm);
    253 		return;
    254 	}
    255 
    256 	if ((pte & PTE_PS) != 0) {
    257 		va &= PTE_LGFRAME;
    258 	}
    259 
    260 	/*
    261 	 * Add the shootdown operation to our pending set.
    262 	 */
    263 	s = splvm();
    264 	ci = curcpu();
    265 	tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
    266 
    267 	/* Whole address flush will be needed if PTE_G is set. */
    268 	if ((pte & PTE_G) != 0) {
    269 		TP_SET_GLOBAL(tp);
    270 	}
    271 	count = TP_GET_COUNT(tp);
    272 
    273 	if (count < TP_MAXVA && va != (vaddr_t)-1LL) {
    274 		/* Flush a single page. */
    275 		TP_SET_VA(tp, count, va);
    276 		TP_INC_COUNT(tp);
    277 	} else {
    278 		/* Flush everything - may already be set. */
    279 		TP_SET_ALLVA(tp);
    280 	}
    281 
    282 	if (pm != pmap_kernel()) {
    283 		kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus);
    284 		if (va >= VM_MAXUSER_ADDRESS) {
    285 			kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus);
    286 		}
    287 		TP_SET_USERPMAP(tp);
    288 	} else {
    289 		kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running);
    290 	}
    291 	pmap_tlbstat_count(pm, va, why);
    292 	splx(s);
    293 }
    294 
    295 #ifdef XENPV
    296 
    297 static inline void
    298 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
    299 {
    300 #ifdef MULTIPROCESSOR
    301 	int i = TP_GET_COUNT(tp);
    302 
    303 	if (i != TP_ALLVA) {
    304 		/* Invalidating a single page or a range of pages. */
    305 		KASSERT(i != 0);
    306 		do {
    307 			--i;
    308 			xen_mcast_invlpg(TP_GET_VA(tp, i), target);
    309 		} while (i > 0);
    310 	} else {
    311 		xen_mcast_tlbflush(target);
    312 	}
    313 
    314 	/* Remote CPUs have been synchronously flushed. */
    315 	pmap_tlb_pendcount = 0;
    316 	pmap_tlb_packet = NULL;
    317 	TP_SET_DONE(tp);
    318 #endif /* MULTIPROCESSOR */
    319 }
    320 
    321 #else
    322 
    323 static inline void
    324 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
    325 {
    326 #ifdef MULTIPROCESSOR
    327 	int err = 0;
    328 
    329 	if (!kcpuset_match(target, kcpuset_attached)) {
    330 		const struct cpu_info * const self = curcpu();
    331 		CPU_INFO_ITERATOR cii;
    332 		struct cpu_info *lci;
    333 
    334 		for (CPU_INFO_FOREACH(cii, lci)) {
    335 			const cpuid_t lcid = cpu_index(lci);
    336 
    337 			if (__predict_false(lci == self) ||
    338 			    !kcpuset_isset(target, lcid)) {
    339 				continue;
    340 			}
    341 			err |= x86_ipi(LAPIC_TLB_VECTOR,
    342 			    lci->ci_cpuid, LAPIC_DLMODE_FIXED);
    343 		}
    344 	} else {
    345 		err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL,
    346 		    LAPIC_DLMODE_FIXED);
    347 	}
    348 	KASSERT(err == 0);
    349 #endif /* MULTIPROCESSOR */
    350 }
    351 
    352 #endif /* XENPV */
    353 
    354 /*
    355  * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU.
    356  *
    357  * => Must be called with preemption disabled.
    358  */
    359 void
    360 pmap_tlb_shootnow(void)
    361 {
    362 	volatile pmap_tlb_packet_t *tp, *ts;
    363 	volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT];
    364 	struct cpu_info *ci;
    365 	kcpuset_t *target;
    366 	u_int local, rcpucount;
    367 	cpuid_t cid;
    368 	int s;
    369 
    370 	KASSERT(kpreempt_disabled());
    371 
    372 	/* Pre-check first. */
    373 	ci = curcpu();
    374 	tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
    375 	if (TP_GET_COUNT(tp) == 0) {
    376 		return;
    377 	}
    378 
    379 	/* An interrupt may have flushed our updates, so check again. */
    380 	s = splvm();
    381 	if (TP_GET_COUNT(tp) == 0) {
    382 		splx(s);
    383 		return;
    384 	}
    385 
    386 	cid = cpu_index(ci);
    387 	target = ci->ci_tlb_cpuset;
    388 	local = kcpuset_isset(target, cid) ? 1 : 0;
    389 	rcpucount = kcpuset_countset(target) - local;
    390 
    391 	/*
    392 	 * Fast path for local shootdowns only.  Do the shootdowns, and
    393 	 * clear out the buffer for the next user.
    394 	 */
    395 	if (rcpucount == 0) {
    396 		pmap_tlb_invalidate(tp);
    397 		kcpuset_zero(ci->ci_tlb_cpuset);
    398 		TP_CLEAR(tp);
    399 		splx(s);
    400 		return;
    401 	}
    402 
    403 	/*
    404 	 * Copy the packet into the stack buffer, and gain ownership of the
    405 	 * global pointer.  We must keep interrupts blocked once we own the
    406 	 * pointer and until the IPIs are triggered, or we could deadlock
    407 	 * against an interrupt on the current CPU trying the same.
    408 	 */
    409 	KASSERT(rcpucount < ncpu);
    410 	ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT);
    411 	*ts = *tp;
    412 	KASSERT(TP_GET_DONE(ts) == 0);
    413 	while (atomic_cas_ptr(&pmap_tlb_packet, NULL,
    414 	    __UNVOLATILE(ts)) != NULL) {
    415 		KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts);
    416 		/*
    417 		 * Don't bother with exponentional backoff, as the pointer
    418 		 * is in a dedicated cache line and only updated twice per
    419 		 * IPI (in contrast to the pending counter).  The cache
    420 		 * line will spend most of its time in the SHARED state.
    421 		 */
    422 		splx(s);
    423 		do {
    424 			x86_pause();
    425 		} while (atomic_load_relaxed(&pmap_tlb_packet) != NULL);
    426 		s = splvm();
    427 
    428 		/*
    429 		 * An interrupt might have done the shootdowns for
    430 		 * us while we spun.
    431 		 */
    432 		if (TP_GET_COUNT(tp) == 0) {
    433 			splx(s);
    434 			return;
    435 		}
    436 	}
    437 
    438 	/*
    439 	 * Ownership of the global pointer provides serialization of the
    440 	 * update to the count and the event counter.  With those values
    441 	 * updated, start shootdowns on remote CPUs.
    442 	 */
    443 	pmap_tlb_pendcount = rcpucount;
    444 	pmap_tlb_evcnt.ev_count++;
    445 	pmap_tlb_processpacket(ts, target);
    446 
    447 	/*
    448 	 * Clear out the local CPU's buffer for the next user.  Once done,
    449 	 * we can drop the IPL.
    450 	 */
    451 #ifdef TLBSTATS
    452 	if (TP_GET_COUNT(tp) != TP_ALLVA) {
    453 		atomic_add_64(&tlbstat_single_issue.ev_count,
    454 		    TP_GET_COUNT(tp));
    455 	}
    456 #endif
    457 	kcpuset_zero(ci->ci_tlb_cpuset);
    458 	TP_CLEAR(tp);
    459 	splx(s);
    460 
    461 	/*
    462 	 * Shootdowns on remote CPUs are now in flight.  In the meantime,
    463 	 * perform local shootdown if needed, using our copy of the packet.
    464 	 */
    465 	if (local) {
    466 		pmap_tlb_invalidate(ts);
    467 	}
    468 
    469 	/*
    470 	 * Wait for the updates to be processed by remote CPUs.  Poll the
    471 	 * flag in the packet in order to limit bus traffic (only the last
    472 	 * CPU out will update it and only we are reading it).  No memory
    473 	 * barrier required due to prior stores - yay x86.
    474 	 */
    475 	while (TP_GET_DONE(ts) == 0) {
    476 		x86_pause();
    477 	}
    478 }
    479 
    480 /*
    481  * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries.
    482  *
    483  * Called from IPI only.  We are outside the SPL framework, with interrupts
    484  * disabled on the CPU: be careful.
    485  *
    486  * TLB flush and the interrupt that brought us here are serializing
    487  * operations (they defeat speculative execution).  Any speculative load
    488  * producing a TLB fill between receipt of the interrupt and the TLB flush
    489  * will load "current" PTEs.  None of the mappings relied on by this ISR for
    490  * its execution will be changing.  So it's safe to acknowledge the request
    491  * and allow the initiator to proceed before performing the flush.
    492  */
    493 void
    494 pmap_tlb_intr(void)
    495 {
    496 	pmap_tlb_packet_t copy;
    497 	volatile pmap_tlb_packet_t *source;
    498 	struct cpu_info *ci;
    499 
    500 	/* Make a private copy of the packet. */
    501 	source = pmap_tlb_packet;
    502 	copy = *source;
    503 
    504 	/*
    505 	 * If we are the last CPU out, clear the active pointer and mark the
    506 	 * packet as done.  Both can be done without using an atomic, and
    507 	 * the one atomic we do use serves as our memory barrier.
    508 	 *
    509 	 * It's important to clear the active pointer before setting
    510 	 * TP_DONE, to ensure a remote CPU does not exit & re-enter
    511 	 * pmap_tlb_shootnow() only to find its current pointer still
    512 	 * seemingly active.
    513 	 */
    514 	if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) {
    515 		atomic_store_relaxed(&pmap_tlb_packet, NULL);
    516 		__insn_barrier();
    517 		TP_SET_DONE(source);
    518 	}
    519 	pmap_tlb_invalidate(&copy);
    520 
    521 	/*
    522 	 * Check the current TLB state.  If we don't want further flushes
    523 	 * for this pmap, then take the CPU out of the pmap's set.  The
    524 	 * order of updates to the set and TLB state must closely align with
    525 	 * the pmap code, as we can interrupt code running in the pmap
    526 	 * module.
    527 	 */
    528 	ci = curcpu();
    529 	if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(&copy) != 0) {
    530 		kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci));
    531 		ci->ci_tlbstate = TLBSTATE_STALE;
    532 	}
    533 }
    534