Home | History | Annotate | Line # | Download | only in uvm
      1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.42 2025/05/20 10:22:27 bouyer Exp $	*/
      2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
      3 
      4 /*-
      5  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
     35  * Copyright (c) 1991, 1993, The Regents of the University of California.
     36  *
     37  * All rights reserved.
     38  *
     39  * This code is derived from software contributed to Berkeley by
     40  * The Mach Operating System project at Carnegie-Mellon University.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
     67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
     68  *
     69  *
     70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
     71  * All rights reserved.
     72  *
     73  * Permission to use, copy, modify and distribute this software and
     74  * its documentation is hereby granted, provided that both the copyright
     75  * notice and this permission notice appear in all copies of the
     76  * software, derivative works or modified versions, and any portions
     77  * thereof, and that both notices appear in supporting documentation.
     78  *
     79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     82  *
     83  * Carnegie Mellon requests users of this software to return to
     84  *
     85  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     86  *  School of Computer Science
     87  *  Carnegie Mellon University
     88  *  Pittsburgh PA 15213-3890
     89  *
     90  * any improvements or extensions that they make and grant Carnegie the
     91  * rights to redistribute these changes.
     92  */
     93 
     94 #if defined(PDSIM)
     95 
     96 #include "pdsim.h"
     97 
     98 #else /* defined(PDSIM) */
     99 
    100 #include <sys/cdefs.h>
    101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.42 2025/05/20 10:22:27 bouyer Exp $");
    102 
    103 #include <sys/param.h>
    104 #include <sys/proc.h>
    105 #include <sys/systm.h>
    106 #include <sys/kernel.h>
    107 #include <sys/kmem.h>
    108 #include <sys/atomic.h>
    109 
    110 #include <uvm/uvm.h>
    111 #include <uvm/uvm_pdpolicy.h>
    112 #include <uvm/uvm_pdpolicy_impl.h>
    113 #include <uvm/uvm_stat.h>
    114 
    115 #endif /* defined(PDSIM) */
    116 
    117 /*
    118  * per-CPU queue of pending page status changes.  128 entries makes for a
    119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
    120  * keeps lock contention events and wait times low, while not using too much
    121  * memory nor allowing global state to fall too far behind.
    122  */
    123 #if !defined(CLOCK_PDQ_SIZE)
    124 #define	CLOCK_PDQ_SIZE	128
    125 #endif /* !defined(CLOCK_PDQ_SIZE) */
    126 
    127 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
    128 #define PQ_ACTIVE	0x00000020	/* page is in active list */
    129 
    130 #if !defined(CLOCK_INACTIVEPCT)
    131 #define	CLOCK_INACTIVEPCT	33
    132 #endif /* !defined(CLOCK_INACTIVEPCT) */
    133 
    134 struct uvmpdpol_globalstate {
    135 	kmutex_t lock;			/* lock on state */
    136 					/* <= compiler pads here */
    137 	struct pglist s_activeq		/* allocated pages, in use */
    138 	    __aligned(COHERENCY_UNIT);
    139 	struct pglist s_inactiveq;	/* pages between the clock hands */
    140 	int s_active;
    141 	int s_inactive;
    142 	int s_inactarg;
    143 	struct uvm_pctparam s_anonmin;
    144 	struct uvm_pctparam s_filemin;
    145 	struct uvm_pctparam s_execmin;
    146 	struct uvm_pctparam s_anonmax;
    147 	struct uvm_pctparam s_filemax;
    148 	struct uvm_pctparam s_execmax;
    149 	struct uvm_pctparam s_inactivepct;
    150 };
    151 
    152 struct uvmpdpol_scanstate {
    153 	bool ss_anonreact, ss_filereact, ss_execreact;
    154 	struct vm_page ss_marker;
    155 };
    156 
    157 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
    158 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
    159 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
    160 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
    161 static struct uvm_cpu *uvmpdpol_flush(void);
    162 
    163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
    164 static struct uvmpdpol_scanstate pdpol_scanstate;
    165 
    166 PDPOL_EVCNT_DEFINE(reactexec)
    167 PDPOL_EVCNT_DEFINE(reactfile)
    168 PDPOL_EVCNT_DEFINE(reactanon)
    169 
    170 static void
    171 clock_tune(void)
    172 {
    173 	struct uvmpdpol_globalstate *s = &pdpol_state;
    174 
    175 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
    176 	    s->s_active + s->s_inactive);
    177 	if (s->s_inactarg <= uvmexp.freetarg) {
    178 		s->s_inactarg = uvmexp.freetarg + 1;
    179 	}
    180 }
    181 
    182 void
    183 uvmpdpol_scaninit(void)
    184 {
    185 	struct uvmpdpol_globalstate *s = &pdpol_state;
    186 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    187 	int t;
    188 	bool anonunder, fileunder, execunder;
    189 	bool anonover, fileover, execover;
    190 	bool anonreact, filereact, execreact;
    191 	int64_t freepg, anonpg, filepg, execpg;
    192 
    193 	/*
    194 	 * decide which types of pages we want to reactivate instead of freeing
    195 	 * to keep usage within the minimum and maximum usage limits.
    196 	 * uvm_availmem() will sync the counters.
    197 	 */
    198 
    199 	freepg = uvm_availmem(false);
    200 	anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) +
    201 	    cpu_count_get(CPU_COUNT_ANONDIRTY) +
    202 	    cpu_count_get(CPU_COUNT_ANONUNKNOWN);
    203 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
    204 	filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
    205 	    cpu_count_get(CPU_COUNT_FILEDIRTY) +
    206 	    cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
    207 	    execpg;
    208 
    209 	mutex_enter(&s->lock);
    210 	t = s->s_active + s->s_inactive + freepg;
    211 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
    212 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
    213 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
    214 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
    215 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
    216 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
    217 	anonreact = anonunder || (!anonover && (fileover || execover));
    218 	filereact = fileunder || (!fileover && (anonover || execover));
    219 	execreact = execunder || (!execover && (anonover || fileover));
    220 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
    221 		anonreact = filereact = execreact = false;
    222 	}
    223 	ss->ss_anonreact = anonreact;
    224 	ss->ss_filereact = filereact;
    225 	ss->ss_execreact = execreact;
    226 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
    227 	ss->ss_marker.flags = PG_MARKER;
    228 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    229 	mutex_exit(&s->lock);
    230 }
    231 
    232 void
    233 uvmpdpol_scanfini(void)
    234 {
    235 	struct uvmpdpol_globalstate *s = &pdpol_state;
    236 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    237 
    238 	mutex_enter(&s->lock);
    239 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    240 	mutex_exit(&s->lock);
    241 }
    242 
    243 struct vm_page *
    244 uvmpdpol_selectvictim(krwlock_t **plock)
    245 {
    246 	struct uvmpdpol_globalstate *s = &pdpol_state;
    247 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    248 	struct vm_page *pg;
    249 	krwlock_t *lock;
    250 
    251 	mutex_enter(&s->lock);
    252 	while (/* CONSTCOND */ 1) {
    253 		struct vm_anon *anon;
    254 		struct uvm_object *uobj;
    255 
    256 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
    257 		if (pg == NULL) {
    258 			break;
    259 		}
    260 		KASSERT((pg->flags & PG_MARKER) == 0);
    261 		uvmexp.pdscans++;
    262 
    263 		/*
    264 		 * acquire interlock to stabilize page identity.
    265 		 * if we have caught the page in a state of flux
    266 		 * deal with it and retry.
    267 		 */
    268 		mutex_enter(&pg->interlock);
    269 		if (uvmpdpol_pagerealize_locked(pg)) {
    270 			mutex_exit(&pg->interlock);
    271 			continue;
    272 		}
    273 
    274 		/*
    275 		 * now prepare to move on to the next page.
    276 		 */
    277 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
    278 		    pdqueue);
    279 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
    280 		    &ss->ss_marker, pdqueue);
    281 
    282 		/*
    283 		 * enforce the minimum thresholds on different
    284 		 * types of memory usage.  if reusing the current
    285 		 * page would reduce that type of usage below its
    286 		 * minimum, reactivate the page instead and move
    287 		 * on to the next page.
    288 		 */
    289 		anon = pg->uanon;
    290 		uobj = pg->uobject;
    291 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
    292 			uvmpdpol_pageactivate_locked(pg);
    293 			mutex_exit(&pg->interlock);
    294 			PDPOL_EVCNT_INCR(reactexec);
    295 			continue;
    296 		}
    297 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
    298 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
    299 			uvmpdpol_pageactivate_locked(pg);
    300 			mutex_exit(&pg->interlock);
    301 			PDPOL_EVCNT_INCR(reactfile);
    302 			continue;
    303 		}
    304 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
    305 			uvmpdpol_pageactivate_locked(pg);
    306 			mutex_exit(&pg->interlock);
    307 			PDPOL_EVCNT_INCR(reactanon);
    308 			continue;
    309 		}
    310 
    311 		/*
    312 		 * try to lock the object that owns the page.
    313 		 *
    314 		 * with the page interlock held, we can drop s->lock, which
    315 		 * could otherwise serve as a barrier to us getting the
    316 		 * object locked, because the owner of the object's lock may
    317 		 * be blocked on s->lock (i.e. a deadlock).
    318 		 *
    319 		 * whatever happens, uvmpd_trylockowner() will release the
    320 		 * interlock.  with the interlock dropped we can then
    321 		 * re-acquire our own lock.  the order is:
    322 		 *
    323 		 *	object -> pdpol -> interlock.
    324 	         */
    325 	        mutex_exit(&s->lock);
    326         	lock = uvmpd_trylockowner(pg);
    327         	/* pg->interlock now released */
    328         	mutex_enter(&s->lock);
    329 		if (lock == NULL) {
    330 			/* didn't get it - try the next page. */
    331 			continue;
    332 		}
    333 
    334 		/*
    335 		 * move referenced pages back to active queue and skip to
    336 		 * next page.
    337 		 */
    338 		if (pmap_is_referenced(pg)) {
    339 			mutex_enter(&pg->interlock);
    340 			uvmpdpol_pageactivate_locked(pg);
    341 			mutex_exit(&pg->interlock);
    342 			uvmexp.pdreact++;
    343 			rw_exit(lock);
    344 			continue;
    345 		}
    346 
    347 		/* we have a potential victim. */
    348 		*plock = lock;
    349 		break;
    350 	}
    351 	mutex_exit(&s->lock);
    352 	return pg;
    353 }
    354 
    355 void
    356 uvmpdpol_balancequeue(int swap_shortage)
    357 {
    358 	struct uvmpdpol_globalstate *s = &pdpol_state;
    359 	int inactive_shortage;
    360 	struct vm_page *p, marker;
    361 	krwlock_t *lock;
    362 
    363 	/*
    364 	 * we have done the scan to get free pages.   now we work on meeting
    365 	 * our inactive target.
    366 	 */
    367 
    368 	memset(&marker, 0, sizeof(marker));
    369 	marker.flags = PG_MARKER;
    370 
    371 	mutex_enter(&s->lock);
    372 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
    373 	for (;;) {
    374 		inactive_shortage =
    375 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
    376 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
    377 			break;
    378 		}
    379 		p = TAILQ_NEXT(&marker, pdqueue);
    380 		if (p == NULL) {
    381 			break;
    382 		}
    383 		KASSERT((p->flags & PG_MARKER) == 0);
    384 
    385 		/*
    386 		 * acquire interlock to stabilize page identity.
    387 		 * if we have caught the page in a state of flux
    388 		 * deal with it and retry.
    389 		 */
    390 		mutex_enter(&p->interlock);
    391 		if (uvmpdpol_pagerealize_locked(p)) {
    392 			mutex_exit(&p->interlock);
    393 			continue;
    394 		}
    395 
    396 		/*
    397 		 * now prepare to move on to the next page.
    398 		 */
    399 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    400 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
    401 		    pdqueue);
    402 
    403 		/*
    404 		 * try to lock the object that owns the page.  see comments
    405 		 * in uvmpdol_selectvictim().
    406 	         */
    407 	        mutex_exit(&s->lock);
    408         	lock = uvmpd_trylockowner(p);
    409         	/* p->interlock now released */
    410         	mutex_enter(&s->lock);
    411 		if (lock == NULL) {
    412 			/* didn't get it - try the next page. */
    413 			continue;
    414 		}
    415 
    416 		/*
    417 		 * if there's a shortage of swap slots, try to free it.
    418 		 */
    419 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
    420 		    (p->flags & PG_BUSY) == 0) {
    421 			if (uvmpd_dropswap(p)) {
    422 				swap_shortage--;
    423 			}
    424 		}
    425 
    426 		/*
    427 		 * if there's a shortage of inactive pages, deactivate.
    428 		 */
    429 		if (inactive_shortage > 0) {
    430 			pmap_clear_reference(p);
    431 			mutex_enter(&p->interlock);
    432 			uvmpdpol_pagedeactivate_locked(p);
    433 			mutex_exit(&p->interlock);
    434 			uvmexp.pddeact++;
    435 			inactive_shortage--;
    436 		}
    437 		rw_exit(lock);
    438 	}
    439 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    440 	mutex_exit(&s->lock);
    441 }
    442 
    443 static void
    444 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
    445 {
    446 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    447 
    448 	KASSERT(mutex_owned(&s->lock));
    449 	KASSERT(mutex_owned(&pg->interlock));
    450 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    451 	    (PQ_INTENT_D | PQ_INTENT_SET));
    452 
    453 	if (pg->pqflags & PQ_ACTIVE) {
    454 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    455 		KASSERT(pdpol_state.s_active > 0);
    456 		pdpol_state.s_active--;
    457 	}
    458 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
    459 		KASSERT(pg->wire_count == 0);
    460 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
    461 		pdpol_state.s_inactive++;
    462 	}
    463 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
    464 	pg->pqflags |= PQ_INACTIVE;
    465 }
    466 
    467 void
    468 uvmpdpol_pagedeactivate(struct vm_page *pg)
    469 {
    470 
    471 	KASSERT(uvm_page_owner_locked_p(pg, false));
    472 	KASSERT(mutex_owned(&pg->interlock));
    473 
    474 	/*
    475 	 * we have to clear the reference bit now, as when it comes time to
    476 	 * realize the intent we won't have the object locked any more.
    477 	 */
    478 	pmap_clear_reference(pg);
    479 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
    480 }
    481 
    482 static void
    483 uvmpdpol_pageactivate_locked(struct vm_page *pg)
    484 {
    485 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    486 
    487 	KASSERT(mutex_owned(&s->lock));
    488 	KASSERT(mutex_owned(&pg->interlock));
    489 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    490 	    (PQ_INTENT_D | PQ_INTENT_SET));
    491 
    492 	uvmpdpol_pagedequeue_locked(pg);
    493 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
    494 	pdpol_state.s_active++;
    495 	pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
    496 	pg->pqflags |= PQ_ACTIVE;
    497 }
    498 
    499 void
    500 uvmpdpol_pageactivate(struct vm_page *pg)
    501 {
    502 
    503 	KASSERT(uvm_page_owner_locked_p(pg, false));
    504 	KASSERT(mutex_owned(&pg->interlock));
    505 
    506 	uvmpdpol_set_intent(pg, PQ_INTENT_A);
    507 }
    508 
    509 static void
    510 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
    511 {
    512 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    513 
    514 	KASSERT(mutex_owned(&s->lock));
    515 	KASSERT(mutex_owned(&pg->interlock));
    516 
    517 	if (pg->pqflags & PQ_ACTIVE) {
    518 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    519 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
    520 		KASSERT(pdpol_state.s_active > 0);
    521 		pdpol_state.s_active--;
    522 	} else if (pg->pqflags & PQ_INACTIVE) {
    523 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
    524 		KASSERT(pdpol_state.s_inactive > 0);
    525 		pdpol_state.s_inactive--;
    526 	}
    527 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
    528 }
    529 
    530 void
    531 uvmpdpol_pagedequeue(struct vm_page *pg)
    532 {
    533 
    534 	KASSERT(uvm_page_owner_locked_p(pg, true));
    535 	KASSERT(mutex_owned(&pg->interlock));
    536 
    537 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
    538 }
    539 
    540 void
    541 uvmpdpol_pageenqueue(struct vm_page *pg)
    542 {
    543 
    544 	KASSERT(uvm_page_owner_locked_p(pg, false));
    545 	KASSERT(mutex_owned(&pg->interlock));
    546 
    547 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
    548 }
    549 
    550 void
    551 uvmpdpol_anfree(struct vm_anon *an)
    552 {
    553 }
    554 
    555 bool
    556 uvmpdpol_pageisqueued_p(struct vm_page *pg)
    557 {
    558 	uint32_t pqflags;
    559 
    560 	/*
    561 	 * if there's an intent set, we have to consider it.  otherwise,
    562 	 * return the actual state.  we may be called unlocked for the
    563 	 * purpose of assertions, which is safe due to the page lifecycle.
    564 	 */
    565 	pqflags = atomic_load_relaxed(&pg->pqflags);
    566 	if ((pqflags & PQ_INTENT_SET) != 0) {
    567 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
    568 	} else {
    569 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
    570 	}
    571 }
    572 
    573 bool
    574 uvmpdpol_pageactivate_p(struct vm_page *pg)
    575 {
    576 	uint32_t pqflags;
    577 
    578 	/* consider intent in preference to actual state. */
    579 	pqflags = atomic_load_relaxed(&pg->pqflags);
    580 	if ((pqflags & PQ_INTENT_SET) != 0) {
    581 		pqflags &= PQ_INTENT_MASK;
    582 		return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E;
    583 	} else {
    584 		/*
    585 		 * TODO: Enabling this may be too much of a big hammer,
    586 		 * since we do get useful information from activations.
    587 		 * Think about it more and maybe come up with a heuristic
    588 		 * or something.
    589 		 *
    590 		 * return (pqflags & PQ_ACTIVE) == 0;
    591 		 */
    592 		return true;
    593 	}
    594 }
    595 
    596 void
    597 uvmpdpol_estimatepageable(int *active, int *inactive)
    598 {
    599 	struct uvmpdpol_globalstate *s = &pdpol_state;
    600 
    601 	/*
    602 	 * Don't take any locks here.  This can be called from DDB, and in
    603 	 * any case the numbers are stale the instant the lock is dropped,
    604 	 * so it just doesn't matter.
    605 	 */
    606 	if (active) {
    607 		*active = s->s_active;
    608 	}
    609 	if (inactive) {
    610 		*inactive = s->s_inactive;
    611 	}
    612 }
    613 
    614 #if !defined(PDSIM)
    615 static int
    616 min_check(struct uvm_pctparam *pct, int t)
    617 {
    618 	struct uvmpdpol_globalstate *s = &pdpol_state;
    619 	int total = t;
    620 
    621 	if (pct != &s->s_anonmin) {
    622 		total += uvm_pctparam_get(&s->s_anonmin);
    623 	}
    624 	if (pct != &s->s_filemin) {
    625 		total += uvm_pctparam_get(&s->s_filemin);
    626 	}
    627 	if (pct != &s->s_execmin) {
    628 		total += uvm_pctparam_get(&s->s_execmin);
    629 	}
    630 	if (total > 95) {
    631 		return EINVAL;
    632 	}
    633 	return 0;
    634 }
    635 #endif /* !defined(PDSIM) */
    636 
    637 void
    638 uvmpdpol_init(void)
    639 {
    640 	struct uvmpdpol_globalstate *s = &pdpol_state;
    641 
    642 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
    643 	TAILQ_INIT(&s->s_activeq);
    644 	TAILQ_INIT(&s->s_inactiveq);
    645 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
    646 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
    647 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
    648 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
    649 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
    650 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
    651 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
    652 }
    653 
    654 void
    655 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
    656 {
    657 
    658 	ucpu->pdq =
    659 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
    660 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
    661 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
    662 }
    663 
    664 void
    665 uvmpdpol_reinit(void)
    666 {
    667 }
    668 
    669 bool
    670 uvmpdpol_needsscan_p(void)
    671 {
    672 
    673 	/*
    674 	 * this must be an unlocked check: can be called from interrupt.
    675 	 */
    676 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
    677 }
    678 
    679 void
    680 uvmpdpol_tune(void)
    681 {
    682 	struct uvmpdpol_globalstate *s = &pdpol_state;
    683 
    684 	mutex_enter(&s->lock);
    685 	clock_tune();
    686 	mutex_exit(&s->lock);
    687 }
    688 
    689 /*
    690  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
    691  * make it real.  return true if any work was done.
    692  */
    693 static bool
    694 uvmpdpol_pagerealize_locked(struct vm_page *pg)
    695 {
    696 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    697 
    698 	KASSERT(mutex_owned(&s->lock));
    699 	KASSERT(mutex_owned(&pg->interlock));
    700 
    701 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
    702 	case PQ_INTENT_A | PQ_INTENT_SET:
    703 	case PQ_INTENT_E | PQ_INTENT_SET:
    704 		uvmpdpol_pageactivate_locked(pg);
    705 		return true;
    706 	case PQ_INTENT_I | PQ_INTENT_SET:
    707 		uvmpdpol_pagedeactivate_locked(pg);
    708 		return true;
    709 	case PQ_INTENT_D | PQ_INTENT_SET:
    710 		uvmpdpol_pagedequeue_locked(pg);
    711 		return true;
    712 	default:
    713 		return false;
    714 	}
    715 }
    716 
    717 /*
    718  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
    719  * updates flushed to the global queues.  this routine may block, and
    720  * so can switch cpu.  the idea is to empty to queue on whatever cpu
    721  * we finally end up on.
    722  * Must be called at splsoftbio()
    723  */
    724 static struct uvm_cpu *
    725 uvmpdpol_flush(void)
    726 {
    727 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    728 	struct uvm_cpu *ucpu;
    729 	struct vm_page *pg;
    730 
    731 	KASSERT(kpreempt_disabled());
    732 
    733 	mutex_enter(&s->lock);
    734 	for (;;) {
    735 		/*
    736 		 * prefer scanning forwards (even though mutex_enter() is
    737 		 * serializing) so as to not defeat any prefetch logic in
    738 		 * the CPU.  that means elsewhere enqueuing backwards, like
    739 		 * a stack, but not so important there as pages are being
    740 		 * added singularly.
    741 		 *
    742 		 * prefetch the next "struct vm_page" while working on the
    743 		 * current one.  this has a measurable and very positive
    744 		 * effect in reducing the amount of time spent here under
    745 		 * the global lock.
    746 		 */
    747 		ucpu = curcpu()->ci_data.cpu_uvm;
    748 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
    749 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
    750 			break;
    751 		}
    752 		pg = ucpu->pdq[ucpu->pdqhead++];
    753 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
    754 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
    755 		}
    756 		mutex_enter(&pg->interlock);
    757 		pg->pqflags &= ~PQ_INTENT_QUEUED;
    758 		(void)uvmpdpol_pagerealize_locked(pg);
    759 		mutex_exit(&pg->interlock);
    760 	}
    761 	mutex_exit(&s->lock);
    762 	return ucpu;
    763 }
    764 
    765 /*
    766  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
    767  * implementation, that means putting the page on a per-CPU queue to be
    768  * dealt with later.
    769  */
    770 void
    771 uvmpdpol_pagerealize(struct vm_page *pg)
    772 {
    773 	struct uvm_cpu *ucpu;
    774 	int s;
    775 
    776 	/*
    777 	 * drain the per per-CPU queue if full, then enter the page.
    778 	 */
    779 	s = splsoftbio();
    780 	ucpu = curcpu()->ci_data.cpu_uvm;
    781 	while (__predict_false(ucpu->pdqhead == 0)) {
    782 		ucpu = uvmpdpol_flush();
    783 	}
    784 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
    785 	splx(s);
    786 }
    787 
    788 /*
    789  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
    790  * pending updates back to the global queues.
    791  */
    792 void
    793 uvmpdpol_idle(struct uvm_cpu *ucpu)
    794 {
    795 	struct uvmpdpol_globalstate *s = &pdpol_state;
    796 	struct vm_page *pg;
    797 	int s_spl;
    798 
    799 	KASSERT(kpreempt_disabled());
    800 
    801 	/*
    802 	 * if no pages in the queue, we have nothing to do.
    803 	 */
    804 	if (ucpu->pdqhead == ucpu->pdqtail) {
    805 		ucpu->pdqtime = getticks();
    806 		return;
    807 	}
    808 
    809 	/*
    810 	 * don't do this more than ~8 times a second as it would needlessly
    811 	 * exert pressure.
    812 	 */
    813 	if (getticks() - ucpu->pdqtime < (hz >> 3)) {
    814 		return;
    815 	}
    816 
    817 	/*
    818 	 * the idle LWP can't block, so we have to try for the lock.  if we
    819 	 * get it, purge the per-CPU pending update queue.  continually
    820 	 * check for a pending resched: in that case exit immediately.
    821 	 */
    822 	if (mutex_tryenter(&s->lock)) {
    823 		s_spl = splsoftbio();
    824 		while (ucpu->pdqhead != ucpu->pdqtail) {
    825 			pg = ucpu->pdq[ucpu->pdqhead];
    826 			if (!mutex_tryenter(&pg->interlock)) {
    827 				break;
    828 			}
    829 			ucpu->pdqhead++;
    830 			pg->pqflags &= ~PQ_INTENT_QUEUED;
    831 			(void)uvmpdpol_pagerealize_locked(pg);
    832 			mutex_exit(&pg->interlock);
    833 			if (curcpu()->ci_want_resched) {
    834 				break;
    835 			}
    836 		}
    837 		if (ucpu->pdqhead == ucpu->pdqtail) {
    838 			ucpu->pdqtime = getticks();
    839 		}
    840 		splx(s_spl);
    841 		mutex_exit(&s->lock);
    842 	}
    843 }
    844 
    845 #if !defined(PDSIM)
    846 
    847 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
    848 
    849 void
    850 uvmpdpol_sysctlsetup(void)
    851 {
    852 	struct uvmpdpol_globalstate *s = &pdpol_state;
    853 
    854 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
    855 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    856 	    "for anonymous application data"));
    857 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
    858 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    859 	    "for cached file data"));
    860 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
    861 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    862 	    "for cached executable data"));
    863 
    864 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
    865 	    SYSCTL_DESCR("Percentage of physical memory which will "
    866 	    "be reclaimed from other usage for "
    867 	    "anonymous application data"));
    868 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
    869 	    SYSCTL_DESCR("Percentage of physical memory which will "
    870 	    "be reclaimed from other usage for cached "
    871 	    "file data"));
    872 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
    873 	    SYSCTL_DESCR("Percentage of physical memory which will "
    874 	    "be reclaimed from other usage for cached "
    875 	    "executable data"));
    876 
    877 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
    878 	    SYSCTL_DESCR("Percentage of inactive queue of "
    879 	    "the entire (active + inactive) queue"));
    880 }
    881 
    882 #endif /* !defined(PDSIM) */
    883 
    884 #if defined(PDSIM)
    885 void
    886 pdsim_dump(const char *id)
    887 {
    888 #if defined(DEBUG)
    889 	/* XXX */
    890 #endif /* defined(DEBUG) */
    891 }
    892 #endif /* defined(PDSIM) */
    893