Home | History | Annotate | Line # | Download | only in uvm
uvm_pdpolicy_clock.c revision 1.28
      1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.28 2019/12/31 22:42:51 ad Exp $	*/
      2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
      3 
      4 /*-
      5  * Copyright (c) 2019 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
     35  * Copyright (c) 1991, 1993, The Regents of the University of California.
     36  *
     37  * All rights reserved.
     38  *
     39  * This code is derived from software contributed to Berkeley by
     40  * The Mach Operating System project at Carnegie-Mellon University.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
     67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
     68  *
     69  *
     70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
     71  * All rights reserved.
     72  *
     73  * Permission to use, copy, modify and distribute this software and
     74  * its documentation is hereby granted, provided that both the copyright
     75  * notice and this permission notice appear in all copies of the
     76  * software, derivative works or modified versions, and any portions
     77  * thereof, and that both notices appear in supporting documentation.
     78  *
     79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     82  *
     83  * Carnegie Mellon requests users of this software to return to
     84  *
     85  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     86  *  School of Computer Science
     87  *  Carnegie Mellon University
     88  *  Pittsburgh PA 15213-3890
     89  *
     90  * any improvements or extensions that they make and grant Carnegie the
     91  * rights to redistribute these changes.
     92  */
     93 
     94 #if defined(PDSIM)
     95 
     96 #include "pdsim.h"
     97 
     98 #else /* defined(PDSIM) */
     99 
    100 #include <sys/cdefs.h>
    101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.28 2019/12/31 22:42:51 ad Exp $");
    102 
    103 #include <sys/param.h>
    104 #include <sys/proc.h>
    105 #include <sys/systm.h>
    106 #include <sys/kernel.h>
    107 #include <sys/kmem.h>
    108 
    109 #include <uvm/uvm.h>
    110 #include <uvm/uvm_pdpolicy.h>
    111 #include <uvm/uvm_pdpolicy_impl.h>
    112 #include <uvm/uvm_stat.h>
    113 
    114 #endif /* defined(PDSIM) */
    115 
    116 /*
    117  * per-CPU queue of pending page status changes.  128 entries makes for a
    118  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
    119  * keeps lock contention events and wait times low, while not using too much
    120  * memory nor allowing global state to fall too far behind.
    121  */
    122 #if !defined(CLOCK_PDQ_SIZE)
    123 #define	CLOCK_PDQ_SIZE	128
    124 #endif /* !defined(CLOCK_PDQ_SIZE) */
    125 
    126 #define	PQ_TIME		0xffffffc0	/* time of last activation */
    127 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
    128 #define PQ_ACTIVE	0x00000020	/* page is in active list */
    129 
    130 #if !defined(CLOCK_INACTIVEPCT)
    131 #define	CLOCK_INACTIVEPCT	33
    132 #endif /* !defined(CLOCK_INACTIVEPCT) */
    133 
    134 struct uvmpdpol_globalstate {
    135 	kmutex_t lock;			/* lock on state */
    136 					/* <= compiler pads here */
    137 	struct pglist s_activeq		/* allocated pages, in use */
    138 	    __aligned(COHERENCY_UNIT);
    139 	struct pglist s_inactiveq;	/* pages between the clock hands */
    140 	int s_active;
    141 	int s_inactive;
    142 	int s_inactarg;
    143 	struct uvm_pctparam s_anonmin;
    144 	struct uvm_pctparam s_filemin;
    145 	struct uvm_pctparam s_execmin;
    146 	struct uvm_pctparam s_anonmax;
    147 	struct uvm_pctparam s_filemax;
    148 	struct uvm_pctparam s_execmax;
    149 	struct uvm_pctparam s_inactivepct;
    150 };
    151 
    152 struct uvmpdpol_scanstate {
    153 	bool ss_anonreact, ss_filereact, ss_execreact;
    154 	struct vm_page ss_marker;
    155 };
    156 
    157 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
    158 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
    159 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
    160 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
    161 static struct uvm_cpu *uvmpdpol_flush(void);
    162 
    163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
    164 static struct uvmpdpol_scanstate pdpol_scanstate;
    165 
    166 PDPOL_EVCNT_DEFINE(reactexec)
    167 PDPOL_EVCNT_DEFINE(reactfile)
    168 PDPOL_EVCNT_DEFINE(reactanon)
    169 
    170 static void
    171 clock_tune(void)
    172 {
    173 	struct uvmpdpol_globalstate *s = &pdpol_state;
    174 
    175 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
    176 	    s->s_active + s->s_inactive);
    177 	if (s->s_inactarg <= uvmexp.freetarg) {
    178 		s->s_inactarg = uvmexp.freetarg + 1;
    179 	}
    180 }
    181 
    182 void
    183 uvmpdpol_scaninit(void)
    184 {
    185 	struct uvmpdpol_globalstate *s = &pdpol_state;
    186 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    187 	int t;
    188 	bool anonunder, fileunder, execunder;
    189 	bool anonover, fileover, execover;
    190 	bool anonreact, filereact, execreact;
    191 	int64_t freepg, anonpg, filepg, execpg;
    192 
    193 	/*
    194 	 * decide which types of pages we want to reactivate instead of freeing
    195 	 * to keep usage within the minimum and maximum usage limits.
    196 	 */
    197 
    198 	cpu_count_sync_all();
    199 	freepg = uvm_availmem();
    200 	anonpg = cpu_count_get(CPU_COUNT_ANONPAGES);
    201 	filepg = cpu_count_get(CPU_COUNT_FILEPAGES);
    202 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
    203 
    204 	mutex_enter(&s->lock);
    205 	t = s->s_active + s->s_inactive + freepg;
    206 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
    207 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
    208 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
    209 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
    210 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
    211 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
    212 	anonreact = anonunder || (!anonover && (fileover || execover));
    213 	filereact = fileunder || (!fileover && (anonover || execover));
    214 	execreact = execunder || (!execover && (anonover || fileover));
    215 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
    216 		anonreact = filereact = execreact = false;
    217 	}
    218 	ss->ss_anonreact = anonreact;
    219 	ss->ss_filereact = filereact;
    220 	ss->ss_execreact = execreact;
    221 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
    222 	ss->ss_marker.flags = PG_MARKER;
    223 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    224 	mutex_exit(&s->lock);
    225 }
    226 
    227 void
    228 uvmpdpol_scanfini(void)
    229 {
    230 	struct uvmpdpol_globalstate *s = &pdpol_state;
    231 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    232 
    233 	mutex_enter(&s->lock);
    234 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    235 	mutex_exit(&s->lock);
    236 }
    237 
    238 struct vm_page *
    239 uvmpdpol_selectvictim(kmutex_t **plock)
    240 {
    241 	struct uvmpdpol_globalstate *s = &pdpol_state;
    242 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    243 	struct vm_page *pg;
    244 	kmutex_t *lock;
    245 
    246 	mutex_enter(&s->lock);
    247 	while (/* CONSTCOND */ 1) {
    248 		struct vm_anon *anon;
    249 		struct uvm_object *uobj;
    250 
    251 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
    252 		if (pg == NULL) {
    253 			break;
    254 		}
    255 		KASSERT((pg->flags & PG_MARKER) == 0);
    256 		uvmexp.pdscans++;
    257 
    258 		/*
    259 		 * acquire interlock to stablize page identity.
    260 		 * if we have caught the page in a state of flux
    261 		 * deal with it and retry.
    262 		 */
    263 		mutex_enter(&pg->interlock);
    264 		if (uvmpdpol_pagerealize_locked(pg)) {
    265 			mutex_exit(&pg->interlock);
    266 			continue;
    267 		}
    268 
    269 		/*
    270 		 * now prepare to move on to the next page.
    271 		 */
    272 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
    273 		    pdqueue);
    274 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
    275 		    &ss->ss_marker, pdqueue);
    276 
    277 		/*
    278 		 * enforce the minimum thresholds on different
    279 		 * types of memory usage.  if reusing the current
    280 		 * page would reduce that type of usage below its
    281 		 * minimum, reactivate the page instead and move
    282 		 * on to the next page.
    283 		 */
    284 		anon = pg->uanon;
    285 		uobj = pg->uobject;
    286 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
    287 			uvmpdpol_pageactivate_locked(pg);
    288 			mutex_exit(&pg->interlock);
    289 			PDPOL_EVCNT_INCR(reactexec);
    290 			continue;
    291 		}
    292 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
    293 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
    294 			uvmpdpol_pageactivate_locked(pg);
    295 			mutex_exit(&pg->interlock);
    296 			PDPOL_EVCNT_INCR(reactfile);
    297 			continue;
    298 		}
    299 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
    300 			uvmpdpol_pageactivate_locked(pg);
    301 			mutex_exit(&pg->interlock);
    302 			PDPOL_EVCNT_INCR(reactanon);
    303 			continue;
    304 		}
    305 
    306 		/*
    307 		 * try to lock the object that owns the page.
    308 		 *
    309 		 * with the page interlock held, we can drop s->lock, which
    310 		 * could otherwise serve as a barrier to us getting the
    311 		 * object locked, because the owner of the object's lock may
    312 		 * be blocked on s->lock (i.e. a deadlock).
    313 		 *
    314 		 * whatever happens, uvmpd_trylockowner() will release the
    315 		 * interlock.  with the interlock dropped we can then
    316 		 * re-acquire our own lock.  the order is:
    317 		 *
    318 		 *	object -> pdpol -> interlock.
    319 	         */
    320 	        mutex_exit(&s->lock);
    321         	lock = uvmpd_trylockowner(pg);
    322         	/* pg->interlock now released */
    323         	mutex_enter(&s->lock);
    324 		if (lock == NULL) {
    325 			/* didn't get it - try the next page. */
    326 			continue;
    327 		}
    328 
    329 		/*
    330 		 * move referenced pages back to active queue and skip to
    331 		 * next page.
    332 		 */
    333 		if (pmap_is_referenced(pg)) {
    334 			mutex_enter(&pg->interlock);
    335 			uvmpdpol_pageactivate_locked(pg);
    336 			mutex_exit(&pg->interlock);
    337 			uvmexp.pdreact++;
    338 			mutex_exit(lock);
    339 			continue;
    340 		}
    341 
    342 		/* we have a potential victim. */
    343 		*plock = lock;
    344 		break;
    345 	}
    346 	mutex_exit(&s->lock);
    347 	return pg;
    348 }
    349 
    350 void
    351 uvmpdpol_balancequeue(int swap_shortage)
    352 {
    353 	struct uvmpdpol_globalstate *s = &pdpol_state;
    354 	int inactive_shortage;
    355 	struct vm_page *p, marker;
    356 	kmutex_t *lock;
    357 
    358 	/*
    359 	 * we have done the scan to get free pages.   now we work on meeting
    360 	 * our inactive target.
    361 	 */
    362 
    363 	memset(&marker, 0, sizeof(marker));
    364 	marker.flags = PG_MARKER;
    365 
    366 	mutex_enter(&s->lock);
    367 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
    368 	for (;;) {
    369 		inactive_shortage =
    370 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
    371 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
    372 			break;
    373 		}
    374 		p = TAILQ_NEXT(&marker, pdqueue);
    375 		if (p == NULL) {
    376 			break;
    377 		}
    378 		KASSERT((p->flags & PG_MARKER) == 0);
    379 
    380 		/*
    381 		 * acquire interlock to stablize page identity.
    382 		 * if we have caught the page in a state of flux
    383 		 * deal with it and retry.
    384 		 */
    385 		mutex_enter(&p->interlock);
    386 		if (uvmpdpol_pagerealize_locked(p)) {
    387 			mutex_exit(&p->interlock);
    388 			continue;
    389 		}
    390 
    391 		/*
    392 		 * now prepare to move on to the next page.
    393 		 */
    394 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    395 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
    396 		    pdqueue);
    397 
    398 		/*
    399 		 * try to lock the object that owns the page.  see comments
    400 		 * in uvmpdol_selectvictim().
    401 	         */
    402 	        mutex_exit(&s->lock);
    403         	lock = uvmpd_trylockowner(p);
    404         	/* p->interlock now released */
    405         	mutex_enter(&s->lock);
    406 		if (lock == NULL) {
    407 			/* didn't get it - try the next page. */
    408 			continue;
    409 		}
    410 
    411 		/*
    412 		 * if there's a shortage of swap slots, try to free it.
    413 		 */
    414 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
    415 		    (p->flags & PG_BUSY) == 0) {
    416 			if (uvmpd_dropswap(p)) {
    417 				swap_shortage--;
    418 			}
    419 		}
    420 
    421 		/*
    422 		 * if there's a shortage of inactive pages, deactivate.
    423 		 */
    424 		if (inactive_shortage > 0) {
    425 			pmap_clear_reference(p);
    426 			mutex_enter(&p->interlock);
    427 			uvmpdpol_pagedeactivate_locked(p);
    428 			mutex_exit(&p->interlock);
    429 			uvmexp.pddeact++;
    430 			inactive_shortage--;
    431 		}
    432 		mutex_exit(lock);
    433 	}
    434 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    435 	mutex_exit(&s->lock);
    436 }
    437 
    438 static void
    439 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
    440 {
    441 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    442 
    443 	KASSERT(mutex_owned(&s->lock));
    444 	KASSERT(mutex_owned(&pg->interlock));
    445 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    446 	    (PQ_INTENT_D | PQ_INTENT_SET));
    447 
    448 	if (pg->pqflags & PQ_ACTIVE) {
    449 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    450 		KASSERT(pdpol_state.s_active > 0);
    451 		pdpol_state.s_active--;
    452 	}
    453 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
    454 		KASSERT(pg->wire_count == 0);
    455 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
    456 		pdpol_state.s_inactive++;
    457 	}
    458 	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_INACTIVE;
    459 }
    460 
    461 void
    462 uvmpdpol_pagedeactivate(struct vm_page *pg)
    463 {
    464 
    465 	KASSERT(uvm_page_owner_locked_p(pg));
    466 	KASSERT(mutex_owned(&pg->interlock));
    467 
    468 	/*
    469 	 * we have to clear the reference bit now, as when it comes time to
    470 	 * realize the intent we won't have the object locked any more.
    471 	 */
    472 	pmap_clear_reference(pg);
    473 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
    474 }
    475 
    476 static void
    477 uvmpdpol_pageactivate_locked(struct vm_page *pg)
    478 {
    479 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    480 
    481 	KASSERT(mutex_owned(&s->lock));
    482 	KASSERT(mutex_owned(&pg->interlock));
    483 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    484 	    (PQ_INTENT_D | PQ_INTENT_SET));
    485 
    486 	uvmpdpol_pagedequeue_locked(pg);
    487 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
    488 	pdpol_state.s_active++;
    489 	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_ACTIVE |
    490 	    (hardclock_ticks & PQ_TIME);
    491 }
    492 
    493 void
    494 uvmpdpol_pageactivate(struct vm_page *pg)
    495 {
    496 	uint32_t pqflags;
    497 
    498 	KASSERT(uvm_page_owner_locked_p(pg));
    499 	KASSERT(mutex_owned(&pg->interlock));
    500 
    501 	/*
    502 	 * if there is any intent set on the page, or the page is not
    503 	 * active, or the page was activated in the "distant" past, then
    504 	 * it needs to be activated anew.
    505 	 */
    506 	pqflags = pg->pqflags;
    507 	if ((pqflags & PQ_INTENT_SET) != 0 ||
    508 	    (pqflags & PQ_ACTIVE) == 0 ||
    509 	    ((hardclock_ticks & PQ_TIME) - (pqflags & PQ_TIME)) > hz) {
    510 		uvmpdpol_set_intent(pg, PQ_INTENT_A);
    511 	}
    512 }
    513 
    514 static void
    515 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
    516 {
    517 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    518 
    519 	KASSERT(mutex_owned(&s->lock));
    520 	KASSERT(mutex_owned(&pg->interlock));
    521 
    522 	if (pg->pqflags & PQ_ACTIVE) {
    523 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    524 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
    525 		KASSERT(pdpol_state.s_active > 0);
    526 		pdpol_state.s_active--;
    527 	} else if (pg->pqflags & PQ_INACTIVE) {
    528 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
    529 		KASSERT(pdpol_state.s_inactive > 0);
    530 		pdpol_state.s_inactive--;
    531 	}
    532 	pg->pqflags &= PQ_INTENT_QUEUED;
    533 }
    534 
    535 void
    536 uvmpdpol_pagedequeue(struct vm_page *pg)
    537 {
    538 
    539 	KASSERT(uvm_page_owner_locked_p(pg));
    540 	KASSERT(mutex_owned(&pg->interlock));
    541 
    542 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
    543 }
    544 
    545 void
    546 uvmpdpol_pageenqueue(struct vm_page *pg)
    547 {
    548 
    549 	KASSERT(uvm_page_owner_locked_p(pg));
    550 	KASSERT(mutex_owned(&pg->interlock));
    551 
    552 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
    553 }
    554 
    555 void
    556 uvmpdpol_anfree(struct vm_anon *an)
    557 {
    558 }
    559 
    560 bool
    561 uvmpdpol_pageisqueued_p(struct vm_page *pg)
    562 {
    563 	uint32_t pqflags;
    564 
    565 	/*
    566 	 * if there's an intent set, we have to consider it.  otherwise,
    567 	 * return the actual state.  we may be called unlocked for the
    568 	 * purpose of assertions, which is safe due to the page lifecycle.
    569 	 */
    570 	pqflags = atomic_load_relaxed(&pg->pqflags);
    571 	if ((pqflags & PQ_INTENT_SET) != 0) {
    572 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
    573 	} else {
    574 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
    575 	}
    576 }
    577 
    578 void
    579 uvmpdpol_estimatepageable(int *active, int *inactive)
    580 {
    581 	struct uvmpdpol_globalstate *s = &pdpol_state;
    582 
    583 	mutex_enter(&s->lock);
    584 	if (active) {
    585 		*active = pdpol_state.s_active;
    586 	}
    587 	if (inactive) {
    588 		*inactive = pdpol_state.s_inactive;
    589 	}
    590 	mutex_exit(&s->lock);
    591 }
    592 
    593 #if !defined(PDSIM)
    594 static int
    595 min_check(struct uvm_pctparam *pct, int t)
    596 {
    597 	struct uvmpdpol_globalstate *s = &pdpol_state;
    598 	int total = t;
    599 
    600 	if (pct != &s->s_anonmin) {
    601 		total += uvm_pctparam_get(&s->s_anonmin);
    602 	}
    603 	if (pct != &s->s_filemin) {
    604 		total += uvm_pctparam_get(&s->s_filemin);
    605 	}
    606 	if (pct != &s->s_execmin) {
    607 		total += uvm_pctparam_get(&s->s_execmin);
    608 	}
    609 	if (total > 95) {
    610 		return EINVAL;
    611 	}
    612 	return 0;
    613 }
    614 #endif /* !defined(PDSIM) */
    615 
    616 void
    617 uvmpdpol_init(void)
    618 {
    619 	struct uvmpdpol_globalstate *s = &pdpol_state;
    620 
    621 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
    622 	TAILQ_INIT(&s->s_activeq);
    623 	TAILQ_INIT(&s->s_inactiveq);
    624 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
    625 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
    626 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
    627 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
    628 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
    629 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
    630 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
    631 }
    632 
    633 void
    634 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
    635 {
    636 
    637 	ucpu->pdq =
    638 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
    639 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
    640 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
    641 }
    642 
    643 void
    644 uvmpdpol_reinit(void)
    645 {
    646 }
    647 
    648 bool
    649 uvmpdpol_needsscan_p(void)
    650 {
    651 
    652 	/*
    653 	 * this must be an unlocked check: can be called from interrupt.
    654 	 */
    655 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
    656 }
    657 
    658 void
    659 uvmpdpol_tune(void)
    660 {
    661 	struct uvmpdpol_globalstate *s = &pdpol_state;
    662 
    663 	mutex_enter(&s->lock);
    664 	clock_tune();
    665 	mutex_exit(&s->lock);
    666 }
    667 
    668 /*
    669  * uvmpdpol_pagerealize_locked: take the intended state set on an indivdual
    670  * page and make it real.  return true if any work was done.
    671  */
    672 static bool
    673 uvmpdpol_pagerealize_locked(struct vm_page *pg)
    674 {
    675 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    676 
    677 	KASSERT(mutex_owned(&s->lock));
    678 	KASSERT(mutex_owned(&pg->interlock));
    679 
    680 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
    681 	case PQ_INTENT_A | PQ_INTENT_SET:
    682 	case PQ_INTENT_E | PQ_INTENT_SET:
    683 		uvmpdpol_pageactivate_locked(pg);
    684 		return true;
    685 	case PQ_INTENT_I | PQ_INTENT_SET:
    686 		uvmpdpol_pagedeactivate_locked(pg);
    687 		return true;
    688 	case PQ_INTENT_D | PQ_INTENT_SET:
    689 		uvmpdpol_pagedequeue_locked(pg);
    690 		return true;
    691 	default:
    692 		return false;
    693 	}
    694 }
    695 
    696 /*
    697  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
    698  * updates flushed to the global queues.  this routine may block, and
    699  * so can switch cpu.  the idea is to empty to queue on whatever cpu
    700  * we finally end up on.
    701  */
    702 static struct uvm_cpu *
    703 uvmpdpol_flush(void)
    704 {
    705 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    706 	struct uvm_cpu *ucpu;
    707 	struct vm_page *pg;
    708 
    709 	KASSERT(kpreempt_disabled());
    710 
    711 	mutex_enter(&s->lock);
    712 	for (;;) {
    713 		/*
    714 		 * prefer scanning forwards (even though mutex_enter() is
    715 		 * serializing) so as to not defeat any prefetch logic in
    716 		 * the CPU.  that means elsewhere enqueuing backwards, like
    717 		 * a stack, but not so important there as pages are being
    718 		 * added singularly.
    719 		 *
    720 		 * prefetch the next "struct vm_page" while working on the
    721 		 * current one.  this has a measurable and very positive
    722 		 * effect in reducing the amount of time spent here under
    723 		 * the global lock.
    724 		 */
    725 		ucpu = curcpu()->ci_data.cpu_uvm;
    726 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
    727 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
    728 			break;
    729 		}
    730 		pg = ucpu->pdq[ucpu->pdqhead++];
    731 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
    732 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
    733 		}
    734 		mutex_enter(&pg->interlock);
    735 		pg->pqflags &= ~PQ_INTENT_QUEUED;
    736 		(void)uvmpdpol_pagerealize_locked(pg);
    737 		mutex_exit(&pg->interlock);
    738 	}
    739 	mutex_exit(&s->lock);
    740 	return ucpu;
    741 }
    742 
    743 /*
    744  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
    745  * implementation, that means putting the page on a per-CPU queue to be
    746  * dealt with later.
    747  */
    748 void
    749 uvmpdpol_pagerealize(struct vm_page *pg)
    750 {
    751 	struct uvm_cpu *ucpu;
    752 
    753 	/*
    754 	 * drain the per per-CPU queue if full, then enter the page.
    755 	 */
    756 	kpreempt_disable();
    757 	ucpu = curcpu()->ci_data.cpu_uvm;
    758 	if (__predict_false(ucpu->pdqhead == 0)) {
    759 		ucpu = uvmpdpol_flush();
    760 	}
    761 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
    762 	kpreempt_enable();
    763 }
    764 
    765 /*
    766  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
    767  * pending updates back to the global queues.
    768  */
    769 void
    770 uvmpdpol_idle(struct uvm_cpu *ucpu)
    771 {
    772 	struct uvmpdpol_globalstate *s = &pdpol_state;
    773 	struct vm_page *pg;
    774 
    775 	KASSERT(kpreempt_disabled());
    776 
    777 	/*
    778 	 * if no pages in the queue, we have nothing to do.
    779 	 */
    780 	if (ucpu->pdqhead == ucpu->pdqtail) {
    781 		ucpu->pdqtime = hardclock_ticks;
    782 		return;
    783 	}
    784 
    785 	/*
    786 	 * don't do this more than ~8 times a second as it would needlessly
    787 	 * exert pressure.
    788 	 */
    789 	if (hardclock_ticks - ucpu->pdqtime < (hz >> 3)) {
    790 		return;
    791 	}
    792 
    793 	/*
    794 	 * the idle LWP can't block, so we have to try for the lock.  if we
    795 	 * get it, purge the per-CPU pending update queue.  continually
    796 	 * check for a pending resched: in that case exit immediately.
    797 	 */
    798 	if (mutex_tryenter(&s->lock)) {
    799 		while (ucpu->pdqhead != ucpu->pdqtail) {
    800 			pg = ucpu->pdq[ucpu->pdqhead];
    801 			if (!mutex_tryenter(&pg->interlock)) {
    802 				break;
    803 			}
    804 			ucpu->pdqhead++;
    805 			pg->pqflags &= ~PQ_INTENT_QUEUED;
    806 			(void)uvmpdpol_pagerealize_locked(pg);
    807 			mutex_exit(&pg->interlock);
    808 			if (curcpu()->ci_want_resched) {
    809 				break;
    810 			}
    811 		}
    812 		if (ucpu->pdqhead == ucpu->pdqtail) {
    813 			ucpu->pdqtime = hardclock_ticks;
    814 		}
    815 		mutex_exit(&s->lock);
    816 	}
    817 }
    818 
    819 #if !defined(PDSIM)
    820 
    821 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
    822 
    823 void
    824 uvmpdpol_sysctlsetup(void)
    825 {
    826 	struct uvmpdpol_globalstate *s = &pdpol_state;
    827 
    828 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
    829 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    830 	    "for anonymous application data"));
    831 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
    832 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    833 	    "for cached file data"));
    834 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
    835 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    836 	    "for cached executable data"));
    837 
    838 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
    839 	    SYSCTL_DESCR("Percentage of physical memory which will "
    840 	    "be reclaimed from other usage for "
    841 	    "anonymous application data"));
    842 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
    843 	    SYSCTL_DESCR("Percentage of physical memory which will "
    844 	    "be reclaimed from other usage for cached "
    845 	    "file data"));
    846 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
    847 	    SYSCTL_DESCR("Percentage of physical memory which will "
    848 	    "be reclaimed from other usage for cached "
    849 	    "executable data"));
    850 
    851 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
    852 	    SYSCTL_DESCR("Percentage of inactive queue of "
    853 	    "the entire (active + inactive) queue"));
    854 }
    855 
    856 #endif /* !defined(PDSIM) */
    857 
    858 #if defined(PDSIM)
    859 void
    860 pdsim_dump(const char *id)
    861 {
    862 #if defined(DEBUG)
    863 	/* XXX */
    864 #endif /* defined(DEBUG) */
    865 }
    866 #endif /* defined(PDSIM) */
    867