Home | History | Annotate | Line # | Download | only in uvm
uvm_pdpolicy_clock.c revision 1.30
      1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.30 2020/01/01 14:33:48 ad Exp $	*/
      2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
      3 
      4 /*-
      5  * Copyright (c) 2019 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
     35  * Copyright (c) 1991, 1993, The Regents of the University of California.
     36  *
     37  * All rights reserved.
     38  *
     39  * This code is derived from software contributed to Berkeley by
     40  * The Mach Operating System project at Carnegie-Mellon University.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
     67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
     68  *
     69  *
     70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
     71  * All rights reserved.
     72  *
     73  * Permission to use, copy, modify and distribute this software and
     74  * its documentation is hereby granted, provided that both the copyright
     75  * notice and this permission notice appear in all copies of the
     76  * software, derivative works or modified versions, and any portions
     77  * thereof, and that both notices appear in supporting documentation.
     78  *
     79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     82  *
     83  * Carnegie Mellon requests users of this software to return to
     84  *
     85  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     86  *  School of Computer Science
     87  *  Carnegie Mellon University
     88  *  Pittsburgh PA 15213-3890
     89  *
     90  * any improvements or extensions that they make and grant Carnegie the
     91  * rights to redistribute these changes.
     92  */
     93 
     94 #if defined(PDSIM)
     95 
     96 #include "pdsim.h"
     97 
     98 #else /* defined(PDSIM) */
     99 
    100 #include <sys/cdefs.h>
    101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.30 2020/01/01 14:33:48 ad Exp $");
    102 
    103 #include <sys/param.h>
    104 #include <sys/proc.h>
    105 #include <sys/systm.h>
    106 #include <sys/kernel.h>
    107 #include <sys/kmem.h>
    108 #include <sys/atomic.h>
    109 
    110 #include <uvm/uvm.h>
    111 #include <uvm/uvm_pdpolicy.h>
    112 #include <uvm/uvm_pdpolicy_impl.h>
    113 #include <uvm/uvm_stat.h>
    114 
    115 #endif /* defined(PDSIM) */
    116 
    117 /*
    118  * per-CPU queue of pending page status changes.  128 entries makes for a
    119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
    120  * keeps lock contention events and wait times low, while not using too much
    121  * memory nor allowing global state to fall too far behind.
    122  */
    123 #if !defined(CLOCK_PDQ_SIZE)
    124 #define	CLOCK_PDQ_SIZE	128
    125 #endif /* !defined(CLOCK_PDQ_SIZE) */
    126 
    127 #define	PQ_TIME		0xffffffc0	/* time of last activation */
    128 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
    129 #define PQ_ACTIVE	0x00000020	/* page is in active list */
    130 
    131 #if !defined(CLOCK_INACTIVEPCT)
    132 #define	CLOCK_INACTIVEPCT	33
    133 #endif /* !defined(CLOCK_INACTIVEPCT) */
    134 
    135 struct uvmpdpol_globalstate {
    136 	kmutex_t lock;			/* lock on state */
    137 					/* <= compiler pads here */
    138 	struct pglist s_activeq		/* allocated pages, in use */
    139 	    __aligned(COHERENCY_UNIT);
    140 	struct pglist s_inactiveq;	/* pages between the clock hands */
    141 	int s_active;
    142 	int s_inactive;
    143 	int s_inactarg;
    144 	struct uvm_pctparam s_anonmin;
    145 	struct uvm_pctparam s_filemin;
    146 	struct uvm_pctparam s_execmin;
    147 	struct uvm_pctparam s_anonmax;
    148 	struct uvm_pctparam s_filemax;
    149 	struct uvm_pctparam s_execmax;
    150 	struct uvm_pctparam s_inactivepct;
    151 };
    152 
    153 struct uvmpdpol_scanstate {
    154 	bool ss_anonreact, ss_filereact, ss_execreact;
    155 	struct vm_page ss_marker;
    156 };
    157 
    158 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
    159 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
    160 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
    161 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
    162 static struct uvm_cpu *uvmpdpol_flush(void);
    163 
    164 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
    165 static struct uvmpdpol_scanstate pdpol_scanstate;
    166 
    167 PDPOL_EVCNT_DEFINE(reactexec)
    168 PDPOL_EVCNT_DEFINE(reactfile)
    169 PDPOL_EVCNT_DEFINE(reactanon)
    170 
    171 static void
    172 clock_tune(void)
    173 {
    174 	struct uvmpdpol_globalstate *s = &pdpol_state;
    175 
    176 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
    177 	    s->s_active + s->s_inactive);
    178 	if (s->s_inactarg <= uvmexp.freetarg) {
    179 		s->s_inactarg = uvmexp.freetarg + 1;
    180 	}
    181 }
    182 
    183 void
    184 uvmpdpol_scaninit(void)
    185 {
    186 	struct uvmpdpol_globalstate *s = &pdpol_state;
    187 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    188 	int t;
    189 	bool anonunder, fileunder, execunder;
    190 	bool anonover, fileover, execover;
    191 	bool anonreact, filereact, execreact;
    192 	int64_t freepg, anonpg, filepg, execpg;
    193 
    194 	/*
    195 	 * decide which types of pages we want to reactivate instead of freeing
    196 	 * to keep usage within the minimum and maximum usage limits.
    197 	 */
    198 
    199 	cpu_count_sync_all();
    200 	freepg = uvm_availmem();
    201 	anonpg = cpu_count_get(CPU_COUNT_ANONPAGES);
    202 	filepg = cpu_count_get(CPU_COUNT_FILEPAGES);
    203 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
    204 
    205 	mutex_enter(&s->lock);
    206 	t = s->s_active + s->s_inactive + freepg;
    207 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
    208 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
    209 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
    210 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
    211 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
    212 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
    213 	anonreact = anonunder || (!anonover && (fileover || execover));
    214 	filereact = fileunder || (!fileover && (anonover || execover));
    215 	execreact = execunder || (!execover && (anonover || fileover));
    216 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
    217 		anonreact = filereact = execreact = false;
    218 	}
    219 	ss->ss_anonreact = anonreact;
    220 	ss->ss_filereact = filereact;
    221 	ss->ss_execreact = execreact;
    222 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
    223 	ss->ss_marker.flags = PG_MARKER;
    224 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    225 	mutex_exit(&s->lock);
    226 }
    227 
    228 void
    229 uvmpdpol_scanfini(void)
    230 {
    231 	struct uvmpdpol_globalstate *s = &pdpol_state;
    232 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    233 
    234 	mutex_enter(&s->lock);
    235 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    236 	mutex_exit(&s->lock);
    237 }
    238 
    239 struct vm_page *
    240 uvmpdpol_selectvictim(kmutex_t **plock)
    241 {
    242 	struct uvmpdpol_globalstate *s = &pdpol_state;
    243 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    244 	struct vm_page *pg;
    245 	kmutex_t *lock;
    246 
    247 	mutex_enter(&s->lock);
    248 	while (/* CONSTCOND */ 1) {
    249 		struct vm_anon *anon;
    250 		struct uvm_object *uobj;
    251 
    252 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
    253 		if (pg == NULL) {
    254 			break;
    255 		}
    256 		KASSERT((pg->flags & PG_MARKER) == 0);
    257 		uvmexp.pdscans++;
    258 
    259 		/*
    260 		 * acquire interlock to stablize page identity.
    261 		 * if we have caught the page in a state of flux
    262 		 * deal with it and retry.
    263 		 */
    264 		mutex_enter(&pg->interlock);
    265 		if (uvmpdpol_pagerealize_locked(pg)) {
    266 			mutex_exit(&pg->interlock);
    267 			continue;
    268 		}
    269 
    270 		/*
    271 		 * now prepare to move on to the next page.
    272 		 */
    273 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
    274 		    pdqueue);
    275 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
    276 		    &ss->ss_marker, pdqueue);
    277 
    278 		/*
    279 		 * enforce the minimum thresholds on different
    280 		 * types of memory usage.  if reusing the current
    281 		 * page would reduce that type of usage below its
    282 		 * minimum, reactivate the page instead and move
    283 		 * on to the next page.
    284 		 */
    285 		anon = pg->uanon;
    286 		uobj = pg->uobject;
    287 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
    288 			uvmpdpol_pageactivate_locked(pg);
    289 			mutex_exit(&pg->interlock);
    290 			PDPOL_EVCNT_INCR(reactexec);
    291 			continue;
    292 		}
    293 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
    294 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
    295 			uvmpdpol_pageactivate_locked(pg);
    296 			mutex_exit(&pg->interlock);
    297 			PDPOL_EVCNT_INCR(reactfile);
    298 			continue;
    299 		}
    300 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
    301 			uvmpdpol_pageactivate_locked(pg);
    302 			mutex_exit(&pg->interlock);
    303 			PDPOL_EVCNT_INCR(reactanon);
    304 			continue;
    305 		}
    306 
    307 		/*
    308 		 * try to lock the object that owns the page.
    309 		 *
    310 		 * with the page interlock held, we can drop s->lock, which
    311 		 * could otherwise serve as a barrier to us getting the
    312 		 * object locked, because the owner of the object's lock may
    313 		 * be blocked on s->lock (i.e. a deadlock).
    314 		 *
    315 		 * whatever happens, uvmpd_trylockowner() will release the
    316 		 * interlock.  with the interlock dropped we can then
    317 		 * re-acquire our own lock.  the order is:
    318 		 *
    319 		 *	object -> pdpol -> interlock.
    320 	         */
    321 	        mutex_exit(&s->lock);
    322         	lock = uvmpd_trylockowner(pg);
    323         	/* pg->interlock now released */
    324         	mutex_enter(&s->lock);
    325 		if (lock == NULL) {
    326 			/* didn't get it - try the next page. */
    327 			continue;
    328 		}
    329 
    330 		/*
    331 		 * move referenced pages back to active queue and skip to
    332 		 * next page.
    333 		 */
    334 		if (pmap_is_referenced(pg)) {
    335 			mutex_enter(&pg->interlock);
    336 			uvmpdpol_pageactivate_locked(pg);
    337 			mutex_exit(&pg->interlock);
    338 			uvmexp.pdreact++;
    339 			mutex_exit(lock);
    340 			continue;
    341 		}
    342 
    343 		/* we have a potential victim. */
    344 		*plock = lock;
    345 		break;
    346 	}
    347 	mutex_exit(&s->lock);
    348 	return pg;
    349 }
    350 
    351 void
    352 uvmpdpol_balancequeue(int swap_shortage)
    353 {
    354 	struct uvmpdpol_globalstate *s = &pdpol_state;
    355 	int inactive_shortage;
    356 	struct vm_page *p, marker;
    357 	kmutex_t *lock;
    358 
    359 	/*
    360 	 * we have done the scan to get free pages.   now we work on meeting
    361 	 * our inactive target.
    362 	 */
    363 
    364 	memset(&marker, 0, sizeof(marker));
    365 	marker.flags = PG_MARKER;
    366 
    367 	mutex_enter(&s->lock);
    368 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
    369 	for (;;) {
    370 		inactive_shortage =
    371 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
    372 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
    373 			break;
    374 		}
    375 		p = TAILQ_NEXT(&marker, pdqueue);
    376 		if (p == NULL) {
    377 			break;
    378 		}
    379 		KASSERT((p->flags & PG_MARKER) == 0);
    380 
    381 		/*
    382 		 * acquire interlock to stablize page identity.
    383 		 * if we have caught the page in a state of flux
    384 		 * deal with it and retry.
    385 		 */
    386 		mutex_enter(&p->interlock);
    387 		if (uvmpdpol_pagerealize_locked(p)) {
    388 			mutex_exit(&p->interlock);
    389 			continue;
    390 		}
    391 
    392 		/*
    393 		 * now prepare to move on to the next page.
    394 		 */
    395 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    396 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
    397 		    pdqueue);
    398 
    399 		/*
    400 		 * try to lock the object that owns the page.  see comments
    401 		 * in uvmpdol_selectvictim().
    402 	         */
    403 	        mutex_exit(&s->lock);
    404         	lock = uvmpd_trylockowner(p);
    405         	/* p->interlock now released */
    406         	mutex_enter(&s->lock);
    407 		if (lock == NULL) {
    408 			/* didn't get it - try the next page. */
    409 			continue;
    410 		}
    411 
    412 		/*
    413 		 * if there's a shortage of swap slots, try to free it.
    414 		 */
    415 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
    416 		    (p->flags & PG_BUSY) == 0) {
    417 			if (uvmpd_dropswap(p)) {
    418 				swap_shortage--;
    419 			}
    420 		}
    421 
    422 		/*
    423 		 * if there's a shortage of inactive pages, deactivate.
    424 		 */
    425 		if (inactive_shortage > 0) {
    426 			pmap_clear_reference(p);
    427 			mutex_enter(&p->interlock);
    428 			uvmpdpol_pagedeactivate_locked(p);
    429 			mutex_exit(&p->interlock);
    430 			uvmexp.pddeact++;
    431 			inactive_shortage--;
    432 		}
    433 		mutex_exit(lock);
    434 	}
    435 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    436 	mutex_exit(&s->lock);
    437 }
    438 
    439 static void
    440 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
    441 {
    442 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    443 
    444 	KASSERT(mutex_owned(&s->lock));
    445 	KASSERT(mutex_owned(&pg->interlock));
    446 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    447 	    (PQ_INTENT_D | PQ_INTENT_SET));
    448 
    449 	if (pg->pqflags & PQ_ACTIVE) {
    450 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    451 		KASSERT(pdpol_state.s_active > 0);
    452 		pdpol_state.s_active--;
    453 	}
    454 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
    455 		KASSERT(pg->wire_count == 0);
    456 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
    457 		pdpol_state.s_inactive++;
    458 	}
    459 	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_INACTIVE;
    460 }
    461 
    462 void
    463 uvmpdpol_pagedeactivate(struct vm_page *pg)
    464 {
    465 
    466 	KASSERT(uvm_page_owner_locked_p(pg));
    467 	KASSERT(mutex_owned(&pg->interlock));
    468 
    469 	/*
    470 	 * we have to clear the reference bit now, as when it comes time to
    471 	 * realize the intent we won't have the object locked any more.
    472 	 */
    473 	pmap_clear_reference(pg);
    474 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
    475 }
    476 
    477 static void
    478 uvmpdpol_pageactivate_locked(struct vm_page *pg)
    479 {
    480 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    481 
    482 	KASSERT(mutex_owned(&s->lock));
    483 	KASSERT(mutex_owned(&pg->interlock));
    484 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    485 	    (PQ_INTENT_D | PQ_INTENT_SET));
    486 
    487 	uvmpdpol_pagedequeue_locked(pg);
    488 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
    489 	pdpol_state.s_active++;
    490 	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_ACTIVE |
    491 	    (hardclock_ticks & PQ_TIME);
    492 }
    493 
    494 void
    495 uvmpdpol_pageactivate(struct vm_page *pg)
    496 {
    497 	uint32_t pqflags;
    498 
    499 	KASSERT(uvm_page_owner_locked_p(pg));
    500 	KASSERT(mutex_owned(&pg->interlock));
    501 
    502 	/*
    503 	 * if there is any intent set on the page, or the page is not
    504 	 * active, or the page was activated in the "distant" past, then
    505 	 * it needs to be activated anew.
    506 	 */
    507 	pqflags = pg->pqflags;
    508 	if ((pqflags & PQ_INTENT_SET) != 0 ||
    509 	    (pqflags & PQ_ACTIVE) == 0 ||
    510 	    ((hardclock_ticks & PQ_TIME) - (pqflags & PQ_TIME)) > hz) {
    511 		uvmpdpol_set_intent(pg, PQ_INTENT_A);
    512 	}
    513 }
    514 
    515 static void
    516 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
    517 {
    518 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    519 
    520 	KASSERT(mutex_owned(&s->lock));
    521 	KASSERT(mutex_owned(&pg->interlock));
    522 
    523 	if (pg->pqflags & PQ_ACTIVE) {
    524 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    525 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
    526 		KASSERT(pdpol_state.s_active > 0);
    527 		pdpol_state.s_active--;
    528 	} else if (pg->pqflags & PQ_INACTIVE) {
    529 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
    530 		KASSERT(pdpol_state.s_inactive > 0);
    531 		pdpol_state.s_inactive--;
    532 	}
    533 	pg->pqflags &= PQ_INTENT_QUEUED;
    534 }
    535 
    536 void
    537 uvmpdpol_pagedequeue(struct vm_page *pg)
    538 {
    539 
    540 	KASSERT(uvm_page_owner_locked_p(pg));
    541 	KASSERT(mutex_owned(&pg->interlock));
    542 
    543 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
    544 }
    545 
    546 void
    547 uvmpdpol_pageenqueue(struct vm_page *pg)
    548 {
    549 
    550 	KASSERT(uvm_page_owner_locked_p(pg));
    551 	KASSERT(mutex_owned(&pg->interlock));
    552 
    553 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
    554 }
    555 
    556 void
    557 uvmpdpol_anfree(struct vm_anon *an)
    558 {
    559 }
    560 
    561 bool
    562 uvmpdpol_pageisqueued_p(struct vm_page *pg)
    563 {
    564 	uint32_t pqflags;
    565 
    566 	/*
    567 	 * if there's an intent set, we have to consider it.  otherwise,
    568 	 * return the actual state.  we may be called unlocked for the
    569 	 * purpose of assertions, which is safe due to the page lifecycle.
    570 	 */
    571 	pqflags = atomic_load_relaxed(&pg->pqflags);
    572 	if ((pqflags & PQ_INTENT_SET) != 0) {
    573 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
    574 	} else {
    575 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
    576 	}
    577 }
    578 
    579 void
    580 uvmpdpol_estimatepageable(int *active, int *inactive)
    581 {
    582 	struct uvmpdpol_globalstate *s = &pdpol_state;
    583 
    584 	mutex_enter(&s->lock);
    585 	if (active) {
    586 		*active = pdpol_state.s_active;
    587 	}
    588 	if (inactive) {
    589 		*inactive = pdpol_state.s_inactive;
    590 	}
    591 	mutex_exit(&s->lock);
    592 }
    593 
    594 #if !defined(PDSIM)
    595 static int
    596 min_check(struct uvm_pctparam *pct, int t)
    597 {
    598 	struct uvmpdpol_globalstate *s = &pdpol_state;
    599 	int total = t;
    600 
    601 	if (pct != &s->s_anonmin) {
    602 		total += uvm_pctparam_get(&s->s_anonmin);
    603 	}
    604 	if (pct != &s->s_filemin) {
    605 		total += uvm_pctparam_get(&s->s_filemin);
    606 	}
    607 	if (pct != &s->s_execmin) {
    608 		total += uvm_pctparam_get(&s->s_execmin);
    609 	}
    610 	if (total > 95) {
    611 		return EINVAL;
    612 	}
    613 	return 0;
    614 }
    615 #endif /* !defined(PDSIM) */
    616 
    617 void
    618 uvmpdpol_init(void)
    619 {
    620 	struct uvmpdpol_globalstate *s = &pdpol_state;
    621 
    622 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
    623 	TAILQ_INIT(&s->s_activeq);
    624 	TAILQ_INIT(&s->s_inactiveq);
    625 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
    626 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
    627 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
    628 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
    629 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
    630 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
    631 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
    632 }
    633 
    634 void
    635 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
    636 {
    637 
    638 	ucpu->pdq =
    639 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
    640 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
    641 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
    642 }
    643 
    644 void
    645 uvmpdpol_reinit(void)
    646 {
    647 }
    648 
    649 bool
    650 uvmpdpol_needsscan_p(void)
    651 {
    652 
    653 	/*
    654 	 * this must be an unlocked check: can be called from interrupt.
    655 	 */
    656 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
    657 }
    658 
    659 void
    660 uvmpdpol_tune(void)
    661 {
    662 	struct uvmpdpol_globalstate *s = &pdpol_state;
    663 
    664 	mutex_enter(&s->lock);
    665 	clock_tune();
    666 	mutex_exit(&s->lock);
    667 }
    668 
    669 /*
    670  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
    671  * make it real.  return true if any work was done.
    672  */
    673 static bool
    674 uvmpdpol_pagerealize_locked(struct vm_page *pg)
    675 {
    676 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    677 
    678 	KASSERT(mutex_owned(&s->lock));
    679 	KASSERT(mutex_owned(&pg->interlock));
    680 
    681 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
    682 	case PQ_INTENT_A | PQ_INTENT_SET:
    683 	case PQ_INTENT_E | PQ_INTENT_SET:
    684 		uvmpdpol_pageactivate_locked(pg);
    685 		return true;
    686 	case PQ_INTENT_I | PQ_INTENT_SET:
    687 		uvmpdpol_pagedeactivate_locked(pg);
    688 		return true;
    689 	case PQ_INTENT_D | PQ_INTENT_SET:
    690 		uvmpdpol_pagedequeue_locked(pg);
    691 		return true;
    692 	default:
    693 		return false;
    694 	}
    695 }
    696 
    697 /*
    698  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
    699  * updates flushed to the global queues.  this routine may block, and
    700  * so can switch cpu.  the idea is to empty to queue on whatever cpu
    701  * we finally end up on.
    702  */
    703 static struct uvm_cpu *
    704 uvmpdpol_flush(void)
    705 {
    706 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    707 	struct uvm_cpu *ucpu;
    708 	struct vm_page *pg;
    709 
    710 	KASSERT(kpreempt_disabled());
    711 
    712 	mutex_enter(&s->lock);
    713 	for (;;) {
    714 		/*
    715 		 * prefer scanning forwards (even though mutex_enter() is
    716 		 * serializing) so as to not defeat any prefetch logic in
    717 		 * the CPU.  that means elsewhere enqueuing backwards, like
    718 		 * a stack, but not so important there as pages are being
    719 		 * added singularly.
    720 		 *
    721 		 * prefetch the next "struct vm_page" while working on the
    722 		 * current one.  this has a measurable and very positive
    723 		 * effect in reducing the amount of time spent here under
    724 		 * the global lock.
    725 		 */
    726 		ucpu = curcpu()->ci_data.cpu_uvm;
    727 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
    728 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
    729 			break;
    730 		}
    731 		pg = ucpu->pdq[ucpu->pdqhead++];
    732 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
    733 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
    734 		}
    735 		mutex_enter(&pg->interlock);
    736 		pg->pqflags &= ~PQ_INTENT_QUEUED;
    737 		(void)uvmpdpol_pagerealize_locked(pg);
    738 		mutex_exit(&pg->interlock);
    739 	}
    740 	mutex_exit(&s->lock);
    741 	return ucpu;
    742 }
    743 
    744 /*
    745  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
    746  * implementation, that means putting the page on a per-CPU queue to be
    747  * dealt with later.
    748  */
    749 void
    750 uvmpdpol_pagerealize(struct vm_page *pg)
    751 {
    752 	struct uvm_cpu *ucpu;
    753 
    754 	/*
    755 	 * drain the per per-CPU queue if full, then enter the page.
    756 	 */
    757 	kpreempt_disable();
    758 	ucpu = curcpu()->ci_data.cpu_uvm;
    759 	if (__predict_false(ucpu->pdqhead == 0)) {
    760 		ucpu = uvmpdpol_flush();
    761 	}
    762 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
    763 	kpreempt_enable();
    764 }
    765 
    766 /*
    767  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
    768  * pending updates back to the global queues.
    769  */
    770 void
    771 uvmpdpol_idle(struct uvm_cpu *ucpu)
    772 {
    773 	struct uvmpdpol_globalstate *s = &pdpol_state;
    774 	struct vm_page *pg;
    775 
    776 	KASSERT(kpreempt_disabled());
    777 
    778 	/*
    779 	 * if no pages in the queue, we have nothing to do.
    780 	 */
    781 	if (ucpu->pdqhead == ucpu->pdqtail) {
    782 		ucpu->pdqtime = hardclock_ticks;
    783 		return;
    784 	}
    785 
    786 	/*
    787 	 * don't do this more than ~8 times a second as it would needlessly
    788 	 * exert pressure.
    789 	 */
    790 	if (hardclock_ticks - ucpu->pdqtime < (hz >> 3)) {
    791 		return;
    792 	}
    793 
    794 	/*
    795 	 * the idle LWP can't block, so we have to try for the lock.  if we
    796 	 * get it, purge the per-CPU pending update queue.  continually
    797 	 * check for a pending resched: in that case exit immediately.
    798 	 */
    799 	if (mutex_tryenter(&s->lock)) {
    800 		while (ucpu->pdqhead != ucpu->pdqtail) {
    801 			pg = ucpu->pdq[ucpu->pdqhead];
    802 			if (!mutex_tryenter(&pg->interlock)) {
    803 				break;
    804 			}
    805 			ucpu->pdqhead++;
    806 			pg->pqflags &= ~PQ_INTENT_QUEUED;
    807 			(void)uvmpdpol_pagerealize_locked(pg);
    808 			mutex_exit(&pg->interlock);
    809 			if (curcpu()->ci_want_resched) {
    810 				break;
    811 			}
    812 		}
    813 		if (ucpu->pdqhead == ucpu->pdqtail) {
    814 			ucpu->pdqtime = hardclock_ticks;
    815 		}
    816 		mutex_exit(&s->lock);
    817 	}
    818 }
    819 
    820 #if !defined(PDSIM)
    821 
    822 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
    823 
    824 void
    825 uvmpdpol_sysctlsetup(void)
    826 {
    827 	struct uvmpdpol_globalstate *s = &pdpol_state;
    828 
    829 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
    830 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    831 	    "for anonymous application data"));
    832 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
    833 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    834 	    "for cached file data"));
    835 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
    836 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    837 	    "for cached executable data"));
    838 
    839 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
    840 	    SYSCTL_DESCR("Percentage of physical memory which will "
    841 	    "be reclaimed from other usage for "
    842 	    "anonymous application data"));
    843 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
    844 	    SYSCTL_DESCR("Percentage of physical memory which will "
    845 	    "be reclaimed from other usage for cached "
    846 	    "file data"));
    847 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
    848 	    SYSCTL_DESCR("Percentage of physical memory which will "
    849 	    "be reclaimed from other usage for cached "
    850 	    "executable data"));
    851 
    852 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
    853 	    SYSCTL_DESCR("Percentage of inactive queue of "
    854 	    "the entire (active + inactive) queue"));
    855 }
    856 
    857 #endif /* !defined(PDSIM) */
    858 
    859 #if defined(PDSIM)
    860 void
    861 pdsim_dump(const char *id)
    862 {
    863 #if defined(DEBUG)
    864 	/* XXX */
    865 #endif /* defined(DEBUG) */
    866 }
    867 #endif /* defined(PDSIM) */
    868