Home | History | Annotate | Line # | Download | only in uvm
uvm_pdpolicy_clock.c revision 1.35
      1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.35 2020/03/14 13:53:26 ad Exp $	*/
      2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
      3 
      4 /*-
      5  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
     35  * Copyright (c) 1991, 1993, The Regents of the University of California.
     36  *
     37  * All rights reserved.
     38  *
     39  * This code is derived from software contributed to Berkeley by
     40  * The Mach Operating System project at Carnegie-Mellon University.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
     67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
     68  *
     69  *
     70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
     71  * All rights reserved.
     72  *
     73  * Permission to use, copy, modify and distribute this software and
     74  * its documentation is hereby granted, provided that both the copyright
     75  * notice and this permission notice appear in all copies of the
     76  * software, derivative works or modified versions, and any portions
     77  * thereof, and that both notices appear in supporting documentation.
     78  *
     79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     82  *
     83  * Carnegie Mellon requests users of this software to return to
     84  *
     85  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     86  *  School of Computer Science
     87  *  Carnegie Mellon University
     88  *  Pittsburgh PA 15213-3890
     89  *
     90  * any improvements or extensions that they make and grant Carnegie the
     91  * rights to redistribute these changes.
     92  */
     93 
     94 #if defined(PDSIM)
     95 
     96 #include "pdsim.h"
     97 
     98 #else /* defined(PDSIM) */
     99 
    100 #include <sys/cdefs.h>
    101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.35 2020/03/14 13:53:26 ad Exp $");
    102 
    103 #include <sys/param.h>
    104 #include <sys/proc.h>
    105 #include <sys/systm.h>
    106 #include <sys/kernel.h>
    107 #include <sys/kmem.h>
    108 #include <sys/atomic.h>
    109 
    110 #include <uvm/uvm.h>
    111 #include <uvm/uvm_pdpolicy.h>
    112 #include <uvm/uvm_pdpolicy_impl.h>
    113 #include <uvm/uvm_stat.h>
    114 
    115 #endif /* defined(PDSIM) */
    116 
    117 /*
    118  * per-CPU queue of pending page status changes.  128 entries makes for a
    119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
    120  * keeps lock contention events and wait times low, while not using too much
    121  * memory nor allowing global state to fall too far behind.
    122  */
    123 #if !defined(CLOCK_PDQ_SIZE)
    124 #define	CLOCK_PDQ_SIZE	128
    125 #endif /* !defined(CLOCK_PDQ_SIZE) */
    126 
    127 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
    128 #define PQ_ACTIVE	0x00000020	/* page is in active list */
    129 
    130 #if !defined(CLOCK_INACTIVEPCT)
    131 #define	CLOCK_INACTIVEPCT	33
    132 #endif /* !defined(CLOCK_INACTIVEPCT) */
    133 
    134 struct uvmpdpol_globalstate {
    135 	kmutex_t lock;			/* lock on state */
    136 					/* <= compiler pads here */
    137 	struct pglist s_activeq		/* allocated pages, in use */
    138 	    __aligned(COHERENCY_UNIT);
    139 	struct pglist s_inactiveq;	/* pages between the clock hands */
    140 	int s_active;
    141 	int s_inactive;
    142 	int s_inactarg;
    143 	struct uvm_pctparam s_anonmin;
    144 	struct uvm_pctparam s_filemin;
    145 	struct uvm_pctparam s_execmin;
    146 	struct uvm_pctparam s_anonmax;
    147 	struct uvm_pctparam s_filemax;
    148 	struct uvm_pctparam s_execmax;
    149 	struct uvm_pctparam s_inactivepct;
    150 };
    151 
    152 struct uvmpdpol_scanstate {
    153 	bool ss_anonreact, ss_filereact, ss_execreact;
    154 	struct vm_page ss_marker;
    155 };
    156 
    157 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
    158 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
    159 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
    160 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
    161 static struct uvm_cpu *uvmpdpol_flush(void);
    162 
    163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
    164 static struct uvmpdpol_scanstate pdpol_scanstate;
    165 
    166 PDPOL_EVCNT_DEFINE(reactexec)
    167 PDPOL_EVCNT_DEFINE(reactfile)
    168 PDPOL_EVCNT_DEFINE(reactanon)
    169 
    170 static void
    171 clock_tune(void)
    172 {
    173 	struct uvmpdpol_globalstate *s = &pdpol_state;
    174 
    175 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
    176 	    s->s_active + s->s_inactive);
    177 	if (s->s_inactarg <= uvmexp.freetarg) {
    178 		s->s_inactarg = uvmexp.freetarg + 1;
    179 	}
    180 }
    181 
    182 void
    183 uvmpdpol_scaninit(void)
    184 {
    185 	struct uvmpdpol_globalstate *s = &pdpol_state;
    186 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    187 	int t;
    188 	bool anonunder, fileunder, execunder;
    189 	bool anonover, fileover, execover;
    190 	bool anonreact, filereact, execreact;
    191 	int64_t freepg, anonpg, filepg, execpg;
    192 
    193 	/*
    194 	 * decide which types of pages we want to reactivate instead of freeing
    195 	 * to keep usage within the minimum and maximum usage limits.
    196 	 */
    197 
    198 	cpu_count_sync_all();
    199 	freepg = uvm_availmem();
    200 	anonpg = cpu_count_get(CPU_COUNT_ANONPAGES);
    201 	filepg = cpu_count_get(CPU_COUNT_FILEPAGES);
    202 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
    203 
    204 	mutex_enter(&s->lock);
    205 	t = s->s_active + s->s_inactive + freepg;
    206 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
    207 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
    208 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
    209 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
    210 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
    211 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
    212 	anonreact = anonunder || (!anonover && (fileover || execover));
    213 	filereact = fileunder || (!fileover && (anonover || execover));
    214 	execreact = execunder || (!execover && (anonover || fileover));
    215 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
    216 		anonreact = filereact = execreact = false;
    217 	}
    218 	ss->ss_anonreact = anonreact;
    219 	ss->ss_filereact = filereact;
    220 	ss->ss_execreact = execreact;
    221 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
    222 	ss->ss_marker.flags = PG_MARKER;
    223 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    224 	mutex_exit(&s->lock);
    225 }
    226 
    227 void
    228 uvmpdpol_scanfini(void)
    229 {
    230 	struct uvmpdpol_globalstate *s = &pdpol_state;
    231 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    232 
    233 	mutex_enter(&s->lock);
    234 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    235 	mutex_exit(&s->lock);
    236 }
    237 
    238 struct vm_page *
    239 uvmpdpol_selectvictim(krwlock_t **plock)
    240 {
    241 	struct uvmpdpol_globalstate *s = &pdpol_state;
    242 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    243 	struct vm_page *pg;
    244 	krwlock_t *lock;
    245 
    246 	mutex_enter(&s->lock);
    247 	while (/* CONSTCOND */ 1) {
    248 		struct vm_anon *anon;
    249 		struct uvm_object *uobj;
    250 
    251 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
    252 		if (pg == NULL) {
    253 			break;
    254 		}
    255 		KASSERT((pg->flags & PG_MARKER) == 0);
    256 		uvmexp.pdscans++;
    257 
    258 		/*
    259 		 * acquire interlock to stablize page identity.
    260 		 * if we have caught the page in a state of flux
    261 		 * deal with it and retry.
    262 		 */
    263 		mutex_enter(&pg->interlock);
    264 		if (uvmpdpol_pagerealize_locked(pg)) {
    265 			mutex_exit(&pg->interlock);
    266 			continue;
    267 		}
    268 
    269 		/*
    270 		 * now prepare to move on to the next page.
    271 		 */
    272 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
    273 		    pdqueue);
    274 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
    275 		    &ss->ss_marker, pdqueue);
    276 
    277 		/*
    278 		 * enforce the minimum thresholds on different
    279 		 * types of memory usage.  if reusing the current
    280 		 * page would reduce that type of usage below its
    281 		 * minimum, reactivate the page instead and move
    282 		 * on to the next page.
    283 		 */
    284 		anon = pg->uanon;
    285 		uobj = pg->uobject;
    286 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
    287 			uvmpdpol_pageactivate_locked(pg);
    288 			mutex_exit(&pg->interlock);
    289 			PDPOL_EVCNT_INCR(reactexec);
    290 			continue;
    291 		}
    292 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
    293 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
    294 			uvmpdpol_pageactivate_locked(pg);
    295 			mutex_exit(&pg->interlock);
    296 			PDPOL_EVCNT_INCR(reactfile);
    297 			continue;
    298 		}
    299 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
    300 			uvmpdpol_pageactivate_locked(pg);
    301 			mutex_exit(&pg->interlock);
    302 			PDPOL_EVCNT_INCR(reactanon);
    303 			continue;
    304 		}
    305 
    306 		/*
    307 		 * try to lock the object that owns the page.
    308 		 *
    309 		 * with the page interlock held, we can drop s->lock, which
    310 		 * could otherwise serve as a barrier to us getting the
    311 		 * object locked, because the owner of the object's lock may
    312 		 * be blocked on s->lock (i.e. a deadlock).
    313 		 *
    314 		 * whatever happens, uvmpd_trylockowner() will release the
    315 		 * interlock.  with the interlock dropped we can then
    316 		 * re-acquire our own lock.  the order is:
    317 		 *
    318 		 *	object -> pdpol -> interlock.
    319 	         */
    320 	        mutex_exit(&s->lock);
    321         	lock = uvmpd_trylockowner(pg);
    322         	/* pg->interlock now released */
    323         	mutex_enter(&s->lock);
    324 		if (lock == NULL) {
    325 			/* didn't get it - try the next page. */
    326 			continue;
    327 		}
    328 
    329 		/*
    330 		 * move referenced pages back to active queue and skip to
    331 		 * next page.
    332 		 */
    333 		if (pmap_is_referenced(pg)) {
    334 			mutex_enter(&pg->interlock);
    335 			uvmpdpol_pageactivate_locked(pg);
    336 			mutex_exit(&pg->interlock);
    337 			uvmexp.pdreact++;
    338 			rw_exit(lock);
    339 			continue;
    340 		}
    341 
    342 		/* we have a potential victim. */
    343 		*plock = lock;
    344 		break;
    345 	}
    346 	mutex_exit(&s->lock);
    347 	return pg;
    348 }
    349 
    350 void
    351 uvmpdpol_balancequeue(int swap_shortage)
    352 {
    353 	struct uvmpdpol_globalstate *s = &pdpol_state;
    354 	int inactive_shortage;
    355 	struct vm_page *p, marker;
    356 	krwlock_t *lock;
    357 
    358 	/*
    359 	 * we have done the scan to get free pages.   now we work on meeting
    360 	 * our inactive target.
    361 	 */
    362 
    363 	memset(&marker, 0, sizeof(marker));
    364 	marker.flags = PG_MARKER;
    365 
    366 	mutex_enter(&s->lock);
    367 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
    368 	for (;;) {
    369 		inactive_shortage =
    370 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
    371 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
    372 			break;
    373 		}
    374 		p = TAILQ_NEXT(&marker, pdqueue);
    375 		if (p == NULL) {
    376 			break;
    377 		}
    378 		KASSERT((p->flags & PG_MARKER) == 0);
    379 
    380 		/*
    381 		 * acquire interlock to stablize page identity.
    382 		 * if we have caught the page in a state of flux
    383 		 * deal with it and retry.
    384 		 */
    385 		mutex_enter(&p->interlock);
    386 		if (uvmpdpol_pagerealize_locked(p)) {
    387 			mutex_exit(&p->interlock);
    388 			continue;
    389 		}
    390 
    391 		/*
    392 		 * now prepare to move on to the next page.
    393 		 */
    394 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    395 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
    396 		    pdqueue);
    397 
    398 		/*
    399 		 * try to lock the object that owns the page.  see comments
    400 		 * in uvmpdol_selectvictim().
    401 	         */
    402 	        mutex_exit(&s->lock);
    403         	lock = uvmpd_trylockowner(p);
    404         	/* p->interlock now released */
    405         	mutex_enter(&s->lock);
    406 		if (lock == NULL) {
    407 			/* didn't get it - try the next page. */
    408 			continue;
    409 		}
    410 
    411 		/*
    412 		 * if there's a shortage of swap slots, try to free it.
    413 		 */
    414 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
    415 		    (p->flags & PG_BUSY) == 0) {
    416 			if (uvmpd_dropswap(p)) {
    417 				swap_shortage--;
    418 			}
    419 		}
    420 
    421 		/*
    422 		 * if there's a shortage of inactive pages, deactivate.
    423 		 */
    424 		if (inactive_shortage > 0) {
    425 			pmap_clear_reference(p);
    426 			mutex_enter(&p->interlock);
    427 			uvmpdpol_pagedeactivate_locked(p);
    428 			mutex_exit(&p->interlock);
    429 			uvmexp.pddeact++;
    430 			inactive_shortage--;
    431 		}
    432 		rw_exit(lock);
    433 	}
    434 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    435 	mutex_exit(&s->lock);
    436 }
    437 
    438 static void
    439 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
    440 {
    441 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    442 
    443 	KASSERT(mutex_owned(&s->lock));
    444 	KASSERT(mutex_owned(&pg->interlock));
    445 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    446 	    (PQ_INTENT_D | PQ_INTENT_SET));
    447 
    448 	if (pg->pqflags & PQ_ACTIVE) {
    449 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    450 		KASSERT(pdpol_state.s_active > 0);
    451 		pdpol_state.s_active--;
    452 	}
    453 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
    454 		KASSERT(pg->wire_count == 0);
    455 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
    456 		pdpol_state.s_inactive++;
    457 	}
    458 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
    459 	pg->pqflags |= PQ_INACTIVE;
    460 }
    461 
    462 void
    463 uvmpdpol_pagedeactivate(struct vm_page *pg)
    464 {
    465 
    466 	KASSERT(uvm_page_owner_locked_p(pg, false));
    467 	KASSERT(mutex_owned(&pg->interlock));
    468 
    469 	/*
    470 	 * we have to clear the reference bit now, as when it comes time to
    471 	 * realize the intent we won't have the object locked any more.
    472 	 */
    473 	pmap_clear_reference(pg);
    474 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
    475 }
    476 
    477 static void
    478 uvmpdpol_pageactivate_locked(struct vm_page *pg)
    479 {
    480 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    481 
    482 	KASSERT(mutex_owned(&s->lock));
    483 	KASSERT(mutex_owned(&pg->interlock));
    484 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    485 	    (PQ_INTENT_D | PQ_INTENT_SET));
    486 
    487 	uvmpdpol_pagedequeue_locked(pg);
    488 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
    489 	pdpol_state.s_active++;
    490 	pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
    491 	pg->pqflags |= PQ_ACTIVE;
    492 }
    493 
    494 void
    495 uvmpdpol_pageactivate(struct vm_page *pg)
    496 {
    497 
    498 	KASSERT(uvm_page_owner_locked_p(pg, false));
    499 	KASSERT(mutex_owned(&pg->interlock));
    500 
    501 	uvmpdpol_set_intent(pg, PQ_INTENT_A);
    502 }
    503 
    504 static void
    505 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
    506 {
    507 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    508 
    509 	KASSERT(mutex_owned(&s->lock));
    510 	KASSERT(mutex_owned(&pg->interlock));
    511 
    512 	if (pg->pqflags & PQ_ACTIVE) {
    513 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    514 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
    515 		KASSERT(pdpol_state.s_active > 0);
    516 		pdpol_state.s_active--;
    517 	} else if (pg->pqflags & PQ_INACTIVE) {
    518 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
    519 		KASSERT(pdpol_state.s_inactive > 0);
    520 		pdpol_state.s_inactive--;
    521 	}
    522 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
    523 }
    524 
    525 void
    526 uvmpdpol_pagedequeue(struct vm_page *pg)
    527 {
    528 
    529 	KASSERT(uvm_page_owner_locked_p(pg, true));
    530 	KASSERT(mutex_owned(&pg->interlock));
    531 
    532 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
    533 }
    534 
    535 void
    536 uvmpdpol_pageenqueue(struct vm_page *pg)
    537 {
    538 
    539 	KASSERT(uvm_page_owner_locked_p(pg, false));
    540 	KASSERT(mutex_owned(&pg->interlock));
    541 
    542 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
    543 }
    544 
    545 void
    546 uvmpdpol_anfree(struct vm_anon *an)
    547 {
    548 }
    549 
    550 bool
    551 uvmpdpol_pageisqueued_p(struct vm_page *pg)
    552 {
    553 	uint32_t pqflags;
    554 
    555 	/*
    556 	 * if there's an intent set, we have to consider it.  otherwise,
    557 	 * return the actual state.  we may be called unlocked for the
    558 	 * purpose of assertions, which is safe due to the page lifecycle.
    559 	 */
    560 	pqflags = atomic_load_relaxed(&pg->pqflags);
    561 	if ((pqflags & PQ_INTENT_SET) != 0) {
    562 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
    563 	} else {
    564 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
    565 	}
    566 }
    567 
    568 void
    569 uvmpdpol_estimatepageable(int *active, int *inactive)
    570 {
    571 	struct uvmpdpol_globalstate *s = &pdpol_state;
    572 
    573 	/*
    574 	 * Don't take any locks here.  This can be called from DDB, and in
    575 	 * any case the numbers are stale the instant the lock is dropped,
    576 	 * so it just doesn't matter.
    577 	 */
    578 	if (active) {
    579 		*active = s->s_active;
    580 	}
    581 	if (inactive) {
    582 		*inactive = s->s_inactive;
    583 	}
    584 }
    585 
    586 #if !defined(PDSIM)
    587 static int
    588 min_check(struct uvm_pctparam *pct, int t)
    589 {
    590 	struct uvmpdpol_globalstate *s = &pdpol_state;
    591 	int total = t;
    592 
    593 	if (pct != &s->s_anonmin) {
    594 		total += uvm_pctparam_get(&s->s_anonmin);
    595 	}
    596 	if (pct != &s->s_filemin) {
    597 		total += uvm_pctparam_get(&s->s_filemin);
    598 	}
    599 	if (pct != &s->s_execmin) {
    600 		total += uvm_pctparam_get(&s->s_execmin);
    601 	}
    602 	if (total > 95) {
    603 		return EINVAL;
    604 	}
    605 	return 0;
    606 }
    607 #endif /* !defined(PDSIM) */
    608 
    609 void
    610 uvmpdpol_init(void)
    611 {
    612 	struct uvmpdpol_globalstate *s = &pdpol_state;
    613 
    614 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
    615 	TAILQ_INIT(&s->s_activeq);
    616 	TAILQ_INIT(&s->s_inactiveq);
    617 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
    618 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
    619 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
    620 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
    621 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
    622 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
    623 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
    624 }
    625 
    626 void
    627 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
    628 {
    629 
    630 	ucpu->pdq =
    631 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
    632 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
    633 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
    634 }
    635 
    636 void
    637 uvmpdpol_reinit(void)
    638 {
    639 }
    640 
    641 bool
    642 uvmpdpol_needsscan_p(void)
    643 {
    644 
    645 	/*
    646 	 * this must be an unlocked check: can be called from interrupt.
    647 	 */
    648 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
    649 }
    650 
    651 void
    652 uvmpdpol_tune(void)
    653 {
    654 	struct uvmpdpol_globalstate *s = &pdpol_state;
    655 
    656 	mutex_enter(&s->lock);
    657 	clock_tune();
    658 	mutex_exit(&s->lock);
    659 }
    660 
    661 /*
    662  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
    663  * make it real.  return true if any work was done.
    664  */
    665 static bool
    666 uvmpdpol_pagerealize_locked(struct vm_page *pg)
    667 {
    668 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    669 
    670 	KASSERT(mutex_owned(&s->lock));
    671 	KASSERT(mutex_owned(&pg->interlock));
    672 
    673 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
    674 	case PQ_INTENT_A | PQ_INTENT_SET:
    675 	case PQ_INTENT_E | PQ_INTENT_SET:
    676 		uvmpdpol_pageactivate_locked(pg);
    677 		return true;
    678 	case PQ_INTENT_I | PQ_INTENT_SET:
    679 		uvmpdpol_pagedeactivate_locked(pg);
    680 		return true;
    681 	case PQ_INTENT_D | PQ_INTENT_SET:
    682 		uvmpdpol_pagedequeue_locked(pg);
    683 		return true;
    684 	default:
    685 		return false;
    686 	}
    687 }
    688 
    689 /*
    690  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
    691  * updates flushed to the global queues.  this routine may block, and
    692  * so can switch cpu.  the idea is to empty to queue on whatever cpu
    693  * we finally end up on.
    694  */
    695 static struct uvm_cpu *
    696 uvmpdpol_flush(void)
    697 {
    698 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    699 	struct uvm_cpu *ucpu;
    700 	struct vm_page *pg;
    701 
    702 	KASSERT(kpreempt_disabled());
    703 
    704 	mutex_enter(&s->lock);
    705 	for (;;) {
    706 		/*
    707 		 * prefer scanning forwards (even though mutex_enter() is
    708 		 * serializing) so as to not defeat any prefetch logic in
    709 		 * the CPU.  that means elsewhere enqueuing backwards, like
    710 		 * a stack, but not so important there as pages are being
    711 		 * added singularly.
    712 		 *
    713 		 * prefetch the next "struct vm_page" while working on the
    714 		 * current one.  this has a measurable and very positive
    715 		 * effect in reducing the amount of time spent here under
    716 		 * the global lock.
    717 		 */
    718 		ucpu = curcpu()->ci_data.cpu_uvm;
    719 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
    720 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
    721 			break;
    722 		}
    723 		pg = ucpu->pdq[ucpu->pdqhead++];
    724 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
    725 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
    726 		}
    727 		mutex_enter(&pg->interlock);
    728 		pg->pqflags &= ~PQ_INTENT_QUEUED;
    729 		(void)uvmpdpol_pagerealize_locked(pg);
    730 		mutex_exit(&pg->interlock);
    731 	}
    732 	mutex_exit(&s->lock);
    733 	return ucpu;
    734 }
    735 
    736 /*
    737  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
    738  * implementation, that means putting the page on a per-CPU queue to be
    739  * dealt with later.
    740  */
    741 void
    742 uvmpdpol_pagerealize(struct vm_page *pg)
    743 {
    744 	struct uvm_cpu *ucpu;
    745 
    746 	/*
    747 	 * drain the per per-CPU queue if full, then enter the page.
    748 	 */
    749 	kpreempt_disable();
    750 	ucpu = curcpu()->ci_data.cpu_uvm;
    751 	if (__predict_false(ucpu->pdqhead == 0)) {
    752 		ucpu = uvmpdpol_flush();
    753 	}
    754 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
    755 	kpreempt_enable();
    756 }
    757 
    758 /*
    759  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
    760  * pending updates back to the global queues.
    761  */
    762 void
    763 uvmpdpol_idle(struct uvm_cpu *ucpu)
    764 {
    765 	struct uvmpdpol_globalstate *s = &pdpol_state;
    766 	struct vm_page *pg;
    767 
    768 	KASSERT(kpreempt_disabled());
    769 
    770 	/*
    771 	 * if no pages in the queue, we have nothing to do.
    772 	 */
    773 	if (ucpu->pdqhead == ucpu->pdqtail) {
    774 		ucpu->pdqtime = hardclock_ticks;
    775 		return;
    776 	}
    777 
    778 	/*
    779 	 * don't do this more than ~8 times a second as it would needlessly
    780 	 * exert pressure.
    781 	 */
    782 	if (hardclock_ticks - ucpu->pdqtime < (hz >> 3)) {
    783 		return;
    784 	}
    785 
    786 	/*
    787 	 * the idle LWP can't block, so we have to try for the lock.  if we
    788 	 * get it, purge the per-CPU pending update queue.  continually
    789 	 * check for a pending resched: in that case exit immediately.
    790 	 */
    791 	if (mutex_tryenter(&s->lock)) {
    792 		while (ucpu->pdqhead != ucpu->pdqtail) {
    793 			pg = ucpu->pdq[ucpu->pdqhead];
    794 			if (!mutex_tryenter(&pg->interlock)) {
    795 				break;
    796 			}
    797 			ucpu->pdqhead++;
    798 			pg->pqflags &= ~PQ_INTENT_QUEUED;
    799 			(void)uvmpdpol_pagerealize_locked(pg);
    800 			mutex_exit(&pg->interlock);
    801 			if (curcpu()->ci_want_resched) {
    802 				break;
    803 			}
    804 		}
    805 		if (ucpu->pdqhead == ucpu->pdqtail) {
    806 			ucpu->pdqtime = hardclock_ticks;
    807 		}
    808 		mutex_exit(&s->lock);
    809 	}
    810 }
    811 
    812 #if !defined(PDSIM)
    813 
    814 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
    815 
    816 void
    817 uvmpdpol_sysctlsetup(void)
    818 {
    819 	struct uvmpdpol_globalstate *s = &pdpol_state;
    820 
    821 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
    822 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    823 	    "for anonymous application data"));
    824 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
    825 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    826 	    "for cached file data"));
    827 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
    828 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    829 	    "for cached executable data"));
    830 
    831 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
    832 	    SYSCTL_DESCR("Percentage of physical memory which will "
    833 	    "be reclaimed from other usage for "
    834 	    "anonymous application data"));
    835 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
    836 	    SYSCTL_DESCR("Percentage of physical memory which will "
    837 	    "be reclaimed from other usage for cached "
    838 	    "file data"));
    839 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
    840 	    SYSCTL_DESCR("Percentage of physical memory which will "
    841 	    "be reclaimed from other usage for cached "
    842 	    "executable data"));
    843 
    844 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
    845 	    SYSCTL_DESCR("Percentage of inactive queue of "
    846 	    "the entire (active + inactive) queue"));
    847 }
    848 
    849 #endif /* !defined(PDSIM) */
    850 
    851 #if defined(PDSIM)
    852 void
    853 pdsim_dump(const char *id)
    854 {
    855 #if defined(DEBUG)
    856 	/* XXX */
    857 #endif /* defined(DEBUG) */
    858 }
    859 #endif /* defined(PDSIM) */
    860