Home | History | Annotate | Line # | Download | only in uvm
uvm_pdpolicy_clock.c revision 1.33
      1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.33 2020/02/23 15:46:43 ad Exp $	*/
      2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
      3 
      4 /*-
      5  * Copyright (c) 2019 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
     35  * Copyright (c) 1991, 1993, The Regents of the University of California.
     36  *
     37  * All rights reserved.
     38  *
     39  * This code is derived from software contributed to Berkeley by
     40  * The Mach Operating System project at Carnegie-Mellon University.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
     67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
     68  *
     69  *
     70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
     71  * All rights reserved.
     72  *
     73  * Permission to use, copy, modify and distribute this software and
     74  * its documentation is hereby granted, provided that both the copyright
     75  * notice and this permission notice appear in all copies of the
     76  * software, derivative works or modified versions, and any portions
     77  * thereof, and that both notices appear in supporting documentation.
     78  *
     79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     82  *
     83  * Carnegie Mellon requests users of this software to return to
     84  *
     85  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     86  *  School of Computer Science
     87  *  Carnegie Mellon University
     88  *  Pittsburgh PA 15213-3890
     89  *
     90  * any improvements or extensions that they make and grant Carnegie the
     91  * rights to redistribute these changes.
     92  */
     93 
     94 #if defined(PDSIM)
     95 
     96 #include "pdsim.h"
     97 
     98 #else /* defined(PDSIM) */
     99 
    100 #include <sys/cdefs.h>
    101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.33 2020/02/23 15:46:43 ad Exp $");
    102 
    103 #include <sys/param.h>
    104 #include <sys/proc.h>
    105 #include <sys/systm.h>
    106 #include <sys/kernel.h>
    107 #include <sys/kmem.h>
    108 #include <sys/atomic.h>
    109 
    110 #include <uvm/uvm.h>
    111 #include <uvm/uvm_pdpolicy.h>
    112 #include <uvm/uvm_pdpolicy_impl.h>
    113 #include <uvm/uvm_stat.h>
    114 
    115 #endif /* defined(PDSIM) */
    116 
    117 /*
    118  * per-CPU queue of pending page status changes.  128 entries makes for a
    119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
    120  * keeps lock contention events and wait times low, while not using too much
    121  * memory nor allowing global state to fall too far behind.
    122  */
    123 #if !defined(CLOCK_PDQ_SIZE)
    124 #define	CLOCK_PDQ_SIZE	128
    125 #endif /* !defined(CLOCK_PDQ_SIZE) */
    126 
    127 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
    128 #define PQ_ACTIVE	0x00000020	/* page is in active list */
    129 
    130 #if !defined(CLOCK_INACTIVEPCT)
    131 #define	CLOCK_INACTIVEPCT	33
    132 #endif /* !defined(CLOCK_INACTIVEPCT) */
    133 
    134 struct uvmpdpol_globalstate {
    135 	kmutex_t lock;			/* lock on state */
    136 					/* <= compiler pads here */
    137 	struct pglist s_activeq		/* allocated pages, in use */
    138 	    __aligned(COHERENCY_UNIT);
    139 	struct pglist s_inactiveq;	/* pages between the clock hands */
    140 	int s_active;
    141 	int s_inactive;
    142 	int s_inactarg;
    143 	struct uvm_pctparam s_anonmin;
    144 	struct uvm_pctparam s_filemin;
    145 	struct uvm_pctparam s_execmin;
    146 	struct uvm_pctparam s_anonmax;
    147 	struct uvm_pctparam s_filemax;
    148 	struct uvm_pctparam s_execmax;
    149 	struct uvm_pctparam s_inactivepct;
    150 };
    151 
    152 struct uvmpdpol_scanstate {
    153 	bool ss_anonreact, ss_filereact, ss_execreact;
    154 	struct vm_page ss_marker;
    155 };
    156 
    157 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
    158 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
    159 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
    160 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
    161 static struct uvm_cpu *uvmpdpol_flush(void);
    162 
    163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
    164 static struct uvmpdpol_scanstate pdpol_scanstate;
    165 
    166 PDPOL_EVCNT_DEFINE(reactexec)
    167 PDPOL_EVCNT_DEFINE(reactfile)
    168 PDPOL_EVCNT_DEFINE(reactanon)
    169 
    170 static void
    171 clock_tune(void)
    172 {
    173 	struct uvmpdpol_globalstate *s = &pdpol_state;
    174 
    175 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
    176 	    s->s_active + s->s_inactive);
    177 	if (s->s_inactarg <= uvmexp.freetarg) {
    178 		s->s_inactarg = uvmexp.freetarg + 1;
    179 	}
    180 }
    181 
    182 void
    183 uvmpdpol_scaninit(void)
    184 {
    185 	struct uvmpdpol_globalstate *s = &pdpol_state;
    186 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    187 	int t;
    188 	bool anonunder, fileunder, execunder;
    189 	bool anonover, fileover, execover;
    190 	bool anonreact, filereact, execreact;
    191 	int64_t freepg, anonpg, filepg, execpg;
    192 
    193 	/*
    194 	 * decide which types of pages we want to reactivate instead of freeing
    195 	 * to keep usage within the minimum and maximum usage limits.
    196 	 */
    197 
    198 	cpu_count_sync_all();
    199 	freepg = uvm_availmem();
    200 	anonpg = cpu_count_get(CPU_COUNT_ANONPAGES);
    201 	filepg = cpu_count_get(CPU_COUNT_FILEPAGES);
    202 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
    203 
    204 	mutex_enter(&s->lock);
    205 	t = s->s_active + s->s_inactive + freepg;
    206 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
    207 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
    208 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
    209 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
    210 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
    211 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
    212 	anonreact = anonunder || (!anonover && (fileover || execover));
    213 	filereact = fileunder || (!fileover && (anonover || execover));
    214 	execreact = execunder || (!execover && (anonover || fileover));
    215 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
    216 		anonreact = filereact = execreact = false;
    217 	}
    218 	ss->ss_anonreact = anonreact;
    219 	ss->ss_filereact = filereact;
    220 	ss->ss_execreact = execreact;
    221 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
    222 	ss->ss_marker.flags = PG_MARKER;
    223 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    224 	mutex_exit(&s->lock);
    225 }
    226 
    227 void
    228 uvmpdpol_scanfini(void)
    229 {
    230 	struct uvmpdpol_globalstate *s = &pdpol_state;
    231 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    232 
    233 	mutex_enter(&s->lock);
    234 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
    235 	mutex_exit(&s->lock);
    236 }
    237 
    238 struct vm_page *
    239 uvmpdpol_selectvictim(krwlock_t **plock)
    240 {
    241 	struct uvmpdpol_globalstate *s = &pdpol_state;
    242 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
    243 	struct vm_page *pg;
    244 	krwlock_t *lock;
    245 
    246 	mutex_enter(&s->lock);
    247 	while (/* CONSTCOND */ 1) {
    248 		struct vm_anon *anon;
    249 		struct uvm_object *uobj;
    250 
    251 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
    252 		if (pg == NULL) {
    253 			break;
    254 		}
    255 		KASSERT((pg->flags & PG_MARKER) == 0);
    256 		uvmexp.pdscans++;
    257 
    258 		/*
    259 		 * acquire interlock to stablize page identity.
    260 		 * if we have caught the page in a state of flux
    261 		 * deal with it and retry.
    262 		 */
    263 		mutex_enter(&pg->interlock);
    264 		if (uvmpdpol_pagerealize_locked(pg)) {
    265 			mutex_exit(&pg->interlock);
    266 			continue;
    267 		}
    268 
    269 		/*
    270 		 * now prepare to move on to the next page.
    271 		 */
    272 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
    273 		    pdqueue);
    274 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
    275 		    &ss->ss_marker, pdqueue);
    276 
    277 		/*
    278 		 * enforce the minimum thresholds on different
    279 		 * types of memory usage.  if reusing the current
    280 		 * page would reduce that type of usage below its
    281 		 * minimum, reactivate the page instead and move
    282 		 * on to the next page.
    283 		 */
    284 		anon = pg->uanon;
    285 		uobj = pg->uobject;
    286 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
    287 			uvmpdpol_pageactivate_locked(pg);
    288 			mutex_exit(&pg->interlock);
    289 			PDPOL_EVCNT_INCR(reactexec);
    290 			continue;
    291 		}
    292 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
    293 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
    294 			uvmpdpol_pageactivate_locked(pg);
    295 			mutex_exit(&pg->interlock);
    296 			PDPOL_EVCNT_INCR(reactfile);
    297 			continue;
    298 		}
    299 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
    300 			uvmpdpol_pageactivate_locked(pg);
    301 			mutex_exit(&pg->interlock);
    302 			PDPOL_EVCNT_INCR(reactanon);
    303 			continue;
    304 		}
    305 
    306 		/*
    307 		 * try to lock the object that owns the page.
    308 		 *
    309 		 * with the page interlock held, we can drop s->lock, which
    310 		 * could otherwise serve as a barrier to us getting the
    311 		 * object locked, because the owner of the object's lock may
    312 		 * be blocked on s->lock (i.e. a deadlock).
    313 		 *
    314 		 * whatever happens, uvmpd_trylockowner() will release the
    315 		 * interlock.  with the interlock dropped we can then
    316 		 * re-acquire our own lock.  the order is:
    317 		 *
    318 		 *	object -> pdpol -> interlock.
    319 	         */
    320 	        mutex_exit(&s->lock);
    321         	lock = uvmpd_trylockowner(pg);
    322         	/* pg->interlock now released */
    323         	mutex_enter(&s->lock);
    324 		if (lock == NULL) {
    325 			/* didn't get it - try the next page. */
    326 			continue;
    327 		}
    328 
    329 		/*
    330 		 * move referenced pages back to active queue and skip to
    331 		 * next page.
    332 		 */
    333 		if (pmap_is_referenced(pg)) {
    334 			mutex_enter(&pg->interlock);
    335 			uvmpdpol_pageactivate_locked(pg);
    336 			mutex_exit(&pg->interlock);
    337 			uvmexp.pdreact++;
    338 			rw_exit(lock);
    339 			continue;
    340 		}
    341 
    342 		/* we have a potential victim. */
    343 		*plock = lock;
    344 		break;
    345 	}
    346 	mutex_exit(&s->lock);
    347 	return pg;
    348 }
    349 
    350 void
    351 uvmpdpol_balancequeue(int swap_shortage)
    352 {
    353 	struct uvmpdpol_globalstate *s = &pdpol_state;
    354 	int inactive_shortage;
    355 	struct vm_page *p, marker;
    356 	krwlock_t *lock;
    357 
    358 	/*
    359 	 * we have done the scan to get free pages.   now we work on meeting
    360 	 * our inactive target.
    361 	 */
    362 
    363 	memset(&marker, 0, sizeof(marker));
    364 	marker.flags = PG_MARKER;
    365 
    366 	mutex_enter(&s->lock);
    367 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
    368 	for (;;) {
    369 		inactive_shortage =
    370 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
    371 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
    372 			break;
    373 		}
    374 		p = TAILQ_NEXT(&marker, pdqueue);
    375 		if (p == NULL) {
    376 			break;
    377 		}
    378 		KASSERT((p->flags & PG_MARKER) == 0);
    379 
    380 		/*
    381 		 * acquire interlock to stablize page identity.
    382 		 * if we have caught the page in a state of flux
    383 		 * deal with it and retry.
    384 		 */
    385 		mutex_enter(&p->interlock);
    386 		if (uvmpdpol_pagerealize_locked(p)) {
    387 			mutex_exit(&p->interlock);
    388 			continue;
    389 		}
    390 
    391 		/*
    392 		 * now prepare to move on to the next page.
    393 		 */
    394 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    395 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
    396 		    pdqueue);
    397 
    398 		/*
    399 		 * try to lock the object that owns the page.  see comments
    400 		 * in uvmpdol_selectvictim().
    401 	         */
    402 	        mutex_exit(&s->lock);
    403         	lock = uvmpd_trylockowner(p);
    404         	/* p->interlock now released */
    405         	mutex_enter(&s->lock);
    406 		if (lock == NULL) {
    407 			/* didn't get it - try the next page. */
    408 			continue;
    409 		}
    410 
    411 		/*
    412 		 * if there's a shortage of swap slots, try to free it.
    413 		 */
    414 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
    415 		    (p->flags & PG_BUSY) == 0) {
    416 			if (uvmpd_dropswap(p)) {
    417 				swap_shortage--;
    418 			}
    419 		}
    420 
    421 		/*
    422 		 * if there's a shortage of inactive pages, deactivate.
    423 		 */
    424 		if (inactive_shortage > 0) {
    425 			pmap_clear_reference(p);
    426 			mutex_enter(&p->interlock);
    427 			uvmpdpol_pagedeactivate_locked(p);
    428 			mutex_exit(&p->interlock);
    429 			uvmexp.pddeact++;
    430 			inactive_shortage--;
    431 		}
    432 		rw_exit(lock);
    433 	}
    434 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
    435 	mutex_exit(&s->lock);
    436 }
    437 
    438 static void
    439 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
    440 {
    441 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    442 
    443 	KASSERT(mutex_owned(&s->lock));
    444 	KASSERT(mutex_owned(&pg->interlock));
    445 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    446 	    (PQ_INTENT_D | PQ_INTENT_SET));
    447 
    448 	if (pg->pqflags & PQ_ACTIVE) {
    449 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    450 		KASSERT(pdpol_state.s_active > 0);
    451 		pdpol_state.s_active--;
    452 	}
    453 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
    454 		KASSERT(pg->wire_count == 0);
    455 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
    456 		pdpol_state.s_inactive++;
    457 	}
    458 	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_INACTIVE;
    459 }
    460 
    461 void
    462 uvmpdpol_pagedeactivate(struct vm_page *pg)
    463 {
    464 
    465 	KASSERT(uvm_page_owner_locked_p(pg, true));
    466 	KASSERT(mutex_owned(&pg->interlock));
    467 
    468 	/*
    469 	 * we have to clear the reference bit now, as when it comes time to
    470 	 * realize the intent we won't have the object locked any more.
    471 	 */
    472 	pmap_clear_reference(pg);
    473 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
    474 }
    475 
    476 static void
    477 uvmpdpol_pageactivate_locked(struct vm_page *pg)
    478 {
    479 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    480 
    481 	KASSERT(mutex_owned(&s->lock));
    482 	KASSERT(mutex_owned(&pg->interlock));
    483 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
    484 	    (PQ_INTENT_D | PQ_INTENT_SET));
    485 
    486 	uvmpdpol_pagedequeue_locked(pg);
    487 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
    488 	pdpol_state.s_active++;
    489 	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_ACTIVE;
    490 }
    491 
    492 void
    493 uvmpdpol_pageactivate(struct vm_page *pg)
    494 {
    495 
    496 	KASSERT(uvm_page_owner_locked_p(pg, true));
    497 	KASSERT(mutex_owned(&pg->interlock));
    498 
    499 	uvmpdpol_set_intent(pg, PQ_INTENT_A);
    500 }
    501 
    502 static void
    503 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
    504 {
    505 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    506 
    507 	KASSERT(mutex_owned(&s->lock));
    508 	KASSERT(mutex_owned(&pg->interlock));
    509 
    510 	if (pg->pqflags & PQ_ACTIVE) {
    511 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
    512 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
    513 		KASSERT(pdpol_state.s_active > 0);
    514 		pdpol_state.s_active--;
    515 	} else if (pg->pqflags & PQ_INACTIVE) {
    516 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
    517 		KASSERT(pdpol_state.s_inactive > 0);
    518 		pdpol_state.s_inactive--;
    519 	}
    520 	pg->pqflags &= PQ_INTENT_QUEUED;
    521 }
    522 
    523 void
    524 uvmpdpol_pagedequeue(struct vm_page *pg)
    525 {
    526 
    527 	KASSERT(uvm_page_owner_locked_p(pg, true));
    528 	KASSERT(mutex_owned(&pg->interlock));
    529 
    530 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
    531 }
    532 
    533 void
    534 uvmpdpol_pageenqueue(struct vm_page *pg)
    535 {
    536 
    537 	KASSERT(uvm_page_owner_locked_p(pg, true));
    538 	KASSERT(mutex_owned(&pg->interlock));
    539 
    540 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
    541 }
    542 
    543 void
    544 uvmpdpol_anfree(struct vm_anon *an)
    545 {
    546 }
    547 
    548 bool
    549 uvmpdpol_pageisqueued_p(struct vm_page *pg)
    550 {
    551 	uint32_t pqflags;
    552 
    553 	/*
    554 	 * if there's an intent set, we have to consider it.  otherwise,
    555 	 * return the actual state.  we may be called unlocked for the
    556 	 * purpose of assertions, which is safe due to the page lifecycle.
    557 	 */
    558 	pqflags = atomic_load_relaxed(&pg->pqflags);
    559 	if ((pqflags & PQ_INTENT_SET) != 0) {
    560 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
    561 	} else {
    562 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
    563 	}
    564 }
    565 
    566 void
    567 uvmpdpol_estimatepageable(int *active, int *inactive)
    568 {
    569 	struct uvmpdpol_globalstate *s = &pdpol_state;
    570 
    571 	/*
    572 	 * Don't take any locks here.  This can be called from DDB, and in
    573 	 * any case the numbers are stale the instant the lock is dropped,
    574 	 * so it just doesn't matter.
    575 	 */
    576 	if (active) {
    577 		*active = s->s_active;
    578 	}
    579 	if (inactive) {
    580 		*inactive = s->s_inactive;
    581 	}
    582 }
    583 
    584 #if !defined(PDSIM)
    585 static int
    586 min_check(struct uvm_pctparam *pct, int t)
    587 {
    588 	struct uvmpdpol_globalstate *s = &pdpol_state;
    589 	int total = t;
    590 
    591 	if (pct != &s->s_anonmin) {
    592 		total += uvm_pctparam_get(&s->s_anonmin);
    593 	}
    594 	if (pct != &s->s_filemin) {
    595 		total += uvm_pctparam_get(&s->s_filemin);
    596 	}
    597 	if (pct != &s->s_execmin) {
    598 		total += uvm_pctparam_get(&s->s_execmin);
    599 	}
    600 	if (total > 95) {
    601 		return EINVAL;
    602 	}
    603 	return 0;
    604 }
    605 #endif /* !defined(PDSIM) */
    606 
    607 void
    608 uvmpdpol_init(void)
    609 {
    610 	struct uvmpdpol_globalstate *s = &pdpol_state;
    611 
    612 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
    613 	TAILQ_INIT(&s->s_activeq);
    614 	TAILQ_INIT(&s->s_inactiveq);
    615 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
    616 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
    617 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
    618 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
    619 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
    620 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
    621 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
    622 }
    623 
    624 void
    625 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
    626 {
    627 
    628 	ucpu->pdq =
    629 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
    630 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
    631 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
    632 }
    633 
    634 void
    635 uvmpdpol_reinit(void)
    636 {
    637 }
    638 
    639 bool
    640 uvmpdpol_needsscan_p(void)
    641 {
    642 
    643 	/*
    644 	 * this must be an unlocked check: can be called from interrupt.
    645 	 */
    646 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
    647 }
    648 
    649 void
    650 uvmpdpol_tune(void)
    651 {
    652 	struct uvmpdpol_globalstate *s = &pdpol_state;
    653 
    654 	mutex_enter(&s->lock);
    655 	clock_tune();
    656 	mutex_exit(&s->lock);
    657 }
    658 
    659 /*
    660  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
    661  * make it real.  return true if any work was done.
    662  */
    663 static bool
    664 uvmpdpol_pagerealize_locked(struct vm_page *pg)
    665 {
    666 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    667 
    668 	KASSERT(mutex_owned(&s->lock));
    669 	KASSERT(mutex_owned(&pg->interlock));
    670 
    671 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
    672 	case PQ_INTENT_A | PQ_INTENT_SET:
    673 	case PQ_INTENT_E | PQ_INTENT_SET:
    674 		uvmpdpol_pageactivate_locked(pg);
    675 		return true;
    676 	case PQ_INTENT_I | PQ_INTENT_SET:
    677 		uvmpdpol_pagedeactivate_locked(pg);
    678 		return true;
    679 	case PQ_INTENT_D | PQ_INTENT_SET:
    680 		uvmpdpol_pagedequeue_locked(pg);
    681 		return true;
    682 	default:
    683 		return false;
    684 	}
    685 }
    686 
    687 /*
    688  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
    689  * updates flushed to the global queues.  this routine may block, and
    690  * so can switch cpu.  the idea is to empty to queue on whatever cpu
    691  * we finally end up on.
    692  */
    693 static struct uvm_cpu *
    694 uvmpdpol_flush(void)
    695 {
    696 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
    697 	struct uvm_cpu *ucpu;
    698 	struct vm_page *pg;
    699 
    700 	KASSERT(kpreempt_disabled());
    701 
    702 	mutex_enter(&s->lock);
    703 	for (;;) {
    704 		/*
    705 		 * prefer scanning forwards (even though mutex_enter() is
    706 		 * serializing) so as to not defeat any prefetch logic in
    707 		 * the CPU.  that means elsewhere enqueuing backwards, like
    708 		 * a stack, but not so important there as pages are being
    709 		 * added singularly.
    710 		 *
    711 		 * prefetch the next "struct vm_page" while working on the
    712 		 * current one.  this has a measurable and very positive
    713 		 * effect in reducing the amount of time spent here under
    714 		 * the global lock.
    715 		 */
    716 		ucpu = curcpu()->ci_data.cpu_uvm;
    717 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
    718 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
    719 			break;
    720 		}
    721 		pg = ucpu->pdq[ucpu->pdqhead++];
    722 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
    723 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
    724 		}
    725 		mutex_enter(&pg->interlock);
    726 		pg->pqflags &= ~PQ_INTENT_QUEUED;
    727 		(void)uvmpdpol_pagerealize_locked(pg);
    728 		mutex_exit(&pg->interlock);
    729 	}
    730 	mutex_exit(&s->lock);
    731 	return ucpu;
    732 }
    733 
    734 /*
    735  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
    736  * implementation, that means putting the page on a per-CPU queue to be
    737  * dealt with later.
    738  */
    739 void
    740 uvmpdpol_pagerealize(struct vm_page *pg)
    741 {
    742 	struct uvm_cpu *ucpu;
    743 
    744 	/*
    745 	 * drain the per per-CPU queue if full, then enter the page.
    746 	 */
    747 	kpreempt_disable();
    748 	ucpu = curcpu()->ci_data.cpu_uvm;
    749 	if (__predict_false(ucpu->pdqhead == 0)) {
    750 		ucpu = uvmpdpol_flush();
    751 	}
    752 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
    753 	kpreempt_enable();
    754 }
    755 
    756 /*
    757  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
    758  * pending updates back to the global queues.
    759  */
    760 void
    761 uvmpdpol_idle(struct uvm_cpu *ucpu)
    762 {
    763 	struct uvmpdpol_globalstate *s = &pdpol_state;
    764 	struct vm_page *pg;
    765 
    766 	KASSERT(kpreempt_disabled());
    767 
    768 	/*
    769 	 * if no pages in the queue, we have nothing to do.
    770 	 */
    771 	if (ucpu->pdqhead == ucpu->pdqtail) {
    772 		ucpu->pdqtime = hardclock_ticks;
    773 		return;
    774 	}
    775 
    776 	/*
    777 	 * don't do this more than ~8 times a second as it would needlessly
    778 	 * exert pressure.
    779 	 */
    780 	if (hardclock_ticks - ucpu->pdqtime < (hz >> 3)) {
    781 		return;
    782 	}
    783 
    784 	/*
    785 	 * the idle LWP can't block, so we have to try for the lock.  if we
    786 	 * get it, purge the per-CPU pending update queue.  continually
    787 	 * check for a pending resched: in that case exit immediately.
    788 	 */
    789 	if (mutex_tryenter(&s->lock)) {
    790 		while (ucpu->pdqhead != ucpu->pdqtail) {
    791 			pg = ucpu->pdq[ucpu->pdqhead];
    792 			if (!mutex_tryenter(&pg->interlock)) {
    793 				break;
    794 			}
    795 			ucpu->pdqhead++;
    796 			pg->pqflags &= ~PQ_INTENT_QUEUED;
    797 			(void)uvmpdpol_pagerealize_locked(pg);
    798 			mutex_exit(&pg->interlock);
    799 			if (curcpu()->ci_want_resched) {
    800 				break;
    801 			}
    802 		}
    803 		if (ucpu->pdqhead == ucpu->pdqtail) {
    804 			ucpu->pdqtime = hardclock_ticks;
    805 		}
    806 		mutex_exit(&s->lock);
    807 	}
    808 }
    809 
    810 #if !defined(PDSIM)
    811 
    812 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
    813 
    814 void
    815 uvmpdpol_sysctlsetup(void)
    816 {
    817 	struct uvmpdpol_globalstate *s = &pdpol_state;
    818 
    819 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
    820 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    821 	    "for anonymous application data"));
    822 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
    823 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    824 	    "for cached file data"));
    825 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
    826 	    SYSCTL_DESCR("Percentage of physical memory reserved "
    827 	    "for cached executable data"));
    828 
    829 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
    830 	    SYSCTL_DESCR("Percentage of physical memory which will "
    831 	    "be reclaimed from other usage for "
    832 	    "anonymous application data"));
    833 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
    834 	    SYSCTL_DESCR("Percentage of physical memory which will "
    835 	    "be reclaimed from other usage for cached "
    836 	    "file data"));
    837 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
    838 	    SYSCTL_DESCR("Percentage of physical memory which will "
    839 	    "be reclaimed from other usage for cached "
    840 	    "executable data"));
    841 
    842 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
    843 	    SYSCTL_DESCR("Percentage of inactive queue of "
    844 	    "the entire (active + inactive) queue"));
    845 }
    846 
    847 #endif /* !defined(PDSIM) */
    848 
    849 #if defined(PDSIM)
    850 void
    851 pdsim_dump(const char *id)
    852 {
    853 #if defined(DEBUG)
    854 	/* XXX */
    855 #endif /* defined(DEBUG) */
    856 }
    857 #endif /* defined(PDSIM) */
    858