uvm_pdaemon.c revision 1.127 1 /* $NetBSD: uvm_pdaemon.c,v 1.127 2020/05/25 19:46:20 ad Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993, The Regents of the University of California.
6 *
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94
37 * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
38 *
39 *
40 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41 * All rights reserved.
42 *
43 * Permission to use, copy, modify and distribute this software and
44 * its documentation is hereby granted, provided that both the copyright
45 * notice and this permission notice appear in all copies of the
46 * software, derivative works or modified versions, and any portions
47 * thereof, and that both notices appear in supporting documentation.
48 *
49 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52 *
53 * Carnegie Mellon requests users of this software to return to
54 *
55 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
56 * School of Computer Science
57 * Carnegie Mellon University
58 * Pittsburgh PA 15213-3890
59 *
60 * any improvements or extensions that they make and grant Carnegie the
61 * rights to redistribute these changes.
62 */
63
64 /*
65 * uvm_pdaemon.c: the page daemon
66 */
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.127 2020/05/25 19:46:20 ad Exp $");
70
71 #include "opt_uvmhist.h"
72 #include "opt_readahead.h"
73
74 #define __RWLOCK_PRIVATE
75
76 #include <sys/param.h>
77 #include <sys/proc.h>
78 #include <sys/systm.h>
79 #include <sys/kernel.h>
80 #include <sys/pool.h>
81 #include <sys/buf.h>
82 #include <sys/module.h>
83 #include <sys/atomic.h>
84 #include <sys/kthread.h>
85
86 #include <uvm/uvm.h>
87 #include <uvm/uvm_pdpolicy.h>
88 #include <uvm/uvm_pgflcache.h>
89
90 #ifdef UVMHIST
91 UVMHIST_DEFINE(pdhist);
92 #endif
93
94 /*
95 * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
96 * in a pass thru the inactive list when swap is full. the value should be
97 * "small"... if it's too large we'll cycle the active pages thru the inactive
98 * queue too quickly to for them to be referenced and avoid being freed.
99 */
100
101 #define UVMPD_NUMDIRTYREACTS 16
102
103 #define UVMPD_NUMTRYLOCKOWNER 128
104
105 /*
106 * local prototypes
107 */
108
109 static void uvmpd_scan(void);
110 static void uvmpd_scan_queue(void);
111 static void uvmpd_tune(void);
112 static void uvmpd_pool_drain_thread(void *);
113 static void uvmpd_pool_drain_wakeup(void);
114
115 static unsigned int uvm_pagedaemon_waiters;
116
117 /* State for the pool drainer thread */
118 static kmutex_t uvmpd_lock __cacheline_aligned;
119 static kcondvar_t uvmpd_pool_drain_cv;
120 static bool uvmpd_pool_drain_run = false;
121
122 /*
123 * XXX hack to avoid hangs when large processes fork.
124 */
125 u_int uvm_extrapages;
126
127 /*
128 * uvm_wait: wait (sleep) for the page daemon to free some pages
129 *
130 * => should be called with all locks released
131 * => should _not_ be called by the page daemon (to avoid deadlock)
132 */
133
134 void
135 uvm_wait(const char *wmsg)
136 {
137 int timo = 0;
138
139 if (uvm.pagedaemon_lwp == NULL)
140 panic("out of memory before the pagedaemon thread exists");
141
142 mutex_spin_enter(&uvmpd_lock);
143
144 /*
145 * check for page daemon going to sleep (waiting for itself)
146 */
147
148 if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) {
149 /*
150 * now we have a problem: the pagedaemon wants to go to
151 * sleep until it frees more memory. but how can it
152 * free more memory if it is asleep? that is a deadlock.
153 * we have two options:
154 * [1] panic now
155 * [2] put a timeout on the sleep, thus causing the
156 * pagedaemon to only pause (rather than sleep forever)
157 *
158 * note that option [2] will only help us if we get lucky
159 * and some other process on the system breaks the deadlock
160 * by exiting or freeing memory (thus allowing the pagedaemon
161 * to continue). for now we panic if DEBUG is defined,
162 * otherwise we hope for the best with option [2] (better
163 * yet, this should never happen in the first place!).
164 */
165
166 printf("pagedaemon: deadlock detected!\n");
167 timo = hz >> 3; /* set timeout */
168 #if defined(DEBUG)
169 /* DEBUG: panic so we can debug it */
170 panic("pagedaemon deadlock");
171 #endif
172 }
173
174 uvm_pagedaemon_waiters++;
175 wakeup(&uvm.pagedaemon); /* wake the daemon! */
176 UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvmpd_lock, false, wmsg, timo);
177 }
178
179 /*
180 * uvm_kick_pdaemon: perform checks to determine if we need to
181 * give the pagedaemon a nudge, and do so if necessary.
182 */
183
184 void
185 uvm_kick_pdaemon(void)
186 {
187 int fpages = uvm_availmem();
188
189 if (fpages + uvmexp.paging < uvmexp.freemin ||
190 (fpages + uvmexp.paging < uvmexp.freetarg &&
191 uvmpdpol_needsscan_p()) ||
192 uvm_km_va_starved_p()) {
193 mutex_spin_enter(&uvmpd_lock);
194 wakeup(&uvm.pagedaemon);
195 mutex_spin_exit(&uvmpd_lock);
196 }
197 }
198
199 /*
200 * uvmpd_tune: tune paging parameters
201 *
202 * => called when ever memory is added (or removed?) to the system
203 */
204
205 static void
206 uvmpd_tune(void)
207 {
208 int val;
209
210 UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
211
212 /*
213 * try to keep 0.5% of available RAM free, but limit to between
214 * 128k and 1024k per-CPU. XXX: what are these values good for?
215 */
216 val = uvmexp.npages / 200;
217 val = MAX(val, (128*1024) >> PAGE_SHIFT);
218 val = MIN(val, (1024*1024) >> PAGE_SHIFT);
219 val *= ncpu;
220
221 /* Make sure there's always a user page free. */
222 if (val < uvmexp.reserve_kernel + 1)
223 val = uvmexp.reserve_kernel + 1;
224 uvmexp.freemin = val;
225
226 /* Calculate free target. */
227 val = (uvmexp.freemin * 4) / 3;
228 if (val <= uvmexp.freemin)
229 val = uvmexp.freemin + 1;
230 uvmexp.freetarg = val + atomic_swap_uint(&uvm_extrapages, 0);
231
232 uvmexp.wiredmax = uvmexp.npages / 3;
233 UVMHIST_LOG(pdhist, "<- done, freemin=%jd, freetarg=%jd, wiredmax=%jd",
234 uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
235 }
236
237 /*
238 * uvm_pageout: the main loop for the pagedaemon
239 */
240
241 void
242 uvm_pageout(void *arg)
243 {
244 int npages = 0;
245 int extrapages = 0;
246 int fpages;
247
248 UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist);
249
250 UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
251
252 mutex_init(&uvmpd_lock, MUTEX_DEFAULT, IPL_VM);
253 cv_init(&uvmpd_pool_drain_cv, "pooldrain");
254
255 /* Create the pool drainer kernel thread. */
256 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL,
257 uvmpd_pool_drain_thread, NULL, NULL, "pooldrain"))
258 panic("fork pooldrain");
259
260 /*
261 * ensure correct priority and set paging parameters...
262 */
263
264 uvm.pagedaemon_lwp = curlwp;
265 npages = uvmexp.npages;
266 uvmpd_tune();
267
268 /*
269 * main loop
270 */
271
272 for (;;) {
273 bool needsscan, needsfree, kmem_va_starved;
274
275 kmem_va_starved = uvm_km_va_starved_p();
276
277 mutex_spin_enter(&uvmpd_lock);
278 if ((uvm_pagedaemon_waiters == 0 || uvmexp.paging > 0) &&
279 !kmem_va_starved) {
280 UVMHIST_LOG(pdhist," <<SLEEPING>>",0,0,0,0);
281 UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
282 &uvmpd_lock, false, "pgdaemon", 0);
283 uvmexp.pdwoke++;
284 UVMHIST_LOG(pdhist," <<WOKE UP>>",0,0,0,0);
285 } else {
286 mutex_spin_exit(&uvmpd_lock);
287 }
288
289 /*
290 * now recompute inactive count
291 */
292
293 if (npages != uvmexp.npages || extrapages != uvm_extrapages) {
294 npages = uvmexp.npages;
295 extrapages = uvm_extrapages;
296 uvmpd_tune();
297 }
298
299 uvmpdpol_tune();
300
301 /*
302 * Estimate a hint. Note that bufmem are returned to
303 * system only when entire pool page is empty.
304 */
305 fpages = uvm_availmem();
306 UVMHIST_LOG(pdhist," free/ftarg=%jd/%jd",
307 fpages, uvmexp.freetarg, 0,0);
308
309 needsfree = fpages + uvmexp.paging < uvmexp.freetarg;
310 needsscan = needsfree || uvmpdpol_needsscan_p();
311
312 /*
313 * scan if needed
314 */
315 if (needsscan) {
316 uvmpd_scan();
317 }
318
319 /*
320 * if there's any free memory to be had,
321 * wake up any waiters.
322 */
323 if (uvm_availmem() > uvmexp.reserve_kernel ||
324 uvmexp.paging == 0) {
325 mutex_spin_enter(&uvmpd_lock);
326 wakeup(&uvmexp.free);
327 uvm_pagedaemon_waiters = 0;
328 mutex_spin_exit(&uvmpd_lock);
329 }
330
331 /*
332 * scan done. if we don't need free memory, we're done.
333 */
334
335 if (!needsfree && !kmem_va_starved)
336 continue;
337
338 /*
339 * kick the pool drainer thread.
340 */
341
342 uvmpd_pool_drain_wakeup();
343 }
344 /*NOTREACHED*/
345 }
346
347 void
348 uvm_pageout_start(int npages)
349 {
350
351 atomic_add_int(&uvmexp.paging, npages);
352 }
353
354 void
355 uvm_pageout_done(int npages)
356 {
357
358 KASSERT(atomic_load_relaxed(&uvmexp.paging) >= npages);
359
360 if (npages == 0) {
361 return;
362 }
363
364 atomic_add_int(&uvmexp.paging, -npages);
365
366 /*
367 * wake up either of pagedaemon or LWPs waiting for it.
368 */
369
370 mutex_spin_enter(&uvmpd_lock);
371 if (uvm_availmem() <= uvmexp.reserve_kernel) {
372 wakeup(&uvm.pagedaemon);
373 } else if (uvm_pagedaemon_waiters != 0) {
374 wakeup(&uvmexp.free);
375 uvm_pagedaemon_waiters = 0;
376 }
377 mutex_spin_exit(&uvmpd_lock);
378 }
379
380 /*
381 * uvmpd_trylockowner: trylock the page's owner.
382 *
383 * => called with page interlock held.
384 * => resolve orphaned O->A loaned page.
385 * => return the locked mutex on success. otherwise, return NULL.
386 */
387
388 krwlock_t *
389 uvmpd_trylockowner(struct vm_page *pg)
390 {
391 struct uvm_object *uobj = pg->uobject;
392 struct vm_anon *anon = pg->uanon;
393 int tries, count;
394 bool running;
395 krwlock_t *slock;
396
397 KASSERT(mutex_owned(&pg->interlock));
398
399 if (uobj != NULL) {
400 slock = uobj->vmobjlock;
401 KASSERTMSG(slock != NULL, "pg %p uobj %p, NULL lock", pg, uobj);
402 } else if (anon != NULL) {
403 slock = anon->an_lock;
404 KASSERTMSG(slock != NULL, "pg %p anon %p, NULL lock", pg, anon);
405 } else {
406 /* Page may be in state of flux - ignore. */
407 mutex_exit(&pg->interlock);
408 return NULL;
409 }
410
411 /*
412 * Now try to lock the objects. We'll try hard, but don't really
413 * plan on spending more than a millisecond or so here.
414 */
415 tries = (curlwp == uvm.pagedaemon_lwp ? UVMPD_NUMTRYLOCKOWNER : 1);
416 for (;;) {
417 if (rw_tryenter(slock, RW_WRITER)) {
418 if (uobj == NULL) {
419 /*
420 * set PG_ANON if it isn't set already.
421 */
422 if ((pg->flags & PG_ANON) == 0) {
423 KASSERT(pg->loan_count > 0);
424 pg->loan_count--;
425 pg->flags |= PG_ANON;
426 /* anon now owns it */
427 }
428 }
429 mutex_exit(&pg->interlock);
430 return slock;
431 }
432 running = rw_owner_running(slock);
433 if (!running || --tries <= 0) {
434 break;
435 }
436 count = SPINLOCK_BACKOFF_MAX;
437 SPINLOCK_BACKOFF(count);
438 }
439
440 /*
441 * We didn't get the lock; chances are the very next page on the
442 * queue also has the same lock, so if the lock owner is not running
443 * take a breather and allow them to make progress. There could be
444 * only 1 CPU in the system, or the pagedaemon could have preempted
445 * the owner in kernel, or any number of other things could be going
446 * on.
447 */
448 mutex_exit(&pg->interlock);
449 if (curlwp == uvm.pagedaemon_lwp) {
450 if (!running) {
451 (void)kpause("pdpglock", false, 1, NULL);
452 }
453 uvmexp.pdbusy++;
454 }
455 return NULL;
456 }
457
458 #if defined(VMSWAP)
459 struct swapcluster {
460 int swc_slot;
461 int swc_nallocated;
462 int swc_nused;
463 struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)];
464 };
465
466 static void
467 swapcluster_init(struct swapcluster *swc)
468 {
469
470 swc->swc_slot = 0;
471 swc->swc_nused = 0;
472 }
473
474 static int
475 swapcluster_allocslots(struct swapcluster *swc)
476 {
477 int slot;
478 int npages;
479
480 if (swc->swc_slot != 0) {
481 return 0;
482 }
483
484 /* Even with strange MAXPHYS, the shift
485 implicitly rounds down to a page. */
486 npages = MAXPHYS >> PAGE_SHIFT;
487 slot = uvm_swap_alloc(&npages, true);
488 if (slot == 0) {
489 return ENOMEM;
490 }
491 swc->swc_slot = slot;
492 swc->swc_nallocated = npages;
493 swc->swc_nused = 0;
494
495 return 0;
496 }
497
498 static int
499 swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
500 {
501 int slot;
502 struct uvm_object *uobj;
503
504 KASSERT(swc->swc_slot != 0);
505 KASSERT(swc->swc_nused < swc->swc_nallocated);
506 KASSERT((pg->flags & PG_SWAPBACKED) != 0);
507
508 slot = swc->swc_slot + swc->swc_nused;
509 uobj = pg->uobject;
510 if (uobj == NULL) {
511 KASSERT(rw_write_held(pg->uanon->an_lock));
512 pg->uanon->an_swslot = slot;
513 } else {
514 int result;
515
516 KASSERT(rw_write_held(uobj->vmobjlock));
517 result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
518 if (result == -1) {
519 return ENOMEM;
520 }
521 }
522 swc->swc_pages[swc->swc_nused] = pg;
523 swc->swc_nused++;
524
525 return 0;
526 }
527
528 static void
529 swapcluster_flush(struct swapcluster *swc, bool now)
530 {
531 int slot;
532 int nused;
533 int nallocated;
534 int error __diagused;
535
536 if (swc->swc_slot == 0) {
537 return;
538 }
539 KASSERT(swc->swc_nused <= swc->swc_nallocated);
540
541 slot = swc->swc_slot;
542 nused = swc->swc_nused;
543 nallocated = swc->swc_nallocated;
544
545 /*
546 * if this is the final pageout we could have a few
547 * unused swap blocks. if so, free them now.
548 */
549
550 if (nused < nallocated) {
551 if (!now) {
552 return;
553 }
554 uvm_swap_free(slot + nused, nallocated - nused);
555 }
556
557 /*
558 * now start the pageout.
559 */
560
561 if (nused > 0) {
562 uvmexp.pdpageouts++;
563 uvm_pageout_start(nused);
564 error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
565 KASSERT(error == 0 || error == ENOMEM);
566 }
567
568 /*
569 * zero swslot to indicate that we are
570 * no longer building a swap-backed cluster.
571 */
572
573 swc->swc_slot = 0;
574 swc->swc_nused = 0;
575 }
576
577 static int
578 swapcluster_nused(struct swapcluster *swc)
579 {
580
581 return swc->swc_nused;
582 }
583
584 /*
585 * uvmpd_dropswap: free any swap allocated to this page.
586 *
587 * => called with owner locked.
588 * => return true if a page had an associated slot.
589 */
590
591 bool
592 uvmpd_dropswap(struct vm_page *pg)
593 {
594 bool result = false;
595 struct vm_anon *anon = pg->uanon;
596
597 if ((pg->flags & PG_ANON) && anon->an_swslot) {
598 uvm_swap_free(anon->an_swslot, 1);
599 anon->an_swslot = 0;
600 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
601 result = true;
602 } else if (pg->flags & PG_AOBJ) {
603 int slot = uao_set_swslot(pg->uobject,
604 pg->offset >> PAGE_SHIFT, 0);
605 if (slot) {
606 uvm_swap_free(slot, 1);
607 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
608 result = true;
609 }
610 }
611
612 return result;
613 }
614
615 #endif /* defined(VMSWAP) */
616
617 /*
618 * uvmpd_scan_queue: scan an replace candidate list for pages
619 * to clean or free.
620 *
621 * => we work on meeting our free target by converting inactive pages
622 * into free pages.
623 * => we handle the building of swap-backed clusters
624 */
625
626 static void
627 uvmpd_scan_queue(void)
628 {
629 struct vm_page *p;
630 struct uvm_object *uobj;
631 struct vm_anon *anon;
632 #if defined(VMSWAP)
633 struct swapcluster swc;
634 #endif /* defined(VMSWAP) */
635 int dirtyreacts;
636 krwlock_t *slock;
637 UVMHIST_FUNC("uvmpd_scan_queue"); UVMHIST_CALLED(pdhist);
638
639 /*
640 * swslot is non-zero if we are building a swap cluster. we want
641 * to stay in the loop while we have a page to scan or we have
642 * a swap-cluster to build.
643 */
644
645 #if defined(VMSWAP)
646 swapcluster_init(&swc);
647 #endif /* defined(VMSWAP) */
648
649 dirtyreacts = 0;
650 uvmpdpol_scaninit();
651
652 while (/* CONSTCOND */ 1) {
653
654 /*
655 * see if we've met the free target.
656 */
657
658 if (uvm_availmem() + uvmexp.paging
659 #if defined(VMSWAP)
660 + swapcluster_nused(&swc)
661 #endif /* defined(VMSWAP) */
662 >= uvmexp.freetarg << 2 ||
663 dirtyreacts == UVMPD_NUMDIRTYREACTS) {
664 UVMHIST_LOG(pdhist," met free target: "
665 "exit loop", 0, 0, 0, 0);
666 break;
667 }
668
669 /*
670 * first we have the pdpolicy select a victim page
671 * and attempt to lock the object that the page
672 * belongs to. if our attempt fails we skip on to
673 * the next page (no harm done). it is important to
674 * "try" locking the object as we are locking in the
675 * wrong order (pageq -> object) and we don't want to
676 * deadlock.
677 *
678 * the only time we expect to see an ownerless page
679 * (i.e. a page with no uobject and !PG_ANON) is if an
680 * anon has loaned a page from a uvm_object and the
681 * uvm_object has dropped the ownership. in that
682 * case, the anon can "take over" the loaned page
683 * and make it its own.
684 */
685
686 p = uvmpdpol_selectvictim(&slock);
687 if (p == NULL) {
688 break;
689 }
690 KASSERT(uvmpdpol_pageisqueued_p(p));
691 KASSERT(uvm_page_owner_locked_p(p, true));
692 KASSERT(p->wire_count == 0);
693
694 /*
695 * we are below target and have a new page to consider.
696 */
697
698 anon = p->uanon;
699 uobj = p->uobject;
700
701 if (p->flags & PG_BUSY) {
702 rw_exit(slock);
703 uvmexp.pdbusy++;
704 continue;
705 }
706
707 /* does the page belong to an object? */
708 if (uobj != NULL) {
709 uvmexp.pdobscan++;
710 } else {
711 #if defined(VMSWAP)
712 KASSERT(anon != NULL);
713 uvmexp.pdanscan++;
714 #else /* defined(VMSWAP) */
715 panic("%s: anon", __func__);
716 #endif /* defined(VMSWAP) */
717 }
718
719
720 /*
721 * we now have the object locked.
722 * if the page is not swap-backed, call the object's
723 * pager to flush and free the page.
724 */
725
726 #if defined(READAHEAD_STATS)
727 if ((p->flags & PG_READAHEAD) != 0) {
728 p->flags &= ~PG_READAHEAD;
729 uvm_ra_miss.ev_count++;
730 }
731 #endif /* defined(READAHEAD_STATS) */
732
733 if ((p->flags & PG_SWAPBACKED) == 0) {
734 KASSERT(uobj != NULL);
735 (void) (uobj->pgops->pgo_put)(uobj, p->offset,
736 p->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE);
737 continue;
738 }
739
740 /*
741 * the page is swap-backed. remove all the permissions
742 * from the page so we can sync the modified info
743 * without any race conditions. if the page is clean
744 * we can free it now and continue.
745 */
746
747 pmap_page_protect(p, VM_PROT_NONE);
748 if (uvm_pagegetdirty(p) == UVM_PAGE_STATUS_UNKNOWN) {
749 if (pmap_clear_modify(p)) {
750 uvm_pagemarkdirty(p, UVM_PAGE_STATUS_DIRTY);
751 } else {
752 uvm_pagemarkdirty(p, UVM_PAGE_STATUS_CLEAN);
753 }
754 }
755 if (uvm_pagegetdirty(p) != UVM_PAGE_STATUS_DIRTY) {
756 int slot;
757 int pageidx;
758
759 pageidx = p->offset >> PAGE_SHIFT;
760 uvm_pagefree(p);
761 atomic_inc_uint(&uvmexp.pdfreed);
762
763 /*
764 * for anons, we need to remove the page
765 * from the anon ourselves. for aobjs,
766 * pagefree did that for us.
767 */
768
769 if (anon) {
770 KASSERT(anon->an_swslot != 0);
771 anon->an_page = NULL;
772 slot = anon->an_swslot;
773 } else {
774 slot = uao_find_swslot(uobj, pageidx);
775 }
776 if (slot > 0) {
777 /* this page is now only in swap. */
778 KASSERT(uvmexp.swpgonly < uvmexp.swpginuse);
779 atomic_inc_uint(&uvmexp.swpgonly);
780 }
781 rw_exit(slock);
782 continue;
783 }
784
785 #if defined(VMSWAP)
786 /*
787 * this page is dirty, skip it if we'll have met our
788 * free target when all the current pageouts complete.
789 */
790
791 if (uvm_availmem() + uvmexp.paging > uvmexp.freetarg << 2) {
792 rw_exit(slock);
793 continue;
794 }
795
796 /*
797 * free any swap space allocated to the page since
798 * we'll have to write it again with its new data.
799 */
800
801 uvmpd_dropswap(p);
802
803 /*
804 * start new swap pageout cluster (if necessary).
805 *
806 * if swap is full reactivate this page so that
807 * we eventually cycle all pages through the
808 * inactive queue.
809 */
810
811 if (swapcluster_allocslots(&swc)) {
812 dirtyreacts++;
813 uvm_pagelock(p);
814 uvm_pageactivate(p);
815 uvm_pageunlock(p);
816 rw_exit(slock);
817 continue;
818 }
819
820 /*
821 * at this point, we're definitely going reuse this
822 * page. mark the page busy and delayed-free.
823 * we should remove the page from the page queues
824 * so we don't ever look at it again.
825 * adjust counters and such.
826 */
827
828 p->flags |= PG_BUSY;
829 UVM_PAGE_OWN(p, "scan_queue");
830 p->flags |= PG_PAGEOUT;
831 uvmexp.pgswapout++;
832
833 uvm_pagelock(p);
834 uvm_pagedequeue(p);
835 uvm_pageunlock(p);
836
837 /*
838 * add the new page to the cluster.
839 */
840
841 if (swapcluster_add(&swc, p)) {
842 p->flags &= ~(PG_BUSY|PG_PAGEOUT);
843 UVM_PAGE_OWN(p, NULL);
844 dirtyreacts++;
845 uvm_pagelock(p);
846 uvm_pageactivate(p);
847 uvm_pageunlock(p);
848 rw_exit(slock);
849 continue;
850 }
851 rw_exit(slock);
852
853 swapcluster_flush(&swc, false);
854
855 /*
856 * the pageout is in progress. bump counters and set up
857 * for the next loop.
858 */
859
860 atomic_inc_uint(&uvmexp.pdpending);
861
862 #else /* defined(VMSWAP) */
863 uvm_pagelock(p);
864 uvm_pageactivate(p);
865 uvm_pageunlock(p);
866 rw_exit(slock);
867 #endif /* defined(VMSWAP) */
868 }
869
870 uvmpdpol_scanfini();
871
872 #if defined(VMSWAP)
873 swapcluster_flush(&swc, true);
874 #endif /* defined(VMSWAP) */
875 }
876
877 /*
878 * uvmpd_scan: scan the page queues and attempt to meet our targets.
879 */
880
881 static void
882 uvmpd_scan(void)
883 {
884 int swap_shortage, pages_freed, fpages;
885 UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist);
886
887 uvmexp.pdrevs++;
888
889 /*
890 * work on meeting our targets. first we work on our free target
891 * by converting inactive pages into free pages. then we work on
892 * meeting our inactive target by converting active pages to
893 * inactive ones.
894 */
895
896 UVMHIST_LOG(pdhist, " starting 'free' loop",0,0,0,0);
897
898 pages_freed = uvmexp.pdfreed;
899 uvmpd_scan_queue();
900 pages_freed = uvmexp.pdfreed - pages_freed;
901
902 /*
903 * detect if we're not going to be able to page anything out
904 * until we free some swap resources from active pages.
905 */
906
907 swap_shortage = 0;
908 fpages = uvm_availmem();
909 if (fpages < uvmexp.freetarg &&
910 uvmexp.swpginuse >= uvmexp.swpgavail &&
911 !uvm_swapisfull() &&
912 pages_freed == 0) {
913 swap_shortage = uvmexp.freetarg - fpages;
914 }
915
916 uvmpdpol_balancequeue(swap_shortage);
917
918 /*
919 * if still below the minimum target, try unloading kernel
920 * modules.
921 */
922
923 if (uvm_availmem() < uvmexp.freemin) {
924 module_thread_kick();
925 }
926 }
927
928 /*
929 * uvm_reclaimable: decide whether to wait for pagedaemon.
930 *
931 * => return true if it seems to be worth to do uvm_wait.
932 *
933 * XXX should be tunable.
934 * XXX should consider pools, etc?
935 */
936
937 bool
938 uvm_reclaimable(void)
939 {
940 int filepages;
941 int active, inactive;
942
943 /*
944 * if swap is not full, no problem.
945 */
946
947 if (!uvm_swapisfull()) {
948 return true;
949 }
950
951 /*
952 * file-backed pages can be reclaimed even when swap is full.
953 * if we have more than 1/16 of pageable memory or 5MB, try to reclaim.
954 *
955 * XXX assume the worst case, ie. all wired pages are file-backed.
956 *
957 * XXX should consider about other reclaimable memory.
958 * XXX ie. pools, traditional buffer cache.
959 */
960
961 cpu_count_sync_all();
962 filepages = (int)cpu_count_get(CPU_COUNT_FILEPAGES) +
963 (int)cpu_count_get(CPU_COUNT_EXECPAGES) - uvmexp.wired;
964 uvm_estimatepageable(&active, &inactive);
965 if (filepages >= MIN((active + inactive) >> 4,
966 5 * 1024 * 1024 >> PAGE_SHIFT)) {
967 return true;
968 }
969
970 /*
971 * kill the process, fail allocation, etc..
972 */
973
974 return false;
975 }
976
977 void
978 uvm_estimatepageable(int *active, int *inactive)
979 {
980
981 uvmpdpol_estimatepageable(active, inactive);
982 }
983
984
985 /*
986 * Use a separate thread for draining pools.
987 * This work can't done from the main pagedaemon thread because
988 * some pool allocators need to take vm_map locks.
989 */
990
991 static void
992 uvmpd_pool_drain_thread(void *arg)
993 {
994 struct pool *firstpool, *curpool;
995 int bufcnt, lastslept;
996 bool cycled;
997
998 firstpool = NULL;
999 cycled = true;
1000 for (;;) {
1001 /*
1002 * sleep until awoken by the pagedaemon.
1003 */
1004 mutex_enter(&uvmpd_lock);
1005 if (!uvmpd_pool_drain_run) {
1006 lastslept = getticks();
1007 cv_wait(&uvmpd_pool_drain_cv, &uvmpd_lock);
1008 if (getticks() != lastslept) {
1009 cycled = false;
1010 firstpool = NULL;
1011 }
1012 }
1013 uvmpd_pool_drain_run = false;
1014 mutex_exit(&uvmpd_lock);
1015
1016 /*
1017 * rate limit draining, otherwise in desperate circumstances
1018 * this can totally saturate the system with xcall activity.
1019 */
1020 if (cycled) {
1021 kpause("uvmpdlmt", false, 1, NULL);
1022 cycled = false;
1023 firstpool = NULL;
1024 }
1025
1026 /*
1027 * drain and temporarily disable the freelist cache.
1028 */
1029 uvm_pgflcache_pause();
1030
1031 /*
1032 * kill unused metadata buffers.
1033 */
1034 bufcnt = uvmexp.freetarg - uvm_availmem();
1035 if (bufcnt < 0)
1036 bufcnt = 0;
1037
1038 mutex_enter(&bufcache_lock);
1039 buf_drain(bufcnt << PAGE_SHIFT);
1040 mutex_exit(&bufcache_lock);
1041
1042 /*
1043 * drain a pool, and then re-enable the freelist cache.
1044 */
1045 (void)pool_drain(&curpool);
1046 KASSERT(curpool != NULL);
1047 if (firstpool == NULL) {
1048 firstpool = curpool;
1049 } else if (firstpool == curpool) {
1050 cycled = true;
1051 }
1052 uvm_pgflcache_resume();
1053 }
1054 /*NOTREACHED*/
1055 }
1056
1057 static void
1058 uvmpd_pool_drain_wakeup(void)
1059 {
1060
1061 mutex_enter(&uvmpd_lock);
1062 uvmpd_pool_drain_run = true;
1063 cv_signal(&uvmpd_pool_drain_cv);
1064 mutex_exit(&uvmpd_lock);
1065 }
1066