uvm_pdaemon.c revision 1.93.4.2.4.10 1 /* $NetBSD: uvm_pdaemon.c,v 1.93.4.2.4.10 2012/04/12 19:41:57 matt Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993, The Regents of the University of California.
6 *
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Charles D. Cranor,
23 * Washington University, the University of California, Berkeley and
24 * its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94
42 * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
43 *
44 *
45 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
46 * All rights reserved.
47 *
48 * Permission to use, copy, modify and distribute this software and
49 * its documentation is hereby granted, provided that both the copyright
50 * notice and this permission notice appear in all copies of the
51 * software, derivative works or modified versions, and any portions
52 * thereof, and that both notices appear in supporting documentation.
53 *
54 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
55 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
56 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
57 *
58 * Carnegie Mellon requests users of this software to return to
59 *
60 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
61 * School of Computer Science
62 * Carnegie Mellon University
63 * Pittsburgh PA 15213-3890
64 *
65 * any improvements or extensions that they make and grant Carnegie the
66 * rights to redistribute these changes.
67 */
68
69 /*
70 * uvm_pdaemon.c: the page daemon
71 */
72
73 #include <sys/cdefs.h>
74 __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.93.4.2.4.10 2012/04/12 19:41:57 matt Exp $");
75
76 #include "opt_uvmhist.h"
77 #include "opt_readahead.h"
78
79 #include <sys/param.h>
80 #include <sys/proc.h>
81 #include <sys/systm.h>
82 #include <sys/kernel.h>
83 #include <sys/pool.h>
84 #include <sys/buf.h>
85 #include <sys/atomic.h>
86
87 #include <uvm/uvm.h>
88 #include <uvm/uvm_pdpolicy.h>
89
90 /*
91 * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
92 * in a pass thru the inactive list when swap is full. the value should be
93 * "small"... if it's too large we'll cycle the active pages thru the inactive
94 * queue too quickly to for them to be referenced and avoid being freed.
95 */
96
97 #define UVMPD_NUMDIRTYREACTS 16
98
99 #define UVMPD_NUMTRYLOCKOWNER 16
100
101 /*
102 * local prototypes
103 */
104
105 static bool uvmpd_scan(struct uvm_pggroup *);
106 static void uvmpd_scan_queue(struct uvm_pggroup *);
107 static void uvmpd_tune(void);
108
109 static void uvmpd_checkgroup(const struct uvm_pggroup *);
110
111 static struct uvm_pdinfo {
112 unsigned int pd_waiters;
113 unsigned int pd_scans_neededs;
114 struct uvm_pggrouplist pd_pagingq;
115 struct uvm_pggrouplist pd_pendingq;
116 } uvm_pdinfo = {
117 .pd_pagingq = TAILQ_HEAD_INITIALIZER(uvm_pdinfo.pd_pagingq),
118 .pd_pendingq = TAILQ_HEAD_INITIALIZER(uvm_pdinfo.pd_pendingq),
119 };
120
121 /*
122 * XXX hack to avoid hangs when large processes fork.
123 */
124 u_int uvm_extrapages;
125
126 /*
127 * uvm_wait: wait (sleep) for the page daemon to free some pages
128 *
129 * => should be called with all locks released
130 * => should _not_ be called by the page daemon (to avoid deadlock)
131 */
132
133 void
134 uvm_wait(const char *wmsg)
135 {
136 int timo = 0;
137
138 mutex_spin_enter(&uvm_fpageqlock);
139
140 /*
141 * check for page daemon going to sleep (waiting for itself)
142 */
143
144 if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) {
145 /*
146 * now we have a problem: the pagedaemon wants to go to
147 * sleep until it frees more memory. but how can it
148 * free more memory if it is asleep? that is a deadlock.
149 * we have two options:
150 * [1] panic now
151 * [2] put a timeout on the sleep, thus causing the
152 * pagedaemon to only pause (rather than sleep forever)
153 *
154 * note that option [2] will only help us if we get lucky
155 * and some other process on the system breaks the deadlock
156 * by exiting or freeing memory (thus allowing the pagedaemon
157 * to continue). for now we panic if DEBUG is defined,
158 * otherwise we hope for the best with option [2] (better
159 * yet, this should never happen in the first place!).
160 */
161
162 printf("pagedaemon: deadlock detected!\n");
163 timo = hz >> 3; /* set timeout */
164 #if defined(DEBUG)
165 /* DEBUG: panic so we can debug it */
166 panic("pagedaemon deadlock");
167 #endif
168 }
169
170 uvm_pdinfo.pd_waiters++;
171 wakeup(&uvm.pagedaemon); /* wake the daemon! */
172 UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvm_fpageqlock, false, wmsg, timo);
173 uvm_pdinfo.pd_waiters--;
174 }
175
176
177 static void
178 uvmpd_checkgroup(const struct uvm_pggroup *grp)
179 {
180 #ifdef DEBUG
181 struct uvm_pdinfo * const pdinfo = &uvm_pdinfo;
182 bool in_pendingq = false;
183 bool in_pagingq = false;
184 const struct uvm_pggroup *tstgrp;
185
186 TAILQ_FOREACH(tstgrp, &pdinfo->pd_pendingq, pgrp_pending_link) {
187 if (tstgrp == grp) {
188 in_pendingq = true;
189 break;
190 }
191 }
192
193 TAILQ_FOREACH(tstgrp, &pdinfo->pd_pagingq, pgrp_paging_link) {
194 if (tstgrp == grp) {
195 in_pagingq = true;
196 break;
197 }
198 }
199
200 if (grp->pgrp_paging > 0) {
201 KASSERT(in_pagingq);
202 KASSERT(!in_pendingq);
203 } else {
204 KASSERT(!in_pagingq);
205 KASSERT(in_pendingq == grp->pgrp_scan_needed);
206 }
207 #endif
208 }
209
210 /*
211 * uvm_kick_pdaemon: perform checks to determine if we need to
212 * give the pagedaemon a nudge, and do so if necessary.
213 *
214 * => called with uvm_fpageqlock held.
215 */
216
217 void
218 uvm_kick_pdaemon(void)
219 {
220 struct uvm_pdinfo * const pdinfo = &uvm_pdinfo;
221 bool need_wakeup = false;
222 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
223
224 KASSERT(mutex_owned(&uvm_fpageqlock));
225
226 struct uvm_pggroup *grp;
227 STAILQ_FOREACH(grp, &uvm.page_groups, pgrp_uvm_link) {
228 const bool prev_scan_needed = grp->pgrp_scan_needed;
229
230 KASSERT(grp->pgrp_npages > 0);
231 uvmpd_checkgroup(grp);
232
233 grp->pgrp_scan_needed =
234 grp->pgrp_free + grp->pgrp_paging < grp->pgrp_freemin
235 || (grp->pgrp_free + grp->pgrp_paging < grp->pgrp_freetarg
236 && uvmpdpol_needsscan_p(grp));
237
238 if (prev_scan_needed != grp->pgrp_scan_needed) {
239 UVMHIST_LOG(pdhist, " [%zd] %d->%d (scan=%d)",
240 grp - uvm.pggroups, prev_scan_needed,
241 grp->pgrp_scan_needed, uvmpdpol_needsscan_p(grp));
242 UVMHIST_LOG(pdhist, " [%zd] %d < min(%d,%d)",
243 grp - uvm.pggroups,
244 grp->pgrp_free + grp->pgrp_paging,
245 grp->pgrp_freemin, grp->pgrp_freetarg);
246 }
247
248 if (prev_scan_needed != grp->pgrp_scan_needed) {
249 if (grp->pgrp_scan_needed) {
250 struct uvm_pggroup *prev;
251 TAILQ_FOREACH(prev, &pdinfo->pd_pendingq,
252 pgrp_pending_link) {
253 if (grp->pgrp_free < prev->pgrp_free)
254 break;
255 }
256 if (prev == NULL) {
257 TAILQ_INSERT_TAIL(&pdinfo->pd_pendingq,
258 grp, pgrp_pending_link);
259 } else {
260 TAILQ_INSERT_BEFORE(prev, grp,
261 pgrp_pending_link);
262 }
263 need_wakeup = true;
264 } else {
265 TAILQ_REMOVE(&pdinfo->pd_pendingq,
266 grp, pgrp_pending_link);
267 }
268 uvmpd_checkgroup(grp);
269 }
270 }
271
272 if (need_wakeup)
273 wakeup(&uvm.pagedaemon);
274
275 UVMHIST_LOG(pdhist, " <- done: wakeup=%d!",
276 need_wakeup, 0, 0, 0);
277 }
278
279 /*
280 * uvmpd_tune: tune paging parameters
281 *
282 * => called when ever memory is added (or removed?) to the system
283 * => caller must call with page queues locked
284 */
285
286 static void
287 uvmpd_tune(void)
288 {
289 u_int extrapages = atomic_swap_uint(&uvm_extrapages, 0) / uvmexp.ncolors;
290 u_int freemin = 0;
291 u_int freetarg = 0;
292 u_int wiredmax = 0;
293
294 UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
295
296 extrapages = roundup(extrapages, uvmexp.npggroups);
297
298 struct uvm_pggroup *grp;
299 STAILQ_FOREACH(grp, &uvm.page_groups, pgrp_uvm_link) {
300 KASSERT(grp->pgrp_npages > 0);
301
302 /*
303 * try to keep 0.5% of available RAM free, but limit
304 * to between 128k and 1024k per-CPU.
305 * XXX: what are these values good for?
306 */
307 u_int val = grp->pgrp_npages / 200;
308 val = MAX(val, (128*1024) >> PAGE_SHIFT);
309 val = MIN(val, (1024*1024) >> PAGE_SHIFT);
310 val *= ncpu;
311
312 /* Make sure there's always a user page free. */
313 if (val * uvmexp.npggroups <= uvmexp.reserve_kernel)
314 val = uvmexp.reserve_kernel / uvmexp.npggroups + 1;
315
316 grp->pgrp_freemin = val;
317
318 /* Calculate freetarg. */
319 val = (grp->pgrp_freemin * 4) / 3;
320 if (val <= grp->pgrp_freemin)
321 val = grp->pgrp_freemin + 1;
322 #ifdef VM_FREELIST_NORMALOK_P
323 if (!VM_FREELIST_NORMALOK_P(grp->pgrp_free_list))
324 val *= 4;
325 #endif
326 grp->pgrp_freetarg = val + extrapages / uvmexp.npggroups;
327 if (grp->pgrp_freetarg > grp->pgrp_npages / 2)
328 grp->pgrp_freetarg = grp->pgrp_npages / 2;
329
330 grp->pgrp_wiredmax = grp->pgrp_npages / 3;
331 UVMHIST_LOG(pdhist,
332 "[%zd]: freemin=%d, freetarg=%d, wiredmax=%d",
333 grp - uvm.pggroups, grp->pgrp_freemin, grp->pgrp_freetarg,
334 grp->pgrp_wiredmax);
335
336 freemin += grp->pgrp_freemin;
337 freetarg += grp->pgrp_freetarg;
338 wiredmax += grp->pgrp_wiredmax;
339 }
340
341 uvmexp.freemin = freemin;
342 uvmexp.freetarg = freetarg;
343 uvmexp.wiredmax = wiredmax;
344
345 UVMHIST_LOG(pdhist, "<- done, freemin=%d, freetarg=%d, wiredmax=%d",
346 uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
347 }
348
349 /*
350 * uvm_pageout: the main loop for the pagedaemon
351 */
352
353 void
354 uvm_pageout(void *arg)
355 {
356 u_int npages = 0;
357 u_int extrapages = 0;
358 u_int npggroups = 0;
359 struct pool *pp;
360 uint64_t where;
361 struct uvm_pdinfo * const pdinfo = &uvm_pdinfo;
362 bool progress = true;
363 UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist);
364
365 UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
366
367 /*
368 * ensure correct priority and set paging parameters...
369 */
370
371 uvm.pagedaemon_lwp = curlwp;
372 mutex_enter(&uvm_pageqlock);
373 npages = uvmexp.npages;
374 uvmpd_tune();
375 mutex_exit(&uvm_pageqlock);
376
377 /*
378 * main loop
379 */
380
381 for (;;) {
382 struct uvm_pggroup *grp;
383 bool need_free = false;
384 u_int bufcnt = 0;
385
386 mutex_spin_enter(&uvm_fpageqlock);
387 /*
388 * If we have no one waiting or all color requests have
389 * active paging, then wait.
390 */
391 if (progress == false
392 || (pdinfo->pd_waiters == 0
393 && TAILQ_FIRST(&pdinfo->pd_pendingq) == NULL)) {
394 UVMHIST_LOG(pdhist," <<SLEEPING>>",0,0,0,0);
395 int timo = 0;
396 if (!progress && pdinfo->pd_waiters > 0)
397 timo = 2 * hz;
398 UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
399 &uvm_fpageqlock, false, "pgdaemon", timo);
400 uvmexp.pdwoke++;
401 UVMHIST_LOG(pdhist," <<WOKE UP>>",0,0,0,0);
402 progress = false;
403 } else if (TAILQ_FIRST(&pdinfo->pd_pendingq) == NULL) {
404 /*
405 * Someone is waiting but no group are pending.
406 * Let's kick ourselves to find groups that need work.
407 */
408 uvm_kick_pdaemon();
409 mutex_spin_exit(&uvm_fpageqlock);
410 } else {
411 mutex_spin_exit(&uvm_fpageqlock);
412 }
413
414 /*
415 * now lock page queues and recompute inactive count
416 */
417
418 mutex_enter(&uvm_pageqlock);
419 mutex_spin_enter(&uvm_fpageqlock);
420
421 if (npages != uvmexp.npages
422 || extrapages != uvm_extrapages
423 || npggroups != uvmexp.npggroups) {
424 npages = uvmexp.npages;
425 extrapages = uvm_extrapages;
426 npggroups = uvmexp.npggroups;
427 uvmpd_tune();
428 }
429
430 /*
431 * Estimate a hint. Note that bufmem are returned to
432 * system only when entire pool page is empty.
433 */
434 bool need_wakeup = false;
435 while ((grp = TAILQ_FIRST(&pdinfo->pd_pendingq)) != NULL) {
436 KASSERT(grp->pgrp_npages > 0);
437
438 uvmpdpol_tune(grp);
439
440 /*
441 * While we are locked, remove this from the pendingq.
442 */
443 uvmpd_checkgroup(grp);
444 KASSERT(grp->pgrp_scan_needed);
445 TAILQ_REMOVE(&pdinfo->pd_pendingq, grp,
446 pgrp_pending_link);
447 grp->pgrp_scan_needed = false;
448 uvmpd_checkgroup(grp);
449
450 int diff = grp->pgrp_freetarg - grp->pgrp_free;
451 if (diff < 0)
452 diff = 0;
453
454 bufcnt += diff;
455
456 UVMHIST_LOG(pdhist," [%zu]: "
457 "free/ftarg/fmin=%u/%u/%u",
458 grp - uvm.pggroups, grp->pgrp_free,
459 grp->pgrp_freetarg, grp->pgrp_freemin);
460
461
462 if (grp->pgrp_paging < diff)
463 need_free = true;
464
465 /*
466 * scan if needed
467 */
468 if (grp->pgrp_paging < diff
469 || uvmpdpol_needsscan_p(grp)) {
470 mutex_spin_exit(&uvm_fpageqlock);
471 if (uvmpd_scan(grp))
472 progress = true;
473 mutex_spin_enter(&uvm_fpageqlock);
474 } else {
475 UVMHIST_LOG(pdhist,
476 " [%zu]: diff/paging=%u/%u: "
477 "scan skipped",
478 grp - uvm.pggroups, diff,
479 grp->pgrp_paging, 0);
480 }
481
482 /*
483 * if there's any free memory to be had,
484 * wake up any waiters.
485 */
486 if (grp->pgrp_free * uvmexp.npggroups > uvmexp.reserve_kernel
487 || grp->pgrp_paging == 0) {
488 need_wakeup = true;
489 }
490
491 }
492 if (need_wakeup) {
493 wakeup(&uvmexp.free);
494 }
495 KASSERT (!need_free || need_wakeup);
496 mutex_spin_exit(&uvm_fpageqlock);
497
498 /*
499 * scan done. unlock page queues (the only lock
500 * we are holding)
501 */
502 mutex_exit(&uvm_pageqlock);
503
504 /*
505 * if we don't need free memory, we're done.
506 */
507
508 if (!need_free)
509 continue;
510
511 /*
512 * start draining pool resources now that we're not
513 * holding any locks.
514 */
515 pool_drain_start(&pp, &where);
516
517 /*
518 * kill unused metadata buffers.
519 */
520 if (bufcnt > 0) {
521 mutex_enter(&bufcache_lock);
522 buf_drain(bufcnt << PAGE_SHIFT);
523 mutex_exit(&bufcache_lock);
524 }
525
526 /*
527 * complete draining the pools.
528 */
529 pool_drain_end(pp, where);
530 }
531 /*NOTREACHED*/
532 }
533
534
535 /*
536 * uvm_aiodone_worker: a workqueue callback for the aiodone daemon.
537 */
538
539 void
540 uvm_aiodone_worker(struct work *wk, void *dummy)
541 {
542 struct buf *bp = (void *)wk;
543
544 KASSERT(&bp->b_work == wk);
545
546 /*
547 * process an i/o that's done.
548 */
549
550 (*bp->b_iodone)(bp);
551 }
552
553 void
554 uvm_pageout_start(struct uvm_pggroup *grp, u_int npages)
555 {
556 struct uvm_pdinfo * const pdinfo = &uvm_pdinfo;
557
558 mutex_spin_enter(&uvm_fpageqlock);
559
560 uvmpd_checkgroup(grp);
561 uvmexp.paging += npages;
562 if (grp->pgrp_paging == 0) {
563 /*
564 * If the group is in a paging queue, it can't be in a pending
565 * queue so remove it if it is.
566 */
567 if (grp->pgrp_scan_needed) {
568 TAILQ_REMOVE(&pdinfo->pd_pendingq, grp,
569 pgrp_pending_link);
570 grp->pgrp_scan_needed = false;
571 }
572 TAILQ_INSERT_TAIL(&pdinfo->pd_pagingq, grp, pgrp_paging_link);
573 }
574 grp->pgrp_paging += npages;
575 uvmpd_checkgroup(grp);
576 mutex_spin_exit(&uvm_fpageqlock);
577 }
578
579 void
580 uvm_pageout_done(struct vm_page *pg, bool freed)
581 {
582 struct uvm_pdinfo * const pdinfo = &uvm_pdinfo;
583
584 KASSERT(pg->flags & PG_PAGEOUT);
585
586 mutex_spin_enter(&uvm_fpageqlock);
587 struct uvm_pggroup * const grp = uvm_page_to_pggroup(pg);
588
589 KASSERT(grp->pgrp_paging > 0);
590 uvmpd_checkgroup(grp);
591 if (--grp->pgrp_paging == 0) {
592 TAILQ_REMOVE(&pdinfo->pd_pagingq, grp, pgrp_paging_link);
593 uvmpd_checkgroup(grp);
594 }
595 KASSERT(uvmexp.paging > 0);
596 uvmexp.paging--;
597 grp->pgrp_pdfreed += freed;
598
599 /*
600 * Page is no longer being paged out.
601 */
602 pg->flags &= ~PG_PAGEOUT;
603
604 /*
605 * wake up either of pagedaemon or LWPs waiting for it.
606 */
607 if (grp->pgrp_free * uvmexp.npggroups <= uvmexp.reserve_kernel) {
608 wakeup(&uvm.pagedaemon);
609 } else {
610 wakeup(&uvmexp.free);
611 }
612
613 mutex_spin_exit(&uvm_fpageqlock);
614 }
615
616 /*
617 * uvmpd_trylockowner: trylock the page's owner.
618 *
619 * => called with pageq locked.
620 * => resolve orphaned O->A loaned page.
621 * => return the locked mutex on success. otherwise, return NULL.
622 */
623
624 kmutex_t *
625 uvmpd_trylockowner(struct vm_page *pg)
626 {
627 struct uvm_object *uobj = pg->uobject;
628 kmutex_t *slock;
629
630 KASSERT(mutex_owned(&uvm_pageqlock));
631
632 if (uobj != NULL) {
633 slock = &uobj->vmobjlock;
634 } else {
635 struct vm_anon *anon = pg->uanon;
636
637 KASSERT(anon != NULL);
638 slock = &anon->an_lock;
639 }
640
641 if (!mutex_tryenter(slock)) {
642 return NULL;
643 }
644
645 if (uobj == NULL) {
646
647 /*
648 * set PQ_ANON if it isn't set already.
649 */
650
651 if ((pg->pqflags & PQ_ANON) == 0) {
652 KASSERT(pg->loan_count > 0);
653 pg->loan_count--;
654 pg->pqflags |= PQ_ANON;
655 /* anon now owns it */
656 }
657 }
658
659 return slock;
660 }
661
662 #if defined(VMSWAP)
663 struct swapcluster {
664 int swc_slot;
665 int swc_nallocated;
666 int swc_nused;
667 struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)];
668 };
669
670 static void
671 swapcluster_init(struct swapcluster *swc)
672 {
673
674 swc->swc_slot = 0;
675 swc->swc_nused = 0;
676 }
677
678 static int
679 swapcluster_allocslots(struct swapcluster *swc)
680 {
681 int slot;
682 int npages;
683
684 if (swc->swc_slot != 0) {
685 return 0;
686 }
687
688 /* Even with strange MAXPHYS, the shift
689 implicitly rounds down to a page. */
690 npages = MAXPHYS >> PAGE_SHIFT;
691 slot = uvm_swap_alloc(&npages, true);
692 if (slot == 0) {
693 return ENOMEM;
694 }
695 swc->swc_slot = slot;
696 swc->swc_nallocated = npages;
697 swc->swc_nused = 0;
698
699 return 0;
700 }
701
702 static int
703 swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
704 {
705 int slot;
706 struct uvm_object *uobj;
707
708 KASSERT(swc->swc_slot != 0);
709 KASSERT(swc->swc_nused < swc->swc_nallocated);
710 KASSERT((pg->pqflags & PQ_SWAPBACKED) != 0);
711
712 slot = swc->swc_slot + swc->swc_nused;
713 uobj = pg->uobject;
714 if (uobj == NULL) {
715 KASSERT(mutex_owned(&pg->uanon->an_lock));
716 pg->uanon->an_swslot = slot;
717 } else {
718 int result;
719
720 KASSERT(mutex_owned(&uobj->vmobjlock));
721 result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
722 if (result == -1) {
723 return ENOMEM;
724 }
725 }
726 swc->swc_pages[swc->swc_nused] = pg;
727 swc->swc_nused++;
728
729 return 0;
730 }
731
732 static void
733 swapcluster_flush(struct uvm_pggroup *grp, struct swapcluster *swc, bool now)
734 {
735 int slot;
736 u_int nused;
737 int nallocated;
738 int error;
739
740 if (swc->swc_slot == 0) {
741 return;
742 }
743 KASSERT(swc->swc_nused <= swc->swc_nallocated);
744
745 slot = swc->swc_slot;
746 nused = swc->swc_nused;
747 nallocated = swc->swc_nallocated;
748
749 /*
750 * if this is the final pageout we could have a few
751 * unused swap blocks. if so, free them now.
752 */
753
754 if (nused < nallocated) {
755 if (!now) {
756 return;
757 }
758 uvm_swap_free(slot + nused, nallocated - nused);
759 }
760
761 /*
762 * now start the pageout.
763 */
764
765 if (nused > 0) {
766 grp->pgrp_pdpageouts++;
767 uvmexp.pdpageouts++; /* procfs */
768 uvm_pageout_start(grp, nused);
769 error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
770 KASSERT(error == 0 || error == ENOMEM);
771 }
772
773 /*
774 * zero swslot to indicate that we are
775 * no longer building a swap-backed cluster.
776 */
777
778 swc->swc_slot = 0;
779 swc->swc_nused = 0;
780 }
781
782 static int
783 swapcluster_nused(struct swapcluster *swc)
784 {
785
786 return swc->swc_nused;
787 }
788
789 /*
790 * uvmpd_dropswap: free any swap allocated to this page.
791 *
792 * => called with owner locked.
793 * => return true if a page had an associated slot.
794 */
795
796 static bool
797 uvmpd_dropswap(struct vm_page *pg)
798 {
799 bool result = false;
800 struct vm_anon *anon = pg->uanon;
801
802 if ((pg->pqflags & PQ_ANON) && anon->an_swslot) {
803 uvm_swap_free(anon->an_swslot, 1);
804 anon->an_swslot = 0;
805 pg->flags &= ~PG_CLEAN;
806 result = true;
807 } else if (pg->pqflags & PQ_AOBJ) {
808 int slot = uao_set_swslot(pg->uobject,
809 pg->offset >> PAGE_SHIFT, 0);
810 if (slot) {
811 uvm_swap_free(slot, 1);
812 pg->flags &= ~PG_CLEAN;
813 result = true;
814 }
815 }
816
817 return result;
818 }
819
820 /*
821 * uvmpd_trydropswap: try to free any swap allocated to this page.
822 *
823 * => return true if a slot is successfully freed.
824 */
825
826 bool
827 uvmpd_trydropswap(struct vm_page *pg)
828 {
829 kmutex_t *slock;
830 bool result;
831
832 if ((pg->flags & PG_BUSY) != 0) {
833 return false;
834 }
835
836 /*
837 * lock the page's owner.
838 */
839
840 slock = uvmpd_trylockowner(pg);
841 if (slock == NULL) {
842 return false;
843 }
844
845 /*
846 * skip this page if it's busy.
847 */
848
849 if ((pg->flags & PG_BUSY) != 0) {
850 mutex_exit(slock);
851 return false;
852 }
853
854 result = uvmpd_dropswap(pg);
855
856 mutex_exit(slock);
857
858 return result;
859 }
860
861 #endif /* defined(VMSWAP) */
862
863 /*
864 * uvmpd_scan_queue: scan an replace candidate list for pages
865 * to clean or free.
866 *
867 * => called with page queues locked
868 * => we work on meeting our free target by converting inactive pages
869 * into free pages.
870 * => we handle the building of swap-backed clusters
871 */
872
873 static void
874 uvmpd_scan_queue(struct uvm_pggroup *grp)
875 {
876 struct vm_page *pg;
877 struct uvm_object *uobj;
878 struct vm_anon *anon;
879 #if defined(VMSWAP)
880 struct swapcluster swc;
881 #endif /* defined(VMSWAP) */
882 u_int dirtyreacts;
883 u_int lockownerfail;
884 u_int victims;
885 u_int freed;
886 u_int busy;
887 kmutex_t *slock;
888 UVMHIST_FUNC("uvmpd_scan_queue"); UVMHIST_CALLED(pdhist);
889
890 /*
891 * swslot is non-zero if we are building a swap cluster. we want
892 * to stay in the loop while we have a page to scan or we have
893 * a swap-cluster to build.
894 */
895
896 #if defined(VMSWAP)
897 swapcluster_init(&swc);
898 #endif /* defined(VMSWAP) */
899
900 dirtyreacts = 0;
901 lockownerfail = 0;
902 victims = 0;
903 freed = 0;
904 busy = 0;
905 uvmpdpol_scaninit(grp);
906
907 UVMHIST_LOG(pdhist," [%zd]: want free target (%u)",
908 grp - uvm.pggroups, grp->pgrp_freetarg << 2, 0, 0);
909 while (/* CONSTCOND */ 1) {
910
911 /*
912 * see if we've met the free target.
913 */
914
915 if (grp->pgrp_free + grp->pgrp_paging
916 #if defined(VMSWAP)
917 + swapcluster_nused(&swc)
918 #endif /* defined(VMSWAP) */
919 >= grp->pgrp_freetarg << 2 ||
920 dirtyreacts == UVMPD_NUMDIRTYREACTS) {
921 UVMHIST_LOG(pdhist," [%zd]: met free target (%u + %u)"
922 ", dirty reacts %u",
923 grp - uvm.pggroups, grp->pgrp_free,
924 grp->pgrp_paging, dirtyreacts);
925 break;
926 }
927
928 pg = uvmpdpol_selectvictim(grp);
929 if (pg == NULL) {
930 UVMHIST_LOG(pdhist," [%zd]: selectvictim didn't",
931 grp - uvm.pggroups, 0, 0, 0);
932 break;
933 }
934 victims++;
935 KASSERT(uvmpdpol_pageisqueued_p(pg));
936 KASSERT(pg->wire_count == 0);
937
938 /*
939 * we are below target and have a new page to consider.
940 */
941
942 anon = pg->uanon;
943 uobj = pg->uobject;
944
945 /*
946 * first we attempt to lock the object that this page
947 * belongs to. if our attempt fails we skip on to
948 * the next page (no harm done). it is important to
949 * "try" locking the object as we are locking in the
950 * wrong order (pageq -> object) and we don't want to
951 * deadlock.
952 *
953 * the only time we expect to see an ownerless page
954 * (i.e. a page with no uobject and !PQ_ANON) is if an
955 * anon has loaned a page from a uvm_object and the
956 * uvm_object has dropped the ownership. in that
957 * case, the anon can "take over" the loaned page
958 * and make it its own.
959 */
960
961 slock = uvmpd_trylockowner(pg);
962 if (slock == NULL) {
963 /*
964 * yield cpu to make a chance for an LWP holding
965 * the lock run. otherwise we can busy-loop too long
966 * if the page queue is filled with a lot of pages
967 * from few objects.
968 */
969 lockownerfail++;
970 if (lockownerfail > UVMPD_NUMTRYLOCKOWNER) {
971 mutex_exit(&uvm_pageqlock);
972 /* XXX Better than yielding but inadequate. */
973 kpause("livelock", false, 1, NULL);
974 mutex_enter(&uvm_pageqlock);
975 lockownerfail = 0;
976 }
977 continue;
978 }
979 if (pg->flags & PG_BUSY) {
980 mutex_exit(slock);
981 busy++;
982 continue;
983 }
984
985 /* does the page belong to an object? */
986 if (uobj != NULL) {
987 grp->pgrp_pdobscan++;
988 } else {
989 #if defined(VMSWAP)
990 KASSERT(anon != NULL);
991 grp->pgrp_pdanscan++;
992 #else /* defined(VMSWAP) */
993 panic("%s: anon", __func__);
994 #endif /* defined(VMSWAP) */
995 }
996
997
998 /*
999 * we now have the object and the page queues locked.
1000 * if the page is not swap-backed, call the object's
1001 * pager to flush and free the page.
1002 */
1003
1004 #if defined(READAHEAD_STATS)
1005 if ((pg->pqflags & PQ_READAHEAD) != 0) {
1006 pg->pqflags &= ~PQ_READAHEAD;
1007 uvm_ra_miss.ev_count++;
1008 }
1009 #endif /* defined(READAHEAD_STATS) */
1010
1011 if ((pg->pqflags & PQ_SWAPBACKED) == 0) {
1012 KASSERT(uobj != NULL);
1013 mutex_exit(&uvm_pageqlock);
1014 (void) (uobj->pgops->pgo_put)(uobj, pg->offset,
1015 pg->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE);
1016 grp->pgrp_pdputs++;
1017 mutex_enter(&uvm_pageqlock);
1018 continue;
1019 }
1020
1021 /*
1022 * the page is swap-backed. remove all the permissions
1023 * from the page so we can sync the modified info
1024 * without any race conditions. if the page is clean
1025 * we can free it now and continue.
1026 */
1027
1028 pmap_page_protect(pg, VM_PROT_NONE);
1029 if ((pg->flags & PG_CLEAN) && pmap_clear_modify(pg)) {
1030 pg->flags &= ~(PG_CLEAN);
1031 }
1032 if (pg->flags & PG_CLEAN) {
1033 int slot;
1034 int pageidx;
1035
1036 pageidx = pg->offset >> PAGE_SHIFT;
1037 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1038 uvm_pagefree(pg);
1039 freed++;
1040
1041 /*
1042 * for anons, we need to remove the page
1043 * from the anon ourselves. for aobjs,
1044 * pagefree did that for us.
1045 */
1046
1047 if (anon) {
1048 KASSERT(anon->an_swslot != 0);
1049 anon->an_page = NULL;
1050 slot = anon->an_swslot;
1051 } else {
1052 slot = uao_find_swslot(uobj, pageidx);
1053 }
1054 mutex_exit(slock);
1055
1056 if (slot > 0) {
1057 /* this page is now only in swap. */
1058 mutex_enter(&uvm_swap_data_lock);
1059 KASSERT(uvmexp.swpgonly < uvmexp.swpginuse);
1060 uvmexp.swpgonly++;
1061 mutex_exit(&uvm_swap_data_lock);
1062 }
1063 continue;
1064 }
1065
1066 #if defined(VMSWAP)
1067 /*
1068 * this page is dirty, skip it if we'll have met our
1069 * free target when all the current pageouts complete.
1070 */
1071
1072 if (grp->pgrp_free + grp->pgrp_paging > grp->pgrp_freetarg << 2) {
1073 mutex_exit(slock);
1074 continue;
1075 }
1076
1077 /*
1078 * free any swap space allocated to the page since
1079 * we'll have to write it again with its new data.
1080 */
1081
1082 uvmpd_dropswap(pg);
1083
1084 /*
1085 * start new swap pageout cluster (if necessary).
1086 *
1087 * if swap is full reactivate this page so that
1088 * we eventually cycle all pages through the
1089 * inactive queue.
1090 */
1091
1092 if (swapcluster_allocslots(&swc)) {
1093 dirtyreacts++;
1094 uvm_pageactivate(pg);
1095 mutex_exit(slock);
1096 continue;
1097 }
1098
1099 /*
1100 * at this point, we're definitely going reuse this
1101 * page. mark the page busy and delayed-free.
1102 * we should remove the page from the page queues
1103 * so we don't ever look at it again.
1104 * adjust counters and such.
1105 */
1106
1107 pg->flags |= PG_BUSY;
1108 UVM_PAGE_OWN(pg, "scan_queue", NULL);
1109
1110 pg->flags |= PG_PAGEOUT;
1111 uvm_pagedequeue(pg);
1112
1113 grp->pgrp_pgswapout++;
1114 mutex_exit(&uvm_pageqlock);
1115
1116 /*
1117 * add the new page to the cluster.
1118 */
1119
1120 if (swapcluster_add(&swc, pg)) {
1121 pg->flags &= ~(PG_BUSY|PG_PAGEOUT);
1122 UVM_PAGE_OWN(pg, NULL, NULL);
1123 mutex_enter(&uvm_pageqlock);
1124 dirtyreacts++;
1125 uvm_pageactivate(pg);
1126 mutex_exit(slock);
1127 continue;
1128 }
1129 mutex_exit(slock);
1130
1131 swapcluster_flush(grp, &swc, false);
1132 mutex_enter(&uvm_pageqlock);
1133
1134 /*
1135 * the pageout is in progress. bump counters and set up
1136 * for the next loop.
1137 */
1138
1139 uvmexp.pdpending++;
1140 #else /* defined(VMSWAP) */
1141 uvm_pageactivate(pg);
1142 mutex_exit(slock);
1143 #endif /* defined(VMSWAP) */
1144 }
1145
1146 UVMHIST_LOG(pdhist," [%zd] <-- done: %u victims: %u freed, %u busy",
1147 grp - uvm.pggroups, victims, freed, busy);
1148
1149 grp->pgrp_pdvictims += victims;
1150 grp->pgrp_pdnullscans += (victims == 0);
1151 grp->pgrp_pdfreed += freed;
1152 grp->pgrp_pdbusy += busy;
1153
1154 #if defined(VMSWAP)
1155 mutex_exit(&uvm_pageqlock);
1156 swapcluster_flush(grp, &swc, true);
1157 mutex_enter(&uvm_pageqlock);
1158 #endif /* defined(VMSWAP) */
1159 }
1160
1161 /*
1162 * uvmpd_scan: scan the page queues and attempt to meet our targets.
1163 *
1164 * => called with pageq's locked
1165 */
1166
1167 static bool
1168 uvmpd_scan(struct uvm_pggroup *grp)
1169 {
1170 u_int swap_shortage, pages_freed;
1171 UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist);
1172
1173 grp->pgrp_pdrevs++;
1174
1175 /*
1176 * work on meeting our targets. first we work on our free target
1177 * by converting inactive pages into free pages. then we work on
1178 * meeting our inactive target by converting active pages to
1179 * inactive ones.
1180 */
1181
1182 UVMHIST_LOG(pdhist, " starting 'free' loop",0,0,0,0);
1183
1184 pages_freed = grp->pgrp_pdfreed;
1185 uvmpd_scan_queue(grp);
1186 pages_freed = grp->pgrp_pdfreed - pages_freed;
1187
1188 /*
1189 * detect if we're not going to be able to page anything out
1190 * until we free some swap resources from active pages.
1191 */
1192
1193 swap_shortage = 0;
1194 if (pages_freed == 0
1195 && grp->pgrp_free < grp->pgrp_freetarg
1196 && uvmexp.swpginuse >= uvmexp.swpgavail
1197 && !uvm_swapisfull()) {
1198 swap_shortage = grp->pgrp_freetarg - grp->pgrp_free;
1199 }
1200
1201 uvmpdpol_balancequeue(grp, swap_shortage);
1202
1203 /*
1204 * swap out some processes if we are still below the minimum
1205 * free target. we need to unlock the page queues for this.
1206 */
1207
1208 #ifdef VMSWAP
1209 if (grp->pgrp_free < grp->pgrp_freemin
1210 && uvmexp.nswapdev != 0 && uvm.swapout_enabled) {
1211 grp->pgrp_pdswout++;
1212 UVMHIST_LOG(pdhist," free %d < min %d: swapout",
1213 uvmexp.free, uvmexp.freemin, 0, 0);
1214 mutex_exit(&uvm_pageqlock);
1215 uvm_swapout_threads();
1216 mutex_enter(&uvm_pageqlock);
1217
1218 }
1219 #endif /* VMSWAP */
1220
1221 return pages_freed != 0;
1222 }
1223
1224 /*
1225 * uvm_reclaimable: decide whether to wait for pagedaemon.
1226 *
1227 * => return true if it seems to be worth to do uvm_wait.
1228 *
1229 * XXX should be tunable.
1230 * XXX should consider pools, etc?
1231 */
1232
1233 bool
1234 uvm_reclaimable(u_int color, bool kmem_p)
1235 {
1236 u_int filepages, npages;
1237 u_int active, inactive;
1238
1239 /*
1240 * if swap is not full, no problem.
1241 */
1242
1243 #ifdef VMSWAP
1244 if (!uvm_swapisfull()) {
1245 return true;
1246 }
1247 #endif
1248
1249 /*
1250 * file-backed pages can be reclaimed even when swap is full.
1251 * if we have more than 1/16 of pageable memory or 5MB, try to reclaim.
1252 *
1253 * XXX assume the worst case, ie. all wired pages are file-backed.
1254 *
1255 * XXX should consider about other reclaimable memory.
1256 * XXX ie. pools, traditional buffer cache.
1257 */
1258 active = 0;
1259 inactive = 0;
1260 filepages = 0;
1261 npages = 0;
1262 for (u_int lcv = 0; lcv < VM_NFREELIST; lcv++) {
1263 struct uvm_pggroup * const grp =
1264 uvm.page_free[color].pgfl_pggroups[lcv];
1265
1266 #ifdef VM_FREELIST_NORMALOK_P
1267 /*
1268 * If this for kmem and it's a normal freelist, skip it.
1269 */
1270 if (kmem_p && VM_FREELIST_NORMALOK_P(lcv))
1271 continue;
1272 #endif
1273
1274 npages += grp->pgrp_npages;
1275 filepages += grp->pgrp_filepages + grp->pgrp_execpages;
1276 uvm_estimatepageable(grp, &active, &inactive);
1277 }
1278 filepages -= uvmexp.wired;
1279 /*
1280 *
1281 */
1282 if (filepages >= MIN((active + inactive) >> 4, npages / 25)) {
1283 return true;
1284 }
1285
1286 /*
1287 * kill the process, fail allocation, etc..
1288 */
1289
1290 return false;
1291 }
1292
1293 void
1294 uvm_estimatepageable(const struct uvm_pggroup *grp,
1295 u_int *active, u_int *inactive)
1296 {
1297
1298 uvmpdpol_estimatepageable(grp, active, inactive);
1299 }
1300