uvm_page.c revision 1.208 1 /* $NetBSD: uvm_page.c,v 1.208 2019/12/21 14:33:18 ad Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993, The Regents of the University of California.
6 *
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
37 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
38 *
39 *
40 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41 * All rights reserved.
42 *
43 * Permission to use, copy, modify and distribute this software and
44 * its documentation is hereby granted, provided that both the copyright
45 * notice and this permission notice appear in all copies of the
46 * software, derivative works or modified versions, and any portions
47 * thereof, and that both notices appear in supporting documentation.
48 *
49 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52 *
53 * Carnegie Mellon requests users of this software to return to
54 *
55 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
56 * School of Computer Science
57 * Carnegie Mellon University
58 * Pittsburgh PA 15213-3890
59 *
60 * any improvements or extensions that they make and grant Carnegie the
61 * rights to redistribute these changes.
62 */
63
64 /*
65 * uvm_page.c: page ops.
66 */
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.208 2019/12/21 14:33:18 ad Exp $");
70
71 #include "opt_ddb.h"
72 #include "opt_uvm.h"
73 #include "opt_uvmhist.h"
74 #include "opt_readahead.h"
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/sched.h>
79 #include <sys/kernel.h>
80 #include <sys/vnode.h>
81 #include <sys/proc.h>
82 #include <sys/radixtree.h>
83 #include <sys/atomic.h>
84 #include <sys/cpu.h>
85 #include <sys/extent.h>
86
87 #include <uvm/uvm.h>
88 #include <uvm/uvm_ddb.h>
89 #include <uvm/uvm_pdpolicy.h>
90
91 /*
92 * Some supported CPUs in a given architecture don't support all
93 * of the things necessary to do idle page zero'ing efficiently.
94 * We therefore provide a way to enable it from machdep code here.
95 */
96 bool vm_page_zero_enable = false;
97
98 /*
99 * number of pages per-CPU to reserve for the kernel.
100 */
101 #ifndef UVM_RESERVED_PAGES_PER_CPU
102 #define UVM_RESERVED_PAGES_PER_CPU 5
103 #endif
104 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
105
106 /*
107 * physical memory size;
108 */
109 psize_t physmem;
110
111 /*
112 * local variables
113 */
114
115 /*
116 * these variables record the values returned by vm_page_bootstrap,
117 * for debugging purposes. The implementation of uvm_pageboot_alloc
118 * and pmap_startup here also uses them internally.
119 */
120
121 static vaddr_t virtual_space_start;
122 static vaddr_t virtual_space_end;
123
124 /*
125 * we allocate an initial number of page colors in uvm_page_init(),
126 * and remember them. We may re-color pages as cache sizes are
127 * discovered during the autoconfiguration phase. But we can never
128 * free the initial set of buckets, since they are allocated using
129 * uvm_pageboot_alloc().
130 */
131
132 static size_t recolored_pages_memsize /* = 0 */;
133
134 #ifdef DEBUG
135 vaddr_t uvm_zerocheckkva;
136 #endif /* DEBUG */
137
138 /*
139 * These functions are reserved for uvm(9) internal use and are not
140 * exported in the header file uvm_physseg.h
141 *
142 * Thus they are redefined here.
143 */
144 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
145 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
146
147 /* returns a pgs array */
148 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
149
150 /*
151 * inline functions
152 */
153
154 /*
155 * uvm_pageinsert: insert a page in the object.
156 *
157 * => caller must lock object
158 * => call should have already set pg's object and offset pointers
159 * and bumped the version counter
160 */
161
162 static inline void
163 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
164 {
165
166 KASSERT(uobj == pg->uobject);
167 KASSERT(mutex_owned(uobj->vmobjlock));
168 KASSERT((pg->flags & PG_TABLED) == 0);
169
170 if (UVM_OBJ_IS_VNODE(uobj)) {
171 if (uobj->uo_npages == 0) {
172 struct vnode *vp = (struct vnode *)uobj;
173
174 vholdl(vp);
175 }
176 if (UVM_OBJ_IS_VTEXT(uobj)) {
177 cpu_count(CPU_COUNT_EXECPAGES, 1);
178 } else {
179 cpu_count(CPU_COUNT_FILEPAGES, 1);
180 }
181 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
182 cpu_count(CPU_COUNT_ANONPAGES, 1);
183 }
184 pg->flags |= PG_TABLED;
185 uobj->uo_npages++;
186 }
187
188 static inline int
189 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
190 {
191 const uint64_t idx = pg->offset >> PAGE_SHIFT;
192 int error;
193
194 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
195 if (error != 0) {
196 return error;
197 }
198 return 0;
199 }
200
201 /*
202 * uvm_page_remove: remove page from object.
203 *
204 * => caller must lock object
205 */
206
207 static inline void
208 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
209 {
210
211 KASSERT(uobj == pg->uobject);
212 KASSERT(mutex_owned(uobj->vmobjlock));
213 KASSERT(pg->flags & PG_TABLED);
214
215 if (UVM_OBJ_IS_VNODE(uobj)) {
216 if (uobj->uo_npages == 1) {
217 struct vnode *vp = (struct vnode *)uobj;
218
219 holdrelel(vp);
220 }
221 if (UVM_OBJ_IS_VTEXT(uobj)) {
222 cpu_count(CPU_COUNT_EXECPAGES, -1);
223 } else {
224 cpu_count(CPU_COUNT_FILEPAGES, -1);
225 }
226 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
227 cpu_count(CPU_COUNT_ANONPAGES, -1);
228 }
229
230 /* object should be locked */
231 uobj->uo_npages--;
232 pg->flags &= ~PG_TABLED;
233 pg->uobject = NULL;
234 }
235
236 static inline void
237 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
238 {
239 struct vm_page *opg __unused;
240
241 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
242 KASSERT(pg == opg);
243 }
244
245 static void
246 uvm_page_init_buckets(struct pgfreelist *pgfl)
247 {
248 int color, i;
249
250 for (color = 0; color < uvmexp.ncolors; color++) {
251 for (i = 0; i < PGFL_NQUEUES; i++) {
252 LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]);
253 }
254 }
255 }
256
257 /*
258 * uvm_page_init: init the page system. called from uvm_init().
259 *
260 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
261 */
262
263 void
264 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
265 {
266 static struct uvm_cpu boot_cpu;
267 psize_t freepages, pagecount, bucketcount, n;
268 struct pgflbucket *bucketarray, *cpuarray;
269 struct vm_page *pagearray;
270 uvm_physseg_t bank;
271 int lcv;
272
273 KASSERT(ncpu <= 1);
274 CTASSERT(sizeof(pagearray->offset) >= sizeof(struct uvm_cpu *));
275
276 /*
277 * init the page queues and free page queue lock, except the
278 * free list; we allocate that later (with the initial vm_page
279 * structures).
280 */
281
282 uvm.cpus[0] = &boot_cpu;
283 curcpu()->ci_data.cpu_uvm = &boot_cpu;
284 uvmpdpol_init();
285 mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM);
286
287 /*
288 * allocate vm_page structures.
289 */
290
291 /*
292 * sanity check:
293 * before calling this function the MD code is expected to register
294 * some free RAM with the uvm_page_physload() function. our job
295 * now is to allocate vm_page structures for this memory.
296 */
297
298 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
299 panic("uvm_page_bootstrap: no memory pre-allocated");
300
301 /*
302 * first calculate the number of free pages...
303 *
304 * note that we use start/end rather than avail_start/avail_end.
305 * this allows us to allocate extra vm_page structures in case we
306 * want to return some memory to the pool after booting.
307 */
308
309 freepages = 0;
310
311 for (bank = uvm_physseg_get_first();
312 uvm_physseg_valid_p(bank) ;
313 bank = uvm_physseg_get_next(bank)) {
314 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
315 }
316
317 /*
318 * Let MD code initialize the number of colors, or default
319 * to 1 color if MD code doesn't care.
320 */
321 if (uvmexp.ncolors == 0)
322 uvmexp.ncolors = 1;
323 uvmexp.colormask = uvmexp.ncolors - 1;
324 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
325
326 /*
327 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
328 * use. for each page of memory we use we need a vm_page structure.
329 * thus, the total number of pages we can use is the total size of
330 * the memory divided by the PAGE_SIZE plus the size of the vm_page
331 * structure. we add one to freepages as a fudge factor to avoid
332 * truncation errors (since we can only allocate in terms of whole
333 * pages).
334 */
335
336 bucketcount = uvmexp.ncolors * VM_NFREELIST;
337 pagecount = ((freepages + 1) << PAGE_SHIFT) /
338 (PAGE_SIZE + sizeof(struct vm_page));
339
340 bucketarray = (void *)uvm_pageboot_alloc((bucketcount *
341 sizeof(struct pgflbucket) * 2) + (pagecount *
342 sizeof(struct vm_page)));
343 cpuarray = bucketarray + bucketcount;
344 pagearray = (struct vm_page *)(bucketarray + bucketcount * 2);
345
346 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
347 uvm.page_free[lcv].pgfl_buckets =
348 (bucketarray + (lcv * uvmexp.ncolors));
349 uvm_page_init_buckets(&uvm.page_free[lcv]);
350 uvm.cpus[0]->page_free[lcv].pgfl_buckets =
351 (cpuarray + (lcv * uvmexp.ncolors));
352 uvm_page_init_buckets(&uvm.cpus[0]->page_free[lcv]);
353 }
354 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
355
356 /*
357 * init the vm_page structures and put them in the correct place.
358 */
359 /* First init the extent */
360
361 for (bank = uvm_physseg_get_first(),
362 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
363 uvm_physseg_valid_p(bank);
364 bank = uvm_physseg_get_next(bank)) {
365
366 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
367 uvm_physseg_seg_alloc_from_slab(bank, n);
368 uvm_physseg_init_seg(bank, pagearray);
369
370 /* set up page array pointers */
371 pagearray += n;
372 pagecount -= n;
373 }
374
375 /*
376 * pass up the values of virtual_space_start and
377 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
378 * layers of the VM.
379 */
380
381 *kvm_startp = round_page(virtual_space_start);
382 *kvm_endp = trunc_page(virtual_space_end);
383 #ifdef DEBUG
384 /*
385 * steal kva for uvm_pagezerocheck().
386 */
387 uvm_zerocheckkva = *kvm_startp;
388 *kvm_startp += PAGE_SIZE;
389 #endif /* DEBUG */
390
391 /*
392 * init various thresholds.
393 */
394
395 uvmexp.reserve_pagedaemon = 1;
396 uvmexp.reserve_kernel = vm_page_reserve_kernel;
397
398 /*
399 * determine if we should zero pages in the idle loop.
400 */
401
402 uvm.cpus[0]->page_idle_zero = vm_page_zero_enable;
403
404 /*
405 * done!
406 */
407
408 uvm.page_init_done = true;
409 }
410
411 /*
412 * uvm_setpagesize: set the page size
413 *
414 * => sets page_shift and page_mask from uvmexp.pagesize.
415 */
416
417 void
418 uvm_setpagesize(void)
419 {
420
421 /*
422 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
423 * to be a constant (indicated by being a non-zero value).
424 */
425 if (uvmexp.pagesize == 0) {
426 if (PAGE_SIZE == 0)
427 panic("uvm_setpagesize: uvmexp.pagesize not set");
428 uvmexp.pagesize = PAGE_SIZE;
429 }
430 uvmexp.pagemask = uvmexp.pagesize - 1;
431 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
432 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
433 uvmexp.pagesize, uvmexp.pagesize);
434 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
435 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
436 break;
437 }
438
439 /*
440 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
441 */
442
443 vaddr_t
444 uvm_pageboot_alloc(vsize_t size)
445 {
446 static bool initialized = false;
447 vaddr_t addr;
448 #if !defined(PMAP_STEAL_MEMORY)
449 vaddr_t vaddr;
450 paddr_t paddr;
451 #endif
452
453 /*
454 * on first call to this function, initialize ourselves.
455 */
456 if (initialized == false) {
457 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
458
459 /* round it the way we like it */
460 virtual_space_start = round_page(virtual_space_start);
461 virtual_space_end = trunc_page(virtual_space_end);
462
463 initialized = true;
464 }
465
466 /* round to page size */
467 size = round_page(size);
468 uvmexp.bootpages += atop(size);
469
470 #if defined(PMAP_STEAL_MEMORY)
471
472 /*
473 * defer bootstrap allocation to MD code (it may want to allocate
474 * from a direct-mapped segment). pmap_steal_memory should adjust
475 * virtual_space_start/virtual_space_end if necessary.
476 */
477
478 addr = pmap_steal_memory(size, &virtual_space_start,
479 &virtual_space_end);
480
481 return(addr);
482
483 #else /* !PMAP_STEAL_MEMORY */
484
485 /*
486 * allocate virtual memory for this request
487 */
488 if (virtual_space_start == virtual_space_end ||
489 (virtual_space_end - virtual_space_start) < size)
490 panic("uvm_pageboot_alloc: out of virtual space");
491
492 addr = virtual_space_start;
493
494 #ifdef PMAP_GROWKERNEL
495 /*
496 * If the kernel pmap can't map the requested space,
497 * then allocate more resources for it.
498 */
499 if (uvm_maxkaddr < (addr + size)) {
500 uvm_maxkaddr = pmap_growkernel(addr + size);
501 if (uvm_maxkaddr < (addr + size))
502 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
503 }
504 #endif
505
506 virtual_space_start += size;
507
508 /*
509 * allocate and mapin physical pages to back new virtual pages
510 */
511
512 for (vaddr = round_page(addr) ; vaddr < addr + size ;
513 vaddr += PAGE_SIZE) {
514
515 if (!uvm_page_physget(&paddr))
516 panic("uvm_pageboot_alloc: out of memory");
517
518 /*
519 * Note this memory is no longer managed, so using
520 * pmap_kenter is safe.
521 */
522 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
523 }
524 pmap_update(pmap_kernel());
525 return(addr);
526 #endif /* PMAP_STEAL_MEMORY */
527 }
528
529 #if !defined(PMAP_STEAL_MEMORY)
530 /*
531 * uvm_page_physget: "steal" one page from the vm_physmem structure.
532 *
533 * => attempt to allocate it off the end of a segment in which the "avail"
534 * values match the start/end values. if we can't do that, then we
535 * will advance both values (making them equal, and removing some
536 * vm_page structures from the non-avail area).
537 * => return false if out of memory.
538 */
539
540 /* subroutine: try to allocate from memory chunks on the specified freelist */
541 static bool uvm_page_physget_freelist(paddr_t *, int);
542
543 static bool
544 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
545 {
546 uvm_physseg_t lcv;
547
548 /* pass 1: try allocating from a matching end */
549 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
550 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
551 #else
552 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
553 #endif
554 {
555 if (uvm.page_init_done == true)
556 panic("uvm_page_physget: called _after_ bootstrap");
557
558 /* Try to match at front or back on unused segment */
559 if (uvm_page_physunload(lcv, freelist, paddrp))
560 return true;
561 }
562
563 /* pass2: forget about matching ends, just allocate something */
564 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
565 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
566 #else
567 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
568 #endif
569 {
570 /* Try the front regardless. */
571 if (uvm_page_physunload_force(lcv, freelist, paddrp))
572 return true;
573 }
574 return false;
575 }
576
577 bool
578 uvm_page_physget(paddr_t *paddrp)
579 {
580 int i;
581
582 /* try in the order of freelist preference */
583 for (i = 0; i < VM_NFREELIST; i++)
584 if (uvm_page_physget_freelist(paddrp, i) == true)
585 return (true);
586 return (false);
587 }
588 #endif /* PMAP_STEAL_MEMORY */
589
590 /*
591 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
592 * back from an I/O mapping (ugh!). used in some MD code as well.
593 */
594 struct vm_page *
595 uvm_phys_to_vm_page(paddr_t pa)
596 {
597 paddr_t pf = atop(pa);
598 paddr_t off;
599 uvm_physseg_t upm;
600
601 upm = uvm_physseg_find(pf, &off);
602 if (upm != UVM_PHYSSEG_TYPE_INVALID)
603 return uvm_physseg_get_pg(upm, off);
604 return(NULL);
605 }
606
607 paddr_t
608 uvm_vm_page_to_phys(const struct vm_page *pg)
609 {
610
611 return pg->phys_addr;
612 }
613
614 /*
615 * uvm_page_recolor: Recolor the pages if the new bucket count is
616 * larger than the old one.
617 */
618
619 void
620 uvm_page_recolor(int newncolors)
621 {
622 struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray;
623 struct pgfreelist gpgfl, pgfl;
624 struct vm_page *pg;
625 vsize_t bucketcount;
626 size_t bucketmemsize, oldbucketmemsize;
627 int color, i, ocolors;
628 int lcv;
629 struct uvm_cpu *ucpu;
630
631 KASSERT(((newncolors - 1) & newncolors) == 0);
632
633 if (newncolors <= uvmexp.ncolors)
634 return;
635
636 if (uvm.page_init_done == false) {
637 uvmexp.ncolors = newncolors;
638 return;
639 }
640
641 bucketcount = newncolors * VM_NFREELIST;
642 bucketmemsize = bucketcount * sizeof(struct pgflbucket) * 2;
643 bucketarray = kmem_alloc(bucketmemsize, KM_SLEEP);
644 cpuarray = bucketarray + bucketcount;
645
646 mutex_spin_enter(&uvm_fpageqlock);
647
648 /* Make sure we should still do this. */
649 if (newncolors <= uvmexp.ncolors) {
650 mutex_spin_exit(&uvm_fpageqlock);
651 kmem_free(bucketarray, bucketmemsize);
652 return;
653 }
654
655 oldbucketarray = uvm.page_free[0].pgfl_buckets;
656 ocolors = uvmexp.ncolors;
657
658 uvmexp.ncolors = newncolors;
659 uvmexp.colormask = uvmexp.ncolors - 1;
660
661 ucpu = curcpu()->ci_data.cpu_uvm;
662 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
663 gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors));
664 pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors));
665 uvm_page_init_buckets(&gpgfl);
666 uvm_page_init_buckets(&pgfl);
667 for (color = 0; color < ocolors; color++) {
668 for (i = 0; i < PGFL_NQUEUES; i++) {
669 while ((pg = LIST_FIRST(&uvm.page_free[
670 lcv].pgfl_buckets[color].pgfl_queues[i]))
671 != NULL) {
672 LIST_REMOVE(pg, pageq.list); /* global */
673 LIST_REMOVE(pg, listq.list); /* cpu */
674 LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[
675 VM_PGCOLOR_BUCKET(pg)].pgfl_queues[
676 i], pg, pageq.list);
677 LIST_INSERT_HEAD(&pgfl.pgfl_buckets[
678 VM_PGCOLOR_BUCKET(pg)].pgfl_queues[
679 i], pg, listq.list);
680 }
681 }
682 }
683 uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets;
684 ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
685 }
686
687 oldbucketmemsize = recolored_pages_memsize;
688
689 recolored_pages_memsize = bucketmemsize;
690 mutex_spin_exit(&uvm_fpageqlock);
691
692 if (oldbucketmemsize) {
693 kmem_free(oldbucketarray, oldbucketmemsize);
694 }
695
696 /*
697 * this calls uvm_km_alloc() which may want to hold
698 * uvm_fpageqlock.
699 */
700 uvm_pager_realloc_emerg();
701 }
702
703 /*
704 * uvm_cpu_attach: initialize per-CPU data structures.
705 */
706
707 void
708 uvm_cpu_attach(struct cpu_info *ci)
709 {
710 struct pgflbucket *bucketarray;
711 struct pgfreelist pgfl;
712 struct uvm_cpu *ucpu;
713 vsize_t bucketcount;
714 int lcv;
715
716 if (CPU_IS_PRIMARY(ci)) {
717 /* Already done in uvm_page_init(). */
718 goto attachrnd;
719 }
720
721 /* Add more reserve pages for this CPU. */
722 uvmexp.reserve_kernel += vm_page_reserve_kernel;
723
724 /* Configure this CPU's free lists. */
725 bucketcount = uvmexp.ncolors * VM_NFREELIST;
726 bucketarray = kmem_alloc(bucketcount * sizeof(struct pgflbucket),
727 KM_SLEEP);
728 ucpu = kmem_zalloc(sizeof(*ucpu), KM_SLEEP);
729 uvm.cpus[cpu_index(ci)] = ucpu;
730 ci->ci_data.cpu_uvm = ucpu;
731 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
732 pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors));
733 uvm_page_init_buckets(&pgfl);
734 ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
735 }
736
737 attachrnd:
738 /*
739 * Attach RNG source for this CPU's VM events
740 */
741 rnd_attach_source(&uvm.cpus[cpu_index(ci)]->rs,
742 ci->ci_data.cpu_name, RND_TYPE_VM,
743 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
744 RND_FLAG_ESTIMATE_VALUE);
745
746 }
747
748 /*
749 * uvm_free: return total number of free pages in system.
750 */
751
752 int
753 uvm_free(void)
754 {
755
756 return uvmexp.free;
757 }
758
759 /*
760 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat
761 */
762
763 static struct vm_page *
764 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2,
765 int *trycolorp)
766 {
767 struct pgflist *freeq;
768 struct vm_page *pg;
769 int color, trycolor = *trycolorp;
770 struct pgfreelist *gpgfl, *pgfl;
771
772 KASSERT(mutex_owned(&uvm_fpageqlock));
773
774 color = trycolor;
775 pgfl = &ucpu->page_free[flist];
776 gpgfl = &uvm.page_free[flist];
777 do {
778 /* cpu, try1 */
779 if ((pg = LIST_FIRST((freeq =
780 &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
781 KASSERT(pg->flags & PG_FREE);
782 KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
783 KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
784 KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
785 VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
786 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
787 goto gotit;
788 }
789 /* global, try1 */
790 if ((pg = LIST_FIRST((freeq =
791 &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
792 KASSERT(pg->flags & PG_FREE);
793 KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
794 KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
795 KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
796 VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
797 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
798 goto gotit;
799 }
800 /* cpu, try2 */
801 if ((pg = LIST_FIRST((freeq =
802 &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
803 KASSERT(pg->flags & PG_FREE);
804 KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
805 KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
806 KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
807 VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
808 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
809 goto gotit;
810 }
811 /* global, try2 */
812 if ((pg = LIST_FIRST((freeq =
813 &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
814 KASSERT(pg->flags & PG_FREE);
815 KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
816 KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
817 KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
818 VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
819 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
820 goto gotit;
821 }
822 color = (color + 1) & uvmexp.colormask;
823 } while (color != trycolor);
824
825 return (NULL);
826
827 gotit:
828 LIST_REMOVE(pg, pageq.list); /* global list */
829 LIST_REMOVE(pg, listq.list); /* per-cpu list */
830 uvmexp.free--;
831
832 /* update zero'd page count */
833 if (pg->flags & PG_ZERO)
834 CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
835
836 if (color == trycolor)
837 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
838 else {
839 CPU_COUNT(CPU_COUNT_COLORMISS, 1);
840 *trycolorp = color;
841 }
842
843 return (pg);
844 }
845
846 /*
847 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
848 *
849 * => return null if no pages free
850 * => wake up pagedaemon if number of free pages drops below low water mark
851 * => if obj != NULL, obj must be locked (to put in obj's tree)
852 * => if anon != NULL, anon must be locked (to put in anon)
853 * => only one of obj or anon can be non-null
854 * => caller must activate/deactivate page if it is not wired.
855 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
856 * => policy decision: it is more important to pull a page off of the
857 * appropriate priority free list than it is to get a zero'd or
858 * unknown contents page. This is because we live with the
859 * consequences of a bad free list decision for the entire
860 * lifetime of the page, e.g. if the page comes from memory that
861 * is slower to access.
862 */
863
864 struct vm_page *
865 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
866 int flags, int strat, int free_list)
867 {
868 int try1, try2, zeroit = 0, color;
869 int lcv, error;
870 struct uvm_cpu *ucpu;
871 struct vm_page *pg;
872 lwp_t *l;
873
874 KASSERT(obj == NULL || anon == NULL);
875 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
876 KASSERT(off == trunc_page(off));
877 KASSERT(obj == NULL || mutex_owned(obj->vmobjlock));
878 KASSERT(anon == NULL || anon->an_lock == NULL ||
879 mutex_owned(anon->an_lock));
880
881 mutex_spin_enter(&uvm_fpageqlock);
882
883 /*
884 * This implements a global round-robin page coloring
885 * algorithm.
886 */
887
888 ucpu = curcpu()->ci_data.cpu_uvm;
889 if (flags & UVM_FLAG_COLORMATCH) {
890 color = atop(off) & uvmexp.colormask;
891 } else {
892 color = ucpu->page_free_nextcolor;
893 }
894
895 /*
896 * check to see if we need to generate some free pages waking
897 * the pagedaemon.
898 */
899
900 uvm_kick_pdaemon();
901
902 /*
903 * fail if any of these conditions is true:
904 * [1] there really are no free pages, or
905 * [2] only kernel "reserved" pages remain and
906 * reserved pages have not been requested.
907 * [3] only pagedaemon "reserved" pages remain and
908 * the requestor isn't the pagedaemon.
909 * we make kernel reserve pages available if called by a
910 * kernel thread or a realtime thread.
911 */
912 l = curlwp;
913 if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) {
914 flags |= UVM_PGA_USERESERVE;
915 }
916 if ((uvmexp.free <= uvmexp.reserve_kernel &&
917 (flags & UVM_PGA_USERESERVE) == 0) ||
918 (uvmexp.free <= uvmexp.reserve_pagedaemon &&
919 curlwp != uvm.pagedaemon_lwp))
920 goto fail;
921
922 #if PGFL_NQUEUES != 2
923 #error uvm_pagealloc_strat needs to be updated
924 #endif
925
926 /*
927 * If we want a zero'd page, try the ZEROS queue first, otherwise
928 * we try the UNKNOWN queue first.
929 */
930 if (flags & UVM_PGA_ZERO) {
931 try1 = PGFL_ZEROS;
932 try2 = PGFL_UNKNOWN;
933 } else {
934 try1 = PGFL_UNKNOWN;
935 try2 = PGFL_ZEROS;
936 }
937
938 again:
939 switch (strat) {
940 case UVM_PGA_STRAT_NORMAL:
941 /* Check freelists: descending priority (ascending id) order */
942 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
943 pg = uvm_pagealloc_pgfl(ucpu, lcv,
944 try1, try2, &color);
945 if (pg != NULL)
946 goto gotit;
947 }
948
949 /* No pages free! */
950 goto fail;
951
952 case UVM_PGA_STRAT_ONLY:
953 case UVM_PGA_STRAT_FALLBACK:
954 /* Attempt to allocate from the specified free list. */
955 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
956 pg = uvm_pagealloc_pgfl(ucpu, free_list,
957 try1, try2, &color);
958 if (pg != NULL)
959 goto gotit;
960
961 /* Fall back, if possible. */
962 if (strat == UVM_PGA_STRAT_FALLBACK) {
963 strat = UVM_PGA_STRAT_NORMAL;
964 goto again;
965 }
966
967 /* No pages free! */
968 goto fail;
969
970 default:
971 panic("uvm_pagealloc_strat: bad strat %d", strat);
972 /* NOTREACHED */
973 }
974
975 gotit:
976 /*
977 * We now know which color we actually allocated from; set
978 * the next color accordingly.
979 */
980
981 ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask;
982
983 /*
984 * update allocation statistics and remember if we have to
985 * zero the page
986 */
987
988 if (flags & UVM_PGA_ZERO) {
989 if (pg->flags & PG_ZERO) {
990 CPU_COUNT(CPU_COUNT_PGA_ZEROHIT, 1);
991 zeroit = 0;
992 } else {
993 CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
994 zeroit = 1;
995 }
996 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
997 ucpu->page_idle_zero = vm_page_zero_enable;
998 }
999 }
1000 KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
1001
1002 /*
1003 * For now check this - later on we may do lazy dequeue, but need
1004 * to get page.queue used only by the pagedaemon policy first.
1005 */
1006 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1007
1008 /*
1009 * assign the page to the object. we don't need to lock the page's
1010 * identity to do this, as the caller holds the objects locked, and
1011 * the page is not on any paging queues at this time.
1012 */
1013 pg->offset = off;
1014 pg->uobject = obj;
1015 pg->uanon = anon;
1016 KASSERT(uvm_page_locked_p(pg));
1017 pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE;
1018 mutex_spin_exit(&uvm_fpageqlock);
1019 if (anon) {
1020 anon->an_page = pg;
1021 pg->flags |= PG_ANON;
1022 cpu_count(CPU_COUNT_ANONPAGES, 1);
1023 } else if (obj) {
1024 uvm_pageinsert_object(obj, pg);
1025 error = uvm_pageinsert_tree(obj, pg);
1026 if (error != 0) {
1027 uvm_pageremove_object(obj, pg);
1028 uvm_pagefree(pg);
1029 return NULL;
1030 }
1031 }
1032
1033 #if defined(UVM_PAGE_TRKOWN)
1034 pg->owner_tag = NULL;
1035 #endif
1036 UVM_PAGE_OWN(pg, "new alloc");
1037
1038 if (flags & UVM_PGA_ZERO) {
1039 /*
1040 * A zero'd page is not clean. If we got a page not already
1041 * zero'd, then we have to zero it ourselves.
1042 */
1043 pg->flags &= ~PG_CLEAN;
1044 if (zeroit)
1045 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1046 }
1047
1048 return(pg);
1049
1050 fail:
1051 mutex_spin_exit(&uvm_fpageqlock);
1052 return (NULL);
1053 }
1054
1055 /*
1056 * uvm_pagereplace: replace a page with another
1057 *
1058 * => object must be locked
1059 */
1060
1061 void
1062 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1063 {
1064 struct uvm_object *uobj = oldpg->uobject;
1065
1066 KASSERT((oldpg->flags & PG_TABLED) != 0);
1067 KASSERT(uobj != NULL);
1068 KASSERT((newpg->flags & PG_TABLED) == 0);
1069 KASSERT(newpg->uobject == NULL);
1070 KASSERT(mutex_owned(uobj->vmobjlock));
1071
1072 newpg->offset = oldpg->offset;
1073 uvm_pageremove_tree(uobj, oldpg);
1074 uvm_pageinsert_tree(uobj, newpg);
1075
1076 /* take page interlocks during rename */
1077 if (oldpg < newpg) {
1078 mutex_enter(&oldpg->interlock);
1079 mutex_enter(&newpg->interlock);
1080 } else {
1081 mutex_enter(&newpg->interlock);
1082 mutex_enter(&oldpg->interlock);
1083 }
1084 newpg->uobject = uobj;
1085 uvm_pageinsert_object(uobj, newpg);
1086 uvm_pageremove_object(uobj, oldpg);
1087 mutex_exit(&oldpg->interlock);
1088 mutex_exit(&newpg->interlock);
1089 }
1090
1091 /*
1092 * uvm_pagerealloc: reallocate a page from one object to another
1093 *
1094 * => both objects must be locked
1095 * => both interlocks must be held
1096 */
1097
1098 void
1099 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1100 {
1101 /*
1102 * remove it from the old object
1103 */
1104
1105 if (pg->uobject) {
1106 uvm_pageremove_tree(pg->uobject, pg);
1107 uvm_pageremove_object(pg->uobject, pg);
1108 }
1109
1110 /*
1111 * put it in the new object
1112 */
1113
1114 if (newobj) {
1115 /*
1116 * XXX we have no in-tree users of this functionality
1117 */
1118 panic("uvm_pagerealloc: no impl");
1119 }
1120 }
1121
1122 #ifdef DEBUG
1123 /*
1124 * check if page is zero-filled
1125 */
1126 void
1127 uvm_pagezerocheck(struct vm_page *pg)
1128 {
1129 int *p, *ep;
1130
1131 KASSERT(uvm_zerocheckkva != 0);
1132 KASSERT(mutex_owned(&uvm_fpageqlock));
1133
1134 /*
1135 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1136 * uvm page allocator.
1137 *
1138 * it might be better to have "CPU-local temporary map" pmap interface.
1139 */
1140 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1141 p = (int *)uvm_zerocheckkva;
1142 ep = (int *)((char *)p + PAGE_SIZE);
1143 pmap_update(pmap_kernel());
1144 while (p < ep) {
1145 if (*p != 0)
1146 panic("PG_ZERO page isn't zero-filled");
1147 p++;
1148 }
1149 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1150 /*
1151 * pmap_update() is not necessary here because no one except us
1152 * uses this VA.
1153 */
1154 }
1155 #endif /* DEBUG */
1156
1157 /*
1158 * uvm_pagefree: free page
1159 *
1160 * => erase page's identity (i.e. remove from object)
1161 * => put page on free list
1162 * => caller must lock owning object (either anon or uvm_object)
1163 * => assumes all valid mappings of pg are gone
1164 */
1165
1166 void
1167 uvm_pagefree(struct vm_page *pg)
1168 {
1169 struct pgflist *pgfl;
1170 struct uvm_cpu *ucpu;
1171 int index, color, queue;
1172 bool iszero, locked;
1173
1174 #ifdef DEBUG
1175 if (pg->uobject == (void *)0xdeadbeef &&
1176 pg->uanon == (void *)0xdeadbeef) {
1177 panic("uvm_pagefree: freeing free page %p", pg);
1178 }
1179 #endif /* DEBUG */
1180
1181 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1182 KASSERT(!(pg->flags & PG_FREE));
1183 //KASSERT(mutex_owned(&uvm_pageqlock) || !uvmpdpol_pageisqueued_p(pg));
1184 KASSERT(pg->uobject == NULL || mutex_owned(pg->uobject->vmobjlock));
1185 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1186 mutex_owned(pg->uanon->an_lock));
1187
1188 /*
1189 * remove the page from the object's tree beore acquiring any page
1190 * interlocks: this can acquire locks to free radixtree nodes.
1191 */
1192 if (pg->uobject != NULL) {
1193 uvm_pageremove_tree(pg->uobject, pg);
1194 }
1195
1196 /*
1197 * if the page is loaned, resolve the loan instead of freeing.
1198 */
1199
1200 if (pg->loan_count) {
1201 KASSERT(pg->wire_count == 0);
1202
1203 /*
1204 * if the page is owned by an anon then we just want to
1205 * drop anon ownership. the kernel will free the page when
1206 * it is done with it. if the page is owned by an object,
1207 * remove it from the object and mark it dirty for the benefit
1208 * of possible anon owners.
1209 *
1210 * regardless of previous ownership, wakeup any waiters,
1211 * unbusy the page, and we're done.
1212 */
1213
1214 mutex_enter(&pg->interlock);
1215 locked = true;
1216 if (pg->uobject != NULL) {
1217 uvm_pageremove_object(pg->uobject, pg);
1218 pg->flags &= ~PG_CLEAN;
1219 } else if (pg->uanon != NULL) {
1220 if ((pg->flags & PG_ANON) == 0) {
1221 pg->loan_count--;
1222 } else {
1223 pg->flags &= ~PG_ANON;
1224 cpu_count(CPU_COUNT_ANONPAGES, -1);
1225 }
1226 pg->uanon->an_page = NULL;
1227 pg->uanon = NULL;
1228 }
1229 if (pg->flags & PG_WANTED) {
1230 wakeup(pg);
1231 }
1232 pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED|PG_PAGER1);
1233 #ifdef UVM_PAGE_TRKOWN
1234 pg->owner_tag = NULL;
1235 #endif
1236 if (pg->loan_count) {
1237 KASSERT(pg->uobject == NULL);
1238 mutex_exit(&pg->interlock);
1239 if (pg->uanon == NULL) {
1240 uvm_pagedequeue(pg);
1241 }
1242 return;
1243 }
1244 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1245 pg->wire_count != 0) {
1246 mutex_enter(&pg->interlock);
1247 locked = true;
1248 } else {
1249 locked = false;
1250 }
1251
1252 /*
1253 * remove page from its object or anon.
1254 */
1255 if (pg->uobject != NULL) {
1256 uvm_pageremove_object(pg->uobject, pg);
1257 } else if (pg->uanon != NULL) {
1258 pg->uanon->an_page = NULL;
1259 pg->uanon = NULL;
1260 cpu_count(CPU_COUNT_ANONPAGES, -1);
1261 }
1262
1263 /*
1264 * if the page was wired, unwire it now.
1265 */
1266
1267 if (pg->wire_count) {
1268 pg->wire_count = 0;
1269 atomic_dec_uint(&uvmexp.wired);
1270 }
1271 if (locked) {
1272 mutex_exit(&pg->interlock);
1273 }
1274
1275 /*
1276 * now remove the page from the queues.
1277 */
1278 uvm_pagedequeue(pg);
1279
1280 /*
1281 * and put on free queue
1282 */
1283
1284 iszero = (pg->flags & PG_ZERO);
1285 index = uvm_page_lookup_freelist(pg);
1286 color = VM_PGCOLOR_BUCKET(pg);
1287 queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN);
1288
1289 #ifdef DEBUG
1290 pg->uobject = (void *)0xdeadbeef;
1291 pg->uanon = (void *)0xdeadbeef;
1292 #endif
1293
1294 mutex_spin_enter(&uvm_fpageqlock);
1295 pg->flags = PG_FREE;
1296
1297 #ifdef DEBUG
1298 if (iszero)
1299 uvm_pagezerocheck(pg);
1300 #endif /* DEBUG */
1301
1302
1303 /* global list */
1304 pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue];
1305 LIST_INSERT_HEAD(pgfl, pg, pageq.list);
1306 uvmexp.free++;
1307 if (iszero) {
1308 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1309 }
1310
1311 /* per-cpu list */
1312 ucpu = curcpu()->ci_data.cpu_uvm;
1313 pg->offset = (uintptr_t)ucpu;
1314 pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue];
1315 LIST_INSERT_HEAD(pgfl, pg, listq.list);
1316 ucpu->pages[queue]++;
1317 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
1318 ucpu->page_idle_zero = vm_page_zero_enable;
1319 }
1320
1321 mutex_spin_exit(&uvm_fpageqlock);
1322 }
1323
1324 /*
1325 * uvm_page_unbusy: unbusy an array of pages.
1326 *
1327 * => pages must either all belong to the same object, or all belong to anons.
1328 * => if pages are object-owned, object must be locked.
1329 * => if pages are anon-owned, anons must be locked.
1330 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1331 */
1332
1333 void
1334 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1335 {
1336 struct vm_page *pg;
1337 int i;
1338 UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
1339
1340 for (i = 0; i < npgs; i++) {
1341 pg = pgs[i];
1342 if (pg == NULL || pg == PGO_DONTCARE) {
1343 continue;
1344 }
1345
1346 KASSERT(uvm_page_locked_p(pg));
1347 KASSERT(pg->flags & PG_BUSY);
1348 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1349 if (pg->flags & PG_WANTED) {
1350 /* XXXAD thundering herd problem. */
1351 wakeup(pg);
1352 }
1353 if (pg->flags & PG_RELEASED) {
1354 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1355 (uintptr_t)pg, 0, 0, 0);
1356 KASSERT(pg->uobject != NULL ||
1357 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1358 pg->flags &= ~PG_RELEASED;
1359 uvm_pagefree(pg);
1360 } else {
1361 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1362 (uintptr_t)pg, 0, 0, 0);
1363 KASSERT((pg->flags & PG_FAKE) == 0);
1364 pg->flags &= ~(PG_WANTED|PG_BUSY);
1365 UVM_PAGE_OWN(pg, NULL);
1366 }
1367 }
1368 }
1369
1370 #if defined(UVM_PAGE_TRKOWN)
1371 /*
1372 * uvm_page_own: set or release page ownership
1373 *
1374 * => this is a debugging function that keeps track of who sets PG_BUSY
1375 * and where they do it. it can be used to track down problems
1376 * such a process setting "PG_BUSY" and never releasing it.
1377 * => page's object [if any] must be locked
1378 * => if "tag" is NULL then we are releasing page ownership
1379 */
1380 void
1381 uvm_page_own(struct vm_page *pg, const char *tag)
1382 {
1383
1384 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1385 KASSERT((pg->flags & PG_WANTED) == 0);
1386 KASSERT(uvm_page_locked_p(pg));
1387
1388 /* gain ownership? */
1389 if (tag) {
1390 KASSERT((pg->flags & PG_BUSY) != 0);
1391 if (pg->owner_tag) {
1392 printf("uvm_page_own: page %p already owned "
1393 "by proc %d [%s]\n", pg,
1394 pg->owner, pg->owner_tag);
1395 panic("uvm_page_own");
1396 }
1397 pg->owner = curproc->p_pid;
1398 pg->lowner = curlwp->l_lid;
1399 pg->owner_tag = tag;
1400 return;
1401 }
1402
1403 /* drop ownership */
1404 KASSERT((pg->flags & PG_BUSY) == 0);
1405 if (pg->owner_tag == NULL) {
1406 printf("uvm_page_own: dropping ownership of an non-owned "
1407 "page (%p)\n", pg);
1408 panic("uvm_page_own");
1409 }
1410 if (!uvmpdpol_pageisqueued_p(pg)) {
1411 KASSERT((pg->uanon == NULL && pg->uobject == NULL) ||
1412 pg->wire_count > 0);
1413 } else {
1414 KASSERT(pg->wire_count == 0);
1415 }
1416 pg->owner_tag = NULL;
1417 }
1418 #endif
1419
1420 /*
1421 * uvm_pageidlezero: zero free pages while the system is idle.
1422 *
1423 * => try to complete one color bucket at a time, to reduce our impact
1424 * on the CPU cache.
1425 * => we loop until we either reach the target or there is a lwp ready
1426 * to run, or MD code detects a reason to break early.
1427 */
1428 void
1429 uvm_pageidlezero(void)
1430 {
1431 struct vm_page *pg;
1432 struct pgfreelist *pgfl, *gpgfl;
1433 struct uvm_cpu *ucpu;
1434 int free_list, firstbucket, nextbucket;
1435 bool lcont = false;
1436
1437 ucpu = curcpu()->ci_data.cpu_uvm;
1438 if (!ucpu->page_idle_zero ||
1439 ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
1440 ucpu->page_idle_zero = false;
1441 return;
1442 }
1443 if (!mutex_tryenter(&uvm_fpageqlock)) {
1444 /* Contention: let other CPUs to use the lock. */
1445 return;
1446 }
1447 firstbucket = ucpu->page_free_nextcolor;
1448 nextbucket = firstbucket;
1449 do {
1450 for (free_list = 0; free_list < VM_NFREELIST; free_list++) {
1451 if (sched_curcpu_runnable_p()) {
1452 goto quit;
1453 }
1454 pgfl = &ucpu->page_free[free_list];
1455 gpgfl = &uvm.page_free[free_list];
1456 while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[
1457 nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) {
1458 if (lcont || sched_curcpu_runnable_p()) {
1459 goto quit;
1460 }
1461 LIST_REMOVE(pg, pageq.list); /* global list */
1462 LIST_REMOVE(pg, listq.list); /* per-cpu list */
1463 ucpu->pages[PGFL_UNKNOWN]--;
1464 uvmexp.free--;
1465 KASSERT(pg->flags == PG_FREE);
1466 pg->flags = 0;
1467 mutex_spin_exit(&uvm_fpageqlock);
1468 #ifdef PMAP_PAGEIDLEZERO
1469 if (!PMAP_PAGEIDLEZERO(VM_PAGE_TO_PHYS(pg))) {
1470
1471 /*
1472 * The machine-dependent code detected
1473 * some reason for us to abort zeroing
1474 * pages, probably because there is a
1475 * process now ready to run.
1476 */
1477
1478 mutex_spin_enter(&uvm_fpageqlock);
1479 pg->flags = PG_FREE;
1480 LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
1481 nextbucket].pgfl_queues[
1482 PGFL_UNKNOWN], pg, pageq.list);
1483 LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
1484 nextbucket].pgfl_queues[
1485 PGFL_UNKNOWN], pg, listq.list);
1486 ucpu->pages[PGFL_UNKNOWN]++;
1487 uvmexp.free++;
1488 uvmexp.zeroaborts++;
1489 goto quit;
1490 }
1491 #else
1492 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1493 #endif /* PMAP_PAGEIDLEZERO */
1494 if (!mutex_tryenter(&uvm_fpageqlock)) {
1495 lcont = true;
1496 mutex_spin_enter(&uvm_fpageqlock);
1497 } else {
1498 lcont = false;
1499 }
1500 pg->flags = PG_FREE | PG_ZERO;
1501 LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
1502 nextbucket].pgfl_queues[PGFL_ZEROS],
1503 pg, pageq.list);
1504 LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
1505 nextbucket].pgfl_queues[PGFL_ZEROS],
1506 pg, listq.list);
1507 ucpu->pages[PGFL_ZEROS]++;
1508 uvmexp.free++;
1509 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1510 }
1511 }
1512 if (ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
1513 break;
1514 }
1515 nextbucket = (nextbucket + 1) & uvmexp.colormask;
1516 } while (nextbucket != firstbucket);
1517 ucpu->page_idle_zero = false;
1518 quit:
1519 mutex_spin_exit(&uvm_fpageqlock);
1520 }
1521
1522 /*
1523 * uvm_pagelookup: look up a page
1524 *
1525 * => caller should lock object to keep someone from pulling the page
1526 * out from under it
1527 */
1528
1529 struct vm_page *
1530 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1531 {
1532 struct vm_page *pg;
1533
1534 /* No - used from DDB. KASSERT(mutex_owned(obj->vmobjlock)); */
1535
1536 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1537
1538 KASSERT(pg == NULL || obj->uo_npages != 0);
1539 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1540 (pg->flags & PG_BUSY) != 0);
1541 return pg;
1542 }
1543
1544 /*
1545 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1546 *
1547 * => caller must lock objects
1548 */
1549
1550 void
1551 uvm_pagewire(struct vm_page *pg)
1552 {
1553
1554 KASSERT(uvm_page_locked_p(pg));
1555 #if defined(READAHEAD_STATS)
1556 if ((pg->flags & PG_READAHEAD) != 0) {
1557 uvm_ra_hit.ev_count++;
1558 pg->flags &= ~PG_READAHEAD;
1559 }
1560 #endif /* defined(READAHEAD_STATS) */
1561 if (pg->wire_count == 0) {
1562 uvm_pagedequeue(pg);
1563 atomic_inc_uint(&uvmexp.wired);
1564 }
1565 mutex_enter(&pg->interlock);
1566 pg->wire_count++;
1567 mutex_exit(&pg->interlock);
1568 KASSERT(pg->wire_count > 0); /* detect wraparound */
1569 }
1570
1571 /*
1572 * uvm_pageunwire: unwire the page.
1573 *
1574 * => activate if wire count goes to zero.
1575 * => caller must lock objects
1576 */
1577
1578 void
1579 uvm_pageunwire(struct vm_page *pg)
1580 {
1581
1582 KASSERT(uvm_page_locked_p(pg));
1583 KASSERT(pg->wire_count != 0);
1584 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1585 mutex_enter(&pg->interlock);
1586 pg->wire_count--;
1587 mutex_exit(&pg->interlock);
1588 if (pg->wire_count == 0) {
1589 uvm_pageactivate(pg);
1590 KASSERT(uvmexp.wired != 0);
1591 atomic_dec_uint(&uvmexp.wired);
1592 }
1593 }
1594
1595 /*
1596 * uvm_pagedeactivate: deactivate page
1597 *
1598 * => caller must lock objects
1599 * => caller must check to make sure page is not wired
1600 * => object that page belongs to must be locked (so we can adjust pg->flags)
1601 * => caller must clear the reference on the page before calling
1602 */
1603
1604 void
1605 uvm_pagedeactivate(struct vm_page *pg)
1606 {
1607
1608 KASSERT(uvm_page_locked_p(pg));
1609 if (pg->wire_count == 0) {
1610 KASSERT(uvmpdpol_pageisqueued_p(pg));
1611 uvmpdpol_pagedeactivate(pg);
1612 }
1613 }
1614
1615 /*
1616 * uvm_pageactivate: activate page
1617 *
1618 * => caller must lock objects
1619 */
1620
1621 void
1622 uvm_pageactivate(struct vm_page *pg)
1623 {
1624
1625 KASSERT(uvm_page_locked_p(pg));
1626 #if defined(READAHEAD_STATS)
1627 if ((pg->flags & PG_READAHEAD) != 0) {
1628 uvm_ra_hit.ev_count++;
1629 pg->flags &= ~PG_READAHEAD;
1630 }
1631 #endif /* defined(READAHEAD_STATS) */
1632 if (pg->wire_count == 0) {
1633 uvmpdpol_pageactivate(pg);
1634 }
1635 }
1636
1637 /*
1638 * uvm_pagedequeue: remove a page from any paging queue
1639 *
1640 * => caller must lock objects
1641 */
1642 void
1643 uvm_pagedequeue(struct vm_page *pg)
1644 {
1645
1646 KASSERT(uvm_page_locked_p(pg));
1647 if (uvmpdpol_pageisqueued_p(pg)) {
1648 uvmpdpol_pagedequeue(pg);
1649 }
1650 }
1651
1652 /*
1653 * uvm_pageenqueue: add a page to a paging queue without activating.
1654 * used where a page is not really demanded (yet). eg. read-ahead
1655 *
1656 * => caller must lock objects
1657 */
1658 void
1659 uvm_pageenqueue(struct vm_page *pg)
1660 {
1661
1662 KASSERT(uvm_page_locked_p(pg));
1663 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1664 uvmpdpol_pageenqueue(pg);
1665 }
1666 }
1667
1668 /*
1669 * uvm_pagezero: zero fill a page
1670 *
1671 * => if page is part of an object then the object should be locked
1672 * to protect pg->flags.
1673 */
1674
1675 void
1676 uvm_pagezero(struct vm_page *pg)
1677 {
1678 pg->flags &= ~PG_CLEAN;
1679 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1680 }
1681
1682 /*
1683 * uvm_pagecopy: copy a page
1684 *
1685 * => if page is part of an object then the object should be locked
1686 * to protect pg->flags.
1687 */
1688
1689 void
1690 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
1691 {
1692
1693 dst->flags &= ~PG_CLEAN;
1694 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
1695 }
1696
1697 /*
1698 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
1699 */
1700
1701 bool
1702 uvm_pageismanaged(paddr_t pa)
1703 {
1704
1705 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
1706 }
1707
1708 /*
1709 * uvm_page_lookup_freelist: look up the free list for the specified page
1710 */
1711
1712 int
1713 uvm_page_lookup_freelist(struct vm_page *pg)
1714 {
1715 uvm_physseg_t upm;
1716
1717 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
1718 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
1719 return uvm_physseg_get_free_list(upm);
1720 }
1721
1722 /*
1723 * uvm_page_locked_p: return true if object associated with page is
1724 * locked. this is a weak check for runtime assertions only.
1725 */
1726
1727 bool
1728 uvm_page_locked_p(struct vm_page *pg)
1729 {
1730
1731 if (pg->uobject != NULL) {
1732 return mutex_owned(pg->uobject->vmobjlock);
1733 }
1734 if (pg->uanon != NULL) {
1735 return mutex_owned(pg->uanon->an_lock);
1736 }
1737 return true;
1738 }
1739
1740 #ifdef PMAP_DIRECT
1741 /*
1742 * Call pmap to translate physical address into a virtual and to run a callback
1743 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
1744 * or equivalent.
1745 */
1746 int
1747 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
1748 int (*process)(void *, size_t, void *), void *arg)
1749 {
1750 int error = 0;
1751 paddr_t pa;
1752 size_t todo;
1753 voff_t pgoff = (off & PAGE_MASK);
1754 struct vm_page *pg;
1755
1756 KASSERT(npages > 0 && len > 0);
1757
1758 for (int i = 0; i < npages; i++) {
1759 pg = pgs[i];
1760
1761 KASSERT(len > 0);
1762
1763 /*
1764 * Caller is responsible for ensuring all the pages are
1765 * available.
1766 */
1767 KASSERT(pg != NULL && pg != PGO_DONTCARE);
1768
1769 pa = VM_PAGE_TO_PHYS(pg);
1770 todo = MIN(len, PAGE_SIZE - pgoff);
1771
1772 error = pmap_direct_process(pa, pgoff, todo, process, arg);
1773 if (error)
1774 break;
1775
1776 pgoff = 0;
1777 len -= todo;
1778 }
1779
1780 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
1781 return error;
1782 }
1783 #endif /* PMAP_DIRECT */
1784
1785 #if defined(DDB) || defined(DEBUGPRINT)
1786
1787 /*
1788 * uvm_page_printit: actually print the page
1789 */
1790
1791 static const char page_flagbits[] = UVM_PGFLAGBITS;
1792
1793 void
1794 uvm_page_printit(struct vm_page *pg, bool full,
1795 void (*pr)(const char *, ...))
1796 {
1797 struct vm_page *tpg;
1798 struct uvm_object *uobj;
1799 struct pgflist *pgl;
1800 char pgbuf[128];
1801
1802 (*pr)("PAGE %p:\n", pg);
1803 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
1804 (*pr)(" flags=%s, pqflags=%x, wire_count=%d, pa=0x%lx\n",
1805 pgbuf, pg->pqflags, pg->wire_count, (long)VM_PAGE_TO_PHYS(pg));
1806 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n",
1807 pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count);
1808 #if defined(UVM_PAGE_TRKOWN)
1809 if (pg->flags & PG_BUSY)
1810 (*pr)(" owning process = %d, tag=%s\n",
1811 pg->owner, pg->owner_tag);
1812 else
1813 (*pr)(" page not busy, no owner\n");
1814 #else
1815 (*pr)(" [page ownership tracking disabled]\n");
1816 #endif
1817
1818 if (!full)
1819 return;
1820
1821 /* cross-verify object/anon */
1822 if ((pg->flags & PG_FREE) == 0) {
1823 if (pg->flags & PG_ANON) {
1824 if (pg->uanon == NULL || pg->uanon->an_page != pg)
1825 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
1826 (pg->uanon) ? pg->uanon->an_page : NULL);
1827 else
1828 (*pr)(" anon backpointer is OK\n");
1829 } else {
1830 uobj = pg->uobject;
1831 if (uobj) {
1832 (*pr)(" checking object list\n");
1833 tpg = uvm_pagelookup(uobj, pg->offset);
1834 if (tpg)
1835 (*pr)(" page found on object list\n");
1836 else
1837 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
1838 }
1839 }
1840 }
1841
1842 /* cross-verify page queue */
1843 if (pg->flags & PG_FREE) {
1844 int fl = uvm_page_lookup_freelist(pg);
1845 int color = VM_PGCOLOR_BUCKET(pg);
1846 pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[
1847 ((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN];
1848 } else {
1849 pgl = NULL;
1850 }
1851
1852 if (pgl) {
1853 (*pr)(" checking pageq list\n");
1854 LIST_FOREACH(tpg, pgl, pageq.list) {
1855 if (tpg == pg) {
1856 break;
1857 }
1858 }
1859 if (tpg)
1860 (*pr)(" page found on pageq list\n");
1861 else
1862 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
1863 }
1864 }
1865
1866 /*
1867 * uvm_page_printall - print a summary of all managed pages
1868 */
1869
1870 void
1871 uvm_page_printall(void (*pr)(const char *, ...))
1872 {
1873 uvm_physseg_t i;
1874 paddr_t pfn;
1875 struct vm_page *pg;
1876
1877 (*pr)("%18s %4s %4s %18s %18s"
1878 #ifdef UVM_PAGE_TRKOWN
1879 " OWNER"
1880 #endif
1881 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
1882 for (i = uvm_physseg_get_first();
1883 uvm_physseg_valid_p(i);
1884 i = uvm_physseg_get_next(i)) {
1885 for (pfn = uvm_physseg_get_start(i);
1886 pfn < uvm_physseg_get_end(i);
1887 pfn++) {
1888 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
1889
1890 (*pr)("%18p %04x %08x %18p %18p",
1891 pg, pg->flags, pg->pqflags, pg->uobject,
1892 pg->uanon);
1893 #ifdef UVM_PAGE_TRKOWN
1894 if (pg->flags & PG_BUSY)
1895 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
1896 #endif
1897 (*pr)("\n");
1898 }
1899 }
1900 }
1901
1902 #endif /* DDB || DEBUGPRINT */
1903