uvm_page.c revision 1.212 1 /* $NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993, The Regents of the University of California.
6 *
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
37 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
38 *
39 *
40 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41 * All rights reserved.
42 *
43 * Permission to use, copy, modify and distribute this software and
44 * its documentation is hereby granted, provided that both the copyright
45 * notice and this permission notice appear in all copies of the
46 * software, derivative works or modified versions, and any portions
47 * thereof, and that both notices appear in supporting documentation.
48 *
49 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52 *
53 * Carnegie Mellon requests users of this software to return to
54 *
55 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
56 * School of Computer Science
57 * Carnegie Mellon University
58 * Pittsburgh PA 15213-3890
59 *
60 * any improvements or extensions that they make and grant Carnegie the
61 * rights to redistribute these changes.
62 */
63
64 /*
65 * uvm_page.c: page ops.
66 */
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $");
70
71 #include "opt_ddb.h"
72 #include "opt_uvm.h"
73 #include "opt_uvmhist.h"
74 #include "opt_readahead.h"
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/sched.h>
79 #include <sys/kernel.h>
80 #include <sys/vnode.h>
81 #include <sys/proc.h>
82 #include <sys/radixtree.h>
83 #include <sys/atomic.h>
84 #include <sys/cpu.h>
85 #include <sys/extent.h>
86
87 #include <uvm/uvm.h>
88 #include <uvm/uvm_ddb.h>
89 #include <uvm/uvm_pdpolicy.h>
90
91 /*
92 * Some supported CPUs in a given architecture don't support all
93 * of the things necessary to do idle page zero'ing efficiently.
94 * We therefore provide a way to enable it from machdep code here.
95 */
96 bool vm_page_zero_enable = false;
97
98 /*
99 * number of pages per-CPU to reserve for the kernel.
100 */
101 #ifndef UVM_RESERVED_PAGES_PER_CPU
102 #define UVM_RESERVED_PAGES_PER_CPU 5
103 #endif
104 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
105
106 /*
107 * physical memory size;
108 */
109 psize_t physmem;
110
111 /*
112 * local variables
113 */
114
115 /*
116 * these variables record the values returned by vm_page_bootstrap,
117 * for debugging purposes. The implementation of uvm_pageboot_alloc
118 * and pmap_startup here also uses them internally.
119 */
120
121 static vaddr_t virtual_space_start;
122 static vaddr_t virtual_space_end;
123
124 /*
125 * we allocate an initial number of page colors in uvm_page_init(),
126 * and remember them. We may re-color pages as cache sizes are
127 * discovered during the autoconfiguration phase. But we can never
128 * free the initial set of buckets, since they are allocated using
129 * uvm_pageboot_alloc().
130 */
131
132 static size_t recolored_pages_memsize /* = 0 */;
133
134 #ifdef DEBUG
135 vaddr_t uvm_zerocheckkva;
136 #endif /* DEBUG */
137
138 /*
139 * These functions are reserved for uvm(9) internal use and are not
140 * exported in the header file uvm_physseg.h
141 *
142 * Thus they are redefined here.
143 */
144 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
145 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
146
147 /* returns a pgs array */
148 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
149
150 /*
151 * inline functions
152 */
153
154 /*
155 * uvm_pageinsert: insert a page in the object.
156 *
157 * => caller must lock object
158 * => call should have already set pg's object and offset pointers
159 * and bumped the version counter
160 */
161
162 static inline void
163 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
164 {
165
166 KASSERT(uobj == pg->uobject);
167 KASSERT(mutex_owned(uobj->vmobjlock));
168 KASSERT((pg->flags & PG_TABLED) == 0);
169
170 if (UVM_OBJ_IS_VNODE(uobj)) {
171 if (uobj->uo_npages == 0) {
172 struct vnode *vp = (struct vnode *)uobj;
173
174 vholdl(vp);
175 }
176 if (UVM_OBJ_IS_VTEXT(uobj)) {
177 cpu_count(CPU_COUNT_EXECPAGES, 1);
178 } else {
179 cpu_count(CPU_COUNT_FILEPAGES, 1);
180 }
181 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
182 cpu_count(CPU_COUNT_ANONPAGES, 1);
183 }
184 pg->flags |= PG_TABLED;
185 uobj->uo_npages++;
186 }
187
188 static inline int
189 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
190 {
191 const uint64_t idx = pg->offset >> PAGE_SHIFT;
192 int error;
193
194 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
195 if (error != 0) {
196 return error;
197 }
198 return 0;
199 }
200
201 /*
202 * uvm_page_remove: remove page from object.
203 *
204 * => caller must lock object
205 */
206
207 static inline void
208 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
209 {
210
211 KASSERT(uobj == pg->uobject);
212 KASSERT(mutex_owned(uobj->vmobjlock));
213 KASSERT(pg->flags & PG_TABLED);
214
215 if (UVM_OBJ_IS_VNODE(uobj)) {
216 if (uobj->uo_npages == 1) {
217 struct vnode *vp = (struct vnode *)uobj;
218
219 holdrelel(vp);
220 }
221 if (UVM_OBJ_IS_VTEXT(uobj)) {
222 cpu_count(CPU_COUNT_EXECPAGES, -1);
223 } else {
224 cpu_count(CPU_COUNT_FILEPAGES, -1);
225 }
226 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
227 cpu_count(CPU_COUNT_ANONPAGES, -1);
228 }
229
230 /* object should be locked */
231 uobj->uo_npages--;
232 pg->flags &= ~PG_TABLED;
233 pg->uobject = NULL;
234 }
235
236 static inline void
237 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
238 {
239 struct vm_page *opg __unused;
240
241 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
242 KASSERT(pg == opg);
243 }
244
245 static void
246 uvm_page_init_buckets(struct pgfreelist *pgfl)
247 {
248 int color, i;
249
250 for (color = 0; color < uvmexp.ncolors; color++) {
251 for (i = 0; i < PGFL_NQUEUES; i++) {
252 LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]);
253 }
254 }
255 }
256
257 /*
258 * uvm_page_init: init the page system. called from uvm_init().
259 *
260 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
261 */
262
263 void
264 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
265 {
266 static struct uvm_cpu boot_cpu;
267 psize_t freepages, pagecount, bucketcount, n;
268 struct pgflbucket *bucketarray, *cpuarray;
269 struct vm_page *pagearray;
270 uvm_physseg_t bank;
271 int lcv;
272
273 KASSERT(ncpu <= 1);
274 CTASSERT(sizeof(pagearray->offset) >= sizeof(struct uvm_cpu *));
275
276 /*
277 * init the page queues and free page queue lock, except the
278 * free list; we allocate that later (with the initial vm_page
279 * structures).
280 */
281
282 uvm.cpus[0] = &boot_cpu;
283 curcpu()->ci_data.cpu_uvm = &boot_cpu;
284 uvmpdpol_init();
285 mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM);
286
287 /*
288 * allocate vm_page structures.
289 */
290
291 /*
292 * sanity check:
293 * before calling this function the MD code is expected to register
294 * some free RAM with the uvm_page_physload() function. our job
295 * now is to allocate vm_page structures for this memory.
296 */
297
298 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
299 panic("uvm_page_bootstrap: no memory pre-allocated");
300
301 /*
302 * first calculate the number of free pages...
303 *
304 * note that we use start/end rather than avail_start/avail_end.
305 * this allows us to allocate extra vm_page structures in case we
306 * want to return some memory to the pool after booting.
307 */
308
309 freepages = 0;
310
311 for (bank = uvm_physseg_get_first();
312 uvm_physseg_valid_p(bank) ;
313 bank = uvm_physseg_get_next(bank)) {
314 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
315 }
316
317 /*
318 * Let MD code initialize the number of colors, or default
319 * to 1 color if MD code doesn't care.
320 */
321 if (uvmexp.ncolors == 0)
322 uvmexp.ncolors = 1;
323 uvmexp.colormask = uvmexp.ncolors - 1;
324 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
325
326 /*
327 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
328 * use. for each page of memory we use we need a vm_page structure.
329 * thus, the total number of pages we can use is the total size of
330 * the memory divided by the PAGE_SIZE plus the size of the vm_page
331 * structure. we add one to freepages as a fudge factor to avoid
332 * truncation errors (since we can only allocate in terms of whole
333 * pages).
334 */
335
336 bucketcount = uvmexp.ncolors * VM_NFREELIST;
337 pagecount = ((freepages + 1) << PAGE_SHIFT) /
338 (PAGE_SIZE + sizeof(struct vm_page));
339
340 bucketarray = (void *)uvm_pageboot_alloc((bucketcount *
341 sizeof(struct pgflbucket) * 2) + (pagecount *
342 sizeof(struct vm_page)));
343 cpuarray = bucketarray + bucketcount;
344 pagearray = (struct vm_page *)(bucketarray + bucketcount * 2);
345
346 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
347 uvm.page_free[lcv].pgfl_buckets =
348 (bucketarray + (lcv * uvmexp.ncolors));
349 uvm_page_init_buckets(&uvm.page_free[lcv]);
350 uvm.cpus[0]->page_free[lcv].pgfl_buckets =
351 (cpuarray + (lcv * uvmexp.ncolors));
352 uvm_page_init_buckets(&uvm.cpus[0]->page_free[lcv]);
353 }
354 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
355
356 /*
357 * init the vm_page structures and put them in the correct place.
358 */
359 /* First init the extent */
360
361 for (bank = uvm_physseg_get_first(),
362 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
363 uvm_physseg_valid_p(bank);
364 bank = uvm_physseg_get_next(bank)) {
365
366 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
367 uvm_physseg_seg_alloc_from_slab(bank, n);
368 uvm_physseg_init_seg(bank, pagearray);
369
370 /* set up page array pointers */
371 pagearray += n;
372 pagecount -= n;
373 }
374
375 /*
376 * pass up the values of virtual_space_start and
377 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
378 * layers of the VM.
379 */
380
381 *kvm_startp = round_page(virtual_space_start);
382 *kvm_endp = trunc_page(virtual_space_end);
383 #ifdef DEBUG
384 /*
385 * steal kva for uvm_pagezerocheck().
386 */
387 uvm_zerocheckkva = *kvm_startp;
388 *kvm_startp += PAGE_SIZE;
389 #endif /* DEBUG */
390
391 /*
392 * init various thresholds.
393 */
394
395 uvmexp.reserve_pagedaemon = 1;
396 uvmexp.reserve_kernel = vm_page_reserve_kernel;
397
398 /*
399 * determine if we should zero pages in the idle loop.
400 */
401
402 uvm.cpus[0]->page_idle_zero = vm_page_zero_enable;
403
404 /*
405 * done!
406 */
407
408 uvm.page_init_done = true;
409 }
410
411 /*
412 * uvm_setpagesize: set the page size
413 *
414 * => sets page_shift and page_mask from uvmexp.pagesize.
415 */
416
417 void
418 uvm_setpagesize(void)
419 {
420
421 /*
422 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
423 * to be a constant (indicated by being a non-zero value).
424 */
425 if (uvmexp.pagesize == 0) {
426 if (PAGE_SIZE == 0)
427 panic("uvm_setpagesize: uvmexp.pagesize not set");
428 uvmexp.pagesize = PAGE_SIZE;
429 }
430 uvmexp.pagemask = uvmexp.pagesize - 1;
431 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
432 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
433 uvmexp.pagesize, uvmexp.pagesize);
434 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
435 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
436 break;
437 }
438
439 /*
440 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
441 */
442
443 vaddr_t
444 uvm_pageboot_alloc(vsize_t size)
445 {
446 static bool initialized = false;
447 vaddr_t addr;
448 #if !defined(PMAP_STEAL_MEMORY)
449 vaddr_t vaddr;
450 paddr_t paddr;
451 #endif
452
453 /*
454 * on first call to this function, initialize ourselves.
455 */
456 if (initialized == false) {
457 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
458
459 /* round it the way we like it */
460 virtual_space_start = round_page(virtual_space_start);
461 virtual_space_end = trunc_page(virtual_space_end);
462
463 initialized = true;
464 }
465
466 /* round to page size */
467 size = round_page(size);
468 uvmexp.bootpages += atop(size);
469
470 #if defined(PMAP_STEAL_MEMORY)
471
472 /*
473 * defer bootstrap allocation to MD code (it may want to allocate
474 * from a direct-mapped segment). pmap_steal_memory should adjust
475 * virtual_space_start/virtual_space_end if necessary.
476 */
477
478 addr = pmap_steal_memory(size, &virtual_space_start,
479 &virtual_space_end);
480
481 return(addr);
482
483 #else /* !PMAP_STEAL_MEMORY */
484
485 /*
486 * allocate virtual memory for this request
487 */
488 if (virtual_space_start == virtual_space_end ||
489 (virtual_space_end - virtual_space_start) < size)
490 panic("uvm_pageboot_alloc: out of virtual space");
491
492 addr = virtual_space_start;
493
494 #ifdef PMAP_GROWKERNEL
495 /*
496 * If the kernel pmap can't map the requested space,
497 * then allocate more resources for it.
498 */
499 if (uvm_maxkaddr < (addr + size)) {
500 uvm_maxkaddr = pmap_growkernel(addr + size);
501 if (uvm_maxkaddr < (addr + size))
502 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
503 }
504 #endif
505
506 virtual_space_start += size;
507
508 /*
509 * allocate and mapin physical pages to back new virtual pages
510 */
511
512 for (vaddr = round_page(addr) ; vaddr < addr + size ;
513 vaddr += PAGE_SIZE) {
514
515 if (!uvm_page_physget(&paddr))
516 panic("uvm_pageboot_alloc: out of memory");
517
518 /*
519 * Note this memory is no longer managed, so using
520 * pmap_kenter is safe.
521 */
522 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
523 }
524 pmap_update(pmap_kernel());
525 return(addr);
526 #endif /* PMAP_STEAL_MEMORY */
527 }
528
529 #if !defined(PMAP_STEAL_MEMORY)
530 /*
531 * uvm_page_physget: "steal" one page from the vm_physmem structure.
532 *
533 * => attempt to allocate it off the end of a segment in which the "avail"
534 * values match the start/end values. if we can't do that, then we
535 * will advance both values (making them equal, and removing some
536 * vm_page structures from the non-avail area).
537 * => return false if out of memory.
538 */
539
540 /* subroutine: try to allocate from memory chunks on the specified freelist */
541 static bool uvm_page_physget_freelist(paddr_t *, int);
542
543 static bool
544 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
545 {
546 uvm_physseg_t lcv;
547
548 /* pass 1: try allocating from a matching end */
549 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
550 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
551 #else
552 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
553 #endif
554 {
555 if (uvm.page_init_done == true)
556 panic("uvm_page_physget: called _after_ bootstrap");
557
558 /* Try to match at front or back on unused segment */
559 if (uvm_page_physunload(lcv, freelist, paddrp))
560 return true;
561 }
562
563 /* pass2: forget about matching ends, just allocate something */
564 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
565 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
566 #else
567 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
568 #endif
569 {
570 /* Try the front regardless. */
571 if (uvm_page_physunload_force(lcv, freelist, paddrp))
572 return true;
573 }
574 return false;
575 }
576
577 bool
578 uvm_page_physget(paddr_t *paddrp)
579 {
580 int i;
581
582 /* try in the order of freelist preference */
583 for (i = 0; i < VM_NFREELIST; i++)
584 if (uvm_page_physget_freelist(paddrp, i) == true)
585 return (true);
586 return (false);
587 }
588 #endif /* PMAP_STEAL_MEMORY */
589
590 /*
591 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
592 * back from an I/O mapping (ugh!). used in some MD code as well.
593 */
594 struct vm_page *
595 uvm_phys_to_vm_page(paddr_t pa)
596 {
597 paddr_t pf = atop(pa);
598 paddr_t off;
599 uvm_physseg_t upm;
600
601 upm = uvm_physseg_find(pf, &off);
602 if (upm != UVM_PHYSSEG_TYPE_INVALID)
603 return uvm_physseg_get_pg(upm, off);
604 return(NULL);
605 }
606
607 paddr_t
608 uvm_vm_page_to_phys(const struct vm_page *pg)
609 {
610
611 return pg->phys_addr & ~(PAGE_SIZE - 1);
612 }
613
614 /*
615 * uvm_page_recolor: Recolor the pages if the new bucket count is
616 * larger than the old one.
617 */
618
619 void
620 uvm_page_recolor(int newncolors)
621 {
622 struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray;
623 struct pgfreelist gpgfl, pgfl;
624 struct vm_page *pg;
625 vsize_t bucketcount;
626 size_t bucketmemsize, oldbucketmemsize;
627 int color, i, ocolors;
628 int lcv;
629 struct uvm_cpu *ucpu;
630
631 KASSERT(((newncolors - 1) & newncolors) == 0);
632
633 if (newncolors <= uvmexp.ncolors)
634 return;
635
636 if (uvm.page_init_done == false) {
637 uvmexp.ncolors = newncolors;
638 return;
639 }
640
641 bucketcount = newncolors * VM_NFREELIST;
642 bucketmemsize = bucketcount * sizeof(struct pgflbucket) * 2;
643 bucketarray = kmem_alloc(bucketmemsize, KM_SLEEP);
644 cpuarray = bucketarray + bucketcount;
645
646 mutex_spin_enter(&uvm_fpageqlock);
647
648 /* Make sure we should still do this. */
649 if (newncolors <= uvmexp.ncolors) {
650 mutex_spin_exit(&uvm_fpageqlock);
651 kmem_free(bucketarray, bucketmemsize);
652 return;
653 }
654
655 oldbucketarray = uvm.page_free[0].pgfl_buckets;
656 ocolors = uvmexp.ncolors;
657
658 uvmexp.ncolors = newncolors;
659 uvmexp.colormask = uvmexp.ncolors - 1;
660
661 ucpu = curcpu()->ci_data.cpu_uvm;
662 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
663 gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors));
664 pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors));
665 uvm_page_init_buckets(&gpgfl);
666 uvm_page_init_buckets(&pgfl);
667 for (color = 0; color < ocolors; color++) {
668 for (i = 0; i < PGFL_NQUEUES; i++) {
669 while ((pg = LIST_FIRST(&uvm.page_free[
670 lcv].pgfl_buckets[color].pgfl_queues[i]))
671 != NULL) {
672 LIST_REMOVE(pg, pageq.list); /* global */
673 LIST_REMOVE(pg, listq.list); /* cpu */
674 LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[
675 VM_PGCOLOR(pg)].pgfl_queues[
676 i], pg, pageq.list);
677 LIST_INSERT_HEAD(&pgfl.pgfl_buckets[
678 VM_PGCOLOR(pg)].pgfl_queues[
679 i], pg, listq.list);
680 }
681 }
682 }
683 uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets;
684 ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
685 }
686
687 oldbucketmemsize = recolored_pages_memsize;
688
689 recolored_pages_memsize = bucketmemsize;
690 mutex_spin_exit(&uvm_fpageqlock);
691
692 if (oldbucketmemsize) {
693 kmem_free(oldbucketarray, oldbucketmemsize);
694 }
695
696 /*
697 * this calls uvm_km_alloc() which may want to hold
698 * uvm_fpageqlock.
699 */
700 uvm_pager_realloc_emerg();
701 }
702
703 /*
704 * uvm_cpu_attach: initialize per-CPU data structures.
705 */
706
707 void
708 uvm_cpu_attach(struct cpu_info *ci)
709 {
710 struct pgflbucket *bucketarray;
711 struct pgfreelist pgfl;
712 struct uvm_cpu *ucpu;
713 vsize_t bucketcount;
714 int lcv;
715
716 if (CPU_IS_PRIMARY(ci)) {
717 /* Already done in uvm_page_init(). */
718 goto attachrnd;
719 }
720
721 /* Add more reserve pages for this CPU. */
722 uvmexp.reserve_kernel += vm_page_reserve_kernel;
723
724 /* Configure this CPU's free lists. */
725 bucketcount = uvmexp.ncolors * VM_NFREELIST;
726 bucketarray = kmem_alloc(bucketcount * sizeof(struct pgflbucket),
727 KM_SLEEP);
728 ucpu = kmem_zalloc(sizeof(*ucpu), KM_SLEEP);
729 uvm.cpus[cpu_index(ci)] = ucpu;
730 ci->ci_data.cpu_uvm = ucpu;
731 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
732 pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors));
733 uvm_page_init_buckets(&pgfl);
734 ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
735 }
736
737 attachrnd:
738 /*
739 * Attach RNG source for this CPU's VM events
740 */
741 rnd_attach_source(&uvm.cpus[cpu_index(ci)]->rs,
742 ci->ci_data.cpu_name, RND_TYPE_VM,
743 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
744 RND_FLAG_ESTIMATE_VALUE);
745
746 }
747
748 /*
749 * uvm_free: return total number of free pages in system.
750 */
751
752 int
753 uvm_free(void)
754 {
755
756 return uvmexp.free;
757 }
758
759 /*
760 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat
761 */
762
763 static struct vm_page *
764 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2,
765 int *trycolorp)
766 {
767 struct pgflist *freeq;
768 struct vm_page *pg;
769 int color, trycolor = *trycolorp;
770 struct pgfreelist *gpgfl, *pgfl;
771
772 KASSERT(mutex_owned(&uvm_fpageqlock));
773
774 color = trycolor;
775 pgfl = &ucpu->page_free[flist];
776 gpgfl = &uvm.page_free[flist];
777 do {
778 /* cpu, try1 */
779 if ((pg = LIST_FIRST((freeq =
780 &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
781 KASSERT(pg->flags & PG_FREE);
782 KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
783 KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
784 KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
785 VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
786 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
787 goto gotit;
788 }
789 /* global, try1 */
790 if ((pg = LIST_FIRST((freeq =
791 &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
792 KASSERT(pg->flags & PG_FREE);
793 KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
794 KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
795 KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
796 VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
797 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
798 goto gotit;
799 }
800 /* cpu, try2 */
801 if ((pg = LIST_FIRST((freeq =
802 &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
803 KASSERT(pg->flags & PG_FREE);
804 KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
805 KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
806 KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
807 VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
808 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
809 goto gotit;
810 }
811 /* global, try2 */
812 if ((pg = LIST_FIRST((freeq =
813 &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
814 KASSERT(pg->flags & PG_FREE);
815 KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
816 KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
817 KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
818 VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
819 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
820 goto gotit;
821 }
822 color = (color + 1) & uvmexp.colormask;
823 } while (color != trycolor);
824
825 return (NULL);
826
827 gotit:
828 LIST_REMOVE(pg, pageq.list); /* global list */
829 LIST_REMOVE(pg, listq.list); /* per-cpu list */
830 uvmexp.free--;
831
832 if (color == trycolor)
833 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
834 else {
835 CPU_COUNT(CPU_COUNT_COLORMISS, 1);
836 *trycolorp = color;
837 }
838
839 return (pg);
840 }
841
842 /*
843 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
844 *
845 * => return null if no pages free
846 * => wake up pagedaemon if number of free pages drops below low water mark
847 * => if obj != NULL, obj must be locked (to put in obj's tree)
848 * => if anon != NULL, anon must be locked (to put in anon)
849 * => only one of obj or anon can be non-null
850 * => caller must activate/deactivate page if it is not wired.
851 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
852 * => policy decision: it is more important to pull a page off of the
853 * appropriate priority free list than it is to get a zero'd or
854 * unknown contents page. This is because we live with the
855 * consequences of a bad free list decision for the entire
856 * lifetime of the page, e.g. if the page comes from memory that
857 * is slower to access.
858 */
859
860 struct vm_page *
861 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
862 int flags, int strat, int free_list)
863 {
864 int try1, try2, zeroit = 0, color;
865 int lcv, error;
866 struct uvm_cpu *ucpu;
867 struct vm_page *pg;
868 lwp_t *l;
869
870 KASSERT(obj == NULL || anon == NULL);
871 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
872 KASSERT(off == trunc_page(off));
873 KASSERT(obj == NULL || mutex_owned(obj->vmobjlock));
874 KASSERT(anon == NULL || anon->an_lock == NULL ||
875 mutex_owned(anon->an_lock));
876
877 /*
878 * This implements a global round-robin page coloring
879 * algorithm.
880 */
881
882 ucpu = curcpu()->ci_data.cpu_uvm;
883 if (flags & UVM_FLAG_COLORMATCH) {
884 color = atop(off) & uvmexp.colormask;
885 } else {
886 color = ucpu->page_free_nextcolor;
887 }
888
889 /*
890 * check to see if we need to generate some free pages waking
891 * the pagedaemon.
892 */
893
894 uvm_kick_pdaemon();
895
896 /*
897 * fail if any of these conditions is true:
898 * [1] there really are no free pages, or
899 * [2] only kernel "reserved" pages remain and
900 * reserved pages have not been requested.
901 * [3] only pagedaemon "reserved" pages remain and
902 * the requestor isn't the pagedaemon.
903 * we make kernel reserve pages available if called by a
904 * kernel thread or a realtime thread.
905 */
906 mutex_spin_enter(&uvm_fpageqlock);
907 l = curlwp;
908 if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) {
909 flags |= UVM_PGA_USERESERVE;
910 }
911 if ((uvmexp.free <= uvmexp.reserve_kernel &&
912 (flags & UVM_PGA_USERESERVE) == 0) ||
913 (uvmexp.free <= uvmexp.reserve_pagedaemon &&
914 curlwp != uvm.pagedaemon_lwp))
915 goto fail;
916
917 #if PGFL_NQUEUES != 2
918 #error uvm_pagealloc_strat needs to be updated
919 #endif
920
921 /*
922 * If we want a zero'd page, try the ZEROS queue first, otherwise
923 * we try the UNKNOWN queue first.
924 */
925 if (flags & UVM_PGA_ZERO) {
926 try1 = PGFL_ZEROS;
927 try2 = PGFL_UNKNOWN;
928 } else {
929 try1 = PGFL_UNKNOWN;
930 try2 = PGFL_ZEROS;
931 }
932
933 again:
934 switch (strat) {
935 case UVM_PGA_STRAT_NORMAL:
936 /* Check freelists: descending priority (ascending id) order */
937 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
938 pg = uvm_pagealloc_pgfl(ucpu, lcv,
939 try1, try2, &color);
940 if (pg != NULL)
941 goto gotit;
942 }
943
944 /* No pages free! */
945 goto fail;
946
947 case UVM_PGA_STRAT_ONLY:
948 case UVM_PGA_STRAT_FALLBACK:
949 /* Attempt to allocate from the specified free list. */
950 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
951 pg = uvm_pagealloc_pgfl(ucpu, free_list,
952 try1, try2, &color);
953 if (pg != NULL)
954 goto gotit;
955
956 /* Fall back, if possible. */
957 if (strat == UVM_PGA_STRAT_FALLBACK) {
958 strat = UVM_PGA_STRAT_NORMAL;
959 goto again;
960 }
961
962 /* No pages free! */
963 goto fail;
964
965 default:
966 panic("uvm_pagealloc_strat: bad strat %d", strat);
967 /* NOTREACHED */
968 }
969
970 gotit:
971 /*
972 * We now know which color we actually allocated from; set
973 * the next color accordingly.
974 */
975
976 ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask;
977
978 /*
979 * update allocation statistics and remember if we have to
980 * zero the page
981 */
982
983 if (flags & UVM_PGA_ZERO) {
984 if (pg->flags & PG_ZERO) {
985 CPU_COUNT(CPU_COUNT_PGA_ZEROHIT, 1);
986 zeroit = 0;
987 } else {
988 CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
989 zeroit = 1;
990 }
991 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
992 ucpu->page_idle_zero = vm_page_zero_enable;
993 }
994 }
995 if (pg->flags & PG_ZERO) {
996 CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
997 }
998 if (anon) {
999 CPU_COUNT(CPU_COUNT_ANONPAGES, 1);
1000 }
1001 KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
1002
1003 /* mark the page as allocated and then drop uvm_fpageqlock. */
1004 pg->flags &= ~PG_FREE;
1005 mutex_spin_exit(&uvm_fpageqlock);
1006
1007 /*
1008 * assign the page to the object. as the page was free, we know
1009 * that pg->uobject and pg->uanon are NULL. we only need to take
1010 * the page's interlock if we are changing the values.
1011 */
1012 if (anon != NULL || obj != NULL) {
1013 mutex_enter(&pg->interlock);
1014 }
1015 pg->offset = off;
1016 pg->uobject = obj;
1017 pg->uanon = anon;
1018 KASSERT(uvm_page_locked_p(pg));
1019 pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE;
1020 if (anon) {
1021 anon->an_page = pg;
1022 pg->flags |= PG_ANON;
1023 mutex_exit(&pg->interlock);
1024 } else if (obj) {
1025 uvm_pageinsert_object(obj, pg);
1026 mutex_exit(&pg->interlock);
1027 error = uvm_pageinsert_tree(obj, pg);
1028 if (error != 0) {
1029 mutex_enter(&pg->interlock);
1030 uvm_pageremove_object(obj, pg);
1031 mutex_exit(&pg->interlock);
1032 uvm_pagefree(pg);
1033 return NULL;
1034 }
1035 }
1036
1037 #if defined(UVM_PAGE_TRKOWN)
1038 pg->owner_tag = NULL;
1039 #endif
1040 UVM_PAGE_OWN(pg, "new alloc");
1041
1042 if (flags & UVM_PGA_ZERO) {
1043 /*
1044 * A zero'd page is not clean. If we got a page not already
1045 * zero'd, then we have to zero it ourselves.
1046 */
1047 pg->flags &= ~PG_CLEAN;
1048 if (zeroit)
1049 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1050 }
1051
1052 return(pg);
1053
1054 fail:
1055 mutex_spin_exit(&uvm_fpageqlock);
1056 return (NULL);
1057 }
1058
1059 /*
1060 * uvm_pagereplace: replace a page with another
1061 *
1062 * => object must be locked
1063 */
1064
1065 void
1066 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1067 {
1068 struct uvm_object *uobj = oldpg->uobject;
1069
1070 KASSERT((oldpg->flags & PG_TABLED) != 0);
1071 KASSERT(uobj != NULL);
1072 KASSERT((newpg->flags & PG_TABLED) == 0);
1073 KASSERT(newpg->uobject == NULL);
1074 KASSERT(mutex_owned(uobj->vmobjlock));
1075
1076 newpg->offset = oldpg->offset;
1077 uvm_pageremove_tree(uobj, oldpg);
1078 uvm_pageinsert_tree(uobj, newpg);
1079
1080 /* take page interlocks during rename */
1081 if (oldpg < newpg) {
1082 mutex_enter(&oldpg->interlock);
1083 mutex_enter(&newpg->interlock);
1084 } else {
1085 mutex_enter(&newpg->interlock);
1086 mutex_enter(&oldpg->interlock);
1087 }
1088 newpg->uobject = uobj;
1089 uvm_pageinsert_object(uobj, newpg);
1090 uvm_pageremove_object(uobj, oldpg);
1091 mutex_exit(&oldpg->interlock);
1092 mutex_exit(&newpg->interlock);
1093 }
1094
1095 /*
1096 * uvm_pagerealloc: reallocate a page from one object to another
1097 *
1098 * => both objects must be locked
1099 * => both interlocks must be held
1100 */
1101
1102 void
1103 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1104 {
1105 /*
1106 * remove it from the old object
1107 */
1108
1109 if (pg->uobject) {
1110 uvm_pageremove_tree(pg->uobject, pg);
1111 uvm_pageremove_object(pg->uobject, pg);
1112 }
1113
1114 /*
1115 * put it in the new object
1116 */
1117
1118 if (newobj) {
1119 /*
1120 * XXX we have no in-tree users of this functionality
1121 */
1122 panic("uvm_pagerealloc: no impl");
1123 }
1124 }
1125
1126 #ifdef DEBUG
1127 /*
1128 * check if page is zero-filled
1129 */
1130 void
1131 uvm_pagezerocheck(struct vm_page *pg)
1132 {
1133 int *p, *ep;
1134
1135 KASSERT(uvm_zerocheckkva != 0);
1136 KASSERT(mutex_owned(&uvm_fpageqlock));
1137
1138 /*
1139 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1140 * uvm page allocator.
1141 *
1142 * it might be better to have "CPU-local temporary map" pmap interface.
1143 */
1144 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1145 p = (int *)uvm_zerocheckkva;
1146 ep = (int *)((char *)p + PAGE_SIZE);
1147 pmap_update(pmap_kernel());
1148 while (p < ep) {
1149 if (*p != 0)
1150 panic("PG_ZERO page isn't zero-filled");
1151 p++;
1152 }
1153 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1154 /*
1155 * pmap_update() is not necessary here because no one except us
1156 * uses this VA.
1157 */
1158 }
1159 #endif /* DEBUG */
1160
1161 /*
1162 * uvm_pagefree: free page
1163 *
1164 * => erase page's identity (i.e. remove from object)
1165 * => put page on free list
1166 * => caller must lock owning object (either anon or uvm_object)
1167 * => assumes all valid mappings of pg are gone
1168 */
1169
1170 void
1171 uvm_pagefree(struct vm_page *pg)
1172 {
1173 struct pgflist *pgfl;
1174 struct uvm_cpu *ucpu;
1175 int index, color, queue;
1176 bool iszero, locked;
1177
1178 #ifdef DEBUG
1179 if (pg->uobject == (void *)0xdeadbeef &&
1180 pg->uanon == (void *)0xdeadbeef) {
1181 panic("uvm_pagefree: freeing free page %p", pg);
1182 }
1183 #endif /* DEBUG */
1184
1185 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1186 KASSERT(!(pg->flags & PG_FREE));
1187 //KASSERT(mutex_owned(&uvm_pageqlock) || !uvmpdpol_pageisqueued_p(pg));
1188 KASSERT(pg->uobject == NULL || mutex_owned(pg->uobject->vmobjlock));
1189 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1190 mutex_owned(pg->uanon->an_lock));
1191
1192 /*
1193 * remove the page from the object's tree beore acquiring any page
1194 * interlocks: this can acquire locks to free radixtree nodes.
1195 */
1196 if (pg->uobject != NULL) {
1197 uvm_pageremove_tree(pg->uobject, pg);
1198 }
1199
1200 /*
1201 * if the page is loaned, resolve the loan instead of freeing.
1202 */
1203
1204 if (pg->loan_count) {
1205 KASSERT(pg->wire_count == 0);
1206
1207 /*
1208 * if the page is owned by an anon then we just want to
1209 * drop anon ownership. the kernel will free the page when
1210 * it is done with it. if the page is owned by an object,
1211 * remove it from the object and mark it dirty for the benefit
1212 * of possible anon owners.
1213 *
1214 * regardless of previous ownership, wakeup any waiters,
1215 * unbusy the page, and we're done.
1216 */
1217
1218 mutex_enter(&pg->interlock);
1219 locked = true;
1220 if (pg->uobject != NULL) {
1221 uvm_pageremove_object(pg->uobject, pg);
1222 pg->flags &= ~PG_CLEAN;
1223 } else if (pg->uanon != NULL) {
1224 if ((pg->flags & PG_ANON) == 0) {
1225 pg->loan_count--;
1226 } else {
1227 pg->flags &= ~PG_ANON;
1228 cpu_count(CPU_COUNT_ANONPAGES, -1);
1229 }
1230 pg->uanon->an_page = NULL;
1231 pg->uanon = NULL;
1232 }
1233 if (pg->flags & PG_WANTED) {
1234 wakeup(pg);
1235 }
1236 pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED|PG_PAGER1);
1237 #ifdef UVM_PAGE_TRKOWN
1238 pg->owner_tag = NULL;
1239 #endif
1240 if (pg->loan_count) {
1241 KASSERT(pg->uobject == NULL);
1242 mutex_exit(&pg->interlock);
1243 if (pg->uanon == NULL) {
1244 uvm_pagedequeue(pg);
1245 }
1246 return;
1247 }
1248 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1249 pg->wire_count != 0) {
1250 mutex_enter(&pg->interlock);
1251 locked = true;
1252 } else {
1253 locked = false;
1254 }
1255
1256 /*
1257 * remove page from its object or anon.
1258 */
1259 if (pg->uobject != NULL) {
1260 uvm_pageremove_object(pg->uobject, pg);
1261 } else if (pg->uanon != NULL) {
1262 pg->uanon->an_page = NULL;
1263 pg->uanon = NULL;
1264 cpu_count(CPU_COUNT_ANONPAGES, -1);
1265 }
1266
1267 /*
1268 * if the page was wired, unwire it now.
1269 */
1270
1271 if (pg->wire_count) {
1272 pg->wire_count = 0;
1273 atomic_dec_uint(&uvmexp.wired);
1274 }
1275 if (locked) {
1276 mutex_exit(&pg->interlock);
1277 }
1278
1279 /*
1280 * now remove the page from the queues.
1281 */
1282 uvm_pagedequeue(pg);
1283
1284 /*
1285 * and put on free queue
1286 */
1287
1288 iszero = (pg->flags & PG_ZERO);
1289 index = uvm_page_get_freelist(pg);
1290 color = VM_PGCOLOR(pg);
1291 queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN);
1292
1293 #ifdef DEBUG
1294 pg->uobject = (void *)0xdeadbeef;
1295 pg->uanon = (void *)0xdeadbeef;
1296 #endif
1297
1298 mutex_spin_enter(&uvm_fpageqlock);
1299 pg->flags = PG_FREE;
1300
1301 #ifdef DEBUG
1302 if (iszero)
1303 uvm_pagezerocheck(pg);
1304 #endif /* DEBUG */
1305
1306
1307 /* global list */
1308 pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue];
1309 LIST_INSERT_HEAD(pgfl, pg, pageq.list);
1310 uvmexp.free++;
1311 if (iszero) {
1312 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1313 }
1314
1315 /* per-cpu list */
1316 ucpu = curcpu()->ci_data.cpu_uvm;
1317 pg->offset = (uintptr_t)ucpu;
1318 pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue];
1319 LIST_INSERT_HEAD(pgfl, pg, listq.list);
1320 ucpu->pages[queue]++;
1321 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
1322 ucpu->page_idle_zero = vm_page_zero_enable;
1323 }
1324
1325 mutex_spin_exit(&uvm_fpageqlock);
1326 }
1327
1328 /*
1329 * uvm_page_unbusy: unbusy an array of pages.
1330 *
1331 * => pages must either all belong to the same object, or all belong to anons.
1332 * => if pages are object-owned, object must be locked.
1333 * => if pages are anon-owned, anons must be locked.
1334 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1335 */
1336
1337 void
1338 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1339 {
1340 struct vm_page *pg;
1341 int i;
1342 UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
1343
1344 for (i = 0; i < npgs; i++) {
1345 pg = pgs[i];
1346 if (pg == NULL || pg == PGO_DONTCARE) {
1347 continue;
1348 }
1349
1350 KASSERT(uvm_page_locked_p(pg));
1351 KASSERT(pg->flags & PG_BUSY);
1352 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1353 if (pg->flags & PG_WANTED) {
1354 /* XXXAD thundering herd problem. */
1355 wakeup(pg);
1356 }
1357 if (pg->flags & PG_RELEASED) {
1358 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1359 (uintptr_t)pg, 0, 0, 0);
1360 KASSERT(pg->uobject != NULL ||
1361 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1362 pg->flags &= ~PG_RELEASED;
1363 uvm_pagefree(pg);
1364 } else {
1365 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1366 (uintptr_t)pg, 0, 0, 0);
1367 KASSERT((pg->flags & PG_FAKE) == 0);
1368 pg->flags &= ~(PG_WANTED|PG_BUSY);
1369 UVM_PAGE_OWN(pg, NULL);
1370 }
1371 }
1372 }
1373
1374 #if defined(UVM_PAGE_TRKOWN)
1375 /*
1376 * uvm_page_own: set or release page ownership
1377 *
1378 * => this is a debugging function that keeps track of who sets PG_BUSY
1379 * and where they do it. it can be used to track down problems
1380 * such a process setting "PG_BUSY" and never releasing it.
1381 * => page's object [if any] must be locked
1382 * => if "tag" is NULL then we are releasing page ownership
1383 */
1384 void
1385 uvm_page_own(struct vm_page *pg, const char *tag)
1386 {
1387
1388 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1389 KASSERT((pg->flags & PG_WANTED) == 0);
1390 KASSERT(uvm_page_locked_p(pg));
1391
1392 /* gain ownership? */
1393 if (tag) {
1394 KASSERT((pg->flags & PG_BUSY) != 0);
1395 if (pg->owner_tag) {
1396 printf("uvm_page_own: page %p already owned "
1397 "by proc %d [%s]\n", pg,
1398 pg->owner, pg->owner_tag);
1399 panic("uvm_page_own");
1400 }
1401 pg->owner = curproc->p_pid;
1402 pg->lowner = curlwp->l_lid;
1403 pg->owner_tag = tag;
1404 return;
1405 }
1406
1407 /* drop ownership */
1408 KASSERT((pg->flags & PG_BUSY) == 0);
1409 if (pg->owner_tag == NULL) {
1410 printf("uvm_page_own: dropping ownership of an non-owned "
1411 "page (%p)\n", pg);
1412 panic("uvm_page_own");
1413 }
1414 if (!uvmpdpol_pageisqueued_p(pg)) {
1415 KASSERT((pg->uanon == NULL && pg->uobject == NULL) ||
1416 pg->wire_count > 0);
1417 } else {
1418 KASSERT(pg->wire_count == 0);
1419 }
1420 pg->owner_tag = NULL;
1421 }
1422 #endif
1423
1424 /*
1425 * uvm_pageidlezero: zero free pages while the system is idle.
1426 *
1427 * => try to complete one color bucket at a time, to reduce our impact
1428 * on the CPU cache.
1429 * => we loop until we either reach the target or there is a lwp ready
1430 * to run, or MD code detects a reason to break early.
1431 */
1432 void
1433 uvm_pageidlezero(void)
1434 {
1435 struct vm_page *pg;
1436 struct pgfreelist *pgfl, *gpgfl;
1437 struct uvm_cpu *ucpu;
1438 int free_list, firstbucket, nextbucket;
1439 bool lcont = false;
1440
1441 ucpu = curcpu()->ci_data.cpu_uvm;
1442 if (!ucpu->page_idle_zero ||
1443 ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
1444 ucpu->page_idle_zero = false;
1445 return;
1446 }
1447 if (!mutex_tryenter(&uvm_fpageqlock)) {
1448 /* Contention: let other CPUs to use the lock. */
1449 return;
1450 }
1451 firstbucket = ucpu->page_free_nextcolor;
1452 nextbucket = firstbucket;
1453 do {
1454 for (free_list = 0; free_list < VM_NFREELIST; free_list++) {
1455 if (sched_curcpu_runnable_p()) {
1456 goto quit;
1457 }
1458 pgfl = &ucpu->page_free[free_list];
1459 gpgfl = &uvm.page_free[free_list];
1460 while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[
1461 nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) {
1462 if (lcont || sched_curcpu_runnable_p()) {
1463 goto quit;
1464 }
1465 LIST_REMOVE(pg, pageq.list); /* global list */
1466 LIST_REMOVE(pg, listq.list); /* per-cpu list */
1467 ucpu->pages[PGFL_UNKNOWN]--;
1468 uvmexp.free--;
1469 KASSERT(pg->flags == PG_FREE);
1470 pg->flags = 0;
1471 mutex_spin_exit(&uvm_fpageqlock);
1472 #ifdef PMAP_PAGEIDLEZERO
1473 if (!PMAP_PAGEIDLEZERO(VM_PAGE_TO_PHYS(pg))) {
1474
1475 /*
1476 * The machine-dependent code detected
1477 * some reason for us to abort zeroing
1478 * pages, probably because there is a
1479 * process now ready to run.
1480 */
1481
1482 mutex_spin_enter(&uvm_fpageqlock);
1483 pg->flags = PG_FREE;
1484 LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
1485 nextbucket].pgfl_queues[
1486 PGFL_UNKNOWN], pg, pageq.list);
1487 LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
1488 nextbucket].pgfl_queues[
1489 PGFL_UNKNOWN], pg, listq.list);
1490 ucpu->pages[PGFL_UNKNOWN]++;
1491 uvmexp.free++;
1492 uvmexp.zeroaborts++;
1493 goto quit;
1494 }
1495 #else
1496 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1497 #endif /* PMAP_PAGEIDLEZERO */
1498 if (!mutex_tryenter(&uvm_fpageqlock)) {
1499 lcont = true;
1500 mutex_spin_enter(&uvm_fpageqlock);
1501 } else {
1502 lcont = false;
1503 }
1504 pg->flags = PG_FREE | PG_ZERO;
1505 LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
1506 nextbucket].pgfl_queues[PGFL_ZEROS],
1507 pg, pageq.list);
1508 LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
1509 nextbucket].pgfl_queues[PGFL_ZEROS],
1510 pg, listq.list);
1511 ucpu->pages[PGFL_ZEROS]++;
1512 uvmexp.free++;
1513 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1514 }
1515 }
1516 if (ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
1517 break;
1518 }
1519 nextbucket = (nextbucket + 1) & uvmexp.colormask;
1520 } while (nextbucket != firstbucket);
1521 ucpu->page_idle_zero = false;
1522 quit:
1523 mutex_spin_exit(&uvm_fpageqlock);
1524 }
1525
1526 /*
1527 * uvm_pagelookup: look up a page
1528 *
1529 * => caller should lock object to keep someone from pulling the page
1530 * out from under it
1531 */
1532
1533 struct vm_page *
1534 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1535 {
1536 struct vm_page *pg;
1537
1538 /* No - used from DDB. KASSERT(mutex_owned(obj->vmobjlock)); */
1539
1540 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1541
1542 KASSERT(pg == NULL || obj->uo_npages != 0);
1543 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1544 (pg->flags & PG_BUSY) != 0);
1545 return pg;
1546 }
1547
1548 /*
1549 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1550 *
1551 * => caller must lock objects
1552 */
1553
1554 void
1555 uvm_pagewire(struct vm_page *pg)
1556 {
1557
1558 KASSERT(uvm_page_locked_p(pg));
1559 #if defined(READAHEAD_STATS)
1560 if ((pg->flags & PG_READAHEAD) != 0) {
1561 uvm_ra_hit.ev_count++;
1562 pg->flags &= ~PG_READAHEAD;
1563 }
1564 #endif /* defined(READAHEAD_STATS) */
1565 if (pg->wire_count == 0) {
1566 uvm_pagedequeue(pg);
1567 atomic_inc_uint(&uvmexp.wired);
1568 }
1569 mutex_enter(&pg->interlock);
1570 pg->wire_count++;
1571 mutex_exit(&pg->interlock);
1572 KASSERT(pg->wire_count > 0); /* detect wraparound */
1573 }
1574
1575 /*
1576 * uvm_pageunwire: unwire the page.
1577 *
1578 * => activate if wire count goes to zero.
1579 * => caller must lock objects
1580 */
1581
1582 void
1583 uvm_pageunwire(struct vm_page *pg)
1584 {
1585
1586 KASSERT(uvm_page_locked_p(pg));
1587 KASSERT(pg->wire_count != 0);
1588 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1589 mutex_enter(&pg->interlock);
1590 pg->wire_count--;
1591 mutex_exit(&pg->interlock);
1592 if (pg->wire_count == 0) {
1593 uvm_pageactivate(pg);
1594 KASSERT(uvmexp.wired != 0);
1595 atomic_dec_uint(&uvmexp.wired);
1596 }
1597 }
1598
1599 /*
1600 * uvm_pagedeactivate: deactivate page
1601 *
1602 * => caller must lock objects
1603 * => caller must check to make sure page is not wired
1604 * => object that page belongs to must be locked (so we can adjust pg->flags)
1605 * => caller must clear the reference on the page before calling
1606 */
1607
1608 void
1609 uvm_pagedeactivate(struct vm_page *pg)
1610 {
1611
1612 KASSERT(uvm_page_locked_p(pg));
1613 if (pg->wire_count == 0) {
1614 KASSERT(uvmpdpol_pageisqueued_p(pg));
1615 uvmpdpol_pagedeactivate(pg);
1616 }
1617 }
1618
1619 /*
1620 * uvm_pageactivate: activate page
1621 *
1622 * => caller must lock objects
1623 */
1624
1625 void
1626 uvm_pageactivate(struct vm_page *pg)
1627 {
1628
1629 KASSERT(uvm_page_locked_p(pg));
1630 #if defined(READAHEAD_STATS)
1631 if ((pg->flags & PG_READAHEAD) != 0) {
1632 uvm_ra_hit.ev_count++;
1633 pg->flags &= ~PG_READAHEAD;
1634 }
1635 #endif /* defined(READAHEAD_STATS) */
1636 if (pg->wire_count == 0) {
1637 uvmpdpol_pageactivate(pg);
1638 }
1639 }
1640
1641 /*
1642 * uvm_pagedequeue: remove a page from any paging queue
1643 *
1644 * => caller must lock objects
1645 */
1646 void
1647 uvm_pagedequeue(struct vm_page *pg)
1648 {
1649
1650 KASSERT(uvm_page_locked_p(pg));
1651 if (uvmpdpol_pageisqueued_p(pg)) {
1652 uvmpdpol_pagedequeue(pg);
1653 }
1654 }
1655
1656 /*
1657 * uvm_pageenqueue: add a page to a paging queue without activating.
1658 * used where a page is not really demanded (yet). eg. read-ahead
1659 *
1660 * => caller must lock objects
1661 */
1662 void
1663 uvm_pageenqueue(struct vm_page *pg)
1664 {
1665
1666 KASSERT(uvm_page_locked_p(pg));
1667 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1668 uvmpdpol_pageenqueue(pg);
1669 }
1670 }
1671
1672 /*
1673 * uvm_pagezero: zero fill a page
1674 *
1675 * => if page is part of an object then the object should be locked
1676 * to protect pg->flags.
1677 */
1678
1679 void
1680 uvm_pagezero(struct vm_page *pg)
1681 {
1682 pg->flags &= ~PG_CLEAN;
1683 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1684 }
1685
1686 /*
1687 * uvm_pagecopy: copy a page
1688 *
1689 * => if page is part of an object then the object should be locked
1690 * to protect pg->flags.
1691 */
1692
1693 void
1694 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
1695 {
1696
1697 dst->flags &= ~PG_CLEAN;
1698 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
1699 }
1700
1701 /*
1702 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
1703 */
1704
1705 bool
1706 uvm_pageismanaged(paddr_t pa)
1707 {
1708
1709 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
1710 }
1711
1712 /*
1713 * uvm_page_lookup_freelist: look up the free list for the specified page
1714 */
1715
1716 int
1717 uvm_page_lookup_freelist(struct vm_page *pg)
1718 {
1719 uvm_physseg_t upm;
1720
1721 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
1722 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
1723 return uvm_physseg_get_free_list(upm);
1724 }
1725
1726 /*
1727 * uvm_page_locked_p: return true if object associated with page is
1728 * locked. this is a weak check for runtime assertions only.
1729 */
1730
1731 bool
1732 uvm_page_locked_p(struct vm_page *pg)
1733 {
1734
1735 if (pg->uobject != NULL) {
1736 return mutex_owned(pg->uobject->vmobjlock);
1737 }
1738 if (pg->uanon != NULL) {
1739 return mutex_owned(pg->uanon->an_lock);
1740 }
1741 return true;
1742 }
1743
1744 #ifdef PMAP_DIRECT
1745 /*
1746 * Call pmap to translate physical address into a virtual and to run a callback
1747 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
1748 * or equivalent.
1749 */
1750 int
1751 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
1752 int (*process)(void *, size_t, void *), void *arg)
1753 {
1754 int error = 0;
1755 paddr_t pa;
1756 size_t todo;
1757 voff_t pgoff = (off & PAGE_MASK);
1758 struct vm_page *pg;
1759
1760 KASSERT(npages > 0 && len > 0);
1761
1762 for (int i = 0; i < npages; i++) {
1763 pg = pgs[i];
1764
1765 KASSERT(len > 0);
1766
1767 /*
1768 * Caller is responsible for ensuring all the pages are
1769 * available.
1770 */
1771 KASSERT(pg != NULL && pg != PGO_DONTCARE);
1772
1773 pa = VM_PAGE_TO_PHYS(pg);
1774 todo = MIN(len, PAGE_SIZE - pgoff);
1775
1776 error = pmap_direct_process(pa, pgoff, todo, process, arg);
1777 if (error)
1778 break;
1779
1780 pgoff = 0;
1781 len -= todo;
1782 }
1783
1784 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
1785 return error;
1786 }
1787 #endif /* PMAP_DIRECT */
1788
1789 #if defined(DDB) || defined(DEBUGPRINT)
1790
1791 /*
1792 * uvm_page_printit: actually print the page
1793 */
1794
1795 static const char page_flagbits[] = UVM_PGFLAGBITS;
1796
1797 void
1798 uvm_page_printit(struct vm_page *pg, bool full,
1799 void (*pr)(const char *, ...))
1800 {
1801 struct vm_page *tpg;
1802 struct uvm_object *uobj;
1803 struct pgflist *pgl;
1804 char pgbuf[128];
1805
1806 (*pr)("PAGE %p:\n", pg);
1807 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
1808 (*pr)(" flags=%s, pqflags=%x, wire_count=%d, pa=0x%lx\n",
1809 pgbuf, pg->pqflags, pg->wire_count, (long)VM_PAGE_TO_PHYS(pg));
1810 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n",
1811 pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count);
1812 (*pr)(" bucket=%d freelist=%d\n",
1813 uvm_page_get_bucket(pg), uvm_page_get_freelist(pg));
1814 #if defined(UVM_PAGE_TRKOWN)
1815 if (pg->flags & PG_BUSY)
1816 (*pr)(" owning process = %d, tag=%s\n",
1817 pg->owner, pg->owner_tag);
1818 else
1819 (*pr)(" page not busy, no owner\n");
1820 #else
1821 (*pr)(" [page ownership tracking disabled]\n");
1822 #endif
1823
1824 if (!full)
1825 return;
1826
1827 /* cross-verify object/anon */
1828 if ((pg->flags & PG_FREE) == 0) {
1829 if (pg->flags & PG_ANON) {
1830 if (pg->uanon == NULL || pg->uanon->an_page != pg)
1831 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
1832 (pg->uanon) ? pg->uanon->an_page : NULL);
1833 else
1834 (*pr)(" anon backpointer is OK\n");
1835 } else {
1836 uobj = pg->uobject;
1837 if (uobj) {
1838 (*pr)(" checking object list\n");
1839 tpg = uvm_pagelookup(uobj, pg->offset);
1840 if (tpg)
1841 (*pr)(" page found on object list\n");
1842 else
1843 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
1844 }
1845 }
1846 }
1847
1848 /* cross-verify page queue */
1849 if (pg->flags & PG_FREE) {
1850 int fl = uvm_page_get_freelist(pg);
1851 int color = VM_PGCOLOR(pg);
1852 pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[
1853 ((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN];
1854 } else {
1855 pgl = NULL;
1856 }
1857
1858 if (pgl) {
1859 (*pr)(" checking pageq list\n");
1860 LIST_FOREACH(tpg, pgl, pageq.list) {
1861 if (tpg == pg) {
1862 break;
1863 }
1864 }
1865 if (tpg)
1866 (*pr)(" page found on pageq list\n");
1867 else
1868 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
1869 }
1870 }
1871
1872 /*
1873 * uvm_page_printall - print a summary of all managed pages
1874 */
1875
1876 void
1877 uvm_page_printall(void (*pr)(const char *, ...))
1878 {
1879 uvm_physseg_t i;
1880 paddr_t pfn;
1881 struct vm_page *pg;
1882
1883 (*pr)("%18s %4s %4s %18s %18s"
1884 #ifdef UVM_PAGE_TRKOWN
1885 " OWNER"
1886 #endif
1887 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
1888 for (i = uvm_physseg_get_first();
1889 uvm_physseg_valid_p(i);
1890 i = uvm_physseg_get_next(i)) {
1891 for (pfn = uvm_physseg_get_start(i);
1892 pfn < uvm_physseg_get_end(i);
1893 pfn++) {
1894 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
1895
1896 (*pr)("%18p %04x %08x %18p %18p",
1897 pg, pg->flags, pg->pqflags, pg->uobject,
1898 pg->uanon);
1899 #ifdef UVM_PAGE_TRKOWN
1900 if (pg->flags & PG_BUSY)
1901 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
1902 #endif
1903 (*pr)("\n");
1904 }
1905 }
1906 }
1907
1908 #endif /* DDB || DEBUGPRINT */
1909