uvm_page.c revision 1.250 1 /* $NetBSD: uvm_page.c,v 1.250 2020/12/20 11:11:34 skrll Exp $ */
2
3 /*-
4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1997 Charles D. Cranor and Washington University.
34 * Copyright (c) 1991, 1993, The Regents of the University of California.
35 *
36 * All rights reserved.
37 *
38 * This code is derived from software contributed to Berkeley by
39 * The Mach Operating System project at Carnegie-Mellon University.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
67 *
68 *
69 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
70 * All rights reserved.
71 *
72 * Permission to use, copy, modify and distribute this software and
73 * its documentation is hereby granted, provided that both the copyright
74 * notice and this permission notice appear in all copies of the
75 * software, derivative works or modified versions, and any portions
76 * thereof, and that both notices appear in supporting documentation.
77 *
78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
81 *
82 * Carnegie Mellon requests users of this software to return to
83 *
84 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
85 * School of Computer Science
86 * Carnegie Mellon University
87 * Pittsburgh PA 15213-3890
88 *
89 * any improvements or extensions that they make and grant Carnegie the
90 * rights to redistribute these changes.
91 */
92
93 /*
94 * uvm_page.c: page ops.
95 */
96
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.250 2020/12/20 11:11:34 skrll Exp $");
99
100 #include "opt_ddb.h"
101 #include "opt_uvm.h"
102 #include "opt_uvmhist.h"
103 #include "opt_readahead.h"
104
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/sched.h>
108 #include <sys/kernel.h>
109 #include <sys/vnode.h>
110 #include <sys/proc.h>
111 #include <sys/radixtree.h>
112 #include <sys/atomic.h>
113 #include <sys/cpu.h>
114
115 #include <uvm/uvm.h>
116 #include <uvm/uvm_ddb.h>
117 #include <uvm/uvm_pdpolicy.h>
118 #include <uvm/uvm_pgflcache.h>
119
120 /*
121 * number of pages per-CPU to reserve for the kernel.
122 */
123 #ifndef UVM_RESERVED_PAGES_PER_CPU
124 #define UVM_RESERVED_PAGES_PER_CPU 5
125 #endif
126 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
127
128 /*
129 * physical memory size;
130 */
131 psize_t physmem;
132
133 /*
134 * local variables
135 */
136
137 /*
138 * these variables record the values returned by vm_page_bootstrap,
139 * for debugging purposes. The implementation of uvm_pageboot_alloc
140 * and pmap_startup here also uses them internally.
141 */
142
143 static vaddr_t virtual_space_start;
144 static vaddr_t virtual_space_end;
145
146 /*
147 * we allocate an initial number of page colors in uvm_page_init(),
148 * and remember them. We may re-color pages as cache sizes are
149 * discovered during the autoconfiguration phase. But we can never
150 * free the initial set of buckets, since they are allocated using
151 * uvm_pageboot_alloc().
152 */
153
154 static size_t recolored_pages_memsize /* = 0 */;
155 static char *recolored_pages_mem;
156
157 /*
158 * freelist locks - one per bucket.
159 */
160
161 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
162 __cacheline_aligned;
163
164 /*
165 * basic NUMA information.
166 */
167
168 static struct uvm_page_numa_region {
169 struct uvm_page_numa_region *next;
170 paddr_t start;
171 paddr_t size;
172 u_int numa_id;
173 } *uvm_page_numa_region;
174
175 #ifdef DEBUG
176 kmutex_t uvm_zerochecklock __cacheline_aligned;
177 vaddr_t uvm_zerocheckkva;
178 #endif /* DEBUG */
179
180 /*
181 * These functions are reserved for uvm(9) internal use and are not
182 * exported in the header file uvm_physseg.h
183 *
184 * Thus they are redefined here.
185 */
186 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
187 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
188
189 /* returns a pgs array */
190 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
191
192 /*
193 * inline functions
194 */
195
196 /*
197 * uvm_pageinsert: insert a page in the object.
198 *
199 * => caller must lock object
200 * => call should have already set pg's object and offset pointers
201 * and bumped the version counter
202 */
203
204 static inline void
205 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
206 {
207
208 KASSERT(uobj == pg->uobject);
209 KASSERT(rw_write_held(uobj->vmobjlock));
210 KASSERT((pg->flags & PG_TABLED) == 0);
211
212 if ((pg->flags & PG_STAT) != 0) {
213 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
214 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
215
216 if ((pg->flags & PG_FILE) != 0) {
217 if (uobj->uo_npages == 0) {
218 struct vnode *vp = (struct vnode *)uobj;
219 mutex_enter(vp->v_interlock);
220 KASSERT((vp->v_iflag & VI_PAGES) == 0);
221 vp->v_iflag |= VI_PAGES;
222 vholdl(vp);
223 mutex_exit(vp->v_interlock);
224 }
225 if (UVM_OBJ_IS_VTEXT(uobj)) {
226 cpu_count(CPU_COUNT_EXECPAGES, 1);
227 }
228 cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
229 } else {
230 cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
231 }
232 }
233 pg->flags |= PG_TABLED;
234 uobj->uo_npages++;
235 }
236
237 static inline int
238 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
239 {
240 const uint64_t idx = pg->offset >> PAGE_SHIFT;
241 int error;
242
243 KASSERT(rw_write_held(uobj->vmobjlock));
244
245 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
246 if (error != 0) {
247 return error;
248 }
249 if ((pg->flags & PG_CLEAN) == 0) {
250 uvm_obj_page_set_dirty(pg);
251 }
252 KASSERT(((pg->flags & PG_CLEAN) == 0) ==
253 uvm_obj_page_dirty_p(pg));
254 return 0;
255 }
256
257 /*
258 * uvm_page_remove: remove page from object.
259 *
260 * => caller must lock object
261 */
262
263 static inline void
264 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
265 {
266
267 KASSERT(uobj == pg->uobject);
268 KASSERT(rw_write_held(uobj->vmobjlock));
269 KASSERT(pg->flags & PG_TABLED);
270
271 if ((pg->flags & PG_STAT) != 0) {
272 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
273 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
274
275 if ((pg->flags & PG_FILE) != 0) {
276 if (uobj->uo_npages == 1) {
277 struct vnode *vp = (struct vnode *)uobj;
278 mutex_enter(vp->v_interlock);
279 KASSERT((vp->v_iflag & VI_PAGES) != 0);
280 vp->v_iflag &= ~VI_PAGES;
281 holdrelel(vp);
282 mutex_exit(vp->v_interlock);
283 }
284 if (UVM_OBJ_IS_VTEXT(uobj)) {
285 cpu_count(CPU_COUNT_EXECPAGES, -1);
286 }
287 cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
288 } else {
289 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
290 }
291 }
292 uobj->uo_npages--;
293 pg->flags &= ~PG_TABLED;
294 pg->uobject = NULL;
295 }
296
297 static inline void
298 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
299 {
300 struct vm_page *opg __unused;
301
302 KASSERT(rw_write_held(uobj->vmobjlock));
303
304 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
305 KASSERT(pg == opg);
306 }
307
308 static void
309 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
310 {
311 int i;
312
313 pgb->pgb_nfree = 0;
314 for (i = 0; i < uvmexp.ncolors; i++) {
315 LIST_INIT(&pgb->pgb_colors[i]);
316 }
317 pgfl->pgfl_buckets[num] = pgb;
318 }
319
320 /*
321 * uvm_page_init: init the page system. called from uvm_init().
322 *
323 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
324 */
325
326 void
327 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
328 {
329 static struct uvm_cpu boot_cpu __cacheline_aligned;
330 psize_t freepages, pagecount, bucketsize, n;
331 struct pgflbucket *pgb;
332 struct vm_page *pagearray;
333 char *bucketarray;
334 uvm_physseg_t bank;
335 int fl, b;
336
337 KASSERT(ncpu <= 1);
338
339 /*
340 * init the page queues and free page queue locks, except the
341 * free list; we allocate that later (with the initial vm_page
342 * structures).
343 */
344
345 curcpu()->ci_data.cpu_uvm = &boot_cpu;
346 uvmpdpol_init();
347 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
348 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
349 }
350
351 /*
352 * allocate vm_page structures.
353 */
354
355 /*
356 * sanity check:
357 * before calling this function the MD code is expected to register
358 * some free RAM with the uvm_page_physload() function. our job
359 * now is to allocate vm_page structures for this memory.
360 */
361
362 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
363 panic("uvm_page_bootstrap: no memory pre-allocated");
364
365 /*
366 * first calculate the number of free pages...
367 *
368 * note that we use start/end rather than avail_start/avail_end.
369 * this allows us to allocate extra vm_page structures in case we
370 * want to return some memory to the pool after booting.
371 */
372
373 freepages = 0;
374
375 for (bank = uvm_physseg_get_first();
376 uvm_physseg_valid_p(bank) ;
377 bank = uvm_physseg_get_next(bank)) {
378 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
379 }
380
381 /*
382 * Let MD code initialize the number of colors, or default
383 * to 1 color if MD code doesn't care.
384 */
385 if (uvmexp.ncolors == 0)
386 uvmexp.ncolors = 1;
387 uvmexp.colormask = uvmexp.ncolors - 1;
388 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
389
390 /* We always start with only 1 bucket. */
391 uvm.bucketcount = 1;
392
393 /*
394 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
395 * use. for each page of memory we use we need a vm_page structure.
396 * thus, the total number of pages we can use is the total size of
397 * the memory divided by the PAGE_SIZE plus the size of the vm_page
398 * structure. we add one to freepages as a fudge factor to avoid
399 * truncation errors (since we can only allocate in terms of whole
400 * pages).
401 */
402 pagecount = ((freepages + 1) << PAGE_SHIFT) /
403 (PAGE_SIZE + sizeof(struct vm_page));
404 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
405 bucketsize = roundup2(bucketsize, coherency_unit);
406 bucketarray = (void *)uvm_pageboot_alloc(
407 bucketsize * VM_NFREELIST +
408 pagecount * sizeof(struct vm_page));
409 pagearray = (struct vm_page *)
410 (bucketarray + bucketsize * VM_NFREELIST);
411
412 for (fl = 0; fl < VM_NFREELIST; fl++) {
413 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
414 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
415 }
416 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
417
418 /*
419 * init the freelist cache in the disabled state.
420 */
421 uvm_pgflcache_init();
422
423 /*
424 * init the vm_page structures and put them in the correct place.
425 */
426 /* First init the extent */
427
428 for (bank = uvm_physseg_get_first(),
429 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
430 uvm_physseg_valid_p(bank);
431 bank = uvm_physseg_get_next(bank)) {
432
433 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
434 uvm_physseg_seg_alloc_from_slab(bank, n);
435 uvm_physseg_init_seg(bank, pagearray);
436
437 /* set up page array pointers */
438 pagearray += n;
439 pagecount -= n;
440 }
441
442 /*
443 * pass up the values of virtual_space_start and
444 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
445 * layers of the VM.
446 */
447
448 *kvm_startp = round_page(virtual_space_start);
449 *kvm_endp = trunc_page(virtual_space_end);
450
451 /*
452 * init various thresholds.
453 */
454
455 uvmexp.reserve_pagedaemon = 1;
456 uvmexp.reserve_kernel = vm_page_reserve_kernel;
457
458 /*
459 * done!
460 */
461
462 uvm.page_init_done = true;
463 }
464
465 /*
466 * uvm_pgfl_lock: lock all freelist buckets
467 */
468
469 void
470 uvm_pgfl_lock(void)
471 {
472 int i;
473
474 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
475 mutex_spin_enter(&uvm_freelist_locks[i].lock);
476 }
477 }
478
479 /*
480 * uvm_pgfl_unlock: unlock all freelist buckets
481 */
482
483 void
484 uvm_pgfl_unlock(void)
485 {
486 int i;
487
488 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
489 mutex_spin_exit(&uvm_freelist_locks[i].lock);
490 }
491 }
492
493 /*
494 * uvm_setpagesize: set the page size
495 *
496 * => sets page_shift and page_mask from uvmexp.pagesize.
497 */
498
499 void
500 uvm_setpagesize(void)
501 {
502
503 /*
504 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
505 * to be a constant (indicated by being a non-zero value).
506 */
507 if (uvmexp.pagesize == 0) {
508 if (PAGE_SIZE == 0)
509 panic("uvm_setpagesize: uvmexp.pagesize not set");
510 uvmexp.pagesize = PAGE_SIZE;
511 }
512 uvmexp.pagemask = uvmexp.pagesize - 1;
513 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
514 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
515 uvmexp.pagesize, uvmexp.pagesize);
516 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
517 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
518 break;
519 }
520
521 /*
522 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
523 */
524
525 vaddr_t
526 uvm_pageboot_alloc(vsize_t size)
527 {
528 static bool initialized = false;
529 vaddr_t addr;
530 #if !defined(PMAP_STEAL_MEMORY)
531 vaddr_t vaddr;
532 paddr_t paddr;
533 #endif
534
535 /*
536 * on first call to this function, initialize ourselves.
537 */
538 if (initialized == false) {
539 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
540
541 /* round it the way we like it */
542 virtual_space_start = round_page(virtual_space_start);
543 virtual_space_end = trunc_page(virtual_space_end);
544
545 initialized = true;
546 }
547
548 /* round to page size */
549 size = round_page(size);
550 uvmexp.bootpages += atop(size);
551
552 #if defined(PMAP_STEAL_MEMORY)
553
554 /*
555 * defer bootstrap allocation to MD code (it may want to allocate
556 * from a direct-mapped segment). pmap_steal_memory should adjust
557 * virtual_space_start/virtual_space_end if necessary.
558 */
559
560 addr = pmap_steal_memory(size, &virtual_space_start,
561 &virtual_space_end);
562
563 return addr;
564
565 #else /* !PMAP_STEAL_MEMORY */
566
567 /*
568 * allocate virtual memory for this request
569 */
570 if (virtual_space_start == virtual_space_end ||
571 (virtual_space_end - virtual_space_start) < size)
572 panic("uvm_pageboot_alloc: out of virtual space");
573
574 addr = virtual_space_start;
575
576 #ifdef PMAP_GROWKERNEL
577 /*
578 * If the kernel pmap can't map the requested space,
579 * then allocate more resources for it.
580 */
581 if (uvm_maxkaddr < (addr + size)) {
582 uvm_maxkaddr = pmap_growkernel(addr + size);
583 if (uvm_maxkaddr < (addr + size))
584 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
585 }
586 #endif
587
588 virtual_space_start += size;
589
590 /*
591 * allocate and mapin physical pages to back new virtual pages
592 */
593
594 for (vaddr = round_page(addr) ; vaddr < addr + size ;
595 vaddr += PAGE_SIZE) {
596
597 if (!uvm_page_physget(&paddr))
598 panic("uvm_pageboot_alloc: out of memory");
599
600 /*
601 * Note this memory is no longer managed, so using
602 * pmap_kenter is safe.
603 */
604 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
605 }
606 pmap_update(pmap_kernel());
607 return addr;
608 #endif /* PMAP_STEAL_MEMORY */
609 }
610
611 #if !defined(PMAP_STEAL_MEMORY)
612 /*
613 * uvm_page_physget: "steal" one page from the vm_physmem structure.
614 *
615 * => attempt to allocate it off the end of a segment in which the "avail"
616 * values match the start/end values. if we can't do that, then we
617 * will advance both values (making them equal, and removing some
618 * vm_page structures from the non-avail area).
619 * => return false if out of memory.
620 */
621
622 /* subroutine: try to allocate from memory chunks on the specified freelist */
623 static bool uvm_page_physget_freelist(paddr_t *, int);
624
625 static bool
626 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
627 {
628 uvm_physseg_t lcv;
629
630 /* pass 1: try allocating from a matching end */
631 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
632 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
633 #else
634 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
635 #endif
636 {
637 if (uvm.page_init_done == true)
638 panic("uvm_page_physget: called _after_ bootstrap");
639
640 /* Try to match at front or back on unused segment */
641 if (uvm_page_physunload(lcv, freelist, paddrp))
642 return true;
643 }
644
645 /* pass2: forget about matching ends, just allocate something */
646 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
647 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
648 #else
649 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
650 #endif
651 {
652 /* Try the front regardless. */
653 if (uvm_page_physunload_force(lcv, freelist, paddrp))
654 return true;
655 }
656 return false;
657 }
658
659 bool
660 uvm_page_physget(paddr_t *paddrp)
661 {
662 int i;
663
664 /* try in the order of freelist preference */
665 for (i = 0; i < VM_NFREELIST; i++)
666 if (uvm_page_physget_freelist(paddrp, i) == true)
667 return (true);
668 return (false);
669 }
670 #endif /* PMAP_STEAL_MEMORY */
671
672 /*
673 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
674 * back from an I/O mapping (ugh!). used in some MD code as well.
675 */
676 struct vm_page *
677 uvm_phys_to_vm_page(paddr_t pa)
678 {
679 paddr_t pf = atop(pa);
680 paddr_t off;
681 uvm_physseg_t upm;
682
683 upm = uvm_physseg_find(pf, &off);
684 if (upm != UVM_PHYSSEG_TYPE_INVALID)
685 return uvm_physseg_get_pg(upm, off);
686 return(NULL);
687 }
688
689 paddr_t
690 uvm_vm_page_to_phys(const struct vm_page *pg)
691 {
692
693 return pg->phys_addr & ~(PAGE_SIZE - 1);
694 }
695
696 /*
697 * uvm_page_numa_load: load NUMA range description.
698 */
699 void
700 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
701 {
702 struct uvm_page_numa_region *d;
703
704 KASSERT(numa_id < PGFL_MAX_BUCKETS);
705
706 d = kmem_alloc(sizeof(*d), KM_SLEEP);
707 d->start = start;
708 d->size = size;
709 d->numa_id = numa_id;
710 d->next = uvm_page_numa_region;
711 uvm_page_numa_region = d;
712 }
713
714 /*
715 * uvm_page_numa_lookup: lookup NUMA node for the given page.
716 */
717 static u_int
718 uvm_page_numa_lookup(struct vm_page *pg)
719 {
720 struct uvm_page_numa_region *d;
721 static bool warned;
722 paddr_t pa;
723
724 KASSERT(uvm_page_numa_region != NULL);
725
726 pa = VM_PAGE_TO_PHYS(pg);
727 for (d = uvm_page_numa_region; d != NULL; d = d->next) {
728 if (pa >= d->start && pa < d->start + d->size) {
729 return d->numa_id;
730 }
731 }
732
733 if (!warned) {
734 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
735 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
736 warned = true;
737 }
738
739 return 0;
740 }
741
742 /*
743 * uvm_page_redim: adjust freelist dimensions if they have changed.
744 */
745
746 static void
747 uvm_page_redim(int newncolors, int newnbuckets)
748 {
749 struct pgfreelist npgfl;
750 struct pgflbucket *opgb, *npgb;
751 struct pgflist *ohead, *nhead;
752 struct vm_page *pg;
753 size_t bucketsize, bucketmemsize, oldbucketmemsize;
754 int fl, ob, oc, nb, nc, obuckets, ocolors;
755 char *bucketarray, *oldbucketmem, *bucketmem;
756
757 KASSERT(((newncolors - 1) & newncolors) == 0);
758
759 /* Anything to do? */
760 if (newncolors <= uvmexp.ncolors &&
761 newnbuckets == uvm.bucketcount) {
762 return;
763 }
764 if (uvm.page_init_done == false) {
765 uvmexp.ncolors = newncolors;
766 return;
767 }
768
769 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
770 bucketsize = roundup2(bucketsize, coherency_unit);
771 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
772 coherency_unit - 1;
773 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
774 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
775
776 ocolors = uvmexp.ncolors;
777 obuckets = uvm.bucketcount;
778
779 /* Freelist cache musn't be enabled. */
780 uvm_pgflcache_pause();
781
782 /* Make sure we should still do this. */
783 uvm_pgfl_lock();
784 if (newncolors <= uvmexp.ncolors &&
785 newnbuckets == uvm.bucketcount) {
786 uvm_pgfl_unlock();
787 uvm_pgflcache_resume();
788 kmem_free(bucketmem, bucketmemsize);
789 return;
790 }
791
792 uvmexp.ncolors = newncolors;
793 uvmexp.colormask = uvmexp.ncolors - 1;
794 uvm.bucketcount = newnbuckets;
795
796 for (fl = 0; fl < VM_NFREELIST; fl++) {
797 /* Init new buckets in new freelist. */
798 memset(&npgfl, 0, sizeof(npgfl));
799 for (nb = 0; nb < newnbuckets; nb++) {
800 npgb = (struct pgflbucket *)bucketarray;
801 uvm_page_init_bucket(&npgfl, npgb, nb);
802 bucketarray += bucketsize;
803 }
804 /* Now transfer pages from the old freelist. */
805 for (nb = ob = 0; ob < obuckets; ob++) {
806 opgb = uvm.page_free[fl].pgfl_buckets[ob];
807 for (oc = 0; oc < ocolors; oc++) {
808 ohead = &opgb->pgb_colors[oc];
809 while ((pg = LIST_FIRST(ohead)) != NULL) {
810 LIST_REMOVE(pg, pageq.list);
811 /*
812 * Here we decide on the NEW color &
813 * bucket for the page. For NUMA
814 * we'll use the info that the
815 * hardware gave us. For non-NUMA
816 * assign take physical page frame
817 * number and cache color into
818 * account. We do this to try and
819 * avoid defeating any memory
820 * interleaving in the hardware.
821 */
822 KASSERT(
823 uvm_page_get_bucket(pg) == ob);
824 KASSERT(fl ==
825 uvm_page_get_freelist(pg));
826 if (uvm_page_numa_region != NULL) {
827 nb = uvm_page_numa_lookup(pg);
828 } else {
829 nb = atop(VM_PAGE_TO_PHYS(pg))
830 / uvmexp.ncolors / 8
831 % newnbuckets;
832 }
833 uvm_page_set_bucket(pg, nb);
834 npgb = npgfl.pgfl_buckets[nb];
835 npgb->pgb_nfree++;
836 nc = VM_PGCOLOR(pg);
837 nhead = &npgb->pgb_colors[nc];
838 LIST_INSERT_HEAD(nhead, pg, pageq.list);
839 }
840 }
841 }
842 /* Install the new freelist. */
843 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
844 }
845
846 /* Unlock and free the old memory. */
847 oldbucketmemsize = recolored_pages_memsize;
848 oldbucketmem = recolored_pages_mem;
849 recolored_pages_memsize = bucketmemsize;
850 recolored_pages_mem = bucketmem;
851
852 uvm_pgfl_unlock();
853 uvm_pgflcache_resume();
854
855 if (oldbucketmemsize) {
856 kmem_free(oldbucketmem, oldbucketmemsize);
857 }
858
859 /*
860 * this calls uvm_km_alloc() which may want to hold
861 * uvm_freelist_lock.
862 */
863 uvm_pager_realloc_emerg();
864 }
865
866 /*
867 * uvm_page_recolor: Recolor the pages if the new color count is
868 * larger than the old one.
869 */
870
871 void
872 uvm_page_recolor(int newncolors)
873 {
874
875 uvm_page_redim(newncolors, uvm.bucketcount);
876 }
877
878 /*
879 * uvm_page_rebucket: Determine a bucket structure and redim the free
880 * lists to match.
881 */
882
883 void
884 uvm_page_rebucket(void)
885 {
886 u_int min_numa, max_numa, npackage, shift;
887 struct cpu_info *ci, *ci2, *ci3;
888 CPU_INFO_ITERATOR cii;
889
890 /*
891 * If we have more than one NUMA node, and the maximum NUMA node ID
892 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
893 * for free pages.
894 */
895 min_numa = (u_int)-1;
896 max_numa = 0;
897 for (CPU_INFO_FOREACH(cii, ci)) {
898 if (ci->ci_numa_id < min_numa) {
899 min_numa = ci->ci_numa_id;
900 }
901 if (ci->ci_numa_id > max_numa) {
902 max_numa = ci->ci_numa_id;
903 }
904 }
905 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
906 aprint_debug("UVM: using NUMA allocation scheme\n");
907 for (CPU_INFO_FOREACH(cii, ci)) {
908 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
909 }
910 uvm_page_redim(uvmexp.ncolors, max_numa + 1);
911 return;
912 }
913
914 /*
915 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
916 * and minimise lock contention. Count the total number of CPU
917 * packages, and then try to distribute the buckets among CPU
918 * packages evenly.
919 */
920 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
921
922 /*
923 * Figure out how to arrange the packages & buckets, and the total
924 * number of buckets we need. XXX 2 may not be the best factor.
925 */
926 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
927 npackage >>= 1;
928 }
929 uvm_page_redim(uvmexp.ncolors, npackage);
930
931 /*
932 * Now tell each CPU which bucket to use. In the outer loop, scroll
933 * through all CPU packages.
934 */
935 npackage = 0;
936 ci = curcpu();
937 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
938 do {
939 /*
940 * In the inner loop, scroll through all CPUs in the package
941 * and assign the same bucket ID.
942 */
943 ci3 = ci2;
944 do {
945 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
946 ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
947 } while (ci3 != ci2);
948 npackage++;
949 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
950 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
951
952 aprint_debug("UVM: using package allocation scheme, "
953 "%d package(s) per bucket\n", 1 << shift);
954 }
955
956 /*
957 * uvm_cpu_attach: initialize per-CPU data structures.
958 */
959
960 void
961 uvm_cpu_attach(struct cpu_info *ci)
962 {
963 struct uvm_cpu *ucpu;
964
965 /* Already done in uvm_page_init(). */
966 if (!CPU_IS_PRIMARY(ci)) {
967 /* Add more reserve pages for this CPU. */
968 uvmexp.reserve_kernel += vm_page_reserve_kernel;
969
970 /* Allocate per-CPU data structures. */
971 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
972 KM_SLEEP);
973 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
974 coherency_unit);
975 ci->ci_data.cpu_uvm = ucpu;
976 } else {
977 ucpu = ci->ci_data.cpu_uvm;
978 }
979
980 uvmpdpol_init_cpu(ucpu);
981
982 /*
983 * Attach RNG source for this CPU's VM events
984 */
985 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
986 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
987 RND_FLAG_ESTIMATE_VALUE);
988 }
989
990 /*
991 * uvm_availmem: fetch the total amount of free memory in pages. this can
992 * have a detrimental effect on performance due to false sharing; don't call
993 * unless needed.
994 *
995 * some users can request the amount of free memory so often that it begins
996 * to impact upon performance. if calling frequently and an inexact value
997 * is okay, call with cached = true.
998 */
999
1000 int
1001 uvm_availmem(bool cached)
1002 {
1003 int64_t fp;
1004
1005 cpu_count_sync(cached);
1006 if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
1007 /*
1008 * XXXAD could briefly go negative because it's impossible
1009 * to get a clean snapshot. address this for other counters
1010 * used as running totals before NetBSD 10 although less
1011 * important for those.
1012 */
1013 fp = 0;
1014 }
1015 return (int)fp;
1016 }
1017
1018 /*
1019 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
1020 * specific freelist and specific bucket only.
1021 *
1022 * => must be at IPL_VM or higher to protect per-CPU data structures.
1023 */
1024
1025 static struct vm_page *
1026 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
1027 {
1028 int c, trycolor, colormask;
1029 struct pgflbucket *pgb;
1030 struct vm_page *pg;
1031 kmutex_t *lock;
1032 bool fill;
1033
1034 /*
1035 * Skip the bucket if empty, no lock needed. There could be many
1036 * empty freelists/buckets.
1037 */
1038 pgb = uvm.page_free[f].pgfl_buckets[b];
1039 if (pgb->pgb_nfree == 0) {
1040 return NULL;
1041 }
1042
1043 /* Skip bucket if low on memory. */
1044 lock = &uvm_freelist_locks[b].lock;
1045 mutex_spin_enter(lock);
1046 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
1047 if ((flags & UVM_PGA_USERESERVE) == 0 ||
1048 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
1049 curlwp != uvm.pagedaemon_lwp)) {
1050 mutex_spin_exit(lock);
1051 return NULL;
1052 }
1053 fill = false;
1054 } else {
1055 fill = true;
1056 }
1057
1058 /* Try all page colors as needed. */
1059 c = trycolor = *trycolorp;
1060 colormask = uvmexp.colormask;
1061 do {
1062 pg = LIST_FIRST(&pgb->pgb_colors[c]);
1063 if (__predict_true(pg != NULL)) {
1064 /*
1065 * Got a free page! PG_FREE must be cleared under
1066 * lock because of uvm_pglistalloc().
1067 */
1068 LIST_REMOVE(pg, pageq.list);
1069 KASSERT(pg->flags == PG_FREE);
1070 pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
1071 pgb->pgb_nfree--;
1072 CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
1073
1074 /*
1075 * While we have the bucket locked and our data
1076 * structures fresh in L1 cache, we have an ideal
1077 * opportunity to grab some pages for the freelist
1078 * cache without causing extra contention. Only do
1079 * so if we found pages in this CPU's preferred
1080 * bucket.
1081 */
1082 if (__predict_true(b == ucpu->pgflbucket && fill)) {
1083 uvm_pgflcache_fill(ucpu, f, b, c);
1084 }
1085 mutex_spin_exit(lock);
1086 KASSERT(uvm_page_get_bucket(pg) == b);
1087 CPU_COUNT(c == trycolor ?
1088 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
1089 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
1090 *trycolorp = c;
1091 return pg;
1092 }
1093 c = (c + 1) & colormask;
1094 } while (c != trycolor);
1095 mutex_spin_exit(lock);
1096
1097 return NULL;
1098 }
1099
1100 /*
1101 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
1102 * any color from any bucket, in a specific freelist.
1103 *
1104 * => must be at IPL_VM or higher to protect per-CPU data structures.
1105 */
1106
1107 static struct vm_page *
1108 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
1109 {
1110 int b, trybucket, bucketcount;
1111 struct vm_page *pg;
1112
1113 /* Try for the exact thing in the per-CPU cache. */
1114 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
1115 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1116 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1117 return pg;
1118 }
1119
1120 /* Walk through all buckets, trying our preferred bucket first. */
1121 trybucket = ucpu->pgflbucket;
1122 b = trybucket;
1123 bucketcount = uvm.bucketcount;
1124 do {
1125 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
1126 if (pg != NULL) {
1127 return pg;
1128 }
1129 b = (b + 1 == bucketcount ? 0 : b + 1);
1130 } while (b != trybucket);
1131
1132 return NULL;
1133 }
1134
1135 /*
1136 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
1137 *
1138 * => return null if no pages free
1139 * => wake up pagedaemon if number of free pages drops below low water mark
1140 * => if obj != NULL, obj must be locked (to put in obj's tree)
1141 * => if anon != NULL, anon must be locked (to put in anon)
1142 * => only one of obj or anon can be non-null
1143 * => caller must activate/deactivate page if it is not wired.
1144 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
1145 * => policy decision: it is more important to pull a page off of the
1146 * appropriate priority free list than it is to get a page from the
1147 * correct bucket or color bin. This is because we live with the
1148 * consequences of a bad free list decision for the entire
1149 * lifetime of the page, e.g. if the page comes from memory that
1150 * is slower to access.
1151 */
1152
1153 struct vm_page *
1154 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
1155 int flags, int strat, int free_list)
1156 {
1157 int color, lcv, error, s;
1158 struct uvm_cpu *ucpu;
1159 struct vm_page *pg;
1160 lwp_t *l;
1161
1162 KASSERT(obj == NULL || anon == NULL);
1163 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
1164 KASSERT(off == trunc_page(off));
1165 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
1166 KASSERT(anon == NULL || anon->an_lock == NULL ||
1167 rw_write_held(anon->an_lock));
1168
1169 /*
1170 * This implements a global round-robin page coloring
1171 * algorithm.
1172 */
1173
1174 s = splvm();
1175 ucpu = curcpu()->ci_data.cpu_uvm;
1176 if (flags & UVM_FLAG_COLORMATCH) {
1177 color = atop(off) & uvmexp.colormask;
1178 } else {
1179 color = ucpu->pgflcolor;
1180 }
1181
1182 /*
1183 * fail if any of these conditions is true:
1184 * [1] there really are no free pages, or
1185 * [2] only kernel "reserved" pages remain and
1186 * reserved pages have not been requested.
1187 * [3] only pagedaemon "reserved" pages remain and
1188 * the requestor isn't the pagedaemon.
1189 * we make kernel reserve pages available if called by a
1190 * kernel thread.
1191 */
1192 l = curlwp;
1193 if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
1194 flags |= UVM_PGA_USERESERVE;
1195 }
1196
1197 again:
1198 switch (strat) {
1199 case UVM_PGA_STRAT_NORMAL:
1200 /* Check freelists: descending priority (ascending id) order. */
1201 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1202 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
1203 if (pg != NULL) {
1204 goto gotit;
1205 }
1206 }
1207
1208 /* No pages free! Have pagedaemon free some memory. */
1209 splx(s);
1210 uvm_kick_pdaemon();
1211 return NULL;
1212
1213 case UVM_PGA_STRAT_ONLY:
1214 case UVM_PGA_STRAT_FALLBACK:
1215 /* Attempt to allocate from the specified free list. */
1216 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
1217 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
1218 if (pg != NULL) {
1219 goto gotit;
1220 }
1221
1222 /* Fall back, if possible. */
1223 if (strat == UVM_PGA_STRAT_FALLBACK) {
1224 strat = UVM_PGA_STRAT_NORMAL;
1225 goto again;
1226 }
1227
1228 /* No pages free! Have pagedaemon free some memory. */
1229 splx(s);
1230 uvm_kick_pdaemon();
1231 return NULL;
1232
1233 case UVM_PGA_STRAT_NUMA:
1234 /*
1235 * NUMA strategy (experimental): allocating from the correct
1236 * bucket is more important than observing freelist
1237 * priority. Look only to the current NUMA node; if that
1238 * fails, we need to look to other NUMA nodes, so retry with
1239 * the normal strategy.
1240 */
1241 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1242 pg = uvm_pgflcache_alloc(ucpu, lcv, color);
1243 if (pg != NULL) {
1244 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1245 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1246 goto gotit;
1247 }
1248 pg = uvm_pagealloc_pgb(ucpu, lcv,
1249 ucpu->pgflbucket, &color, flags);
1250 if (pg != NULL) {
1251 goto gotit;
1252 }
1253 }
1254 strat = UVM_PGA_STRAT_NORMAL;
1255 goto again;
1256
1257 default:
1258 panic("uvm_pagealloc_strat: bad strat %d", strat);
1259 /* NOTREACHED */
1260 }
1261
1262 gotit:
1263 /*
1264 * We now know which color we actually allocated from; set
1265 * the next color accordingly.
1266 */
1267
1268 ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
1269
1270 /*
1271 * while still at IPL_VM, update allocation statistics.
1272 */
1273
1274 if (anon) {
1275 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
1276 }
1277 splx(s);
1278 KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
1279
1280 /*
1281 * assign the page to the object. as the page was free, we know
1282 * that pg->uobject and pg->uanon are NULL. we only need to take
1283 * the page's interlock if we are changing the values.
1284 */
1285 if (anon != NULL || obj != NULL) {
1286 mutex_enter(&pg->interlock);
1287 }
1288 pg->offset = off;
1289 pg->uobject = obj;
1290 pg->uanon = anon;
1291 KASSERT(uvm_page_owner_locked_p(pg, true));
1292 if (anon) {
1293 anon->an_page = pg;
1294 pg->flags |= PG_ANON;
1295 mutex_exit(&pg->interlock);
1296 } else if (obj) {
1297 /*
1298 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
1299 */
1300 if (UVM_OBJ_IS_VNODE(obj)) {
1301 pg->flags |= PG_FILE;
1302 } else if (UVM_OBJ_IS_AOBJ(obj)) {
1303 pg->flags |= PG_AOBJ;
1304 }
1305 uvm_pageinsert_object(obj, pg);
1306 mutex_exit(&pg->interlock);
1307 error = uvm_pageinsert_tree(obj, pg);
1308 if (error != 0) {
1309 mutex_enter(&pg->interlock);
1310 uvm_pageremove_object(obj, pg);
1311 mutex_exit(&pg->interlock);
1312 uvm_pagefree(pg);
1313 return NULL;
1314 }
1315 }
1316
1317 #if defined(UVM_PAGE_TRKOWN)
1318 pg->owner_tag = NULL;
1319 #endif
1320 UVM_PAGE_OWN(pg, "new alloc");
1321
1322 if (flags & UVM_PGA_ZERO) {
1323 /* A zero'd page is not clean. */
1324 if (obj != NULL || anon != NULL) {
1325 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1326 }
1327 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1328 }
1329
1330 return(pg);
1331 }
1332
1333 /*
1334 * uvm_pagereplace: replace a page with another
1335 *
1336 * => object must be locked
1337 * => page interlocks must be held
1338 */
1339
1340 void
1341 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1342 {
1343 struct uvm_object *uobj = oldpg->uobject;
1344 struct vm_page *pg __diagused;
1345 uint64_t idx;
1346
1347 KASSERT((oldpg->flags & PG_TABLED) != 0);
1348 KASSERT(uobj != NULL);
1349 KASSERT((newpg->flags & PG_TABLED) == 0);
1350 KASSERT(newpg->uobject == NULL);
1351 KASSERT(rw_write_held(uobj->vmobjlock));
1352 KASSERT(mutex_owned(&oldpg->interlock));
1353 KASSERT(mutex_owned(&newpg->interlock));
1354
1355 newpg->uobject = uobj;
1356 newpg->offset = oldpg->offset;
1357 idx = newpg->offset >> PAGE_SHIFT;
1358 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
1359 KASSERT(pg == oldpg);
1360 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
1361 if ((newpg->flags & PG_CLEAN) != 0) {
1362 uvm_obj_page_clear_dirty(newpg);
1363 } else {
1364 uvm_obj_page_set_dirty(newpg);
1365 }
1366 }
1367 /*
1368 * oldpg's PG_STAT is stable. newpg is not reachable by others yet.
1369 */
1370 newpg->flags |=
1371 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
1372 uvm_pageinsert_object(uobj, newpg);
1373 uvm_pageremove_object(uobj, oldpg);
1374 }
1375
1376 /*
1377 * uvm_pagerealloc: reallocate a page from one object to another
1378 *
1379 * => both objects must be locked
1380 */
1381
1382 int
1383 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1384 {
1385 int error = 0;
1386
1387 /*
1388 * remove it from the old object
1389 */
1390
1391 if (pg->uobject) {
1392 uvm_pageremove_tree(pg->uobject, pg);
1393 uvm_pageremove_object(pg->uobject, pg);
1394 }
1395
1396 /*
1397 * put it in the new object
1398 */
1399
1400 if (newobj) {
1401 mutex_enter(&pg->interlock);
1402 pg->uobject = newobj;
1403 pg->offset = newoff;
1404 if (UVM_OBJ_IS_VNODE(newobj)) {
1405 pg->flags |= PG_FILE;
1406 } else if (UVM_OBJ_IS_AOBJ(newobj)) {
1407 pg->flags |= PG_AOBJ;
1408 }
1409 uvm_pageinsert_object(newobj, pg);
1410 mutex_exit(&pg->interlock);
1411 error = uvm_pageinsert_tree(newobj, pg);
1412 if (error != 0) {
1413 mutex_enter(&pg->interlock);
1414 uvm_pageremove_object(newobj, pg);
1415 mutex_exit(&pg->interlock);
1416 }
1417 }
1418
1419 return error;
1420 }
1421
1422 /*
1423 * uvm_pagefree: free page
1424 *
1425 * => erase page's identity (i.e. remove from object)
1426 * => put page on free list
1427 * => caller must lock owning object (either anon or uvm_object)
1428 * => assumes all valid mappings of pg are gone
1429 */
1430
1431 void
1432 uvm_pagefree(struct vm_page *pg)
1433 {
1434 struct pgfreelist *pgfl;
1435 struct pgflbucket *pgb;
1436 struct uvm_cpu *ucpu;
1437 kmutex_t *lock;
1438 int bucket, s;
1439 bool locked;
1440
1441 #ifdef DEBUG
1442 if (pg->uobject == (void *)0xdeadbeef &&
1443 pg->uanon == (void *)0xdeadbeef) {
1444 panic("uvm_pagefree: freeing free page %p", pg);
1445 }
1446 #endif /* DEBUG */
1447
1448 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1449 KASSERT(!(pg->flags & PG_FREE));
1450 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
1451 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1452 rw_write_held(pg->uanon->an_lock));
1453
1454 /*
1455 * remove the page from the object's tree before acquiring any page
1456 * interlocks: this can acquire locks to free radixtree nodes.
1457 */
1458 if (pg->uobject != NULL) {
1459 uvm_pageremove_tree(pg->uobject, pg);
1460 }
1461
1462 /*
1463 * if the page is loaned, resolve the loan instead of freeing.
1464 */
1465
1466 if (pg->loan_count) {
1467 KASSERT(pg->wire_count == 0);
1468
1469 /*
1470 * if the page is owned by an anon then we just want to
1471 * drop anon ownership. the kernel will free the page when
1472 * it is done with it. if the page is owned by an object,
1473 * remove it from the object and mark it dirty for the benefit
1474 * of possible anon owners.
1475 *
1476 * regardless of previous ownership, wakeup any waiters,
1477 * unbusy the page, and we're done.
1478 */
1479
1480 uvm_pagelock(pg);
1481 locked = true;
1482 if (pg->uobject != NULL) {
1483 uvm_pageremove_object(pg->uobject, pg);
1484 pg->flags &= ~(PG_FILE|PG_AOBJ);
1485 } else if (pg->uanon != NULL) {
1486 if ((pg->flags & PG_ANON) == 0) {
1487 pg->loan_count--;
1488 } else {
1489 const unsigned status = uvm_pagegetdirty(pg);
1490 pg->flags &= ~PG_ANON;
1491 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1492 }
1493 pg->uanon->an_page = NULL;
1494 pg->uanon = NULL;
1495 }
1496 if (pg->pqflags & PQ_WANTED) {
1497 wakeup(pg);
1498 }
1499 pg->pqflags &= ~PQ_WANTED;
1500 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
1501 #ifdef UVM_PAGE_TRKOWN
1502 pg->owner_tag = NULL;
1503 #endif
1504 KASSERT((pg->flags & PG_STAT) == 0);
1505 if (pg->loan_count) {
1506 KASSERT(pg->uobject == NULL);
1507 if (pg->uanon == NULL) {
1508 uvm_pagedequeue(pg);
1509 }
1510 uvm_pageunlock(pg);
1511 return;
1512 }
1513 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1514 pg->wire_count != 0) {
1515 uvm_pagelock(pg);
1516 locked = true;
1517 } else {
1518 locked = false;
1519 }
1520
1521 /*
1522 * remove page from its object or anon.
1523 */
1524 if (pg->uobject != NULL) {
1525 uvm_pageremove_object(pg->uobject, pg);
1526 } else if (pg->uanon != NULL) {
1527 const unsigned int status = uvm_pagegetdirty(pg);
1528 pg->uanon->an_page = NULL;
1529 pg->uanon = NULL;
1530 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1531 }
1532
1533 /*
1534 * if the page was wired, unwire it now.
1535 */
1536
1537 if (pg->wire_count) {
1538 pg->wire_count = 0;
1539 atomic_dec_uint(&uvmexp.wired);
1540 }
1541 if (locked) {
1542 /*
1543 * wake anyone waiting on the page.
1544 */
1545 if ((pg->pqflags & PQ_WANTED) != 0) {
1546 pg->pqflags &= ~PQ_WANTED;
1547 wakeup(pg);
1548 }
1549
1550 /*
1551 * now remove the page from the queues.
1552 */
1553 uvm_pagedequeue(pg);
1554 uvm_pageunlock(pg);
1555 } else {
1556 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1557 }
1558
1559 /*
1560 * and put on free queue
1561 */
1562
1563 #ifdef DEBUG
1564 pg->uobject = (void *)0xdeadbeef;
1565 pg->uanon = (void *)0xdeadbeef;
1566 #endif /* DEBUG */
1567
1568 /* Try to send the page to the per-CPU cache. */
1569 s = splvm();
1570 ucpu = curcpu()->ci_data.cpu_uvm;
1571 bucket = uvm_page_get_bucket(pg);
1572 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
1573 splx(s);
1574 return;
1575 }
1576
1577 /* Didn't work. Never mind, send it to a global bucket. */
1578 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
1579 pgb = pgfl->pgfl_buckets[bucket];
1580 lock = &uvm_freelist_locks[bucket].lock;
1581
1582 mutex_spin_enter(lock);
1583 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
1584 pg->flags = PG_FREE;
1585 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
1586 pgb->pgb_nfree++;
1587 CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
1588 mutex_spin_exit(lock);
1589 splx(s);
1590 }
1591
1592 /*
1593 * uvm_page_unbusy: unbusy an array of pages.
1594 *
1595 * => pages must either all belong to the same object, or all belong to anons.
1596 * => if pages are object-owned, object must be locked.
1597 * => if pages are anon-owned, anons must be locked.
1598 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1599 */
1600
1601 void
1602 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1603 {
1604 struct vm_page *pg;
1605 int i, pageout_done;
1606 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1607
1608 pageout_done = 0;
1609 for (i = 0; i < npgs; i++) {
1610 pg = pgs[i];
1611 if (pg == NULL || pg == PGO_DONTCARE) {
1612 continue;
1613 }
1614
1615 KASSERT(uvm_page_owner_locked_p(pg, true));
1616 KASSERT(pg->flags & PG_BUSY);
1617
1618 if (pg->flags & PG_PAGEOUT) {
1619 pg->flags &= ~PG_PAGEOUT;
1620 pg->flags |= PG_RELEASED;
1621 pageout_done++;
1622 atomic_inc_uint(&uvmexp.pdfreed);
1623 }
1624 if (pg->flags & PG_RELEASED) {
1625 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1626 (uintptr_t)pg, 0, 0, 0);
1627 KASSERT(pg->uobject != NULL ||
1628 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1629 pg->flags &= ~PG_RELEASED;
1630 uvm_pagefree(pg);
1631 } else {
1632 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1633 (uintptr_t)pg, 0, 0, 0);
1634 KASSERT((pg->flags & PG_FAKE) == 0);
1635 pg->flags &= ~PG_BUSY;
1636 uvm_pagelock(pg);
1637 uvm_pagewakeup(pg);
1638 uvm_pageunlock(pg);
1639 UVM_PAGE_OWN(pg, NULL);
1640 }
1641 }
1642 if (pageout_done != 0) {
1643 uvm_pageout_done(pageout_done);
1644 }
1645 }
1646
1647 /*
1648 * uvm_pagewait: wait for a busy page
1649 *
1650 * => page must be known PG_BUSY
1651 * => object must be read or write locked
1652 * => object will be unlocked on return
1653 */
1654
1655 void
1656 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
1657 {
1658
1659 KASSERT(rw_lock_held(lock));
1660 KASSERT((pg->flags & PG_BUSY) != 0);
1661 KASSERT(uvm_page_owner_locked_p(pg, false));
1662
1663 mutex_enter(&pg->interlock);
1664 pg->pqflags |= PQ_WANTED;
1665 rw_exit(lock);
1666 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
1667 }
1668
1669 /*
1670 * uvm_pagewakeup: wake anyone waiting on a page
1671 *
1672 * => page interlock must be held
1673 */
1674
1675 void
1676 uvm_pagewakeup(struct vm_page *pg)
1677 {
1678 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1679
1680 KASSERT(mutex_owned(&pg->interlock));
1681
1682 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
1683
1684 if ((pg->pqflags & PQ_WANTED) != 0) {
1685 wakeup(pg);
1686 pg->pqflags &= ~PQ_WANTED;
1687 }
1688 }
1689
1690 /*
1691 * uvm_pagewanted_p: return true if someone is waiting on the page
1692 *
1693 * => object must be write locked (lock out all concurrent access)
1694 */
1695
1696 bool
1697 uvm_pagewanted_p(struct vm_page *pg)
1698 {
1699
1700 KASSERT(uvm_page_owner_locked_p(pg, true));
1701
1702 return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
1703 }
1704
1705 #if defined(UVM_PAGE_TRKOWN)
1706 /*
1707 * uvm_page_own: set or release page ownership
1708 *
1709 * => this is a debugging function that keeps track of who sets PG_BUSY
1710 * and where they do it. it can be used to track down problems
1711 * such a process setting "PG_BUSY" and never releasing it.
1712 * => page's object [if any] must be locked
1713 * => if "tag" is NULL then we are releasing page ownership
1714 */
1715 void
1716 uvm_page_own(struct vm_page *pg, const char *tag)
1717 {
1718
1719 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1720 KASSERT(uvm_page_owner_locked_p(pg, true));
1721
1722 /* gain ownership? */
1723 if (tag) {
1724 KASSERT((pg->flags & PG_BUSY) != 0);
1725 if (pg->owner_tag) {
1726 printf("uvm_page_own: page %p already owned "
1727 "by proc %d.%d [%s]\n", pg,
1728 pg->owner, pg->lowner, pg->owner_tag);
1729 panic("uvm_page_own");
1730 }
1731 pg->owner = curproc->p_pid;
1732 pg->lowner = curlwp->l_lid;
1733 pg->owner_tag = tag;
1734 return;
1735 }
1736
1737 /* drop ownership */
1738 KASSERT((pg->flags & PG_BUSY) == 0);
1739 if (pg->owner_tag == NULL) {
1740 printf("uvm_page_own: dropping ownership of an non-owned "
1741 "page (%p)\n", pg);
1742 panic("uvm_page_own");
1743 }
1744 pg->owner_tag = NULL;
1745 }
1746 #endif
1747
1748 /*
1749 * uvm_pagelookup: look up a page
1750 *
1751 * => caller should lock object to keep someone from pulling the page
1752 * out from under it
1753 */
1754
1755 struct vm_page *
1756 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1757 {
1758 struct vm_page *pg;
1759 bool ddb __diagused = false;
1760 #ifdef DDB
1761 extern int db_active;
1762 ddb = db_active != 0;
1763 #endif
1764
1765 KASSERT(ddb || rw_lock_held(obj->vmobjlock));
1766
1767 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1768
1769 KASSERT(pg == NULL || obj->uo_npages != 0);
1770 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1771 (pg->flags & PG_BUSY) != 0);
1772 return pg;
1773 }
1774
1775 /*
1776 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1777 *
1778 * => caller must lock objects
1779 * => caller must hold pg->interlock
1780 */
1781
1782 void
1783 uvm_pagewire(struct vm_page *pg)
1784 {
1785
1786 KASSERT(uvm_page_owner_locked_p(pg, true));
1787 KASSERT(mutex_owned(&pg->interlock));
1788 #if defined(READAHEAD_STATS)
1789 if ((pg->flags & PG_READAHEAD) != 0) {
1790 uvm_ra_hit.ev_count++;
1791 pg->flags &= ~PG_READAHEAD;
1792 }
1793 #endif /* defined(READAHEAD_STATS) */
1794 if (pg->wire_count == 0) {
1795 uvm_pagedequeue(pg);
1796 atomic_inc_uint(&uvmexp.wired);
1797 }
1798 pg->wire_count++;
1799 KASSERT(pg->wire_count > 0); /* detect wraparound */
1800 }
1801
1802 /*
1803 * uvm_pageunwire: unwire the page.
1804 *
1805 * => activate if wire count goes to zero.
1806 * => caller must lock objects
1807 * => caller must hold pg->interlock
1808 */
1809
1810 void
1811 uvm_pageunwire(struct vm_page *pg)
1812 {
1813
1814 KASSERT(uvm_page_owner_locked_p(pg, true));
1815 KASSERT(pg->wire_count != 0);
1816 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1817 KASSERT(mutex_owned(&pg->interlock));
1818 pg->wire_count--;
1819 if (pg->wire_count == 0) {
1820 uvm_pageactivate(pg);
1821 KASSERT(uvmexp.wired != 0);
1822 atomic_dec_uint(&uvmexp.wired);
1823 }
1824 }
1825
1826 /*
1827 * uvm_pagedeactivate: deactivate page
1828 *
1829 * => caller must lock objects
1830 * => caller must check to make sure page is not wired
1831 * => object that page belongs to must be locked (so we can adjust pg->flags)
1832 * => caller must clear the reference on the page before calling
1833 * => caller must hold pg->interlock
1834 */
1835
1836 void
1837 uvm_pagedeactivate(struct vm_page *pg)
1838 {
1839
1840 KASSERT(uvm_page_owner_locked_p(pg, false));
1841 KASSERT(mutex_owned(&pg->interlock));
1842 if (pg->wire_count == 0) {
1843 KASSERT(uvmpdpol_pageisqueued_p(pg));
1844 uvmpdpol_pagedeactivate(pg);
1845 }
1846 }
1847
1848 /*
1849 * uvm_pageactivate: activate page
1850 *
1851 * => caller must lock objects
1852 * => caller must hold pg->interlock
1853 */
1854
1855 void
1856 uvm_pageactivate(struct vm_page *pg)
1857 {
1858
1859 KASSERT(uvm_page_owner_locked_p(pg, false));
1860 KASSERT(mutex_owned(&pg->interlock));
1861 #if defined(READAHEAD_STATS)
1862 if ((pg->flags & PG_READAHEAD) != 0) {
1863 uvm_ra_hit.ev_count++;
1864 pg->flags &= ~PG_READAHEAD;
1865 }
1866 #endif /* defined(READAHEAD_STATS) */
1867 if (pg->wire_count == 0) {
1868 uvmpdpol_pageactivate(pg);
1869 }
1870 }
1871
1872 /*
1873 * uvm_pagedequeue: remove a page from any paging queue
1874 *
1875 * => caller must lock objects
1876 * => caller must hold pg->interlock
1877 */
1878 void
1879 uvm_pagedequeue(struct vm_page *pg)
1880 {
1881
1882 KASSERT(uvm_page_owner_locked_p(pg, true));
1883 KASSERT(mutex_owned(&pg->interlock));
1884 if (uvmpdpol_pageisqueued_p(pg)) {
1885 uvmpdpol_pagedequeue(pg);
1886 }
1887 }
1888
1889 /*
1890 * uvm_pageenqueue: add a page to a paging queue without activating.
1891 * used where a page is not really demanded (yet). eg. read-ahead
1892 *
1893 * => caller must lock objects
1894 * => caller must hold pg->interlock
1895 */
1896 void
1897 uvm_pageenqueue(struct vm_page *pg)
1898 {
1899
1900 KASSERT(uvm_page_owner_locked_p(pg, false));
1901 KASSERT(mutex_owned(&pg->interlock));
1902 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1903 uvmpdpol_pageenqueue(pg);
1904 }
1905 }
1906
1907 /*
1908 * uvm_pagelock: acquire page interlock
1909 */
1910 void
1911 uvm_pagelock(struct vm_page *pg)
1912 {
1913
1914 mutex_enter(&pg->interlock);
1915 }
1916
1917 /*
1918 * uvm_pagelock2: acquire two page interlocks
1919 */
1920 void
1921 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
1922 {
1923
1924 if (pg1 < pg2) {
1925 mutex_enter(&pg1->interlock);
1926 mutex_enter(&pg2->interlock);
1927 } else {
1928 mutex_enter(&pg2->interlock);
1929 mutex_enter(&pg1->interlock);
1930 }
1931 }
1932
1933 /*
1934 * uvm_pageunlock: release page interlock, and if a page replacement intent
1935 * is set on the page, pass it to uvmpdpol to make real.
1936 *
1937 * => caller must hold pg->interlock
1938 */
1939 void
1940 uvm_pageunlock(struct vm_page *pg)
1941 {
1942
1943 if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
1944 (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
1945 mutex_exit(&pg->interlock);
1946 return;
1947 }
1948 pg->pqflags |= PQ_INTENT_QUEUED;
1949 mutex_exit(&pg->interlock);
1950 uvmpdpol_pagerealize(pg);
1951 }
1952
1953 /*
1954 * uvm_pageunlock2: release two page interlocks, and for both pages if a
1955 * page replacement intent is set on the page, pass it to uvmpdpol to make
1956 * real.
1957 *
1958 * => caller must hold pg->interlock
1959 */
1960 void
1961 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
1962 {
1963
1964 if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
1965 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
1966 mutex_exit(&pg1->interlock);
1967 pg1 = NULL;
1968 } else {
1969 pg1->pqflags |= PQ_INTENT_QUEUED;
1970 mutex_exit(&pg1->interlock);
1971 }
1972
1973 if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
1974 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
1975 mutex_exit(&pg2->interlock);
1976 pg2 = NULL;
1977 } else {
1978 pg2->pqflags |= PQ_INTENT_QUEUED;
1979 mutex_exit(&pg2->interlock);
1980 }
1981
1982 if (pg1 != NULL) {
1983 uvmpdpol_pagerealize(pg1);
1984 }
1985 if (pg2 != NULL) {
1986 uvmpdpol_pagerealize(pg2);
1987 }
1988 }
1989
1990 /*
1991 * uvm_pagezero: zero fill a page
1992 *
1993 * => if page is part of an object then the object should be locked
1994 * to protect pg->flags.
1995 */
1996
1997 void
1998 uvm_pagezero(struct vm_page *pg)
1999 {
2000
2001 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
2002 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
2003 }
2004
2005 /*
2006 * uvm_pagecopy: copy a page
2007 *
2008 * => if page is part of an object then the object should be locked
2009 * to protect pg->flags.
2010 */
2011
2012 void
2013 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
2014 {
2015
2016 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
2017 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
2018 }
2019
2020 /*
2021 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
2022 */
2023
2024 bool
2025 uvm_pageismanaged(paddr_t pa)
2026 {
2027
2028 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
2029 }
2030
2031 /*
2032 * uvm_page_lookup_freelist: look up the free list for the specified page
2033 */
2034
2035 int
2036 uvm_page_lookup_freelist(struct vm_page *pg)
2037 {
2038 uvm_physseg_t upm;
2039
2040 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
2041 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
2042 return uvm_physseg_get_free_list(upm);
2043 }
2044
2045 /*
2046 * uvm_page_owner_locked_p: return true if object associated with page is
2047 * locked. this is a weak check for runtime assertions only.
2048 */
2049
2050 bool
2051 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
2052 {
2053
2054 if (pg->uobject != NULL) {
2055 return exclusive
2056 ? rw_write_held(pg->uobject->vmobjlock)
2057 : rw_lock_held(pg->uobject->vmobjlock);
2058 }
2059 if (pg->uanon != NULL) {
2060 return exclusive
2061 ? rw_write_held(pg->uanon->an_lock)
2062 : rw_lock_held(pg->uanon->an_lock);
2063 }
2064 return true;
2065 }
2066
2067 /*
2068 * uvm_pagereadonly_p: return if the page should be mapped read-only
2069 */
2070
2071 bool
2072 uvm_pagereadonly_p(struct vm_page *pg)
2073 {
2074 struct uvm_object * const uobj = pg->uobject;
2075
2076 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
2077 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
2078 if ((pg->flags & PG_RDONLY) != 0) {
2079 return true;
2080 }
2081 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
2082 return true;
2083 }
2084 if (uobj == NULL) {
2085 return false;
2086 }
2087 return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
2088 }
2089
2090 #ifdef PMAP_DIRECT
2091 /*
2092 * Call pmap to translate physical address into a virtual and to run a callback
2093 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
2094 * or equivalent.
2095 */
2096 int
2097 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
2098 int (*process)(void *, size_t, void *), void *arg)
2099 {
2100 int error = 0;
2101 paddr_t pa;
2102 size_t todo;
2103 voff_t pgoff = (off & PAGE_MASK);
2104 struct vm_page *pg;
2105
2106 KASSERT(npages > 0 && len > 0);
2107
2108 for (int i = 0; i < npages; i++) {
2109 pg = pgs[i];
2110
2111 KASSERT(len > 0);
2112
2113 /*
2114 * Caller is responsible for ensuring all the pages are
2115 * available.
2116 */
2117 KASSERT(pg != NULL && pg != PGO_DONTCARE);
2118
2119 pa = VM_PAGE_TO_PHYS(pg);
2120 todo = MIN(len, PAGE_SIZE - pgoff);
2121
2122 error = pmap_direct_process(pa, pgoff, todo, process, arg);
2123 if (error)
2124 break;
2125
2126 pgoff = 0;
2127 len -= todo;
2128 }
2129
2130 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
2131 return error;
2132 }
2133 #endif /* PMAP_DIRECT */
2134
2135 #if defined(DDB) || defined(DEBUGPRINT)
2136
2137 /*
2138 * uvm_page_printit: actually print the page
2139 */
2140
2141 static const char page_flagbits[] = UVM_PGFLAGBITS;
2142 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
2143
2144 void
2145 uvm_page_printit(struct vm_page *pg, bool full,
2146 void (*pr)(const char *, ...))
2147 {
2148 struct vm_page *tpg;
2149 struct uvm_object *uobj;
2150 struct pgflbucket *pgb;
2151 struct pgflist *pgl;
2152 char pgbuf[128];
2153
2154 (*pr)("PAGE %p:\n", pg);
2155 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
2156 (*pr)(" flags=%s\n", pgbuf);
2157 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
2158 (*pr)(" pqflags=%s\n", pgbuf);
2159 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
2160 pg->uobject, pg->uanon, (long long)pg->offset);
2161 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
2162 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
2163 uvm_page_get_freelist(pg));
2164 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
2165 #if defined(UVM_PAGE_TRKOWN)
2166 if (pg->flags & PG_BUSY)
2167 (*pr)(" owning process = %d.%d, tag=%s\n",
2168 pg->owner, pg->lowner, pg->owner_tag);
2169 else
2170 (*pr)(" page not busy, no owner\n");
2171 #else
2172 (*pr)(" [page ownership tracking disabled]\n");
2173 #endif
2174
2175 if (!full)
2176 return;
2177
2178 /* cross-verify object/anon */
2179 if ((pg->flags & PG_FREE) == 0) {
2180 if (pg->flags & PG_ANON) {
2181 if (pg->uanon == NULL || pg->uanon->an_page != pg)
2182 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2183 (pg->uanon) ? pg->uanon->an_page : NULL);
2184 else
2185 (*pr)(" anon backpointer is OK\n");
2186 } else {
2187 uobj = pg->uobject;
2188 if (uobj) {
2189 (*pr)(" checking object list\n");
2190 tpg = uvm_pagelookup(uobj, pg->offset);
2191 if (tpg)
2192 (*pr)(" page found on object list\n");
2193 else
2194 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
2195 }
2196 }
2197 }
2198
2199 /* cross-verify page queue */
2200 if (pg->flags & PG_FREE) {
2201 int fl = uvm_page_get_freelist(pg);
2202 int b = uvm_page_get_bucket(pg);
2203 pgb = uvm.page_free[fl].pgfl_buckets[b];
2204 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
2205 (*pr)(" checking pageq list\n");
2206 LIST_FOREACH(tpg, pgl, pageq.list) {
2207 if (tpg == pg) {
2208 break;
2209 }
2210 }
2211 if (tpg)
2212 (*pr)(" page found on pageq list\n");
2213 else
2214 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
2215 }
2216 }
2217
2218 /*
2219 * uvm_page_printall - print a summary of all managed pages
2220 */
2221
2222 void
2223 uvm_page_printall(void (*pr)(const char *, ...))
2224 {
2225 uvm_physseg_t i;
2226 paddr_t pfn;
2227 struct vm_page *pg;
2228
2229 (*pr)("%18s %4s %4s %18s %18s"
2230 #ifdef UVM_PAGE_TRKOWN
2231 " OWNER"
2232 #endif
2233 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
2234 for (i = uvm_physseg_get_first();
2235 uvm_physseg_valid_p(i);
2236 i = uvm_physseg_get_next(i)) {
2237 for (pfn = uvm_physseg_get_start(i);
2238 pfn < uvm_physseg_get_end(i);
2239 pfn++) {
2240 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
2241
2242 (*pr)("%18p %04x %08x %18p %18p",
2243 pg, pg->flags, pg->pqflags, pg->uobject,
2244 pg->uanon);
2245 #ifdef UVM_PAGE_TRKOWN
2246 if (pg->flags & PG_BUSY)
2247 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
2248 #endif
2249 (*pr)("\n");
2250 }
2251 }
2252 }
2253
2254 /*
2255 * uvm_page_print_freelists - print a summary freelists
2256 */
2257
2258 void
2259 uvm_page_print_freelists(void (*pr)(const char *, ...))
2260 {
2261 struct pgfreelist *pgfl;
2262 struct pgflbucket *pgb;
2263 int fl, b, c;
2264
2265 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
2266 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
2267
2268 for (fl = 0; fl < VM_NFREELIST; fl++) {
2269 pgfl = &uvm.page_free[fl];
2270 (*pr)("freelist(%d) @ %p\n", fl, pgfl);
2271 for (b = 0; b < uvm.bucketcount; b++) {
2272 pgb = uvm.page_free[fl].pgfl_buckets[b];
2273 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
2274 b, pgb, pgb->pgb_nfree,
2275 &uvm_freelist_locks[b].lock);
2276 for (c = 0; c < uvmexp.ncolors; c++) {
2277 (*pr)(" color(%d) @ %p, ", c,
2278 &pgb->pgb_colors[c]);
2279 (*pr)("first page = %p\n",
2280 LIST_FIRST(&pgb->pgb_colors[c]));
2281 }
2282 }
2283 }
2284 }
2285
2286 #endif /* DDB || DEBUGPRINT */
2287