uvm_page.c revision 1.242 1 /* $NetBSD: uvm_page.c,v 1.242 2020/06/14 21:41:42 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1997 Charles D. Cranor and Washington University.
34 * Copyright (c) 1991, 1993, The Regents of the University of California.
35 *
36 * All rights reserved.
37 *
38 * This code is derived from software contributed to Berkeley by
39 * The Mach Operating System project at Carnegie-Mellon University.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
67 *
68 *
69 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
70 * All rights reserved.
71 *
72 * Permission to use, copy, modify and distribute this software and
73 * its documentation is hereby granted, provided that both the copyright
74 * notice and this permission notice appear in all copies of the
75 * software, derivative works or modified versions, and any portions
76 * thereof, and that both notices appear in supporting documentation.
77 *
78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
81 *
82 * Carnegie Mellon requests users of this software to return to
83 *
84 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
85 * School of Computer Science
86 * Carnegie Mellon University
87 * Pittsburgh PA 15213-3890
88 *
89 * any improvements or extensions that they make and grant Carnegie the
90 * rights to redistribute these changes.
91 */
92
93 /*
94 * uvm_page.c: page ops.
95 */
96
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.242 2020/06/14 21:41:42 ad Exp $");
99
100 #include "opt_ddb.h"
101 #include "opt_uvm.h"
102 #include "opt_uvmhist.h"
103 #include "opt_readahead.h"
104
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/sched.h>
108 #include <sys/kernel.h>
109 #include <sys/vnode.h>
110 #include <sys/proc.h>
111 #include <sys/radixtree.h>
112 #include <sys/atomic.h>
113 #include <sys/cpu.h>
114 #include <sys/extent.h>
115
116 #include <uvm/uvm.h>
117 #include <uvm/uvm_ddb.h>
118 #include <uvm/uvm_pdpolicy.h>
119 #include <uvm/uvm_pgflcache.h>
120
121 /*
122 * number of pages per-CPU to reserve for the kernel.
123 */
124 #ifndef UVM_RESERVED_PAGES_PER_CPU
125 #define UVM_RESERVED_PAGES_PER_CPU 5
126 #endif
127 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
128
129 /*
130 * physical memory size;
131 */
132 psize_t physmem;
133
134 /*
135 * local variables
136 */
137
138 /*
139 * these variables record the values returned by vm_page_bootstrap,
140 * for debugging purposes. The implementation of uvm_pageboot_alloc
141 * and pmap_startup here also uses them internally.
142 */
143
144 static vaddr_t virtual_space_start;
145 static vaddr_t virtual_space_end;
146
147 /*
148 * we allocate an initial number of page colors in uvm_page_init(),
149 * and remember them. We may re-color pages as cache sizes are
150 * discovered during the autoconfiguration phase. But we can never
151 * free the initial set of buckets, since they are allocated using
152 * uvm_pageboot_alloc().
153 */
154
155 static size_t recolored_pages_memsize /* = 0 */;
156 static char *recolored_pages_mem;
157
158 /*
159 * freelist locks - one per bucket.
160 */
161
162 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
163 __cacheline_aligned;
164
165 /*
166 * basic NUMA information.
167 */
168
169 static struct uvm_page_numa_region {
170 struct uvm_page_numa_region *next;
171 paddr_t start;
172 paddr_t size;
173 u_int numa_id;
174 } *uvm_page_numa_region;
175
176 #ifdef DEBUG
177 kmutex_t uvm_zerochecklock __cacheline_aligned;
178 vaddr_t uvm_zerocheckkva;
179 #endif /* DEBUG */
180
181 /*
182 * These functions are reserved for uvm(9) internal use and are not
183 * exported in the header file uvm_physseg.h
184 *
185 * Thus they are redefined here.
186 */
187 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
188 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
189
190 /* returns a pgs array */
191 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
192
193 /*
194 * inline functions
195 */
196
197 /*
198 * uvm_pageinsert: insert a page in the object.
199 *
200 * => caller must lock object
201 * => call should have already set pg's object and offset pointers
202 * and bumped the version counter
203 */
204
205 static inline void
206 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
207 {
208
209 KASSERT(uobj == pg->uobject);
210 KASSERT(rw_write_held(uobj->vmobjlock));
211 KASSERT((pg->flags & PG_TABLED) == 0);
212
213 if ((pg->flags & PG_STAT) != 0) {
214 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
215 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
216
217 if ((pg->flags & PG_FILE) != 0) {
218 if (uobj->uo_npages == 0) {
219 struct vnode *vp = (struct vnode *)uobj;
220 mutex_enter(vp->v_interlock);
221 KASSERT((vp->v_iflag & VI_PAGES) == 0);
222 vp->v_iflag |= VI_PAGES;
223 vholdl(vp);
224 mutex_exit(vp->v_interlock);
225 }
226 if (UVM_OBJ_IS_VTEXT(uobj)) {
227 cpu_count(CPU_COUNT_EXECPAGES, 1);
228 }
229 cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
230 } else {
231 cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
232 }
233 }
234 pg->flags |= PG_TABLED;
235 uobj->uo_npages++;
236 }
237
238 static inline int
239 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
240 {
241 const uint64_t idx = pg->offset >> PAGE_SHIFT;
242 int error;
243
244 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
245 if (error != 0) {
246 return error;
247 }
248 if ((pg->flags & PG_CLEAN) == 0) {
249 radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG);
250 }
251 KASSERT(((pg->flags & PG_CLEAN) == 0) ==
252 radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG));
253 return 0;
254 }
255
256 /*
257 * uvm_page_remove: remove page from object.
258 *
259 * => caller must lock object
260 */
261
262 static inline void
263 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
264 {
265
266 KASSERT(uobj == pg->uobject);
267 KASSERT(rw_write_held(uobj->vmobjlock));
268 KASSERT(pg->flags & PG_TABLED);
269
270 if ((pg->flags & PG_STAT) != 0) {
271 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
272 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
273
274 if ((pg->flags & PG_FILE) != 0) {
275 if (uobj->uo_npages == 1) {
276 struct vnode *vp = (struct vnode *)uobj;
277 mutex_enter(vp->v_interlock);
278 KASSERT((vp->v_iflag & VI_PAGES) != 0);
279 vp->v_iflag &= ~VI_PAGES;
280 holdrelel(vp);
281 mutex_exit(vp->v_interlock);
282 }
283 if (UVM_OBJ_IS_VTEXT(uobj)) {
284 cpu_count(CPU_COUNT_EXECPAGES, -1);
285 }
286 cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
287 } else {
288 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
289 }
290 }
291 uobj->uo_npages--;
292 pg->flags &= ~PG_TABLED;
293 pg->uobject = NULL;
294 }
295
296 static inline void
297 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
298 {
299 struct vm_page *opg __unused;
300
301 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
302 KASSERT(pg == opg);
303 }
304
305 static void
306 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
307 {
308 int i;
309
310 pgb->pgb_nfree = 0;
311 for (i = 0; i < uvmexp.ncolors; i++) {
312 LIST_INIT(&pgb->pgb_colors[i]);
313 }
314 pgfl->pgfl_buckets[num] = pgb;
315 }
316
317 /*
318 * uvm_page_init: init the page system. called from uvm_init().
319 *
320 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
321 */
322
323 void
324 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
325 {
326 static struct uvm_cpu boot_cpu __cacheline_aligned;
327 psize_t freepages, pagecount, bucketsize, n;
328 struct pgflbucket *pgb;
329 struct vm_page *pagearray;
330 char *bucketarray;
331 uvm_physseg_t bank;
332 int fl, b;
333
334 KASSERT(ncpu <= 1);
335
336 /*
337 * init the page queues and free page queue locks, except the
338 * free list; we allocate that later (with the initial vm_page
339 * structures).
340 */
341
342 curcpu()->ci_data.cpu_uvm = &boot_cpu;
343 uvmpdpol_init();
344 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
345 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
346 }
347
348 /*
349 * allocate vm_page structures.
350 */
351
352 /*
353 * sanity check:
354 * before calling this function the MD code is expected to register
355 * some free RAM with the uvm_page_physload() function. our job
356 * now is to allocate vm_page structures for this memory.
357 */
358
359 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
360 panic("uvm_page_bootstrap: no memory pre-allocated");
361
362 /*
363 * first calculate the number of free pages...
364 *
365 * note that we use start/end rather than avail_start/avail_end.
366 * this allows us to allocate extra vm_page structures in case we
367 * want to return some memory to the pool after booting.
368 */
369
370 freepages = 0;
371
372 for (bank = uvm_physseg_get_first();
373 uvm_physseg_valid_p(bank) ;
374 bank = uvm_physseg_get_next(bank)) {
375 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
376 }
377
378 /*
379 * Let MD code initialize the number of colors, or default
380 * to 1 color if MD code doesn't care.
381 */
382 if (uvmexp.ncolors == 0)
383 uvmexp.ncolors = 1;
384 uvmexp.colormask = uvmexp.ncolors - 1;
385 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
386
387 /* We always start with only 1 bucket. */
388 uvm.bucketcount = 1;
389
390 /*
391 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
392 * use. for each page of memory we use we need a vm_page structure.
393 * thus, the total number of pages we can use is the total size of
394 * the memory divided by the PAGE_SIZE plus the size of the vm_page
395 * structure. we add one to freepages as a fudge factor to avoid
396 * truncation errors (since we can only allocate in terms of whole
397 * pages).
398 */
399 pagecount = ((freepages + 1) << PAGE_SHIFT) /
400 (PAGE_SIZE + sizeof(struct vm_page));
401 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
402 bucketsize = roundup2(bucketsize, coherency_unit);
403 bucketarray = (void *)uvm_pageboot_alloc(
404 bucketsize * VM_NFREELIST +
405 pagecount * sizeof(struct vm_page));
406 pagearray = (struct vm_page *)
407 (bucketarray + bucketsize * VM_NFREELIST);
408
409 for (fl = 0; fl < VM_NFREELIST; fl++) {
410 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
411 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
412 }
413 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
414
415 /*
416 * init the freelist cache in the disabled state.
417 */
418 uvm_pgflcache_init();
419
420 /*
421 * init the vm_page structures and put them in the correct place.
422 */
423 /* First init the extent */
424
425 for (bank = uvm_physseg_get_first(),
426 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
427 uvm_physseg_valid_p(bank);
428 bank = uvm_physseg_get_next(bank)) {
429
430 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
431 uvm_physseg_seg_alloc_from_slab(bank, n);
432 uvm_physseg_init_seg(bank, pagearray);
433
434 /* set up page array pointers */
435 pagearray += n;
436 pagecount -= n;
437 }
438
439 /*
440 * pass up the values of virtual_space_start and
441 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
442 * layers of the VM.
443 */
444
445 *kvm_startp = round_page(virtual_space_start);
446 *kvm_endp = trunc_page(virtual_space_end);
447 #ifdef DEBUG
448 /*
449 * steal kva for uvm_pagezerocheck().
450 */
451 uvm_zerocheckkva = *kvm_startp;
452 *kvm_startp += PAGE_SIZE;
453 mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM);
454 #endif /* DEBUG */
455
456 /*
457 * init various thresholds.
458 */
459
460 uvmexp.reserve_pagedaemon = 1;
461 uvmexp.reserve_kernel = vm_page_reserve_kernel;
462
463 /*
464 * done!
465 */
466
467 uvm.page_init_done = true;
468 }
469
470 /*
471 * uvm_pgfl_lock: lock all freelist buckets
472 */
473
474 void
475 uvm_pgfl_lock(void)
476 {
477 int i;
478
479 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
480 mutex_spin_enter(&uvm_freelist_locks[i].lock);
481 }
482 }
483
484 /*
485 * uvm_pgfl_unlock: unlock all freelist buckets
486 */
487
488 void
489 uvm_pgfl_unlock(void)
490 {
491 int i;
492
493 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
494 mutex_spin_exit(&uvm_freelist_locks[i].lock);
495 }
496 }
497
498 /*
499 * uvm_setpagesize: set the page size
500 *
501 * => sets page_shift and page_mask from uvmexp.pagesize.
502 */
503
504 void
505 uvm_setpagesize(void)
506 {
507
508 /*
509 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
510 * to be a constant (indicated by being a non-zero value).
511 */
512 if (uvmexp.pagesize == 0) {
513 if (PAGE_SIZE == 0)
514 panic("uvm_setpagesize: uvmexp.pagesize not set");
515 uvmexp.pagesize = PAGE_SIZE;
516 }
517 uvmexp.pagemask = uvmexp.pagesize - 1;
518 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
519 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
520 uvmexp.pagesize, uvmexp.pagesize);
521 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
522 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
523 break;
524 }
525
526 /*
527 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
528 */
529
530 vaddr_t
531 uvm_pageboot_alloc(vsize_t size)
532 {
533 static bool initialized = false;
534 vaddr_t addr;
535 #if !defined(PMAP_STEAL_MEMORY)
536 vaddr_t vaddr;
537 paddr_t paddr;
538 #endif
539
540 /*
541 * on first call to this function, initialize ourselves.
542 */
543 if (initialized == false) {
544 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
545
546 /* round it the way we like it */
547 virtual_space_start = round_page(virtual_space_start);
548 virtual_space_end = trunc_page(virtual_space_end);
549
550 initialized = true;
551 }
552
553 /* round to page size */
554 size = round_page(size);
555 uvmexp.bootpages += atop(size);
556
557 #if defined(PMAP_STEAL_MEMORY)
558
559 /*
560 * defer bootstrap allocation to MD code (it may want to allocate
561 * from a direct-mapped segment). pmap_steal_memory should adjust
562 * virtual_space_start/virtual_space_end if necessary.
563 */
564
565 addr = pmap_steal_memory(size, &virtual_space_start,
566 &virtual_space_end);
567
568 return(addr);
569
570 #else /* !PMAP_STEAL_MEMORY */
571
572 /*
573 * allocate virtual memory for this request
574 */
575 if (virtual_space_start == virtual_space_end ||
576 (virtual_space_end - virtual_space_start) < size)
577 panic("uvm_pageboot_alloc: out of virtual space");
578
579 addr = virtual_space_start;
580
581 #ifdef PMAP_GROWKERNEL
582 /*
583 * If the kernel pmap can't map the requested space,
584 * then allocate more resources for it.
585 */
586 if (uvm_maxkaddr < (addr + size)) {
587 uvm_maxkaddr = pmap_growkernel(addr + size);
588 if (uvm_maxkaddr < (addr + size))
589 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
590 }
591 #endif
592
593 virtual_space_start += size;
594
595 /*
596 * allocate and mapin physical pages to back new virtual pages
597 */
598
599 for (vaddr = round_page(addr) ; vaddr < addr + size ;
600 vaddr += PAGE_SIZE) {
601
602 if (!uvm_page_physget(&paddr))
603 panic("uvm_pageboot_alloc: out of memory");
604
605 /*
606 * Note this memory is no longer managed, so using
607 * pmap_kenter is safe.
608 */
609 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
610 }
611 pmap_update(pmap_kernel());
612 return(addr);
613 #endif /* PMAP_STEAL_MEMORY */
614 }
615
616 #if !defined(PMAP_STEAL_MEMORY)
617 /*
618 * uvm_page_physget: "steal" one page from the vm_physmem structure.
619 *
620 * => attempt to allocate it off the end of a segment in which the "avail"
621 * values match the start/end values. if we can't do that, then we
622 * will advance both values (making them equal, and removing some
623 * vm_page structures from the non-avail area).
624 * => return false if out of memory.
625 */
626
627 /* subroutine: try to allocate from memory chunks on the specified freelist */
628 static bool uvm_page_physget_freelist(paddr_t *, int);
629
630 static bool
631 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
632 {
633 uvm_physseg_t lcv;
634
635 /* pass 1: try allocating from a matching end */
636 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
637 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
638 #else
639 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
640 #endif
641 {
642 if (uvm.page_init_done == true)
643 panic("uvm_page_physget: called _after_ bootstrap");
644
645 /* Try to match at front or back on unused segment */
646 if (uvm_page_physunload(lcv, freelist, paddrp))
647 return true;
648 }
649
650 /* pass2: forget about matching ends, just allocate something */
651 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
652 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
653 #else
654 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
655 #endif
656 {
657 /* Try the front regardless. */
658 if (uvm_page_physunload_force(lcv, freelist, paddrp))
659 return true;
660 }
661 return false;
662 }
663
664 bool
665 uvm_page_physget(paddr_t *paddrp)
666 {
667 int i;
668
669 /* try in the order of freelist preference */
670 for (i = 0; i < VM_NFREELIST; i++)
671 if (uvm_page_physget_freelist(paddrp, i) == true)
672 return (true);
673 return (false);
674 }
675 #endif /* PMAP_STEAL_MEMORY */
676
677 /*
678 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
679 * back from an I/O mapping (ugh!). used in some MD code as well.
680 */
681 struct vm_page *
682 uvm_phys_to_vm_page(paddr_t pa)
683 {
684 paddr_t pf = atop(pa);
685 paddr_t off;
686 uvm_physseg_t upm;
687
688 upm = uvm_physseg_find(pf, &off);
689 if (upm != UVM_PHYSSEG_TYPE_INVALID)
690 return uvm_physseg_get_pg(upm, off);
691 return(NULL);
692 }
693
694 paddr_t
695 uvm_vm_page_to_phys(const struct vm_page *pg)
696 {
697
698 return pg->phys_addr & ~(PAGE_SIZE - 1);
699 }
700
701 /*
702 * uvm_page_numa_load: load NUMA range description.
703 */
704 void
705 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
706 {
707 struct uvm_page_numa_region *d;
708
709 KASSERT(numa_id < PGFL_MAX_BUCKETS);
710
711 d = kmem_alloc(sizeof(*d), KM_SLEEP);
712 d->start = start;
713 d->size = size;
714 d->numa_id = numa_id;
715 d->next = uvm_page_numa_region;
716 uvm_page_numa_region = d;
717 }
718
719 /*
720 * uvm_page_numa_lookup: lookup NUMA node for the given page.
721 */
722 static u_int
723 uvm_page_numa_lookup(struct vm_page *pg)
724 {
725 struct uvm_page_numa_region *d;
726 static bool warned;
727 paddr_t pa;
728
729 KASSERT(uvm_page_numa_region != NULL);
730
731 pa = VM_PAGE_TO_PHYS(pg);
732 for (d = uvm_page_numa_region; d != NULL; d = d->next) {
733 if (pa >= d->start && pa < d->start + d->size) {
734 return d->numa_id;
735 }
736 }
737
738 if (!warned) {
739 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
740 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
741 warned = true;
742 }
743
744 return 0;
745 }
746
747 /*
748 * uvm_page_redim: adjust freelist dimensions if they have changed.
749 */
750
751 static void
752 uvm_page_redim(int newncolors, int newnbuckets)
753 {
754 struct pgfreelist npgfl;
755 struct pgflbucket *opgb, *npgb;
756 struct pgflist *ohead, *nhead;
757 struct vm_page *pg;
758 size_t bucketsize, bucketmemsize, oldbucketmemsize;
759 int fl, ob, oc, nb, nc, obuckets, ocolors;
760 char *bucketarray, *oldbucketmem, *bucketmem;
761
762 KASSERT(((newncolors - 1) & newncolors) == 0);
763
764 /* Anything to do? */
765 if (newncolors <= uvmexp.ncolors &&
766 newnbuckets == uvm.bucketcount) {
767 return;
768 }
769 if (uvm.page_init_done == false) {
770 uvmexp.ncolors = newncolors;
771 return;
772 }
773
774 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
775 bucketsize = roundup2(bucketsize, coherency_unit);
776 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
777 coherency_unit - 1;
778 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
779 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
780
781 ocolors = uvmexp.ncolors;
782 obuckets = uvm.bucketcount;
783
784 /* Freelist cache musn't be enabled. */
785 uvm_pgflcache_pause();
786
787 /* Make sure we should still do this. */
788 uvm_pgfl_lock();
789 if (newncolors <= uvmexp.ncolors &&
790 newnbuckets == uvm.bucketcount) {
791 uvm_pgfl_unlock();
792 uvm_pgflcache_resume();
793 kmem_free(bucketmem, bucketmemsize);
794 return;
795 }
796
797 uvmexp.ncolors = newncolors;
798 uvmexp.colormask = uvmexp.ncolors - 1;
799 uvm.bucketcount = newnbuckets;
800
801 for (fl = 0; fl < VM_NFREELIST; fl++) {
802 /* Init new buckets in new freelist. */
803 memset(&npgfl, 0, sizeof(npgfl));
804 for (nb = 0; nb < newnbuckets; nb++) {
805 npgb = (struct pgflbucket *)bucketarray;
806 uvm_page_init_bucket(&npgfl, npgb, nb);
807 bucketarray += bucketsize;
808 }
809 /* Now transfer pages from the old freelist. */
810 for (nb = ob = 0; ob < obuckets; ob++) {
811 opgb = uvm.page_free[fl].pgfl_buckets[ob];
812 for (oc = 0; oc < ocolors; oc++) {
813 ohead = &opgb->pgb_colors[oc];
814 while ((pg = LIST_FIRST(ohead)) != NULL) {
815 LIST_REMOVE(pg, pageq.list);
816 /*
817 * Here we decide on the NEW color &
818 * bucket for the page. For NUMA
819 * we'll use the info that the
820 * hardware gave us. For non-NUMA
821 * assign take physical page frame
822 * number and cache color into
823 * account. We do this to try and
824 * avoid defeating any memory
825 * interleaving in the hardware.
826 */
827 KASSERT(
828 uvm_page_get_bucket(pg) == ob);
829 KASSERT(fl ==
830 uvm_page_get_freelist(pg));
831 if (uvm_page_numa_region != NULL) {
832 nb = uvm_page_numa_lookup(pg);
833 } else {
834 nb = atop(VM_PAGE_TO_PHYS(pg))
835 / uvmexp.ncolors / 8
836 % newnbuckets;
837 }
838 uvm_page_set_bucket(pg, nb);
839 npgb = npgfl.pgfl_buckets[nb];
840 npgb->pgb_nfree++;
841 nc = VM_PGCOLOR(pg);
842 nhead = &npgb->pgb_colors[nc];
843 LIST_INSERT_HEAD(nhead, pg, pageq.list);
844 }
845 }
846 }
847 /* Install the new freelist. */
848 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
849 }
850
851 /* Unlock and free the old memory. */
852 oldbucketmemsize = recolored_pages_memsize;
853 oldbucketmem = recolored_pages_mem;
854 recolored_pages_memsize = bucketmemsize;
855 recolored_pages_mem = bucketmem;
856
857 uvm_pgfl_unlock();
858 uvm_pgflcache_resume();
859
860 if (oldbucketmemsize) {
861 kmem_free(oldbucketmem, oldbucketmemsize);
862 }
863
864 /*
865 * this calls uvm_km_alloc() which may want to hold
866 * uvm_freelist_lock.
867 */
868 uvm_pager_realloc_emerg();
869 }
870
871 /*
872 * uvm_page_recolor: Recolor the pages if the new color count is
873 * larger than the old one.
874 */
875
876 void
877 uvm_page_recolor(int newncolors)
878 {
879
880 uvm_page_redim(newncolors, uvm.bucketcount);
881 }
882
883 /*
884 * uvm_page_rebucket: Determine a bucket structure and redim the free
885 * lists to match.
886 */
887
888 void
889 uvm_page_rebucket(void)
890 {
891 u_int min_numa, max_numa, npackage, shift;
892 struct cpu_info *ci, *ci2, *ci3;
893 CPU_INFO_ITERATOR cii;
894
895 /*
896 * If we have more than one NUMA node, and the maximum NUMA node ID
897 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
898 * for free pages.
899 */
900 min_numa = (u_int)-1;
901 max_numa = 0;
902 for (CPU_INFO_FOREACH(cii, ci)) {
903 if (ci->ci_numa_id < min_numa) {
904 min_numa = ci->ci_numa_id;
905 }
906 if (ci->ci_numa_id > max_numa) {
907 max_numa = ci->ci_numa_id;
908 }
909 }
910 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
911 aprint_debug("UVM: using NUMA allocation scheme\n");
912 for (CPU_INFO_FOREACH(cii, ci)) {
913 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
914 }
915 uvm_page_redim(uvmexp.ncolors, max_numa + 1);
916 return;
917 }
918
919 /*
920 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
921 * and minimise lock contention. Count the total number of CPU
922 * packages, and then try to distribute the buckets among CPU
923 * packages evenly.
924 */
925 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
926
927 /*
928 * Figure out how to arrange the packages & buckets, and the total
929 * number of buckets we need. XXX 2 may not be the best factor.
930 */
931 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
932 npackage >>= 1;
933 }
934 uvm_page_redim(uvmexp.ncolors, npackage);
935
936 /*
937 * Now tell each CPU which bucket to use. In the outer loop, scroll
938 * through all CPU packages.
939 */
940 npackage = 0;
941 ci = curcpu();
942 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
943 do {
944 /*
945 * In the inner loop, scroll through all CPUs in the package
946 * and assign the same bucket ID.
947 */
948 ci3 = ci2;
949 do {
950 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
951 ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
952 } while (ci3 != ci2);
953 npackage++;
954 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
955 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
956
957 aprint_debug("UVM: using package allocation scheme, "
958 "%d package(s) per bucket\n", 1 << shift);
959 }
960
961 /*
962 * uvm_cpu_attach: initialize per-CPU data structures.
963 */
964
965 void
966 uvm_cpu_attach(struct cpu_info *ci)
967 {
968 struct uvm_cpu *ucpu;
969
970 /* Already done in uvm_page_init(). */
971 if (!CPU_IS_PRIMARY(ci)) {
972 /* Add more reserve pages for this CPU. */
973 uvmexp.reserve_kernel += vm_page_reserve_kernel;
974
975 /* Allocate per-CPU data structures. */
976 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
977 KM_SLEEP);
978 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
979 coherency_unit);
980 ci->ci_data.cpu_uvm = ucpu;
981 } else {
982 ucpu = ci->ci_data.cpu_uvm;
983 }
984
985 uvmpdpol_init_cpu(ucpu);
986
987 /*
988 * Attach RNG source for this CPU's VM events
989 */
990 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
991 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
992 RND_FLAG_ESTIMATE_VALUE);
993 }
994
995 /*
996 * uvm_availmem: fetch the total amount of free memory in pages. this can
997 * have a detrimental effect on performance due to false sharing; don't call
998 * unless needed.
999 *
1000 * some users can request the amount of free memory so often that it begins
1001 * to impact upon performance. if calling frequently and an inexact value
1002 * is okay, call with cached = true.
1003 */
1004
1005 int
1006 uvm_availmem(bool cached)
1007 {
1008 int64_t fp;
1009
1010 cpu_count_sync(cached);
1011 if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
1012 /*
1013 * XXXAD could briefly go negative because it's impossible
1014 * to get a clean snapshot. address this for other counters
1015 * used as running totals before NetBSD 10 although less
1016 * important for those.
1017 */
1018 fp = 0;
1019 }
1020 return (int)fp;
1021 }
1022
1023 /*
1024 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
1025 * specific freelist and specific bucket only.
1026 *
1027 * => must be at IPL_VM or higher to protect per-CPU data structures.
1028 */
1029
1030 static struct vm_page *
1031 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
1032 {
1033 int c, trycolor, colormask;
1034 struct pgflbucket *pgb;
1035 struct vm_page *pg;
1036 kmutex_t *lock;
1037 bool fill;
1038
1039 /*
1040 * Skip the bucket if empty, no lock needed. There could be many
1041 * empty freelists/buckets.
1042 */
1043 pgb = uvm.page_free[f].pgfl_buckets[b];
1044 if (pgb->pgb_nfree == 0) {
1045 return NULL;
1046 }
1047
1048 /* Skip bucket if low on memory. */
1049 lock = &uvm_freelist_locks[b].lock;
1050 mutex_spin_enter(lock);
1051 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
1052 if ((flags & UVM_PGA_USERESERVE) == 0 ||
1053 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
1054 curlwp != uvm.pagedaemon_lwp)) {
1055 mutex_spin_exit(lock);
1056 return NULL;
1057 }
1058 fill = false;
1059 } else {
1060 fill = true;
1061 }
1062
1063 /* Try all page colors as needed. */
1064 c = trycolor = *trycolorp;
1065 colormask = uvmexp.colormask;
1066 do {
1067 pg = LIST_FIRST(&pgb->pgb_colors[c]);
1068 if (__predict_true(pg != NULL)) {
1069 /*
1070 * Got a free page! PG_FREE must be cleared under
1071 * lock because of uvm_pglistalloc().
1072 */
1073 LIST_REMOVE(pg, pageq.list);
1074 KASSERT(pg->flags == PG_FREE);
1075 pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
1076 pgb->pgb_nfree--;
1077
1078 /*
1079 * While we have the bucket locked and our data
1080 * structures fresh in L1 cache, we have an ideal
1081 * opportunity to grab some pages for the freelist
1082 * cache without causing extra contention. Only do
1083 * so if we found pages in this CPU's preferred
1084 * bucket.
1085 */
1086 if (__predict_true(b == ucpu->pgflbucket && fill)) {
1087 uvm_pgflcache_fill(ucpu, f, b, c);
1088 }
1089 mutex_spin_exit(lock);
1090 KASSERT(uvm_page_get_bucket(pg) == b);
1091 CPU_COUNT(c == trycolor ?
1092 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
1093 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
1094 *trycolorp = c;
1095 return pg;
1096 }
1097 c = (c + 1) & colormask;
1098 } while (c != trycolor);
1099 mutex_spin_exit(lock);
1100
1101 return NULL;
1102 }
1103
1104 /*
1105 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
1106 * any color from any bucket, in a specific freelist.
1107 *
1108 * => must be at IPL_VM or higher to protect per-CPU data structures.
1109 */
1110
1111 static struct vm_page *
1112 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
1113 {
1114 int b, trybucket, bucketcount;
1115 struct vm_page *pg;
1116
1117 /* Try for the exact thing in the per-CPU cache. */
1118 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
1119 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1120 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1121 return pg;
1122 }
1123
1124 /* Walk through all buckets, trying our preferred bucket first. */
1125 trybucket = ucpu->pgflbucket;
1126 b = trybucket;
1127 bucketcount = uvm.bucketcount;
1128 do {
1129 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
1130 if (pg != NULL) {
1131 return pg;
1132 }
1133 b = (b + 1 == bucketcount ? 0 : b + 1);
1134 } while (b != trybucket);
1135
1136 return NULL;
1137 }
1138
1139 /*
1140 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
1141 *
1142 * => return null if no pages free
1143 * => wake up pagedaemon if number of free pages drops below low water mark
1144 * => if obj != NULL, obj must be locked (to put in obj's tree)
1145 * => if anon != NULL, anon must be locked (to put in anon)
1146 * => only one of obj or anon can be non-null
1147 * => caller must activate/deactivate page if it is not wired.
1148 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
1149 * => policy decision: it is more important to pull a page off of the
1150 * appropriate priority free list than it is to get a page from the
1151 * correct bucket or color bin. This is because we live with the
1152 * consequences of a bad free list decision for the entire
1153 * lifetime of the page, e.g. if the page comes from memory that
1154 * is slower to access.
1155 */
1156
1157 struct vm_page *
1158 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
1159 int flags, int strat, int free_list)
1160 {
1161 int color, lcv, error, s;
1162 struct uvm_cpu *ucpu;
1163 struct vm_page *pg;
1164 lwp_t *l;
1165
1166 KASSERT(obj == NULL || anon == NULL);
1167 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
1168 KASSERT(off == trunc_page(off));
1169 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
1170 KASSERT(anon == NULL || anon->an_lock == NULL ||
1171 rw_write_held(anon->an_lock));
1172
1173 /*
1174 * This implements a global round-robin page coloring
1175 * algorithm.
1176 */
1177
1178 s = splvm();
1179 ucpu = curcpu()->ci_data.cpu_uvm;
1180 if (flags & UVM_FLAG_COLORMATCH) {
1181 color = atop(off) & uvmexp.colormask;
1182 } else {
1183 color = ucpu->pgflcolor;
1184 }
1185
1186 /*
1187 * fail if any of these conditions is true:
1188 * [1] there really are no free pages, or
1189 * [2] only kernel "reserved" pages remain and
1190 * reserved pages have not been requested.
1191 * [3] only pagedaemon "reserved" pages remain and
1192 * the requestor isn't the pagedaemon.
1193 * we make kernel reserve pages available if called by a
1194 * kernel thread.
1195 */
1196 l = curlwp;
1197 if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
1198 flags |= UVM_PGA_USERESERVE;
1199 }
1200
1201 again:
1202 switch (strat) {
1203 case UVM_PGA_STRAT_NORMAL:
1204 /* Check freelists: descending priority (ascending id) order. */
1205 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1206 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
1207 if (pg != NULL) {
1208 goto gotit;
1209 }
1210 }
1211
1212 /* No pages free! Have pagedaemon free some memory. */
1213 splx(s);
1214 uvm_kick_pdaemon();
1215 return NULL;
1216
1217 case UVM_PGA_STRAT_ONLY:
1218 case UVM_PGA_STRAT_FALLBACK:
1219 /* Attempt to allocate from the specified free list. */
1220 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
1221 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
1222 if (pg != NULL) {
1223 goto gotit;
1224 }
1225
1226 /* Fall back, if possible. */
1227 if (strat == UVM_PGA_STRAT_FALLBACK) {
1228 strat = UVM_PGA_STRAT_NORMAL;
1229 goto again;
1230 }
1231
1232 /* No pages free! Have pagedaemon free some memory. */
1233 splx(s);
1234 uvm_kick_pdaemon();
1235 return NULL;
1236
1237 case UVM_PGA_STRAT_NUMA:
1238 /*
1239 * NUMA strategy (experimental): allocating from the correct
1240 * bucket is more important than observing freelist
1241 * priority. Look only to the current NUMA node; if that
1242 * fails, we need to look to other NUMA nodes, so retry with
1243 * the normal strategy.
1244 */
1245 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1246 pg = uvm_pgflcache_alloc(ucpu, lcv, color);
1247 if (pg != NULL) {
1248 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1249 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1250 goto gotit;
1251 }
1252 pg = uvm_pagealloc_pgb(ucpu, lcv,
1253 ucpu->pgflbucket, &color, flags);
1254 if (pg != NULL) {
1255 goto gotit;
1256 }
1257 }
1258 strat = UVM_PGA_STRAT_NORMAL;
1259 goto again;
1260
1261 default:
1262 panic("uvm_pagealloc_strat: bad strat %d", strat);
1263 /* NOTREACHED */
1264 }
1265
1266 gotit:
1267 /*
1268 * We now know which color we actually allocated from; set
1269 * the next color accordingly.
1270 */
1271
1272 ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
1273
1274 /*
1275 * while still at IPL_VM, update allocation statistics.
1276 */
1277
1278 CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
1279 if (anon) {
1280 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
1281 }
1282 splx(s);
1283 KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
1284
1285 /*
1286 * assign the page to the object. as the page was free, we know
1287 * that pg->uobject and pg->uanon are NULL. we only need to take
1288 * the page's interlock if we are changing the values.
1289 */
1290 if (anon != NULL || obj != NULL) {
1291 mutex_enter(&pg->interlock);
1292 }
1293 pg->offset = off;
1294 pg->uobject = obj;
1295 pg->uanon = anon;
1296 KASSERT(uvm_page_owner_locked_p(pg, true));
1297 if (anon) {
1298 anon->an_page = pg;
1299 pg->flags |= PG_ANON;
1300 mutex_exit(&pg->interlock);
1301 } else if (obj) {
1302 /*
1303 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
1304 */
1305 if (UVM_OBJ_IS_VNODE(obj)) {
1306 pg->flags |= PG_FILE;
1307 } else if (UVM_OBJ_IS_AOBJ(obj)) {
1308 pg->flags |= PG_AOBJ;
1309 }
1310 uvm_pageinsert_object(obj, pg);
1311 mutex_exit(&pg->interlock);
1312 error = uvm_pageinsert_tree(obj, pg);
1313 if (error != 0) {
1314 mutex_enter(&pg->interlock);
1315 uvm_pageremove_object(obj, pg);
1316 mutex_exit(&pg->interlock);
1317 uvm_pagefree(pg);
1318 return NULL;
1319 }
1320 }
1321
1322 #if defined(UVM_PAGE_TRKOWN)
1323 pg->owner_tag = NULL;
1324 #endif
1325 UVM_PAGE_OWN(pg, "new alloc");
1326
1327 if (flags & UVM_PGA_ZERO) {
1328 /* A zero'd page is not clean. */
1329 if (obj != NULL || anon != NULL) {
1330 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1331 }
1332 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1333 }
1334
1335 return(pg);
1336 }
1337
1338 /*
1339 * uvm_pagereplace: replace a page with another
1340 *
1341 * => object must be locked
1342 * => page interlocks must be held
1343 */
1344
1345 void
1346 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1347 {
1348 struct uvm_object *uobj = oldpg->uobject;
1349 struct vm_page *pg __diagused;
1350 uint64_t idx;
1351
1352 KASSERT((oldpg->flags & PG_TABLED) != 0);
1353 KASSERT(uobj != NULL);
1354 KASSERT((newpg->flags & PG_TABLED) == 0);
1355 KASSERT(newpg->uobject == NULL);
1356 KASSERT(rw_write_held(uobj->vmobjlock));
1357 KASSERT(mutex_owned(&oldpg->interlock));
1358 KASSERT(mutex_owned(&newpg->interlock));
1359
1360 newpg->uobject = uobj;
1361 newpg->offset = oldpg->offset;
1362 idx = newpg->offset >> PAGE_SHIFT;
1363 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
1364 KASSERT(pg == oldpg);
1365 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
1366 if ((newpg->flags & PG_CLEAN) != 0) {
1367 radix_tree_clear_tag(&uobj->uo_pages, idx,
1368 UVM_PAGE_DIRTY_TAG);
1369 } else {
1370 radix_tree_set_tag(&uobj->uo_pages, idx,
1371 UVM_PAGE_DIRTY_TAG);
1372 }
1373 }
1374 /*
1375 * oldpg's PG_STAT is stable. newpg is not reachable by others yet.
1376 */
1377 newpg->flags |=
1378 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
1379 uvm_pageinsert_object(uobj, newpg);
1380 uvm_pageremove_object(uobj, oldpg);
1381 }
1382
1383 /*
1384 * uvm_pagerealloc: reallocate a page from one object to another
1385 *
1386 * => both objects must be locked
1387 */
1388
1389 int
1390 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1391 {
1392 int error = 0;
1393
1394 /*
1395 * remove it from the old object
1396 */
1397
1398 if (pg->uobject) {
1399 uvm_pageremove_tree(pg->uobject, pg);
1400 uvm_pageremove_object(pg->uobject, pg);
1401 }
1402
1403 /*
1404 * put it in the new object
1405 */
1406
1407 if (newobj) {
1408 mutex_enter(&pg->interlock);
1409 pg->uobject = newobj;
1410 pg->offset = newoff;
1411 if (UVM_OBJ_IS_VNODE(newobj)) {
1412 pg->flags |= PG_FILE;
1413 } else if (UVM_OBJ_IS_AOBJ(newobj)) {
1414 pg->flags |= PG_AOBJ;
1415 }
1416 uvm_pageinsert_object(newobj, pg);
1417 mutex_exit(&pg->interlock);
1418 error = uvm_pageinsert_tree(newobj, pg);
1419 if (error != 0) {
1420 mutex_enter(&pg->interlock);
1421 uvm_pageremove_object(newobj, pg);
1422 mutex_exit(&pg->interlock);
1423 }
1424 }
1425
1426 return error;
1427 }
1428
1429 #ifdef DEBUG
1430 /*
1431 * check if page is zero-filled
1432 */
1433 void
1434 uvm_pagezerocheck(struct vm_page *pg)
1435 {
1436 int *p, *ep;
1437
1438 KASSERT(uvm_zerocheckkva != 0);
1439
1440 /*
1441 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1442 * uvm page allocator.
1443 *
1444 * it might be better to have "CPU-local temporary map" pmap interface.
1445 */
1446 mutex_spin_enter(&uvm_zerochecklock);
1447 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1448 p = (int *)uvm_zerocheckkva;
1449 ep = (int *)((char *)p + PAGE_SIZE);
1450 pmap_update(pmap_kernel());
1451 while (p < ep) {
1452 if (*p != 0)
1453 panic("zero page isn't zero-filled");
1454 p++;
1455 }
1456 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1457 mutex_spin_exit(&uvm_zerochecklock);
1458 /*
1459 * pmap_update() is not necessary here because no one except us
1460 * uses this VA.
1461 */
1462 }
1463 #endif /* DEBUG */
1464
1465 /*
1466 * uvm_pagefree: free page
1467 *
1468 * => erase page's identity (i.e. remove from object)
1469 * => put page on free list
1470 * => caller must lock owning object (either anon or uvm_object)
1471 * => assumes all valid mappings of pg are gone
1472 */
1473
1474 void
1475 uvm_pagefree(struct vm_page *pg)
1476 {
1477 struct pgfreelist *pgfl;
1478 struct pgflbucket *pgb;
1479 struct uvm_cpu *ucpu;
1480 kmutex_t *lock;
1481 int bucket, s;
1482 bool locked;
1483
1484 #ifdef DEBUG
1485 if (pg->uobject == (void *)0xdeadbeef &&
1486 pg->uanon == (void *)0xdeadbeef) {
1487 panic("uvm_pagefree: freeing free page %p", pg);
1488 }
1489 #endif /* DEBUG */
1490
1491 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1492 KASSERT(!(pg->flags & PG_FREE));
1493 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
1494 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1495 rw_write_held(pg->uanon->an_lock));
1496
1497 /*
1498 * remove the page from the object's tree before acquiring any page
1499 * interlocks: this can acquire locks to free radixtree nodes.
1500 */
1501 if (pg->uobject != NULL) {
1502 uvm_pageremove_tree(pg->uobject, pg);
1503 }
1504
1505 /*
1506 * if the page is loaned, resolve the loan instead of freeing.
1507 */
1508
1509 if (pg->loan_count) {
1510 KASSERT(pg->wire_count == 0);
1511
1512 /*
1513 * if the page is owned by an anon then we just want to
1514 * drop anon ownership. the kernel will free the page when
1515 * it is done with it. if the page is owned by an object,
1516 * remove it from the object and mark it dirty for the benefit
1517 * of possible anon owners.
1518 *
1519 * regardless of previous ownership, wakeup any waiters,
1520 * unbusy the page, and we're done.
1521 */
1522
1523 uvm_pagelock(pg);
1524 locked = true;
1525 if (pg->uobject != NULL) {
1526 uvm_pageremove_object(pg->uobject, pg);
1527 pg->flags &= ~(PG_FILE|PG_AOBJ);
1528 } else if (pg->uanon != NULL) {
1529 if ((pg->flags & PG_ANON) == 0) {
1530 pg->loan_count--;
1531 } else {
1532 const unsigned status = uvm_pagegetdirty(pg);
1533 pg->flags &= ~PG_ANON;
1534 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1535 }
1536 pg->uanon->an_page = NULL;
1537 pg->uanon = NULL;
1538 }
1539 if (pg->pqflags & PQ_WANTED) {
1540 wakeup(pg);
1541 }
1542 pg->pqflags &= ~PQ_WANTED;
1543 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
1544 #ifdef UVM_PAGE_TRKOWN
1545 pg->owner_tag = NULL;
1546 #endif
1547 KASSERT((pg->flags & PG_STAT) == 0);
1548 if (pg->loan_count) {
1549 KASSERT(pg->uobject == NULL);
1550 if (pg->uanon == NULL) {
1551 uvm_pagedequeue(pg);
1552 }
1553 uvm_pageunlock(pg);
1554 return;
1555 }
1556 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1557 pg->wire_count != 0) {
1558 uvm_pagelock(pg);
1559 locked = true;
1560 } else {
1561 locked = false;
1562 }
1563
1564 /*
1565 * remove page from its object or anon.
1566 */
1567 if (pg->uobject != NULL) {
1568 uvm_pageremove_object(pg->uobject, pg);
1569 } else if (pg->uanon != NULL) {
1570 const unsigned int status = uvm_pagegetdirty(pg);
1571 pg->uanon->an_page = NULL;
1572 pg->uanon = NULL;
1573 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1574 }
1575
1576 /*
1577 * if the page was wired, unwire it now.
1578 */
1579
1580 if (pg->wire_count) {
1581 pg->wire_count = 0;
1582 atomic_dec_uint(&uvmexp.wired);
1583 }
1584 if (locked) {
1585 /*
1586 * wake anyone waiting on the page.
1587 */
1588 if ((pg->pqflags & PQ_WANTED) != 0) {
1589 pg->pqflags &= ~PQ_WANTED;
1590 wakeup(pg);
1591 }
1592
1593 /*
1594 * now remove the page from the queues.
1595 */
1596 uvm_pagedequeue(pg);
1597 uvm_pageunlock(pg);
1598 } else {
1599 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1600 }
1601
1602 /*
1603 * and put on free queue
1604 */
1605
1606 #ifdef DEBUG
1607 pg->uobject = (void *)0xdeadbeef;
1608 pg->uanon = (void *)0xdeadbeef;
1609 #endif /* DEBUG */
1610
1611 /* Try to send the page to the per-CPU cache. */
1612 s = splvm();
1613 CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
1614 ucpu = curcpu()->ci_data.cpu_uvm;
1615 bucket = uvm_page_get_bucket(pg);
1616 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
1617 splx(s);
1618 return;
1619 }
1620
1621 /* Didn't work. Never mind, send it to a global bucket. */
1622 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
1623 pgb = pgfl->pgfl_buckets[bucket];
1624 lock = &uvm_freelist_locks[bucket].lock;
1625
1626 mutex_spin_enter(lock);
1627 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
1628 pg->flags = PG_FREE;
1629 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
1630 pgb->pgb_nfree++;
1631 mutex_spin_exit(lock);
1632 splx(s);
1633 }
1634
1635 /*
1636 * uvm_page_unbusy: unbusy an array of pages.
1637 *
1638 * => pages must either all belong to the same object, or all belong to anons.
1639 * => if pages are object-owned, object must be locked.
1640 * => if pages are anon-owned, anons must be locked.
1641 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1642 */
1643
1644 void
1645 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1646 {
1647 struct vm_page *pg;
1648 int i;
1649 UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
1650
1651 for (i = 0; i < npgs; i++) {
1652 pg = pgs[i];
1653 if (pg == NULL || pg == PGO_DONTCARE) {
1654 continue;
1655 }
1656
1657 KASSERT(uvm_page_owner_locked_p(pg, true));
1658 KASSERT(pg->flags & PG_BUSY);
1659 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1660 if (pg->flags & PG_RELEASED) {
1661 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1662 (uintptr_t)pg, 0, 0, 0);
1663 KASSERT(pg->uobject != NULL ||
1664 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1665 pg->flags &= ~PG_RELEASED;
1666 uvm_pagefree(pg);
1667 } else {
1668 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1669 (uintptr_t)pg, 0, 0, 0);
1670 KASSERT((pg->flags & PG_FAKE) == 0);
1671 pg->flags &= ~PG_BUSY;
1672 uvm_pagelock(pg);
1673 uvm_pagewakeup(pg);
1674 uvm_pageunlock(pg);
1675 UVM_PAGE_OWN(pg, NULL);
1676 }
1677 }
1678 }
1679
1680 /*
1681 * uvm_pagewait: wait for a busy page
1682 *
1683 * => page must be known PG_BUSY
1684 * => object must be read or write locked
1685 * => object will be unlocked on return
1686 */
1687
1688 void
1689 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
1690 {
1691
1692 KASSERT(rw_lock_held(lock));
1693 KASSERT((pg->flags & PG_BUSY) != 0);
1694 KASSERT(uvm_page_owner_locked_p(pg, false));
1695
1696 mutex_enter(&pg->interlock);
1697 pg->pqflags |= PQ_WANTED;
1698 rw_exit(lock);
1699 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
1700 }
1701
1702 /*
1703 * uvm_pagewakeup: wake anyone waiting on a page
1704 *
1705 * => page interlock must be held
1706 */
1707
1708 void
1709 uvm_pagewakeup(struct vm_page *pg)
1710 {
1711 UVMHIST_FUNC("uvm_pagewakeup"); UVMHIST_CALLED(ubchist);
1712
1713 KASSERT(mutex_owned(&pg->interlock));
1714
1715 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
1716
1717 if ((pg->pqflags & PQ_WANTED) != 0) {
1718 wakeup(pg);
1719 pg->pqflags &= ~PQ_WANTED;
1720 }
1721 }
1722
1723 /*
1724 * uvm_pagewanted_p: return true if someone is waiting on the page
1725 *
1726 * => object must be write locked (lock out all concurrent access)
1727 */
1728
1729 bool
1730 uvm_pagewanted_p(struct vm_page *pg)
1731 {
1732
1733 KASSERT(uvm_page_owner_locked_p(pg, true));
1734
1735 return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
1736 }
1737
1738 #if defined(UVM_PAGE_TRKOWN)
1739 /*
1740 * uvm_page_own: set or release page ownership
1741 *
1742 * => this is a debugging function that keeps track of who sets PG_BUSY
1743 * and where they do it. it can be used to track down problems
1744 * such a process setting "PG_BUSY" and never releasing it.
1745 * => page's object [if any] must be locked
1746 * => if "tag" is NULL then we are releasing page ownership
1747 */
1748 void
1749 uvm_page_own(struct vm_page *pg, const char *tag)
1750 {
1751
1752 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1753 KASSERT(uvm_page_owner_locked_p(pg, true));
1754
1755 /* gain ownership? */
1756 if (tag) {
1757 KASSERT((pg->flags & PG_BUSY) != 0);
1758 if (pg->owner_tag) {
1759 printf("uvm_page_own: page %p already owned "
1760 "by proc %d.%d [%s]\n", pg,
1761 pg->owner, pg->lowner, pg->owner_tag);
1762 panic("uvm_page_own");
1763 }
1764 pg->owner = curproc->p_pid;
1765 pg->lowner = curlwp->l_lid;
1766 pg->owner_tag = tag;
1767 return;
1768 }
1769
1770 /* drop ownership */
1771 KASSERT((pg->flags & PG_BUSY) == 0);
1772 if (pg->owner_tag == NULL) {
1773 printf("uvm_page_own: dropping ownership of an non-owned "
1774 "page (%p)\n", pg);
1775 panic("uvm_page_own");
1776 }
1777 pg->owner_tag = NULL;
1778 }
1779 #endif
1780
1781 /*
1782 * uvm_pagelookup: look up a page
1783 *
1784 * => caller should lock object to keep someone from pulling the page
1785 * out from under it
1786 */
1787
1788 struct vm_page *
1789 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1790 {
1791 struct vm_page *pg;
1792
1793 /* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */
1794
1795 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1796
1797 KASSERT(pg == NULL || obj->uo_npages != 0);
1798 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1799 (pg->flags & PG_BUSY) != 0);
1800 return pg;
1801 }
1802
1803 /*
1804 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1805 *
1806 * => caller must lock objects
1807 * => caller must hold pg->interlock
1808 */
1809
1810 void
1811 uvm_pagewire(struct vm_page *pg)
1812 {
1813
1814 KASSERT(uvm_page_owner_locked_p(pg, true));
1815 KASSERT(mutex_owned(&pg->interlock));
1816 #if defined(READAHEAD_STATS)
1817 if ((pg->flags & PG_READAHEAD) != 0) {
1818 uvm_ra_hit.ev_count++;
1819 pg->flags &= ~PG_READAHEAD;
1820 }
1821 #endif /* defined(READAHEAD_STATS) */
1822 if (pg->wire_count == 0) {
1823 uvm_pagedequeue(pg);
1824 atomic_inc_uint(&uvmexp.wired);
1825 }
1826 pg->wire_count++;
1827 KASSERT(pg->wire_count > 0); /* detect wraparound */
1828 }
1829
1830 /*
1831 * uvm_pageunwire: unwire the page.
1832 *
1833 * => activate if wire count goes to zero.
1834 * => caller must lock objects
1835 * => caller must hold pg->interlock
1836 */
1837
1838 void
1839 uvm_pageunwire(struct vm_page *pg)
1840 {
1841
1842 KASSERT(uvm_page_owner_locked_p(pg, true));
1843 KASSERT(pg->wire_count != 0);
1844 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1845 KASSERT(mutex_owned(&pg->interlock));
1846 pg->wire_count--;
1847 if (pg->wire_count == 0) {
1848 uvm_pageactivate(pg);
1849 KASSERT(uvmexp.wired != 0);
1850 atomic_dec_uint(&uvmexp.wired);
1851 }
1852 }
1853
1854 /*
1855 * uvm_pagedeactivate: deactivate page
1856 *
1857 * => caller must lock objects
1858 * => caller must check to make sure page is not wired
1859 * => object that page belongs to must be locked (so we can adjust pg->flags)
1860 * => caller must clear the reference on the page before calling
1861 * => caller must hold pg->interlock
1862 */
1863
1864 void
1865 uvm_pagedeactivate(struct vm_page *pg)
1866 {
1867
1868 KASSERT(uvm_page_owner_locked_p(pg, false));
1869 KASSERT(mutex_owned(&pg->interlock));
1870 if (pg->wire_count == 0) {
1871 KASSERT(uvmpdpol_pageisqueued_p(pg));
1872 uvmpdpol_pagedeactivate(pg);
1873 }
1874 }
1875
1876 /*
1877 * uvm_pageactivate: activate page
1878 *
1879 * => caller must lock objects
1880 * => caller must hold pg->interlock
1881 */
1882
1883 void
1884 uvm_pageactivate(struct vm_page *pg)
1885 {
1886
1887 KASSERT(uvm_page_owner_locked_p(pg, false));
1888 KASSERT(mutex_owned(&pg->interlock));
1889 #if defined(READAHEAD_STATS)
1890 if ((pg->flags & PG_READAHEAD) != 0) {
1891 uvm_ra_hit.ev_count++;
1892 pg->flags &= ~PG_READAHEAD;
1893 }
1894 #endif /* defined(READAHEAD_STATS) */
1895 if (pg->wire_count == 0) {
1896 uvmpdpol_pageactivate(pg);
1897 }
1898 }
1899
1900 /*
1901 * uvm_pagedequeue: remove a page from any paging queue
1902 *
1903 * => caller must lock objects
1904 * => caller must hold pg->interlock
1905 */
1906 void
1907 uvm_pagedequeue(struct vm_page *pg)
1908 {
1909
1910 KASSERT(uvm_page_owner_locked_p(pg, true));
1911 KASSERT(mutex_owned(&pg->interlock));
1912 if (uvmpdpol_pageisqueued_p(pg)) {
1913 uvmpdpol_pagedequeue(pg);
1914 }
1915 }
1916
1917 /*
1918 * uvm_pageenqueue: add a page to a paging queue without activating.
1919 * used where a page is not really demanded (yet). eg. read-ahead
1920 *
1921 * => caller must lock objects
1922 * => caller must hold pg->interlock
1923 */
1924 void
1925 uvm_pageenqueue(struct vm_page *pg)
1926 {
1927
1928 KASSERT(uvm_page_owner_locked_p(pg, false));
1929 KASSERT(mutex_owned(&pg->interlock));
1930 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1931 uvmpdpol_pageenqueue(pg);
1932 }
1933 }
1934
1935 /*
1936 * uvm_pagelock: acquire page interlock
1937 */
1938 void
1939 uvm_pagelock(struct vm_page *pg)
1940 {
1941
1942 mutex_enter(&pg->interlock);
1943 }
1944
1945 /*
1946 * uvm_pagelock2: acquire two page interlocks
1947 */
1948 void
1949 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
1950 {
1951
1952 if (pg1 < pg2) {
1953 mutex_enter(&pg1->interlock);
1954 mutex_enter(&pg2->interlock);
1955 } else {
1956 mutex_enter(&pg2->interlock);
1957 mutex_enter(&pg1->interlock);
1958 }
1959 }
1960
1961 /*
1962 * uvm_pageunlock: release page interlock, and if a page replacement intent
1963 * is set on the page, pass it to uvmpdpol to make real.
1964 *
1965 * => caller must hold pg->interlock
1966 */
1967 void
1968 uvm_pageunlock(struct vm_page *pg)
1969 {
1970
1971 if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
1972 (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
1973 mutex_exit(&pg->interlock);
1974 return;
1975 }
1976 pg->pqflags |= PQ_INTENT_QUEUED;
1977 mutex_exit(&pg->interlock);
1978 uvmpdpol_pagerealize(pg);
1979 }
1980
1981 /*
1982 * uvm_pageunlock2: release two page interlocks, and for both pages if a
1983 * page replacement intent is set on the page, pass it to uvmpdpol to make
1984 * real.
1985 *
1986 * => caller must hold pg->interlock
1987 */
1988 void
1989 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
1990 {
1991
1992 if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
1993 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
1994 mutex_exit(&pg1->interlock);
1995 pg1 = NULL;
1996 } else {
1997 pg1->pqflags |= PQ_INTENT_QUEUED;
1998 mutex_exit(&pg1->interlock);
1999 }
2000
2001 if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
2002 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
2003 mutex_exit(&pg2->interlock);
2004 pg2 = NULL;
2005 } else {
2006 pg2->pqflags |= PQ_INTENT_QUEUED;
2007 mutex_exit(&pg2->interlock);
2008 }
2009
2010 if (pg1 != NULL) {
2011 uvmpdpol_pagerealize(pg1);
2012 }
2013 if (pg2 != NULL) {
2014 uvmpdpol_pagerealize(pg2);
2015 }
2016 }
2017
2018 /*
2019 * uvm_pagezero: zero fill a page
2020 *
2021 * => if page is part of an object then the object should be locked
2022 * to protect pg->flags.
2023 */
2024
2025 void
2026 uvm_pagezero(struct vm_page *pg)
2027 {
2028
2029 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
2030 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
2031 }
2032
2033 /*
2034 * uvm_pagecopy: copy a page
2035 *
2036 * => if page is part of an object then the object should be locked
2037 * to protect pg->flags.
2038 */
2039
2040 void
2041 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
2042 {
2043
2044 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
2045 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
2046 }
2047
2048 /*
2049 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
2050 */
2051
2052 bool
2053 uvm_pageismanaged(paddr_t pa)
2054 {
2055
2056 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
2057 }
2058
2059 /*
2060 * uvm_page_lookup_freelist: look up the free list for the specified page
2061 */
2062
2063 int
2064 uvm_page_lookup_freelist(struct vm_page *pg)
2065 {
2066 uvm_physseg_t upm;
2067
2068 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
2069 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
2070 return uvm_physseg_get_free_list(upm);
2071 }
2072
2073 /*
2074 * uvm_page_owner_locked_p: return true if object associated with page is
2075 * locked. this is a weak check for runtime assertions only.
2076 */
2077
2078 bool
2079 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
2080 {
2081
2082 if (pg->uobject != NULL) {
2083 return exclusive
2084 ? rw_write_held(pg->uobject->vmobjlock)
2085 : rw_lock_held(pg->uobject->vmobjlock);
2086 }
2087 if (pg->uanon != NULL) {
2088 return exclusive
2089 ? rw_write_held(pg->uanon->an_lock)
2090 : rw_lock_held(pg->uanon->an_lock);
2091 }
2092 return true;
2093 }
2094
2095 /*
2096 * uvm_pagereadonly_p: return if the page should be mapped read-only
2097 */
2098
2099 bool
2100 uvm_pagereadonly_p(struct vm_page *pg)
2101 {
2102 struct uvm_object * const uobj = pg->uobject;
2103
2104 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
2105 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
2106 if ((pg->flags & PG_RDONLY) != 0) {
2107 return true;
2108 }
2109 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
2110 return true;
2111 }
2112 if (uobj == NULL) {
2113 return false;
2114 }
2115 return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
2116 }
2117
2118 #ifdef PMAP_DIRECT
2119 /*
2120 * Call pmap to translate physical address into a virtual and to run a callback
2121 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
2122 * or equivalent.
2123 */
2124 int
2125 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
2126 int (*process)(void *, size_t, void *), void *arg)
2127 {
2128 int error = 0;
2129 paddr_t pa;
2130 size_t todo;
2131 voff_t pgoff = (off & PAGE_MASK);
2132 struct vm_page *pg;
2133
2134 KASSERT(npages > 0 && len > 0);
2135
2136 for (int i = 0; i < npages; i++) {
2137 pg = pgs[i];
2138
2139 KASSERT(len > 0);
2140
2141 /*
2142 * Caller is responsible for ensuring all the pages are
2143 * available.
2144 */
2145 KASSERT(pg != NULL && pg != PGO_DONTCARE);
2146
2147 pa = VM_PAGE_TO_PHYS(pg);
2148 todo = MIN(len, PAGE_SIZE - pgoff);
2149
2150 error = pmap_direct_process(pa, pgoff, todo, process, arg);
2151 if (error)
2152 break;
2153
2154 pgoff = 0;
2155 len -= todo;
2156 }
2157
2158 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
2159 return error;
2160 }
2161 #endif /* PMAP_DIRECT */
2162
2163 #if defined(DDB) || defined(DEBUGPRINT)
2164
2165 /*
2166 * uvm_page_printit: actually print the page
2167 */
2168
2169 static const char page_flagbits[] = UVM_PGFLAGBITS;
2170 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
2171
2172 void
2173 uvm_page_printit(struct vm_page *pg, bool full,
2174 void (*pr)(const char *, ...))
2175 {
2176 struct vm_page *tpg;
2177 struct uvm_object *uobj;
2178 struct pgflbucket *pgb;
2179 struct pgflist *pgl;
2180 char pgbuf[128];
2181
2182 (*pr)("PAGE %p:\n", pg);
2183 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
2184 (*pr)(" flags=%s\n", pgbuf);
2185 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
2186 (*pr)(" pqflags=%s\n", pgbuf);
2187 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
2188 pg->uobject, pg->uanon, (long long)pg->offset);
2189 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
2190 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
2191 uvm_page_get_freelist(pg));
2192 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
2193 #if defined(UVM_PAGE_TRKOWN)
2194 if (pg->flags & PG_BUSY)
2195 (*pr)(" owning process = %d.%d, tag=%s\n",
2196 pg->owner, pg->lowner, pg->owner_tag);
2197 else
2198 (*pr)(" page not busy, no owner\n");
2199 #else
2200 (*pr)(" [page ownership tracking disabled]\n");
2201 #endif
2202
2203 if (!full)
2204 return;
2205
2206 /* cross-verify object/anon */
2207 if ((pg->flags & PG_FREE) == 0) {
2208 if (pg->flags & PG_ANON) {
2209 if (pg->uanon == NULL || pg->uanon->an_page != pg)
2210 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2211 (pg->uanon) ? pg->uanon->an_page : NULL);
2212 else
2213 (*pr)(" anon backpointer is OK\n");
2214 } else {
2215 uobj = pg->uobject;
2216 if (uobj) {
2217 (*pr)(" checking object list\n");
2218 tpg = uvm_pagelookup(uobj, pg->offset);
2219 if (tpg)
2220 (*pr)(" page found on object list\n");
2221 else
2222 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
2223 }
2224 }
2225 }
2226
2227 /* cross-verify page queue */
2228 if (pg->flags & PG_FREE) {
2229 int fl = uvm_page_get_freelist(pg);
2230 int b = uvm_page_get_bucket(pg);
2231 pgb = uvm.page_free[fl].pgfl_buckets[b];
2232 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
2233 (*pr)(" checking pageq list\n");
2234 LIST_FOREACH(tpg, pgl, pageq.list) {
2235 if (tpg == pg) {
2236 break;
2237 }
2238 }
2239 if (tpg)
2240 (*pr)(" page found on pageq list\n");
2241 else
2242 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
2243 }
2244 }
2245
2246 /*
2247 * uvm_page_printall - print a summary of all managed pages
2248 */
2249
2250 void
2251 uvm_page_printall(void (*pr)(const char *, ...))
2252 {
2253 uvm_physseg_t i;
2254 paddr_t pfn;
2255 struct vm_page *pg;
2256
2257 (*pr)("%18s %4s %4s %18s %18s"
2258 #ifdef UVM_PAGE_TRKOWN
2259 " OWNER"
2260 #endif
2261 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
2262 for (i = uvm_physseg_get_first();
2263 uvm_physseg_valid_p(i);
2264 i = uvm_physseg_get_next(i)) {
2265 for (pfn = uvm_physseg_get_start(i);
2266 pfn < uvm_physseg_get_end(i);
2267 pfn++) {
2268 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
2269
2270 (*pr)("%18p %04x %08x %18p %18p",
2271 pg, pg->flags, pg->pqflags, pg->uobject,
2272 pg->uanon);
2273 #ifdef UVM_PAGE_TRKOWN
2274 if (pg->flags & PG_BUSY)
2275 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
2276 #endif
2277 (*pr)("\n");
2278 }
2279 }
2280 }
2281
2282 /*
2283 * uvm_page_print_freelists - print a summary freelists
2284 */
2285
2286 void
2287 uvm_page_print_freelists(void (*pr)(const char *, ...))
2288 {
2289 struct pgfreelist *pgfl;
2290 struct pgflbucket *pgb;
2291 int fl, b, c;
2292
2293 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
2294 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
2295
2296 for (fl = 0; fl < VM_NFREELIST; fl++) {
2297 pgfl = &uvm.page_free[fl];
2298 (*pr)("freelist(%d) @ %p\n", fl, pgfl);
2299 for (b = 0; b < uvm.bucketcount; b++) {
2300 pgb = uvm.page_free[fl].pgfl_buckets[b];
2301 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
2302 b, pgb, pgb->pgb_nfree,
2303 &uvm_freelist_locks[b].lock);
2304 for (c = 0; c < uvmexp.ncolors; c++) {
2305 (*pr)(" color(%d) @ %p, ", c,
2306 &pgb->pgb_colors[c]);
2307 (*pr)("first page = %p\n",
2308 LIST_FIRST(&pgb->pgb_colors[c]));
2309 }
2310 }
2311 }
2312 }
2313
2314 #endif /* DDB || DEBUGPRINT */
2315