uvm_page.c revision 1.241 1 /* $NetBSD: uvm_page.c,v 1.241 2020/06/13 19:55:39 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1997 Charles D. Cranor and Washington University.
34 * Copyright (c) 1991, 1993, The Regents of the University of California.
35 *
36 * All rights reserved.
37 *
38 * This code is derived from software contributed to Berkeley by
39 * The Mach Operating System project at Carnegie-Mellon University.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
67 *
68 *
69 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
70 * All rights reserved.
71 *
72 * Permission to use, copy, modify and distribute this software and
73 * its documentation is hereby granted, provided that both the copyright
74 * notice and this permission notice appear in all copies of the
75 * software, derivative works or modified versions, and any portions
76 * thereof, and that both notices appear in supporting documentation.
77 *
78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
81 *
82 * Carnegie Mellon requests users of this software to return to
83 *
84 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
85 * School of Computer Science
86 * Carnegie Mellon University
87 * Pittsburgh PA 15213-3890
88 *
89 * any improvements or extensions that they make and grant Carnegie the
90 * rights to redistribute these changes.
91 */
92
93 /*
94 * uvm_page.c: page ops.
95 */
96
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.241 2020/06/13 19:55:39 ad Exp $");
99
100 #include "opt_ddb.h"
101 #include "opt_uvm.h"
102 #include "opt_uvmhist.h"
103 #include "opt_readahead.h"
104
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/sched.h>
108 #include <sys/kernel.h>
109 #include <sys/vnode.h>
110 #include <sys/proc.h>
111 #include <sys/radixtree.h>
112 #include <sys/atomic.h>
113 #include <sys/cpu.h>
114 #include <sys/extent.h>
115
116 #include <uvm/uvm.h>
117 #include <uvm/uvm_ddb.h>
118 #include <uvm/uvm_pdpolicy.h>
119 #include <uvm/uvm_pgflcache.h>
120
121 /*
122 * Some supported CPUs in a given architecture don't support all
123 * of the things necessary to do idle page zero'ing efficiently.
124 * We therefore provide a way to enable it from machdep code here.
125 */
126 bool vm_page_zero_enable = false;
127
128 /*
129 * number of pages per-CPU to reserve for the kernel.
130 */
131 #ifndef UVM_RESERVED_PAGES_PER_CPU
132 #define UVM_RESERVED_PAGES_PER_CPU 5
133 #endif
134 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
135
136 /*
137 * physical memory size;
138 */
139 psize_t physmem;
140
141 /*
142 * local variables
143 */
144
145 /*
146 * these variables record the values returned by vm_page_bootstrap,
147 * for debugging purposes. The implementation of uvm_pageboot_alloc
148 * and pmap_startup here also uses them internally.
149 */
150
151 static vaddr_t virtual_space_start;
152 static vaddr_t virtual_space_end;
153
154 /*
155 * we allocate an initial number of page colors in uvm_page_init(),
156 * and remember them. We may re-color pages as cache sizes are
157 * discovered during the autoconfiguration phase. But we can never
158 * free the initial set of buckets, since they are allocated using
159 * uvm_pageboot_alloc().
160 */
161
162 static size_t recolored_pages_memsize /* = 0 */;
163 static char *recolored_pages_mem;
164
165 /*
166 * freelist locks - one per bucket.
167 */
168
169 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
170 __cacheline_aligned;
171
172 /*
173 * basic NUMA information.
174 */
175
176 static struct uvm_page_numa_region {
177 struct uvm_page_numa_region *next;
178 paddr_t start;
179 paddr_t size;
180 u_int numa_id;
181 } *uvm_page_numa_region;
182
183 #ifdef DEBUG
184 kmutex_t uvm_zerochecklock __cacheline_aligned;
185 vaddr_t uvm_zerocheckkva;
186 #endif /* DEBUG */
187
188 /*
189 * These functions are reserved for uvm(9) internal use and are not
190 * exported in the header file uvm_physseg.h
191 *
192 * Thus they are redefined here.
193 */
194 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
195 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
196
197 /* returns a pgs array */
198 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
199
200 /*
201 * inline functions
202 */
203
204 /*
205 * uvm_pageinsert: insert a page in the object.
206 *
207 * => caller must lock object
208 * => call should have already set pg's object and offset pointers
209 * and bumped the version counter
210 */
211
212 static inline void
213 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
214 {
215
216 KASSERT(uobj == pg->uobject);
217 KASSERT(rw_write_held(uobj->vmobjlock));
218 KASSERT((pg->flags & PG_TABLED) == 0);
219
220 if ((pg->flags & PG_STAT) != 0) {
221 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
222 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
223
224 if ((pg->flags & PG_FILE) != 0) {
225 if (uobj->uo_npages == 0) {
226 struct vnode *vp = (struct vnode *)uobj;
227 mutex_enter(vp->v_interlock);
228 KASSERT((vp->v_iflag & VI_PAGES) == 0);
229 vp->v_iflag |= VI_PAGES;
230 vholdl(vp);
231 mutex_exit(vp->v_interlock);
232 }
233 if (UVM_OBJ_IS_VTEXT(uobj)) {
234 cpu_count(CPU_COUNT_EXECPAGES, 1);
235 }
236 cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
237 } else {
238 cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
239 }
240 }
241 pg->flags |= PG_TABLED;
242 uobj->uo_npages++;
243 }
244
245 static inline int
246 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
247 {
248 const uint64_t idx = pg->offset >> PAGE_SHIFT;
249 int error;
250
251 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
252 if (error != 0) {
253 return error;
254 }
255 if ((pg->flags & PG_CLEAN) == 0) {
256 radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG);
257 }
258 KASSERT(((pg->flags & PG_CLEAN) == 0) ==
259 radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG));
260 return 0;
261 }
262
263 /*
264 * uvm_page_remove: remove page from object.
265 *
266 * => caller must lock object
267 */
268
269 static inline void
270 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
271 {
272
273 KASSERT(uobj == pg->uobject);
274 KASSERT(rw_write_held(uobj->vmobjlock));
275 KASSERT(pg->flags & PG_TABLED);
276
277 if ((pg->flags & PG_STAT) != 0) {
278 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
279 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
280
281 if ((pg->flags & PG_FILE) != 0) {
282 if (uobj->uo_npages == 1) {
283 struct vnode *vp = (struct vnode *)uobj;
284 mutex_enter(vp->v_interlock);
285 KASSERT((vp->v_iflag & VI_PAGES) != 0);
286 vp->v_iflag &= ~VI_PAGES;
287 holdrelel(vp);
288 mutex_exit(vp->v_interlock);
289 }
290 if (UVM_OBJ_IS_VTEXT(uobj)) {
291 cpu_count(CPU_COUNT_EXECPAGES, -1);
292 }
293 cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
294 } else {
295 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
296 }
297 }
298 uobj->uo_npages--;
299 pg->flags &= ~PG_TABLED;
300 pg->uobject = NULL;
301 }
302
303 static inline void
304 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
305 {
306 struct vm_page *opg __unused;
307
308 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
309 KASSERT(pg == opg);
310 }
311
312 static void
313 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
314 {
315 int i;
316
317 pgb->pgb_nfree = 0;
318 for (i = 0; i < uvmexp.ncolors; i++) {
319 LIST_INIT(&pgb->pgb_colors[i]);
320 }
321 pgfl->pgfl_buckets[num] = pgb;
322 }
323
324 /*
325 * uvm_page_init: init the page system. called from uvm_init().
326 *
327 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
328 */
329
330 void
331 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
332 {
333 static struct uvm_cpu boot_cpu __cacheline_aligned;
334 psize_t freepages, pagecount, bucketsize, n;
335 struct pgflbucket *pgb;
336 struct vm_page *pagearray;
337 char *bucketarray;
338 uvm_physseg_t bank;
339 int fl, b;
340
341 KASSERT(ncpu <= 1);
342
343 /*
344 * init the page queues and free page queue locks, except the
345 * free list; we allocate that later (with the initial vm_page
346 * structures).
347 */
348
349 curcpu()->ci_data.cpu_uvm = &boot_cpu;
350 uvmpdpol_init();
351 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
352 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
353 }
354
355 /*
356 * allocate vm_page structures.
357 */
358
359 /*
360 * sanity check:
361 * before calling this function the MD code is expected to register
362 * some free RAM with the uvm_page_physload() function. our job
363 * now is to allocate vm_page structures for this memory.
364 */
365
366 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
367 panic("uvm_page_bootstrap: no memory pre-allocated");
368
369 /*
370 * first calculate the number of free pages...
371 *
372 * note that we use start/end rather than avail_start/avail_end.
373 * this allows us to allocate extra vm_page structures in case we
374 * want to return some memory to the pool after booting.
375 */
376
377 freepages = 0;
378
379 for (bank = uvm_physseg_get_first();
380 uvm_physseg_valid_p(bank) ;
381 bank = uvm_physseg_get_next(bank)) {
382 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
383 }
384
385 /*
386 * Let MD code initialize the number of colors, or default
387 * to 1 color if MD code doesn't care.
388 */
389 if (uvmexp.ncolors == 0)
390 uvmexp.ncolors = 1;
391 uvmexp.colormask = uvmexp.ncolors - 1;
392 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
393
394 /* We always start with only 1 bucket. */
395 uvm.bucketcount = 1;
396
397 /*
398 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
399 * use. for each page of memory we use we need a vm_page structure.
400 * thus, the total number of pages we can use is the total size of
401 * the memory divided by the PAGE_SIZE plus the size of the vm_page
402 * structure. we add one to freepages as a fudge factor to avoid
403 * truncation errors (since we can only allocate in terms of whole
404 * pages).
405 */
406 pagecount = ((freepages + 1) << PAGE_SHIFT) /
407 (PAGE_SIZE + sizeof(struct vm_page));
408 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
409 bucketsize = roundup2(bucketsize, coherency_unit);
410 bucketarray = (void *)uvm_pageboot_alloc(
411 bucketsize * VM_NFREELIST +
412 pagecount * sizeof(struct vm_page));
413 pagearray = (struct vm_page *)
414 (bucketarray + bucketsize * VM_NFREELIST);
415
416 for (fl = 0; fl < VM_NFREELIST; fl++) {
417 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
418 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
419 }
420 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
421
422 /*
423 * init the freelist cache in the disabled state.
424 */
425 uvm_pgflcache_init();
426
427 /*
428 * init the vm_page structures and put them in the correct place.
429 */
430 /* First init the extent */
431
432 for (bank = uvm_physseg_get_first(),
433 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
434 uvm_physseg_valid_p(bank);
435 bank = uvm_physseg_get_next(bank)) {
436
437 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
438 uvm_physseg_seg_alloc_from_slab(bank, n);
439 uvm_physseg_init_seg(bank, pagearray);
440
441 /* set up page array pointers */
442 pagearray += n;
443 pagecount -= n;
444 }
445
446 /*
447 * pass up the values of virtual_space_start and
448 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
449 * layers of the VM.
450 */
451
452 *kvm_startp = round_page(virtual_space_start);
453 *kvm_endp = trunc_page(virtual_space_end);
454 #ifdef DEBUG
455 /*
456 * steal kva for uvm_pagezerocheck().
457 */
458 uvm_zerocheckkva = *kvm_startp;
459 *kvm_startp += PAGE_SIZE;
460 mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM);
461 #endif /* DEBUG */
462
463 /*
464 * init various thresholds.
465 */
466
467 uvmexp.reserve_pagedaemon = 1;
468 uvmexp.reserve_kernel = vm_page_reserve_kernel;
469
470 /*
471 * done!
472 */
473
474 uvm.page_init_done = true;
475 }
476
477 /*
478 * uvm_pgfl_lock: lock all freelist buckets
479 */
480
481 void
482 uvm_pgfl_lock(void)
483 {
484 int i;
485
486 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
487 mutex_spin_enter(&uvm_freelist_locks[i].lock);
488 }
489 }
490
491 /*
492 * uvm_pgfl_unlock: unlock all freelist buckets
493 */
494
495 void
496 uvm_pgfl_unlock(void)
497 {
498 int i;
499
500 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
501 mutex_spin_exit(&uvm_freelist_locks[i].lock);
502 }
503 }
504
505 /*
506 * uvm_setpagesize: set the page size
507 *
508 * => sets page_shift and page_mask from uvmexp.pagesize.
509 */
510
511 void
512 uvm_setpagesize(void)
513 {
514
515 /*
516 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
517 * to be a constant (indicated by being a non-zero value).
518 */
519 if (uvmexp.pagesize == 0) {
520 if (PAGE_SIZE == 0)
521 panic("uvm_setpagesize: uvmexp.pagesize not set");
522 uvmexp.pagesize = PAGE_SIZE;
523 }
524 uvmexp.pagemask = uvmexp.pagesize - 1;
525 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
526 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
527 uvmexp.pagesize, uvmexp.pagesize);
528 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
529 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
530 break;
531 }
532
533 /*
534 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
535 */
536
537 vaddr_t
538 uvm_pageboot_alloc(vsize_t size)
539 {
540 static bool initialized = false;
541 vaddr_t addr;
542 #if !defined(PMAP_STEAL_MEMORY)
543 vaddr_t vaddr;
544 paddr_t paddr;
545 #endif
546
547 /*
548 * on first call to this function, initialize ourselves.
549 */
550 if (initialized == false) {
551 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
552
553 /* round it the way we like it */
554 virtual_space_start = round_page(virtual_space_start);
555 virtual_space_end = trunc_page(virtual_space_end);
556
557 initialized = true;
558 }
559
560 /* round to page size */
561 size = round_page(size);
562 uvmexp.bootpages += atop(size);
563
564 #if defined(PMAP_STEAL_MEMORY)
565
566 /*
567 * defer bootstrap allocation to MD code (it may want to allocate
568 * from a direct-mapped segment). pmap_steal_memory should adjust
569 * virtual_space_start/virtual_space_end if necessary.
570 */
571
572 addr = pmap_steal_memory(size, &virtual_space_start,
573 &virtual_space_end);
574
575 return(addr);
576
577 #else /* !PMAP_STEAL_MEMORY */
578
579 /*
580 * allocate virtual memory for this request
581 */
582 if (virtual_space_start == virtual_space_end ||
583 (virtual_space_end - virtual_space_start) < size)
584 panic("uvm_pageboot_alloc: out of virtual space");
585
586 addr = virtual_space_start;
587
588 #ifdef PMAP_GROWKERNEL
589 /*
590 * If the kernel pmap can't map the requested space,
591 * then allocate more resources for it.
592 */
593 if (uvm_maxkaddr < (addr + size)) {
594 uvm_maxkaddr = pmap_growkernel(addr + size);
595 if (uvm_maxkaddr < (addr + size))
596 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
597 }
598 #endif
599
600 virtual_space_start += size;
601
602 /*
603 * allocate and mapin physical pages to back new virtual pages
604 */
605
606 for (vaddr = round_page(addr) ; vaddr < addr + size ;
607 vaddr += PAGE_SIZE) {
608
609 if (!uvm_page_physget(&paddr))
610 panic("uvm_pageboot_alloc: out of memory");
611
612 /*
613 * Note this memory is no longer managed, so using
614 * pmap_kenter is safe.
615 */
616 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
617 }
618 pmap_update(pmap_kernel());
619 return(addr);
620 #endif /* PMAP_STEAL_MEMORY */
621 }
622
623 #if !defined(PMAP_STEAL_MEMORY)
624 /*
625 * uvm_page_physget: "steal" one page from the vm_physmem structure.
626 *
627 * => attempt to allocate it off the end of a segment in which the "avail"
628 * values match the start/end values. if we can't do that, then we
629 * will advance both values (making them equal, and removing some
630 * vm_page structures from the non-avail area).
631 * => return false if out of memory.
632 */
633
634 /* subroutine: try to allocate from memory chunks on the specified freelist */
635 static bool uvm_page_physget_freelist(paddr_t *, int);
636
637 static bool
638 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
639 {
640 uvm_physseg_t lcv;
641
642 /* pass 1: try allocating from a matching end */
643 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
644 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
645 #else
646 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
647 #endif
648 {
649 if (uvm.page_init_done == true)
650 panic("uvm_page_physget: called _after_ bootstrap");
651
652 /* Try to match at front or back on unused segment */
653 if (uvm_page_physunload(lcv, freelist, paddrp))
654 return true;
655 }
656
657 /* pass2: forget about matching ends, just allocate something */
658 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
659 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
660 #else
661 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
662 #endif
663 {
664 /* Try the front regardless. */
665 if (uvm_page_physunload_force(lcv, freelist, paddrp))
666 return true;
667 }
668 return false;
669 }
670
671 bool
672 uvm_page_physget(paddr_t *paddrp)
673 {
674 int i;
675
676 /* try in the order of freelist preference */
677 for (i = 0; i < VM_NFREELIST; i++)
678 if (uvm_page_physget_freelist(paddrp, i) == true)
679 return (true);
680 return (false);
681 }
682 #endif /* PMAP_STEAL_MEMORY */
683
684 /*
685 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
686 * back from an I/O mapping (ugh!). used in some MD code as well.
687 */
688 struct vm_page *
689 uvm_phys_to_vm_page(paddr_t pa)
690 {
691 paddr_t pf = atop(pa);
692 paddr_t off;
693 uvm_physseg_t upm;
694
695 upm = uvm_physseg_find(pf, &off);
696 if (upm != UVM_PHYSSEG_TYPE_INVALID)
697 return uvm_physseg_get_pg(upm, off);
698 return(NULL);
699 }
700
701 paddr_t
702 uvm_vm_page_to_phys(const struct vm_page *pg)
703 {
704
705 return pg->phys_addr & ~(PAGE_SIZE - 1);
706 }
707
708 /*
709 * uvm_page_numa_load: load NUMA range description.
710 */
711 void
712 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
713 {
714 struct uvm_page_numa_region *d;
715
716 KASSERT(numa_id < PGFL_MAX_BUCKETS);
717
718 d = kmem_alloc(sizeof(*d), KM_SLEEP);
719 d->start = start;
720 d->size = size;
721 d->numa_id = numa_id;
722 d->next = uvm_page_numa_region;
723 uvm_page_numa_region = d;
724 }
725
726 /*
727 * uvm_page_numa_lookup: lookup NUMA node for the given page.
728 */
729 static u_int
730 uvm_page_numa_lookup(struct vm_page *pg)
731 {
732 struct uvm_page_numa_region *d;
733 static bool warned;
734 paddr_t pa;
735
736 KASSERT(uvm_page_numa_region != NULL);
737
738 pa = VM_PAGE_TO_PHYS(pg);
739 for (d = uvm_page_numa_region; d != NULL; d = d->next) {
740 if (pa >= d->start && pa < d->start + d->size) {
741 return d->numa_id;
742 }
743 }
744
745 if (!warned) {
746 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
747 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
748 warned = true;
749 }
750
751 return 0;
752 }
753
754 /*
755 * uvm_page_redim: adjust freelist dimensions if they have changed.
756 */
757
758 static void
759 uvm_page_redim(int newncolors, int newnbuckets)
760 {
761 struct pgfreelist npgfl;
762 struct pgflbucket *opgb, *npgb;
763 struct pgflist *ohead, *nhead;
764 struct vm_page *pg;
765 size_t bucketsize, bucketmemsize, oldbucketmemsize;
766 int fl, ob, oc, nb, nc, obuckets, ocolors;
767 char *bucketarray, *oldbucketmem, *bucketmem;
768
769 KASSERT(((newncolors - 1) & newncolors) == 0);
770
771 /* Anything to do? */
772 if (newncolors <= uvmexp.ncolors &&
773 newnbuckets == uvm.bucketcount) {
774 return;
775 }
776 if (uvm.page_init_done == false) {
777 uvmexp.ncolors = newncolors;
778 return;
779 }
780
781 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
782 bucketsize = roundup2(bucketsize, coherency_unit);
783 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
784 coherency_unit - 1;
785 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
786 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
787
788 ocolors = uvmexp.ncolors;
789 obuckets = uvm.bucketcount;
790
791 /* Freelist cache musn't be enabled. */
792 uvm_pgflcache_pause();
793
794 /* Make sure we should still do this. */
795 uvm_pgfl_lock();
796 if (newncolors <= uvmexp.ncolors &&
797 newnbuckets == uvm.bucketcount) {
798 uvm_pgfl_unlock();
799 uvm_pgflcache_resume();
800 kmem_free(bucketmem, bucketmemsize);
801 return;
802 }
803
804 uvmexp.ncolors = newncolors;
805 uvmexp.colormask = uvmexp.ncolors - 1;
806 uvm.bucketcount = newnbuckets;
807
808 for (fl = 0; fl < VM_NFREELIST; fl++) {
809 /* Init new buckets in new freelist. */
810 memset(&npgfl, 0, sizeof(npgfl));
811 for (nb = 0; nb < newnbuckets; nb++) {
812 npgb = (struct pgflbucket *)bucketarray;
813 uvm_page_init_bucket(&npgfl, npgb, nb);
814 bucketarray += bucketsize;
815 }
816 /* Now transfer pages from the old freelist. */
817 for (nb = ob = 0; ob < obuckets; ob++) {
818 opgb = uvm.page_free[fl].pgfl_buckets[ob];
819 for (oc = 0; oc < ocolors; oc++) {
820 ohead = &opgb->pgb_colors[oc];
821 while ((pg = LIST_FIRST(ohead)) != NULL) {
822 LIST_REMOVE(pg, pageq.list);
823 /*
824 * Here we decide on the NEW color &
825 * bucket for the page. For NUMA
826 * we'll use the info that the
827 * hardware gave us. For non-NUMA
828 * assign take physical page frame
829 * number and cache color into
830 * account. We do this to try and
831 * avoid defeating any memory
832 * interleaving in the hardware.
833 */
834 KASSERT(
835 uvm_page_get_bucket(pg) == ob);
836 KASSERT(fl ==
837 uvm_page_get_freelist(pg));
838 if (uvm_page_numa_region != NULL) {
839 nb = uvm_page_numa_lookup(pg);
840 } else {
841 nb = atop(VM_PAGE_TO_PHYS(pg))
842 / uvmexp.ncolors / 8
843 % newnbuckets;
844 }
845 uvm_page_set_bucket(pg, nb);
846 npgb = npgfl.pgfl_buckets[nb];
847 npgb->pgb_nfree++;
848 nc = VM_PGCOLOR(pg);
849 nhead = &npgb->pgb_colors[nc];
850 LIST_INSERT_HEAD(nhead, pg, pageq.list);
851 }
852 }
853 }
854 /* Install the new freelist. */
855 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
856 }
857
858 /* Unlock and free the old memory. */
859 oldbucketmemsize = recolored_pages_memsize;
860 oldbucketmem = recolored_pages_mem;
861 recolored_pages_memsize = bucketmemsize;
862 recolored_pages_mem = bucketmem;
863
864 uvm_pgfl_unlock();
865 uvm_pgflcache_resume();
866
867 if (oldbucketmemsize) {
868 kmem_free(oldbucketmem, oldbucketmemsize);
869 }
870
871 /*
872 * this calls uvm_km_alloc() which may want to hold
873 * uvm_freelist_lock.
874 */
875 uvm_pager_realloc_emerg();
876 }
877
878 /*
879 * uvm_page_recolor: Recolor the pages if the new color count is
880 * larger than the old one.
881 */
882
883 void
884 uvm_page_recolor(int newncolors)
885 {
886
887 uvm_page_redim(newncolors, uvm.bucketcount);
888 }
889
890 /*
891 * uvm_page_rebucket: Determine a bucket structure and redim the free
892 * lists to match.
893 */
894
895 void
896 uvm_page_rebucket(void)
897 {
898 u_int min_numa, max_numa, npackage, shift;
899 struct cpu_info *ci, *ci2, *ci3;
900 CPU_INFO_ITERATOR cii;
901
902 /*
903 * If we have more than one NUMA node, and the maximum NUMA node ID
904 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
905 * for free pages.
906 */
907 min_numa = (u_int)-1;
908 max_numa = 0;
909 for (CPU_INFO_FOREACH(cii, ci)) {
910 if (ci->ci_numa_id < min_numa) {
911 min_numa = ci->ci_numa_id;
912 }
913 if (ci->ci_numa_id > max_numa) {
914 max_numa = ci->ci_numa_id;
915 }
916 }
917 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
918 aprint_debug("UVM: using NUMA allocation scheme\n");
919 for (CPU_INFO_FOREACH(cii, ci)) {
920 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
921 }
922 uvm_page_redim(uvmexp.ncolors, max_numa + 1);
923 return;
924 }
925
926 /*
927 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
928 * and minimise lock contention. Count the total number of CPU
929 * packages, and then try to distribute the buckets among CPU
930 * packages evenly.
931 */
932 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
933
934 /*
935 * Figure out how to arrange the packages & buckets, and the total
936 * number of buckets we need. XXX 2 may not be the best factor.
937 */
938 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
939 npackage >>= 1;
940 }
941 uvm_page_redim(uvmexp.ncolors, npackage);
942
943 /*
944 * Now tell each CPU which bucket to use. In the outer loop, scroll
945 * through all CPU packages.
946 */
947 npackage = 0;
948 ci = curcpu();
949 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
950 do {
951 /*
952 * In the inner loop, scroll through all CPUs in the package
953 * and assign the same bucket ID.
954 */
955 ci3 = ci2;
956 do {
957 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
958 ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
959 } while (ci3 != ci2);
960 npackage++;
961 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
962 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
963
964 aprint_debug("UVM: using package allocation scheme, "
965 "%d package(s) per bucket\n", 1 << shift);
966 }
967
968 /*
969 * uvm_cpu_attach: initialize per-CPU data structures.
970 */
971
972 void
973 uvm_cpu_attach(struct cpu_info *ci)
974 {
975 struct uvm_cpu *ucpu;
976
977 /* Already done in uvm_page_init(). */
978 if (!CPU_IS_PRIMARY(ci)) {
979 /* Add more reserve pages for this CPU. */
980 uvmexp.reserve_kernel += vm_page_reserve_kernel;
981
982 /* Allocate per-CPU data structures. */
983 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
984 KM_SLEEP);
985 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
986 coherency_unit);
987 ci->ci_data.cpu_uvm = ucpu;
988 } else {
989 ucpu = ci->ci_data.cpu_uvm;
990 }
991
992 uvmpdpol_init_cpu(ucpu);
993
994 /*
995 * Attach RNG source for this CPU's VM events
996 */
997 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
998 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
999 RND_FLAG_ESTIMATE_VALUE);
1000 }
1001
1002 /*
1003 * uvm_availmem: fetch the total amount of free memory in pages. this can
1004 * have a detrimental effect on performance due to false sharing; don't call
1005 * unless needed.
1006 *
1007 * some users can request the amount of free memory so often that it begins
1008 * to impact upon performance. if calling frequently and an inexact value
1009 * is okay, call with cached = true.
1010 */
1011
1012 int
1013 uvm_availmem(bool cached)
1014 {
1015 int64_t fp;
1016
1017 cpu_count_sync(cached);
1018 if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
1019 /*
1020 * XXXAD could briefly go negative because it's impossible
1021 * to get a clean snapshot. address this for other counters
1022 * used as running totals before NetBSD 10 although less
1023 * important for those.
1024 */
1025 fp = 0;
1026 }
1027 return (int)fp;
1028 }
1029
1030 /*
1031 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
1032 * specific freelist and specific bucket only.
1033 *
1034 * => must be at IPL_VM or higher to protect per-CPU data structures.
1035 */
1036
1037 static struct vm_page *
1038 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
1039 {
1040 int c, trycolor, colormask;
1041 struct pgflbucket *pgb;
1042 struct vm_page *pg;
1043 kmutex_t *lock;
1044 bool fill;
1045
1046 /*
1047 * Skip the bucket if empty, no lock needed. There could be many
1048 * empty freelists/buckets.
1049 */
1050 pgb = uvm.page_free[f].pgfl_buckets[b];
1051 if (pgb->pgb_nfree == 0) {
1052 return NULL;
1053 }
1054
1055 /* Skip bucket if low on memory. */
1056 lock = &uvm_freelist_locks[b].lock;
1057 mutex_spin_enter(lock);
1058 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
1059 if ((flags & UVM_PGA_USERESERVE) == 0 ||
1060 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
1061 curlwp != uvm.pagedaemon_lwp)) {
1062 mutex_spin_exit(lock);
1063 return NULL;
1064 }
1065 fill = false;
1066 } else {
1067 fill = true;
1068 }
1069
1070 /* Try all page colors as needed. */
1071 c = trycolor = *trycolorp;
1072 colormask = uvmexp.colormask;
1073 do {
1074 pg = LIST_FIRST(&pgb->pgb_colors[c]);
1075 if (__predict_true(pg != NULL)) {
1076 /*
1077 * Got a free page! PG_FREE must be cleared under
1078 * lock because of uvm_pglistalloc().
1079 */
1080 LIST_REMOVE(pg, pageq.list);
1081 KASSERT(pg->flags & PG_FREE);
1082 pg->flags &= PG_ZERO;
1083 pgb->pgb_nfree--;
1084
1085 /*
1086 * While we have the bucket locked and our data
1087 * structures fresh in L1 cache, we have an ideal
1088 * opportunity to grab some pages for the freelist
1089 * cache without causing extra contention. Only do
1090 * so if we found pages in this CPU's preferred
1091 * bucket.
1092 */
1093 if (__predict_true(b == ucpu->pgflbucket && fill)) {
1094 uvm_pgflcache_fill(ucpu, f, b, c);
1095 }
1096 mutex_spin_exit(lock);
1097 KASSERT(uvm_page_get_bucket(pg) == b);
1098 CPU_COUNT(c == trycolor ?
1099 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
1100 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
1101 *trycolorp = c;
1102 return pg;
1103 }
1104 c = (c + 1) & colormask;
1105 } while (c != trycolor);
1106 mutex_spin_exit(lock);
1107
1108 return NULL;
1109 }
1110
1111 /*
1112 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
1113 * any color from any bucket, in a specific freelist.
1114 *
1115 * => must be at IPL_VM or higher to protect per-CPU data structures.
1116 */
1117
1118 static struct vm_page *
1119 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
1120 {
1121 int b, trybucket, bucketcount;
1122 struct vm_page *pg;
1123
1124 /* Try for the exact thing in the per-CPU cache. */
1125 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
1126 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1127 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1128 return pg;
1129 }
1130
1131 /* Walk through all buckets, trying our preferred bucket first. */
1132 trybucket = ucpu->pgflbucket;
1133 b = trybucket;
1134 bucketcount = uvm.bucketcount;
1135 do {
1136 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
1137 if (pg != NULL) {
1138 return pg;
1139 }
1140 b = (b + 1 == bucketcount ? 0 : b + 1);
1141 } while (b != trybucket);
1142
1143 return NULL;
1144 }
1145
1146 /*
1147 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
1148 *
1149 * => return null if no pages free
1150 * => wake up pagedaemon if number of free pages drops below low water mark
1151 * => if obj != NULL, obj must be locked (to put in obj's tree)
1152 * => if anon != NULL, anon must be locked (to put in anon)
1153 * => only one of obj or anon can be non-null
1154 * => caller must activate/deactivate page if it is not wired.
1155 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
1156 * => policy decision: it is more important to pull a page off of the
1157 * appropriate priority free list than it is to get a zero'd or
1158 * unknown contents page. This is because we live with the
1159 * consequences of a bad free list decision for the entire
1160 * lifetime of the page, e.g. if the page comes from memory that
1161 * is slower to access.
1162 */
1163
1164 struct vm_page *
1165 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
1166 int flags, int strat, int free_list)
1167 {
1168 int zeroit = 0, color;
1169 int lcv, error, s;
1170 struct uvm_cpu *ucpu;
1171 struct vm_page *pg;
1172 lwp_t *l;
1173
1174 KASSERT(obj == NULL || anon == NULL);
1175 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
1176 KASSERT(off == trunc_page(off));
1177 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
1178 KASSERT(anon == NULL || anon->an_lock == NULL ||
1179 rw_write_held(anon->an_lock));
1180
1181 /*
1182 * This implements a global round-robin page coloring
1183 * algorithm.
1184 */
1185
1186 s = splvm();
1187 ucpu = curcpu()->ci_data.cpu_uvm;
1188 if (flags & UVM_FLAG_COLORMATCH) {
1189 color = atop(off) & uvmexp.colormask;
1190 } else {
1191 color = ucpu->pgflcolor;
1192 }
1193
1194 /*
1195 * fail if any of these conditions is true:
1196 * [1] there really are no free pages, or
1197 * [2] only kernel "reserved" pages remain and
1198 * reserved pages have not been requested.
1199 * [3] only pagedaemon "reserved" pages remain and
1200 * the requestor isn't the pagedaemon.
1201 * we make kernel reserve pages available if called by a
1202 * kernel thread.
1203 */
1204 l = curlwp;
1205 if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
1206 flags |= UVM_PGA_USERESERVE;
1207 }
1208
1209 again:
1210 switch (strat) {
1211 case UVM_PGA_STRAT_NORMAL:
1212 /* Check freelists: descending priority (ascending id) order. */
1213 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1214 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
1215 if (pg != NULL) {
1216 goto gotit;
1217 }
1218 }
1219
1220 /* No pages free! Have pagedaemon free some memory. */
1221 splx(s);
1222 uvm_kick_pdaemon();
1223 return NULL;
1224
1225 case UVM_PGA_STRAT_ONLY:
1226 case UVM_PGA_STRAT_FALLBACK:
1227 /* Attempt to allocate from the specified free list. */
1228 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
1229 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
1230 if (pg != NULL) {
1231 goto gotit;
1232 }
1233
1234 /* Fall back, if possible. */
1235 if (strat == UVM_PGA_STRAT_FALLBACK) {
1236 strat = UVM_PGA_STRAT_NORMAL;
1237 goto again;
1238 }
1239
1240 /* No pages free! Have pagedaemon free some memory. */
1241 splx(s);
1242 uvm_kick_pdaemon();
1243 return NULL;
1244
1245 case UVM_PGA_STRAT_NUMA:
1246 /*
1247 * NUMA strategy (experimental): allocating from the correct
1248 * bucket is more important than observing freelist
1249 * priority. Look only to the current NUMA node; if that
1250 * fails, we need to look to other NUMA nodes, so retry with
1251 * the normal strategy.
1252 */
1253 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1254 pg = uvm_pgflcache_alloc(ucpu, lcv, color);
1255 if (pg != NULL) {
1256 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1257 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1258 goto gotit;
1259 }
1260 pg = uvm_pagealloc_pgb(ucpu, lcv,
1261 ucpu->pgflbucket, &color, flags);
1262 if (pg != NULL) {
1263 goto gotit;
1264 }
1265 }
1266 strat = UVM_PGA_STRAT_NORMAL;
1267 goto again;
1268
1269 default:
1270 panic("uvm_pagealloc_strat: bad strat %d", strat);
1271 /* NOTREACHED */
1272 }
1273
1274 gotit:
1275 /*
1276 * We now know which color we actually allocated from; set
1277 * the next color accordingly.
1278 */
1279
1280 ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
1281
1282 /*
1283 * while still at IPL_VM, update allocation statistics and remember
1284 * if we have to zero the page
1285 */
1286
1287 CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
1288 if (flags & UVM_PGA_ZERO) {
1289 if (pg->flags & PG_ZERO) {
1290 CPU_COUNT(CPU_COUNT_PGA_ZEROHIT, 1);
1291 zeroit = 0;
1292 } else {
1293 CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
1294 zeroit = 1;
1295 }
1296 }
1297 if (pg->flags & PG_ZERO) {
1298 CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
1299 }
1300 if (anon) {
1301 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
1302 }
1303 splx(s);
1304 KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
1305
1306 /*
1307 * assign the page to the object. as the page was free, we know
1308 * that pg->uobject and pg->uanon are NULL. we only need to take
1309 * the page's interlock if we are changing the values.
1310 */
1311 if (anon != NULL || obj != NULL) {
1312 mutex_enter(&pg->interlock);
1313 }
1314 pg->offset = off;
1315 pg->uobject = obj;
1316 pg->uanon = anon;
1317 KASSERT(uvm_page_owner_locked_p(pg, true));
1318 pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE;
1319 if (anon) {
1320 anon->an_page = pg;
1321 pg->flags |= PG_ANON;
1322 mutex_exit(&pg->interlock);
1323 } else if (obj) {
1324 /*
1325 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
1326 */
1327 if (UVM_OBJ_IS_VNODE(obj)) {
1328 pg->flags |= PG_FILE;
1329 } else if (UVM_OBJ_IS_AOBJ(obj)) {
1330 pg->flags |= PG_AOBJ;
1331 }
1332 uvm_pageinsert_object(obj, pg);
1333 mutex_exit(&pg->interlock);
1334 error = uvm_pageinsert_tree(obj, pg);
1335 if (error != 0) {
1336 mutex_enter(&pg->interlock);
1337 uvm_pageremove_object(obj, pg);
1338 mutex_exit(&pg->interlock);
1339 uvm_pagefree(pg);
1340 return NULL;
1341 }
1342 }
1343
1344 #if defined(UVM_PAGE_TRKOWN)
1345 pg->owner_tag = NULL;
1346 #endif
1347 UVM_PAGE_OWN(pg, "new alloc");
1348
1349 if (flags & UVM_PGA_ZERO) {
1350 /*
1351 * A zero'd page is not clean. If we got a page not already
1352 * zero'd, then we have to zero it ourselves.
1353 */
1354 if (obj != NULL || anon != NULL) {
1355 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1356 }
1357 if (zeroit) {
1358 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1359 }
1360 }
1361
1362 return(pg);
1363 }
1364
1365 /*
1366 * uvm_pagereplace: replace a page with another
1367 *
1368 * => object must be locked
1369 * => page interlocks must be held
1370 */
1371
1372 void
1373 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1374 {
1375 struct uvm_object *uobj = oldpg->uobject;
1376 struct vm_page *pg __diagused;
1377 uint64_t idx;
1378
1379 KASSERT((oldpg->flags & PG_TABLED) != 0);
1380 KASSERT(uobj != NULL);
1381 KASSERT((newpg->flags & PG_TABLED) == 0);
1382 KASSERT(newpg->uobject == NULL);
1383 KASSERT(rw_write_held(uobj->vmobjlock));
1384 KASSERT(mutex_owned(&oldpg->interlock));
1385 KASSERT(mutex_owned(&newpg->interlock));
1386
1387 newpg->uobject = uobj;
1388 newpg->offset = oldpg->offset;
1389 idx = newpg->offset >> PAGE_SHIFT;
1390 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
1391 KASSERT(pg == oldpg);
1392 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
1393 if ((newpg->flags & PG_CLEAN) != 0) {
1394 radix_tree_clear_tag(&uobj->uo_pages, idx,
1395 UVM_PAGE_DIRTY_TAG);
1396 } else {
1397 radix_tree_set_tag(&uobj->uo_pages, idx,
1398 UVM_PAGE_DIRTY_TAG);
1399 }
1400 }
1401 /*
1402 * oldpg's PG_STAT is stable. newpg is not reachable by others yet.
1403 */
1404 newpg->flags |=
1405 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
1406 uvm_pageinsert_object(uobj, newpg);
1407 uvm_pageremove_object(uobj, oldpg);
1408 }
1409
1410 /*
1411 * uvm_pagerealloc: reallocate a page from one object to another
1412 *
1413 * => both objects must be locked
1414 */
1415
1416 int
1417 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1418 {
1419 int error = 0;
1420
1421 /*
1422 * remove it from the old object
1423 */
1424
1425 if (pg->uobject) {
1426 uvm_pageremove_tree(pg->uobject, pg);
1427 uvm_pageremove_object(pg->uobject, pg);
1428 }
1429
1430 /*
1431 * put it in the new object
1432 */
1433
1434 if (newobj) {
1435 mutex_enter(&pg->interlock);
1436 pg->uobject = newobj;
1437 pg->offset = newoff;
1438 if (UVM_OBJ_IS_VNODE(newobj)) {
1439 pg->flags |= PG_FILE;
1440 } else if (UVM_OBJ_IS_AOBJ(newobj)) {
1441 pg->flags |= PG_AOBJ;
1442 }
1443 uvm_pageinsert_object(newobj, pg);
1444 mutex_exit(&pg->interlock);
1445 error = uvm_pageinsert_tree(newobj, pg);
1446 if (error != 0) {
1447 mutex_enter(&pg->interlock);
1448 uvm_pageremove_object(newobj, pg);
1449 mutex_exit(&pg->interlock);
1450 }
1451 }
1452
1453 return error;
1454 }
1455
1456 #ifdef DEBUG
1457 /*
1458 * check if page is zero-filled
1459 */
1460 void
1461 uvm_pagezerocheck(struct vm_page *pg)
1462 {
1463 int *p, *ep;
1464
1465 KASSERT(uvm_zerocheckkva != 0);
1466
1467 /*
1468 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1469 * uvm page allocator.
1470 *
1471 * it might be better to have "CPU-local temporary map" pmap interface.
1472 */
1473 mutex_spin_enter(&uvm_zerochecklock);
1474 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1475 p = (int *)uvm_zerocheckkva;
1476 ep = (int *)((char *)p + PAGE_SIZE);
1477 pmap_update(pmap_kernel());
1478 while (p < ep) {
1479 if (*p != 0)
1480 panic("PG_ZERO page isn't zero-filled");
1481 p++;
1482 }
1483 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1484 mutex_spin_exit(&uvm_zerochecklock);
1485 /*
1486 * pmap_update() is not necessary here because no one except us
1487 * uses this VA.
1488 */
1489 }
1490 #endif /* DEBUG */
1491
1492 /*
1493 * uvm_pagefree: free page
1494 *
1495 * => erase page's identity (i.e. remove from object)
1496 * => put page on free list
1497 * => caller must lock owning object (either anon or uvm_object)
1498 * => assumes all valid mappings of pg are gone
1499 */
1500
1501 void
1502 uvm_pagefree(struct vm_page *pg)
1503 {
1504 struct pgfreelist *pgfl;
1505 struct pgflbucket *pgb;
1506 struct uvm_cpu *ucpu;
1507 kmutex_t *lock;
1508 int bucket, s;
1509 bool locked;
1510
1511 #ifdef DEBUG
1512 if (pg->uobject == (void *)0xdeadbeef &&
1513 pg->uanon == (void *)0xdeadbeef) {
1514 panic("uvm_pagefree: freeing free page %p", pg);
1515 }
1516 #endif /* DEBUG */
1517
1518 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1519 KASSERT(!(pg->flags & PG_FREE));
1520 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
1521 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1522 rw_write_held(pg->uanon->an_lock));
1523
1524 /*
1525 * remove the page from the object's tree before acquiring any page
1526 * interlocks: this can acquire locks to free radixtree nodes.
1527 */
1528 if (pg->uobject != NULL) {
1529 uvm_pageremove_tree(pg->uobject, pg);
1530 }
1531
1532 /*
1533 * if the page is loaned, resolve the loan instead of freeing.
1534 */
1535
1536 if (pg->loan_count) {
1537 KASSERT(pg->wire_count == 0);
1538
1539 /*
1540 * if the page is owned by an anon then we just want to
1541 * drop anon ownership. the kernel will free the page when
1542 * it is done with it. if the page is owned by an object,
1543 * remove it from the object and mark it dirty for the benefit
1544 * of possible anon owners.
1545 *
1546 * regardless of previous ownership, wakeup any waiters,
1547 * unbusy the page, and we're done.
1548 */
1549
1550 uvm_pagelock(pg);
1551 locked = true;
1552 if (pg->uobject != NULL) {
1553 uvm_pageremove_object(pg->uobject, pg);
1554 pg->flags &= ~(PG_FILE|PG_AOBJ);
1555 } else if (pg->uanon != NULL) {
1556 if ((pg->flags & PG_ANON) == 0) {
1557 pg->loan_count--;
1558 } else {
1559 const unsigned status = uvm_pagegetdirty(pg);
1560 pg->flags &= ~PG_ANON;
1561 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1562 }
1563 pg->uanon->an_page = NULL;
1564 pg->uanon = NULL;
1565 }
1566 if (pg->pqflags & PQ_WANTED) {
1567 wakeup(pg);
1568 }
1569 pg->pqflags &= ~PQ_WANTED;
1570 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
1571 #ifdef UVM_PAGE_TRKOWN
1572 pg->owner_tag = NULL;
1573 #endif
1574 KASSERT((pg->flags & PG_STAT) == 0);
1575 if (pg->loan_count) {
1576 KASSERT(pg->uobject == NULL);
1577 if (pg->uanon == NULL) {
1578 uvm_pagedequeue(pg);
1579 }
1580 uvm_pageunlock(pg);
1581 return;
1582 }
1583 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1584 pg->wire_count != 0) {
1585 uvm_pagelock(pg);
1586 locked = true;
1587 } else {
1588 locked = false;
1589 }
1590
1591 /*
1592 * remove page from its object or anon.
1593 */
1594 if (pg->uobject != NULL) {
1595 uvm_pageremove_object(pg->uobject, pg);
1596 } else if (pg->uanon != NULL) {
1597 const unsigned int status = uvm_pagegetdirty(pg);
1598 pg->uanon->an_page = NULL;
1599 pg->uanon = NULL;
1600 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1601 }
1602
1603 /*
1604 * if the page was wired, unwire it now.
1605 */
1606
1607 if (pg->wire_count) {
1608 pg->wire_count = 0;
1609 atomic_dec_uint(&uvmexp.wired);
1610 }
1611 if (locked) {
1612 /*
1613 * wake anyone waiting on the page.
1614 */
1615 if ((pg->pqflags & PQ_WANTED) != 0) {
1616 pg->pqflags &= ~PQ_WANTED;
1617 wakeup(pg);
1618 }
1619
1620 /*
1621 * now remove the page from the queues.
1622 */
1623 uvm_pagedequeue(pg);
1624 uvm_pageunlock(pg);
1625 } else {
1626 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1627 }
1628
1629 /*
1630 * and put on free queue
1631 */
1632
1633 #ifdef DEBUG
1634 pg->uobject = (void *)0xdeadbeef;
1635 pg->uanon = (void *)0xdeadbeef;
1636 if (pg->flags & PG_ZERO)
1637 uvm_pagezerocheck(pg);
1638 #endif /* DEBUG */
1639
1640 /* Try to send the page to the per-CPU cache. */
1641 s = splvm();
1642 CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
1643 if (pg->flags & PG_ZERO) {
1644 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1645 }
1646 ucpu = curcpu()->ci_data.cpu_uvm;
1647 bucket = uvm_page_get_bucket(pg);
1648 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
1649 splx(s);
1650 return;
1651 }
1652
1653 /* Didn't work. Never mind, send it to a global bucket. */
1654 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
1655 pgb = pgfl->pgfl_buckets[bucket];
1656 lock = &uvm_freelist_locks[bucket].lock;
1657
1658 mutex_spin_enter(lock);
1659 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
1660 pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
1661 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
1662 pgb->pgb_nfree++;
1663 mutex_spin_exit(lock);
1664 splx(s);
1665 }
1666
1667 /*
1668 * uvm_page_unbusy: unbusy an array of pages.
1669 *
1670 * => pages must either all belong to the same object, or all belong to anons.
1671 * => if pages are object-owned, object must be locked.
1672 * => if pages are anon-owned, anons must be locked.
1673 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1674 */
1675
1676 void
1677 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1678 {
1679 struct vm_page *pg;
1680 int i;
1681 UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
1682
1683 for (i = 0; i < npgs; i++) {
1684 pg = pgs[i];
1685 if (pg == NULL || pg == PGO_DONTCARE) {
1686 continue;
1687 }
1688
1689 KASSERT(uvm_page_owner_locked_p(pg, true));
1690 KASSERT(pg->flags & PG_BUSY);
1691 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1692 if (pg->flags & PG_RELEASED) {
1693 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1694 (uintptr_t)pg, 0, 0, 0);
1695 KASSERT(pg->uobject != NULL ||
1696 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1697 pg->flags &= ~PG_RELEASED;
1698 uvm_pagefree(pg);
1699 } else {
1700 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1701 (uintptr_t)pg, 0, 0, 0);
1702 KASSERT((pg->flags & PG_FAKE) == 0);
1703 pg->flags &= ~PG_BUSY;
1704 uvm_pagelock(pg);
1705 uvm_pagewakeup(pg);
1706 uvm_pageunlock(pg);
1707 UVM_PAGE_OWN(pg, NULL);
1708 }
1709 }
1710 }
1711
1712 /*
1713 * uvm_pagewait: wait for a busy page
1714 *
1715 * => page must be known PG_BUSY
1716 * => object must be read or write locked
1717 * => object will be unlocked on return
1718 */
1719
1720 void
1721 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
1722 {
1723
1724 KASSERT(rw_lock_held(lock));
1725 KASSERT((pg->flags & PG_BUSY) != 0);
1726 KASSERT(uvm_page_owner_locked_p(pg, false));
1727
1728 mutex_enter(&pg->interlock);
1729 pg->pqflags |= PQ_WANTED;
1730 rw_exit(lock);
1731 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
1732 }
1733
1734 /*
1735 * uvm_pagewakeup: wake anyone waiting on a page
1736 *
1737 * => page interlock must be held
1738 */
1739
1740 void
1741 uvm_pagewakeup(struct vm_page *pg)
1742 {
1743 UVMHIST_FUNC("uvm_pagewakeup"); UVMHIST_CALLED(ubchist);
1744
1745 KASSERT(mutex_owned(&pg->interlock));
1746
1747 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
1748
1749 if ((pg->pqflags & PQ_WANTED) != 0) {
1750 wakeup(pg);
1751 pg->pqflags &= ~PQ_WANTED;
1752 }
1753 }
1754
1755 /*
1756 * uvm_pagewanted_p: return true if someone is waiting on the page
1757 *
1758 * => object must be write locked (lock out all concurrent access)
1759 */
1760
1761 bool
1762 uvm_pagewanted_p(struct vm_page *pg)
1763 {
1764
1765 KASSERT(uvm_page_owner_locked_p(pg, true));
1766
1767 return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
1768 }
1769
1770 #if defined(UVM_PAGE_TRKOWN)
1771 /*
1772 * uvm_page_own: set or release page ownership
1773 *
1774 * => this is a debugging function that keeps track of who sets PG_BUSY
1775 * and where they do it. it can be used to track down problems
1776 * such a process setting "PG_BUSY" and never releasing it.
1777 * => page's object [if any] must be locked
1778 * => if "tag" is NULL then we are releasing page ownership
1779 */
1780 void
1781 uvm_page_own(struct vm_page *pg, const char *tag)
1782 {
1783
1784 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1785 KASSERT(uvm_page_owner_locked_p(pg, true));
1786
1787 /* gain ownership? */
1788 if (tag) {
1789 KASSERT((pg->flags & PG_BUSY) != 0);
1790 if (pg->owner_tag) {
1791 printf("uvm_page_own: page %p already owned "
1792 "by proc %d.%d [%s]\n", pg,
1793 pg->owner, pg->lowner, pg->owner_tag);
1794 panic("uvm_page_own");
1795 }
1796 pg->owner = curproc->p_pid;
1797 pg->lowner = curlwp->l_lid;
1798 pg->owner_tag = tag;
1799 return;
1800 }
1801
1802 /* drop ownership */
1803 KASSERT((pg->flags & PG_BUSY) == 0);
1804 if (pg->owner_tag == NULL) {
1805 printf("uvm_page_own: dropping ownership of an non-owned "
1806 "page (%p)\n", pg);
1807 panic("uvm_page_own");
1808 }
1809 pg->owner_tag = NULL;
1810 }
1811 #endif
1812
1813 /*
1814 * uvm_pageidlezero: zero free pages while the system is idle.
1815 */
1816 void
1817 uvm_pageidlezero(void)
1818 {
1819
1820 /*
1821 * Disabled for the moment. Previous strategy too cache heavy. In
1822 * the future we may experiment with zeroing the pages held in the
1823 * per-CPU cache (uvm_pgflcache).
1824 */
1825 }
1826
1827 /*
1828 * uvm_pagelookup: look up a page
1829 *
1830 * => caller should lock object to keep someone from pulling the page
1831 * out from under it
1832 */
1833
1834 struct vm_page *
1835 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1836 {
1837 struct vm_page *pg;
1838
1839 /* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */
1840
1841 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1842
1843 KASSERT(pg == NULL || obj->uo_npages != 0);
1844 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1845 (pg->flags & PG_BUSY) != 0);
1846 return pg;
1847 }
1848
1849 /*
1850 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1851 *
1852 * => caller must lock objects
1853 * => caller must hold pg->interlock
1854 */
1855
1856 void
1857 uvm_pagewire(struct vm_page *pg)
1858 {
1859
1860 KASSERT(uvm_page_owner_locked_p(pg, true));
1861 KASSERT(mutex_owned(&pg->interlock));
1862 #if defined(READAHEAD_STATS)
1863 if ((pg->flags & PG_READAHEAD) != 0) {
1864 uvm_ra_hit.ev_count++;
1865 pg->flags &= ~PG_READAHEAD;
1866 }
1867 #endif /* defined(READAHEAD_STATS) */
1868 if (pg->wire_count == 0) {
1869 uvm_pagedequeue(pg);
1870 atomic_inc_uint(&uvmexp.wired);
1871 }
1872 pg->wire_count++;
1873 KASSERT(pg->wire_count > 0); /* detect wraparound */
1874 }
1875
1876 /*
1877 * uvm_pageunwire: unwire the page.
1878 *
1879 * => activate if wire count goes to zero.
1880 * => caller must lock objects
1881 * => caller must hold pg->interlock
1882 */
1883
1884 void
1885 uvm_pageunwire(struct vm_page *pg)
1886 {
1887
1888 KASSERT(uvm_page_owner_locked_p(pg, true));
1889 KASSERT(pg->wire_count != 0);
1890 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1891 KASSERT(mutex_owned(&pg->interlock));
1892 pg->wire_count--;
1893 if (pg->wire_count == 0) {
1894 uvm_pageactivate(pg);
1895 KASSERT(uvmexp.wired != 0);
1896 atomic_dec_uint(&uvmexp.wired);
1897 }
1898 }
1899
1900 /*
1901 * uvm_pagedeactivate: deactivate page
1902 *
1903 * => caller must lock objects
1904 * => caller must check to make sure page is not wired
1905 * => object that page belongs to must be locked (so we can adjust pg->flags)
1906 * => caller must clear the reference on the page before calling
1907 * => caller must hold pg->interlock
1908 */
1909
1910 void
1911 uvm_pagedeactivate(struct vm_page *pg)
1912 {
1913
1914 KASSERT(uvm_page_owner_locked_p(pg, false));
1915 KASSERT(mutex_owned(&pg->interlock));
1916 if (pg->wire_count == 0) {
1917 KASSERT(uvmpdpol_pageisqueued_p(pg));
1918 uvmpdpol_pagedeactivate(pg);
1919 }
1920 }
1921
1922 /*
1923 * uvm_pageactivate: activate page
1924 *
1925 * => caller must lock objects
1926 * => caller must hold pg->interlock
1927 */
1928
1929 void
1930 uvm_pageactivate(struct vm_page *pg)
1931 {
1932
1933 KASSERT(uvm_page_owner_locked_p(pg, false));
1934 KASSERT(mutex_owned(&pg->interlock));
1935 #if defined(READAHEAD_STATS)
1936 if ((pg->flags & PG_READAHEAD) != 0) {
1937 uvm_ra_hit.ev_count++;
1938 pg->flags &= ~PG_READAHEAD;
1939 }
1940 #endif /* defined(READAHEAD_STATS) */
1941 if (pg->wire_count == 0) {
1942 uvmpdpol_pageactivate(pg);
1943 }
1944 }
1945
1946 /*
1947 * uvm_pagedequeue: remove a page from any paging queue
1948 *
1949 * => caller must lock objects
1950 * => caller must hold pg->interlock
1951 */
1952 void
1953 uvm_pagedequeue(struct vm_page *pg)
1954 {
1955
1956 KASSERT(uvm_page_owner_locked_p(pg, true));
1957 KASSERT(mutex_owned(&pg->interlock));
1958 if (uvmpdpol_pageisqueued_p(pg)) {
1959 uvmpdpol_pagedequeue(pg);
1960 }
1961 }
1962
1963 /*
1964 * uvm_pageenqueue: add a page to a paging queue without activating.
1965 * used where a page is not really demanded (yet). eg. read-ahead
1966 *
1967 * => caller must lock objects
1968 * => caller must hold pg->interlock
1969 */
1970 void
1971 uvm_pageenqueue(struct vm_page *pg)
1972 {
1973
1974 KASSERT(uvm_page_owner_locked_p(pg, false));
1975 KASSERT(mutex_owned(&pg->interlock));
1976 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1977 uvmpdpol_pageenqueue(pg);
1978 }
1979 }
1980
1981 /*
1982 * uvm_pagelock: acquire page interlock
1983 */
1984 void
1985 uvm_pagelock(struct vm_page *pg)
1986 {
1987
1988 mutex_enter(&pg->interlock);
1989 }
1990
1991 /*
1992 * uvm_pagelock2: acquire two page interlocks
1993 */
1994 void
1995 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
1996 {
1997
1998 if (pg1 < pg2) {
1999 mutex_enter(&pg1->interlock);
2000 mutex_enter(&pg2->interlock);
2001 } else {
2002 mutex_enter(&pg2->interlock);
2003 mutex_enter(&pg1->interlock);
2004 }
2005 }
2006
2007 /*
2008 * uvm_pageunlock: release page interlock, and if a page replacement intent
2009 * is set on the page, pass it to uvmpdpol to make real.
2010 *
2011 * => caller must hold pg->interlock
2012 */
2013 void
2014 uvm_pageunlock(struct vm_page *pg)
2015 {
2016
2017 if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
2018 (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
2019 mutex_exit(&pg->interlock);
2020 return;
2021 }
2022 pg->pqflags |= PQ_INTENT_QUEUED;
2023 mutex_exit(&pg->interlock);
2024 uvmpdpol_pagerealize(pg);
2025 }
2026
2027 /*
2028 * uvm_pageunlock2: release two page interlocks, and for both pages if a
2029 * page replacement intent is set on the page, pass it to uvmpdpol to make
2030 * real.
2031 *
2032 * => caller must hold pg->interlock
2033 */
2034 void
2035 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
2036 {
2037
2038 if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
2039 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
2040 mutex_exit(&pg1->interlock);
2041 pg1 = NULL;
2042 } else {
2043 pg1->pqflags |= PQ_INTENT_QUEUED;
2044 mutex_exit(&pg1->interlock);
2045 }
2046
2047 if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
2048 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
2049 mutex_exit(&pg2->interlock);
2050 pg2 = NULL;
2051 } else {
2052 pg2->pqflags |= PQ_INTENT_QUEUED;
2053 mutex_exit(&pg2->interlock);
2054 }
2055
2056 if (pg1 != NULL) {
2057 uvmpdpol_pagerealize(pg1);
2058 }
2059 if (pg2 != NULL) {
2060 uvmpdpol_pagerealize(pg2);
2061 }
2062 }
2063
2064 /*
2065 * uvm_pagezero: zero fill a page
2066 *
2067 * => if page is part of an object then the object should be locked
2068 * to protect pg->flags.
2069 */
2070
2071 void
2072 uvm_pagezero(struct vm_page *pg)
2073 {
2074
2075 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
2076 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
2077 }
2078
2079 /*
2080 * uvm_pagecopy: copy a page
2081 *
2082 * => if page is part of an object then the object should be locked
2083 * to protect pg->flags.
2084 */
2085
2086 void
2087 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
2088 {
2089
2090 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
2091 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
2092 }
2093
2094 /*
2095 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
2096 */
2097
2098 bool
2099 uvm_pageismanaged(paddr_t pa)
2100 {
2101
2102 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
2103 }
2104
2105 /*
2106 * uvm_page_lookup_freelist: look up the free list for the specified page
2107 */
2108
2109 int
2110 uvm_page_lookup_freelist(struct vm_page *pg)
2111 {
2112 uvm_physseg_t upm;
2113
2114 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
2115 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
2116 return uvm_physseg_get_free_list(upm);
2117 }
2118
2119 /*
2120 * uvm_page_owner_locked_p: return true if object associated with page is
2121 * locked. this is a weak check for runtime assertions only.
2122 */
2123
2124 bool
2125 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
2126 {
2127
2128 if (pg->uobject != NULL) {
2129 return exclusive
2130 ? rw_write_held(pg->uobject->vmobjlock)
2131 : rw_lock_held(pg->uobject->vmobjlock);
2132 }
2133 if (pg->uanon != NULL) {
2134 return exclusive
2135 ? rw_write_held(pg->uanon->an_lock)
2136 : rw_lock_held(pg->uanon->an_lock);
2137 }
2138 return true;
2139 }
2140
2141 /*
2142 * uvm_pagereadonly_p: return if the page should be mapped read-only
2143 */
2144
2145 bool
2146 uvm_pagereadonly_p(struct vm_page *pg)
2147 {
2148 struct uvm_object * const uobj = pg->uobject;
2149
2150 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
2151 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
2152 if ((pg->flags & PG_RDONLY) != 0) {
2153 return true;
2154 }
2155 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
2156 return true;
2157 }
2158 if (uobj == NULL) {
2159 return false;
2160 }
2161 return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
2162 }
2163
2164 #ifdef PMAP_DIRECT
2165 /*
2166 * Call pmap to translate physical address into a virtual and to run a callback
2167 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
2168 * or equivalent.
2169 */
2170 int
2171 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
2172 int (*process)(void *, size_t, void *), void *arg)
2173 {
2174 int error = 0;
2175 paddr_t pa;
2176 size_t todo;
2177 voff_t pgoff = (off & PAGE_MASK);
2178 struct vm_page *pg;
2179
2180 KASSERT(npages > 0 && len > 0);
2181
2182 for (int i = 0; i < npages; i++) {
2183 pg = pgs[i];
2184
2185 KASSERT(len > 0);
2186
2187 /*
2188 * Caller is responsible for ensuring all the pages are
2189 * available.
2190 */
2191 KASSERT(pg != NULL && pg != PGO_DONTCARE);
2192
2193 pa = VM_PAGE_TO_PHYS(pg);
2194 todo = MIN(len, PAGE_SIZE - pgoff);
2195
2196 error = pmap_direct_process(pa, pgoff, todo, process, arg);
2197 if (error)
2198 break;
2199
2200 pgoff = 0;
2201 len -= todo;
2202 }
2203
2204 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
2205 return error;
2206 }
2207 #endif /* PMAP_DIRECT */
2208
2209 #if defined(DDB) || defined(DEBUGPRINT)
2210
2211 /*
2212 * uvm_page_printit: actually print the page
2213 */
2214
2215 static const char page_flagbits[] = UVM_PGFLAGBITS;
2216 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
2217
2218 void
2219 uvm_page_printit(struct vm_page *pg, bool full,
2220 void (*pr)(const char *, ...))
2221 {
2222 struct vm_page *tpg;
2223 struct uvm_object *uobj;
2224 struct pgflbucket *pgb;
2225 struct pgflist *pgl;
2226 char pgbuf[128];
2227
2228 (*pr)("PAGE %p:\n", pg);
2229 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
2230 (*pr)(" flags=%s\n", pgbuf);
2231 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
2232 (*pr)(" pqflags=%s\n", pgbuf);
2233 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
2234 pg->uobject, pg->uanon, (long long)pg->offset);
2235 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
2236 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
2237 uvm_page_get_freelist(pg));
2238 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
2239 #if defined(UVM_PAGE_TRKOWN)
2240 if (pg->flags & PG_BUSY)
2241 (*pr)(" owning process = %d.%d, tag=%s\n",
2242 pg->owner, pg->lowner, pg->owner_tag);
2243 else
2244 (*pr)(" page not busy, no owner\n");
2245 #else
2246 (*pr)(" [page ownership tracking disabled]\n");
2247 #endif
2248
2249 if (!full)
2250 return;
2251
2252 /* cross-verify object/anon */
2253 if ((pg->flags & PG_FREE) == 0) {
2254 if (pg->flags & PG_ANON) {
2255 if (pg->uanon == NULL || pg->uanon->an_page != pg)
2256 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2257 (pg->uanon) ? pg->uanon->an_page : NULL);
2258 else
2259 (*pr)(" anon backpointer is OK\n");
2260 } else {
2261 uobj = pg->uobject;
2262 if (uobj) {
2263 (*pr)(" checking object list\n");
2264 tpg = uvm_pagelookup(uobj, pg->offset);
2265 if (tpg)
2266 (*pr)(" page found on object list\n");
2267 else
2268 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
2269 }
2270 }
2271 }
2272
2273 /* cross-verify page queue */
2274 if (pg->flags & PG_FREE) {
2275 int fl = uvm_page_get_freelist(pg);
2276 int b = uvm_page_get_bucket(pg);
2277 pgb = uvm.page_free[fl].pgfl_buckets[b];
2278 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
2279 (*pr)(" checking pageq list\n");
2280 LIST_FOREACH(tpg, pgl, pageq.list) {
2281 if (tpg == pg) {
2282 break;
2283 }
2284 }
2285 if (tpg)
2286 (*pr)(" page found on pageq list\n");
2287 else
2288 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
2289 }
2290 }
2291
2292 /*
2293 * uvm_page_printall - print a summary of all managed pages
2294 */
2295
2296 void
2297 uvm_page_printall(void (*pr)(const char *, ...))
2298 {
2299 uvm_physseg_t i;
2300 paddr_t pfn;
2301 struct vm_page *pg;
2302
2303 (*pr)("%18s %4s %4s %18s %18s"
2304 #ifdef UVM_PAGE_TRKOWN
2305 " OWNER"
2306 #endif
2307 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
2308 for (i = uvm_physseg_get_first();
2309 uvm_physseg_valid_p(i);
2310 i = uvm_physseg_get_next(i)) {
2311 for (pfn = uvm_physseg_get_start(i);
2312 pfn < uvm_physseg_get_end(i);
2313 pfn++) {
2314 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
2315
2316 (*pr)("%18p %04x %08x %18p %18p",
2317 pg, pg->flags, pg->pqflags, pg->uobject,
2318 pg->uanon);
2319 #ifdef UVM_PAGE_TRKOWN
2320 if (pg->flags & PG_BUSY)
2321 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
2322 #endif
2323 (*pr)("\n");
2324 }
2325 }
2326 }
2327
2328 /*
2329 * uvm_page_print_freelists - print a summary freelists
2330 */
2331
2332 void
2333 uvm_page_print_freelists(void (*pr)(const char *, ...))
2334 {
2335 struct pgfreelist *pgfl;
2336 struct pgflbucket *pgb;
2337 int fl, b, c;
2338
2339 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
2340 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
2341
2342 for (fl = 0; fl < VM_NFREELIST; fl++) {
2343 pgfl = &uvm.page_free[fl];
2344 (*pr)("freelist(%d) @ %p\n", fl, pgfl);
2345 for (b = 0; b < uvm.bucketcount; b++) {
2346 pgb = uvm.page_free[fl].pgfl_buckets[b];
2347 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
2348 b, pgb, pgb->pgb_nfree,
2349 &uvm_freelist_locks[b].lock);
2350 for (c = 0; c < uvmexp.ncolors; c++) {
2351 (*pr)(" color(%d) @ %p, ", c,
2352 &pgb->pgb_colors[c]);
2353 (*pr)("first page = %p\n",
2354 LIST_FIRST(&pgb->pgb_colors[c]));
2355 }
2356 }
2357 }
2358 }
2359
2360 #endif /* DDB || DEBUGPRINT */
2361