uvm_page.c revision 1.244 1 /* $NetBSD: uvm_page.c,v 1.244 2020/07/09 05:57:15 skrll Exp $ */
2
3 /*-
4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1997 Charles D. Cranor and Washington University.
34 * Copyright (c) 1991, 1993, The Regents of the University of California.
35 *
36 * All rights reserved.
37 *
38 * This code is derived from software contributed to Berkeley by
39 * The Mach Operating System project at Carnegie-Mellon University.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
66 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
67 *
68 *
69 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
70 * All rights reserved.
71 *
72 * Permission to use, copy, modify and distribute this software and
73 * its documentation is hereby granted, provided that both the copyright
74 * notice and this permission notice appear in all copies of the
75 * software, derivative works or modified versions, and any portions
76 * thereof, and that both notices appear in supporting documentation.
77 *
78 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
79 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
80 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
81 *
82 * Carnegie Mellon requests users of this software to return to
83 *
84 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
85 * School of Computer Science
86 * Carnegie Mellon University
87 * Pittsburgh PA 15213-3890
88 *
89 * any improvements or extensions that they make and grant Carnegie the
90 * rights to redistribute these changes.
91 */
92
93 /*
94 * uvm_page.c: page ops.
95 */
96
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.244 2020/07/09 05:57:15 skrll Exp $");
99
100 #include "opt_ddb.h"
101 #include "opt_uvm.h"
102 #include "opt_uvmhist.h"
103 #include "opt_readahead.h"
104
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/sched.h>
108 #include <sys/kernel.h>
109 #include <sys/vnode.h>
110 #include <sys/proc.h>
111 #include <sys/radixtree.h>
112 #include <sys/atomic.h>
113 #include <sys/cpu.h>
114
115 #include <uvm/uvm.h>
116 #include <uvm/uvm_ddb.h>
117 #include <uvm/uvm_pdpolicy.h>
118 #include <uvm/uvm_pgflcache.h>
119
120 /*
121 * number of pages per-CPU to reserve for the kernel.
122 */
123 #ifndef UVM_RESERVED_PAGES_PER_CPU
124 #define UVM_RESERVED_PAGES_PER_CPU 5
125 #endif
126 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
127
128 /*
129 * physical memory size;
130 */
131 psize_t physmem;
132
133 /*
134 * local variables
135 */
136
137 /*
138 * these variables record the values returned by vm_page_bootstrap,
139 * for debugging purposes. The implementation of uvm_pageboot_alloc
140 * and pmap_startup here also uses them internally.
141 */
142
143 static vaddr_t virtual_space_start;
144 static vaddr_t virtual_space_end;
145
146 /*
147 * we allocate an initial number of page colors in uvm_page_init(),
148 * and remember them. We may re-color pages as cache sizes are
149 * discovered during the autoconfiguration phase. But we can never
150 * free the initial set of buckets, since they are allocated using
151 * uvm_pageboot_alloc().
152 */
153
154 static size_t recolored_pages_memsize /* = 0 */;
155 static char *recolored_pages_mem;
156
157 /*
158 * freelist locks - one per bucket.
159 */
160
161 union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
162 __cacheline_aligned;
163
164 /*
165 * basic NUMA information.
166 */
167
168 static struct uvm_page_numa_region {
169 struct uvm_page_numa_region *next;
170 paddr_t start;
171 paddr_t size;
172 u_int numa_id;
173 } *uvm_page_numa_region;
174
175 #ifdef DEBUG
176 kmutex_t uvm_zerochecklock __cacheline_aligned;
177 vaddr_t uvm_zerocheckkva;
178 #endif /* DEBUG */
179
180 /*
181 * These functions are reserved for uvm(9) internal use and are not
182 * exported in the header file uvm_physseg.h
183 *
184 * Thus they are redefined here.
185 */
186 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
187 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
188
189 /* returns a pgs array */
190 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
191
192 /*
193 * inline functions
194 */
195
196 /*
197 * uvm_pageinsert: insert a page in the object.
198 *
199 * => caller must lock object
200 * => call should have already set pg's object and offset pointers
201 * and bumped the version counter
202 */
203
204 static inline void
205 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
206 {
207
208 KASSERT(uobj == pg->uobject);
209 KASSERT(rw_write_held(uobj->vmobjlock));
210 KASSERT((pg->flags & PG_TABLED) == 0);
211
212 if ((pg->flags & PG_STAT) != 0) {
213 /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
214 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
215
216 if ((pg->flags & PG_FILE) != 0) {
217 if (uobj->uo_npages == 0) {
218 struct vnode *vp = (struct vnode *)uobj;
219 mutex_enter(vp->v_interlock);
220 KASSERT((vp->v_iflag & VI_PAGES) == 0);
221 vp->v_iflag |= VI_PAGES;
222 vholdl(vp);
223 mutex_exit(vp->v_interlock);
224 }
225 if (UVM_OBJ_IS_VTEXT(uobj)) {
226 cpu_count(CPU_COUNT_EXECPAGES, 1);
227 }
228 cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
229 } else {
230 cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
231 }
232 }
233 pg->flags |= PG_TABLED;
234 uobj->uo_npages++;
235 }
236
237 static inline int
238 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
239 {
240 const uint64_t idx = pg->offset >> PAGE_SHIFT;
241 int error;
242
243 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
244 if (error != 0) {
245 return error;
246 }
247 if ((pg->flags & PG_CLEAN) == 0) {
248 radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG);
249 }
250 KASSERT(((pg->flags & PG_CLEAN) == 0) ==
251 radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG));
252 return 0;
253 }
254
255 /*
256 * uvm_page_remove: remove page from object.
257 *
258 * => caller must lock object
259 */
260
261 static inline void
262 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
263 {
264
265 KASSERT(uobj == pg->uobject);
266 KASSERT(rw_write_held(uobj->vmobjlock));
267 KASSERT(pg->flags & PG_TABLED);
268
269 if ((pg->flags & PG_STAT) != 0) {
270 /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
271 const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
272
273 if ((pg->flags & PG_FILE) != 0) {
274 if (uobj->uo_npages == 1) {
275 struct vnode *vp = (struct vnode *)uobj;
276 mutex_enter(vp->v_interlock);
277 KASSERT((vp->v_iflag & VI_PAGES) != 0);
278 vp->v_iflag &= ~VI_PAGES;
279 holdrelel(vp);
280 mutex_exit(vp->v_interlock);
281 }
282 if (UVM_OBJ_IS_VTEXT(uobj)) {
283 cpu_count(CPU_COUNT_EXECPAGES, -1);
284 }
285 cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
286 } else {
287 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
288 }
289 }
290 uobj->uo_npages--;
291 pg->flags &= ~PG_TABLED;
292 pg->uobject = NULL;
293 }
294
295 static inline void
296 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
297 {
298 struct vm_page *opg __unused;
299
300 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
301 KASSERT(pg == opg);
302 }
303
304 static void
305 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
306 {
307 int i;
308
309 pgb->pgb_nfree = 0;
310 for (i = 0; i < uvmexp.ncolors; i++) {
311 LIST_INIT(&pgb->pgb_colors[i]);
312 }
313 pgfl->pgfl_buckets[num] = pgb;
314 }
315
316 /*
317 * uvm_page_init: init the page system. called from uvm_init().
318 *
319 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
320 */
321
322 void
323 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
324 {
325 static struct uvm_cpu boot_cpu __cacheline_aligned;
326 psize_t freepages, pagecount, bucketsize, n;
327 struct pgflbucket *pgb;
328 struct vm_page *pagearray;
329 char *bucketarray;
330 uvm_physseg_t bank;
331 int fl, b;
332
333 KASSERT(ncpu <= 1);
334
335 /*
336 * init the page queues and free page queue locks, except the
337 * free list; we allocate that later (with the initial vm_page
338 * structures).
339 */
340
341 curcpu()->ci_data.cpu_uvm = &boot_cpu;
342 uvmpdpol_init();
343 for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
344 mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
345 }
346
347 /*
348 * allocate vm_page structures.
349 */
350
351 /*
352 * sanity check:
353 * before calling this function the MD code is expected to register
354 * some free RAM with the uvm_page_physload() function. our job
355 * now is to allocate vm_page structures for this memory.
356 */
357
358 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
359 panic("uvm_page_bootstrap: no memory pre-allocated");
360
361 /*
362 * first calculate the number of free pages...
363 *
364 * note that we use start/end rather than avail_start/avail_end.
365 * this allows us to allocate extra vm_page structures in case we
366 * want to return some memory to the pool after booting.
367 */
368
369 freepages = 0;
370
371 for (bank = uvm_physseg_get_first();
372 uvm_physseg_valid_p(bank) ;
373 bank = uvm_physseg_get_next(bank)) {
374 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
375 }
376
377 /*
378 * Let MD code initialize the number of colors, or default
379 * to 1 color if MD code doesn't care.
380 */
381 if (uvmexp.ncolors == 0)
382 uvmexp.ncolors = 1;
383 uvmexp.colormask = uvmexp.ncolors - 1;
384 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
385
386 /* We always start with only 1 bucket. */
387 uvm.bucketcount = 1;
388
389 /*
390 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
391 * use. for each page of memory we use we need a vm_page structure.
392 * thus, the total number of pages we can use is the total size of
393 * the memory divided by the PAGE_SIZE plus the size of the vm_page
394 * structure. we add one to freepages as a fudge factor to avoid
395 * truncation errors (since we can only allocate in terms of whole
396 * pages).
397 */
398 pagecount = ((freepages + 1) << PAGE_SHIFT) /
399 (PAGE_SIZE + sizeof(struct vm_page));
400 bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
401 bucketsize = roundup2(bucketsize, coherency_unit);
402 bucketarray = (void *)uvm_pageboot_alloc(
403 bucketsize * VM_NFREELIST +
404 pagecount * sizeof(struct vm_page));
405 pagearray = (struct vm_page *)
406 (bucketarray + bucketsize * VM_NFREELIST);
407
408 for (fl = 0; fl < VM_NFREELIST; fl++) {
409 pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
410 uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
411 }
412 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
413
414 /*
415 * init the freelist cache in the disabled state.
416 */
417 uvm_pgflcache_init();
418
419 /*
420 * init the vm_page structures and put them in the correct place.
421 */
422 /* First init the extent */
423
424 for (bank = uvm_physseg_get_first(),
425 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
426 uvm_physseg_valid_p(bank);
427 bank = uvm_physseg_get_next(bank)) {
428
429 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
430 uvm_physseg_seg_alloc_from_slab(bank, n);
431 uvm_physseg_init_seg(bank, pagearray);
432
433 /* set up page array pointers */
434 pagearray += n;
435 pagecount -= n;
436 }
437
438 /*
439 * pass up the values of virtual_space_start and
440 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
441 * layers of the VM.
442 */
443
444 *kvm_startp = round_page(virtual_space_start);
445 *kvm_endp = trunc_page(virtual_space_end);
446 #ifdef DEBUG
447 /*
448 * steal kva for uvm_pagezerocheck().
449 */
450 uvm_zerocheckkva = *kvm_startp;
451 *kvm_startp += PAGE_SIZE;
452 mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM);
453 #endif /* DEBUG */
454
455 /*
456 * init various thresholds.
457 */
458
459 uvmexp.reserve_pagedaemon = 1;
460 uvmexp.reserve_kernel = vm_page_reserve_kernel;
461
462 /*
463 * done!
464 */
465
466 uvm.page_init_done = true;
467 }
468
469 /*
470 * uvm_pgfl_lock: lock all freelist buckets
471 */
472
473 void
474 uvm_pgfl_lock(void)
475 {
476 int i;
477
478 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
479 mutex_spin_enter(&uvm_freelist_locks[i].lock);
480 }
481 }
482
483 /*
484 * uvm_pgfl_unlock: unlock all freelist buckets
485 */
486
487 void
488 uvm_pgfl_unlock(void)
489 {
490 int i;
491
492 for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
493 mutex_spin_exit(&uvm_freelist_locks[i].lock);
494 }
495 }
496
497 /*
498 * uvm_setpagesize: set the page size
499 *
500 * => sets page_shift and page_mask from uvmexp.pagesize.
501 */
502
503 void
504 uvm_setpagesize(void)
505 {
506
507 /*
508 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
509 * to be a constant (indicated by being a non-zero value).
510 */
511 if (uvmexp.pagesize == 0) {
512 if (PAGE_SIZE == 0)
513 panic("uvm_setpagesize: uvmexp.pagesize not set");
514 uvmexp.pagesize = PAGE_SIZE;
515 }
516 uvmexp.pagemask = uvmexp.pagesize - 1;
517 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
518 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
519 uvmexp.pagesize, uvmexp.pagesize);
520 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
521 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
522 break;
523 }
524
525 /*
526 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
527 */
528
529 vaddr_t
530 uvm_pageboot_alloc(vsize_t size)
531 {
532 static bool initialized = false;
533 vaddr_t addr;
534 #if !defined(PMAP_STEAL_MEMORY)
535 vaddr_t vaddr;
536 paddr_t paddr;
537 #endif
538
539 /*
540 * on first call to this function, initialize ourselves.
541 */
542 if (initialized == false) {
543 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
544
545 /* round it the way we like it */
546 virtual_space_start = round_page(virtual_space_start);
547 virtual_space_end = trunc_page(virtual_space_end);
548
549 initialized = true;
550 }
551
552 /* round to page size */
553 size = round_page(size);
554 uvmexp.bootpages += atop(size);
555
556 #if defined(PMAP_STEAL_MEMORY)
557
558 /*
559 * defer bootstrap allocation to MD code (it may want to allocate
560 * from a direct-mapped segment). pmap_steal_memory should adjust
561 * virtual_space_start/virtual_space_end if necessary.
562 */
563
564 addr = pmap_steal_memory(size, &virtual_space_start,
565 &virtual_space_end);
566
567 return(addr);
568
569 #else /* !PMAP_STEAL_MEMORY */
570
571 /*
572 * allocate virtual memory for this request
573 */
574 if (virtual_space_start == virtual_space_end ||
575 (virtual_space_end - virtual_space_start) < size)
576 panic("uvm_pageboot_alloc: out of virtual space");
577
578 addr = virtual_space_start;
579
580 #ifdef PMAP_GROWKERNEL
581 /*
582 * If the kernel pmap can't map the requested space,
583 * then allocate more resources for it.
584 */
585 if (uvm_maxkaddr < (addr + size)) {
586 uvm_maxkaddr = pmap_growkernel(addr + size);
587 if (uvm_maxkaddr < (addr + size))
588 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
589 }
590 #endif
591
592 virtual_space_start += size;
593
594 /*
595 * allocate and mapin physical pages to back new virtual pages
596 */
597
598 for (vaddr = round_page(addr) ; vaddr < addr + size ;
599 vaddr += PAGE_SIZE) {
600
601 if (!uvm_page_physget(&paddr))
602 panic("uvm_pageboot_alloc: out of memory");
603
604 /*
605 * Note this memory is no longer managed, so using
606 * pmap_kenter is safe.
607 */
608 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
609 }
610 pmap_update(pmap_kernel());
611 return(addr);
612 #endif /* PMAP_STEAL_MEMORY */
613 }
614
615 #if !defined(PMAP_STEAL_MEMORY)
616 /*
617 * uvm_page_physget: "steal" one page from the vm_physmem structure.
618 *
619 * => attempt to allocate it off the end of a segment in which the "avail"
620 * values match the start/end values. if we can't do that, then we
621 * will advance both values (making them equal, and removing some
622 * vm_page structures from the non-avail area).
623 * => return false if out of memory.
624 */
625
626 /* subroutine: try to allocate from memory chunks on the specified freelist */
627 static bool uvm_page_physget_freelist(paddr_t *, int);
628
629 static bool
630 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
631 {
632 uvm_physseg_t lcv;
633
634 /* pass 1: try allocating from a matching end */
635 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
636 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
637 #else
638 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
639 #endif
640 {
641 if (uvm.page_init_done == true)
642 panic("uvm_page_physget: called _after_ bootstrap");
643
644 /* Try to match at front or back on unused segment */
645 if (uvm_page_physunload(lcv, freelist, paddrp))
646 return true;
647 }
648
649 /* pass2: forget about matching ends, just allocate something */
650 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
651 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
652 #else
653 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
654 #endif
655 {
656 /* Try the front regardless. */
657 if (uvm_page_physunload_force(lcv, freelist, paddrp))
658 return true;
659 }
660 return false;
661 }
662
663 bool
664 uvm_page_physget(paddr_t *paddrp)
665 {
666 int i;
667
668 /* try in the order of freelist preference */
669 for (i = 0; i < VM_NFREELIST; i++)
670 if (uvm_page_physget_freelist(paddrp, i) == true)
671 return (true);
672 return (false);
673 }
674 #endif /* PMAP_STEAL_MEMORY */
675
676 /*
677 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
678 * back from an I/O mapping (ugh!). used in some MD code as well.
679 */
680 struct vm_page *
681 uvm_phys_to_vm_page(paddr_t pa)
682 {
683 paddr_t pf = atop(pa);
684 paddr_t off;
685 uvm_physseg_t upm;
686
687 upm = uvm_physseg_find(pf, &off);
688 if (upm != UVM_PHYSSEG_TYPE_INVALID)
689 return uvm_physseg_get_pg(upm, off);
690 return(NULL);
691 }
692
693 paddr_t
694 uvm_vm_page_to_phys(const struct vm_page *pg)
695 {
696
697 return pg->phys_addr & ~(PAGE_SIZE - 1);
698 }
699
700 /*
701 * uvm_page_numa_load: load NUMA range description.
702 */
703 void
704 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
705 {
706 struct uvm_page_numa_region *d;
707
708 KASSERT(numa_id < PGFL_MAX_BUCKETS);
709
710 d = kmem_alloc(sizeof(*d), KM_SLEEP);
711 d->start = start;
712 d->size = size;
713 d->numa_id = numa_id;
714 d->next = uvm_page_numa_region;
715 uvm_page_numa_region = d;
716 }
717
718 /*
719 * uvm_page_numa_lookup: lookup NUMA node for the given page.
720 */
721 static u_int
722 uvm_page_numa_lookup(struct vm_page *pg)
723 {
724 struct uvm_page_numa_region *d;
725 static bool warned;
726 paddr_t pa;
727
728 KASSERT(uvm_page_numa_region != NULL);
729
730 pa = VM_PAGE_TO_PHYS(pg);
731 for (d = uvm_page_numa_region; d != NULL; d = d->next) {
732 if (pa >= d->start && pa < d->start + d->size) {
733 return d->numa_id;
734 }
735 }
736
737 if (!warned) {
738 printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
739 PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
740 warned = true;
741 }
742
743 return 0;
744 }
745
746 /*
747 * uvm_page_redim: adjust freelist dimensions if they have changed.
748 */
749
750 static void
751 uvm_page_redim(int newncolors, int newnbuckets)
752 {
753 struct pgfreelist npgfl;
754 struct pgflbucket *opgb, *npgb;
755 struct pgflist *ohead, *nhead;
756 struct vm_page *pg;
757 size_t bucketsize, bucketmemsize, oldbucketmemsize;
758 int fl, ob, oc, nb, nc, obuckets, ocolors;
759 char *bucketarray, *oldbucketmem, *bucketmem;
760
761 KASSERT(((newncolors - 1) & newncolors) == 0);
762
763 /* Anything to do? */
764 if (newncolors <= uvmexp.ncolors &&
765 newnbuckets == uvm.bucketcount) {
766 return;
767 }
768 if (uvm.page_init_done == false) {
769 uvmexp.ncolors = newncolors;
770 return;
771 }
772
773 bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
774 bucketsize = roundup2(bucketsize, coherency_unit);
775 bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
776 coherency_unit - 1;
777 bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
778 bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
779
780 ocolors = uvmexp.ncolors;
781 obuckets = uvm.bucketcount;
782
783 /* Freelist cache musn't be enabled. */
784 uvm_pgflcache_pause();
785
786 /* Make sure we should still do this. */
787 uvm_pgfl_lock();
788 if (newncolors <= uvmexp.ncolors &&
789 newnbuckets == uvm.bucketcount) {
790 uvm_pgfl_unlock();
791 uvm_pgflcache_resume();
792 kmem_free(bucketmem, bucketmemsize);
793 return;
794 }
795
796 uvmexp.ncolors = newncolors;
797 uvmexp.colormask = uvmexp.ncolors - 1;
798 uvm.bucketcount = newnbuckets;
799
800 for (fl = 0; fl < VM_NFREELIST; fl++) {
801 /* Init new buckets in new freelist. */
802 memset(&npgfl, 0, sizeof(npgfl));
803 for (nb = 0; nb < newnbuckets; nb++) {
804 npgb = (struct pgflbucket *)bucketarray;
805 uvm_page_init_bucket(&npgfl, npgb, nb);
806 bucketarray += bucketsize;
807 }
808 /* Now transfer pages from the old freelist. */
809 for (nb = ob = 0; ob < obuckets; ob++) {
810 opgb = uvm.page_free[fl].pgfl_buckets[ob];
811 for (oc = 0; oc < ocolors; oc++) {
812 ohead = &opgb->pgb_colors[oc];
813 while ((pg = LIST_FIRST(ohead)) != NULL) {
814 LIST_REMOVE(pg, pageq.list);
815 /*
816 * Here we decide on the NEW color &
817 * bucket for the page. For NUMA
818 * we'll use the info that the
819 * hardware gave us. For non-NUMA
820 * assign take physical page frame
821 * number and cache color into
822 * account. We do this to try and
823 * avoid defeating any memory
824 * interleaving in the hardware.
825 */
826 KASSERT(
827 uvm_page_get_bucket(pg) == ob);
828 KASSERT(fl ==
829 uvm_page_get_freelist(pg));
830 if (uvm_page_numa_region != NULL) {
831 nb = uvm_page_numa_lookup(pg);
832 } else {
833 nb = atop(VM_PAGE_TO_PHYS(pg))
834 / uvmexp.ncolors / 8
835 % newnbuckets;
836 }
837 uvm_page_set_bucket(pg, nb);
838 npgb = npgfl.pgfl_buckets[nb];
839 npgb->pgb_nfree++;
840 nc = VM_PGCOLOR(pg);
841 nhead = &npgb->pgb_colors[nc];
842 LIST_INSERT_HEAD(nhead, pg, pageq.list);
843 }
844 }
845 }
846 /* Install the new freelist. */
847 memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
848 }
849
850 /* Unlock and free the old memory. */
851 oldbucketmemsize = recolored_pages_memsize;
852 oldbucketmem = recolored_pages_mem;
853 recolored_pages_memsize = bucketmemsize;
854 recolored_pages_mem = bucketmem;
855
856 uvm_pgfl_unlock();
857 uvm_pgflcache_resume();
858
859 if (oldbucketmemsize) {
860 kmem_free(oldbucketmem, oldbucketmemsize);
861 }
862
863 /*
864 * this calls uvm_km_alloc() which may want to hold
865 * uvm_freelist_lock.
866 */
867 uvm_pager_realloc_emerg();
868 }
869
870 /*
871 * uvm_page_recolor: Recolor the pages if the new color count is
872 * larger than the old one.
873 */
874
875 void
876 uvm_page_recolor(int newncolors)
877 {
878
879 uvm_page_redim(newncolors, uvm.bucketcount);
880 }
881
882 /*
883 * uvm_page_rebucket: Determine a bucket structure and redim the free
884 * lists to match.
885 */
886
887 void
888 uvm_page_rebucket(void)
889 {
890 u_int min_numa, max_numa, npackage, shift;
891 struct cpu_info *ci, *ci2, *ci3;
892 CPU_INFO_ITERATOR cii;
893
894 /*
895 * If we have more than one NUMA node, and the maximum NUMA node ID
896 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
897 * for free pages.
898 */
899 min_numa = (u_int)-1;
900 max_numa = 0;
901 for (CPU_INFO_FOREACH(cii, ci)) {
902 if (ci->ci_numa_id < min_numa) {
903 min_numa = ci->ci_numa_id;
904 }
905 if (ci->ci_numa_id > max_numa) {
906 max_numa = ci->ci_numa_id;
907 }
908 }
909 if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
910 aprint_debug("UVM: using NUMA allocation scheme\n");
911 for (CPU_INFO_FOREACH(cii, ci)) {
912 ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
913 }
914 uvm_page_redim(uvmexp.ncolors, max_numa + 1);
915 return;
916 }
917
918 /*
919 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
920 * and minimise lock contention. Count the total number of CPU
921 * packages, and then try to distribute the buckets among CPU
922 * packages evenly.
923 */
924 npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
925
926 /*
927 * Figure out how to arrange the packages & buckets, and the total
928 * number of buckets we need. XXX 2 may not be the best factor.
929 */
930 for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
931 npackage >>= 1;
932 }
933 uvm_page_redim(uvmexp.ncolors, npackage);
934
935 /*
936 * Now tell each CPU which bucket to use. In the outer loop, scroll
937 * through all CPU packages.
938 */
939 npackage = 0;
940 ci = curcpu();
941 ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
942 do {
943 /*
944 * In the inner loop, scroll through all CPUs in the package
945 * and assign the same bucket ID.
946 */
947 ci3 = ci2;
948 do {
949 ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
950 ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
951 } while (ci3 != ci2);
952 npackage++;
953 ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
954 } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
955
956 aprint_debug("UVM: using package allocation scheme, "
957 "%d package(s) per bucket\n", 1 << shift);
958 }
959
960 /*
961 * uvm_cpu_attach: initialize per-CPU data structures.
962 */
963
964 void
965 uvm_cpu_attach(struct cpu_info *ci)
966 {
967 struct uvm_cpu *ucpu;
968
969 /* Already done in uvm_page_init(). */
970 if (!CPU_IS_PRIMARY(ci)) {
971 /* Add more reserve pages for this CPU. */
972 uvmexp.reserve_kernel += vm_page_reserve_kernel;
973
974 /* Allocate per-CPU data structures. */
975 ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
976 KM_SLEEP);
977 ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
978 coherency_unit);
979 ci->ci_data.cpu_uvm = ucpu;
980 } else {
981 ucpu = ci->ci_data.cpu_uvm;
982 }
983
984 uvmpdpol_init_cpu(ucpu);
985
986 /*
987 * Attach RNG source for this CPU's VM events
988 */
989 rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
990 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
991 RND_FLAG_ESTIMATE_VALUE);
992 }
993
994 /*
995 * uvm_availmem: fetch the total amount of free memory in pages. this can
996 * have a detrimental effect on performance due to false sharing; don't call
997 * unless needed.
998 *
999 * some users can request the amount of free memory so often that it begins
1000 * to impact upon performance. if calling frequently and an inexact value
1001 * is okay, call with cached = true.
1002 */
1003
1004 int
1005 uvm_availmem(bool cached)
1006 {
1007 int64_t fp;
1008
1009 cpu_count_sync(cached);
1010 if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
1011 /*
1012 * XXXAD could briefly go negative because it's impossible
1013 * to get a clean snapshot. address this for other counters
1014 * used as running totals before NetBSD 10 although less
1015 * important for those.
1016 */
1017 fp = 0;
1018 }
1019 return (int)fp;
1020 }
1021
1022 /*
1023 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
1024 * specific freelist and specific bucket only.
1025 *
1026 * => must be at IPL_VM or higher to protect per-CPU data structures.
1027 */
1028
1029 static struct vm_page *
1030 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
1031 {
1032 int c, trycolor, colormask;
1033 struct pgflbucket *pgb;
1034 struct vm_page *pg;
1035 kmutex_t *lock;
1036 bool fill;
1037
1038 /*
1039 * Skip the bucket if empty, no lock needed. There could be many
1040 * empty freelists/buckets.
1041 */
1042 pgb = uvm.page_free[f].pgfl_buckets[b];
1043 if (pgb->pgb_nfree == 0) {
1044 return NULL;
1045 }
1046
1047 /* Skip bucket if low on memory. */
1048 lock = &uvm_freelist_locks[b].lock;
1049 mutex_spin_enter(lock);
1050 if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
1051 if ((flags & UVM_PGA_USERESERVE) == 0 ||
1052 (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
1053 curlwp != uvm.pagedaemon_lwp)) {
1054 mutex_spin_exit(lock);
1055 return NULL;
1056 }
1057 fill = false;
1058 } else {
1059 fill = true;
1060 }
1061
1062 /* Try all page colors as needed. */
1063 c = trycolor = *trycolorp;
1064 colormask = uvmexp.colormask;
1065 do {
1066 pg = LIST_FIRST(&pgb->pgb_colors[c]);
1067 if (__predict_true(pg != NULL)) {
1068 /*
1069 * Got a free page! PG_FREE must be cleared under
1070 * lock because of uvm_pglistalloc().
1071 */
1072 LIST_REMOVE(pg, pageq.list);
1073 KASSERT(pg->flags == PG_FREE);
1074 pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
1075 pgb->pgb_nfree--;
1076
1077 /*
1078 * While we have the bucket locked and our data
1079 * structures fresh in L1 cache, we have an ideal
1080 * opportunity to grab some pages for the freelist
1081 * cache without causing extra contention. Only do
1082 * so if we found pages in this CPU's preferred
1083 * bucket.
1084 */
1085 if (__predict_true(b == ucpu->pgflbucket && fill)) {
1086 uvm_pgflcache_fill(ucpu, f, b, c);
1087 }
1088 mutex_spin_exit(lock);
1089 KASSERT(uvm_page_get_bucket(pg) == b);
1090 CPU_COUNT(c == trycolor ?
1091 CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
1092 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
1093 *trycolorp = c;
1094 return pg;
1095 }
1096 c = (c + 1) & colormask;
1097 } while (c != trycolor);
1098 mutex_spin_exit(lock);
1099
1100 return NULL;
1101 }
1102
1103 /*
1104 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
1105 * any color from any bucket, in a specific freelist.
1106 *
1107 * => must be at IPL_VM or higher to protect per-CPU data structures.
1108 */
1109
1110 static struct vm_page *
1111 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
1112 {
1113 int b, trybucket, bucketcount;
1114 struct vm_page *pg;
1115
1116 /* Try for the exact thing in the per-CPU cache. */
1117 if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
1118 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1119 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1120 return pg;
1121 }
1122
1123 /* Walk through all buckets, trying our preferred bucket first. */
1124 trybucket = ucpu->pgflbucket;
1125 b = trybucket;
1126 bucketcount = uvm.bucketcount;
1127 do {
1128 pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
1129 if (pg != NULL) {
1130 return pg;
1131 }
1132 b = (b + 1 == bucketcount ? 0 : b + 1);
1133 } while (b != trybucket);
1134
1135 return NULL;
1136 }
1137
1138 /*
1139 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
1140 *
1141 * => return null if no pages free
1142 * => wake up pagedaemon if number of free pages drops below low water mark
1143 * => if obj != NULL, obj must be locked (to put in obj's tree)
1144 * => if anon != NULL, anon must be locked (to put in anon)
1145 * => only one of obj or anon can be non-null
1146 * => caller must activate/deactivate page if it is not wired.
1147 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
1148 * => policy decision: it is more important to pull a page off of the
1149 * appropriate priority free list than it is to get a page from the
1150 * correct bucket or color bin. This is because we live with the
1151 * consequences of a bad free list decision for the entire
1152 * lifetime of the page, e.g. if the page comes from memory that
1153 * is slower to access.
1154 */
1155
1156 struct vm_page *
1157 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
1158 int flags, int strat, int free_list)
1159 {
1160 int color, lcv, error, s;
1161 struct uvm_cpu *ucpu;
1162 struct vm_page *pg;
1163 lwp_t *l;
1164
1165 KASSERT(obj == NULL || anon == NULL);
1166 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
1167 KASSERT(off == trunc_page(off));
1168 KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
1169 KASSERT(anon == NULL || anon->an_lock == NULL ||
1170 rw_write_held(anon->an_lock));
1171
1172 /*
1173 * This implements a global round-robin page coloring
1174 * algorithm.
1175 */
1176
1177 s = splvm();
1178 ucpu = curcpu()->ci_data.cpu_uvm;
1179 if (flags & UVM_FLAG_COLORMATCH) {
1180 color = atop(off) & uvmexp.colormask;
1181 } else {
1182 color = ucpu->pgflcolor;
1183 }
1184
1185 /*
1186 * fail if any of these conditions is true:
1187 * [1] there really are no free pages, or
1188 * [2] only kernel "reserved" pages remain and
1189 * reserved pages have not been requested.
1190 * [3] only pagedaemon "reserved" pages remain and
1191 * the requestor isn't the pagedaemon.
1192 * we make kernel reserve pages available if called by a
1193 * kernel thread.
1194 */
1195 l = curlwp;
1196 if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
1197 flags |= UVM_PGA_USERESERVE;
1198 }
1199
1200 again:
1201 switch (strat) {
1202 case UVM_PGA_STRAT_NORMAL:
1203 /* Check freelists: descending priority (ascending id) order. */
1204 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1205 pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
1206 if (pg != NULL) {
1207 goto gotit;
1208 }
1209 }
1210
1211 /* No pages free! Have pagedaemon free some memory. */
1212 splx(s);
1213 uvm_kick_pdaemon();
1214 return NULL;
1215
1216 case UVM_PGA_STRAT_ONLY:
1217 case UVM_PGA_STRAT_FALLBACK:
1218 /* Attempt to allocate from the specified free list. */
1219 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
1220 pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
1221 if (pg != NULL) {
1222 goto gotit;
1223 }
1224
1225 /* Fall back, if possible. */
1226 if (strat == UVM_PGA_STRAT_FALLBACK) {
1227 strat = UVM_PGA_STRAT_NORMAL;
1228 goto again;
1229 }
1230
1231 /* No pages free! Have pagedaemon free some memory. */
1232 splx(s);
1233 uvm_kick_pdaemon();
1234 return NULL;
1235
1236 case UVM_PGA_STRAT_NUMA:
1237 /*
1238 * NUMA strategy (experimental): allocating from the correct
1239 * bucket is more important than observing freelist
1240 * priority. Look only to the current NUMA node; if that
1241 * fails, we need to look to other NUMA nodes, so retry with
1242 * the normal strategy.
1243 */
1244 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1245 pg = uvm_pgflcache_alloc(ucpu, lcv, color);
1246 if (pg != NULL) {
1247 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1248 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1249 goto gotit;
1250 }
1251 pg = uvm_pagealloc_pgb(ucpu, lcv,
1252 ucpu->pgflbucket, &color, flags);
1253 if (pg != NULL) {
1254 goto gotit;
1255 }
1256 }
1257 strat = UVM_PGA_STRAT_NORMAL;
1258 goto again;
1259
1260 default:
1261 panic("uvm_pagealloc_strat: bad strat %d", strat);
1262 /* NOTREACHED */
1263 }
1264
1265 gotit:
1266 /*
1267 * We now know which color we actually allocated from; set
1268 * the next color accordingly.
1269 */
1270
1271 ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
1272
1273 /*
1274 * while still at IPL_VM, update allocation statistics.
1275 */
1276
1277 CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
1278 if (anon) {
1279 CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
1280 }
1281 splx(s);
1282 KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
1283
1284 /*
1285 * assign the page to the object. as the page was free, we know
1286 * that pg->uobject and pg->uanon are NULL. we only need to take
1287 * the page's interlock if we are changing the values.
1288 */
1289 if (anon != NULL || obj != NULL) {
1290 mutex_enter(&pg->interlock);
1291 }
1292 pg->offset = off;
1293 pg->uobject = obj;
1294 pg->uanon = anon;
1295 KASSERT(uvm_page_owner_locked_p(pg, true));
1296 if (anon) {
1297 anon->an_page = pg;
1298 pg->flags |= PG_ANON;
1299 mutex_exit(&pg->interlock);
1300 } else if (obj) {
1301 /*
1302 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
1303 */
1304 if (UVM_OBJ_IS_VNODE(obj)) {
1305 pg->flags |= PG_FILE;
1306 } else if (UVM_OBJ_IS_AOBJ(obj)) {
1307 pg->flags |= PG_AOBJ;
1308 }
1309 uvm_pageinsert_object(obj, pg);
1310 mutex_exit(&pg->interlock);
1311 error = uvm_pageinsert_tree(obj, pg);
1312 if (error != 0) {
1313 mutex_enter(&pg->interlock);
1314 uvm_pageremove_object(obj, pg);
1315 mutex_exit(&pg->interlock);
1316 uvm_pagefree(pg);
1317 return NULL;
1318 }
1319 }
1320
1321 #if defined(UVM_PAGE_TRKOWN)
1322 pg->owner_tag = NULL;
1323 #endif
1324 UVM_PAGE_OWN(pg, "new alloc");
1325
1326 if (flags & UVM_PGA_ZERO) {
1327 /* A zero'd page is not clean. */
1328 if (obj != NULL || anon != NULL) {
1329 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1330 }
1331 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1332 }
1333
1334 return(pg);
1335 }
1336
1337 /*
1338 * uvm_pagereplace: replace a page with another
1339 *
1340 * => object must be locked
1341 * => page interlocks must be held
1342 */
1343
1344 void
1345 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1346 {
1347 struct uvm_object *uobj = oldpg->uobject;
1348 struct vm_page *pg __diagused;
1349 uint64_t idx;
1350
1351 KASSERT((oldpg->flags & PG_TABLED) != 0);
1352 KASSERT(uobj != NULL);
1353 KASSERT((newpg->flags & PG_TABLED) == 0);
1354 KASSERT(newpg->uobject == NULL);
1355 KASSERT(rw_write_held(uobj->vmobjlock));
1356 KASSERT(mutex_owned(&oldpg->interlock));
1357 KASSERT(mutex_owned(&newpg->interlock));
1358
1359 newpg->uobject = uobj;
1360 newpg->offset = oldpg->offset;
1361 idx = newpg->offset >> PAGE_SHIFT;
1362 pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
1363 KASSERT(pg == oldpg);
1364 if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
1365 if ((newpg->flags & PG_CLEAN) != 0) {
1366 radix_tree_clear_tag(&uobj->uo_pages, idx,
1367 UVM_PAGE_DIRTY_TAG);
1368 } else {
1369 radix_tree_set_tag(&uobj->uo_pages, idx,
1370 UVM_PAGE_DIRTY_TAG);
1371 }
1372 }
1373 /*
1374 * oldpg's PG_STAT is stable. newpg is not reachable by others yet.
1375 */
1376 newpg->flags |=
1377 (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
1378 uvm_pageinsert_object(uobj, newpg);
1379 uvm_pageremove_object(uobj, oldpg);
1380 }
1381
1382 /*
1383 * uvm_pagerealloc: reallocate a page from one object to another
1384 *
1385 * => both objects must be locked
1386 */
1387
1388 int
1389 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1390 {
1391 int error = 0;
1392
1393 /*
1394 * remove it from the old object
1395 */
1396
1397 if (pg->uobject) {
1398 uvm_pageremove_tree(pg->uobject, pg);
1399 uvm_pageremove_object(pg->uobject, pg);
1400 }
1401
1402 /*
1403 * put it in the new object
1404 */
1405
1406 if (newobj) {
1407 mutex_enter(&pg->interlock);
1408 pg->uobject = newobj;
1409 pg->offset = newoff;
1410 if (UVM_OBJ_IS_VNODE(newobj)) {
1411 pg->flags |= PG_FILE;
1412 } else if (UVM_OBJ_IS_AOBJ(newobj)) {
1413 pg->flags |= PG_AOBJ;
1414 }
1415 uvm_pageinsert_object(newobj, pg);
1416 mutex_exit(&pg->interlock);
1417 error = uvm_pageinsert_tree(newobj, pg);
1418 if (error != 0) {
1419 mutex_enter(&pg->interlock);
1420 uvm_pageremove_object(newobj, pg);
1421 mutex_exit(&pg->interlock);
1422 }
1423 }
1424
1425 return error;
1426 }
1427
1428 #ifdef DEBUG
1429 /*
1430 * check if page is zero-filled
1431 */
1432 void
1433 uvm_pagezerocheck(struct vm_page *pg)
1434 {
1435 int *p, *ep;
1436
1437 KASSERT(uvm_zerocheckkva != 0);
1438
1439 /*
1440 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1441 * uvm page allocator.
1442 *
1443 * it might be better to have "CPU-local temporary map" pmap interface.
1444 */
1445 mutex_spin_enter(&uvm_zerochecklock);
1446 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1447 p = (int *)uvm_zerocheckkva;
1448 ep = (int *)((char *)p + PAGE_SIZE);
1449 pmap_update(pmap_kernel());
1450 while (p < ep) {
1451 if (*p != 0)
1452 panic("zero page isn't zero-filled");
1453 p++;
1454 }
1455 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1456 mutex_spin_exit(&uvm_zerochecklock);
1457 /*
1458 * pmap_update() is not necessary here because no one except us
1459 * uses this VA.
1460 */
1461 }
1462 #endif /* DEBUG */
1463
1464 /*
1465 * uvm_pagefree: free page
1466 *
1467 * => erase page's identity (i.e. remove from object)
1468 * => put page on free list
1469 * => caller must lock owning object (either anon or uvm_object)
1470 * => assumes all valid mappings of pg are gone
1471 */
1472
1473 void
1474 uvm_pagefree(struct vm_page *pg)
1475 {
1476 struct pgfreelist *pgfl;
1477 struct pgflbucket *pgb;
1478 struct uvm_cpu *ucpu;
1479 kmutex_t *lock;
1480 int bucket, s;
1481 bool locked;
1482
1483 #ifdef DEBUG
1484 if (pg->uobject == (void *)0xdeadbeef &&
1485 pg->uanon == (void *)0xdeadbeef) {
1486 panic("uvm_pagefree: freeing free page %p", pg);
1487 }
1488 #endif /* DEBUG */
1489
1490 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1491 KASSERT(!(pg->flags & PG_FREE));
1492 KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
1493 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1494 rw_write_held(pg->uanon->an_lock));
1495
1496 /*
1497 * remove the page from the object's tree before acquiring any page
1498 * interlocks: this can acquire locks to free radixtree nodes.
1499 */
1500 if (pg->uobject != NULL) {
1501 uvm_pageremove_tree(pg->uobject, pg);
1502 }
1503
1504 /*
1505 * if the page is loaned, resolve the loan instead of freeing.
1506 */
1507
1508 if (pg->loan_count) {
1509 KASSERT(pg->wire_count == 0);
1510
1511 /*
1512 * if the page is owned by an anon then we just want to
1513 * drop anon ownership. the kernel will free the page when
1514 * it is done with it. if the page is owned by an object,
1515 * remove it from the object and mark it dirty for the benefit
1516 * of possible anon owners.
1517 *
1518 * regardless of previous ownership, wakeup any waiters,
1519 * unbusy the page, and we're done.
1520 */
1521
1522 uvm_pagelock(pg);
1523 locked = true;
1524 if (pg->uobject != NULL) {
1525 uvm_pageremove_object(pg->uobject, pg);
1526 pg->flags &= ~(PG_FILE|PG_AOBJ);
1527 } else if (pg->uanon != NULL) {
1528 if ((pg->flags & PG_ANON) == 0) {
1529 pg->loan_count--;
1530 } else {
1531 const unsigned status = uvm_pagegetdirty(pg);
1532 pg->flags &= ~PG_ANON;
1533 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1534 }
1535 pg->uanon->an_page = NULL;
1536 pg->uanon = NULL;
1537 }
1538 if (pg->pqflags & PQ_WANTED) {
1539 wakeup(pg);
1540 }
1541 pg->pqflags &= ~PQ_WANTED;
1542 pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
1543 #ifdef UVM_PAGE_TRKOWN
1544 pg->owner_tag = NULL;
1545 #endif
1546 KASSERT((pg->flags & PG_STAT) == 0);
1547 if (pg->loan_count) {
1548 KASSERT(pg->uobject == NULL);
1549 if (pg->uanon == NULL) {
1550 uvm_pagedequeue(pg);
1551 }
1552 uvm_pageunlock(pg);
1553 return;
1554 }
1555 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1556 pg->wire_count != 0) {
1557 uvm_pagelock(pg);
1558 locked = true;
1559 } else {
1560 locked = false;
1561 }
1562
1563 /*
1564 * remove page from its object or anon.
1565 */
1566 if (pg->uobject != NULL) {
1567 uvm_pageremove_object(pg->uobject, pg);
1568 } else if (pg->uanon != NULL) {
1569 const unsigned int status = uvm_pagegetdirty(pg);
1570 pg->uanon->an_page = NULL;
1571 pg->uanon = NULL;
1572 cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1573 }
1574
1575 /*
1576 * if the page was wired, unwire it now.
1577 */
1578
1579 if (pg->wire_count) {
1580 pg->wire_count = 0;
1581 atomic_dec_uint(&uvmexp.wired);
1582 }
1583 if (locked) {
1584 /*
1585 * wake anyone waiting on the page.
1586 */
1587 if ((pg->pqflags & PQ_WANTED) != 0) {
1588 pg->pqflags &= ~PQ_WANTED;
1589 wakeup(pg);
1590 }
1591
1592 /*
1593 * now remove the page from the queues.
1594 */
1595 uvm_pagedequeue(pg);
1596 uvm_pageunlock(pg);
1597 } else {
1598 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1599 }
1600
1601 /*
1602 * and put on free queue
1603 */
1604
1605 #ifdef DEBUG
1606 pg->uobject = (void *)0xdeadbeef;
1607 pg->uanon = (void *)0xdeadbeef;
1608 #endif /* DEBUG */
1609
1610 /* Try to send the page to the per-CPU cache. */
1611 s = splvm();
1612 CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
1613 ucpu = curcpu()->ci_data.cpu_uvm;
1614 bucket = uvm_page_get_bucket(pg);
1615 if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
1616 splx(s);
1617 return;
1618 }
1619
1620 /* Didn't work. Never mind, send it to a global bucket. */
1621 pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
1622 pgb = pgfl->pgfl_buckets[bucket];
1623 lock = &uvm_freelist_locks[bucket].lock;
1624
1625 mutex_spin_enter(lock);
1626 /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
1627 pg->flags = PG_FREE;
1628 LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
1629 pgb->pgb_nfree++;
1630 mutex_spin_exit(lock);
1631 splx(s);
1632 }
1633
1634 /*
1635 * uvm_page_unbusy: unbusy an array of pages.
1636 *
1637 * => pages must either all belong to the same object, or all belong to anons.
1638 * => if pages are object-owned, object must be locked.
1639 * => if pages are anon-owned, anons must be locked.
1640 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1641 */
1642
1643 void
1644 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1645 {
1646 struct vm_page *pg;
1647 int i;
1648 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1649
1650 for (i = 0; i < npgs; i++) {
1651 pg = pgs[i];
1652 if (pg == NULL || pg == PGO_DONTCARE) {
1653 continue;
1654 }
1655
1656 KASSERT(uvm_page_owner_locked_p(pg, true));
1657 KASSERT(pg->flags & PG_BUSY);
1658 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1659 if (pg->flags & PG_RELEASED) {
1660 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1661 (uintptr_t)pg, 0, 0, 0);
1662 KASSERT(pg->uobject != NULL ||
1663 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1664 pg->flags &= ~PG_RELEASED;
1665 uvm_pagefree(pg);
1666 } else {
1667 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1668 (uintptr_t)pg, 0, 0, 0);
1669 KASSERT((pg->flags & PG_FAKE) == 0);
1670 pg->flags &= ~PG_BUSY;
1671 uvm_pagelock(pg);
1672 uvm_pagewakeup(pg);
1673 uvm_pageunlock(pg);
1674 UVM_PAGE_OWN(pg, NULL);
1675 }
1676 }
1677 }
1678
1679 /*
1680 * uvm_pagewait: wait for a busy page
1681 *
1682 * => page must be known PG_BUSY
1683 * => object must be read or write locked
1684 * => object will be unlocked on return
1685 */
1686
1687 void
1688 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
1689 {
1690
1691 KASSERT(rw_lock_held(lock));
1692 KASSERT((pg->flags & PG_BUSY) != 0);
1693 KASSERT(uvm_page_owner_locked_p(pg, false));
1694
1695 mutex_enter(&pg->interlock);
1696 pg->pqflags |= PQ_WANTED;
1697 rw_exit(lock);
1698 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
1699 }
1700
1701 /*
1702 * uvm_pagewakeup: wake anyone waiting on a page
1703 *
1704 * => page interlock must be held
1705 */
1706
1707 void
1708 uvm_pagewakeup(struct vm_page *pg)
1709 {
1710 UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1711
1712 KASSERT(mutex_owned(&pg->interlock));
1713
1714 UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
1715
1716 if ((pg->pqflags & PQ_WANTED) != 0) {
1717 wakeup(pg);
1718 pg->pqflags &= ~PQ_WANTED;
1719 }
1720 }
1721
1722 /*
1723 * uvm_pagewanted_p: return true if someone is waiting on the page
1724 *
1725 * => object must be write locked (lock out all concurrent access)
1726 */
1727
1728 bool
1729 uvm_pagewanted_p(struct vm_page *pg)
1730 {
1731
1732 KASSERT(uvm_page_owner_locked_p(pg, true));
1733
1734 return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
1735 }
1736
1737 #if defined(UVM_PAGE_TRKOWN)
1738 /*
1739 * uvm_page_own: set or release page ownership
1740 *
1741 * => this is a debugging function that keeps track of who sets PG_BUSY
1742 * and where they do it. it can be used to track down problems
1743 * such a process setting "PG_BUSY" and never releasing it.
1744 * => page's object [if any] must be locked
1745 * => if "tag" is NULL then we are releasing page ownership
1746 */
1747 void
1748 uvm_page_own(struct vm_page *pg, const char *tag)
1749 {
1750
1751 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1752 KASSERT(uvm_page_owner_locked_p(pg, true));
1753
1754 /* gain ownership? */
1755 if (tag) {
1756 KASSERT((pg->flags & PG_BUSY) != 0);
1757 if (pg->owner_tag) {
1758 printf("uvm_page_own: page %p already owned "
1759 "by proc %d.%d [%s]\n", pg,
1760 pg->owner, pg->lowner, pg->owner_tag);
1761 panic("uvm_page_own");
1762 }
1763 pg->owner = curproc->p_pid;
1764 pg->lowner = curlwp->l_lid;
1765 pg->owner_tag = tag;
1766 return;
1767 }
1768
1769 /* drop ownership */
1770 KASSERT((pg->flags & PG_BUSY) == 0);
1771 if (pg->owner_tag == NULL) {
1772 printf("uvm_page_own: dropping ownership of an non-owned "
1773 "page (%p)\n", pg);
1774 panic("uvm_page_own");
1775 }
1776 pg->owner_tag = NULL;
1777 }
1778 #endif
1779
1780 /*
1781 * uvm_pagelookup: look up a page
1782 *
1783 * => caller should lock object to keep someone from pulling the page
1784 * out from under it
1785 */
1786
1787 struct vm_page *
1788 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1789 {
1790 struct vm_page *pg;
1791
1792 /* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */
1793
1794 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1795
1796 KASSERT(pg == NULL || obj->uo_npages != 0);
1797 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1798 (pg->flags & PG_BUSY) != 0);
1799 return pg;
1800 }
1801
1802 /*
1803 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1804 *
1805 * => caller must lock objects
1806 * => caller must hold pg->interlock
1807 */
1808
1809 void
1810 uvm_pagewire(struct vm_page *pg)
1811 {
1812
1813 KASSERT(uvm_page_owner_locked_p(pg, true));
1814 KASSERT(mutex_owned(&pg->interlock));
1815 #if defined(READAHEAD_STATS)
1816 if ((pg->flags & PG_READAHEAD) != 0) {
1817 uvm_ra_hit.ev_count++;
1818 pg->flags &= ~PG_READAHEAD;
1819 }
1820 #endif /* defined(READAHEAD_STATS) */
1821 if (pg->wire_count == 0) {
1822 uvm_pagedequeue(pg);
1823 atomic_inc_uint(&uvmexp.wired);
1824 }
1825 pg->wire_count++;
1826 KASSERT(pg->wire_count > 0); /* detect wraparound */
1827 }
1828
1829 /*
1830 * uvm_pageunwire: unwire the page.
1831 *
1832 * => activate if wire count goes to zero.
1833 * => caller must lock objects
1834 * => caller must hold pg->interlock
1835 */
1836
1837 void
1838 uvm_pageunwire(struct vm_page *pg)
1839 {
1840
1841 KASSERT(uvm_page_owner_locked_p(pg, true));
1842 KASSERT(pg->wire_count != 0);
1843 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1844 KASSERT(mutex_owned(&pg->interlock));
1845 pg->wire_count--;
1846 if (pg->wire_count == 0) {
1847 uvm_pageactivate(pg);
1848 KASSERT(uvmexp.wired != 0);
1849 atomic_dec_uint(&uvmexp.wired);
1850 }
1851 }
1852
1853 /*
1854 * uvm_pagedeactivate: deactivate page
1855 *
1856 * => caller must lock objects
1857 * => caller must check to make sure page is not wired
1858 * => object that page belongs to must be locked (so we can adjust pg->flags)
1859 * => caller must clear the reference on the page before calling
1860 * => caller must hold pg->interlock
1861 */
1862
1863 void
1864 uvm_pagedeactivate(struct vm_page *pg)
1865 {
1866
1867 KASSERT(uvm_page_owner_locked_p(pg, false));
1868 KASSERT(mutex_owned(&pg->interlock));
1869 if (pg->wire_count == 0) {
1870 KASSERT(uvmpdpol_pageisqueued_p(pg));
1871 uvmpdpol_pagedeactivate(pg);
1872 }
1873 }
1874
1875 /*
1876 * uvm_pageactivate: activate page
1877 *
1878 * => caller must lock objects
1879 * => caller must hold pg->interlock
1880 */
1881
1882 void
1883 uvm_pageactivate(struct vm_page *pg)
1884 {
1885
1886 KASSERT(uvm_page_owner_locked_p(pg, false));
1887 KASSERT(mutex_owned(&pg->interlock));
1888 #if defined(READAHEAD_STATS)
1889 if ((pg->flags & PG_READAHEAD) != 0) {
1890 uvm_ra_hit.ev_count++;
1891 pg->flags &= ~PG_READAHEAD;
1892 }
1893 #endif /* defined(READAHEAD_STATS) */
1894 if (pg->wire_count == 0) {
1895 uvmpdpol_pageactivate(pg);
1896 }
1897 }
1898
1899 /*
1900 * uvm_pagedequeue: remove a page from any paging queue
1901 *
1902 * => caller must lock objects
1903 * => caller must hold pg->interlock
1904 */
1905 void
1906 uvm_pagedequeue(struct vm_page *pg)
1907 {
1908
1909 KASSERT(uvm_page_owner_locked_p(pg, true));
1910 KASSERT(mutex_owned(&pg->interlock));
1911 if (uvmpdpol_pageisqueued_p(pg)) {
1912 uvmpdpol_pagedequeue(pg);
1913 }
1914 }
1915
1916 /*
1917 * uvm_pageenqueue: add a page to a paging queue without activating.
1918 * used where a page is not really demanded (yet). eg. read-ahead
1919 *
1920 * => caller must lock objects
1921 * => caller must hold pg->interlock
1922 */
1923 void
1924 uvm_pageenqueue(struct vm_page *pg)
1925 {
1926
1927 KASSERT(uvm_page_owner_locked_p(pg, false));
1928 KASSERT(mutex_owned(&pg->interlock));
1929 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1930 uvmpdpol_pageenqueue(pg);
1931 }
1932 }
1933
1934 /*
1935 * uvm_pagelock: acquire page interlock
1936 */
1937 void
1938 uvm_pagelock(struct vm_page *pg)
1939 {
1940
1941 mutex_enter(&pg->interlock);
1942 }
1943
1944 /*
1945 * uvm_pagelock2: acquire two page interlocks
1946 */
1947 void
1948 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
1949 {
1950
1951 if (pg1 < pg2) {
1952 mutex_enter(&pg1->interlock);
1953 mutex_enter(&pg2->interlock);
1954 } else {
1955 mutex_enter(&pg2->interlock);
1956 mutex_enter(&pg1->interlock);
1957 }
1958 }
1959
1960 /*
1961 * uvm_pageunlock: release page interlock, and if a page replacement intent
1962 * is set on the page, pass it to uvmpdpol to make real.
1963 *
1964 * => caller must hold pg->interlock
1965 */
1966 void
1967 uvm_pageunlock(struct vm_page *pg)
1968 {
1969
1970 if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
1971 (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
1972 mutex_exit(&pg->interlock);
1973 return;
1974 }
1975 pg->pqflags |= PQ_INTENT_QUEUED;
1976 mutex_exit(&pg->interlock);
1977 uvmpdpol_pagerealize(pg);
1978 }
1979
1980 /*
1981 * uvm_pageunlock2: release two page interlocks, and for both pages if a
1982 * page replacement intent is set on the page, pass it to uvmpdpol to make
1983 * real.
1984 *
1985 * => caller must hold pg->interlock
1986 */
1987 void
1988 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
1989 {
1990
1991 if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
1992 (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
1993 mutex_exit(&pg1->interlock);
1994 pg1 = NULL;
1995 } else {
1996 pg1->pqflags |= PQ_INTENT_QUEUED;
1997 mutex_exit(&pg1->interlock);
1998 }
1999
2000 if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
2001 (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
2002 mutex_exit(&pg2->interlock);
2003 pg2 = NULL;
2004 } else {
2005 pg2->pqflags |= PQ_INTENT_QUEUED;
2006 mutex_exit(&pg2->interlock);
2007 }
2008
2009 if (pg1 != NULL) {
2010 uvmpdpol_pagerealize(pg1);
2011 }
2012 if (pg2 != NULL) {
2013 uvmpdpol_pagerealize(pg2);
2014 }
2015 }
2016
2017 /*
2018 * uvm_pagezero: zero fill a page
2019 *
2020 * => if page is part of an object then the object should be locked
2021 * to protect pg->flags.
2022 */
2023
2024 void
2025 uvm_pagezero(struct vm_page *pg)
2026 {
2027
2028 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
2029 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
2030 }
2031
2032 /*
2033 * uvm_pagecopy: copy a page
2034 *
2035 * => if page is part of an object then the object should be locked
2036 * to protect pg->flags.
2037 */
2038
2039 void
2040 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
2041 {
2042
2043 uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
2044 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
2045 }
2046
2047 /*
2048 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
2049 */
2050
2051 bool
2052 uvm_pageismanaged(paddr_t pa)
2053 {
2054
2055 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
2056 }
2057
2058 /*
2059 * uvm_page_lookup_freelist: look up the free list for the specified page
2060 */
2061
2062 int
2063 uvm_page_lookup_freelist(struct vm_page *pg)
2064 {
2065 uvm_physseg_t upm;
2066
2067 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
2068 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
2069 return uvm_physseg_get_free_list(upm);
2070 }
2071
2072 /*
2073 * uvm_page_owner_locked_p: return true if object associated with page is
2074 * locked. this is a weak check for runtime assertions only.
2075 */
2076
2077 bool
2078 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
2079 {
2080
2081 if (pg->uobject != NULL) {
2082 return exclusive
2083 ? rw_write_held(pg->uobject->vmobjlock)
2084 : rw_lock_held(pg->uobject->vmobjlock);
2085 }
2086 if (pg->uanon != NULL) {
2087 return exclusive
2088 ? rw_write_held(pg->uanon->an_lock)
2089 : rw_lock_held(pg->uanon->an_lock);
2090 }
2091 return true;
2092 }
2093
2094 /*
2095 * uvm_pagereadonly_p: return if the page should be mapped read-only
2096 */
2097
2098 bool
2099 uvm_pagereadonly_p(struct vm_page *pg)
2100 {
2101 struct uvm_object * const uobj = pg->uobject;
2102
2103 KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
2104 KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
2105 if ((pg->flags & PG_RDONLY) != 0) {
2106 return true;
2107 }
2108 if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
2109 return true;
2110 }
2111 if (uobj == NULL) {
2112 return false;
2113 }
2114 return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
2115 }
2116
2117 #ifdef PMAP_DIRECT
2118 /*
2119 * Call pmap to translate physical address into a virtual and to run a callback
2120 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
2121 * or equivalent.
2122 */
2123 int
2124 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
2125 int (*process)(void *, size_t, void *), void *arg)
2126 {
2127 int error = 0;
2128 paddr_t pa;
2129 size_t todo;
2130 voff_t pgoff = (off & PAGE_MASK);
2131 struct vm_page *pg;
2132
2133 KASSERT(npages > 0 && len > 0);
2134
2135 for (int i = 0; i < npages; i++) {
2136 pg = pgs[i];
2137
2138 KASSERT(len > 0);
2139
2140 /*
2141 * Caller is responsible for ensuring all the pages are
2142 * available.
2143 */
2144 KASSERT(pg != NULL && pg != PGO_DONTCARE);
2145
2146 pa = VM_PAGE_TO_PHYS(pg);
2147 todo = MIN(len, PAGE_SIZE - pgoff);
2148
2149 error = pmap_direct_process(pa, pgoff, todo, process, arg);
2150 if (error)
2151 break;
2152
2153 pgoff = 0;
2154 len -= todo;
2155 }
2156
2157 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
2158 return error;
2159 }
2160 #endif /* PMAP_DIRECT */
2161
2162 #if defined(DDB) || defined(DEBUGPRINT)
2163
2164 /*
2165 * uvm_page_printit: actually print the page
2166 */
2167
2168 static const char page_flagbits[] = UVM_PGFLAGBITS;
2169 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
2170
2171 void
2172 uvm_page_printit(struct vm_page *pg, bool full,
2173 void (*pr)(const char *, ...))
2174 {
2175 struct vm_page *tpg;
2176 struct uvm_object *uobj;
2177 struct pgflbucket *pgb;
2178 struct pgflist *pgl;
2179 char pgbuf[128];
2180
2181 (*pr)("PAGE %p:\n", pg);
2182 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
2183 (*pr)(" flags=%s\n", pgbuf);
2184 snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
2185 (*pr)(" pqflags=%s\n", pgbuf);
2186 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
2187 pg->uobject, pg->uanon, (long long)pg->offset);
2188 (*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
2189 pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
2190 uvm_page_get_freelist(pg));
2191 (*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
2192 #if defined(UVM_PAGE_TRKOWN)
2193 if (pg->flags & PG_BUSY)
2194 (*pr)(" owning process = %d.%d, tag=%s\n",
2195 pg->owner, pg->lowner, pg->owner_tag);
2196 else
2197 (*pr)(" page not busy, no owner\n");
2198 #else
2199 (*pr)(" [page ownership tracking disabled]\n");
2200 #endif
2201
2202 if (!full)
2203 return;
2204
2205 /* cross-verify object/anon */
2206 if ((pg->flags & PG_FREE) == 0) {
2207 if (pg->flags & PG_ANON) {
2208 if (pg->uanon == NULL || pg->uanon->an_page != pg)
2209 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2210 (pg->uanon) ? pg->uanon->an_page : NULL);
2211 else
2212 (*pr)(" anon backpointer is OK\n");
2213 } else {
2214 uobj = pg->uobject;
2215 if (uobj) {
2216 (*pr)(" checking object list\n");
2217 tpg = uvm_pagelookup(uobj, pg->offset);
2218 if (tpg)
2219 (*pr)(" page found on object list\n");
2220 else
2221 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
2222 }
2223 }
2224 }
2225
2226 /* cross-verify page queue */
2227 if (pg->flags & PG_FREE) {
2228 int fl = uvm_page_get_freelist(pg);
2229 int b = uvm_page_get_bucket(pg);
2230 pgb = uvm.page_free[fl].pgfl_buckets[b];
2231 pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
2232 (*pr)(" checking pageq list\n");
2233 LIST_FOREACH(tpg, pgl, pageq.list) {
2234 if (tpg == pg) {
2235 break;
2236 }
2237 }
2238 if (tpg)
2239 (*pr)(" page found on pageq list\n");
2240 else
2241 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
2242 }
2243 }
2244
2245 /*
2246 * uvm_page_printall - print a summary of all managed pages
2247 */
2248
2249 void
2250 uvm_page_printall(void (*pr)(const char *, ...))
2251 {
2252 uvm_physseg_t i;
2253 paddr_t pfn;
2254 struct vm_page *pg;
2255
2256 (*pr)("%18s %4s %4s %18s %18s"
2257 #ifdef UVM_PAGE_TRKOWN
2258 " OWNER"
2259 #endif
2260 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
2261 for (i = uvm_physseg_get_first();
2262 uvm_physseg_valid_p(i);
2263 i = uvm_physseg_get_next(i)) {
2264 for (pfn = uvm_physseg_get_start(i);
2265 pfn < uvm_physseg_get_end(i);
2266 pfn++) {
2267 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
2268
2269 (*pr)("%18p %04x %08x %18p %18p",
2270 pg, pg->flags, pg->pqflags, pg->uobject,
2271 pg->uanon);
2272 #ifdef UVM_PAGE_TRKOWN
2273 if (pg->flags & PG_BUSY)
2274 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
2275 #endif
2276 (*pr)("\n");
2277 }
2278 }
2279 }
2280
2281 /*
2282 * uvm_page_print_freelists - print a summary freelists
2283 */
2284
2285 void
2286 uvm_page_print_freelists(void (*pr)(const char *, ...))
2287 {
2288 struct pgfreelist *pgfl;
2289 struct pgflbucket *pgb;
2290 int fl, b, c;
2291
2292 (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
2293 VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
2294
2295 for (fl = 0; fl < VM_NFREELIST; fl++) {
2296 pgfl = &uvm.page_free[fl];
2297 (*pr)("freelist(%d) @ %p\n", fl, pgfl);
2298 for (b = 0; b < uvm.bucketcount; b++) {
2299 pgb = uvm.page_free[fl].pgfl_buckets[b];
2300 (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
2301 b, pgb, pgb->pgb_nfree,
2302 &uvm_freelist_locks[b].lock);
2303 for (c = 0; c < uvmexp.ncolors; c++) {
2304 (*pr)(" color(%d) @ %p, ", c,
2305 &pgb->pgb_colors[c]);
2306 (*pr)("first page = %p\n",
2307 LIST_FIRST(&pgb->pgb_colors[c]));
2308 }
2309 }
2310 }
2311 }
2312
2313 #endif /* DDB || DEBUGPRINT */
2314