uvm_page.c revision 1.206 1 /* $NetBSD: uvm_page.c,v 1.206 2019/12/18 20:38:14 ad Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993, The Regents of the University of California.
6 *
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)vm_page.c 8.3 (Berkeley) 3/21/94
37 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
38 *
39 *
40 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41 * All rights reserved.
42 *
43 * Permission to use, copy, modify and distribute this software and
44 * its documentation is hereby granted, provided that both the copyright
45 * notice and this permission notice appear in all copies of the
46 * software, derivative works or modified versions, and any portions
47 * thereof, and that both notices appear in supporting documentation.
48 *
49 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52 *
53 * Carnegie Mellon requests users of this software to return to
54 *
55 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
56 * School of Computer Science
57 * Carnegie Mellon University
58 * Pittsburgh PA 15213-3890
59 *
60 * any improvements or extensions that they make and grant Carnegie the
61 * rights to redistribute these changes.
62 */
63
64 /*
65 * uvm_page.c: page ops.
66 */
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.206 2019/12/18 20:38:14 ad Exp $");
70
71 #include "opt_ddb.h"
72 #include "opt_uvm.h"
73 #include "opt_uvmhist.h"
74 #include "opt_readahead.h"
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/sched.h>
79 #include <sys/kernel.h>
80 #include <sys/vnode.h>
81 #include <sys/proc.h>
82 #include <sys/radixtree.h>
83 #include <sys/atomic.h>
84 #include <sys/cpu.h>
85 #include <sys/extent.h>
86
87 #include <uvm/uvm.h>
88 #include <uvm/uvm_ddb.h>
89 #include <uvm/uvm_pdpolicy.h>
90
91 /*
92 * Some supported CPUs in a given architecture don't support all
93 * of the things necessary to do idle page zero'ing efficiently.
94 * We therefore provide a way to enable it from machdep code here.
95 */
96 bool vm_page_zero_enable = false;
97
98 /*
99 * number of pages per-CPU to reserve for the kernel.
100 */
101 #ifndef UVM_RESERVED_PAGES_PER_CPU
102 #define UVM_RESERVED_PAGES_PER_CPU 5
103 #endif
104 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
105
106 /*
107 * physical memory size;
108 */
109 psize_t physmem;
110
111 /*
112 * local variables
113 */
114
115 /*
116 * these variables record the values returned by vm_page_bootstrap,
117 * for debugging purposes. The implementation of uvm_pageboot_alloc
118 * and pmap_startup here also uses them internally.
119 */
120
121 static vaddr_t virtual_space_start;
122 static vaddr_t virtual_space_end;
123
124 /*
125 * we allocate an initial number of page colors in uvm_page_init(),
126 * and remember them. We may re-color pages as cache sizes are
127 * discovered during the autoconfiguration phase. But we can never
128 * free the initial set of buckets, since they are allocated using
129 * uvm_pageboot_alloc().
130 */
131
132 static size_t recolored_pages_memsize /* = 0 */;
133
134 #ifdef DEBUG
135 vaddr_t uvm_zerocheckkva;
136 #endif /* DEBUG */
137
138 /*
139 * These functions are reserved for uvm(9) internal use and are not
140 * exported in the header file uvm_physseg.h
141 *
142 * Thus they are redefined here.
143 */
144 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
145 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
146
147 /* returns a pgs array */
148 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
149
150 /*
151 * inline functions
152 */
153
154 /*
155 * uvm_pageinsert: insert a page in the object.
156 *
157 * => caller must lock object
158 * => call should have already set pg's object and offset pointers
159 * and bumped the version counter
160 */
161
162 static inline void
163 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
164 {
165
166 KASSERT(uobj == pg->uobject);
167 KASSERT(mutex_owned(uobj->vmobjlock));
168 KASSERT((pg->flags & PG_TABLED) == 0);
169
170 if (UVM_OBJ_IS_VNODE(uobj)) {
171 if (uobj->uo_npages == 0) {
172 struct vnode *vp = (struct vnode *)uobj;
173
174 vholdl(vp);
175 }
176 if (UVM_OBJ_IS_VTEXT(uobj)) {
177 cpu_count(CPU_COUNT_EXECPAGES, 1);
178 } else {
179 cpu_count(CPU_COUNT_FILEPAGES, 1);
180 }
181 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
182 cpu_count(CPU_COUNT_ANONPAGES, 1);
183 }
184 pg->flags |= PG_TABLED;
185 uobj->uo_npages++;
186 }
187
188 static inline int
189 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
190 {
191 const uint64_t idx = pg->offset >> PAGE_SHIFT;
192 int error;
193
194 error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
195 if (error != 0) {
196 return error;
197 }
198 return 0;
199 }
200
201 /*
202 * uvm_page_remove: remove page from object.
203 *
204 * => caller must lock object
205 */
206
207 static inline void
208 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
209 {
210
211 KASSERT(uobj == pg->uobject);
212 KASSERT(mutex_owned(uobj->vmobjlock));
213 KASSERT(pg->flags & PG_TABLED);
214
215 if (UVM_OBJ_IS_VNODE(uobj)) {
216 if (uobj->uo_npages == 1) {
217 struct vnode *vp = (struct vnode *)uobj;
218
219 holdrelel(vp);
220 }
221 if (UVM_OBJ_IS_VTEXT(uobj)) {
222 cpu_count(CPU_COUNT_EXECPAGES, -1);
223 } else {
224 cpu_count(CPU_COUNT_FILEPAGES, -1);
225 }
226 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
227 cpu_count(CPU_COUNT_ANONPAGES, -1);
228 }
229
230 /* object should be locked */
231 uobj->uo_npages--;
232 pg->flags &= ~PG_TABLED;
233 pg->uobject = NULL;
234 }
235
236 static inline void
237 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
238 {
239 struct vm_page *opg __unused;
240
241 opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
242 KASSERT(pg == opg);
243 }
244
245 static void
246 uvm_page_init_buckets(struct pgfreelist *pgfl)
247 {
248 int color, i;
249
250 for (color = 0; color < uvmexp.ncolors; color++) {
251 for (i = 0; i < PGFL_NQUEUES; i++) {
252 LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]);
253 }
254 }
255 }
256
257 /*
258 * uvm_page_init: init the page system. called from uvm_init().
259 *
260 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
261 */
262
263 void
264 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
265 {
266 static struct uvm_cpu boot_cpu;
267 psize_t freepages, pagecount, bucketcount, n;
268 struct pgflbucket *bucketarray, *cpuarray;
269 struct vm_page *pagearray;
270 uvm_physseg_t bank;
271 int lcv;
272
273 KASSERT(ncpu <= 1);
274 CTASSERT(sizeof(pagearray->offset) >= sizeof(struct uvm_cpu *));
275
276 /*
277 * init the page queues and free page queue lock, except the
278 * free list; we allocate that later (with the initial vm_page
279 * structures).
280 */
281
282 uvm.cpus[0] = &boot_cpu;
283 curcpu()->ci_data.cpu_uvm = &boot_cpu;
284 uvmpdpol_init();
285 mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM);
286
287 /*
288 * allocate vm_page structures.
289 */
290
291 /*
292 * sanity check:
293 * before calling this function the MD code is expected to register
294 * some free RAM with the uvm_page_physload() function. our job
295 * now is to allocate vm_page structures for this memory.
296 */
297
298 if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
299 panic("uvm_page_bootstrap: no memory pre-allocated");
300
301 /*
302 * first calculate the number of free pages...
303 *
304 * note that we use start/end rather than avail_start/avail_end.
305 * this allows us to allocate extra vm_page structures in case we
306 * want to return some memory to the pool after booting.
307 */
308
309 freepages = 0;
310
311 for (bank = uvm_physseg_get_first();
312 uvm_physseg_valid_p(bank) ;
313 bank = uvm_physseg_get_next(bank)) {
314 freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
315 }
316
317 /*
318 * Let MD code initialize the number of colors, or default
319 * to 1 color if MD code doesn't care.
320 */
321 if (uvmexp.ncolors == 0)
322 uvmexp.ncolors = 1;
323 uvmexp.colormask = uvmexp.ncolors - 1;
324 KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
325
326 /*
327 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
328 * use. for each page of memory we use we need a vm_page structure.
329 * thus, the total number of pages we can use is the total size of
330 * the memory divided by the PAGE_SIZE plus the size of the vm_page
331 * structure. we add one to freepages as a fudge factor to avoid
332 * truncation errors (since we can only allocate in terms of whole
333 * pages).
334 */
335
336 bucketcount = uvmexp.ncolors * VM_NFREELIST;
337 pagecount = ((freepages + 1) << PAGE_SHIFT) /
338 (PAGE_SIZE + sizeof(struct vm_page));
339
340 bucketarray = (void *)uvm_pageboot_alloc((bucketcount *
341 sizeof(struct pgflbucket) * 2) + (pagecount *
342 sizeof(struct vm_page)));
343 cpuarray = bucketarray + bucketcount;
344 pagearray = (struct vm_page *)(bucketarray + bucketcount * 2);
345
346 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
347 uvm.page_free[lcv].pgfl_buckets =
348 (bucketarray + (lcv * uvmexp.ncolors));
349 uvm_page_init_buckets(&uvm.page_free[lcv]);
350 uvm.cpus[0]->page_free[lcv].pgfl_buckets =
351 (cpuarray + (lcv * uvmexp.ncolors));
352 uvm_page_init_buckets(&uvm.cpus[0]->page_free[lcv]);
353 }
354 memset(pagearray, 0, pagecount * sizeof(struct vm_page));
355
356 /*
357 * init the vm_page structures and put them in the correct place.
358 */
359 /* First init the extent */
360
361 for (bank = uvm_physseg_get_first(),
362 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
363 uvm_physseg_valid_p(bank);
364 bank = uvm_physseg_get_next(bank)) {
365
366 n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
367 uvm_physseg_seg_alloc_from_slab(bank, n);
368 uvm_physseg_init_seg(bank, pagearray);
369
370 /* set up page array pointers */
371 pagearray += n;
372 pagecount -= n;
373 }
374
375 /*
376 * pass up the values of virtual_space_start and
377 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
378 * layers of the VM.
379 */
380
381 *kvm_startp = round_page(virtual_space_start);
382 *kvm_endp = trunc_page(virtual_space_end);
383 #ifdef DEBUG
384 /*
385 * steal kva for uvm_pagezerocheck().
386 */
387 uvm_zerocheckkva = *kvm_startp;
388 *kvm_startp += PAGE_SIZE;
389 #endif /* DEBUG */
390
391 /*
392 * init various thresholds.
393 */
394
395 uvmexp.reserve_pagedaemon = 1;
396 uvmexp.reserve_kernel = vm_page_reserve_kernel;
397
398 /*
399 * determine if we should zero pages in the idle loop.
400 */
401
402 uvm.cpus[0]->page_idle_zero = vm_page_zero_enable;
403
404 /*
405 * done!
406 */
407
408 uvm.page_init_done = true;
409 }
410
411 /*
412 * uvm_setpagesize: set the page size
413 *
414 * => sets page_shift and page_mask from uvmexp.pagesize.
415 */
416
417 void
418 uvm_setpagesize(void)
419 {
420
421 /*
422 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
423 * to be a constant (indicated by being a non-zero value).
424 */
425 if (uvmexp.pagesize == 0) {
426 if (PAGE_SIZE == 0)
427 panic("uvm_setpagesize: uvmexp.pagesize not set");
428 uvmexp.pagesize = PAGE_SIZE;
429 }
430 uvmexp.pagemask = uvmexp.pagesize - 1;
431 if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
432 panic("uvm_setpagesize: page size %u (%#x) not a power of two",
433 uvmexp.pagesize, uvmexp.pagesize);
434 for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
435 if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
436 break;
437 }
438
439 /*
440 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
441 */
442
443 vaddr_t
444 uvm_pageboot_alloc(vsize_t size)
445 {
446 static bool initialized = false;
447 vaddr_t addr;
448 #if !defined(PMAP_STEAL_MEMORY)
449 vaddr_t vaddr;
450 paddr_t paddr;
451 #endif
452
453 /*
454 * on first call to this function, initialize ourselves.
455 */
456 if (initialized == false) {
457 pmap_virtual_space(&virtual_space_start, &virtual_space_end);
458
459 /* round it the way we like it */
460 virtual_space_start = round_page(virtual_space_start);
461 virtual_space_end = trunc_page(virtual_space_end);
462
463 initialized = true;
464 }
465
466 /* round to page size */
467 size = round_page(size);
468 uvmexp.bootpages += atop(size);
469
470 #if defined(PMAP_STEAL_MEMORY)
471
472 /*
473 * defer bootstrap allocation to MD code (it may want to allocate
474 * from a direct-mapped segment). pmap_steal_memory should adjust
475 * virtual_space_start/virtual_space_end if necessary.
476 */
477
478 addr = pmap_steal_memory(size, &virtual_space_start,
479 &virtual_space_end);
480
481 return(addr);
482
483 #else /* !PMAP_STEAL_MEMORY */
484
485 /*
486 * allocate virtual memory for this request
487 */
488 if (virtual_space_start == virtual_space_end ||
489 (virtual_space_end - virtual_space_start) < size)
490 panic("uvm_pageboot_alloc: out of virtual space");
491
492 addr = virtual_space_start;
493
494 #ifdef PMAP_GROWKERNEL
495 /*
496 * If the kernel pmap can't map the requested space,
497 * then allocate more resources for it.
498 */
499 if (uvm_maxkaddr < (addr + size)) {
500 uvm_maxkaddr = pmap_growkernel(addr + size);
501 if (uvm_maxkaddr < (addr + size))
502 panic("uvm_pageboot_alloc: pmap_growkernel() failed");
503 }
504 #endif
505
506 virtual_space_start += size;
507
508 /*
509 * allocate and mapin physical pages to back new virtual pages
510 */
511
512 for (vaddr = round_page(addr) ; vaddr < addr + size ;
513 vaddr += PAGE_SIZE) {
514
515 if (!uvm_page_physget(&paddr))
516 panic("uvm_pageboot_alloc: out of memory");
517
518 /*
519 * Note this memory is no longer managed, so using
520 * pmap_kenter is safe.
521 */
522 pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
523 }
524 pmap_update(pmap_kernel());
525 return(addr);
526 #endif /* PMAP_STEAL_MEMORY */
527 }
528
529 #if !defined(PMAP_STEAL_MEMORY)
530 /*
531 * uvm_page_physget: "steal" one page from the vm_physmem structure.
532 *
533 * => attempt to allocate it off the end of a segment in which the "avail"
534 * values match the start/end values. if we can't do that, then we
535 * will advance both values (making them equal, and removing some
536 * vm_page structures from the non-avail area).
537 * => return false if out of memory.
538 */
539
540 /* subroutine: try to allocate from memory chunks on the specified freelist */
541 static bool uvm_page_physget_freelist(paddr_t *, int);
542
543 static bool
544 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
545 {
546 uvm_physseg_t lcv;
547
548 /* pass 1: try allocating from a matching end */
549 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
550 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
551 #else
552 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
553 #endif
554 {
555 if (uvm.page_init_done == true)
556 panic("uvm_page_physget: called _after_ bootstrap");
557
558 /* Try to match at front or back on unused segment */
559 if (uvm_page_physunload(lcv, freelist, paddrp))
560 return true;
561 }
562
563 /* pass2: forget about matching ends, just allocate something */
564 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
565 for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
566 #else
567 for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
568 #endif
569 {
570 /* Try the front regardless. */
571 if (uvm_page_physunload_force(lcv, freelist, paddrp))
572 return true;
573 }
574 return false;
575 }
576
577 bool
578 uvm_page_physget(paddr_t *paddrp)
579 {
580 int i;
581
582 /* try in the order of freelist preference */
583 for (i = 0; i < VM_NFREELIST; i++)
584 if (uvm_page_physget_freelist(paddrp, i) == true)
585 return (true);
586 return (false);
587 }
588 #endif /* PMAP_STEAL_MEMORY */
589
590 /*
591 * PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
592 * back from an I/O mapping (ugh!). used in some MD code as well.
593 */
594 struct vm_page *
595 uvm_phys_to_vm_page(paddr_t pa)
596 {
597 paddr_t pf = atop(pa);
598 paddr_t off;
599 uvm_physseg_t upm;
600
601 upm = uvm_physseg_find(pf, &off);
602 if (upm != UVM_PHYSSEG_TYPE_INVALID)
603 return uvm_physseg_get_pg(upm, off);
604 return(NULL);
605 }
606
607 paddr_t
608 uvm_vm_page_to_phys(const struct vm_page *pg)
609 {
610
611 return pg->phys_addr;
612 }
613
614 /*
615 * uvm_page_recolor: Recolor the pages if the new bucket count is
616 * larger than the old one.
617 */
618
619 void
620 uvm_page_recolor(int newncolors)
621 {
622 struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray;
623 struct pgfreelist gpgfl, pgfl;
624 struct vm_page *pg;
625 vsize_t bucketcount;
626 size_t bucketmemsize, oldbucketmemsize;
627 int color, i, ocolors;
628 int lcv;
629 struct uvm_cpu *ucpu;
630
631 KASSERT(((newncolors - 1) & newncolors) == 0);
632
633 if (newncolors <= uvmexp.ncolors)
634 return;
635
636 if (uvm.page_init_done == false) {
637 uvmexp.ncolors = newncolors;
638 return;
639 }
640
641 bucketcount = newncolors * VM_NFREELIST;
642 bucketmemsize = bucketcount * sizeof(struct pgflbucket) * 2;
643 bucketarray = kmem_alloc(bucketmemsize, KM_SLEEP);
644 cpuarray = bucketarray + bucketcount;
645
646 mutex_spin_enter(&uvm_fpageqlock);
647
648 /* Make sure we should still do this. */
649 if (newncolors <= uvmexp.ncolors) {
650 mutex_spin_exit(&uvm_fpageqlock);
651 kmem_free(bucketarray, bucketmemsize);
652 return;
653 }
654
655 oldbucketarray = uvm.page_free[0].pgfl_buckets;
656 ocolors = uvmexp.ncolors;
657
658 uvmexp.ncolors = newncolors;
659 uvmexp.colormask = uvmexp.ncolors - 1;
660
661 ucpu = curcpu()->ci_data.cpu_uvm;
662 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
663 gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors));
664 pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors));
665 uvm_page_init_buckets(&gpgfl);
666 uvm_page_init_buckets(&pgfl);
667 for (color = 0; color < ocolors; color++) {
668 for (i = 0; i < PGFL_NQUEUES; i++) {
669 while ((pg = LIST_FIRST(&uvm.page_free[
670 lcv].pgfl_buckets[color].pgfl_queues[i]))
671 != NULL) {
672 LIST_REMOVE(pg, pageq.list); /* global */
673 LIST_REMOVE(pg, listq.list); /* cpu */
674 LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[
675 VM_PGCOLOR_BUCKET(pg)].pgfl_queues[
676 i], pg, pageq.list);
677 LIST_INSERT_HEAD(&pgfl.pgfl_buckets[
678 VM_PGCOLOR_BUCKET(pg)].pgfl_queues[
679 i], pg, listq.list);
680 }
681 }
682 }
683 uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets;
684 ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
685 }
686
687 oldbucketmemsize = recolored_pages_memsize;
688
689 recolored_pages_memsize = bucketmemsize;
690 mutex_spin_exit(&uvm_fpageqlock);
691
692 if (oldbucketmemsize) {
693 kmem_free(oldbucketarray, oldbucketmemsize);
694 }
695
696 /*
697 * this calls uvm_km_alloc() which may want to hold
698 * uvm_fpageqlock.
699 */
700 uvm_pager_realloc_emerg();
701 }
702
703 /*
704 * uvm_cpu_attach: initialize per-CPU data structures.
705 */
706
707 void
708 uvm_cpu_attach(struct cpu_info *ci)
709 {
710 struct pgflbucket *bucketarray;
711 struct pgfreelist pgfl;
712 struct uvm_cpu *ucpu;
713 vsize_t bucketcount;
714 int lcv;
715
716 if (CPU_IS_PRIMARY(ci)) {
717 /* Already done in uvm_page_init(). */
718 goto attachrnd;
719 }
720
721 /* Add more reserve pages for this CPU. */
722 uvmexp.reserve_kernel += vm_page_reserve_kernel;
723
724 /* Configure this CPU's free lists. */
725 bucketcount = uvmexp.ncolors * VM_NFREELIST;
726 bucketarray = kmem_alloc(bucketcount * sizeof(struct pgflbucket),
727 KM_SLEEP);
728 ucpu = kmem_zalloc(sizeof(*ucpu), KM_SLEEP);
729 uvm.cpus[cpu_index(ci)] = ucpu;
730 ci->ci_data.cpu_uvm = ucpu;
731 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
732 pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors));
733 uvm_page_init_buckets(&pgfl);
734 ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
735 }
736
737 attachrnd:
738 /*
739 * Attach RNG source for this CPU's VM events
740 */
741 rnd_attach_source(&uvm.cpus[cpu_index(ci)]->rs,
742 ci->ci_data.cpu_name, RND_TYPE_VM,
743 RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
744 RND_FLAG_ESTIMATE_VALUE);
745
746 }
747
748 /*
749 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat
750 */
751
752 static struct vm_page *
753 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2,
754 int *trycolorp)
755 {
756 struct pgflist *freeq;
757 struct vm_page *pg;
758 int color, trycolor = *trycolorp;
759 struct pgfreelist *gpgfl, *pgfl;
760
761 KASSERT(mutex_owned(&uvm_fpageqlock));
762
763 color = trycolor;
764 pgfl = &ucpu->page_free[flist];
765 gpgfl = &uvm.page_free[flist];
766 do {
767 /* cpu, try1 */
768 if ((pg = LIST_FIRST((freeq =
769 &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
770 KASSERT(pg->flags & PG_FREE);
771 KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
772 KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
773 KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
774 VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
775 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
776 goto gotit;
777 }
778 /* global, try1 */
779 if ((pg = LIST_FIRST((freeq =
780 &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
781 KASSERT(pg->flags & PG_FREE);
782 KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
783 KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
784 KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
785 VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
786 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
787 goto gotit;
788 }
789 /* cpu, try2 */
790 if ((pg = LIST_FIRST((freeq =
791 &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
792 KASSERT(pg->flags & PG_FREE);
793 KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
794 KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
795 KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
796 VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
797 CPU_COUNT(CPU_COUNT_CPUHIT, 1);
798 goto gotit;
799 }
800 /* global, try2 */
801 if ((pg = LIST_FIRST((freeq =
802 &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
803 KASSERT(pg->flags & PG_FREE);
804 KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
805 KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
806 KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
807 VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
808 CPU_COUNT(CPU_COUNT_CPUMISS, 1);
809 goto gotit;
810 }
811 color = (color + 1) & uvmexp.colormask;
812 } while (color != trycolor);
813
814 return (NULL);
815
816 gotit:
817 LIST_REMOVE(pg, pageq.list); /* global list */
818 LIST_REMOVE(pg, listq.list); /* per-cpu list */
819 uvmexp.free--;
820
821 /* update zero'd page count */
822 if (pg->flags & PG_ZERO)
823 CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
824
825 if (color == trycolor)
826 CPU_COUNT(CPU_COUNT_COLORHIT, 1);
827 else {
828 CPU_COUNT(CPU_COUNT_COLORMISS, 1);
829 *trycolorp = color;
830 }
831
832 return (pg);
833 }
834
835 /*
836 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
837 *
838 * => return null if no pages free
839 * => wake up pagedaemon if number of free pages drops below low water mark
840 * => if obj != NULL, obj must be locked (to put in obj's tree)
841 * => if anon != NULL, anon must be locked (to put in anon)
842 * => only one of obj or anon can be non-null
843 * => caller must activate/deactivate page if it is not wired.
844 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
845 * => policy decision: it is more important to pull a page off of the
846 * appropriate priority free list than it is to get a zero'd or
847 * unknown contents page. This is because we live with the
848 * consequences of a bad free list decision for the entire
849 * lifetime of the page, e.g. if the page comes from memory that
850 * is slower to access.
851 */
852
853 struct vm_page *
854 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
855 int flags, int strat, int free_list)
856 {
857 int try1, try2, zeroit = 0, color;
858 int lcv, error;
859 struct uvm_cpu *ucpu;
860 struct vm_page *pg;
861 lwp_t *l;
862
863 KASSERT(obj == NULL || anon == NULL);
864 KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
865 KASSERT(off == trunc_page(off));
866 KASSERT(obj == NULL || mutex_owned(obj->vmobjlock));
867 KASSERT(anon == NULL || anon->an_lock == NULL ||
868 mutex_owned(anon->an_lock));
869
870 mutex_spin_enter(&uvm_fpageqlock);
871
872 /*
873 * This implements a global round-robin page coloring
874 * algorithm.
875 */
876
877 ucpu = curcpu()->ci_data.cpu_uvm;
878 if (flags & UVM_FLAG_COLORMATCH) {
879 color = atop(off) & uvmexp.colormask;
880 } else {
881 color = ucpu->page_free_nextcolor;
882 }
883
884 /*
885 * check to see if we need to generate some free pages waking
886 * the pagedaemon.
887 */
888
889 uvm_kick_pdaemon();
890
891 /*
892 * fail if any of these conditions is true:
893 * [1] there really are no free pages, or
894 * [2] only kernel "reserved" pages remain and
895 * reserved pages have not been requested.
896 * [3] only pagedaemon "reserved" pages remain and
897 * the requestor isn't the pagedaemon.
898 * we make kernel reserve pages available if called by a
899 * kernel thread or a realtime thread.
900 */
901 l = curlwp;
902 if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) {
903 flags |= UVM_PGA_USERESERVE;
904 }
905 if ((uvmexp.free <= uvmexp.reserve_kernel &&
906 (flags & UVM_PGA_USERESERVE) == 0) ||
907 (uvmexp.free <= uvmexp.reserve_pagedaemon &&
908 curlwp != uvm.pagedaemon_lwp))
909 goto fail;
910
911 #if PGFL_NQUEUES != 2
912 #error uvm_pagealloc_strat needs to be updated
913 #endif
914
915 /*
916 * If we want a zero'd page, try the ZEROS queue first, otherwise
917 * we try the UNKNOWN queue first.
918 */
919 if (flags & UVM_PGA_ZERO) {
920 try1 = PGFL_ZEROS;
921 try2 = PGFL_UNKNOWN;
922 } else {
923 try1 = PGFL_UNKNOWN;
924 try2 = PGFL_ZEROS;
925 }
926
927 again:
928 switch (strat) {
929 case UVM_PGA_STRAT_NORMAL:
930 /* Check freelists: descending priority (ascending id) order */
931 for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
932 pg = uvm_pagealloc_pgfl(ucpu, lcv,
933 try1, try2, &color);
934 if (pg != NULL)
935 goto gotit;
936 }
937
938 /* No pages free! */
939 goto fail;
940
941 case UVM_PGA_STRAT_ONLY:
942 case UVM_PGA_STRAT_FALLBACK:
943 /* Attempt to allocate from the specified free list. */
944 KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
945 pg = uvm_pagealloc_pgfl(ucpu, free_list,
946 try1, try2, &color);
947 if (pg != NULL)
948 goto gotit;
949
950 /* Fall back, if possible. */
951 if (strat == UVM_PGA_STRAT_FALLBACK) {
952 strat = UVM_PGA_STRAT_NORMAL;
953 goto again;
954 }
955
956 /* No pages free! */
957 goto fail;
958
959 default:
960 panic("uvm_pagealloc_strat: bad strat %d", strat);
961 /* NOTREACHED */
962 }
963
964 gotit:
965 /*
966 * We now know which color we actually allocated from; set
967 * the next color accordingly.
968 */
969
970 ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask;
971
972 /*
973 * update allocation statistics and remember if we have to
974 * zero the page
975 */
976
977 if (flags & UVM_PGA_ZERO) {
978 if (pg->flags & PG_ZERO) {
979 CPU_COUNT(CPU_COUNT_PGA_ZEROHIT, 1);
980 zeroit = 0;
981 } else {
982 CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
983 zeroit = 1;
984 }
985 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
986 ucpu->page_idle_zero = vm_page_zero_enable;
987 }
988 }
989 KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
990
991 /*
992 * For now check this - later on we may do lazy dequeue, but need
993 * to get page.queue used only by the pagedaemon policy first.
994 */
995 KASSERT(!uvmpdpol_pageisqueued_p(pg));
996
997 /*
998 * assign the page to the object. we don't need to lock the page's
999 * identity to do this, as the caller holds the objects locked, and
1000 * the page is not on any paging queues at this time.
1001 */
1002 pg->offset = off;
1003 pg->uobject = obj;
1004 pg->uanon = anon;
1005 KASSERT(uvm_page_locked_p(pg));
1006 pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE;
1007 mutex_spin_exit(&uvm_fpageqlock);
1008 if (anon) {
1009 anon->an_page = pg;
1010 pg->flags |= PG_ANON;
1011 cpu_count(CPU_COUNT_ANONPAGES, 1);
1012 } else if (obj) {
1013 uvm_pageinsert_object(obj, pg);
1014 error = uvm_pageinsert_tree(obj, pg);
1015 if (error != 0) {
1016 uvm_pageremove_object(obj, pg);
1017 uvm_pagefree(pg);
1018 return NULL;
1019 }
1020 }
1021
1022 #if defined(UVM_PAGE_TRKOWN)
1023 pg->owner_tag = NULL;
1024 #endif
1025 UVM_PAGE_OWN(pg, "new alloc");
1026
1027 if (flags & UVM_PGA_ZERO) {
1028 /*
1029 * A zero'd page is not clean. If we got a page not already
1030 * zero'd, then we have to zero it ourselves.
1031 */
1032 pg->flags &= ~PG_CLEAN;
1033 if (zeroit)
1034 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1035 }
1036
1037 return(pg);
1038
1039 fail:
1040 mutex_spin_exit(&uvm_fpageqlock);
1041 return (NULL);
1042 }
1043
1044 /*
1045 * uvm_pagereplace: replace a page with another
1046 *
1047 * => object must be locked
1048 */
1049
1050 void
1051 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1052 {
1053 struct uvm_object *uobj = oldpg->uobject;
1054
1055 KASSERT((oldpg->flags & PG_TABLED) != 0);
1056 KASSERT(uobj != NULL);
1057 KASSERT((newpg->flags & PG_TABLED) == 0);
1058 KASSERT(newpg->uobject == NULL);
1059 KASSERT(mutex_owned(uobj->vmobjlock));
1060
1061 newpg->offset = oldpg->offset;
1062 uvm_pageremove_tree(uobj, oldpg);
1063 uvm_pageinsert_tree(uobj, newpg);
1064
1065 /* take page interlocks during rename */
1066 if (oldpg < newpg) {
1067 mutex_enter(&oldpg->interlock);
1068 mutex_enter(&newpg->interlock);
1069 } else {
1070 mutex_enter(&newpg->interlock);
1071 mutex_enter(&oldpg->interlock);
1072 }
1073 newpg->uobject = uobj;
1074 uvm_pageinsert_object(uobj, newpg);
1075 uvm_pageremove_object(uobj, oldpg);
1076 mutex_exit(&oldpg->interlock);
1077 mutex_exit(&newpg->interlock);
1078 }
1079
1080 /*
1081 * uvm_pagerealloc: reallocate a page from one object to another
1082 *
1083 * => both objects must be locked
1084 * => both interlocks must be held
1085 */
1086
1087 void
1088 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1089 {
1090 /*
1091 * remove it from the old object
1092 */
1093
1094 if (pg->uobject) {
1095 uvm_pageremove_tree(pg->uobject, pg);
1096 uvm_pageremove_object(pg->uobject, pg);
1097 }
1098
1099 /*
1100 * put it in the new object
1101 */
1102
1103 if (newobj) {
1104 /*
1105 * XXX we have no in-tree users of this functionality
1106 */
1107 panic("uvm_pagerealloc: no impl");
1108 }
1109 }
1110
1111 #ifdef DEBUG
1112 /*
1113 * check if page is zero-filled
1114 */
1115 void
1116 uvm_pagezerocheck(struct vm_page *pg)
1117 {
1118 int *p, *ep;
1119
1120 KASSERT(uvm_zerocheckkva != 0);
1121 KASSERT(mutex_owned(&uvm_fpageqlock));
1122
1123 /*
1124 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1125 * uvm page allocator.
1126 *
1127 * it might be better to have "CPU-local temporary map" pmap interface.
1128 */
1129 pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1130 p = (int *)uvm_zerocheckkva;
1131 ep = (int *)((char *)p + PAGE_SIZE);
1132 pmap_update(pmap_kernel());
1133 while (p < ep) {
1134 if (*p != 0)
1135 panic("PG_ZERO page isn't zero-filled");
1136 p++;
1137 }
1138 pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1139 /*
1140 * pmap_update() is not necessary here because no one except us
1141 * uses this VA.
1142 */
1143 }
1144 #endif /* DEBUG */
1145
1146 /*
1147 * uvm_pagefree: free page
1148 *
1149 * => erase page's identity (i.e. remove from object)
1150 * => put page on free list
1151 * => caller must lock owning object (either anon or uvm_object)
1152 * => assumes all valid mappings of pg are gone
1153 */
1154
1155 void
1156 uvm_pagefree(struct vm_page *pg)
1157 {
1158 struct pgflist *pgfl;
1159 struct uvm_cpu *ucpu;
1160 int index, color, queue;
1161 bool iszero, locked;
1162
1163 #ifdef DEBUG
1164 if (pg->uobject == (void *)0xdeadbeef &&
1165 pg->uanon == (void *)0xdeadbeef) {
1166 panic("uvm_pagefree: freeing free page %p", pg);
1167 }
1168 #endif /* DEBUG */
1169
1170 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1171 KASSERT(!(pg->flags & PG_FREE));
1172 //KASSERT(mutex_owned(&uvm_pageqlock) || !uvmpdpol_pageisqueued_p(pg));
1173 KASSERT(pg->uobject == NULL || mutex_owned(pg->uobject->vmobjlock));
1174 KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1175 mutex_owned(pg->uanon->an_lock));
1176
1177 /*
1178 * remove the page from the object's tree beore acquiring any page
1179 * interlocks: this can acquire locks to free radixtree nodes.
1180 */
1181 if (pg->uobject != NULL) {
1182 uvm_pageremove_tree(pg->uobject, pg);
1183 }
1184
1185 /*
1186 * if the page is loaned, resolve the loan instead of freeing.
1187 */
1188
1189 if (pg->loan_count) {
1190 KASSERT(pg->wire_count == 0);
1191
1192 /*
1193 * if the page is owned by an anon then we just want to
1194 * drop anon ownership. the kernel will free the page when
1195 * it is done with it. if the page is owned by an object,
1196 * remove it from the object and mark it dirty for the benefit
1197 * of possible anon owners.
1198 *
1199 * regardless of previous ownership, wakeup any waiters,
1200 * unbusy the page, and we're done.
1201 */
1202
1203 mutex_enter(&pg->interlock);
1204 locked = true;
1205 if (pg->uobject != NULL) {
1206 uvm_pageremove_object(pg->uobject, pg);
1207 pg->flags &= ~PG_CLEAN;
1208 } else if (pg->uanon != NULL) {
1209 if ((pg->flags & PG_ANON) == 0) {
1210 pg->loan_count--;
1211 } else {
1212 pg->flags &= ~PG_ANON;
1213 cpu_count(CPU_COUNT_ANONPAGES, -1);
1214 }
1215 pg->uanon->an_page = NULL;
1216 pg->uanon = NULL;
1217 }
1218 if (pg->flags & PG_WANTED) {
1219 wakeup(pg);
1220 }
1221 pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED|PG_PAGER1);
1222 #ifdef UVM_PAGE_TRKOWN
1223 pg->owner_tag = NULL;
1224 #endif
1225 if (pg->loan_count) {
1226 KASSERT(pg->uobject == NULL);
1227 mutex_exit(&pg->interlock);
1228 if (pg->uanon == NULL) {
1229 uvm_pagedequeue(pg);
1230 }
1231 return;
1232 }
1233 } else if (pg->uobject != NULL || pg->uanon != NULL ||
1234 pg->wire_count != 0) {
1235 mutex_enter(&pg->interlock);
1236 locked = true;
1237 } else {
1238 locked = false;
1239 }
1240
1241 /*
1242 * remove page from its object or anon.
1243 */
1244 if (pg->uobject != NULL) {
1245 uvm_pageremove_object(pg->uobject, pg);
1246 } else if (pg->uanon != NULL) {
1247 pg->uanon->an_page = NULL;
1248 pg->uanon = NULL;
1249 cpu_count(CPU_COUNT_ANONPAGES, -1);
1250 }
1251
1252 /*
1253 * if the page was wired, unwire it now.
1254 */
1255
1256 if (pg->wire_count) {
1257 pg->wire_count = 0;
1258 atomic_dec_uint(&uvmexp.wired);
1259 }
1260 if (locked) {
1261 mutex_exit(&pg->interlock);
1262 }
1263
1264 /*
1265 * now remove the page from the queues.
1266 */
1267 uvm_pagedequeue(pg);
1268
1269 /*
1270 * and put on free queue
1271 */
1272
1273 iszero = (pg->flags & PG_ZERO);
1274 index = uvm_page_lookup_freelist(pg);
1275 color = VM_PGCOLOR_BUCKET(pg);
1276 queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN);
1277
1278 #ifdef DEBUG
1279 pg->uobject = (void *)0xdeadbeef;
1280 pg->uanon = (void *)0xdeadbeef;
1281 #endif
1282
1283 mutex_spin_enter(&uvm_fpageqlock);
1284 pg->flags = PG_FREE;
1285
1286 #ifdef DEBUG
1287 if (iszero)
1288 uvm_pagezerocheck(pg);
1289 #endif /* DEBUG */
1290
1291
1292 /* global list */
1293 pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue];
1294 LIST_INSERT_HEAD(pgfl, pg, pageq.list);
1295 uvmexp.free++;
1296 if (iszero) {
1297 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1298 }
1299
1300 /* per-cpu list */
1301 ucpu = curcpu()->ci_data.cpu_uvm;
1302 pg->offset = (uintptr_t)ucpu;
1303 pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue];
1304 LIST_INSERT_HEAD(pgfl, pg, listq.list);
1305 ucpu->pages[queue]++;
1306 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
1307 ucpu->page_idle_zero = vm_page_zero_enable;
1308 }
1309
1310 mutex_spin_exit(&uvm_fpageqlock);
1311 }
1312
1313 /*
1314 * uvm_page_unbusy: unbusy an array of pages.
1315 *
1316 * => pages must either all belong to the same object, or all belong to anons.
1317 * => if pages are object-owned, object must be locked.
1318 * => if pages are anon-owned, anons must be locked.
1319 * => caller must make sure that anon-owned pages are not PG_RELEASED.
1320 */
1321
1322 void
1323 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1324 {
1325 struct vm_page *pg;
1326 int i;
1327 UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
1328
1329 for (i = 0; i < npgs; i++) {
1330 pg = pgs[i];
1331 if (pg == NULL || pg == PGO_DONTCARE) {
1332 continue;
1333 }
1334
1335 KASSERT(uvm_page_locked_p(pg));
1336 KASSERT(pg->flags & PG_BUSY);
1337 KASSERT((pg->flags & PG_PAGEOUT) == 0);
1338 if (pg->flags & PG_WANTED) {
1339 /* XXXAD thundering herd problem. */
1340 wakeup(pg);
1341 }
1342 if (pg->flags & PG_RELEASED) {
1343 UVMHIST_LOG(ubchist, "releasing pg %#jx",
1344 (uintptr_t)pg, 0, 0, 0);
1345 KASSERT(pg->uobject != NULL ||
1346 (pg->uanon != NULL && pg->uanon->an_ref > 0));
1347 pg->flags &= ~PG_RELEASED;
1348 uvm_pagefree(pg);
1349 } else {
1350 UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1351 (uintptr_t)pg, 0, 0, 0);
1352 KASSERT((pg->flags & PG_FAKE) == 0);
1353 pg->flags &= ~(PG_WANTED|PG_BUSY);
1354 UVM_PAGE_OWN(pg, NULL);
1355 }
1356 }
1357 }
1358
1359 #if defined(UVM_PAGE_TRKOWN)
1360 /*
1361 * uvm_page_own: set or release page ownership
1362 *
1363 * => this is a debugging function that keeps track of who sets PG_BUSY
1364 * and where they do it. it can be used to track down problems
1365 * such a process setting "PG_BUSY" and never releasing it.
1366 * => page's object [if any] must be locked
1367 * => if "tag" is NULL then we are releasing page ownership
1368 */
1369 void
1370 uvm_page_own(struct vm_page *pg, const char *tag)
1371 {
1372
1373 KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1374 KASSERT((pg->flags & PG_WANTED) == 0);
1375 KASSERT(uvm_page_locked_p(pg));
1376
1377 /* gain ownership? */
1378 if (tag) {
1379 KASSERT((pg->flags & PG_BUSY) != 0);
1380 if (pg->owner_tag) {
1381 printf("uvm_page_own: page %p already owned "
1382 "by proc %d [%s]\n", pg,
1383 pg->owner, pg->owner_tag);
1384 panic("uvm_page_own");
1385 }
1386 pg->owner = curproc->p_pid;
1387 pg->lowner = curlwp->l_lid;
1388 pg->owner_tag = tag;
1389 return;
1390 }
1391
1392 /* drop ownership */
1393 KASSERT((pg->flags & PG_BUSY) == 0);
1394 if (pg->owner_tag == NULL) {
1395 printf("uvm_page_own: dropping ownership of an non-owned "
1396 "page (%p)\n", pg);
1397 panic("uvm_page_own");
1398 }
1399 if (!uvmpdpol_pageisqueued_p(pg)) {
1400 KASSERT((pg->uanon == NULL && pg->uobject == NULL) ||
1401 pg->wire_count > 0);
1402 } else {
1403 KASSERT(pg->wire_count == 0);
1404 }
1405 pg->owner_tag = NULL;
1406 }
1407 #endif
1408
1409 /*
1410 * uvm_pageidlezero: zero free pages while the system is idle.
1411 *
1412 * => try to complete one color bucket at a time, to reduce our impact
1413 * on the CPU cache.
1414 * => we loop until we either reach the target or there is a lwp ready
1415 * to run, or MD code detects a reason to break early.
1416 */
1417 void
1418 uvm_pageidlezero(void)
1419 {
1420 struct vm_page *pg;
1421 struct pgfreelist *pgfl, *gpgfl;
1422 struct uvm_cpu *ucpu;
1423 int free_list, firstbucket, nextbucket;
1424 bool lcont = false;
1425
1426 ucpu = curcpu()->ci_data.cpu_uvm;
1427 if (!ucpu->page_idle_zero ||
1428 ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
1429 ucpu->page_idle_zero = false;
1430 return;
1431 }
1432 if (!mutex_tryenter(&uvm_fpageqlock)) {
1433 /* Contention: let other CPUs to use the lock. */
1434 return;
1435 }
1436 firstbucket = ucpu->page_free_nextcolor;
1437 nextbucket = firstbucket;
1438 do {
1439 for (free_list = 0; free_list < VM_NFREELIST; free_list++) {
1440 if (sched_curcpu_runnable_p()) {
1441 goto quit;
1442 }
1443 pgfl = &ucpu->page_free[free_list];
1444 gpgfl = &uvm.page_free[free_list];
1445 while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[
1446 nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) {
1447 if (lcont || sched_curcpu_runnable_p()) {
1448 goto quit;
1449 }
1450 LIST_REMOVE(pg, pageq.list); /* global list */
1451 LIST_REMOVE(pg, listq.list); /* per-cpu list */
1452 ucpu->pages[PGFL_UNKNOWN]--;
1453 uvmexp.free--;
1454 KASSERT(pg->flags == PG_FREE);
1455 pg->flags = 0;
1456 mutex_spin_exit(&uvm_fpageqlock);
1457 #ifdef PMAP_PAGEIDLEZERO
1458 if (!PMAP_PAGEIDLEZERO(VM_PAGE_TO_PHYS(pg))) {
1459
1460 /*
1461 * The machine-dependent code detected
1462 * some reason for us to abort zeroing
1463 * pages, probably because there is a
1464 * process now ready to run.
1465 */
1466
1467 mutex_spin_enter(&uvm_fpageqlock);
1468 pg->flags = PG_FREE;
1469 LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
1470 nextbucket].pgfl_queues[
1471 PGFL_UNKNOWN], pg, pageq.list);
1472 LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
1473 nextbucket].pgfl_queues[
1474 PGFL_UNKNOWN], pg, listq.list);
1475 ucpu->pages[PGFL_UNKNOWN]++;
1476 uvmexp.free++;
1477 CPU_COUNT(CPU_COUNT_ZEROABORTS, 1);
1478 goto quit;
1479 }
1480 #else
1481 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1482 #endif /* PMAP_PAGEIDLEZERO */
1483 if (!mutex_tryenter(&uvm_fpageqlock)) {
1484 lcont = true;
1485 mutex_spin_enter(&uvm_fpageqlock);
1486 } else {
1487 lcont = false;
1488 }
1489 pg->flags = PG_FREE | PG_ZERO;
1490 LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
1491 nextbucket].pgfl_queues[PGFL_ZEROS],
1492 pg, pageq.list);
1493 LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
1494 nextbucket].pgfl_queues[PGFL_ZEROS],
1495 pg, listq.list);
1496 ucpu->pages[PGFL_ZEROS]++;
1497 uvmexp.free++;
1498 CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1499 }
1500 }
1501 if (ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
1502 break;
1503 }
1504 nextbucket = (nextbucket + 1) & uvmexp.colormask;
1505 } while (nextbucket != firstbucket);
1506 ucpu->page_idle_zero = false;
1507 quit:
1508 mutex_spin_exit(&uvm_fpageqlock);
1509 }
1510
1511 /*
1512 * uvm_pagelookup: look up a page
1513 *
1514 * => caller should lock object to keep someone from pulling the page
1515 * out from under it
1516 */
1517
1518 struct vm_page *
1519 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1520 {
1521 struct vm_page *pg;
1522
1523 /* No - used from DDB. KASSERT(mutex_owned(obj->vmobjlock)); */
1524
1525 pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1526
1527 KASSERT(pg == NULL || obj->uo_npages != 0);
1528 KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1529 (pg->flags & PG_BUSY) != 0);
1530 return pg;
1531 }
1532
1533 /*
1534 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1535 *
1536 * => caller must lock objects
1537 */
1538
1539 void
1540 uvm_pagewire(struct vm_page *pg)
1541 {
1542
1543 KASSERT(uvm_page_locked_p(pg));
1544 #if defined(READAHEAD_STATS)
1545 if ((pg->flags & PG_READAHEAD) != 0) {
1546 uvm_ra_hit.ev_count++;
1547 pg->flags &= ~PG_READAHEAD;
1548 }
1549 #endif /* defined(READAHEAD_STATS) */
1550 if (pg->wire_count == 0) {
1551 uvm_pagedequeue(pg);
1552 atomic_inc_uint(&uvmexp.wired);
1553 }
1554 mutex_enter(&pg->interlock);
1555 pg->wire_count++;
1556 mutex_exit(&pg->interlock);
1557 KASSERT(pg->wire_count > 0); /* detect wraparound */
1558 }
1559
1560 /*
1561 * uvm_pageunwire: unwire the page.
1562 *
1563 * => activate if wire count goes to zero.
1564 * => caller must lock objects
1565 */
1566
1567 void
1568 uvm_pageunwire(struct vm_page *pg)
1569 {
1570
1571 KASSERT(uvm_page_locked_p(pg));
1572 KASSERT(pg->wire_count != 0);
1573 KASSERT(!uvmpdpol_pageisqueued_p(pg));
1574 mutex_enter(&pg->interlock);
1575 pg->wire_count--;
1576 mutex_exit(&pg->interlock);
1577 if (pg->wire_count == 0) {
1578 uvm_pageactivate(pg);
1579 KASSERT(uvmexp.wired != 0);
1580 atomic_dec_uint(&uvmexp.wired);
1581 }
1582 }
1583
1584 /*
1585 * uvm_pagedeactivate: deactivate page
1586 *
1587 * => caller must lock objects
1588 * => caller must check to make sure page is not wired
1589 * => object that page belongs to must be locked (so we can adjust pg->flags)
1590 * => caller must clear the reference on the page before calling
1591 */
1592
1593 void
1594 uvm_pagedeactivate(struct vm_page *pg)
1595 {
1596
1597 KASSERT(uvm_page_locked_p(pg));
1598 if (pg->wire_count == 0) {
1599 KASSERT(uvmpdpol_pageisqueued_p(pg));
1600 uvmpdpol_pagedeactivate(pg);
1601 }
1602 }
1603
1604 /*
1605 * uvm_pageactivate: activate page
1606 *
1607 * => caller must lock objects
1608 */
1609
1610 void
1611 uvm_pageactivate(struct vm_page *pg)
1612 {
1613
1614 KASSERT(uvm_page_locked_p(pg));
1615 #if defined(READAHEAD_STATS)
1616 if ((pg->flags & PG_READAHEAD) != 0) {
1617 uvm_ra_hit.ev_count++;
1618 pg->flags &= ~PG_READAHEAD;
1619 }
1620 #endif /* defined(READAHEAD_STATS) */
1621 if (pg->wire_count == 0) {
1622 uvmpdpol_pageactivate(pg);
1623 }
1624 }
1625
1626 /*
1627 * uvm_pagedequeue: remove a page from any paging queue
1628 *
1629 * => caller must lock objects
1630 */
1631 void
1632 uvm_pagedequeue(struct vm_page *pg)
1633 {
1634
1635 KASSERT(uvm_page_locked_p(pg));
1636 if (uvmpdpol_pageisqueued_p(pg)) {
1637 uvmpdpol_pagedequeue(pg);
1638 }
1639 }
1640
1641 /*
1642 * uvm_pageenqueue: add a page to a paging queue without activating.
1643 * used where a page is not really demanded (yet). eg. read-ahead
1644 *
1645 * => caller must lock objects
1646 */
1647 void
1648 uvm_pageenqueue(struct vm_page *pg)
1649 {
1650
1651 KASSERT(uvm_page_locked_p(pg));
1652 if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1653 uvmpdpol_pageenqueue(pg);
1654 }
1655 }
1656
1657 /*
1658 * uvm_pagezero: zero fill a page
1659 *
1660 * => if page is part of an object then the object should be locked
1661 * to protect pg->flags.
1662 */
1663
1664 void
1665 uvm_pagezero(struct vm_page *pg)
1666 {
1667 pg->flags &= ~PG_CLEAN;
1668 pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1669 }
1670
1671 /*
1672 * uvm_pagecopy: copy a page
1673 *
1674 * => if page is part of an object then the object should be locked
1675 * to protect pg->flags.
1676 */
1677
1678 void
1679 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
1680 {
1681
1682 dst->flags &= ~PG_CLEAN;
1683 pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
1684 }
1685
1686 /*
1687 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
1688 */
1689
1690 bool
1691 uvm_pageismanaged(paddr_t pa)
1692 {
1693
1694 return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
1695 }
1696
1697 /*
1698 * uvm_page_lookup_freelist: look up the free list for the specified page
1699 */
1700
1701 int
1702 uvm_page_lookup_freelist(struct vm_page *pg)
1703 {
1704 uvm_physseg_t upm;
1705
1706 upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
1707 KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
1708 return uvm_physseg_get_free_list(upm);
1709 }
1710
1711 /*
1712 * uvm_page_locked_p: return true if object associated with page is
1713 * locked. this is a weak check for runtime assertions only.
1714 */
1715
1716 bool
1717 uvm_page_locked_p(struct vm_page *pg)
1718 {
1719
1720 if (pg->uobject != NULL) {
1721 return mutex_owned(pg->uobject->vmobjlock);
1722 }
1723 if (pg->uanon != NULL) {
1724 return mutex_owned(pg->uanon->an_lock);
1725 }
1726 return true;
1727 }
1728
1729 #ifdef PMAP_DIRECT
1730 /*
1731 * Call pmap to translate physical address into a virtual and to run a callback
1732 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
1733 * or equivalent.
1734 */
1735 int
1736 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
1737 int (*process)(void *, size_t, void *), void *arg)
1738 {
1739 int error = 0;
1740 paddr_t pa;
1741 size_t todo;
1742 voff_t pgoff = (off & PAGE_MASK);
1743 struct vm_page *pg;
1744
1745 KASSERT(npages > 0 && len > 0);
1746
1747 for (int i = 0; i < npages; i++) {
1748 pg = pgs[i];
1749
1750 KASSERT(len > 0);
1751
1752 /*
1753 * Caller is responsible for ensuring all the pages are
1754 * available.
1755 */
1756 KASSERT(pg != NULL && pg != PGO_DONTCARE);
1757
1758 pa = VM_PAGE_TO_PHYS(pg);
1759 todo = MIN(len, PAGE_SIZE - pgoff);
1760
1761 error = pmap_direct_process(pa, pgoff, todo, process, arg);
1762 if (error)
1763 break;
1764
1765 pgoff = 0;
1766 len -= todo;
1767 }
1768
1769 KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
1770 return error;
1771 }
1772 #endif /* PMAP_DIRECT */
1773
1774 #if defined(DDB) || defined(DEBUGPRINT)
1775
1776 /*
1777 * uvm_page_printit: actually print the page
1778 */
1779
1780 static const char page_flagbits[] = UVM_PGFLAGBITS;
1781
1782 void
1783 uvm_page_printit(struct vm_page *pg, bool full,
1784 void (*pr)(const char *, ...))
1785 {
1786 struct vm_page *tpg;
1787 struct uvm_object *uobj;
1788 struct pgflist *pgl;
1789 char pgbuf[128];
1790
1791 (*pr)("PAGE %p:\n", pg);
1792 snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
1793 (*pr)(" flags=%s, pqflags=%x, wire_count=%d, pa=0x%lx\n",
1794 pgbuf, pg->pqflags, pg->wire_count, (long)VM_PAGE_TO_PHYS(pg));
1795 (*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n",
1796 pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count);
1797 #if defined(UVM_PAGE_TRKOWN)
1798 if (pg->flags & PG_BUSY)
1799 (*pr)(" owning process = %d, tag=%s\n",
1800 pg->owner, pg->owner_tag);
1801 else
1802 (*pr)(" page not busy, no owner\n");
1803 #else
1804 (*pr)(" [page ownership tracking disabled]\n");
1805 #endif
1806
1807 if (!full)
1808 return;
1809
1810 /* cross-verify object/anon */
1811 if ((pg->flags & PG_FREE) == 0) {
1812 if (pg->flags & PG_ANON) {
1813 if (pg->uanon == NULL || pg->uanon->an_page != pg)
1814 (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
1815 (pg->uanon) ? pg->uanon->an_page : NULL);
1816 else
1817 (*pr)(" anon backpointer is OK\n");
1818 } else {
1819 uobj = pg->uobject;
1820 if (uobj) {
1821 (*pr)(" checking object list\n");
1822 tpg = uvm_pagelookup(uobj, pg->offset);
1823 if (tpg)
1824 (*pr)(" page found on object list\n");
1825 else
1826 (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
1827 }
1828 }
1829 }
1830
1831 /* cross-verify page queue */
1832 if (pg->flags & PG_FREE) {
1833 int fl = uvm_page_lookup_freelist(pg);
1834 int color = VM_PGCOLOR_BUCKET(pg);
1835 pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[
1836 ((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN];
1837 } else {
1838 pgl = NULL;
1839 }
1840
1841 if (pgl) {
1842 (*pr)(" checking pageq list\n");
1843 LIST_FOREACH(tpg, pgl, pageq.list) {
1844 if (tpg == pg) {
1845 break;
1846 }
1847 }
1848 if (tpg)
1849 (*pr)(" page found on pageq list\n");
1850 else
1851 (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
1852 }
1853 }
1854
1855 /*
1856 * uvm_page_printall - print a summary of all managed pages
1857 */
1858
1859 void
1860 uvm_page_printall(void (*pr)(const char *, ...))
1861 {
1862 uvm_physseg_t i;
1863 paddr_t pfn;
1864 struct vm_page *pg;
1865
1866 (*pr)("%18s %4s %4s %18s %18s"
1867 #ifdef UVM_PAGE_TRKOWN
1868 " OWNER"
1869 #endif
1870 "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
1871 for (i = uvm_physseg_get_first();
1872 uvm_physseg_valid_p(i);
1873 i = uvm_physseg_get_next(i)) {
1874 for (pfn = uvm_physseg_get_start(i);
1875 pfn < uvm_physseg_get_end(i);
1876 pfn++) {
1877 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
1878
1879 (*pr)("%18p %04x %08x %18p %18p",
1880 pg, pg->flags, pg->pqflags, pg->uobject,
1881 pg->uanon);
1882 #ifdef UVM_PAGE_TRKOWN
1883 if (pg->flags & PG_BUSY)
1884 (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
1885 #endif
1886 (*pr)("\n");
1887 }
1888 }
1889 }
1890
1891 #endif /* DDB || DEBUGPRINT */
1892