vm.c revision 1.187 1 /* $NetBSD: vm.c,v 1.187 2020/03/17 18:31:38 ad Exp $ */
2
3 /*
4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by
7 * The Finnish Cultural Foundation and the Research Foundation of
8 * The Helsinki University of Technology.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Virtual memory emulation routines.
34 */
35
36 /*
37 * XXX: we abuse pg->uanon for the virtual address of the storage
38 * for each page. phys_addr would fit the job description better,
39 * except that it will create unnecessary lossage on some platforms
40 * due to not being a pointer type.
41 */
42
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.187 2020/03/17 18:31:38 ad Exp $");
45
46 #include <sys/param.h>
47 #include <sys/atomic.h>
48 #include <sys/buf.h>
49 #include <sys/kernel.h>
50 #include <sys/kmem.h>
51 #include <sys/vmem.h>
52 #include <sys/mman.h>
53 #include <sys/null.h>
54 #include <sys/vnode.h>
55 #include <sys/radixtree.h>
56
57 #include <machine/pmap.h>
58
59 #include <uvm/uvm.h>
60 #include <uvm/uvm_ddb.h>
61 #include <uvm/uvm_pdpolicy.h>
62 #include <uvm/uvm_prot.h>
63 #include <uvm/uvm_readahead.h>
64 #include <uvm/uvm_device.h>
65
66 #include <rump-sys/kern.h>
67 #include <rump-sys/vfs.h>
68
69 #include <rump/rumpuser.h>
70
71 kmutex_t vmpage_lruqueue_lock; /* non-free page lock */
72 kmutex_t uvm_fpageqlock; /* free page lock, non-gpl license */
73 kmutex_t uvm_swap_data_lock;
74
75 struct uvmexp uvmexp;
76 struct uvm uvm;
77
78 #ifdef __uvmexp_pagesize
79 const int * const uvmexp_pagesize = &uvmexp.pagesize;
80 const int * const uvmexp_pagemask = &uvmexp.pagemask;
81 const int * const uvmexp_pageshift = &uvmexp.pageshift;
82 #endif
83
84 static struct vm_map kernel_map_store;
85 struct vm_map *kernel_map = &kernel_map_store;
86
87 static struct vm_map module_map_store;
88 extern struct vm_map *module_map;
89
90 static struct pmap pmap_kernel;
91 struct pmap rump_pmap_local;
92 struct pmap *const kernel_pmap_ptr = &pmap_kernel;
93
94 vmem_t *kmem_arena;
95 vmem_t *kmem_va_arena;
96
97 static unsigned int pdaemon_waiters;
98 static kmutex_t pdaemonmtx;
99 static kcondvar_t pdaemoncv, oomwait;
100
101 /* all local non-proc0 processes share this vmspace */
102 struct vmspace *rump_vmspace_local;
103
104 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED;
105 static unsigned long pdlimit = RUMPMEM_UNLIMITED; /* page daemon memlimit */
106 static unsigned long curphysmem;
107 static unsigned long dddlim; /* 90% of memory limit used */
108 #define NEED_PAGEDAEMON() \
109 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim)
110 #define PDRESERVE (2*MAXPHYS)
111
112 /*
113 * Try to free two pages worth of pages from objects.
114 * If this succesfully frees a full page cache page, we'll
115 * free the released page plus PAGE_SIZE/sizeof(vm_page).
116 */
117 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page))
118
119 /*
120 * Keep a list of least recently used pages. Since the only way a
121 * rump kernel can "access" a page is via lookup, we put the page
122 * at the back of queue every time a lookup for it is done. If the
123 * page is in front of this global queue and we're short of memory,
124 * it's a candidate for pageout.
125 */
126 static struct pglist vmpage_lruqueue;
127 static unsigned vmpage_onqueue;
128
129 /*
130 * vm pages
131 */
132
133 static int
134 pgctor(void *arg, void *obj, int flags)
135 {
136 struct vm_page *pg = obj;
137
138 memset(pg, 0, sizeof(*pg));
139 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE,
140 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc");
141 return pg->uanon == NULL;
142 }
143
144 static void
145 pgdtor(void *arg, void *obj)
146 {
147 struct vm_page *pg = obj;
148
149 rump_hyperfree(pg->uanon, PAGE_SIZE);
150 }
151
152 static struct pool_cache pagecache;
153
154 /*
155 * Called with the object locked. We don't support anons.
156 */
157 struct vm_page *
158 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon,
159 int flags, int strat, int free_list)
160 {
161 struct vm_page *pg;
162
163 KASSERT(uobj && rw_write_held(uobj->vmobjlock));
164 KASSERT(anon == NULL);
165
166 pg = pool_cache_get(&pagecache, PR_NOWAIT);
167 if (__predict_false(pg == NULL)) {
168 return NULL;
169 }
170 mutex_init(&pg->interlock, MUTEX_DEFAULT, IPL_NONE);
171
172 pg->offset = off;
173 pg->uobject = uobj;
174
175 if (UVM_OBJ_IS_VNODE(uobj) && uobj->uo_npages == 0) {
176 struct vnode *vp = (struct vnode *)uobj;
177 mutex_enter(vp->v_interlock);
178 vp->v_iflag |= VI_PAGES;
179 mutex_exit(vp->v_interlock);
180 }
181
182 if (radix_tree_insert_node(&uobj->uo_pages, off >> PAGE_SHIFT,
183 pg) != 0) {
184 pool_cache_put(&pagecache, pg);
185 return NULL;
186 }
187 uobj->uo_npages++;
188
189 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE;
190 if (flags & UVM_PGA_ZERO) {
191 uvm_pagezero(pg);
192 }
193
194 /*
195 * Don't put anons on the LRU page queue. We can't flush them
196 * (there's no concept of swap in a rump kernel), so no reason
197 * to bother with them.
198 */
199 if (!UVM_OBJ_IS_AOBJ(uobj)) {
200 atomic_inc_uint(&vmpage_onqueue);
201 mutex_enter(&vmpage_lruqueue_lock);
202 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
203 mutex_exit(&vmpage_lruqueue_lock);
204 }
205
206 return pg;
207 }
208
209 /*
210 * Release a page.
211 *
212 * Called with the vm object locked.
213 */
214 void
215 uvm_pagefree(struct vm_page *pg)
216 {
217 struct uvm_object *uobj = pg->uobject;
218 struct vm_page *pg2 __unused;
219
220 KASSERT(rw_write_held(uobj->vmobjlock));
221
222 mutex_enter(&pg->interlock);
223 if (pg->pqflags & PQ_WANTED) {
224 pg->pqflags &= ~PQ_WANTED;
225 wakeup(pg);
226 }
227 mutex_exit(&pg->interlock);
228
229 uobj->uo_npages--;
230 pg2 = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
231 KASSERT(pg == pg2);
232
233 if (!UVM_OBJ_IS_AOBJ(uobj)) {
234 mutex_enter(&vmpage_lruqueue_lock);
235 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
236 mutex_exit(&vmpage_lruqueue_lock);
237 atomic_dec_uint(&vmpage_onqueue);
238 }
239
240 if (UVM_OBJ_IS_VNODE(uobj) && uobj->uo_npages == 0) {
241 struct vnode *vp = (struct vnode *)uobj;
242 mutex_enter(vp->v_interlock);
243 vp->v_iflag &= ~VI_PAGES;
244 mutex_exit(vp->v_interlock);
245 }
246
247 mutex_destroy(&pg->interlock);
248 pool_cache_put(&pagecache, pg);
249 }
250
251 void
252 uvm_pagezero(struct vm_page *pg)
253 {
254
255 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
256 memset((void *)pg->uanon, 0, PAGE_SIZE);
257 }
258
259 /*
260 * uvm_page_owner_locked_p: return true if object associated with page is
261 * locked. this is a weak check for runtime assertions only.
262 */
263
264 bool
265 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
266 {
267
268 if (exclusive)
269 return rw_write_held(pg->uobject->vmobjlock);
270 else
271 return rw_lock_held(pg->uobject->vmobjlock);
272 }
273
274 /*
275 * Misc routines
276 */
277
278 static kmutex_t pagermtx;
279
280 void
281 uvm_init(void)
282 {
283 char buf[64];
284
285 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) {
286 unsigned long tmp;
287 char *ep;
288 int mult;
289
290 tmp = strtoul(buf, &ep, 10);
291 if (strlen(ep) > 1)
292 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf);
293
294 /* mini-dehumanize-number */
295 mult = 1;
296 switch (*ep) {
297 case 'k':
298 mult = 1024;
299 break;
300 case 'm':
301 mult = 1024*1024;
302 break;
303 case 'g':
304 mult = 1024*1024*1024;
305 break;
306 case 0:
307 break;
308 default:
309 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf);
310 }
311 rump_physmemlimit = tmp * mult;
312
313 if (rump_physmemlimit / mult != tmp)
314 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf);
315
316 /* reserve some memory for the pager */
317 if (rump_physmemlimit <= PDRESERVE)
318 panic("uvm_init: system reserves %d bytes of mem, "
319 "only %lu bytes given",
320 PDRESERVE, rump_physmemlimit);
321 pdlimit = rump_physmemlimit;
322 rump_physmemlimit -= PDRESERVE;
323
324 if (pdlimit < 1024*1024)
325 printf("uvm_init: WARNING: <1MB RAM limit, "
326 "hope you know what you're doing\n");
327
328 #define HUMANIZE_BYTES 9
329 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES);
330 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit);
331 #undef HUMANIZE_BYTES
332 dddlim = 9 * (rump_physmemlimit / 10);
333 } else {
334 strlcpy(buf, "unlimited (host limit)", sizeof(buf));
335 }
336 aprint_verbose("total memory = %s\n", buf);
337
338 TAILQ_INIT(&vmpage_lruqueue);
339
340 if (rump_physmemlimit == RUMPMEM_UNLIMITED) {
341 uvmexp.npages = physmem;
342 } else {
343 uvmexp.npages = pdlimit >> PAGE_SHIFT;
344 uvmexp.reserve_pagedaemon = PDRESERVE >> PAGE_SHIFT;
345 uvmexp.freetarg = (rump_physmemlimit-dddlim) >> PAGE_SHIFT;
346 }
347 /*
348 * uvmexp.free is not used internally or updated. The reason is
349 * that the memory hypercall allocator is allowed to allocate
350 * non-page sized chunks. We use a byte count in curphysmem
351 * instead.
352 */
353 uvmexp.free = uvmexp.npages;
354
355 #ifndef __uvmexp_pagesize
356 uvmexp.pagesize = PAGE_SIZE;
357 uvmexp.pagemask = PAGE_MASK;
358 uvmexp.pageshift = PAGE_SHIFT;
359 #else
360 #define FAKE_PAGE_SHIFT 12
361 uvmexp.pageshift = FAKE_PAGE_SHIFT;
362 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT;
363 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1;
364 #undef FAKE_PAGE_SHIFT
365 #endif
366
367 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE);
368 mutex_init(&vmpage_lruqueue_lock, MUTEX_DEFAULT, IPL_NONE);
369 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
370
371 /* just to appease linkage */
372 mutex_init(&uvm_fpageqlock, MUTEX_SPIN, IPL_VM);
373
374 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE);
375 cv_init(&pdaemoncv, "pdaemon");
376 cv_init(&oomwait, "oomwait");
377
378 module_map = &module_map_store;
379
380 kernel_map->pmap = pmap_kernel();
381
382 pool_subsystem_init();
383
384 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE,
385 NULL, NULL, NULL,
386 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
387
388 vmem_subsystem_init(kmem_arena);
389
390 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE,
391 vmem_alloc, vmem_free, kmem_arena,
392 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
393
394 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0,
395 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL);
396
397 radix_tree_init();
398
399 /* create vmspace used by local clients */
400 rump_vmspace_local = kmem_zalloc(sizeof(*rump_vmspace_local), KM_SLEEP);
401 uvmspace_init(rump_vmspace_local, &rump_pmap_local, 0, 0, false);
402 }
403
404 void
405 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax,
406 bool topdown)
407 {
408
409 vm->vm_map.pmap = pmap;
410 vm->vm_refcnt = 1;
411 }
412
413 int
414 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
415 bool new_pageable, int lockflags)
416 {
417 return 0;
418 }
419
420 void
421 uvm_pagewire(struct vm_page *pg)
422 {
423
424 /* nada */
425 }
426
427 void
428 uvm_pageunwire(struct vm_page *pg)
429 {
430
431 /* nada */
432 }
433
434 int
435 uvm_availmem(void)
436 {
437
438 return uvmexp.free;
439 }
440
441 void
442 uvm_pagelock(struct vm_page *pg)
443 {
444
445 mutex_enter(&pg->interlock);
446 }
447
448 void
449 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
450 {
451
452 if (pg1 < pg2) {
453 mutex_enter(&pg1->interlock);
454 mutex_enter(&pg2->interlock);
455 } else {
456 mutex_enter(&pg2->interlock);
457 mutex_enter(&pg1->interlock);
458 }
459 }
460
461 void
462 uvm_pageunlock(struct vm_page *pg)
463 {
464
465 mutex_exit(&pg->interlock);
466 }
467
468 void
469 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
470 {
471
472 mutex_exit(&pg1->interlock);
473 mutex_exit(&pg2->interlock);
474 }
475
476 /* where's your schmonz now? */
477 #define PUNLIMIT(a) \
478 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY;
479 void
480 uvm_init_limits(struct proc *p)
481 {
482
483 #ifndef DFLSSIZ
484 #define DFLSSIZ (16*1024*1024)
485 #endif
486 p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
487 p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ;
488 PUNLIMIT(RLIMIT_DATA);
489 PUNLIMIT(RLIMIT_RSS);
490 PUNLIMIT(RLIMIT_AS);
491 /* nice, cascade */
492 }
493 #undef PUNLIMIT
494
495 /*
496 * This satisfies the "disgusting mmap hack" used by proplib.
497 */
498 int
499 uvm_mmap_anon(struct proc *p, void **addrp, size_t size)
500 {
501 int error;
502
503 /* no reason in particular, but cf. uvm_default_mapaddr() */
504 if (*addrp != NULL)
505 panic("uvm_mmap() variant unsupported");
506
507 if (RUMP_LOCALPROC_P(curproc)) {
508 error = rumpuser_anonmmap(NULL, size, 0, 0, addrp);
509 } else {
510 error = rump_sysproxy_anonmmap(RUMP_SPVM2CTL(p->p_vmspace),
511 size, addrp);
512 }
513 return error;
514 }
515
516 /*
517 * Stubs for things referenced from vfs_vnode.c but not used.
518 */
519 const dev_t zerodev;
520
521 struct uvm_object *
522 udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size)
523 {
524 return NULL;
525 }
526
527 struct pagerinfo {
528 vaddr_t pgr_kva;
529 int pgr_npages;
530 struct vm_page **pgr_pgs;
531 bool pgr_read;
532
533 LIST_ENTRY(pagerinfo) pgr_entries;
534 };
535 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist);
536
537 /*
538 * Pager "map" in routine. Instead of mapping, we allocate memory
539 * and copy page contents there. The reason for copying instead of
540 * mapping is simple: we do not assume we are running on virtual
541 * memory. Even if we could emulate virtual memory in some envs
542 * such as userspace, copying is much faster than trying to awkardly
543 * cope with remapping (see "Design and Implementation" pp.95-98).
544 * The downside of the approach is that the pager requires MAXPHYS
545 * free memory to perform paging, but short of virtual memory or
546 * making the pager do I/O in page-sized chunks we cannot do much
547 * about that.
548 */
549 vaddr_t
550 uvm_pagermapin(struct vm_page **pgs, int npages, int flags)
551 {
552 struct pagerinfo *pgri;
553 vaddr_t curkva;
554 int i;
555
556 /* allocate structures */
557 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP);
558 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP);
559 pgri->pgr_npages = npages;
560 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP);
561 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0;
562
563 /* copy contents to "mapped" memory */
564 for (i = 0, curkva = pgri->pgr_kva;
565 i < npages;
566 i++, curkva += PAGE_SIZE) {
567 /*
568 * We need to copy the previous contents of the pages to
569 * the window even if we are reading from the
570 * device, since the device might not fill the contents of
571 * the full mapped range and we will end up corrupting
572 * data when we unmap the window.
573 */
574 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE);
575 pgri->pgr_pgs[i] = pgs[i];
576 }
577
578 mutex_enter(&pagermtx);
579 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries);
580 mutex_exit(&pagermtx);
581
582 return pgri->pgr_kva;
583 }
584
585 /*
586 * map out the pager window. return contents from VA to page storage
587 * and free structures.
588 *
589 * Note: does not currently support partial frees
590 */
591 void
592 uvm_pagermapout(vaddr_t kva, int npages)
593 {
594 struct pagerinfo *pgri;
595 vaddr_t curkva;
596 int i;
597
598 mutex_enter(&pagermtx);
599 LIST_FOREACH(pgri, &pagerlist, pgr_entries) {
600 if (pgri->pgr_kva == kva)
601 break;
602 }
603 KASSERT(pgri);
604 if (pgri->pgr_npages != npages)
605 panic("uvm_pagermapout: partial unmapping not supported");
606 LIST_REMOVE(pgri, pgr_entries);
607 mutex_exit(&pagermtx);
608
609 if (pgri->pgr_read) {
610 for (i = 0, curkva = pgri->pgr_kva;
611 i < pgri->pgr_npages;
612 i++, curkva += PAGE_SIZE) {
613 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE);
614 }
615 }
616
617 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *));
618 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE);
619 kmem_free(pgri, sizeof(*pgri));
620 }
621
622 /*
623 * convert va in pager window to page structure.
624 * XXX: how expensive is this (global lock, list traversal)?
625 */
626 struct vm_page *
627 uvm_pageratop(vaddr_t va)
628 {
629 struct pagerinfo *pgri;
630 struct vm_page *pg = NULL;
631 int i;
632
633 mutex_enter(&pagermtx);
634 LIST_FOREACH(pgri, &pagerlist, pgr_entries) {
635 if (pgri->pgr_kva <= va
636 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE)
637 break;
638 }
639 if (pgri) {
640 i = (va - pgri->pgr_kva) >> PAGE_SHIFT;
641 pg = pgri->pgr_pgs[i];
642 }
643 mutex_exit(&pagermtx);
644
645 return pg;
646 }
647
648 /*
649 * Called with the vm object locked.
650 *
651 * Put vnode object pages at the end of the access queue to indicate
652 * they have been recently accessed and should not be immediate
653 * candidates for pageout. Do not do this for lookups done by
654 * the pagedaemon to mimic pmap_kentered mappings which don't track
655 * access information.
656 */
657 struct vm_page *
658 uvm_pagelookup(struct uvm_object *uobj, voff_t off)
659 {
660 struct vm_page *pg;
661 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp;
662
663 pg = radix_tree_lookup_node(&uobj->uo_pages, off >> PAGE_SHIFT);
664 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) {
665 mutex_enter(&vmpage_lruqueue_lock);
666 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
667 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
668 mutex_exit(&vmpage_lruqueue_lock);
669 }
670
671 return pg;
672 }
673
674 void
675 uvm_page_unbusy(struct vm_page **pgs, int npgs)
676 {
677 struct vm_page *pg;
678 int i;
679
680 KASSERT(npgs > 0);
681 KASSERT(rw_write_held(pgs[0]->uobject->vmobjlock));
682
683 for (i = 0; i < npgs; i++) {
684 pg = pgs[i];
685 if (pg == NULL)
686 continue;
687
688 KASSERT(pg->flags & PG_BUSY);
689 if (pg->flags & PG_RELEASED) {
690 uvm_pagefree(pg);
691 } else {
692 pg->flags &= ~PG_BUSY;
693 uvm_pagelock(pg);
694 uvm_pagewakeup(pg);
695 uvm_pageunlock(pg);
696 }
697 }
698 }
699
700 void
701 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
702 {
703
704 KASSERT(rw_lock_held(lock));
705 KASSERT((pg->flags & PG_BUSY) != 0);
706
707 mutex_enter(&pg->interlock);
708 pg->pqflags |= PQ_WANTED;
709 rw_exit(lock);
710 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
711 }
712
713 void
714 uvm_pagewakeup(struct vm_page *pg)
715 {
716
717 KASSERT(mutex_owned(&pg->interlock));
718
719 if ((pg->pqflags & PQ_WANTED) != 0) {
720 pg->pqflags &= ~PQ_WANTED;
721 wakeup(pg);
722 }
723 }
724
725 void
726 uvm_estimatepageable(int *active, int *inactive)
727 {
728
729 /* XXX: guessing game */
730 *active = 1024;
731 *inactive = 1024;
732 }
733
734 int
735 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags)
736 {
737
738 panic("%s: unimplemented", __func__);
739 }
740
741 void
742 uvm_unloan(void *v, int npages, int flags)
743 {
744
745 panic("%s: unimplemented", __func__);
746 }
747
748 int
749 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages,
750 struct vm_page **opp)
751 {
752
753 return EBUSY;
754 }
755
756 struct vm_page *
757 uvm_loanbreak(struct vm_page *pg)
758 {
759
760 panic("%s: unimplemented", __func__);
761 }
762
763 void
764 ubc_purge(struct uvm_object *uobj)
765 {
766
767 }
768
769 vaddr_t
770 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
771 {
772
773 return 0;
774 }
775
776 int
777 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
778 vm_prot_t prot, bool set_max)
779 {
780
781 return EOPNOTSUPP;
782 }
783
784 int
785 uvm_map(struct vm_map *map, vaddr_t *startp, vsize_t size,
786 struct uvm_object *uobj, voff_t uoffset, vsize_t align,
787 uvm_flag_t flags)
788 {
789
790 *startp = (vaddr_t)rump_hypermalloc(size, align, true, "uvm_map");
791 return *startp != 0 ? 0 : ENOMEM;
792 }
793
794 void
795 uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
796 {
797
798 rump_hyperfree((void*)start, end-start);
799 }
800
801
802 /*
803 * UVM km
804 */
805
806 vaddr_t
807 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags)
808 {
809 void *rv, *desired = NULL;
810 int alignbit, error;
811
812 #ifdef __x86_64__
813 /*
814 * On amd64, allocate all module memory from the lowest 2GB.
815 * This is because NetBSD kernel modules are compiled
816 * with -mcmodel=kernel and reserve only 4 bytes for
817 * offsets. If we load code compiled with -mcmodel=kernel
818 * anywhere except the lowest or highest 2GB, it will not
819 * work. Since userspace does not have access to the highest
820 * 2GB, use the lowest 2GB.
821 *
822 * Note: this assumes the rump kernel resides in
823 * the lowest 2GB as well.
824 *
825 * Note2: yes, it's a quick hack, but since this the only
826 * place where we care about the map we're allocating from,
827 * just use a simple "if" instead of coming up with a fancy
828 * generic solution.
829 */
830 if (map == module_map) {
831 desired = (void *)(0x80000000 - size);
832 }
833 #endif
834
835 if (__predict_false(map == module_map)) {
836 alignbit = 0;
837 if (align) {
838 alignbit = ffs(align)-1;
839 }
840 error = rumpuser_anonmmap(desired, size, alignbit,
841 flags & UVM_KMF_EXEC, &rv);
842 } else {
843 error = rumpuser_malloc(size, align, &rv);
844 }
845
846 if (error) {
847 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT))
848 return 0;
849 else
850 panic("uvm_km_alloc failed");
851 }
852
853 if (flags & UVM_KMF_ZERO)
854 memset(rv, 0, size);
855
856 return (vaddr_t)rv;
857 }
858
859 void
860 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags)
861 {
862
863 if (__predict_false(map == module_map))
864 rumpuser_unmap((void *)vaddr, size);
865 else
866 rumpuser_free((void *)vaddr, size);
867 }
868
869 int
870 uvm_km_protect(struct vm_map *map, vaddr_t vaddr, vsize_t size, vm_prot_t prot)
871 {
872 return 0;
873 }
874
875 struct vm_map *
876 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr,
877 vsize_t size, int pageable, bool fixed, struct vm_map *submap)
878 {
879
880 return (struct vm_map *)417416;
881 }
882
883 int
884 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags,
885 vmem_addr_t *addr)
886 {
887 vaddr_t va;
888 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE,
889 (flags & VM_SLEEP), "kmalloc");
890
891 if (va) {
892 *addr = va;
893 return 0;
894 } else {
895 return ENOMEM;
896 }
897 }
898
899 void
900 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
901 {
902
903 rump_hyperfree((void *)addr, size);
904 }
905
906 /*
907 * VM space locking routines. We don't really have to do anything,
908 * since the pages are always "wired" (both local and remote processes).
909 */
910 int
911 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access)
912 {
913
914 return 0;
915 }
916
917 void
918 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len)
919 {
920
921 }
922
923 /*
924 * For the local case the buffer mappers don't need to do anything.
925 * For the remote case we need to reserve space and copy data in or
926 * out, depending on B_READ/B_WRITE.
927 */
928 int
929 vmapbuf(struct buf *bp, vsize_t len)
930 {
931 int error = 0;
932
933 bp->b_saveaddr = bp->b_data;
934
935 /* remote case */
936 if (!RUMP_LOCALPROC_P(curproc)) {
937 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf");
938 if (BUF_ISWRITE(bp)) {
939 error = copyin(bp->b_saveaddr, bp->b_data, len);
940 if (error) {
941 rump_hyperfree(bp->b_data, len);
942 bp->b_data = bp->b_saveaddr;
943 bp->b_saveaddr = 0;
944 }
945 }
946 }
947
948 return error;
949 }
950
951 void
952 vunmapbuf(struct buf *bp, vsize_t len)
953 {
954
955 /* remote case */
956 if (!RUMP_LOCALPROC_P(bp->b_proc)) {
957 if (BUF_ISREAD(bp)) {
958 bp->b_error = copyout_proc(bp->b_proc,
959 bp->b_data, bp->b_saveaddr, len);
960 }
961 rump_hyperfree(bp->b_data, len);
962 }
963
964 bp->b_data = bp->b_saveaddr;
965 bp->b_saveaddr = 0;
966 }
967
968 void
969 uvmspace_addref(struct vmspace *vm)
970 {
971
972 /*
973 * No dynamically allocated vmspaces exist.
974 */
975 }
976
977 void
978 uvmspace_free(struct vmspace *vm)
979 {
980
981 /* nothing for now */
982 }
983
984 /*
985 * page life cycle stuff. it really doesn't exist, so just stubs.
986 */
987
988 void
989 uvm_pageactivate(struct vm_page *pg)
990 {
991
992 /* nada */
993 }
994
995 void
996 uvm_pagedeactivate(struct vm_page *pg)
997 {
998
999 /* nada */
1000 }
1001
1002 void
1003 uvm_pagedequeue(struct vm_page *pg)
1004 {
1005
1006 /* nada*/
1007 }
1008
1009 void
1010 uvm_pageenqueue(struct vm_page *pg)
1011 {
1012
1013 /* nada */
1014 }
1015
1016 void
1017 uvmpdpol_anfree(struct vm_anon *an)
1018 {
1019
1020 /* nada */
1021 }
1022
1023 /*
1024 * Physical address accessors.
1025 */
1026
1027 struct vm_page *
1028 uvm_phys_to_vm_page(paddr_t pa)
1029 {
1030
1031 return NULL;
1032 }
1033
1034 paddr_t
1035 uvm_vm_page_to_phys(const struct vm_page *pg)
1036 {
1037
1038 return 0;
1039 }
1040
1041 vaddr_t
1042 uvm_uarea_alloc(void)
1043 {
1044
1045 /* non-zero */
1046 return (vaddr_t)11;
1047 }
1048
1049 void
1050 uvm_uarea_free(vaddr_t uarea)
1051 {
1052
1053 /* nata, so creamy */
1054 }
1055
1056 /*
1057 * Routines related to the Page Baroness.
1058 */
1059
1060 void
1061 uvm_wait(const char *msg)
1062 {
1063
1064 if (__predict_false(rump_threads == 0))
1065 panic("pagedaemon missing (RUMP_THREADS = 0)");
1066
1067 if (curlwp == uvm.pagedaemon_lwp) {
1068 /* is it possible for us to later get memory? */
1069 if (!uvmexp.paging)
1070 panic("pagedaemon out of memory");
1071 }
1072
1073 mutex_enter(&pdaemonmtx);
1074 pdaemon_waiters++;
1075 cv_signal(&pdaemoncv);
1076 cv_wait(&oomwait, &pdaemonmtx);
1077 mutex_exit(&pdaemonmtx);
1078 }
1079
1080 void
1081 uvm_pageout_start(int npages)
1082 {
1083
1084 mutex_enter(&pdaemonmtx);
1085 uvmexp.paging += npages;
1086 mutex_exit(&pdaemonmtx);
1087 }
1088
1089 void
1090 uvm_pageout_done(int npages)
1091 {
1092
1093 if (!npages)
1094 return;
1095
1096 mutex_enter(&pdaemonmtx);
1097 KASSERT(uvmexp.paging >= npages);
1098 uvmexp.paging -= npages;
1099
1100 if (pdaemon_waiters) {
1101 pdaemon_waiters = 0;
1102 cv_broadcast(&oomwait);
1103 }
1104 mutex_exit(&pdaemonmtx);
1105 }
1106
1107 static bool
1108 processpage(struct vm_page *pg)
1109 {
1110 struct uvm_object *uobj;
1111
1112 uobj = pg->uobject;
1113 if (rw_tryenter(uobj->vmobjlock, RW_WRITER)) {
1114 if ((pg->flags & PG_BUSY) == 0) {
1115 mutex_exit(&vmpage_lruqueue_lock);
1116 uobj->pgops->pgo_put(uobj, pg->offset,
1117 pg->offset + PAGE_SIZE,
1118 PGO_CLEANIT|PGO_FREE);
1119 KASSERT(!rw_write_held(uobj->vmobjlock));
1120 return true;
1121 } else {
1122 rw_exit(uobj->vmobjlock);
1123 }
1124 }
1125
1126 return false;
1127 }
1128
1129 /*
1130 * The Diabolical pageDaemon Director (DDD).
1131 *
1132 * This routine can always use better heuristics.
1133 */
1134 void
1135 uvm_pageout(void *arg)
1136 {
1137 struct vm_page *pg;
1138 struct pool *pp, *pp_first;
1139 int cleaned, skip, skipped;
1140 bool succ;
1141
1142 mutex_enter(&pdaemonmtx);
1143 for (;;) {
1144 if (!NEED_PAGEDAEMON()) {
1145 kernel_map->flags &= ~VM_MAP_WANTVA;
1146 }
1147
1148 if (pdaemon_waiters) {
1149 pdaemon_waiters = 0;
1150 cv_broadcast(&oomwait);
1151 }
1152
1153 cv_wait(&pdaemoncv, &pdaemonmtx);
1154 uvmexp.pdwoke++;
1155
1156 /* tell the world that we are hungry */
1157 kernel_map->flags |= VM_MAP_WANTVA;
1158 mutex_exit(&pdaemonmtx);
1159
1160 /*
1161 * step one: reclaim the page cache. this should give
1162 * us the biggest earnings since whole pages are released
1163 * into backing memory.
1164 */
1165 pool_cache_reclaim(&pagecache);
1166 if (!NEED_PAGEDAEMON()) {
1167 mutex_enter(&pdaemonmtx);
1168 continue;
1169 }
1170
1171 /*
1172 * Ok, so that didn't help. Next, try to hunt memory
1173 * by pushing out vnode pages. The pages might contain
1174 * useful cached data, but we need the memory.
1175 */
1176 cleaned = 0;
1177 skip = 0;
1178 again:
1179 mutex_enter(&vmpage_lruqueue_lock);
1180 while (cleaned < PAGEDAEMON_OBJCHUNK) {
1181 skipped = 0;
1182 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) {
1183
1184 /*
1185 * skip over pages we _might_ have tried
1186 * to handle earlier. they might not be
1187 * exactly the same ones, but I'm not too
1188 * concerned.
1189 */
1190 while (skipped++ < skip)
1191 continue;
1192
1193 if (processpage(pg)) {
1194 cleaned++;
1195 goto again;
1196 }
1197
1198 skip++;
1199 }
1200 break;
1201 }
1202 mutex_exit(&vmpage_lruqueue_lock);
1203
1204 /*
1205 * Ok, someone is running with an object lock held.
1206 * We want to yield the host CPU to make sure the
1207 * thread is not parked on the host. nanosleep
1208 * for the smallest possible time and hope we're back in
1209 * the game soon.
1210 */
1211 if (cleaned == 0) {
1212 rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL, 0, 1);
1213
1214 skip = 0;
1215
1216 /* and here we go again */
1217 goto again;
1218 }
1219
1220 /*
1221 * And of course we need to reclaim the page cache
1222 * again to actually release memory.
1223 */
1224 pool_cache_reclaim(&pagecache);
1225 if (!NEED_PAGEDAEMON()) {
1226 mutex_enter(&pdaemonmtx);
1227 continue;
1228 }
1229
1230 /*
1231 * And then drain the pools. Wipe them out ... all of them.
1232 */
1233 for (pp_first = NULL;;) {
1234 rump_vfs_drainbufs(10 /* XXX: estimate! */);
1235
1236 succ = pool_drain(&pp);
1237 if (succ || pp == pp_first)
1238 break;
1239
1240 if (pp_first == NULL)
1241 pp_first = pp;
1242 }
1243
1244 /*
1245 * Need to use PYEC on our bag of tricks.
1246 * Unfortunately, the wife just borrowed it.
1247 */
1248
1249 mutex_enter(&pdaemonmtx);
1250 if (!succ && cleaned == 0 && pdaemon_waiters &&
1251 uvmexp.paging == 0) {
1252 rumpuser_dprintf("pagedaemoness: failed to reclaim "
1253 "memory ... sleeping (deadlock?)\n");
1254 kpause("pddlk", false, hz, &pdaemonmtx);
1255 }
1256 }
1257
1258 panic("you can swap out any time you like, but you can never leave");
1259 }
1260
1261 void
1262 uvm_kick_pdaemon()
1263 {
1264
1265 /*
1266 * Wake up the diabolical pagedaemon director if we are over
1267 * 90% of the memory limit. This is a complete and utter
1268 * stetson-harrison decision which you are allowed to finetune.
1269 * Don't bother locking. If we have some unflushed caches,
1270 * other waker-uppers will deal with the issue.
1271 */
1272 if (NEED_PAGEDAEMON()) {
1273 cv_signal(&pdaemoncv);
1274 }
1275 }
1276
1277 void *
1278 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg)
1279 {
1280 const unsigned long thelimit =
1281 curlwp == uvm.pagedaemon_lwp ? pdlimit : rump_physmemlimit;
1282 unsigned long newmem;
1283 void *rv;
1284 int error;
1285
1286 uvm_kick_pdaemon(); /* ouch */
1287
1288 /* first we must be within the limit */
1289 limitagain:
1290 if (thelimit != RUMPMEM_UNLIMITED) {
1291 newmem = atomic_add_long_nv(&curphysmem, howmuch);
1292 if (newmem > thelimit) {
1293 newmem = atomic_add_long_nv(&curphysmem, -howmuch);
1294 if (!waitok) {
1295 return NULL;
1296 }
1297 uvm_wait(wmsg);
1298 goto limitagain;
1299 }
1300 }
1301
1302 /* second, we must get something from the backend */
1303 again:
1304 error = rumpuser_malloc(howmuch, alignment, &rv);
1305 if (__predict_false(error && waitok)) {
1306 uvm_wait(wmsg);
1307 goto again;
1308 }
1309
1310 return rv;
1311 }
1312
1313 void
1314 rump_hyperfree(void *what, size_t size)
1315 {
1316
1317 if (rump_physmemlimit != RUMPMEM_UNLIMITED) {
1318 atomic_add_long(&curphysmem, -size);
1319 }
1320 rumpuser_free(what, size);
1321 }
1322