Home | History | Annotate | Line # | Download | only in uvm
uvm_mmap.c revision 1.151
      1 /*	$NetBSD: uvm_mmap.c,v 1.151 2015/01/10 23:35:02 chs Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
      5  * Copyright (c) 1991, 1993 The Regents of the University of California.
      6  * Copyright (c) 1988 University of Utah.
      7  *
      8  * All rights reserved.
      9  *
     10  * This code is derived from software contributed to Berkeley by
     11  * the Systems Programming Group of the University of Utah Computer
     12  * Science Department.
     13  *
     14  * Redistribution and use in source and binary forms, with or without
     15  * modification, are permitted provided that the following conditions
     16  * are met:
     17  * 1. Redistributions of source code must retain the above copyright
     18  *    notice, this list of conditions and the following disclaimer.
     19  * 2. Redistributions in binary form must reproduce the above copyright
     20  *    notice, this list of conditions and the following disclaimer in the
     21  *    documentation and/or other materials provided with the distribution.
     22  * 3. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  *
     38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
     39  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
     40  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
     41  */
     42 
     43 /*
     44  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
     45  * function.
     46  */
     47 
     48 #include <sys/cdefs.h>
     49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.151 2015/01/10 23:35:02 chs Exp $");
     50 
     51 #include "opt_compat_netbsd.h"
     52 #include "opt_pax.h"
     53 
     54 #include <sys/types.h>
     55 #include <sys/file.h>
     56 #include <sys/filedesc.h>
     57 #include <sys/resourcevar.h>
     58 #include <sys/mman.h>
     59 
     60 #if defined(PAX_ASLR) || defined(PAX_MPROTECT)
     61 #include <sys/pax.h>
     62 #endif /* PAX_ASLR || PAX_MPROTECT */
     63 
     64 #include <sys/syscallargs.h>
     65 
     66 #include <uvm/uvm.h>
     67 #include <uvm/uvm_device.h>
     68 
     69 static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
     70 		    int, int, struct uvm_object *, voff_t, vsize_t);
     71 
     72 static int
     73 range_test(vaddr_t addr, vsize_t size, bool ismmap)
     74 {
     75 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
     76 	vaddr_t vm_max_address = VM_MAXUSER_ADDRESS;
     77 	vaddr_t eaddr = addr + size;
     78 	int res = 0;
     79 
     80 	if (addr < vm_min_address)
     81 		return EINVAL;
     82 	if (eaddr > vm_max_address)
     83 		return ismmap ? EFBIG : EINVAL;
     84 	if (addr > eaddr) /* no wrapping! */
     85 		return ismmap ? EOVERFLOW : EINVAL;
     86 
     87 #ifdef MD_MMAP_RANGE_TEST
     88 	res = MD_MMAP_RANGE_TEST(addr, eaddr);
     89 #endif
     90 
     91 	return res;
     92 }
     93 
     94 /*
     95  * unimplemented VM system calls:
     96  */
     97 
     98 /*
     99  * sys_sbrk: sbrk system call.
    100  */
    101 
    102 /* ARGSUSED */
    103 int
    104 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
    105 {
    106 	/* {
    107 		syscallarg(intptr_t) incr;
    108 	} */
    109 
    110 	return (ENOSYS);
    111 }
    112 
    113 /*
    114  * sys_sstk: sstk system call.
    115  */
    116 
    117 /* ARGSUSED */
    118 int
    119 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
    120 {
    121 	/* {
    122 		syscallarg(int) incr;
    123 	} */
    124 
    125 	return (ENOSYS);
    126 }
    127 
    128 /*
    129  * sys_mincore: determine if pages are in core or not.
    130  */
    131 
    132 /* ARGSUSED */
    133 int
    134 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
    135     register_t *retval)
    136 {
    137 	/* {
    138 		syscallarg(void *) addr;
    139 		syscallarg(size_t) len;
    140 		syscallarg(char *) vec;
    141 	} */
    142 	struct proc *p = l->l_proc;
    143 	struct vm_page *pg;
    144 	char *vec, pgi;
    145 	struct uvm_object *uobj;
    146 	struct vm_amap *amap;
    147 	struct vm_anon *anon;
    148 	struct vm_map_entry *entry;
    149 	vaddr_t start, end, lim;
    150 	struct vm_map *map;
    151 	vsize_t len;
    152 	int error = 0, npgs;
    153 
    154 	map = &p->p_vmspace->vm_map;
    155 
    156 	start = (vaddr_t)SCARG(uap, addr);
    157 	len = SCARG(uap, len);
    158 	vec = SCARG(uap, vec);
    159 
    160 	if (start & PAGE_MASK)
    161 		return (EINVAL);
    162 	len = round_page(len);
    163 	end = start + len;
    164 	if (end <= start)
    165 		return (EINVAL);
    166 
    167 	/*
    168 	 * Lock down vec, so our returned status isn't outdated by
    169 	 * storing the status byte for a page.
    170 	 */
    171 
    172 	npgs = len >> PAGE_SHIFT;
    173 	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
    174 	if (error) {
    175 		return error;
    176 	}
    177 	vm_map_lock_read(map);
    178 
    179 	if (uvm_map_lookup_entry(map, start, &entry) == false) {
    180 		error = ENOMEM;
    181 		goto out;
    182 	}
    183 
    184 	for (/* nothing */;
    185 	     entry != &map->header && entry->start < end;
    186 	     entry = entry->next) {
    187 		KASSERT(!UVM_ET_ISSUBMAP(entry));
    188 		KASSERT(start >= entry->start);
    189 
    190 		/* Make sure there are no holes. */
    191 		if (entry->end < end &&
    192 		     (entry->next == &map->header ||
    193 		      entry->next->start > entry->end)) {
    194 			error = ENOMEM;
    195 			goto out;
    196 		}
    197 
    198 		lim = end < entry->end ? end : entry->end;
    199 
    200 		/*
    201 		 * Special case for objects with no "real" pages.  Those
    202 		 * are always considered resident (mapped devices).
    203 		 */
    204 
    205 		if (UVM_ET_ISOBJ(entry)) {
    206 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
    207 			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
    208 				for (/* nothing */; start < lim;
    209 				     start += PAGE_SIZE, vec++)
    210 					subyte(vec, 1);
    211 				continue;
    212 			}
    213 		}
    214 
    215 		amap = entry->aref.ar_amap;	/* upper layer */
    216 		uobj = entry->object.uvm_obj;	/* lower layer */
    217 
    218 		if (amap != NULL)
    219 			amap_lock(amap);
    220 		if (uobj != NULL)
    221 			mutex_enter(uobj->vmobjlock);
    222 
    223 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
    224 			pgi = 0;
    225 			if (amap != NULL) {
    226 				/* Check the upper layer first. */
    227 				anon = amap_lookup(&entry->aref,
    228 				    start - entry->start);
    229 				/* Don't need to lock anon here. */
    230 				if (anon != NULL && anon->an_page != NULL) {
    231 
    232 					/*
    233 					 * Anon has the page for this entry
    234 					 * offset.
    235 					 */
    236 
    237 					pgi = 1;
    238 				}
    239 			}
    240 			if (uobj != NULL && pgi == 0) {
    241 				/* Check the lower layer. */
    242 				pg = uvm_pagelookup(uobj,
    243 				    entry->offset + (start - entry->start));
    244 				if (pg != NULL) {
    245 
    246 					/*
    247 					 * Object has the page for this entry
    248 					 * offset.
    249 					 */
    250 
    251 					pgi = 1;
    252 				}
    253 			}
    254 			(void) subyte(vec, pgi);
    255 		}
    256 		if (uobj != NULL)
    257 			mutex_exit(uobj->vmobjlock);
    258 		if (amap != NULL)
    259 			amap_unlock(amap);
    260 	}
    261 
    262  out:
    263 	vm_map_unlock_read(map);
    264 	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
    265 	return (error);
    266 }
    267 
    268 /*
    269  * sys_mmap: mmap system call.
    270  *
    271  * => file offset and address may not be page aligned
    272  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
    273  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
    274  *      and the return value is adjusted up by the page offset.
    275  */
    276 
    277 int
    278 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
    279 {
    280 	/* {
    281 		syscallarg(void *) addr;
    282 		syscallarg(size_t) len;
    283 		syscallarg(int) prot;
    284 		syscallarg(int) flags;
    285 		syscallarg(int) fd;
    286 		syscallarg(long) pad;
    287 		syscallarg(off_t) pos;
    288 	} */
    289 	struct proc *p = l->l_proc;
    290 	vaddr_t addr;
    291 	off_t pos;
    292 	vsize_t size, pageoff;
    293 	vm_prot_t prot, maxprot;
    294 	int flags, fd, advice;
    295 	vaddr_t defaddr;
    296 	struct file *fp = NULL;
    297 	struct uvm_object *uobj;
    298 	int error;
    299 #ifdef PAX_ASLR
    300 	vaddr_t orig_addr;
    301 #endif /* PAX_ASLR */
    302 
    303 	/*
    304 	 * first, extract syscall args from the uap.
    305 	 */
    306 
    307 	addr = (vaddr_t)SCARG(uap, addr);
    308 	size = (vsize_t)SCARG(uap, len);
    309 	prot = SCARG(uap, prot) & VM_PROT_ALL;
    310 	flags = SCARG(uap, flags);
    311 	fd = SCARG(uap, fd);
    312 	pos = SCARG(uap, pos);
    313 
    314 #ifdef PAX_ASLR
    315 	orig_addr = addr;
    316 #endif /* PAX_ASLR */
    317 
    318 	/*
    319 	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
    320 	 * validate the flags.
    321 	 */
    322 	if (flags & MAP_COPY) {
    323 		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
    324 #if defined(COMPAT_10) && defined(__i386__)
    325 		/*
    326 		 * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least
    327 		 * and ld.so did not turn it on. We take care of this on amd64
    328 		 * in compat32.
    329 		 */
    330 		prot |= PROT_EXEC;
    331 #endif
    332 	}
    333 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
    334 		return (EINVAL);
    335 
    336 	/*
    337 	 * align file position and save offset.  adjust size.
    338 	 */
    339 
    340 	pageoff = (pos & PAGE_MASK);
    341 	pos  -= pageoff;
    342 	size += pageoff;			/* add offset */
    343 	size = (vsize_t)round_page(size);	/* round up */
    344 
    345 	/*
    346 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
    347 	 */
    348 	if (flags & MAP_FIXED) {
    349 
    350 		/* ensure address and file offset are aligned properly */
    351 		addr -= pageoff;
    352 		if (addr & PAGE_MASK)
    353 			return (EINVAL);
    354 
    355 		error = range_test(addr, size, true);
    356 		if (error) {
    357 			return error;
    358 		}
    359 
    360 	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
    361 
    362 		/*
    363 		 * not fixed: make sure we skip over the largest
    364 		 * possible heap for non-topdown mapping arrangements.
    365 		 * we will refine our guess later (e.g. to account for
    366 		 * VAC, etc)
    367 		 */
    368 
    369 		defaddr = p->p_emul->e_vm_default_addr(p,
    370 		    (vaddr_t)p->p_vmspace->vm_daddr, size);
    371 
    372 		if (addr == 0 ||
    373 		    !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
    374 			addr = MAX(addr, defaddr);
    375 		else
    376 			addr = MIN(addr, defaddr);
    377 	}
    378 
    379 	/*
    380 	 * check for file mappings (i.e. not anonymous) and verify file.
    381 	 */
    382 
    383 	advice = UVM_ADV_NORMAL;
    384 	if ((flags & MAP_ANON) == 0) {
    385 		if ((fp = fd_getfile(fd)) == NULL)
    386 			return (EBADF);
    387 
    388 		if (fp->f_ops->fo_mmap == NULL) {
    389 			error = ENODEV;
    390 			goto out;
    391 		}
    392 		error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
    393 					      &advice, &uobj, &maxprot);
    394 		if (error) {
    395 			goto out;
    396 		}
    397 		if (uobj == NULL) {
    398 			flags |= MAP_ANON;
    399 			fd_putfile(fd);
    400 			fp = NULL;
    401 			goto is_anon;
    402 		}
    403 	} else {		/* MAP_ANON case */
    404 		/*
    405 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
    406 		 */
    407 		if (fd != -1)
    408 			return (EINVAL);
    409 
    410  is_anon:		/* label for SunOS style /dev/zero */
    411 		uobj = NULL;
    412 		maxprot = VM_PROT_ALL;
    413 		pos = 0;
    414 	}
    415 
    416 #ifdef PAX_MPROTECT
    417 	pax_mprotect(l, &prot, &maxprot);
    418 #endif /* PAX_MPROTECT */
    419 
    420 #ifdef PAX_ASLR
    421 	pax_aslr(l, &addr, orig_addr, flags);
    422 #endif /* PAX_ASLR */
    423 
    424 	/*
    425 	 * now let kernel internal function uvm_mmap do the work.
    426 	 */
    427 
    428 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
    429 	    flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
    430 
    431 	/* remember to add offset */
    432 	*retval = (register_t)(addr + pageoff);
    433 
    434  out:
    435      	if (fp != NULL)
    436 		fd_putfile(fd);
    437 
    438 	return (error);
    439 }
    440 
    441 /*
    442  * sys___msync13: the msync system call (a front-end for flush)
    443  */
    444 
    445 int
    446 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
    447     register_t *retval)
    448 {
    449 	/* {
    450 		syscallarg(void *) addr;
    451 		syscallarg(size_t) len;
    452 		syscallarg(int) flags;
    453 	} */
    454 	struct proc *p = l->l_proc;
    455 	vaddr_t addr;
    456 	vsize_t size, pageoff;
    457 	struct vm_map *map;
    458 	int error, rv, flags, uvmflags;
    459 
    460 	/*
    461 	 * extract syscall args from the uap
    462 	 */
    463 
    464 	addr = (vaddr_t)SCARG(uap, addr);
    465 	size = (vsize_t)SCARG(uap, len);
    466 	flags = SCARG(uap, flags);
    467 
    468 	/* sanity check flags */
    469 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
    470 	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
    471 	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
    472 		return (EINVAL);
    473 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
    474 		flags |= MS_SYNC;
    475 
    476 	/*
    477 	 * align the address to a page boundary and adjust the size accordingly.
    478 	 */
    479 
    480 	pageoff = (addr & PAGE_MASK);
    481 	addr -= pageoff;
    482 	size += pageoff;
    483 	size = (vsize_t)round_page(size);
    484 
    485 	error = range_test(addr, size, false);
    486 	if (error)
    487 		return error;
    488 
    489 	/*
    490 	 * get map
    491 	 */
    492 
    493 	map = &p->p_vmspace->vm_map;
    494 
    495 	/*
    496 	 * XXXCDC: do we really need this semantic?
    497 	 *
    498 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
    499 	 * pages with the region containing addr".  Unfortunately, we
    500 	 * don't really keep track of individual mmaps so we approximate
    501 	 * by flushing the range of the map entry containing addr.
    502 	 * This can be incorrect if the region splits or is coalesced
    503 	 * with a neighbor.
    504 	 */
    505 
    506 	if (size == 0) {
    507 		struct vm_map_entry *entry;
    508 
    509 		vm_map_lock_read(map);
    510 		rv = uvm_map_lookup_entry(map, addr, &entry);
    511 		if (rv == true) {
    512 			addr = entry->start;
    513 			size = entry->end - entry->start;
    514 		}
    515 		vm_map_unlock_read(map);
    516 		if (rv == false)
    517 			return (EINVAL);
    518 	}
    519 
    520 	/*
    521 	 * translate MS_ flags into PGO_ flags
    522 	 */
    523 
    524 	uvmflags = PGO_CLEANIT;
    525 	if (flags & MS_INVALIDATE)
    526 		uvmflags |= PGO_FREE;
    527 	if (flags & MS_SYNC)
    528 		uvmflags |= PGO_SYNCIO;
    529 
    530 	error = uvm_map_clean(map, addr, addr+size, uvmflags);
    531 	return error;
    532 }
    533 
    534 /*
    535  * sys_munmap: unmap a users memory
    536  */
    537 
    538 int
    539 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
    540 {
    541 	/* {
    542 		syscallarg(void *) addr;
    543 		syscallarg(size_t) len;
    544 	} */
    545 	struct proc *p = l->l_proc;
    546 	vaddr_t addr;
    547 	vsize_t size, pageoff;
    548 	struct vm_map *map;
    549 	struct vm_map_entry *dead_entries;
    550 	int error;
    551 
    552 	/*
    553 	 * get syscall args.
    554 	 */
    555 
    556 	addr = (vaddr_t)SCARG(uap, addr);
    557 	size = (vsize_t)SCARG(uap, len);
    558 
    559 	/*
    560 	 * align the address to a page boundary and adjust the size accordingly.
    561 	 */
    562 
    563 	pageoff = (addr & PAGE_MASK);
    564 	addr -= pageoff;
    565 	size += pageoff;
    566 	size = (vsize_t)round_page(size);
    567 
    568 	if (size == 0)
    569 		return (0);
    570 
    571 	error = range_test(addr, size, false);
    572 	if (error)
    573 		return error;
    574 
    575 	map = &p->p_vmspace->vm_map;
    576 
    577 	/*
    578 	 * interesting system call semantic: make sure entire range is
    579 	 * allocated before allowing an unmap.
    580 	 */
    581 
    582 	vm_map_lock(map);
    583 #if 0
    584 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
    585 		vm_map_unlock(map);
    586 		return (EINVAL);
    587 	}
    588 #endif
    589 	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
    590 	vm_map_unlock(map);
    591 	if (dead_entries != NULL)
    592 		uvm_unmap_detach(dead_entries, 0);
    593 	return (0);
    594 }
    595 
    596 /*
    597  * sys_mprotect: the mprotect system call
    598  */
    599 
    600 int
    601 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
    602     register_t *retval)
    603 {
    604 	/* {
    605 		syscallarg(void *) addr;
    606 		syscallarg(size_t) len;
    607 		syscallarg(int) prot;
    608 	} */
    609 	struct proc *p = l->l_proc;
    610 	vaddr_t addr;
    611 	vsize_t size, pageoff;
    612 	vm_prot_t prot;
    613 	int error;
    614 
    615 	/*
    616 	 * extract syscall args from uap
    617 	 */
    618 
    619 	addr = (vaddr_t)SCARG(uap, addr);
    620 	size = (vsize_t)SCARG(uap, len);
    621 	prot = SCARG(uap, prot) & VM_PROT_ALL;
    622 
    623 	/*
    624 	 * align the address to a page boundary and adjust the size accordingly.
    625 	 */
    626 
    627 	pageoff = (addr & PAGE_MASK);
    628 	addr -= pageoff;
    629 	size += pageoff;
    630 	size = round_page(size);
    631 
    632 	error = range_test(addr, size, false);
    633 	if (error)
    634 		return error;
    635 
    636 	error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
    637 				false);
    638 	return error;
    639 }
    640 
    641 /*
    642  * sys_minherit: the minherit system call
    643  */
    644 
    645 int
    646 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
    647    register_t *retval)
    648 {
    649 	/* {
    650 		syscallarg(void *) addr;
    651 		syscallarg(int) len;
    652 		syscallarg(int) inherit;
    653 	} */
    654 	struct proc *p = l->l_proc;
    655 	vaddr_t addr;
    656 	vsize_t size, pageoff;
    657 	vm_inherit_t inherit;
    658 	int error;
    659 
    660 	addr = (vaddr_t)SCARG(uap, addr);
    661 	size = (vsize_t)SCARG(uap, len);
    662 	inherit = SCARG(uap, inherit);
    663 
    664 	/*
    665 	 * align the address to a page boundary and adjust the size accordingly.
    666 	 */
    667 
    668 	pageoff = (addr & PAGE_MASK);
    669 	addr -= pageoff;
    670 	size += pageoff;
    671 	size = (vsize_t)round_page(size);
    672 
    673 	error = range_test(addr, size, false);
    674 	if (error)
    675 		return error;
    676 
    677 	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
    678 				inherit);
    679 	return error;
    680 }
    681 
    682 /*
    683  * sys_madvise: give advice about memory usage.
    684  */
    685 
    686 /* ARGSUSED */
    687 int
    688 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
    689    register_t *retval)
    690 {
    691 	/* {
    692 		syscallarg(void *) addr;
    693 		syscallarg(size_t) len;
    694 		syscallarg(int) behav;
    695 	} */
    696 	struct proc *p = l->l_proc;
    697 	vaddr_t addr;
    698 	vsize_t size, pageoff;
    699 	int advice, error;
    700 
    701 	addr = (vaddr_t)SCARG(uap, addr);
    702 	size = (vsize_t)SCARG(uap, len);
    703 	advice = SCARG(uap, behav);
    704 
    705 	/*
    706 	 * align the address to a page boundary, and adjust the size accordingly
    707 	 */
    708 
    709 	pageoff = (addr & PAGE_MASK);
    710 	addr -= pageoff;
    711 	size += pageoff;
    712 	size = (vsize_t)round_page(size);
    713 
    714 	error = range_test(addr, size, false);
    715 	if (error)
    716 		return error;
    717 
    718 	switch (advice) {
    719 	case MADV_NORMAL:
    720 	case MADV_RANDOM:
    721 	case MADV_SEQUENTIAL:
    722 		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
    723 		    advice);
    724 		break;
    725 
    726 	case MADV_WILLNEED:
    727 
    728 		/*
    729 		 * Activate all these pages, pre-faulting them in if
    730 		 * necessary.
    731 		 */
    732 		error = uvm_map_willneed(&p->p_vmspace->vm_map,
    733 		    addr, addr + size);
    734 		break;
    735 
    736 	case MADV_DONTNEED:
    737 
    738 		/*
    739 		 * Deactivate all these pages.  We don't need them
    740 		 * any more.  We don't, however, toss the data in
    741 		 * the pages.
    742 		 */
    743 
    744 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
    745 		    PGO_DEACTIVATE);
    746 		break;
    747 
    748 	case MADV_FREE:
    749 
    750 		/*
    751 		 * These pages contain no valid data, and may be
    752 		 * garbage-collected.  Toss all resources, including
    753 		 * any swap space in use.
    754 		 */
    755 
    756 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
    757 		    PGO_FREE);
    758 		break;
    759 
    760 	case MADV_SPACEAVAIL:
    761 
    762 		/*
    763 		 * XXXMRG What is this?  I think it's:
    764 		 *
    765 		 *	Ensure that we have allocated backing-store
    766 		 *	for these pages.
    767 		 *
    768 		 * This is going to require changes to the page daemon,
    769 		 * as it will free swap space allocated to pages in core.
    770 		 * There's also what to do for device/file/anonymous memory.
    771 		 */
    772 
    773 		return (EINVAL);
    774 
    775 	default:
    776 		return (EINVAL);
    777 	}
    778 
    779 	return error;
    780 }
    781 
    782 /*
    783  * sys_mlock: memory lock
    784  */
    785 
    786 int
    787 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
    788 {
    789 	/* {
    790 		syscallarg(const void *) addr;
    791 		syscallarg(size_t) len;
    792 	} */
    793 	struct proc *p = l->l_proc;
    794 	vaddr_t addr;
    795 	vsize_t size, pageoff;
    796 	int error;
    797 
    798 	/*
    799 	 * extract syscall args from uap
    800 	 */
    801 
    802 	addr = (vaddr_t)SCARG(uap, addr);
    803 	size = (vsize_t)SCARG(uap, len);
    804 
    805 	/*
    806 	 * align the address to a page boundary and adjust the size accordingly
    807 	 */
    808 
    809 	pageoff = (addr & PAGE_MASK);
    810 	addr -= pageoff;
    811 	size += pageoff;
    812 	size = (vsize_t)round_page(size);
    813 
    814 	error = range_test(addr, size, false);
    815 	if (error)
    816 		return error;
    817 
    818 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
    819 		return (EAGAIN);
    820 
    821 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
    822 			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
    823 		return (EAGAIN);
    824 
    825 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
    826 	    0);
    827 	if (error == EFAULT)
    828 		error = ENOMEM;
    829 	return error;
    830 }
    831 
    832 /*
    833  * sys_munlock: unlock wired pages
    834  */
    835 
    836 int
    837 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
    838     register_t *retval)
    839 {
    840 	/* {
    841 		syscallarg(const void *) addr;
    842 		syscallarg(size_t) len;
    843 	} */
    844 	struct proc *p = l->l_proc;
    845 	vaddr_t addr;
    846 	vsize_t size, pageoff;
    847 	int error;
    848 
    849 	/*
    850 	 * extract syscall args from uap
    851 	 */
    852 
    853 	addr = (vaddr_t)SCARG(uap, addr);
    854 	size = (vsize_t)SCARG(uap, len);
    855 
    856 	/*
    857 	 * align the address to a page boundary, and adjust the size accordingly
    858 	 */
    859 
    860 	pageoff = (addr & PAGE_MASK);
    861 	addr -= pageoff;
    862 	size += pageoff;
    863 	size = (vsize_t)round_page(size);
    864 
    865 	error = range_test(addr, size, false);
    866 	if (error)
    867 		return error;
    868 
    869 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
    870 	    0);
    871 	if (error == EFAULT)
    872 		error = ENOMEM;
    873 	return error;
    874 }
    875 
    876 /*
    877  * sys_mlockall: lock all pages mapped into an address space.
    878  */
    879 
    880 int
    881 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
    882     register_t *retval)
    883 {
    884 	/* {
    885 		syscallarg(int) flags;
    886 	} */
    887 	struct proc *p = l->l_proc;
    888 	int error, flags;
    889 
    890 	flags = SCARG(uap, flags);
    891 
    892 	if (flags == 0 ||
    893 	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
    894 		return (EINVAL);
    895 
    896 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
    897 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
    898 	return (error);
    899 }
    900 
    901 /*
    902  * sys_munlockall: unlock all pages mapped into an address space.
    903  */
    904 
    905 int
    906 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
    907 {
    908 	struct proc *p = l->l_proc;
    909 
    910 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
    911 	return (0);
    912 }
    913 
    914 /*
    915  * uvm_mmap: internal version of mmap
    916  *
    917  * - used by sys_mmap and various framebuffers
    918  * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
    919  * - caller must page-align the file offset
    920  */
    921 
    922 int
    923 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
    924     vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
    925     voff_t foff, vsize_t locklimit)
    926 {
    927 	vaddr_t align = 0;
    928 	int error;
    929 	uvm_flag_t uvmflag = 0;
    930 
    931 	/*
    932 	 * check params
    933 	 */
    934 
    935 	if (size == 0)
    936 		return(0);
    937 	if (foff & PAGE_MASK)
    938 		return(EINVAL);
    939 	if ((prot & maxprot) != prot)
    940 		return(EINVAL);
    941 
    942 	/*
    943 	 * for non-fixed mappings, round off the suggested address.
    944 	 * for fixed mappings, check alignment and zap old mappings.
    945 	 */
    946 
    947 	if ((flags & MAP_FIXED) == 0) {
    948 		*addr = round_page(*addr);
    949 	} else {
    950 		if (*addr & PAGE_MASK)
    951 			return(EINVAL);
    952 		uvmflag |= UVM_FLAG_FIXED;
    953 		(void) uvm_unmap(map, *addr, *addr + size);
    954 	}
    955 
    956 	/*
    957 	 * Try to see if any requested alignment can even be attemped.
    958 	 * Make sure we can express the alignment (asking for a >= 4GB
    959 	 * alignment on an ILP32 architecure make no sense) and the
    960 	 * alignment is at least for a page sized quanitiy.  If the
    961 	 * request was for a fixed mapping, make sure supplied address
    962 	 * adheres to the request alignment.
    963 	 */
    964 	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
    965 	if (align) {
    966 		if (align >= sizeof(vaddr_t) * NBBY)
    967 			return(EINVAL);
    968 		align = 1L << align;
    969 		if (align < PAGE_SIZE)
    970 			return(EINVAL);
    971 		if (align >= vm_map_max(map))
    972 			return(ENOMEM);
    973 		if (flags & MAP_FIXED) {
    974 			if ((*addr & (align-1)) != 0)
    975 				return(EINVAL);
    976 			align = 0;
    977 		}
    978 	}
    979 
    980 	/*
    981 	 * check resource limits
    982 	 */
    983 
    984 	if (!VM_MAP_IS_KERNEL(map) &&
    985 	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
    986 	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
    987 		return ENOMEM;
    988 
    989 	/*
    990 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
    991 	 * to underlying vm object.
    992 	 */
    993 
    994 	if (flags & MAP_ANON) {
    995 		KASSERT(uobj == NULL);
    996 		foff = UVM_UNKNOWN_OFFSET;
    997 		if ((flags & MAP_SHARED) == 0)
    998 			/* XXX: defer amap create */
    999 			uvmflag |= UVM_FLAG_COPYONW;
   1000 		else
   1001 			/* shared: create amap now */
   1002 			uvmflag |= UVM_FLAG_OVERLAY;
   1003 
   1004 	} else {
   1005 		KASSERT(uobj != NULL);
   1006 		if ((flags & MAP_SHARED) == 0) {
   1007 			uvmflag |= UVM_FLAG_COPYONW;
   1008 		}
   1009 	}
   1010 
   1011 	uvmflag = UVM_MAPFLAG(prot, maxprot,
   1012 			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
   1013 			advice, uvmflag);
   1014 	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
   1015 	if (error) {
   1016 		if (uobj)
   1017 			uobj->pgops->pgo_detach(uobj);
   1018 		return error;
   1019 	}
   1020 
   1021 	/*
   1022 	 * POSIX 1003.1b -- if our address space was configured
   1023 	 * to lock all future mappings, wire the one we just made.
   1024 	 *
   1025 	 * Also handle the MAP_WIRED flag here.
   1026 	 */
   1027 
   1028 	if (prot == VM_PROT_NONE) {
   1029 
   1030 		/*
   1031 		 * No more work to do in this case.
   1032 		 */
   1033 
   1034 		return (0);
   1035 	}
   1036 	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
   1037 		vm_map_lock(map);
   1038 		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
   1039 		    (locklimit != 0 &&
   1040 		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
   1041 		     locklimit)) {
   1042 			vm_map_unlock(map);
   1043 			uvm_unmap(map, *addr, *addr + size);
   1044 			return ENOMEM;
   1045 		}
   1046 
   1047 		/*
   1048 		 * uvm_map_pageable() always returns the map unlocked.
   1049 		 */
   1050 
   1051 		error = uvm_map_pageable(map, *addr, *addr + size,
   1052 					 false, UVM_LK_ENTER);
   1053 		if (error) {
   1054 			uvm_unmap(map, *addr, *addr + size);
   1055 			return error;
   1056 		}
   1057 		return (0);
   1058 	}
   1059 	return 0;
   1060 }
   1061 
   1062 vaddr_t
   1063 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz)
   1064 {
   1065 
   1066 	if (p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN)
   1067 		return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
   1068 	else
   1069 		return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
   1070 }
   1071 
   1072 int
   1073 uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
   1074     off_t off)
   1075 {
   1076 	struct uvm_object *uobj;
   1077 	int error, flags, prot;
   1078 
   1079 	flags = MAP_SHARED;
   1080 	prot = VM_PROT_READ | VM_PROT_WRITE;
   1081 	if (*addrp)
   1082 		flags |= MAP_FIXED;
   1083 	else
   1084 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
   1085 		    (vaddr_t)p->p_vmspace->vm_daddr, len);
   1086 
   1087 	uobj = udv_attach(dev, prot, off, len);
   1088 	if (uobj == NULL)
   1089 		return EINVAL;
   1090 
   1091 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
   1092 			 (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM,
   1093 			 uobj, off, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
   1094 	return error;
   1095 }
   1096 
   1097 int
   1098 uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
   1099 {
   1100 	int error, flags, prot;
   1101 
   1102 	flags = MAP_PRIVATE | MAP_ANON;
   1103 	prot = VM_PROT_READ | VM_PROT_WRITE;
   1104 	if (*addrp)
   1105 		flags |= MAP_FIXED;
   1106 	else
   1107 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
   1108 		    (vaddr_t)p->p_vmspace->vm_daddr, len);
   1109 
   1110 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
   1111 			 (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL,
   1112 			 NULL, 0, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
   1113 	return error;
   1114 }
   1115