Home | History | Annotate | Line # | Download | only in uvm
uvm_mmap.c revision 1.154
      1 /*	$NetBSD: uvm_mmap.c,v 1.154 2015/11/26 13:15:34 martin Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
      5  * Copyright (c) 1991, 1993 The Regents of the University of California.
      6  * Copyright (c) 1988 University of Utah.
      7  *
      8  * All rights reserved.
      9  *
     10  * This code is derived from software contributed to Berkeley by
     11  * the Systems Programming Group of the University of Utah Computer
     12  * Science Department.
     13  *
     14  * Redistribution and use in source and binary forms, with or without
     15  * modification, are permitted provided that the following conditions
     16  * are met:
     17  * 1. Redistributions of source code must retain the above copyright
     18  *    notice, this list of conditions and the following disclaimer.
     19  * 2. Redistributions in binary form must reproduce the above copyright
     20  *    notice, this list of conditions and the following disclaimer in the
     21  *    documentation and/or other materials provided with the distribution.
     22  * 3. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  *
     38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
     39  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
     40  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
     41  */
     42 
     43 /*
     44  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
     45  * function.
     46  */
     47 
     48 #include <sys/cdefs.h>
     49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.154 2015/11/26 13:15:34 martin Exp $");
     50 
     51 #include "opt_compat_netbsd.h"
     52 #include "opt_pax.h"
     53 
     54 #include <sys/types.h>
     55 #include <sys/file.h>
     56 #include <sys/filedesc.h>
     57 #include <sys/resourcevar.h>
     58 #include <sys/mman.h>
     59 
     60 #if defined(PAX_ASLR) || defined(PAX_MPROTECT)
     61 #include <sys/pax.h>
     62 #endif /* PAX_ASLR || PAX_MPROTECT */
     63 
     64 #include <sys/syscallargs.h>
     65 
     66 #include <uvm/uvm.h>
     67 #include <uvm/uvm_device.h>
     68 
     69 static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
     70 		    int, int, struct uvm_object *, voff_t, vsize_t);
     71 
     72 static int
     73 range_test(vaddr_t addr, vsize_t size, bool ismmap)
     74 {
     75 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
     76 	vaddr_t vm_max_address = VM_MAXUSER_ADDRESS;
     77 	vaddr_t eaddr = addr + size;
     78 	int res = 0;
     79 
     80 	if (addr < vm_min_address)
     81 		return EINVAL;
     82 	if (eaddr > vm_max_address)
     83 		return ismmap ? EFBIG : EINVAL;
     84 	if (addr > eaddr) /* no wrapping! */
     85 		return ismmap ? EOVERFLOW : EINVAL;
     86 
     87 #ifdef MD_MMAP_RANGE_TEST
     88 	res = MD_MMAP_RANGE_TEST(addr, eaddr);
     89 #endif
     90 
     91 	return res;
     92 }
     93 
     94 /*
     95  * unimplemented VM system calls:
     96  */
     97 
     98 /*
     99  * sys_sbrk: sbrk system call.
    100  */
    101 
    102 /* ARGSUSED */
    103 int
    104 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
    105 {
    106 	/* {
    107 		syscallarg(intptr_t) incr;
    108 	} */
    109 
    110 	return (ENOSYS);
    111 }
    112 
    113 /*
    114  * sys_sstk: sstk system call.
    115  */
    116 
    117 /* ARGSUSED */
    118 int
    119 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
    120 {
    121 	/* {
    122 		syscallarg(int) incr;
    123 	} */
    124 
    125 	return (ENOSYS);
    126 }
    127 
    128 /*
    129  * sys_mincore: determine if pages are in core or not.
    130  */
    131 
    132 /* ARGSUSED */
    133 int
    134 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
    135     register_t *retval)
    136 {
    137 	/* {
    138 		syscallarg(void *) addr;
    139 		syscallarg(size_t) len;
    140 		syscallarg(char *) vec;
    141 	} */
    142 	struct proc *p = l->l_proc;
    143 	struct vm_page *pg;
    144 	char *vec, pgi;
    145 	struct uvm_object *uobj;
    146 	struct vm_amap *amap;
    147 	struct vm_anon *anon;
    148 	struct vm_map_entry *entry;
    149 	vaddr_t start, end, lim;
    150 	struct vm_map *map;
    151 	vsize_t len;
    152 	int error = 0, npgs;
    153 
    154 	map = &p->p_vmspace->vm_map;
    155 
    156 	start = (vaddr_t)SCARG(uap, addr);
    157 	len = SCARG(uap, len);
    158 	vec = SCARG(uap, vec);
    159 
    160 	if (start & PAGE_MASK)
    161 		return (EINVAL);
    162 	len = round_page(len);
    163 	end = start + len;
    164 	if (end <= start)
    165 		return (EINVAL);
    166 
    167 	/*
    168 	 * Lock down vec, so our returned status isn't outdated by
    169 	 * storing the status byte for a page.
    170 	 */
    171 
    172 	npgs = len >> PAGE_SHIFT;
    173 	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
    174 	if (error) {
    175 		return error;
    176 	}
    177 	vm_map_lock_read(map);
    178 
    179 	if (uvm_map_lookup_entry(map, start, &entry) == false) {
    180 		error = ENOMEM;
    181 		goto out;
    182 	}
    183 
    184 	for (/* nothing */;
    185 	     entry != &map->header && entry->start < end;
    186 	     entry = entry->next) {
    187 		KASSERT(!UVM_ET_ISSUBMAP(entry));
    188 		KASSERT(start >= entry->start);
    189 
    190 		/* Make sure there are no holes. */
    191 		if (entry->end < end &&
    192 		     (entry->next == &map->header ||
    193 		      entry->next->start > entry->end)) {
    194 			error = ENOMEM;
    195 			goto out;
    196 		}
    197 
    198 		lim = end < entry->end ? end : entry->end;
    199 
    200 		/*
    201 		 * Special case for objects with no "real" pages.  Those
    202 		 * are always considered resident (mapped devices).
    203 		 */
    204 
    205 		if (UVM_ET_ISOBJ(entry)) {
    206 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
    207 			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
    208 				for (/* nothing */; start < lim;
    209 				     start += PAGE_SIZE, vec++)
    210 					subyte(vec, 1);
    211 				continue;
    212 			}
    213 		}
    214 
    215 		amap = entry->aref.ar_amap;	/* upper layer */
    216 		uobj = entry->object.uvm_obj;	/* lower layer */
    217 
    218 		if (amap != NULL)
    219 			amap_lock(amap);
    220 		if (uobj != NULL)
    221 			mutex_enter(uobj->vmobjlock);
    222 
    223 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
    224 			pgi = 0;
    225 			if (amap != NULL) {
    226 				/* Check the upper layer first. */
    227 				anon = amap_lookup(&entry->aref,
    228 				    start - entry->start);
    229 				/* Don't need to lock anon here. */
    230 				if (anon != NULL && anon->an_page != NULL) {
    231 
    232 					/*
    233 					 * Anon has the page for this entry
    234 					 * offset.
    235 					 */
    236 
    237 					pgi = 1;
    238 				}
    239 			}
    240 			if (uobj != NULL && pgi == 0) {
    241 				/* Check the lower layer. */
    242 				pg = uvm_pagelookup(uobj,
    243 				    entry->offset + (start - entry->start));
    244 				if (pg != NULL) {
    245 
    246 					/*
    247 					 * Object has the page for this entry
    248 					 * offset.
    249 					 */
    250 
    251 					pgi = 1;
    252 				}
    253 			}
    254 			(void) subyte(vec, pgi);
    255 		}
    256 		if (uobj != NULL)
    257 			mutex_exit(uobj->vmobjlock);
    258 		if (amap != NULL)
    259 			amap_unlock(amap);
    260 	}
    261 
    262  out:
    263 	vm_map_unlock_read(map);
    264 	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
    265 	return (error);
    266 }
    267 
    268 /*
    269  * sys_mmap: mmap system call.
    270  *
    271  * => file offset and address may not be page aligned
    272  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
    273  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
    274  *      and the return value is adjusted up by the page offset.
    275  */
    276 
    277 int
    278 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
    279 {
    280 	/* {
    281 		syscallarg(void *) addr;
    282 		syscallarg(size_t) len;
    283 		syscallarg(int) prot;
    284 		syscallarg(int) flags;
    285 		syscallarg(int) fd;
    286 		syscallarg(long) pad;
    287 		syscallarg(off_t) pos;
    288 	} */
    289 	struct proc *p = l->l_proc;
    290 	vaddr_t addr;
    291 	off_t pos;
    292 	vsize_t size, pageoff, newsize;
    293 	vm_prot_t prot, maxprot;
    294 	int flags, fd, advice;
    295 	vaddr_t defaddr;
    296 	struct file *fp = NULL;
    297 	struct uvm_object *uobj;
    298 	int error;
    299 #ifdef PAX_ASLR
    300 	vaddr_t orig_addr;
    301 #endif /* PAX_ASLR */
    302 
    303 	/*
    304 	 * first, extract syscall args from the uap.
    305 	 */
    306 
    307 	addr = (vaddr_t)SCARG(uap, addr);
    308 	size = (vsize_t)SCARG(uap, len);
    309 	prot = SCARG(uap, prot) & VM_PROT_ALL;
    310 	flags = SCARG(uap, flags);
    311 	fd = SCARG(uap, fd);
    312 	pos = SCARG(uap, pos);
    313 
    314 #ifdef PAX_ASLR
    315 	orig_addr = addr;
    316 #endif /* PAX_ASLR */
    317 
    318 	/*
    319 	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
    320 	 * validate the flags.
    321 	 */
    322 	if (flags & MAP_COPY) {
    323 		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
    324 #if defined(COMPAT_10) && defined(__i386__)
    325 		/*
    326 		 * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least
    327 		 * and ld.so did not turn it on. We take care of this on amd64
    328 		 * in compat32.
    329 		 */
    330 		prot |= PROT_EXEC;
    331 #endif
    332 	}
    333 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
    334 		return (EINVAL);
    335 
    336 	/*
    337 	 * align file position and save offset.  adjust size.
    338 	 */
    339 
    340 	pageoff = (pos & PAGE_MASK);
    341 	pos    -= pageoff;
    342 	newsize = size + pageoff;		/* add offset */
    343 	newsize = (vsize_t)round_page(newsize);	/* round up */
    344 
    345 	if (newsize < size)
    346 		return (ENOMEM);
    347 	size = newsize;
    348 
    349 	/*
    350 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
    351 	 */
    352 	if (flags & MAP_FIXED) {
    353 
    354 		/* ensure address and file offset are aligned properly */
    355 		addr -= pageoff;
    356 		if (addr & PAGE_MASK)
    357 			return (EINVAL);
    358 
    359 		error = range_test(addr, size, true);
    360 		if (error) {
    361 			return error;
    362 		}
    363 
    364 	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
    365 
    366 		/*
    367 		 * not fixed: make sure we skip over the largest
    368 		 * possible heap for non-topdown mapping arrangements.
    369 		 * we will refine our guess later (e.g. to account for
    370 		 * VAC, etc)
    371 		 */
    372 
    373 		defaddr = p->p_emul->e_vm_default_addr(p,
    374 		    (vaddr_t)p->p_vmspace->vm_daddr, size,
    375 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
    376 
    377 		if (addr == 0 ||
    378 		    !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
    379 			addr = MAX(addr, defaddr);
    380 		else
    381 			addr = MIN(addr, defaddr);
    382 	}
    383 
    384 	/*
    385 	 * check for file mappings (i.e. not anonymous) and verify file.
    386 	 */
    387 
    388 	advice = UVM_ADV_NORMAL;
    389 	if ((flags & MAP_ANON) == 0) {
    390 		if ((fp = fd_getfile(fd)) == NULL)
    391 			return (EBADF);
    392 
    393 		if (fp->f_ops->fo_mmap == NULL) {
    394 			error = ENODEV;
    395 			goto out;
    396 		}
    397 		error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
    398 					      &advice, &uobj, &maxprot);
    399 		if (error) {
    400 			goto out;
    401 		}
    402 		if (uobj == NULL) {
    403 			flags |= MAP_ANON;
    404 			fd_putfile(fd);
    405 			fp = NULL;
    406 			goto is_anon;
    407 		}
    408 	} else {		/* MAP_ANON case */
    409 		/*
    410 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
    411 		 */
    412 		if (fd != -1)
    413 			return (EINVAL);
    414 
    415  is_anon:		/* label for SunOS style /dev/zero */
    416 		uobj = NULL;
    417 		maxprot = VM_PROT_ALL;
    418 		pos = 0;
    419 	}
    420 
    421 #ifdef PAX_MPROTECT
    422 	pax_mprotect(l, &prot, &maxprot);
    423 #endif /* PAX_MPROTECT */
    424 
    425 #ifdef PAX_ASLR
    426 	pax_aslr_mmap(l, &addr, orig_addr, flags);
    427 #endif /* PAX_ASLR */
    428 
    429 	/*
    430 	 * now let kernel internal function uvm_mmap do the work.
    431 	 */
    432 
    433 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
    434 	    flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
    435 
    436 	/* remember to add offset */
    437 	*retval = (register_t)(addr + pageoff);
    438 
    439  out:
    440      	if (fp != NULL)
    441 		fd_putfile(fd);
    442 
    443 	return (error);
    444 }
    445 
    446 /*
    447  * sys___msync13: the msync system call (a front-end for flush)
    448  */
    449 
    450 int
    451 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
    452     register_t *retval)
    453 {
    454 	/* {
    455 		syscallarg(void *) addr;
    456 		syscallarg(size_t) len;
    457 		syscallarg(int) flags;
    458 	} */
    459 	struct proc *p = l->l_proc;
    460 	vaddr_t addr;
    461 	vsize_t size, pageoff;
    462 	struct vm_map *map;
    463 	int error, rv, flags, uvmflags;
    464 
    465 	/*
    466 	 * extract syscall args from the uap
    467 	 */
    468 
    469 	addr = (vaddr_t)SCARG(uap, addr);
    470 	size = (vsize_t)SCARG(uap, len);
    471 	flags = SCARG(uap, flags);
    472 
    473 	/* sanity check flags */
    474 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
    475 	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
    476 	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
    477 		return (EINVAL);
    478 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
    479 		flags |= MS_SYNC;
    480 
    481 	/*
    482 	 * align the address to a page boundary and adjust the size accordingly.
    483 	 */
    484 
    485 	pageoff = (addr & PAGE_MASK);
    486 	addr -= pageoff;
    487 	size += pageoff;
    488 	size = (vsize_t)round_page(size);
    489 
    490 	error = range_test(addr, size, false);
    491 	if (error)
    492 		return error;
    493 
    494 	/*
    495 	 * get map
    496 	 */
    497 
    498 	map = &p->p_vmspace->vm_map;
    499 
    500 	/*
    501 	 * XXXCDC: do we really need this semantic?
    502 	 *
    503 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
    504 	 * pages with the region containing addr".  Unfortunately, we
    505 	 * don't really keep track of individual mmaps so we approximate
    506 	 * by flushing the range of the map entry containing addr.
    507 	 * This can be incorrect if the region splits or is coalesced
    508 	 * with a neighbor.
    509 	 */
    510 
    511 	if (size == 0) {
    512 		struct vm_map_entry *entry;
    513 
    514 		vm_map_lock_read(map);
    515 		rv = uvm_map_lookup_entry(map, addr, &entry);
    516 		if (rv == true) {
    517 			addr = entry->start;
    518 			size = entry->end - entry->start;
    519 		}
    520 		vm_map_unlock_read(map);
    521 		if (rv == false)
    522 			return (EINVAL);
    523 	}
    524 
    525 	/*
    526 	 * translate MS_ flags into PGO_ flags
    527 	 */
    528 
    529 	uvmflags = PGO_CLEANIT;
    530 	if (flags & MS_INVALIDATE)
    531 		uvmflags |= PGO_FREE;
    532 	if (flags & MS_SYNC)
    533 		uvmflags |= PGO_SYNCIO;
    534 
    535 	error = uvm_map_clean(map, addr, addr+size, uvmflags);
    536 	return error;
    537 }
    538 
    539 /*
    540  * sys_munmap: unmap a users memory
    541  */
    542 
    543 int
    544 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
    545 {
    546 	/* {
    547 		syscallarg(void *) addr;
    548 		syscallarg(size_t) len;
    549 	} */
    550 	struct proc *p = l->l_proc;
    551 	vaddr_t addr;
    552 	vsize_t size, pageoff;
    553 	struct vm_map *map;
    554 	struct vm_map_entry *dead_entries;
    555 	int error;
    556 
    557 	/*
    558 	 * get syscall args.
    559 	 */
    560 
    561 	addr = (vaddr_t)SCARG(uap, addr);
    562 	size = (vsize_t)SCARG(uap, len);
    563 
    564 	/*
    565 	 * align the address to a page boundary and adjust the size accordingly.
    566 	 */
    567 
    568 	pageoff = (addr & PAGE_MASK);
    569 	addr -= pageoff;
    570 	size += pageoff;
    571 	size = (vsize_t)round_page(size);
    572 
    573 	if (size == 0)
    574 		return (0);
    575 
    576 	error = range_test(addr, size, false);
    577 	if (error)
    578 		return error;
    579 
    580 	map = &p->p_vmspace->vm_map;
    581 
    582 	/*
    583 	 * interesting system call semantic: make sure entire range is
    584 	 * allocated before allowing an unmap.
    585 	 */
    586 
    587 	vm_map_lock(map);
    588 #if 0
    589 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
    590 		vm_map_unlock(map);
    591 		return (EINVAL);
    592 	}
    593 #endif
    594 	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
    595 	vm_map_unlock(map);
    596 	if (dead_entries != NULL)
    597 		uvm_unmap_detach(dead_entries, 0);
    598 	return (0);
    599 }
    600 
    601 /*
    602  * sys_mprotect: the mprotect system call
    603  */
    604 
    605 int
    606 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
    607     register_t *retval)
    608 {
    609 	/* {
    610 		syscallarg(void *) addr;
    611 		syscallarg(size_t) len;
    612 		syscallarg(int) prot;
    613 	} */
    614 	struct proc *p = l->l_proc;
    615 	vaddr_t addr;
    616 	vsize_t size, pageoff;
    617 	vm_prot_t prot;
    618 	int error;
    619 
    620 	/*
    621 	 * extract syscall args from uap
    622 	 */
    623 
    624 	addr = (vaddr_t)SCARG(uap, addr);
    625 	size = (vsize_t)SCARG(uap, len);
    626 	prot = SCARG(uap, prot) & VM_PROT_ALL;
    627 
    628 	/*
    629 	 * align the address to a page boundary and adjust the size accordingly.
    630 	 */
    631 
    632 	pageoff = (addr & PAGE_MASK);
    633 	addr -= pageoff;
    634 	size += pageoff;
    635 	size = round_page(size);
    636 
    637 	error = range_test(addr, size, false);
    638 	if (error)
    639 		return error;
    640 
    641 	error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
    642 				false);
    643 	return error;
    644 }
    645 
    646 /*
    647  * sys_minherit: the minherit system call
    648  */
    649 
    650 int
    651 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
    652    register_t *retval)
    653 {
    654 	/* {
    655 		syscallarg(void *) addr;
    656 		syscallarg(int) len;
    657 		syscallarg(int) inherit;
    658 	} */
    659 	struct proc *p = l->l_proc;
    660 	vaddr_t addr;
    661 	vsize_t size, pageoff;
    662 	vm_inherit_t inherit;
    663 	int error;
    664 
    665 	addr = (vaddr_t)SCARG(uap, addr);
    666 	size = (vsize_t)SCARG(uap, len);
    667 	inherit = SCARG(uap, inherit);
    668 
    669 	/*
    670 	 * align the address to a page boundary and adjust the size accordingly.
    671 	 */
    672 
    673 	pageoff = (addr & PAGE_MASK);
    674 	addr -= pageoff;
    675 	size += pageoff;
    676 	size = (vsize_t)round_page(size);
    677 
    678 	error = range_test(addr, size, false);
    679 	if (error)
    680 		return error;
    681 
    682 	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
    683 				inherit);
    684 	return error;
    685 }
    686 
    687 /*
    688  * sys_madvise: give advice about memory usage.
    689  */
    690 
    691 /* ARGSUSED */
    692 int
    693 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
    694    register_t *retval)
    695 {
    696 	/* {
    697 		syscallarg(void *) addr;
    698 		syscallarg(size_t) len;
    699 		syscallarg(int) behav;
    700 	} */
    701 	struct proc *p = l->l_proc;
    702 	vaddr_t addr;
    703 	vsize_t size, pageoff;
    704 	int advice, error;
    705 
    706 	addr = (vaddr_t)SCARG(uap, addr);
    707 	size = (vsize_t)SCARG(uap, len);
    708 	advice = SCARG(uap, behav);
    709 
    710 	/*
    711 	 * align the address to a page boundary, and adjust the size accordingly
    712 	 */
    713 
    714 	pageoff = (addr & PAGE_MASK);
    715 	addr -= pageoff;
    716 	size += pageoff;
    717 	size = (vsize_t)round_page(size);
    718 
    719 	error = range_test(addr, size, false);
    720 	if (error)
    721 		return error;
    722 
    723 	switch (advice) {
    724 	case MADV_NORMAL:
    725 	case MADV_RANDOM:
    726 	case MADV_SEQUENTIAL:
    727 		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
    728 		    advice);
    729 		break;
    730 
    731 	case MADV_WILLNEED:
    732 
    733 		/*
    734 		 * Activate all these pages, pre-faulting them in if
    735 		 * necessary.
    736 		 */
    737 		error = uvm_map_willneed(&p->p_vmspace->vm_map,
    738 		    addr, addr + size);
    739 		break;
    740 
    741 	case MADV_DONTNEED:
    742 
    743 		/*
    744 		 * Deactivate all these pages.  We don't need them
    745 		 * any more.  We don't, however, toss the data in
    746 		 * the pages.
    747 		 */
    748 
    749 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
    750 		    PGO_DEACTIVATE);
    751 		break;
    752 
    753 	case MADV_FREE:
    754 
    755 		/*
    756 		 * These pages contain no valid data, and may be
    757 		 * garbage-collected.  Toss all resources, including
    758 		 * any swap space in use.
    759 		 */
    760 
    761 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
    762 		    PGO_FREE);
    763 		break;
    764 
    765 	case MADV_SPACEAVAIL:
    766 
    767 		/*
    768 		 * XXXMRG What is this?  I think it's:
    769 		 *
    770 		 *	Ensure that we have allocated backing-store
    771 		 *	for these pages.
    772 		 *
    773 		 * This is going to require changes to the page daemon,
    774 		 * as it will free swap space allocated to pages in core.
    775 		 * There's also what to do for device/file/anonymous memory.
    776 		 */
    777 
    778 		return (EINVAL);
    779 
    780 	default:
    781 		return (EINVAL);
    782 	}
    783 
    784 	return error;
    785 }
    786 
    787 /*
    788  * sys_mlock: memory lock
    789  */
    790 
    791 int
    792 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
    793 {
    794 	/* {
    795 		syscallarg(const void *) addr;
    796 		syscallarg(size_t) len;
    797 	} */
    798 	struct proc *p = l->l_proc;
    799 	vaddr_t addr;
    800 	vsize_t size, pageoff;
    801 	int error;
    802 
    803 	/*
    804 	 * extract syscall args from uap
    805 	 */
    806 
    807 	addr = (vaddr_t)SCARG(uap, addr);
    808 	size = (vsize_t)SCARG(uap, len);
    809 
    810 	/*
    811 	 * align the address to a page boundary and adjust the size accordingly
    812 	 */
    813 
    814 	pageoff = (addr & PAGE_MASK);
    815 	addr -= pageoff;
    816 	size += pageoff;
    817 	size = (vsize_t)round_page(size);
    818 
    819 	error = range_test(addr, size, false);
    820 	if (error)
    821 		return error;
    822 
    823 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
    824 		return (EAGAIN);
    825 
    826 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
    827 			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
    828 		return (EAGAIN);
    829 
    830 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
    831 	    0);
    832 	if (error == EFAULT)
    833 		error = ENOMEM;
    834 	return error;
    835 }
    836 
    837 /*
    838  * sys_munlock: unlock wired pages
    839  */
    840 
    841 int
    842 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
    843     register_t *retval)
    844 {
    845 	/* {
    846 		syscallarg(const void *) addr;
    847 		syscallarg(size_t) len;
    848 	} */
    849 	struct proc *p = l->l_proc;
    850 	vaddr_t addr;
    851 	vsize_t size, pageoff;
    852 	int error;
    853 
    854 	/*
    855 	 * extract syscall args from uap
    856 	 */
    857 
    858 	addr = (vaddr_t)SCARG(uap, addr);
    859 	size = (vsize_t)SCARG(uap, len);
    860 
    861 	/*
    862 	 * align the address to a page boundary, and adjust the size accordingly
    863 	 */
    864 
    865 	pageoff = (addr & PAGE_MASK);
    866 	addr -= pageoff;
    867 	size += pageoff;
    868 	size = (vsize_t)round_page(size);
    869 
    870 	error = range_test(addr, size, false);
    871 	if (error)
    872 		return error;
    873 
    874 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
    875 	    0);
    876 	if (error == EFAULT)
    877 		error = ENOMEM;
    878 	return error;
    879 }
    880 
    881 /*
    882  * sys_mlockall: lock all pages mapped into an address space.
    883  */
    884 
    885 int
    886 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
    887     register_t *retval)
    888 {
    889 	/* {
    890 		syscallarg(int) flags;
    891 	} */
    892 	struct proc *p = l->l_proc;
    893 	int error, flags;
    894 
    895 	flags = SCARG(uap, flags);
    896 
    897 	if (flags == 0 ||
    898 	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
    899 		return (EINVAL);
    900 
    901 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
    902 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
    903 	return (error);
    904 }
    905 
    906 /*
    907  * sys_munlockall: unlock all pages mapped into an address space.
    908  */
    909 
    910 int
    911 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
    912 {
    913 	struct proc *p = l->l_proc;
    914 
    915 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
    916 	return (0);
    917 }
    918 
    919 /*
    920  * uvm_mmap: internal version of mmap
    921  *
    922  * - used by sys_mmap and various framebuffers
    923  * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
    924  * - caller must page-align the file offset
    925  */
    926 
    927 int
    928 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
    929     vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
    930     voff_t foff, vsize_t locklimit)
    931 {
    932 	vaddr_t align = 0;
    933 	int error;
    934 	uvm_flag_t uvmflag = 0;
    935 
    936 	/*
    937 	 * check params
    938 	 */
    939 
    940 	if (size == 0)
    941 		return(0);
    942 	if (foff & PAGE_MASK)
    943 		return(EINVAL);
    944 	if ((prot & maxprot) != prot)
    945 		return(EINVAL);
    946 
    947 	/*
    948 	 * for non-fixed mappings, round off the suggested address.
    949 	 * for fixed mappings, check alignment and zap old mappings.
    950 	 */
    951 
    952 	if ((flags & MAP_FIXED) == 0) {
    953 		*addr = round_page(*addr);
    954 	} else {
    955 		if (*addr & PAGE_MASK)
    956 			return(EINVAL);
    957 		uvmflag |= UVM_FLAG_FIXED;
    958 		(void) uvm_unmap(map, *addr, *addr + size);
    959 	}
    960 
    961 	/*
    962 	 * Try to see if any requested alignment can even be attemped.
    963 	 * Make sure we can express the alignment (asking for a >= 4GB
    964 	 * alignment on an ILP32 architecure make no sense) and the
    965 	 * alignment is at least for a page sized quanitiy.  If the
    966 	 * request was for a fixed mapping, make sure supplied address
    967 	 * adheres to the request alignment.
    968 	 */
    969 	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
    970 	if (align) {
    971 		if (align >= sizeof(vaddr_t) * NBBY)
    972 			return(EINVAL);
    973 		align = 1L << align;
    974 		if (align < PAGE_SIZE)
    975 			return(EINVAL);
    976 		if (align >= vm_map_max(map))
    977 			return(ENOMEM);
    978 		if (flags & MAP_FIXED) {
    979 			if ((*addr & (align-1)) != 0)
    980 				return(EINVAL);
    981 			align = 0;
    982 		}
    983 	}
    984 
    985 	/*
    986 	 * check resource limits
    987 	 */
    988 
    989 	if (!VM_MAP_IS_KERNEL(map) &&
    990 	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
    991 	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
    992 		return ENOMEM;
    993 
    994 	/*
    995 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
    996 	 * to underlying vm object.
    997 	 */
    998 
    999 	if (flags & MAP_ANON) {
   1000 		KASSERT(uobj == NULL);
   1001 		foff = UVM_UNKNOWN_OFFSET;
   1002 		if ((flags & MAP_SHARED) == 0)
   1003 			/* XXX: defer amap create */
   1004 			uvmflag |= UVM_FLAG_COPYONW;
   1005 		else
   1006 			/* shared: create amap now */
   1007 			uvmflag |= UVM_FLAG_OVERLAY;
   1008 
   1009 	} else {
   1010 		KASSERT(uobj != NULL);
   1011 		if ((flags & MAP_SHARED) == 0) {
   1012 			uvmflag |= UVM_FLAG_COPYONW;
   1013 		}
   1014 	}
   1015 
   1016 	uvmflag = UVM_MAPFLAG(prot, maxprot,
   1017 			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
   1018 			advice, uvmflag);
   1019 	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
   1020 	if (error) {
   1021 		if (uobj)
   1022 			uobj->pgops->pgo_detach(uobj);
   1023 		return error;
   1024 	}
   1025 
   1026 	/*
   1027 	 * POSIX 1003.1b -- if our address space was configured
   1028 	 * to lock all future mappings, wire the one we just made.
   1029 	 *
   1030 	 * Also handle the MAP_WIRED flag here.
   1031 	 */
   1032 
   1033 	if (prot == VM_PROT_NONE) {
   1034 
   1035 		/*
   1036 		 * No more work to do in this case.
   1037 		 */
   1038 
   1039 		return (0);
   1040 	}
   1041 	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
   1042 		vm_map_lock(map);
   1043 		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
   1044 		    (locklimit != 0 &&
   1045 		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
   1046 		     locklimit)) {
   1047 			vm_map_unlock(map);
   1048 			uvm_unmap(map, *addr, *addr + size);
   1049 			return ENOMEM;
   1050 		}
   1051 
   1052 		/*
   1053 		 * uvm_map_pageable() always returns the map unlocked.
   1054 		 */
   1055 
   1056 		error = uvm_map_pageable(map, *addr, *addr + size,
   1057 					 false, UVM_LK_ENTER);
   1058 		if (error) {
   1059 			uvm_unmap(map, *addr, *addr + size);
   1060 			return error;
   1061 		}
   1062 		return (0);
   1063 	}
   1064 	return 0;
   1065 }
   1066 
   1067 vaddr_t
   1068 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
   1069 {
   1070 
   1071 	if (topdown)
   1072 		return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
   1073 	else
   1074 		return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
   1075 }
   1076 
   1077 int
   1078 uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
   1079     off_t off)
   1080 {
   1081 	struct uvm_object *uobj;
   1082 	int error, flags, prot;
   1083 
   1084 	flags = MAP_SHARED;
   1085 	prot = VM_PROT_READ | VM_PROT_WRITE;
   1086 	if (*addrp)
   1087 		flags |= MAP_FIXED;
   1088 	else
   1089 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
   1090 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
   1091 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
   1092 
   1093 	uobj = udv_attach(dev, prot, off, len);
   1094 	if (uobj == NULL)
   1095 		return EINVAL;
   1096 
   1097 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
   1098 			 (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM,
   1099 			 uobj, off, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
   1100 	return error;
   1101 }
   1102 
   1103 int
   1104 uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
   1105 {
   1106 	int error, flags, prot;
   1107 
   1108 	flags = MAP_PRIVATE | MAP_ANON;
   1109 	prot = VM_PROT_READ | VM_PROT_WRITE;
   1110 	if (*addrp)
   1111 		flags |= MAP_FIXED;
   1112 	else
   1113 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
   1114 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
   1115 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
   1116 
   1117 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
   1118 			 (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL,
   1119 			 NULL, 0, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
   1120 	return error;
   1121 }
   1122