uvm_mmap.c revision 1.115 1 /* $NetBSD: uvm_mmap.c,v 1.115 2007/09/23 16:05:40 yamt Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993 The Regents of the University of California.
6 * Copyright (c) 1988 University of Utah.
7 *
8 * All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * the Systems Programming Group of the University of Utah Computer
12 * Science Department.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the Charles D. Cranor,
25 * Washington University, University of California, Berkeley and
26 * its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
44 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
45 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
46 */
47
48 /*
49 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
50 * function.
51 */
52
53 #include <sys/cdefs.h>
54 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.115 2007/09/23 16:05:40 yamt Exp $");
55
56 #include "opt_compat_netbsd.h"
57 #include "opt_pax.h"
58 #include "veriexec.h"
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/file.h>
63 #include <sys/filedesc.h>
64 #include <sys/resourcevar.h>
65 #include <sys/mman.h>
66 #include <sys/mount.h>
67 #include <sys/proc.h>
68 #include <sys/malloc.h>
69 #include <sys/vnode.h>
70 #include <sys/conf.h>
71 #include <sys/stat.h>
72
73 #if NVERIEXEC > 0
74 #include <sys/verified_exec.h>
75 #endif /* NVERIEXEC > 0 */
76
77 #ifdef PAX_MPROTECT
78 #include <sys/pax.h>
79 #endif /* PAX_MPROTECT */
80
81 #include <miscfs/specfs/specdev.h>
82
83 #include <sys/syscallargs.h>
84
85 #include <uvm/uvm.h>
86 #include <uvm/uvm_device.h>
87
88 #ifndef COMPAT_ZERODEV
89 #define COMPAT_ZERODEV(dev) (0)
90 #endif
91
92 static int
93 range_test(vaddr_t addr, vsize_t size, bool ismmap)
94 {
95 vaddr_t vm_min_address = VM_MIN_ADDRESS;
96 vaddr_t vm_max_address = VM_MAXUSER_ADDRESS;
97 vaddr_t eaddr = addr + size;
98
99 if (addr < vm_min_address)
100 return EINVAL;
101 if (eaddr > vm_max_address)
102 return ismmap ? EFBIG : EINVAL;
103 if (addr > eaddr) /* no wrapping! */
104 return ismmap ? EOVERFLOW : EINVAL;
105 return 0;
106 }
107
108 /*
109 * unimplemented VM system calls:
110 */
111
112 /*
113 * sys_sbrk: sbrk system call.
114 */
115
116 /* ARGSUSED */
117 int
118 sys_sbrk(struct lwp *l, void *v, register_t *retval)
119 {
120 #if 0
121 struct sys_sbrk_args /* {
122 syscallarg(intptr_t) incr;
123 } */ *uap = v;
124 #endif
125
126 return (ENOSYS);
127 }
128
129 /*
130 * sys_sstk: sstk system call.
131 */
132
133 /* ARGSUSED */
134 int
135 sys_sstk(struct lwp *l, void *v, register_t *retval)
136 {
137 #if 0
138 struct sys_sstk_args /* {
139 syscallarg(int) incr;
140 } */ *uap = v;
141 #endif
142
143 return (ENOSYS);
144 }
145
146 /*
147 * sys_mincore: determine if pages are in core or not.
148 */
149
150 /* ARGSUSED */
151 int
152 sys_mincore(struct lwp *l, void *v, register_t *retval)
153 {
154 struct sys_mincore_args /* {
155 syscallarg(void *) addr;
156 syscallarg(size_t) len;
157 syscallarg(char *) vec;
158 } */ *uap = v;
159 struct proc *p = l->l_proc;
160 struct vm_page *pg;
161 char *vec, pgi;
162 struct uvm_object *uobj;
163 struct vm_amap *amap;
164 struct vm_anon *anon;
165 struct vm_map_entry *entry;
166 vaddr_t start, end, lim;
167 struct vm_map *map;
168 vsize_t len;
169 int error = 0, npgs;
170
171 map = &p->p_vmspace->vm_map;
172
173 start = (vaddr_t)SCARG(uap, addr);
174 len = SCARG(uap, len);
175 vec = SCARG(uap, vec);
176
177 if (start & PAGE_MASK)
178 return (EINVAL);
179 len = round_page(len);
180 end = start + len;
181 if (end <= start)
182 return (EINVAL);
183
184 /*
185 * Lock down vec, so our returned status isn't outdated by
186 * storing the status byte for a page.
187 */
188
189 npgs = len >> PAGE_SHIFT;
190 error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
191 if (error) {
192 return error;
193 }
194 vm_map_lock_read(map);
195
196 if (uvm_map_lookup_entry(map, start, &entry) == false) {
197 error = ENOMEM;
198 goto out;
199 }
200
201 for (/* nothing */;
202 entry != &map->header && entry->start < end;
203 entry = entry->next) {
204 KASSERT(!UVM_ET_ISSUBMAP(entry));
205 KASSERT(start >= entry->start);
206
207 /* Make sure there are no holes. */
208 if (entry->end < end &&
209 (entry->next == &map->header ||
210 entry->next->start > entry->end)) {
211 error = ENOMEM;
212 goto out;
213 }
214
215 lim = end < entry->end ? end : entry->end;
216
217 /*
218 * Special case for objects with no "real" pages. Those
219 * are always considered resident (mapped devices).
220 */
221
222 if (UVM_ET_ISOBJ(entry)) {
223 KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
224 if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
225 for (/* nothing */; start < lim;
226 start += PAGE_SIZE, vec++)
227 subyte(vec, 1);
228 continue;
229 }
230 }
231
232 amap = entry->aref.ar_amap; /* top layer */
233 uobj = entry->object.uvm_obj; /* bottom layer */
234
235 if (amap != NULL)
236 amap_lock(amap);
237 if (uobj != NULL)
238 simple_lock(&uobj->vmobjlock);
239
240 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
241 pgi = 0;
242 if (amap != NULL) {
243 /* Check the top layer first. */
244 anon = amap_lookup(&entry->aref,
245 start - entry->start);
246 /* Don't need to lock anon here. */
247 if (anon != NULL && anon->an_page != NULL) {
248
249 /*
250 * Anon has the page for this entry
251 * offset.
252 */
253
254 pgi = 1;
255 }
256 }
257 if (uobj != NULL && pgi == 0) {
258 /* Check the bottom layer. */
259 pg = uvm_pagelookup(uobj,
260 entry->offset + (start - entry->start));
261 if (pg != NULL) {
262
263 /*
264 * Object has the page for this entry
265 * offset.
266 */
267
268 pgi = 1;
269 }
270 }
271 (void) subyte(vec, pgi);
272 }
273 if (uobj != NULL)
274 simple_unlock(&uobj->vmobjlock);
275 if (amap != NULL)
276 amap_unlock(amap);
277 }
278
279 out:
280 vm_map_unlock_read(map);
281 uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
282 return (error);
283 }
284
285 /*
286 * sys_mmap: mmap system call.
287 *
288 * => file offset and address may not be page aligned
289 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
290 * - if address isn't page aligned the mapping starts at trunc_page(addr)
291 * and the return value is adjusted up by the page offset.
292 */
293
294 int
295 sys_mmap(l, v, retval)
296 struct lwp *l;
297 void *v;
298 register_t *retval;
299 {
300 struct sys_mmap_args /* {
301 syscallarg(void *) addr;
302 syscallarg(size_t) len;
303 syscallarg(int) prot;
304 syscallarg(int) flags;
305 syscallarg(int) fd;
306 syscallarg(long) pad;
307 syscallarg(off_t) pos;
308 } */ *uap = v;
309 struct proc *p = l->l_proc;
310 vaddr_t addr;
311 struct vattr va;
312 off_t pos;
313 vsize_t size, pageoff;
314 vm_prot_t prot, maxprot;
315 int flags, fd;
316 vaddr_t defaddr;
317 struct filedesc *fdp = p->p_fd;
318 struct file *fp;
319 struct vnode *vp;
320 void *handle;
321 int error;
322
323 /*
324 * first, extract syscall args from the uap.
325 */
326
327 addr = (vaddr_t)SCARG(uap, addr);
328 size = (vsize_t)SCARG(uap, len);
329 prot = SCARG(uap, prot) & VM_PROT_ALL;
330 flags = SCARG(uap, flags);
331 fd = SCARG(uap, fd);
332 pos = SCARG(uap, pos);
333
334 /*
335 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
336 * validate the flags.
337 */
338 if (flags & MAP_COPY)
339 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
340 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
341 return (EINVAL);
342
343 /*
344 * align file position and save offset. adjust size.
345 */
346
347 pageoff = (pos & PAGE_MASK);
348 pos -= pageoff;
349 size += pageoff; /* add offset */
350 size = (vsize_t)round_page(size); /* round up */
351
352 /*
353 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
354 */
355 if (flags & MAP_FIXED) {
356
357 /* ensure address and file offset are aligned properly */
358 addr -= pageoff;
359 if (addr & PAGE_MASK)
360 return (EINVAL);
361
362 error = range_test(addr, size, true);
363 if (error)
364 return error;
365 } else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
366
367 /*
368 * not fixed: make sure we skip over the largest
369 * possible heap for non-topdown mapping arrangements.
370 * we will refine our guess later (e.g. to account for
371 * VAC, etc)
372 */
373
374 defaddr = p->p_emul->e_vm_default_addr(p,
375 (vaddr_t)p->p_vmspace->vm_daddr, size);
376
377 if (addr == 0 ||
378 !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
379 addr = MAX(addr, defaddr);
380 else
381 addr = MIN(addr, defaddr);
382 }
383
384 /*
385 * check for file mappings (i.e. not anonymous) and verify file.
386 */
387
388 if ((flags & MAP_ANON) == 0) {
389
390 if ((fp = fd_getfile(fdp, fd)) == NULL)
391 return (EBADF);
392
393 simple_unlock(&fp->f_slock);
394
395 if (fp->f_type != DTYPE_VNODE)
396 return (ENODEV); /* only mmap vnodes! */
397 vp = (struct vnode *)fp->f_data; /* convert to vnode */
398
399 if (vp->v_type != VREG && vp->v_type != VCHR &&
400 vp->v_type != VBLK)
401 return (ENODEV); /* only REG/CHR/BLK support mmap */
402
403 if (vp->v_type != VCHR && pos < 0)
404 return (EINVAL);
405
406 if (vp->v_type != VCHR && (pos + size) < pos)
407 return (EOVERFLOW); /* no offset wrapping */
408
409 /* special case: catch SunOS style /dev/zero */
410 if (vp->v_type == VCHR
411 && (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
412 flags |= MAP_ANON;
413 goto is_anon;
414 }
415
416 /*
417 * Old programs may not select a specific sharing type, so
418 * default to an appropriate one.
419 *
420 * XXX: how does MAP_ANON fit in the picture?
421 */
422 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
423 #if defined(DEBUG)
424 printf("WARNING: defaulted mmap() share type to "
425 "%s (pid %d command %s)\n", vp->v_type == VCHR ?
426 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
427 p->p_comm);
428 #endif
429 if (vp->v_type == VCHR)
430 flags |= MAP_SHARED; /* for a device */
431 else
432 flags |= MAP_PRIVATE; /* for a file */
433 }
434
435 /*
436 * MAP_PRIVATE device mappings don't make sense (and aren't
437 * supported anyway). However, some programs rely on this,
438 * so just change it to MAP_SHARED.
439 */
440 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
441 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
442 }
443
444 /*
445 * now check protection
446 */
447
448 maxprot = VM_PROT_EXECUTE;
449
450 /* check read access */
451 if (fp->f_flag & FREAD)
452 maxprot |= VM_PROT_READ;
453 else if (prot & PROT_READ)
454 return (EACCES);
455
456 /* check write access, shared case first */
457 if (flags & MAP_SHARED) {
458 /*
459 * if the file is writable, only add PROT_WRITE to
460 * maxprot if the file is not immutable, append-only.
461 * otherwise, if we have asked for PROT_WRITE, return
462 * EPERM.
463 */
464 if (fp->f_flag & FWRITE) {
465 if ((error =
466 VOP_GETATTR(vp, &va, l->l_cred, l)))
467 return (error);
468 if ((va.va_flags &
469 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
470 maxprot |= VM_PROT_WRITE;
471 else if (prot & PROT_WRITE)
472 return (EPERM);
473 }
474 else if (prot & PROT_WRITE)
475 return (EACCES);
476 } else {
477 /* MAP_PRIVATE mappings can always write to */
478 maxprot |= VM_PROT_WRITE;
479 }
480 handle = vp;
481
482 } else { /* MAP_ANON case */
483 /*
484 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
485 */
486 if (fd != -1)
487 return (EINVAL);
488
489 is_anon: /* label for SunOS style /dev/zero */
490 handle = NULL;
491 maxprot = VM_PROT_ALL;
492 pos = 0;
493 }
494
495 /*
496 * XXX (in)sanity check. We don't do proper datasize checking
497 * XXX for anonymous (or private writable) mmap(). However,
498 * XXX know that if we're trying to allocate more than the amount
499 * XXX remaining under our current data size limit, _that_ should
500 * XXX be disallowed.
501 */
502 if ((flags & MAP_ANON) != 0 ||
503 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
504 if (size >
505 (p->p_rlimit[RLIMIT_DATA].rlim_cur -
506 ctob(p->p_vmspace->vm_dsize))) {
507 return (ENOMEM);
508 }
509 }
510
511 #if NVERIEXEC > 0
512 if (handle != NULL) {
513 /*
514 * Check if the file can be executed indirectly.
515 *
516 * XXX: This gives false warnings about "Incorrect access type"
517 * XXX: if the mapping is not executable. Harmless, but will be
518 * XXX: fixed as part of other changes.
519 */
520 if (veriexec_verify(l, handle, "(mmap)", VERIEXEC_INDIRECT,
521 NULL)) {
522 /*
523 * Don't allow executable mappings if we can't
524 * indirectly execute the file.
525 */
526 if (prot & VM_PROT_EXECUTE)
527 return (EPERM);
528
529 /*
530 * Strip the executable bit from 'maxprot' to make sure
531 * it can't be made executable later.
532 */
533 maxprot &= ~VM_PROT_EXECUTE;
534 }
535 }
536 #endif /* NVERIEXEC > 0 */
537
538 #ifdef PAX_MPROTECT
539 pax_mprotect(l, &prot, &maxprot);
540 #endif /* PAX_MPROTECT */
541
542 /*
543 * now let kernel internal function uvm_mmap do the work.
544 */
545
546 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
547 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
548
549 if (error == 0)
550 /* remember to add offset */
551 *retval = (register_t)(addr + pageoff);
552
553 return (error);
554 }
555
556 /*
557 * sys___msync13: the msync system call (a front-end for flush)
558 */
559
560 int
561 sys___msync13(struct lwp *l, void *v, register_t *retval)
562 {
563 struct sys___msync13_args /* {
564 syscallarg(void *) addr;
565 syscallarg(size_t) len;
566 syscallarg(int) flags;
567 } */ *uap = v;
568 struct proc *p = l->l_proc;
569 vaddr_t addr;
570 vsize_t size, pageoff;
571 struct vm_map *map;
572 int error, rv, flags, uvmflags;
573
574 /*
575 * extract syscall args from the uap
576 */
577
578 addr = (vaddr_t)SCARG(uap, addr);
579 size = (vsize_t)SCARG(uap, len);
580 flags = SCARG(uap, flags);
581
582 /* sanity check flags */
583 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
584 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
585 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
586 return (EINVAL);
587 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
588 flags |= MS_SYNC;
589
590 /*
591 * align the address to a page boundary and adjust the size accordingly.
592 */
593
594 pageoff = (addr & PAGE_MASK);
595 addr -= pageoff;
596 size += pageoff;
597 size = (vsize_t)round_page(size);
598
599 error = range_test(addr, size, false);
600 if (error)
601 return error;
602
603 /*
604 * get map
605 */
606
607 map = &p->p_vmspace->vm_map;
608
609 /*
610 * XXXCDC: do we really need this semantic?
611 *
612 * XXX Gak! If size is zero we are supposed to sync "all modified
613 * pages with the region containing addr". Unfortunately, we
614 * don't really keep track of individual mmaps so we approximate
615 * by flushing the range of the map entry containing addr.
616 * This can be incorrect if the region splits or is coalesced
617 * with a neighbor.
618 */
619
620 if (size == 0) {
621 struct vm_map_entry *entry;
622
623 vm_map_lock_read(map);
624 rv = uvm_map_lookup_entry(map, addr, &entry);
625 if (rv == true) {
626 addr = entry->start;
627 size = entry->end - entry->start;
628 }
629 vm_map_unlock_read(map);
630 if (rv == false)
631 return (EINVAL);
632 }
633
634 /*
635 * translate MS_ flags into PGO_ flags
636 */
637
638 uvmflags = PGO_CLEANIT;
639 if (flags & MS_INVALIDATE)
640 uvmflags |= PGO_FREE;
641 if (flags & MS_SYNC)
642 uvmflags |= PGO_SYNCIO;
643
644 error = uvm_map_clean(map, addr, addr+size, uvmflags);
645 return error;
646 }
647
648 /*
649 * sys_munmap: unmap a users memory
650 */
651
652 int
653 sys_munmap(struct lwp *l, void *v, register_t *retval)
654 {
655 struct sys_munmap_args /* {
656 syscallarg(void *) addr;
657 syscallarg(size_t) len;
658 } */ *uap = v;
659 struct proc *p = l->l_proc;
660 vaddr_t addr;
661 vsize_t size, pageoff;
662 struct vm_map *map;
663 struct vm_map_entry *dead_entries;
664 int error;
665
666 /*
667 * get syscall args.
668 */
669
670 addr = (vaddr_t)SCARG(uap, addr);
671 size = (vsize_t)SCARG(uap, len);
672
673 /*
674 * align the address to a page boundary and adjust the size accordingly.
675 */
676
677 pageoff = (addr & PAGE_MASK);
678 addr -= pageoff;
679 size += pageoff;
680 size = (vsize_t)round_page(size);
681
682 if (size == 0)
683 return (0);
684
685 error = range_test(addr, size, false);
686 if (error)
687 return error;
688
689 map = &p->p_vmspace->vm_map;
690
691 /*
692 * interesting system call semantic: make sure entire range is
693 * allocated before allowing an unmap.
694 */
695
696 vm_map_lock(map);
697 #if 0
698 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
699 vm_map_unlock(map);
700 return (EINVAL);
701 }
702 #endif
703 uvm_unmap_remove(map, addr, addr + size, &dead_entries, NULL, 0);
704 vm_map_unlock(map);
705 if (dead_entries != NULL)
706 uvm_unmap_detach(dead_entries, 0);
707 return (0);
708 }
709
710 /*
711 * sys_mprotect: the mprotect system call
712 */
713
714 int
715 sys_mprotect(struct lwp *l, void *v, register_t *retval)
716 {
717 struct sys_mprotect_args /* {
718 syscallarg(void *) addr;
719 syscallarg(size_t) len;
720 syscallarg(int) prot;
721 } */ *uap = v;
722 struct proc *p = l->l_proc;
723 vaddr_t addr;
724 vsize_t size, pageoff;
725 vm_prot_t prot;
726 int error;
727
728 /*
729 * extract syscall args from uap
730 */
731
732 addr = (vaddr_t)SCARG(uap, addr);
733 size = (vsize_t)SCARG(uap, len);
734 prot = SCARG(uap, prot) & VM_PROT_ALL;
735
736 /*
737 * align the address to a page boundary and adjust the size accordingly.
738 */
739
740 pageoff = (addr & PAGE_MASK);
741 addr -= pageoff;
742 size += pageoff;
743 size = round_page(size);
744
745 error = range_test(addr, size, false);
746 if (error)
747 return error;
748
749 error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
750 false);
751 return error;
752 }
753
754 /*
755 * sys_minherit: the minherit system call
756 */
757
758 int
759 sys_minherit(struct lwp *l, void *v, register_t *retval)
760 {
761 struct sys_minherit_args /* {
762 syscallarg(void *) addr;
763 syscallarg(int) len;
764 syscallarg(int) inherit;
765 } */ *uap = v;
766 struct proc *p = l->l_proc;
767 vaddr_t addr;
768 vsize_t size, pageoff;
769 vm_inherit_t inherit;
770 int error;
771
772 addr = (vaddr_t)SCARG(uap, addr);
773 size = (vsize_t)SCARG(uap, len);
774 inherit = SCARG(uap, inherit);
775
776 /*
777 * align the address to a page boundary and adjust the size accordingly.
778 */
779
780 pageoff = (addr & PAGE_MASK);
781 addr -= pageoff;
782 size += pageoff;
783 size = (vsize_t)round_page(size);
784
785 error = range_test(addr, size, false);
786 if (error)
787 return error;
788
789 error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
790 inherit);
791 return error;
792 }
793
794 /*
795 * sys_madvise: give advice about memory usage.
796 */
797
798 /* ARGSUSED */
799 int
800 sys_madvise(struct lwp *l, void *v, register_t *retval)
801 {
802 struct sys_madvise_args /* {
803 syscallarg(void *) addr;
804 syscallarg(size_t) len;
805 syscallarg(int) behav;
806 } */ *uap = v;
807 struct proc *p = l->l_proc;
808 vaddr_t addr;
809 vsize_t size, pageoff;
810 int advice, error;
811
812 addr = (vaddr_t)SCARG(uap, addr);
813 size = (vsize_t)SCARG(uap, len);
814 advice = SCARG(uap, behav);
815
816 /*
817 * align the address to a page boundary, and adjust the size accordingly
818 */
819
820 pageoff = (addr & PAGE_MASK);
821 addr -= pageoff;
822 size += pageoff;
823 size = (vsize_t)round_page(size);
824
825 error = range_test(addr, size, false);
826 if (error)
827 return error;
828
829 switch (advice) {
830 case MADV_NORMAL:
831 case MADV_RANDOM:
832 case MADV_SEQUENTIAL:
833 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
834 advice);
835 break;
836
837 case MADV_WILLNEED:
838
839 /*
840 * Activate all these pages, pre-faulting them in if
841 * necessary.
842 */
843 /*
844 * XXX IMPLEMENT ME.
845 * Should invent a "weak" mode for uvm_fault()
846 * which would only do the PGO_LOCKED pgo_get().
847 */
848
849 return (0);
850
851 case MADV_DONTNEED:
852
853 /*
854 * Deactivate all these pages. We don't need them
855 * any more. We don't, however, toss the data in
856 * the pages.
857 */
858
859 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
860 PGO_DEACTIVATE);
861 break;
862
863 case MADV_FREE:
864
865 /*
866 * These pages contain no valid data, and may be
867 * garbage-collected. Toss all resources, including
868 * any swap space in use.
869 */
870
871 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
872 PGO_FREE);
873 break;
874
875 case MADV_SPACEAVAIL:
876
877 /*
878 * XXXMRG What is this? I think it's:
879 *
880 * Ensure that we have allocated backing-store
881 * for these pages.
882 *
883 * This is going to require changes to the page daemon,
884 * as it will free swap space allocated to pages in core.
885 * There's also what to do for device/file/anonymous memory.
886 */
887
888 return (EINVAL);
889
890 default:
891 return (EINVAL);
892 }
893
894 return error;
895 }
896
897 /*
898 * sys_mlock: memory lock
899 */
900
901 int
902 sys_mlock(struct lwp *l, void *v, register_t *retval)
903 {
904 struct sys_mlock_args /* {
905 syscallarg(const void *) addr;
906 syscallarg(size_t) len;
907 } */ *uap = v;
908 struct proc *p = l->l_proc;
909 vaddr_t addr;
910 vsize_t size, pageoff;
911 int error;
912
913 /*
914 * extract syscall args from uap
915 */
916
917 addr = (vaddr_t)SCARG(uap, addr);
918 size = (vsize_t)SCARG(uap, len);
919
920 /*
921 * align the address to a page boundary and adjust the size accordingly
922 */
923
924 pageoff = (addr & PAGE_MASK);
925 addr -= pageoff;
926 size += pageoff;
927 size = (vsize_t)round_page(size);
928
929 error = range_test(addr, size, false);
930 if (error)
931 return error;
932
933 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
934 return (EAGAIN);
935
936 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
937 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
938 return (EAGAIN);
939
940 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
941 0);
942 if (error == EFAULT)
943 error = ENOMEM;
944 return error;
945 }
946
947 /*
948 * sys_munlock: unlock wired pages
949 */
950
951 int
952 sys_munlock(struct lwp *l, void *v, register_t *retval)
953 {
954 struct sys_munlock_args /* {
955 syscallarg(const void *) addr;
956 syscallarg(size_t) len;
957 } */ *uap = v;
958 struct proc *p = l->l_proc;
959 vaddr_t addr;
960 vsize_t size, pageoff;
961 int error;
962
963 /*
964 * extract syscall args from uap
965 */
966
967 addr = (vaddr_t)SCARG(uap, addr);
968 size = (vsize_t)SCARG(uap, len);
969
970 /*
971 * align the address to a page boundary, and adjust the size accordingly
972 */
973
974 pageoff = (addr & PAGE_MASK);
975 addr -= pageoff;
976 size += pageoff;
977 size = (vsize_t)round_page(size);
978
979 error = range_test(addr, size, false);
980 if (error)
981 return error;
982
983 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
984 0);
985 if (error == EFAULT)
986 error = ENOMEM;
987 return error;
988 }
989
990 /*
991 * sys_mlockall: lock all pages mapped into an address space.
992 */
993
994 int
995 sys_mlockall(struct lwp *l, void *v, register_t *retval)
996 {
997 struct sys_mlockall_args /* {
998 syscallarg(int) flags;
999 } */ *uap = v;
1000 struct proc *p = l->l_proc;
1001 int error, flags;
1002
1003 flags = SCARG(uap, flags);
1004
1005 if (flags == 0 ||
1006 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
1007 return (EINVAL);
1008
1009 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
1010 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1011 return (error);
1012 }
1013
1014 /*
1015 * sys_munlockall: unlock all pages mapped into an address space.
1016 */
1017
1018 int
1019 sys_munlockall(struct lwp *l, void *v, register_t *retval)
1020 {
1021 struct proc *p = l->l_proc;
1022
1023 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1024 return (0);
1025 }
1026
1027 /*
1028 * uvm_mmap: internal version of mmap
1029 *
1030 * - used by sys_mmap and various framebuffers
1031 * - handle is a vnode pointer or NULL for MAP_ANON
1032 * - caller must page-align the file offset
1033 */
1034
1035 int
1036 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1037 struct vm_map *map;
1038 vaddr_t *addr;
1039 vsize_t size;
1040 vm_prot_t prot, maxprot;
1041 int flags;
1042 void *handle;
1043 voff_t foff;
1044 vsize_t locklimit;
1045 {
1046 struct uvm_object *uobj;
1047 struct vnode *vp;
1048 vaddr_t align = 0;
1049 int error;
1050 int advice = UVM_ADV_NORMAL;
1051 uvm_flag_t uvmflag = 0;
1052 bool needwritemap;
1053
1054 /*
1055 * check params
1056 */
1057
1058 if (size == 0)
1059 return(0);
1060 if (foff & PAGE_MASK)
1061 return(EINVAL);
1062 if ((prot & maxprot) != prot)
1063 return(EINVAL);
1064
1065 /*
1066 * for non-fixed mappings, round off the suggested address.
1067 * for fixed mappings, check alignment and zap old mappings.
1068 */
1069
1070 if ((flags & MAP_FIXED) == 0) {
1071 *addr = round_page(*addr);
1072 } else {
1073 if (*addr & PAGE_MASK)
1074 return(EINVAL);
1075 uvmflag |= UVM_FLAG_FIXED;
1076 (void) uvm_unmap(map, *addr, *addr + size);
1077 }
1078
1079 /*
1080 * Try to see if any requested alignment can even be attemped.
1081 * Make sure we can express the alignment (asking for a >= 4GB
1082 * alignment on an ILP32 architecure make no sense) and the
1083 * alignment is at least for a page sized quanitiy. If the
1084 * request was for a fixed mapping, make sure supplied address
1085 * adheres to the request alignment.
1086 */
1087 align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
1088 if (align) {
1089 if (align >= sizeof(vaddr_t) * NBBY)
1090 return(EINVAL);
1091 align = 1L << align;
1092 if (align < PAGE_SIZE)
1093 return(EINVAL);
1094 if (align >= vm_map_max(map))
1095 return(ENOMEM);
1096 if (flags & MAP_FIXED) {
1097 if ((*addr & (align-1)) != 0)
1098 return(EINVAL);
1099 align = 0;
1100 }
1101 }
1102
1103 /*
1104 * handle anon vs. non-anon mappings. for non-anon mappings attach
1105 * to underlying vm object.
1106 */
1107
1108 if (flags & MAP_ANON) {
1109 KASSERT(handle == NULL);
1110 foff = UVM_UNKNOWN_OFFSET;
1111 uobj = NULL;
1112 if ((flags & MAP_SHARED) == 0)
1113 /* XXX: defer amap create */
1114 uvmflag |= UVM_FLAG_COPYONW;
1115 else
1116 /* shared: create amap now */
1117 uvmflag |= UVM_FLAG_OVERLAY;
1118
1119 } else {
1120 KASSERT(handle != NULL);
1121 vp = (struct vnode *)handle;
1122
1123 /*
1124 * Don't allow mmap for EXEC if the file system
1125 * is mounted NOEXEC.
1126 */
1127 if ((prot & PROT_EXEC) != 0 &&
1128 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0)
1129 return (EACCES);
1130
1131 if (vp->v_type != VCHR) {
1132 error = VOP_MMAP(vp, prot, curlwp->l_cred, curlwp);
1133 if (error) {
1134 return error;
1135 }
1136 vref(vp);
1137 uobj = &vp->v_uobj;
1138
1139 /*
1140 * If the vnode is being mapped with PROT_EXEC,
1141 * then mark it as text.
1142 */
1143 if (prot & PROT_EXEC)
1144 vn_markexec(vp);
1145 } else {
1146 int i = maxprot;
1147
1148 /*
1149 * XXX Some devices don't like to be mapped with
1150 * XXX PROT_EXEC or PROT_WRITE, but we don't really
1151 * XXX have a better way of handling this, right now
1152 */
1153 do {
1154 uobj = udv_attach((void *) &vp->v_rdev,
1155 (flags & MAP_SHARED) ? i :
1156 (i & ~VM_PROT_WRITE), foff, size);
1157 i--;
1158 } while ((uobj == NULL) && (i > 0));
1159 advice = UVM_ADV_RANDOM;
1160 }
1161 if (uobj == NULL)
1162 return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1163 if ((flags & MAP_SHARED) == 0) {
1164 uvmflag |= UVM_FLAG_COPYONW;
1165 }
1166
1167 /*
1168 * Set vnode flags to indicate the new kinds of mapping.
1169 * We take the vnode lock in exclusive mode here to serialize
1170 * with direct I/O.
1171 */
1172
1173 needwritemap = (vp->v_flag & VWRITEMAP) == 0 &&
1174 (flags & MAP_SHARED) != 0 &&
1175 (maxprot & VM_PROT_WRITE) != 0;
1176 if ((vp->v_flag & VMAPPED) == 0 || needwritemap) {
1177 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1178 simple_lock(&vp->v_interlock);
1179 vp->v_flag |= VMAPPED;
1180 if (needwritemap) {
1181 vp->v_flag |= VWRITEMAP;
1182 }
1183 simple_unlock(&vp->v_interlock);
1184 VOP_UNLOCK(vp, 0);
1185 }
1186 }
1187
1188 uvmflag = UVM_MAPFLAG(prot, maxprot,
1189 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1190 advice, uvmflag);
1191 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1192 if (error) {
1193 if (uobj)
1194 uobj->pgops->pgo_detach(uobj);
1195 return error;
1196 }
1197
1198 /*
1199 * POSIX 1003.1b -- if our address space was configured
1200 * to lock all future mappings, wire the one we just made.
1201 *
1202 * Also handle the MAP_WIRED flag here.
1203 */
1204
1205 if (prot == VM_PROT_NONE) {
1206
1207 /*
1208 * No more work to do in this case.
1209 */
1210
1211 return (0);
1212 }
1213 vm_map_lock(map);
1214 if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
1215 if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
1216 (locklimit != 0 &&
1217 size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
1218 locklimit)) {
1219 vm_map_unlock(map);
1220 uvm_unmap(map, *addr, *addr + size);
1221 return ENOMEM;
1222 }
1223
1224 /*
1225 * uvm_map_pageable() always returns the map unlocked.
1226 */
1227
1228 error = uvm_map_pageable(map, *addr, *addr + size,
1229 false, UVM_LK_ENTER);
1230 if (error) {
1231 uvm_unmap(map, *addr, *addr + size);
1232 return error;
1233 }
1234 return (0);
1235 }
1236 vm_map_unlock(map);
1237 return 0;
1238 }
1239
1240 vaddr_t
1241 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz)
1242 {
1243
1244 return VM_DEFAULT_ADDRESS(base, sz);
1245 }
1246