uvm_mmap.c revision 1.41 1 /* $NetBSD: uvm_mmap.c,v 1.41 2000/05/23 02:19:20 enami Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993 The Regents of the University of California.
6 * Copyright (c) 1988 University of Utah.
7 *
8 * All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * the Systems Programming Group of the University of Utah Computer
12 * Science Department.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the Charles D. Cranor,
25 * Washington University, University of California, Berkeley and
26 * its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
44 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
45 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
46 */
47
48 /*
49 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
50 * function.
51 */
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/file.h>
55 #include <sys/filedesc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/mman.h>
58 #include <sys/mount.h>
59 #include <sys/proc.h>
60 #include <sys/malloc.h>
61 #include <sys/vnode.h>
62 #include <sys/conf.h>
63 #include <sys/stat.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #include <vm/vm.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_kern.h>
70
71 #include <sys/syscallargs.h>
72
73 #include <uvm/uvm.h>
74 #include <uvm/uvm_device.h>
75 #include <uvm/uvm_vnode.h>
76
77
78 /*
79 * unimplemented VM system calls:
80 */
81
82 /*
83 * sys_sbrk: sbrk system call.
84 */
85
86 /* ARGSUSED */
87 int
88 sys_sbrk(p, v, retval)
89 struct proc *p;
90 void *v;
91 register_t *retval;
92 {
93 #if 0
94 struct sys_sbrk_args /* {
95 syscallarg(intptr_t) incr;
96 } */ *uap = v;
97 #endif
98
99 return (ENOSYS);
100 }
101
102 /*
103 * sys_sstk: sstk system call.
104 */
105
106 /* ARGSUSED */
107 int
108 sys_sstk(p, v, retval)
109 struct proc *p;
110 void *v;
111 register_t *retval;
112 {
113 #if 0
114 struct sys_sstk_args /* {
115 syscallarg(int) incr;
116 } */ *uap = v;
117 #endif
118
119 return (ENOSYS);
120 }
121
122 /*
123 * sys_mincore: determine if pages are in core or not.
124 */
125
126 /* ARGSUSED */
127 int
128 sys_mincore(p, v, retval)
129 struct proc *p;
130 void *v;
131 register_t *retval;
132 {
133 struct sys_mincore_args /* {
134 syscallarg(void *) addr;
135 syscallarg(size_t) len;
136 syscallarg(char *) vec;
137 } */ *uap = v;
138 vm_page_t m;
139 char *vec, pgi;
140 struct uvm_object *uobj;
141 struct vm_amap *amap;
142 struct vm_anon *anon;
143 vm_map_entry_t entry;
144 vaddr_t start, end, lim;
145 vm_map_t map;
146 vsize_t len;
147 int error = 0, npgs;
148
149 map = &p->p_vmspace->vm_map;
150
151 start = (vaddr_t)SCARG(uap, addr);
152 len = SCARG(uap, len);
153 vec = SCARG(uap, vec);
154
155 if (start & PAGE_MASK)
156 return (EINVAL);
157 len = round_page(len);
158 end = start + len;
159 if (end <= start)
160 return (EINVAL);
161
162 npgs = len >> PAGE_SHIFT;
163
164 if (uvm_useracc(vec, npgs, B_WRITE) == FALSE)
165 return (EFAULT);
166
167 /*
168 * Lock down vec, so our returned status isn't outdated by
169 * storing the status byte for a page.
170 */
171 uvm_vslock(p, vec, npgs, VM_PROT_WRITE);
172
173 vm_map_lock_read(map);
174
175 if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
176 error = ENOMEM;
177 goto out;
178 }
179
180 for (/* nothing */;
181 entry != &map->header && entry->start < end;
182 entry = entry->next) {
183 #ifdef DIAGNOSTIC
184 if (UVM_ET_ISSUBMAP(entry))
185 panic("mincore: user map has submap");
186 if (start < entry->start)
187 panic("mincore: hole");
188 #endif
189 /* Make sure there are no holes. */
190 if (entry->end < end &&
191 (entry->next == &map->header ||
192 entry->next->start > entry->end)) {
193 error = ENOMEM;
194 goto out;
195 }
196
197 lim = end < entry->end ? end : entry->end;
198
199 /*
200 * Special case for objects with no "real" pages. Those
201 * are always considered resident (mapped devices).
202 */
203 if (UVM_ET_ISOBJ(entry)) {
204 #ifdef DIAGNOSTIC
205 if (UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj))
206 panic("mincore: user map has kernel object");
207 #endif
208 if (entry->object.uvm_obj->pgops->pgo_releasepg
209 == NULL) {
210 for (/* nothing */; start < lim;
211 start += PAGE_SIZE, vec++)
212 subyte(vec, 1);
213 continue;
214 }
215 }
216
217 amap = entry->aref.ar_amap; /* top layer */
218 uobj = entry->object.uvm_obj; /* bottom layer */
219
220 if (amap != NULL)
221 amap_lock(amap);
222 if (uobj != NULL)
223 simple_lock(&uobj->vmobjlock);
224
225 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
226 pgi = 0;
227 if (amap != NULL) {
228 /* Check the top layer first. */
229 anon = amap_lookup(&entry->aref,
230 start - entry->start);
231 /* Don't need to lock anon here. */
232 if (anon != NULL && anon->u.an_page != NULL) {
233 /*
234 * Anon has the page for this entry
235 * offset.
236 */
237 pgi = 1;
238 }
239 }
240
241 if (uobj != NULL && pgi == 0) {
242 /* Check the bottom layer. */
243 m = uvm_pagelookup(uobj,
244 entry->offset + (start - entry->start));
245 if (m != NULL) {
246 /*
247 * Object has the page for this entry
248 * offset.
249 */
250 pgi = 1;
251 }
252 }
253
254 (void) subyte(vec, pgi);
255 }
256
257 if (uobj != NULL)
258 simple_unlock(&uobj->vmobjlock);
259 if (amap != NULL)
260 amap_unlock(amap);
261 }
262
263 out:
264 vm_map_unlock_read(map);
265 uvm_vsunlock(p, SCARG(uap, vec), npgs);
266 return (error);
267 }
268
269 #if 0
270 /*
271 * munmapfd: unmap file descriptor
272 *
273 * XXX: is this acutally a useful function? could it be useful?
274 */
275
276 void
277 munmapfd(p, fd)
278 struct proc *p;
279 int fd;
280 {
281
282 /*
283 * XXX should vm_deallocate any regions mapped to this file
284 */
285 p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
286 }
287 #endif
288
289 /*
290 * sys_mmap: mmap system call.
291 *
292 * => file offest and address may not be page aligned
293 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
294 * - if address isn't page aligned the mapping starts at trunc_page(addr)
295 * and the return value is adjusted up by the page offset.
296 */
297
298 int
299 sys_mmap(p, v, retval)
300 struct proc *p;
301 void *v;
302 register_t *retval;
303 {
304 struct sys_mmap_args /* {
305 syscallarg(caddr_t) addr;
306 syscallarg(size_t) len;
307 syscallarg(int) prot;
308 syscallarg(int) flags;
309 syscallarg(int) fd;
310 syscallarg(long) pad;
311 syscallarg(off_t) pos;
312 } */ *uap = v;
313 vaddr_t addr;
314 struct vattr va;
315 off_t pos;
316 vsize_t size, pageoff;
317 vm_prot_t prot, maxprot;
318 int flags, fd;
319 vaddr_t vm_min_address = VM_MIN_ADDRESS;
320 struct filedesc *fdp = p->p_fd;
321 struct file *fp;
322 struct vnode *vp;
323 caddr_t handle;
324 int error;
325
326 /*
327 * first, extract syscall args from the uap.
328 */
329
330 addr = (vaddr_t) SCARG(uap, addr);
331 size = (vsize_t) SCARG(uap, len);
332 prot = SCARG(uap, prot) & VM_PROT_ALL;
333 flags = SCARG(uap, flags);
334 fd = SCARG(uap, fd);
335 pos = SCARG(uap, pos);
336
337 /*
338 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
339 * validate the flags.
340 */
341 if (flags & MAP_COPY)
342 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
343 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
344 return (EINVAL);
345
346 /*
347 * align file position and save offset. adjust size.
348 */
349
350 pageoff = (pos & PAGE_MASK);
351 pos -= pageoff;
352 size += pageoff; /* add offset */
353 size = (vsize_t) round_page(size); /* round up */
354 if ((ssize_t) size < 0)
355 return (EINVAL); /* don't allow wrap */
356
357 /*
358 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
359 */
360
361 if (flags & MAP_FIXED) {
362
363 /* ensure address and file offset are aligned properly */
364 addr -= pageoff;
365 if (addr & PAGE_MASK)
366 return (EINVAL);
367
368 if (VM_MAXUSER_ADDRESS > 0 &&
369 (addr + size) > VM_MAXUSER_ADDRESS)
370 return (EINVAL);
371 if (vm_min_address > 0 && addr < vm_min_address)
372 return (EINVAL);
373 if (addr > addr + size)
374 return (EINVAL); /* no wrapping! */
375
376 } else {
377
378 /*
379 * not fixed: make sure we skip over the largest possible heap.
380 * we will refine our guess later (e.g. to account for VAC, etc)
381 */
382 if (addr < round_page((vaddr_t)p->p_vmspace->vm_daddr+MAXDSIZ))
383 addr = round_page((vaddr_t)p->p_vmspace->vm_daddr +
384 MAXDSIZ);
385 }
386
387 /*
388 * check for file mappings (i.e. not anonymous) and verify file.
389 */
390
391 if ((flags & MAP_ANON) == 0) {
392
393 if (fd < 0 || fd >= fdp->fd_nfiles)
394 return(EBADF); /* failed range check? */
395 fp = fdp->fd_ofiles[fd]; /* convert to file pointer */
396 if (fp == NULL)
397 return(EBADF);
398
399 if (fp->f_type != DTYPE_VNODE)
400 return (ENODEV); /* only mmap vnodes! */
401 vp = (struct vnode *)fp->f_data; /* convert to vnode */
402
403 if (vp->v_type != VREG && vp->v_type != VCHR &&
404 vp->v_type != VBLK)
405 return (ENODEV); /* only REG/CHR/BLK support mmap */
406
407 if (vp->v_type == VREG && (pos + size) < pos)
408 return (EOVERFLOW); /* no offset wrapping */
409
410 /* special case: catch SunOS style /dev/zero */
411 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
412 flags |= MAP_ANON;
413 goto is_anon;
414 }
415
416 /*
417 * Old programs may not select a specific sharing type, so
418 * default to an appropriate one.
419 *
420 * XXX: how does MAP_ANON fit in the picture?
421 */
422 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
423 #if defined(DEBUG)
424 printf("WARNING: defaulted mmap() share type to "
425 "%s (pid %d comm %s)\n", vp->v_type == VCHR ?
426 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
427 p->p_comm);
428 #endif
429 if (vp->v_type == VCHR)
430 flags |= MAP_SHARED; /* for a device */
431 else
432 flags |= MAP_PRIVATE; /* for a file */
433 }
434
435 /*
436 * MAP_PRIVATE device mappings don't make sense (and aren't
437 * supported anyway). However, some programs rely on this,
438 * so just change it to MAP_SHARED.
439 */
440 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
441 #if defined(DIAGNOSTIC)
442 printf("WARNING: converted MAP_PRIVATE device mapping "
443 "to MAP_SHARED (pid %d comm %s)\n", p->p_pid,
444 p->p_comm);
445 #endif
446 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
447 }
448
449 /*
450 * now check protection
451 */
452
453 maxprot = VM_PROT_EXECUTE;
454
455 /* check read access */
456 if (fp->f_flag & FREAD)
457 maxprot |= VM_PROT_READ;
458 else if (prot & PROT_READ)
459 return (EACCES);
460
461 /* check write access, shared case first */
462 if (flags & MAP_SHARED) {
463 /*
464 * if the file is writable, only add PROT_WRITE to
465 * maxprot if the file is not immutable, append-only.
466 * otherwise, if we have asked for PROT_WRITE, return
467 * EPERM.
468 */
469 if (fp->f_flag & FWRITE) {
470 if ((error =
471 VOP_GETATTR(vp, &va, p->p_ucred, p)))
472 return (error);
473 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
474 maxprot |= VM_PROT_WRITE;
475 else if (prot & PROT_WRITE)
476 return (EPERM);
477 }
478 else if (prot & PROT_WRITE)
479 return (EACCES);
480 } else {
481 /* MAP_PRIVATE mappings can always write to */
482 maxprot |= VM_PROT_WRITE;
483 }
484
485 /*
486 * set handle to vnode
487 */
488
489 handle = (caddr_t)vp;
490
491 } else { /* MAP_ANON case */
492 /*
493 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
494 */
495 if (fd != -1)
496 return (EINVAL);
497
498 is_anon: /* label for SunOS style /dev/zero */
499 handle = NULL;
500 maxprot = VM_PROT_ALL;
501 pos = 0;
502 }
503
504 /*
505 * XXX (in)sanity check. We don't do proper datasize checking
506 * XXX for anonymous (or private writable) mmap(). However,
507 * XXX know that if we're trying to allocate more than the amount
508 * XXX remaining under our current data size limit, _that_ should
509 * XXX be disallowed.
510 */
511 if ((flags & MAP_ANON) != 0 ||
512 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
513 if (size >
514 (p->p_rlimit[RLIMIT_DATA].rlim_cur - ctob(p->p_vmspace->vm_dsize))) {
515 return (ENOMEM);
516 }
517 }
518
519 /*
520 * now let kernel internal function uvm_mmap do the work.
521 */
522
523 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
524 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
525
526 if (error == 0)
527 /* remember to add offset */
528 *retval = (register_t)(addr + pageoff);
529
530 return (error);
531 }
532
533 /*
534 * sys___msync13: the msync system call (a front-end for flush)
535 */
536
537 int
538 sys___msync13(p, v, retval)
539 struct proc *p;
540 void *v;
541 register_t *retval;
542 {
543 struct sys___msync13_args /* {
544 syscallarg(caddr_t) addr;
545 syscallarg(size_t) len;
546 syscallarg(int) flags;
547 } */ *uap = v;
548 vaddr_t addr;
549 vsize_t size, pageoff;
550 vm_map_t map;
551 int rv, flags, uvmflags;
552
553 /*
554 * extract syscall args from the uap
555 */
556
557 addr = (vaddr_t)SCARG(uap, addr);
558 size = (vsize_t)SCARG(uap, len);
559 flags = SCARG(uap, flags);
560
561 /* sanity check flags */
562 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
563 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
564 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
565 return (EINVAL);
566 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
567 flags |= MS_SYNC;
568
569 /*
570 * align the address to a page boundary, and adjust the size accordingly
571 */
572
573 pageoff = (addr & PAGE_MASK);
574 addr -= pageoff;
575 size += pageoff;
576 size = (vsize_t) round_page(size);
577
578 /* disallow wrap-around. */
579 if (addr + size < addr)
580 return (EINVAL);
581
582 /*
583 * get map
584 */
585
586 map = &p->p_vmspace->vm_map;
587
588 /*
589 * XXXCDC: do we really need this semantic?
590 *
591 * XXX Gak! If size is zero we are supposed to sync "all modified
592 * pages with the region containing addr". Unfortunately, we
593 * don't really keep track of individual mmaps so we approximate
594 * by flushing the range of the map entry containing addr.
595 * This can be incorrect if the region splits or is coalesced
596 * with a neighbor.
597 */
598 if (size == 0) {
599 vm_map_entry_t entry;
600
601 vm_map_lock_read(map);
602 rv = uvm_map_lookup_entry(map, addr, &entry);
603 if (rv == TRUE) {
604 addr = entry->start;
605 size = entry->end - entry->start;
606 }
607 vm_map_unlock_read(map);
608 if (rv == FALSE)
609 return (EINVAL);
610 }
611
612 /*
613 * translate MS_ flags into PGO_ flags
614 */
615 uvmflags = PGO_CLEANIT;
616 if (flags & MS_INVALIDATE)
617 uvmflags |= PGO_FREE;
618 if (flags & MS_SYNC)
619 uvmflags |= PGO_SYNCIO;
620 else
621 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
622
623 /*
624 * doit!
625 */
626 rv = uvm_map_clean(map, addr, addr+size, uvmflags);
627
628 /*
629 * and return...
630 */
631 switch (rv) {
632 case KERN_SUCCESS:
633 return(0);
634 case KERN_INVALID_ADDRESS:
635 return (ENOMEM);
636 case KERN_FAILURE:
637 return (EIO);
638 case KERN_PAGES_LOCKED: /* XXXCDC: uvm doesn't return this */
639 return (EBUSY);
640 default:
641 return (EINVAL);
642 }
643 /*NOTREACHED*/
644 }
645
646 /*
647 * sys_munmap: unmap a users memory
648 */
649
650 int
651 sys_munmap(p, v, retval)
652 struct proc *p;
653 void *v;
654 register_t *retval;
655 {
656 struct sys_munmap_args /* {
657 syscallarg(caddr_t) addr;
658 syscallarg(size_t) len;
659 } */ *uap = v;
660 vaddr_t addr;
661 vsize_t size, pageoff;
662 vm_map_t map;
663 vaddr_t vm_min_address = VM_MIN_ADDRESS;
664 struct vm_map_entry *dead_entries;
665
666 /*
667 * get syscall args...
668 */
669
670 addr = (vaddr_t) SCARG(uap, addr);
671 size = (vsize_t) SCARG(uap, len);
672
673 /*
674 * align the address to a page boundary, and adjust the size accordingly
675 */
676
677 pageoff = (addr & PAGE_MASK);
678 addr -= pageoff;
679 size += pageoff;
680 size = (vsize_t) round_page(size);
681
682 if ((int)size < 0)
683 return (EINVAL);
684 if (size == 0)
685 return (0);
686
687 /*
688 * Check for illegal addresses. Watch out for address wrap...
689 * Note that VM_*_ADDRESS are not constants due to casts (argh).
690 */
691 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
692 return (EINVAL);
693 if (vm_min_address > 0 && addr < vm_min_address)
694 return (EINVAL);
695 if (addr > addr + size)
696 return (EINVAL);
697 map = &p->p_vmspace->vm_map;
698
699
700 vm_map_lock(map); /* lock map so we can checkprot */
701
702 /*
703 * interesting system call semantic: make sure entire range is
704 * allocated before allowing an unmap.
705 */
706
707 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
708 vm_map_unlock(map);
709 return (EINVAL);
710 }
711
712 /*
713 * doit!
714 */
715 (void) uvm_unmap_remove(map, addr, addr + size, &dead_entries);
716
717 vm_map_unlock(map); /* and unlock */
718
719 if (dead_entries != NULL)
720 uvm_unmap_detach(dead_entries, 0);
721
722 return (0);
723 }
724
725 /*
726 * sys_mprotect: the mprotect system call
727 */
728
729 int
730 sys_mprotect(p, v, retval)
731 struct proc *p;
732 void *v;
733 register_t *retval;
734 {
735 struct sys_mprotect_args /* {
736 syscallarg(caddr_t) addr;
737 syscallarg(int) len;
738 syscallarg(int) prot;
739 } */ *uap = v;
740 vaddr_t addr;
741 vsize_t size, pageoff;
742 vm_prot_t prot;
743 int rv;
744
745 /*
746 * extract syscall args from uap
747 */
748
749 addr = (vaddr_t)SCARG(uap, addr);
750 size = (vsize_t)SCARG(uap, len);
751 prot = SCARG(uap, prot) & VM_PROT_ALL;
752
753 /*
754 * align the address to a page boundary, and adjust the size accordingly
755 */
756 pageoff = (addr & PAGE_MASK);
757 addr -= pageoff;
758 size += pageoff;
759 size = (vsize_t) round_page(size);
760 if ((int)size < 0)
761 return (EINVAL);
762
763 /*
764 * doit
765 */
766
767 rv = uvm_map_protect(&p->p_vmspace->vm_map,
768 addr, addr+size, prot, FALSE);
769
770 if (rv == KERN_SUCCESS)
771 return (0);
772 if (rv == KERN_PROTECTION_FAILURE)
773 return (EACCES);
774 return (EINVAL);
775 }
776
777 /*
778 * sys_minherit: the minherit system call
779 */
780
781 int
782 sys_minherit(p, v, retval)
783 struct proc *p;
784 void *v;
785 register_t *retval;
786 {
787 struct sys_minherit_args /* {
788 syscallarg(caddr_t) addr;
789 syscallarg(int) len;
790 syscallarg(int) inherit;
791 } */ *uap = v;
792 vaddr_t addr;
793 vsize_t size, pageoff;
794 vm_inherit_t inherit;
795
796 addr = (vaddr_t)SCARG(uap, addr);
797 size = (vsize_t)SCARG(uap, len);
798 inherit = SCARG(uap, inherit);
799 /*
800 * align the address to a page boundary, and adjust the size accordingly
801 */
802
803 pageoff = (addr & PAGE_MASK);
804 addr -= pageoff;
805 size += pageoff;
806 size = (vsize_t) round_page(size);
807
808 if ((int)size < 0)
809 return (EINVAL);
810
811 switch (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
812 inherit)) {
813 case KERN_SUCCESS:
814 return (0);
815 case KERN_PROTECTION_FAILURE:
816 return (EACCES);
817 }
818 return (EINVAL);
819 }
820
821 /*
822 * sys_madvise: give advice about memory usage.
823 */
824
825 /* ARGSUSED */
826 int
827 sys_madvise(p, v, retval)
828 struct proc *p;
829 void *v;
830 register_t *retval;
831 {
832 struct sys_madvise_args /* {
833 syscallarg(caddr_t) addr;
834 syscallarg(size_t) len;
835 syscallarg(int) behav;
836 } */ *uap = v;
837 vaddr_t addr;
838 vsize_t size, pageoff;
839 int advice, rv;;
840
841 addr = (vaddr_t)SCARG(uap, addr);
842 size = (vsize_t)SCARG(uap, len);
843 advice = SCARG(uap, behav);
844
845 /*
846 * align the address to a page boundary, and adjust the size accordingly
847 */
848 pageoff = (addr & PAGE_MASK);
849 addr -= pageoff;
850 size += pageoff;
851 size = (vsize_t) round_page(size);
852
853 if ((ssize_t)size <= 0)
854 return (EINVAL);
855
856 switch (advice) {
857 case MADV_NORMAL:
858 case MADV_RANDOM:
859 case MADV_SEQUENTIAL:
860 rv = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
861 advice);
862 break;
863
864 case MADV_WILLNEED:
865 /*
866 * Activate all these pages, pre-faulting them in if
867 * necessary.
868 */
869 /*
870 * XXX IMPLEMENT ME.
871 * Should invent a "weak" mode for uvm_fault()
872 * which would only do the PGO_LOCKED pgo_get().
873 */
874 return (0);
875
876 case MADV_DONTNEED:
877 /*
878 * Deactivate all these pages. We don't need them
879 * any more. We don't, however, toss the data in
880 * the pages.
881 */
882 rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
883 PGO_DEACTIVATE);
884 break;
885
886 case MADV_FREE:
887 /*
888 * These pages contain no valid data, and may be
889 * grbage-collected. Toss all resources, including
890 * any swap space in use.
891 */
892 rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
893 PGO_FREE);
894 break;
895
896 case MADV_SPACEAVAIL:
897 /*
898 * XXXMRG What is this? I think it's:
899 *
900 * Ensure that we have allocated backing-store
901 * for these pages.
902 *
903 * This is going to require changes to the page daemon,
904 * as it will free swap space allocated to pages in core.
905 * There's also what to do for device/file/anonymous memory.
906 */
907 return (EINVAL);
908
909 default:
910 return (EINVAL);
911 }
912
913 switch (rv) {
914 case KERN_SUCCESS:
915 return (0);
916 case KERN_NO_SPACE:
917 return (EAGAIN);
918 case KERN_INVALID_ADDRESS:
919 return (ENOMEM);
920 case KERN_FAILURE:
921 return (EIO);
922 }
923
924 return (EINVAL);
925 }
926
927 /*
928 * sys_mlock: memory lock
929 */
930
931 int
932 sys_mlock(p, v, retval)
933 struct proc *p;
934 void *v;
935 register_t *retval;
936 {
937 struct sys_mlock_args /* {
938 syscallarg(const void *) addr;
939 syscallarg(size_t) len;
940 } */ *uap = v;
941 vaddr_t addr;
942 vsize_t size, pageoff;
943 int error;
944
945 /*
946 * extract syscall args from uap
947 */
948 addr = (vaddr_t)SCARG(uap, addr);
949 size = (vsize_t)SCARG(uap, len);
950
951 /*
952 * align the address to a page boundary and adjust the size accordingly
953 */
954 pageoff = (addr & PAGE_MASK);
955 addr -= pageoff;
956 size += pageoff;
957 size = (vsize_t) round_page(size);
958
959 /* disallow wrap-around. */
960 if (addr + (int)size < addr)
961 return (EINVAL);
962
963 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
964 return (EAGAIN);
965
966 #ifdef pmap_wired_count
967 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
968 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
969 return (EAGAIN);
970 #else
971 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
972 return (error);
973 #endif
974
975 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
976 0);
977 return (error == KERN_SUCCESS ? 0 : ENOMEM);
978 }
979
980 /*
981 * sys_munlock: unlock wired pages
982 */
983
984 int
985 sys_munlock(p, v, retval)
986 struct proc *p;
987 void *v;
988 register_t *retval;
989 {
990 struct sys_munlock_args /* {
991 syscallarg(const void *) addr;
992 syscallarg(size_t) len;
993 } */ *uap = v;
994 vaddr_t addr;
995 vsize_t size, pageoff;
996 int error;
997
998 /*
999 * extract syscall args from uap
1000 */
1001
1002 addr = (vaddr_t)SCARG(uap, addr);
1003 size = (vsize_t)SCARG(uap, len);
1004
1005 /*
1006 * align the address to a page boundary, and adjust the size accordingly
1007 */
1008 pageoff = (addr & PAGE_MASK);
1009 addr -= pageoff;
1010 size += pageoff;
1011 size = (vsize_t) round_page(size);
1012
1013 /* disallow wrap-around. */
1014 if (addr + (int)size < addr)
1015 return (EINVAL);
1016
1017 #ifndef pmap_wired_count
1018 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
1019 return (error);
1020 #endif
1021
1022 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
1023 0);
1024 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1025 }
1026
1027 /*
1028 * sys_mlockall: lock all pages mapped into an address space.
1029 */
1030
1031 int
1032 sys_mlockall(p, v, retval)
1033 struct proc *p;
1034 void *v;
1035 register_t *retval;
1036 {
1037 struct sys_mlockall_args /* {
1038 syscallarg(int) flags;
1039 } */ *uap = v;
1040 int error, flags;
1041
1042 flags = SCARG(uap, flags);
1043
1044 if (flags == 0 ||
1045 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
1046 return (EINVAL);
1047
1048 #ifndef pmap_wired_count
1049 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
1050 return (error);
1051 #endif
1052
1053 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
1054 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1055 switch (error) {
1056 case KERN_SUCCESS:
1057 error = 0;
1058 break;
1059
1060 case KERN_NO_SPACE: /* XXX overloaded */
1061 error = ENOMEM;
1062 break;
1063
1064 default:
1065 /*
1066 * "Some or all of the memory could not be locked when
1067 * the call was made."
1068 */
1069 error = EAGAIN;
1070 }
1071
1072 return (error);
1073 }
1074
1075 /*
1076 * sys_munlockall: unlock all pages mapped into an address space.
1077 */
1078
1079 int
1080 sys_munlockall(p, v, retval)
1081 struct proc *p;
1082 void *v;
1083 register_t *retval;
1084 {
1085
1086 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1087 return (0);
1088 }
1089
1090 /*
1091 * uvm_mmap: internal version of mmap
1092 *
1093 * - used by sys_mmap, exec, and sysv shm
1094 * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true,
1095 * sysv shm uses "named anonymous memory")
1096 * - caller must page-align the file offset
1097 */
1098
1099 int
1100 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1101 vm_map_t map;
1102 vaddr_t *addr;
1103 vsize_t size;
1104 vm_prot_t prot, maxprot;
1105 int flags;
1106 caddr_t handle; /* XXX: VNODE? */
1107 voff_t foff;
1108 vsize_t locklimit;
1109 {
1110 struct uvm_object *uobj;
1111 struct vnode *vp;
1112 int retval;
1113 int advice = UVM_ADV_NORMAL;
1114 uvm_flag_t uvmflag = 0;
1115
1116 /*
1117 * check params
1118 */
1119
1120 if (size == 0)
1121 return(0);
1122 if (foff & PAGE_MASK)
1123 return(EINVAL);
1124 if ((prot & maxprot) != prot)
1125 return(EINVAL);
1126
1127 /*
1128 * for non-fixed mappings, round off the suggested address.
1129 * for fixed mappings, check alignment and zap old mappings.
1130 */
1131
1132 if ((flags & MAP_FIXED) == 0) {
1133 *addr = round_page(*addr); /* round */
1134 } else {
1135
1136 if (*addr & PAGE_MASK)
1137 return(EINVAL);
1138 uvmflag |= UVM_FLAG_FIXED;
1139 (void) uvm_unmap(map, *addr, *addr + size); /* zap! */
1140 }
1141
1142 /*
1143 * handle anon vs. non-anon mappings. for non-anon mappings attach
1144 * to underlying vm object.
1145 */
1146
1147 if (flags & MAP_ANON) {
1148 foff = UVM_UNKNOWN_OFFSET;
1149 uobj = NULL;
1150 if ((flags & MAP_SHARED) == 0)
1151 /* XXX: defer amap create */
1152 uvmflag |= UVM_FLAG_COPYONW;
1153 else
1154 /* shared: create amap now */
1155 uvmflag |= UVM_FLAG_OVERLAY;
1156
1157 } else {
1158
1159 vp = (struct vnode *) handle; /* get vnode */
1160 if (vp->v_type != VCHR) {
1161 uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ?
1162 maxprot : (maxprot & ~VM_PROT_WRITE));
1163
1164 /*
1165 * XXXCDC: hack from old code
1166 * don't allow vnodes which have been mapped
1167 * shared-writeable to persist [forces them to be
1168 * flushed out when last reference goes].
1169 * XXXCDC: interesting side effect: avoids a bug.
1170 * note that in WRITE [ufs_readwrite.c] that we
1171 * allocate buffer, uncache, and then do the write.
1172 * the problem with this is that if the uncache causes
1173 * VM data to be flushed to the same area of the file
1174 * we are writing to... in that case we've got the
1175 * buffer locked and our process goes to sleep forever.
1176 *
1177 * XXXCDC: checking maxprot protects us from the
1178 * "persistbug" program but this is not a long term
1179 * solution.
1180 *
1181 * XXXCDC: we don't bother calling uncache with the vp
1182 * VOP_LOCKed since we know that we are already
1183 * holding a valid reference to the uvn (from the
1184 * uvn_attach above), and thus it is impossible for
1185 * the uncache to kill the uvn and trigger I/O.
1186 */
1187 if (flags & MAP_SHARED) {
1188 if ((prot & VM_PROT_WRITE) ||
1189 (maxprot & VM_PROT_WRITE)) {
1190 uvm_vnp_uncache(vp);
1191 }
1192 }
1193
1194 } else {
1195 uobj = udv_attach((void *) &vp->v_rdev,
1196 (flags & MAP_SHARED) ?
1197 maxprot : (maxprot & ~VM_PROT_WRITE), foff, size);
1198 advice = UVM_ADV_RANDOM;
1199 }
1200
1201 if (uobj == NULL)
1202 return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1203
1204 if ((flags & MAP_SHARED) == 0)
1205 uvmflag |= UVM_FLAG_COPYONW;
1206 }
1207
1208 /*
1209 * set up mapping flags
1210 */
1211
1212 uvmflag = UVM_MAPFLAG(prot, maxprot,
1213 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1214 advice, uvmflag);
1215
1216 /*
1217 * do it!
1218 */
1219
1220 retval = uvm_map(map, addr, size, uobj, foff, uvmflag);
1221
1222 if (retval == KERN_SUCCESS) {
1223 /*
1224 * POSIX 1003.1b -- if our address space was configured
1225 * to lock all future mappings, wire the one we just made.
1226 */
1227 if (prot == VM_PROT_NONE) {
1228 /*
1229 * No more work to do in this case.
1230 */
1231 return (0);
1232 }
1233
1234 vm_map_lock(map);
1235
1236 if (map->flags & VM_MAP_WIREFUTURE) {
1237 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
1238 #ifdef pmap_wired_count
1239 || (locklimit != 0 && (size +
1240 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
1241 locklimit)
1242 #endif
1243 ) {
1244 retval = KERN_RESOURCE_SHORTAGE;
1245 vm_map_unlock(map);
1246 /* unmap the region! */
1247 (void) uvm_unmap(map, *addr, *addr + size);
1248 goto bad;
1249 }
1250 /*
1251 * uvm_map_pageable() always returns the map
1252 * unlocked.
1253 */
1254 retval = uvm_map_pageable(map, *addr, *addr + size,
1255 FALSE, UVM_LK_ENTER);
1256 if (retval != KERN_SUCCESS) {
1257 /* unmap the region! */
1258 (void) uvm_unmap(map, *addr, *addr + size);
1259 goto bad;
1260 }
1261 return (0);
1262 }
1263
1264 vm_map_unlock(map);
1265
1266 return (0);
1267 }
1268
1269 /*
1270 * errors: first detach from the uobj, if any.
1271 */
1272
1273 if (uobj)
1274 uobj->pgops->pgo_detach(uobj);
1275
1276 bad:
1277 switch (retval) {
1278 case KERN_INVALID_ADDRESS:
1279 case KERN_NO_SPACE:
1280 return(ENOMEM);
1281 case KERN_RESOURCE_SHORTAGE:
1282 return (EAGAIN);
1283 case KERN_PROTECTION_FAILURE:
1284 return(EACCES);
1285 }
1286 return(EINVAL);
1287 }
1288