uvm_mmap.c revision 1.49 1 /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993 The Regents of the University of California.
6 * Copyright (c) 1988 University of Utah.
7 *
8 * All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * the Systems Programming Group of the University of Utah Computer
12 * Science Department.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the Charles D. Cranor,
25 * Washington University, University of California, Berkeley and
26 * its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
44 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
45 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
46 */
47
48 /*
49 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
50 * function.
51 */
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/file.h>
55 #include <sys/filedesc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/mman.h>
58 #include <sys/mount.h>
59 #include <sys/proc.h>
60 #include <sys/malloc.h>
61 #include <sys/vnode.h>
62 #include <sys/conf.h>
63 #include <sys/stat.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #include <sys/syscallargs.h>
68
69 #include <uvm/uvm.h>
70 #include <uvm/uvm_device.h>
71 #include <uvm/uvm_vnode.h>
72
73
74 /*
75 * unimplemented VM system calls:
76 */
77
78 /*
79 * sys_sbrk: sbrk system call.
80 */
81
82 /* ARGSUSED */
83 int
84 sys_sbrk(p, v, retval)
85 struct proc *p;
86 void *v;
87 register_t *retval;
88 {
89 #if 0
90 struct sys_sbrk_args /* {
91 syscallarg(intptr_t) incr;
92 } */ *uap = v;
93 #endif
94
95 return (ENOSYS);
96 }
97
98 /*
99 * sys_sstk: sstk system call.
100 */
101
102 /* ARGSUSED */
103 int
104 sys_sstk(p, v, retval)
105 struct proc *p;
106 void *v;
107 register_t *retval;
108 {
109 #if 0
110 struct sys_sstk_args /* {
111 syscallarg(int) incr;
112 } */ *uap = v;
113 #endif
114
115 return (ENOSYS);
116 }
117
118 /*
119 * sys_mincore: determine if pages are in core or not.
120 */
121
122 /* ARGSUSED */
123 int
124 sys_mincore(p, v, retval)
125 struct proc *p;
126 void *v;
127 register_t *retval;
128 {
129 struct sys_mincore_args /* {
130 syscallarg(void *) addr;
131 syscallarg(size_t) len;
132 syscallarg(char *) vec;
133 } */ *uap = v;
134 vm_page_t m;
135 char *vec, pgi;
136 struct uvm_object *uobj;
137 struct vm_amap *amap;
138 struct vm_anon *anon;
139 vm_map_entry_t entry;
140 vaddr_t start, end, lim;
141 vm_map_t map;
142 vsize_t len;
143 int error = 0, npgs;
144
145 map = &p->p_vmspace->vm_map;
146
147 start = (vaddr_t)SCARG(uap, addr);
148 len = SCARG(uap, len);
149 vec = SCARG(uap, vec);
150
151 if (start & PAGE_MASK)
152 return (EINVAL);
153 len = round_page(len);
154 end = start + len;
155 if (end <= start)
156 return (EINVAL);
157
158 npgs = len >> PAGE_SHIFT;
159
160 if (uvm_useracc(vec, npgs, B_WRITE) == FALSE)
161 return (EFAULT);
162
163 /*
164 * Lock down vec, so our returned status isn't outdated by
165 * storing the status byte for a page.
166 */
167 uvm_vslock(p, vec, npgs, VM_PROT_WRITE);
168
169 vm_map_lock_read(map);
170
171 if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
172 error = ENOMEM;
173 goto out;
174 }
175
176 for (/* nothing */;
177 entry != &map->header && entry->start < end;
178 entry = entry->next) {
179 KASSERT(!UVM_ET_ISSUBMAP(entry));
180 KASSERT(start >= entry->start);
181
182 /* Make sure there are no holes. */
183 if (entry->end < end &&
184 (entry->next == &map->header ||
185 entry->next->start > entry->end)) {
186 error = ENOMEM;
187 goto out;
188 }
189
190 lim = end < entry->end ? end : entry->end;
191
192 /*
193 * Special case for objects with no "real" pages. Those
194 * are always considered resident (mapped devices).
195 */
196 if (UVM_ET_ISOBJ(entry)) {
197 KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
198 if (entry->object.uvm_obj->pgops->pgo_releasepg
199 == NULL) {
200 for (/* nothing */; start < lim;
201 start += PAGE_SIZE, vec++)
202 subyte(vec, 1);
203 continue;
204 }
205 }
206
207 amap = entry->aref.ar_amap; /* top layer */
208 uobj = entry->object.uvm_obj; /* bottom layer */
209
210 if (amap != NULL)
211 amap_lock(amap);
212 if (uobj != NULL)
213 simple_lock(&uobj->vmobjlock);
214
215 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
216 pgi = 0;
217 if (amap != NULL) {
218 /* Check the top layer first. */
219 anon = amap_lookup(&entry->aref,
220 start - entry->start);
221 /* Don't need to lock anon here. */
222 if (anon != NULL && anon->u.an_page != NULL) {
223 /*
224 * Anon has the page for this entry
225 * offset.
226 */
227 pgi = 1;
228 }
229 }
230
231 if (uobj != NULL && pgi == 0) {
232 /* Check the bottom layer. */
233 m = uvm_pagelookup(uobj,
234 entry->offset + (start - entry->start));
235 if (m != NULL) {
236 /*
237 * Object has the page for this entry
238 * offset.
239 */
240 pgi = 1;
241 }
242 }
243
244 (void) subyte(vec, pgi);
245 }
246
247 if (uobj != NULL)
248 simple_unlock(&uobj->vmobjlock);
249 if (amap != NULL)
250 amap_unlock(amap);
251 }
252
253 out:
254 vm_map_unlock_read(map);
255 uvm_vsunlock(p, SCARG(uap, vec), npgs);
256 return (error);
257 }
258
259 /*
260 * sys_mmap: mmap system call.
261 *
262 * => file offest and address may not be page aligned
263 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
264 * - if address isn't page aligned the mapping starts at trunc_page(addr)
265 * and the return value is adjusted up by the page offset.
266 */
267
268 int
269 sys_mmap(p, v, retval)
270 struct proc *p;
271 void *v;
272 register_t *retval;
273 {
274 struct sys_mmap_args /* {
275 syscallarg(caddr_t) addr;
276 syscallarg(size_t) len;
277 syscallarg(int) prot;
278 syscallarg(int) flags;
279 syscallarg(int) fd;
280 syscallarg(long) pad;
281 syscallarg(off_t) pos;
282 } */ *uap = v;
283 vaddr_t addr;
284 struct vattr va;
285 off_t pos;
286 vsize_t size, pageoff;
287 vm_prot_t prot, maxprot;
288 int flags, fd;
289 vaddr_t vm_min_address = VM_MIN_ADDRESS;
290 struct filedesc *fdp = p->p_fd;
291 struct file *fp;
292 struct vnode *vp;
293 caddr_t handle;
294 int error;
295
296 /*
297 * first, extract syscall args from the uap.
298 */
299
300 addr = (vaddr_t) SCARG(uap, addr);
301 size = (vsize_t) SCARG(uap, len);
302 prot = SCARG(uap, prot) & VM_PROT_ALL;
303 flags = SCARG(uap, flags);
304 fd = SCARG(uap, fd);
305 pos = SCARG(uap, pos);
306
307 /*
308 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
309 * validate the flags.
310 */
311 if (flags & MAP_COPY)
312 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
313 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
314 return (EINVAL);
315
316 /*
317 * align file position and save offset. adjust size.
318 */
319
320 pageoff = (pos & PAGE_MASK);
321 pos -= pageoff;
322 size += pageoff; /* add offset */
323 size = (vsize_t) round_page(size); /* round up */
324 if ((ssize_t) size < 0)
325 return (EINVAL); /* don't allow wrap */
326
327 /*
328 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
329 */
330
331 if (flags & MAP_FIXED) {
332
333 /* ensure address and file offset are aligned properly */
334 addr -= pageoff;
335 if (addr & PAGE_MASK)
336 return (EINVAL);
337
338 if (VM_MAXUSER_ADDRESS > 0 &&
339 (addr + size) > VM_MAXUSER_ADDRESS)
340 return (EINVAL);
341 if (vm_min_address > 0 && addr < vm_min_address)
342 return (EINVAL);
343 if (addr > addr + size)
344 return (EINVAL); /* no wrapping! */
345
346 } else {
347
348 /*
349 * not fixed: make sure we skip over the largest possible heap.
350 * we will refine our guess later (e.g. to account for VAC, etc)
351 */
352
353 if (addr < round_page((vaddr_t)p->p_vmspace->vm_daddr +
354 MAXDSIZ))
355 addr = round_page((vaddr_t)p->p_vmspace->vm_daddr +
356 MAXDSIZ);
357 }
358
359 /*
360 * check for file mappings (i.e. not anonymous) and verify file.
361 */
362
363 if ((flags & MAP_ANON) == 0) {
364
365 if (fd < 0 || fd >= fdp->fd_nfiles)
366 return(EBADF); /* failed range check? */
367 fp = fdp->fd_ofiles[fd]; /* convert to file pointer */
368 if (fp == NULL)
369 return(EBADF);
370
371 if (fp->f_type != DTYPE_VNODE)
372 return (ENODEV); /* only mmap vnodes! */
373 vp = (struct vnode *)fp->f_data; /* convert to vnode */
374
375 if (vp->v_type != VREG && vp->v_type != VCHR &&
376 vp->v_type != VBLK)
377 return (ENODEV); /* only REG/CHR/BLK support mmap */
378
379 if (vp->v_type == VREG && (pos + size) < pos)
380 return (EOVERFLOW); /* no offset wrapping */
381
382 /* special case: catch SunOS style /dev/zero */
383 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
384 flags |= MAP_ANON;
385 goto is_anon;
386 }
387
388 /*
389 * Old programs may not select a specific sharing type, so
390 * default to an appropriate one.
391 *
392 * XXX: how does MAP_ANON fit in the picture?
393 */
394 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
395 #if defined(DEBUG)
396 printf("WARNING: defaulted mmap() share type to "
397 "%s (pid %d comm %s)\n", vp->v_type == VCHR ?
398 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
399 p->p_comm);
400 #endif
401 if (vp->v_type == VCHR)
402 flags |= MAP_SHARED; /* for a device */
403 else
404 flags |= MAP_PRIVATE; /* for a file */
405 }
406
407 /*
408 * MAP_PRIVATE device mappings don't make sense (and aren't
409 * supported anyway). However, some programs rely on this,
410 * so just change it to MAP_SHARED.
411 */
412 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
413 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
414 }
415
416 /*
417 * now check protection
418 */
419
420 maxprot = VM_PROT_EXECUTE;
421
422 /* check read access */
423 if (fp->f_flag & FREAD)
424 maxprot |= VM_PROT_READ;
425 else if (prot & PROT_READ)
426 return (EACCES);
427
428 /* check write access, shared case first */
429 if (flags & MAP_SHARED) {
430 /*
431 * if the file is writable, only add PROT_WRITE to
432 * maxprot if the file is not immutable, append-only.
433 * otherwise, if we have asked for PROT_WRITE, return
434 * EPERM.
435 */
436 if (fp->f_flag & FWRITE) {
437 if ((error =
438 VOP_GETATTR(vp, &va, p->p_ucred, p)))
439 return (error);
440 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
441 maxprot |= VM_PROT_WRITE;
442 else if (prot & PROT_WRITE)
443 return (EPERM);
444 }
445 else if (prot & PROT_WRITE)
446 return (EACCES);
447 } else {
448 /* MAP_PRIVATE mappings can always write to */
449 maxprot |= VM_PROT_WRITE;
450 }
451
452 /*
453 * set handle to vnode
454 */
455
456 handle = (caddr_t)vp;
457
458 } else { /* MAP_ANON case */
459 /*
460 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
461 */
462 if (fd != -1)
463 return (EINVAL);
464
465 is_anon: /* label for SunOS style /dev/zero */
466 handle = NULL;
467 maxprot = VM_PROT_ALL;
468 pos = 0;
469 }
470
471 /*
472 * XXX (in)sanity check. We don't do proper datasize checking
473 * XXX for anonymous (or private writable) mmap(). However,
474 * XXX know that if we're trying to allocate more than the amount
475 * XXX remaining under our current data size limit, _that_ should
476 * XXX be disallowed.
477 */
478 if ((flags & MAP_ANON) != 0 ||
479 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
480 if (size >
481 (p->p_rlimit[RLIMIT_DATA].rlim_cur - ctob(p->p_vmspace->vm_dsize))) {
482 return (ENOMEM);
483 }
484 }
485
486 /*
487 * now let kernel internal function uvm_mmap do the work.
488 */
489
490 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
491 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
492
493 if (error == 0)
494 /* remember to add offset */
495 *retval = (register_t)(addr + pageoff);
496
497 return (error);
498 }
499
500 /*
501 * sys___msync13: the msync system call (a front-end for flush)
502 */
503
504 int
505 sys___msync13(p, v, retval)
506 struct proc *p;
507 void *v;
508 register_t *retval;
509 {
510 struct sys___msync13_args /* {
511 syscallarg(caddr_t) addr;
512 syscallarg(size_t) len;
513 syscallarg(int) flags;
514 } */ *uap = v;
515 vaddr_t addr;
516 vsize_t size, pageoff;
517 vm_map_t map;
518 int rv, flags, uvmflags;
519
520 /*
521 * extract syscall args from the uap
522 */
523
524 addr = (vaddr_t)SCARG(uap, addr);
525 size = (vsize_t)SCARG(uap, len);
526 flags = SCARG(uap, flags);
527
528 /* sanity check flags */
529 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
530 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
531 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
532 return (EINVAL);
533 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
534 flags |= MS_SYNC;
535
536 /*
537 * align the address to a page boundary, and adjust the size accordingly
538 */
539
540 pageoff = (addr & PAGE_MASK);
541 addr -= pageoff;
542 size += pageoff;
543 size = (vsize_t) round_page(size);
544
545 /* disallow wrap-around. */
546 if (addr + size < addr)
547 return (EINVAL);
548
549 /*
550 * get map
551 */
552
553 map = &p->p_vmspace->vm_map;
554
555 /*
556 * XXXCDC: do we really need this semantic?
557 *
558 * XXX Gak! If size is zero we are supposed to sync "all modified
559 * pages with the region containing addr". Unfortunately, we
560 * don't really keep track of individual mmaps so we approximate
561 * by flushing the range of the map entry containing addr.
562 * This can be incorrect if the region splits or is coalesced
563 * with a neighbor.
564 */
565 if (size == 0) {
566 vm_map_entry_t entry;
567
568 vm_map_lock_read(map);
569 rv = uvm_map_lookup_entry(map, addr, &entry);
570 if (rv == TRUE) {
571 addr = entry->start;
572 size = entry->end - entry->start;
573 }
574 vm_map_unlock_read(map);
575 if (rv == FALSE)
576 return (EINVAL);
577 }
578
579 /*
580 * translate MS_ flags into PGO_ flags
581 */
582 uvmflags = PGO_CLEANIT;
583 if (flags & MS_INVALIDATE)
584 uvmflags |= PGO_FREE;
585 if (flags & MS_SYNC)
586 uvmflags |= PGO_SYNCIO;
587 else
588 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
589
590 /*
591 * doit!
592 */
593 rv = uvm_map_clean(map, addr, addr+size, uvmflags);
594
595 /*
596 * and return...
597 */
598 switch (rv) {
599 case KERN_SUCCESS:
600 return(0);
601 case KERN_INVALID_ADDRESS:
602 return (ENOMEM);
603 case KERN_FAILURE:
604 return (EIO);
605 case KERN_PAGES_LOCKED: /* XXXCDC: uvm doesn't return this */
606 return (EBUSY);
607 default:
608 return (EINVAL);
609 }
610 /*NOTREACHED*/
611 }
612
613 /*
614 * sys_munmap: unmap a users memory
615 */
616
617 int
618 sys_munmap(p, v, retval)
619 struct proc *p;
620 void *v;
621 register_t *retval;
622 {
623 struct sys_munmap_args /* {
624 syscallarg(caddr_t) addr;
625 syscallarg(size_t) len;
626 } */ *uap = v;
627 vaddr_t addr;
628 vsize_t size, pageoff;
629 vm_map_t map;
630 vaddr_t vm_min_address = VM_MIN_ADDRESS;
631 struct vm_map_entry *dead_entries;
632
633 /*
634 * get syscall args...
635 */
636
637 addr = (vaddr_t) SCARG(uap, addr);
638 size = (vsize_t) SCARG(uap, len);
639
640 /*
641 * align the address to a page boundary, and adjust the size accordingly
642 */
643
644 pageoff = (addr & PAGE_MASK);
645 addr -= pageoff;
646 size += pageoff;
647 size = (vsize_t) round_page(size);
648
649 if ((int)size < 0)
650 return (EINVAL);
651 if (size == 0)
652 return (0);
653
654 /*
655 * Check for illegal addresses. Watch out for address wrap...
656 * Note that VM_*_ADDRESS are not constants due to casts (argh).
657 */
658 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
659 return (EINVAL);
660 if (vm_min_address > 0 && addr < vm_min_address)
661 return (EINVAL);
662 if (addr > addr + size)
663 return (EINVAL);
664 map = &p->p_vmspace->vm_map;
665
666
667 vm_map_lock(map); /* lock map so we can checkprot */
668
669 /*
670 * interesting system call semantic: make sure entire range is
671 * allocated before allowing an unmap.
672 */
673
674 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
675 vm_map_unlock(map);
676 return (EINVAL);
677 }
678
679 /*
680 * doit!
681 */
682 (void) uvm_unmap_remove(map, addr, addr + size, &dead_entries);
683
684 vm_map_unlock(map); /* and unlock */
685
686 if (dead_entries != NULL)
687 uvm_unmap_detach(dead_entries, 0);
688
689 return (0);
690 }
691
692 /*
693 * sys_mprotect: the mprotect system call
694 */
695
696 int
697 sys_mprotect(p, v, retval)
698 struct proc *p;
699 void *v;
700 register_t *retval;
701 {
702 struct sys_mprotect_args /* {
703 syscallarg(caddr_t) addr;
704 syscallarg(int) len;
705 syscallarg(int) prot;
706 } */ *uap = v;
707 vaddr_t addr;
708 vsize_t size, pageoff;
709 vm_prot_t prot;
710 int rv;
711
712 /*
713 * extract syscall args from uap
714 */
715
716 addr = (vaddr_t)SCARG(uap, addr);
717 size = (vsize_t)SCARG(uap, len);
718 prot = SCARG(uap, prot) & VM_PROT_ALL;
719
720 /*
721 * align the address to a page boundary, and adjust the size accordingly
722 */
723 pageoff = (addr & PAGE_MASK);
724 addr -= pageoff;
725 size += pageoff;
726 size = (vsize_t) round_page(size);
727 if ((int)size < 0)
728 return (EINVAL);
729
730 /*
731 * doit
732 */
733
734 rv = uvm_map_protect(&p->p_vmspace->vm_map,
735 addr, addr+size, prot, FALSE);
736
737 if (rv == KERN_SUCCESS)
738 return (0);
739 if (rv == KERN_PROTECTION_FAILURE)
740 return (EACCES);
741 return (EINVAL);
742 }
743
744 /*
745 * sys_minherit: the minherit system call
746 */
747
748 int
749 sys_minherit(p, v, retval)
750 struct proc *p;
751 void *v;
752 register_t *retval;
753 {
754 struct sys_minherit_args /* {
755 syscallarg(caddr_t) addr;
756 syscallarg(int) len;
757 syscallarg(int) inherit;
758 } */ *uap = v;
759 vaddr_t addr;
760 vsize_t size, pageoff;
761 vm_inherit_t inherit;
762
763 addr = (vaddr_t)SCARG(uap, addr);
764 size = (vsize_t)SCARG(uap, len);
765 inherit = SCARG(uap, inherit);
766 /*
767 * align the address to a page boundary, and adjust the size accordingly
768 */
769
770 pageoff = (addr & PAGE_MASK);
771 addr -= pageoff;
772 size += pageoff;
773 size = (vsize_t) round_page(size);
774
775 if ((int)size < 0)
776 return (EINVAL);
777
778 switch (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
779 inherit)) {
780 case KERN_SUCCESS:
781 return (0);
782 case KERN_PROTECTION_FAILURE:
783 return (EACCES);
784 }
785 return (EINVAL);
786 }
787
788 /*
789 * sys_madvise: give advice about memory usage.
790 */
791
792 /* ARGSUSED */
793 int
794 sys_madvise(p, v, retval)
795 struct proc *p;
796 void *v;
797 register_t *retval;
798 {
799 struct sys_madvise_args /* {
800 syscallarg(caddr_t) addr;
801 syscallarg(size_t) len;
802 syscallarg(int) behav;
803 } */ *uap = v;
804 vaddr_t addr;
805 vsize_t size, pageoff;
806 int advice, rv;;
807
808 addr = (vaddr_t)SCARG(uap, addr);
809 size = (vsize_t)SCARG(uap, len);
810 advice = SCARG(uap, behav);
811
812 /*
813 * align the address to a page boundary, and adjust the size accordingly
814 */
815 pageoff = (addr & PAGE_MASK);
816 addr -= pageoff;
817 size += pageoff;
818 size = (vsize_t) round_page(size);
819
820 if ((ssize_t)size <= 0)
821 return (EINVAL);
822
823 switch (advice) {
824 case MADV_NORMAL:
825 case MADV_RANDOM:
826 case MADV_SEQUENTIAL:
827 rv = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
828 advice);
829 break;
830
831 case MADV_WILLNEED:
832 /*
833 * Activate all these pages, pre-faulting them in if
834 * necessary.
835 */
836 /*
837 * XXX IMPLEMENT ME.
838 * Should invent a "weak" mode for uvm_fault()
839 * which would only do the PGO_LOCKED pgo_get().
840 */
841 return (0);
842
843 case MADV_DONTNEED:
844 /*
845 * Deactivate all these pages. We don't need them
846 * any more. We don't, however, toss the data in
847 * the pages.
848 */
849 rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
850 PGO_DEACTIVATE);
851 break;
852
853 case MADV_FREE:
854 /*
855 * These pages contain no valid data, and may be
856 * garbage-collected. Toss all resources, including
857 * any swap space in use.
858 */
859 rv = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
860 PGO_FREE);
861 break;
862
863 case MADV_SPACEAVAIL:
864 /*
865 * XXXMRG What is this? I think it's:
866 *
867 * Ensure that we have allocated backing-store
868 * for these pages.
869 *
870 * This is going to require changes to the page daemon,
871 * as it will free swap space allocated to pages in core.
872 * There's also what to do for device/file/anonymous memory.
873 */
874 return (EINVAL);
875
876 default:
877 return (EINVAL);
878 }
879
880 switch (rv) {
881 case KERN_SUCCESS:
882 return (0);
883 case KERN_NO_SPACE:
884 return (EAGAIN);
885 case KERN_INVALID_ADDRESS:
886 return (ENOMEM);
887 case KERN_FAILURE:
888 return (EIO);
889 }
890
891 return (EINVAL);
892 }
893
894 /*
895 * sys_mlock: memory lock
896 */
897
898 int
899 sys_mlock(p, v, retval)
900 struct proc *p;
901 void *v;
902 register_t *retval;
903 {
904 struct sys_mlock_args /* {
905 syscallarg(const void *) addr;
906 syscallarg(size_t) len;
907 } */ *uap = v;
908 vaddr_t addr;
909 vsize_t size, pageoff;
910 int error;
911
912 /*
913 * extract syscall args from uap
914 */
915 addr = (vaddr_t)SCARG(uap, addr);
916 size = (vsize_t)SCARG(uap, len);
917
918 /*
919 * align the address to a page boundary and adjust the size accordingly
920 */
921 pageoff = (addr & PAGE_MASK);
922 addr -= pageoff;
923 size += pageoff;
924 size = (vsize_t) round_page(size);
925
926 /* disallow wrap-around. */
927 if (addr + (int)size < addr)
928 return (EINVAL);
929
930 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
931 return (EAGAIN);
932
933 #ifdef pmap_wired_count
934 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
935 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
936 return (EAGAIN);
937 #else
938 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
939 return (error);
940 #endif
941
942 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
943 0);
944 return (error == KERN_SUCCESS ? 0 : ENOMEM);
945 }
946
947 /*
948 * sys_munlock: unlock wired pages
949 */
950
951 int
952 sys_munlock(p, v, retval)
953 struct proc *p;
954 void *v;
955 register_t *retval;
956 {
957 struct sys_munlock_args /* {
958 syscallarg(const void *) addr;
959 syscallarg(size_t) len;
960 } */ *uap = v;
961 vaddr_t addr;
962 vsize_t size, pageoff;
963 int error;
964
965 /*
966 * extract syscall args from uap
967 */
968
969 addr = (vaddr_t)SCARG(uap, addr);
970 size = (vsize_t)SCARG(uap, len);
971
972 /*
973 * align the address to a page boundary, and adjust the size accordingly
974 */
975 pageoff = (addr & PAGE_MASK);
976 addr -= pageoff;
977 size += pageoff;
978 size = (vsize_t) round_page(size);
979
980 /* disallow wrap-around. */
981 if (addr + (int)size < addr)
982 return (EINVAL);
983
984 #ifndef pmap_wired_count
985 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
986 return (error);
987 #endif
988
989 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
990 0);
991 return (error == KERN_SUCCESS ? 0 : ENOMEM);
992 }
993
994 /*
995 * sys_mlockall: lock all pages mapped into an address space.
996 */
997
998 int
999 sys_mlockall(p, v, retval)
1000 struct proc *p;
1001 void *v;
1002 register_t *retval;
1003 {
1004 struct sys_mlockall_args /* {
1005 syscallarg(int) flags;
1006 } */ *uap = v;
1007 int error, flags;
1008
1009 flags = SCARG(uap, flags);
1010
1011 if (flags == 0 ||
1012 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
1013 return (EINVAL);
1014
1015 #ifndef pmap_wired_count
1016 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
1017 return (error);
1018 #endif
1019
1020 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
1021 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1022 switch (error) {
1023 case KERN_SUCCESS:
1024 error = 0;
1025 break;
1026
1027 case KERN_NO_SPACE: /* XXX overloaded */
1028 error = ENOMEM;
1029 break;
1030
1031 default:
1032 /*
1033 * "Some or all of the memory could not be locked when
1034 * the call was made."
1035 */
1036 error = EAGAIN;
1037 }
1038
1039 return (error);
1040 }
1041
1042 /*
1043 * sys_munlockall: unlock all pages mapped into an address space.
1044 */
1045
1046 int
1047 sys_munlockall(p, v, retval)
1048 struct proc *p;
1049 void *v;
1050 register_t *retval;
1051 {
1052
1053 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1054 return (0);
1055 }
1056
1057 /*
1058 * uvm_mmap: internal version of mmap
1059 *
1060 * - used by sys_mmap, exec, and sysv shm
1061 * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true,
1062 * sysv shm uses "named anonymous memory")
1063 * - caller must page-align the file offset
1064 */
1065
1066 int
1067 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1068 vm_map_t map;
1069 vaddr_t *addr;
1070 vsize_t size;
1071 vm_prot_t prot, maxprot;
1072 int flags;
1073 caddr_t handle; /* XXX: VNODE? */
1074 voff_t foff;
1075 vsize_t locklimit;
1076 {
1077 struct uvm_object *uobj;
1078 struct vnode *vp;
1079 int retval;
1080 int advice = UVM_ADV_NORMAL;
1081 uvm_flag_t uvmflag = 0;
1082
1083 /*
1084 * check params
1085 */
1086
1087 if (size == 0)
1088 return(0);
1089 if (foff & PAGE_MASK)
1090 return(EINVAL);
1091 if ((prot & maxprot) != prot)
1092 return(EINVAL);
1093
1094 /*
1095 * for non-fixed mappings, round off the suggested address.
1096 * for fixed mappings, check alignment and zap old mappings.
1097 */
1098
1099 if ((flags & MAP_FIXED) == 0) {
1100 *addr = round_page(*addr); /* round */
1101 } else {
1102
1103 if (*addr & PAGE_MASK)
1104 return(EINVAL);
1105 uvmflag |= UVM_FLAG_FIXED;
1106 (void) uvm_unmap(map, *addr, *addr + size); /* zap! */
1107 }
1108
1109 /*
1110 * handle anon vs. non-anon mappings. for non-anon mappings attach
1111 * to underlying vm object.
1112 */
1113
1114 if (flags & MAP_ANON) {
1115 foff = UVM_UNKNOWN_OFFSET;
1116 uobj = NULL;
1117 if ((flags & MAP_SHARED) == 0)
1118 /* XXX: defer amap create */
1119 uvmflag |= UVM_FLAG_COPYONW;
1120 else
1121 /* shared: create amap now */
1122 uvmflag |= UVM_FLAG_OVERLAY;
1123
1124 } else {
1125
1126 vp = (struct vnode *) handle; /* get vnode */
1127 if (vp->v_type != VCHR) {
1128 uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ?
1129 maxprot : (maxprot & ~VM_PROT_WRITE));
1130
1131 /* XXX for now, attach doesn't gain a ref */
1132 VREF(vp);
1133 } else {
1134 uobj = udv_attach((void *) &vp->v_rdev,
1135 (flags & MAP_SHARED) ? maxprot :
1136 (maxprot & ~VM_PROT_WRITE), foff, size);
1137 /*
1138 * XXX Some devices don't like to be mapped with
1139 * XXX PROT_EXEC, but we don't really have a
1140 * XXX better way of handling this, right now
1141 */
1142 if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1143 maxprot &= ~VM_PROT_EXECUTE;
1144 uobj = udv_attach((void *) &vp->v_rdev,
1145 (flags & MAP_SHARED) ? maxprot :
1146 (maxprot & ~VM_PROT_WRITE), foff, size);
1147 }
1148 advice = UVM_ADV_RANDOM;
1149 }
1150
1151 if (uobj == NULL)
1152 return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1153
1154 if ((flags & MAP_SHARED) == 0)
1155 uvmflag |= UVM_FLAG_COPYONW;
1156 }
1157
1158 /*
1159 * set up mapping flags
1160 */
1161
1162 uvmflag = UVM_MAPFLAG(prot, maxprot,
1163 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1164 advice, uvmflag);
1165
1166 /*
1167 * do it!
1168 */
1169
1170 retval = uvm_map(map, addr, size, uobj, foff, 0, uvmflag);
1171
1172 if (retval == KERN_SUCCESS) {
1173 /*
1174 * POSIX 1003.1b -- if our address space was configured
1175 * to lock all future mappings, wire the one we just made.
1176 */
1177 if (prot == VM_PROT_NONE) {
1178 /*
1179 * No more work to do in this case.
1180 */
1181 return (0);
1182 }
1183
1184 vm_map_lock(map);
1185
1186 if (map->flags & VM_MAP_WIREFUTURE) {
1187 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
1188 #ifdef pmap_wired_count
1189 || (locklimit != 0 && (size +
1190 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
1191 locklimit)
1192 #endif
1193 ) {
1194 retval = KERN_RESOURCE_SHORTAGE;
1195 vm_map_unlock(map);
1196 /* unmap the region! */
1197 (void) uvm_unmap(map, *addr, *addr + size);
1198 goto bad;
1199 }
1200 /*
1201 * uvm_map_pageable() always returns the map
1202 * unlocked.
1203 */
1204 retval = uvm_map_pageable(map, *addr, *addr + size,
1205 FALSE, UVM_LK_ENTER);
1206 if (retval != KERN_SUCCESS) {
1207 /* unmap the region! */
1208 (void) uvm_unmap(map, *addr, *addr + size);
1209 goto bad;
1210 }
1211 return (0);
1212 }
1213
1214 vm_map_unlock(map);
1215
1216 return (0);
1217 }
1218
1219 /*
1220 * errors: first detach from the uobj, if any.
1221 */
1222
1223 if (uobj)
1224 uobj->pgops->pgo_detach(uobj);
1225
1226 bad:
1227 switch (retval) {
1228 case KERN_INVALID_ADDRESS:
1229 case KERN_NO_SPACE:
1230 return(ENOMEM);
1231 case KERN_RESOURCE_SHORTAGE:
1232 return (EAGAIN);
1233 case KERN_PROTECTION_FAILURE:
1234 return(EACCES);
1235 }
1236 return(EINVAL);
1237 }
1238