uvm_mmap.c revision 1.49.2.3 1 /* $NetBSD: uvm_mmap.c,v 1.49.2.3 2001/06/21 20:10:37 nathanw Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993 The Regents of the University of California.
6 * Copyright (c) 1988 University of Utah.
7 *
8 * All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * the Systems Programming Group of the University of Utah Computer
12 * Science Department.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the Charles D. Cranor,
25 * Washington University, University of California, Berkeley and
26 * its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
44 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
45 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
46 */
47
48 /*
49 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
50 * function.
51 */
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/file.h>
55 #include <sys/filedesc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/mman.h>
58 #include <sys/mount.h>
59 #include <sys/lwp.h>
60 #include <sys/proc.h>
61 #include <sys/malloc.h>
62 #include <sys/vnode.h>
63 #include <sys/conf.h>
64 #include <sys/stat.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <sys/syscallargs.h>
69
70 #include <uvm/uvm.h>
71 #include <uvm/uvm_device.h>
72 #include <uvm/uvm_vnode.h>
73
74
75 /*
76 * unimplemented VM system calls:
77 */
78
79 /*
80 * sys_sbrk: sbrk system call.
81 */
82
83 /* ARGSUSED */
84 int
85 sys_sbrk(l, v, retval)
86 struct lwp *l;
87 void *v;
88 register_t *retval;
89 {
90 #if 0
91 struct sys_sbrk_args /* {
92 syscallarg(intptr_t) incr;
93 } */ *uap = v;
94 #endif
95
96 return (ENOSYS);
97 }
98
99 /*
100 * sys_sstk: sstk system call.
101 */
102
103 /* ARGSUSED */
104 int
105 sys_sstk(l, v, retval)
106 struct lwp *l;
107 void *v;
108 register_t *retval;
109 {
110 #if 0
111 struct sys_sstk_args /* {
112 syscallarg(int) incr;
113 } */ *uap = v;
114 #endif
115
116 return (ENOSYS);
117 }
118
119 /*
120 * sys_mincore: determine if pages are in core or not.
121 */
122
123 /* ARGSUSED */
124 int
125 sys_mincore(l, v, retval)
126 struct lwp *l;
127 void *v;
128 register_t *retval;
129 {
130 struct sys_mincore_args /* {
131 syscallarg(void *) addr;
132 syscallarg(size_t) len;
133 syscallarg(char *) vec;
134 } */ *uap = v;
135 struct proc *p = l->l_proc;
136 struct vm_page *m;
137 char *vec, pgi;
138 struct uvm_object *uobj;
139 struct vm_amap *amap;
140 struct vm_anon *anon;
141 struct vm_map_entry *entry;
142 vaddr_t start, end, lim;
143 struct vm_map *map;
144 vsize_t len;
145 int error = 0, npgs;
146
147 map = &p->p_vmspace->vm_map;
148
149 start = (vaddr_t)SCARG(uap, addr);
150 len = SCARG(uap, len);
151 vec = SCARG(uap, vec);
152
153 if (start & PAGE_MASK)
154 return (EINVAL);
155 len = round_page(len);
156 end = start + len;
157 if (end <= start)
158 return (EINVAL);
159
160 npgs = len >> PAGE_SHIFT;
161
162 if (uvm_useracc(vec, npgs, B_WRITE) == FALSE)
163 return (EFAULT);
164
165 /*
166 * Lock down vec, so our returned status isn't outdated by
167 * storing the status byte for a page.
168 */
169
170 uvm_vslock(p, vec, npgs, VM_PROT_WRITE);
171 vm_map_lock_read(map);
172
173 if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
174 error = ENOMEM;
175 goto out;
176 }
177
178 for (/* nothing */;
179 entry != &map->header && entry->start < end;
180 entry = entry->next) {
181 KASSERT(!UVM_ET_ISSUBMAP(entry));
182 KASSERT(start >= entry->start);
183
184 /* Make sure there are no holes. */
185 if (entry->end < end &&
186 (entry->next == &map->header ||
187 entry->next->start > entry->end)) {
188 error = ENOMEM;
189 goto out;
190 }
191
192 lim = end < entry->end ? end : entry->end;
193
194 /*
195 * Special case for objects with no "real" pages. Those
196 * are always considered resident (mapped devices).
197 */
198
199 if (UVM_ET_ISOBJ(entry)) {
200 KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
201 if (entry->object.uvm_obj->pgops->pgo_releasepg
202 == NULL) {
203 for (/* nothing */; start < lim;
204 start += PAGE_SIZE, vec++)
205 subyte(vec, 1);
206 continue;
207 }
208 }
209
210 amap = entry->aref.ar_amap; /* top layer */
211 uobj = entry->object.uvm_obj; /* bottom layer */
212
213 if (amap != NULL)
214 amap_lock(amap);
215 if (uobj != NULL)
216 simple_lock(&uobj->vmobjlock);
217
218 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
219 pgi = 0;
220 if (amap != NULL) {
221 /* Check the top layer first. */
222 anon = amap_lookup(&entry->aref,
223 start - entry->start);
224 /* Don't need to lock anon here. */
225 if (anon != NULL && anon->u.an_page != NULL) {
226
227 /*
228 * Anon has the page for this entry
229 * offset.
230 */
231
232 pgi = 1;
233 }
234 }
235 if (uobj != NULL && pgi == 0) {
236 /* Check the bottom layer. */
237 m = uvm_pagelookup(uobj,
238 entry->offset + (start - entry->start));
239 if (m != NULL) {
240
241 /*
242 * Object has the page for this entry
243 * offset.
244 */
245
246 pgi = 1;
247 }
248 }
249 (void) subyte(vec, pgi);
250 }
251 if (uobj != NULL)
252 simple_unlock(&uobj->vmobjlock);
253 if (amap != NULL)
254 amap_unlock(amap);
255 }
256
257 out:
258 vm_map_unlock_read(map);
259 uvm_vsunlock(p, SCARG(uap, vec), npgs);
260 return (error);
261 }
262
263 /*
264 * sys_mmap: mmap system call.
265 *
266 * => file offest and address may not be page aligned
267 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
268 * - if address isn't page aligned the mapping starts at trunc_page(addr)
269 * and the return value is adjusted up by the page offset.
270 */
271
272 int
273 sys_mmap(l, v, retval)
274 struct lwp *l;
275 void *v;
276 register_t *retval;
277 {
278 struct sys_mmap_args /* {
279 syscallarg(caddr_t) addr;
280 syscallarg(size_t) len;
281 syscallarg(int) prot;
282 syscallarg(int) flags;
283 syscallarg(int) fd;
284 syscallarg(long) pad;
285 syscallarg(off_t) pos;
286 } */ *uap = v;
287 struct proc *p = l->l_proc;
288 vaddr_t addr;
289 struct vattr va;
290 off_t pos;
291 vsize_t size, pageoff;
292 vm_prot_t prot, maxprot;
293 int flags, fd;
294 vaddr_t vm_min_address = VM_MIN_ADDRESS;
295 struct filedesc *fdp = p->p_fd;
296 struct file *fp;
297 struct vnode *vp;
298 void *handle;
299 int error;
300
301 /*
302 * first, extract syscall args from the uap.
303 */
304
305 addr = (vaddr_t)SCARG(uap, addr);
306 size = (vsize_t)SCARG(uap, len);
307 prot = SCARG(uap, prot) & VM_PROT_ALL;
308 flags = SCARG(uap, flags);
309 fd = SCARG(uap, fd);
310 pos = SCARG(uap, pos);
311
312 /*
313 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
314 * validate the flags.
315 */
316 if (flags & MAP_COPY)
317 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
318 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
319 return (EINVAL);
320
321 /*
322 * align file position and save offset. adjust size.
323 */
324
325 pageoff = (pos & PAGE_MASK);
326 pos -= pageoff;
327 size += pageoff; /* add offset */
328 size = (vsize_t)round_page(size); /* round up */
329 if ((ssize_t) size < 0)
330 return (EINVAL); /* don't allow wrap */
331
332 /*
333 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
334 */
335
336 if (flags & MAP_FIXED) {
337
338 /* ensure address and file offset are aligned properly */
339 addr -= pageoff;
340 if (addr & PAGE_MASK)
341 return (EINVAL);
342
343 if (VM_MAXUSER_ADDRESS > 0 &&
344 (addr + size) > VM_MAXUSER_ADDRESS)
345 return (EINVAL);
346 if (vm_min_address > 0 && addr < vm_min_address)
347 return (EINVAL);
348 if (addr > addr + size)
349 return (EINVAL); /* no wrapping! */
350
351 } else {
352
353 /*
354 * not fixed: make sure we skip over the largest possible heap.
355 * we will refine our guess later (e.g. to account for VAC, etc)
356 */
357
358 addr = MAX(addr, round_page((vaddr_t)p->p_vmspace->vm_daddr +
359 MAXDSIZ));
360 }
361
362 /*
363 * check for file mappings (i.e. not anonymous) and verify file.
364 */
365
366 if ((flags & MAP_ANON) == 0) {
367
368 if ((fp = fd_getfile(fdp, fd)) == NULL)
369 return (EBADF);
370
371 if (fp->f_type != DTYPE_VNODE)
372 return (ENODEV); /* only mmap vnodes! */
373 vp = (struct vnode *)fp->f_data; /* convert to vnode */
374
375 if (vp->v_type != VREG && vp->v_type != VCHR &&
376 vp->v_type != VBLK)
377 return (ENODEV); /* only REG/CHR/BLK support mmap */
378
379 if (vp->v_type == VREG && (pos + size) < pos)
380 return (EOVERFLOW); /* no offset wrapping */
381
382 /* special case: catch SunOS style /dev/zero */
383 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
384 flags |= MAP_ANON;
385 goto is_anon;
386 }
387
388 /*
389 * Old programs may not select a specific sharing type, so
390 * default to an appropriate one.
391 *
392 * XXX: how does MAP_ANON fit in the picture?
393 */
394 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
395 #if defined(DEBUG)
396 printf("WARNING: defaulted mmap() share type to "
397 "%s (pid %d comm %s)\n", vp->v_type == VCHR ?
398 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
399 p->p_comm);
400 #endif
401 if (vp->v_type == VCHR)
402 flags |= MAP_SHARED; /* for a device */
403 else
404 flags |= MAP_PRIVATE; /* for a file */
405 }
406
407 /*
408 * MAP_PRIVATE device mappings don't make sense (and aren't
409 * supported anyway). However, some programs rely on this,
410 * so just change it to MAP_SHARED.
411 */
412 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
413 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
414 }
415
416 /*
417 * now check protection
418 */
419
420 maxprot = VM_PROT_EXECUTE;
421
422 /* check read access */
423 if (fp->f_flag & FREAD)
424 maxprot |= VM_PROT_READ;
425 else if (prot & PROT_READ)
426 return (EACCES);
427
428 /* check write access, shared case first */
429 if (flags & MAP_SHARED) {
430 /*
431 * if the file is writable, only add PROT_WRITE to
432 * maxprot if the file is not immutable, append-only.
433 * otherwise, if we have asked for PROT_WRITE, return
434 * EPERM.
435 */
436 if (fp->f_flag & FWRITE) {
437 if ((error =
438 VOP_GETATTR(vp, &va, p->p_ucred, p)))
439 return (error);
440 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
441 maxprot |= VM_PROT_WRITE;
442 else if (prot & PROT_WRITE)
443 return (EPERM);
444 }
445 else if (prot & PROT_WRITE)
446 return (EACCES);
447 } else {
448 /* MAP_PRIVATE mappings can always write to */
449 maxprot |= VM_PROT_WRITE;
450 }
451 handle = vp;
452
453 } else { /* MAP_ANON case */
454 /*
455 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
456 */
457 if (fd != -1)
458 return (EINVAL);
459
460 is_anon: /* label for SunOS style /dev/zero */
461 handle = NULL;
462 maxprot = VM_PROT_ALL;
463 pos = 0;
464 }
465
466 /*
467 * XXX (in)sanity check. We don't do proper datasize checking
468 * XXX for anonymous (or private writable) mmap(). However,
469 * XXX know that if we're trying to allocate more than the amount
470 * XXX remaining under our current data size limit, _that_ should
471 * XXX be disallowed.
472 */
473 if ((flags & MAP_ANON) != 0 ||
474 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
475 if (size >
476 (p->p_rlimit[RLIMIT_DATA].rlim_cur -
477 ctob(p->p_vmspace->vm_dsize))) {
478 return (ENOMEM);
479 }
480 }
481
482 /*
483 * now let kernel internal function uvm_mmap do the work.
484 */
485
486 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
487 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
488
489 if (error == 0)
490 /* remember to add offset */
491 *retval = (register_t)(addr + pageoff);
492
493 return (error);
494 }
495
496 /*
497 * sys___msync13: the msync system call (a front-end for flush)
498 */
499
500 int
501 sys___msync13(l, v, retval)
502 struct lwp *l;
503 void *v;
504 register_t *retval;
505 {
506 struct sys___msync13_args /* {
507 syscallarg(caddr_t) addr;
508 syscallarg(size_t) len;
509 syscallarg(int) flags;
510 } */ *uap = v;
511 struct proc *p = l->l_proc;
512 vaddr_t addr;
513 vsize_t size, pageoff;
514 struct vm_map *map;
515 int error, rv, flags, uvmflags;
516
517 /*
518 * extract syscall args from the uap
519 */
520
521 addr = (vaddr_t)SCARG(uap, addr);
522 size = (vsize_t)SCARG(uap, len);
523 flags = SCARG(uap, flags);
524
525 /* sanity check flags */
526 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
527 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
528 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
529 return (EINVAL);
530 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
531 flags |= MS_SYNC;
532
533 /*
534 * align the address to a page boundary and adjust the size accordingly.
535 */
536
537 pageoff = (addr & PAGE_MASK);
538 addr -= pageoff;
539 size += pageoff;
540 size = (vsize_t)round_page(size);
541
542 /* disallow wrap-around. */
543 if (addr + size < addr)
544 return (EINVAL);
545
546 /*
547 * get map
548 */
549
550 map = &p->p_vmspace->vm_map;
551
552 /*
553 * XXXCDC: do we really need this semantic?
554 *
555 * XXX Gak! If size is zero we are supposed to sync "all modified
556 * pages with the region containing addr". Unfortunately, we
557 * don't really keep track of individual mmaps so we approximate
558 * by flushing the range of the map entry containing addr.
559 * This can be incorrect if the region splits or is coalesced
560 * with a neighbor.
561 */
562
563 if (size == 0) {
564 struct vm_map_entry *entry;
565
566 vm_map_lock_read(map);
567 rv = uvm_map_lookup_entry(map, addr, &entry);
568 if (rv == TRUE) {
569 addr = entry->start;
570 size = entry->end - entry->start;
571 }
572 vm_map_unlock_read(map);
573 if (rv == FALSE)
574 return (EINVAL);
575 }
576
577 /*
578 * translate MS_ flags into PGO_ flags
579 */
580
581 uvmflags = PGO_CLEANIT;
582 if (flags & MS_INVALIDATE)
583 uvmflags |= PGO_FREE;
584 if (flags & MS_SYNC)
585 uvmflags |= PGO_SYNCIO;
586 else
587 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
588
589 error = uvm_map_clean(map, addr, addr+size, uvmflags);
590 return error;
591 }
592
593 /*
594 * sys_munmap: unmap a users memory
595 */
596
597 int
598 sys_munmap(l, v, retval)
599 struct lwp *l;
600 void *v;
601 register_t *retval;
602 {
603 struct sys_munmap_args /* {
604 syscallarg(caddr_t) addr;
605 syscallarg(size_t) len;
606 } */ *uap = v;
607 struct proc *p = l->l_proc;
608 vaddr_t addr;
609 vsize_t size, pageoff;
610 struct vm_map *map;
611 vaddr_t vm_min_address = VM_MIN_ADDRESS;
612 struct vm_map_entry *dead_entries;
613
614 /*
615 * get syscall args.
616 */
617
618 addr = (vaddr_t)SCARG(uap, addr);
619 size = (vsize_t)SCARG(uap, len);
620
621 /*
622 * align the address to a page boundary and adjust the size accordingly.
623 */
624
625 pageoff = (addr & PAGE_MASK);
626 addr -= pageoff;
627 size += pageoff;
628 size = (vsize_t)round_page(size);
629
630 if ((int)size < 0)
631 return (EINVAL);
632 if (size == 0)
633 return (0);
634
635 /*
636 * Check for illegal addresses. Watch out for address wrap...
637 * Note that VM_*_ADDRESS are not constants due to casts (argh).
638 */
639 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
640 return (EINVAL);
641 if (vm_min_address > 0 && addr < vm_min_address)
642 return (EINVAL);
643 if (addr > addr + size)
644 return (EINVAL);
645 map = &p->p_vmspace->vm_map;
646
647 /*
648 * interesting system call semantic: make sure entire range is
649 * allocated before allowing an unmap.
650 */
651
652 vm_map_lock(map);
653 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
654 vm_map_unlock(map);
655 return (EINVAL);
656 }
657 uvm_unmap_remove(map, addr, addr + size, &dead_entries);
658 vm_map_unlock(map);
659 if (dead_entries != NULL)
660 uvm_unmap_detach(dead_entries, 0);
661 return (0);
662 }
663
664 /*
665 * sys_mprotect: the mprotect system call
666 */
667
668 int
669 sys_mprotect(l, v, retval)
670 struct lwp *l;
671 void *v;
672 register_t *retval;
673 {
674 struct sys_mprotect_args /* {
675 syscallarg(caddr_t) addr;
676 syscallarg(int) len;
677 syscallarg(int) prot;
678 } */ *uap = v;
679 struct proc *p = l->l_proc;
680 vaddr_t addr;
681 vsize_t size, pageoff;
682 vm_prot_t prot;
683 int error;
684
685 /*
686 * extract syscall args from uap
687 */
688
689 addr = (vaddr_t)SCARG(uap, addr);
690 size = (vsize_t)SCARG(uap, len);
691 prot = SCARG(uap, prot) & VM_PROT_ALL;
692
693 /*
694 * align the address to a page boundary and adjust the size accordingly.
695 */
696
697 pageoff = (addr & PAGE_MASK);
698 addr -= pageoff;
699 size += pageoff;
700 size = (vsize_t)round_page(size);
701
702 if ((int)size < 0)
703 return (EINVAL);
704 error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
705 FALSE);
706 return error;
707 }
708
709 /*
710 * sys_minherit: the minherit system call
711 */
712
713 int
714 sys_minherit(l, v, retval)
715 struct lwp *l;
716 void *v;
717 register_t *retval;
718 {
719 struct sys_minherit_args /* {
720 syscallarg(caddr_t) addr;
721 syscallarg(int) len;
722 syscallarg(int) inherit;
723 } */ *uap = v;
724 struct proc *p = l->l_proc;
725 vaddr_t addr;
726 vsize_t size, pageoff;
727 vm_inherit_t inherit;
728 int error;
729
730 addr = (vaddr_t)SCARG(uap, addr);
731 size = (vsize_t)SCARG(uap, len);
732 inherit = SCARG(uap, inherit);
733
734 /*
735 * align the address to a page boundary and adjust the size accordingly.
736 */
737
738 pageoff = (addr & PAGE_MASK);
739 addr -= pageoff;
740 size += pageoff;
741 size = (vsize_t)round_page(size);
742
743 if ((int)size < 0)
744 return (EINVAL);
745 error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
746 inherit);
747 return error;
748 }
749
750 /*
751 * sys_madvise: give advice about memory usage.
752 */
753
754 /* ARGSUSED */
755 int
756 sys_madvise(l, v, retval)
757 struct lwp *l;
758 void *v;
759 register_t *retval;
760 {
761 struct sys_madvise_args /* {
762 syscallarg(caddr_t) addr;
763 syscallarg(size_t) len;
764 syscallarg(int) behav;
765 } */ *uap = v;
766 struct proc *p = l->l_proc;
767 vaddr_t addr;
768 vsize_t size, pageoff;
769 int advice, error;
770
771 addr = (vaddr_t)SCARG(uap, addr);
772 size = (vsize_t)SCARG(uap, len);
773 advice = SCARG(uap, behav);
774
775 /*
776 * align the address to a page boundary, and adjust the size accordingly
777 */
778
779 pageoff = (addr & PAGE_MASK);
780 addr -= pageoff;
781 size += pageoff;
782 size = (vsize_t)round_page(size);
783
784 if ((ssize_t)size <= 0)
785 return (EINVAL);
786
787 switch (advice) {
788 case MADV_NORMAL:
789 case MADV_RANDOM:
790 case MADV_SEQUENTIAL:
791 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
792 advice);
793 break;
794
795 case MADV_WILLNEED:
796
797 /*
798 * Activate all these pages, pre-faulting them in if
799 * necessary.
800 */
801 /*
802 * XXX IMPLEMENT ME.
803 * Should invent a "weak" mode for uvm_fault()
804 * which would only do the PGO_LOCKED pgo_get().
805 */
806
807 return (0);
808
809 case MADV_DONTNEED:
810
811 /*
812 * Deactivate all these pages. We don't need them
813 * any more. We don't, however, toss the data in
814 * the pages.
815 */
816
817 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
818 PGO_DEACTIVATE);
819 break;
820
821 case MADV_FREE:
822
823 /*
824 * These pages contain no valid data, and may be
825 * garbage-collected. Toss all resources, including
826 * any swap space in use.
827 */
828
829 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
830 PGO_FREE);
831 break;
832
833 case MADV_SPACEAVAIL:
834
835 /*
836 * XXXMRG What is this? I think it's:
837 *
838 * Ensure that we have allocated backing-store
839 * for these pages.
840 *
841 * This is going to require changes to the page daemon,
842 * as it will free swap space allocated to pages in core.
843 * There's also what to do for device/file/anonymous memory.
844 */
845
846 return (EINVAL);
847
848 default:
849 return (EINVAL);
850 }
851
852 return error;
853 }
854
855 /*
856 * sys_mlock: memory lock
857 */
858
859 int
860 sys_mlock(l, v, retval)
861 struct lwp *l;
862 void *v;
863 register_t *retval;
864 {
865 struct sys_mlock_args /* {
866 syscallarg(const void *) addr;
867 syscallarg(size_t) len;
868 } */ *uap = v;
869 struct proc *p = l->l_proc;
870 vaddr_t addr;
871 vsize_t size, pageoff;
872 int error;
873
874 /*
875 * extract syscall args from uap
876 */
877
878 addr = (vaddr_t)SCARG(uap, addr);
879 size = (vsize_t)SCARG(uap, len);
880
881 /*
882 * align the address to a page boundary and adjust the size accordingly
883 */
884
885 pageoff = (addr & PAGE_MASK);
886 addr -= pageoff;
887 size += pageoff;
888 size = (vsize_t)round_page(size);
889
890 /* disallow wrap-around. */
891 if (addr + size < addr)
892 return (EINVAL);
893
894 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
895 return (EAGAIN);
896
897 #ifdef pmap_wired_count
898 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
899 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
900 return (EAGAIN);
901 #else
902 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
903 return (error);
904 #endif
905
906 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
907 0);
908 return error;
909 }
910
911 /*
912 * sys_munlock: unlock wired pages
913 */
914
915 int
916 sys_munlock(l, v, retval)
917 struct lwp *l;
918 void *v;
919 register_t *retval;
920 {
921 struct sys_munlock_args /* {
922 syscallarg(const void *) addr;
923 syscallarg(size_t) len;
924 } */ *uap = v;
925 struct proc *p = l->l_proc;
926 vaddr_t addr;
927 vsize_t size, pageoff;
928 int error;
929
930 /*
931 * extract syscall args from uap
932 */
933
934 addr = (vaddr_t)SCARG(uap, addr);
935 size = (vsize_t)SCARG(uap, len);
936
937 /*
938 * align the address to a page boundary, and adjust the size accordingly
939 */
940
941 pageoff = (addr & PAGE_MASK);
942 addr -= pageoff;
943 size += pageoff;
944 size = (vsize_t)round_page(size);
945
946 /* disallow wrap-around. */
947 if (addr + size < addr)
948 return (EINVAL);
949
950 #ifndef pmap_wired_count
951 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
952 return (error);
953 #endif
954
955 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
956 0);
957 return error;
958 }
959
960 /*
961 * sys_mlockall: lock all pages mapped into an address space.
962 */
963
964 int
965 sys_mlockall(l, v, retval)
966 struct lwp *l;
967 void *v;
968 register_t *retval;
969 {
970 struct sys_mlockall_args /* {
971 syscallarg(int) flags;
972 } */ *uap = v;
973 struct proc *p = l->l_proc;
974 int error, flags;
975
976 flags = SCARG(uap, flags);
977
978 if (flags == 0 ||
979 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
980 return (EINVAL);
981
982 #ifndef pmap_wired_count
983 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
984 return (error);
985 #endif
986
987 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
988 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
989 return (error);
990 }
991
992 /*
993 * sys_munlockall: unlock all pages mapped into an address space.
994 */
995
996 int
997 sys_munlockall(l, v, retval)
998 struct lwp *l;
999 void *v;
1000 register_t *retval;
1001 {
1002 struct proc *p = l->l_proc;
1003
1004 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
1005 return (0);
1006 }
1007
1008 /*
1009 * uvm_mmap: internal version of mmap
1010 *
1011 * - used by sys_mmap, exec, and sysv shm
1012 * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true,
1013 * sysv shm uses "named anonymous memory")
1014 * - caller must page-align the file offset
1015 */
1016
1017 int
1018 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1019 struct vm_map *map;
1020 vaddr_t *addr;
1021 vsize_t size;
1022 vm_prot_t prot, maxprot;
1023 int flags;
1024 void *handle;
1025 voff_t foff;
1026 vsize_t locklimit;
1027 {
1028 struct uvm_object *uobj;
1029 struct vnode *vp;
1030 int error;
1031 int advice = UVM_ADV_NORMAL;
1032 uvm_flag_t uvmflag = 0;
1033
1034 /*
1035 * check params
1036 */
1037
1038 if (size == 0)
1039 return(0);
1040 if (foff & PAGE_MASK)
1041 return(EINVAL);
1042 if ((prot & maxprot) != prot)
1043 return(EINVAL);
1044
1045 /*
1046 * for non-fixed mappings, round off the suggested address.
1047 * for fixed mappings, check alignment and zap old mappings.
1048 */
1049
1050 if ((flags & MAP_FIXED) == 0) {
1051 *addr = round_page(*addr); /* round */
1052 } else {
1053 if (*addr & PAGE_MASK)
1054 return(EINVAL);
1055 uvmflag |= UVM_FLAG_FIXED;
1056 (void) uvm_unmap(map, *addr, *addr + size); /* zap! */
1057 }
1058
1059 /*
1060 * handle anon vs. non-anon mappings. for non-anon mappings attach
1061 * to underlying vm object.
1062 */
1063
1064 if (flags & MAP_ANON) {
1065 foff = UVM_UNKNOWN_OFFSET;
1066 uobj = NULL;
1067 if ((flags & MAP_SHARED) == 0)
1068 /* XXX: defer amap create */
1069 uvmflag |= UVM_FLAG_COPYONW;
1070 else
1071 /* shared: create amap now */
1072 uvmflag |= UVM_FLAG_OVERLAY;
1073
1074 } else {
1075 vp = (struct vnode *)handle;
1076 if (vp->v_type != VCHR) {
1077 uobj = uvn_attach((void *)vp, (flags & MAP_SHARED) ?
1078 maxprot : (maxprot & ~VM_PROT_WRITE));
1079
1080 /* XXX for now, attach doesn't gain a ref */
1081 VREF(vp);
1082 } else {
1083 uobj = udv_attach((void *) &vp->v_rdev,
1084 (flags & MAP_SHARED) ? maxprot :
1085 (maxprot & ~VM_PROT_WRITE), foff, size);
1086 /*
1087 * XXX Some devices don't like to be mapped with
1088 * XXX PROT_EXEC, but we don't really have a
1089 * XXX better way of handling this, right now
1090 */
1091 if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1092 maxprot &= ~VM_PROT_EXECUTE;
1093 uobj = udv_attach((void *)&vp->v_rdev,
1094 (flags & MAP_SHARED) ? maxprot :
1095 (maxprot & ~VM_PROT_WRITE), foff, size);
1096 }
1097 advice = UVM_ADV_RANDOM;
1098 }
1099 if (uobj == NULL)
1100 return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1101 if ((flags & MAP_SHARED) == 0)
1102 uvmflag |= UVM_FLAG_COPYONW;
1103 }
1104
1105 uvmflag = UVM_MAPFLAG(prot, maxprot,
1106 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1107 advice, uvmflag);
1108 error = uvm_map(map, addr, size, uobj, foff, 0, uvmflag);
1109 if (error) {
1110 if (uobj)
1111 uobj->pgops->pgo_detach(uobj);
1112 return error;
1113 }
1114
1115 /*
1116 * POSIX 1003.1b -- if our address space was configured
1117 * to lock all future mappings, wire the one we just made.
1118 */
1119
1120 if (prot == VM_PROT_NONE) {
1121
1122 /*
1123 * No more work to do in this case.
1124 */
1125
1126 return (0);
1127 }
1128 vm_map_lock(map);
1129 if (map->flags & VM_MAP_WIREFUTURE) {
1130 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
1131 #ifdef pmap_wired_count
1132 || (locklimit != 0 && (size +
1133 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
1134 locklimit)
1135 #endif
1136 ) {
1137 vm_map_unlock(map);
1138 uvm_unmap(map, *addr, *addr + size);
1139 return ENOMEM;
1140 }
1141
1142 /*
1143 * uvm_map_pageable() always returns the map unlocked.
1144 */
1145
1146 error = uvm_map_pageable(map, *addr, *addr + size,
1147 FALSE, UVM_LK_ENTER);
1148 if (error) {
1149 uvm_unmap(map, *addr, *addr + size);
1150 return error;
1151 }
1152 return (0);
1153 }
1154 vm_map_unlock(map);
1155 return 0;
1156 }
1157