uvm_mmap.c revision 1.54.2.1 1 /* $NetBSD: uvm_mmap.c,v 1.54.2.1 2001/08/25 06:17:22 thorpej Exp $ */
2
3 /*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * Copyright (c) 1991, 1993 The Regents of the University of California.
6 * Copyright (c) 1988 University of Utah.
7 *
8 * All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * the Systems Programming Group of the University of Utah Computer
12 * Science Department.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the Charles D. Cranor,
25 * Washington University, University of California, Berkeley and
26 * its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
44 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
45 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
46 */
47
48 /*
49 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
50 * function.
51 */
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/file.h>
55 #include <sys/filedesc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/mman.h>
58 #include <sys/mount.h>
59 #include <sys/proc.h>
60 #include <sys/malloc.h>
61 #include <sys/vnode.h>
62 #include <sys/conf.h>
63 #include <sys/stat.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #include <sys/syscallargs.h>
68
69 #include <uvm/uvm.h>
70 #include <uvm/uvm_device.h>
71 #include <uvm/uvm_vnode.h>
72
73
74 /*
75 * unimplemented VM system calls:
76 */
77
78 /*
79 * sys_sbrk: sbrk system call.
80 */
81
82 /* ARGSUSED */
83 int
84 sys_sbrk(p, v, retval)
85 struct proc *p;
86 void *v;
87 register_t *retval;
88 {
89 #if 0
90 struct sys_sbrk_args /* {
91 syscallarg(intptr_t) incr;
92 } */ *uap = v;
93 #endif
94
95 return (ENOSYS);
96 }
97
98 /*
99 * sys_sstk: sstk system call.
100 */
101
102 /* ARGSUSED */
103 int
104 sys_sstk(p, v, retval)
105 struct proc *p;
106 void *v;
107 register_t *retval;
108 {
109 #if 0
110 struct sys_sstk_args /* {
111 syscallarg(int) incr;
112 } */ *uap = v;
113 #endif
114
115 return (ENOSYS);
116 }
117
118 /*
119 * sys_mincore: determine if pages are in core or not.
120 */
121
122 /* ARGSUSED */
123 int
124 sys_mincore(p, v, retval)
125 struct proc *p;
126 void *v;
127 register_t *retval;
128 {
129 struct sys_mincore_args /* {
130 syscallarg(void *) addr;
131 syscallarg(size_t) len;
132 syscallarg(char *) vec;
133 } */ *uap = v;
134 struct vm_page *m;
135 char *vec, pgi;
136 struct uvm_object *uobj;
137 struct vm_amap *amap;
138 struct vm_anon *anon;
139 struct vm_map_entry *entry;
140 vaddr_t start, end, lim;
141 struct vm_map *map;
142 vsize_t len;
143 int error = 0, npgs;
144
145 map = &p->p_vmspace->vm_map;
146
147 start = (vaddr_t)SCARG(uap, addr);
148 len = SCARG(uap, len);
149 vec = SCARG(uap, vec);
150
151 if (start & PAGE_MASK)
152 return (EINVAL);
153 len = round_page(len);
154 end = start + len;
155 if (end <= start)
156 return (EINVAL);
157
158 npgs = len >> PAGE_SHIFT;
159
160 if (uvm_useracc(vec, npgs, B_WRITE) == FALSE)
161 return (EFAULT);
162
163 /*
164 * Lock down vec, so our returned status isn't outdated by
165 * storing the status byte for a page.
166 */
167
168 uvm_vslock(p, vec, npgs, VM_PROT_WRITE);
169 vm_map_lock_read(map);
170
171 if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
172 error = ENOMEM;
173 goto out;
174 }
175
176 for (/* nothing */;
177 entry != &map->header && entry->start < end;
178 entry = entry->next) {
179 KASSERT(!UVM_ET_ISSUBMAP(entry));
180 KASSERT(start >= entry->start);
181
182 /* Make sure there are no holes. */
183 if (entry->end < end &&
184 (entry->next == &map->header ||
185 entry->next->start > entry->end)) {
186 error = ENOMEM;
187 goto out;
188 }
189
190 lim = end < entry->end ? end : entry->end;
191
192 /*
193 * Special case for objects with no "real" pages. Those
194 * are always considered resident (mapped devices).
195 */
196
197 if (UVM_ET_ISOBJ(entry)) {
198 KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
199 if (entry->object.uvm_obj->pgops->pgo_releasepg
200 == NULL) {
201 for (/* nothing */; start < lim;
202 start += PAGE_SIZE, vec++)
203 subyte(vec, 1);
204 continue;
205 }
206 }
207
208 amap = entry->aref.ar_amap; /* top layer */
209 uobj = entry->object.uvm_obj; /* bottom layer */
210
211 if (amap != NULL)
212 amap_lock(amap);
213 if (uobj != NULL)
214 simple_lock(&uobj->vmobjlock);
215
216 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
217 pgi = 0;
218 if (amap != NULL) {
219 /* Check the top layer first. */
220 anon = amap_lookup(&entry->aref,
221 start - entry->start);
222 /* Don't need to lock anon here. */
223 if (anon != NULL && anon->u.an_page != NULL) {
224
225 /*
226 * Anon has the page for this entry
227 * offset.
228 */
229
230 pgi = 1;
231 }
232 }
233 if (uobj != NULL && pgi == 0) {
234 /* Check the bottom layer. */
235 m = uvm_pagelookup(uobj,
236 entry->offset + (start - entry->start));
237 if (m != NULL) {
238
239 /*
240 * Object has the page for this entry
241 * offset.
242 */
243
244 pgi = 1;
245 }
246 }
247 (void) subyte(vec, pgi);
248 }
249 if (uobj != NULL)
250 simple_unlock(&uobj->vmobjlock);
251 if (amap != NULL)
252 amap_unlock(amap);
253 }
254
255 out:
256 vm_map_unlock_read(map);
257 uvm_vsunlock(p, SCARG(uap, vec), npgs);
258 return (error);
259 }
260
261 /*
262 * sys_mmap: mmap system call.
263 *
264 * => file offest and address may not be page aligned
265 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
266 * - if address isn't page aligned the mapping starts at trunc_page(addr)
267 * and the return value is adjusted up by the page offset.
268 */
269
270 int
271 sys_mmap(p, v, retval)
272 struct proc *p;
273 void *v;
274 register_t *retval;
275 {
276 struct sys_mmap_args /* {
277 syscallarg(caddr_t) addr;
278 syscallarg(size_t) len;
279 syscallarg(int) prot;
280 syscallarg(int) flags;
281 syscallarg(int) fd;
282 syscallarg(long) pad;
283 syscallarg(off_t) pos;
284 } */ *uap = v;
285 vaddr_t addr;
286 struct vattr va;
287 off_t pos;
288 vsize_t size, pageoff;
289 vm_prot_t prot, maxprot;
290 int flags, fd;
291 vaddr_t vm_min_address = VM_MIN_ADDRESS;
292 struct filedesc *fdp = p->p_fd;
293 struct file *fp;
294 struct vnode *vp;
295 void *handle;
296 int error;
297
298 /*
299 * first, extract syscall args from the uap.
300 */
301
302 addr = (vaddr_t)SCARG(uap, addr);
303 size = (vsize_t)SCARG(uap, len);
304 prot = SCARG(uap, prot) & VM_PROT_ALL;
305 flags = SCARG(uap, flags);
306 fd = SCARG(uap, fd);
307 pos = SCARG(uap, pos);
308
309 /*
310 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
311 * validate the flags.
312 */
313 if (flags & MAP_COPY)
314 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
315 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
316 return (EINVAL);
317
318 /*
319 * align file position and save offset. adjust size.
320 */
321
322 pageoff = (pos & PAGE_MASK);
323 pos -= pageoff;
324 size += pageoff; /* add offset */
325 size = (vsize_t)round_page(size); /* round up */
326 if ((ssize_t) size < 0)
327 return (EINVAL); /* don't allow wrap */
328
329 /*
330 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
331 */
332
333 if (flags & MAP_FIXED) {
334
335 /* ensure address and file offset are aligned properly */
336 addr -= pageoff;
337 if (addr & PAGE_MASK)
338 return (EINVAL);
339
340 if (VM_MAXUSER_ADDRESS > 0 &&
341 (addr + size) > VM_MAXUSER_ADDRESS)
342 return (EINVAL);
343 if (vm_min_address > 0 && addr < vm_min_address)
344 return (EINVAL);
345 if (addr > addr + size)
346 return (EINVAL); /* no wrapping! */
347
348 } else {
349
350 /*
351 * not fixed: make sure we skip over the largest possible heap.
352 * we will refine our guess later (e.g. to account for VAC, etc)
353 */
354
355 addr = MAX(addr, round_page((vaddr_t)p->p_vmspace->vm_daddr +
356 MAXDSIZ));
357 }
358
359 /*
360 * check for file mappings (i.e. not anonymous) and verify file.
361 */
362
363 if ((flags & MAP_ANON) == 0) {
364
365 if ((fp = fd_getfile(fdp, fd)) == NULL)
366 return (EBADF);
367
368 if (fp->f_type != DTYPE_VNODE)
369 return (ENODEV); /* only mmap vnodes! */
370 vp = (struct vnode *)fp->f_data; /* convert to vnode */
371
372 if (vp->v_type != VREG && vp->v_type != VCHR &&
373 vp->v_type != VBLK)
374 return (ENODEV); /* only REG/CHR/BLK support mmap */
375
376 if (vp->v_type == VREG && (pos + size) < pos)
377 return (EOVERFLOW); /* no offset wrapping */
378
379 /* special case: catch SunOS style /dev/zero */
380 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
381 flags |= MAP_ANON;
382 goto is_anon;
383 }
384
385 /*
386 * Old programs may not select a specific sharing type, so
387 * default to an appropriate one.
388 *
389 * XXX: how does MAP_ANON fit in the picture?
390 */
391 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
392 #if defined(DEBUG)
393 printf("WARNING: defaulted mmap() share type to "
394 "%s (pid %d comm %s)\n", vp->v_type == VCHR ?
395 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
396 p->p_comm);
397 #endif
398 if (vp->v_type == VCHR)
399 flags |= MAP_SHARED; /* for a device */
400 else
401 flags |= MAP_PRIVATE; /* for a file */
402 }
403
404 /*
405 * MAP_PRIVATE device mappings don't make sense (and aren't
406 * supported anyway). However, some programs rely on this,
407 * so just change it to MAP_SHARED.
408 */
409 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
410 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
411 }
412
413 /*
414 * now check protection
415 */
416
417 maxprot = VM_PROT_EXECUTE;
418
419 /* check read access */
420 if (fp->f_flag & FREAD)
421 maxprot |= VM_PROT_READ;
422 else if (prot & PROT_READ)
423 return (EACCES);
424
425 /* check write access, shared case first */
426 if (flags & MAP_SHARED) {
427 /*
428 * if the file is writable, only add PROT_WRITE to
429 * maxprot if the file is not immutable, append-only.
430 * otherwise, if we have asked for PROT_WRITE, return
431 * EPERM.
432 */
433 if (fp->f_flag & FWRITE) {
434 if ((error =
435 VOP_GETATTR(vp, &va, p->p_ucred, p)))
436 return (error);
437 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
438 maxprot |= VM_PROT_WRITE;
439 else if (prot & PROT_WRITE)
440 return (EPERM);
441 }
442 else if (prot & PROT_WRITE)
443 return (EACCES);
444 } else {
445 /* MAP_PRIVATE mappings can always write to */
446 maxprot |= VM_PROT_WRITE;
447 }
448 handle = vp;
449
450 } else { /* MAP_ANON case */
451 /*
452 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
453 */
454 if (fd != -1)
455 return (EINVAL);
456
457 is_anon: /* label for SunOS style /dev/zero */
458 handle = NULL;
459 maxprot = VM_PROT_ALL;
460 pos = 0;
461 }
462
463 /*
464 * XXX (in)sanity check. We don't do proper datasize checking
465 * XXX for anonymous (or private writable) mmap(). However,
466 * XXX know that if we're trying to allocate more than the amount
467 * XXX remaining under our current data size limit, _that_ should
468 * XXX be disallowed.
469 */
470 if ((flags & MAP_ANON) != 0 ||
471 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
472 if (size >
473 (p->p_rlimit[RLIMIT_DATA].rlim_cur -
474 ctob(p->p_vmspace->vm_dsize))) {
475 return (ENOMEM);
476 }
477 }
478
479 /*
480 * now let kernel internal function uvm_mmap do the work.
481 */
482
483 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
484 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
485
486 if (error == 0)
487 /* remember to add offset */
488 *retval = (register_t)(addr + pageoff);
489
490 return (error);
491 }
492
493 /*
494 * sys___msync13: the msync system call (a front-end for flush)
495 */
496
497 int
498 sys___msync13(p, v, retval)
499 struct proc *p;
500 void *v;
501 register_t *retval;
502 {
503 struct sys___msync13_args /* {
504 syscallarg(caddr_t) addr;
505 syscallarg(size_t) len;
506 syscallarg(int) flags;
507 } */ *uap = v;
508 vaddr_t addr;
509 vsize_t size, pageoff;
510 struct vm_map *map;
511 int error, rv, flags, uvmflags;
512
513 /*
514 * extract syscall args from the uap
515 */
516
517 addr = (vaddr_t)SCARG(uap, addr);
518 size = (vsize_t)SCARG(uap, len);
519 flags = SCARG(uap, flags);
520
521 /* sanity check flags */
522 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
523 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
524 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
525 return (EINVAL);
526 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
527 flags |= MS_SYNC;
528
529 /*
530 * align the address to a page boundary and adjust the size accordingly.
531 */
532
533 pageoff = (addr & PAGE_MASK);
534 addr -= pageoff;
535 size += pageoff;
536 size = (vsize_t)round_page(size);
537
538 /* disallow wrap-around. */
539 if (addr + size < addr)
540 return (EINVAL);
541
542 /*
543 * get map
544 */
545
546 map = &p->p_vmspace->vm_map;
547
548 /*
549 * XXXCDC: do we really need this semantic?
550 *
551 * XXX Gak! If size is zero we are supposed to sync "all modified
552 * pages with the region containing addr". Unfortunately, we
553 * don't really keep track of individual mmaps so we approximate
554 * by flushing the range of the map entry containing addr.
555 * This can be incorrect if the region splits or is coalesced
556 * with a neighbor.
557 */
558
559 if (size == 0) {
560 struct vm_map_entry *entry;
561
562 vm_map_lock_read(map);
563 rv = uvm_map_lookup_entry(map, addr, &entry);
564 if (rv == TRUE) {
565 addr = entry->start;
566 size = entry->end - entry->start;
567 }
568 vm_map_unlock_read(map);
569 if (rv == FALSE)
570 return (EINVAL);
571 }
572
573 /*
574 * translate MS_ flags into PGO_ flags
575 */
576
577 uvmflags = PGO_CLEANIT;
578 if (flags & MS_INVALIDATE)
579 uvmflags |= PGO_FREE;
580 if (flags & MS_SYNC)
581 uvmflags |= PGO_SYNCIO;
582 else
583 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
584
585 error = uvm_map_clean(map, addr, addr+size, uvmflags);
586 return error;
587 }
588
589 /*
590 * sys_munmap: unmap a users memory
591 */
592
593 int
594 sys_munmap(p, v, retval)
595 struct proc *p;
596 void *v;
597 register_t *retval;
598 {
599 struct sys_munmap_args /* {
600 syscallarg(caddr_t) addr;
601 syscallarg(size_t) len;
602 } */ *uap = v;
603 vaddr_t addr;
604 vsize_t size, pageoff;
605 struct vm_map *map;
606 vaddr_t vm_min_address = VM_MIN_ADDRESS;
607 struct vm_map_entry *dead_entries;
608
609 /*
610 * get syscall args.
611 */
612
613 addr = (vaddr_t)SCARG(uap, addr);
614 size = (vsize_t)SCARG(uap, len);
615
616 /*
617 * align the address to a page boundary and adjust the size accordingly.
618 */
619
620 pageoff = (addr & PAGE_MASK);
621 addr -= pageoff;
622 size += pageoff;
623 size = (vsize_t)round_page(size);
624
625 if ((int)size < 0)
626 return (EINVAL);
627 if (size == 0)
628 return (0);
629
630 /*
631 * Check for illegal addresses. Watch out for address wrap...
632 * Note that VM_*_ADDRESS are not constants due to casts (argh).
633 */
634 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
635 return (EINVAL);
636 if (vm_min_address > 0 && addr < vm_min_address)
637 return (EINVAL);
638 if (addr > addr + size)
639 return (EINVAL);
640 map = &p->p_vmspace->vm_map;
641
642 /*
643 * interesting system call semantic: make sure entire range is
644 * allocated before allowing an unmap.
645 */
646
647 vm_map_lock(map);
648 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
649 vm_map_unlock(map);
650 return (EINVAL);
651 }
652 uvm_unmap_remove(map, addr, addr + size, &dead_entries);
653 vm_map_unlock(map);
654 if (dead_entries != NULL)
655 uvm_unmap_detach(dead_entries, 0);
656 return (0);
657 }
658
659 /*
660 * sys_mprotect: the mprotect system call
661 */
662
663 int
664 sys_mprotect(p, v, retval)
665 struct proc *p;
666 void *v;
667 register_t *retval;
668 {
669 struct sys_mprotect_args /* {
670 syscallarg(caddr_t) addr;
671 syscallarg(int) len;
672 syscallarg(int) prot;
673 } */ *uap = v;
674 vaddr_t addr;
675 vsize_t size, pageoff;
676 vm_prot_t prot;
677 int error;
678
679 /*
680 * extract syscall args from uap
681 */
682
683 addr = (vaddr_t)SCARG(uap, addr);
684 size = (vsize_t)SCARG(uap, len);
685 prot = SCARG(uap, prot) & VM_PROT_ALL;
686
687 /*
688 * align the address to a page boundary and adjust the size accordingly.
689 */
690
691 pageoff = (addr & PAGE_MASK);
692 addr -= pageoff;
693 size += pageoff;
694 size = (vsize_t)round_page(size);
695
696 if ((int)size < 0)
697 return (EINVAL);
698 error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
699 FALSE);
700 return error;
701 }
702
703 /*
704 * sys_minherit: the minherit system call
705 */
706
707 int
708 sys_minherit(p, v, retval)
709 struct proc *p;
710 void *v;
711 register_t *retval;
712 {
713 struct sys_minherit_args /* {
714 syscallarg(caddr_t) addr;
715 syscallarg(int) len;
716 syscallarg(int) inherit;
717 } */ *uap = v;
718 vaddr_t addr;
719 vsize_t size, pageoff;
720 vm_inherit_t inherit;
721 int error;
722
723 addr = (vaddr_t)SCARG(uap, addr);
724 size = (vsize_t)SCARG(uap, len);
725 inherit = SCARG(uap, inherit);
726
727 /*
728 * align the address to a page boundary and adjust the size accordingly.
729 */
730
731 pageoff = (addr & PAGE_MASK);
732 addr -= pageoff;
733 size += pageoff;
734 size = (vsize_t)round_page(size);
735
736 if ((int)size < 0)
737 return (EINVAL);
738 error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
739 inherit);
740 return error;
741 }
742
743 /*
744 * sys_madvise: give advice about memory usage.
745 */
746
747 /* ARGSUSED */
748 int
749 sys_madvise(p, v, retval)
750 struct proc *p;
751 void *v;
752 register_t *retval;
753 {
754 struct sys_madvise_args /* {
755 syscallarg(caddr_t) addr;
756 syscallarg(size_t) len;
757 syscallarg(int) behav;
758 } */ *uap = v;
759 vaddr_t addr;
760 vsize_t size, pageoff;
761 int advice, error;
762
763 addr = (vaddr_t)SCARG(uap, addr);
764 size = (vsize_t)SCARG(uap, len);
765 advice = SCARG(uap, behav);
766
767 /*
768 * align the address to a page boundary, and adjust the size accordingly
769 */
770
771 pageoff = (addr & PAGE_MASK);
772 addr -= pageoff;
773 size += pageoff;
774 size = (vsize_t)round_page(size);
775
776 if ((ssize_t)size <= 0)
777 return (EINVAL);
778
779 switch (advice) {
780 case MADV_NORMAL:
781 case MADV_RANDOM:
782 case MADV_SEQUENTIAL:
783 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
784 advice);
785 break;
786
787 case MADV_WILLNEED:
788
789 /*
790 * Activate all these pages, pre-faulting them in if
791 * necessary.
792 */
793 /*
794 * XXX IMPLEMENT ME.
795 * Should invent a "weak" mode for uvm_fault()
796 * which would only do the PGO_LOCKED pgo_get().
797 */
798
799 return (0);
800
801 case MADV_DONTNEED:
802
803 /*
804 * Deactivate all these pages. We don't need them
805 * any more. We don't, however, toss the data in
806 * the pages.
807 */
808
809 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
810 PGO_DEACTIVATE);
811 break;
812
813 case MADV_FREE:
814
815 /*
816 * These pages contain no valid data, and may be
817 * garbage-collected. Toss all resources, including
818 * any swap space in use.
819 */
820
821 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
822 PGO_FREE);
823 break;
824
825 case MADV_SPACEAVAIL:
826
827 /*
828 * XXXMRG What is this? I think it's:
829 *
830 * Ensure that we have allocated backing-store
831 * for these pages.
832 *
833 * This is going to require changes to the page daemon,
834 * as it will free swap space allocated to pages in core.
835 * There's also what to do for device/file/anonymous memory.
836 */
837
838 return (EINVAL);
839
840 default:
841 return (EINVAL);
842 }
843
844 return error;
845 }
846
847 /*
848 * sys_mlock: memory lock
849 */
850
851 int
852 sys_mlock(p, v, retval)
853 struct proc *p;
854 void *v;
855 register_t *retval;
856 {
857 struct sys_mlock_args /* {
858 syscallarg(const void *) addr;
859 syscallarg(size_t) len;
860 } */ *uap = v;
861 vaddr_t addr;
862 vsize_t size, pageoff;
863 int error;
864
865 /*
866 * extract syscall args from uap
867 */
868
869 addr = (vaddr_t)SCARG(uap, addr);
870 size = (vsize_t)SCARG(uap, len);
871
872 /*
873 * align the address to a page boundary and adjust the size accordingly
874 */
875
876 pageoff = (addr & PAGE_MASK);
877 addr -= pageoff;
878 size += pageoff;
879 size = (vsize_t)round_page(size);
880
881 /* disallow wrap-around. */
882 if (addr + size < addr)
883 return (EINVAL);
884
885 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
886 return (EAGAIN);
887
888 #ifdef pmap_wired_count
889 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
890 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
891 return (EAGAIN);
892 #else
893 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
894 return (error);
895 #endif
896
897 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
898 0);
899 return error;
900 }
901
902 /*
903 * sys_munlock: unlock wired pages
904 */
905
906 int
907 sys_munlock(p, v, retval)
908 struct proc *p;
909 void *v;
910 register_t *retval;
911 {
912 struct sys_munlock_args /* {
913 syscallarg(const void *) addr;
914 syscallarg(size_t) len;
915 } */ *uap = v;
916 vaddr_t addr;
917 vsize_t size, pageoff;
918 int error;
919
920 /*
921 * extract syscall args from uap
922 */
923
924 addr = (vaddr_t)SCARG(uap, addr);
925 size = (vsize_t)SCARG(uap, len);
926
927 /*
928 * align the address to a page boundary, and adjust the size accordingly
929 */
930
931 pageoff = (addr & PAGE_MASK);
932 addr -= pageoff;
933 size += pageoff;
934 size = (vsize_t)round_page(size);
935
936 /* disallow wrap-around. */
937 if (addr + size < addr)
938 return (EINVAL);
939
940 #ifndef pmap_wired_count
941 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
942 return (error);
943 #endif
944
945 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
946 0);
947 return error;
948 }
949
950 /*
951 * sys_mlockall: lock all pages mapped into an address space.
952 */
953
954 int
955 sys_mlockall(p, v, retval)
956 struct proc *p;
957 void *v;
958 register_t *retval;
959 {
960 struct sys_mlockall_args /* {
961 syscallarg(int) flags;
962 } */ *uap = v;
963 int error, flags;
964
965 flags = SCARG(uap, flags);
966
967 if (flags == 0 ||
968 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
969 return (EINVAL);
970
971 #ifndef pmap_wired_count
972 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
973 return (error);
974 #endif
975
976 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
977 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
978 return (error);
979 }
980
981 /*
982 * sys_munlockall: unlock all pages mapped into an address space.
983 */
984
985 int
986 sys_munlockall(p, v, retval)
987 struct proc *p;
988 void *v;
989 register_t *retval;
990 {
991
992 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
993 return (0);
994 }
995
996 /*
997 * uvm_mmap: internal version of mmap
998 *
999 * - used by sys_mmap, exec, and sysv shm
1000 * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true,
1001 * sysv shm uses "named anonymous memory")
1002 * - caller must page-align the file offset
1003 */
1004
1005 int
1006 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
1007 struct vm_map *map;
1008 vaddr_t *addr;
1009 vsize_t size;
1010 vm_prot_t prot, maxprot;
1011 int flags;
1012 void *handle;
1013 voff_t foff;
1014 vsize_t locklimit;
1015 {
1016 struct uvm_object *uobj;
1017 struct vnode *vp;
1018 int error;
1019 int advice = UVM_ADV_NORMAL;
1020 uvm_flag_t uvmflag = 0;
1021
1022 /*
1023 * check params
1024 */
1025
1026 if (size == 0)
1027 return(0);
1028 if (foff & PAGE_MASK)
1029 return(EINVAL);
1030 if ((prot & maxprot) != prot)
1031 return(EINVAL);
1032
1033 /*
1034 * for non-fixed mappings, round off the suggested address.
1035 * for fixed mappings, check alignment and zap old mappings.
1036 */
1037
1038 if ((flags & MAP_FIXED) == 0) {
1039 *addr = round_page(*addr); /* round */
1040 } else {
1041 if (*addr & PAGE_MASK)
1042 return(EINVAL);
1043 uvmflag |= UVM_FLAG_FIXED;
1044 (void) uvm_unmap(map, *addr, *addr + size); /* zap! */
1045 }
1046
1047 /*
1048 * handle anon vs. non-anon mappings. for non-anon mappings attach
1049 * to underlying vm object.
1050 */
1051
1052 if (flags & MAP_ANON) {
1053 foff = UVM_UNKNOWN_OFFSET;
1054 uobj = NULL;
1055 if ((flags & MAP_SHARED) == 0)
1056 /* XXX: defer amap create */
1057 uvmflag |= UVM_FLAG_COPYONW;
1058 else
1059 /* shared: create amap now */
1060 uvmflag |= UVM_FLAG_OVERLAY;
1061
1062 } else {
1063 vp = (struct vnode *)handle;
1064 if (vp->v_type != VCHR) {
1065 error = VOP_MMAP(vp, 0, curproc->p_ucred, curproc);
1066 if (error) {
1067 return error;
1068 }
1069
1070 uobj = uvn_attach((void *)vp, (flags & MAP_SHARED) ?
1071 maxprot : (maxprot & ~VM_PROT_WRITE));
1072
1073 /* XXX for now, attach doesn't gain a ref */
1074 VREF(vp);
1075 } else {
1076 uobj = udv_attach((void *) &vp->v_rdev,
1077 (flags & MAP_SHARED) ? maxprot :
1078 (maxprot & ~VM_PROT_WRITE), foff, size);
1079 /*
1080 * XXX Some devices don't like to be mapped with
1081 * XXX PROT_EXEC, but we don't really have a
1082 * XXX better way of handling this, right now
1083 */
1084 if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1085 maxprot &= ~VM_PROT_EXECUTE;
1086 uobj = udv_attach((void *)&vp->v_rdev,
1087 (flags & MAP_SHARED) ? maxprot :
1088 (maxprot & ~VM_PROT_WRITE), foff, size);
1089 }
1090 advice = UVM_ADV_RANDOM;
1091 }
1092 if (uobj == NULL)
1093 return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1094 if ((flags & MAP_SHARED) == 0)
1095 uvmflag |= UVM_FLAG_COPYONW;
1096 }
1097
1098 uvmflag = UVM_MAPFLAG(prot, maxprot,
1099 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1100 advice, uvmflag);
1101 error = uvm_map(map, addr, size, uobj, foff, 0, uvmflag);
1102 if (error) {
1103 if (uobj)
1104 uobj->pgops->pgo_detach(uobj);
1105 return error;
1106 }
1107
1108 /*
1109 * POSIX 1003.1b -- if our address space was configured
1110 * to lock all future mappings, wire the one we just made.
1111 */
1112
1113 if (prot == VM_PROT_NONE) {
1114
1115 /*
1116 * No more work to do in this case.
1117 */
1118
1119 return (0);
1120 }
1121 vm_map_lock(map);
1122 if (map->flags & VM_MAP_WIREFUTURE) {
1123 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
1124 #ifdef pmap_wired_count
1125 || (locklimit != 0 && (size +
1126 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
1127 locklimit)
1128 #endif
1129 ) {
1130 vm_map_unlock(map);
1131 uvm_unmap(map, *addr, *addr + size);
1132 return ENOMEM;
1133 }
1134
1135 /*
1136 * uvm_map_pageable() always returns the map unlocked.
1137 */
1138
1139 error = uvm_map_pageable(map, *addr, *addr + size,
1140 FALSE, UVM_LK_ENTER);
1141 if (error) {
1142 uvm_unmap(map, *addr, *addr + size);
1143 return error;
1144 }
1145 return (0);
1146 }
1147 vm_map_unlock(map);
1148 return 0;
1149 }
1150