1/*  $OpenBSD: uvm_mmap.c,v 1.151 2018/08/15 20:22:13 kettenis Exp $ */
2/*  $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $    */
3
4/*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * Copyright (c) 1991, 1993 The Regents of the University of California.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *      This product includes software developed by the Charles D. Cranor,
26 *  Washington University, University of California, Berkeley and
27 *  its contributors.
28 * 4. Neither the name of the University nor the names of its contributors
29 *    may be used to endorse or promote products derived from this software
30 *    without specific prior written permission.
31 *
32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42 * SUCH DAMAGE.
43 *
44 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
45 *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
46 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
47 */
48
49/*
50 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
51 * function.
52 */
53#include <sys/param.h>
54#include <sys/systm.h>
55#include <sys/fcntl.h>
56#include <sys/file.h>
57#include <sys/filedesc.h>
58#include <sys/resourcevar.h>
59#include <sys/mman.h>
60#include <sys/mount.h>
61#include <sys/proc.h>
62#include <sys/malloc.h>
63#include <sys/vnode.h>
64#include <sys/conf.h>
65#include <sys/signalvar.h>
66#include <sys/syslog.h>
67#include <sys/stat.h>
68#include <sys/specdev.h>
69#include <sys/stdint.h>
70#include <sys/pledge.h>
71#include <sys/unistd.h>     /* for KBIND* */
72#include <sys/user.h>
73
74#include <machine/exec.h>   /* for __LDPGSZ */
75
76#include <sys/syscallargs.h>
77
78#include <uvm/uvm.h>
79#include <uvm/uvm_device.h>
80#include <uvm/uvm_vnode.h>
81
82int uvm_mmapanon(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
83    vsize_t, struct proc *);
84int uvm_mmapfile(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
85    struct vnode *, voff_t, vsize_t, struct proc *);
86
87
88/*
89 * Page align addr and size, returning EINVAL on wraparound.
90 */
91#define ALIGN_ADDR(addr, size, pageoff) do {                \
92    pageoff = (addr & PAGE_MASK);                   \
93    if (pageoff != 0) {                     \
94        if (size > SIZE_MAX - pageoff)              \
95            return (EINVAL);    /* wraparound */    \
96        addr -= pageoff;                    \
97        size += pageoff;                    \
98    }                               \
99    if (size != 0) {                        \
100        size = (vsize_t)round_page(size);           \
101        if (size == 0)                      \
102            return (EINVAL);    /* wraparound */    \
103    }                               \
104} while (0)
105
106/*
107 * sys_mquery: provide mapping hints to applications that do fixed mappings
108 *
109 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
110 *  don't care about PMAP_PREFER or such)
111 * addr: hint where we'd like to place the mapping.
112 * size: size of the mapping
113 * fd: fd of the file we want to map
114 * off: offset within the file
115 */
116int
117sys_mquery(struct proc *p, void *v, register_t *retval)
118{
119    struct sys_mquery_args /* {
120        syscallarg(void *) addr;
121        syscallarg(size_t) len;
122        syscallarg(int) prot;
123        syscallarg(int) flags;
124        syscallarg(int) fd;
125        syscallarg(long) pad;
126        syscallarg(off_t) pos;
127    } */ *uap = v;
128    struct file *fp;
129    voff_t uoff;
130    int error;
131    vaddr_t vaddr;
132    int flags = 0;
133    vsize_t size;
134    vm_prot_t prot;
135    int fd;
136
137    vaddr = (vaddr_t) SCARG(uap, addr);
138    prot = SCARG(uap, prot);
139    size = (vsize_t) SCARG(uap, len);
140    fd = SCARG(uap, fd);
141
142    if ((prot & PROT_MASK) != prot)
143        return (EINVAL);
144
145    if (SCARG(uap, flags) & MAP_FIXED)
146        flags |= UVM_FLAG_FIXED;
147
148    if (fd >= 0) {
149        if ((error = getvnode(p, fd, &fp)) != 0)
150            return (error);
151        uoff = SCARG(uap, pos);
152    } else {
153        fp = NULL;
154        uoff = UVM_UNKNOWN_OFFSET;
155    }
156
157    if (vaddr == 0)
158        vaddr = uvm_map_hint(p->p_vmspace, prot, VM_MIN_ADDRESS,
159            VM_MAXUSER_ADDRESS);
160
161    error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff,
162        flags);
163    if (error == 0)
164        *retval = (register_t)(vaddr);
165
166    if (fp != NULL)
167        FRELE(fp, p);
168    return (error);
169}
170
171/*
172 * sys_mincore: determine if pages are in core or not.
173 */
174/* ARGSUSED */
175int
176sys_mincore(struct proc *p, void *v, register_t *retval)
177{
178    struct sys_mincore_args /* {
179        syscallarg(void *) addr;
180        syscallarg(size_t) len;
181        syscallarg(char *) vec;
182    } */ *uap = v;
183    vm_page_t m;
184    char *vec, *pgi, *pgs;
185    struct uvm_object *uobj;
186    struct vm_amap *amap;
187    struct vm_anon *anon;
188    vm_map_entry_t entry, next;
189    vaddr_t start, end, lim;
190    vm_map_t map;
191    vsize_t len, npgs;
192    int error = 0;
193
194    map = &p->p_vmspace->vm_map;
195
196    start = (vaddr_t)SCARG(uap, addr);
197    len = SCARG(uap, len);
198    vec = SCARG(uap, vec);
199
200    if (start & PAGE_MASK)
201        return (EINVAL);
202    len = round_page(len);
203    end = start + len;
204    if (end <= start)
205        return (EINVAL);
206
207    npgs = len >> PAGE_SHIFT;
208
209    /*
210     * < art> Anyone trying to mincore more than 4GB of address space is
211     *  clearly insane.
212     */
213    if (npgs >= (0xffffffff >> PAGE_SHIFT))
214        return (E2BIG);
215    pgs = mallocarray(npgs, sizeof(*pgs), M_TEMP, M_WAITOK | M_CANFAIL);
216    if (pgs == NULL)
217        return (ENOMEM);
218    pgi = pgs;
219
220    /*
221     * Lock down vec, so our returned status isn't outdated by
222     * storing the status byte for a page.
223     */
224    if ((error = uvm_vslock(p, vec, npgs, PROT_WRITE)) != 0) {
225        free(pgs, M_TEMP, npgs * sizeof(*pgs));
226        return (error);
227    }
228
229    vm_map_lock_read(map);
230
231    if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
232        error = ENOMEM;
233        goto out;
234    }
235
236    for (/* nothing */;
237         entry != NULL && entry->start < end;
238         entry = RBT_NEXT(uvm_map_addr, entry)) {
239        KASSERT(!UVM_ET_ISSUBMAP(entry));
240        KASSERT(start >= entry->start);
241
242        /* Make sure there are no holes. */
243        next = RBT_NEXT(uvm_map_addr, entry);
244        if (entry->end < end &&
245             (next == NULL ||
246              next->start > entry->end)) {
247            error = ENOMEM;
248            goto out;
249        }
250
251        lim = end < entry->end ? end : entry->end;
252
253        /*
254         * Special case for objects with no "real" pages.  Those
255         * are always considered resident (mapped devices).
256         */
257        if (UVM_ET_ISOBJ(entry)) {
258            KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
259            if (entry->object.uvm_obj->pgops->pgo_fault != NULL) {
260                for (/* nothing */; start < lim;
261                     start += PAGE_SIZE, pgi++)
262                    *pgi = 1;
263                continue;
264            }
265        }
266
267        amap = entry->aref.ar_amap; /* top layer */
268        uobj = entry->object.uvm_obj;   /* bottom layer */
269
270        for (/* nothing */; start < lim; start += PAGE_SIZE, pgi++) {
271            *pgi = 0;
272            if (amap != NULL) {
273                /* Check the top layer first. */
274                anon = amap_lookup(&entry->aref,
275                    start - entry->start);
276                if (anon != NULL && anon->an_page != NULL) {
277                    /*
278                     * Anon has the page for this entry
279                     * offset.
280                     */
281                    *pgi = 1;
282                }
283            }
284
285            if (uobj != NULL && *pgi == 0) {
286                /* Check the bottom layer. */
287                m = uvm_pagelookup(uobj,
288                    entry->offset + (start - entry->start));
289                if (m != NULL) {
290                    /*
291                     * Object has the page for this entry
292                     * offset.
293                     */
294                    *pgi = 1;
295                }
296            }
297        }
298    }
299
300 out:
301    vm_map_unlock_read(map);
302    uvm_vsunlock(p, SCARG(uap, vec), npgs);
303    /* now the map is unlocked we can copyout without fear. */
304    if (error == 0)
305        copyout(pgs, vec, npgs * sizeof(char));
306    free(pgs, M_TEMP, npgs * sizeof(*pgs));
307    return (error);
308}
309
310int uvm_wxabort;
311
312/*
313 * W^X violations are only allowed on permitted filesystems.
314 */
315static inline int
316uvm_wxcheck(struct proc *p, char *call)
317{
318    struct process *pr = p->p_p;
319    int wxallowed = (pr->ps_textvp->v_mount &&
320        (pr->ps_textvp->v_mount->mnt_flag & MNT_WXALLOWED));
321
322    if (wxallowed && (pr->ps_flags & PS_WXNEEDED))
323        return (0);
324
325    /* Report W^X failures, and potentially SIGABRT */
326    if (pr->ps_wxcounter++ == 0)
327        log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
328            pr->ps_comm, pr->ps_pid, call);
329
330    /* Send uncatchable SIGABRT for coredump */
331    if (uvm_wxabort)
332        sigexit(p, SIGABRT);
333
334    return (ENOTSUP);
335}
336
337/*
338 * sys_mmap: mmap system call.
339 *
340 * => file offset and address may not be page aligned
341 *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
342 *    - if address isn't page aligned the mapping starts at trunc_page(addr)
343 *      and the return value is adjusted up by the page offset.
344 */
345int
346sys_mmap(struct proc *p, void *v, register_t *retval)
347{
348    struct sys_mmap_args /* {
349        syscallarg(void *) addr;
350        syscallarg(size_t) len;
351        syscallarg(int) prot;
352        syscallarg(int) flags;
353        syscallarg(int) fd;
354        syscallarg(long) pad;
355        syscallarg(off_t) pos;
356    } */ *uap = v;
357    vaddr_t addr;
358    struct vattr va;
359    off_t pos;
360    vsize_t size, pageoff;
361    vm_prot_t prot, maxprot;
362    int flags, fd;
363    vaddr_t vm_min_address = VM_MIN_ADDRESS;
364    struct filedesc *fdp = p->p_fd;
365    struct file *fp = NULL;
366    struct vnode *vp;
367    int error;
368
369    /* first, extract syscall args from the uap. */
370    addr = (vaddr_t) SCARG(uap, addr);
371    size = (vsize_t) SCARG(uap, len);
372    prot = SCARG(uap, prot);
373    flags = SCARG(uap, flags);
374    fd = SCARG(uap, fd);
375    pos = SCARG(uap, pos);
376
377    /*
378     * Validate the flags.
379     */
380    if ((prot & PROT_MASK) != prot)
381        return (EINVAL);
382    if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
383        (error = uvm_wxcheck(p, "mmap")))
384        return (error);
385
386    if ((flags & MAP_FLAGMASK) != flags)
387        return (EINVAL);
388    if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
389        return (EINVAL);
390    if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE)
391        return (EINVAL);
392    if (flags & MAP_STACK) {
393        if ((flags & (MAP_ANON|MAP_PRIVATE)) != (MAP_ANON|MAP_PRIVATE))
394            return (EINVAL);
395        if (flags & ~(MAP_STACK|MAP_FIXED|MAP_ANON|MAP_PRIVATE))
396            return (EINVAL);
397        if (pos != 0)
398            return (EINVAL);
399        if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE))
400            return (EINVAL);
401    }
402    if (size == 0)
403        return (EINVAL);
404
405    error = pledge_protexec(p, prot);
406    if (error)
407        return (error);
408
409    /* align file position and save offset.  adjust size. */
410    ALIGN_ADDR(pos, size, pageoff);
411
412    /* now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */
413    if (flags & MAP_FIXED) {
414        /* adjust address by the same amount as we did the offset */
415        addr -= pageoff;
416        if (addr & PAGE_MASK)
417            return (EINVAL);        /* not page aligned */
418
419        if (addr > SIZE_MAX - size)
420            return (EINVAL);        /* no wrapping! */
421        if (VM_MAXUSER_ADDRESS > 0 &&
422            (addr + size) > VM_MAXUSER_ADDRESS)
423            return (EINVAL);
424        if (vm_min_address > 0 && addr < vm_min_address)
425            return (EINVAL);
426
427    }
428
429    /* check for file mappings (i.e. not anonymous) and verify file. */
430    if ((flags & MAP_ANON) == 0) {
431        if ((fp = fd_getfile(fdp, fd)) == NULL)
432            return (EBADF);
433
434        if (fp->f_type != DTYPE_VNODE) {
435            error = ENODEV;     /* only mmap vnodes! */
436            goto out;
437        }
438        vp = (struct vnode *)fp->f_data;    /* convert to vnode */
439
440        if (vp->v_type != VREG && vp->v_type != VCHR &&
441            vp->v_type != VBLK) {
442            error = ENODEV; /* only REG/CHR/BLK support mmap */
443            goto out;
444        }
445
446        if (vp->v_type == VREG && (pos + size) < pos) {
447            error = EINVAL;     /* no offset wrapping */
448            goto out;
449        }
450
451        /* special case: catch SunOS style /dev/zero */
452        if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
453            flags |= MAP_ANON;
454            FRELE(fp, p);
455            fp = NULL;
456            goto is_anon;
457        }
458
459        /*
460         * Old programs may not select a specific sharing type, so
461         * default to an appropriate one.
462         */
463        if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
464#if defined(DEBUG)
465            printf("WARNING: defaulted mmap() share type to"
466                " %s (pid %d comm %s)\n",
467                vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE",
468                p->p_p->ps_pid, p->p_p->ps_comm);
469#endif
470            if (vp->v_type == VCHR)
471                flags |= MAP_SHARED;    /* for a device */
472            else
473                flags |= MAP_PRIVATE;   /* for a file */
474        }
475
476        /*
477         * MAP_PRIVATE device mappings don't make sense (and aren't
478         * supported anyway).  However, some programs rely on this,
479         * so just change it to MAP_SHARED.
480         */
481        if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
482            flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
483        }
484
485        /* now check protection */
486        maxprot = PROT_EXEC;
487
488        /* check read access */
489        if (fp->f_flag & FREAD)
490            maxprot |= PROT_READ;
491        else if (prot & PROT_READ) {
492            error = EACCES;
493            goto out;
494        }
495
496        /* check write access, shared case first */
497        if (flags & MAP_SHARED) {
498            /*
499             * if the file is writable, only add PROT_WRITE to
500             * maxprot if the file is not immutable, append-only.
501             * otherwise, if we have asked for PROT_WRITE, return
502             * EPERM.
503             */
504            if (fp->f_flag & FWRITE) {
505                KERNEL_LOCK();
506                error = VOP_GETATTR(vp, &va, p->p_ucred, p);
507                KERNEL_UNLOCK();
508                if (error)
509                    goto out;
510                if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
511                    maxprot |= PROT_WRITE;
512                else if (prot & PROT_WRITE) {
513                    error = EPERM;
514                    goto out;
515                }
516            } else if (prot & PROT_WRITE) {
517                error = EACCES;
518                goto out;
519            }
520        } else {
521            /* MAP_PRIVATE mappings can always write to */
522            maxprot |= PROT_WRITE;
523        }
524        if ((flags & __MAP_NOFAULT) != 0 ||
525            ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
526            if (p->p_rlimit[RLIMIT_DATA].rlim_cur < size ||
527                p->p_rlimit[RLIMIT_DATA].rlim_cur - size <
528                ptoa(p->p_vmspace->vm_dused)) {
529                error = ENOMEM;
530                goto out;
531            }
532        }
533        KERNEL_LOCK();
534        error = uvm_mmapfile(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
535            flags, vp, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, p);
536        KERNEL_UNLOCK();
537    } else {        /* MAP_ANON case */
538        if (fd != -1)
539            return EINVAL;
540
541is_anon:    /* label for SunOS style /dev/zero */
542
543        /* __MAP_NOFAULT only makes sense with a backing object */
544        if ((flags & __MAP_NOFAULT) != 0)
545            return EINVAL;
546
547        if (p->p_rlimit[RLIMIT_DATA].rlim_cur < size ||
548            p->p_rlimit[RLIMIT_DATA].rlim_cur - size <
549            ptoa(p->p_vmspace->vm_dused)) {
550            return ENOMEM;
551        }
552
553        /*
554         * We've been treating (MAP_SHARED|MAP_PRIVATE) == 0 as
555         * MAP_PRIVATE, so make that clear.
556         */
557        if ((flags & MAP_SHARED) == 0)
558            flags |= MAP_PRIVATE;
559
560        maxprot = PROT_MASK;
561        error = uvm_mmapanon(&p->p_vmspace->vm_map, &addr, size, prot,
562            maxprot, flags, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, p);
563    }
564
565    if (error == 0)
566        /* remember to add offset */
567        *retval = (register_t)(addr + pageoff);
568
569out:
570    if (fp)
571        FRELE(fp, p);
572    return (error);
573}
574
575/*
576 * sys_msync: the msync system call (a front-end for flush)
577 */
578
579int
580sys_msync(struct proc *p, void *v, register_t *retval)
581{
582    struct sys_msync_args /* {
583        syscallarg(void *) addr;
584        syscallarg(size_t) len;
585        syscallarg(int) flags;
586    } */ *uap = v;
587    vaddr_t addr;
588    vsize_t size, pageoff;
589    vm_map_t map;
590    int flags, uvmflags;
591
592    /* extract syscall args from the uap */
593    addr = (vaddr_t)SCARG(uap, addr);
594    size = (vsize_t)SCARG(uap, len);
595    flags = SCARG(uap, flags);
596
597    /* sanity check flags */
598    if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
599            (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
600            (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
601        return (EINVAL);
602    if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
603        flags |= MS_SYNC;
604
605    /* align the address to a page boundary, and adjust the size accordingly */
606    ALIGN_ADDR(addr, size, pageoff);
607    if (addr > SIZE_MAX - size)
608        return (EINVAL);        /* disallow wrap-around. */
609
610    /* get map */
611    map = &p->p_vmspace->vm_map;
612
613    /* translate MS_ flags into PGO_ flags */
614    uvmflags = PGO_CLEANIT;
615    if (flags & MS_INVALIDATE)
616        uvmflags |= PGO_FREE;
617    if (flags & MS_SYNC)
618        uvmflags |= PGO_SYNCIO;
619    else
620        uvmflags |= PGO_SYNCIO;  /* XXXCDC: force sync for now! */
621
622    return (uvm_map_clean(map, addr, addr+size, uvmflags));
623}
624
625/*
626 * sys_munmap: unmap a users memory
627 */
628int
629sys_munmap(struct proc *p, void *v, register_t *retval)
630{
631    struct sys_munmap_args /* {
632        syscallarg(void *) addr;
633        syscallarg(size_t) len;
634    } */ *uap = v;
635    vaddr_t addr;
636    vsize_t size, pageoff;
637    vm_map_t map;
638    vaddr_t vm_min_address = VM_MIN_ADDRESS;
639    struct uvm_map_deadq dead_entries;
640
641    /* get syscall args... */
642    addr = (vaddr_t) SCARG(uap, addr);
643    size = (vsize_t) SCARG(uap, len);
644
645    /* align address to a page boundary, and adjust size accordingly */
646    ALIGN_ADDR(addr, size, pageoff);
647
648    /*
649     * Check for illegal addresses.  Watch out for address wrap...
650     * Note that VM_*_ADDRESS are not constants due to casts (argh).
651     */
652    if (addr > SIZE_MAX - size)
653        return (EINVAL);
654    if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
655        return (EINVAL);
656    if (vm_min_address > 0 && addr < vm_min_address)
657        return (EINVAL);
658    map = &p->p_vmspace->vm_map;
659
660
661    vm_map_lock(map);   /* lock map so we can checkprot */
662
663    /*
664     * interesting system call semantic: make sure entire range is
665     * allocated before allowing an unmap.
666     */
667    if (!uvm_map_checkprot(map, addr, addr + size, PROT_NONE)) {
668        vm_map_unlock(map);
669        return (EINVAL);
670    }
671
672    TAILQ_INIT(&dead_entries);
673    uvm_unmap_remove(map, addr, addr + size, &dead_entries, FALSE, TRUE);
674    vm_map_unlock(map); /* and unlock */
675
676    uvm_unmap_detach(&dead_entries, 0);
677
678    return (0);
679}
680
681/*
682 * sys_mprotect: the mprotect system call
683 */
684int
685sys_mprotect(struct proc *p, void *v, register_t *retval)
686{
687    struct sys_mprotect_args /* {
688        syscallarg(void *) addr;
689        syscallarg(size_t) len;
690        syscallarg(int) prot;
691    } */ *uap = v;
692    vaddr_t addr;
693    vsize_t size, pageoff;
694    vm_prot_t prot;
695    int error;
696
697    /*
698     * extract syscall args from uap
699     */
700
701    addr = (vaddr_t)SCARG(uap, addr);
702    size = (vsize_t)SCARG(uap, len);
703    prot = SCARG(uap, prot);
704
705    if ((prot & PROT_MASK) != prot)
706        return (EINVAL);
707    if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
708        (error = uvm_wxcheck(p, "mprotect")))
709        return (error);
710
711    error = pledge_protexec(p, prot);
712    if (error)
713        return (error);
714
715    /*
716     * align the address to a page boundary, and adjust the size accordingly
717     */
718    ALIGN_ADDR(addr, size, pageoff);
719    if (addr > SIZE_MAX - size)
720        return (EINVAL);        /* disallow wrap-around. */
721
722    return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size,
723        prot, FALSE));
724}
725
726/*
727 * sys_minherit: the minherit system call
728 */
729int
730sys_minherit(struct proc *p, void *v, register_t *retval)
731{
732    struct sys_minherit_args /* {
733        syscallarg(void *) addr;
734        syscallarg(size_t) len;
735        syscallarg(int) inherit;
736    } */ *uap = v;
737    vaddr_t addr;
738    vsize_t size, pageoff;
739    vm_inherit_t inherit;
740
741    addr = (vaddr_t)SCARG(uap, addr);
742    size = (vsize_t)SCARG(uap, len);
743    inherit = SCARG(uap, inherit);
744
745    /*
746     * align the address to a page boundary, and adjust the size accordingly
747     */
748    ALIGN_ADDR(addr, size, pageoff);
749    if (addr > SIZE_MAX - size)
750        return (EINVAL);        /* disallow wrap-around. */
751
752    return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
753        inherit));
754}
755
756/*
757 * sys_madvise: give advice about memory usage.
758 */
759/* ARGSUSED */
760int
761sys_madvise(struct proc *p, void *v, register_t *retval)
762{
763    struct sys_madvise_args /* {
764        syscallarg(void *) addr;
765        syscallarg(size_t) len;
766        syscallarg(int) behav;
767    } */ *uap = v;
768    vaddr_t addr;
769    vsize_t size, pageoff;
770    int advice, error;
771
772    addr = (vaddr_t)SCARG(uap, addr);
773    size = (vsize_t)SCARG(uap, len);
774    advice = SCARG(uap, behav);
775
776    /*
777     * align the address to a page boundary, and adjust the size accordingly
778     */
779    ALIGN_ADDR(addr, size, pageoff);
780    if (addr > SIZE_MAX - size)
781        return (EINVAL);        /* disallow wrap-around. */
782
783    switch (advice) {
784    case MADV_NORMAL:
785    case MADV_RANDOM:
786    case MADV_SEQUENTIAL:
787        error = uvm_map_advice(&p->p_vmspace->vm_map, addr,
788            addr + size, advice);
789        break;
790
791    case MADV_WILLNEED:
792        /*
793         * Activate all these pages, pre-faulting them in if
794         * necessary.
795         */
796        /*
797         * XXX IMPLEMENT ME.
798         * Should invent a "weak" mode for uvm_fault()
799         * which would only do the PGO_LOCKED pgo_get().
800         */
801        return (0);
802
803    case MADV_DONTNEED:
804        /*
805         * Deactivate all these pages.  We don't need them
806         * any more.  We don't, however, toss the data in
807         * the pages.
808         */
809        error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
810            PGO_DEACTIVATE);
811        break;
812
813    case MADV_FREE:
814        /*
815         * These pages contain no valid data, and may be
816         * garbage-collected.  Toss all resources, including
817         * any swap space in use.
818         */
819        error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
820            PGO_FREE);
821        break;
822
823    case MADV_SPACEAVAIL:
824        /*
825         * XXXMRG What is this?  I think it's:
826         *
827         *  Ensure that we have allocated backing-store
828         *  for these pages.
829         *
830         * This is going to require changes to the page daemon,
831         * as it will free swap space allocated to pages in core.
832         * There's also what to do for device/file/anonymous memory.
833         */
834        return (EINVAL);
835
836    default:
837        return (EINVAL);
838    }
839
840    return (error);
841}
842
843/*
844 * sys_mlock: memory lock
845 */
846
847int
848sys_mlock(struct proc *p, void *v, register_t *retval)
849{
850    struct sys_mlock_args /* {
851        syscallarg(const void *) addr;
852        syscallarg(size_t) len;
853    } */ *uap = v;
854    vaddr_t addr;
855    vsize_t size, pageoff;
856    int error;
857
858    /* extract syscall args from uap */
859    addr = (vaddr_t)SCARG(uap, addr);
860    size = (vsize_t)SCARG(uap, len);
861
862    /* align address to a page boundary and adjust size accordingly */
863    ALIGN_ADDR(addr, size, pageoff);
864    if (addr > SIZE_MAX - size)
865        return (EINVAL);        /* disallow wrap-around. */
866
867    if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
868        return (EAGAIN);
869
870#ifdef pmap_wired_count
871    if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
872            p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
873        return (EAGAIN);
874#else
875    if ((error = suser(p)) != 0)
876        return (error);
877#endif
878
879    error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
880        0);
881    return (error == 0 ? 0 : ENOMEM);
882}
883
884/*
885 * sys_munlock: unlock wired pages
886 */
887
888int
889sys_munlock(struct proc *p, void *v, register_t *retval)
890{
891    struct sys_munlock_args /* {
892        syscallarg(const void *) addr;
893        syscallarg(size_t) len;
894    } */ *uap = v;
895    vaddr_t addr;
896    vsize_t size, pageoff;
897    int error;
898
899    /* extract syscall args from uap */
900    addr = (vaddr_t)SCARG(uap, addr);
901    size = (vsize_t)SCARG(uap, len);
902
903    /* align address to a page boundary, and adjust size accordingly */
904    ALIGN_ADDR(addr, size, pageoff);
905    if (addr > SIZE_MAX - size)
906        return (EINVAL);        /* disallow wrap-around. */
907
908#ifndef pmap_wired_count
909    if ((error = suser(p)) != 0)
910        return (error);
911#endif
912
913    error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
914        0);
915    return (error == 0 ? 0 : ENOMEM);
916}
917
918/*
919 * sys_mlockall: lock all pages mapped into an address space.
920 */
921int
922sys_mlockall(struct proc *p, void *v, register_t *retval)
923{
924    struct sys_mlockall_args /* {
925        syscallarg(int) flags;
926    } */ *uap = v;
927    int error, flags;
928
929    flags = SCARG(uap, flags);
930
931    if (flags == 0 ||
932        (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
933        return (EINVAL);
934
935#ifndef pmap_wired_count
936    if ((error = suser(p)) != 0)
937        return (error);
938#endif
939
940    error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
941        p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
942    if (error != 0 && error != ENOMEM)
943        return (EAGAIN);
944    return (error);
945}
946
947/*
948 * sys_munlockall: unlock all pages mapped into an address space.
949 */
950int
951sys_munlockall(struct proc *p, void *v, register_t *retval)
952{
953
954    (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
955    return (0);
956}
957
958/*
959 * common code for mmapanon and mmapfile to lock a mmaping
960 */
961int
962uvm_mmaplock(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
963    vsize_t locklimit)
964{
965    int error;
966
967    /*
968     * POSIX 1003.1b -- if our address space was configured
969     * to lock all future mappings, wire the one we just made.
970     */
971    if (prot == PROT_NONE) {
972        /*
973         * No more work to do in this case.
974         */
975        return (0);
976    }
977
978    vm_map_lock(map);
979    if (map->flags & VM_MAP_WIREFUTURE) {
980        KERNEL_LOCK();
981        if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
982#ifdef pmap_wired_count
983            || (locklimit != 0 && (size +
984             ptoa(pmap_wired_count(vm_map_pmap(map)))) >
985            locklimit)
986#endif
987        ) {
988            error = ENOMEM;
989            vm_map_unlock(map);
990            /* unmap the region! */
991            uvm_unmap(map, *addr, *addr + size);
992            KERNEL_UNLOCK();
993            return (error);
994        }
995        /*
996         * uvm_map_pageable() always returns the map
997         * unlocked.
998         */
999        error = uvm_map_pageable(map, *addr, *addr + size,
1000            FALSE, UVM_LK_ENTER);
1001        if (error != 0) {
1002            /* unmap the region! */
1003            uvm_unmap(map, *addr, *addr + size);
1004            KERNEL_UNLOCK();
1005            return (error);
1006        }
1007        KERNEL_UNLOCK();
1008        return (0);
1009    }
1010    vm_map_unlock(map);
1011    return (0);
1012}
1013
1014/*
1015 * uvm_mmapanon: internal version of mmap for anons
1016 *
1017 * - used by sys_mmap
1018 */
1019int
1020uvm_mmapanon(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1021    vm_prot_t maxprot, int flags, vsize_t locklimit, struct proc *p)
1022{
1023    int error;
1024    int advice = MADV_NORMAL;
1025    unsigned int uvmflag = 0;
1026    vsize_t align = 0;  /* userland page size */
1027
1028    /*
1029     * for non-fixed mappings, round off the suggested address.
1030     * for fixed mappings, check alignment and zap old mappings.
1031     */
1032    if ((flags & MAP_FIXED) == 0) {
1033        *addr = round_page(*addr);  /* round */
1034    } else {
1035        if (*addr & PAGE_MASK)
1036            return(EINVAL);
1037
1038        uvmflag |= UVM_FLAG_FIXED;
1039        if ((flags & __MAP_NOREPLACE) == 0)
1040            uvmflag |= UVM_FLAG_UNMAP;
1041    }
1042
1043    if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ)
1044        align = __LDPGSZ;
1045    if ((flags & MAP_SHARED) == 0)
1046        /* XXX: defer amap create */
1047        uvmflag |= UVM_FLAG_COPYONW;
1048    else
1049        /* shared: create amap now */
1050        uvmflag |= UVM_FLAG_OVERLAY;
1051    if (flags & MAP_STACK)
1052        uvmflag |= UVM_FLAG_STACK;
1053
1054    /* set up mapping flags */
1055    uvmflag = UVM_MAPFLAG(prot, maxprot,
1056        (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1057        advice, uvmflag);
1058
1059    error = uvm_mapanon(map, addr, size, align, uvmflag);
1060
1061    if (error == 0)
1062        error = uvm_mmaplock(map, addr, size, prot, locklimit);
1063    return error;
1064}
1065
1066/*
1067 * uvm_mmapfile: internal version of mmap for non-anons
1068 *
1069 * - used by sys_mmap
1070 * - caller must page-align the file offset
1071 */
1072int
1073uvm_mmapfile(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1074    vm_prot_t maxprot, int flags, struct vnode *vp, voff_t foff,
1075    vsize_t locklimit, struct proc *p)
1076{
1077    struct uvm_object *uobj;
1078    int error;
1079    int advice = MADV_NORMAL;
1080    unsigned int uvmflag = 0;
1081    vsize_t align = 0;  /* userland page size */
1082
1083    /*
1084     * for non-fixed mappings, round off the suggested address.
1085     * for fixed mappings, check alignment and zap old mappings.
1086     */
1087    if ((flags & MAP_FIXED) == 0) {
1088        *addr = round_page(*addr);  /* round */
1089    } else {
1090        if (*addr & PAGE_MASK)
1091            return(EINVAL);
1092
1093        uvmflag |= UVM_FLAG_FIXED;
1094        if ((flags & __MAP_NOREPLACE) == 0)
1095            uvmflag |= UVM_FLAG_UNMAP;
1096    }
1097
1098    /*
1099     * attach to underlying vm object.
1100     */
1101    if (vp->v_type != VCHR) {
1102        uobj = uvn_attach(vp, (flags & MAP_SHARED) ?
1103           maxprot : (maxprot & ~PROT_WRITE));
1104
1105        /*
1106         * XXXCDC: hack from old code
1107         * don't allow vnodes which have been mapped
1108         * shared-writeable to persist [forces them to be
1109         * flushed out when last reference goes].
1110         * XXXCDC: interesting side effect: avoids a bug.
1111         * note that in WRITE [ufs_readwrite.c] that we
1112         * allocate buffer, uncache, and then do the write.
1113         * the problem with this is that if the uncache causes
1114         * VM data to be flushed to the same area of the file
1115         * we are writing to... in that case we've got the
1116         * buffer locked and our process goes to sleep forever.
1117         *
1118         * XXXCDC: checking maxprot protects us from the
1119         * "persistbug" program but this is not a long term
1120         * solution.
1121         *
1122         * XXXCDC: we don't bother calling uncache with the vp
1123         * VOP_LOCKed since we know that we are already
1124         * holding a valid reference to the uvn (from the
1125         * uvn_attach above), and thus it is impossible for
1126         * the uncache to kill the uvn and trigger I/O.
1127         */
1128        if (flags & MAP_SHARED) {
1129            if ((prot & PROT_WRITE) ||
1130                (maxprot & PROT_WRITE)) {
1131                uvm_vnp_uncache(vp);
1132            }
1133        }
1134    } else {
1135        uobj = udv_attach(vp->v_rdev,
1136            (flags & MAP_SHARED) ? maxprot :
1137            (maxprot & ~PROT_WRITE), foff, size);
1138        /*
1139         * XXX Some devices don't like to be mapped with
1140         * XXX PROT_EXEC, but we don't really have a
1141         * XXX better way of handling this, right now
1142         */
1143        if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1144            maxprot &= ~PROT_EXEC;
1145            uobj = udv_attach(vp->v_rdev,
1146                (flags & MAP_SHARED) ? maxprot :
1147                (maxprot & ~PROT_WRITE), foff, size);
1148        }
1149        advice = MADV_RANDOM;
1150    }
1151
1152    if (uobj == NULL)
1153        return((vp->v_type == VREG) ? ENOMEM : EINVAL);
1154
1155    if ((flags & MAP_SHARED) == 0)
1156        uvmflag |= UVM_FLAG_COPYONW;
1157    if (flags & __MAP_NOFAULT)
1158        uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY);
1159    if (flags & MAP_STACK)
1160        uvmflag |= UVM_FLAG_STACK;
1161
1162    /* set up mapping flags */
1163    uvmflag = UVM_MAPFLAG(prot, maxprot,
1164        (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1165        advice, uvmflag);
1166
1167    error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1168
1169    if (error == 0)
1170        return uvm_mmaplock(map, addr, size, prot, locklimit);
1171
1172    /* errors: first detach from the uobj, if any.  */
1173    if (uobj)
1174        uobj->pgops->pgo_detach(uobj);
1175
1176    return (error);
1177}
1178
1179/* an address that can't be in userspace */
1180#define BOGO_PC (KERNBASE + 1)
1181int
1182sys_kbind(struct proc *p, void *v, register_t *retval)
1183{
1184    struct sys_kbind_args /* {
1185        syscallarg(const struct __kbind *) param;
1186        syscallarg(size_t) psize;
1187        syscallarg(uint64_t) proc_cookie;
1188    } */ *uap = v;
1189    const struct __kbind *paramp;
1190    union {
1191        struct __kbind uk[KBIND_BLOCK_MAX];
1192        char upad[KBIND_BLOCK_MAX * sizeof(*paramp) + KBIND_DATA_MAX];
1193    } param;
1194    struct uvm_map_deadq dead_entries;
1195    struct process *pr = p->p_p;
1196    const char *data;
1197    vaddr_t baseva, last_baseva, endva, pageoffset, kva;
1198    size_t psize, s;
1199    u_long pc;
1200    int count, i;
1201    int error;
1202
1203    /*
1204     * extract syscall args from uap
1205     */
1206    paramp = SCARG(uap, param);
1207    psize = SCARG(uap, psize);
1208
1209    /* a NULL paramp disables the syscall for the process */
1210    if (paramp == NULL) {
1211        pr->ps_kbind_addr = BOGO_PC;
1212        return (0);
1213    }
1214
1215    /* security checks */
1216    pc = PROC_PC(p);
1217    if (pr->ps_kbind_addr == 0) {
1218        pr->ps_kbind_addr = pc;
1219        pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
1220    } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC)
1221        sigexit(p, SIGILL);
1222    else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie))
1223        sigexit(p, SIGILL);
1224    if (psize < sizeof(struct __kbind) || psize > sizeof(param))
1225        return (EINVAL);
1226    if ((error = copyin(paramp, &param, psize)))
1227        return (error);
1228
1229    /*
1230     * The param argument points to an array of __kbind structures
1231     * followed by the corresponding new data areas for them.  Verify
1232     * that the sizes in the __kbind structures add up to the total
1233     * size and find the start of the new area.
1234     */
1235    paramp = &param.uk[0];
1236    s = psize;
1237    for (count = 0; s > 0 && count < KBIND_BLOCK_MAX; count++) {
1238        if (s < sizeof(*paramp))
1239            return (EINVAL);
1240        s -= sizeof(*paramp);
1241
1242        baseva = (vaddr_t)paramp[count].kb_addr;
1243        endva = baseva + paramp[count].kb_size - 1;
1244        if (paramp[count].kb_addr == NULL ||
1245            paramp[count].kb_size == 0 ||
1246            paramp[count].kb_size > KBIND_DATA_MAX ||
1247            baseva >= VM_MAXUSER_ADDRESS ||
1248            endva >= VM_MAXUSER_ADDRESS ||
1249            trunc_page(baseva) != trunc_page(endva) ||
1250            s < paramp[count].kb_size)
1251            return (EINVAL);
1252
1253        s -= paramp[count].kb_size;
1254    }
1255    if (s > 0)
1256        return (EINVAL);
1257    data = (const char *)&paramp[count];
1258
1259    /* all looks good, so do the bindings */
1260    last_baseva = VM_MAXUSER_ADDRESS;
1261    kva = 0;
1262    TAILQ_INIT(&dead_entries);
1263    for (i = 0; i < count; i++) {
1264        baseva = (vaddr_t)paramp[i].kb_addr;
1265        pageoffset = baseva & PAGE_MASK;
1266        baseva = trunc_page(baseva);
1267
1268        /* make sure sure the desired page is mapped into kernel_map */
1269        if (baseva != last_baseva) {
1270            if (kva != 0) {
1271                vm_map_lock(kernel_map);
1272                uvm_unmap_remove(kernel_map, kva,
1273                    kva+PAGE_SIZE, &dead_entries, FALSE, TRUE);
1274                vm_map_unlock(kernel_map);
1275                kva = 0;
1276            }
1277            if ((error = uvm_map_extract(&p->p_vmspace->vm_map,
1278                baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT)))
1279                break;
1280            last_baseva = baseva;
1281        }
1282
1283        /* do the update */
1284        if ((error = kcopy(data, (char *)kva + pageoffset,
1285            paramp[i].kb_size)))
1286            break;
1287        data += paramp[i].kb_size;
1288    }
1289
1290    if (kva != 0) {
1291        vm_map_lock(kernel_map);
1292        uvm_unmap_remove(kernel_map, kva, kva+PAGE_SIZE,
1293            &dead_entries, FALSE, TRUE);
1294        vm_map_unlock(kernel_map);
1295    }
1296    uvm_unmap_detach(&dead_entries, AMAP_REFALL);
1297
1298    return (error);
1299}
1300