1/*  $OpenBSD: uvm_map.c,v 1.238 2018/07/22 14:33:44 kettenis Exp $  */
2/*  $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */
3
4/*
5 * Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 *
19 *
20 * Copyright (c) 1997 Charles D. Cranor and Washington University.
21 * Copyright (c) 1991, 1993, The Regents of the University of California.
22 *
23 * All rights reserved.
24 *
25 * This code is derived from software contributed to Berkeley by
26 * The Mach Operating System project at Carnegie-Mellon University.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 * 1. Redistributions of source code must retain the above copyright
32 *    notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 *    notice, this list of conditions and the following disclaimer in the
35 *    documentation and/or other materials provided with the distribution.
36 * 3. Neither the name of the University nor the names of its contributors
37 *    may be used to endorse or promote products derived from this software
38 *    without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 *  @(#)vm_map.c    8.3 (Berkeley) 1/12/94
53 * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
54 *
55 *
56 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
57 * All rights reserved.
58 *
59 * Permission to use, copy, modify and distribute this software and
60 * its documentation is hereby granted, provided that both the copyright
61 * notice and this permission notice appear in all copies of the
62 * software, derivative works or modified versions, and any portions
63 * thereof, and that both notices appear in supporting documentation.
64 *
65 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
66 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
67 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
68 *
69 * Carnegie Mellon requests users of this software to return to
70 *
71 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
72 *  School of Computer Science
73 *  Carnegie Mellon University
74 *  Pittsburgh PA 15213-3890
75 *
76 * any improvements or extensions that they make and grant Carnegie the
77 * rights to redistribute these changes.
78 */
79
80/*
81 * uvm_map.c: uvm map operations
82 */
83
84/* #define DEBUG */
85/* #define VMMAP_DEBUG */
86
87#include <sys/param.h>
88#include <sys/systm.h>
89#include <sys/mman.h>
90#include <sys/proc.h>
91#include <sys/malloc.h>
92#include <sys/pool.h>
93#include <sys/sysctl.h>
94#include <sys/syslog.h>
95
96#ifdef SYSVSHM
97#include <sys/shm.h>
98#endif
99
100#include <uvm/uvm.h>
101
102#ifdef DDB
103#include <uvm/uvm_ddb.h>
104#endif
105
106#include <uvm/uvm_addr.h>
107
108
109vsize_t          uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t);
110int          uvm_mapent_isjoinable(struct vm_map*,
111                struct vm_map_entry*, struct vm_map_entry*);
112struct vm_map_entry *uvm_mapent_merge(struct vm_map*, struct vm_map_entry*,
113                struct vm_map_entry*, struct uvm_map_deadq*);
114struct vm_map_entry *uvm_mapent_tryjoin(struct vm_map*,
115                struct vm_map_entry*, struct uvm_map_deadq*);
116struct vm_map_entry *uvm_map_mkentry(struct vm_map*, struct vm_map_entry*,
117                struct vm_map_entry*, vaddr_t, vsize_t, int,
118                struct uvm_map_deadq*, struct vm_map_entry*);
119struct vm_map_entry *uvm_mapent_alloc(struct vm_map*, int);
120void             uvm_mapent_free(struct vm_map_entry*);
121void             uvm_unmap_kill_entry(struct vm_map*,
122                struct vm_map_entry*);
123void             uvm_unmap_detach_intrsafe(struct uvm_map_deadq *);
124void             uvm_mapent_mkfree(struct vm_map*,
125                struct vm_map_entry*, struct vm_map_entry**,
126                struct uvm_map_deadq*, boolean_t);
127void             uvm_map_pageable_pgon(struct vm_map*,
128                struct vm_map_entry*, struct vm_map_entry*,
129                vaddr_t, vaddr_t);
130int          uvm_map_pageable_wire(struct vm_map*,
131                struct vm_map_entry*, struct vm_map_entry*,
132                vaddr_t, vaddr_t, int);
133void             uvm_map_setup_entries(struct vm_map*);
134void             uvm_map_setup_md(struct vm_map*);
135void             uvm_map_teardown(struct vm_map*);
136void             uvm_map_vmspace_update(struct vm_map*,
137                struct uvm_map_deadq*, int);
138void             uvm_map_kmem_grow(struct vm_map*,
139                struct uvm_map_deadq*, vsize_t, int);
140void             uvm_map_freelist_update_clear(struct vm_map*,
141                struct uvm_map_deadq*);
142void             uvm_map_freelist_update_refill(struct vm_map *, int);
143void             uvm_map_freelist_update(struct vm_map*,
144                struct uvm_map_deadq*, vaddr_t, vaddr_t,
145                vaddr_t, vaddr_t, int);
146struct vm_map_entry *uvm_map_fix_space(struct vm_map*, struct vm_map_entry*,
147                vaddr_t, vaddr_t, int);
148int          uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int,
149                struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t,
150                int);
151int          uvm_map_findspace(struct vm_map*,
152                struct vm_map_entry**, struct vm_map_entry**,
153                vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
154                vaddr_t);
155vsize_t          uvm_map_addr_augment_get(struct vm_map_entry*);
156void             uvm_map_addr_augment(struct vm_map_entry*);
157
158/*
159 * Tree management functions.
160 */
161
162static __inline void     uvm_mapent_copy(struct vm_map_entry*,
163                struct vm_map_entry*);
164static inline int    uvm_mapentry_addrcmp(const struct vm_map_entry*,
165                const struct vm_map_entry*);
166void             uvm_mapent_free_insert(struct vm_map*,
167                struct uvm_addr_state*, struct vm_map_entry*);
168void             uvm_mapent_free_remove(struct vm_map*,
169                struct uvm_addr_state*, struct vm_map_entry*);
170void             uvm_mapent_addr_insert(struct vm_map*,
171                struct vm_map_entry*);
172void             uvm_mapent_addr_remove(struct vm_map*,
173                struct vm_map_entry*);
174void             uvm_map_splitentry(struct vm_map*,
175                struct vm_map_entry*, struct vm_map_entry*,
176                vaddr_t);
177vsize_t          uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t);
178int          uvm_mapent_bias(struct vm_map*, struct vm_map_entry*);
179
180/*
181 * uvm_vmspace_fork helper functions.
182 */
183struct vm_map_entry *uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
184                vsize_t, vm_prot_t, vm_prot_t,
185                struct vm_map_entry*, struct uvm_map_deadq*, int,
186                int);
187struct vm_map_entry *uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
188                vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
189                struct vm_map_entry*, struct uvm_map_deadq*);
190struct vm_map_entry *uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
191                struct vm_map*, struct vm_map_entry*,
192                struct uvm_map_deadq*);
193struct vm_map_entry *uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
194                struct vm_map*, struct vm_map_entry*,
195                struct uvm_map_deadq*);
196struct vm_map_entry *uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
197                struct vm_map*, struct vm_map_entry*,
198                struct uvm_map_deadq*);
199
200/*
201 * Tree validation.
202 */
203#ifdef VMMAP_DEBUG
204void             uvm_tree_assert(struct vm_map*, int, char*,
205                char*, int);
206#define UVM_ASSERT(map, cond, file, line)               \
207    uvm_tree_assert((map), (cond), #cond, (file), (line))
208void             uvm_tree_sanity(struct vm_map*, char*, int);
209void             uvm_tree_size_chk(struct vm_map*, char*, int);
210void             vmspace_validate(struct vm_map*);
211#else
212#define uvm_tree_sanity(_map, _file, _line)     do {} while (0)
213#define uvm_tree_size_chk(_map, _file, _line)       do {} while (0)
214#define vmspace_validate(_map)              do {} while (0)
215#endif
216
217/*
218 * All architectures will have pmap_prefer.
219 */
220#ifndef PMAP_PREFER
221#define PMAP_PREFER_ALIGN() (vaddr_t)PAGE_SIZE
222#define PMAP_PREFER_OFFSET(off) 0
223#define PMAP_PREFER(addr, off)  (addr)
224#endif
225
226
227/*
228 * The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
229 * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes.
230 *
231 * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size
232 * each time.
233 */
234#define VM_MAP_KSIZE_INIT   (512 * (vaddr_t)PAGE_SIZE)
235#define VM_MAP_KSIZE_DELTA  (256 * (vaddr_t)PAGE_SIZE)
236#define VM_MAP_KSIZE_ALLOCMUL   4
237/*
238 * When selecting a random free-space block, look at most FSPACE_DELTA blocks
239 * ahead.
240 */
241#define FSPACE_DELTA        8
242/*
243 * Put allocations adjecent to previous allocations when the free-space tree
244 * is larger than FSPACE_COMPACT entries.
245 *
246 * Alignment and PMAP_PREFER may still cause the entry to not be fully
247 * adjecent. Note that this strategy reduces memory fragmentation (by leaving
248 * a large space before or after the allocation).
249 */
250#define FSPACE_COMPACT      128
251/*
252 * Make the address selection skip at most this many bytes from the start of
253 * the free space in which the allocation takes place.
254 *
255 * The main idea behind a randomized address space is that an attacker cannot
256 * know where to target his attack. Therefore, the location of objects must be
257 * as random as possible. However, the goal is not to create the most sparse
258 * map that is possible.
259 * FSPACE_MAXOFF pushes the considered range in bytes down to less insane
260 * sizes, thereby reducing the sparseness. The biggest randomization comes
261 * from fragmentation, i.e. FSPACE_COMPACT.
262 */
263#define FSPACE_MAXOFF       ((vaddr_t)32 * 1024 * 1024)
264/*
265 * Allow for small gaps in the overflow areas.
266 * Gap size is in bytes and does not have to be a multiple of page-size.
267 */
268#define FSPACE_BIASGAP      ((vaddr_t)32 * 1024)
269
270/* auto-allocate address lower bound */
271#define VMMAP_MIN_ADDR      PAGE_SIZE
272
273
274#ifdef DEADBEEF0
275#define UVMMAP_DEADBEEF     ((unsigned long)DEADBEEF0)
276#else
277#define UVMMAP_DEADBEEF     ((unsigned long)0xdeadd0d0)
278#endif
279
280#ifdef DEBUG
281int uvm_map_printlocks = 0;
282
283#define LPRINTF(_args)                          \
284    do {                                \
285        if (uvm_map_printlocks)                 \
286            printf _args;                   \
287    } while (0)
288#else
289#define LPRINTF(_args)  do {} while (0)
290#endif
291
292static struct mutex uvm_kmapent_mtx;
293static struct timeval uvm_kmapent_last_warn_time;
294static struct timeval uvm_kmapent_warn_rate = { 10, 0 };
295
296const char vmmapbsy[] = "vmmapbsy";
297
298/*
299 * pool for vmspace structures.
300 */
301struct pool uvm_vmspace_pool;
302
303/*
304 * pool for dynamically-allocated map entries.
305 */
306struct pool uvm_map_entry_pool;
307struct pool uvm_map_entry_kmem_pool;
308
309/*
310 * This global represents the end of the kernel virtual address
311 * space. If we want to exceed this, we must grow the kernel
312 * virtual address space dynamically.
313 *
314 * Note, this variable is locked by kernel_map's lock.
315 */
316vaddr_t uvm_maxkaddr;
317
318/*
319 * Locking predicate.
320 */
321#define UVM_MAP_REQ_WRITE(_map)                     \
322    do {                                \
323        if ((_map)->ref_count > 0) {                \
324            if (((_map)->flags & VM_MAP_INTRSAFE) == 0) \
325                rw_assert_wrlock(&(_map)->lock);    \
326            else                        \
327                MUTEX_ASSERT_LOCKED(&(_map)->mtx);  \
328        }                           \
329    } while (0)
330
331/*
332 * Tree describing entries by address.
333 *
334 * Addresses are unique.
335 * Entries with start == end may only exist if they are the first entry
336 * (sorted by address) within a free-memory tree.
337 */
338
339static inline int
340uvm_mapentry_addrcmp(const struct vm_map_entry *e1,
341    const struct vm_map_entry *e2)
342{
343    return e1->start < e2->start ? -1 : e1->start > e2->start;
344}
345
346/*
347 * Copy mapentry.
348 */
349static __inline void
350uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
351{
352    caddr_t csrc, cdst;
353    size_t sz;
354
355    csrc = (caddr_t)src;
356    cdst = (caddr_t)dst;
357    csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
358    cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
359
360    sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) -
361        offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
362    memcpy(cdst, csrc, sz);
363}
364
365/*
366 * Handle free-list insertion.
367 */
368void
369uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr,
370    struct vm_map_entry *entry)
371{
372    const struct uvm_addr_functions *fun;
373#ifdef VMMAP_DEBUG
374    vaddr_t min, max, bound;
375#endif
376
377#ifdef VMMAP_DEBUG
378    /*
379     * Boundary check.
380     * Boundaries are folded if they go on the same free list.
381     */
382    min = VMMAP_FREE_START(entry);
383    max = VMMAP_FREE_END(entry);
384
385    while (min < max) {
386        bound = uvm_map_boundary(map, min, max);
387        KASSERT(uvm_map_uaddr(map, min) == uaddr);
388        min = bound;
389    }
390#endif
391    KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
392    KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
393
394    UVM_MAP_REQ_WRITE(map);
395
396    /* Actual insert: forward to uaddr pointer. */
397    if (uaddr != NULL) {
398        fun = uaddr->uaddr_functions;
399        KDASSERT(fun != NULL);
400        if (fun->uaddr_free_insert != NULL)
401            (*fun->uaddr_free_insert)(map, uaddr, entry);
402        entry->etype |= UVM_ET_FREEMAPPED;
403    }
404
405    /* Update fspace augmentation. */
406    uvm_map_addr_augment(entry);
407}
408
409/*
410 * Handle free-list removal.
411 */
412void
413uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr,
414    struct vm_map_entry *entry)
415{
416    const struct uvm_addr_functions *fun;
417
418    KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
419    KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
420    UVM_MAP_REQ_WRITE(map);
421
422    if (uaddr != NULL) {
423        fun = uaddr->uaddr_functions;
424        if (fun->uaddr_free_remove != NULL)
425            (*fun->uaddr_free_remove)(map, uaddr, entry);
426        entry->etype &= ~UVM_ET_FREEMAPPED;
427    }
428}
429
430/*
431 * Handle address tree insertion.
432 */
433void
434uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry)
435{
436    struct vm_map_entry *res;
437
438    if (!RBT_CHECK(uvm_map_addr, entry, UVMMAP_DEADBEEF))
439        panic("uvm_mapent_addr_insert: entry still in addr list");
440    KDASSERT(entry->start <= entry->end);
441    KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 &&
442        (entry->end & (vaddr_t)PAGE_MASK) == 0);
443
444    UVM_MAP_REQ_WRITE(map);
445    res = RBT_INSERT(uvm_map_addr, &map->addr, entry);
446    if (res != NULL) {
447        panic("uvm_mapent_addr_insert: map %p entry %p "
448            "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision "
449            "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)",
450            map, entry,
451            entry->start, entry->end, entry->guard, entry->fspace,
452            res, res->start, res->end, res->guard, res->fspace);
453    }
454}
455
456/*
457 * Handle address tree removal.
458 */
459void
460uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry)
461{
462    struct vm_map_entry *res;
463
464    UVM_MAP_REQ_WRITE(map);
465    res = RBT_REMOVE(uvm_map_addr, &map->addr, entry);
466    if (res != entry)
467        panic("uvm_mapent_addr_remove");
468    RBT_POISON(uvm_map_addr, entry, UVMMAP_DEADBEEF);
469}
470
471/*
472 * uvm_map_reference: add reference to a map
473 *
474 * XXX check map reference counter lock
475 */
476#define uvm_map_reference(_map)                     \
477    do {                                \
478        map->ref_count++;                   \
479    } while (0)
480
481/*
482 * Calculate the dused delta.
483 */
484vsize_t
485uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max)
486{
487    struct vmspace *vm;
488    vsize_t sz;
489    vaddr_t lmax;
490    vaddr_t stack_begin, stack_end; /* Position of stack. */
491
492    KASSERT(map->flags & VM_MAP_ISVMSPACE);
493    vm = (struct vmspace *)map;
494    stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
495    stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
496
497    sz = 0;
498    while (min != max) {
499        lmax = max;
500        if (min < stack_begin && lmax > stack_begin)
501            lmax = stack_begin;
502        else if (min < stack_end && lmax > stack_end)
503            lmax = stack_end;
504
505        if (min >= stack_begin && min < stack_end) {
506            /* nothing */
507        } else
508            sz += lmax - min;
509        min = lmax;
510    }
511
512    return sz >> PAGE_SHIFT;
513}
514
515/*
516 * Find the entry describing the given address.
517 */
518struct vm_map_entry*
519uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr)
520{
521    struct vm_map_entry *iter;
522
523    iter = RBT_ROOT(uvm_map_addr, atree);
524    while (iter != NULL) {
525        if (iter->start > addr)
526            iter = RBT_LEFT(uvm_map_addr, iter);
527        else if (VMMAP_FREE_END(iter) <= addr)
528            iter = RBT_RIGHT(uvm_map_addr, iter);
529        else
530            return iter;
531    }
532    return NULL;
533}
534
535/*
536 * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry)
537 *
538 * Push dead entries into a linked list.
539 * Since the linked list abuses the address tree for storage, the entry
540 * may not be linked in a map.
541 *
542 * *head must be initialized to NULL before the first call to this macro.
543 * uvm_unmap_detach(*head, 0) will remove dead entries.
544 */
545static __inline void
546dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry)
547{
548    TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq);
549}
550#define DEAD_ENTRY_PUSH(_headptr, _entry)               \
551    dead_entry_push((_headptr), (_entry))
552
553/*
554 * Helper function for uvm_map_findspace_tree.
555 *
556 * Given allocation constraints and pmap constraints, finds the
557 * lowest and highest address in a range that can be used for the
558 * allocation.
559 *
560 * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs.
561 *
562 *
563 * Big chunk of math with a seasoning of dragons.
564 */
565int
566uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg,
567    struct vm_map_entry *sel, vaddr_t align,
568    vaddr_t pmap_align, vaddr_t pmap_off, int bias)
569{
570    vaddr_t sel_min, sel_max;
571#ifdef PMAP_PREFER
572    vaddr_t pmap_min, pmap_max;
573#endif /* PMAP_PREFER */
574#ifdef DIAGNOSTIC
575    int bad;
576#endif /* DIAGNOSTIC */
577
578    sel_min = VMMAP_FREE_START(sel);
579    sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0);
580
581#ifdef PMAP_PREFER
582
583    /*
584     * There are two special cases, in which we can satisfy the align
585     * requirement and the pmap_prefer requirement.
586     * - when pmap_off == 0, we always select the largest of the two
587     * - when pmap_off % align == 0 and pmap_align > align, we simply
588     *   satisfy the pmap_align requirement and automatically
589     *   satisfy the align requirement.
590     */
591    if (align > PAGE_SIZE &&
592        !(pmap_align > align && (pmap_off & (align - 1)) == 0)) {
593        /*
594         * Simple case: only use align.
595         */
596        sel_min = roundup(sel_min, align);
597        sel_max &= ~(align - 1);
598
599        if (sel_min > sel_max)
600            return ENOMEM;
601
602        /* Correct for bias. */
603        if (sel_max - sel_min > FSPACE_BIASGAP) {
604            if (bias > 0) {
605                sel_min = sel_max - FSPACE_BIASGAP;
606                sel_min = roundup(sel_min, align);
607            } else if (bias < 0) {
608                sel_max = sel_min + FSPACE_BIASGAP;
609                sel_max &= ~(align - 1);
610            }
611        }
612    } else if (pmap_align != 0) {
613        /*
614         * Special case: satisfy both pmap_prefer and
615         * align argument.
616         */
617        pmap_max = sel_max & ~(pmap_align - 1);
618        pmap_min = sel_min;
619        if (pmap_max < sel_min)
620            return ENOMEM;
621
622        /* Adjust pmap_min for BIASGAP for top-addr bias. */
623        if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP)
624            pmap_min = pmap_max - FSPACE_BIASGAP;
625        /* Align pmap_min. */
626        pmap_min &= ~(pmap_align - 1);
627        if (pmap_min < sel_min)
628            pmap_min += pmap_align;
629        if (pmap_min > pmap_max)
630            return ENOMEM;
631
632        /* Adjust pmap_max for BIASGAP for bottom-addr bias. */
633        if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) {
634            pmap_max = (pmap_min + FSPACE_BIASGAP) &
635                ~(pmap_align - 1);
636        }
637        if (pmap_min > pmap_max)
638            return ENOMEM;
639
640        /* Apply pmap prefer offset. */
641        pmap_max |= pmap_off;
642        if (pmap_max > sel_max)
643            pmap_max -= pmap_align;
644        pmap_min |= pmap_off;
645        if (pmap_min < sel_min)
646            pmap_min += pmap_align;
647
648        /*
649         * Fixup: it's possible that pmap_min and pmap_max
650         * cross eachother. In this case, try to find one
651         * address that is allowed.
652         * (This usually happens in biased case.)
653         */
654        if (pmap_min > pmap_max) {
655            if (pmap_min < sel_max)
656                pmap_max = pmap_min;
657            else if (pmap_max > sel_min)
658                pmap_min = pmap_max;
659            else
660                return ENOMEM;
661        }
662
663        /* Internal validation. */
664        KDASSERT(pmap_min <= pmap_max);
665
666        sel_min = pmap_min;
667        sel_max = pmap_max;
668    } else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
669        sel_min = sel_max - FSPACE_BIASGAP;
670    else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
671        sel_max = sel_min + FSPACE_BIASGAP;
672
673#else
674
675    if (align > PAGE_SIZE) {
676        sel_min = roundup(sel_min, align);
677        sel_max &= ~(align - 1);
678        if (sel_min > sel_max)
679            return ENOMEM;
680
681        if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) {
682            if (bias > 0) {
683                sel_min = roundup(sel_max - FSPACE_BIASGAP,
684                    align);
685            } else {
686                sel_max = (sel_min + FSPACE_BIASGAP) &
687                    ~(align - 1);
688            }
689        }
690    } else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
691        sel_min = sel_max - FSPACE_BIASGAP;
692    else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
693        sel_max = sel_min + FSPACE_BIASGAP;
694
695#endif
696
697    if (sel_min > sel_max)
698        return ENOMEM;
699
700#ifdef DIAGNOSTIC
701    bad = 0;
702    /* Lower boundary check. */
703    if (sel_min < VMMAP_FREE_START(sel)) {
704        printf("sel_min: 0x%lx, but should be at least 0x%lx\n",
705            sel_min, VMMAP_FREE_START(sel));
706        bad++;
707    }
708    /* Upper boundary check. */
709    if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) {
710        printf("sel_max: 0x%lx, but should be at most 0x%lx\n",
711            sel_max,
712            VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0));
713        bad++;
714    }
715    /* Lower boundary alignment. */
716    if (align != 0 && (sel_min & (align - 1)) != 0) {
717        printf("sel_min: 0x%lx, not aligned to 0x%lx\n",
718            sel_min, align);
719        bad++;
720    }
721    /* Upper boundary alignment. */
722    if (align != 0 && (sel_max & (align - 1)) != 0) {
723        printf("sel_max: 0x%lx, not aligned to 0x%lx\n",
724            sel_max, align);
725        bad++;
726    }
727    /* Lower boundary PMAP_PREFER check. */
728    if (pmap_align != 0 && align == 0 &&
729        (sel_min & (pmap_align - 1)) != pmap_off) {
730        printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
731            sel_min, sel_min & (pmap_align - 1), pmap_off);
732        bad++;
733    }
734    /* Upper boundary PMAP_PREFER check. */
735    if (pmap_align != 0 && align == 0 &&
736        (sel_max & (pmap_align - 1)) != pmap_off) {
737        printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
738            sel_max, sel_max & (pmap_align - 1), pmap_off);
739        bad++;
740    }
741
742    if (bad) {
743        panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, "
744            "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, "
745            "bias = %d, "
746            "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)",
747            sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off,
748            bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel));
749    }
750#endif /* DIAGNOSTIC */
751
752    *min = sel_min;
753    *max = sel_max;
754    return 0;
755}
756
757/*
758 * Test if memory starting at addr with sz bytes is free.
759 *
760 * Fills in *start_ptr and *end_ptr to be the first and last entry describing
761 * the space.
762 * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
763 */
764int
765uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
766    struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr,
767    vaddr_t addr, vsize_t sz)
768{
769    struct uvm_addr_state *free;
770    struct uvm_map_addr *atree;
771    struct vm_map_entry *i, *i_end;
772
773    if (addr + sz < addr)
774        return 0;
775
776    /*
777     * Kernel memory above uvm_maxkaddr is considered unavailable.
778     */
779    if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
780        if (addr + sz > uvm_maxkaddr)
781            return 0;
782    }
783
784    atree = &map->addr;
785
786    /*
787     * Fill in first, last, so they point at the entries containing the
788     * first and last address of the range.
789     * Note that if they are not NULL, we don't perform the lookup.
790     */
791    KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL);
792    if (*start_ptr == NULL) {
793        *start_ptr = uvm_map_entrybyaddr(atree, addr);
794        if (*start_ptr == NULL)
795            return 0;
796    } else
797        KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr));
798    if (*end_ptr == NULL) {
799        if (VMMAP_FREE_END(*start_ptr) >= addr + sz)
800            *end_ptr = *start_ptr;
801        else {
802            *end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1);
803            if (*end_ptr == NULL)
804                return 0;
805        }
806    } else
807        KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1));
808
809    /* Validation. */
810    KDASSERT(*start_ptr != NULL && *end_ptr != NULL);
811    KDASSERT((*start_ptr)->start <= addr &&
812        VMMAP_FREE_END(*start_ptr) > addr &&
813        (*end_ptr)->start < addr + sz &&
814        VMMAP_FREE_END(*end_ptr) >= addr + sz);
815
816    /*
817     * Check the none of the entries intersects with <addr, addr+sz>.
818     * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is
819     * considered unavailable unless called by those allocators.
820     */
821    i = *start_ptr;
822    i_end = RBT_NEXT(uvm_map_addr, *end_ptr);
823    for (; i != i_end;
824        i = RBT_NEXT(uvm_map_addr, i)) {
825        if (i->start != i->end && i->end > addr)
826            return 0;
827
828        /*
829         * uaddr_exe and uaddr_brk_stack may only be used
830         * by these allocators and the NULL uaddr (i.e. no
831         * uaddr).
832         * Reject if this requirement is not met.
833         */
834        if (uaddr != NULL) {
835            free = uvm_map_uaddr_e(map, i);
836
837            if (uaddr != free && free != NULL &&
838                (free == map->uaddr_exe ||
839                 free == map->uaddr_brk_stack))
840                return 0;
841        }
842    }
843
844    return -1;
845}
846
847/*
848 * Invoke each address selector until an address is found.
849 * Will not invoke uaddr_exe.
850 */
851int
852uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
853    struct vm_map_entry**last, vaddr_t *addr, vsize_t sz,
854    vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint)
855{
856    struct uvm_addr_state *uaddr;
857    int i;
858
859    /*
860     * Allocation for sz bytes at any address,
861     * using the addr selectors in order.
862     */
863    for (i = 0; i < nitems(map->uaddr_any); i++) {
864        uaddr = map->uaddr_any[i];
865
866        if (uvm_addr_invoke(map, uaddr, first, last,
867            addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
868            return 0;
869    }
870
871    /* Fall back to brk() and stack() address selectors. */
872    uaddr = map->uaddr_brk_stack;
873    if (uvm_addr_invoke(map, uaddr, first, last,
874        addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
875        return 0;
876
877    return ENOMEM;
878}
879
880/* Calculate entry augmentation value. */
881vsize_t
882uvm_map_addr_augment_get(struct vm_map_entry *entry)
883{
884    vsize_t          augment;
885    struct vm_map_entry *left, *right;
886
887    augment = entry->fspace;
888    if ((left = RBT_LEFT(uvm_map_addr, entry)) != NULL)
889        augment = MAX(augment, left->fspace_augment);
890    if ((right = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
891        augment = MAX(augment, right->fspace_augment);
892    return augment;
893}
894
895/*
896 * Update augmentation data in entry.
897 */
898void
899uvm_map_addr_augment(struct vm_map_entry *entry)
900{
901    vsize_t          augment;
902
903    while (entry != NULL) {
904        /* Calculate value for augmentation. */
905        augment = uvm_map_addr_augment_get(entry);
906
907        /*
908         * Descend update.
909         * Once we find an entry that already has the correct value,
910         * stop, since it means all its parents will use the correct
911         * value too.
912         */
913        if (entry->fspace_augment == augment)
914            return;
915        entry->fspace_augment = augment;
916        entry = RBT_PARENT(uvm_map_addr, entry);
917    }
918}
919
920/*
921 * uvm_mapanon: establish a valid mapping in map for an anon
922 *
923 * => *addr and sz must be a multiple of PAGE_SIZE.
924 * => *addr is ignored, except if flags contains UVM_FLAG_FIXED.
925 * => map must be unlocked.
926 *
927 * => align: align vaddr, must be a power-of-2.
928 *    Align is only a hint and will be ignored if the alignment fails.
929 */
930int
931uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz,
932    vsize_t align, unsigned int flags)
933{
934    struct vm_map_entry *first, *last, *entry, *new;
935    struct uvm_map_deadq     dead;
936    vm_prot_t        prot;
937    vm_prot_t        maxprot;
938    vm_inherit_t         inherit;
939    int          advice;
940    int          error;
941    vaddr_t          pmap_align, pmap_offset;
942    vaddr_t          hint;
943
944    KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE);
945    KASSERT(map != kernel_map);
946    KASSERT((map->flags & UVM_FLAG_HOLE) == 0);
947
948    KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
949    splassert(IPL_NONE);
950
951    /*
952     * We use pmap_align and pmap_offset as alignment and offset variables.
953     *
954     * Because the align parameter takes precedence over pmap prefer,
955     * the pmap_align will need to be set to align, with pmap_offset = 0,
956     * if pmap_prefer will not align.
957     */
958    pmap_align = MAX(align, PAGE_SIZE);
959    pmap_offset = 0;
960
961    /* Decode parameters. */
962    prot = UVM_PROTECTION(flags);
963    maxprot = UVM_MAXPROTECTION(flags);
964    advice = UVM_ADVICE(flags);
965    inherit = UVM_INHERIT(flags);
966    error = 0;
967    hint = trunc_page(*addr);
968    TAILQ_INIT(&dead);
969    KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
970    KASSERT((align & (align - 1)) == 0);
971
972    /* Check protection. */
973    if ((prot & maxprot) != prot)
974        return EACCES;
975
976    /*
977     * Before grabbing the lock, allocate a map entry for later
978     * use to ensure we don't wait for memory while holding the
979     * vm_map_lock.
980     */
981    new = uvm_mapent_alloc(map, flags);
982    if (new == NULL)
983        return(ENOMEM);
984
985    if (flags & UVM_FLAG_TRYLOCK) {
986        if (vm_map_lock_try(map) == FALSE) {
987            error = EFAULT;
988            goto out;
989        }
990    } else
991        vm_map_lock(map);
992
993    first = last = NULL;
994    if (flags & UVM_FLAG_FIXED) {
995        /*
996         * Fixed location.
997         *
998         * Note: we ignore align, pmap_prefer.
999         * Fill in first, last and *addr.
1000         */
1001        KASSERT((*addr & PAGE_MASK) == 0);
1002
1003        /* Check that the space is available. */
1004        if (flags & UVM_FLAG_UNMAP) {
1005            if ((flags & UVM_FLAG_STACK) &&
1006                !uvm_map_is_stack_remappable(map, *addr, sz)) {
1007                error = EINVAL;
1008                goto unlock;
1009            }
1010            uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1011        }
1012        if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1013            error = ENOMEM;
1014            goto unlock;
1015        }
1016    } else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1017        (align == 0 || (*addr & (align - 1)) == 0) &&
1018        uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1019        /*
1020         * Address used as hint.
1021         *
1022         * Note: we enforce the alignment restriction,
1023         * but ignore pmap_prefer.
1024         */
1025    } else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1026        /* Run selection algorithm for executables. */
1027        error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1028            addr, sz, pmap_align, pmap_offset, prot, hint);
1029
1030        if (error != 0)
1031            goto unlock;
1032    } else {
1033        /* Update freelists from vmspace. */
1034        uvm_map_vmspace_update(map, &dead, flags);
1035
1036        error = uvm_map_findspace(map, &first, &last, addr, sz,
1037            pmap_align, pmap_offset, prot, hint);
1038
1039        if (error != 0)
1040            goto unlock;
1041    }
1042
1043    /* Double-check if selected address doesn't cause overflow. */
1044    if (*addr + sz < *addr) {
1045        error = ENOMEM;
1046        goto unlock;
1047    }
1048
1049    /* If we only want a query, return now. */
1050    if (flags & UVM_FLAG_QUERY) {
1051        error = 0;
1052        goto unlock;
1053    }
1054
1055    /*
1056     * Create new entry.
1057     * first and last may be invalidated after this call.
1058     */
1059    entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1060        new);
1061    if (entry == NULL) {
1062        error = ENOMEM;
1063        goto unlock;
1064    }
1065    new = NULL;
1066    KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1067    entry->object.uvm_obj = NULL;
1068    entry->offset = 0;
1069    entry->protection = prot;
1070    entry->max_protection = maxprot;
1071    entry->inheritance = inherit;
1072    entry->wired_count = 0;
1073    entry->advice = advice;
1074    if (flags & UVM_FLAG_STACK) {
1075        entry->etype |= UVM_ET_STACK;
1076        if (flags & (UVM_FLAG_FIXED | UVM_FLAG_UNMAP))
1077            map->serial++;
1078    }
1079    if (flags & UVM_FLAG_COPYONW) {
1080        entry->etype |= UVM_ET_COPYONWRITE;
1081        if ((flags & UVM_FLAG_OVERLAY) == 0)
1082            entry->etype |= UVM_ET_NEEDSCOPY;
1083    }
1084    if (flags & UVM_FLAG_OVERLAY) {
1085        KERNEL_LOCK();
1086        entry->aref.ar_pageoff = 0;
1087        entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1088        KERNEL_UNLOCK();
1089    }
1090
1091    /* Update map and process statistics. */
1092    map->size += sz;
1093    ((struct vmspace *)map)->vm_dused += uvmspace_dused(map, *addr, *addr + sz);
1094
1095unlock:
1096    vm_map_unlock(map);
1097
1098    /*
1099     * Remove dead entries.
1100     *
1101     * Dead entries may be the result of merging.
1102     * uvm_map_mkentry may also create dead entries, when it attempts to
1103     * destroy free-space entries.
1104     */
1105    uvm_unmap_detach(&dead, 0);
1106out:
1107    if (new)
1108        uvm_mapent_free(new);
1109    return error;
1110}
1111
1112/*
1113 * uvm_map: establish a valid mapping in map
1114 *
1115 * => *addr and sz must be a multiple of PAGE_SIZE.
1116 * => map must be unlocked.
1117 * => <uobj,uoffset> value meanings (4 cases):
1118 *  [1] <NULL,uoffset>      == uoffset is a hint for PMAP_PREFER
1119 *  [2] <NULL,UVM_UNKNOWN_OFFSET>   == don't PMAP_PREFER
1120 *  [3] <uobj,uoffset>      == normal mapping
1121 *  [4] <uobj,UVM_UNKNOWN_OFFSET>   == uvm_map finds offset based on VA
1122 *
1123 *   case [4] is for kernel mappings where we don't know the offset until
1124 *   we've found a virtual address.   note that kernel object offsets are
1125 *   always relative to vm_map_min(kernel_map).
1126 *
1127 * => align: align vaddr, must be a power-of-2.
1128 *    Align is only a hint and will be ignored if the alignment fails.
1129 */
1130int
1131uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz,
1132    struct uvm_object *uobj, voff_t uoffset,
1133    vsize_t align, unsigned int flags)
1134{
1135    struct vm_map_entry *first, *last, *entry, *new;
1136    struct uvm_map_deadq     dead;
1137    vm_prot_t        prot;
1138    vm_prot_t        maxprot;
1139    vm_inherit_t         inherit;
1140    int          advice;
1141    int          error;
1142    vaddr_t          pmap_align, pmap_offset;
1143    vaddr_t          hint;
1144
1145    if ((map->flags & VM_MAP_INTRSAFE) == 0)
1146        splassert(IPL_NONE);
1147    else
1148        splassert(IPL_VM);
1149
1150    /*
1151     * We use pmap_align and pmap_offset as alignment and offset variables.
1152     *
1153     * Because the align parameter takes precedence over pmap prefer,
1154     * the pmap_align will need to be set to align, with pmap_offset = 0,
1155     * if pmap_prefer will not align.
1156     */
1157    if (uoffset == UVM_UNKNOWN_OFFSET) {
1158        pmap_align = MAX(align, PAGE_SIZE);
1159        pmap_offset = 0;
1160    } else {
1161        pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE);
1162        pmap_offset = PMAP_PREFER_OFFSET(uoffset);
1163
1164        if (align == 0 ||
1165            (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) {
1166            /* pmap_offset satisfies align, no change. */
1167        } else {
1168            /* Align takes precedence over pmap prefer. */
1169            pmap_align = align;
1170            pmap_offset = 0;
1171        }
1172    }
1173
1174    /* Decode parameters. */
1175    prot = UVM_PROTECTION(flags);
1176    maxprot = UVM_MAXPROTECTION(flags);
1177    advice = UVM_ADVICE(flags);
1178    inherit = UVM_INHERIT(flags);
1179    error = 0;
1180    hint = trunc_page(*addr);
1181    TAILQ_INIT(&dead);
1182    KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
1183    KASSERT((align & (align - 1)) == 0);
1184
1185    /* Holes are incompatible with other types of mappings. */
1186    if (flags & UVM_FLAG_HOLE) {
1187        KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) &&
1188            (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0);
1189    }
1190
1191    /* Unset hint for kernel_map non-fixed allocations. */
1192    if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED))
1193        hint = 0;
1194
1195    /* Check protection. */
1196    if ((prot & maxprot) != prot)
1197        return EACCES;
1198
1199    if (map == kernel_map &&
1200        (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
1201        panic("uvm_map: kernel map W^X violation requested");
1202
1203    /*
1204     * Before grabbing the lock, allocate a map entry for later
1205     * use to ensure we don't wait for memory while holding the
1206     * vm_map_lock.
1207     */
1208    new = uvm_mapent_alloc(map, flags);
1209    if (new == NULL)
1210        return(ENOMEM);
1211
1212    if (flags & UVM_FLAG_TRYLOCK) {
1213        if (vm_map_lock_try(map) == FALSE) {
1214            error = EFAULT;
1215            goto out;
1216        }
1217    } else {
1218        vm_map_lock(map);
1219    }
1220
1221    first = last = NULL;
1222    if (flags & UVM_FLAG_FIXED) {
1223        /*
1224         * Fixed location.
1225         *
1226         * Note: we ignore align, pmap_prefer.
1227         * Fill in first, last and *addr.
1228         */
1229        KASSERT((*addr & PAGE_MASK) == 0);
1230
1231        /*
1232         * Grow pmap to include allocated address.
1233         * If the growth fails, the allocation will fail too.
1234         */
1235        if ((map->flags & VM_MAP_ISVMSPACE) == 0 &&
1236            uvm_maxkaddr < (*addr + sz)) {
1237            uvm_map_kmem_grow(map, &dead,
1238                *addr + sz - uvm_maxkaddr, flags);
1239        }
1240
1241        /* Check that the space is available. */
1242        if (flags & UVM_FLAG_UNMAP)
1243            uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1244        if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1245            error = ENOMEM;
1246            goto unlock;
1247        }
1248    } else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1249        (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE &&
1250        (align == 0 || (*addr & (align - 1)) == 0) &&
1251        uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1252        /*
1253         * Address used as hint.
1254         *
1255         * Note: we enforce the alignment restriction,
1256         * but ignore pmap_prefer.
1257         */
1258    } else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1259        /* Run selection algorithm for executables. */
1260        error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1261            addr, sz, pmap_align, pmap_offset, prot, hint);
1262
1263        /* Grow kernel memory and try again. */
1264        if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1265            uvm_map_kmem_grow(map, &dead, sz, flags);
1266
1267            error = uvm_addr_invoke(map, map->uaddr_exe,
1268                &first, &last, addr, sz,
1269                pmap_align, pmap_offset, prot, hint);
1270        }
1271
1272        if (error != 0)
1273            goto unlock;
1274    } else {
1275        /* Update freelists from vmspace. */
1276        if (map->flags & VM_MAP_ISVMSPACE)
1277            uvm_map_vmspace_update(map, &dead, flags);
1278
1279        error = uvm_map_findspace(map, &first, &last, addr, sz,
1280            pmap_align, pmap_offset, prot, hint);
1281
1282        /* Grow kernel memory and try again. */
1283        if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1284            uvm_map_kmem_grow(map, &dead, sz, flags);
1285
1286            error = uvm_map_findspace(map, &first, &last, addr, sz,
1287                pmap_align, pmap_offset, prot, hint);
1288        }
1289
1290        if (error != 0)
1291            goto unlock;
1292    }
1293
1294    /* Double-check if selected address doesn't cause overflow. */
1295    if (*addr + sz < *addr) {
1296        error = ENOMEM;
1297        goto unlock;
1298    }
1299
1300    KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE ||
1301        uvm_maxkaddr >= *addr + sz);
1302
1303    /* If we only want a query, return now. */
1304    if (flags & UVM_FLAG_QUERY) {
1305        error = 0;
1306        goto unlock;
1307    }
1308
1309    if (uobj == NULL)
1310        uoffset = 0;
1311    else if (uoffset == UVM_UNKNOWN_OFFSET) {
1312        KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
1313        uoffset = *addr - vm_map_min(kernel_map);
1314    }
1315
1316    /*
1317     * Create new entry.
1318     * first and last may be invalidated after this call.
1319     */
1320    entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1321        new);
1322    if (entry == NULL) {
1323        error = ENOMEM;
1324        goto unlock;
1325    }
1326    new = NULL;
1327    KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1328    entry->object.uvm_obj = uobj;
1329    entry->offset = uoffset;
1330    entry->protection = prot;
1331    entry->max_protection = maxprot;
1332    entry->inheritance = inherit;
1333    entry->wired_count = 0;
1334    entry->advice = advice;
1335    if (flags & UVM_FLAG_STACK) {
1336        entry->etype |= UVM_ET_STACK;
1337        if (flags & UVM_FLAG_UNMAP)
1338            map->serial++;
1339    }
1340    if (uobj)
1341        entry->etype |= UVM_ET_OBJ;
1342    else if (flags & UVM_FLAG_HOLE)
1343        entry->etype |= UVM_ET_HOLE;
1344    if (flags & UVM_FLAG_NOFAULT)
1345        entry->etype |= UVM_ET_NOFAULT;
1346    if (flags & UVM_FLAG_COPYONW) {
1347        entry->etype |= UVM_ET_COPYONWRITE;
1348        if ((flags & UVM_FLAG_OVERLAY) == 0)
1349            entry->etype |= UVM_ET_NEEDSCOPY;
1350    }
1351    if (flags & UVM_FLAG_OVERLAY) {
1352        entry->aref.ar_pageoff = 0;
1353        entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1354    }
1355
1356    /* Update map and process statistics. */
1357    if (!(flags & UVM_FLAG_HOLE)) {
1358        map->size += sz;
1359        if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL) {
1360            ((struct vmspace *)map)->vm_dused +=
1361                uvmspace_dused(map, *addr, *addr + sz);
1362        }
1363    }
1364
1365    /*
1366     * Try to merge entry.
1367     *
1368     * Userland allocations are kept separated most of the time.
1369     * Forego the effort of merging what most of the time can't be merged
1370     * and only try the merge if it concerns a kernel entry.
1371     */
1372    if ((flags & UVM_FLAG_NOMERGE) == 0 &&
1373        (map->flags & VM_MAP_ISVMSPACE) == 0)
1374        uvm_mapent_tryjoin(map, entry, &dead);
1375
1376unlock:
1377    vm_map_unlock(map);
1378
1379    /*
1380     * Remove dead entries.
1381     *
1382     * Dead entries may be the result of merging.
1383     * uvm_map_mkentry may also create dead entries, when it attempts to
1384     * destroy free-space entries.
1385     */
1386    if (map->flags & VM_MAP_INTRSAFE)
1387        uvm_unmap_detach_intrsafe(&dead);
1388    else
1389        uvm_unmap_detach(&dead, 0);
1390out:
1391    if (new)
1392        uvm_mapent_free(new);
1393    return error;
1394}
1395
1396/*
1397 * True iff e1 and e2 can be joined together.
1398 */
1399int
1400uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1,
1401    struct vm_map_entry *e2)
1402{
1403    KDASSERT(e1 != NULL && e2 != NULL);
1404
1405    /* Must be the same entry type and not have free memory between. */
1406    if (e1->etype != e2->etype || e1->end != e2->start)
1407        return 0;
1408
1409    /* Submaps are never joined. */
1410    if (UVM_ET_ISSUBMAP(e1))
1411        return 0;
1412
1413    /* Never merge wired memory. */
1414    if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2))
1415        return 0;
1416
1417    /* Protection, inheritance and advice must be equal. */
1418    if (e1->protection != e2->protection ||
1419        e1->max_protection != e2->max_protection ||
1420        e1->inheritance != e2->inheritance ||
1421        e1->advice != e2->advice)
1422        return 0;
1423
1424    /* If uvm_object: object itself and offsets within object must match. */
1425    if (UVM_ET_ISOBJ(e1)) {
1426        if (e1->object.uvm_obj != e2->object.uvm_obj)
1427            return 0;
1428        if (e1->offset + (e1->end - e1->start) != e2->offset)
1429            return 0;
1430    }
1431
1432    /*
1433     * Cannot join shared amaps.
1434     * Note: no need to lock amap to look at refs, since we don't care
1435     * about its exact value.
1436     * If it is 1 (i.e. we have the only reference) it will stay there.
1437     */
1438    if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1)
1439        return 0;
1440    if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1)
1441        return 0;
1442
1443    /* Apprently, e1 and e2 match. */
1444    return 1;
1445}
1446
1447/*
1448 * Join support function.
1449 *
1450 * Returns the merged entry on succes.
1451 * Returns NULL if the merge failed.
1452 */
1453struct vm_map_entry*
1454uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1,
1455    struct vm_map_entry *e2, struct uvm_map_deadq *dead)
1456{
1457    struct uvm_addr_state *free;
1458
1459    /*
1460     * Merging is not supported for map entries that
1461     * contain an amap in e1. This should never happen
1462     * anyway, because only kernel entries are merged.
1463     * These do not contain amaps.
1464     * e2 contains no real information in its amap,
1465     * so it can be erased immediately.
1466     */
1467    KASSERT(e1->aref.ar_amap == NULL);
1468
1469    /*
1470     * Don't drop obj reference:
1471     * uvm_unmap_detach will do this for us.
1472     */
1473    free = uvm_map_uaddr_e(map, e1);
1474    uvm_mapent_free_remove(map, free, e1);
1475
1476    free = uvm_map_uaddr_e(map, e2);
1477    uvm_mapent_free_remove(map, free, e2);
1478    uvm_mapent_addr_remove(map, e2);
1479    e1->end = e2->end;
1480    e1->guard = e2->guard;
1481    e1->fspace = e2->fspace;
1482    uvm_mapent_free_insert(map, free, e1);
1483
1484    DEAD_ENTRY_PUSH(dead, e2);
1485    return e1;
1486}
1487
1488/*
1489 * Attempt forward and backward joining of entry.
1490 *
1491 * Returns entry after joins.
1492 * We are guaranteed that the amap of entry is either non-existant or
1493 * has never been used.
1494 */
1495struct vm_map_entry*
1496uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry,
1497    struct uvm_map_deadq *dead)
1498{
1499    struct vm_map_entry *other;
1500    struct vm_map_entry *merged;
1501
1502    /* Merge with previous entry. */
1503    other = RBT_PREV(uvm_map_addr, entry);
1504    if (other && uvm_mapent_isjoinable(map, other, entry)) {
1505        merged = uvm_mapent_merge(map, other, entry, dead);
1506        if (merged)
1507            entry = merged;
1508    }
1509
1510    /*
1511     * Merge with next entry.
1512     *
1513     * Because amap can only extend forward and the next entry
1514     * probably contains sensible info, only perform forward merging
1515     * in the absence of an amap.
1516     */
1517    other = RBT_NEXT(uvm_map_addr, entry);
1518    if (other && entry->aref.ar_amap == NULL &&
1519        other->aref.ar_amap == NULL &&
1520        uvm_mapent_isjoinable(map, entry, other)) {
1521        merged = uvm_mapent_merge(map, entry, other, dead);
1522        if (merged)
1523            entry = merged;
1524    }
1525
1526    return entry;
1527}
1528
1529/*
1530 * Kill entries that are no longer in a map.
1531 */
1532void
1533uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags)
1534{
1535    struct vm_map_entry *entry;
1536    int waitok = flags & UVM_PLA_WAITOK;
1537
1538    if (TAILQ_EMPTY(deadq))
1539        return;
1540
1541    KERNEL_LOCK();
1542    while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1543        if (waitok)
1544            uvm_pause();
1545        /* Drop reference to amap, if we've got one. */
1546        if (entry->aref.ar_amap)
1547            amap_unref(entry->aref.ar_amap,
1548                entry->aref.ar_pageoff,
1549                atop(entry->end - entry->start),
1550                flags & AMAP_REFALL);
1551
1552        /* Drop reference to our backing object, if we've got one. */
1553        if (UVM_ET_ISSUBMAP(entry)) {
1554            /* ... unlikely to happen, but play it safe */
1555            uvm_map_deallocate(entry->object.sub_map);
1556        } else if (UVM_ET_ISOBJ(entry) &&
1557            entry->object.uvm_obj->pgops->pgo_detach) {
1558            entry->object.uvm_obj->pgops->pgo_detach(
1559                entry->object.uvm_obj);
1560        }
1561
1562        /* Step to next. */
1563        TAILQ_REMOVE(deadq, entry, dfree.deadq);
1564        uvm_mapent_free(entry);
1565    }
1566    KERNEL_UNLOCK();
1567}
1568
1569void
1570uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq)
1571{
1572    struct vm_map_entry *entry;
1573
1574    while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1575        KASSERT(entry->aref.ar_amap == NULL);
1576        KASSERT(!UVM_ET_ISSUBMAP(entry));
1577        KASSERT(!UVM_ET_ISOBJ(entry));
1578        TAILQ_REMOVE(deadq, entry, dfree.deadq);
1579        uvm_mapent_free(entry);
1580    }
1581}
1582
1583/*
1584 * Create and insert new entry.
1585 *
1586 * Returned entry contains new addresses and is inserted properly in the tree.
1587 * first and last are (probably) no longer valid.
1588 */
1589struct vm_map_entry*
1590uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first,
1591    struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags,
1592    struct uvm_map_deadq *dead, struct vm_map_entry *new)
1593{
1594    struct vm_map_entry *entry, *prev;
1595    struct uvm_addr_state *free;
1596    vaddr_t min, max;   /* free space boundaries for new entry */
1597
1598    KDASSERT(map != NULL);
1599    KDASSERT(first != NULL);
1600    KDASSERT(last != NULL);
1601    KDASSERT(dead != NULL);
1602    KDASSERT(sz > 0);
1603    KDASSERT(addr + sz > addr);
1604    KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr);
1605    KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz);
1606    KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz));
1607    uvm_tree_sanity(map, __FILE__, __LINE__);
1608
1609    min = addr + sz;
1610    max = VMMAP_FREE_END(last);
1611
1612    /* Initialize new entry. */
1613    if (new == NULL)
1614        entry = uvm_mapent_alloc(map, flags);
1615    else
1616        entry = new;
1617    if (entry == NULL)
1618        return NULL;
1619    entry->offset = 0;
1620    entry->etype = 0;
1621    entry->wired_count = 0;
1622    entry->aref.ar_pageoff = 0;
1623    entry->aref.ar_amap = NULL;
1624
1625    entry->start = addr;
1626    entry->end = min;
1627    entry->guard = 0;
1628    entry->fspace = 0;
1629
1630    /* Reset free space in first. */
1631    free = uvm_map_uaddr_e(map, first);
1632    uvm_mapent_free_remove(map, free, first);
1633    first->guard = 0;
1634    first->fspace = 0;
1635
1636    /*
1637     * Remove all entries that are fully replaced.
1638     * We are iterating using last in reverse order.
1639     */
1640    for (; first != last; last = prev) {
1641        prev = RBT_PREV(uvm_map_addr, last);
1642
1643        KDASSERT(last->start == last->end);
1644        free = uvm_map_uaddr_e(map, last);
1645        uvm_mapent_free_remove(map, free, last);
1646        uvm_mapent_addr_remove(map, last);
1647        DEAD_ENTRY_PUSH(dead, last);
1648    }
1649    /* Remove first if it is entirely inside <addr, addr+sz>.  */
1650    if (first->start == addr) {
1651        uvm_mapent_addr_remove(map, first);
1652        DEAD_ENTRY_PUSH(dead, first);
1653    } else {
1654        uvm_map_fix_space(map, first, VMMAP_FREE_START(first),
1655            addr, flags);
1656    }
1657
1658    /* Finally, link in entry. */
1659    uvm_mapent_addr_insert(map, entry);
1660    uvm_map_fix_space(map, entry, min, max, flags);
1661
1662    uvm_tree_sanity(map, __FILE__, __LINE__);
1663    return entry;
1664}
1665
1666
1667/*
1668 * uvm_mapent_alloc: allocate a map entry
1669 */
1670struct vm_map_entry *
1671uvm_mapent_alloc(struct vm_map *map, int flags)
1672{
1673    struct vm_map_entry *me, *ne;
1674    int pool_flags;
1675    int i;
1676
1677    pool_flags = PR_WAITOK;
1678    if (flags & UVM_FLAG_TRYLOCK)
1679        pool_flags = PR_NOWAIT;
1680
1681    if (map->flags & VM_MAP_INTRSAFE || cold) {
1682        mtx_enter(&uvm_kmapent_mtx);
1683        if (SLIST_EMPTY(&uvm.kentry_free)) {
1684            ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty,
1685                &kd_nowait);
1686            if (ne == NULL)
1687                panic("uvm_mapent_alloc: cannot allocate map "
1688                    "entry");
1689            for (i = 0; i < PAGE_SIZE / sizeof(*ne); i++) {
1690                SLIST_INSERT_HEAD(&uvm.kentry_free,
1691                    &ne[i], daddrs.addr_kentry);
1692            }
1693            if (ratecheck(&uvm_kmapent_last_warn_time,
1694                &uvm_kmapent_warn_rate))
1695                printf("uvm_mapent_alloc: out of static "
1696                    "map entries\n");
1697        }
1698        me = SLIST_FIRST(&uvm.kentry_free);
1699        SLIST_REMOVE_HEAD(&uvm.kentry_free, daddrs.addr_kentry);
1700        uvmexp.kmapent++;
1701        mtx_leave(&uvm_kmapent_mtx);
1702        me->flags = UVM_MAP_STATIC;
1703    } else if (map == kernel_map) {
1704        splassert(IPL_NONE);
1705        me = pool_get(&uvm_map_entry_kmem_pool, pool_flags);
1706        if (me == NULL)
1707            goto out;
1708        me->flags = UVM_MAP_KMEM;
1709    } else {
1710        splassert(IPL_NONE);
1711        me = pool_get(&uvm_map_entry_pool, pool_flags);
1712        if (me == NULL)
1713            goto out;
1714        me->flags = 0;
1715    }
1716
1717    if (me != NULL) {
1718        RBT_POISON(uvm_map_addr, me, UVMMAP_DEADBEEF);
1719    }
1720
1721out:
1722    return(me);
1723}
1724
1725/*
1726 * uvm_mapent_free: free map entry
1727 *
1728 * => XXX: static pool for kernel map?
1729 */
1730void
1731uvm_mapent_free(struct vm_map_entry *me)
1732{
1733    if (me->flags & UVM_MAP_STATIC) {
1734        mtx_enter(&uvm_kmapent_mtx);
1735        SLIST_INSERT_HEAD(&uvm.kentry_free, me, daddrs.addr_kentry);
1736        uvmexp.kmapent--;
1737        mtx_leave(&uvm_kmapent_mtx);
1738    } else if (me->flags & UVM_MAP_KMEM) {
1739        splassert(IPL_NONE);
1740        pool_put(&uvm_map_entry_kmem_pool, me);
1741    } else {
1742        splassert(IPL_NONE);
1743        pool_put(&uvm_map_entry_pool, me);
1744    }
1745}
1746
1747/*
1748 * uvm_map_lookup_entry: find map entry at or before an address.
1749 *
1750 * => map must at least be read-locked by caller
1751 * => entry is returned in "entry"
1752 * => return value is true if address is in the returned entry
1753 * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is
1754 * returned for those mappings.
1755 */
1756boolean_t
1757uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
1758    struct vm_map_entry **entry)
1759{
1760    *entry = uvm_map_entrybyaddr(&map->addr, address);
1761    return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
1762        (*entry)->start <= address && (*entry)->end > address;
1763}
1764
1765/*
1766 * Inside a vm_map find the sp address and verify MAP_STACK, and also
1767 * remember low and high regions of that of region  which is marked
1768 * with MAP_STACK.  Return TRUE.
1769 * If sp isn't in a MAP_STACK region return FALSE.
1770 */
1771boolean_t
1772uvm_map_check_stack_range(struct proc *p, vaddr_t sp)
1773{
1774    vm_map_t map = &p->p_vmspace->vm_map;
1775    vm_map_entry_t entry;
1776
1777    if (sp < map->min_offset || sp >= map->max_offset)
1778        return(FALSE);
1779
1780    /* lock map */
1781    vm_map_lock_read(map);
1782
1783    /* lookup */
1784    if (!uvm_map_lookup_entry(map, trunc_page(sp), &entry)) {
1785        vm_map_unlock_read(map);
1786        return(FALSE);
1787    }
1788
1789    if ((entry->etype & UVM_ET_STACK) == 0) {
1790        vm_map_unlock_read(map);
1791        return (FALSE);
1792    }
1793    p->p_spstart = entry->start;
1794    p->p_spend = entry->end;
1795    p->p_spserial = map->serial;
1796    vm_map_unlock_read(map);
1797    return(TRUE);
1798}
1799
1800/*
1801 * Check whether the given address range can be converted to a MAP_STACK
1802 * mapping.
1803 *
1804 * Must be called with map locked.
1805 */
1806boolean_t
1807uvm_map_is_stack_remappable(struct vm_map *map, vaddr_t addr, vaddr_t sz)
1808{
1809    vaddr_t end = addr + sz;
1810    struct vm_map_entry *first, *iter, *prev = NULL;
1811
1812    if (!uvm_map_lookup_entry(map, addr, &first)) {
1813        printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n",
1814            addr, end, map);
1815        return FALSE;
1816    }
1817
1818    /*
1819     * Check that the address range exists and is contiguous.
1820     */
1821    for (iter = first; iter != NULL && iter->start < end;
1822        prev = iter, iter = RBT_NEXT(uvm_map_addr, iter)) {
1823        /*
1824         * Make sure that we do not have holes in the range.
1825         */
1826#if 0
1827        if (prev != NULL) {
1828            printf("prev->start 0x%lx, prev->end 0x%lx, "
1829                "iter->start 0x%lx, iter->end 0x%lx\n",
1830                prev->start, prev->end, iter->start, iter->end);
1831        }
1832#endif
1833
1834        if (prev != NULL && prev->end != iter->start) {
1835            printf("map stack 0x%lx-0x%lx of map %p failed: "
1836                "hole in range\n", addr, end, map);
1837            return FALSE;
1838        }
1839        if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) {
1840            printf("map stack 0x%lx-0x%lx of map %p failed: "
1841                "hole in range\n", addr, end, map);
1842            return FALSE;
1843        }
1844    }
1845
1846    return TRUE;
1847}
1848
1849/*
1850 * Remap the middle-pages of an existing mapping as a stack range.
1851 * If there exists a previous contiguous mapping with the given range
1852 * [addr, addr + sz), with protection PROT_READ|PROT_WRITE, then the
1853 * mapping is dropped, and a new anon mapping is created and marked as
1854 * a stack.
1855 *
1856 * Must be called with map unlocked.
1857 */
1858int
1859uvm_map_remap_as_stack(struct proc *p, vaddr_t addr, vaddr_t sz)
1860{
1861    vm_map_t map = &p->p_vmspace->vm_map;
1862    vaddr_t start, end;
1863    int error;
1864    int flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE,
1865        PROT_READ | PROT_WRITE | PROT_EXEC,
1866        MAP_INHERIT_COPY, MADV_NORMAL,
1867        UVM_FLAG_STACK | UVM_FLAG_FIXED | UVM_FLAG_UNMAP |
1868        UVM_FLAG_COPYONW);
1869
1870    start = round_page(addr);
1871    end = trunc_page(addr + sz);
1872#ifdef MACHINE_STACK_GROWS_UP
1873    if (end == addr + sz)
1874        end -= PAGE_SIZE;
1875#else
1876    if (start == addr)
1877        start += PAGE_SIZE;
1878#endif
1879
1880    if (start < map->min_offset || end >= map->max_offset || end < start)
1881        return EINVAL;
1882
1883    error = uvm_mapanon(map, &start, end - start, 0, flags);
1884    if (error != 0)
1885        printf("map stack for pid %d failed\n", p->p_p->ps_pid);
1886
1887    return error;
1888}
1889
1890/*
1891 * uvm_map_pie: return a random load address for a PIE executable
1892 * properly aligned.
1893 */
1894#ifndef VM_PIE_MAX_ADDR
1895#define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4)
1896#endif
1897
1898#ifndef VM_PIE_MIN_ADDR
1899#define VM_PIE_MIN_ADDR VM_MIN_ADDRESS
1900#endif
1901
1902#ifndef VM_PIE_MIN_ALIGN
1903#define VM_PIE_MIN_ALIGN PAGE_SIZE
1904#endif
1905
1906vaddr_t
1907uvm_map_pie(vaddr_t align)
1908{
1909    vaddr_t addr, space, min;
1910
1911    align = MAX(align, VM_PIE_MIN_ALIGN);
1912
1913    /* round up to next alignment */
1914    min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1);
1915
1916    if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR)
1917        return (align);
1918
1919    space = (VM_PIE_MAX_ADDR - min) / align;
1920    space = MIN(space, (u_int32_t)-1);
1921
1922    addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align;
1923    addr += min;
1924
1925    return (addr);
1926}
1927
1928void
1929uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end)
1930{
1931    struct uvm_map_deadq dead;
1932
1933    KASSERT((start & (vaddr_t)PAGE_MASK) == 0 &&
1934        (end & (vaddr_t)PAGE_MASK) == 0);
1935    TAILQ_INIT(&dead);
1936    vm_map_lock(map);
1937    uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE);
1938    vm_map_unlock(map);
1939
1940    if (map->flags & VM_MAP_INTRSAFE)
1941        uvm_unmap_detach_intrsafe(&dead);
1942    else
1943        uvm_unmap_detach(&dead, 0);
1944}
1945
1946/*
1947 * Mark entry as free.
1948 *
1949 * entry will be put on the dead list.
1950 * The free space will be merged into the previous or a new entry,
1951 * unless markfree is false.
1952 */
1953void
1954uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry,
1955    struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead,
1956    boolean_t markfree)
1957{
1958    struct uvm_addr_state   *free;
1959    struct vm_map_entry *prev;
1960    vaddr_t          addr;  /* Start of freed range. */
1961    vaddr_t          end;   /* End of freed range. */
1962
1963    prev = *prev_ptr;
1964    if (prev == entry)
1965        *prev_ptr = prev = NULL;
1966
1967    if (prev == NULL ||
1968        VMMAP_FREE_END(prev) != entry->start)
1969        prev = RBT_PREV(uvm_map_addr, entry);
1970
1971    /* Entry is describing only free memory and has nothing to drain into. */
1972    if (prev == NULL && entry->start == entry->end && markfree) {
1973        *prev_ptr = entry;
1974        return;
1975    }
1976
1977    addr = entry->start;
1978    end = VMMAP_FREE_END(entry);
1979    free = uvm_map_uaddr_e(map, entry);
1980    uvm_mapent_free_remove(map, free, entry);
1981    uvm_mapent_addr_remove(map, entry);
1982    DEAD_ENTRY_PUSH(dead, entry);
1983
1984    if (markfree) {
1985        if (prev) {
1986            free = uvm_map_uaddr_e(map, prev);
1987            uvm_mapent_free_remove(map, free, prev);
1988        }
1989        *prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0);
1990    }
1991}
1992
1993/*
1994 * Unwire and release referenced amap and object from map entry.
1995 */
1996void
1997uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry)
1998{
1999    /* Unwire removed map entry. */
2000    if (VM_MAPENT_ISWIRED(entry)) {
2001        KERNEL_LOCK();
2002        entry->wired_count = 0;
2003        uvm_fault_unwire_locked(map, entry->start, entry->end);
2004        KERNEL_UNLOCK();
2005    }
2006
2007    /* Entry-type specific code. */
2008    if (UVM_ET_ISHOLE(entry)) {
2009        /* Nothing to be done for holes. */
2010    } else if (map->flags & VM_MAP_INTRSAFE) {
2011        KASSERT(vm_map_pmap(map) == pmap_kernel());
2012        uvm_km_pgremove_intrsafe(entry->start, entry->end);
2013        pmap_kremove(entry->start, entry->end - entry->start);
2014    } else if (UVM_ET_ISOBJ(entry) &&
2015        UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
2016        KASSERT(vm_map_pmap(map) == pmap_kernel());
2017        /*
2018         * Note: kernel object mappings are currently used in
2019         * two ways:
2020         *  [1] "normal" mappings of pages in the kernel object
2021         *  [2] uvm_km_valloc'd allocations in which we
2022         *      pmap_enter in some non-kernel-object page
2023         *      (e.g. vmapbuf).
2024         *
2025         * for case [1], we need to remove the mapping from
2026         * the pmap and then remove the page from the kernel
2027         * object (because, once pages in a kernel object are
2028         * unmapped they are no longer needed, unlike, say,
2029         * a vnode where you might want the data to persist
2030         * until flushed out of a queue).
2031         *
2032         * for case [2], we need to remove the mapping from
2033         * the pmap.  there shouldn't be any pages at the
2034         * specified offset in the kernel object [but it
2035         * doesn't hurt to call uvm_km_pgremove just to be
2036         * safe?]
2037         *
2038         * uvm_km_pgremove currently does the following:
2039         *   for pages in the kernel object range:
2040         *     - drops the swap slot
2041         *     - uvm_pagefree the page
2042         *
2043         * note there is version of uvm_km_pgremove() that
2044         * is used for "intrsafe" objects.
2045         */
2046        /*
2047         * remove mappings from pmap and drop the pages
2048         * from the object.  offsets are always relative
2049         * to vm_map_min(kernel_map).
2050         */
2051        pmap_remove(pmap_kernel(), entry->start, entry->end);
2052        uvm_km_pgremove(entry->object.uvm_obj,
2053            entry->start - vm_map_min(kernel_map),
2054            entry->end - vm_map_min(kernel_map));
2055
2056        /*
2057         * null out kernel_object reference, we've just
2058         * dropped it
2059         */
2060        entry->etype &= ~UVM_ET_OBJ;
2061        entry->object.uvm_obj = NULL;  /* to be safe */
2062    } else {
2063        /* remove mappings the standard way. */
2064        pmap_remove(map->pmap, entry->start, entry->end);
2065    }
2066}
2067
2068/*
2069 * Remove all entries from start to end.
2070 *
2071 * If remove_holes, then remove ET_HOLE entries as well.
2072 * If markfree, entry will be properly marked free, otherwise, no replacement
2073 * entry will be put in the tree (corrupting the tree).
2074 */
2075void
2076uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
2077    struct uvm_map_deadq *dead, boolean_t remove_holes,
2078    boolean_t markfree)
2079{
2080    struct vm_map_entry *prev_hint, *next, *entry;
2081
2082    start = MAX(start, map->min_offset);
2083    end = MIN(end, map->max_offset);
2084    if (start >= end)
2085        return;
2086
2087    if ((map->flags & VM_MAP_INTRSAFE) == 0)
2088        splassert(IPL_NONE);
2089    else
2090        splassert(IPL_VM);
2091
2092    /* Find first affected entry. */
2093    entry = uvm_map_entrybyaddr(&map->addr, start);
2094    KDASSERT(entry != NULL && entry->start <= start);
2095    if (entry->end <= start && markfree)
2096        entry = RBT_NEXT(uvm_map_addr, entry);
2097    else
2098        UVM_MAP_CLIP_START(map, entry, start);
2099
2100    /*
2101     * Iterate entries until we reach end address.
2102     * prev_hint hints where the freed space can be appended to.
2103     */
2104    prev_hint = NULL;
2105    for (; entry != NULL && entry->start < end; entry = next) {
2106        KDASSERT(entry->start >= start);
2107        if (entry->end > end || !markfree)
2108            UVM_MAP_CLIP_END(map, entry, end);
2109        KDASSERT(entry->start >= start && entry->end <= end);
2110        next = RBT_NEXT(uvm_map_addr, entry);
2111
2112        /* Don't remove holes unless asked to do so. */
2113        if (UVM_ET_ISHOLE(entry)) {
2114            if (!remove_holes) {
2115                prev_hint = entry;
2116                continue;
2117            }
2118        }
2119
2120        /* A stack has been removed.. */
2121        if (UVM_ET_ISSTACK(entry) && (map->flags & VM_MAP_ISVMSPACE))
2122            map->serial++;
2123
2124        /* Kill entry. */
2125        uvm_unmap_kill_entry(map, entry);
2126
2127        /* Update space usage. */
2128        if ((map->flags & VM_MAP_ISVMSPACE) &&
2129            entry->object.uvm_obj == NULL &&
2130            !UVM_ET_ISHOLE(entry)) {
2131            ((struct vmspace *)map)->vm_dused -=
2132                uvmspace_dused(map, entry->start, entry->end);
2133        }
2134        if (!UVM_ET_ISHOLE(entry))
2135            map->size -= entry->end - entry->start;
2136
2137        /* Actual removal of entry. */
2138        uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree);
2139    }
2140
2141    pmap_update(vm_map_pmap(map));
2142
2143#ifdef VMMAP_DEBUG
2144    if (markfree) {
2145        for (entry = uvm_map_entrybyaddr(&map->addr, start);
2146            entry != NULL && entry->start < end;
2147            entry = RBT_NEXT(uvm_map_addr, entry)) {
2148            KDASSERT(entry->end <= start ||
2149                entry->start == entry->end ||
2150                UVM_ET_ISHOLE(entry));
2151        }
2152    } else {
2153        vaddr_t a;
2154        for (a = start; a < end; a += PAGE_SIZE)
2155            KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL);
2156    }
2157#endif
2158}
2159
2160/*
2161 * Mark all entries from first until end (exclusive) as pageable.
2162 *
2163 * Lock must be exclusive on entry and will not be touched.
2164 */
2165void
2166uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first,
2167    struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr)
2168{
2169    struct vm_map_entry *iter;
2170
2171    for (iter = first; iter != end;
2172        iter = RBT_NEXT(uvm_map_addr, iter)) {
2173        KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2174        if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2175            continue;
2176
2177        iter->wired_count = 0;
2178        uvm_fault_unwire_locked(map, iter->start, iter->end);
2179    }
2180}
2181
2182/*
2183 * Mark all entries from first until end (exclusive) as wired.
2184 *
2185 * Lockflags determines the lock state on return from this function.
2186 * Lock must be exclusive on entry.
2187 */
2188int
2189uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first,
2190    struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr,
2191    int lockflags)
2192{
2193    struct vm_map_entry *iter;
2194#ifdef DIAGNOSTIC
2195    unsigned int timestamp_save;
2196#endif
2197    int error;
2198
2199    /*
2200     * Wire pages in two passes:
2201     *
2202     * 1: holding the write lock, we create any anonymous maps that need
2203     *    to be created.  then we clip each map entry to the region to
2204     *    be wired and increment its wiring count.
2205     *
2206     * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
2207     *    in the pages for any newly wired area (wired_count == 1).
2208     *
2209     *    downgrading to a read lock for uvm_fault_wire avoids a possible
2210     *    deadlock with another thread that may have faulted on one of
2211     *    the pages to be wired (it would mark the page busy, blocking
2212     *    us, then in turn block on the map lock that we hold).
2213     *    because we keep the read lock on the map, the copy-on-write
2214     *    status of the entries we modify here cannot change.
2215     */
2216    for (iter = first; iter != end;
2217        iter = RBT_NEXT(uvm_map_addr, iter)) {
2218        KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2219        if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2220            iter->protection == PROT_NONE)
2221            continue;
2222
2223        /*
2224         * Perform actions of vm_map_lookup that need the write lock.
2225         * - create an anonymous map for copy-on-write
2226         * - anonymous map for zero-fill
2227         * Skip submaps.
2228         */
2229        if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) &&
2230            UVM_ET_ISNEEDSCOPY(iter) &&
2231            ((iter->protection & PROT_WRITE) ||
2232            iter->object.uvm_obj == NULL)) {
2233            amap_copy(map, iter, M_WAITOK,
2234                UVM_ET_ISSTACK(iter) ? FALSE : TRUE,
2235                iter->start, iter->end);
2236        }
2237        iter->wired_count++;
2238    }
2239
2240    /*
2241     * Pass 2.
2242     */
2243#ifdef DIAGNOSTIC
2244    timestamp_save = map->timestamp;
2245#endif
2246    vm_map_busy(map);
2247    vm_map_downgrade(map);
2248
2249    error = 0;
2250    for (iter = first; error == 0 && iter != end;
2251        iter = RBT_NEXT(uvm_map_addr, iter)) {
2252        if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2253            iter->protection == PROT_NONE)
2254            continue;
2255
2256        error = uvm_fault_wire(map, iter->start, iter->end,
2257            iter->protection);
2258    }
2259
2260    if (error) {
2261        /*
2262         * uvm_fault_wire failure
2263         *
2264         * Reacquire lock and undo our work.
2265         */
2266        vm_map_upgrade(map);
2267        vm_map_unbusy(map);
2268#ifdef DIAGNOSTIC
2269        if (timestamp_save != map->timestamp)
2270            panic("uvm_map_pageable_wire: stale map");
2271#endif
2272
2273        /*
2274         * first is no longer needed to restart loops.
2275         * Use it as iterator to unmap successful mappings.
2276         */
2277        for (; first != iter;
2278            first = RBT_NEXT(uvm_map_addr, first)) {
2279            if (UVM_ET_ISHOLE(first) ||
2280                first->start == first->end ||
2281                first->protection == PROT_NONE)
2282                continue;
2283
2284            first->wired_count--;
2285            if (!VM_MAPENT_ISWIRED(first)) {
2286                uvm_fault_unwire_locked(map,
2287                    iter->start, iter->end);
2288            }
2289        }
2290
2291        /* decrease counter in the rest of the entries */
2292        for (; iter != end;
2293            iter = RBT_NEXT(uvm_map_addr, iter)) {
2294            if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2295                iter->protection == PROT_NONE)
2296                continue;
2297
2298            iter->wired_count--;
2299        }
2300
2301        if ((lockflags & UVM_LK_EXIT) == 0)
2302            vm_map_unlock(map);
2303        return error;
2304    }
2305
2306    /* We are currently holding a read lock. */
2307    if ((lockflags & UVM_LK_EXIT) == 0) {
2308        vm_map_unbusy(map);
2309        vm_map_unlock_read(map);
2310    } else {
2311        vm_map_upgrade(map);
2312        vm_map_unbusy(map);
2313#ifdef DIAGNOSTIC
2314        if (timestamp_save != map->timestamp)
2315            panic("uvm_map_pageable_wire: stale map");
2316#endif
2317    }
2318    return 0;
2319}
2320
2321/*
2322 * uvm_map_pageable: set pageability of a range in a map.
2323 *
2324 * Flags:
2325 * UVM_LK_ENTER: map is already locked by caller
2326 * UVM_LK_EXIT:  don't unlock map on exit
2327 *
2328 * The full range must be in use (entries may not have fspace != 0).
2329 * UVM_ET_HOLE counts as unmapped.
2330 */
2331int
2332uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
2333    boolean_t new_pageable, int lockflags)
2334{
2335    struct vm_map_entry *first, *last, *tmp;
2336    int error;
2337
2338    start = trunc_page(start);
2339    end = round_page(end);
2340
2341    if (start > end)
2342        return EINVAL;
2343    if (start == end)
2344        return 0;   /* nothing to do */
2345    if (start < map->min_offset)
2346        return EFAULT; /* why? see first XXX below */
2347    if (end > map->max_offset)
2348        return EINVAL; /* why? see second XXX below */
2349
2350    KASSERT(map->flags & VM_MAP_PAGEABLE);
2351    if ((lockflags & UVM_LK_ENTER) == 0)
2352        vm_map_lock(map);
2353
2354    /*
2355     * Find first entry.
2356     *
2357     * Initial test on start is different, because of the different
2358     * error returned. Rest is tested further down.
2359     */
2360    first = uvm_map_entrybyaddr(&map->addr, start);
2361    if (first->end <= start || UVM_ET_ISHOLE(first)) {
2362        /*
2363         * XXX if the first address is not mapped, it is EFAULT?
2364         */
2365        error = EFAULT;
2366        goto out;
2367    }
2368
2369    /* Check that the range has no holes. */
2370    for (last = first; last != NULL && last->start < end;
2371        last = RBT_NEXT(uvm_map_addr, last)) {
2372        if (UVM_ET_ISHOLE(last) ||
2373            (last->end < end && VMMAP_FREE_END(last) != last->end)) {
2374            /*
2375             * XXX unmapped memory in range, why is it EINVAL
2376             * instead of EFAULT?
2377             */
2378            error = EINVAL;
2379            goto out;
2380        }
2381    }
2382
2383    /*
2384     * Last ended at the first entry after the range.
2385     * Move back one step.
2386     *
2387     * Note that last may be NULL.
2388     */
2389    if (last == NULL) {
2390        last = RBT_MAX(uvm_map_addr, &map->addr);
2391        if (last->end < end) {
2392            error = EINVAL;
2393            goto out;
2394        }
2395    } else {
2396        KASSERT(last != first);
2397        last = RBT_PREV(uvm_map_addr, last);
2398    }
2399
2400    /* Wire/unwire pages here. */
2401    if (new_pageable) {
2402        /*
2403         * Mark pageable.
2404         * entries that are not wired are untouched.
2405         */
2406        if (VM_MAPENT_ISWIRED(first))
2407            UVM_MAP_CLIP_START(map, first, start);
2408        /*
2409         * Split last at end.
2410         * Make tmp be the first entry after what is to be touched.
2411         * If last is not wired, don't touch it.
2412         */
2413        if (VM_MAPENT_ISWIRED(last)) {
2414            UVM_MAP_CLIP_END(map, last, end);
2415            tmp = RBT_NEXT(uvm_map_addr, last);
2416        } else
2417            tmp = last;
2418
2419        uvm_map_pageable_pgon(map, first, tmp, start, end);
2420        error = 0;
2421
2422out:
2423        if ((lockflags & UVM_LK_EXIT) == 0)
2424            vm_map_unlock(map);
2425        return error;
2426    } else {
2427        /*
2428         * Mark entries wired.
2429         * entries are always touched (because recovery needs this).
2430         */
2431        if (!VM_MAPENT_ISWIRED(first))
2432            UVM_MAP_CLIP_START(map, first, start);
2433        /*
2434         * Split last at end.
2435         * Make tmp be the first entry after what is to be touched.
2436         * If last is not wired, don't touch it.
2437         */
2438        if (!VM_MAPENT_ISWIRED(last)) {
2439            UVM_MAP_CLIP_END(map, last, end);
2440            tmp = RBT_NEXT(uvm_map_addr, last);
2441        } else
2442            tmp = last;
2443
2444        return uvm_map_pageable_wire(map, first, tmp, start, end,
2445            lockflags);
2446    }
2447}
2448
2449/*
2450 * uvm_map_pageable_all: special case of uvm_map_pageable - affects
2451 * all mapped regions.
2452 *
2453 * Map must not be locked.
2454 * If no flags are specified, all ragions are unwired.
2455 */
2456int
2457uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
2458{
2459    vsize_t size;
2460    struct vm_map_entry *iter;
2461
2462    KASSERT(map->flags & VM_MAP_PAGEABLE);
2463    vm_map_lock(map);
2464
2465    if (flags == 0) {
2466        uvm_map_pageable_pgon(map, RBT_MIN(uvm_map_addr, &map->addr),
2467            NULL, map->min_offset, map->max_offset);
2468
2469        vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
2470        vm_map_unlock(map);
2471        return 0;
2472    }
2473
2474    if (flags & MCL_FUTURE)
2475        vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
2476    if (!(flags & MCL_CURRENT)) {
2477        vm_map_unlock(map);
2478        return 0;
2479    }
2480
2481    /*
2482     * Count number of pages in all non-wired entries.
2483     * If the number exceeds the limit, abort.
2484     */
2485    size = 0;
2486    RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2487        if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2488            continue;
2489
2490        size += iter->end - iter->start;
2491    }
2492
2493    if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
2494        vm_map_unlock(map);
2495        return ENOMEM;
2496    }
2497
2498    /* XXX non-pmap_wired_count case must be handled by caller */
2499#ifdef pmap_wired_count
2500    if (limit != 0 &&
2501        size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) {
2502        vm_map_unlock(map);
2503        return ENOMEM;
2504    }
2505#endif
2506
2507    /*
2508     * uvm_map_pageable_wire will release lcok
2509     */
2510    return uvm_map_pageable_wire(map, RBT_MIN(uvm_map_addr, &map->addr),
2511        NULL, map->min_offset, map->max_offset, 0);
2512}
2513
2514/*
2515 * Initialize map.
2516 *
2517 * Allocates sufficient entries to describe the free memory in the map.
2518 */
2519void
2520uvm_map_setup(struct vm_map *map, vaddr_t min, vaddr_t max, int flags)
2521{
2522    int i;
2523
2524    KASSERT((min & (vaddr_t)PAGE_MASK) == 0);
2525    KASSERT((max & (vaddr_t)PAGE_MASK) == 0 ||
2526        (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
2527
2528    /*
2529     * Update parameters.
2530     *
2531     * This code handles (vaddr_t)-1 and other page mask ending addresses
2532     * properly.
2533     * We lose the top page if the full virtual address space is used.
2534     */
2535    if (max & (vaddr_t)PAGE_MASK) {
2536        max += 1;
2537        if (max == 0) /* overflow */
2538            max -= PAGE_SIZE;
2539    }
2540
2541    RBT_INIT(uvm_map_addr, &map->addr);
2542    map->uaddr_exe = NULL;
2543    for (i = 0; i < nitems(map->uaddr_any); ++i)
2544        map->uaddr_any[i] = NULL;
2545    map->uaddr_brk_stack = NULL;
2546
2547    map->size = 0;
2548    map->ref_count = 0;
2549    map->min_offset = min;
2550    map->max_offset = max;
2551    map->b_start = map->b_end = 0; /* Empty brk() area by default. */
2552    map->s_start = map->s_end = 0; /* Empty stack area by default. */
2553    map->flags = flags;
2554    map->timestamp = 0;
2555    rw_init_flags(&map->lock, "vmmaplk", RWL_DUPOK);
2556    mtx_init(&map->mtx, IPL_VM);
2557    mtx_init(&map->flags_lock, IPL_VM);
2558
2559    /* Configure the allocators. */
2560    if (flags & VM_MAP_ISVMSPACE)
2561        uvm_map_setup_md(map);
2562    else
2563        map->uaddr_any[3] = &uaddr_kbootstrap;
2564
2565    /*
2566     * Fill map entries.
2567     * We do not need to write-lock the map here because only the current
2568     * thread sees it right now. Initialize ref_count to 0 above to avoid
2569     * bogus triggering of lock-not-held assertions.
2570     */
2571    uvm_map_setup_entries(map);
2572    uvm_tree_sanity(map, __FILE__, __LINE__);
2573    map->ref_count = 1;
2574}
2575
2576/*
2577 * Destroy the map.
2578 *
2579 * This is the inverse operation to uvm_map_setup.
2580 */
2581void
2582uvm_map_teardown(struct vm_map *map)
2583{
2584    struct uvm_map_deadq     dead_entries;
2585    struct vm_map_entry *entry, *tmp;
2586#ifdef VMMAP_DEBUG
2587    size_t           numq, numt;
2588#endif
2589    int          i;
2590
2591    KERNEL_ASSERT_LOCKED();
2592    KERNEL_UNLOCK();
2593    KERNEL_ASSERT_UNLOCKED();
2594
2595    KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
2596
2597    /* Remove address selectors. */
2598    uvm_addr_destroy(map->uaddr_exe);
2599    map->uaddr_exe = NULL;
2600    for (i = 0; i < nitems(map->uaddr_any); i++) {
2601        uvm_addr_destroy(map->uaddr_any[i]);
2602        map->uaddr_any[i] = NULL;
2603    }
2604    uvm_addr_destroy(map->uaddr_brk_stack);
2605    map->uaddr_brk_stack = NULL;
2606
2607    /*
2608     * Remove entries.
2609     *
2610     * The following is based on graph breadth-first search.
2611     *
2612     * In color terms:
2613     * - the dead_entries set contains all nodes that are reachable
2614     *   (i.e. both the black and the grey nodes)
2615     * - any entry not in dead_entries is white
2616     * - any entry that appears in dead_entries before entry,
2617     *   is black, the rest is grey.
2618     * The set [entry, end] is also referred to as the wavefront.
2619     *
2620     * Since the tree is always a fully connected graph, the breadth-first
2621     * search guarantees that each vmmap_entry is visited exactly once.
2622     * The vm_map is broken down in linear time.
2623     */
2624    TAILQ_INIT(&dead_entries);
2625    if ((entry = RBT_ROOT(uvm_map_addr, &map->addr)) != NULL)
2626        DEAD_ENTRY_PUSH(&dead_entries, entry);
2627    while (entry != NULL) {
2628        sched_pause(yield);
2629        uvm_unmap_kill_entry(map, entry);
2630        if ((tmp = RBT_LEFT(uvm_map_addr, entry)) != NULL)
2631            DEAD_ENTRY_PUSH(&dead_entries, tmp);
2632        if ((tmp = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
2633            DEAD_ENTRY_PUSH(&dead_entries, tmp);
2634        /* Update wave-front. */
2635        entry = TAILQ_NEXT(entry, dfree.deadq);
2636    }
2637
2638#ifdef VMMAP_DEBUG
2639    numt = numq = 0;
2640    RBT_FOREACH(entry, uvm_map_addr, &map->addr)
2641        numt++;
2642    TAILQ_FOREACH(entry, &dead_entries, dfree.deadq)
2643        numq++;
2644    KASSERT(numt == numq);
2645#endif
2646    uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK);
2647
2648    KERNEL_LOCK();
2649
2650    pmap_destroy(map->pmap);
2651    map->pmap = NULL;
2652}
2653
2654/*
2655 * Populate map with free-memory entries.
2656 *
2657 * Map must be initialized and empty.
2658 */
2659void
2660uvm_map_setup_entries(struct vm_map *map)
2661{
2662    KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
2663
2664    uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0);
2665}
2666
2667/*
2668 * Split entry at given address.
2669 *
2670 * orig:  entry that is to be split.
2671 * next:  a newly allocated map entry that is not linked.
2672 * split: address at which the split is done.
2673 */
2674void
2675uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig,
2676    struct vm_map_entry *next, vaddr_t split)
2677{
2678    struct uvm_addr_state *free, *free_before;
2679    vsize_t adj;
2680
2681    if ((split & PAGE_MASK) != 0) {
2682        panic("uvm_map_splitentry: split address 0x%lx "
2683            "not on page boundary!", split);
2684    }
2685    KDASSERT(map != NULL && orig != NULL && next != NULL);
2686    uvm_tree_sanity(map, __FILE__, __LINE__);
2687    KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split);
2688
2689#ifdef VMMAP_DEBUG
2690    KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, orig) == orig);
2691    KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, next) != next);
2692#endif /* VMMAP_DEBUG */
2693
2694    /*
2695     * Free space will change, unlink from free space tree.
2696     */
2697    free = uvm_map_uaddr_e(map, orig);
2698    uvm_mapent_free_remove(map, free, orig);
2699
2700    adj = split - orig->start;
2701
2702    uvm_mapent_copy(orig, next);
2703    if (split >= orig->end) {
2704        next->etype = 0;
2705        next->offset = 0;
2706        next->wired_count = 0;
2707        next->start = next->end = split;
2708        next->guard = 0;
2709        next->fspace = VMMAP_FREE_END(orig) - split;
2710        next->aref.ar_amap = NULL;
2711        next->aref.ar_pageoff = 0;
2712        orig->guard = MIN(orig->guard, split - orig->end);
2713        orig->fspace = split - VMMAP_FREE_START(orig);
2714    } else {
2715        orig->fspace = 0;
2716        orig->guard = 0;
2717        orig->end = next->start = split;
2718
2719        if (next->aref.ar_amap) {
2720            KERNEL_LOCK();
2721            amap_splitref(&orig->aref, &next->aref, adj);
2722            KERNEL_UNLOCK();
2723        }
2724        if (UVM_ET_ISSUBMAP(orig)) {
2725            uvm_map_reference(next->object.sub_map);
2726            next->offset += adj;
2727        } else if (UVM_ET_ISOBJ(orig)) {
2728            if (next->object.uvm_obj->pgops &&
2729                next->object.uvm_obj->pgops->pgo_reference) {
2730                KERNEL_LOCK();
2731                next->object.uvm_obj->pgops->pgo_reference(
2732                    next->object.uvm_obj);
2733                KERNEL_UNLOCK();
2734            }
2735            next->offset += adj;
2736        }
2737    }
2738
2739    /*
2740     * Link next into address tree.
2741     * Link orig and next into free-space tree.
2742     *
2743     * Don't insert 'next' into the addr tree until orig has been linked,
2744     * in case the free-list looks at adjecent entries in the addr tree
2745     * for its decisions.
2746     */
2747    if (orig->fspace > 0)
2748        free_before = free;
2749    else
2750        free_before = uvm_map_uaddr_e(map, orig);
2751    uvm_mapent_free_insert(map, free_before, orig);
2752    uvm_mapent_addr_insert(map, next);
2753    uvm_mapent_free_insert(map, free, next);
2754
2755    uvm_tree_sanity(map, __FILE__, __LINE__);
2756}
2757
2758
2759#ifdef VMMAP_DEBUG
2760
2761void
2762uvm_tree_assert(struct vm_map *map, int test, char *test_str,
2763    char *file, int line)
2764{
2765    char* map_special;
2766
2767    if (test)
2768        return;
2769
2770    if (map == kernel_map)
2771        map_special = " (kernel_map)";
2772    else if (map == kmem_map)
2773        map_special = " (kmem_map)";
2774    else
2775        map_special = "";
2776    panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file,
2777        line, test_str);
2778}
2779
2780/*
2781 * Check that map is sane.
2782 */
2783void
2784uvm_tree_sanity(struct vm_map *map, char *file, int line)
2785{
2786    struct vm_map_entry *iter;
2787    vaddr_t          addr;
2788    vaddr_t          min, max, bound; /* Bounds checker. */
2789    struct uvm_addr_state   *free;
2790
2791    addr = vm_map_min(map);
2792    RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2793        /*
2794         * Valid start, end.
2795         * Catch overflow for end+fspace.
2796         */
2797        UVM_ASSERT(map, iter->end >= iter->start, file, line);
2798        UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line);
2799
2800        /* May not be empty. */
2801        UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter),
2802            file, line);
2803
2804        /* Addresses for entry must lie within map boundaries. */
2805        UVM_ASSERT(map, iter->start >= vm_map_min(map) &&
2806            VMMAP_FREE_END(iter) <= vm_map_max(map), file, line);
2807
2808        /* Tree may not have gaps. */
2809        UVM_ASSERT(map, iter->start == addr, file, line);
2810        addr = VMMAP_FREE_END(iter);
2811
2812        /*
2813         * Free space may not cross boundaries, unless the same
2814         * free list is used on both sides of the border.
2815         */
2816        min = VMMAP_FREE_START(iter);
2817        max = VMMAP_FREE_END(iter);
2818
2819        while (min < max &&
2820            (bound = uvm_map_boundary(map, min, max)) != max) {
2821            UVM_ASSERT(map,
2822                uvm_map_uaddr(map, bound - 1) ==
2823                uvm_map_uaddr(map, bound),
2824                file, line);
2825            min = bound;
2826        }
2827
2828        free = uvm_map_uaddr_e(map, iter);
2829        if (free) {
2830            UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0,
2831                file, line);
2832        } else {
2833            UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0,
2834                file, line);
2835        }
2836    }
2837    UVM_ASSERT(map, addr == vm_map_max(map), file, line);
2838}
2839
2840void
2841uvm_tree_size_chk(struct vm_map *map, char *file, int line)
2842{
2843    struct vm_map_entry *iter;
2844    vsize_t size;
2845
2846    size = 0;
2847    RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2848        if (!UVM_ET_ISHOLE(iter))
2849            size += iter->end - iter->start;
2850    }
2851
2852    if (map->size != size)
2853        printf("map size = 0x%lx, should be 0x%lx\n", map->size, size);
2854    UVM_ASSERT(map, map->size == size, file, line);
2855
2856    vmspace_validate(map);
2857}
2858
2859/*
2860 * This function validates the statistics on vmspace.
2861 */
2862void
2863vmspace_validate(struct vm_map *map)
2864{
2865    struct vmspace *vm;
2866    struct vm_map_entry *iter;
2867    vaddr_t imin, imax;
2868    vaddr_t stack_begin, stack_end; /* Position of stack. */
2869    vsize_t stack, heap; /* Measured sizes. */
2870
2871    if (!(map->flags & VM_MAP_ISVMSPACE))
2872        return;
2873
2874    vm = (struct vmspace *)map;
2875    stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2876    stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2877
2878    stack = heap = 0;
2879    RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2880        imin = imax = iter->start;
2881
2882        if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL)
2883            continue;
2884
2885        /*
2886         * Update stack, heap.
2887         * Keep in mind that (theoretically) the entries of
2888         * userspace and stack may be joined.
2889         */
2890        while (imin != iter->end) {
2891            /*
2892             * Set imax to the first boundary crossed between
2893             * imin and stack addresses.
2894             */
2895            imax = iter->end;
2896            if (imin < stack_begin && imax > stack_begin)
2897                imax = stack_begin;
2898            else if (imin < stack_end && imax > stack_end)
2899                imax = stack_end;
2900
2901            if (imin >= stack_begin && imin < stack_end)
2902                stack += imax - imin;
2903            else
2904                heap += imax - imin;
2905            imin = imax;
2906        }
2907    }
2908
2909    heap >>= PAGE_SHIFT;
2910    if (heap != vm->vm_dused) {
2911        printf("vmspace stack range: 0x%lx-0x%lx\n",
2912            stack_begin, stack_end);
2913        panic("vmspace_validate: vmspace.vm_dused invalid, "
2914            "expected %ld pgs, got %ld pgs in map %p",
2915            heap, vm->vm_dused,
2916            map);
2917    }
2918}
2919
2920#endif /* VMMAP_DEBUG */
2921
2922/*
2923 * uvm_map_init: init mapping system at boot time.   note that we allocate
2924 * and init the static pool of structs vm_map_entry for the kernel here.
2925 */
2926void
2927uvm_map_init(void)
2928{
2929    static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
2930    int lcv;
2931
2932    /* now set up static pool of kernel map entries ... */
2933    mtx_init(&uvm_kmapent_mtx, IPL_VM);
2934    SLIST_INIT(&uvm.kentry_free);
2935    for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
2936        SLIST_INSERT_HEAD(&uvm.kentry_free,
2937            &kernel_map_entry[lcv], daddrs.addr_kentry);
2938    }
2939
2940    /* initialize the map-related pools. */
2941    pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), 0,
2942        IPL_NONE, PR_WAITOK, "vmsppl", NULL);
2943    pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), 0,
2944        IPL_VM, PR_WAITOK, "vmmpepl", NULL);
2945    pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), 0,
2946        IPL_VM, 0, "vmmpekpl", NULL);
2947    pool_sethiwat(&uvm_map_entry_pool, 8192);
2948
2949    uvm_addr_init();
2950}
2951
2952#if defined(DDB)
2953
2954/*
2955 * DDB hooks
2956 */
2957
2958/*
2959 * uvm_map_printit: actually prints the map
2960 */
2961void
2962uvm_map_printit(struct vm_map *map, boolean_t full,
2963    int (*pr)(const char *, ...))
2964{
2965    struct vmspace          *vm;
2966    struct vm_map_entry     *entry;
2967    struct uvm_addr_state       *free;
2968    int              in_free, i;
2969    char                 buf[8];
2970
2971    (*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
2972    (*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n",
2973        map->b_start, map->b_end);
2974    (*pr)("\tstack allocate range: 0x%lx-0x%lx\n",
2975        map->s_start, map->s_end);
2976    (*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n",
2977        map->size, map->ref_count, map->timestamp,
2978        map->flags);
2979    (*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
2980        pmap_resident_count(map->pmap));
2981
2982    /* struct vmspace handling. */
2983    if (map->flags & VM_MAP_ISVMSPACE) {
2984        vm = (struct vmspace *)map;
2985
2986        (*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n",
2987            vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss);
2988        (*pr)("\tvm_tsize=%u vm_dsize=%u\n",
2989            vm->vm_tsize, vm->vm_dsize);
2990        (*pr)("\tvm_taddr=%p vm_daddr=%p\n",
2991            vm->vm_taddr, vm->vm_daddr);
2992        (*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n",
2993            vm->vm_maxsaddr, vm->vm_minsaddr);
2994    }
2995
2996    if (!full)
2997        goto print_uaddr;
2998    RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
2999        (*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
3000            entry, entry->start, entry->end, entry->object.uvm_obj,
3001            (long long)entry->offset, entry->aref.ar_amap,
3002            entry->aref.ar_pageoff);
3003        (*pr)("\tsubmap=%c, cow=%c, nc=%c, stack=%c, prot(max)=%d/%d, inh=%d, "
3004            "wc=%d, adv=%d\n",
3005            (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
3006            (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
3007            (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
3008            (entry->etype & UVM_ET_STACK) ? 'T' : 'F',
3009            entry->protection, entry->max_protection,
3010            entry->inheritance, entry->wired_count, entry->advice);
3011
3012        free = uvm_map_uaddr_e(map, entry);
3013        in_free = (free != NULL);
3014        (*pr)("\thole=%c, free=%c, guard=0x%lx, "
3015            "free=0x%lx-0x%lx\n",
3016            (entry->etype & UVM_ET_HOLE) ? 'T' : 'F',
3017            in_free ? 'T' : 'F',
3018            entry->guard,
3019            VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
3020        (*pr)("\tfspace_augment=%lu\n", entry->fspace_augment);
3021        (*pr)("\tfreemapped=%c, uaddr=%p\n",
3022            (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free);
3023        if (free) {
3024            (*pr)("\t\t(0x%lx-0x%lx %s)\n",
3025                free->uaddr_minaddr, free->uaddr_maxaddr,
3026                free->uaddr_functions->uaddr_name);
3027        }
3028    }
3029
3030print_uaddr:
3031    uvm_addr_print(map->uaddr_exe, "exe", full, pr);
3032    for (i = 0; i < nitems(map->uaddr_any); i++) {
3033        snprintf(&buf[0], sizeof(buf), "any[%d]", i);
3034        uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr);
3035    }
3036    uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr);
3037}
3038
3039/*
3040 * uvm_object_printit: actually prints the object
3041 */
3042void
3043uvm_object_printit(uobj, full, pr)
3044    struct uvm_object *uobj;
3045    boolean_t full;
3046    int (*pr)(const char *, ...);
3047{
3048    struct vm_page *pg;
3049    int cnt = 0;
3050
3051    (*pr)("OBJECT %p: pgops=%p, npages=%d, ",
3052        uobj, uobj->pgops, uobj->uo_npages);
3053    if (UVM_OBJ_IS_KERN_OBJECT(uobj))
3054        (*pr)("refs=<SYSTEM>\n");
3055    else
3056        (*pr)("refs=%d\n", uobj->uo_refs);
3057
3058    if (!full) {
3059        return;
3060    }
3061    (*pr)("  PAGES <pg,offset>:\n  ");
3062    RBT_FOREACH(pg, uvm_objtree, &uobj->memt) {
3063        (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
3064        if ((cnt % 3) == 2) {
3065            (*pr)("\n  ");
3066        }
3067        cnt++;
3068    }
3069    if ((cnt % 3) != 2) {
3070        (*pr)("\n");
3071    }
3072}
3073
3074/*
3075 * uvm_page_printit: actually print the page
3076 */
3077static const char page_flagbits[] =
3078    "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
3079    "\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
3080    "\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
3081
3082void
3083uvm_page_printit(pg, full, pr)
3084    struct vm_page *pg;
3085    boolean_t full;
3086    int (*pr)(const char *, ...);
3087{
3088    struct vm_page *tpg;
3089    struct uvm_object *uobj;
3090    struct pglist *pgl;
3091
3092    (*pr)("PAGE %p:\n", pg);
3093    (*pr)("  flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n",
3094        pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count,
3095        (long long)pg->phys_addr);
3096    (*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
3097        pg->uobject, pg->uanon, (long long)pg->offset);
3098#if defined(UVM_PAGE_TRKOWN)
3099    if (pg->pg_flags & PG_BUSY)
3100        (*pr)("  owning thread = %d, tag=%s",
3101            pg->owner, pg->owner_tag);
3102    else
3103        (*pr)("  page not busy, no owner");
3104#else
3105    (*pr)("  [page ownership tracking disabled]");
3106#endif
3107    (*pr)("\tvm_page_md %p\n", &pg->mdpage);
3108
3109    if (!full)
3110        return;
3111
3112    /* cross-verify object/anon */
3113    if ((pg->pg_flags & PQ_FREE) == 0) {
3114        if (pg->pg_flags & PQ_ANON) {
3115            if (pg->uanon == NULL || pg->uanon->an_page != pg)
3116                (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
3117                (pg->uanon) ? pg->uanon->an_page : NULL);
3118            else
3119                (*pr)("  anon backpointer is OK\n");
3120        } else {
3121            uobj = pg->uobject;
3122            if (uobj) {
3123                (*pr)("  checking object list\n");
3124                RBT_FOREACH(tpg, uvm_objtree, &uobj->memt) {
3125                    if (tpg == pg) {
3126                        break;
3127                    }
3128                }
3129                if (tpg)
3130                    (*pr)("  page found on object list\n");
3131                else
3132                    (*pr)("  >>> PAGE NOT FOUND "
3133                        "ON OBJECT LIST! <<<\n");
3134            }
3135        }
3136    }
3137
3138    /* cross-verify page queue */
3139    if (pg->pg_flags & PQ_FREE) {
3140        if (uvm_pmr_isfree(pg))
3141            (*pr)("  page found in uvm_pmemrange\n");
3142        else
3143            (*pr)("  >>> page not found in uvm_pmemrange <<<\n");
3144        pgl = NULL;
3145    } else if (pg->pg_flags & PQ_INACTIVE) {
3146        pgl = (pg->pg_flags & PQ_SWAPBACKED) ?
3147            &uvm.page_inactive_swp : &uvm.page_inactive_obj;
3148    } else if (pg->pg_flags & PQ_ACTIVE) {
3149        pgl = &uvm.page_active;
3150    } else {
3151        pgl = NULL;
3152    }
3153
3154    if (pgl) {
3155        (*pr)("  checking pageq list\n");
3156        TAILQ_FOREACH(tpg, pgl, pageq) {
3157            if (tpg == pg) {
3158                break;
3159            }
3160        }
3161        if (tpg)
3162            (*pr)("  page found on pageq list\n");
3163        else
3164            (*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
3165    }
3166}
3167#endif
3168
3169/*
3170 * uvm_map_protect: change map protection
3171 *
3172 * => set_max means set max_protection.
3173 * => map must be unlocked.
3174 */
3175int
3176uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
3177    vm_prot_t new_prot, boolean_t set_max)
3178{
3179    struct vm_map_entry *first, *iter;
3180    vm_prot_t old_prot;
3181    vm_prot_t mask;
3182    int error;
3183
3184    if (start > end)
3185        return EINVAL;
3186    start = MAX(start, map->min_offset);
3187    end = MIN(end, map->max_offset);
3188    if (start >= end)
3189        return 0;
3190
3191    error = 0;
3192    vm_map_lock(map);
3193
3194    /*
3195     * Set up first and last.
3196     * - first will contain first entry at or after start.
3197     */
3198    first = uvm_map_entrybyaddr(&map->addr, start);
3199    KDASSERT(first != NULL);
3200    if (first->end <= start)
3201        first = RBT_NEXT(uvm_map_addr, first);
3202
3203    /* First, check for protection violations. */
3204    for (iter = first; iter != NULL && iter->start < end;
3205        iter = RBT_NEXT(uvm_map_addr, iter)) {
3206        /* Treat memory holes as free space. */
3207        if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3208            continue;
3209
3210        if (UVM_ET_ISSUBMAP(iter)) {
3211            error = EINVAL;
3212            goto out;
3213        }
3214        if ((new_prot & iter->max_protection) != new_prot) {
3215            error = EACCES;
3216            goto out;
3217        }
3218        if (map == kernel_map &&
3219            (new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
3220            panic("uvm_map_protect: kernel map W^X violation requested");
3221    }
3222
3223    /* Fix protections.  */
3224    for (iter = first; iter != NULL && iter->start < end;
3225        iter = RBT_NEXT(uvm_map_addr, iter)) {
3226        /* Treat memory holes as free space. */
3227        if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3228            continue;
3229
3230        old_prot = iter->protection;
3231
3232        /*
3233         * Skip adapting protection iff old and new protection
3234         * are equal.
3235         */
3236        if (set_max) {
3237            if (old_prot == (new_prot & old_prot) &&
3238                iter->max_protection == new_prot)
3239                continue;
3240        } else {
3241            if (old_prot == new_prot)
3242                continue;
3243        }
3244
3245        UVM_MAP_CLIP_START(map, iter, start);
3246        UVM_MAP_CLIP_END(map, iter, end);
3247
3248        if (set_max) {
3249            iter->max_protection = new_prot;
3250            iter->protection &= new_prot;
3251        } else
3252            iter->protection = new_prot;
3253
3254        /*
3255         * update physical map if necessary.  worry about copy-on-write
3256         * here -- CHECK THIS XXX
3257         */
3258        if (iter->protection != old_prot) {
3259            mask = UVM_ET_ISCOPYONWRITE(iter) ?
3260                ~PROT_WRITE : PROT_MASK;
3261
3262            /* update pmap */
3263            if ((iter->protection & mask) == PROT_NONE &&
3264                VM_MAPENT_ISWIRED(iter)) {
3265                /*
3266                 * TODO(ariane) this is stupid. wired_count
3267                 * is 0 if not wired, otherwise anything
3268                 * larger than 0 (incremented once each time
3269                 * wire is called).
3270                 * Mostly to be able to undo the damage on
3271                 * failure. Not the actually be a wired
3272                 * refcounter...
3273                 * Originally: iter->wired_count--;
3274                 * (don't we have to unwire this in the pmap
3275                 * as well?)
3276                 */
3277                iter->wired_count = 0;
3278            }
3279            pmap_protect(map->pmap, iter->start, iter->end,
3280                iter->protection & mask);
3281        }
3282
3283        /*
3284         * If the map is configured to lock any future mappings,
3285         * wire this entry now if the old protection was PROT_NONE
3286         * and the new protection is not PROT_NONE.
3287         */
3288        if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
3289            VM_MAPENT_ISWIRED(iter) == 0 &&
3290            old_prot == PROT_NONE &&
3291            new_prot != PROT_NONE) {
3292            if (uvm_map_pageable(map, iter->start, iter->end,
3293                FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) {
3294                /*
3295                 * If locking the entry fails, remember the
3296                 * error if it's the first one.  Note we
3297                 * still continue setting the protection in
3298                 * the map, but it will return the resource
3299                 * storage condition regardless.
3300                 *
3301                 * XXX Ignore what the actual error is,
3302                 * XXX just call it a resource shortage
3303                 * XXX so that it doesn't get confused
3304                 * XXX what uvm_map_protect() itself would
3305                 * XXX normally return.
3306                 */
3307                error = ENOMEM;
3308            }
3309        }
3310    }
3311    pmap_update(map->pmap);
3312
3313out:
3314    vm_map_unlock(map);
3315    return error;
3316}
3317
3318/*
3319 * uvmspace_alloc: allocate a vmspace structure.
3320 *
3321 * - structure includes vm_map and pmap
3322 * - XXX: no locking on this structure
3323 * - refcnt set to 1, rest must be init'd by caller
3324 */
3325struct vmspace *
3326uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable,
3327    boolean_t remove_holes)
3328{
3329    struct vmspace *vm;
3330
3331    vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO);
3332    uvmspace_init(vm, NULL, min, max, pageable, remove_holes);
3333    return (vm);
3334}
3335
3336/*
3337 * uvmspace_init: initialize a vmspace structure.
3338 *
3339 * - XXX: no locking on this structure
3340 * - refcnt set to 1, rest must be init'd by caller
3341 */
3342void
3343uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max,
3344    boolean_t pageable, boolean_t remove_holes)
3345{
3346    KASSERT(pmap == NULL || pmap == pmap_kernel());
3347
3348    if (pmap)
3349        pmap_reference(pmap);
3350    else
3351        pmap = pmap_create();
3352    vm->vm_map.pmap = pmap;
3353
3354    uvm_map_setup(&vm->vm_map, min, max,
3355        (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE);
3356
3357    vm->vm_refcnt = 1;
3358
3359    if (remove_holes)
3360        pmap_remove_holes(vm);
3361}
3362
3363/*
3364 * uvmspace_share: share a vmspace between two processes
3365 *
3366 * - XXX: no locking on vmspace
3367 * - used for vfork
3368 */
3369
3370struct vmspace *
3371uvmspace_share(struct process *pr)
3372{
3373    struct vmspace *vm = pr->ps_vmspace;
3374
3375    vm->vm_refcnt++;
3376    return vm;
3377}
3378
3379/*
3380 * uvmspace_exec: the process wants to exec a new program
3381 *
3382 * - XXX: no locking on vmspace
3383 */
3384
3385void
3386uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end)
3387{
3388    struct process *pr = p->p_p;
3389    struct vmspace *nvm, *ovm = pr->ps_vmspace;
3390    struct vm_map *map = &ovm->vm_map;
3391    struct uvm_map_deadq dead_entries;
3392
3393    KASSERT((start & (vaddr_t)PAGE_MASK) == 0);
3394    KASSERT((end & (vaddr_t)PAGE_MASK) == 0 ||
3395        (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
3396
3397    pmap_unuse_final(p);   /* before stack addresses go away */
3398    TAILQ_INIT(&dead_entries);
3399
3400    /* see if more than one process is using this vmspace...  */
3401    if (ovm->vm_refcnt == 1) {
3402        /*
3403         * If pr is the only process using its vmspace then
3404         * we can safely recycle that vmspace for the program
3405         * that is being exec'd.
3406         */
3407
3408#ifdef SYSVSHM
3409        /*
3410         * SYSV SHM semantics require us to kill all segments on an exec
3411         */
3412        if (ovm->vm_shm)
3413            shmexit(ovm);
3414#endif
3415
3416        /*
3417         * POSIX 1003.1b -- "lock future mappings" is revoked
3418         * when a process execs another program image.
3419         */
3420        vm_map_lock(map);
3421        vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
3422
3423        /*
3424         * now unmap the old program
3425         *
3426         * Instead of attempting to keep the map valid, we simply
3427         * nuke all entries and ask uvm_map_setup to reinitialize
3428         * the map to the new boundaries.
3429         *
3430         * uvm_unmap_remove will actually nuke all entries for us
3431         * (as in, not replace them with free-memory entries).
3432         */
3433        uvm_unmap_remove(map, map->min_offset, map->max_offset,
3434            &dead_entries, TRUE, FALSE);
3435
3436        KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
3437
3438        /* Nuke statistics and boundaries. */
3439        memset(&ovm->vm_startcopy, 0,
3440            (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy);
3441
3442
3443        if (end & (vaddr_t)PAGE_MASK) {
3444            end += 1;
3445            if (end == 0) /* overflow */
3446                end -= PAGE_SIZE;
3447        }
3448
3449        /* Setup new boundaries and populate map with entries. */
3450        map->min_offset = start;
3451        map->max_offset = end;
3452        uvm_map_setup_entries(map);
3453        vm_map_unlock(map);
3454
3455        /* but keep MMU holes unavailable */
3456        pmap_remove_holes(ovm);
3457    } else {
3458        /*
3459         * pr's vmspace is being shared, so we can't reuse
3460         * it for pr since it is still being used for others.
3461         * allocate a new vmspace for pr
3462         */
3463        nvm = uvmspace_alloc(start, end,
3464            (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE);
3465
3466        /* install new vmspace and drop our ref to the old one. */
3467        pmap_deactivate(p);
3468        p->p_vmspace = pr->ps_vmspace = nvm;
3469        pmap_activate(p);
3470
3471        uvmspace_free(ovm);
3472    }
3473
3474    /* Release dead entries */
3475    uvm_unmap_detach(&dead_entries, 0);
3476}
3477
3478/*
3479 * uvmspace_free: free a vmspace data structure
3480 *
3481 * - XXX: no locking on vmspace
3482 */
3483void
3484uvmspace_free(struct vmspace *vm)
3485{
3486    if (--vm->vm_refcnt == 0) {
3487        /*
3488         * lock the map, to wait out all other references to it.  delete
3489         * all of the mappings and pages they hold, then call the pmap
3490         * module to reclaim anything left.
3491         */
3492#ifdef SYSVSHM
3493        /* Get rid of any SYSV shared memory segments. */
3494        if (vm->vm_shm != NULL)
3495            shmexit(vm);
3496#endif
3497
3498        uvm_map_teardown(&vm->vm_map);
3499        pool_put(&uvm_vmspace_pool, vm);
3500    }
3501}
3502
3503/*
3504 * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
3505 * srcmap to the address range [dstaddr, dstaddr + sz) in
3506 * dstmap.
3507 *
3508 * The whole address range in srcmap must be backed by an object
3509 * (no holes).
3510 *
3511 * If successful, the address ranges share memory and the destination
3512 * address range uses the protection flags in prot.
3513 *
3514 * This routine assumes that sz is a multiple of PAGE_SIZE and
3515 * that dstaddr and srcaddr are page-aligned.
3516 */
3517int
3518uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
3519    struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
3520{
3521    int ret = 0;
3522    vaddr_t unmap_end;
3523    vaddr_t dstva;
3524    vsize_t off, len, n = sz;
3525    struct vm_map_entry *first = NULL, *last = NULL;
3526    struct vm_map_entry *src_entry, *psrc_entry = NULL;
3527    struct uvm_map_deadq dead;
3528
3529    if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
3530        return EINVAL;
3531
3532    TAILQ_INIT(&dead);
3533    vm_map_lock(dstmap);
3534    vm_map_lock_read(srcmap);
3535
3536    if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
3537        ret = ENOMEM;
3538        goto exit_unlock;
3539    }
3540    if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
3541        ret = EINVAL;
3542        goto exit_unlock;
3543    }
3544
3545    unmap_end = dstaddr;
3546    for (; src_entry != NULL;
3547        psrc_entry = src_entry,
3548        src_entry = RBT_NEXT(uvm_map_addr, src_entry)) {
3549        /* hole in address space, bail out */
3550        if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
3551            break;
3552        if (src_entry->start >= srcaddr + sz)
3553            break;
3554
3555        if (UVM_ET_ISSUBMAP(src_entry))
3556            panic("uvm_share: encountered a submap (illegal)");
3557        if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
3558            UVM_ET_ISNEEDSCOPY(src_entry))
3559            panic("uvm_share: non-copy_on_write map entries "
3560                "marked needs_copy (illegal)");
3561
3562        dstva = dstaddr;
3563        if (src_entry->start > srcaddr) {
3564            dstva += src_entry->start - srcaddr;
3565            off = 0;
3566        } else
3567            off = srcaddr - src_entry->start;
3568
3569        if (n < src_entry->end - src_entry->start)
3570            len = n;
3571        else
3572            len = src_entry->end - src_entry->start;
3573        n -= len;
3574
3575        if (uvm_mapent_share(dstmap, dstva, len, off, prot, prot,
3576            srcmap, src_entry, &dead) == NULL)
3577            break;
3578
3579        unmap_end = dstva + len;
3580        if (n == 0)
3581            goto exit_unlock;
3582    }
3583
3584    ret = EINVAL;
3585    uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
3586
3587exit_unlock:
3588    vm_map_unlock_read(srcmap);
3589    vm_map_unlock(dstmap);
3590    uvm_unmap_detach(&dead, 0);
3591
3592    return ret;
3593}
3594
3595/*
3596 * Clone map entry into other map.
3597 *
3598 * Mapping will be placed at dstaddr, for the same length.
3599 * Space must be available.
3600 * Reference counters are incremented.
3601 */
3602struct vm_map_entry *
3603uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3604    vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
3605    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
3606    int mapent_flags, int amap_share_flags)
3607{
3608    struct vm_map_entry *new_entry, *first, *last;
3609
3610    KDASSERT(!UVM_ET_ISSUBMAP(old_entry));
3611
3612    /* Create new entry (linked in on creation). Fill in first, last. */
3613    first = last = NULL;
3614    if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) {
3615        panic("uvmspace_fork: no space in map for "
3616            "entry in empty map");
3617    }
3618    new_entry = uvm_map_mkentry(dstmap, first, last,
3619        dstaddr, dstlen, mapent_flags, dead, NULL);
3620    if (new_entry == NULL)
3621        return NULL;
3622    /* old_entry -> new_entry */
3623    new_entry->object = old_entry->object;
3624    new_entry->offset = old_entry->offset;
3625    new_entry->aref = old_entry->aref;
3626    new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
3627    new_entry->protection = prot;
3628    new_entry->max_protection = maxprot;
3629    new_entry->inheritance = old_entry->inheritance;
3630    new_entry->advice = old_entry->advice;
3631
3632    /* gain reference to object backing the map (can't be a submap). */
3633    if (new_entry->aref.ar_amap) {
3634        new_entry->aref.ar_pageoff += off >> PAGE_SHIFT;
3635        amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3636            (new_entry->end - new_entry->start) >> PAGE_SHIFT,
3637            amap_share_flags);
3638    }
3639
3640    if (UVM_ET_ISOBJ(new_entry) &&
3641        new_entry->object.uvm_obj->pgops->pgo_reference) {
3642        new_entry->offset += off;
3643        new_entry->object.uvm_obj->pgops->pgo_reference
3644            (new_entry->object.uvm_obj);
3645    }
3646
3647    return new_entry;
3648}
3649
3650struct vm_map_entry *
3651uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3652    vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
3653    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3654{
3655    /*
3656     * If old_entry refers to a copy-on-write region that has not yet been
3657     * written to (needs_copy flag is set), then we need to allocate a new
3658     * amap for old_entry.
3659     *
3660     * If we do not do this, and the process owning old_entry does a copy-on
3661     * write later, old_entry and new_entry will refer to different memory
3662     * regions, and the memory between the processes is no longer shared.
3663     *
3664     * [in other words, we need to clear needs_copy]
3665     */
3666
3667    if (UVM_ET_ISNEEDSCOPY(old_entry)) {
3668        /* get our own amap, clears needs_copy */
3669        amap_copy(old_map, old_entry, M_WAITOK, FALSE,
3670            0, 0);
3671        /* XXXCDC: WAITOK??? */
3672    }
3673
3674    return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
3675        prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
3676}
3677
3678/*
3679 * share the mapping: this means we want the old and
3680 * new entries to share amaps and backing objects.
3681 */
3682struct vm_map_entry *
3683uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
3684    struct vm_map *old_map,
3685    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3686{
3687    struct vm_map_entry *new_entry;
3688
3689    new_entry = uvm_mapent_share(new_map, old_entry->start,
3690        old_entry->end - old_entry->start, 0, old_entry->protection,
3691        old_entry->max_protection, old_map, old_entry, dead);
3692
3693    /*
3694     * pmap_copy the mappings: this routine is optional
3695     * but if it is there it will reduce the number of
3696     * page faults in the new proc.
3697     */
3698    if (!UVM_ET_ISHOLE(new_entry))
3699        pmap_copy(new_map->pmap, old_map->pmap, new_entry->start,
3700            (new_entry->end - new_entry->start), new_entry->start);
3701
3702    return (new_entry);
3703}
3704
3705/*
3706 * copy-on-write the mapping (using mmap's
3707 * MAP_PRIVATE semantics)
3708 *
3709 * allocate new_entry, adjust reference counts.
3710 * (note that new references are read-only).
3711 */
3712struct vm_map_entry *
3713uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
3714    struct vm_map *old_map,
3715    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3716{
3717    struct vm_map_entry *new_entry;
3718    boolean_t        protect_child;
3719
3720    new_entry = uvm_mapent_clone(new_map, old_entry->start,
3721        old_entry->end - old_entry->start, 0, old_entry->protection,
3722        old_entry->max_protection, old_entry, dead, 0, 0);
3723
3724    new_entry->etype |=
3725        (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3726
3727    /*
3728     * the new entry will need an amap.  it will either
3729     * need to be copied from the old entry or created
3730     * from scratch (if the old entry does not have an
3731     * amap).  can we defer this process until later
3732     * (by setting "needs_copy") or do we need to copy
3733     * the amap now?
3734     *
3735     * we must copy the amap now if any of the following
3736     * conditions hold:
3737     * 1. the old entry has an amap and that amap is
3738     *    being shared.  this means that the old (parent)
3739     *    process is sharing the amap with another
3740     *    process.  if we do not clear needs_copy here
3741     *    we will end up in a situation where both the
3742     *    parent and child process are referring to the
3743     *    same amap with "needs_copy" set.  if the
3744     *    parent write-faults, the fault routine will
3745     *    clear "needs_copy" in the parent by allocating
3746     *    a new amap.   this is wrong because the
3747     *    parent is supposed to be sharing the old amap
3748     *    and the new amap will break that.
3749     *
3750     * 2. if the old entry has an amap and a non-zero
3751     *    wire count then we are going to have to call
3752     *    amap_cow_now to avoid page faults in the
3753     *    parent process.   since amap_cow_now requires
3754     *    "needs_copy" to be clear we might as well
3755     *    clear it here as well.
3756     *
3757     */
3758    if (old_entry->aref.ar_amap != NULL &&
3759        ((amap_flags(old_entry->aref.ar_amap) &
3760        AMAP_SHARED) != 0 ||
3761        VM_MAPENT_ISWIRED(old_entry))) {
3762        amap_copy(new_map, new_entry, M_WAITOK, FALSE,
3763            0, 0);
3764        /* XXXCDC: M_WAITOK ... ok? */
3765    }
3766
3767    /*
3768     * if the parent's entry is wired down, then the
3769     * parent process does not want page faults on
3770     * access to that memory.  this means that we
3771     * cannot do copy-on-write because we can't write
3772     * protect the old entry.   in this case we
3773     * resolve all copy-on-write faults now, using
3774     * amap_cow_now.   note that we have already
3775     * allocated any needed amap (above).
3776     */
3777    if (VM_MAPENT_ISWIRED(old_entry)) {
3778        /*
3779         * resolve all copy-on-write faults now
3780         * (note that there is nothing to do if
3781         * the old mapping does not have an amap).
3782         * XXX: is it worthwhile to bother with
3783         * pmap_copy in this case?
3784         */
3785        if (old_entry->aref.ar_amap)
3786            amap_cow_now(new_map, new_entry);
3787    } else {
3788        if (old_entry->aref.ar_amap) {
3789            /*
3790             * setup mappings to trigger copy-on-write faults
3791             * we must write-protect the parent if it has
3792             * an amap and it is not already "needs_copy"...
3793             * if it is already "needs_copy" then the parent
3794             * has already been write-protected by a previous
3795             * fork operation.
3796             *
3797             * if we do not write-protect the parent, then
3798             * we must be sure to write-protect the child
3799             * after the pmap_copy() operation.
3800             *
3801             * XXX: pmap_copy should have some way of telling
3802             * us that it didn't do anything so we can avoid
3803             * calling pmap_protect needlessly.
3804             */
3805            if (!UVM_ET_ISNEEDSCOPY(old_entry)) {
3806                if (old_entry->max_protection & PROT_WRITE) {
3807                    pmap_protect(old_map->pmap,
3808                        old_entry->start,
3809                        old_entry->end,
3810                        old_entry->protection &
3811                        ~PROT_WRITE);
3812                    pmap_update(old_map->pmap);
3813                }
3814                old_entry->etype |= UVM_ET_NEEDSCOPY;
3815            }
3816
3817            /* parent must now be write-protected */
3818            protect_child = FALSE;
3819        } else {
3820            /*
3821             * we only need to protect the child if the
3822             * parent has write access.
3823             */
3824            if (old_entry->max_protection & PROT_WRITE)
3825                protect_child = TRUE;
3826            else
3827                protect_child = FALSE;
3828        }
3829        /*
3830         * copy the mappings
3831         * XXX: need a way to tell if this does anything
3832         */
3833        if (!UVM_ET_ISHOLE(new_entry))
3834            pmap_copy(new_map->pmap, old_map->pmap,
3835                new_entry->start,
3836                (old_entry->end - old_entry->start),
3837                old_entry->start);
3838
3839        /* protect the child's mappings if necessary */
3840        if (protect_child) {
3841            pmap_protect(new_map->pmap, new_entry->start,
3842                new_entry->end,
3843                new_entry->protection &
3844                ~PROT_WRITE);
3845        }
3846    }
3847
3848    return (new_entry);
3849}
3850
3851/*
3852 * zero the mapping: the new entry will be zero initialized
3853 */
3854struct vm_map_entry *
3855uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
3856    struct vm_map *old_map,
3857    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3858{
3859    struct vm_map_entry *new_entry;
3860
3861    new_entry = uvm_mapent_clone(new_map, old_entry->start,
3862        old_entry->end - old_entry->start, 0, old_entry->protection,
3863        old_entry->max_protection, old_entry, dead, 0, 0);
3864
3865    new_entry->etype |=
3866        (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3867
3868    if (new_entry->aref.ar_amap) {
3869        amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3870            atop(new_entry->end - new_entry->start), 0);
3871        new_entry->aref.ar_amap = NULL;
3872        new_entry->aref.ar_pageoff = 0;
3873    }
3874
3875    if (UVM_ET_ISOBJ(new_entry)) {
3876        if (new_entry->object.uvm_obj->pgops->pgo_detach)
3877            new_entry->object.uvm_obj->pgops->pgo_detach(
3878                new_entry->object.uvm_obj);
3879        new_entry->object.uvm_obj = NULL;
3880        new_entry->etype &= ~UVM_ET_OBJ;
3881    }
3882
3883    return (new_entry);
3884}
3885
3886/*
3887 * uvmspace_fork: fork a process' main map
3888 *
3889 * => create a new vmspace for child process from parent.
3890 * => parent's map must not be locked.
3891 */
3892struct vmspace *
3893uvmspace_fork(struct process *pr)
3894{
3895    struct vmspace *vm1 = pr->ps_vmspace;
3896    struct vmspace *vm2;
3897    struct vm_map *old_map = &vm1->vm_map;
3898    struct vm_map *new_map;
3899    struct vm_map_entry *old_entry, *new_entry;
3900    struct uvm_map_deadq dead;
3901
3902    vm_map_lock(old_map);
3903
3904    vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
3905        (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE);
3906    memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
3907        (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
3908    vm2->vm_dused = 0; /* Statistic managed by us. */
3909    new_map = &vm2->vm_map;
3910    vm_map_lock(new_map);
3911
3912    /* go entry-by-entry */
3913    TAILQ_INIT(&dead);
3914    RBT_FOREACH(old_entry, uvm_map_addr, &old_map->addr) {
3915        if (old_entry->start == old_entry->end)
3916            continue;
3917
3918        /* first, some sanity checks on the old entry */
3919        if (UVM_ET_ISSUBMAP(old_entry)) {
3920            panic("fork: encountered a submap during fork "
3921                "(illegal)");
3922        }
3923
3924        if (!UVM_ET_ISCOPYONWRITE(old_entry) &&
3925            UVM_ET_ISNEEDSCOPY(old_entry)) {
3926            panic("fork: non-copy_on_write map entry marked "
3927                "needs_copy (illegal)");
3928        }
3929
3930        /* Apply inheritance. */
3931        switch (old_entry->inheritance) {
3932        case MAP_INHERIT_SHARE:
3933            new_entry = uvm_mapent_forkshared(vm2, new_map,
3934                old_map, old_entry, &dead);
3935            break;
3936        case MAP_INHERIT_COPY:
3937            new_entry = uvm_mapent_forkcopy(vm2, new_map,
3938                old_map, old_entry, &dead);
3939            break;
3940        case MAP_INHERIT_ZERO:
3941            new_entry = uvm_mapent_forkzero(vm2, new_map,
3942                old_map, old_entry, &dead);
3943            break;
3944        default:
3945            continue;
3946        }
3947
3948        /* Update process statistics. */
3949        if (!UVM_ET_ISHOLE(new_entry))
3950            new_map->size += new_entry->end - new_entry->start;
3951        if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry)) {
3952            vm2->vm_dused += uvmspace_dused(
3953                new_map, new_entry->start, new_entry->end);
3954        }
3955    }
3956
3957    vm_map_unlock(old_map);
3958    vm_map_unlock(new_map);
3959
3960    /*
3961     * This can actually happen, if multiple entries described a
3962     * space in which an entry was inherited.
3963     */
3964    uvm_unmap_detach(&dead, 0);
3965
3966#ifdef SYSVSHM
3967    if (vm1->vm_shm)
3968        shmfork(vm1, vm2);
3969#endif
3970
3971    return vm2;
3972}
3973
3974/*
3975 * uvm_map_hint: return the beginning of the best area suitable for
3976 * creating a new mapping with "prot" protection.
3977 */
3978vaddr_t
3979uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr,
3980    vaddr_t maxaddr)
3981{
3982    vaddr_t addr;
3983    vaddr_t spacing;
3984
3985#ifdef __i386__
3986    /*
3987     * If executable skip first two pages, otherwise start
3988     * after data + heap region.
3989     */
3990    if ((prot & PROT_EXEC) != 0 &&
3991        (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) {
3992        addr = (PAGE_SIZE*2) +
3993            (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
3994        return (round_page(addr));
3995    }
3996#endif
3997
3998#if defined (__LP64__)
3999    spacing = MIN(4UL * 1024 * 1024 * 1024, MAXDSIZ) - 1;
4000#else
4001    spacing = MIN(1 * 1024 * 1024 * 1024, MAXDSIZ) - 1;
4002#endif
4003
4004    /*
4005     * Start malloc/mmap after the brk.
4006     */
4007    addr = (vaddr_t)vm->vm_daddr + BRKSIZ;
4008    addr = MAX(addr, minaddr);
4009
4010    if (addr < maxaddr) {
4011        while (spacing > maxaddr - addr)
4012            spacing >>= 1;
4013    }
4014    addr += arc4random() & spacing;
4015    return (round_page(addr));
4016}
4017
4018/*
4019 * uvm_map_submap: punch down part of a map into a submap
4020 *
4021 * => only the kernel_map is allowed to be submapped
4022 * => the purpose of submapping is to break up the locking granularity
4023 *  of a larger map
4024 * => the range specified must have been mapped previously with a uvm_map()
4025 *  call [with uobj==NULL] to create a blank map entry in the main map.
4026 *  [And it had better still be blank!]
4027 * => maps which contain submaps should never be copied or forked.
4028 * => to remove a submap, use uvm_unmap() on the main map
4029 *  and then uvm_map_deallocate() the submap.
4030 * => main map must be unlocked.
4031 * => submap must have been init'd and have a zero reference count.
4032 *  [need not be locked as we don't actually reference it]
4033 */
4034int
4035uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
4036    struct vm_map *submap)
4037{
4038    struct vm_map_entry *entry;
4039    int result;
4040
4041    if (start > map->max_offset || end > map->max_offset ||
4042        start < map->min_offset || end < map->min_offset)
4043        return EINVAL;
4044
4045    vm_map_lock(map);
4046
4047    if (uvm_map_lookup_entry(map, start, &entry)) {
4048        UVM_MAP_CLIP_START(map, entry, start);
4049        UVM_MAP_CLIP_END(map, entry, end);
4050    } else
4051        entry = NULL;
4052
4053    if (entry != NULL &&
4054        entry->start == start && entry->end == end &&
4055        entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
4056        !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
4057        entry->etype |= UVM_ET_SUBMAP;
4058        entry->object.sub_map = submap;
4059        entry->offset = 0;
4060        uvm_map_reference(submap);
4061        result = 0;
4062    } else
4063        result = EINVAL;
4064
4065    vm_map_unlock(map);
4066    return(result);
4067}
4068
4069/*
4070 * uvm_map_checkprot: check protection in map
4071 *
4072 * => must allow specific protection in a fully allocated region.
4073 * => map mut be read or write locked by caller.
4074 */
4075boolean_t
4076uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
4077    vm_prot_t protection)
4078{
4079    struct vm_map_entry *entry;
4080
4081    if (start < map->min_offset || end > map->max_offset || start > end)
4082        return FALSE;
4083    if (start == end)
4084        return TRUE;
4085
4086    /*
4087     * Iterate entries.
4088     */
4089    for (entry = uvm_map_entrybyaddr(&map->addr, start);
4090        entry != NULL && entry->start < end;
4091        entry = RBT_NEXT(uvm_map_addr, entry)) {
4092        /* Fail if a hole is found. */
4093        if (UVM_ET_ISHOLE(entry) ||
4094            (entry->end < end && entry->end != VMMAP_FREE_END(entry)))
4095            return FALSE;
4096
4097        /* Check protection. */
4098        if ((entry->protection & protection) != protection)
4099            return FALSE;
4100    }
4101    return TRUE;
4102}
4103
4104/*
4105 * uvm_map_create: create map
4106 */
4107vm_map_t
4108uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags)
4109{
4110    vm_map_t map;
4111
4112    map = malloc(sizeof *map, M_VMMAP, M_WAITOK);
4113    map->pmap = pmap;
4114    uvm_map_setup(map, min, max, flags);
4115    return (map);
4116}
4117
4118/*
4119 * uvm_map_deallocate: drop reference to a map
4120 *
4121 * => caller must not lock map
4122 * => we will zap map if ref count goes to zero
4123 */
4124void
4125uvm_map_deallocate(vm_map_t map)
4126{
4127    int c;
4128    struct uvm_map_deadq dead;
4129
4130    c = --map->ref_count;
4131    if (c > 0) {
4132        return;
4133    }
4134
4135    /*
4136     * all references gone.   unmap and free.
4137     *
4138     * No lock required: we are only one to access this map.
4139     */
4140    TAILQ_INIT(&dead);
4141    uvm_tree_sanity(map, __FILE__, __LINE__);
4142    uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead,
4143        TRUE, FALSE);
4144    pmap_destroy(map->pmap);
4145    KASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
4146    free(map, M_VMMAP, sizeof *map);
4147
4148    uvm_unmap_detach(&dead, 0);
4149}
4150
4151/*
4152 * uvm_map_inherit: set inheritance code for range of addrs in map.
4153 *
4154 * => map must be unlocked
4155 * => note that the inherit code is used during a "fork".  see fork
4156 *  code for details.
4157 */
4158int
4159uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
4160    vm_inherit_t new_inheritance)
4161{
4162    struct vm_map_entry *entry;
4163
4164    switch (new_inheritance) {
4165    case MAP_INHERIT_NONE:
4166    case MAP_INHERIT_COPY:
4167    case MAP_INHERIT_SHARE:
4168    case MAP_INHERIT_ZERO:
4169        break;
4170    default:
4171        return (EINVAL);
4172    }
4173
4174    if (start > end)
4175        return EINVAL;
4176    start = MAX(start, map->min_offset);
4177    end = MIN(end, map->max_offset);
4178    if (start >= end)
4179        return 0;
4180
4181    vm_map_lock(map);
4182
4183    entry = uvm_map_entrybyaddr(&map->addr, start);
4184    if (entry->end > start)
4185        UVM_MAP_CLIP_START(map, entry, start);
4186    else
4187        entry = RBT_NEXT(uvm_map_addr, entry);
4188
4189    while (entry != NULL && entry->start < end) {
4190        UVM_MAP_CLIP_END(map, entry, end);
4191        entry->inheritance = new_inheritance;
4192        entry = RBT_NEXT(uvm_map_addr, entry);
4193    }
4194
4195    vm_map_unlock(map);
4196    return (0);
4197}
4198
4199/*
4200 * uvm_map_advice: set advice code for range of addrs in map.
4201 *
4202 * => map must be unlocked
4203 */
4204int
4205uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
4206{
4207    struct vm_map_entry *entry;
4208
4209    switch (new_advice) {
4210    case MADV_NORMAL:
4211    case MADV_RANDOM:
4212    case MADV_SEQUENTIAL:
4213        break;
4214    default:
4215        return (EINVAL);
4216    }
4217
4218    if (start > end)
4219        return EINVAL;
4220    start = MAX(start, map->min_offset);
4221    end = MIN(end, map->max_offset);
4222    if (start >= end)
4223        return 0;
4224
4225    vm_map_lock(map);
4226
4227    entry = uvm_map_entrybyaddr(&map->addr, start);
4228    if (entry != NULL && entry->end > start)
4229        UVM_MAP_CLIP_START(map, entry, start);
4230    else if (entry!= NULL)
4231        entry = RBT_NEXT(uvm_map_addr, entry);
4232
4233    /*
4234     * XXXJRT: disallow holes?
4235     */
4236    while (entry != NULL && entry->start < end) {
4237        UVM_MAP_CLIP_END(map, entry, end);
4238        entry->advice = new_advice;
4239        entry = RBT_NEXT(uvm_map_addr, entry);
4240    }
4241
4242    vm_map_unlock(map);
4243    return (0);
4244}
4245
4246/*
4247 * uvm_map_extract: extract a mapping from a map and put it somewhere
4248 * in the kernel_map, setting protection to max_prot.
4249 *
4250 * => map should be unlocked (we will write lock it and kernel_map)
4251 * => returns 0 on success, error code otherwise
4252 * => start must be page aligned
4253 * => len must be page sized
4254 * => flags:
4255 *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
4256 * Mappings are QREF's.
4257 */
4258int
4259uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
4260    vaddr_t *dstaddrp, int flags)
4261{
4262    struct uvm_map_deadq dead;
4263    struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2;
4264    vaddr_t dstaddr;
4265    vaddr_t end;
4266    vaddr_t cp_start;
4267    vsize_t cp_len, cp_off;
4268    int error;
4269
4270    TAILQ_INIT(&dead);
4271    end = start + len;
4272
4273    /*
4274     * Sanity check on the parameters.
4275     * Also, since the mapping may not contain gaps, error out if the
4276     * mapped area is not in source map.
4277     */
4278    if ((start & (vaddr_t)PAGE_MASK) != 0 ||
4279        (end & (vaddr_t)PAGE_MASK) != 0 || end < start)
4280        return EINVAL;
4281    if (start < srcmap->min_offset || end > srcmap->max_offset)
4282        return EINVAL;
4283
4284    /* Initialize dead entries. Handle len == 0 case. */
4285    if (len == 0)
4286        return 0;
4287
4288    /* Acquire lock on srcmap. */
4289    vm_map_lock(srcmap);
4290
4291    /* Lock srcmap, lookup first and last entry in <start,len>. */
4292    first = uvm_map_entrybyaddr(&srcmap->addr, start);
4293
4294    /* Check that the range is contiguous. */
4295    for (entry = first; entry != NULL && entry->end < end;
4296        entry = RBT_NEXT(uvm_map_addr, entry)) {
4297        if (VMMAP_FREE_END(entry) != entry->end ||
4298            UVM_ET_ISHOLE(entry)) {
4299            error = EINVAL;
4300            goto fail;
4301        }
4302    }
4303    if (entry == NULL || UVM_ET_ISHOLE(entry)) {
4304        error = EINVAL;
4305        goto fail;
4306    }
4307
4308    /*
4309     * Handle need-copy flag.
4310     */
4311    for (entry = first; entry != NULL && entry->start < end;
4312        entry = RBT_NEXT(uvm_map_addr, entry)) {
4313        if (UVM_ET_ISNEEDSCOPY(entry))
4314            amap_copy(srcmap, entry, M_NOWAIT,
4315                UVM_ET_ISSTACK(entry) ? FALSE : TRUE, start, end);
4316        if (UVM_ET_ISNEEDSCOPY(entry)) {
4317            /*
4318             * amap_copy failure
4319             */
4320            error = ENOMEM;
4321            goto fail;
4322        }
4323    }
4324
4325    /* Lock destination map (kernel_map). */
4326    vm_map_lock(kernel_map);
4327
4328    if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len,
4329        MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start),
4330        PROT_NONE, 0) != 0) {
4331        error = ENOMEM;
4332        goto fail2;
4333    }
4334    *dstaddrp = dstaddr;
4335
4336    /*
4337     * We now have srcmap and kernel_map locked.
4338     * dstaddr contains the destination offset in dstmap.
4339     */
4340    /* step 1: start looping through map entries, performing extraction. */
4341    for (entry = first; entry != NULL && entry->start < end;
4342        entry = RBT_NEXT(uvm_map_addr, entry)) {
4343        KDASSERT(!UVM_ET_ISNEEDSCOPY(entry));
4344        if (UVM_ET_ISHOLE(entry))
4345            continue;
4346
4347        /* Calculate uvm_mapent_clone parameters. */
4348        cp_start = entry->start;
4349        if (cp_start < start) {
4350            cp_off = start - cp_start;
4351            cp_start = start;
4352        } else
4353            cp_off = 0;
4354        cp_len = MIN(entry->end, end) - cp_start;
4355
4356        newentry = uvm_mapent_clone(kernel_map,
4357            cp_start - start + dstaddr, cp_len, cp_off,
4358            entry->protection, entry->max_protection,
4359            entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
4360        if (newentry == NULL) {
4361            error = ENOMEM;
4362            goto fail2_unmap;
4363        }
4364        kernel_map->size += cp_len;
4365        if (flags & UVM_EXTRACT_FIXPROT)
4366            newentry->protection = newentry->max_protection;
4367
4368        /*
4369         * Step 2: perform pmap copy.
4370         * (Doing this in the loop saves one RB traversal.)
4371         */
4372        pmap_copy(kernel_map->pmap, srcmap->pmap,
4373            cp_start - start + dstaddr, cp_len, cp_start);
4374    }
4375    pmap_update(kernel_map->pmap);
4376
4377    error = 0;
4378
4379    /* Unmap copied entries on failure. */
4380fail2_unmap:
4381    if (error) {
4382        uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead,
4383            FALSE, TRUE);
4384    }
4385
4386    /* Release maps, release dead entries. */
4387fail2:
4388    vm_map_unlock(kernel_map);
4389
4390fail:
4391    vm_map_unlock(srcmap);
4392
4393    uvm_unmap_detach(&dead, 0);
4394
4395    return error;
4396}
4397
4398/*
4399 * uvm_map_clean: clean out a map range
4400 *
4401 * => valid flags:
4402 *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
4403 *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
4404 *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
4405 *   if (flags & PGO_FREE): any cached pages are freed after clean
4406 * => returns an error if any part of the specified range isn't mapped
4407 * => never a need to flush amap layer since the anonymous memory has
4408 *  no permanent home, but may deactivate pages there
4409 * => called from sys_msync() and sys_madvise()
4410 * => caller must not write-lock map (read OK).
4411 * => we may sleep while cleaning if SYNCIO [with map read-locked]
4412 */
4413
4414int
4415uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
4416{
4417    struct vm_map_entry *first, *entry;
4418    struct vm_amap *amap;
4419    struct vm_anon *anon;
4420    struct vm_page *pg;
4421    struct uvm_object *uobj;
4422    vaddr_t cp_start, cp_end;
4423    int refs;
4424    int error;
4425    boolean_t rv;
4426
4427    KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
4428        (PGO_FREE|PGO_DEACTIVATE));
4429
4430    if (start > end || start < map->min_offset || end > map->max_offset)
4431        return EINVAL;
4432
4433    vm_map_lock_read(map);
4434    first = uvm_map_entrybyaddr(&map->addr, start);
4435
4436    /* Make a first pass to check for holes. */
4437    for (entry = first; entry != NULL && entry->start < end;
4438        entry = RBT_NEXT(uvm_map_addr, entry)) {
4439        if (UVM_ET_ISSUBMAP(entry)) {
4440            vm_map_unlock_read(map);
4441            return EINVAL;
4442        }
4443        if (UVM_ET_ISSUBMAP(entry) ||
4444            UVM_ET_ISHOLE(entry) ||
4445            (entry->end < end &&
4446            VMMAP_FREE_END(entry) != entry->end)) {
4447            vm_map_unlock_read(map);
4448            return EFAULT;
4449        }
4450    }
4451
4452    error = 0;
4453    for (entry = first; entry != NULL && entry->start < end;
4454        entry = RBT_NEXT(uvm_map_addr, entry)) {
4455        amap = entry->aref.ar_amap; /* top layer */
4456        if (UVM_ET_ISOBJ(entry))
4457            uobj = entry->object.uvm_obj;
4458        else
4459            uobj = NULL;
4460
4461        /*
4462         * No amap cleaning necessary if:
4463         *  - there's no amap
4464         *  - we're not deactivating or freeing pages.
4465         */
4466        if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
4467            goto flush_object;
4468
4469        cp_start = MAX(entry->start, start);
4470        cp_end = MIN(entry->end, end);
4471
4472        for (; cp_start != cp_end; cp_start += PAGE_SIZE) {
4473            anon = amap_lookup(&entry->aref,
4474                cp_start - entry->start);
4475            if (anon == NULL)
4476                continue;
4477
4478            pg = anon->an_page;
4479            if (pg == NULL) {
4480                continue;
4481            }
4482            KASSERT(pg->pg_flags & PQ_ANON);
4483
4484            switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
4485            /*
4486             * XXX In these first 3 cases, we always just
4487             * XXX deactivate the page.  We may want to
4488             * XXX handle the different cases more
4489             * XXX specifically, in the future.
4490             */
4491            case PGO_CLEANIT|PGO_FREE:
4492            case PGO_CLEANIT|PGO_DEACTIVATE:
4493            case PGO_DEACTIVATE:
4494deactivate_it:
4495                /* skip the page if it's wired */
4496                if (pg->wire_count != 0)
4497                    break;
4498
4499                uvm_lock_pageq();
4500
4501                KASSERT(pg->uanon == anon);
4502
4503                /* zap all mappings for the page. */
4504                pmap_page_protect(pg, PROT_NONE);
4505
4506                /* ...and deactivate the page. */
4507                uvm_pagedeactivate(pg);
4508
4509                uvm_unlock_pageq();
4510                break;
4511            case PGO_FREE:
4512                /*
4513                 * If there are multiple references to
4514                 * the amap, just deactivate the page.
4515                 */
4516                if (amap_refs(amap) > 1)
4517                    goto deactivate_it;
4518
4519                /* XXX skip the page if it's wired */
4520                if (pg->wire_count != 0) {
4521                    break;
4522                }
4523                amap_unadd(&entry->aref,
4524                    cp_start - entry->start);
4525                refs = --anon->an_ref;
4526                if (refs == 0)
4527                    uvm_anfree(anon);
4528                break;
4529            default:
4530                panic("uvm_map_clean: weird flags");
4531            }
4532        }
4533
4534flush_object:
4535        cp_start = MAX(entry->start, start);
4536        cp_end = MIN(entry->end, end);
4537
4538        /*
4539         * flush pages if we've got a valid backing object.
4540         *
4541         * Don't PGO_FREE if we don't have write permission
4542         * and don't flush if this is a copy-on-write object
4543         * since we can't know our permissions on it.
4544         */
4545        if (uobj != NULL &&
4546            ((flags & PGO_FREE) == 0 ||
4547             ((entry->max_protection & PROT_WRITE) != 0 &&
4548              (entry->etype & UVM_ET_COPYONWRITE) == 0))) {
4549            rv = uobj->pgops->pgo_flush(uobj,
4550                cp_start - entry->start + entry->offset,
4551                cp_end - entry->start + entry->offset, flags);
4552
4553            if (rv == FALSE)
4554                error = EFAULT;
4555        }
4556    }
4557
4558    vm_map_unlock_read(map);
4559    return error;
4560}
4561
4562/*
4563 * UVM_MAP_CLIP_END implementation
4564 */
4565void
4566uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4567{
4568    struct vm_map_entry *tmp;
4569
4570    KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4571    tmp = uvm_mapent_alloc(map, 0);
4572
4573    /* Invoke splitentry. */
4574    uvm_map_splitentry(map, entry, tmp, addr);
4575}
4576
4577/*
4578 * UVM_MAP_CLIP_START implementation
4579 *
4580 * Clippers are required to not change the pointers to the entry they are
4581 * clipping on.
4582 * Since uvm_map_splitentry turns the original entry into the lowest
4583 * entry (address wise) we do a swap between the new entry and the original
4584 * entry, prior to calling uvm_map_splitentry.
4585 */
4586void
4587uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4588{
4589    struct vm_map_entry *tmp;
4590    struct uvm_addr_state *free;
4591
4592    /* Unlink original. */
4593    free = uvm_map_uaddr_e(map, entry);
4594    uvm_mapent_free_remove(map, free, entry);
4595    uvm_mapent_addr_remove(map, entry);
4596
4597    /* Copy entry. */
4598    KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4599    tmp = uvm_mapent_alloc(map, 0);
4600    uvm_mapent_copy(entry, tmp);
4601
4602    /* Put new entry in place of original entry. */
4603    uvm_mapent_addr_insert(map, tmp);
4604    uvm_mapent_free_insert(map, free, tmp);
4605
4606    /* Invoke splitentry. */
4607    uvm_map_splitentry(map, tmp, entry, addr);
4608}
4609
4610/*
4611 * Boundary fixer.
4612 */
4613static __inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t);
4614static __inline vaddr_t
4615uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound)
4616{
4617    return (min < bound && max > bound) ? bound : max;
4618}
4619
4620/*
4621 * Choose free list based on address at start of free space.
4622 *
4623 * The uvm_addr_state returned contains addr and is the first of:
4624 * - uaddr_exe
4625 * - uaddr_brk_stack
4626 * - uaddr_any
4627 */
4628struct uvm_addr_state*
4629uvm_map_uaddr(struct vm_map *map, vaddr_t addr)
4630{
4631    struct uvm_addr_state *uaddr;
4632    int i;
4633
4634    /* Special case the first page, to prevent mmap from returning 0. */
4635    if (addr < VMMAP_MIN_ADDR)
4636        return NULL;
4637
4638    /* Upper bound for kernel maps at uvm_maxkaddr. */
4639    if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
4640        if (addr >= uvm_maxkaddr)
4641            return NULL;
4642    }
4643
4644    /* Is the address inside the exe-only map? */
4645    if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr &&
4646        addr < map->uaddr_exe->uaddr_maxaddr)
4647        return map->uaddr_exe;
4648
4649    /* Check if the space falls inside brk/stack area. */
4650    if ((addr >= map->b_start && addr < map->b_end) ||
4651        (addr >= map->s_start && addr < map->s_end)) {
4652        if (map->uaddr_brk_stack != NULL &&
4653            addr >= map->uaddr_brk_stack->uaddr_minaddr &&
4654            addr < map->uaddr_brk_stack->uaddr_maxaddr) {
4655            return map->uaddr_brk_stack;
4656        } else
4657            return NULL;
4658    }
4659
4660    /*
4661     * Check the other selectors.
4662     *
4663     * These selectors are only marked as the owner, if they have insert
4664     * functions.
4665     */
4666    for (i = 0; i < nitems(map->uaddr_any); i++) {
4667        uaddr = map->uaddr_any[i];
4668        if (uaddr == NULL)
4669            continue;
4670        if (uaddr->uaddr_functions->uaddr_free_insert == NULL)
4671            continue;
4672
4673        if (addr >= uaddr->uaddr_minaddr &&
4674            addr < uaddr->uaddr_maxaddr)
4675            return uaddr;
4676    }
4677
4678    return NULL;
4679}
4680
4681/*
4682 * Choose free list based on address at start of free space.
4683 *
4684 * The uvm_addr_state returned contains addr and is the first of:
4685 * - uaddr_exe
4686 * - uaddr_brk_stack
4687 * - uaddr_any
4688 */
4689struct uvm_addr_state*
4690uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry)
4691{
4692    return uvm_map_uaddr(map, VMMAP_FREE_START(entry));
4693}
4694
4695/*
4696 * Returns the first free-memory boundary that is crossed by [min-max].
4697 */
4698vsize_t
4699uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max)
4700{
4701    struct uvm_addr_state   *uaddr;
4702    int          i;
4703
4704    /* Never return first page. */
4705    max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR);
4706
4707    /* Treat the maxkaddr special, if the map is a kernel_map. */
4708    if ((map->flags & VM_MAP_ISVMSPACE) == 0)
4709        max = uvm_map_boundfix(min, max, uvm_maxkaddr);
4710
4711    /* Check for exe-only boundaries. */
4712    if (map->uaddr_exe != NULL) {
4713        max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr);
4714        max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr);
4715    }
4716
4717    /* Check for exe-only boundaries. */
4718    if (map->uaddr_brk_stack != NULL) {
4719        max = uvm_map_boundfix(min, max,
4720            map->uaddr_brk_stack->uaddr_minaddr);
4721        max = uvm_map_boundfix(min, max,
4722            map->uaddr_brk_stack->uaddr_maxaddr);
4723    }
4724
4725    /* Check other boundaries. */
4726    for (i = 0; i < nitems(map->uaddr_any); i++) {
4727        uaddr = map->uaddr_any[i];
4728        if (uaddr != NULL) {
4729            max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr);
4730            max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr);
4731        }
4732    }
4733
4734    /* Boundaries at stack and brk() area. */
4735    max = uvm_map_boundfix(min, max, map->s_start);
4736    max = uvm_map_boundfix(min, max, map->s_end);
4737    max = uvm_map_boundfix(min, max, map->b_start);
4738    max = uvm_map_boundfix(min, max, map->b_end);
4739
4740    return max;
4741}
4742
4743/*
4744 * Update map allocation start and end addresses from proc vmspace.
4745 */
4746void
4747uvm_map_vmspace_update(struct vm_map *map,
4748    struct uvm_map_deadq *dead, int flags)
4749{
4750    struct vmspace *vm;
4751    vaddr_t b_start, b_end, s_start, s_end;
4752
4753    KASSERT(map->flags & VM_MAP_ISVMSPACE);
4754    KASSERT(offsetof(struct vmspace, vm_map) == 0);
4755
4756    /*
4757     * Derive actual allocation boundaries from vmspace.
4758     */
4759    vm = (struct vmspace *)map;
4760    b_start = (vaddr_t)vm->vm_daddr;
4761    b_end   = b_start + BRKSIZ;
4762    s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4763    s_end   = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4764#ifdef DIAGNOSTIC
4765    if ((b_start & (vaddr_t)PAGE_MASK) != 0 ||
4766        (b_end & (vaddr_t)PAGE_MASK) != 0 ||
4767        (s_start & (vaddr_t)PAGE_MASK) != 0 ||
4768        (s_end & (vaddr_t)PAGE_MASK) != 0) {
4769        panic("uvm_map_vmspace_update: vmspace %p invalid bounds: "
4770            "b=0x%lx-0x%lx s=0x%lx-0x%lx",
4771            vm, b_start, b_end, s_start, s_end);
4772    }
4773#endif
4774
4775    if (__predict_true(map->b_start == b_start && map->b_end == b_end &&
4776        map->s_start == s_start && map->s_end == s_end))
4777        return;
4778
4779    uvm_map_freelist_update(map, dead, b_start, b_end,
4780        s_start, s_end, flags);
4781}
4782
4783/*
4784 * Grow kernel memory.
4785 *
4786 * This function is only called for kernel maps when an allocation fails.
4787 *
4788 * If the map has a gap that is large enough to accommodate alloc_sz, this
4789 * function will make sure map->free will include it.
4790 */
4791void
4792uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead,
4793    vsize_t alloc_sz, int flags)
4794{
4795    vsize_t sz;
4796    vaddr_t end;
4797    struct vm_map_entry *entry;
4798
4799    /* Kernel memory only. */
4800    KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0);
4801    /* Destroy free list. */
4802    uvm_map_freelist_update_clear(map, dead);
4803
4804    /* Include the guard page in the hard minimum requirement of alloc_sz. */
4805    if (map->flags & VM_MAP_GUARDPAGES)
4806        alloc_sz += PAGE_SIZE;
4807
4808    /*
4809     * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA.
4810     *
4811     * Don't handle the case where the multiplication overflows:
4812     * if that happens, the allocation is probably too big anyway.
4813     */
4814    sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA);
4815
4816    /*
4817     * Walk forward until a gap large enough for alloc_sz shows up.
4818     *
4819     * We assume the kernel map has no boundaries.
4820     * uvm_maxkaddr may be zero.
4821     */
4822    end = MAX(uvm_maxkaddr, map->min_offset);
4823    entry = uvm_map_entrybyaddr(&map->addr, end);
4824    while (entry && entry->fspace < alloc_sz)
4825        entry = RBT_NEXT(uvm_map_addr, entry);
4826    if (entry) {
4827        end = MAX(VMMAP_FREE_START(entry), end);
4828        end += MIN(sz, map->max_offset - end);
4829    } else
4830        end = map->max_offset;
4831
4832    /* Reserve pmap entries. */
4833#ifdef PMAP_GROWKERNEL
4834    uvm_maxkaddr = pmap_growkernel(end);
4835#else
4836    uvm_maxkaddr = MAX(uvm_maxkaddr, end);
4837#endif
4838
4839    /* Rebuild free list. */
4840    uvm_map_freelist_update_refill(map, flags);
4841}
4842
4843/*
4844 * Freelist update subfunction: unlink all entries from freelists.
4845 */
4846void
4847uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead)
4848{
4849    struct uvm_addr_state *free;
4850    struct vm_map_entry *entry, *prev, *next;
4851
4852    prev = NULL;
4853    for (entry = RBT_MIN(uvm_map_addr, &map->addr); entry != NULL;
4854        entry = next) {
4855        next = RBT_NEXT(uvm_map_addr, entry);
4856
4857        free = uvm_map_uaddr_e(map, entry);
4858        uvm_mapent_free_remove(map, free, entry);
4859
4860        if (prev != NULL && entry->start == entry->end) {
4861            prev->fspace += VMMAP_FREE_END(entry) - entry->end;
4862            uvm_mapent_addr_remove(map, entry);
4863            DEAD_ENTRY_PUSH(dead, entry);
4864        } else
4865            prev = entry;
4866    }
4867}
4868
4869/*
4870 * Freelist update subfunction: refill the freelists with entries.
4871 */
4872void
4873uvm_map_freelist_update_refill(struct vm_map *map, int flags)
4874{
4875    struct vm_map_entry *entry;
4876    vaddr_t min, max;
4877
4878    RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
4879        min = VMMAP_FREE_START(entry);
4880        max = VMMAP_FREE_END(entry);
4881        entry->fspace = 0;
4882
4883        entry = uvm_map_fix_space(map, entry, min, max, flags);
4884    }
4885
4886    uvm_tree_sanity(map, __FILE__, __LINE__);
4887}
4888
4889/*
4890 * Change {a,b}_{start,end} allocation ranges and associated free lists.
4891 */
4892void
4893uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead,
4894    vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
4895{
4896    KDASSERT(b_end >= b_start && s_end >= s_start);
4897
4898    /* Clear all free lists. */
4899    uvm_map_freelist_update_clear(map, dead);
4900
4901    /* Apply new bounds. */
4902    map->b_start = b_start;
4903    map->b_end   = b_end;
4904    map->s_start = s_start;
4905    map->s_end   = s_end;
4906
4907    /* Refill free lists. */
4908    uvm_map_freelist_update_refill(map, flags);
4909}
4910
4911/*
4912 * Assign a uvm_addr_state to the specified pointer in vm_map.
4913 *
4914 * May sleep.
4915 */
4916void
4917uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which,
4918    struct uvm_addr_state *newval)
4919{
4920    struct uvm_map_deadq dead;
4921
4922    /* Pointer which must be in this map. */
4923    KASSERT(which != NULL);
4924    KASSERT((void*)map <= (void*)(which) &&
4925        (void*)(which) < (void*)(map + 1));
4926
4927    vm_map_lock(map);
4928    TAILQ_INIT(&dead);
4929    uvm_map_freelist_update_clear(map, &dead);
4930
4931    uvm_addr_destroy(*which);
4932    *which = newval;
4933
4934    uvm_map_freelist_update_refill(map, 0);
4935    vm_map_unlock(map);
4936    uvm_unmap_detach(&dead, 0);
4937}
4938
4939/*
4940 * Correct space insert.
4941 *
4942 * Entry must not be on any freelist.
4943 */
4944struct vm_map_entry*
4945uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry,
4946    vaddr_t min, vaddr_t max, int flags)
4947{
4948    struct uvm_addr_state   *free, *entfree;
4949    vaddr_t          lmax;
4950
4951    KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0);
4952    KDASSERT(min <= max);
4953    KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) ||
4954        min == map->min_offset);
4955
4956    /*
4957     * During the function, entfree will always point at the uaddr state
4958     * for entry.
4959     */
4960    entfree = (entry == NULL ? NULL :
4961        uvm_map_uaddr_e(map, entry));
4962
4963    while (min != max) {
4964        /* Claim guard page for entry. */
4965        if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL &&
4966            VMMAP_FREE_END(entry) == entry->end &&
4967            entry->start != entry->end) {
4968            if (max - min == 2 * PAGE_SIZE) {
4969                /*
4970                 * If the free-space gap is exactly 2 pages,
4971                 * we make the guard 2 pages instead of 1.
4972                 * Because in a guarded map, an area needs
4973                 * at least 2 pages to allocate from:
4974                 * one page for the allocation and one for
4975                 * the guard.
4976                 */
4977                entry->guard = 2 * PAGE_SIZE;
4978                min = max;
4979            } else {
4980                entry->guard = PAGE_SIZE;
4981                min += PAGE_SIZE;
4982            }
4983            continue;
4984        }
4985
4986        /*
4987         * Handle the case where entry has a 2-page guard, but the
4988         * space after entry is freed.
4989         */
4990        if (entry != NULL && entry->fspace == 0 &&
4991            entry->guard > PAGE_SIZE) {
4992            entry->guard = PAGE_SIZE;
4993            min = VMMAP_FREE_START(entry);
4994        }
4995
4996        lmax = uvm_map_boundary(map, min, max);
4997        free = uvm_map_uaddr(map, min);
4998
4999        /*
5000         * Entries are merged if they point at the same uvm_free().
5001         * Exception to that rule: if min == uvm_maxkaddr, a new
5002         * entry is started regardless (otherwise the allocators
5003         * will get confused).
5004         */
5005        if (entry != NULL && free == entfree &&
5006            !((map->flags & VM_MAP_ISVMSPACE) == 0 &&
5007            min == uvm_maxkaddr)) {
5008            KDASSERT(VMMAP_FREE_END(entry) == min);
5009            entry->fspace += lmax - min;
5010        } else {
5011            /*
5012             * Commit entry to free list: it'll not be added to
5013             * anymore.
5014             * We'll start a new entry and add to that entry
5015             * instead.
5016             */
5017            if (entry != NULL)
5018                uvm_mapent_free_insert(map, entfree, entry);
5019
5020            /* New entry for new uaddr. */
5021            entry = uvm_mapent_alloc(map, flags);
5022            KDASSERT(entry != NULL);
5023            entry->end = entry->start = min;
5024            entry->guard = 0;
5025            entry->fspace = lmax - min;
5026            entry->object.uvm_obj = NULL;
5027            entry->offset = 0;
5028            entry->etype = 0;
5029            entry->protection = entry->max_protection = 0;
5030            entry->inheritance = 0;
5031            entry->wired_count = 0;
5032            entry->advice = 0;
5033            entry->aref.ar_pageoff = 0;
5034            entry->aref.ar_amap = NULL;
5035            uvm_mapent_addr_insert(map, entry);
5036
5037            entfree = free;
5038        }
5039
5040        min = lmax;
5041    }
5042    /* Finally put entry on the uaddr state. */
5043    if (entry != NULL)
5044        uvm_mapent_free_insert(map, entfree, entry);
5045
5046    return entry;
5047}
5048
5049/*
5050 * MQuery style of allocation.
5051 *
5052 * This allocator searches forward until sufficient space is found to map
5053 * the given size.
5054 *
5055 * XXX: factor in offset (via pmap_prefer) and protection?
5056 */
5057int
5058uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset,
5059    int flags)
5060{
5061    struct vm_map_entry *entry, *last;
5062    vaddr_t addr;
5063    vaddr_t tmp, pmap_align, pmap_offset;
5064    int error;
5065
5066    addr = *addr_p;
5067    vm_map_lock_read(map);
5068
5069    /* Configure pmap prefer. */
5070    if (offset != UVM_UNKNOWN_OFFSET) {
5071        pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN());
5072        pmap_offset = PMAP_PREFER_OFFSET(offset);
5073    } else {
5074        pmap_align = PAGE_SIZE;
5075        pmap_offset = 0;
5076    }
5077
5078    /* Align address to pmap_prefer unless FLAG_FIXED is set. */
5079    if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) {
5080        tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
5081        if (tmp < addr)
5082            tmp += pmap_align;
5083        addr = tmp;
5084    }
5085
5086    /* First, check if the requested range is fully available. */
5087    entry = uvm_map_entrybyaddr(&map->addr, addr);
5088    last = NULL;
5089    if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5090        error = 0;
5091        goto out;
5092    }
5093    if (flags & UVM_FLAG_FIXED) {
5094        error = EINVAL;
5095        goto out;
5096    }
5097
5098    error = ENOMEM; /* Default error from here. */
5099
5100    /*
5101     * At this point, the memory at <addr, sz> is not available.
5102     * The reasons are:
5103     * [1] it's outside the map,
5104     * [2] it starts in used memory (and therefore needs to move
5105     *     toward the first free page in entry),
5106     * [3] it starts in free memory but bumps into used memory.
5107     *
5108     * Note that for case [2], the forward moving is handled by the
5109     * for loop below.
5110     */
5111    if (entry == NULL) {
5112        /* [1] Outside the map. */
5113        if (addr >= map->max_offset)
5114            goto out;
5115        else
5116            entry = RBT_MIN(uvm_map_addr, &map->addr);
5117    } else if (VMMAP_FREE_START(entry) <= addr) {
5118        /* [3] Bumped into used memory. */
5119        entry = RBT_NEXT(uvm_map_addr, entry);
5120    }
5121
5122    /* Test if the next entry is sufficient for the allocation. */
5123    for (; entry != NULL;
5124        entry = RBT_NEXT(uvm_map_addr, entry)) {
5125        if (entry->fspace == 0)
5126            continue;
5127        addr = VMMAP_FREE_START(entry);
5128
5129restart:    /* Restart address checks on address change. */
5130        tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
5131        if (tmp < addr)
5132            tmp += pmap_align;
5133        addr = tmp;
5134        if (addr >= VMMAP_FREE_END(entry))
5135            continue;
5136
5137        /* Skip brk() allocation addresses. */
5138        if (addr + sz > map->b_start && addr < map->b_end) {
5139            if (VMMAP_FREE_END(entry) > map->b_end) {
5140                addr = map->b_end;
5141                goto restart;
5142            } else
5143                continue;
5144        }
5145        /* Skip stack allocation addresses. */
5146        if (addr + sz > map->s_start && addr < map->s_end) {
5147            if (VMMAP_FREE_END(entry) > map->s_end) {
5148                addr = map->s_end;
5149                goto restart;
5150            } else
5151                continue;
5152        }
5153
5154        last = NULL;
5155        if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5156            error = 0;
5157            goto out;
5158        }
5159    }
5160
5161out:
5162    vm_map_unlock_read(map);
5163    if (error == 0)
5164        *addr_p = addr;
5165    return error;
5166}
5167
5168/*
5169 * Determine allocation bias.
5170 *
5171 * Returns 1 if we should bias to high addresses, -1 for a bias towards low
5172 * addresses, or 0 for no bias.
5173 * The bias mechanism is intended to avoid clashing with brk() and stack
5174 * areas.
5175 */
5176int
5177uvm_mapent_bias(struct vm_map *map, struct vm_map_entry *entry)
5178{
5179    vaddr_t start, end;
5180
5181    start = VMMAP_FREE_START(entry);
5182    end = VMMAP_FREE_END(entry);
5183
5184    /* Stay at the top of brk() area. */
5185    if (end >= map->b_start && start < map->b_end)
5186        return 1;
5187    /* Stay at the far end of the stack area. */
5188    if (end >= map->s_start && start < map->s_end) {
5189#ifdef MACHINE_STACK_GROWS_UP
5190        return 1;
5191#else
5192        return -1;
5193#endif
5194    }
5195
5196    /* No bias, this area is meant for us. */
5197    return 0;
5198}
5199
5200
5201boolean_t
5202vm_map_lock_try_ln(struct vm_map *map, char *file, int line)
5203{
5204    boolean_t rv;
5205
5206    if (map->flags & VM_MAP_INTRSAFE) {
5207        rv = _mtx_enter_try(&map->mtx LOCK_FL_ARGS);
5208    } else {
5209        mtx_enter(&map->flags_lock);
5210        if (map->flags & VM_MAP_BUSY) {
5211            mtx_leave(&map->flags_lock);
5212            return (FALSE);
5213        }
5214        mtx_leave(&map->flags_lock);
5215        rv = (_rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP LOCK_FL_ARGS)
5216            == 0);
5217        /* check if the lock is busy and back out if we won the race */
5218        if (rv) {
5219            mtx_enter(&map->flags_lock);
5220            if (map->flags & VM_MAP_BUSY) {
5221                _rw_exit(&map->lock LOCK_FL_ARGS);
5222                rv = FALSE;
5223            }
5224            mtx_leave(&map->flags_lock);
5225        }
5226    }
5227
5228    if (rv) {
5229        map->timestamp++;
5230        LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5231        uvm_tree_sanity(map, file, line);
5232        uvm_tree_size_chk(map, file, line);
5233    }
5234
5235    return (rv);
5236}
5237
5238void
5239vm_map_lock_ln(struct vm_map *map, char *file, int line)
5240{
5241    if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5242        do {
5243            mtx_enter(&map->flags_lock);
5244tryagain:
5245            while (map->flags & VM_MAP_BUSY) {
5246                map->flags |= VM_MAP_WANTLOCK;
5247                msleep(&map->flags, &map->flags_lock,
5248                    PVM, vmmapbsy, 0);
5249            }
5250            mtx_leave(&map->flags_lock);
5251        } while (_rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL
5252            LOCK_FL_ARGS) != 0);
5253        /* check if the lock is busy and back out if we won the race */
5254        mtx_enter(&map->flags_lock);
5255        if (map->flags & VM_MAP_BUSY) {
5256            _rw_exit(&map->lock LOCK_FL_ARGS);
5257            goto tryagain;
5258        }
5259        mtx_leave(&map->flags_lock);
5260    } else {
5261        _mtx_enter(&map->mtx LOCK_FL_ARGS);
5262    }
5263
5264    map->timestamp++;
5265    LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5266    uvm_tree_sanity(map, file, line);
5267    uvm_tree_size_chk(map, file, line);
5268}
5269
5270void
5271vm_map_lock_read_ln(struct vm_map *map, char *file, int line)
5272{
5273    if ((map->flags & VM_MAP_INTRSAFE) == 0)
5274        _rw_enter_read(&map->lock LOCK_FL_ARGS);
5275    else
5276        _mtx_enter(&map->mtx LOCK_FL_ARGS);
5277    LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5278    uvm_tree_sanity(map, file, line);
5279    uvm_tree_size_chk(map, file, line);
5280}
5281
5282void
5283vm_map_unlock_ln(struct vm_map *map, char *file, int line)
5284{
5285    uvm_tree_sanity(map, file, line);
5286    uvm_tree_size_chk(map, file, line);
5287    LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5288    if ((map->flags & VM_MAP_INTRSAFE) == 0)
5289        _rw_exit(&map->lock LOCK_FL_ARGS);
5290    else
5291        _mtx_leave(&map->mtx LOCK_FL_ARGS);
5292}
5293
5294void
5295vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
5296{
5297    /* XXX: RO */ uvm_tree_sanity(map, file, line);
5298    /* XXX: RO */ uvm_tree_size_chk(map, file, line);
5299    LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5300    if ((map->flags & VM_MAP_INTRSAFE) == 0)
5301        _rw_exit_read(&map->lock LOCK_FL_ARGS);
5302    else
5303        _mtx_leave(&map->mtx LOCK_FL_ARGS);
5304}
5305
5306void
5307vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
5308{
5309    uvm_tree_sanity(map, file, line);
5310    uvm_tree_size_chk(map, file, line);
5311    LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5312    LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5313    KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5314    if ((map->flags & VM_MAP_INTRSAFE) == 0)
5315        _rw_enter(&map->lock, RW_DOWNGRADE LOCK_FL_ARGS);
5316}
5317
5318void
5319vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
5320{
5321    /* XXX: RO */ uvm_tree_sanity(map, file, line);
5322    /* XXX: RO */ uvm_tree_size_chk(map, file, line);
5323    LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5324    KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5325    if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5326        _rw_exit_read(&map->lock LOCK_FL_ARGS);
5327        _rw_enter_write(&map->lock LOCK_FL_ARGS);
5328    }
5329    LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5330    uvm_tree_sanity(map, file, line);
5331}
5332
5333void
5334vm_map_busy_ln(struct vm_map *map, char *file, int line)
5335{
5336    KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5337    mtx_enter(&map->flags_lock);
5338    map->flags |= VM_MAP_BUSY;
5339    mtx_leave(&map->flags_lock);
5340}
5341
5342void
5343vm_map_unbusy_ln(struct vm_map *map, char *file, int line)
5344{
5345    int oflags;
5346
5347    KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5348    mtx_enter(&map->flags_lock);
5349    oflags = map->flags;
5350    map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK);
5351    mtx_leave(&map->flags_lock);
5352    if (oflags & VM_MAP_WANTLOCK)
5353        wakeup(&map->flags);
5354}
5355
5356#ifndef SMALL_KERNEL
5357int
5358uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve,
5359    size_t *lenp)
5360{
5361    struct vm_map_entry *entry;
5362    vaddr_t start;
5363    int cnt, maxcnt, error = 0;
5364
5365    KASSERT(*lenp > 0);
5366    KASSERT((*lenp % sizeof(*kve)) == 0);
5367    cnt = 0;
5368    maxcnt = *lenp / sizeof(*kve);
5369    KASSERT(maxcnt > 0);
5370
5371    /*
5372     * Return only entries whose address is above the given base
5373     * address.  This allows userland to iterate without knowing the
5374     * number of entries beforehand.
5375     */
5376    start = (vaddr_t)kve[0].kve_start;
5377
5378    vm_map_lock(map);
5379    RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
5380        if (cnt == maxcnt) {
5381            error = ENOMEM;
5382            break;
5383        }
5384        if (start != 0 && entry->start < start)
5385            continue;
5386        kve->kve_start = entry->start;
5387        kve->kve_end = entry->end;
5388        kve->kve_guard = entry->guard;
5389        kve->kve_fspace = entry->fspace;
5390        kve->kve_fspace_augment = entry->fspace_augment;
5391        kve->kve_offset = entry->offset;
5392        kve->kve_wired_count = entry->wired_count;
5393        kve->kve_etype = entry->etype;
5394        kve->kve_protection = entry->protection;
5395        kve->kve_max_protection = entry->max_protection;
5396        kve->kve_advice = entry->advice;
5397        kve->kve_inheritance = entry->inheritance;
5398        kve->kve_flags = entry->flags;
5399        kve++;
5400        cnt++;
5401    }
5402    vm_map_unlock(map);
5403
5404    KASSERT(cnt <= maxcnt);
5405
5406    *lenp = sizeof(*kve) * cnt;
5407    return error;
5408}
5409#endif
5410
5411
5412RBT_GENERATE_AUGMENT(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
5413    uvm_mapentry_addrcmp, uvm_map_addr_augment);
5414
5415
5416/*
5417 * MD code: vmspace allocator setup.
5418 */
5419
5420#ifdef __i386__
5421void
5422uvm_map_setup_md(struct vm_map *map)
5423{
5424    vaddr_t     min, max;
5425
5426    min = map->min_offset;
5427    max = map->max_offset;
5428
5429    /*
5430     * Ensure the selectors will not try to manage page 0;
5431     * it's too special.
5432     */
5433    if (min < VMMAP_MIN_ADDR)
5434        min = VMMAP_MIN_ADDR;
5435
5436#if 0   /* Cool stuff, not yet */
5437    /* Executable code is special. */
5438    map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR);
5439    /* Place normal allocations beyond executable mappings. */
5440    map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max);
5441#else   /* Crappy stuff, for now */
5442    map->uaddr_any[0] = uaddr_rnd_create(min, max);
5443#endif
5444
5445#ifndef SMALL_KERNEL
5446    map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5447#endif /* !SMALL_KERNEL */
5448}
5449#elif __LP64__
5450void
5451uvm_map_setup_md(struct vm_map *map)
5452{
5453    vaddr_t     min, max;
5454
5455    min = map->min_offset;
5456    max = map->max_offset;
5457
5458    /*
5459     * Ensure the selectors will not try to manage page 0;
5460     * it's too special.
5461     */
5462    if (min < VMMAP_MIN_ADDR)
5463        min = VMMAP_MIN_ADDR;
5464
5465#if 0   /* Cool stuff, not yet */
5466    map->uaddr_any[3] = uaddr_pivot_create(MAX(min, 0x100000000ULL), max);
5467#else   /* Crappy stuff, for now */
5468    map->uaddr_any[0] = uaddr_rnd_create(min, max);
5469#endif
5470
5471#ifndef SMALL_KERNEL
5472    map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5473#endif /* !SMALL_KERNEL */
5474}
5475#else   /* non-i386, 32 bit */
5476void
5477uvm_map_setup_md(struct vm_map *map)
5478{
5479    vaddr_t     min, max;
5480
5481    min = map->min_offset;
5482    max = map->max_offset;
5483
5484    /*
5485     * Ensure the selectors will not try to manage page 0;
5486     * it's too special.
5487     */
5488    if (min < VMMAP_MIN_ADDR)
5489        min = VMMAP_MIN_ADDR;
5490
5491#if 0   /* Cool stuff, not yet */
5492    map->uaddr_any[3] = uaddr_pivot_create(min, max);
5493#else   /* Crappy stuff, for now */
5494    map->uaddr_any[0] = uaddr_rnd_create(min, max);
5495#endif
5496
5497#ifndef SMALL_KERNEL
5498    map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5499#endif /* !SMALL_KERNEL */
5500}
5501#endif
5502