/* $NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $ */ /* * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran, and by Maxime Villard. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2007 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2006 Mathieu Ropert * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Copyright 2001 (c) Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" #include "opt_multiprocessor.h" #include "opt_xen.h" #include "opt_svs.h" #include "opt_kaslr.h" #include "opt_efi.h" #define __MUTEX_PRIVATE /* for assertions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef XEN #include #include #include #endif #ifdef __HAVE_DIRECT_MAP #include #endif /* * general info: * * - for an explanation of how the x86 MMU hardware works see * the comments in . * * - for an explanation of the general memory structure used by * this pmap (including the recursive mapping), see the comments * in . * * this file contains the code for the "pmap module." the module's * job is to manage the hardware's virtual to physical address mappings. * note that there are two levels of mapping in the VM system: * * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's * to map ranges of virtual address space to objects/files. for * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only * to the file /bin/ls starting at offset zero." note that * the upper layer mapping is not concerned with how individual * vm_pages are mapped. * * [2] the lower layer of the VM system (the pmap) maintains the mappings * from virtual addresses. it is concerned with which vm_page is * mapped where. for example, when you run /bin/ls and start * at page 0x1000 the fault routine may lookup the correct page * of the /bin/ls file and then ask the pmap layer to establish * a mapping for it. * * note that information in the lower layer of the VM system can be * thrown away since it can easily be reconstructed from the info * in the upper layer. * * data structures we use include: * * - struct pmap: describes the address space of one thread * - struct pmap_page: describes one pv-tracked page, without * necessarily a corresponding vm_page * - struct pv_entry: describes one mapping of a PA * - pmap_page::pp_pvlist: there is one list per pv-tracked page of * physical memory. the pp_pvlist points to a list of pv_entry * structures which describe all the pairs that this * page is mapped in. this is critical for page based operations * such as pmap_page_protect() [change protection on _all_ mappings * of a page] */ /* * Locking * * We have the following locks that we must deal with, listed in the order * that they are acquired: * * pg->uobject->vmobjlock, pg->uanon->an_lock * * For managed pages, these per-object locks are taken by the VM system * before calling into the pmap module - either a read or write hold. * The lock hold prevent pages from changing identity while the pmap is * operating on them. For example, the same lock is held across a call * to pmap_remove() and the following call to pmap_update(), so that a * page does not gain a new identity while its TLB visibility is stale. * * pmap->pm_lock * * This lock protects the fields in the pmap structure including the * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data * structures. For modifying unmanaged kernel PTEs it is not needed as * kernel PDEs are never freed, and the kernel is expected to be self * consistent (and the lock can't be taken for unmanaged kernel PTEs, * because they can be modified from interrupt context). * * pmaps_lock * * This lock protects the list of active pmaps (headed by "pmaps"). * It's acquired when adding or removing pmaps or adjusting kernel PDEs. * * pp_lock * * This per-page lock protects PV entry lists and the embedded PV entry * in each vm_page, allowing for concurrent operation on pages by * different pmaps. This is a spin mutex at IPL_VM, because at the * points it is taken context switching is usually not tolerable, and * spin mutexes must block out interrupts that could take kernel_lock. */ /* uvm_object is abused here to index pmap_pages; make assertions happy. */ #ifdef DIAGNOSTIC #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) #else #define PMAP_DUMMY_LOCK(pm) #define PMAP_DUMMY_UNLOCK(pm) #endif static const struct uvm_pagerops pmap_pager = { /* nothing */ }; /* * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X) */ #define pl_i(va, lvl) \ (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1]) #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl)) /* * PTP macros: * a PTP's index is the PD index of the PDE that points to it * a PTP's offset is the byte-offset in the PTE space that this PTP is at * a PTP's VA is the first VA mapped by that PTP */ #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE) const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; const long nkptpmax[] = NKPTPMAX_INITIALIZER; const long nbpd[] = NBPD_INITIALIZER; #ifdef i386 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; #else pd_entry_t *normal_pdes[3]; #endif long nkptp[] = NKPTP_INITIALIZER; struct pmap_head pmaps; kmutex_t pmaps_lock __cacheline_aligned; struct pcpu_area *pcpuarea __read_mostly; static vaddr_t pmap_maxkvaddr; /* * Misc. event counters. */ struct evcnt pmap_iobmp_evcnt; struct evcnt pmap_ldt_evcnt; /* * PAT */ static bool cpu_pat_enabled __read_mostly = false; /* * Global data structures */ static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; static rb_tree_t pmap_kernel_rb __cacheline_aligned; struct bootspace bootspace __read_mostly; struct slotspace slotspace __read_mostly; /* Set to PTE_NX if supported. */ pd_entry_t pmap_pg_nx __read_mostly = 0; /* Set to PTE_G if supported. */ pd_entry_t pmap_pg_g __read_mostly = 0; /* Set to true if large pages are supported. */ int pmap_largepages __read_mostly = 0; paddr_t lowmem_rsvd __read_mostly; paddr_t avail_start __read_mostly; /* PA of first available physical page */ paddr_t avail_end __read_mostly; /* PA of last available physical page */ #ifdef XENPV paddr_t pmap_pa_start; /* PA of first physical page for this domain */ paddr_t pmap_pa_end; /* PA of last physical page for this domain */ #endif #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) #define PMAP_CHECK_PP(pp) \ KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) #define PAGE_ALIGNED(pp) \ __builtin_assume_aligned((void *)(pp), PAGE_SIZE) /* * Other data structures */ static pt_entry_t protection_codes[8] __read_mostly; static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ /* * The following two vaddr_t's are used during system startup to keep track of * how much of the kernel's VM space we have used. Once the system is started, * the management of the remaining kernel VM space is turned over to the * kernel_map vm_map. */ static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ #ifndef XENPV /* * LAPIC virtual address, and fake physical address. */ volatile vaddr_t local_apic_va __read_mostly; paddr_t local_apic_pa __read_mostly; #endif /* * pool that pmap structures are allocated from */ struct pool_cache pmap_cache; static int pmap_ctor(void *, void *, int); static void pmap_dtor(void *, void *); /* * pv_page cache */ static struct pool_cache pmap_pvp_cache; #ifdef __HAVE_DIRECT_MAP vaddr_t pmap_direct_base __read_mostly; vaddr_t pmap_direct_end __read_mostly; #endif #ifndef __HAVE_DIRECT_MAP /* * Special VAs and the PTEs that map them */ static pt_entry_t *early_zero_pte; static void pmap_vpage_cpualloc(struct cpu_info *); #ifdef XENPV char *early_zerop; /* also referenced from xen_locore() */ #else static char *early_zerop; #endif #endif int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); /* PDP pool and its callbacks */ static struct pool pmap_pdp_pool; static void pmap_pdp_init(pd_entry_t *); static void pmap_pdp_fini(pd_entry_t *); #ifdef PAE /* need to allocate items of 4 pages */ static void *pmap_pdp_alloc(struct pool *, int); static void pmap_pdp_free(struct pool *, void *); static struct pool_allocator pmap_pdp_allocator = { .pa_alloc = pmap_pdp_alloc, .pa_free = pmap_pdp_free, .pa_pagesz = PAGE_SIZE * PDP_SIZE, }; #endif extern vaddr_t idt_vaddr; extern paddr_t idt_paddr; extern vaddr_t gdt_vaddr; extern paddr_t gdt_paddr; extern vaddr_t ldt_vaddr; extern paddr_t ldt_paddr; #ifdef i386 /* stuff to fix the pentium f00f bug */ extern vaddr_t pentium_idt_vaddr; #endif /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ struct pmap_ptparray { struct vm_page *pg[PTP_LEVELS + 1]; bool alloced[PTP_LEVELS + 1]; }; /* * PV entries are allocated in page-sized chunks and cached per-pmap to * avoid intense pressure on memory allocators. */ struct pv_page { LIST_HEAD(, pv_entry) pvp_pves; LIST_ENTRY(pv_page) pvp_list; long pvp_nfree; struct pmap *pvp_pmap; }; #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) /* * PV tree prototypes */ static int pmap_compare_key(void *, const void *, const void *); static int pmap_compare_nodes(void *, const void *, const void *); /* Read-black tree */ static const rb_tree_ops_t pmap_rbtree_ops = { .rbto_compare_nodes = pmap_compare_nodes, .rbto_compare_key = pmap_compare_key, .rbto_node_offset = offsetof(struct pv_entry, pve_rb), .rbto_context = NULL }; /* * Local prototypes */ #ifdef __HAVE_PCPU_AREA static void pmap_init_pcpu(void); #endif #ifdef __HAVE_DIRECT_MAP static void pmap_init_directmap(struct pmap *); #endif #if !defined(XENPV) static void pmap_remap_global(void); #endif #ifndef XENPV static void pmap_init_lapic(void); static void pmap_remap_largepages(void); #endif static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, struct vm_page **); static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, pd_entry_t * const *); static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); static void pmap_freepage(struct pmap *, struct vm_page *, int); static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, pt_entry_t *, pd_entry_t * const *); static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, vaddr_t); static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, vaddr_t); static int pmap_pvp_ctor(void *, void *, int); static void pmap_pvp_dtor(void *, void *); static struct pv_entry *pmap_alloc_pv(struct pmap *); static void pmap_free_pv(struct pmap *, struct pv_entry *); static void pmap_drain_pv(struct pmap *); static void pmap_alloc_level(struct pmap *, vaddr_t, long *); static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); static void pmap_reactivate(struct pmap *); long pmap_resident_count(struct pmap *pmap) { return pmap->pm_stats.resident_count; } long pmap_wired_count(struct pmap *pmap) { return pmap->pm_stats.wired_count; } /* * p m a p h e l p e r f u n c t i o n s */ static inline void pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) { KASSERT(cold || mutex_owned(&pmap->pm_lock)); pmap->pm_stats.resident_count += resid_diff; pmap->pm_stats.wired_count += wired_diff; } static inline void pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) { int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); pmap_stats_update(pmap, resid_diff, wired_diff); } /* * ptp_to_pmap: lookup pmap by ptp */ static inline struct pmap * ptp_to_pmap(struct vm_page *ptp) { struct pmap *pmap; if (ptp == NULL) { return pmap_kernel(); } pmap = (struct pmap *)ptp->uobject; KASSERT(pmap != NULL); KASSERT(&pmap->pm_obj[0] == ptp->uobject); return pmap; } static inline struct pv_pte * pve_to_pvpte(struct pv_entry *pve) { if (pve == NULL) return NULL; KASSERT((void *)&pve->pve_pte == (void *)pve); return &pve->pve_pte; } static inline struct pv_entry * pvpte_to_pve(struct pv_pte *pvpte) { struct pv_entry *pve = (void *)pvpte; KASSERT(pve_to_pvpte(pve) == pvpte); return pve; } /* * Return true if the pmap page has an embedded PV entry. */ static inline bool pv_pte_embedded(struct pmap_page *pp) { KASSERT(mutex_owned(&pp->pp_lock)); return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); } /* * pv_pte_first, pv_pte_next: PV list iterator. */ static inline struct pv_pte * pv_pte_first(struct pmap_page *pp) { KASSERT(mutex_owned(&pp->pp_lock)); if (pv_pte_embedded(pp)) { return &pp->pp_pte; } return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); } static inline struct pv_pte * pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) { KASSERT(mutex_owned(&pp->pp_lock)); KASSERT(pvpte != NULL); if (pvpte == &pp->pp_pte) { return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); } return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); } static inline uint8_t pmap_pte_to_pp_attrs(pt_entry_t pte) { uint8_t ret = 0; if (pte & PTE_D) ret |= PP_ATTRS_D; if (pte & PTE_A) ret |= PP_ATTRS_A; if (pte & PTE_W) ret |= PP_ATTRS_W; return ret; } static inline pt_entry_t pmap_pp_attrs_to_pte(uint8_t attrs) { pt_entry_t pte = 0; if (attrs & PP_ATTRS_D) pte |= PTE_D; if (attrs & PP_ATTRS_A) pte |= PTE_A; if (attrs & PP_ATTRS_W) pte |= PTE_W; return pte; } /* * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? * of course the kernel is always loaded */ bool pmap_is_curpmap(struct pmap *pmap) { return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); } inline void pmap_reference(struct pmap *pmap) { atomic_inc_uint(&pmap->pm_obj[0].uo_refs); } /* * rbtree: compare two nodes. */ static int pmap_compare_nodes(void *context, const void *n1, const void *n2) { const struct pv_entry *pve1 = n1; const struct pv_entry *pve2 = n2; KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { return -1; } if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { return 1; } return 0; } /* * rbtree: compare a node and a key. */ static int pmap_compare_key(void *context, const void *n, const void *k) { const struct pv_entry *pve = n; const vaddr_t key = (vaddr_t)k; if (pve->pve_pte.pte_va < key) { return -1; } if (pve->pve_pte.pte_va > key) { return 1; } return 0; } /* * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE */ static inline void pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) { vaddr_t *min = (vaddr_t *)&ptp->uanon; if (va < *min) { *min = va; } } /* * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove */ static inline void pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) { vaddr_t sclip; if (ptp == NULL) { return; } sclip = (vaddr_t)ptp->uanon; sclip = (*startva < sclip ? sclip : *startva); *pte += (sclip - *startva) / PAGE_SIZE; *startva = sclip; } /* * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in * * there are several pmaps involved. some or all of them might be same. * * - the pmap given by the first argument * our caller wants to access this pmap's PTEs. * * - pmap_kernel() * the kernel pmap. note that it only contains the kernel part * of the address space which is shared by any pmap. ie. any * pmap can be used instead of pmap_kernel() for our purpose. * * - ci->ci_pmap * pmap currently loaded on the cpu. * * - vm_map_pmap(&curproc->p_vmspace->vm_map) * current process' pmap. * * => caller must lock pmap first (if not the kernel pmap) * => must be undone with pmap_unmap_ptes before returning * => disables kernel preemption */ void pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, pd_entry_t * const **pdeppp) { struct pmap *curpmap; struct cpu_info *ci; lwp_t *l; kpreempt_disable(); /* The kernel's pmap is always accessible. */ if (pmap == pmap_kernel()) { *pmap2 = NULL; *ptepp = PTE_BASE; *pdeppp = normal_pdes; return; } KASSERT(mutex_owned(&pmap->pm_lock)); l = curlwp; ci = l->l_cpu; curpmap = ci->ci_pmap; if (pmap == curpmap) { /* * Already on the CPU: make it valid. This is very * often the case during exit(), when we have switched * to the kernel pmap in order to destroy a user pmap. */ if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { pmap_reactivate(pmap); } *pmap2 = NULL; } else { /* * Toss current pmap from CPU and install new pmap, but keep * a reference to the old one. Dropping the reference can * can block as it needs to take locks, so defer that to * pmap_unmap_ptes(). */ pmap_reference(pmap); pmap_load1(l, pmap, curpmap); *pmap2 = curpmap; } KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); #ifdef DIAGNOSTIC pmap->pm_pctr = lwp_pctr(); #endif *ptepp = PTE_BASE; #if defined(XENPV) && defined(__x86_64__) KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; *pdeppp = ci->ci_normal_pdes; #else *pdeppp = normal_pdes; #endif } /* * pmap_unmap_ptes: unlock the PTE mapping of "pmap" * * => we cannot tolerate context switches while mapped in: assert this. * => reenables kernel preemption. * => does not unlock pmap. */ void pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) { struct cpu_info *ci; struct pmap *mypmap; struct lwp *l; KASSERT(kpreempt_disabled()); /* The kernel's pmap is always accessible. */ if (pmap == pmap_kernel()) { kpreempt_enable(); return; } l = curlwp; ci = l->l_cpu; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pmap->pm_pctr == lwp_pctr()); #if defined(XENPV) && defined(__x86_64__) KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; #endif /* If not our own pmap, mark whatever's on the CPU now as lazy. */ KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { ci->ci_want_pmapload = 0; } else { ci->ci_want_pmapload = (mypmap != pmap_kernel()); ci->ci_tlbstate = TLBSTATE_LAZY; } /* Now safe to re-enable preemption. */ kpreempt_enable(); /* Toss reference to other pmap taken earlier. */ if (pmap2 != NULL) { pmap_destroy(pmap2); } } inline static void pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) { #if !defined(__x86_64__) if (curproc == NULL || curproc->p_vmspace == NULL || pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) return; if ((opte ^ npte) & PTE_X) pmap_update_pg(va); /* * Executability was removed on the last executable change. * Reset the code segment to something conservative and * let the trap handler deal with setting the right limit. * We can't do that because of locking constraints on the vm map. */ if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { struct trapframe *tf = curlwp->l_md.md_regs; tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); pm->pm_hiexec = I386_MAX_EXE_ADDR; } #endif /* !defined(__x86_64__) */ } #if !defined(__x86_64__) /* * Fixup the code segment to cover all potential executable mappings. * returns 0 if no changes to the code segment were made. */ int pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) { struct vm_map_entry *ent; struct pmap *pm = vm_map_pmap(map); vaddr_t va = 0; vm_map_lock_read(map); for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { /* * This entry has greater va than the entries before. * We need to make it point to the last page, not past it. */ if (ent->protection & VM_PROT_EXECUTE) va = trunc_page(ent->end) - PAGE_SIZE; } vm_map_unlock_read(map); if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) return 0; pm->pm_hiexec = va; if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); } else { tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); return 0; } return 1; } #endif /* !defined(__x86_64__) */ void pat_init(struct cpu_info *ci) { #ifndef XENPV uint64_t pat; if (!(ci->ci_feat_val[0] & CPUID_PAT)) return; /* We change WT to WC. Leave all other entries the default values. */ pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); wrmsr(MSR_CR_PAT, pat); cpu_pat_enabled = true; #endif } static pt_entry_t pmap_pat_flags(u_int flags) { u_int cacheflags = (flags & PMAP_CACHE_MASK); if (!cpu_pat_enabled) { switch (cacheflags) { case PMAP_NOCACHE: case PMAP_NOCACHE_OVR: /* results in PGC_UCMINUS on cpus which have * the cpuid PAT but PAT "disabled" */ return PTE_PCD; default: return 0; } } switch (cacheflags) { case PMAP_NOCACHE: return PGC_UC; case PMAP_WRITE_COMBINE: return PGC_WC; case PMAP_WRITE_BACK: return PGC_WB; case PMAP_NOCACHE_OVR: return PGC_UCMINUS; } return 0; } /* * p m a p k e n t e r f u n c t i o n s * * functions to quickly enter/remove pages from the kernel address * space. pmap_kremove is exported to MI kernel. we make use of * the recursive PTE mappings. */ /* * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking * * => no need to lock anything, assume va is already allocated * => should be faster than normal pmap enter function */ void pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) { pt_entry_t *pte, opte, npte; KASSERT(!(prot & ~VM_PROT_ALL)); if (va < VM_MIN_KERNEL_ADDRESS) pte = vtopte(va); else pte = kvtopte(va); #if defined(XENPV) && defined(DOM0OPS) if (pa < pmap_pa_start || pa >= pmap_pa_end) { #ifdef DEBUG printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR " outside range\n", __func__, pa, va); #endif /* DEBUG */ npte = pa; } else #endif /* XENPV && DOM0OPS */ npte = pmap_pa2pte(pa); npte |= protection_codes[prot] | PTE_P | pmap_pg_g; npte |= pmap_pat_flags(flags); opte = pmap_pte_testset(pte, npte); /* zap! */ /* * XXX: make sure we are not dealing with a large page, since the only * large pages created are for the kernel image, and they should never * be kentered. */ KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { /* This should not happen. */ printf_nolog("%s: mapping already present\n", __func__); kpreempt_disable(); pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); kpreempt_enable(); } } __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); #if defined(__x86_64__) /* * Change protection for a virtual address. Local for a CPU only, don't * care about TLB shootdowns. * * => must be called with preemption disabled */ void pmap_changeprot_local(vaddr_t va, vm_prot_t prot) { pt_entry_t *pte, opte, npte; KASSERT(kpreempt_disabled()); if (va < VM_MIN_KERNEL_ADDRESS) pte = vtopte(va); else pte = kvtopte(va); npte = opte = *pte; if ((prot & VM_PROT_WRITE) != 0) npte |= PTE_W; else npte &= ~(PTE_W|PTE_D); if (opte != npte) { pmap_pte_set(pte, npte); pmap_pte_flush(); invlpg(va); } } #endif /* defined(__x86_64__) */ /* * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking * * => no need to lock anything * => caller must dispose of any vm_page mapped in the va range * => note: not an inline function * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE * => we assume kernel only unmaps valid addresses and thus don't bother * checking the valid bit before doing TLB flushing * => must be followed by call to pmap_update() before reuse of page */ static void pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) { pt_entry_t *pte, opte; vaddr_t va, eva; eva = sva + len; kpreempt_disable(); for (va = sva; va < eva; va += PAGE_SIZE) { pte = kvtopte(va); opte = pmap_pte_testset(pte, 0); /* zap! */ if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KREMOVE); } KASSERTMSG((opte & PTE_PS) == 0, "va %#" PRIxVADDR " is a large page", va); KASSERTMSG((opte & PTE_PVLIST) == 0, "va %#" PRIxVADDR " is a pv tracked page", va); } if (localonly) { tlbflushg(); } kpreempt_enable(); } void pmap_kremove(vaddr_t sva, vsize_t len) { pmap_kremove1(sva, len, false); } /* * pmap_kremove_local: like pmap_kremove(), but only worry about * TLB invalidations on the current CPU. this is only intended * for use while writing kernel crash dumps, either after panic * or via reboot -d. */ void pmap_kremove_local(vaddr_t sva, vsize_t len) { pmap_kremove1(sva, len, true); } /* * p m a p i n i t f u n c t i o n s * * pmap_bootstrap and pmap_init are called during system startup * to init the pmap module. pmap_bootstrap() does a low level * init just to get things rolling. pmap_init() finishes the job. */ /* * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. * This function is to be used before any VM system has been set up. * * The va is taken from virtual_avail. */ static vaddr_t pmap_bootstrap_valloc(size_t npages) { vaddr_t va = virtual_avail; virtual_avail += npages * PAGE_SIZE; return va; } /* * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. * This function is to be used before any VM system has been set up. * * The pa is taken from avail_start. */ static paddr_t pmap_bootstrap_palloc(size_t npages) { paddr_t pa = avail_start; avail_start += npages * PAGE_SIZE; return pa; } /* * pmap_bootstrap: get the system in a state where it can run with VM properly * enabled (called before main()). The VM system is fully init'd later. * * => on i386, locore.S has already enabled the MMU by allocating a PDP for the * kernel, and nkpde PTP's for the kernel. * => kva_start is the first free virtual address in kernel space. */ void pmap_bootstrap(vaddr_t kva_start) { struct pmap *kpm; int i; vaddr_t kva; pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); /* * Set up our local static global vars that keep track of the usage of * KVM before kernel_map is set up. */ virtual_avail = kva_start; /* first free KVA */ virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ /* * Set up protection_codes: we need to be able to convert from a MI * protection code (some combo of VM_PROT...) to something we can jam * into a x86 PTE. */ protection_codes[VM_PROT_NONE] = pmap_pg_nx; protection_codes[VM_PROT_EXECUTE] = PTE_X; protection_codes[VM_PROT_READ] = pmap_pg_nx; protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; /* * Now we init the kernel's pmap. * * The kernel pmap's pm_obj is not used for much. However, in user pmaps * the pm_obj contains the list of active PTPs. */ kpm = pmap_kernel(); mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); rw_init(&kpm->pm_dummy_lock); for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); kpm->pm_ptphint[i] = NULL; } memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; for (i = 0; i < PDP_SIZE; i++) kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); kcpuset_create(&kpm->pm_cpus, true); kcpuset_create(&kpm->pm_kernel_cpus, true); kpm->pm_ldt = NULL; kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); /* * the above is just a rough estimate and not critical to the proper * operation of the system. */ #if !defined(XENPV) /* * Begin to enable global TLB entries if they are supported: add PTE_G * attribute to already mapped kernel pages. Do that only if SVS is * disabled. * * The G bit has no effect until the CR4_PGE bit is set in CR4, which * happens later in cpu_init(). */ #ifdef SVS if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { #else if (cpu_feature[0] & CPUID_PGE) { #endif pmap_pg_g = PTE_G; pmap_remap_global(); } #endif #ifndef XENPV /* * Enable large pages if they are supported. */ if (cpu_feature[0] & CPUID_PSE) { lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ pmap_largepages = 1; /* enable software */ /* * The TLB must be flushed after enabling large pages on Pentium * CPUs, according to section 3.6.2.2 of "Intel Architecture * Software Developer's Manual, Volume 3: System Programming". */ tlbflushg(); /* Remap the kernel. */ pmap_remap_largepages(); } pmap_init_lapic(); #endif /* !XENPV */ #ifdef __HAVE_PCPU_AREA pmap_init_pcpu(); #endif #ifdef __HAVE_DIRECT_MAP pmap_init_directmap(kpm); #else pmap_vpage_cpualloc(&cpu_info_primary); if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; } else { /* amd64 */ /* * zero_pte is stuck at the end of mapped space for the kernel * image (disjunct from kva space). This is done so that it * can safely be used in pmap_growkernel (pmap_get_physpage), * when it's called for the first time. * XXXfvdl fix this for MULTIPROCESSOR later. */ #ifdef XENPV /* early_zerop initialized in xen_locore() */ #else early_zerop = (void *)bootspace.spareva; #endif early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); } #endif #if defined(XENPV) && defined(__x86_64__) extern vaddr_t xen_dummy_page; paddr_t xen_dummy_user_pgd; /* * We want a dummy page directory for Xen: when deactivating a pmap, * Xen will still consider it active. So we set user PGD to this one * to lift all protection on the now inactive page tables set. */ xen_dummy_user_pgd = xen_dummy_page - KERNBASE; /* Zero fill it, the less checks in Xen it requires the better */ memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); /* Mark read-only */ HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, UVMF_INVLPG); /* Pin as L4 */ xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); #endif /* * Allocate space for the Interrupt Descriptor Table (IDT), * Global Descriptor Table (GDT), and Local Descriptor Table * (LDT). * * Currently there is an initial temporary GDT allocated on the * stack by the caller of init386/init_x86_64, which is (among * other things) needed on i386 for %fs-relative addressing for * CPU-local data (CPUVAR(...), curcpu(), curlwp). This * initial temporary GDT will be popped off the stack before we * can enter main, so we need to make sure there is space for a * second temporary GDT to continue existing when we enter main * before we allocate space for the permanent GDT with * uvm_km(9) in gdt_init via cpu_startup and switch to that. */ idt_vaddr = pmap_bootstrap_valloc(1); idt_paddr = pmap_bootstrap_palloc(1); gdt_vaddr = pmap_bootstrap_valloc(1); gdt_paddr = pmap_bootstrap_palloc(1); #ifdef __HAVE_PCPU_AREA ldt_vaddr = (vaddr_t)&pcpuarea->ldt; #else ldt_vaddr = pmap_bootstrap_valloc(1); #endif ldt_paddr = pmap_bootstrap_palloc(1); #if !defined(__x86_64__) /* pentium f00f bug stuff */ pentium_idt_vaddr = pmap_bootstrap_valloc(1); #endif #if defined(XENPVHVM) /* XXX: move to hypervisor.c with appropriate API adjustments */ extern paddr_t HYPERVISOR_shared_info_pa; extern volatile struct xencons_interface *xencons_interface; /* XXX */ extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ if (vm_guest != VM_GUEST_XENPVH) { HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); } xencons_interface = (void *) pmap_bootstrap_valloc(1); xenstore_interface = (void *) pmap_bootstrap_valloc(1); #endif /* * Now we reserve some VM for mapping pages when doing a crash dump. */ virtual_avail = reserve_dumppages(virtual_avail); /* * Init the global lock and global list. */ mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&pmaps); /* * Ensure the TLB is sync'd with reality by flushing it... */ tlbflushg(); /* * Calculate pmap_maxkvaddr from nkptp[]. */ kva = VM_MIN_KERNEL_ADDRESS; for (i = PTP_LEVELS - 1; i >= 1; i--) { kva += nkptp[i] * nbpd[i]; } pmap_maxkvaddr = kva; } #ifndef XENPV static void pmap_init_lapic(void) { /* * On CPUs that have no LAPIC, local_apic_va is never kentered. But our * x86 implementation relies a lot on this address to be valid; so just * allocate a fake physical page that will be kentered into * local_apic_va by machdep. * * If the LAPIC is present, the va will be remapped somewhere else * later in lapic_map. */ local_apic_va = pmap_bootstrap_valloc(1); local_apic_pa = pmap_bootstrap_palloc(1); } #endif #ifdef __x86_64__ static size_t pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) { size_t npages; npages = (roundup(endva, pgsz) / pgsz) - (rounddown(startva, pgsz) / pgsz); return npages; } #endif #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) static inline void slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) { size_t sslot = slotspace.area[type].sslot; size_t nslot = slotspace.area[type].nslot; memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); } #endif #ifdef __x86_64__ /* * Randomize the location of an area. We count the holes in the VM space. We * randomly select one hole, and then randomly select an area within that hole. * Finally we update the associated entry in the slotspace structure. */ vaddr_t slotspace_rand(int type, size_t sz, size_t align, size_t randhole, vaddr_t randva) { struct { int start; int end; } holes[SLSPACE_NAREAS+1]; size_t i, nholes, hole; size_t startsl, endsl, nslots, winsize; vaddr_t startva, va; sz = roundup(sz, align); /* * Take one more slot with +NBPD_L4, because we may end up choosing * an area that crosses slots: * +------+------+------+ * | Slot | Slot | Slot | * +------+------+------+ * [Chosen Area] * And in that case we must take into account the additional slot * consumed. */ nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; /* Get the holes. */ nholes = 0; size_t curslot = 0 + 256; /* end of SLAREA_USER */ while (1) { /* * Find the first occupied slot after the current one. * The area between the two is a hole. */ size_t minsslot = 512; size_t minnslot = 0; for (i = 0; i < SLSPACE_NAREAS; i++) { if (!slotspace.area[i].active) continue; if (slotspace.area[i].sslot >= curslot && slotspace.area[i].sslot < minsslot) { minsslot = slotspace.area[i].sslot; minnslot = slotspace.area[i].nslot; } } /* No hole anymore, stop here. */ if (minsslot == 512) { break; } /* Register the hole. */ if (minsslot - curslot >= nslots) { holes[nholes].start = curslot; holes[nholes].end = minsslot; nholes++; } /* Skip that hole, and iterate again. */ curslot = minsslot + minnslot; } if (nholes == 0) { panic("%s: impossible", __func__); } /* Select a hole. */ hole = randhole; #ifdef NO_X86_ASLR hole = 0; #endif hole %= nholes; startsl = holes[hole].start; endsl = holes[hole].end; startva = VA_SIGN_NEG(startsl * NBPD_L4); /* Select an area within the hole. */ va = randva; #ifdef NO_X86_ASLR va = 0; #endif winsize = ((endsl - startsl) * NBPD_L4) - sz; va %= winsize; va = rounddown(va, align); va += startva; /* Update the entry. */ slotspace.area[type].sslot = pl4_i(va); slotspace.area[type].nslot = pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); slotspace.area[type].active = true; return va; } #endif #ifdef __HAVE_PCPU_AREA static void pmap_init_pcpu(void) { const vaddr_t startva = PMAP_PCPU_BASE; size_t nL4e, nL3e, nL2e, nL1e; size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; paddr_t pa; vaddr_t endva; vaddr_t tmpva; pt_entry_t *pte; size_t size; int i; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; size = sizeof(struct pcpu_area); endva = startva + size; /* We will use this temporary va. */ tmpva = bootspace.spareva; pte = PTE_BASE + pl1_i(tmpva); /* Build L4 */ L4e_idx = pl4_i(startva); nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); KASSERT(nL4e == 1); for (i = 0; i < nL4e; i++) { KASSERT(L4_BASE[L4e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; } /* Build L3 */ L3e_idx = pl3_i(startva); nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); for (i = 0; i < nL3e; i++) { KASSERT(L3_BASE[L3e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; } /* Build L2 */ L2e_idx = pl2_i(startva); nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); for (i = 0; i < nL2e; i++) { KASSERT(L2_BASE[L2e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; } /* Build L1 */ L1e_idx = pl1_i(startva); nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); for (i = 0; i < nL1e; i++) { /* * Nothing to do, the PTEs will be entered via * pmap_kenter_pa. */ KASSERT(L1_BASE[L1e_idx+i] == 0); } *pte = 0; pmap_update_pg(tmpva); pcpuarea = (struct pcpu_area *)startva; tlbflush(); } #endif #ifdef __HAVE_DIRECT_MAP static void randomize_hole(size_t *randholep, vaddr_t *randvap) { struct nist_hash_drbg drbg; uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES]; const char p[] = "x86/directmap"; int error; entropy_extract(seed, sizeof(seed), 0); error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed), /*nonce*/NULL, 0, /*personalization*/p, strlen(p)); KASSERTMSG(error == 0, "error=%d", error); error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep), /*additional*/NULL, 0); KASSERTMSG(error == 0, "error=%d", error); error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap), /*additional*/NULL, 0); KASSERTMSG(error == 0, "error=%d", error); explicit_memset(seed, 0, sizeof(seed)); explicit_memset(&drbg, 0, sizeof(drbg)); } /* * Create the amd64 direct map. Called only once at boot time. We map all of * the physical memory contiguously using 2MB large pages, with RW permissions. * However there is a hole: the kernel is mapped with RO permissions. */ static void pmap_init_directmap(struct pmap *kpm) { extern phys_ram_seg_t mem_clusters[]; extern int mem_cluster_cnt; vaddr_t startva; size_t nL4e, nL3e, nL2e; size_t L4e_idx, L3e_idx, L2e_idx; size_t spahole, epahole; paddr_t lastpa, pa; vaddr_t endva; vaddr_t tmpva; pt_entry_t *pte; phys_ram_seg_t *mc; int i; size_t randhole; vaddr_t randva; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); spahole = roundup(bootspace.head.pa, NBPD_L2); epahole = rounddown(bootspace.boot.pa, NBPD_L2); /* Get the last physical address available */ lastpa = 0; for (i = 0; i < mem_cluster_cnt; i++) { mc = &mem_clusters[i]; lastpa = MAX(lastpa, mc->start + mc->size); } /* * x86_add_cluster should have truncated the memory to MAXPHYSMEM. */ if (lastpa > MAXPHYSMEM) { panic("pmap_init_directmap: lastpa incorrect"); } randomize_hole(&randhole, &randva); startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, randhole, randva); endva = startva + lastpa; /* We will use this temporary va. */ tmpva = bootspace.spareva; pte = PTE_BASE + pl1_i(tmpva); /* Build L4 */ L4e_idx = pl4_i(startva); nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); KASSERT(nL4e <= NL4_SLOT_DIRECT); for (i = 0; i < nL4e; i++) { KASSERT(L4_BASE[L4e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; } /* Build L3 */ L3e_idx = pl3_i(startva); nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); for (i = 0; i < nL3e; i++) { KASSERT(L3_BASE[L3e_idx+i] == 0); pa = pmap_bootstrap_palloc(1); *pte = (pa & PTE_FRAME) | pteflags; pmap_update_pg(tmpva); memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; } /* Build L2 */ L2e_idx = pl2_i(startva); nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); for (i = 0; i < nL2e; i++) { KASSERT(L2_BASE[L2e_idx+i] == 0); pa = (paddr_t)(i * NBPD_L2); if (spahole <= pa && pa < epahole) { L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | PTE_PS | pmap_pg_g; } else { L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | PTE_PS | pmap_pg_g; } } *pte = 0; pmap_update_pg(tmpva); pmap_direct_base = startva; pmap_direct_end = endva; tlbflush(); } #endif /* __HAVE_DIRECT_MAP */ #if !defined(XENPV) /* * Remap all of the virtual pages created so far with the PTE_G bit. */ static void pmap_remap_global(void) { vaddr_t kva, kva_end; unsigned long p1i; size_t i; /* head */ kva = bootspace.head.va; kva_end = kva + bootspace.head.sz; for ( ; kva < kva_end; kva += PAGE_SIZE) { p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pmap_pg_g; } /* kernel segments */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type == BTSEG_NONE) { continue; } kva = bootspace.segs[i].va; kva_end = kva + bootspace.segs[i].sz; for ( ; kva < kva_end; kva += PAGE_SIZE) { p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pmap_pg_g; } } /* boot space */ kva = bootspace.boot.va; kva_end = kva + bootspace.boot.sz; for ( ; kva < kva_end; kva += PAGE_SIZE) { p1i = pl1_i(kva); if (pmap_valid_entry(PTE_BASE[p1i])) PTE_BASE[p1i] |= pmap_pg_g; } } #endif #ifndef XENPV /* * Remap several kernel segments with large pages. We cover as many pages as we * can. Called only once at boot time, if the CPU supports large pages. */ static void pmap_remap_largepages(void) { pd_entry_t *pde; vaddr_t kva, kva_end; paddr_t pa; size_t i; /* Remap the kernel text using large pages. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type != BTSEG_TEXT) { continue; } kva = roundup(bootspace.segs[i].va, NBPD_L2); if (kva < bootspace.segs[i].va) { continue; } kva_end = rounddown(bootspace.segs[i].va + bootspace.segs[i].sz, NBPD_L2); pa = roundup(bootspace.segs[i].pa, NBPD_L2); for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PTE_PS | PTE_P; tlbflushg(); } } /* Remap the kernel rodata using large pages. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type != BTSEG_RODATA) { continue; } kva = roundup(bootspace.segs[i].va, NBPD_L2); if (kva < bootspace.segs[i].va) { continue; } kva_end = rounddown(bootspace.segs[i].va + bootspace.segs[i].sz, NBPD_L2); pa = roundup(bootspace.segs[i].pa, NBPD_L2); for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; tlbflushg(); } } /* Remap the kernel data+bss using large pages. */ for (i = 0; i < BTSPACE_NSEGS; i++) { if (bootspace.segs[i].type != BTSEG_DATA) { continue; } kva = roundup(bootspace.segs[i].va, NBPD_L2); if (kva < bootspace.segs[i].va) { continue; } kva_end = rounddown(bootspace.segs[i].va + bootspace.segs[i].sz, NBPD_L2); pa = roundup(bootspace.segs[i].pa, NBPD_L2); for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; tlbflushg(); } } } #endif /* !XENPV */ /* * pmap_init: called from uvm_init, our job is to get the pmap system ready * to manage mappings. */ void pmap_init(void) { int flags; /* * initialize caches. */ pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); #ifdef XENPV /* * pool_cache(9) should not touch cached objects, since they * are pinned on xen and R/O for the domU */ flags = PR_NOTOUCH; #else flags = 0; #endif #ifdef PAE pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, "pdppl", &pmap_pdp_allocator, IPL_NONE); #else pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, "pdppl", NULL, IPL_NONE); #endif pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 0, 0, "pvpage", &pool_allocator_kmem, IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); pmap_tlb_init(); /* XXX: Since cpu_hatch() is only for secondary CPUs. */ pmap_tlb_cpu_init(curcpu()); evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, NULL, "x86", "io bitmap copy"); evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, NULL, "x86", "ldt sync"); /* * The kernel doesn't keep track of PTPs, so there's nowhere handy * to hang a tree of pv_entry records. Dynamically allocated * pv_entry lists are not heavily used in the kernel's pmap (the * usual case is embedded), so cop out and use a single RB tree * to cover them. */ rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); /* * done: pmap module is up (and ready for business) */ pmap_initialized = true; } #ifndef XENPV /* * pmap_cpu_init_late: perform late per-CPU initialization. */ void pmap_cpu_init_late(struct cpu_info *ci) { /* * The BP has already its own PD page allocated during early * MD startup. */ if (ci == &cpu_info_primary) return; #ifdef PAE cpu_alloc_l3_page(ci); #endif } #endif #ifndef __HAVE_DIRECT_MAP CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); static void pmap_vpage_cpualloc(struct cpu_info *ci) { bool primary = (ci == &cpu_info_primary); size_t i, npages; vaddr_t vabase; vsize_t vrange; npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); KASSERT(npages >= VPAGE_MAX); vrange = npages * PAGE_SIZE; if (primary) { while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { /* Waste some pages to align properly */ } /* The base is aligned, allocate the rest (contiguous) */ pmap_bootstrap_valloc(npages - 1); } else { vabase = uvm_km_alloc(kernel_map, vrange, vrange, UVM_KMF_VAONLY); if (vabase == 0) { panic("%s: failed to allocate tmp VA for CPU %d\n", __func__, cpu_index(ci)); } } KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); for (i = 0; i < VPAGE_MAX; i++) { ci->vpage[i] = vabase + i * PAGE_SIZE; ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); } } void pmap_vpage_cpu_init(struct cpu_info *ci) { if (ci == &cpu_info_primary) { /* cpu0 already taken care of in pmap_bootstrap */ return; } pmap_vpage_cpualloc(ci); } #endif /* * p v _ e n t r y f u n c t i o n s */ /* * pmap_pvp_dtor: pool_cache constructor for PV pages. */ static int pmap_pvp_ctor(void *arg, void *obj, int flags) { struct pv_page *pvp = (struct pv_page *)obj; struct pv_entry *pve = (struct pv_entry *)obj + 1; struct pv_entry *maxpve = pve + PVE_PER_PVP; KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); LIST_INIT(&pvp->pvp_pves); pvp->pvp_nfree = PVE_PER_PVP; pvp->pvp_pmap = NULL; for (; pve < maxpve; pve++) { LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); } return 0; } /* * pmap_pvp_dtor: pool_cache destructor for PV pages. */ static void pmap_pvp_dtor(void *arg, void *obj) { struct pv_page *pvp __diagused = obj; KASSERT(pvp->pvp_pmap == NULL); KASSERT(pvp->pvp_nfree == PVE_PER_PVP); } /* * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). */ static struct pv_entry * pmap_alloc_pv(struct pmap *pmap) { struct pv_entry *pve; struct pv_page *pvp; KASSERT(mutex_owned(&pmap->pm_lock)); if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { LIST_REMOVE(pvp, pvp_list); } else { pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); } if (__predict_false(pvp == NULL)) { return NULL; } /* full -> part */ LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); pvp->pvp_pmap = pmap; } KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree > 0); pve = LIST_FIRST(&pvp->pvp_pves); LIST_REMOVE(pve, pve_list); pvp->pvp_nfree--; if (__predict_false(pvp->pvp_nfree == 0)) { /* part -> empty */ KASSERT(LIST_EMPTY(&pvp->pvp_pves)); LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); } else { KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); } return pve; } /* * pmap_free_pv: delayed free of a PV entry. */ static void pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) { struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree >= 0); LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); pvp->pvp_nfree++; if (__predict_false(pvp->pvp_nfree == 1)) { /* empty -> part */ LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { /* part -> full */ LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); } } /* * pmap_drain_pv: free full PV pages. */ static void pmap_drain_pv(struct pmap *pmap) { struct pv_page *pvp; KASSERT(mutex_owned(&pmap->pm_lock)); while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { LIST_REMOVE(pvp, pvp_list); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree == PVE_PER_PVP); pvp->pvp_pmap = NULL; pool_cache_put(&pmap_pvp_cache, pvp); } } /* * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page */ static void pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, vaddr_t va, bool tracked) { #ifdef DEBUG struct pv_pte *pvpte; PMAP_CHECK_PP(pp); mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { break; } } mutex_spin_exit(&pp->pp_lock); if (pvpte && !tracked) { panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); } else if (!pvpte && tracked) { panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); } #endif } /* * pmap_treelookup_pv: search the PV tree for a dynamic entry * * => pmap must be locked */ static struct pv_entry * pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, const rb_tree_t *tree, const vaddr_t va) { struct pv_entry *pve; rb_node_t *node; /* * Inlined lookup tailored for exactly what's needed here that is * quite a bit faster than using rb_tree_find_node(). */ for (node = tree->rbt_root;;) { if (__predict_false(RB_SENTINEL_P(node))) { return NULL; } pve = (struct pv_entry *) ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); if (pve->pve_pte.pte_va == va) { KASSERT(pve->pve_pte.pte_ptp == ptp); return pve; } node = node->rb_nodes[pve->pve_pte.pte_va < va]; } } /* * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap * * => a PV entry must be known present (doesn't check for existence) * => pmap must be locked */ static struct pv_entry * pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, const struct pmap_page * const old_pp, const vaddr_t va) { struct pv_entry *pve; const rb_tree_t *tree; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp != NULL || pmap == pmap_kernel()); /* * [This mostly deals with the case of process-private pages, i.e. * anonymous memory allocations or COW.] * * If the page is tracked with an embedded entry then the tree * lookup can be avoided. It's safe to check for this specific * set of values without pp_lock because both will only ever be * set together for this pmap. * */ if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { return NULL; } /* * [This mostly deals with shared mappings, for example shared libs * and executables.] * * Optimise for pmap_remove_ptes() which works by ascending scan: * look at the lowest numbered node in the tree first. The tree is * known non-empty because of the check above. For short lived * processes where pmap_remove() isn't used much this gets close to * a 100% hit rate. */ tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); KASSERT(!RB_SENTINEL_P(tree->rbt_root)); pve = (struct pv_entry *) ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - offsetof(struct pv_entry, pve_rb)); if (__predict_true(pve->pve_pte.pte_va == va)) { KASSERT(pve->pve_pte.pte_ptp == ptp); return pve; } /* Search the RB tree for the key (uncommon). */ return pmap_treelookup_pv(pmap, ptp, tree, va); } /* * pmap_enter_pv: enter a mapping onto a pmap_page lst * * => pmap must be locked * => does NOT insert dynamic entries to tree (pmap_enter() does later) */ static int pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, bool *samepage, bool *new_embedded, rb_tree_t *tree) { struct pv_entry *pve; int error; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); PMAP_CHECK_PP(pp); /* * If entering the same page and it's already tracked with an * embedded entry, we can avoid the expense below. It's safe * to check for this very specific set of values without a lock * because both will only ever be set together for this pmap. */ if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { *samepage = true; pmap_check_pv(pmap, ptp, pp, va, true); return 0; } /* * Check for an existing dynamic mapping at this address. If it's * for the same page, then it will be reused and nothing needs to be * changed. */ *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { *samepage = true; pmap_check_pv(pmap, ptp, pp, va, true); return 0; } /* * Need to put a new mapping in place. Grab a spare pv_entry in * case it's needed; won't know for sure until the lock is taken. */ if (pmap->pm_pve == NULL) { pmap->pm_pve = pmap_alloc_pv(pmap); } error = 0; pmap_check_pv(pmap, ptp, pp, va, false); mutex_spin_enter(&pp->pp_lock); if (!pv_pte_embedded(pp)) { /* * Embedded PV tracking available - easy. */ pp->pp_pte.pte_ptp = ptp; pp->pp_pte.pte_va = va; *new_embedded = true; } else if (__predict_false(pmap->pm_pve == NULL)) { /* * No memory. */ error = ENOMEM; } else { /* * Install new pv_entry on the page. */ pve = pmap->pm_pve; pmap->pm_pve = NULL; *new_pve = pve; pve->pve_pte.pte_ptp = ptp; pve->pve_pte.pte_va = va; pve->pve_pp = pp; LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); } mutex_spin_exit(&pp->pp_lock); if (error == 0) { pmap_check_pv(pmap, ptp, pp, va, true); } return error; } /* * pmap_remove_pv: try to remove a mapping from a pv_list * * => pmap must be locked * => removes dynamic entries from tree and frees them * => caller should adjust ptp's wire_count and free PTP if needed */ static void pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, vaddr_t va, struct pv_entry *pve, uint8_t oattrs) { rb_tree_t *tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); KASSERT(ptp != NULL || pmap == pmap_kernel()); pmap_check_pv(pmap, ptp, pp, va, true); if (pve == NULL) { mutex_spin_enter(&pp->pp_lock); KASSERT(pp->pp_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_va == va); pp->pp_attrs |= oattrs; pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; mutex_spin_exit(&pp->pp_lock); } else { mutex_spin_enter(&pp->pp_lock); KASSERT(pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va); KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pve->pve_pte.pte_va == va); KASSERT(pve->pve_pp == pp); pp->pp_attrs |= oattrs; LIST_REMOVE(pve, pve_list); mutex_spin_exit(&pp->pp_lock); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); rb_tree_remove_node(tree, pve); #ifdef DIAGNOSTIC memset(pve, 0, sizeof(*pve)); #endif pmap_free_pv(pmap, pve); } KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); pmap_check_pv(pmap, ptp, pp, va, false); } /* * p t p f u n c t i o n s */ static struct vm_page * pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) { int lidx = level - 1; off_t off = ptp_va2o(va, level); struct vm_page *pg; KASSERT(mutex_owned(&pmap->pm_lock)); if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); pg = pmap->pm_ptphint[lidx]; PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); return pg; } PMAP_DUMMY_LOCK(pmap); pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); PMAP_DUMMY_UNLOCK(pmap); if (pg != NULL && __predict_false(pg->wire_count == 0)) { /* This page is queued to be freed - ignore. */ pg = NULL; } if (pg != NULL) { PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); } pmap->pm_ptphint[lidx] = pg; return pg; } static inline void pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) { int lidx; KASSERT(ptp->wire_count <= 1); PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); lidx = level - 1; pmap_stats_update(pmap, -ptp->wire_count, 0); if (pmap->pm_ptphint[lidx] == ptp) pmap->pm_ptphint[lidx] = NULL; ptp->wire_count = 0; ptp->uanon = NULL; KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); /* * Enqueue the PTP to be freed by pmap_update(). We can't remove * the page from the uvm_object, as that can take further locks * (intolerable right now because the PTEs are likely mapped in). * Instead mark the PTP as free and if we bump into it again, we'll * either ignore or reuse (depending on what's useful at the time). */ LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); } static void pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, pt_entry_t *ptes, pd_entry_t * const *pdes) { unsigned long index; int level; vaddr_t invaladdr; pd_entry_t opde; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); level = 1; do { index = pl_i(va, level + 1); opde = pmap_pte_testset(&pdes[level - 1][index], 0); /* * On Xen-amd64 or SVS, we need to sync the top level page * directory on each CPU. */ #if defined(XENPV) && defined(__x86_64__) if (level == PTP_LEVELS - 1) { xen_kpm_sync(pmap, index); } #elif defined(SVS) if (svs_enabled && level == PTP_LEVELS - 1 && pmap_is_user(pmap)) { svs_pmap_sync(pmap, index); } #endif invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, opde, TLBSHOOT_FREE_PTP); #if defined(XENPV) pmap_tlb_shootnow(); #endif pmap_freepage(pmap, ptp, level); if (level < PTP_LEVELS - 1) { ptp = pmap_find_ptp(pmap, va, level + 1); ptp->wire_count--; if (ptp->wire_count > 1) break; } } while (++level < PTP_LEVELS); pmap_pte_flush(); } /* * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) * * => pmap should NOT be pmap_kernel() * => pmap should be locked * => we are not touching any PTEs yet, so they need not be mapped in */ static int pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, int flags, struct vm_page **resultp) { struct vm_page *ptp; int i, aflags; struct uvm_object *obj; voff_t off; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); /* * Loop through all page table levels allocating a page * for any level where we don't already have one. */ memset(pt, 0, sizeof(*pt)); aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | UVM_PGA_ZERO; for (i = PTP_LEVELS; i > 1; i--) { obj = &pmap->pm_obj[i - 2]; off = ptp_va2o(va, i - 1); PMAP_DUMMY_LOCK(pmap); pt->pg[i] = uvm_pagelookup(obj, off); if (pt->pg[i] == NULL) { pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); pt->alloced[i] = (pt->pg[i] != NULL); } else if (pt->pg[i]->wire_count == 0) { /* This page was queued to be freed; dequeue it. */ LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); pt->alloced[i] = true; } PMAP_DUMMY_UNLOCK(pmap); if (pt->pg[i] == NULL) { pmap_unget_ptp(pmap, pt); return ENOMEM; } else if (pt->alloced[i]) { pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, &pmap_rbtree_ops); PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); } } ptp = pt->pg[2]; KASSERT(ptp != NULL); *resultp = ptp; pmap->pm_ptphint[0] = ptp; return 0; } /* * pmap_install_ptp: install any freshly allocated PTPs * * => pmap should NOT be pmap_kernel() * => pmap should be locked * => PTEs must be mapped * => preemption must be disabled */ static void pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, pd_entry_t * const *pdes) { struct vm_page *ptp; unsigned long index; pd_entry_t *pva; paddr_t pa; int i; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * Now that we have all the pages looked up or allocated, * loop through again installing any new ones into the tree. */ for (i = PTP_LEVELS; i > 1; i--) { index = pl_i(va, i); pva = pdes[i - 2]; if (pmap_valid_entry(pva[index])) { KASSERT(!pt->alloced[i]); continue; } ptp = pt->pg[i]; ptp->flags &= ~PG_BUSY; /* never busy */ ptp->wire_count = 1; pmap->pm_ptphint[i - 2] = ptp; pa = VM_PAGE_TO_PHYS(ptp); pmap_pte_set(&pva[index], (pd_entry_t) (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); /* * On Xen-amd64 or SVS, we need to sync the top level page * directory on each CPU. */ #if defined(XENPV) && defined(__x86_64__) if (i == PTP_LEVELS) { xen_kpm_sync(pmap, index); } #elif defined(SVS) if (svs_enabled && i == PTP_LEVELS && pmap_is_user(pmap)) { svs_pmap_sync(pmap, index); } #endif pmap_pte_flush(); pmap_stats_update(pmap, 1, 0); /* * If we're not in the top level, increase the * wire count of the parent page. */ if (i < PTP_LEVELS) { pt->pg[i + 1]->wire_count++; } } } /* * pmap_unget_ptp: free unusued PTPs * * => pmap should NOT be pmap_kernel() * => pmap should be locked */ static void pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) { int i; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); for (i = PTP_LEVELS; i > 1; i--) { if (!pt->alloced[i]) { continue; } KASSERT(pt->pg[i]->wire_count == 0); PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); pmap_freepage(pmap, pt->pg[i], i - 1); } } /* * p m a p l i f e c y c l e f u n c t i o n s */ /* * pmap_pdp_init: constructor a new PDP. */ static void pmap_pdp_init(pd_entry_t *pdir) { paddr_t pdirpa = 0; vaddr_t object; int i; #if !defined(XENPV) || !defined(__x86_64__) int npde; #endif #ifdef XENPV int s; #endif memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); /* * NOTE: This is all done unlocked, but we will check afterwards * if we have raced with pmap_growkernel(). */ #if defined(XENPV) && defined(__x86_64__) /* Fetch the physical address of the page directory */ (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); /* * This pdir will NEVER be active in kernel mode, so mark * recursive entry invalid. */ pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); /* * PDP constructed this way won't be for the kernel, hence we * don't put kernel mappings on Xen. * * But we need to make pmap_create() happy, so put a dummy * (without PTE_P) value at the right place. */ pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = (pd_entry_t)-1 & PTE_FRAME; #else /* XENPV && __x86_64__*/ object = (vaddr_t)pdir; for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* Fetch the physical address of the page directory */ (void)pmap_extract(pmap_kernel(), object, &pdirpa); /* Put in recursive PDE to map the PTEs */ pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | pmap_pg_nx; #ifndef XENPV pdir[PDIR_SLOT_PTE + i] |= PTE_W; #endif } /* Copy the kernel's top level PDE */ npde = nkptp[PTP_LEVELS - 1]; memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], npde * sizeof(pd_entry_t)); if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { int idx = pl_i(KERNBASE, PTP_LEVELS); pdir[idx] = PDP_BASE[idx]; } #ifdef __HAVE_PCPU_AREA pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; #endif #ifdef __HAVE_DIRECT_MAP slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); #endif #ifdef KASAN slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); #endif #ifdef KMSAN slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); #endif #endif /* XENPV && __x86_64__*/ #ifdef XENPV s = splvm(); object = (vaddr_t)pdir; pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), VM_PROT_READ); pmap_update(pmap_kernel()); for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* * pin as L2/L4 page, we have to do the page with the * PDIR_SLOT_PTE entries last */ #ifdef PAE if (i == l2tol3(PDIR_SLOT_PTE)) continue; #endif (void) pmap_extract(pmap_kernel(), object, &pdirpa); #ifdef __x86_64__ xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); #else xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); #endif } #ifdef PAE object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); (void)pmap_extract(pmap_kernel(), object, &pdirpa); xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); #endif splx(s); #endif /* XENPV */ } /* * pmap_pdp_fini: destructor for the PDPs. */ static void pmap_pdp_fini(pd_entry_t *pdir) { #ifdef XENPV paddr_t pdirpa = 0; /* XXX: GCC */ vaddr_t object = (vaddr_t)pdir; int i; int s = splvm(); pt_entry_t *pte; for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* fetch the physical address of the page directory. */ (void) pmap_extract(pmap_kernel(), object, &pdirpa); /* unpin page table */ xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); } object = (vaddr_t)pdir; for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { /* Set page RW again */ pte = kvtopte(object); pmap_pte_set(pte, *pte | PTE_W); xen_bcast_invlpg((vaddr_t)object); } splx(s); #endif /* XENPV */ } #ifdef PAE static void * pmap_pdp_alloc(struct pool *pp, int flags) { return (void *)uvm_km_alloc(kernel_map, PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, ((flags & PR_WAITOK) ? UVM_KMF_WAITVA : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | UVM_KMF_WIRED); } static void pmap_pdp_free(struct pool *pp, void *v) { uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, UVM_KMF_WIRED); } #endif /* PAE */ /* * pmap_ctor: constructor for the pmap cache. */ static int pmap_ctor(void *arg, void *obj, int flags) { struct pmap *pmap = obj; pt_entry_t p; int i; KASSERT((flags & PR_WAITOK) != 0); mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); rw_init(&pmap->pm_dummy_lock); kcpuset_create(&pmap->pm_cpus, true); kcpuset_create(&pmap->pm_kernel_cpus, true); #ifdef XENPV kcpuset_create(&pmap->pm_xen_ptp_cpus, true); #endif LIST_INIT(&pmap->pm_gc_ptp); pmap->pm_pve = NULL; LIST_INIT(&pmap->pm_pvp_full); LIST_INIT(&pmap->pm_pvp_part); LIST_INIT(&pmap->pm_pvp_empty); /* allocate and init PDP */ pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); for (;;) { pmap_pdp_init(pmap->pm_pdir); mutex_enter(&pmaps_lock); p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; if (__predict_true(p != 0)) { break; } mutex_exit(&pmaps_lock); } for (i = 0; i < PDP_SIZE; i++) pmap->pm_pdirpa[i] = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); LIST_INSERT_HEAD(&pmaps, pmap, pm_list); mutex_exit(&pmaps_lock); return 0; } /* * pmap_ctor: destructor for the pmap cache. */ static void pmap_dtor(void *arg, void *obj) { struct pmap *pmap = obj; mutex_enter(&pmaps_lock); LIST_REMOVE(pmap, pm_list); mutex_exit(&pmaps_lock); pmap_pdp_fini(pmap->pm_pdir); pool_put(&pmap_pdp_pool, pmap->pm_pdir); mutex_destroy(&pmap->pm_lock); rw_destroy(&pmap->pm_dummy_lock); kcpuset_destroy(pmap->pm_cpus); kcpuset_destroy(pmap->pm_kernel_cpus); #ifdef XENPV kcpuset_destroy(pmap->pm_xen_ptp_cpus); #endif } /* * pmap_create: create a pmap object. */ struct pmap * pmap_create(void) { struct pmap *pmap; int i; pmap = pool_cache_get(&pmap_cache, PR_WAITOK); /* init uvm_object */ for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); pmap->pm_ptphint[i] = NULL; } pmap->pm_stats.wired_count = 0; /* count the PDP allocd below */ pmap->pm_stats.resident_count = PDP_SIZE; #if !defined(__x86_64__) pmap->pm_hiexec = 0; #endif /* Used by NVMM and Xen */ pmap->pm_enter = NULL; pmap->pm_extract = NULL; pmap->pm_remove = NULL; pmap->pm_sync_pv = NULL; pmap->pm_pp_remove_ent = NULL; pmap->pm_write_protect = NULL; pmap->pm_unwire = NULL; pmap->pm_tlb_flush = NULL; pmap->pm_data = NULL; /* init the LDT */ pmap->pm_ldt = NULL; pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); return pmap; } /* * pmap_check_ptps: verify that none of the pmap's page table objects * have any pages allocated to them. */ static void pmap_check_ptps(struct pmap *pmap) { int i; for (i = 0; i < PTP_LEVELS - 1; i++) { KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, "pmap %p level %d still has %d pages", pmap, i, (int)pmap->pm_obj[i].uo_npages); } } static void pmap_check_inuse(struct pmap *pmap) { #ifdef DEBUG CPU_INFO_ITERATOR cii; struct cpu_info *ci; for (CPU_INFO_FOREACH(cii, ci)) { if (ci->ci_pmap == pmap) panic("destroying pmap being used"); #if defined(XENPV) && defined(__x86_64__) for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { if (pmap->pm_pdir[i] != 0 && ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { printf("pmap_destroy(%p) pmap_kernel %p " "curcpu %d cpu %d ci_pmap %p " "ci->ci_kpm_pdir[%d]=%" PRIx64 " pmap->pm_pdir[%d]=%" PRIx64 "\n", pmap, pmap_kernel(), curcpu()->ci_index, ci->ci_index, ci->ci_pmap, i, ci->ci_kpm_pdir[i], i, pmap->pm_pdir[i]); panic("%s: used pmap", __func__); } } #endif } #endif /* DEBUG */ } /* * pmap_destroy: drop reference count on pmap. free pmap if reference * count goes to zero. * * => we can be called from pmap_unmap_ptes() with a different, unrelated * pmap's lock held. be careful! */ void pmap_destroy(struct pmap *pmap) { int i; /* * drop reference count and verify not in use. */ if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { return; } pmap_check_inuse(pmap); /* * handle any deferred frees. */ mutex_enter(&pmap->pm_lock); if (pmap->pm_pve != NULL) { pmap_free_pv(pmap, pmap->pm_pve); pmap->pm_pve = NULL; } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); pmap_update(pmap); /* * Reference count is zero, free pmap resources and then free pmap. */ pmap_check_ptps(pmap); KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); #ifdef USER_LDT if (pmap->pm_ldt != NULL) { /* * No need to switch the LDT; this address space is gone, * nothing is using it. * * No need to lock the pmap for ldt_free (or anything else), * we're the last one to use it. */ /* XXXAD can't take cpu_lock here - fix soon. */ mutex_enter(&cpu_lock); ldt_free(pmap->pm_ldt_sel); mutex_exit(&cpu_lock); uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); } #endif for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_destroy(&pmap->pm_obj[i], false); } kcpuset_zero(pmap->pm_cpus); kcpuset_zero(pmap->pm_kernel_cpus); #ifdef XENPV kcpuset_zero(pmap->pm_xen_ptp_cpus); #endif KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); pmap_check_ptps(pmap); if (__predict_false(pmap->pm_enter != NULL)) { /* XXX make this a different cache */ pool_cache_destruct_object(&pmap_cache, pmap); } else { pool_cache_put(&pmap_cache, pmap); } } /* * pmap_zap_ptp: clear out an entire PTP without modifying PTEs * * => caller must hold pmap's lock * => PTP must be mapped into KVA * => must be called with kernel preemption disabled * => does as little work as possible */ static void pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, vaddr_t startva, vaddr_t blkendva) { #ifndef XENPV struct pv_entry *pve; struct vm_page *pg; struct pmap_page *pp; pt_entry_t opte; rb_tree_t *tree; vaddr_t va; int wired; uint8_t oattrs; u_int cnt; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); KASSERT(pmap != pmap_kernel()); KASSERT(ptp->wire_count > 1); KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); /* * Start at the lowest entered VA, and scan until there are no more * PTEs in the PTPs. */ tree = &VM_PAGE_TO_PP(ptp)->pp_rb; pve = RB_TREE_MIN(tree); wired = 0; va = (vaddr_t)ptp->uanon; pte += ((va - startva) >> PAGE_SHIFT); for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { /* * No need for an atomic to clear the PTE. Nothing else can * see the address space any more and speculative access (if * possible) won't modify. Therefore there's no need to * track the accessed/dirty bits. */ opte = *pte; if (!pmap_valid_entry(opte)) { continue; } /* * Count the PTE. If it's not for a managed mapping * there's noting more to do. */ cnt--; wired -= (opte & PTE_WIRED); if ((opte & PTE_PVLIST) == 0) { #ifndef DOM0OPS KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), "managed page without PTE_PVLIST for %#" PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without PTE_PVLIST for %#" PRIxVADDR, va); #endif KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); continue; } /* * "pve" now points to the lowest (by VA) dynamic PV entry * in the PTP. If it's for this VA, take advantage of it to * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB * tree by skipping to the next VA in the tree whenever * there is a match here. The tree will be cleared out in * one pass before return to pmap_remove_all(). */ oattrs = pmap_pte_to_pp_attrs(opte); if (pve != NULL && pve->pve_pte.pte_va == va) { pp = pve->pve_pp; KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va); mutex_spin_enter(&pp->pp_lock); pp->pp_attrs |= oattrs; LIST_REMOVE(pve, pve_list); mutex_spin_exit(&pp->pp_lock); /* * pve won't be touched again until pmap_drain_pv(), * so it's still safe to traverse the tree. */ pmap_free_pv(pmap, pve); pve = RB_TREE_NEXT(tree, pve); continue; } /* * No entry in the tree so it must be embedded. Look up the * page and cancel the embedded entry. */ if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { paddr_t pa = pmap_pte2pa(opte); panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR"pa = %#"PRIxPADDR "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); } mutex_spin_enter(&pp->pp_lock); KASSERT(pp->pp_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_va == va); pp->pp_attrs |= oattrs; pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; mutex_spin_exit(&pp->pp_lock); } /* PTP now empty - adjust the tree & stats to match. */ pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); ptp->wire_count = 1; #ifdef DIAGNOSTIC rb_tree_init(tree, &pmap_rbtree_ops); #endif #else /* !XENPV */ /* * XXXAD For XEN, it's not clear to me that we can do this, because * I guess the hypervisor keeps track of PTEs too. */ pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); #endif /* !XENPV */ } /* * pmap_remove_all: remove all mappings from pmap in bulk. * * Ordinarily when removing mappings it's important to hold the UVM object's * lock, so that pages do not gain a new identity while retaining stale TLB * entries (the same lock hold covers both pmap_remove() and pmap_update()). * Here it's known that the address space is no longer visible to any user * process, so we don't need to worry about that. */ bool pmap_remove_all(struct pmap *pmap) { struct vm_page *ptps[32]; vaddr_t va, blkendva; struct pmap *pmap2; pt_entry_t *ptes; pd_entry_t pde __diagused; pd_entry_t * const *pdes; int lvl __diagused, i, n; /* XXX Can't handle EPT just yet. */ if (pmap->pm_remove != NULL) { return false; } for (;;) { /* Fetch a block of PTPs from tree. */ mutex_enter(&pmap->pm_lock); n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, (void **)ptps, __arraycount(ptps), false); if (n == 0) { mutex_exit(&pmap->pm_lock); break; } /* Remove all mappings in the set of PTPs. */ pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); for (i = 0; i < n; i++) { if (ptps[i]->wire_count == 0) { /* It's dead: pmap_update() will expunge. */ continue; } /* Determine range of block. */ va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); blkendva = x86_round_pdr(va + 1); /* Make sure everything squares up... */ KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); KASSERT(lvl == 1); KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); /* Zap! */ pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, blkendva); /* PTP should now be unused - free it. */ KASSERT(ptps[i]->wire_count == 1); pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); } pmap_unmap_ptes(pmap, pmap2); pmap_drain_pv(pmap); pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); mutex_exit(&pmap->pm_lock); /* Process deferred frees. */ pmap_update(pmap); /* A breathing point. */ preempt_point(); } /* Verify that the pmap is now completely empty. */ pmap_check_ptps(pmap); KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, "pmap %p not empty", pmap); return true; } #if defined(PMAP_FORK) /* * pmap_fork: perform any necessary data structure manipulation when * a VM space is forked. */ void pmap_fork(struct pmap *pmap1, struct pmap *pmap2) { #ifdef USER_LDT union descriptor *new_ldt; int sel; if (__predict_true(pmap1->pm_ldt == NULL)) { return; } /* * Copy the LDT into the new process. * * Read pmap1's ldt pointer unlocked; if it changes behind our back * we'll retry. This will starve if there's a stream of LDT changes * in another thread but that should not happen. */ retry: if (pmap1->pm_ldt != NULL) { /* Allocate space for the new process's LDT */ new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); if (new_ldt == NULL) { printf("WARNING: %s: unable to allocate LDT space\n", __func__); return; } mutex_enter(&cpu_lock); /* Get a GDT slot for it */ sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); if (sel == -1) { mutex_exit(&cpu_lock); uvm_km_free(kernel_map, (vaddr_t)new_ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); printf("WARNING: %s: unable to allocate LDT selector\n", __func__); return; } } else { /* Wasn't anything there after all. */ new_ldt = NULL; sel = -1; mutex_enter(&cpu_lock); } /* * Now that we have cpu_lock, ensure the LDT status is the same. */ if (pmap1->pm_ldt != NULL) { if (new_ldt == NULL) { /* A wild LDT just appeared. */ mutex_exit(&cpu_lock); goto retry; } /* Copy the LDT data and install it in pmap2 */ memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); pmap2->pm_ldt = new_ldt; pmap2->pm_ldt_sel = sel; mutex_exit(&cpu_lock); } else { if (new_ldt != NULL) { /* The LDT disappeared, drop what we did. */ ldt_free(sel); mutex_exit(&cpu_lock); uvm_km_free(kernel_map, (vaddr_t)new_ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); return; } /* We're good, just leave. */ mutex_exit(&cpu_lock); } #endif /* USER_LDT */ } #endif /* PMAP_FORK */ #ifdef USER_LDT /* * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap * is active, reload LDTR. */ static void pmap_ldt_xcall(void *arg1, void *arg2) { struct pmap *pm; kpreempt_disable(); pm = arg1; if (curcpu()->ci_pmap == pm) { #if defined(SVS) if (svs_enabled) { svs_ldt_sync(pm); } else #endif lldt(pm->pm_ldt_sel); } kpreempt_enable(); } /* * pmap_ldt_sync: LDT selector for the named pmap is changing. swap * in the new selector on all CPUs. */ void pmap_ldt_sync(struct pmap *pm) { uint64_t where; KASSERT(mutex_owned(&cpu_lock)); pmap_ldt_evcnt.ev_count++; where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); xc_wait(where); } /* * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and * restore the default. */ void pmap_ldt_cleanup(struct lwp *l) { pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; union descriptor *ldt; int sel; if (__predict_true(pmap->pm_ldt == NULL)) { return; } mutex_enter(&cpu_lock); if (pmap->pm_ldt != NULL) { sel = pmap->pm_ldt_sel; ldt = pmap->pm_ldt; pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); pmap->pm_ldt = NULL; pmap_ldt_sync(pmap); ldt_free(sel); uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, UVM_KMF_WIRED); } mutex_exit(&cpu_lock); } #endif /* USER_LDT */ /* * pmap_activate: activate a process' pmap * * => must be called with kernel preemption disabled * => if lwp is the curlwp, then set ci_want_pmapload so that * actual MMU context switch will be done by pmap_load() later */ void pmap_activate(struct lwp *l) { struct cpu_info *ci; struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); KASSERT(kpreempt_disabled()); ci = curcpu(); if (l != ci->ci_curlwp) return; KASSERT(ci->ci_want_pmapload == 0); KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); /* * no need to switch to kernel vmspace because * it's a subset of any vmspace. */ if (pmap == pmap_kernel()) { ci->ci_want_pmapload = 0; return; } ci->ci_want_pmapload = 1; } #if defined(XENPV) && defined(__x86_64__) #define KASSERT_PDIRPA(pmap) \ KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ pmap == pmap_kernel()) #elif defined(PAE) #define KASSERT_PDIRPA(pmap) \ KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) #elif !defined(XENPV) #define KASSERT_PDIRPA(pmap) \ KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) #else #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ #endif /* * pmap_reactivate: try to regain reference to the pmap. * * => Must be called with kernel preemption disabled. */ static void pmap_reactivate(struct pmap *pmap) { struct cpu_info * const ci = curcpu(); const cpuid_t cid = cpu_index(ci); KASSERT(kpreempt_disabled()); KASSERT_PDIRPA(pmap); /* * If we still have a lazy reference to this pmap, we can assume * that there was no TLB shootdown for this pmap in the meantime. * * The order of events here is important as we must synchronize * with TLB shootdown interrupts. Declare interest in invalidations * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can * change only when the state is TLBSTATE_LAZY. */ ci->ci_tlbstate = TLBSTATE_VALID; KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { /* We have the reference, state is valid. */ } else { /* * Must reload the TLB, pmap has been changed during * deactivated. */ kcpuset_atomic_set(pmap->pm_cpus, cid); tlbflush(); } } /* * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register * and relevant LDT info. * * Ensures that the current process' pmap is loaded on the current CPU's * MMU and that there are no stale TLB entries. * * => The caller should disable kernel preemption or do check-and-retry * to prevent a preemption from undoing our efforts. * => This function may block. */ void pmap_load(void) { struct cpu_info *ci; struct pmap *pmap, *oldpmap; struct lwp *l; uint64_t pctr; int ilevel __diagused; u_long psl __diagused; kpreempt_disable(); retry: ci = curcpu(); if (!ci->ci_want_pmapload) { kpreempt_enable(); return; } l = ci->ci_curlwp; pctr = lwp_pctr(); __insn_barrier(); /* should be able to take ipis. */ KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel); #ifdef XENPV /* Check to see if interrupts are enabled (ie; no events are masked) */ KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl); #else KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl); #endif KASSERT(l != NULL); pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); KASSERT(pmap != pmap_kernel()); oldpmap = ci->ci_pmap; if (pmap == oldpmap) { pmap_reactivate(pmap); ci->ci_want_pmapload = 0; kpreempt_enable(); return; } /* * Acquire a reference to the new pmap and perform the switch. */ pmap_reference(pmap); pmap_load1(l, pmap, oldpmap); ci->ci_want_pmapload = 0; /* * we're now running with the new pmap. drop the reference * to the old pmap. if we block, we need to go around again. */ pmap_destroy(oldpmap); __insn_barrier(); if (lwp_pctr() != pctr) { goto retry; } kpreempt_enable(); } /* * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and * pmap_load(). It's critically important that this function does not * block. */ static void pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) { struct cpu_info *ci; struct pcb *pcb; cpuid_t cid; KASSERT(kpreempt_disabled()); pcb = lwp_getpcb(l); ci = l->l_cpu; cid = cpu_index(ci); kcpuset_atomic_clear(oldpmap->pm_cpus, cid); kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); KASSERT_PDIRPA(oldpmap); KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); /* * Mark the pmap in use by this CPU. Again, we must synchronize * with TLB shootdown interrupts, so set the state VALID first, * then register us for shootdown events on this pmap. */ ci->ci_tlbstate = TLBSTATE_VALID; kcpuset_atomic_set(pmap->pm_cpus, cid); kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); ci->ci_pmap = pmap; /* * update tss. now that we have registered for invalidations * from other CPUs, we're good to load the page tables. */ #ifdef PAE pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; #else pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); #endif #ifdef i386 #ifndef XENPV ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; #endif #endif #if defined(SVS) && defined(USER_LDT) if (svs_enabled) { svs_ldt_sync(pmap); } else #endif lldt(pmap->pm_ldt_sel); cpu_load_pmap(pmap, oldpmap); } /* * pmap_deactivate: deactivate a process' pmap. * * => Must be called with kernel preemption disabled (high IPL is enough). */ void pmap_deactivate(struct lwp *l) { struct pmap *pmap; struct cpu_info *ci; KASSERT(kpreempt_disabled()); if (l != curlwp) { return; } /* * Wait for pending TLB shootdowns to complete. Necessary because * TLB shootdown state is per-CPU, and the LWP may be coming off * the CPU before it has a chance to call pmap_update(), e.g. due * to kernel preemption or blocking routine in between. */ pmap_tlb_shootnow(); ci = curcpu(); if (ci->ci_want_pmapload) { /* * ci_want_pmapload means that our pmap is not loaded on * the CPU or TLB might be stale. note that pmap_kernel() * is always considered loaded. */ KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) != pmap_kernel()); KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); /* * userspace has not been touched. * nothing to do here. */ ci->ci_want_pmapload = 0; return; } pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); if (pmap == pmap_kernel()) { return; } KASSERT_PDIRPA(pmap); KASSERT(ci->ci_pmap == pmap); /* * we aren't interested in TLB invalidations for this pmap, * at least for the time being. */ KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); ci->ci_tlbstate = TLBSTATE_LAZY; } #ifdef EFI_RUNTIME extern struct pmap *efi_runtime_pmap; /* * pmap_is_user: true if pmap, which must not be the kernel pmap, is * for an unprivileged user process */ bool pmap_is_user(struct pmap *pmap) { KASSERT(pmap != pmap_kernel()); return (pmap != efi_runtime_pmap); } /* * pmap_activate_sync: synchronously activate specified pmap. * * => Must be called with kernel preemption disabled (high IPL is enough). * => Must not sleep before pmap_deactivate_sync. */ void * pmap_activate_sync(struct pmap *pmap) { struct cpu_info *ci = curcpu(); struct pmap *oldpmap = ci->ci_pmap; unsigned cid = cpu_index(ci); KASSERT(kpreempt_disabled()); KASSERT(pmap != pmap_kernel()); KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); if (oldpmap) { KASSERT_PDIRPA(oldpmap); kcpuset_atomic_clear(oldpmap->pm_cpus, cid); kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); } ci->ci_tlbstate = TLBSTATE_VALID; kcpuset_atomic_set(pmap->pm_cpus, cid); kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); ci->ci_pmap = pmap; #if defined(SVS) && defined(USER_LDT) if (svs_enabled) { svs_ldt_sync(pmap); } else #endif lldt(pmap->pm_ldt_sel); cpu_load_pmap(pmap, oldpmap); return oldpmap; } /* * pmap_deactivate_sync: synchronously deactivate specified pmap and * restore whatever was active before pmap_activate_sync. * * => Must be called with kernel preemption disabled (high IPL is enough). * => Must not have slept since pmap_activate_sync. */ void pmap_deactivate_sync(struct pmap *pmap, void *cookie) { struct cpu_info *ci = curcpu(); struct pmap *oldpmap = cookie; unsigned cid = cpu_index(ci); KASSERT(kpreempt_disabled()); KASSERT(pmap != pmap_kernel()); KASSERT(ci->ci_pmap == pmap); KASSERT_PDIRPA(pmap); KASSERT(kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); pmap_tlb_shootnow(); kcpuset_atomic_clear(pmap->pm_cpus, cid); kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid); ci->ci_tlbstate = TLBSTATE_VALID; ci->ci_pmap = oldpmap; if (oldpmap) { kcpuset_atomic_set(oldpmap->pm_cpus, cid); kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid); #if defined(SVS) && defined(USER_LDT) if (svs_enabled) { svs_ldt_sync(oldpmap); } else #endif lldt(oldpmap->pm_ldt_sel); cpu_load_pmap(oldpmap, pmap); } else { lcr3(pmap_pdirpa(pmap_kernel(), 0)); } } #endif /* EFI_RUNTIME */ /* * some misc. functions */ bool pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, int *lastlvl) { unsigned long index; pd_entry_t pde; int i; for (i = PTP_LEVELS; i > 1; i--) { index = pl_i(va, i); pde = pdes[i - 2][index]; if ((pde & PTE_P) == 0) { *lastlvl = i; return false; } if (pde & PTE_PS) break; } if (lastpde != NULL) *lastpde = pde; *lastlvl = i; return true; } /* * pmap_extract: extract a PA for the given VA */ bool pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) { pt_entry_t *ptes, pte; pd_entry_t pde; pd_entry_t * const *pdes; struct pmap *pmap2; paddr_t pa; bool rv; int lvl; if (__predict_false(pmap->pm_extract != NULL)) { return (*pmap->pm_extract)(pmap, va, pap); } #ifdef __HAVE_DIRECT_MAP if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { if (pap != NULL) { *pap = PMAP_DIRECT_UNMAP(va); } return true; } #endif rv = false; pa = 0; if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock); } pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { if (lvl == 2) { pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); rv = true; } else { KASSERT(lvl == 1); pte = ptes[pl1_i(va)]; if (__predict_true((pte & PTE_P) != 0)) { pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); rv = true; } } } pmap_unmap_ptes(pmap, pmap2); if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock); } if (pap != NULL) { *pap = pa; } return rv; } /* * vtophys: virtual address to physical address. For use by * machine-dependent code only. */ paddr_t vtophys(vaddr_t va) { paddr_t pa; if (pmap_extract(pmap_kernel(), va, &pa) == true) return pa; return 0; } __strict_weak_alias(pmap_extract_ma, pmap_extract); #ifdef XENPV /* * vtomach: virtual address to machine address. For use by * machine-dependent code only. */ paddr_t vtomach(vaddr_t va) { paddr_t pa; if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) return pa; return 0; } #endif /* * pmap_virtual_space: used during bootup [pmap_steal_memory] to * determine the bounds of the kernel virtual address space. */ void pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) { *startp = virtual_avail; *endp = virtual_end; } void pmap_zero_page(paddr_t pa) { #if defined(__HAVE_DIRECT_MAP) memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); #else #if defined(XENPV) if (XEN_VERSION_SUPPORTED(3, 4)) { xen_pagezero(pa); return; } #endif struct cpu_info *ci; pt_entry_t *zpte; vaddr_t zerova; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; kpreempt_disable(); ci = curcpu(); zerova = ci->vpage[VPAGE_ZER]; zpte = ci->vpage_pte[VPAGE_ZER]; KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); pmap_pte_flush(); pmap_update_pg(zerova); /* flush TLB */ memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); #if defined(DIAGNOSTIC) || defined(XENPV) pmap_pte_set(zpte, 0); /* zap ! */ pmap_pte_flush(); #endif kpreempt_enable(); #endif /* defined(__HAVE_DIRECT_MAP) */ } void pmap_copy_page(paddr_t srcpa, paddr_t dstpa) { #if defined(__HAVE_DIRECT_MAP) vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); #else #if defined(XENPV) if (XEN_VERSION_SUPPORTED(3, 4)) { xen_copy_page(srcpa, dstpa); return; } #endif struct cpu_info *ci; pt_entry_t *srcpte, *dstpte; vaddr_t srcva, dstva; const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; kpreempt_disable(); ci = curcpu(); srcva = ci->vpage[VPAGE_SRC]; dstva = ci->vpage[VPAGE_DST]; srcpte = ci->vpage_pte[VPAGE_SRC]; dstpte = ci->vpage_pte[VPAGE_DST]; KASSERT(*srcpte == 0 && *dstpte == 0); pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); pmap_pte_flush(); pmap_update_pg(srcva); pmap_update_pg(dstva); memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); #if defined(DIAGNOSTIC) || defined(XENPV) pmap_pte_set(srcpte, 0); pmap_pte_set(dstpte, 0); pmap_pte_flush(); #endif kpreempt_enable(); #endif /* defined(__HAVE_DIRECT_MAP) */ } static pt_entry_t * pmap_map_ptp(struct vm_page *ptp) { #ifdef __HAVE_DIRECT_MAP return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); #else struct cpu_info *ci; pt_entry_t *ptppte; vaddr_t ptpva; KASSERT(kpreempt_disabled()); #ifndef XENPV const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; #else const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; #endif ci = curcpu(); ptpva = ci->vpage[VPAGE_PTP]; ptppte = ci->vpage_pte[VPAGE_PTP]; pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); pmap_pte_flush(); pmap_update_pg(ptpva); return (pt_entry_t *)ptpva; #endif } static void pmap_unmap_ptp(void) { #ifndef __HAVE_DIRECT_MAP #if defined(DIAGNOSTIC) || defined(XENPV) struct cpu_info *ci; pt_entry_t *pte; KASSERT(kpreempt_disabled()); ci = curcpu(); pte = ci->vpage_pte[VPAGE_PTP]; if (*pte != 0) { pmap_pte_set(pte, 0); pmap_pte_flush(); } #endif #endif } static pt_entry_t * pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) { KASSERT(kpreempt_disabled()); if (pmap_is_curpmap(pmap)) { return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ } KASSERT(ptp != NULL); return pmap_map_ptp(ptp) + pl1_pi(va); } static void pmap_unmap_pte(void) { KASSERT(kpreempt_disabled()); pmap_unmap_ptp(); } /* * p m a p r e m o v e f u n c t i o n s * * functions that remove mappings */ /* * pmap_remove_ptes: remove PTEs from a PTP * * => caller must hold pmap's lock * => PTP must be mapped into KVA * => PTP should be null if pmap == pmap_kernel() * => must be called with kernel preemption disabled * => returns composite pte if at least one page should be shot down */ static void pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, vaddr_t startva, vaddr_t endva) { pt_entry_t *pte = (pt_entry_t *)ptpva; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * mappings are very often sparse, so clip the given range to the * range of PTEs that are known present in the PTP. */ pmap_ptp_range_clip(ptp, &startva, &pte); /* * note that ptpva points to the PTE that maps startva. this may * or may not be the first PTE in the PTP. * * we loop through the PTP while there are still PTEs to look at * and the wire_count is greater than 1 (because we use the wire_count * to keep track of the number of real PTEs in the PTP). */ while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { (void)pmap_remove_pte(pmap, ptp, pte, startva); startva += PAGE_SIZE; pte++; } } /* * pmap_remove_pte: remove a single PTE from a PTP. * * => caller must hold pmap's lock * => PTP must be mapped into KVA * => PTP should be null if pmap == pmap_kernel() * => returns true if we removed a mapping * => must be called with kernel preemption disabled */ static bool pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, vaddr_t va) { struct pv_entry *pve; struct vm_page *pg; struct pmap_page *pp; pt_entry_t opte; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); if (!pmap_valid_entry(*pte)) { /* VA not mapped. */ return false; } /* Atomically save the old PTE and zap it. */ opte = pmap_pte_testset(pte, 0); if (!pmap_valid_entry(opte)) { return false; } pmap_exec_account(pmap, va, opte, 0); pmap_stats_update_bypte(pmap, 0, opte); if (ptp) { /* * Dropping a PTE. Make sure that the PDE is flushed. */ ptp->wire_count--; if (ptp->wire_count <= 1) { opte |= PTE_A; } } if ((opte & PTE_A) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); } /* * If we are not on a pv list - we are done. */ if ((opte & PTE_PVLIST) == 0) { #ifndef DOM0OPS KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), "managed page without PTE_PVLIST for %#"PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); #endif KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); return true; } if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { paddr_t pa = pmap_pte2pa(opte); panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", __func__, va, pa, atop(pa)); } /* Sync R/M bits. */ pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); return true; } static void pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { pt_entry_t *ptes; pd_entry_t pde; pd_entry_t * const *pdes; bool result; vaddr_t blkendva, va = sva; struct vm_page *ptp; struct pmap *pmap2; int lvl; KASSERT(mutex_owned(&pmap->pm_lock)); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* * removing one page? take shortcut function. */ if (va + PAGE_SIZE == eva) { if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { KASSERT(lvl == 1); /* Get PTP if non-kernel mapping. */ if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); } else { /* Never free kernel PTPs. */ ptp = NULL; } result = pmap_remove_pte(pmap, ptp, &ptes[pl1_i(va)], va); /* * if mapping removed and the PTP is no longer * being used, free it! */ if (result && ptp && ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, va, ptes, pdes); } } else for (/* null */ ; va < eva ; va = blkendva) { /* determine range of block */ blkendva = x86_round_pdr(va+1); if (blkendva > eva) blkendva = eva; if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { /* Skip a range corresponding to an invalid pde. */ blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; continue; } KASSERT(lvl == 1); /* Get PTP if non-kernel mapping. */ if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); } else { /* Never free kernel PTPs. */ ptp = NULL; } pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, blkendva); /* If PTP is no longer being used, free it. */ if (ptp && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } } pmap_unmap_ptes(pmap, pmap2); pmap_drain_pv(pmap); } /* * pmap_remove: mapping removal function. * * => caller should not be holding any pmap locks */ void pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { if (__predict_false(pmap->pm_remove != NULL)) { (*pmap->pm_remove)(pmap, sva, eva); return; } mutex_enter(&pmap->pm_lock); pmap_remove_locked(pmap, sva, eva); mutex_exit(&pmap->pm_lock); } /* * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. * * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... * => Caller should disable kernel preemption. * => issues tlb shootdowns if necessary. */ static int pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, pt_entry_t *optep) { struct pmap *pmap; struct vm_page *ptp; vaddr_t va; pt_entry_t *ptep; pt_entry_t opte; pt_entry_t npte; pt_entry_t expect; bool need_shootdown; ptp = pvpte->pte_ptp; va = pvpte->pte_va; KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); pmap = ptp_to_pmap(ptp); KASSERT(kpreempt_disabled()); if (__predict_false(pmap->pm_sync_pv != NULL)) { return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, optep); } expect = pmap_pa2pte(pa) | PTE_P; if (clearbits != ~0) { KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); clearbits = pmap_pp_attrs_to_pte(clearbits); } ptep = pmap_map_pte(pmap, ptp, va); do { opte = *ptep; KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); KASSERT(opte == 0 || (opte & PTE_P) != 0); if ((opte & (PTE_FRAME | PTE_P)) != expect) { /* * We lost a race with a V->P operation like * pmap_remove(). Wait for the competitor * reflecting pte bits into mp_attrs. */ pmap_unmap_pte(); return EAGAIN; } /* * Check if there's anything to do on this PTE. */ if ((opte & clearbits) == 0) { need_shootdown = false; break; } /* * We need a shootdown if the PTE is cached (PTE_A) ... * ... Unless we are clearing only the PTE_W bit and * it isn't cached as RW (PTE_D). */ need_shootdown = (opte & PTE_A) != 0 && !(clearbits == PTE_W && (opte & PTE_D) == 0); npte = opte & ~clearbits; /* * If we need a shootdown anyway, clear PTE_A and PTE_D. */ if (need_shootdown) { npte &= ~(PTE_A | PTE_D); } KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); KASSERT(npte == 0 || (opte & PTE_P) != 0); } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); } pmap_unmap_pte(); *oattrs = pmap_pte_to_pp_attrs(opte); if (optep != NULL) *optep = opte; return 0; } static void pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, vaddr_t va) { struct pmap *pmap2; pt_entry_t *ptes; pd_entry_t * const *pdes; KASSERT(mutex_owned(&pmap->pm_lock)); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); pmap_stats_update_bypte(pmap, 0, opte); ptp->wire_count--; if (ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } pmap_unmap_ptes(pmap, pmap2); } static void pmap_pp_remove(struct pmap_page *pp, paddr_t pa) { struct pv_pte *pvpte; struct vm_page *ptp; uintptr_t sum; uint8_t oattrs; bool locked; /* * Do an unlocked check to see if the page has no mappings, eg when * pmap_remove_all() was called before amap_wipeout() for a process * private amap - common. The page being removed must be on the way * out, so we don't have to worry about concurrent attempts to enter * it (otherwise the caller either doesn't care or has screwed up). */ sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); if (sum == 0) { return; } kpreempt_disable(); for (;;) { struct pmap *pmap; struct pv_entry *pve; pt_entry_t opte; vaddr_t va; mutex_spin_enter(&pp->pp_lock); if ((pvpte = pv_pte_first(pp)) == NULL) { mutex_spin_exit(&pp->pp_lock); break; } /* * Add a reference to the pmap before clearing the pte. * Otherwise the pmap can disappear behind us. */ ptp = pvpte->pte_ptp; pmap = ptp_to_pmap(ptp); KASSERT(pmap->pm_obj[0].uo_refs > 0); if (ptp != NULL) { pmap_reference(pmap); } /* * Now try to lock it. We need a direct handoff between * pp_lock and pm_lock to know the pv_entry is kept intact * and kept associated with this pmap. If that can't be * had, wait for the pmap's lock to become free and then * retry. */ locked = mutex_tryenter(&pmap->pm_lock); mutex_spin_exit(&pp->pp_lock); if (!locked) { mutex_enter(&pmap->pm_lock); /* nothing, just wait for it */ mutex_exit(&pmap->pm_lock); if (ptp != NULL) { pmap_destroy(pmap); } continue; } va = pvpte->pte_va; KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, "va %lx pmap %p ptp %p is empty", va, pmap, ptp); KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, "va %lx pmap %p ptp %p is free", va, pmap, ptp); KASSERTMSG(ptp == NULL || ptp->wire_count > 1, "va %lx pmap %p ptp %p is empty", va, pmap, ptp); #ifdef DEBUG pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); rb_tree_t *tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); pve = pmap_treelookup_pv(pmap, ptp, tree, va); if (pve == NULL) { KASSERTMSG(&pp->pp_pte == pvpte, "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", va, pmap, ptp, pvpte, pve); } else { KASSERTMSG(&pve->pve_pte == pvpte, "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", va, pmap, ptp, pvpte, pve); } #endif if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { panic("pmap_pp_remove: mapping not present"); } pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); /* Update the PTP reference count. Free if last reference. */ if (ptp != NULL) { KASSERT(pmap != pmap_kernel()); pmap_tlb_shootnow(); if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); } else { pmap_pp_remove_ent(pmap, ptp, opte, va); } } else { KASSERT(pmap == pmap_kernel()); pmap_stats_update_bypte(pmap, 0, opte); } pmap_tlb_shootnow(); pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); if (ptp != NULL) { pmap_destroy(pmap); } } kpreempt_enable(); } /* * pmap_page_remove: remove a managed vm_page from all pmaps that map it * * => R/M bits are sync'd back to attrs */ void pmap_page_remove(struct vm_page *pg) { struct pmap_page *pp; paddr_t pa; pp = VM_PAGE_TO_PP(pg); pa = VM_PAGE_TO_PHYS(pg); pmap_pp_remove(pp, pa); } /* * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps * that map it */ void pmap_pv_remove(paddr_t pa) { struct pmap_page *pp; pp = pmap_pv_tracked(pa); if (pp == NULL) panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); pmap_pp_remove(pp, pa); } /* * p m a p a t t r i b u t e f u n c t i o n s * functions that test/change managed page's attributes * since a page can be mapped multiple times we must check each PTE that * maps it by going down the pv lists. */ /* * pmap_test_attrs: test a page's attributes */ bool pmap_test_attrs(struct vm_page *pg, unsigned testbits) { struct pmap_page *pp; struct pv_pte *pvpte; struct pmap *pmap; uint8_t oattrs; u_int result; paddr_t pa; pp = VM_PAGE_TO_PP(pg); if ((pp->pp_attrs & testbits) != 0) { return true; } pa = VM_PAGE_TO_PHYS(pg); startover: mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if ((pp->pp_attrs & testbits) != 0) { break; } if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { /* * raced with a V->P operation. wait for the other * side to finish by acquiring pmap's lock. if no * wait, updates to pp_attrs by the other side may * go unseen. */ pmap = ptp_to_pmap(pvpte->pte_ptp); pmap_reference(pmap); mutex_spin_exit(&pp->pp_lock); mutex_enter(&pmap->pm_lock); /* nothing. */ mutex_exit(&pmap->pm_lock); pmap_destroy(pmap); goto startover; } pp->pp_attrs |= oattrs; } result = pp->pp_attrs & testbits; mutex_spin_exit(&pp->pp_lock); /* * note that we will exit the for loop with a non-null pve if * we have found the bits we are testing for. */ return result != 0; } static bool pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) { struct pv_pte *pvpte; struct pmap *pmap; uint8_t oattrs; u_int result; startover: mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { /* * raced with a V->P operation. wait for the other * side to finish by acquiring pmap's lock. it is * probably unmapping the page, and it will be gone * when the loop is restarted. */ pmap = ptp_to_pmap(pvpte->pte_ptp); pmap_reference(pmap); mutex_spin_exit(&pp->pp_lock); mutex_enter(&pmap->pm_lock); /* nothing. */ mutex_exit(&pmap->pm_lock); pmap_destroy(pmap); goto startover; } pp->pp_attrs |= oattrs; } result = pp->pp_attrs & clearbits; pp->pp_attrs &= ~clearbits; pmap_tlb_shootnow(); mutex_spin_exit(&pp->pp_lock); return result != 0; } /* * pmap_clear_attrs: clear the specified attribute for a page. * * => we return true if we cleared one of the bits we were asked to */ bool pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) { struct pmap_page *pp; paddr_t pa; pp = VM_PAGE_TO_PP(pg); pa = VM_PAGE_TO_PHYS(pg); /* * If this is a new page, assert it has no mappings and simply zap * the stored attributes without taking any locks. */ if ((pg->flags & PG_FAKE) != 0) { KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); atomic_store_relaxed(&pp->pp_attrs, 0); return false; } else { return pmap_pp_clear_attrs(pp, pa, clearbits); } } /* * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged * pv-tracked page. */ bool pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) { struct pmap_page *pp; pp = pmap_pv_tracked(pa); if (pp == NULL) panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); return pmap_pp_clear_attrs(pp, pa, clearbits); } /* * p m a p p r o t e c t i o n f u n c t i o n s */ /* * pmap_page_protect: change the protection of all recorded mappings * of a managed page * * => NOTE: this is an inline function in pmap.h */ /* see pmap.h */ /* * pmap_pv_protect: change the protection of all recorded mappings * of an unmanaged pv-tracked page * * => NOTE: this is an inline function in pmap.h */ /* see pmap.h */ /* * pmap_protect: set the protection in of the pages in a pmap * * => NOTE: this is an inline function in pmap.h */ /* see pmap.h */ /* * pmap_write_protect: write-protect pages in a pmap. * * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we * don't need to remove this bit when re-entering the PTEs here: Xen tracks the * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is * present the page will still be considered as a kernel page, and the privilege * separation will be enforced correctly. */ void pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) { pt_entry_t bit_rem, bit_put; pt_entry_t *ptes; pt_entry_t * const *pdes; struct pmap *pmap2; vaddr_t blockend, va; int lvl, i; if (__predict_false(pmap->pm_write_protect != NULL)) { (*pmap->pm_write_protect)(pmap, sva, eva, prot); return; } bit_rem = 0; if (!(prot & VM_PROT_WRITE)) bit_rem = PTE_W; bit_put = 0; if (!(prot & VM_PROT_EXECUTE)) bit_put = pmap_pg_nx; sva &= ~PAGE_MASK; eva &= ~PAGE_MASK; /* * Acquire pmap. No need to lock the kernel pmap as we won't * be touching PV entries nor stats and kernel PDEs aren't * freed. */ if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock); } pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); for (va = sva ; va < eva; va = blockend) { pt_entry_t *spte, *epte; blockend = x86_round_pdr(va + 1); if (blockend > eva) blockend = eva; /* Is it a valid block? */ if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { continue; } KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); KASSERT(lvl == 1); spte = &ptes[pl1_i(va)]; epte = &ptes[pl1_i(blockend)]; for (i = 0; spte < epte; spte++, i++) { pt_entry_t opte, npte; do { opte = *spte; if (!pmap_valid_entry(opte)) { goto next; } npte = (opte & ~bit_rem) | bit_put; } while (pmap_pte_cas(spte, opte, npte) != opte); if ((opte & PTE_D) != 0) { vaddr_t tva = va + x86_ptob(i); pmap_tlb_shootdown(pmap, tva, opte, TLBSHOOT_WRITE_PROTECT); } next:; } } /* Release pmap. */ pmap_unmap_ptes(pmap, pmap2); if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock); } } /* * pmap_unwire: clear the wired bit in the PTE. * * => Mapping should already be present. */ void pmap_unwire(struct pmap *pmap, vaddr_t va) { pt_entry_t *ptes, *ptep, opte; pd_entry_t * const *pdes; struct pmap *pmap2; int lvl; if (__predict_false(pmap->pm_unwire != NULL)) { (*pmap->pm_unwire)(pmap, va); return; } /* * Acquire pmap. Need to lock the kernel pmap only to protect the * statistics. */ mutex_enter(&pmap->pm_lock); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); } KASSERT(lvl == 1); ptep = &ptes[pl1_i(va)]; opte = *ptep; KASSERT(pmap_valid_entry(opte)); if (opte & PTE_WIRED) { pt_entry_t npte = opte & ~PTE_WIRED; opte = pmap_pte_testset(ptep, npte); pmap_stats_update_bypte(pmap, npte, opte); } else { printf("%s: wiring for pmap %p va %#" PRIxVADDR " did not change!\n", __func__, pmap, va); } /* Release pmap. */ pmap_unmap_ptes(pmap, pmap2); mutex_exit(&pmap->pm_lock); } /* * pmap_copy: copy mappings from one pmap to another * * => optional function * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) */ /* * defined as macro in pmap.h */ __strict_weak_alias(pmap_enter, pmap_enter_default); int pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) { if (__predict_false(pmap->pm_enter != NULL)) { return (*pmap->pm_enter)(pmap, va, pa, prot, flags); } return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); } /* * pmap_enter: enter a mapping into a pmap * * => must be done "now" ... no lazy-evaluation */ int pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, vm_prot_t prot, u_int flags, int domid) { pt_entry_t *ptes, opte, npte; pt_entry_t *ptep; pd_entry_t * const *pdes; struct vm_page *ptp; struct vm_page *new_pg, *old_pg; struct pmap_page *new_pp, *old_pp; struct pv_entry *old_pve, *new_pve; bool wired = (flags & PMAP_WIRED) != 0; struct pmap *pmap2; struct pmap_ptparray pt; int error; bool getptp, samepage, new_embedded; rb_tree_t *tree; KASSERT(pmap_initialized); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" PRIxVADDR " over PDP!", __func__, va); KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); #ifdef XENPV KASSERT(domid == DOMID_SELF || pa == 0); #endif npte = ma | protection_codes[prot] | PTE_P; npte |= pmap_pat_flags(flags); if (wired) npte |= PTE_WIRED; if (va < VM_MAXUSER_ADDRESS) { KASSERTMSG(pmap != pmap_kernel(), "entering user va %#"PRIxVADDR" into kernel pmap", va); if (pmap_is_user(pmap)) npte |= PTE_U; } if (pmap == pmap_kernel()) npte |= pmap_pg_g; if (flags & VM_PROT_ALL) { npte |= PTE_A; if (flags & VM_PROT_WRITE) { KASSERT((npte & PTE_W) != 0); npte |= PTE_D; } } #ifdef XENPV if (domid != DOMID_SELF) new_pg = NULL; else #endif new_pg = PHYS_TO_VM_PAGE(pa); if (new_pg != NULL) { /* This is a managed page */ npte |= PTE_PVLIST; new_pp = VM_PAGE_TO_PP(new_pg); PMAP_CHECK_PP(new_pp); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { /* This is an unmanaged pv-tracked page */ npte |= PTE_PVLIST; PMAP_CHECK_PP(new_pp); } else { new_pp = NULL; } /* Begin by locking the pmap. */ mutex_enter(&pmap->pm_lock); /* Look up the PTP. Allocate if none present. */ ptp = NULL; getptp = false; if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); if (ptp == NULL) { getptp = true; error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); if (error != 0) { if (flags & PMAP_CANFAIL) { mutex_exit(&pmap->pm_lock); return error; } panic("%s: get ptp failed, error=%d", __func__, error); } } tree = &VM_PAGE_TO_PP(ptp)->pp_rb; } else { /* Embedded PV entries rely on this. */ KASSERT(va != 0); tree = &pmap_kernel_rb; } /* * Look up the old PV entry at this VA (if any), and insert a new PV * entry if required for the new mapping. Temporarily track the old * and new mappings concurrently. Only after the old mapping is * evicted from the pmap will we remove its PV entry. Otherwise, * our picture of modified/accessed state for either page could get * out of sync (we need any P->V operation for either page to stall * on pmap->pm_lock until done here). */ new_pve = NULL; old_pve = NULL; samepage = false; new_embedded = false; if (new_pp != NULL) { error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, &old_pve, &samepage, &new_embedded, tree); /* * If a new pv_entry was needed and none was available, we * can go no further. */ if (error != 0) { if (flags & PMAP_CANFAIL) { if (getptp) { pmap_unget_ptp(pmap, &pt); } mutex_exit(&pmap->pm_lock); return error; } panic("%s: alloc pve failed", __func__); } } else { old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); } /* Map PTEs into address space. */ pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* Install any newly allocated PTPs. */ if (getptp) { pmap_install_ptp(pmap, &pt, va, pdes); } /* Check if there is an existing mapping. */ ptep = &ptes[pl1_i(va)]; opte = *ptep; bool have_oldpa = pmap_valid_entry(opte); paddr_t oldpa = pmap_pte2pa(opte); /* * Update the pte. */ do { opte = *ptep; /* * if the same page, inherit PTE_A and PTE_D. */ if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { npte |= opte & (PTE_A | PTE_D); } #if defined(XENPV) if (domid != DOMID_SELF) { /* pmap_pte_cas with error handling */ int s = splvm(); if (opte != *ptep) { splx(s); continue; } error = xpq_update_foreign( vtomach((vaddr_t)ptep), npte, domid, flags); splx(s); if (error) { /* Undo pv_entry tracking - oof. */ if (new_pp != NULL) { mutex_spin_enter(&new_pp->pp_lock); if (new_pve != NULL) { LIST_REMOVE(new_pve, pve_list); KASSERT(pmap->pm_pve == NULL); pmap->pm_pve = new_pve; } else if (new_embedded) { new_pp->pp_pte.pte_ptp = NULL; new_pp->pp_pte.pte_va = 0; } mutex_spin_exit(&new_pp->pp_lock); } pmap_unmap_ptes(pmap, pmap2); /* Free new PTP. */ if (ptp != NULL && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } mutex_exit(&pmap->pm_lock); return error; } break; } #endif /* defined(XENPV) */ } while (pmap_pte_cas(ptep, opte, npte) != opte); /* * Done with the PTEs: they can now be unmapped. */ pmap_unmap_ptes(pmap, pmap2); /* * Update statistics and PTP's reference count. */ pmap_stats_update_bypte(pmap, npte, opte); if (ptp != NULL) { if (!have_oldpa) { ptp->wire_count++; } /* Remember minimum VA in PTP. */ pmap_ptp_range_set(ptp, va); } KASSERT(ptp == NULL || ptp->wire_count > 1); /* * If the same page, we can skip pv_entry handling. */ if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); if ((npte & PTE_PVLIST) != 0) { KASSERT(samepage); pmap_check_pv(pmap, ptp, new_pp, va, true); } goto same_pa; } else if ((npte & PTE_PVLIST) != 0) { KASSERT(!samepage); } /* * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", __func__, va, oldpa, atop(pa)); } pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_pte_to_pp_attrs(opte)); } else { KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } /* * If new page is dynamically PV tracked, insert to tree. */ if (new_pve != NULL) { KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); old_pve = rb_tree_insert_node(tree, new_pve); KASSERT(old_pve == new_pve); pmap_check_pv(pmap, ptp, new_pp, va, true); } same_pa: /* * shootdown tlb if necessary. */ if ((~opte & (PTE_P | PTE_A)) == 0 && ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); return 0; } #if defined(XEN) && defined(DOM0OPS) struct pmap_data_gnt { SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; vaddr_t pd_gnt_sva; vaddr_t pd_gnt_eva; /* range covered by this gnt */ int pd_gnt_refs; /* ref counter */ struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ }; SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); static struct pmap_data_gnt * pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { struct pmap_data_gnt_head *headp; struct pmap_data_gnt *pgnt; KASSERT(mutex_owned(&pmap->pm_lock)); headp = pmap->pm_data; KASSERT(headp != NULL); SLIST_FOREACH(pgnt, headp, pd_gnt_list) { if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) return pgnt; /* check that we're not overlapping part of a region */ KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); } return NULL; } static void pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, const struct gnttab_map_grant_ref *ops) { struct pmap_data_gnt_head *headp; struct pmap_data_gnt *pgnt; vaddr_t eva = sva + nentries * PAGE_SIZE; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(nentries >= 1); if (pmap->pm_remove == NULL) { pmap->pm_remove = pmap_remove_gnt; KASSERT(pmap->pm_data == NULL); headp = kmem_alloc(sizeof(*headp), KM_SLEEP); SLIST_INIT(headp); pmap->pm_data = headp; } else { KASSERT(pmap->pm_remove == pmap_remove_gnt); KASSERT(pmap->pm_data != NULL); headp = pmap->pm_data; } pgnt = pmap_find_gnt(pmap, sva, eva); if (pgnt != NULL) { KASSERT(pgnt->pd_gnt_sva == sva); KASSERT(pgnt->pd_gnt_eva == eva); return; } /* new entry */ pgnt = kmem_alloc(sizeof(*pgnt) + (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); pgnt->pd_gnt_sva = sva; pgnt->pd_gnt_eva = eva; pgnt->pd_gnt_refs = 0; memcpy(pgnt->pd_gnt_ops, ops, sizeof(struct gnttab_map_grant_ref) * nentries); SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); } static void pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) { struct pmap_data_gnt_head *headp = pmap->pm_data; int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; KASSERT(nentries >= 1); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pgnt->pd_gnt_refs == 0); SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); kmem_free(pgnt, sizeof(*pgnt) + (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); if (SLIST_EMPTY(headp)) { kmem_free(headp, sizeof(*headp)); pmap->pm_data = NULL; pmap->pm_remove = NULL; } } /* * pmap_enter_gnt: enter a grant entry into a pmap * * => must be done "now" ... no lazy-evaluation */ int pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, const struct gnttab_map_grant_ref *oops) { struct pmap_data_gnt *pgnt; pt_entry_t *ptes, opte; #ifndef XENPV pt_entry_t npte; #endif pt_entry_t *ptep; pd_entry_t * const *pdes; struct vm_page *ptp; struct vm_page *old_pg; struct pmap_page *old_pp; struct pv_entry *old_pve; struct pmap *pmap2; struct pmap_ptparray pt; int error; bool getptp; rb_tree_t *tree; struct gnttab_map_grant_ref *op; int ret; int idx; KASSERT(pmap_initialized); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" PRIxVADDR " over PDP!", __func__, va); KASSERT(pmap != pmap_kernel()); /* Begin by locking the pmap. */ mutex_enter(&pmap->pm_lock); pmap_alloc_gnt(pmap, sva, nentries, oops); pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); KASSERT(pgnt != NULL); /* Look up the PTP. Allocate if none present. */ ptp = NULL; getptp = false; ptp = pmap_find_ptp(pmap, va, 1); if (ptp == NULL) { getptp = true; error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); if (error != 0) { mutex_exit(&pmap->pm_lock); return error; } } tree = &VM_PAGE_TO_PP(ptp)->pp_rb; /* * Look up the old PV entry at this VA (if any), and insert a new PV * entry if required for the new mapping. Temporarily track the old * and new mappings concurrently. Only after the old mapping is * evicted from the pmap will we remove its PV entry. Otherwise, * our picture of modified/accessed state for either page could get * out of sync (we need any P->V operation for either page to stall * on pmap->pm_lock until done here). */ old_pve = NULL; old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); /* Map PTEs into address space. */ pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* Install any newly allocated PTPs. */ if (getptp) { pmap_install_ptp(pmap, &pt, va, pdes); } /* Check if there is an existing mapping. */ ptep = &ptes[pl1_i(va)]; opte = *ptep; bool have_oldpa = pmap_valid_entry(opte); paddr_t oldpa = pmap_pte2pa(opte); /* * Update the pte. */ idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; op = &pgnt->pd_gnt_ops[idx]; #ifdef XENPV KASSERT(op->flags & GNTMAP_contains_pte); op->host_addr = xpmap_ptetomach(ptep); #else KASSERT((op->flags & GNTMAP_contains_pte) == 0); KASSERT(op->flags != 0); KASSERT(op->host_addr != 0); #endif op->dev_bus_addr = 0; op->status = GNTST_general_error; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); if (__predict_false(ret)) { printf("%s: GNTTABOP_map_grant_ref failed: %d\n", __func__, ret); op->status = GNTST_general_error; } for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { kpause("gntmap", false, mstohz(1), NULL); ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); if (__predict_false(ret)) { printf("%s: GNTTABOP_map_grant_ref failed: %d\n", __func__, ret); op->status = GNTST_general_error; } } if (__predict_false(op->status != GNTST_okay)) { printf("%s: GNTTABOP_map_grant_ref status: %d\n", __func__, op->status); if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/ ptp->wire_count--; } } else { #ifndef XENPV npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P; if ((op->flags & GNTMAP_readonly) == 0) npte |= PTE_W; do { opte = *ptep; } while (pmap_pte_cas(ptep, opte, npte) != opte); #endif pgnt->pd_gnt_refs++; if (!have_oldpa) { ptp->wire_count++; } KASSERT(ptp->wire_count > 1); /* Remember minimum VA in PTP. */ pmap_ptp_range_set(ptp, va); } if (ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, va, ptes, pdes); /* * Done with the PTEs: they can now be unmapped. */ pmap_unmap_ptes(pmap, pmap2); /* * Update statistics and PTP's reference count. */ pmap_stats_update_bypte(pmap, 0, opte); /* * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { panic("%s: PTE_PVLIST with pv-untracked page" " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, __func__, va, oldpa); } pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_pte_to_pp_attrs(opte)); } else { KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); return op->status; } /* * pmap_remove_gnt: grant mapping removal function. * * => caller should not be holding any pmap locks */ static void pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { struct pmap_data_gnt *pgnt; pt_entry_t *ptes; pd_entry_t pde; pd_entry_t * const *pdes; struct vm_page *ptp; struct pmap *pmap2; vaddr_t va; int lvl; int idx; struct gnttab_map_grant_ref *op; struct gnttab_unmap_grant_ref unmap_op; int ret; KASSERT(pmap != pmap_kernel()); KASSERT(pmap->pm_remove == pmap_remove_gnt); mutex_enter(&pmap->pm_lock); for (va = sva; va < eva; va += PAGE_SIZE) { pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); if (pgnt == NULL) { pmap_remove_locked(pmap, sva, eva); continue; } pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { panic("pmap_remove_gnt pdes not valid"); } idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; op = &pgnt->pd_gnt_ops[idx]; KASSERT(lvl == 1); /* Get PTP if non-kernel mapping. */ ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); if (op->status == GNTST_okay) { KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); #ifdef XENPV unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); #else unmap_op.host_addr = op->host_addr; pmap_pte_testset(&ptes[pl1_i(va)], 0); #endif unmap_op.handle = op->handle; unmap_op.dev_bus_addr = 0; ret = HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, &unmap_op, 1); if (ret) { printf("%s: GNTTABOP_unmap_grant_ref " "failed: %d\n", __func__, ret); } ptp->wire_count--; pgnt->pd_gnt_refs--; } if (pgnt->pd_gnt_refs == 0) { pmap_free_gnt(pmap, pgnt); } /* * if mapping removed and the PTP is no longer * being used, free it! */ if (ptp->wire_count <= 1) pmap_free_ptp(pmap, ptp, va, ptes, pdes); pmap_unmap_ptes(pmap, pmap2); } mutex_exit(&pmap->pm_lock); } #endif /* XEN && DOM0OPS */ paddr_t pmap_get_physpage(void) { struct vm_page *ptp; struct pmap *kpm = pmap_kernel(); paddr_t pa; if (!uvm.page_init_done) { /* * We're growing the kernel pmap early (from * uvm_pageboot_alloc()). This case must be * handled a little differently. */ if (!uvm_page_physget(&pa)) panic("%s: out of memory", __func__); #if defined(__HAVE_DIRECT_MAP) memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); #else #if defined(XENPV) if (XEN_VERSION_SUPPORTED(3, 4)) { xen_pagezero(pa); return pa; } #endif kpreempt_disable(); pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | PTE_W | pmap_pg_nx); pmap_pte_flush(); pmap_update_pg((vaddr_t)early_zerop); memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); #if defined(DIAGNOSTIC) || defined(XENPV) pmap_pte_set(early_zero_pte, 0); pmap_pte_flush(); #endif /* defined(DIAGNOSTIC) */ kpreempt_enable(); #endif /* defined(__HAVE_DIRECT_MAP) */ } else { /* XXX */ ptp = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO); if (ptp == NULL) panic("%s: out of memory", __func__); ptp->flags &= ~PG_BUSY; ptp->wire_count = 1; pa = VM_PAGE_TO_PHYS(ptp); } pmap_stats_update(kpm, 1, 0); return pa; } /* * Expand the page tree with the specified amount of PTPs, mapping virtual * addresses starting at kva. We populate all the levels but the last one * (L1). The nodes of the tree are created as RW, but the pages covered * will be kentered in L1, with proper permissions. * * Used only by pmap_growkernel. */ static void pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) { unsigned long i; paddr_t pa; unsigned long index, endindex; int level; pd_entry_t *pdep; #ifdef XENPV int s = splvm(); /* protect xpq_* */ #endif for (level = PTP_LEVELS; level > 1; level--) { if (level == PTP_LEVELS) pdep = cpm->pm_pdir; else pdep = normal_pdes[level - 2]; index = pl_i_roundup(kva, level); endindex = index + needed_ptps[level - 1] - 1; for (i = index; i <= endindex; i++) { pt_entry_t pte; KASSERT(!pmap_valid_entry(pdep[i])); pa = pmap_get_physpage(); pte = pmap_pa2pte(pa) | PTE_P | PTE_W; #ifdef __x86_64__ pte |= pmap_pg_nx; #endif pmap_pte_set(&pdep[i], pte); #ifdef XENPV if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { if (__predict_true( cpu_info_primary.ci_flags & CPUF_PRESENT)) { /* update per-cpu PMDs on all cpus */ xen_kpm_sync(pmap_kernel(), i); } else { /* * too early; update primary CPU * PMD only (without locks) */ #ifdef __x86_64__ pd_entry_t *cpu_pdep = &cpu_info_primary.ci_kpm_pdir[i]; #else pd_entry_t *cpu_pdep = &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; #endif pmap_pte_set(cpu_pdep, pte); } } #endif KASSERT(level != PTP_LEVELS || nkptp[level - 1] + pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); nkptp[level - 1]++; } pmap_pte_flush(); } #ifdef XENPV splx(s); #endif } /* * pmap_growkernel: increase usage of KVM space. * * => we allocate new PTPs for the kernel and install them in all * the pmaps on the system. */ vaddr_t pmap_growkernel(vaddr_t maxkvaddr) { struct pmap *kpm = pmap_kernel(); struct pmap *cpm; #if !defined(XENPV) || !defined(__x86_64__) struct pmap *pm; long old; #endif int s, i; long needed_kptp[PTP_LEVELS], target_nptp; bool invalidate = false; s = splvm(); /* to be safe */ mutex_enter(&kpm->pm_lock); if (maxkvaddr <= pmap_maxkvaddr) { mutex_exit(&kpm->pm_lock); splx(s); return pmap_maxkvaddr; } maxkvaddr = x86_round_pdr(maxkvaddr); #if !defined(XENPV) || !defined(__x86_64__) old = nkptp[PTP_LEVELS - 1]; #endif /* Initialize needed_kptp. */ for (i = PTP_LEVELS - 1; i >= 1; i--) { target_nptp = pl_i_roundup(maxkvaddr, i + 1) - pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); if (target_nptp > nkptpmax[i]) panic("out of KVA space"); KASSERT(target_nptp >= nkptp[i]); needed_kptp[i] = target_nptp - nkptp[i]; } #ifdef XENPV /* only pmap_kernel(), or the per-cpu map, has kernel entries */ cpm = kpm; #else /* Get the current pmap */ if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { cpm = curcpu()->ci_pmap; } else { cpm = kpm; } #endif kasan_shadow_map((void *)pmap_maxkvaddr, (size_t)(maxkvaddr - pmap_maxkvaddr)); kmsan_shadow_map((void *)pmap_maxkvaddr, (size_t)(maxkvaddr - pmap_maxkvaddr)); pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); /* * If the number of top level entries changed, update all pmaps. */ if (needed_kptp[PTP_LEVELS - 1] != 0) { #ifdef XENPV #ifdef __x86_64__ /* nothing, kernel entries are never entered in user pmap */ #else int pdkidx; mutex_enter(&pmaps_lock); LIST_FOREACH(pm, &pmaps, pm_list) { for (pdkidx = PDIR_SLOT_KERN + old; pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; pdkidx++) { pmap_pte_set(&pm->pm_pdir[pdkidx], kpm->pm_pdir[pdkidx]); } pmap_pte_flush(); } mutex_exit(&pmaps_lock); #endif /* __x86_64__ */ #else /* XENPV */ size_t newpdes; newpdes = nkptp[PTP_LEVELS - 1] - old; if (cpm != kpm) { memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], &cpm->pm_pdir[PDIR_SLOT_KERN + old], newpdes * sizeof(pd_entry_t)); } mutex_enter(&pmaps_lock); LIST_FOREACH(pm, &pmaps, pm_list) { if (__predict_false(pm->pm_enter != NULL)) { /* * Not a native pmap, the kernel is not mapped, * so nothing to synchronize. */ continue; } memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], &kpm->pm_pdir[PDIR_SLOT_KERN + old], newpdes * sizeof(pd_entry_t)); } mutex_exit(&pmaps_lock); #endif invalidate = true; } pmap_maxkvaddr = maxkvaddr; mutex_exit(&kpm->pm_lock); splx(s); if (invalidate && pmap_initialized) { /* Invalidate the pmap cache. */ pool_cache_invalidate(&pmap_cache); } return maxkvaddr; } #ifdef DEBUG void pmap_dump(struct pmap *, vaddr_t, vaddr_t); /* * pmap_dump: dump all the mappings from a pmap * * => caller should not be holding any pmap locks */ void pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { pt_entry_t *ptes, *pte; pd_entry_t * const *pdes; struct pmap *pmap2; vaddr_t blkendva; int lvl; /* * if end is out of range truncate. * if (end == start) update to max. */ if (eva > VM_MAXUSER_ADDRESS || eva <= sva) eva = VM_MAXUSER_ADDRESS; mutex_enter(&pmap->pm_lock); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* * dumping a range of pages: we dump in PTP sized blocks (4MB) */ for (/* null */ ; sva < eva ; sva = blkendva) { /* determine range of block */ blkendva = x86_round_pdr(sva+1); if (blkendva > eva) blkendva = eva; /* valid block? */ if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) continue; KASSERT(lvl == 1); pte = &ptes[pl1_i(sva)]; for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { if (!pmap_valid_entry(*pte)) continue; printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR " (pte=%#" PRIxPADDR ")\n", sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); } } pmap_unmap_ptes(pmap, pmap2); mutex_exit(&pmap->pm_lock); } #endif /* * pmap_update: process deferred invalidations and frees. */ void pmap_update(struct pmap *pmap) { struct pmap_page *pp; struct vm_page *ptp; /* * Initiate any pending TLB shootdowns. Wait for them to * complete before returning control to the caller. */ kpreempt_disable(); pmap_tlb_shootnow(); kpreempt_enable(); /* * Now that shootdowns are complete, process deferred frees. This * is an unlocked check, but is safe as we're only interested in * work done in this LWP - we won't get a false negative. */ if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { return; } mutex_enter(&pmap->pm_lock); while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { KASSERT(ptp->wire_count == 0); KASSERT(ptp->uanon == NULL); LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); pp = VM_PAGE_TO_PP(ptp); LIST_INIT(&pp->pp_pvlist); pp->pp_attrs = 0; pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); /* * XXX Hack to avoid extra locking, and lock * assertions in uvm_pagefree(). Despite uobject * being set, this isn't a managed page. */ PMAP_DUMMY_LOCK(pmap); uvm_pagerealloc(ptp, NULL, 0); PMAP_DUMMY_UNLOCK(pmap); uvm_pagefree(ptp); } mutex_exit(&pmap->pm_lock); } #if PTP_LEVELS > 4 #error "Unsupported number of page table mappings" #endif paddr_t pmap_init_tmp_pgtbl(paddr_t pg) { static bool maps_loaded; static const paddr_t x86_tmp_pml_paddr[] = { 4 * PAGE_SIZE, /* L1 */ 5 * PAGE_SIZE, /* L2 */ 6 * PAGE_SIZE, /* L3 */ 7 * PAGE_SIZE /* L4 */ }; static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; pd_entry_t *tmp_pml, *kernel_pml; int level; if (!maps_loaded) { for (level = 0; level < PTP_LEVELS; ++level) { x86_tmp_pml_vaddr[level] = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY); if (x86_tmp_pml_vaddr[level] == 0) panic("mapping of real mode PML failed\n"); pmap_kenter_pa(x86_tmp_pml_vaddr[level], x86_tmp_pml_paddr[level], VM_PROT_READ | VM_PROT_WRITE, 0); } pmap_update(pmap_kernel()); maps_loaded = true; } /* Zero levels 1-3 */ for (level = 0; level < PTP_LEVELS - 1; ++level) { tmp_pml = (void *)x86_tmp_pml_vaddr[level]; memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); } /* Copy PML4 */ kernel_pml = pmap_kernel()->pm_pdir; tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); #ifdef PAE /* * Use the last 4 entries of the L2 page as L3 PD entries. These * last entries are unlikely to be used for temporary mappings. * 508: maps 0->1GB (userland) * 509: unused * 510: unused * 511: maps 3->4GB (kernel) */ tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; tmp_pml[509] = 0; tmp_pml[510] = 0; tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; #endif for (level = PTP_LEVELS - 1; level > 0; --level) { tmp_pml = (void *)x86_tmp_pml_vaddr[level]; tmp_pml[pl_i(pg, level + 1)] = (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; } tmp_pml = (void *)x86_tmp_pml_vaddr[0]; tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; #ifdef PAE /* Return the PA of the L3 page (entry 508 of the L2 page) */ return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); #endif return x86_tmp_pml_paddr[PTP_LEVELS - 1]; } u_int x86_mmap_flags(paddr_t mdpgno) { u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; u_int pflag = 0; if (nflag & X86_MMAP_FLAG_PREFETCH) pflag |= PMAP_WRITE_COMBINE; return pflag; } #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) /* * ----------------------------------------------------------------------------- * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ----------------------------------------------------------------------------- * * These functions are invoked as callbacks from the code above. Contrary to * native, EPT does not have a recursive slot; therefore, it is not possible * to call pmap_map_ptes(). Instead, we use the direct map and walk down the * tree manually. * * Apart from that, the logic is mostly the same as native. Once a pmap has * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. * After that we're good, and the callbacks will handle the translations * for us. * * ----------------------------------------------------------------------------- */ /* Hardware bits. */ #define EPT_R __BIT(0) /* read */ #define EPT_W __BIT(1) /* write */ #define EPT_X __BIT(2) /* execute */ #define EPT_T __BITS(5,3) /* type */ #define TYPE_UC 0 #define TYPE_WC 1 #define TYPE_WT 4 #define TYPE_WP 5 #define TYPE_WB 6 #define EPT_NOPAT __BIT(6) #define EPT_L __BIT(7) /* large */ #define EPT_A __BIT(8) /* accessed */ #define EPT_D __BIT(9) /* dirty */ /* Software bits. */ #define EPT_PVLIST __BIT(60) #define EPT_WIRED __BIT(61) #define pmap_ept_valid_entry(pte) (pte & EPT_R) bool pmap_ept_has_ad __read_mostly; static inline void pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) { int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); pmap_stats_update(pmap, resid_diff, wired_diff); } static pt_entry_t pmap_ept_type(u_int flags) { u_int cacheflags = (flags & PMAP_CACHE_MASK); pt_entry_t ret; switch (cacheflags) { case PMAP_NOCACHE: case PMAP_NOCACHE_OVR: ret = __SHIFTIN(TYPE_UC, EPT_T); break; case PMAP_WRITE_COMBINE: ret = __SHIFTIN(TYPE_WC, EPT_T); break; case PMAP_WRITE_BACK: default: ret = __SHIFTIN(TYPE_WB, EPT_T); break; } ret |= EPT_NOPAT; return ret; } static inline pt_entry_t pmap_ept_prot(vm_prot_t prot) { pt_entry_t res = 0; if (prot & VM_PROT_READ) res |= EPT_R; if (prot & VM_PROT_WRITE) res |= EPT_W; if (prot & VM_PROT_EXECUTE) res |= EPT_X; return res; } static inline uint8_t pmap_ept_to_pp_attrs(pt_entry_t ept) { uint8_t ret = 0; if (pmap_ept_has_ad) { if (ept & EPT_D) ret |= PP_ATTRS_D; if (ept & EPT_A) ret |= PP_ATTRS_A; } else { ret |= (PP_ATTRS_D|PP_ATTRS_A); } if (ept & EPT_W) ret |= PP_ATTRS_W; return ret; } static inline pt_entry_t pmap_pp_attrs_to_ept(uint8_t attrs) { pt_entry_t ept = 0; if (attrs & PP_ATTRS_D) ept |= EPT_D; if (attrs & PP_ATTRS_A) ept |= EPT_A; if (attrs & PP_ATTRS_W) ept |= EPT_W; return ept; } /* * Helper for pmap_ept_free_ptp. * tree[0] = &L2[L2idx] * tree[1] = &L3[L3idx] * tree[2] = &L4[L4idx] */ static void pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) { pt_entry_t *pteva; paddr_t ptepa; int i, index; ptepa = pmap->pm_pdirpa[0]; for (i = PTP_LEVELS; i > 1; i--) { index = pl_pi(va, i); pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); KASSERT(pmap_ept_valid_entry(pteva[index])); tree[i - 2] = &pteva[index]; ptepa = pmap_pte2pa(pteva[index]); } } static void pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) { pd_entry_t *tree[3]; int level; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); pmap_ept_get_tree(pmap, va, tree); level = 1; do { (void)pmap_pte_testset(tree[level - 1], 0); pmap_freepage(pmap, ptp, level); if (level < PTP_LEVELS - 1) { ptp = pmap_find_ptp(pmap, va, level + 1); ptp->wire_count--; if (ptp->wire_count > 1) break; } } while (++level < PTP_LEVELS); pmap_pte_flush(); } /* Allocate L4->L3->L2. Return L2. */ static void pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) { struct vm_page *ptp; unsigned long index; pd_entry_t *pteva; paddr_t ptepa; int i; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * Now that we have all the pages looked up or allocated, * loop through again installing any new ones into the tree. */ ptepa = pmap->pm_pdirpa[0]; for (i = PTP_LEVELS; i > 1; i--) { index = pl_pi(va, i); pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); if (pmap_ept_valid_entry(pteva[index])) { KASSERT(!pt->alloced[i]); ptepa = pmap_pte2pa(pteva[index]); continue; } ptp = pt->pg[i]; ptp->flags &= ~PG_BUSY; /* never busy */ ptp->wire_count = 1; pmap->pm_ptphint[i - 2] = ptp; ptepa = VM_PAGE_TO_PHYS(ptp); pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); pmap_pte_flush(); pmap_stats_update(pmap, 1, 0); /* * If we're not in the top level, increase the * wire count of the parent page. */ if (i < PTP_LEVELS) { pt->pg[i + 1]->wire_count++; } } } static int pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) { pt_entry_t *ptes, opte, npte; pt_entry_t *ptep; struct vm_page *ptp; struct vm_page *new_pg, *old_pg; struct pmap_page *new_pp, *old_pp; struct pv_entry *old_pve, *new_pve; bool wired = (flags & PMAP_WIRED) != 0; bool accessed; struct pmap_ptparray pt; int error; bool getptp, samepage, new_embedded; rb_tree_t *tree; KASSERT(pmap_initialized); KASSERT(va < VM_MAXUSER_ADDRESS); npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); if (wired) npte |= EPT_WIRED; if (flags & VM_PROT_ALL) { npte |= EPT_A; if (flags & VM_PROT_WRITE) { KASSERT((npte & EPT_W) != 0); npte |= EPT_D; } } new_pg = PHYS_TO_VM_PAGE(pa); if (new_pg != NULL) { /* This is a managed page */ npte |= EPT_PVLIST; new_pp = VM_PAGE_TO_PP(new_pg); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { /* This is an unmanaged pv-tracked page */ npte |= EPT_PVLIST; } else { new_pp = NULL; } /* Begin by locking the pmap. */ mutex_enter(&pmap->pm_lock); /* Look up the PTP. Allocate if none present. */ ptp = NULL; getptp = false; if (pmap != pmap_kernel()) { ptp = pmap_find_ptp(pmap, va, 1); if (ptp == NULL) { getptp = true; error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); if (error != 0) { if (flags & PMAP_CANFAIL) { mutex_exit(&pmap->pm_lock); return error; } panic("%s: get ptp failed, error=%d", __func__, error); } } tree = &VM_PAGE_TO_PP(ptp)->pp_rb; } else { /* Embedded PV entries rely on this. */ KASSERT(va != 0); tree = &pmap_kernel_rb; } /* * Look up the old PV entry at this VA (if any), and insert a new PV * entry if required for the new mapping. Temporarily track the old * and new mappings concurrently. Only after the old mapping is * evicted from the pmap will we remove its PV entry. Otherwise, * our picture of modified/accessed state for either page could get * out of sync (we need any P->V operation for either page to stall * on pmap->pm_lock until done here). */ new_pve = NULL; old_pve = NULL; samepage = false; new_embedded = false; if (new_pp != NULL) { error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, &old_pve, &samepage, &new_embedded, tree); /* * If a new pv_entry was needed and none was available, we * can go no further. */ if (error != 0) { if (flags & PMAP_CANFAIL) { if (getptp) { pmap_unget_ptp(pmap, &pt); } mutex_exit(&pmap->pm_lock); return error; } panic("%s: alloc pve failed", __func__); } } else { old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); } /* Map PTEs into address space. */ kpreempt_disable(); /* Install any newly allocated PTPs. */ if (getptp) { pmap_ept_install_ptp(pmap, &pt, va); } /* Check if there is an existing mapping. */ ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); ptep = &ptes[pl1_pi(va)]; opte = *ptep; bool have_oldpa = pmap_ept_valid_entry(opte); paddr_t oldpa = pmap_pte2pa(opte); /* * Update the pte. */ do { opte = *ptep; /* * if the same page, inherit PTE_A and PTE_D. */ if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { npte |= opte & (EPT_A | EPT_D); } } while (pmap_pte_cas(ptep, opte, npte) != opte); /* * Done with the PTEs: they can now be unmapped. */ kpreempt_enable(); /* * Update statistics and PTP's reference count. */ pmap_ept_stats_update_bypte(pmap, npte, opte); if (ptp != NULL) { if (!have_oldpa) { ptp->wire_count++; } /* Remember minimum VA in PTP. */ pmap_ptp_range_set(ptp, va); } KASSERT(ptp == NULL || ptp->wire_count > 1); /* * If the same page, we can skip pv_entry handling. */ if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); if ((npte & EPT_PVLIST) != 0) { KASSERT(samepage); pmap_check_pv(pmap, ptp, new_pp, va, true); } goto same_pa; } else if ((npte & EPT_PVLIST) != 0) { KASSERT(!samepage); } /* * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { panic("%s: EPT_PVLIST with pv-untracked page" " va = %#"PRIxVADDR " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", __func__, va, oldpa, atop(pa)); } pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_ept_to_pp_attrs(opte)); } else { KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } /* * If new page is dynamically PV tracked, insert to tree. */ if (new_pve != NULL) { KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); old_pve = rb_tree_insert_node(tree, new_pve); KASSERT(old_pve == new_pve); pmap_check_pv(pmap, ptp, new_pp, va, true); } same_pa: /* * shootdown tlb if necessary. */ if (pmap_ept_has_ad) { accessed = (~opte & (EPT_R | EPT_A)) == 0; } else { accessed = (opte & EPT_R) != 0; } if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); } pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); return 0; } /* Pay close attention, this returns L2. */ static int pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) { pt_entry_t *pteva; paddr_t ptepa; int i, index; KASSERT(mutex_owned(&pmap->pm_lock)); ptepa = pmap->pm_pdirpa[0]; for (i = PTP_LEVELS; i > 1; i--) { pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); index = pl_pi(va, i); if (!pmap_ept_valid_entry(pteva[index])) return i; ptepa = pmap_pte2pa(pteva[index]); } if (lastpde != NULL) { *lastpde = pteva[index]; } return 0; } static bool pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) { pt_entry_t *ptes, pte; pd_entry_t pde; paddr_t ptppa, pa; bool rv; #ifdef __HAVE_DIRECT_MAP if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { if (pap != NULL) { *pap = PMAP_DIRECT_UNMAP(va); } return true; } #endif rv = false; pa = 0; mutex_enter(&pmap->pm_lock); kpreempt_disable(); if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { ptppa = pmap_pte2pa(pde); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); pte = ptes[pl1_pi(va)]; if (__predict_true((pte & EPT_R) != 0)) { pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); rv = true; } } kpreempt_enable(); mutex_exit(&pmap->pm_lock); if (pap != NULL) { *pap = pa; } return rv; } static bool pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, vaddr_t va) { struct pv_entry *pve; struct vm_page *pg; struct pmap_page *pp; pt_entry_t opte; bool accessed; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); if (!pmap_ept_valid_entry(*pte)) { /* VA not mapped. */ return false; } /* Atomically save the old PTE and zap it. */ opte = pmap_pte_testset(pte, 0); if (!pmap_ept_valid_entry(opte)) { return false; } pmap_ept_stats_update_bypte(pmap, 0, opte); if (ptp) { /* * Dropping a PTE. Make sure that the PDE is flushed. */ ptp->wire_count--; if (ptp->wire_count <= 1) { opte |= EPT_A; } } if (pmap_ept_has_ad) { accessed = (opte & EPT_A) != 0; } else { accessed = true; } if (accessed) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); } /* * If we are not on a pv list - we are done. */ if ((opte & EPT_PVLIST) == 0) { KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), "managed page without EPT_PVLIST for %#"PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); return true; } if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { paddr_t pa = pmap_pte2pa(opte); panic("%s: EPT_PVLIST with pv-untracked page" " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", __func__, va, pa, atop(pa)); } /* Sync R/M bits. */ pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); return true; } static void pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, vaddr_t startva, vaddr_t endva) { pt_entry_t *pte = (pt_entry_t *)ptpva; KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); /* * mappings are very often sparse, so clip the given range to the * range of PTEs that are known present in the PTP. */ pmap_ptp_range_clip(ptp, &startva, &pte); /* * note that ptpva points to the PTE that maps startva. this may * or may not be the first PTE in the PTP. * * we loop through the PTP while there are still PTEs to look at * and the wire_count is greater than 1 (because we use the wire_count * to keep track of the number of real PTEs in the PTP). */ while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); startva += PAGE_SIZE; pte++; } } static void pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { pt_entry_t *ptes; pd_entry_t pde; paddr_t ptppa; vaddr_t blkendva, va = sva; struct vm_page *ptp; mutex_enter(&pmap->pm_lock); kpreempt_disable(); for (/* null */ ; va < eva ; va = blkendva) { int lvl; /* determine range of block */ blkendva = x86_round_pdr(va+1); if (blkendva > eva) blkendva = eva; lvl = pmap_ept_pdes_invalid(pmap, va, &pde); if (lvl != 0) { /* Skip a range corresponding to an invalid pde. */ blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; continue; } /* PA of the PTP */ ptppa = pmap_pte2pa(pde); ptp = pmap_find_ptp(pmap, va, 1); KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", __func__); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, blkendva); /* If PTP is no longer being used, free it. */ if (ptp && ptp->wire_count <= 1) { pmap_ept_free_ptp(pmap, ptp, va); } } kpreempt_enable(); pmap_drain_pv(pmap); mutex_exit(&pmap->pm_lock); } static int pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, uint8_t *oattrs, pt_entry_t *optep) { struct pmap *pmap; pt_entry_t *ptep; pt_entry_t opte; pt_entry_t npte; pt_entry_t expect; bool need_shootdown; expect = pmap_pa2pte(pa) | EPT_R; pmap = ptp_to_pmap(ptp); if (clearbits != ~0) { KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); clearbits = pmap_pp_attrs_to_ept(clearbits); } ptep = pmap_map_pte(pmap, ptp, va); do { opte = *ptep; KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); KASSERT(opte == 0 || (opte & EPT_R) != 0); if ((opte & (PTE_FRAME | EPT_R)) != expect) { /* * We lost a race with a V->P operation like * pmap_remove(). Wait for the competitor * reflecting pte bits into mp_attrs. */ pmap_unmap_pte(); return EAGAIN; } /* * Check if there's anything to do on this PTE. */ if ((opte & clearbits) == 0) { need_shootdown = false; break; } /* * We need a shootdown if the PTE is cached (EPT_A) ... * ... Unless we are clearing only the EPT_W bit and * it isn't cached as RW (EPT_D). */ if (pmap_ept_has_ad) { need_shootdown = (opte & EPT_A) != 0 && !(clearbits == EPT_W && (opte & EPT_D) == 0); } else { need_shootdown = true; } npte = opte & ~clearbits; /* * If we need a shootdown anyway, clear EPT_A and EPT_D. */ if (need_shootdown) { npte &= ~(EPT_A | EPT_D); } KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); KASSERT(npte == 0 || (opte & EPT_R) != 0); } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); } pmap_unmap_pte(); *oattrs = pmap_ept_to_pp_attrs(opte); if (optep != NULL) *optep = opte; return 0; } static void pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, vaddr_t va) { KASSERT(mutex_owned(&pmap->pm_lock)); pmap_ept_stats_update_bypte(pmap, 0, opte); ptp->wire_count--; if (ptp->wire_count <= 1) { pmap_ept_free_ptp(pmap, ptp, va); } } static void pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) { pt_entry_t bit_rem; pt_entry_t *ptes, *spte; pt_entry_t opte, npte; pd_entry_t pde; paddr_t ptppa; vaddr_t va; bool modified; bit_rem = 0; if (!(prot & VM_PROT_WRITE)) bit_rem = EPT_W; sva &= PTE_FRAME; eva &= PTE_FRAME; /* Acquire pmap. */ mutex_enter(&pmap->pm_lock); kpreempt_disable(); for (va = sva; va < eva; va += PAGE_SIZE) { if (pmap_ept_pdes_invalid(pmap, va, &pde)) { continue; } ptppa = pmap_pte2pa(pde); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); spte = &ptes[pl1_pi(va)]; do { opte = *spte; if (!pmap_ept_valid_entry(opte)) { goto next; } npte = (opte & ~bit_rem); } while (pmap_pte_cas(spte, opte, npte) != opte); if (pmap_ept_has_ad) { modified = (opte & EPT_D) != 0; } else { modified = true; } if (modified) { vaddr_t tva = x86_ptob(spte - ptes); pmap_tlb_shootdown(pmap, tva, 0, TLBSHOOT_WRITE_PROTECT); } next:; } kpreempt_enable(); mutex_exit(&pmap->pm_lock); } static void pmap_ept_unwire(struct pmap *pmap, vaddr_t va) { pt_entry_t *ptes, *ptep, opte; pd_entry_t pde; paddr_t ptppa; /* Acquire pmap. */ mutex_enter(&pmap->pm_lock); kpreempt_disable(); if (pmap_ept_pdes_invalid(pmap, va, &pde)) { panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); } ptppa = pmap_pte2pa(pde); ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); ptep = &ptes[pl1_pi(va)]; opte = *ptep; KASSERT(pmap_ept_valid_entry(opte)); if (opte & EPT_WIRED) { pt_entry_t npte = opte & ~EPT_WIRED; opte = pmap_pte_testset(ptep, npte); pmap_ept_stats_update_bypte(pmap, npte, opte); } else { printf("%s: wiring for pmap %p va %#" PRIxVADDR "did not change!\n", __func__, pmap, va); } /* Release pmap. */ kpreempt_enable(); mutex_exit(&pmap->pm_lock); } /* -------------------------------------------------------------------------- */ void pmap_ept_transform(struct pmap *pmap) { pmap->pm_enter = pmap_ept_enter; pmap->pm_extract = pmap_ept_extract; pmap->pm_remove = pmap_ept_remove; pmap->pm_sync_pv = pmap_ept_sync_pv; pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; pmap->pm_write_protect = pmap_ept_write_protect; pmap->pm_unwire = pmap_ept_unwire; memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); } #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */