From fd2dae0a30e937717faf87f626630a8a5b8f724a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sun, 9 Jun 2019 03:36:10 +0000 Subject: [PATCH] Implement an alternative solution to the amd64 and i386 pmap problem that we previously addressed in r348246. This pmap problem also exists on arm64 and riscv. However, the original solution developed for amd64 and i386 cannot be used on arm64 and riscv. In particular, arm64 and riscv do not define a PG_PROMOTED flag in their level 2 PTEs. (A PG_PROMOTED flag makes no sense on arm64, where unlike x86 or riscv we are required to break the old 4KB mappings before making the 2MB mapping; and on riscv there are no unused bits in the PTE to define a PG_PROMOTED flag.) This commit implements an alternative solution that can be used on all four architectures. Moreover, this solution has two other advantages. First, on older AMD processors that required the Erratum 383 workaround, it is less costly. Specifically, it avoids unnecessary calls to pmap_fill_ptp() on a superpage demotion. Second, it enables the elimination of some calls to pagezero() in pmap_kernel_remove_{l2,pde}(). In addition, remove a related stale comment from pmap_enter_{l2,pde}(). Reviewed by: kib, markj (an earlier version) MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D20538 --- sys/amd64/amd64/pmap.c | 40 ++++++++++++++++++++++++++-------------- sys/arm64/arm64/pmap.c | 39 ++++++++++++++++++++++++--------------- sys/i386/i386/pmap.c | 40 ++++++++++++++++++++++++++-------------- sys/riscv/riscv/pmap.c | 39 ++++++++++++++++++++++++--------------- 4 files changed, 100 insertions(+), 58 deletions(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index f7599bf364cf..d7a35b5c9cfc 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1053,7 +1053,7 @@ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); -static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, @@ -1757,8 +1757,13 @@ pmap_init(void) mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); mpte->wire_count = 1; + + /* + * Collect the page table pages that were replaced by a 2MB + * page in create_pagetables(). They are zero filled. + */ if (i << PDRSHIFT < KERNend && - pmap_insert_pt_page(kernel_pmap, mpte)) + pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); @@ -3129,12 +3134,15 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. + * + * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int -pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } @@ -4626,7 +4634,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, * If the page table page is not leftover from an earlier promotion, * initialize it. */ - if ((oldpde & PG_PROMOTED) == 0) + if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); @@ -4699,9 +4707,11 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* - * Initialize the page table page. + * If this page table page was unmapped by a promotion, then it + * contains valid mappings. Zero it to invalidate those mappings. */ - pagezero((void *)PHYS_TO_DMAP(mptepa)); + if (mpte->valid != 0) + pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. @@ -4766,6 +4776,8 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { + KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); @@ -5399,7 +5411,7 @@ setpte: ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); - if (pmap_insert_pt_page(pmap, mpte)) { + if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, @@ -5826,15 +5838,13 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { + /* + * Both pmap_remove_pde() and pmap_remove_ptes() will + * leave the kernel page table page zero filled. + */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - if (pmap_insert_pt_page(pmap, mt)) { - /* - * XXX Currently, this can't happen because - * we do not perform pmap_enter(psind == 1) - * on the kernel pmap. - */ + if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); - } } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); @@ -6824,6 +6834,8 @@ pmap_remove_pages(pmap_t pmap) } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { + KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 6b41dfd59f60..4c6727de3a1b 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -2401,9 +2401,11 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) newl2 = ml3pa | L2_TABLE; /* - * Initialize the page table page. + * If this page table page was unmapped by a promotion, then it + * contains valid mappings. Zero it to invalidate those mappings. */ - pagezero((void *)PHYS_TO_DMAP(ml3pa)); + if (ml3->valid != 0) + pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the @@ -2456,6 +2458,8 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { + KASSERT(ml3->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_l2: l3 page wire count error")); @@ -2812,12 +2816,15 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. + * + * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int -pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } @@ -2962,7 +2969,7 @@ pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); - if (pmap_insert_pt_page(pmap, mpte)) { + if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, @@ -3386,15 +3393,13 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { + /* + * Both pmap_remove_l2() and pmap_remove_l3() will + * leave the kernel page table page zero filled. + */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); - if (pmap_insert_pt_page(pmap, mt)) { - /* - * XXX Currently, this can't happen bacuse - * we do not perform pmap_enter(psind == 1) - * on the kernel pmap. - */ + if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); - } } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); @@ -4081,6 +4086,8 @@ pmap_remove_pages(pmap_t pmap) ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { + KASSERT(ml3->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); @@ -5035,8 +5042,10 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, " in pmap %p", va, pmap); goto fail; } - if (va < VM_MAXUSER_ADDRESS) + if (va < VM_MAXUSER_ADDRESS) { + ml3->wire_count = NL3PG; pmap_resident_count_inc(pmap, 1); + } } l3phys = VM_PAGE_TO_PHYS(ml3); @@ -5048,10 +5057,10 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE; /* - * If the page table page is new, initialize it. + * If the page table page is not leftover from an earlier promotion, + * initialize it. */ - if (ml3->wire_count == 1) { - ml3->wire_count = NL3PG; + if (ml3->valid == 0) { for (i = 0; i < Ln_ENTRIES; i++) { l3[i] = newl3 | phys; phys += L3_SIZE; diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 67699438e3d0..e312549f3f15 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -318,7 +318,7 @@ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); -static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); @@ -990,9 +990,14 @@ __CONCAT(PMTYPE, init)(void) mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + ptoa(i); mpte->wire_count = 1; + + /* + * Collect the page table pages that were replaced by a 2/4MB + * page. They are filled with equivalent 4KB page mappings. + */ if (pseflag != 0 && KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && - pmap_insert_pt_page(kernel_pmap, mpte)) + pmap_insert_pt_page(kernel_pmap, mpte, true)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); @@ -1900,12 +1905,15 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. + * + * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int -pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } @@ -2823,7 +2831,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) * If the page table page is not leftover from an earlier promotion, * initialize it. */ - if ((oldpde & PG_PROMOTED) == 0) + if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), @@ -2895,9 +2903,11 @@ pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* - * Initialize the page table page. + * If this page table page was unmapped by a promotion, then it + * contains valid mappings. Zero it to invalidate those mappings. */ - pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); + if (mpte->valid != 0) + pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. @@ -2960,6 +2970,8 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { + KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pde: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); @@ -3533,7 +3545,7 @@ setpte: ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); - if (pmap_insert_pt_page(pmap, mpte)) { + if (pmap_insert_pt_page(pmap, mpte, true)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, @@ -3911,15 +3923,13 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, } vm_page_free_pages_toq(&free, true); if (pmap == kernel_pmap) { + /* + * Both pmap_remove_pde() and pmap_remove_ptes() will + * leave the kernel page table page zero filled. + */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - if (pmap_insert_pt_page(pmap, mt)) { - /* - * XXX Currently, this can't happen because - * we do not perform pmap_enter(psind == 1) - * on the kernel pmap. - */ + if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); - } } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); @@ -4797,6 +4807,8 @@ __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { + KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pages: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index 0a6819f23bf6..9df759083163 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -1104,12 +1104,15 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. + * + * If "promoted" is false, then the page table page "ml3" must be zero filled. */ static __inline int -pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3) +pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); + ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, ml3)); } @@ -2002,9 +2005,11 @@ pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) newl2 = ml3pa | PTE_V; /* - * Initialize the page table page. + * If this page table page was unmapped by a promotion, then it + * contains valid mappings. Zero it to invalidate those mappings. */ - pagezero((void *)PHYS_TO_DMAP(ml3pa)); + if (ml3->valid != 0) + pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. @@ -2064,6 +2069,8 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { + KASSERT(ml3->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == Ln_ENTRIES, ("pmap_remove_l2: l3 page wire count error")); @@ -2482,8 +2489,10 @@ pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, "failure for va %#lx in pmap %p", va, pmap); return (false); } - if (va < VM_MAXUSER_ADDRESS) + if (va < VM_MAXUSER_ADDRESS) { + mpte->wire_count = Ln_ENTRIES; pmap_resident_count_inc(pmap, 1); + } } mptepa = VM_PAGE_TO_PHYS(mpte); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); @@ -2495,10 +2504,10 @@ pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, newl3 = oldl2; /* - * If the page table page is new, initialize it. + * If the page table page is not leftover from an earlier promotion, + * initialize it. */ - if (mpte->wire_count == 1) { - mpte->wire_count = Ln_ENTRIES; + if (mpte->valid == 0) { for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); } @@ -2589,7 +2598,7 @@ pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); KASSERT(ml3->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); - if (pmap_insert_pt_page(pmap, ml3)) { + if (pmap_insert_pt_page(pmap, ml3, true)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); @@ -2972,15 +2981,13 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { + /* + * Both pmap_remove_l2() and pmap_remove_l3() will + * leave the kernel page table page zero filled. + */ mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); - if (pmap_insert_pt_page(pmap, mt)) { - /* - * XXX Currently, this can't happen bacuse - * we do not perform pmap_enter(psind == 1) - * on the kernel pmap. - */ + if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); - } } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); @@ -3557,6 +3564,8 @@ pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { + KASSERT(ml3->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == Ln_ENTRIES, ("pmap_remove_pages: pte page wire count error"));