diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 4bbd9ed591f2a55741ae48429a3f112031c91910..95ea25b9076e9845735634d594d71370ecf93e1a 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -303,13 +303,6 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) return true; } -static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm, - unsigned long uaddr) -{ - __flush_tlb_page_nosync(mm, uaddr); -} - /* * If mprotect/munmap/etc occurs during TLB batched flushing, we need to * synchronise all the TLBI issued with a DSB to avoid the race mentioned in @@ -419,7 +412,7 @@ static inline bool __flush_tlb_range_limit_excess(unsigned long start, return false; } -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, +static inline void __flush_tlb_range_nosync(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) @@ -431,19 +424,19 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, pages = (end - start) >> PAGE_SHIFT; if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { - flush_tlb_mm(vma->vm_mm); + flush_tlb_mm(mm); return; } dsb(ishst); - asid = ASID(vma->vm_mm); + asid = ASID(mm); if (last_level) __flush_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level, true); else __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true); - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void __flush_tlb_range(struct vm_area_struct *vma, @@ -451,7 +444,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long stride, bool last_level, int tlb_level) { - __flush_tlb_range_nosync(vma, start, end, stride, + __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, last_level, tlb_level); dsb(ish); } @@ -501,6 +494,12 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) isb(); } +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, unsigned long start, unsigned long end) +{ + __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3); +} + #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP static inline void vmemmap_flush_tlb_all(void) { diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a3edced29ac1383564f432139ed01def1913d28b..996938edc7354987febd089fbf100bb660ab22b3 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -335,7 +335,7 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, * eliding the trailing DSB applies here. */ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); - __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, + __flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE, PAGE_SIZE, true, 3); } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4bd3c1a1b355a23fe4c82fc3f655711d470d2625..8136bff62aed6427fcb971ee847d53e112863a43 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -278,8 +278,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) } static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm, - unsigned long uaddr) + struct mm_struct *mm, unsigned long start, unsigned long end) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7045b7b7ac4aceb54f490c2f6b21480a9dad4483..260d8f3ec934094e750d45de4865a8516a1fbfac 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3077,8 +3077,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, int ref_count, map_count; pmd_t orig_pmd = *pmdp; - if (folio_test_dirty(folio) || pmd_dirty(orig_pmd)) + if (pmd_dirty(orig_pmd)) + folio_set_dirty(folio); + if (folio_test_dirty(folio)) { + folio_set_swapbacked(folio); return false; + } orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); @@ -3105,8 +3109,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, * * The only folio refs must be one from isolation plus the rmap(s). */ - if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) || - ref_count != map_count + 1) { + if (pmd_dirty(orig_pmd)) + folio_set_dirty(folio); + if (folio_test_dirty(folio)) { + folio_set_swapbacked(folio); + set_pmd_at(mm, addr, pmdp, orig_pmd); + return false; + } + + if (ref_count != map_count + 1) { set_pmd_at(mm, addr, pmdp, orig_pmd); return false; } @@ -3127,12 +3138,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, { VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio); VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) - return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); - - return false; + return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); } static void remap_page(struct folio *folio, unsigned long nr) diff --git a/mm/rmap.c b/mm/rmap.c index 86353d274d437f4e4a1046bffcd1cbe35aa10d12..f3ec869b73200709d5b77c08ca5eaf642efcdc2b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -642,7 +642,7 @@ void try_to_unmap_flush_dirty(void) (TLB_FLUSH_BATCH_PENDING_MASK / 2) static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, - unsigned long uaddr) + unsigned long start, unsigned long end) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; int batch; @@ -651,7 +651,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, if (!pte_accessible(mm, pteval)) return; - arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); + arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end); tlb_ubc->flush_required = true; /* @@ -727,7 +727,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) } #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, - unsigned long uaddr) + unsigned long start, unsigned long end) { } @@ -1594,6 +1594,34 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, #endif } +static inline unsigned int folio_unmap_pte_batch(struct folio *folio, + struct page_vma_mapped_walk *pvmw, + enum ttu_flags flags, pte_t pte) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + unsigned long end_addr, addr = pvmw->address; + struct vm_area_struct *vma = pvmw->vma; + unsigned int max_nr; + + if (flags & TTU_HWPOISON) + return 1; + if (!folio_test_large(folio)) + return 1; + + /* We may only batch within a single VMA and a single page table. */ + end_addr = pmd_addr_end(addr, vma->vm_end); + max_nr = (end_addr - addr) >> PAGE_SHIFT; + + /* We only support lazyfree batching for now ... */ + if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) + return 1; + if (pte_unused(pte)) + return 1; + + return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags, + NULL, NULL, NULL); +} + /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1607,6 +1635,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long nr_pages = 1, end_addr; unsigned long pfn; unsigned long hsz = 0; @@ -1656,9 +1685,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } if (!pvmw.pte) { - if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, - folio)) - goto walk_done; + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) + goto walk_done; + /* + * unmap_huge_pmd_locked has either already marked + * the folio as swap-backed or decided to retain it + * due to GUP or speculative references. + */ + goto walk_abort; + } if (flags & TTU_SPLIT_HUGE_PMD) { /* @@ -1733,23 +1769,24 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pfn); - /* Nuke the page table entry. */ - if (should_defer_flush(mm, flags)) { - /* - * We clear the PTE but do not flush so potentially - * a remote CPU could still be writing to the folio. - * If the entry was previously clean then the - * architecture must guarantee that a clear->dirty - * transition on a cached TLB entry is written through - * and traps if the PTE is unmapped. - */ - pteval = ptep_get_and_clear(mm, address, pvmw.pte); + nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval); + end_addr = address + nr_pages * PAGE_SIZE; + flush_cache_range(vma, address, end_addr); - set_tlb_ubc_flush_pending(mm, pteval, address); - } else { - pteval = ptep_clear_flush(vma, address, pvmw.pte); - } + /* Nuke the page table entry. */ + pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the folio. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + if (should_defer_flush(mm, flags)) + set_tlb_ubc_flush_pending(mm, pteval, address, end_addr); + else + flush_tlb_range(vma, address, end_addr); } /* @@ -1824,24 +1861,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ smp_rmb(); - /* - * The only page refs must be one from isolation - * plus the rmap(s) (dropped by discard:). - */ - if (ref_count == 1 + map_count && - !folio_test_dirty(folio)) { - dec_mm_counter(mm, MM_ANONPAGES); - add_reliable_folio_counter(folio, mm, -1); - goto discard; + if (folio_test_dirty(folio)) { + /* + * redirtied either using the page table or a previously + * obtained GUP reference. + */ + set_ptes(mm, address, pvmw.pte, pteval, nr_pages); + folio_set_swapbacked(folio); + goto walk_abort; + } else if (ref_count != 1 + map_count) { + /* + * Additional reference. Could be a GUP reference or any + * speculative reference. GUP users must mark the folio + * dirty if there was a modification. This folio cannot be + * reclaimed right now either way, so act just like nothing + * happened. + * We'll come back here later and detect if the folio was + * dirtied when the additional reference is gone. + */ + set_ptes(mm, address, pvmw.pte, pteval, nr_pages); + goto walk_abort; } - - /* - * If the folio was redirtied, it cannot be - * discarded. Remap the page to page table. - */ - set_pte_at(mm, address, pvmw.pte, pteval); - folio_set_swapbacked(folio); - goto walk_abort; + add_mm_counter(mm, MM_ANONPAGES, -nr_pages); + add_reliable_folio_counter(folio, mm, -nr_pages); + goto discard; } if (swap_duplicate(entry) < 0) { @@ -1894,13 +1937,21 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, add_reliable_folio_counter(folio, mm, -1); } discard: - if (unlikely(folio_test_hugetlb(folio))) + if (unlikely(folio_test_hugetlb(folio))) { hugetlb_remove_rmap(folio); - else - folio_remove_rmap_pte(folio, subpage, vma); + } else { + folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); + } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); - folio_put(folio); + folio_put_refs(folio, nr_pages); + + /* + * If we are sure that we batched the entire folio and cleared + * all PTEs, we can just optimize and stop right here. + */ + if (nr_pages == folio_nr_pages(folio)) + goto walk_done; continue; walk_abort: ret = false; @@ -2117,7 +2168,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval, address); + set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); }