From 92caf68b1a765fb0079fae5346f5033e4c670f47 Mon Sep 17 00:00:00 2001 From: lijiawei Date: Sun, 15 May 2022 18:22:26 +0800 Subject: [PATCH 01/10] purheable mem with CONFIG Signed-off-by: Chengke Wang Signed-off-by: lijiawei --- fs/proc/meminfo.c | 17 +- include/linux/mm.h | 16 ++ include/linux/mm_inline.h | 3 + include/linux/mm_purgeable.h | 47 +++++ include/linux/mm_types.h | 4 + include/linux/mman.h | 5 + include/linux/mmzone.h | 14 +- include/linux/page-flags.h | 8 + include/trace/events/mmflags.h | 7 + include/uapi/asm-generic/mman-common.h | 3 + kernel/fork.c | 3 + mm/Kconfig | 7 + mm/Makefile | 1 + mm/memory.c | 18 ++ mm/mmap.c | 6 +- mm/purgeable.c | 226 +++++++++++++++++++++++++ mm/rmap.c | 16 +- mm/vmscan.c | 84 ++++++++- mm/vmstat.c | 8 + 19 files changed, 483 insertions(+), 10 deletions(-) create mode 100644 include/linux/mm_purgeable.h create mode 100644 mm/purgeable.c diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 248e0afeac94..0ebced4b78c6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; + unsigned long nr_purgeable_active = 0; + unsigned long nr_purgeable_inactive = 0; si_meminfo(&i); si_swapinfo(&i); @@ -51,6 +53,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); +#ifdef CONFIG_MEM_PURGEABLE + nr_purgeable_active = pages[LRU_ACTIVE_PURGEABLE]; + nr_purgeable_inactive = pages[LRU_INACTIVE_PURGEABLE]; +#endif + available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); @@ -62,13 +69,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + - pages[LRU_ACTIVE_FILE]); + pages[LRU_ACTIVE_FILE] + + nr_purgeable_active); show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + - pages[LRU_INACTIVE_FILE]); + pages[LRU_INACTIVE_FILE] + + nr_purgeable_inactive); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); +#ifdef CONFIG_MEM_PURGEABLE + show_val_kb(m, "Active(purgeable): ", nr_purgeable_active); + show_val_kb(m, "Inactive(purgeable): ", nr_purgeable_inactive); +#endif show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); diff --git a/include/linux/mm.h b/include/linux/mm.h index c9b37ce2db0b..d49bc29eab99 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -304,13 +304,29 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_7 39 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) +#define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ +#ifdef CONFIG_MEM_PURGEABLE +#define VM_PURGEABLE VM_HIGH_ARCH_5 +#define VM_USERPTE VM_HIGH_ARCH_6 +#define VM_USEREXPTE VM_HIGH_ARCH_7 +#else /* CONFIG_MEM_PURGEABLE */ +#define VM_PURGEABLE 0 +#define VM_USERPTE 0 +#define VM_USEREXPTE 0 +#endif /* CONFIG_MEM_PURGEABLE */ + #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8fc71e9d7bb0..d0e326dabb33 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -76,6 +76,9 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ static inline enum lru_list page_lru_base_type(struct page *page) { + if (PagePurgeable(page)) + return LRU_INACTIVE_PURGEABLE; + if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; return LRU_INACTIVE_ANON; diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h new file mode 100644 index 000000000000..1bf725ad0435 --- /dev/null +++ b/include/linux/mm_purgeable.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#ifndef __MM_PURGEABLE_MEM_H +#define __MM_PURGEABLE_MEM_H + +#ifdef CONFIG_MEM_PURGEABLE + +void mm_init_expgd(struct mm_struct *mm); +void mm_clear_expgd(struct mm_struct *mm); +int lock_userexpte(struct vm_area_struct *vma, unsigned long addr); +void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr); +vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, + pte_t *entry); +int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr); + +#else /* CONFIG_MEM_PURGEABLE */ + +static inline void mm_init_expgd(struct mm_struct *mm) {} + +static inline void mm_clear_expgd(struct mm_struct *mm) {} + +static inline int lock_userexpte(struct vm_area_struct *vma, + unsigned long addr) +{ + return 0; +} + +static inline void unlock_userexpte(struct vm_area_struct *vma, + unsigned long addr) {} + +static inline vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, + struct page *page, pte_t *entry) +{ + return 0; +} + +static inline int userexpte_setpresent(struct vm_area_struct *vma, + unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MEM_PURGEABLE */ +#endif /* __MM_PURGEABLE_MEM_H */ + diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f2a7fc951350..7742c221efa1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -422,6 +422,10 @@ struct mm_struct { unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEM_PURGEABLE + void *expgd; + spinlock_t expgd_lock; +#endif #ifdef CONFIG_MEMBARRIER /** diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..8f814b3396e1 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -154,6 +154,11 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | +#ifdef CONFIG_MEM_PURGEABLE + _calc_vm_trans(flags, MAP_PURGEABLE, VM_PURGEABLE ) | + _calc_vm_trans(flags, MAP_USERPTE, VM_USERPTE ) | + _calc_vm_trans(flags, MAP_USEREXPTE, VM_USEREXPTE ) | +#endif arch_calc_vm_flag_bits(flags); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d66cecefa84f..a1c1d1c0baaa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -158,6 +158,10 @@ enum zone_stat_item { NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, +#ifdef CONFIG_MEM_PURGEABLE + NR_ZONE_INACTIVE_PURGEABLE, + NR_ZONE_ACTIVE_PURGEABLE, +#endif NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ @@ -179,6 +183,10 @@ enum node_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ +#ifdef CONFIG_MEM_PURGEABLE + NR_INACTIVE_PURGEABLE, + NR_ACTIVE_PURGEABLE, +#endif NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, @@ -254,12 +262,15 @@ static __always_inline bool vmstat_item_in_bytes(int idx) #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 +#define LRU_PURGEABLE 4 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, + LRU_INACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE, + LRU_ACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; @@ -275,7 +286,8 @@ static inline bool is_file_lru(enum lru_list lru) static inline bool is_active_lru(enum lru_list lru) { - return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); + return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE || + lru == LRU_ACTIVE_PURGEABLE); } #define ANON_AND_FILE 2 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a6446a50c39f..dcf83c01f57b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -142,6 +142,9 @@ enum pageflags { #ifdef CONFIG_PAGE_TRACING PG_skb, PG_zspage, +#endif +#ifdef CONFIG_MEM_PURGEABLE + PG_purgeable, #endif __NR_PAGEFLAGS, @@ -461,6 +464,11 @@ PAGEFLAG(Idle, idle, PF_ANY) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_MEM_PURGEABLE +PAGEFLAG(Purgeable, purgeable, PF_ANY) +#else +PAGEFLAG_FALSE(Purgeable) +#endif /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..2332482f7df7 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -55,6 +55,12 @@ __def_gfpflag_names \ ) : "none" +#ifdef CONFIG_MEM_PURGEABLE +#define IF_HAVE_PG_PURGEABLE(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_PURGEABLE(flag,string) +#endif + #ifdef CONFIG_MMU #define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string} #else @@ -107,6 +113,7 @@ {1UL << PG_reclaim, "reclaim" }, \ {1UL << PG_swapbacked, "swapbacked" }, \ {1UL << PG_unevictable, "unevictable" } \ +IF_HAVE_PG_PURGEABLE(PG_purgeable, "purgeable" ) \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..5487e3f01773 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -21,6 +21,9 @@ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ +#define MAP_PURGEABLE 0x40 +#define MAP_USERPTE 0x80 +#define MAP_USEREXPTE 0x100 /* 0x0100 - 0x4000 flags are defined in asm-generic/mman.h */ #define MAP_POPULATE 0x008000 /* populate (prefault) pagetables */ diff --git a/kernel/fork.c b/kernel/fork.c index 7b3ce57416cd..9d537dcb5a96 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,7 @@ #ifdef CONFIG_RECLAIM_ACCT #include #endif +#include #include #include @@ -625,6 +626,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, static inline int mm_alloc_pgd(struct mm_struct *mm) { + mm_init_expgd(mm); mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; @@ -634,6 +636,7 @@ static inline int mm_alloc_pgd(struct mm_struct *mm) static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); + mm_clear_expgd(mm); } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) diff --git a/mm/Kconfig b/mm/Kconfig index 68aaed4cdc9f..36ef0bed11ea 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -939,4 +939,11 @@ config LMKD_DBG help print processes info when lmk happen per several seconds +config MEM_PURGEABLE + bool "Purgeable memory feature" + default n + select ARCH_USES_HIGH_VMA_FLAGS + help + Support purgeable pages for process + endmenu diff --git a/mm/Makefile b/mm/Makefile index 6b1897637368..efef809522b4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -126,3 +126,4 @@ obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o obj-$(CONFIG_RECLAIM_ACCT) += reclaim_acct.o reclaimacct_show.o +obj-$(CONFIG_MEM_PURGEABLE) += purgeable.o diff --git a/mm/memory.c b/mm/memory.c index 4fe24cd865a7..c39350ac179f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -1236,6 +1237,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = vm_normal_page(vma, addr, ptent); + if (vma->vm_flags & (VM_USERPTE | VM_USEREXPTE)) + page = NULL; if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -3555,11 +3558,20 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; + /* use extra page table for userexpte */ + if (vma->vm_flags & VM_USEREXPTE) { + ret = do_purgeable_page_fault(vmf, page, &entry); + if (ret == VM_FAULT_OOM) + goto oom; + else + goto got_page; + } /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); +got_page: vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { @@ -3620,6 +3632,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); + if (vma->vm_flags & VM_PURGEABLE) { + pr_info("set page %lx purgeable\n", page_to_pfn(page)); + SetPagePurgeable(page); + ClearPageSwapBacked(page); + userexpte_setpresent(vma, vmf->address); + } lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); diff --git a/mm/mmap.c b/mm/mmap.c index 1f13b3069cb1..0587e2274a98 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1558,14 +1558,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* * Ignore pgoff. */ - pgoff = 0; + if (!(flags & (MAP_USERPTE | MAP_USEREXPTE))) + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ - pgoff = addr >> PAGE_SHIFT; + if (!(flags & (MAP_USERPTE | MAP_USEREXPTE))) + pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; diff --git a/mm/purgeable.c b/mm/purgeable.c new file mode 100644 index 000000000000..68edf5cd77e3 --- /dev/null +++ b/mm/purgeable.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include + +void mm_init_expgd(struct mm_struct *mm) +{ + mm->expgd = NULL; + spin_lock_init(&mm->expgd_lock); +} + +void mm_clear_expgd(struct mm_struct *mm) +{ + struct page *page = NULL; + void **slot = NULL; + struct radix_tree_iter iter; + + spin_lock(&mm->expgd_lock); + if (!mm->expgd) + goto out; + radix_tree_for_each_slot(slot, mm->expgd, &iter, 0) { + page = radix_tree_lookup(mm->expgd, iter.index); + put_page(page); + } +out: + spin_unlock(&mm->expgd_lock); +} + +int lock_userexpte(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(unsigned long) * + PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->expgd_lock); + if (!vma->vm_mm->expgd) + goto unlock; + page = radix_tree_lookup(vma->vm_mm->expgd, base); + if (!page) + goto unlock; + uxpte = page_to_virt(page); +retry: + val = atomic64_read(&uxpte[offset]); + if (val >> 1) + goto unlock; + if (atomic64_cmpxchg(&uxpte[offset], val, -2) != val) + goto retry; + val = -2; +unlock: + spin_unlock(&vma->vm_mm->expgd_lock); + pr_info("lock uxpte of addr %lx is %lx\n", addr, val); + + return (val == -2) ? 1 : 0; +} + +void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(unsigned long) * + PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + + spin_lock(&vma->vm_mm->expgd_lock); + if (!vma->vm_mm->expgd) + goto unlock; + page = radix_tree_lookup(vma->vm_mm->expgd, base); + if (!page) + goto unlock; + uxpte = page_to_virt(page); + atomic64_set(&uxpte[offset], 0); +unlock: + spin_unlock(&vma->vm_mm->expgd_lock); + pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0); +} + +vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, + pte_t *entry) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long offset = (vmf->address - vma->vm_start + vma->vm_pgoff * + PAGE_SIZE) / sizeof(pte_t) * PAGE_SIZE; + struct page *dup = NULL; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + if (current->mm->expgd) + goto lookup; + spin_lock(¤t->mm->expgd_lock); + if (!current->mm->expgd) + current->mm->expgd = kzalloc(sizeof(struct radix_tree_root), + GFP_KERNEL); + if (current->mm->expgd) + INIT_RADIX_TREE(current->mm->expgd, GFP_KERNEL); + spin_unlock(¤t->mm->expgd_lock); + if (!current->mm->expgd) { + pr_err("expgd alloc failed.\n"); + goto oom; + } +lookup: + page = radix_tree_lookup(current->mm->expgd, offset); + if (page) + goto make_pte; + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); + if (!page) + goto oom; + if (radix_tree_preload(GFP_KERNEL)) { + put_page(page); + pr_err("radix preload fail.\n"); + goto oom; + } + spin_lock(¤t->mm->expgd_lock); + dup = radix_tree_lookup(current->mm->expgd, offset); + if (dup) { + put_page(page); + page = dup; + } else { + radix_tree_insert(current->mm->expgd, offset, page); + } + spin_unlock(¤t->mm->expgd_lock); + radix_tree_preload_end(); +make_pte: + *entry = mk_pte(page, vma->vm_page_prot); + *entry = pte_sw_mkyoung(*entry); + if (vma->vm_flags & VM_WRITE) + *entry = pte_mkwrite(pte_mkdirty(*entry)); + return 0; +oom: + return VM_FAULT_OOM; +} + +static struct page *lookup_expte_page(struct vm_area_struct *vma, + unsigned long addr, bool alloc) +{ + struct radix_tree_root *expgd = NULL; + struct page *page = NULL; + struct page *new_page = NULL; + struct mm_struct *mm = vma->vm_mm; + + if (mm->expgd) + goto lookup; + if (!alloc) + goto out; + spin_unlock(&mm->expgd_lock); + expgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); + if (!expgd) { + pr_err("expgd alloc failed.\n"); + spin_lock(&mm->expgd_lock); + goto out; + } + INIT_RADIX_TREE(expgd, GFP_KERNEL); + spin_lock(&mm->expgd_lock); + if (mm->expgd) + kfree(expgd); + else + mm->expgd = expgd; +lookup: + page = radix_tree_lookup(mm->expgd, addr); + if (page) + goto out; + if (!alloc) + goto out; + spin_unlock(&mm->expgd_lock); + new_page = alloc_zeroed_user_highpage_movable(vma, addr); + if (!new_page) { + pr_err("expte page alloc fail.\n"); + spin_lock(&mm->expgd_lock); + goto out; + } + if (radix_tree_preload(GFP_KERNEL)) { + put_page(new_page); + pr_err("radix preload fail.\n"); + spin_lock(&mm->expgd_lock); + goto out; + } + spin_lock(&mm->expgd_lock); + page = radix_tree_lookup(mm->expgd, addr); + if (page) { + put_page(new_page); + } else { + page = new_page; + radix_tree_insert(mm->expgd, addr, page); + } + radix_tree_preload_end(); +out: + return page; +} + +int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(unsigned long) * PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->expgd_lock); + page = lookup_expte_page(vma, base, true); + if (!page) + goto unlock; + uxpte = page_to_virt(page); +retry: + val = atomic64_read(&uxpte[offset]); + if (val & 1) + goto unlock; + if (atomic64_cmpxchg(&uxpte[offset], val, val + 1) != val) + goto retry; + val++; +unlock: + spin_unlock(&vma->vm_mm->expgd_lock); + pr_info("set present uxpte of addr %lx is %lx\n", addr, val); + + return 0; +} diff --git a/mm/rmap.c b/mm/rmap.c index cdf549f6f617..535d20474835 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -72,6 +72,7 @@ #include #include #include +#include #include @@ -1443,6 +1444,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, continue; } #endif +#ifdef CONFIG_MEM_PURGEABLE + if (PagePurgeable(page) && !lock_userexpte(vma, address)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + pr_info("uxpte hold purgeable page %lx\n", page_to_pfn(page)); + break; + } +#endif /* * If the page is mlock()d, we cannot swap it out. @@ -1584,7 +1593,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + } else if (PagePurgeable(page) || (pte_unused(pteval) && + !userfaultfd_armed(vma))) { + if (PagePurgeable(page)) { + unlock_userexpte(vma, address); + pr_info("unmap purgeable page %lx\n", page_to_pfn(page)); + } /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan diff --git a/mm/vmscan.c b/mm/vmscan.c index 5371b75ff477..09caa2770b03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1030,6 +1030,9 @@ unsigned int shrink_page_list(struct list_head *page_list, page = lru_to_page(page_list); list_del(&page->lru); + if (PagePurgeable(page)) + pr_info("try to reclaim purgeable page %lx\n", page_to_pfn(page)); + if (!trylock_page(page)) goto keep; @@ -1152,7 +1155,7 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (!ignore_references) + if (!ignore_references && !PagePurgeable(page)) references = page_check_references(page, sc); switch (references) { @@ -1247,7 +1250,7 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageDirty(page)) { + if (PageDirty(page) && !PagePurgeable(page)) { /* * Only kswapd can writeback filesystem pages * to avoid risk of stack overflow. But avoid @@ -1359,7 +1362,7 @@ unsigned int shrink_page_list(struct list_head *page_list, /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { + if (PageDirty(page) && !PagePurgeable(page)) { page_ref_unfreeze(page, 1); goto keep_locked; } @@ -2130,7 +2133,8 @@ unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, reclaimacct_substage_start(stub); #endif if (is_active_lru(lru)) { - if (sc->may_deactivate & (1 << is_file_lru(lru))) + if (sc->may_deactivate & (1 << is_file_lru(lru)) + || lru == LRU_ACTIVE_PURGEABLE) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; @@ -4084,6 +4088,8 @@ void kswapd_stop(int nid) } } +static void __init purgeable_init(void); + static int __init kswapd_init(void) { int nid; @@ -4091,6 +4097,8 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); + + purgeable_init(); return 0; } @@ -4339,3 +4347,71 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +static unsigned long purgeable_node(pg_data_t *pgdata, struct scan_control *sc) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr = 0; + while (memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata); + + shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc); + nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc); + } + + pr_info("reclaim %lu purgeable pages.\n", nr); + + return nr; +} + +static int purgeable(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY, + .may_deactivate = 1, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + int nid = 0; + + for_each_node_state(nid, N_MEMORY) + purgeable_node(NODE_DATA(nid), &sc); + return 0; +} + +static struct ctl_table ker_tab[] = { + { + .procname = "purgeable", + .mode = 0200, + .proc_handler = purgeable, + }, + {}, +}; + +static struct ctl_table sys_tab[] = { + { + .procname = "kernel", + .mode = 0555, + .child = ker_tab, + }, + {}, +}; + +static struct ctl_table_header *purgeable_header; + +static void __init purgeable_init(void) +{ + purgeable_header = register_sysctl_table(sys_tab); + if (!purgeable_header) + pr_err("register purgeable sysctl table failed.\n"); +} + +static void __exit purgeable_exit(void) +{ + unregister_sysctl_table(purgeable_header); +} diff --git a/mm/vmstat.c b/mm/vmstat.c index 5b9b46f42f40..3e89021a3f75 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1154,6 +1154,10 @@ const char * const vmstat_text[] = { "nr_zone_active_anon", "nr_zone_inactive_file", "nr_zone_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_zone_inactive_purgeable", + "nr_zone_active_purgeable", +#endif "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", @@ -1182,6 +1186,10 @@ const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_inactive_purgeable", + "nr_active_purgeable", +#endif "nr_unevictable", "nr_slab_reclaimable", "nr_slab_unreclaimable", -- Gitee From c55e26ef3f64b7c376e270ad9c09e4377662c8b6 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Fri, 20 May 2022 10:35:56 +0800 Subject: [PATCH 02/10] use VM_PURGEABLE insteadof PagePurgeable --- mm/memory.c | 1 - mm/rmap.c | 17 +++++++++++++---- mm/vmscan.c | 22 ++++++++++++++-------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c39350ac179f..74bd65dbc1ed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3635,7 +3635,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_PURGEABLE) { pr_info("set page %lx purgeable\n", page_to_pfn(page)); SetPagePurgeable(page); - ClearPageSwapBacked(page); userexpte_setpresent(vma, vmf->address); } lru_cache_add_inactive_or_unevictable(page, vma); diff --git a/mm/rmap.c b/mm/rmap.c index 535d20474835..0602881d5630 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -783,6 +783,14 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; +#ifdef CONFIG_MEM_PURGEABLE + if (!(vma->vm_flags & VM_PURGEABLE)) { + pra->vm_flags &= ~VM_PURGEABLE; + pr_info("page %lx mapped to inpurgeable vma %lx.", page_to_pfn(page), vma); + } else { + pr_info("page %lx mapped to purgeable vma %lx.", page_to_pfn(page), vma); + } +#endif if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; @@ -822,7 +830,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; } if (!pra->mapcount) @@ -861,6 +869,7 @@ int page_referenced(struct page *page, struct page_referenced_arg pra = { .mapcount = total_mapcount(page), .memcg = memcg, + .vm_flags = VM_PURGEABLE, }; struct rmap_walk_control rwc = { .rmap_one = page_referenced_one, @@ -1445,7 +1454,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } #endif #ifdef CONFIG_MEM_PURGEABLE - if (PagePurgeable(page) && !lock_userexpte(vma, address)) { + if ((vma->vm_flags & VM_PURGEABLE) && !lock_userexpte(vma, address)) { ret = false; page_vma_mapped_walk_done(&pvmw); pr_info("uxpte hold purgeable page %lx\n", page_to_pfn(page)); @@ -1593,9 +1602,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if (PagePurgeable(page) || (pte_unused(pteval) && + } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && !userfaultfd_armed(vma))) { - if (PagePurgeable(page)) { + if (vma->vm_flags & VM_PURGEABLE) { unlock_userexpte(vma, address); pr_info("unmap purgeable page %lx\n", page_to_pfn(page)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 09caa2770b03..90dcdd8b34ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -914,6 +914,7 @@ void putback_lru_page(struct page *page) enum page_references { PAGEREF_RECLAIM, PAGEREF_RECLAIM_CLEAN, + PAGEREF_RECLAIM_PURGEABLE, PAGEREF_KEEP, PAGEREF_ACTIVATE, }; @@ -935,6 +936,13 @@ static enum page_references page_check_references(struct page *page, if (vm_flags & VM_LOCKED) return PAGEREF_RECLAIM; +#ifdef CONFIG_MEM_PURGEABLE + pr_info("check page %lx references\n", page_to_pfn(page)); + if (vm_flags & VM_PURGEABLE) { + pr_info("try reclaim purgeable page %lx.\n", page_to_pfn(page)); + return PAGEREF_RECLAIM_PURGEABLE; + } +#endif if (referenced_ptes) { /* * All mapped pages start out with page table @@ -1030,9 +1038,6 @@ unsigned int shrink_page_list(struct list_head *page_list, page = lru_to_page(page_list); list_del(&page->lru); - if (PagePurgeable(page)) - pr_info("try to reclaim purgeable page %lx\n", page_to_pfn(page)); - if (!trylock_page(page)) goto keep; @@ -1155,7 +1160,7 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (!ignore_references && !PagePurgeable(page)) + if (!ignore_references) references = page_check_references(page, sc); switch (references) { @@ -1166,6 +1171,7 @@ unsigned int shrink_page_list(struct list_head *page_list, goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: + case PAGEREF_RECLAIM_PURGEABLE: ; /* try to reclaim the page below */ } @@ -1175,7 +1181,7 @@ unsigned int shrink_page_list(struct list_head *page_list, * Lazyfree page could be freed directly */ if (PageAnon(page) && PageSwapBacked(page)) { - if (!PageSwapCache(page)) { + if (!PageSwapCache(page) && references != PAGEREF_RECLAIM_PURGEABLE) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (page_maybe_dma_pinned(page)) @@ -1250,7 +1256,7 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageDirty(page) && !PagePurgeable(page)) { + if (PageDirty(page) && references != PAGEREF_RECLAIM_PURGEABLE) { /* * Only kswapd can writeback filesystem pages * to avoid risk of stack overflow. But avoid @@ -1358,11 +1364,11 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageAnon(page) && !PageSwapBacked(page)) { + if (PageAnon(page) && (!PageSwapBacked(page) || references == PAGEREF_RECLAIM_PURGEABLE)) { /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page) && !PagePurgeable(page)) { + if (PageDirty(page) && references != PAGEREF_RECLAIM_PURGEABLE) { page_ref_unfreeze(page, 1); goto keep_locked; } -- Gitee From 494ad96def5d60702ec14d1252cf3aa12d5166fa Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Sat, 21 May 2022 10:56:07 +0800 Subject: [PATCH 03/10] purgeable vma name --- arch/mips/kernel/vdso.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 18 +----------------- include/linux/mm_purgeable.h | 4 ++-- include/linux/mm_types.h | 31 +++++++++++++++++++++++++++++++ include/linux/mman.h | 5 ----- include/linux/rmap.h | 6 ++++-- mm/Kconfig | 1 - mm/madvise.c | 14 ++++++++++++++ mm/memory.c | 10 +++++----- mm/mmap.c | 31 ++++++++++++++++++++++++++++--- mm/purgeable.c | 2 +- mm/rmap.c | 25 ++++++++++++------------- mm/vmscan.c | 7 ++++--- 14 files changed, 104 insertions(+), 54 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 242dc5e83847..567f99079108 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + 0, NULL, NULL); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0e22a47e8a6b..3b68e578114f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -327,7 +327,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) } anon_name = vma_anon_name(vma); - if (anon_name) { + if (!IS_ERR(anon_name)) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name); } diff --git a/include/linux/mm.h b/include/linux/mm.h index d49bc29eab99..3ee287e79282 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -304,29 +304,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_7 39 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) -#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ -#ifdef CONFIG_MEM_PURGEABLE -#define VM_PURGEABLE VM_HIGH_ARCH_5 -#define VM_USERPTE VM_HIGH_ARCH_6 -#define VM_USEREXPTE VM_HIGH_ARCH_7 -#else /* CONFIG_MEM_PURGEABLE */ -#define VM_PURGEABLE 0 -#define VM_USERPTE 0 -#define VM_USEREXPTE 0 -#endif /* CONFIG_MEM_PURGEABLE */ - #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ @@ -2606,7 +2590,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); + struct list_head *uf, void *name); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h index 1bf725ad0435..0c9bf36217df 100644 --- a/include/linux/mm_purgeable.h +++ b/include/linux/mm_purgeable.h @@ -12,7 +12,7 @@ void mm_init_expgd(struct mm_struct *mm); void mm_clear_expgd(struct mm_struct *mm); int lock_userexpte(struct vm_area_struct *vma, unsigned long addr); void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr); -vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry); int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr); @@ -31,7 +31,7 @@ static inline int lock_userexpte(struct vm_area_struct *vma, static inline void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) {} -static inline vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, +static inline vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry) { return 0; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7742c221efa1..3313092bf077 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -875,4 +875,35 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ +#ifdef CONFIG_MEM_PURGEABLE +#define ANON_NAME_PURGEABLE (ERR_PTR(-1)) +#define ANON_NAME_UPTE (ERR_PTR(-2)) +#define ANON_NAME_UXPTE (ERR_PTR(-3)) +static inline bool is_vma_purgeable(struct vm_area_struct *vma) +{ + return vma && vma->anon_name == ANON_NAME_PURGEABLE; +} +static inline bool is_vma_upte(struct vm_area_struct *vma) +{ + return vma && vma->anon_name == ANON_NAME_UPTE; +} +static inline bool is_vma_uxpte(struct vm_area_struct *vma) +{ + return vma && vma->anon_name == ANON_NAME_UXPTE; +} +#else /* CONFIG_MEM_PURGEABLE */ +static inline bool is_vma_purgeable(struct vm_area_struct *vma) +{ + return false; +} +static inline bool is_vma_upte(struct vm_area_struct *vma) +{ + return false; +} +static inline bool is_vma_uxpte(struct vm_area_struct *vma) +{ + return false; +} +#endif /* CONFIG_MEM_PURGEABLE */ + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 8f814b3396e1..629cefc4ecba 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -154,11 +154,6 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | -#ifdef CONFIG_MEM_PURGEABLE - _calc_vm_trans(flags, MAP_PURGEABLE, VM_PURGEABLE ) | - _calc_vm_trans(flags, MAP_USERPTE, VM_USERPTE ) | - _calc_vm_trans(flags, MAP_USEREXPTE, VM_USEREXPTE ) | -#endif arch_calc_vm_flag_bits(flags); } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8d04e7deedc6..4802583b79ae 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -193,7 +193,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags, bool *purgeable); bool try_to_unmap(struct page *, enum ttu_flags flags); @@ -284,9 +284,11 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, bool *purgealbe) { *vm_flags = 0; + if (purgeable) + *purgeable = false; return 0; } diff --git a/mm/Kconfig b/mm/Kconfig index 36ef0bed11ea..f2d5480badec 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -942,7 +942,6 @@ config LMKD_DBG config MEM_PURGEABLE bool "Purgeable memory feature" default n - select ARCH_USES_HIGH_VMA_FLAGS help Support purgeable pages for process diff --git a/mm/madvise.c b/mm/madvise.c index 23b48a0049cb..134d120e1fbb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -97,6 +97,10 @@ const char *vma_anon_name(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); +#ifdef CONFIG_MEM_PURGEABLE + if (IS_ERR(vma->anon_name)) + return (const char*)vma->anon_name; +#endif return vma->anon_name->name; } @@ -106,6 +110,9 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma, if (!has_vma_anon_name(orig_vma)) return; +#ifdef CONFIG_MEM_PURGEABLE + if (!IS_ERR(orig_vma->anon_name)) +#endif kref_get(&orig_vma->anon_name->kref); new_vma->anon_name = orig_vma->anon_name; } @@ -119,6 +126,9 @@ void free_vma_anon_name(struct vm_area_struct *vma) anon_name = vma->anon_name; vma->anon_name = NULL; +#ifdef CONFIG_MEM_PURGEABLE + if (!IS_ERR(anon_name)) +#endif kref_put(&anon_name->kref, vma_anon_name_free); } @@ -135,6 +145,10 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) anon_name = vma_anon_name(vma); if (anon_name) { /* Same name, nothing to do here */ +#ifdef CONFIG_MEM_PURGEABLE + if (name == anon_name) + return 0; +#endif if (!strcmp(name, anon_name)) return 0; diff --git a/mm/memory.c b/mm/memory.c index 74bd65dbc1ed..1be3de317752 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1237,8 +1237,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = vm_normal_page(vma, addr, ptent); - if (vma->vm_flags & (VM_USERPTE | VM_USEREXPTE)) - page = NULL; + if (is_vma_upte(vma) || is_vma_uxpte(vma)) + page = NULL; if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -3559,8 +3559,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return 0; /* use extra page table for userexpte */ - if (vma->vm_flags & VM_USEREXPTE) { - ret = do_purgeable_page_fault(vmf, page, &entry); + if (is_vma_uxpte(vma)) { + ret = do_uxpte_page_fault(vmf, page, &entry); if (ret == VM_FAULT_OOM) goto oom; else @@ -3632,7 +3632,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - if (vma->vm_flags & VM_PURGEABLE) { + if (is_vma_purgeable(vma)) { pr_info("set page %lx purgeable\n", page_to_pfn(page)); SetPagePurgeable(page); userexpte_setpresent(vma, vmf->address); diff --git a/mm/mmap.c b/mm/mmap.c index 0587e2274a98..360f4511cf26 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1404,6 +1404,19 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } +void *vma_special_name(unsigned long flags) +{ +#ifdef CONFIG_MEM_PURGEABLE + if (flags & MAP_PURGEABLE) + return ANON_NAME_PURGEABLE; + if (flags & MAP_USERPTE) + return ANON_NAME_UPTE; + if (flags & MAP_USEREXPTE) + return ANON_NAME_UXPTE; +#endif + return NULL; +} + /* * The caller must write-lock current->mm->mmap_lock. */ @@ -1415,9 +1428,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; vm_flags_t vm_flags; int pkey = 0; + const char *name = NULL; *populate = 0; +#ifndef CONFIG_MEM_PURGEABLE + if (flags & (MAP_PURGEABLE | MAP_USERPTE | MAP_USEREXPTE)) + return -EINVAL; +#endif if (!len) return -EINVAL; @@ -1572,6 +1590,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } + + name = vma_special_name(flags); } /* @@ -1588,7 +1608,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } - addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, name); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1733,7 +1753,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + struct list_head *uf, void *name) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev, *merge; @@ -1773,7 +1793,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, name); if (vma) goto out; @@ -1788,6 +1808,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto unacct_error; } + if (!file) { + BUG_ON(name && !IS_ERR(name)); + vma->anon_name = name; + } + vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; diff --git a/mm/purgeable.c b/mm/purgeable.c index 68edf5cd77e3..cd406e67959a 100644 --- a/mm/purgeable.c +++ b/mm/purgeable.c @@ -86,7 +86,7 @@ void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0); } -vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry) { struct vm_area_struct *vma = vmf->vma; diff --git a/mm/rmap.c b/mm/rmap.c index 0602881d5630..529241405336 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -765,6 +765,7 @@ struct page_referenced_arg { int referenced; unsigned long vm_flags; struct mem_cgroup *memcg; + bool purgeable; }; /* * arg: page_referenced_arg will be passed @@ -783,14 +784,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; -#ifdef CONFIG_MEM_PURGEABLE - if (!(vma->vm_flags & VM_PURGEABLE)) { - pra->vm_flags &= ~VM_PURGEABLE; + if (!is_vma_purgeable(vma)) { + pra->purgeable = false; pr_info("page %lx mapped to inpurgeable vma %lx.", page_to_pfn(page), vma); } else { pr_info("page %lx mapped to purgeable vma %lx.", page_to_pfn(page), vma); } -#endif if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; @@ -830,7 +829,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; + pra->vm_flags |= vma->vm_flags; } if (!pra->mapcount) @@ -863,13 +862,14 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + bool *purgeable) { int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), .memcg = memcg, - .vm_flags = VM_PURGEABLE, + .purgeable = true, }; struct rmap_walk_control rwc = { .rmap_one = page_referenced_one, @@ -901,6 +901,8 @@ int page_referenced(struct page *page, rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; + if (purgeable) + *purgeable = pra.purgeable; if (we_locked) unlock_page(page); @@ -1453,14 +1455,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, continue; } #endif -#ifdef CONFIG_MEM_PURGEABLE - if ((vma->vm_flags & VM_PURGEABLE) && !lock_userexpte(vma, address)) { + if (is_vma_purgeable(vma) && !lock_userexpte(vma, address)) { ret = false; page_vma_mapped_walk_done(&pvmw); pr_info("uxpte hold purgeable page %lx\n", page_to_pfn(page)); break; } -#endif /* * If the page is mlock()d, we cannot swap it out. @@ -1602,9 +1602,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && - !userfaultfd_armed(vma))) { - if (vma->vm_flags & VM_PURGEABLE) { + } else if ((pte_unused(pteval) && !userfaultfd_armed(vma)) || is_vma_purgeable(vma)) { + if (is_vma_purgeable(vma)) { unlock_userexpte(vma, address); pr_info("unmap purgeable page %lx\n", page_to_pfn(page)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 90dcdd8b34ea..fe607587a10b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -924,9 +924,10 @@ static enum page_references page_check_references(struct page *page, { int referenced_ptes, referenced_page; unsigned long vm_flags; + bool purgeable; referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); + &vm_flags, &purgeable); referenced_page = TestClearPageReferenced(page); /* @@ -938,7 +939,7 @@ static enum page_references page_check_references(struct page *page, #ifdef CONFIG_MEM_PURGEABLE pr_info("check page %lx references\n", page_to_pfn(page)); - if (vm_flags & VM_PURGEABLE) { + if (purgeable) { pr_info("try reclaim purgeable page %lx.\n", page_to_pfn(page)); return PAGEREF_RECLAIM_PURGEABLE; } @@ -2030,7 +2031,7 @@ void shrink_active_list(unsigned long nr_to_scan, } if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags, NULL)) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- Gitee From 4e163d79842865c84378360cf664b46cc5f3f4ee Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Mon, 30 May 2022 21:11:46 +0800 Subject: [PATCH 04/10] clear uxpte at unmap & sizeof uxpte --- mm/memory.c | 5 ++++- mm/purgeable.c | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1be3de317752..7aa010ee93cc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1254,7 +1254,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - + if (is_vma_purgeable(vma)) { + unlock_userexpte(vma, addr); + pr_info("unmap purgeable page %lx at addr %lx of vma %lx.\n", page_to_pfn(page), addr, vma); + } if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; diff --git a/mm/purgeable.c b/mm/purgeable.c index cd406e67959a..910732cf8036 100644 --- a/mm/purgeable.c +++ b/mm/purgeable.c @@ -37,7 +37,7 @@ void mm_clear_expgd(struct mm_struct *mm) int lock_userexpte(struct vm_area_struct *vma, unsigned long addr) { - unsigned long base = addr & ~(PAGE_SIZE / sizeof(unsigned long) * + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); unsigned long offset = (addr - base) / PAGE_SIZE; atomic64_t *uxpte = NULL; @@ -67,7 +67,7 @@ int lock_userexpte(struct vm_area_struct *vma, unsigned long addr) void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) { - unsigned long base = addr & ~(PAGE_SIZE / sizeof(unsigned long) * + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); unsigned long offset = (addr - base) / PAGE_SIZE; atomic64_t *uxpte = NULL; @@ -91,7 +91,7 @@ vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, { struct vm_area_struct *vma = vmf->vma; unsigned long offset = (vmf->address - vma->vm_start + vma->vm_pgoff * - PAGE_SIZE) / sizeof(pte_t) * PAGE_SIZE; + PAGE_SIZE) / sizeof(atomic64_t) * PAGE_SIZE; struct page *dup = NULL; if (unlikely(anon_vma_prepare(vma))) @@ -200,7 +200,7 @@ static struct page *lookup_expte_page(struct vm_area_struct *vma, int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr) { - unsigned long base = addr & ~(PAGE_SIZE / sizeof(unsigned long) * PAGE_SIZE - 1); + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); unsigned long offset = (addr - base) / PAGE_SIZE; atomic64_t *uxpte = NULL; struct page *page = NULL; -- Gitee From 56d95066a4589f461febdda3790d29b7381115db Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 14:01:02 +0800 Subject: [PATCH 05/10] Revert "purgeable vma name" This reverts commit 494ad96def5d60702ec14d1252cf3aa12d5166fa. --- arch/mips/kernel/vdso.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 18 +++++++++++++++++- include/linux/mm_purgeable.h | 4 ++-- include/linux/mm_types.h | 31 ------------------------------- include/linux/mman.h | 5 +++++ include/linux/rmap.h | 6 ++---- mm/Kconfig | 1 + mm/madvise.c | 14 -------------- mm/memory.c | 10 +++++----- mm/mmap.c | 31 +++---------------------------- mm/purgeable.c | 2 +- mm/rmap.c | 25 +++++++++++++------------ mm/vmscan.c | 7 +++---- 14 files changed, 54 insertions(+), 104 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 567f99079108..242dc5e83847 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL, NULL); + 0, NULL); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3b68e578114f..0e22a47e8a6b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -327,7 +327,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) } anon_name = vma_anon_name(vma); - if (!IS_ERR(anon_name)) { + if (anon_name) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 3ee287e79282..d49bc29eab99 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -304,13 +304,29 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_7 39 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) +#define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ +#ifdef CONFIG_MEM_PURGEABLE +#define VM_PURGEABLE VM_HIGH_ARCH_5 +#define VM_USERPTE VM_HIGH_ARCH_6 +#define VM_USEREXPTE VM_HIGH_ARCH_7 +#else /* CONFIG_MEM_PURGEABLE */ +#define VM_PURGEABLE 0 +#define VM_USERPTE 0 +#define VM_USEREXPTE 0 +#endif /* CONFIG_MEM_PURGEABLE */ + #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ @@ -2590,7 +2606,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf, void *name); + struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h index 0c9bf36217df..1bf725ad0435 100644 --- a/include/linux/mm_purgeable.h +++ b/include/linux/mm_purgeable.h @@ -12,7 +12,7 @@ void mm_init_expgd(struct mm_struct *mm); void mm_clear_expgd(struct mm_struct *mm); int lock_userexpte(struct vm_area_struct *vma, unsigned long addr); void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr); -vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, +vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry); int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr); @@ -31,7 +31,7 @@ static inline int lock_userexpte(struct vm_area_struct *vma, static inline void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) {} -static inline vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, +static inline vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry) { return 0; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3313092bf077..7742c221efa1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -875,35 +875,4 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ -#ifdef CONFIG_MEM_PURGEABLE -#define ANON_NAME_PURGEABLE (ERR_PTR(-1)) -#define ANON_NAME_UPTE (ERR_PTR(-2)) -#define ANON_NAME_UXPTE (ERR_PTR(-3)) -static inline bool is_vma_purgeable(struct vm_area_struct *vma) -{ - return vma && vma->anon_name == ANON_NAME_PURGEABLE; -} -static inline bool is_vma_upte(struct vm_area_struct *vma) -{ - return vma && vma->anon_name == ANON_NAME_UPTE; -} -static inline bool is_vma_uxpte(struct vm_area_struct *vma) -{ - return vma && vma->anon_name == ANON_NAME_UXPTE; -} -#else /* CONFIG_MEM_PURGEABLE */ -static inline bool is_vma_purgeable(struct vm_area_struct *vma) -{ - return false; -} -static inline bool is_vma_upte(struct vm_area_struct *vma) -{ - return false; -} -static inline bool is_vma_uxpte(struct vm_area_struct *vma) -{ - return false; -} -#endif /* CONFIG_MEM_PURGEABLE */ - #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..8f814b3396e1 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -154,6 +154,11 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | +#ifdef CONFIG_MEM_PURGEABLE + _calc_vm_trans(flags, MAP_PURGEABLE, VM_PURGEABLE ) | + _calc_vm_trans(flags, MAP_USERPTE, VM_USERPTE ) | + _calc_vm_trans(flags, MAP_USEREXPTE, VM_USEREXPTE ) | +#endif arch_calc_vm_flag_bits(flags); } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 4802583b79ae..8d04e7deedc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -193,7 +193,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags, bool *purgeable); + struct mem_cgroup *memcg, unsigned long *vm_flags); bool try_to_unmap(struct page *, enum ttu_flags flags); @@ -284,11 +284,9 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags, bool *purgealbe) + unsigned long *vm_flags) { *vm_flags = 0; - if (purgeable) - *purgeable = false; return 0; } diff --git a/mm/Kconfig b/mm/Kconfig index f2d5480badec..36ef0bed11ea 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -942,6 +942,7 @@ config LMKD_DBG config MEM_PURGEABLE bool "Purgeable memory feature" default n + select ARCH_USES_HIGH_VMA_FLAGS help Support purgeable pages for process diff --git a/mm/madvise.c b/mm/madvise.c index 134d120e1fbb..23b48a0049cb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -97,10 +97,6 @@ const char *vma_anon_name(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); -#ifdef CONFIG_MEM_PURGEABLE - if (IS_ERR(vma->anon_name)) - return (const char*)vma->anon_name; -#endif return vma->anon_name->name; } @@ -110,9 +106,6 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma, if (!has_vma_anon_name(orig_vma)) return; -#ifdef CONFIG_MEM_PURGEABLE - if (!IS_ERR(orig_vma->anon_name)) -#endif kref_get(&orig_vma->anon_name->kref); new_vma->anon_name = orig_vma->anon_name; } @@ -126,9 +119,6 @@ void free_vma_anon_name(struct vm_area_struct *vma) anon_name = vma->anon_name; vma->anon_name = NULL; -#ifdef CONFIG_MEM_PURGEABLE - if (!IS_ERR(anon_name)) -#endif kref_put(&anon_name->kref, vma_anon_name_free); } @@ -145,10 +135,6 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) anon_name = vma_anon_name(vma); if (anon_name) { /* Same name, nothing to do here */ -#ifdef CONFIG_MEM_PURGEABLE - if (name == anon_name) - return 0; -#endif if (!strcmp(name, anon_name)) return 0; diff --git a/mm/memory.c b/mm/memory.c index 7aa010ee93cc..b9113cfc2929 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1237,8 +1237,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = vm_normal_page(vma, addr, ptent); - if (is_vma_upte(vma) || is_vma_uxpte(vma)) - page = NULL; + if (vma->vm_flags & (VM_USERPTE | VM_USEREXPTE)) + page = NULL; if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -3562,8 +3562,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return 0; /* use extra page table for userexpte */ - if (is_vma_uxpte(vma)) { - ret = do_uxpte_page_fault(vmf, page, &entry); + if (vma->vm_flags & VM_USEREXPTE) { + ret = do_purgeable_page_fault(vmf, page, &entry); if (ret == VM_FAULT_OOM) goto oom; else @@ -3635,7 +3635,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - if (is_vma_purgeable(vma)) { + if (vma->vm_flags & VM_PURGEABLE) { pr_info("set page %lx purgeable\n", page_to_pfn(page)); SetPagePurgeable(page); userexpte_setpresent(vma, vmf->address); diff --git a/mm/mmap.c b/mm/mmap.c index 360f4511cf26..0587e2274a98 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1404,19 +1404,6 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } -void *vma_special_name(unsigned long flags) -{ -#ifdef CONFIG_MEM_PURGEABLE - if (flags & MAP_PURGEABLE) - return ANON_NAME_PURGEABLE; - if (flags & MAP_USERPTE) - return ANON_NAME_UPTE; - if (flags & MAP_USEREXPTE) - return ANON_NAME_UXPTE; -#endif - return NULL; -} - /* * The caller must write-lock current->mm->mmap_lock. */ @@ -1428,14 +1415,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; vm_flags_t vm_flags; int pkey = 0; - const char *name = NULL; *populate = 0; -#ifndef CONFIG_MEM_PURGEABLE - if (flags & (MAP_PURGEABLE | MAP_USERPTE | MAP_USEREXPTE)) - return -EINVAL; -#endif if (!len) return -EINVAL; @@ -1590,8 +1572,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } - - name = vma_special_name(flags); } /* @@ -1608,7 +1588,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } - addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, name); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1753,7 +1733,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf, void *name) + struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev, *merge; @@ -1793,7 +1773,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, name); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1808,11 +1788,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto unacct_error; } - if (!file) { - BUG_ON(name && !IS_ERR(name)); - vma->anon_name = name; - } - vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; diff --git a/mm/purgeable.c b/mm/purgeable.c index 910732cf8036..70facfbdaba6 100644 --- a/mm/purgeable.c +++ b/mm/purgeable.c @@ -86,7 +86,7 @@ void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0); } -vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, +vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry) { struct vm_area_struct *vma = vmf->vma; diff --git a/mm/rmap.c b/mm/rmap.c index 529241405336..0602881d5630 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -765,7 +765,6 @@ struct page_referenced_arg { int referenced; unsigned long vm_flags; struct mem_cgroup *memcg; - bool purgeable; }; /* * arg: page_referenced_arg will be passed @@ -784,12 +783,14 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if (!is_vma_purgeable(vma)) { - pra->purgeable = false; +#ifdef CONFIG_MEM_PURGEABLE + if (!(vma->vm_flags & VM_PURGEABLE)) { + pra->vm_flags &= ~VM_PURGEABLE; pr_info("page %lx mapped to inpurgeable vma %lx.", page_to_pfn(page), vma); } else { pr_info("page %lx mapped to purgeable vma %lx.", page_to_pfn(page), vma); } +#endif if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; @@ -829,7 +830,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; } if (!pra->mapcount) @@ -862,14 +863,13 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags, - bool *purgeable) + unsigned long *vm_flags) { int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), .memcg = memcg, - .purgeable = true, + .vm_flags = VM_PURGEABLE, }; struct rmap_walk_control rwc = { .rmap_one = page_referenced_one, @@ -901,8 +901,6 @@ int page_referenced(struct page *page, rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; - if (purgeable) - *purgeable = pra.purgeable; if (we_locked) unlock_page(page); @@ -1455,12 +1453,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, continue; } #endif - if (is_vma_purgeable(vma) && !lock_userexpte(vma, address)) { +#ifdef CONFIG_MEM_PURGEABLE + if ((vma->vm_flags & VM_PURGEABLE) && !lock_userexpte(vma, address)) { ret = false; page_vma_mapped_walk_done(&pvmw); pr_info("uxpte hold purgeable page %lx\n", page_to_pfn(page)); break; } +#endif /* * If the page is mlock()d, we cannot swap it out. @@ -1602,8 +1602,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if ((pte_unused(pteval) && !userfaultfd_armed(vma)) || is_vma_purgeable(vma)) { - if (is_vma_purgeable(vma)) { + } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && + !userfaultfd_armed(vma))) { + if (vma->vm_flags & VM_PURGEABLE) { unlock_userexpte(vma, address); pr_info("unmap purgeable page %lx\n", page_to_pfn(page)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index fe607587a10b..90dcdd8b34ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -924,10 +924,9 @@ static enum page_references page_check_references(struct page *page, { int referenced_ptes, referenced_page; unsigned long vm_flags; - bool purgeable; referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags, &purgeable); + &vm_flags); referenced_page = TestClearPageReferenced(page); /* @@ -939,7 +938,7 @@ static enum page_references page_check_references(struct page *page, #ifdef CONFIG_MEM_PURGEABLE pr_info("check page %lx references\n", page_to_pfn(page)); - if (purgeable) { + if (vm_flags & VM_PURGEABLE) { pr_info("try reclaim purgeable page %lx.\n", page_to_pfn(page)); return PAGEREF_RECLAIM_PURGEABLE; } @@ -2031,7 +2030,7 @@ void shrink_active_list(unsigned long nr_to_scan, } if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags, NULL)) { + &vm_flags)) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- Gitee From 77283eb0246bae6c33370698f440fd2a0814e56e Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 14:11:14 +0800 Subject: [PATCH 06/10] fix is_vma_purgeable --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index b9113cfc2929..aeb6d1f091a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1254,7 +1254,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (is_vma_purgeable(vma)) { + if (vma->vm_flags & VM_PURGEABLE) { unlock_userexpte(vma, addr); pr_info("unmap purgeable page %lx at addr %lx of vma %lx.\n", page_to_pfn(page), addr, vma); } -- Gitee From e88cbe51e5f138da07066c481f8f5ed2bee573e5 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 14:37:14 +0800 Subject: [PATCH 07/10] del upte --- include/linux/mm.h | 6 +----- include/linux/mman.h | 1 - include/uapi/asm-generic/mman-common.h | 3 +-- mm/memory.c | 2 +- mm/mmap.c | 4 ++-- 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d49bc29eab99..957119d7d280 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -306,7 +306,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_7 39 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) @@ -314,16 +313,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_MEM_PURGEABLE #define VM_PURGEABLE VM_HIGH_ARCH_5 -#define VM_USERPTE VM_HIGH_ARCH_6 -#define VM_USEREXPTE VM_HIGH_ARCH_7 +#define VM_USEREXPTE VM_HIGH_ARCH_6 #else /* CONFIG_MEM_PURGEABLE */ #define VM_PURGEABLE 0 -#define VM_USERPTE 0 #define VM_USEREXPTE 0 #endif /* CONFIG_MEM_PURGEABLE */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 8f814b3396e1..946641c59135 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,7 +156,6 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | #ifdef CONFIG_MEM_PURGEABLE _calc_vm_trans(flags, MAP_PURGEABLE, VM_PURGEABLE ) | - _calc_vm_trans(flags, MAP_USERPTE, VM_USERPTE ) | _calc_vm_trans(flags, MAP_USEREXPTE, VM_USEREXPTE ) | #endif arch_calc_vm_flag_bits(flags); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 5487e3f01773..8003a595a007 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -22,8 +22,7 @@ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ #define MAP_PURGEABLE 0x40 -#define MAP_USERPTE 0x80 -#define MAP_USEREXPTE 0x100 +#define MAP_USEREXPTE 0x80 /* 0x0100 - 0x4000 flags are defined in asm-generic/mman.h */ #define MAP_POPULATE 0x008000 /* populate (prefault) pagetables */ diff --git a/mm/memory.c b/mm/memory.c index aeb6d1f091a6..5bbe4f0affb2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1237,7 +1237,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = vm_normal_page(vma, addr, ptent); - if (vma->vm_flags & (VM_USERPTE | VM_USEREXPTE)) + if (vma->vm_flags & VM_USEREXPTE) page = NULL; if (unlikely(details) && page) { /* diff --git a/mm/mmap.c b/mm/mmap.c index 0587e2274a98..bcca4cafc9fa 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1558,7 +1558,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* * Ignore pgoff. */ - if (!(flags & (MAP_USERPTE | MAP_USEREXPTE))) + if (!(flags & MAP_USEREXPTE)) pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; @@ -1566,7 +1566,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* * Set pgoff according to addr for anon_vma. */ - if (!(flags & (MAP_USERPTE | MAP_USEREXPTE))) + if (!(flags & MAP_USEREXPTE)) pgoff = addr >> PAGE_SHIFT; break; default: -- Gitee From c169ba067481aa89a4b591d10bf5ad0a6b4958a6 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 15:37:29 +0800 Subject: [PATCH 08/10] var name refactory --- include/linux/mm_purgeable.h | 31 +++++--- include/linux/mm_types.h | 4 +- kernel/fork.c | 4 +- mm/memory.c | 6 +- mm/purgeable.c | 145 +++++++++++++++++++++-------------- mm/rmap.c | 4 +- 6 files changed, 114 insertions(+), 80 deletions(-) diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h index 1bf725ad0435..c7556d256305 100644 --- a/include/linux/mm_purgeable.h +++ b/include/linux/mm_purgeable.h @@ -8,36 +8,43 @@ #ifdef CONFIG_MEM_PURGEABLE -void mm_init_expgd(struct mm_struct *mm); -void mm_clear_expgd(struct mm_struct *mm); -int lock_userexpte(struct vm_area_struct *vma, unsigned long addr); -void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr); -vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, +void mm_init_uxpgd(struct mm_struct *mm); +void mm_clear_uxpgd(struct mm_struct *mm); +int lock_uxpte(struct vm_area_struct *vma, unsigned long addr); +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr); +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry); -int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr); +int uxpte_set_present(struct vm_area_struct *vma, unsigned long addr); +int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr); #else /* CONFIG_MEM_PURGEABLE */ -static inline void mm_init_expgd(struct mm_struct *mm) {} +static inline void mm_init_uxpgd(struct mm_struct *mm) {} -static inline void mm_clear_expgd(struct mm_struct *mm) {} +static inline void mm_clear_uxpgd(struct mm_struct *mm) {} -static inline int lock_userexpte(struct vm_area_struct *vma, +static inline int lock_uxpte(struct vm_area_struct *vma, unsigned long addr) { return 0; } -static inline void unlock_userexpte(struct vm_area_struct *vma, +static inline void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) {} -static inline vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, +static inline vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry) { return 0; } -static inline int userexpte_setpresent(struct vm_area_struct *vma, +static inline int uxpte_set_present(struct vm_area_struct *vma, + unsigned long addr) +{ + return 0; +} + +static inline int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) { return 0; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7742c221efa1..95a7fce86579 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -423,8 +423,8 @@ struct mm_struct { unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; #ifdef CONFIG_MEM_PURGEABLE - void *expgd; - spinlock_t expgd_lock; + void *uxpgd; + spinlock_t uxpgd_lock; #endif #ifdef CONFIG_MEMBARRIER diff --git a/kernel/fork.c b/kernel/fork.c index 9d537dcb5a96..a79fb3a3065d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -626,7 +626,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, static inline int mm_alloc_pgd(struct mm_struct *mm) { - mm_init_expgd(mm); + mm_init_uxpgd(mm); mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; @@ -636,7 +636,7 @@ static inline int mm_alloc_pgd(struct mm_struct *mm) static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); - mm_clear_expgd(mm); + mm_clear_uxpgd(mm); } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) diff --git a/mm/memory.c b/mm/memory.c index 5bbe4f0affb2..f18948b277f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1255,7 +1255,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!page)) continue; if (vma->vm_flags & VM_PURGEABLE) { - unlock_userexpte(vma, addr); + uxpte_clear_present(vma, addr); pr_info("unmap purgeable page %lx at addr %lx of vma %lx.\n", page_to_pfn(page), addr, vma); } if (!PageAnon(page)) { @@ -3563,7 +3563,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* use extra page table for userexpte */ if (vma->vm_flags & VM_USEREXPTE) { - ret = do_purgeable_page_fault(vmf, page, &entry); + ret = do_uxpte_page_fault(vmf, page, &entry); if (ret == VM_FAULT_OOM) goto oom; else @@ -3638,7 +3638,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_PURGEABLE) { pr_info("set page %lx purgeable\n", page_to_pfn(page)); SetPagePurgeable(page); - userexpte_setpresent(vma, vmf->address); + uxpte_set_present(vma, vmf->address); } lru_cache_add_inactive_or_unevictable(page, vma); setpte: diff --git a/mm/purgeable.c b/mm/purgeable.c index 70facfbdaba6..487781d8f472 100644 --- a/mm/purgeable.c +++ b/mm/purgeable.c @@ -12,30 +12,30 @@ #include -void mm_init_expgd(struct mm_struct *mm) +void mm_init_uxpgd(struct mm_struct *mm) { - mm->expgd = NULL; - spin_lock_init(&mm->expgd_lock); + mm->uxpgd = NULL; + spin_lock_init(&mm->uxpgd_lock); } -void mm_clear_expgd(struct mm_struct *mm) +void mm_clear_uxpgd(struct mm_struct *mm) { struct page *page = NULL; void **slot = NULL; struct radix_tree_iter iter; - spin_lock(&mm->expgd_lock); - if (!mm->expgd) + spin_lock(&mm->uxpgd_lock); + if (!mm->uxpgd) goto out; - radix_tree_for_each_slot(slot, mm->expgd, &iter, 0) { - page = radix_tree_lookup(mm->expgd, iter.index); + radix_tree_for_each_slot(slot, mm->uxpgd, &iter, 0) { + page = radix_tree_lookup(mm->uxpgd, iter.index); put_page(page); } out: - spin_unlock(&mm->expgd_lock); + spin_unlock(&mm->uxpgd_lock); } -int lock_userexpte(struct vm_area_struct *vma, unsigned long addr) +int lock_uxpte(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); @@ -44,10 +44,10 @@ int lock_userexpte(struct vm_area_struct *vma, unsigned long addr) struct page *page = NULL; long val = 0; - spin_lock(&vma->vm_mm->expgd_lock); - if (!vma->vm_mm->expgd) + spin_lock(&vma->vm_mm->uxpgd_lock); + if (!vma->vm_mm->uxpgd) goto unlock; - page = radix_tree_lookup(vma->vm_mm->expgd, base); + page = radix_tree_lookup(vma->vm_mm->uxpgd, base); if (!page) goto unlock; uxpte = page_to_virt(page); @@ -59,13 +59,13 @@ int lock_userexpte(struct vm_area_struct *vma, unsigned long addr) goto retry; val = -2; unlock: - spin_unlock(&vma->vm_mm->expgd_lock); + spin_unlock(&vma->vm_mm->uxpgd_lock); pr_info("lock uxpte of addr %lx is %lx\n", addr, val); return (val == -2) ? 1 : 0; } -void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); @@ -73,20 +73,20 @@ void unlock_userexpte(struct vm_area_struct *vma, unsigned long addr) atomic64_t *uxpte = NULL; struct page *page = NULL; - spin_lock(&vma->vm_mm->expgd_lock); - if (!vma->vm_mm->expgd) + spin_lock(&vma->vm_mm->uxpgd_lock); + if (!vma->vm_mm->uxpgd) goto unlock; - page = radix_tree_lookup(vma->vm_mm->expgd, base); + page = radix_tree_lookup(vma->vm_mm->uxpgd, base); if (!page) goto unlock; uxpte = page_to_virt(page); atomic64_set(&uxpte[offset], 0); unlock: - spin_unlock(&vma->vm_mm->expgd_lock); + spin_unlock(&vma->vm_mm->uxpgd_lock); pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0); } -vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, pte_t *entry) { struct vm_area_struct *vma = vmf->vma; @@ -96,21 +96,21 @@ vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, if (unlikely(anon_vma_prepare(vma))) goto oom; - if (current->mm->expgd) + if (current->mm->uxpgd) goto lookup; - spin_lock(¤t->mm->expgd_lock); - if (!current->mm->expgd) - current->mm->expgd = kzalloc(sizeof(struct radix_tree_root), + spin_lock(¤t->mm->uxpgd_lock); + if (!current->mm->uxpgd) + current->mm->uxpgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); - if (current->mm->expgd) - INIT_RADIX_TREE(current->mm->expgd, GFP_KERNEL); - spin_unlock(¤t->mm->expgd_lock); - if (!current->mm->expgd) { - pr_err("expgd alloc failed.\n"); + if (current->mm->uxpgd) + INIT_RADIX_TREE(current->mm->uxpgd, GFP_KERNEL); + spin_unlock(¤t->mm->uxpgd_lock); + if (!current->mm->uxpgd) { + pr_err("uxpgd alloc failed.\n"); goto oom; } lookup: - page = radix_tree_lookup(current->mm->expgd, offset); + page = radix_tree_lookup(current->mm->uxpgd, offset); if (page) goto make_pte; page = alloc_zeroed_user_highpage_movable(vma, vmf->address); @@ -121,15 +121,15 @@ vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, pr_err("radix preload fail.\n"); goto oom; } - spin_lock(¤t->mm->expgd_lock); - dup = radix_tree_lookup(current->mm->expgd, offset); + spin_lock(¤t->mm->uxpgd_lock); + dup = radix_tree_lookup(current->mm->uxpgd, offset); if (dup) { put_page(page); page = dup; } else { - radix_tree_insert(current->mm->expgd, offset, page); + radix_tree_insert(current->mm->uxpgd, offset, page); } - spin_unlock(¤t->mm->expgd_lock); + spin_unlock(¤t->mm->uxpgd_lock); radix_tree_preload_end(); make_pte: *entry = mk_pte(page, vma->vm_page_prot); @@ -141,64 +141,64 @@ vm_fault_t do_purgeable_page_fault(struct vm_fault *vmf, struct page *page, return VM_FAULT_OOM; } -static struct page *lookup_expte_page(struct vm_area_struct *vma, +static struct page *lookup_uxpte_page(struct vm_area_struct *vma, unsigned long addr, bool alloc) { - struct radix_tree_root *expgd = NULL; + struct radix_tree_root *uxpgd = NULL; struct page *page = NULL; struct page *new_page = NULL; struct mm_struct *mm = vma->vm_mm; - if (mm->expgd) + if (mm->uxpgd) goto lookup; if (!alloc) goto out; - spin_unlock(&mm->expgd_lock); - expgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); - if (!expgd) { - pr_err("expgd alloc failed.\n"); - spin_lock(&mm->expgd_lock); + spin_unlock(&mm->uxpgd_lock); + uxpgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); + if (!uxpgd) { + pr_err("uxpgd alloc failed.\n"); + spin_lock(&mm->uxpgd_lock); goto out; } - INIT_RADIX_TREE(expgd, GFP_KERNEL); - spin_lock(&mm->expgd_lock); - if (mm->expgd) - kfree(expgd); + INIT_RADIX_TREE(uxpgd, GFP_KERNEL); + spin_lock(&mm->uxpgd_lock); + if (mm->uxpgd) + kfree(uxpgd); else - mm->expgd = expgd; + mm->uxpgd = uxpgd; lookup: - page = radix_tree_lookup(mm->expgd, addr); + page = radix_tree_lookup(mm->uxpgd, addr); if (page) goto out; if (!alloc) goto out; - spin_unlock(&mm->expgd_lock); + spin_unlock(&mm->uxpgd_lock); new_page = alloc_zeroed_user_highpage_movable(vma, addr); if (!new_page) { - pr_err("expte page alloc fail.\n"); - spin_lock(&mm->expgd_lock); + pr_err("uxpte page alloc fail.\n"); + spin_lock(&mm->uxpgd_lock); goto out; } if (radix_tree_preload(GFP_KERNEL)) { put_page(new_page); pr_err("radix preload fail.\n"); - spin_lock(&mm->expgd_lock); + spin_lock(&mm->uxpgd_lock); goto out; } - spin_lock(&mm->expgd_lock); - page = radix_tree_lookup(mm->expgd, addr); + spin_lock(&mm->uxpgd_lock); + page = radix_tree_lookup(mm->uxpgd, addr); if (page) { put_page(new_page); } else { page = new_page; - radix_tree_insert(mm->expgd, addr, page); + radix_tree_insert(mm->uxpgd, addr, page); } radix_tree_preload_end(); out: return page; } -int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr) +int uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); unsigned long offset = (addr - base) / PAGE_SIZE; @@ -206,8 +206,8 @@ int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr) struct page *page = NULL; long val = 0; - spin_lock(&vma->vm_mm->expgd_lock); - page = lookup_expte_page(vma, base, true); + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, base, true); if (!page) goto unlock; uxpte = page_to_virt(page); @@ -219,8 +219,35 @@ int userexpte_setpresent(struct vm_area_struct *vma, unsigned long addr) goto retry; val++; unlock: - spin_unlock(&vma->vm_mm->expgd_lock); + spin_unlock(&vma->vm_mm->uxpgd_lock); pr_info("set present uxpte of addr %lx is %lx\n", addr, val); return 0; } + +int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, base, false); + if (!page) + goto unlock; + uxpte = page_to_virt(page); +retry: + val = atomic64_read(&uxpte[offset]); + if (!(val & 1)) + goto unlock; + if (atomic64_cmpxchg(&uxpte[offset], val, val - 1) != val) + goto retry; + val--; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("clear present uxpte of addr %lx is %lx\n", addr, val); + + return 0; +} diff --git a/mm/rmap.c b/mm/rmap.c index 0602881d5630..46a396a826b7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1454,7 +1454,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } #endif #ifdef CONFIG_MEM_PURGEABLE - if ((vma->vm_flags & VM_PURGEABLE) && !lock_userexpte(vma, address)) { + if ((vma->vm_flags & VM_PURGEABLE) && !lock_uxpte(vma, address)) { ret = false; page_vma_mapped_walk_done(&pvmw); pr_info("uxpte hold purgeable page %lx\n", page_to_pfn(page)); @@ -1605,7 +1605,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && !userfaultfd_armed(vma))) { if (vma->vm_flags & VM_PURGEABLE) { - unlock_userexpte(vma, address); + unlock_uxpte(vma, address); pr_info("unmap purgeable page %lx\n", page_to_pfn(page)); } /* -- Gitee From 968925fe0e36b092a1d9732b39bb50ab490cafc2 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 15:43:42 +0800 Subject: [PATCH 09/10] fix may_deactivate --- mm/vmscan.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 90dcdd8b34ea..25295af3eeb5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2139,8 +2139,7 @@ unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, reclaimacct_substage_start(stub); #endif if (is_active_lru(lru)) { - if (sc->may_deactivate & (1 << is_file_lru(lru)) - || lru == LRU_ACTIVE_PURGEABLE) + if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; @@ -4377,7 +4376,7 @@ static int purgeable(struct ctl_table *table, int write, void *buffer, .gfp_mask = GFP_KERNEL, .order = 0, .priority = DEF_PRIORITY, - .may_deactivate = 1, + .may_deactivate = DEACTIVATE_ANON, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, -- Gitee From adfc10b357b69e8a0a28b22d8dbc7b36ad465dcb Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 17:31:50 +0800 Subject: [PATCH 10/10] macro refactory --- include/linux/mm_inline.h | 2 ++ include/linux/mmzone.h | 13 ++++++++++--- mm/Kconfig | 7 +++++++ mm/vmscan.c | 15 ++++++++++----- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d0e326dabb33..a3e5b0d9a340 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -76,8 +76,10 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ static inline enum lru_list page_lru_base_type(struct page *page) { +#ifdef CONFIG_MEM_PURGEABLE if (PagePurgeable(page)) return LRU_INACTIVE_PURGEABLE; +#endif if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a1c1d1c0baaa..f4d955c67b54 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -262,22 +262,26 @@ static __always_inline bool vmstat_item_in_bytes(int idx) #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 +#ifdef CONFIG_MEM_PURGEABLE #define LRU_PURGEABLE 4 +#endif enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, +#ifdef CONFIG_MEM_PURGEABLE LRU_INACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE, LRU_ACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE + LRU_ACTIVE, +#endif LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) +#define for_each_evictable_lru(lru) for (lru = 0; lru < LRU_UNEVICTABLE; lru++) static inline bool is_file_lru(enum lru_list lru) { @@ -286,8 +290,11 @@ static inline bool is_file_lru(enum lru_list lru) static inline bool is_active_lru(enum lru_list lru) { - return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE || - lru == LRU_ACTIVE_PURGEABLE); +#ifdef CONFIG_MEM_PURGEABLE + if (lru == LRU_ACTIVE_PURGEABLE) + return true; +#endif + return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define ANON_AND_FILE 2 diff --git a/mm/Kconfig b/mm/Kconfig index 36ef0bed11ea..be952d88d0ac 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -946,4 +946,11 @@ config MEM_PURGEABLE help Support purgeable pages for process +config MEM_PURGEABLE_DEBUG + bool "Purgeable memory debug" + default n + depends on MEM_PURGEABLE + help + Debug info for purgeable memory + endmenu diff --git a/mm/vmscan.c b/mm/vmscan.c index 25295af3eeb5..bdb7ae170233 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4093,7 +4093,9 @@ void kswapd_stop(int nid) } } -static void __init purgeable_init(void); +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static void __init purgeable_debugfs_init(void); +#endif static int __init kswapd_init(void) { @@ -4102,8 +4104,9 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - - purgeable_init(); +#ifdef CONFIG_MEM_PURGEABLE_DEBUG + purgeable_debugfs_init(); +#endif return 0; } @@ -4353,6 +4356,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); +#ifdef CONFIG_MEM_PURGEABLE_DEBUG static unsigned long purgeable_node(pg_data_t *pgdata, struct scan_control *sc) { struct mem_cgroup *memcg = NULL; @@ -4409,14 +4413,15 @@ static struct ctl_table sys_tab[] = { static struct ctl_table_header *purgeable_header; -static void __init purgeable_init(void) +static void __init purgeable_debugfs_init(void) { purgeable_header = register_sysctl_table(sys_tab); if (!purgeable_header) pr_err("register purgeable sysctl table failed.\n"); } -static void __exit purgeable_exit(void) +static void __exit purgeable_debugfs_exit(void) { unregister_sysctl_table(purgeable_header); } +#endif /* CONFIG_MEM_PURGEABLE_DEBUG */ -- Gitee