From 5acb1b1357ca10cfa937cccb82165e620dd0e10a Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 19:42:30 +0800 Subject: [PATCH 1/2] mm: Purgeable Memory ohos inclusion category: feature issue: #I5C0RT CVE: NA ------------------------------------ Add purgeable memory kernel support Signed-off-by: Chengke Wang Signed-off-by: lijiawei --- fs/proc/meminfo.c | 17 +- include/linux/mm.h | 12 ++ include/linux/mm_inline.h | 5 + include/linux/mm_purgeable.h | 50 ++++++ include/linux/mm_types.h | 4 + include/linux/mman.h | 4 + include/linux/mmzone.h | 21 ++- include/linux/page-flags.h | 8 + include/trace/events/mmflags.h | 7 + include/uapi/asm-generic/mman-common.h | 2 + kernel/fork.c | 3 + mm/Kconfig | 14 ++ mm/Makefile | 1 + mm/memory.c | 19 +- mm/mmap.c | 6 +- mm/purgeable.c | 236 +++++++++++++++++++++++++ mm/rmap.c | 20 ++- mm/vmscan.c | 91 +++++++++- mm/vmstat.c | 8 + 19 files changed, 516 insertions(+), 12 deletions(-) create mode 100644 include/linux/mm_purgeable.h create mode 100644 mm/purgeable.c diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 248e0afeac94..0ebced4b78c6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; + unsigned long nr_purgeable_active = 0; + unsigned long nr_purgeable_inactive = 0; si_meminfo(&i); si_swapinfo(&i); @@ -51,6 +53,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); +#ifdef CONFIG_MEM_PURGEABLE + nr_purgeable_active = pages[LRU_ACTIVE_PURGEABLE]; + nr_purgeable_inactive = pages[LRU_INACTIVE_PURGEABLE]; +#endif + available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); @@ -62,13 +69,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + - pages[LRU_ACTIVE_FILE]); + pages[LRU_ACTIVE_FILE] + + nr_purgeable_active); show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + - pages[LRU_INACTIVE_FILE]); + pages[LRU_INACTIVE_FILE] + + nr_purgeable_inactive); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); +#ifdef CONFIG_MEM_PURGEABLE + show_val_kb(m, "Active(purgeable): ", nr_purgeable_active); + show_val_kb(m, "Inactive(purgeable): ", nr_purgeable_inactive); +#endif show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6a92fac06b44..40ca6f122b9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -304,13 +304,25 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ +#ifdef CONFIG_MEM_PURGEABLE +#define VM_PURGEABLE VM_HIGH_ARCH_5 +#define VM_USEREXPTE VM_HIGH_ARCH_6 +#else /* CONFIG_MEM_PURGEABLE */ +#define VM_PURGEABLE 0 +#define VM_USEREXPTE 0 +#endif /* CONFIG_MEM_PURGEABLE */ + #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8939659fc32f..e7c1d78ce243 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -77,6 +77,11 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ static inline enum lru_list page_lru_base_type(struct page *page) { +#ifdef CONFIG_MEM_PURGEABLE + if (PagePurgeable(page)) + return LRU_INACTIVE_PURGEABLE; +#endif + if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; return LRU_INACTIVE_ANON; diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h new file mode 100644 index 000000000000..11994d5cf59c --- /dev/null +++ b/include/linux/mm_purgeable.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#ifndef __MM_PURGEABLE_MEM_H +#define __MM_PURGEABLE_MEM_H + +#ifdef CONFIG_MEM_PURGEABLE + +void mm_init_uxpgd(struct mm_struct *mm); +void mm_clear_uxpgd(struct mm_struct *mm); +bool lock_uxpte(struct vm_area_struct *vma, unsigned long addr); +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr); +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, pte_t *entry); +bool uxpte_set_present(struct vm_area_struct *vma, unsigned long addr); +void uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr); + +#else /* CONFIG_MEM_PURGEABLE */ + +static inline void mm_init_uxpgd(struct mm_struct *mm) {} + +static inline void mm_clear_uxpgd(struct mm_struct *mm) {} + +static inline bool lock_uxpte(struct vm_area_struct *vma, + unsigned long addr) +{ + return false; +} + +static inline void unlock_uxpte(struct vm_area_struct *vma, + unsigned long addr) {} + +static inline vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, + pte_t *entry) +{ + return 0; +} + +static inline bool uxpte_set_present(struct vm_area_struct *vma, + unsigned long addr) +{ + return false; +} + +static inline void uxpte_clear_present(struct vm_area_struct *vma, + unsigned long addr) {} +#endif /* CONFIG_MEM_PURGEABLE */ +#endif /* __MM_PURGEABLE_MEM_H */ + diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a9249cb18123..d86bc1d2dcc3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -425,6 +425,10 @@ struct mm_struct { unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEM_PURGEABLE + void *uxpgd; + spinlock_t uxpgd_lock; +#endif #ifdef CONFIG_MEMBARRIER /** diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..946641c59135 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -154,6 +154,10 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | +#ifdef CONFIG_MEM_PURGEABLE + _calc_vm_trans(flags, MAP_PURGEABLE, VM_PURGEABLE ) | + _calc_vm_trans(flags, MAP_USEREXPTE, VM_USEREXPTE ) | +#endif arch_calc_vm_flag_bits(flags); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d66cecefa84f..f4d955c67b54 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -158,6 +158,10 @@ enum zone_stat_item { NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, +#ifdef CONFIG_MEM_PURGEABLE + NR_ZONE_INACTIVE_PURGEABLE, + NR_ZONE_ACTIVE_PURGEABLE, +#endif NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ @@ -179,6 +183,10 @@ enum node_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ +#ifdef CONFIG_MEM_PURGEABLE + NR_INACTIVE_PURGEABLE, + NR_ACTIVE_PURGEABLE, +#endif NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, @@ -254,19 +262,26 @@ static __always_inline bool vmstat_item_in_bytes(int idx) #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 +#ifdef CONFIG_MEM_PURGEABLE +#define LRU_PURGEABLE 4 +#endif enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, +#ifdef CONFIG_MEM_PURGEABLE + LRU_INACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE, + LRU_ACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE + LRU_ACTIVE, +#endif LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) +#define for_each_evictable_lru(lru) for (lru = 0; lru < LRU_UNEVICTABLE; lru++) static inline bool is_file_lru(enum lru_list lru) { @@ -275,6 +290,10 @@ static inline bool is_file_lru(enum lru_list lru) static inline bool is_active_lru(enum lru_list lru) { +#ifdef CONFIG_MEM_PURGEABLE + if (lru == LRU_ACTIVE_PURGEABLE) + return true; +#endif return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a6446a50c39f..dcf83c01f57b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -142,6 +142,9 @@ enum pageflags { #ifdef CONFIG_PAGE_TRACING PG_skb, PG_zspage, +#endif +#ifdef CONFIG_MEM_PURGEABLE + PG_purgeable, #endif __NR_PAGEFLAGS, @@ -461,6 +464,11 @@ PAGEFLAG(Idle, idle, PF_ANY) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_MEM_PURGEABLE +PAGEFLAG(Purgeable, purgeable, PF_ANY) +#else +PAGEFLAG_FALSE(Purgeable) +#endif /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..2332482f7df7 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -55,6 +55,12 @@ __def_gfpflag_names \ ) : "none" +#ifdef CONFIG_MEM_PURGEABLE +#define IF_HAVE_PG_PURGEABLE(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_PURGEABLE(flag,string) +#endif + #ifdef CONFIG_MMU #define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string} #else @@ -107,6 +113,7 @@ {1UL << PG_reclaim, "reclaim" }, \ {1UL << PG_swapbacked, "swapbacked" }, \ {1UL << PG_unevictable, "unevictable" } \ +IF_HAVE_PG_PURGEABLE(PG_purgeable, "purgeable" ) \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..8003a595a007 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -21,6 +21,8 @@ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ +#define MAP_PURGEABLE 0x40 +#define MAP_USEREXPTE 0x80 /* 0x0100 - 0x4000 flags are defined in asm-generic/mman.h */ #define MAP_POPULATE 0x008000 /* populate (prefault) pagetables */ diff --git a/kernel/fork.c b/kernel/fork.c index 0426eb8a4042..9b0ab3b61d65 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,7 @@ #ifdef CONFIG_RECLAIM_ACCT #include #endif +#include #include #include @@ -626,6 +627,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, static inline int mm_alloc_pgd(struct mm_struct *mm) { + mm_init_uxpgd(mm); mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; @@ -635,6 +637,7 @@ static inline int mm_alloc_pgd(struct mm_struct *mm) static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); + mm_clear_uxpgd(mm); } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) diff --git a/mm/Kconfig b/mm/Kconfig index 291e90d2de32..c39f868947e3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -958,4 +958,18 @@ config RSS_THRESHOLD help Set a threshold to monitoring RSS in per pid +config MEM_PURGEABLE + bool "Purgeable memory feature" + default n + select ARCH_USES_HIGH_VMA_FLAGS + help + Support purgeable pages for process + +config MEM_PURGEABLE_DEBUG + bool "Purgeable memory debug" + default n + depends on MEM_PURGEABLE + help + Debug info for purgeable memory + endmenu diff --git a/mm/Makefile b/mm/Makefile index d193db7a48e0..bebdabe1fce5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -128,3 +128,4 @@ obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o obj-$(CONFIG_RECLAIM_ACCT) += reclaim_acct.o reclaimacct_show.o +obj-$(CONFIG_MEM_PURGEABLE) += purgeable.o diff --git a/mm/memory.c b/mm/memory.c index 4fe24cd865a7..73231717b75c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -1236,6 +1237,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = vm_normal_page(vma, addr, ptent); + if (vma->vm_flags & VM_USEREXPTE) + page = NULL; if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -1251,7 +1254,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - + if (vma->vm_flags & VM_PURGEABLE) + uxpte_clear_present(vma, addr); if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; @@ -3555,11 +3559,20 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; + /* use extra page table for userexpte */ + if (vma->vm_flags & VM_USEREXPTE) { + if (do_uxpte_page_fault(vmf, &entry)) + goto oom; + else + goto got_page; + } + /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); +got_page: vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { @@ -3620,6 +3633,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); + if (vma->vm_flags & VM_PURGEABLE) { + SetPagePurgeable(page); + uxpte_set_present(vma, vmf->address); + } lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); diff --git a/mm/mmap.c b/mm/mmap.c index a5e0958acd05..b134946984a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1559,14 +1559,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* * Ignore pgoff. */ - pgoff = 0; + if (!(flags & MAP_USEREXPTE)) + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ - pgoff = addr >> PAGE_SHIFT; + if (!(flags & MAP_USEREXPTE)) + pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; diff --git a/mm/purgeable.c b/mm/purgeable.c new file mode 100644 index 000000000000..5ce7459b6c82 --- /dev/null +++ b/mm/purgeable.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include + +typedef atomic64_t uxpte_t; + +#define uxpte_read atomic64_read +#define uxpte_set atomic64_set +#define uxpte_cas(v, old, new) (atomic64_cmpxchg((v), (old), (new)) == (old)) + +#define UXPTE_SIZE_SHIFT 3 +#define UXPTE_SIZE (1 << UXPTE_SIZE_SHIFT) + +#define UXPTE_PER_PAGE_SHIFT (PAGE_SHIFT - UXPTE_SIZE_SHIFT) +#define UXPTE_PER_PAGE (1 << UXPTE_PER_PAGE_SHIFT) + +#define UXPTE_PRESENT_BIT 1 +#define UXPTE_PRESENT_MASK (1 << UXPTE_PRESENT_BIT) +#define UXPTE_REFCNT_ONE (1 << UXPTE_PRESENT_BIT) +#define UXPTE_UNDER_RECLAIM (-UXPTE_REFCNT_ONE) + +#define vpn(vaddr) ((vaddr) >> PAGE_SHIFT) +#define uxpte_pn(vaddr) (vpn(vaddr) >> UXPTE_PER_PAGE_SHIFT) +#define uxpte_off(vaddr) (vpn(vaddr) & (UXPTE_PER_PAGE - 1)) +#define uxpn2addr(uxpn) ((uxpn) << (UXPTE_PER_PAGE_SHIFT + PAGE_SHIFT)) + +void mm_init_uxpgd(struct mm_struct *mm) +{ + mm->uxpgd = NULL; + spin_lock_init(&mm->uxpgd_lock); +} + +void mm_clear_uxpgd(struct mm_struct *mm) +{ + struct page *page = NULL; + void **slot = NULL; + struct radix_tree_iter iter; + + spin_lock(&mm->uxpgd_lock); + if (!mm->uxpgd) + goto out; + radix_tree_for_each_slot(slot, mm->uxpgd, &iter, 0) { + page = radix_tree_delete(mm->uxpgd, iter.index); + put_page(page); + } +out: + kfree(mm->uxpgd); + mm->uxpgd = NULL; + spin_unlock(&mm->uxpgd_lock); +} + +/* should hold uxpgd_lock before invoke */ +static struct page *lookup_uxpte_page(struct vm_area_struct *vma, + unsigned long addr, bool alloc) +{ + struct radix_tree_root *uxpgd = NULL; + struct page *page = NULL; + struct page *new_page = NULL; + struct mm_struct *mm = vma->vm_mm; + unsigned long uxpn = uxpte_pn(addr); + + if (mm->uxpgd) + goto lookup; + if (!alloc) + goto out; + spin_unlock(&mm->uxpgd_lock); + uxpgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); + if (!uxpgd) { + pr_err("uxpgd alloc failed.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + INIT_RADIX_TREE(uxpgd, GFP_KERNEL); + spin_lock(&mm->uxpgd_lock); + if (mm->uxpgd) + kfree(uxpgd); + else + mm->uxpgd = uxpgd; +lookup: + page = radix_tree_lookup(mm->uxpgd, uxpn); + if (page) + goto out; + if (!alloc) + goto out; + spin_unlock(&mm->uxpgd_lock); + new_page = alloc_zeroed_user_highpage_movable(vma, addr); + if (!new_page) { + pr_err("uxpte page alloc fail.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + if (radix_tree_preload(GFP_KERNEL)) { + put_page(new_page); + pr_err("radix preload fail.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + spin_lock(&mm->uxpgd_lock); + page = radix_tree_lookup(mm->uxpgd, uxpn); + if (page) { + put_page(new_page); + } else { + page = new_page; + radix_tree_insert(mm->uxpgd, uxpn, page); + } + radix_tree_preload_end(); +out: + return page; +} + +/* should hold uxpgd_lock before invoke */ +static uxpte_t *lookup_uxpte(struct vm_area_struct *vma, + unsigned long addr, bool alloc) +{ + uxpte_t *uxpte = NULL; + struct page *page = NULL; + + page = lookup_uxpte_page(vma, addr, alloc); + if (!page) + return NULL; + uxpte = page_to_virt(page); + + return uxpte + uxpte_off(addr); +} + +bool lock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, true); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (val >> 1) + goto unlock; + if (!uxpte_cas(uxpte, val, UXPTE_UNDER_RECLAIM)) + goto retry; + val = UXPTE_UNDER_RECLAIM; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + + return val == UXPTE_UNDER_RECLAIM; +} + +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, false); + if (!uxpte) + goto unlock; + uxpte_set(uxpte, 0); +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); +} + +bool uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, true); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (val & 1) + goto unlock; + if (!uxpte_cas(uxpte, val, val + 1)) + goto retry; + val++; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + + return val & 1; +} + +void uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, false); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (!(val & 1)) + goto unlock; + if (!uxpte_cas(uxpte, val, val - 1)) + goto retry; + val--; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); +} + +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, pte_t *entry) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long vma_uxpn = vma->vm_pgoff; + unsigned long off_uxpn = vpn(vmf->address - vma->vm_start); + unsigned long addr = uxpn2addr(vma_uxpn + off_uxpn); + struct page *page = NULL; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, addr, true); + spin_unlock(&vma->vm_mm->uxpgd_lock); + + if (!page) + return VM_FAULT_OOM; + + *entry = mk_pte(page, vma->vm_page_prot); + *entry = pte_sw_mkyoung(*entry); + if (vma->vm_flags & VM_WRITE) + *entry = pte_mkwrite(pte_mkdirty(*entry)); + return 0; +} diff --git a/mm/rmap.c b/mm/rmap.c index cdf549f6f617..ef7d4df28615 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -72,6 +72,7 @@ #include #include #include +#include #include @@ -782,6 +783,10 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; +#ifdef CONFIG_MEM_PURGEABLE + if (!(vma->vm_flags & VM_PURGEABLE)) + pra->vm_flags &= ~VM_PURGEABLE; +#endif if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; @@ -821,7 +826,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; } if (!pra->mapcount) @@ -860,6 +865,7 @@ int page_referenced(struct page *page, struct page_referenced_arg pra = { .mapcount = total_mapcount(page), .memcg = memcg, + .vm_flags = VM_PURGEABLE, }; struct rmap_walk_control rwc = { .rmap_one = page_referenced_one, @@ -1443,6 +1449,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, continue; } #endif +#ifdef CONFIG_MEM_PURGEABLE + if ((vma->vm_flags & VM_PURGEABLE) && !lock_uxpte(vma, address)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } +#endif /* * If the page is mlock()d, we cannot swap it out. @@ -1584,7 +1597,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && + !userfaultfd_armed(vma))) { + if (vma->vm_flags & VM_PURGEABLE) + unlock_uxpte(vma, address); /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan diff --git a/mm/vmscan.c b/mm/vmscan.c index 5f412c857d94..95599a792baf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -914,6 +914,7 @@ void putback_lru_page(struct page *page) enum page_references { PAGEREF_RECLAIM, PAGEREF_RECLAIM_CLEAN, + PAGEREF_RECLAIM_PURGEABLE, PAGEREF_KEEP, PAGEREF_ACTIVATE, }; @@ -935,6 +936,10 @@ static enum page_references page_check_references(struct page *page, if (vm_flags & VM_LOCKED) return PAGEREF_RECLAIM; +#ifdef CONFIG_MEM_PURGEABLE + if (vm_flags & VM_PURGEABLE) + return PAGEREF_RECLAIM_PURGEABLE; +#endif if (referenced_ptes) { /* * All mapped pages start out with page table @@ -1163,6 +1168,7 @@ unsigned int shrink_page_list(struct list_head *page_list, goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: + case PAGEREF_RECLAIM_PURGEABLE: ; /* try to reclaim the page below */ } @@ -1172,7 +1178,7 @@ unsigned int shrink_page_list(struct list_head *page_list, * Lazyfree page could be freed directly */ if (PageAnon(page) && PageSwapBacked(page)) { - if (!PageSwapCache(page)) { + if (!PageSwapCache(page) && references != PAGEREF_RECLAIM_PURGEABLE) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (page_maybe_dma_pinned(page)) @@ -1247,7 +1253,7 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageDirty(page)) { + if (PageDirty(page) && references != PAGEREF_RECLAIM_PURGEABLE) { /* * Only kswapd can writeback filesystem pages * to avoid risk of stack overflow. But avoid @@ -1355,11 +1361,11 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageAnon(page) && !PageSwapBacked(page)) { + if (PageAnon(page) && (!PageSwapBacked(page) || references == PAGEREF_RECLAIM_PURGEABLE)) { /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { + if (PageDirty(page) && references != PAGEREF_RECLAIM_PURGEABLE) { page_ref_unfreeze(page, 1); goto keep_locked; } @@ -4086,6 +4092,10 @@ void kswapd_stop(int nid) } } +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static void __init purgeable_debugfs_init(void); +#endif + static int __init kswapd_init(void) { int nid; @@ -4093,6 +4103,9 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); +#ifdef CONFIG_MEM_PURGEABLE_DEBUG + purgeable_debugfs_init(); +#endif return 0; } @@ -4341,3 +4354,73 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static unsigned long purgeable_node(pg_data_t *pgdata, struct scan_control *sc) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr = 0; + while (memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata); + + shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc); + nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc); + } + + pr_info("reclaim %lu purgeable pages.\n", nr); + + return nr; +} + +static int purgeable(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY, + .may_deactivate = DEACTIVATE_ANON, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + int nid = 0; + + for_each_node_state(nid, N_MEMORY) + purgeable_node(NODE_DATA(nid), &sc); + return 0; +} + +static struct ctl_table ker_tab[] = { + { + .procname = "purgeable", + .mode = 0200, + .proc_handler = purgeable, + }, + {}, +}; + +static struct ctl_table sys_tab[] = { + { + .procname = "kernel", + .mode = 0555, + .child = ker_tab, + }, + {}, +}; + +static struct ctl_table_header *purgeable_header; + +static void __init purgeable_debugfs_init(void) +{ + purgeable_header = register_sysctl_table(sys_tab); + if (!purgeable_header) + pr_err("register purgeable sysctl table failed.\n"); +} + +static void __exit purgeable_debugfs_exit(void) +{ + unregister_sysctl_table(purgeable_header); +} +#endif /* CONFIG_MEM_PURGEABLE_DEBUG */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 5b9b46f42f40..3e89021a3f75 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1154,6 +1154,10 @@ const char * const vmstat_text[] = { "nr_zone_active_anon", "nr_zone_inactive_file", "nr_zone_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_zone_inactive_purgeable", + "nr_zone_active_purgeable", +#endif "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", @@ -1182,6 +1186,10 @@ const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_inactive_purgeable", + "nr_active_purgeable", +#endif "nr_unevictable", "nr_slab_reclaimable", "nr_slab_unreclaimable", -- Gitee From 43a939629cf2ba6f0e8139d9659d96165fc5812f Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 14 Jun 2022 19:11:51 +0800 Subject: [PATCH 2/2] fixed d225fe8 from https://gitee.com/eliotc/kernel_linux_5.10/pulls/1 hpinit bugfix --- drivers/hyperhold/hp_core.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/hyperhold/hp_core.c b/drivers/hyperhold/hp_core.c index a55ee05f3fe6..e37301725b3f 100644 --- a/drivers/hyperhold/hp_core.c +++ b/drivers/hyperhold/hp_core.c @@ -71,10 +71,8 @@ void hyperhold_disable(bool force) goto out; hyperhold.inited = false; wait_for_iotab_empty(); - if (hyperhold.read_wq) - destroy_workqueue(hyperhold.read_wq); - if (hyperhold.write_wq) - destroy_workqueue(hyperhold.write_wq); + destroy_workqueue(hyperhold.read_wq); + destroy_workqueue(hyperhold.write_wq); deinit_space(&hyperhold.spc); crypto_deinit(&hyperhold.dev); unbind_bdev(&hyperhold.dev); @@ -98,27 +96,28 @@ void hyperhold_enable(void) if (hyperhold.inited) goto unlock; if (!bind_bdev(&hyperhold.dev, hyperhold.device_name)) - goto err; + goto err1; if (!crypto_init(&hyperhold.dev, hyperhold.enable_soft_crypt)) - goto err; + goto err2; if (!init_space(&hyperhold.spc, hyperhold.dev.dev_size, hyperhold.extent_size)) - goto err; + goto err3; hyperhold.read_wq = alloc_workqueue("hyperhold_read", WQ_HIGHPRI | WQ_UNBOUND, 0); if (!hyperhold.read_wq) - goto err; + goto err4; hyperhold.write_wq = alloc_workqueue("hyperhold_write", 0, 0); if (!hyperhold.write_wq) - goto err; + goto err5; hyperhold.inited = true; goto unlock; -err: - if (hyperhold.read_wq) - destroy_workqueue(hyperhold.read_wq); - if (hyperhold.write_wq) - destroy_workqueue(hyperhold.write_wq); +err5: + destroy_workqueue(hyperhold.read_wq); +err4: deinit_space(&hyperhold.spc); +err3: crypto_deinit(&hyperhold.dev); +err2: unbind_bdev(&hyperhold.dev); +err1: enable = false; unlock: mutex_unlock(&hyperhold.init_lock); -- Gitee