From 75097761595f8783523dbd3413dae07a54763414 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Tue, 31 May 2022 19:42:30 +0800 Subject: [PATCH 1/2] add purgeable memory kernel surpport Signed-off-by: Chengke Wang Signed-off-by: lijiawei --- fs/proc/meminfo.c | 17 +- include/linux/mm.h | 12 ++ include/linux/mm_inline.h | 5 + include/linux/mm_purgeable.h | 54 ++++++ include/linux/mm_types.h | 4 + include/linux/mman.h | 4 + include/linux/mmzone.h | 21 +- include/linux/page-flags.h | 8 + include/trace/events/mmflags.h | 7 + include/uapi/asm-generic/mman-common.h | 2 + kernel/fork.c | 3 + mm/Kconfig | 14 ++ mm/Makefile | 1 + mm/memory.c | 22 ++- mm/mmap.c | 6 +- mm/purgeable.c | 253 +++++++++++++++++++++++++ mm/rmap.c | 27 ++- mm/vmscan.c | 94 ++++++++- mm/vmstat.c | 8 + 19 files changed, 550 insertions(+), 12 deletions(-) create mode 100644 include/linux/mm_purgeable.h create mode 100644 mm/purgeable.c diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 248e0afeac94..0ebced4b78c6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; + unsigned long nr_purgeable_active = 0; + unsigned long nr_purgeable_inactive = 0; si_meminfo(&i); si_swapinfo(&i); @@ -51,6 +53,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); +#ifdef CONFIG_MEM_PURGEABLE + nr_purgeable_active = pages[LRU_ACTIVE_PURGEABLE]; + nr_purgeable_inactive = pages[LRU_INACTIVE_PURGEABLE]; +#endif + available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); @@ -62,13 +69,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + - pages[LRU_ACTIVE_FILE]); + pages[LRU_ACTIVE_FILE] + + nr_purgeable_active); show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + - pages[LRU_INACTIVE_FILE]); + pages[LRU_INACTIVE_FILE] + + nr_purgeable_inactive); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); +#ifdef CONFIG_MEM_PURGEABLE + show_val_kb(m, "Active(purgeable): ", nr_purgeable_active); + show_val_kb(m, "Inactive(purgeable): ", nr_purgeable_inactive); +#endif show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 01d012727a27..2a0e86ed0def 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -304,13 +304,25 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ +#ifdef CONFIG_MEM_PURGEABLE +#define VM_PURGEABLE VM_HIGH_ARCH_5 +#define VM_USEREXPTE VM_HIGH_ARCH_6 +#else /* CONFIG_MEM_PURGEABLE */ +#define VM_PURGEABLE 0 +#define VM_USEREXPTE 0 +#endif /* CONFIG_MEM_PURGEABLE */ + #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8939659fc32f..e7c1d78ce243 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -77,6 +77,11 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ static inline enum lru_list page_lru_base_type(struct page *page) { +#ifdef CONFIG_MEM_PURGEABLE + if (PagePurgeable(page)) + return LRU_INACTIVE_PURGEABLE; +#endif + if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; return LRU_INACTIVE_ANON; diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h new file mode 100644 index 000000000000..c7556d256305 --- /dev/null +++ b/include/linux/mm_purgeable.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#ifndef __MM_PURGEABLE_MEM_H +#define __MM_PURGEABLE_MEM_H + +#ifdef CONFIG_MEM_PURGEABLE + +void mm_init_uxpgd(struct mm_struct *mm); +void mm_clear_uxpgd(struct mm_struct *mm); +int lock_uxpte(struct vm_area_struct *vma, unsigned long addr); +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr); +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, + pte_t *entry); +int uxpte_set_present(struct vm_area_struct *vma, unsigned long addr); +int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr); + +#else /* CONFIG_MEM_PURGEABLE */ + +static inline void mm_init_uxpgd(struct mm_struct *mm) {} + +static inline void mm_clear_uxpgd(struct mm_struct *mm) {} + +static inline int lock_uxpte(struct vm_area_struct *vma, + unsigned long addr) +{ + return 0; +} + +static inline void unlock_uxpte(struct vm_area_struct *vma, + unsigned long addr) {} + +static inline vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, + struct page *page, pte_t *entry) +{ + return 0; +} + +static inline int uxpte_set_present(struct vm_area_struct *vma, + unsigned long addr) +{ + return 0; +} + +static inline int uxpte_clear_present(struct vm_area_struct *vma, + unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MEM_PURGEABLE */ +#endif /* __MM_PURGEABLE_MEM_H */ + diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c0aedba56912..66efc374a9a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -425,6 +425,10 @@ struct mm_struct { unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEM_PURGEABLE + void *uxpgd; + spinlock_t uxpgd_lock; +#endif #ifdef CONFIG_MEMBARRIER /** diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..946641c59135 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -154,6 +154,10 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | +#ifdef CONFIG_MEM_PURGEABLE + _calc_vm_trans(flags, MAP_PURGEABLE, VM_PURGEABLE ) | + _calc_vm_trans(flags, MAP_USEREXPTE, VM_USEREXPTE ) | +#endif arch_calc_vm_flag_bits(flags); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d66cecefa84f..f4d955c67b54 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -158,6 +158,10 @@ enum zone_stat_item { NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, +#ifdef CONFIG_MEM_PURGEABLE + NR_ZONE_INACTIVE_PURGEABLE, + NR_ZONE_ACTIVE_PURGEABLE, +#endif NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ @@ -179,6 +183,10 @@ enum node_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ +#ifdef CONFIG_MEM_PURGEABLE + NR_INACTIVE_PURGEABLE, + NR_ACTIVE_PURGEABLE, +#endif NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, @@ -254,19 +262,26 @@ static __always_inline bool vmstat_item_in_bytes(int idx) #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 +#ifdef CONFIG_MEM_PURGEABLE +#define LRU_PURGEABLE 4 +#endif enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, +#ifdef CONFIG_MEM_PURGEABLE + LRU_INACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE, + LRU_ACTIVE_PURGEABLE = LRU_BASE + LRU_PURGEABLE + LRU_ACTIVE, +#endif LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) +#define for_each_evictable_lru(lru) for (lru = 0; lru < LRU_UNEVICTABLE; lru++) static inline bool is_file_lru(enum lru_list lru) { @@ -275,6 +290,10 @@ static inline bool is_file_lru(enum lru_list lru) static inline bool is_active_lru(enum lru_list lru) { +#ifdef CONFIG_MEM_PURGEABLE + if (lru == LRU_ACTIVE_PURGEABLE) + return true; +#endif return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a6446a50c39f..dcf83c01f57b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -142,6 +142,9 @@ enum pageflags { #ifdef CONFIG_PAGE_TRACING PG_skb, PG_zspage, +#endif +#ifdef CONFIG_MEM_PURGEABLE + PG_purgeable, #endif __NR_PAGEFLAGS, @@ -461,6 +464,11 @@ PAGEFLAG(Idle, idle, PF_ANY) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_MEM_PURGEABLE +PAGEFLAG(Purgeable, purgeable, PF_ANY) +#else +PAGEFLAG_FALSE(Purgeable) +#endif /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..2332482f7df7 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -55,6 +55,12 @@ __def_gfpflag_names \ ) : "none" +#ifdef CONFIG_MEM_PURGEABLE +#define IF_HAVE_PG_PURGEABLE(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_PURGEABLE(flag,string) +#endif + #ifdef CONFIG_MMU #define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string} #else @@ -107,6 +113,7 @@ {1UL << PG_reclaim, "reclaim" }, \ {1UL << PG_swapbacked, "swapbacked" }, \ {1UL << PG_unevictable, "unevictable" } \ +IF_HAVE_PG_PURGEABLE(PG_purgeable, "purgeable" ) \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..8003a595a007 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -21,6 +21,8 @@ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ +#define MAP_PURGEABLE 0x40 +#define MAP_USEREXPTE 0x80 /* 0x0100 - 0x4000 flags are defined in asm-generic/mman.h */ #define MAP_POPULATE 0x008000 /* populate (prefault) pagetables */ diff --git a/kernel/fork.c b/kernel/fork.c index 298c44dc5e77..f331938e1a16 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,7 @@ #ifdef CONFIG_RECLAIM_ACCT #include #endif +#include #include #include @@ -626,6 +627,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, static inline int mm_alloc_pgd(struct mm_struct *mm) { + mm_init_uxpgd(mm); mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; @@ -635,6 +637,7 @@ static inline int mm_alloc_pgd(struct mm_struct *mm) static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); + mm_clear_uxpgd(mm); } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) diff --git a/mm/Kconfig b/mm/Kconfig index c35f9f8a0857..aadd14639887 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -948,4 +948,18 @@ config MEMTRACE_ASHMEM help Enable the Ashmem Process Info Show +config MEM_PURGEABLE + bool "Purgeable memory feature" + default n + select ARCH_USES_HIGH_VMA_FLAGS + help + Support purgeable pages for process + +config MEM_PURGEABLE_DEBUG + bool "Purgeable memory debug" + default n + depends on MEM_PURGEABLE + help + Debug info for purgeable memory + endmenu diff --git a/mm/Makefile b/mm/Makefile index 4c0ca82c4a64..11025e018458 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -127,3 +127,4 @@ obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o obj-$(CONFIG_RECLAIM_ACCT) += reclaim_acct.o reclaimacct_show.o +obj-$(CONFIG_MEM_PURGEABLE) += purgeable.o diff --git a/mm/memory.c b/mm/memory.c index 4fe24cd865a7..f18948b277f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -1236,6 +1237,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = vm_normal_page(vma, addr, ptent); + if (vma->vm_flags & VM_USEREXPTE) + page = NULL; if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -1251,7 +1254,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - + if (vma->vm_flags & VM_PURGEABLE) { + uxpte_clear_present(vma, addr); + pr_info("unmap purgeable page %lx at addr %lx of vma %lx.\n", page_to_pfn(page), addr, vma); + } if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; @@ -3555,11 +3561,20 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; + /* use extra page table for userexpte */ + if (vma->vm_flags & VM_USEREXPTE) { + ret = do_uxpte_page_fault(vmf, page, &entry); + if (ret == VM_FAULT_OOM) + goto oom; + else + goto got_page; + } /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); +got_page: vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { @@ -3620,6 +3635,11 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); + if (vma->vm_flags & VM_PURGEABLE) { + pr_info("set page %lx purgeable\n", page_to_pfn(page)); + SetPagePurgeable(page); + uxpte_set_present(vma, vmf->address); + } lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); diff --git a/mm/mmap.c b/mm/mmap.c index a5e0958acd05..b134946984a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1559,14 +1559,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr, /* * Ignore pgoff. */ - pgoff = 0; + if (!(flags & MAP_USEREXPTE)) + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ - pgoff = addr >> PAGE_SHIFT; + if (!(flags & MAP_USEREXPTE)) + pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; diff --git a/mm/purgeable.c b/mm/purgeable.c new file mode 100644 index 000000000000..487781d8f472 --- /dev/null +++ b/mm/purgeable.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include + +void mm_init_uxpgd(struct mm_struct *mm) +{ + mm->uxpgd = NULL; + spin_lock_init(&mm->uxpgd_lock); +} + +void mm_clear_uxpgd(struct mm_struct *mm) +{ + struct page *page = NULL; + void **slot = NULL; + struct radix_tree_iter iter; + + spin_lock(&mm->uxpgd_lock); + if (!mm->uxpgd) + goto out; + radix_tree_for_each_slot(slot, mm->uxpgd, &iter, 0) { + page = radix_tree_lookup(mm->uxpgd, iter.index); + put_page(page); + } +out: + spin_unlock(&mm->uxpgd_lock); +} + +int lock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * + PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + if (!vma->vm_mm->uxpgd) + goto unlock; + page = radix_tree_lookup(vma->vm_mm->uxpgd, base); + if (!page) + goto unlock; + uxpte = page_to_virt(page); +retry: + val = atomic64_read(&uxpte[offset]); + if (val >> 1) + goto unlock; + if (atomic64_cmpxchg(&uxpte[offset], val, -2) != val) + goto retry; + val = -2; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("lock uxpte of addr %lx is %lx\n", addr, val); + + return (val == -2) ? 1 : 0; +} + +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * + PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + + spin_lock(&vma->vm_mm->uxpgd_lock); + if (!vma->vm_mm->uxpgd) + goto unlock; + page = radix_tree_lookup(vma->vm_mm->uxpgd, base); + if (!page) + goto unlock; + uxpte = page_to_virt(page); + atomic64_set(&uxpte[offset], 0); +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0); +} + +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, + pte_t *entry) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long offset = (vmf->address - vma->vm_start + vma->vm_pgoff * + PAGE_SIZE) / sizeof(atomic64_t) * PAGE_SIZE; + struct page *dup = NULL; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + if (current->mm->uxpgd) + goto lookup; + spin_lock(¤t->mm->uxpgd_lock); + if (!current->mm->uxpgd) + current->mm->uxpgd = kzalloc(sizeof(struct radix_tree_root), + GFP_KERNEL); + if (current->mm->uxpgd) + INIT_RADIX_TREE(current->mm->uxpgd, GFP_KERNEL); + spin_unlock(¤t->mm->uxpgd_lock); + if (!current->mm->uxpgd) { + pr_err("uxpgd alloc failed.\n"); + goto oom; + } +lookup: + page = radix_tree_lookup(current->mm->uxpgd, offset); + if (page) + goto make_pte; + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); + if (!page) + goto oom; + if (radix_tree_preload(GFP_KERNEL)) { + put_page(page); + pr_err("radix preload fail.\n"); + goto oom; + } + spin_lock(¤t->mm->uxpgd_lock); + dup = radix_tree_lookup(current->mm->uxpgd, offset); + if (dup) { + put_page(page); + page = dup; + } else { + radix_tree_insert(current->mm->uxpgd, offset, page); + } + spin_unlock(¤t->mm->uxpgd_lock); + radix_tree_preload_end(); +make_pte: + *entry = mk_pte(page, vma->vm_page_prot); + *entry = pte_sw_mkyoung(*entry); + if (vma->vm_flags & VM_WRITE) + *entry = pte_mkwrite(pte_mkdirty(*entry)); + return 0; +oom: + return VM_FAULT_OOM; +} + +static struct page *lookup_uxpte_page(struct vm_area_struct *vma, + unsigned long addr, bool alloc) +{ + struct radix_tree_root *uxpgd = NULL; + struct page *page = NULL; + struct page *new_page = NULL; + struct mm_struct *mm = vma->vm_mm; + + if (mm->uxpgd) + goto lookup; + if (!alloc) + goto out; + spin_unlock(&mm->uxpgd_lock); + uxpgd = kzalloc(sizeof(struct radix_tree_root), GFP_KERNEL); + if (!uxpgd) { + pr_err("uxpgd alloc failed.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + INIT_RADIX_TREE(uxpgd, GFP_KERNEL); + spin_lock(&mm->uxpgd_lock); + if (mm->uxpgd) + kfree(uxpgd); + else + mm->uxpgd = uxpgd; +lookup: + page = radix_tree_lookup(mm->uxpgd, addr); + if (page) + goto out; + if (!alloc) + goto out; + spin_unlock(&mm->uxpgd_lock); + new_page = alloc_zeroed_user_highpage_movable(vma, addr); + if (!new_page) { + pr_err("uxpte page alloc fail.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + if (radix_tree_preload(GFP_KERNEL)) { + put_page(new_page); + pr_err("radix preload fail.\n"); + spin_lock(&mm->uxpgd_lock); + goto out; + } + spin_lock(&mm->uxpgd_lock); + page = radix_tree_lookup(mm->uxpgd, addr); + if (page) { + put_page(new_page); + } else { + page = new_page; + radix_tree_insert(mm->uxpgd, addr, page); + } + radix_tree_preload_end(); +out: + return page; +} + +int uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, base, true); + if (!page) + goto unlock; + uxpte = page_to_virt(page); +retry: + val = atomic64_read(&uxpte[offset]); + if (val & 1) + goto unlock; + if (atomic64_cmpxchg(&uxpte[offset], val, val + 1) != val) + goto retry; + val++; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("set present uxpte of addr %lx is %lx\n", addr, val); + + return 0; +} + +int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); + unsigned long offset = (addr - base) / PAGE_SIZE; + atomic64_t *uxpte = NULL; + struct page *page = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, base, false); + if (!page) + goto unlock; + uxpte = page_to_virt(page); +retry: + val = atomic64_read(&uxpte[offset]); + if (!(val & 1)) + goto unlock; + if (atomic64_cmpxchg(&uxpte[offset], val, val - 1) != val) + goto retry; + val--; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("clear present uxpte of addr %lx is %lx\n", addr, val); + + return 0; +} diff --git a/mm/rmap.c b/mm/rmap.c index cdf549f6f617..46a396a826b7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -72,6 +72,7 @@ #include #include #include +#include #include @@ -782,6 +783,14 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; +#ifdef CONFIG_MEM_PURGEABLE + if (!(vma->vm_flags & VM_PURGEABLE)) { + pra->vm_flags &= ~VM_PURGEABLE; + pr_info("page %lx mapped to inpurgeable vma %lx.", page_to_pfn(page), vma); + } else { + pr_info("page %lx mapped to purgeable vma %lx.", page_to_pfn(page), vma); + } +#endif if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; @@ -821,7 +830,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; } if (!pra->mapcount) @@ -860,6 +869,7 @@ int page_referenced(struct page *page, struct page_referenced_arg pra = { .mapcount = total_mapcount(page), .memcg = memcg, + .vm_flags = VM_PURGEABLE, }; struct rmap_walk_control rwc = { .rmap_one = page_referenced_one, @@ -1443,6 +1453,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, continue; } #endif +#ifdef CONFIG_MEM_PURGEABLE + if ((vma->vm_flags & VM_PURGEABLE) && !lock_uxpte(vma, address)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + pr_info("uxpte hold purgeable page %lx\n", page_to_pfn(page)); + break; + } +#endif /* * If the page is mlock()d, we cannot swap it out. @@ -1584,7 +1602,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && + !userfaultfd_armed(vma))) { + if (vma->vm_flags & VM_PURGEABLE) { + unlock_uxpte(vma, address); + pr_info("unmap purgeable page %lx\n", page_to_pfn(page)); + } /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan diff --git a/mm/vmscan.c b/mm/vmscan.c index 5371b75ff477..bdb7ae170233 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -914,6 +914,7 @@ void putback_lru_page(struct page *page) enum page_references { PAGEREF_RECLAIM, PAGEREF_RECLAIM_CLEAN, + PAGEREF_RECLAIM_PURGEABLE, PAGEREF_KEEP, PAGEREF_ACTIVATE, }; @@ -935,6 +936,13 @@ static enum page_references page_check_references(struct page *page, if (vm_flags & VM_LOCKED) return PAGEREF_RECLAIM; +#ifdef CONFIG_MEM_PURGEABLE + pr_info("check page %lx references\n", page_to_pfn(page)); + if (vm_flags & VM_PURGEABLE) { + pr_info("try reclaim purgeable page %lx.\n", page_to_pfn(page)); + return PAGEREF_RECLAIM_PURGEABLE; + } +#endif if (referenced_ptes) { /* * All mapped pages start out with page table @@ -1163,6 +1171,7 @@ unsigned int shrink_page_list(struct list_head *page_list, goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: + case PAGEREF_RECLAIM_PURGEABLE: ; /* try to reclaim the page below */ } @@ -1172,7 +1181,7 @@ unsigned int shrink_page_list(struct list_head *page_list, * Lazyfree page could be freed directly */ if (PageAnon(page) && PageSwapBacked(page)) { - if (!PageSwapCache(page)) { + if (!PageSwapCache(page) && references != PAGEREF_RECLAIM_PURGEABLE) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (page_maybe_dma_pinned(page)) @@ -1247,7 +1256,7 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageDirty(page)) { + if (PageDirty(page) && references != PAGEREF_RECLAIM_PURGEABLE) { /* * Only kswapd can writeback filesystem pages * to avoid risk of stack overflow. But avoid @@ -1355,11 +1364,11 @@ unsigned int shrink_page_list(struct list_head *page_list, } } - if (PageAnon(page) && !PageSwapBacked(page)) { + if (PageAnon(page) && (!PageSwapBacked(page) || references == PAGEREF_RECLAIM_PURGEABLE)) { /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { + if (PageDirty(page) && references != PAGEREF_RECLAIM_PURGEABLE) { page_ref_unfreeze(page, 1); goto keep_locked; } @@ -4084,6 +4093,10 @@ void kswapd_stop(int nid) } } +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static void __init purgeable_debugfs_init(void); +#endif + static int __init kswapd_init(void) { int nid; @@ -4091,6 +4104,9 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); +#ifdef CONFIG_MEM_PURGEABLE_DEBUG + purgeable_debugfs_init(); +#endif return 0; } @@ -4339,3 +4355,73 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_MEM_PURGEABLE_DEBUG +static unsigned long purgeable_node(pg_data_t *pgdata, struct scan_control *sc) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr = 0; + while (memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata); + + shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc); + nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc); + } + + pr_info("reclaim %lu purgeable pages.\n", nr); + + return nr; +} + +static int purgeable(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY, + .may_deactivate = DEACTIVATE_ANON, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + int nid = 0; + + for_each_node_state(nid, N_MEMORY) + purgeable_node(NODE_DATA(nid), &sc); + return 0; +} + +static struct ctl_table ker_tab[] = { + { + .procname = "purgeable", + .mode = 0200, + .proc_handler = purgeable, + }, + {}, +}; + +static struct ctl_table sys_tab[] = { + { + .procname = "kernel", + .mode = 0555, + .child = ker_tab, + }, + {}, +}; + +static struct ctl_table_header *purgeable_header; + +static void __init purgeable_debugfs_init(void) +{ + purgeable_header = register_sysctl_table(sys_tab); + if (!purgeable_header) + pr_err("register purgeable sysctl table failed.\n"); +} + +static void __exit purgeable_debugfs_exit(void) +{ + unregister_sysctl_table(purgeable_header); +} +#endif /* CONFIG_MEM_PURGEABLE_DEBUG */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 5b9b46f42f40..3e89021a3f75 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1154,6 +1154,10 @@ const char * const vmstat_text[] = { "nr_zone_active_anon", "nr_zone_inactive_file", "nr_zone_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_zone_inactive_purgeable", + "nr_zone_active_purgeable", +#endif "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", @@ -1182,6 +1186,10 @@ const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", +#ifdef CONFIG_MEM_PURGEABLE + "nr_inactive_purgeable", + "nr_active_purgeable", +#endif "nr_unevictable", "nr_slab_reclaimable", "nr_slab_unreclaimable", -- Gitee From 279fa95b29af392934b0519ca720cebace4a2799 Mon Sep 17 00:00:00 2001 From: Chengke Wang Date: Wed, 1 Jun 2022 17:08:56 +0800 Subject: [PATCH 2/2] refactory Signed-off-by: Chengke Wang Signed-off-by: lijiawei --- include/linux/mm_purgeable.h | 26 ++-- mm/memory.c | 4 +- mm/purgeable.c | 247 +++++++++++++++++------------------ 3 files changed, 130 insertions(+), 147 deletions(-) diff --git a/include/linux/mm_purgeable.h b/include/linux/mm_purgeable.h index c7556d256305..11994d5cf59c 100644 --- a/include/linux/mm_purgeable.h +++ b/include/linux/mm_purgeable.h @@ -10,12 +10,11 @@ void mm_init_uxpgd(struct mm_struct *mm); void mm_clear_uxpgd(struct mm_struct *mm); -int lock_uxpte(struct vm_area_struct *vma, unsigned long addr); +bool lock_uxpte(struct vm_area_struct *vma, unsigned long addr); void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr); -vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, - pte_t *entry); -int uxpte_set_present(struct vm_area_struct *vma, unsigned long addr); -int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr); +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, pte_t *entry); +bool uxpte_set_present(struct vm_area_struct *vma, unsigned long addr); +void uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr); #else /* CONFIG_MEM_PURGEABLE */ @@ -23,32 +22,29 @@ static inline void mm_init_uxpgd(struct mm_struct *mm) {} static inline void mm_clear_uxpgd(struct mm_struct *mm) {} -static inline int lock_uxpte(struct vm_area_struct *vma, +static inline bool lock_uxpte(struct vm_area_struct *vma, unsigned long addr) { - return 0; + return false; } static inline void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) {} static inline vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, - struct page *page, pte_t *entry) + pte_t *entry) { return 0; } -static inline int uxpte_set_present(struct vm_area_struct *vma, +static inline bool uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) { - return 0; + return false; } -static inline int uxpte_clear_present(struct vm_area_struct *vma, - unsigned long addr) -{ - return 0; -} +static inline void uxpte_clear_present(struct vm_area_struct *vma, + unsigned long addr) {} #endif /* CONFIG_MEM_PURGEABLE */ #endif /* __MM_PURGEABLE_MEM_H */ diff --git a/mm/memory.c b/mm/memory.c index f18948b277f5..d5bf3ef1a9e1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3563,12 +3563,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* use extra page table for userexpte */ if (vma->vm_flags & VM_USEREXPTE) { - ret = do_uxpte_page_fault(vmf, page, &entry); - if (ret == VM_FAULT_OOM) + if (do_uxpte_page_fault(vmf, &entry)) goto oom; else goto got_page; } + /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { diff --git a/mm/purgeable.c b/mm/purgeable.c index 487781d8f472..5bd8e5f0e383 100644 --- a/mm/purgeable.c +++ b/mm/purgeable.c @@ -12,6 +12,28 @@ #include +typedef atomic64_t uxpte_t; + +#define uxpte_read atomic64_read +#define uxpte_set atomic64_set +#define uxpte_cas(v, old, new) (atomic64_cmpxchg((v), (old), (new)) == (old)) + +#define UXPTE_SIZE_SHIFT 3 +#define UXPTE_SIZE (1 << UXPTE_SIZE_SHIFT) + +#define UXPTE_PER_PAGE_SHIFT (PAGE_SHIFT - UXPTE_SIZE_SHIFT) +#define UXPTE_PER_PAGE (1 << UXPTE_PER_PAGE_SHIFT) + +#define UXPTE_PRESENT_BIT 1 +#define UXPTE_PRESENT_MASK (1 << UXPTE_PRESENT_BIT) +#define UXPTE_REFCNT_ONE (1 << UXPTE_PRESENT_BIT) +#define UXPTE_UNDER_RECLAIM (-UXPTE_REFCNT_ONE) + +#define vpn(vaddr) ((vaddr) >> PAGE_SHIFT) +#define uxpte_pn(vaddr) (vpn(vaddr) >> UXPTE_PER_PAGE_SHIFT) +#define uxpte_off(vaddr) (vpn(vaddr) & (UXPTE_PER_PAGE - 1)) +#define uxpn2addr(uxpn) ((uxpn) << (UXPTE_PER_PAGE_SHIFT + PAGE_SHIFT)) + void mm_init_uxpgd(struct mm_struct *mm) { mm->uxpgd = NULL; @@ -28,119 +50,16 @@ void mm_clear_uxpgd(struct mm_struct *mm) if (!mm->uxpgd) goto out; radix_tree_for_each_slot(slot, mm->uxpgd, &iter, 0) { - page = radix_tree_lookup(mm->uxpgd, iter.index); + page = radix_tree_delete(mm->uxpgd, iter.index); put_page(page); } out: + kfree(mm->uxpgd); + mm->uxpgd = NULL; spin_unlock(&mm->uxpgd_lock); } -int lock_uxpte(struct vm_area_struct *vma, unsigned long addr) -{ - unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * - PAGE_SIZE - 1); - unsigned long offset = (addr - base) / PAGE_SIZE; - atomic64_t *uxpte = NULL; - struct page *page = NULL; - long val = 0; - - spin_lock(&vma->vm_mm->uxpgd_lock); - if (!vma->vm_mm->uxpgd) - goto unlock; - page = radix_tree_lookup(vma->vm_mm->uxpgd, base); - if (!page) - goto unlock; - uxpte = page_to_virt(page); -retry: - val = atomic64_read(&uxpte[offset]); - if (val >> 1) - goto unlock; - if (atomic64_cmpxchg(&uxpte[offset], val, -2) != val) - goto retry; - val = -2; -unlock: - spin_unlock(&vma->vm_mm->uxpgd_lock); - pr_info("lock uxpte of addr %lx is %lx\n", addr, val); - - return (val == -2) ? 1 : 0; -} - -void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) -{ - unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * - PAGE_SIZE - 1); - unsigned long offset = (addr - base) / PAGE_SIZE; - atomic64_t *uxpte = NULL; - struct page *page = NULL; - - spin_lock(&vma->vm_mm->uxpgd_lock); - if (!vma->vm_mm->uxpgd) - goto unlock; - page = radix_tree_lookup(vma->vm_mm->uxpgd, base); - if (!page) - goto unlock; - uxpte = page_to_virt(page); - atomic64_set(&uxpte[offset], 0); -unlock: - spin_unlock(&vma->vm_mm->uxpgd_lock); - pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0); -} - -vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, struct page *page, - pte_t *entry) -{ - struct vm_area_struct *vma = vmf->vma; - unsigned long offset = (vmf->address - vma->vm_start + vma->vm_pgoff * - PAGE_SIZE) / sizeof(atomic64_t) * PAGE_SIZE; - struct page *dup = NULL; - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - if (current->mm->uxpgd) - goto lookup; - spin_lock(¤t->mm->uxpgd_lock); - if (!current->mm->uxpgd) - current->mm->uxpgd = kzalloc(sizeof(struct radix_tree_root), - GFP_KERNEL); - if (current->mm->uxpgd) - INIT_RADIX_TREE(current->mm->uxpgd, GFP_KERNEL); - spin_unlock(¤t->mm->uxpgd_lock); - if (!current->mm->uxpgd) { - pr_err("uxpgd alloc failed.\n"); - goto oom; - } -lookup: - page = radix_tree_lookup(current->mm->uxpgd, offset); - if (page) - goto make_pte; - page = alloc_zeroed_user_highpage_movable(vma, vmf->address); - if (!page) - goto oom; - if (radix_tree_preload(GFP_KERNEL)) { - put_page(page); - pr_err("radix preload fail.\n"); - goto oom; - } - spin_lock(¤t->mm->uxpgd_lock); - dup = radix_tree_lookup(current->mm->uxpgd, offset); - if (dup) { - put_page(page); - page = dup; - } else { - radix_tree_insert(current->mm->uxpgd, offset, page); - } - spin_unlock(¤t->mm->uxpgd_lock); - radix_tree_preload_end(); -make_pte: - *entry = mk_pte(page, vma->vm_page_prot); - *entry = pte_sw_mkyoung(*entry); - if (vma->vm_flags & VM_WRITE) - *entry = pte_mkwrite(pte_mkdirty(*entry)); - return 0; -oom: - return VM_FAULT_OOM; -} - +/* should hold uxpgd_lock before invoke */ static struct page *lookup_uxpte_page(struct vm_area_struct *vma, unsigned long addr, bool alloc) { @@ -148,6 +67,7 @@ static struct page *lookup_uxpte_page(struct vm_area_struct *vma, struct page *page = NULL; struct page *new_page = NULL; struct mm_struct *mm = vma->vm_mm; + unsigned long uxpn = uxpte_pn(addr); if (mm->uxpgd) goto lookup; @@ -167,7 +87,7 @@ static struct page *lookup_uxpte_page(struct vm_area_struct *vma, else mm->uxpgd = uxpgd; lookup: - page = radix_tree_lookup(mm->uxpgd, addr); + page = radix_tree_lookup(mm->uxpgd, uxpn); if (page) goto out; if (!alloc) @@ -186,68 +106,135 @@ static struct page *lookup_uxpte_page(struct vm_area_struct *vma, goto out; } spin_lock(&mm->uxpgd_lock); - page = radix_tree_lookup(mm->uxpgd, addr); + page = radix_tree_lookup(mm->uxpgd, uxpn); if (page) { put_page(new_page); } else { page = new_page; - radix_tree_insert(mm->uxpgd, addr, page); + radix_tree_insert(mm->uxpgd, uxpn, page); } radix_tree_preload_end(); out: return page; } -int uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) +/* should hold uxpgd_lock before invoke */ +static uxpte_t *lookup_uxpte(struct vm_area_struct *vma, + unsigned long addr, bool alloc) { - unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); - unsigned long offset = (addr - base) / PAGE_SIZE; - atomic64_t *uxpte = NULL; + uxpte_t *uxpte = NULL; struct page *page = NULL; + + page = lookup_uxpte_page(vma, addr, alloc); + if (!page) + return NULL; + uxpte = page_to_virt(page); + + return uxpte + uxpte_off(addr); +} + +bool lock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; long val = 0; spin_lock(&vma->vm_mm->uxpgd_lock); - page = lookup_uxpte_page(vma, base, true); - if (!page) + uxpte = lookup_uxpte(vma, addr, true); + if (!uxpte) + goto unlock; +retry: + val = uxpte_read(uxpte); + if (val >> 1) + goto unlock; + if (!uxpte_cas(uxpte, val, UXPTE_UNDER_RECLAIM)) + goto retry; + val = UXPTE_UNDER_RECLAIM; +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("lock uxpte of addr %lx is %lx\n", addr, val); + + return val == UXPTE_UNDER_RECLAIM; +} + +void unlock_uxpte(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, false); + if (!uxpte) + goto unlock; + uxpte_set(uxpte, 0); +unlock: + spin_unlock(&vma->vm_mm->uxpgd_lock); + pr_info("unlock uxpte of addr %lx is %lx\n", addr, 0L); +} + +bool uxpte_set_present(struct vm_area_struct *vma, unsigned long addr) +{ + uxpte_t *uxpte = NULL; + long val = 0; + + spin_lock(&vma->vm_mm->uxpgd_lock); + uxpte = lookup_uxpte(vma, addr, true); + if (!uxpte) goto unlock; - uxpte = page_to_virt(page); retry: - val = atomic64_read(&uxpte[offset]); + val = uxpte_read(uxpte); if (val & 1) goto unlock; - if (atomic64_cmpxchg(&uxpte[offset], val, val + 1) != val) + if (!uxpte_cas(uxpte, val, val + 1)) goto retry; val++; unlock: spin_unlock(&vma->vm_mm->uxpgd_lock); pr_info("set present uxpte of addr %lx is %lx\n", addr, val); - return 0; + return val & 1; } -int uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) +void uxpte_clear_present(struct vm_area_struct *vma, unsigned long addr) { - unsigned long base = addr & ~(PAGE_SIZE / sizeof(atomic64_t) * PAGE_SIZE - 1); - unsigned long offset = (addr - base) / PAGE_SIZE; - atomic64_t *uxpte = NULL; - struct page *page = NULL; + uxpte_t *uxpte = NULL; long val = 0; spin_lock(&vma->vm_mm->uxpgd_lock); - page = lookup_uxpte_page(vma, base, false); - if (!page) + uxpte = lookup_uxpte(vma, addr, false); + if (!uxpte) goto unlock; - uxpte = page_to_virt(page); retry: - val = atomic64_read(&uxpte[offset]); + val = uxpte_read(uxpte); if (!(val & 1)) goto unlock; - if (atomic64_cmpxchg(&uxpte[offset], val, val - 1) != val) + if (!uxpte_cas(uxpte, val, val - 1)) goto retry; val--; unlock: spin_unlock(&vma->vm_mm->uxpgd_lock); pr_info("clear present uxpte of addr %lx is %lx\n", addr, val); +} +vm_fault_t do_uxpte_page_fault(struct vm_fault *vmf, pte_t *entry) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long vma_uxpn = vma->vm_pgoff; + unsigned long off_uxpn = vpn(vmf->address - vma->vm_start); + unsigned long addr = uxpn2addr(vma_uxpn + off_uxpn); + struct page *page = NULL; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + spin_lock(&vma->vm_mm->uxpgd_lock); + page = lookup_uxpte_page(vma, addr, true); + spin_unlock(&vma->vm_mm->uxpgd_lock); + + if (!page) + return VM_FAULT_OOM; + + *entry = mk_pte(page, vma->vm_page_prot); + *entry = pte_sw_mkyoung(*entry); + if (vma->vm_flags & VM_WRITE) + *entry = pte_mkwrite(pte_mkdirty(*entry)); return 0; } -- Gitee