diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index daf74a119adf443bbf2798231da6ca95507358d4..b5cafc097bd13eacca928b25258e30f95c68578c 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1228,6 +1228,7 @@ CONFIG_LRU_GEN=y CONFIG_ARM64_HAFT=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y # CONFIG_ASCEND_FEATURES is not set @@ -7008,6 +7009,13 @@ CONFIG_CPU_INSPECTOR_ATF=m CONFIG_ROH=m CONFIG_ROH_HNS=m CONFIG_ARM_SPE_MEM_SAMPLING=y + +# +# remote pager device +# +CONFIG_REMOTE_PAGER=m +CONFIG_REMOTE_PAGER_MASTER=m +# end of remote pager device # end of Device Drivers # diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b46394fa0f144d11b6497811dab88badcefa7bde..15146955834ac50ca745516486b0ffaf3e38a840 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1202,6 +1202,7 @@ CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_STATS is not set CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y CONFIG_PAGE_CACHE_LIMIT=y @@ -8202,6 +8203,13 @@ CONFIG_INTEL_TH_PTI=m # # CONFIG_CPU_INSPECT is not set # end of CPU Inspect + +# +# remote pager device +# +CONFIG_REMOTE_PAGER=m +CONFIG_REMOTE_PAGER_MASTER=m +# end of remote pager device # end of Device Drivers # diff --git a/drivers/base/node.c b/drivers/base/node.c index 4d588f4658c85cc1471da691fecbe744811812b4..b9e095cf349822c6ddb97271d2b32fd1a227fd36 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -931,6 +931,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), +#endif }; static struct attribute *node_state_attrs[] = { @@ -943,6 +946,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_GMEM + &node_state_attr[N_HETEROGENEOUS].attr.attr, +#endif NULL }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 46b4c39a12dbcee7d15eb72a05294b3051502151..5f197d64e4611a694880ea0d8608e19d9550b7c7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -698,6 +698,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_GMEM + [ilog2(VM_PEER_SHARED)] = "ps", +#endif #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif diff --git a/include/linux/device.h b/include/linux/device.h index 92176316a16cdd5727ae5404d03014dcb9bc7f71..2ba9458c6b1298ec338cd02f2328431f2571e993 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -806,7 +806,11 @@ struct device { bool dma_ops_bypass : 1; #endif +#ifdef CONFIG_GMEM + KABI_USE(1, void *gm_dev) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/gmem.h b/include/linux/gmem.h new file mode 100644 index 0000000000000000000000000000000000000000..7beebc67c398f341696bc59a2ceda2d54bead41c --- /dev/null +++ b/include/linux/gmem.h @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ +#ifndef _GMEM_H +#define _GMEM_H + +#include + +struct hnode; + +/* + * enum gm_ret - The return value of GMEM KPI that can be used to tell + * the core VM or peripheral driver whether the GMEM KPI was + * executed successfully. + * + * @GM_RET_SUCCESS: The invoked GMEM KPI behaved as expected. + * @GM_RET_FAILURE_UNKNOWN: The GMEM KPI failed with unknown reason. + * Any external status related to this KPI invocation changes must be rolled back. + */ +enum gm_ret { + GM_RET_SUCCESS = 0, + GM_RET_NOMEM, + GM_RET_PAGE_EXIST, + GM_RET_DMA_ERROR, + GM_RET_MIGRATING, + GM_RET_FAILURE_UNKNOWN, + GM_RET_UNIMPLEMENTED, +}; + +/* + * Defines a contiguous range of virtual addresses inside a struct gm_as + * As an analogy, this is conceptually similar as virtual_address_struct + */ +struct gm_region { + unsigned long start_va; + unsigned long end_va; + struct rb_node node; + struct gm_as *as; /* The address space that it belongs to */ + + /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ + struct list_head mapping_set_link; + + void (*callback_op)(void *args); + void *cb_args; +}; + +/* This holds a list of regions that must not be concurrently manipulated. */ +struct gm_mapping_set { + unsigned int region_cnt; + struct list_head gm_region_list; +}; + +/** + * enum gm_mmu_mode - defines the method to share a physical page table. + * + * @GM_MMU_MODE_SHARE: Literally share a physical page table with another + * attached device's MMU. Nothing is guaranteed about the allocated address. + * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds + * exclusive mapping entries, so that device memory accesses can trigger fault-driven + * migration for automatic data locality optimizations. + * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical + * mapping entries whenever a physical mapping is installed inside the address space, so + * that it may minimize the page faults to be triggered by this device. + */ +enum gm_mmu_mode { + GM_MMU_MODE_SHARE, + GM_MMU_MODE_COHERENT_EXCLUSIVE, + GM_MMU_MODE_REPLICATE, +}; + +/* + * This is the parameter list of peer_map/unmap mmu operations. + * if device should copy data to/from host, set copy and dma_addr + */ +struct gm_fault_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long va; + unsigned long size; + unsigned long prot; + bool copy; + dma_addr_t dma_addr; + int behavior; +}; + +struct gm_memcpy_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long src; + unsigned long dest; + dma_addr_t dma_addr; + size_t size; +}; + +/** + * + * This struct defines a series of MMU functions registered by a peripheral + * device that is to be invoked by GMEM. + * + * pmap is an opaque pointer that identifies a physical page table of a device. + * A physical page table holds the physical mappings that can be interpreted by + * the hardware MMU. + */ +struct gm_mmu { + /* + * Each bit indicates a supported page size for page-based TLB. + * Currently we do not consider range TLBs. + */ + unsigned long pgsize_bitmap; + + /* + * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, + * then it means their page table formats are compatible. + * In that case, they can share the same void *pmap as the input arg. + */ + unsigned long cookie; + + /* Synchronize VMA in a peer OS to interact with the host OS */ + enum gm_ret (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); + enum gm_ret (*peer_va_free)(struct gm_fault_t *gmf); + + /* Create physical mappings on peer host. + * If copy is set, copy data [dma_addr, dma_addr + size] to peer host + */ + enum gm_ret (*peer_map)(struct gm_fault_t *gmf); + /* + * Destroy physical mappings on peer host. + * If copy is set, copy data back to [dma_addr, dma_addr + size] + */ + enum gm_ret (*peer_unmap)(struct gm_fault_t *gmf); + + /* Create or destroy a device's physical page table. */ + enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); + enum gm_ret (*pmap_destroy)(void *pmap); + + /* Create or destroy a physical mapping of a created physical page table */ + enum gm_ret (*pmap_enter)(void *pmap, unsigned long va, unsigned long size, + unsigned long pa, unsigned long prot); + enum gm_ret (*pmap_release)(void *pmap, unsigned long va, unsigned long size); + + /* Change the protection of a virtual page */ + enum gm_ret (*pmap_protect)(void *pmap, unsigned long va, unsigned long size, + unsigned long new_prot); + + /* Invalidation functions of the MMU TLB */ + enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); + enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); + + // copy one area of memory from device to host or from host to device + enum gm_ret (*peer_hmemcpy)(struct gm_memcpy_t *gmc); +}; + +/** + * unsigned long defines a composable flag to describe the capabilities of a device. + * + * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. + * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS + */ +#define GM_DEV_CAP_REPLAYABLE 0x00000001 +#define GM_DEV_CAP_PEER 0x00000010 + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +struct gm_context { + struct gm_as *as; + struct gm_dev *dev; + void *pmap; + /* + * consider a better container to maintain multiple ctx inside a device or multiple ctx + * inside a va space. + * A device may simultaneously have multiple contexts for time-sliced ctx switching + */ + struct list_head gm_dev_link; + + /* A va space may have multiple gm_context */ + struct list_head gm_as_link; +}; +#define get_gm_context(head) (list_entry((head)->prev, struct gm_context, ctx_link)) + +struct gm_dev { + int id; + + /* identifies the device capability + * For example, whether the device supports page faults or whether it has its + * own OS that manages the VA and PA resources. + */ + unsigned long capability; + struct gm_mmu *mmu; + void *dev_data; + /* + * TODO: Use a better container of struct gm_context to support time-sliced context switch. + * A collection of device contexts. If the device does not support time-sliced context + * switch, then the size of the collection should never be greater than one. + * We need to think about what operators should the container be optimized for. + * A list, a radix-tree or what? What would gm_dev_activate require? + * Are there any accelerators that are really going to support time-sliced context switch? + */ + struct gm_context *current_ctx; + + struct list_head gm_ctx_list; + + /* Add tracking of registered device local physical memory. */ + nodemask_t registered_hnodes; + struct device *dma_dev; + + struct gm_mapping *gm_mapping; +}; + +#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ +#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_PAGE_DEVICE 0x20 +#define GM_PAGE_NOMAP 0x40 +#define GM_PAGE_PINNED 0x80 +#define GM_PAGE_WILLNEED 0x100 + +#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) + +/* Records the status of a page-size physical page */ +struct gm_mapping { + unsigned int flag; + + union { + struct page *page; /* CPU node */ + struct gm_dev *dev; /* hetero-node */ + unsigned long pfn; + }; + + struct mutex lock; +}; + +static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) +{ + if (flags & GM_PAGE_TYPE_MASK) + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + + gm_mapping->flag |= flags; +} + +static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags) +{ + gm_mapping->flag &= ~flags; +} + +static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_CPU); +} + +static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_DEVICE); +} + +static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_NOMAP); +} + +static inline bool gm_mapping_willneed(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_WILLNEED); +} + +static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_PINNED); +} + +#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } + +/* GMEM Device KPI */ +extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev); +extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, + unsigned long end); +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, + struct gm_dev *dev, int behavior); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); + +/* GMEM address space KPI */ +extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, + unsigned long end); +extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); +extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as); +extern enum gm_ret gm_as_destroy(struct gm_as *as); +extern enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx); +extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned long size, + unsigned long align, unsigned long no_cross, unsigned long max_va, + struct gm_region **new_region); + +extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); +extern int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); + +enum gmem_stats_item { + NR_PAGE_MIGRATING_H2D, + NR_PAGE_MIGRATING_D2H, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_stats_counter(enum gmem_stats_item item, int val); +extern void gmem_stats_counter_show(void); + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + + struct gm_dev *dev; + + struct xarray pages; +}; + +static inline bool is_hnode(int node) +{ + return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && + node_isset(node, hnode_map); +} + +static inline bool is_hnode_allowed(int node) +{ + return (node < MAX_NUMNODES) && is_hnode(node) && + node_isset(node, current->mems_allowed); +} + +static inline int get_hnuma_id(struct gm_dev *gm_dev) +{ + return first_node(gm_dev->registered_hnodes); +} + +void __init hnuma_init(void); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); +void hnode_deinit(unsigned int hnid, struct gm_dev *dev); + +#define gmem_err(fmt, ...) \ + ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) + +#endif /* _GMEM_H */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cfe42c43b55b679b02b35f362acee6f28d1d145c..88bdd17cadeb6daba91e1b67075a8d41981b39a1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -9,6 +9,9 @@ #include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); +#ifdef CONFIG_GMEM +vm_fault_t do_huge_pmd_anonymous_page_with_peer_shared(struct vm_fault *vmf); +#endif int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); @@ -16,7 +19,6 @@ void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); - #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 77a7d7c4c88c517365c5f338f980a625a41f4cba..ebe1364b005d2267f73474005d2e96a680aebb95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -342,6 +342,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) + +#ifdef CONFIG_GMEM +#define VM_PEER_SHARED BIT(56) +#else +#define VM_PEER_SHARED VM_NONE +#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -3404,6 +3410,12 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#ifdef CONFIG_GMEM +extern unsigned long get_unmapped_area_aligned(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags, unsigned long align); +#endif + extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); @@ -4211,4 +4223,28 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); /* added to mm.h to avoid every caller adding new header file */ #include + +#ifdef CONFIG_GMEM +DECLARE_STATIC_KEY_FALSE(gmem_status); + +static inline bool gmem_is_enabled(void) +{ + return static_branch_likely(&gmem_status); +} + +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + if (!gmem_is_enabled()) + return false; + + return !!(vma->vm_flags & VM_PEER_SHARED); +} +#else +static inline bool gmem_is_enabled(void) { return false; } +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 64c38b09e18d5579dd362cc160f68d6535c70428..f012f7c7c4d4a11c5532e33bf4331ce114687233 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -612,6 +612,74 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +#ifdef CONFIG_GMEM +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; +#endif + struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ @@ -735,7 +803,11 @@ struct vm_area_struct { #ifdef CONFIG_SHARE_POOL struct sp_area *spa; #endif +#ifdef CONFIG_GMEM + KABI_USE(1, struct vm_object *vm_obj) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -1016,7 +1088,11 @@ struct mm_struct { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_GMEM + KABI_USE(2, struct gm_as *gm_as) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/include/linux/mman.h b/include/linux/mman.h index 8ddca62d6460bd461b8afff731bb64a5203b822a..30ec68346f6b0409155afbf32aa3d40e8afb305b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -55,7 +55,8 @@ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ - | MAP_HUGE_1GB) + | MAP_HUGE_1GB \ + | MAP_PEER_SHARED) extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 8d07116caaf1b037c3121bd8ca5011dd4568cdc2..f005f3d903aedc52d0d9423f3077b6cfedd10865 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,6 +407,11 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_GMEM +#ifndef __GENKSYMS__ + N_HETEROGENEOUS, /* The node has heterogeneous memory */ +#endif +#endif NR_NODE_STATES }; @@ -536,6 +541,13 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#ifdef CONFIG_GMEM +/* For h-NUMA topology */ +#define hnode_map node_states[N_HETEROGENEOUS] +#define num_hnodes() num_node_state(N_HETEROGENEOUS) +#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) +#endif + /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/include/linux/remote_pager/msg_chan.h b/include/linux/remote_pager/msg_chan.h new file mode 100644 index 0000000000000000000000000000000000000000..a8049def052d6686a59474846b83c59576cd2263 --- /dev/null +++ b/include/linux/remote_pager/msg_chan.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __RPG_MSG_CHAN_H__ +#define __RPG_MSG_CHAN_H__ + +#include + +/* + * struct phys_channel_ops - Channel physical layer ops + * @open: Open the communication channel of node nid and alloc physical resources, + * returns the channel ID + * @notify: Notify peer of chan_id to receive messages + * @copy_to: Copy the msg_data message from origin to peer + * @copy_from: Copy the msg_data message from peer to origin + * @close: Close channel and free physical resources + */ +struct phys_channel_ops { + char *name; + int (*open)(int nid); + int (*notify)(int chan_id); + int (*copy_to)(int chan_id, void *msg_data, size_t msg_len, int flags); + int (*copy_from)(int chan_id, void *buf, size_t len, int flags); + int (*migrate_page)(void *peer_addr, struct page *local_page, size_t size, int dir); + int (*close)(int chan_id); +}; + +int msg_layer_install_phy_ops(struct phys_channel_ops *ops, int default_chan_id); +int msg_layer_uninstall_phy_ops(struct phys_channel_ops *ops); + +#define log_err(fmt, ...) pr_err("[%s:%d]" fmt, __func__, __LINE__, ##__VA_ARGS__) +#define log_info(fmt, ...) pr_info("[%s:%d]" fmt, __func__, __LINE__, ##__VA_ARGS__) + +#define MSG_CMD_START 0x1 +#define MSG_CMD_IRQ_END 0x2 +#define MSG_CMD_FIFO_NO_MEM 0x3 +#define MSG_CMD_CHANN_OPEN 0x4 + +#define CHAN_STAT_ENABLE 1 +#define CHAN_STAT_DISABLE 0 + +#define TO_PEER 0 +#define FROM_PEER 1 + +#endif diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h new file mode 100644 index 0000000000000000000000000000000000000000..ca82642eb2df62503d8a1caeaf96341b073fc778 --- /dev/null +++ b/include/linux/vm_object.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VM_OBJECT_H +#define _VM_OBJECT_H + +#include +#include + +#ifdef CONFIG_GMEM +/* vm_object KPI */ +int __init vm_object_init(void); +struct vm_object *vm_object_create(struct vm_area_struct *vma); +void vm_object_drop_locked(struct vm_area_struct *vma); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared); +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end); +void dup_peer_shared_vma(struct vm_area_struct *vma); + +struct gm_mapping *alloc_gm_mapping(void); +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); +void vm_object_mapping_create(struct vm_object *obj, unsigned long start); +void free_gm_mappings(struct vm_area_struct *vma); +#else +static inline void __init vm_object_init(void) {} +static inline struct vm_object *vm_object_create(struct vm_area_struct *vma) { return NULL; } +static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void dup_vm_object(struct vm_area_struct *dst, + struct vm_area_struct *src, bool dst_peer_shared) {} +static inline void dup_peer_shared_vma(struct vm_area_struct *vma) {} +static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end) {} + +static inline struct gm_mapping *alloc_gm_mapping(void) { return NULL; } +static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, + unsigned long va) { return NULL; } +static inline void vm_object_mapping_create(struct vm_object *obj, + unsigned long start) {} +static inline void free_gm_mappings(struct vm_area_struct *vma) {} +#endif + +#endif /* _VM_OBJECT_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 14e5498efd7acab203c0d43e48e0536ed52ffead..d8857c71d4bb40f1c8daecbcb61bd025f2e6c5c2 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,6 +33,8 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ +#define MAP_PEER_SHARED 0x1000000 + /* * Flags for mlock */ @@ -79,6 +81,11 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* for hmadvise */ +#define MADV_GMEM_BASE 0x1000 +#define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ +#define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ + #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ #define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) diff --git a/kernel/fork.c b/kernel/fork.c index 96c6a9e446ac01de782450b563ba52cc3bc794b3..3461216d7b0b68dc851de292b7b667ac75d0299e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,11 @@ #include #include #include + +#ifdef CONFIG_GMEM +#include +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID #include #endif @@ -110,7 +115,9 @@ #include #include #include - +#ifdef CONFIG_GMEM +#include +#endif #include #define CREATE_TRACE_POINTS @@ -523,6 +530,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); +#ifdef CONFIG_GMEM + dup_peer_shared_vma(new); +#endif + return new; } @@ -548,6 +559,10 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) void vm_area_free(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + vm_object_drop_locked(vma); +#endif #ifdef CONFIG_PER_VMA_LOCK call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); #else @@ -1765,7 +1780,9 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, err = dup_mmap(mm, oldmm); if (err) goto free_pt; - +#ifdef CONFIG_GMEM + mm->gm_as = NULL; +#endif mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; diff --git a/mm/Kconfig b/mm/Kconfig index bdd8372552ffd0fd17a1c879c5fe1545f99f0f0c..829a0d6a0fb5e50caef77fe02423ecd053119b1c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1480,6 +1480,21 @@ config NUMABALANCING_MEM_SAMPLING if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. +config GMEM + bool "gmem subsystem for multi-MMU cooperative management" + depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + default y + help + This provides a high-level interface that decouples MMU-specific functions. + Device drivers can thus attach themselves to a process’s address space and + let the OS take charge of their memory management. This eliminates + the need for device drivers to reinvent the wheel and allows them to + benefit from general memory optimizations integrated by GMEM. + + say Y here to enable gmem subsystem + + source "mm/damon/Kconfig" config THP_CONTROL diff --git a/mm/Makefile b/mm/Makefile index 5e45f01f56ce94b4faab4fcea880477f50988e88..108f82ac69172f42552a0489463a90c3a56a28e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o - +mmu-$(CONFIG_GMEM) += gmem.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c new file mode 100644 index 0000000000000000000000000000000000000000..039f4cfe28db1795aa4c5a00e0896da6c0c3e19f --- /dev/null +++ b/mm/gmem.c @@ -0,0 +1,1064 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(gmem_status); +EXPORT_SYMBOL_GPL(gmem_status); + +static struct kmem_cache *gm_as_cache; +static struct kmem_cache *gm_dev_cache; +static struct kmem_cache *gm_ctx_cache; +static struct kmem_cache *gm_region_cache; +static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); + +static bool enable_gmem; + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +static inline unsigned long pe_mask(unsigned int order) +{ + if (order == 0) + return PAGE_MASK; + if (order == PMD_ORDER) + return HPAGE_PMD_MASK; + if (order == PUD_ORDER) + return HPAGE_PUD_MASK; + return 0; +} + +static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; + +void gmem_stats_counter(enum gmem_stats_item item, int val) +{ + if (!gmem_is_enabled()) + return; + + if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS))) + return; + + percpu_counter_add(&g_gmem_stats[item], val); +} + +static int gmem_stats_init(void) +{ + int i, rc; + + for (i = 0; i < NR_GMEM_STAT_ITEMS; i++) { + rc = percpu_counter_init(&g_gmem_stats[i], 0, GFP_KERNEL); + if (rc) { + int j; + + for (j = i-1; j >= 0; j--) + percpu_counter_destroy(&g_gmem_stats[j]); + + break; /* break the initialization process */ + } + } + + return rc; +} + +#ifdef CONFIG_PROC_FS +static int gmem_stats_show(struct seq_file *m, void *arg) +{ + if (!gmem_is_enabled()) + return 0; + + seq_printf( + m, "migrating H2D : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_H2D])); + seq_printf( + m, "migrating D2H : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_D2H])); + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static struct workqueue_struct *prefetch_wq; + +#define GM_WORK_CONCURRENCY 4 + +static int __init gmem_init(void) +{ + int err = -ENOMEM; + + if (!enable_gmem) + return 0; + + gm_as_cache = KMEM_CACHE(gm_as, 0); + if (!gm_as_cache) + goto out; + + gm_dev_cache = KMEM_CACHE(gm_dev, 0); + if (!gm_dev_cache) + goto free_as; + + gm_ctx_cache = KMEM_CACHE(gm_context, 0); + if (!gm_ctx_cache) + goto free_dev; + + gm_region_cache = KMEM_CACHE(gm_region, 0); + if (!gm_region_cache) + goto free_ctx; + + err = vm_object_init(); + if (err) + goto free_region; + + err = gmem_stats_init(); + if (err) + goto free_region; + + prefetch_wq = alloc_workqueue("prefetch", + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); + if (!prefetch_wq) { + gmem_err("fail to alloc workqueue prefetch_wq\n"); + err = -EFAULT; + goto free_region; + } + +#ifdef CONFIG_PROC_FS + proc_create_single("gmemstats", 0444, NULL, gmem_stats_show); +#endif + + static_branch_enable(&gmem_status); + + return 0; + +free_region: + kmem_cache_destroy(gm_region_cache); +free_ctx: + kmem_cache_destroy(gm_ctx_cache); +free_dev: + kmem_cache_destroy(gm_dev_cache); +free_as: + kmem_cache_destroy(gm_as_cache); +out: + return -ENOMEM; +} +subsys_initcall(gmem_init); + +static int __init setup_gmem(char *str) +{ + strtobool(str, &enable_gmem); + + return 1; +} +__setup("gmem=", setup_gmem); + +/* + * Create a GMEM device, register its MMU function and the page table. + * The returned device pointer will be passed by new_dev. + * A unique id will be assigned to the GMEM device, using Linux's xarray. + */ +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev) +{ + struct gm_dev *dev; + + if (!gmem_is_enabled()) + return GM_RET_FAILURE_UNKNOWN; + + dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); + if (!dev) + return GM_RET_NOMEM; + + if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, + GFP_KERNEL)) { + kmem_cache_free(gm_dev_cache, dev); + return GM_RET_NOMEM; + } + + dev->capability = cap; + dev->mmu = mmu; + dev->dev_data = dev_data; + dev->current_ctx = NULL; + INIT_LIST_HEAD(&dev->gm_ctx_list); + *new_dev = dev; + nodes_clear(dev->registered_hnodes); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_create); + +/* Handle the page fault triggered by a given device with mmap lock*/ +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior) +{ + enum gm_ret ret = GM_RET_SUCCESS; + struct gm_mmu *mmu = dev->mmu; + struct device *dma_dev = dev->dma_dev; + struct vm_area_struct *vma; + struct vm_object *obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior + }; + struct page *page = NULL; + + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) { + gmem_err("%s failed to find vma\n", __func__); + pr_info("gmem: %s no vma\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + obj = vma->vm_obj; + if (!obj) { + gmem_err("%s no vm_obj\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + vm_object_mapping_create(obj, addr); + gm_mapping = vm_object_lookup(obj, addr); + } + xa_unlock(obj->logical_page_table); + + if (unlikely(!gm_mapping)) { + gmem_err("OOM when creating vm_obj!\n"); + ret = GM_RET_NOMEM; + goto out; + } + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + goto peer_map; + } else if (gm_mapping_device(gm_mapping)) { + if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { + goto peer_map; + } else { + ret = 0; + goto unlock; + } + } else if (gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (!page) { + gmem_err("host gm_mapping page is NULL. Set nomap\n"); + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + goto unlock; + } + get_page(page); + /* zap_page_range_single can be used in Linux 6.4 and later versions. */ + zap_page_range_single(vma, addr, size, NULL); + gmf.dma_addr = + dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) + gmem_err("dma map failed\n"); + + gmf.copy = true; + } + +peer_map: + ret = mmu->peer_map(&gmf); + if (ret != GM_RET_SUCCESS) { + if (ret == GM_RET_MIGRATING) { + /* + * gmem page is migrating due to overcommit. + * update page to willneed and this will stop page evicting + */ + gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); + gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); + ret = GM_RET_SUCCESS; + } else { + gmem_err("peer map failed\n"); + if (page) { + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + put_page(page); + } + } + goto unlock; + } + + if (page) { + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + put_page(page); + } + + gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); + gm_mapping->dev = dev; +unlock: + mutex_unlock(&gm_mapping->lock); +out: + return ret; +} +EXPORT_SYMBOL_GPL(gm_dev_fault_locked); + +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, + unsigned int order) +{ + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & pe_mask(order); + struct vm_object *obj = vma->vm_obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_dev *dev; + struct device *dma_dev; + struct gm_fault_t gmf = { + .mm = vma->vm_mm, + .va = addr, + .size = size, + .copy = true, + }; + + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + gmem_err("host fault gm_mapping should not be NULL\n"); + return VM_FAULT_SIGBUS; + } + + dev = gm_mapping->dev; + gmf.dev = dev; + dma_dev = dev->dma_dev; + gmf.dma_addr = + dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + gmem_err("host fault dma mapping error\n"); + return VM_FAULT_SIGBUS; + } + if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { + gmem_err("peer unmap failed\n"); + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return VM_FAULT_SIGBUS; + } + + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return ret; +} + +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +static struct gm_dev *get_gm_dev(unsigned int nid) +{ + struct hnode *hnode; + struct gm_dev *dev = NULL; + + spin_lock(&hnode_lock); + hnode = get_hnode(nid); + if (hnode) + dev = hnode->dev; + spin_unlock(&hnode_lock); + return dev; +} + +/* + * Register the local physical memory of a gmem device. + * This implies dynamically creating + * the struct page data structures. + */ +enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) +{ + struct gm_mapping *mapping; + unsigned long addr = PAGE_ALIGN(begin); + unsigned int nid; + int i, page_num = (end - addr) >> PAGE_SHIFT; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + + if (!hnode) + goto err; + + mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); + if (!mapping) + goto free_hnode; + + spin_lock(&hnode_lock); + nid = alloc_hnode_id(); + if (nid == MAX_NUMNODES) + goto unlock_hnode; + hnode_init(hnode, nid, dev); + + for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { + mapping[i].pfn = addr >> PAGE_SHIFT; + mapping[i].flag = 0; + } + + xa_lock(&hnode->pages); + for (i = 0; i < page_num; i++) { + if (xa_err(__xa_store(&hnode->pages, i, mapping + i, + GFP_KERNEL))) { + /* Probably nomem */ + kvfree(mapping); + xa_unlock(&hnode->pages); + goto deinit_hnode; + } + __xa_set_mark(&hnode->pages, i, XA_MARK_0); + } + xa_unlock(&hnode->pages); + + spin_unlock(&hnode_lock); + return GM_RET_SUCCESS; + +deinit_hnode: + hnode_deinit(nid, dev); + free_hnode_id(nid); +unlock_hnode: + spin_unlock(&hnode_lock); +free_hnode: + kfree(hnode); +err: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gm_dev_register_physmem); + +void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) +{ + struct hnode *hnode = NULL; + struct gm_mapping *mapping = NULL; + + spin_lock(&hnode_lock); + + if (!node_isset(nid, dev->registered_hnodes)) + goto unlock; + + hnode = get_hnode(nid); + + if (!hnode) + goto unlock; + mapping = xa_load(&hnode->pages, 0); + + if (mapping) + kvfree(mapping); + + hnode_deinit(nid, dev); + free_hnode_id(nid); + kfree(hnode); +unlock: + spin_unlock(&hnode_lock); +} +EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); + +/* GMEM Virtual Address Space API */ +enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as) +{ + struct gm_as *as; + + if (!new_as) + return -EINVAL; + + as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC); + if (!as) + return -ENOMEM; + + spin_lock_init(&as->rbtree_lock); + as->rbroot = RB_ROOT; + as->start_va = begin; + as->end_va = end; + as->policy = policy; + + INIT_LIST_HEAD(&as->gm_ctx_list); + + *new_as = as; + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_create); + +enum gm_ret gm_as_destroy(struct gm_as *as) +{ + struct gm_context *ctx, *tmp_ctx; + + list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link) + kfree(ctx); + + kmem_cache_free(gm_as_cache, as); + + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_destroy); + +enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx) +{ + struct gm_context *ctx; + int nid; + int ret; + + ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); + if (!ctx) + return GM_RET_NOMEM; + + ctx->as = as; + ctx->dev = dev; + ctx->pmap = NULL; + ret = dev->mmu->pmap_create(dev, &ctx->pmap); + if (ret) { + kmem_cache_free(gm_ctx_cache, ctx); + return ret; + } + + INIT_LIST_HEAD(&ctx->gm_dev_link); + INIT_LIST_HEAD(&ctx->gm_as_link); + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); + list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); + + if (activate) { + /* + * Here we should really have a callback function to perform the context switch + * for the hardware. E.g. in x86 this function is effectively + * flushing the CR3 value. Currently we do not care time-sliced context switch, + * unless someone wants to support it. + */ + dev->current_ctx = ctx; + } + *out_ctx = ctx; + + /* + * gm_as_attach will be used to attach device to process address space. + * Handle this case and add hnodes registered by device to process mems_allowed. + */ + for_each_node_mask(nid, dev->registered_hnodes) + node_set(nid, current->mems_allowed); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_attach); + +void __init hnuma_init(void) +{ + unsigned int node; + spin_lock(&hnode_lock); + for_each_node(node) + node_set(node, hnode_map); + spin_unlock(&hnode_lock); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + spin_lock(&hnode_lock); + node_clear(nid, hnode_map); + spin_unlock(&hnode_lock); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid] = hnode; + hnodes[hnid]->id = hnid; + hnodes[hnid]->dev = dev; + node_set(hnid, dev->registered_hnodes); + xa_init(&hnodes[hnid]->pages); +} + +void hnode_deinit(unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + xa_destroy(&hnodes[hnid]->pages); + hnodes[hnid] = NULL; +} + +struct prefetch_data { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long addr; + size_t size; + struct work_struct work; + int *res; +}; + +static void prefetch_work_cb(struct work_struct *work) +{ + struct prefetch_data *d = + container_of(work, struct prefetch_data, work); + unsigned long addr = d->addr, end = d->addr + d->size; + int page_size = HPAGE_SIZE; + int ret; + + do { + /* MADV_WILLNEED: dev will soon access this addr. */ + mmap_read_lock(d->mm); + ret = gm_dev_fault_locked(d->mm, addr, d->dev, MADV_WILLNEED); + mmap_read_unlock(d->mm); + if (ret == GM_RET_PAGE_EXIST) { + gmem_err("%s: device has done page fault, ignore prefetch\n", + __func__); + } else if (ret != GM_RET_SUCCESS) { + *d->res = -EFAULT; + gmem_err("%s: call dev fault error %d\n", __func__, ret); + } + } while (addr += page_size, addr != end); + + kfree(d); +} + +static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size) +{ + unsigned long start, end, per_size; + int page_size = HPAGE_SIZE; + struct prefetch_data *data; + struct vm_area_struct *vma; + int res = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); + return -EINVAL; + } + + old_start = end; + + /* Align addr by rounding outward to make page cover addr. */ + end = round_up(end, page_size); + start = round_down(addr, page_size); + size = end - start; + + if (!end && old_start) { + gmem_err("end addr align up 2M causes invalid addr\n"); + return -EINVAL; + } + + if (size == 0) + return 0; + + mmap_read_lock(current->mm); + vma = find_vma(current->mm, start); + if (!vma || start < vma->vm_start || end > vma->vm_end) { + mmap_read_unlock(current->mm); + gmem_err("failed to find vma by invalid start or size.\n"); + return GM_RET_FAILURE_UNKNOWN; + } else if (!vma_is_peer_shared(vma)) { + mmap_read_unlock(current->mm); + gmem_err("%s the vma does not use VM_PEER_SHARED\n", __func__); + return GM_RET_FAILURE_UNKNOWN; + } + mmap_read_unlock(current->mm); + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (start < end) { + data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL); + if (!data) { + flush_workqueue(prefetch_wq); + return GM_RET_NOMEM; + } + + INIT_WORK(&data->work, prefetch_work_cb); + data->mm = current->mm; + data->dev = dev; + data->addr = start; + data->res = &res; + if (per_size == 0) + data->size = size; + else + /* Process (1.x * per_size) for the last time */ + data->size = (end - start < 2 * per_size) ? + (end - start) : + per_size; + queue_work(prefetch_wq, &data->work); + start += data->size; + } + + flush_workqueue(prefetch_wq); + return res; +} + +static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int page_size) +{ + struct gm_fault_t gmf = { + .mm = current->mm, + .size = page_size, + .copy = false, + }; + struct gm_mapping *gm_mapping; + struct vm_object *obj; + int ret; + + obj = vma->vm_obj; + if (!obj) { + gmem_err("peer-shared vma should have vm_object\n"); + return -EINVAL; + } + + for (; start < end; start += page_size) { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, start); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } else if (gm_mapping_cpu(gm_mapping)) { + zap_page_range_single(vma, start, page_size, NULL); + } else { + gmf.va = start; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret) { + gmem_err("peer_unmap failed. ret %d\n", ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + } + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + mutex_unlock(&gm_mapping->lock); + } + + return 0; +} + +static int hmadvise_do_eagerfree(unsigned long addr, size_t size) +{ + unsigned long start, end, i_start, i_end; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); + return -EINVAL; + } + + old_start = addr; + + /* Align addr by rounding inward to avoid excessive page release. */ + end = round_down(end, page_size); + start = round_up(addr, page_size); + if (start >= end) { + pr_debug("gmem:start align up 2M >= end align down 2M.\n"); + return ret; + } + + /* Check to see whether len was rounded up from small -ve to zero */ + if (old_start && !start) { + gmem_err("start addr align up 2M causes invalid addr"); + return -EINVAL; + } + + mmap_read_lock(current->mm); + do { + vma = find_vma_intersection(current->mm, start, end); + if (!vma) { + gmem_err("gmem: there is no valid vma\n"); + break; + } + + if (!vma_is_peer_shared(vma)) { + pr_debug("gmem:not peer-shared vma, skip dontneed\n"); + start = vma->vm_end; + continue; + } + + i_start = start > vma->vm_start ? start : vma->vm_start; + i_end = end < vma->vm_end ? end : vma->vm_end; + ret = gmem_unmap_vma_pages(vma, i_start, i_end, page_size); + if (ret) + break; + + start = vma->vm_end; + } while (start < end); + + mmap_read_unlock(current->mm); + return ret; +} + +static bool check_hmadvise_behavior(int behavior) +{ + return behavior == MADV_DONTNEED; +} + +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) +{ + int error = -EINVAL; + struct gm_dev *dev = NULL; + + if (hnid == -1) { + if (check_hmadvise_behavior(behavior)) { + goto no_hnid; + } else { + gmem_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); + return error; + } + } + + if (hnid < 0) { + gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); + return error; + } + + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + gmem_err("hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); + return error; + } + + dev = get_gm_dev(hnid); + if (!dev) { + gmem_err("hmadvise: hnode id %d is invalid\n", hnid); + return error; + } + +no_hnid: + switch (behavior) { + case MADV_PREFETCH: + return hmadvise_do_prefetch(dev, start, len_in); + case MADV_DONTNEED: + return hmadvise_do_eagerfree(start, len_in); + default: + gmem_err("hmadvise: unsupported behavior %d\n", behavior); + } + + return error; +} +EXPORT_SYMBOL_GPL(hmadvise_inner); + +static bool hnid_match_dest(int hnid, struct gm_mapping *dest) +{ + return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); +} + +static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, + unsigned long src, size_t size) +{ + enum gm_ret ret; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma_dest, *vma_src; + struct gm_mapping *gm_mapping_dest, *gm_mapping_src; + struct gm_dev *dev = NULL; + struct gm_memcpy_t gmc = {0}; + + if (size == 0) + return; + + mmap_read_lock(mm); + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); + + if (!vma_src || vma_src->vm_start > src || !vma_dest || vma_dest->vm_start > dest) { + gmem_err("hmemcpy: the vma find by src/dest is NULL!\n"); + goto unlock_mm; + } + + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + + if (!gm_mapping_src) { + gmem_err("hmemcpy: gm_mapping_src is NULL\n"); + goto unlock_mm; + } + + if (hnid != -1) { + dev = get_gm_dev(hnid); + if (!dev) { + gmem_err("hmemcpy: hnode's dev is NULL\n"); + goto unlock_mm; + } + } + + // Trigger dest page fault on host or device + if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) + || !hnid_match_dest(hnid, gm_mapping_dest)) { + if (hnid == -1) { + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d\n", + __func__, ret); + goto unlock_mm; + } + } else { + ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); + if (ret != GM_RET_SUCCESS) { + gmem_err("%s: failed to excecute dev page fault.\n", __func__); + goto unlock_mm; + } + } + } + if (!gm_mapping_dest) + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); + + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_lock(&gm_mapping_dest->lock); + mutex_lock(&gm_mapping_src->lock); + // Use memcpy when there is no device address, otherwise use peer_memcpy + if (hnid == -1) { + if (gm_mapping_cpu(gm_mapping_src)) { // host to host + gmem_err("hmemcpy: host to host is unimplemented\n"); + goto unlock_gm_mmaping; + } else { // device to host + dev = gm_mapping_src->dev; + gmc.dma_addr = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); + gmc.src = src; + } + } else { + if (gm_mapping_cpu(gm_mapping_src)) { // host to device + gmc.dest = dest; + gmc.dma_addr = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); + } else { // device to device + gmem_err("hmemcpy: device to device is unimplemented\n"); + goto unlock_gm_mmaping; + } + } + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + +unlock_gm_mmaping: + mutex_unlock(&gm_mapping_src->lock); + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_unlock(&gm_mapping_dest->lock); +unlock_mm: + mmap_read_unlock(mm); +} + +/* + * Each page needs to be copied in three parts when the address is not aligned. + * | ml <--0-->|<1><--2-> | + * | -------|--------- | + * | / /| / / | + * | / / | / / | + * | / / |/ / | + * | ----------|------ | + * | | | + * |<----page x---->|<----page y---->| + */ + +static void __hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) +{ + int i = 0; + // offsets within the huge page for the source and destination addresses + int src_offset = src & (HPAGE_SIZE - 1); + int dst_offset = dest & (HPAGE_SIZE - 1); + // Divide each page into three parts according to the align + int ml[3] = { + HPAGE_SIZE - (src_offset < dst_offset ? dst_offset : src_offset), + src_offset < dst_offset ? (dst_offset - src_offset) : (src_offset - dst_offset), + src_offset < dst_offset ? src_offset : dst_offset + }; + struct mm_struct *mm = current->mm; + + if (size == 0) + return; + + while (size >= ml[i]) { + if (ml[i] > 0) { + do_hmemcpy(mm, hnid, dest, src, ml[i]); + src += ml[i]; + dest += ml[i]; + size -= ml[i]; + } + i = (i + 1) % 3; + } + + if (size > 0) + do_hmemcpy(mm, hnid, dest, src, size); +} + +int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) +{ + struct vm_area_struct *vma_dest, *vma_src; + struct mm_struct *mm = current->mm; + + if (hnid < 0) { + if (hnid != -1) { + gmem_err("hmemcpy: invalid hnid %d < 0\n", hnid); + return -EINVAL; + } + } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + gmem_err("hmemcpy: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); + return -EINVAL; + } + + mmap_read_lock(mm); + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); + + if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || + !vma_is_peer_shared(vma_src) || vma_src->vm_end < (src + size)) { + gmem_err("failed to find peer_shared vma by invalid src or size\n"); + goto unlock; + } + + if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || + !vma_is_peer_shared(vma_dest) || vma_dest->vm_end < (dest + size)) { + gmem_err("failed to find peer_shared vma by invalid dest or size\n"); + goto unlock; + } + + if (!(vma_dest->vm_flags & VM_WRITE)) { + gmem_err("dest is not writable.\n"); + goto unlock; + } + mmap_read_unlock(mm); + + __hmemcpy(hnid, dest, src, size); + + return 0; + +unlock: + mmap_read_unlock(mm); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(hmemcpy); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7045b7b7ac4aceb54f490c2f6b21480a9dad4483..ce5162d952adc41ce43b9ccadf9fd45cc1f0c374 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,6 +38,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1318,6 +1322,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; +#ifdef CONFIG_GMEM + struct gm_mapping *gm_mapping = NULL; + + if (vma_is_peer_shared(vma)) + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); +#endif VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); @@ -1327,7 +1337,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, count_vm_event(THP_FAULT_FALLBACK_CHARGE); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; } folio_throttle_swaprate(folio, gfp); @@ -1337,7 +1348,16 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, goto release; } +#ifdef CONFIG_GMEM + /* + * gmem device overcommit needs to reload the swapped page, + * so skip it to avoid clearing device data. + */ + if (!vma_is_peer_shared(vma) || !gm_mapping_cpu(gm_mapping)) + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); +#else clear_huge_page(page, vmf->address, HPAGE_PMD_NR); +#endif /* * The memory barrier inside __folio_mark_uptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -1362,7 +1382,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + goto gm_mapping_release; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -1370,6 +1390,14 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && gm_mapping_device(gm_mapping)) { + vmf->page = page; + ret = gm_host_fault_locked(vmf, PMD_ORDER); + if (ret) + goto unlock_release; + } +#endif set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1379,6 +1407,13 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); + gm_mapping->page = page; + mutex_unlock(&gm_mapping->lock); + } +#endif } return 0; @@ -1388,6 +1423,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, if (pgtable) pte_free(vma->vm_mm, pgtable); folio_put(folio); +gm_mapping_release: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + mutex_unlock(&gm_mapping->lock); +#endif return ret; } @@ -1446,7 +1486,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; - struct folio *folio; + struct folio *folio = NULL; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret; @@ -1455,10 +1495,12 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = vmf_anon_prepare(vmf); if (ret) return ret; + khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && + !vma_is_peer_shared(vma) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; struct page *zero_page; @@ -1497,14 +1539,81 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = vma_thp_gfp_mask(vma); + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; + } + return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); +gm_mapping_release: + return ret; +} + +#ifdef CONFIG_GMEM +vm_fault_t do_huge_pmd_anonymous_page_with_peer_shared(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + gfp_t gfp; + struct folio *folio = NULL; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; + struct gm_mapping *gm_mapping; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + if (unlikely(!gm_mapping)) { + gmem_err("OOM when creating vm_obj!\n"); + return VM_FAULT_OOM; + } + mutex_lock(&gm_mapping->lock); + if (unlikely(!pmd_none(*vmf->pmd))) + goto gm_mapping_release; + + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) { + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; + } + ret = vmf_anon_prepare(vmf); + if (ret) + goto gm_mapping_release; + + khugepaged_enter_vma(vma, vma->vm_flags); + + gfp = vma_thp_gfp_mask(vma); + + /* + * gmem support device memory overcommit, which uses host page + * as the device's swap space. When device needs to reload data, + * remap the swapped page. + */ + if (gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + gfp = GFP_TRANSHUGE; + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + } + + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); + ret = VM_FAULT_FALLBACK; + goto gm_mapping_release; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); +gm_mapping_release: + mutex_unlock(&gm_mapping->lock); + return ret; } +#endif static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, diff --git a/mm/memory.c b/mm/memory.c index c81a2c3be01318d2b99df4247c0674d2f1795d7a..4509798a96e90c8a0d48e66bb13bef3250d9568c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1710,6 +1714,50 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; } +#ifdef CONFIG_GMEM +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + struct gm_mapping *gm_mapping = NULL; + struct page *page = NULL; + + if (!vma->vm_obj) + return; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + + if (gm_mapping && gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (page && (page_ref_count(page) != 0)) { + put_page(page); + gm_mapping->page = NULL; + } + } + xa_unlock(vma->vm_obj->logical_page_table); +} + +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + zap_logic_pmd_range(vma, addr, next); + } while (addr = next, addr != end); +} +#else +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +#endif + static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, @@ -1740,6 +1788,19 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } +#ifdef CONFIG_GMEM + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_lock in read + * mode. + */ + if (vma_is_peer_shared(vma)) { + if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) + zap_logic_pmd_range(vma, addr, next); + } +#endif if (pmd_none(*pmd)) { addr = next; continue; @@ -1771,8 +1832,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, goto next; /* fall through */ } - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); @@ -1792,8 +1856,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none_or_clear_bad(p4d)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); @@ -1813,8 +1880,13 @@ void unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); +#endif continue; + } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); @@ -5634,8 +5706,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_GMEM + if (vma_is_anonymous(vma)) { + if (vma_is_peer_shared(vma)) + return do_huge_pmd_anonymous_page_with_peer_shared(vmf); + else + return do_huge_pmd_anonymous_page(vmf); + } +#else if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); +#endif if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; @@ -5822,7 +5903,9 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; - +#ifdef CONFIG_GMEM + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; +#endif pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5875,9 +5958,21 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; + if (vma_is_peer_shared(vma)) + return VM_FAULT_OOM; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && pmd_none(*vmf.pmd) && + (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))) { + /* if transparent hugepage is not enabled, return pagefault failed */ + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return VM_FAULT_SIGBUS; + } +#endif + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1587efaf777e3b1fc40eb35f51d012dab3533133..d397307de792be813d4af356fc70f4ae78d53bcf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1902,8 +1902,13 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) + return false; +#else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; +#endif /* * DAX device mappings require predictable access latency, so avoid diff --git a/mm/mm_init.c b/mm/mm_init.c index 6677aaa5972d4e97fe5604d64d73dab3903fe7c6..1a3d3b6e52c9c20d73f7b557663b67eb86d71960 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,9 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" +#ifdef CONFIG_GMEM +#include +#endif #include @@ -2797,6 +2800,9 @@ static void __init mem_init_print_info(void) */ void __init mm_core_init(void) { +#ifdef CONFIG_GMEM + hnuma_init(); +#endif /* Initializations relying on SMP setup */ build_all_zonelists(NULL); page_alloc_init_cpuhp(); diff --git a/mm/mmap.c b/mm/mmap.c index 32799ed58022740bb08d25e62c5aa300ea286b1e..3a97de39adc81241ac8442b83d7879b7a7e9245a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -642,7 +646,9 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, */ if (src->anon_vma && !dst->anon_vma) { int ret; - +#ifdef CONFIG_GMEM + dup_vm_object(dst, src, true); +#endif vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); @@ -1073,6 +1079,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(adjust)) + vm_object_adjust(adjust, adjust->vm_start + adj_start, + adjust->vm_end); +#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -1307,11 +1318,21 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + len = round_up(len, PMD_SIZE); + addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, + PMD_SIZE); + } else { + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } +#else addr = get_unmapped_area(file, addr, len, pgoff, flags); +#endif if (IS_ERR_VALUE(addr)) return addr; - if (flags & MAP_FIXED_NOREPLACE) { + if ((flags & MAP_FIXED_NOREPLACE) || (gmem_is_enabled() && (flags & MAP_PEER_SHARED))) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } @@ -1430,6 +1451,14 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_GMEM + if (flags & MAP_PEER_SHARED) { + if (gmem_is_enabled()) + vm_flags |= VM_PEER_SHARED; + else + return -EINVAL; + } +#endif addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && @@ -1438,6 +1467,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon *populate = len; return addr; } +EXPORT_SYMBOL(__do_mmap_mm); unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1456,7 +1486,26 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_SHARED) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_SHARE cannot be used together.\n"); + goto out_fput; + } + if (gmem_is_enabled() && (flags & MAP_HUGETLB) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_HUGETLB cannot be used together.\n"); + goto out_fput; + } +#endif if (!(flags & MAP_ANONYMOUS)) { +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED cannot map file page.\n"); + goto out_fput; + } +#endif audit_mmap_fd(fd, flags); file = fget(fd); if (!file) @@ -1924,6 +1973,29 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +#ifdef CONFIG_GMEM +unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +EXPORT_SYMBOL(get_unmapped_area_aligned); +#endif + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2462,7 +2534,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; - +#ifdef COFNIG_GMEM + dup_vm_object(new, vma, false); +#endif if (new->vm_file) get_file(new->vm_file); @@ -2477,6 +2551,18 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + if (new_below) { + vm_object_adjust(new, new->vm_start, addr); + vm_object_adjust(vma, addr, vma->vm_end); + } else { + vm_object_adjust(vma, vma->vm_start, addr); + vm_object_adjust(new, addr, new->vm_end); + } + } +#endif + if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; @@ -2514,6 +2600,122 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +#ifdef CONFIG_GMEM +static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + enum gm_ret ret; + struct gm_context *ctx, *tmp; + struct gm_mapping *gm_mapping; + + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("%s: call dev peer_unmap error %d\n", __func__, ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma failed, ret %d\n", ret); + } +} + +static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + if (vma_is_peer_shared(vma)) + munmap_single_vma_in_peer_devices(mm, vma, start, end); + } +} + +static unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma, *vma_end; + + vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); + if (!vma || !vma_is_peer_shared(vma)) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + } + + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); + return len; +} + +static void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end, ret; + + ret = gmem_unmap_align(mm, start, len); + + if (!ret || IS_ERR_VALUE(ret)) + return; + + end = start + ret; + munmap_in_peer_devices(mm, start, end); +} +#endif + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2588,6 +2790,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, locked_vm += vma_pages(next); count++; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_single_vma_in_peer_devices(mm, vma, start, end); +#endif if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2644,6 +2850,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); + if (next) vma_iter_prev_range(vmi); @@ -2702,6 +2909,17 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) { + unsigned long ret = gmem_unmap_align(mm, start, len); + + if (IS_ERR_VALUE(ret)) + return ret; + else if (ret) + len = ret; + } +#endif + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2736,6 +2954,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { VMA_ITERATOR(vmi, mm, start); +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); +#endif return do_vmi_munmap(&vmi, mm, start, len, uf, false); } @@ -2765,21 +2987,24 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) + (len >> PAGE_SHIFT) - nr_pages)) { return -ENOMEM; + } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { return -ENOMEM; + } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) + if (security_vm_enough_memory_mm(mm, charged)) { return -ENOMEM; + } vm_flags |= VM_ACCOUNT; } @@ -3020,6 +3245,11 @@ static int __vm_munmap(unsigned long start, size_t len, bool unlock) if (sp_check_addr(start)) return -EINVAL; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); +#endif + if (mmap_write_lock_killable(mm)) return -EINTR; @@ -3401,6 +3631,10 @@ void exit_mmap(struct mm_struct *mm) __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && mm->gm_as) + gm_as_destroy(mm->gm_as); +#endif } /* Insert vm structure into process list sorted by address diff --git a/mm/mprotect.c b/mm/mprotect.c index ed08f87e39c44e8f5f7149c2f2eb2acace593dd3..55367abe168bc4bc710b356e0f43d31f515c7994 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -693,7 +693,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; +#ifdef CONFIG_GMEM + struct vm_area_struct *vma, *prev, *vma_end; +#else struct vm_area_struct *vma, *prev; +#endif int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && @@ -736,7 +740,19 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + start = ALIGN_DOWN(start, HPAGE_SIZE); + vma_end = find_vma(current->mm, end); + if (vma_end && vma_end->vm_start < end && vma_is_peer_shared(vma_end)) + end = ALIGN(end, HPAGE_SIZE); + if (end <= start) { + error = -ENOMEM; + goto out; + } + len = end - start; + } +#endif if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c233d61d0d06df9a48b779ad600d094ddd95510a..80b29d946a0d92ff57891a903cd4f252bd0434ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -194,6 +194,9 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = NODE_MASK_NONE, +#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM diff --git a/mm/util.c b/mm/util.c index f3d6751b2f2a6fc58cd9ce1364fcad3678a01190..77149510fdd2b55af0f934ab494bfbe4492d5f35 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,9 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif #include "internal.h" #include "swap.h" @@ -545,6 +548,114 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm); +#ifdef CONFIG_GMEM +static unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, + unsigned long flag) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct gm_context *ctx, *tmp; + unsigned long prot = VM_NONE; + enum gm_ret ret; + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; + + vma = find_vma(mm, addr); + if (!vma) { + gmem_err("vma for addr %lx is NULL, should not happen\n", addr); + return -EINVAL; + } + + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return -EINVAL; + } + + prot |= vma->vm_flags; + + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, HPAGE_SIZE, &mm->gm_as); + if (ret) { + gmem_err("gm_as_create failed\n"); + return ret; + } + } + + ret = -ENODEV; + // TODO: consider the concurrency problem of device attaching/detaching from the gm_as. + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = prot, + }; + + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("device mmap failed\n"); + return ret; + } + } + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + + return ret; +} + +struct gmem_vma_list { + unsigned long start; + size_t len; + struct list_head list; +}; + +static void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *head) +{ + struct vm_area_struct *vma; + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + vma = find_vma(mm, start); + if (!vma || vma->vm_start >= start + len) { + kfree(node); + return; + } + vm_flags_set(vma, ~VM_PEER_SHARED); + + node->start = start; + node->len = round_up(len, SZ_2M); + list_add_tail(&node->list, head); +} + +static void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + unsigned long start = node->start; + size_t len = node->len; + + if (len) + vm_munmap(start, len); + + list_del(&node->list); + kfree(node); + } +} +#endif + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) @@ -553,7 +664,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); - +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); +retry: +#endif ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) @@ -564,6 +679,27 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && !IS_ERR_VALUE(ret) && flag & MAP_PEER_SHARED) { + enum gm_ret gm_ret = 0; + + gm_ret = alloc_va_in_peer_devices(ret, len, flag); + /* + * if alloc_va_in_peer_devices failed + * add vma to reserve_list and release after find a proper vma + */ + if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + gmem_reserve_vma(mm, ret, len, &reserve_list); + goto retry; + } else if (gm_ret != GM_RET_SUCCESS) { + gmem_err("alloc vma ret %lu\n", ret); + gmem_reserve_vma(mm, ret, len, &reserve_list); + ret = -ENOMEM; + } + gmem_release_vma(mm, &reserve_list); + } +#endif } return ret; } diff --git a/mm/vm_object.c b/mm/vm_object.c new file mode 100644 index 0000000000000000000000000000000000000000..3c8932c47270b49a32e33c8e41b306a197435cbc --- /dev/null +++ b/mm/vm_object.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Logical Mapping Management + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi zhu, chao Liu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA + * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA + * gets changed: merge, split, adjust + */ +static struct kmem_cache *vm_object_cachep; +static struct kmem_cache *gm_mapping_cachep; + +/* gm_mapping will not be release dynamically */ +struct gm_mapping *alloc_gm_mapping(void) +{ + struct gm_mapping *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + + if (!gm_mapping) + return NULL; + + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + mutex_init(&gm_mapping->lock); + + return gm_mapping; +} +EXPORT_SYMBOL(alloc_gm_mapping); + +static inline void release_gm_mapping(struct gm_mapping *mapping) +{ + kmem_cache_free(gm_mapping_cachep, mapping); +} + +static inline struct gm_mapping *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) +{ + return xa_load(obj->logical_page_table, pindex); +} + +int __init vm_object_init(void) +{ + vm_object_cachep = KMEM_CACHE(vm_object, 0); + if (!vm_object_cachep) + goto out; + + gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0); + if (!gm_mapping_cachep) + goto free_vm_object; + + return 0; +free_vm_object: + kmem_cache_destroy(vm_object_cachep); +out: + return -ENOMEM; +} + +/* + * Create a VM_OBJECT and attach it to a VMA + * This should be called when a VMA is created. + */ +struct vm_object *vm_object_create(struct vm_area_struct *vma) +{ + struct vm_object *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL); + + if (!obj) + return NULL; + + spin_lock_init(&obj->lock); + obj->vma = vma; + + /* + * The logical page table maps linear_page_index(obj->vma, va) + * to pointers of struct gm_mapping. + */ + obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL); + if (!obj->logical_page_table) { + kmem_cache_free(vm_object_cachep, obj); + return NULL; + } + + xa_init(obj->logical_page_table); + atomic_set(&obj->nr_pages, 0); + atomic_set(&obj->ref_count, 1); + + return obj; +} + +/* This should be called when a VMA no longer refers to a VM_OBJECT */ +void vm_object_drop_locked(struct vm_area_struct *vma) +{ + struct vm_object *obj = vma->vm_obj; + + if (!obj) { + pr_err("vm_object: vm_obj of the vma is NULL\n"); + return; + } + + /* + * We must enter this with VMA write-locked, which is unfortunately a giant lock. + * Note that Linux 6.0 has per-VMA lock: + * https://lwn.net/Articles/906852/ + * https://lwn.net/Articles/906833/ + */ + free_gm_mappings(vma); + mmap_assert_write_locked(vma->vm_mm); + vma->vm_obj = NULL; + + if (atomic_dec_and_test(&obj->ref_count)) { + xa_destroy(obj->logical_page_table); + kfree(obj->logical_page_table); + kmem_cache_free(vm_object_cachep, obj); + } +} + +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared) +{ + unsigned long index; + struct gm_mapping *mapping; + unsigned long moved_pages = 0; + + if (dst_peer_shared) { + if (!vma_is_peer_shared(dst)) + return; + } else { + if (!vma_is_peer_shared(src)) + return; + } + + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); + + xa_lock(dst->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) { + index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff + + ((src->vm_start - dst->vm_start) >> PAGE_SHIFT); + __xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &dst->vm_obj->nr_pages); + xa_unlock(dst->vm_obj->logical_page_table); +} + +void dup_peer_shared_vma(struct vm_area_struct *vma) +{ + if (vma_is_peer_shared(vma)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + vma->vm_obj = vm_object_create(vma); + } +} + +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ + unsigned long removed_pages = 0; + struct gm_mapping *mapping; + + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xas_lock(&xas); + if (vma->vm_start < start) { + xas_for_each(&xas, mapping, linear_page_index(vma, start)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + + if (vma->vm_end > end) { + xas_set(&xas, linear_page_index(vma, end)); + + xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + atomic_sub(removed_pages, &vma->vm_obj->nr_pages); + xas_unlock(&xas); +} + +/* + * Given a VA, the page_index is computed by + * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address) + */ +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) +{ + return lookup_gm_mapping(obj, linear_page_index(obj->vma, va)); +} +EXPORT_SYMBOL_GPL(vm_object_lookup); + +void vm_object_mapping_create(struct vm_object *obj, unsigned long start) +{ + pgoff_t index = linear_page_index(obj->vma, start); + struct gm_mapping *gm_mapping; + + gm_mapping = alloc_gm_mapping(); + if (!gm_mapping) + return; + + __xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL); +} + +void free_gm_mappings(struct vm_area_struct *vma) +{ + struct gm_mapping *gm_mapping; + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end - SZ_2M)) { + release_gm_mapping(gm_mapping); + xas_store(&xas, NULL); + } + xa_unlock(vma->vm_obj->logical_page_table); +}