diff --git a/0005-include-msi-modify-kabi-size-of-msi_desc.patch b/0005-include-msi-modify-kabi-size-of-msi_desc.patch new file mode 100644 index 0000000000000000000000000000000000000000..79c77ab1e2d4ebfe75b6d3b522790ebcd6df6204 --- /dev/null +++ b/0005-include-msi-modify-kabi-size-of-msi_desc.patch @@ -0,0 +1,45 @@ +From 723d41836db7669ab658d3e07c62fcbe17d7d7f4 Mon Sep 17 00:00:00 2001 +From: zhengjunlong +Date: Fri, 11 Oct 2024 17:08:35 +0800 +Subject: [PATCH 01/17] include/msi: modify kabi size of msi_desc + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IAW8JF + +---------------------------------------------------- + +Change the size of the pre-embedded memory for msi_desc to 40 bytes. + +Signed-off-by: Zheng Junlong +--- + include/linux/msi.h | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +diff --git a/include/linux/msi.h b/include/linux/msi.h +index 7354ffb14856..5fd8a6caae98 100644 +--- a/include/linux/msi.h ++++ b/include/linux/msi.h +@@ -205,15 +205,12 @@ struct msi_desc { + union { + struct pci_msi_desc pci; + struct msi_desc_data data; +- KABI_RESERVE(1) +- KABI_RESERVE(2) +- KABI_RESERVE(3) +- KABI_RESERVE(4) ++ KABI_EXTEND_WITH_SIZE(KABI_RESERVE(1), 5) + }; ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + KABI_RESERVE(5) +- KABI_RESERVE(6) +- KABI_RESERVE(7) +- KABI_RESERVE(8) + }; + + /* +-- +2.25.1 + diff --git a/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch new file mode 100644 index 0000000000000000000000000000000000000000..1d3c32fe05be2df1ee24d756c0db25e7377882fd --- /dev/null +++ b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch @@ -0,0 +1,40 @@ +From e68e6e3cf90ec8fb7893057c768d55e83855aaa0 Mon Sep 17 00:00:00 2001 +From: Li Lingfeng +Date: Mon, 16 Dec 2024 20:15:25 +0800 +Subject: [PATCH 03/17] nfs: fix the loss of superblock's initialized flags + +hulk inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IB42W1 + +-------------------------------- + +Commit 573573887e0b ("nfs: pass flags to second superblock") directly +assigns fc->sb_flags to dentry->d_sb->s_flags, which will cause the loss +of the initialized flags in dentry->d_sb->s_flags. + +Fix it by just passing SB_RDONLY from fc->sb_flags to +dentry->d_sb->s_flags. + +Fixes: 573573887e0b ("nfs: pass flags to second superblock") +Signed-off-by: Li Lingfeng +--- + fs/nfs/nfs4super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c +index bb13894ad152..e87f878178f3 100644 +--- a/fs/nfs/nfs4super.c ++++ b/fs/nfs/nfs4super.c +@@ -209,7 +209,7 @@ static int do_nfs4_mount(struct nfs_server *server, + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- dentry->d_sb->s_flags = fc->sb_flags; ++ dentry->d_sb->s_flags |= (fc->sb_flags & SB_RDONLY); + fc->root = dentry; + return 0; + } +-- +2.25.1 + diff --git a/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch new file mode 100644 index 0000000000000000000000000000000000000000..f9c3ab227f17a2ee820ed733a996a6e9e1fdc9d1 --- /dev/null +++ b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch @@ -0,0 +1,61 @@ +From 844a44e5a21be8062fd0c120a75e9ecf97427ae8 Mon Sep 17 00:00:00 2001 +From: hanliyang +Date: Mon, 16 Dec 2024 20:44:36 +0800 +Subject: [PATCH 04/17] x86/config: Enable CONFIG_CMA by default in + openeuler_defconfig + +hygon inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI +CVE: NA + +--------------------------- + +Enable CONFIG_CMA will change kabi. + +Enable CONFIG_CMA will also enable CONFIG_DMA_CMA. + +Signed-off-by: hanliyang +--- + arch/x86/configs/openeuler_defconfig | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig +index 8e8542796a13..adfaef0cb10c 100644 +--- a/arch/x86/configs/openeuler_defconfig ++++ b/arch/x86/configs/openeuler_defconfig +@@ -1158,7 +1158,11 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y + CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y + CONFIG_USE_PERCPU_NUMA_NODE_ID=y + CONFIG_HAVE_SETUP_PER_CPU_AREA=y +-# CONFIG_CMA is not set ++CONFIG_CMA=y ++# CONFIG_CMA_DEBUG is not set ++# CONFIG_CMA_DEBUGFS is not set ++# CONFIG_CMA_SYSFS is not set ++CONFIG_CMA_AREAS=19 + CONFIG_MEM_SOFT_DIRTY=y + CONFIG_GENERIC_EARLY_IOREMAP=y + CONFIG_DEFERRED_STRUCT_PAGE_INIT=y +@@ -9018,6 +9022,18 @@ CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y + CONFIG_SWIOTLB=y + # CONFIG_SWIOTLB_DYNAMIC is not set + CONFIG_DMA_COHERENT_POOL=y ++CONFIG_DMA_CMA=y ++# CONFIG_DMA_NUMA_CMA is not set ++ ++# ++# Default contiguous memory area size: ++# ++CONFIG_CMA_SIZE_MBYTES=0 ++CONFIG_CMA_SIZE_SEL_MBYTES=y ++# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set ++# CONFIG_CMA_SIZE_SEL_MIN is not set ++# CONFIG_CMA_SIZE_SEL_MAX is not set ++CONFIG_CMA_ALIGNMENT=8 + # CONFIG_DMA_API_DEBUG is not set + # CONFIG_DMA_MAP_BENCHMARK is not set + CONFIG_SGL_ALLOC=y +-- +2.25.1 + diff --git a/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch new file mode 100644 index 0000000000000000000000000000000000000000..79f223e8325d70865474d83d9c4c89e5d2ca9bf4 --- /dev/null +++ b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch @@ -0,0 +1,35 @@ +From f0e6b8ca2a5b0bc1347906ff6b80422c4c9878b2 Mon Sep 17 00:00:00 2001 +From: hanliyang +Date: Mon, 16 Dec 2024 20:52:08 +0800 +Subject: [PATCH 05/17] x86/Kconfig: Select CONFIG_CMA if CONFIG_HYGON_CSV=y + +hygon inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI +CVE: NA + +--------------------------- + +The Hygon CSV3 use CMA to manage CSV3 guest's private memory. If the +CONFIG_HYGON_CSV is enabled, then enable CONFIG_CMA automatically. + +Signed-off-by: hanliyang +--- + arch/x86/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index fcd0c3b2065d..a6bbe6029121 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2075,6 +2075,7 @@ config HYGON_CSV + bool "Hygon secure virtualization CSV support" + default y + depends on CPU_SUP_HYGON && AMD_MEM_ENCRYPT ++ select CONFIG_CMA + help + Hygon CSV integrates secure processor, memory encryption and + memory isolation to provide the ability to protect guest's private +-- +2.25.1 + diff --git a/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch new file mode 100644 index 0000000000000000000000000000000000000000..a07a0a576382ff80a1e12d00aabc365047db7efa --- /dev/null +++ b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch @@ -0,0 +1,60 @@ +From 44c5a161852ac117a94ed7748784aecaab552b47 Mon Sep 17 00:00:00 2001 +From: Kuniyuki Iwashima +Date: Tue, 17 Dec 2024 16:33:23 +0800 +Subject: [PATCH 06/17] tcp: Fix use-after-free of nreq in + reqsk_timer_handler(). + +stable inclusion +from stable-v6.6.64 +commit 65ed89cad1f57034c256b016e89e8c0a4ec7c65b +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBA6RL +CVE: NA + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=65ed89cad1f57034c256b016e89e8c0a4ec7c65b + +------------------------------------------------- + +[ Upstream commit c31e72d021db2714df03df6c42855a1db592716c ] + +The cited commit replaced inet_csk_reqsk_queue_drop_and_put() with +__inet_csk_reqsk_queue_drop() and reqsk_put() in reqsk_timer_handler(). + +Then, oreq should be passed to reqsk_put() instead of req; otherwise +use-after-free of nreq could happen when reqsk is migrated but the +retry attempt failed (e.g. due to timeout). + +Let's pass oreq to reqsk_put(). + +Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().") +Reported-by: Liu Jian +Closes: https://lore.kernel.org/netdev/1284490f-9525-42ee-b7b8-ccadf6606f6d@huawei.com/ +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Vadim Fedorenko +Reviewed-by: Liu Jian +Reviewed-by: Eric Dumazet +Reviewed-by: Martin KaFai Lau +Link: https://patch.msgid.link/20241123174236.62438-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +Signed-off-by: Liu Jian +--- + net/ipv4/inet_connection_sock.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index ca8cc0988b61..bd032ac2376e 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -1124,7 +1124,7 @@ static void reqsk_timer_handler(struct timer_list *t) + + drop: + __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); +- reqsk_put(req); ++ reqsk_put(oreq); + } + + static bool reqsk_queue_hash_req(struct request_sock *req, +-- +2.25.1 + diff --git a/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch new file mode 100644 index 0000000000000000000000000000000000000000..9a958456d1487dd65305ecd45b7261c3186101ac --- /dev/null +++ b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch @@ -0,0 +1,63 @@ +From c189729809e4c7a6298126a76db608da2b571240 Mon Sep 17 00:00:00 2001 +From: Pu Lehui +Date: Wed, 18 Dec 2024 06:24:00 +0000 +Subject: [PATCH 08/17] bpf: Add kabi reserve padding for uapi struct + bpf_link_info + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC248 + +-------------------------------- + +Add kabi reserve padding for uapi struct bpf_link_info + +Signed-off-by: Pu Lehui +--- + include/uapi/linux/bpf.h | 9 +++++++++ + tools/include/uapi/linux/bpf.h | 9 +++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h +index 482647774bf5..a660cb68c853 100644 +--- a/include/uapi/linux/bpf.h ++++ b/include/uapi/linux/bpf.h +@@ -6573,6 +6573,15 @@ struct bpf_link_info { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ ++ struct { ++ __u64:64; ++ __u32:32; ++ __u32:32; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ } kabi_reserve; + }; + } perf_event; + struct { +diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h +index c112c6f7c766..9b302242be6c 100644 +--- a/tools/include/uapi/linux/bpf.h ++++ b/tools/include/uapi/linux/bpf.h +@@ -6576,6 +6576,15 @@ struct bpf_link_info { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ ++ struct { ++ __u64:64; ++ __u32:32; ++ __u32:32; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ __u64:64; ++ } kabi_reserve; + }; + } perf_event; + struct { +-- +2.25.1 + diff --git a/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch new file mode 100644 index 0000000000000000000000000000000000000000..43e830cb9ff9e7129dbbb2322c2698bdca0d8050 --- /dev/null +++ b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch @@ -0,0 +1,38 @@ +From bbfb8fd7b1297acf7769a814f3fbf919afd391dc Mon Sep 17 00:00:00 2001 +From: Zhang Zekun +Date: Wed, 18 Dec 2024 14:43:35 +0800 +Subject: [PATCH 09/17] iommu: Reserve extra KABI entry for struct iopf_group + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBBRHP + +--------------------------------------------------------------- + +The list_head entry in iopf_group has been moved to iopf_group_extend +for KABI compatibility and the lack of KABI reserve entry. Reserve extra +kabi entry for future usage. + +Signed-off-by: Zhang Zekun +--- + include/linux/iommu.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/include/linux/iommu.h b/include/linux/iommu.h +index bb463cb96a44..83ec4bf9809e 100644 +--- a/include/linux/iommu.h ++++ b/include/linux/iommu.h +@@ -155,6 +155,10 @@ struct iopf_group { + KABI_USE(2, u32 cookie) + KABI_RESERVE(3) + KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct iopf_group_extend { +-- +2.25.1 + diff --git a/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch new file mode 100644 index 0000000000000000000000000000000000000000..371e3afefc94e3038552e8c5abc889b030d75667 --- /dev/null +++ b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch @@ -0,0 +1,45 @@ +From 1cb26ea1471efb775f2aa141863e82efead07d61 Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Wed, 18 Dec 2024 15:21:56 +0800 +Subject: [PATCH 10/17] seq_file: kabi: KABI reservation for seq_file + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC34X + +---------------------------------------------------------------------- + + structure size reserves reserved + seq_file 120 1 128 + seq_operations 32 1 40 + +Signed-off-by: Baokun Li +--- + include/linux/seq_file.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h +index 234bcdb1fba4..cf4a2258df85 100644 +--- a/include/linux/seq_file.h ++++ b/include/linux/seq_file.h +@@ -27,6 +27,8 @@ struct seq_file { + int poll_event; + const struct file *file; + void *private; ++ ++ KABI_RESERVE(1) + }; + + struct seq_operations { +@@ -34,6 +36,8 @@ struct seq_operations { + void (*stop) (struct seq_file *m, void *v); + void * (*next) (struct seq_file *m, void *v, loff_t *pos); + int (*show) (struct seq_file *m, void *v); ++ ++ KABI_RESERVE(1) + }; + + #define SEQ_SKIP 1 +-- +2.25.1 + diff --git a/0015-statx-kabi-KABI-reservation-for-kstat.patch b/0015-statx-kabi-KABI-reservation-for-kstat.patch new file mode 100644 index 0000000000000000000000000000000000000000..12b7151da8e418c07a999522b7cf35fd49fb33dd --- /dev/null +++ b/0015-statx-kabi-KABI-reservation-for-kstat.patch @@ -0,0 +1,38 @@ +From ed5b59b6c40d2563994c1f7b5a1321affb490d45 Mon Sep 17 00:00:00 2001 +From: Baokun Li +Date: Wed, 18 Dec 2024 15:23:01 +0800 +Subject: [PATCH 11/17] statx: kabi: KABI reservation for kstat + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC24E + +---------------------------------------------------------------------- + + structure size reserves reserved mainline + kstat 160 4 192 184 + +Signed-off-by: Baokun Li +--- + include/linux/stat.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/include/linux/stat.h b/include/linux/stat.h +index 52150570d37a..d342e89b7aaa 100644 +--- a/include/linux/stat.h ++++ b/include/linux/stat.h +@@ -53,6 +53,11 @@ struct kstat { + u32 dio_mem_align; + u32 dio_offset_align; + u64 change_cookie; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ +-- +2.25.1 + diff --git a/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch new file mode 100644 index 0000000000000000000000000000000000000000..ca2556d094eb53b3afc70b790eda1820ab3cee4c --- /dev/null +++ b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch @@ -0,0 +1,200 @@ +From 30f7b1506ec798949e6ce99c023780b0306845c9 Mon Sep 17 00:00:00 2001 +From: "Matthew Wilcox (Oracle)" +Date: Wed, 18 Dec 2024 15:31:44 +0800 +Subject: [PATCH 12/17] fs: Allow fine-grained control of folio sizes + +mainline inclusion +from mainline-v6.10-rc2 +commit 84429b675bcfd2a518ae167ee4661cdf7539aa7d +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC20Q + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=84429b675bcfd2a518ae167ee4661cdf7539aa7d + +-------------------------------- + +We need filesystems to be able to communicate acceptable folio sizes +to the pagecache for a variety of uses (e.g. large block sizes). +Support a range of folio sizes between order-0 and order-31. + +Signed-off-by: Matthew Wilcox (Oracle) +Co-developed-by: Pankaj Raghav +Signed-off-by: Pankaj Raghav +Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com +Tested-by: David Howells +Reviewed-by: Hannes Reinecke +Reviewed-by: Darrick J. Wong +Reviewed-by: Daniel Gomez +Signed-off-by: Christian Brauner +Conflicts: + include/linux/pagemap.h + mm/filemap.c +[Conflicts due to not merged 83ee0e20fd9f ("filemap: support disable large +folios on active inode")] +Signed-off-by: Long Li +--- + include/linux/pagemap.h | 90 +++++++++++++++++++++++++++++++++++------ + mm/readahead.c | 4 +- + 2 files changed, 79 insertions(+), 15 deletions(-) + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 429627abfef4..e44e377661f2 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -203,12 +203,21 @@ enum mapping_flags { + AS_EXITING = 4, /* final truncate in progress */ + /* writeback related tags are not used */ + AS_NO_WRITEBACK_TAGS = 5, +- AS_LARGE_FOLIO_SUPPORT = 6, +- AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ +- AS_STABLE_WRITES, /* must wait for writeback before modifying ++ AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ ++ AS_STABLE_WRITES = 7, /* must wait for writeback before modifying + folio contents */ ++ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ ++ /* Bits 16-25 are used for FOLIO_ORDER */ ++ AS_FOLIO_ORDER_BITS = 5, ++ AS_FOLIO_ORDER_MIN = 16, ++ AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, + }; + ++#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) ++#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) ++#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) ++#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) ++ + /** + * mapping_set_error - record a writeback error in the address_space + * @mapping: the mapping in which an error should be set +@@ -348,9 +357,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) + #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) + ++/* ++ * mapping_set_folio_order_range() - Set the orders supported by a file. ++ * @mapping: The address space of the file. ++ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). ++ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). ++ * ++ * The filesystem should call this function in its inode constructor to ++ * indicate which base size (min) and maximum size (max) of folio the VFS ++ * can use to cache the contents of the file. This should only be used ++ * if the filesystem needs special handling of folio sizes (ie there is ++ * something the core cannot know). ++ * Do not tune it based on, eg, i_size. ++ * ++ * Context: This should not be called while the inode is active as it ++ * is non-atomic. ++ */ ++static inline void mapping_set_folio_order_range(struct address_space *mapping, ++ unsigned int min, ++ unsigned int max) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return; ++ ++ if (min > MAX_PAGECACHE_ORDER) ++ min = MAX_PAGECACHE_ORDER; ++ ++ if (max > MAX_PAGECACHE_ORDER) ++ max = MAX_PAGECACHE_ORDER; ++ ++ if (max < min) ++ max = min; ++ ++ mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | ++ (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); ++} ++ ++static inline void mapping_set_folio_min_order(struct address_space *mapping, ++ unsigned int min) ++{ ++ mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); ++} ++ + /** + * mapping_set_large_folios() - Indicate the file supports large folios. +- * @mapping: The file. ++ * @mapping: The address space of the file. + * + * The filesystem should call this function in its inode constructor to + * indicate that the VFS can use large folios to cache the contents of +@@ -361,7 +412,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) + */ + static inline void mapping_set_large_folios(struct address_space *mapping) + { +- __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); ++} ++ ++static inline unsigned int ++mapping_max_folio_order(const struct address_space *mapping) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return 0; ++ return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; ++} ++ ++static inline unsigned int ++mapping_min_folio_order(const struct address_space *mapping) ++{ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) ++ return 0; ++ return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; + } + + /** +@@ -375,7 +442,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping) + static inline void mapping_clear_large_folios(struct address_space *mapping) + { + WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock)); +- __clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ mapping_set_folio_order_range(mapping, 0, 0); + } + + /* +@@ -384,20 +451,17 @@ static inline void mapping_clear_large_folios(struct address_space *mapping) + */ + static inline bool mapping_large_folio_support(struct address_space *mapping) + { +- /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ ++ /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + +- return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && +- test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); ++ return mapping_max_folio_order(mapping) > 0; + } + + /* Return the maximum folio size for this pagecache mapping, in bytes. */ +-static inline size_t mapping_max_folio_size(struct address_space *mapping) ++static inline size_t mapping_max_folio_size(const struct address_space *mapping) + { +- if (mapping_large_folio_support(mapping)) +- return PAGE_SIZE << MAX_PAGECACHE_ORDER; +- return PAGE_SIZE; ++ return PAGE_SIZE << mapping_max_folio_order(mapping); + } + + static inline int filemap_nr_thps(struct address_space *mapping) +diff --git a/mm/readahead.c b/mm/readahead.c +index 438f142a3e74..c13c130efcca 100644 +--- a/mm/readahead.c ++++ b/mm/readahead.c +@@ -513,10 +513,10 @@ void page_cache_ra_order(struct readahead_control *ractl, + + limit = min(limit, index + ra->size - 1); + +- if (new_order < MAX_PAGECACHE_ORDER) ++ if (new_order < mapping_max_folio_order(mapping)) + new_order += 2; + +- new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); ++ new_order = min(mapping_max_folio_order(mapping), new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); + + /* See comment in page_cache_ra_unbounded() */ +-- +2.25.1 + diff --git a/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch new file mode 100644 index 0000000000000000000000000000000000000000..ebe3ba02a5bc5f03b16204014daa2c53b0c0b53c --- /dev/null +++ b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch @@ -0,0 +1,68 @@ +From 8c8766f9500b9ffdb907d23269aa888d0632e68c Mon Sep 17 00:00:00 2001 +From: Chen Ridong +Date: Wed, 18 Dec 2024 08:10:59 +0000 +Subject: [PATCH 13/17] Revert "cgroup: fix uaf when proc_cpuset_show" + +hulk inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IA9YQ9 + +-------------------------------- + +To keep the same with the mainline and backport the lts patch. +This reverts commit 24c448de81d48ad08925dda9869bcf535a3258b8. + +Fixes: 24c448de81d4 ("cgroup: fix uaf when proc_cpuset_show") +Signed-off-by: Chen Ridong +--- + kernel/cgroup/cpuset.c | 24 ------------------------ + 1 file changed, 24 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 2c9e50f09fc1..140dfb5ad3fc 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -5185,7 +5185,6 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + char *buf; + struct cgroup_subsys_state *css; + int retval; +- struct cgroup *root_cgroup = NULL; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); +@@ -5193,32 +5192,9 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + goto out; + + css = task_get_css(tsk, cpuset_cgrp_id); +- rcu_read_lock(); +- /* +- * When the cpuset subsystem is mounted on the legacy hierarchy, +- * the top_cpuset.css->cgroup does not hold a reference count of +- * cgroup_root.cgroup. This makes accessing css->cgroup very +- * dangerous because when the cpuset subsystem is remounted to the +- * default hierarchy, the cgroup_root.cgroup that css->cgroup points +- * to will be released, leading to a UAF issue. To avoid this problem, +- * get the reference count of top_cpuset.css->cgroup first. +- * +- * This is ugly!! +- */ +- if (css == &top_cpuset.css) { +- root_cgroup = css->cgroup; +- if (!css_tryget_online(&root_cgroup->self)) { +- rcu_read_unlock(); +- retval = -EBUSY; +- goto out_free; +- } +- } +- rcu_read_unlock(); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); +- if (root_cgroup) +- css_put(&root_cgroup->self); + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) +-- +2.25.1 + diff --git a/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch new file mode 100644 index 0000000000000000000000000000000000000000..0c54088d11faad0dafea84ea125ddbbfd7305321 --- /dev/null +++ b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch @@ -0,0 +1,145 @@ +From 7b6abe1742cbfedea405f03fcf7fc88cacb2a205 Mon Sep 17 00:00:00 2001 +From: Yafang Shao +Date: Wed, 18 Dec 2024 08:11:00 +0000 +Subject: [PATCH 14/17] cgroup: Make operations on the cgroup root_list RCU + safe +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +stable inclusion +from stable-v6.6.47 +commit dd9542ae7c7ca82ed2d7c185754ba9026361f6bc +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dd9542ae7c7ca82ed2d7c185754ba9026361f6bc + +-------------------------------- + +commit d23b5c577715892c87533b13923306acc6243f93 upstream. + +At present, when we perform operations on the cgroup root_list, we must +hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, +we can make operations on this list RCU-safe, eliminating the need to hold +the cgroup_mutex during traversal. Modifications to the list only occur in +the cgroup root setup and destroy paths, which should be infrequent in a +production environment. In contrast, traversal may occur frequently. +Therefore, making it RCU-safe would be beneficial. + +Signed-off-by: Yafang Shao +Signed-off-by: Tejun Heo +To: Michal Koutný +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Chen Ridong +--- + include/linux/cgroup-defs.h | 1 + + kernel/cgroup/cgroup-internal.h | 3 ++- + kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- + 3 files changed, 19 insertions(+), 8 deletions(-) + +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 6e3227a688de..05ece896af7d 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -591,6 +591,7 @@ struct cgroup_root { + + /* A list running through the active hierarchies */ + struct list_head root_list; ++ struct rcu_head rcu; + + /* Hierarchy-specific flags */ + unsigned int flags; +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index 96a9bd2c26f0..f5fb12890645 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; + + /* iterate across the hierarchies */ + #define for_each_root(root) \ +- list_for_each_entry((root), &cgroup_roots, root_list) ++ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ ++ lockdep_is_held(&cgroup_mutex)) + + /** + * for_each_subsys - iterate all enabled cgroup subsystems +diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c +index 52fe6ba2fefd..c26a9b3a3576 100644 +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) + + void cgroup_free_root(struct cgroup_root *root) + { +- kfree(root); ++ kfree_rcu(root, rcu); + } + + static void cgroup_destroy_root(struct cgroup_root *root) +@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) + spin_unlock_irq(&css_set_lock); + + if (!list_empty(&root->root_list)) { +- list_del(&root->root_list); ++ list_del_rcu(&root->root_list); + cgroup_root_count--; + } + +@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + } + } + +- BUG_ON(!res_cgroup); ++ /* ++ * If cgroup_mutex is not held, the cgrp_cset_link will be freed ++ * before we remove the cgroup root from the root_list. Consequently, ++ * when accessing a cgroup root, the cset_link may have already been ++ * freed, resulting in a NULL res_cgroup. However, by holding the ++ * cgroup_mutex, we ensure that res_cgroup can't be NULL. ++ * If we don't hold cgroup_mutex in the caller, we must do the NULL ++ * check. ++ */ + return res_cgroup; + } + +@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) + static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) + { +- lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_lock); + + return __cset_cgroup_from_root(cset, root); +@@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + + /* + * Return the cgroup for "task" from the given hierarchy. Must be +- * called with cgroup_mutex and css_set_lock held. ++ * called with css_set_lock held to prevent task's groups from being modified. ++ * Must be called with either cgroup_mutex or rcu read lock to prevent the ++ * cgroup root from being destroyed. + */ + struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) +@@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) + struct cgroup_root *root = ctx->root; + struct cgroup *cgrp = &root->cgrp; + +- INIT_LIST_HEAD(&root->root_list); ++ INIT_LIST_HEAD_RCU(&root->root_list); + atomic_set(&root->nr_cgrps, 1); + cgrp->root = root; + init_cgroup_housekeeping(cgrp); +@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ +- list_add(&root->root_list, &cgroup_roots); ++ list_add_rcu(&root->root_list, &cgroup_roots); + cgroup_root_count++; + + /* +-- +2.25.1 + diff --git a/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch new file mode 100644 index 0000000000000000000000000000000000000000..45d780248fb256f34f308724c08e0cf675c3bf28 --- /dev/null +++ b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch @@ -0,0 +1,84 @@ +From 4363688e9b49bde3cce7b2ea1882f3d44d1f5289 Mon Sep 17 00:00:00 2001 +From: Waiman Long +Date: Wed, 18 Dec 2024 08:11:01 +0000 +Subject: [PATCH 15/17] cgroup: Move rcu_head up near the top of cgroup_root +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +stable inclusion +from stable-v6.6.47 +commit f3c60ab676bb62e01d004d5b1cf2963a296c8e6a +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f3c60ab676bb62e01d004d5b1cf2963a296c8e6a + +-------------------------------- + +commit a7fb0423c201ba12815877a0b5a68a6a1710b23a upstream. + +Commit 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU +safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu() +for freeing the cgroup_root. + +The current implementation of kvfree_rcu(), however, has the limitation +that the offset of the rcu_head structure within the larger data +structure must be less than 4096 or the compilation will fail. See the +macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h +for more information. + +By putting rcu_head below the large cgroup structure, any change to the +cgroup structure that makes it larger run the risk of causing build +failure under certain configurations. Commit 77070eeb8821 ("cgroup: +Avoid false cacheline sharing of read mostly rstat_cpu") happens to be +the last straw that breaks it. Fix this problem by moving the rcu_head +structure up before the cgroup structure. + +Fixes: 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe") +Reported-by: Stephen Rothwell +Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/ +Signed-off-by: Waiman Long +Acked-by: Yafang Shao +Reviewed-by: Yosry Ahmed +Reviewed-by: Michal Koutný +Signed-off-by: Tejun Heo +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + include/linux/cgroup-defs.h +[Context is mismatched for wait_queue_head_t wait was merged] +Signed-off-by: Chen Ridong +--- + include/linux/cgroup-defs.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 05ece896af7d..8eb518ce87a1 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -573,6 +573,10 @@ struct cgroup_root { + /* Unique id for this hierarchy. */ + int hierarchy_id; + ++ /* A list running through the active hierarchies */ ++ struct list_head root_list; ++ struct rcu_head rcu; /* Must be near the top */ ++ + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. cgrp->ancestors[0] will be used overflowing into the +@@ -589,10 +593,6 @@ struct cgroup_root { + /* Wait while cgroups are being destroyed */ + wait_queue_head_t wait; + +- /* A list running through the active hierarchies */ +- struct list_head root_list; +- struct rcu_head rcu; +- + /* Hierarchy-specific flags */ + unsigned int flags; + +-- +2.25.1 + diff --git a/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch new file mode 100644 index 0000000000000000000000000000000000000000..c528ff32a703b081492f054515fb49b431716238 --- /dev/null +++ b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch @@ -0,0 +1,110 @@ +From 724b6581cd8b49962e3add6e8795423f2c1390f8 Mon Sep 17 00:00:00 2001 +From: Chen Ridong +Date: Wed, 18 Dec 2024 08:11:02 +0000 +Subject: [PATCH 16/17] cgroup/cpuset: Prevent UAF in proc_cpuset_show() + +stable inclusion +from stable-v6.6.44 +commit 96226fbed566f3f686f53a489a29846f2d538080 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=96226fbed566f3f686f53a489a29846f2d538080 + +-------------------------------- + +[ Upstream commit 1be59c97c83ccd67a519d8a49486b3a8a73ca28a ] + +An UAF can happen when /proc/cpuset is read as reported in [1]. + +This can be reproduced by the following methods: +1.add an mdelay(1000) before acquiring the cgroup_lock In the + cgroup_path_ns function. +2.$cat /proc//cpuset repeatly. +3.$mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset/ +$umount /sys/fs/cgroup/cpuset/ repeatly. + +The race that cause this bug can be shown as below: + +(umount) | (cat /proc//cpuset) +css_release | proc_cpuset_show +css_release_work_fn | css = task_get_css(tsk, cpuset_cgrp_id); +css_free_rwork_fn | cgroup_path_ns(css->cgroup, ...); +cgroup_destroy_root | mutex_lock(&cgroup_mutex); +rebind_subsystems | +cgroup_free_root | + | // cgrp was freed, UAF + | cgroup_path_ns_locked(cgrp,..); + +When the cpuset is initialized, the root node top_cpuset.css.cgrp +will point to &cgrp_dfl_root.cgrp. In cgroup v1, the mount operation will +allocate cgroup_root, and top_cpuset.css.cgrp will point to the allocated +&cgroup_root.cgrp. When the umount operation is executed, +top_cpuset.css.cgrp will be rebound to &cgrp_dfl_root.cgrp. + +The problem is that when rebinding to cgrp_dfl_root, there are cases +where the cgroup_root allocated by setting up the root for cgroup v1 +is cached. This could lead to a Use-After-Free (UAF) if it is +subsequently freed. The descendant cgroups of cgroup v1 can only be +freed after the css is released. However, the css of the root will never +be released, yet the cgroup_root should be freed when it is unmounted. +This means that obtaining a reference to the css of the root does +not guarantee that css.cgrp->root will not be freed. + +Fix this problem by using rcu_read_lock in proc_cpuset_show(). +As cgroup_root is kfree_rcu after commit 331654dc5f40 +("cgroup: Make operations on the cgroup root_list RCU safe"), +css->cgroup won't be freed during the critical section. +To call cgroup_path_ns_locked, css_set_lock is needed, so it is safe to +replace task_get_css with task_css. + +[1] https://syzkaller.appspot.com/bug?extid=9b1ff7be974a403aa4cd + +Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") +Signed-off-by: Chen Ridong +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin + +Conflicts: + kernel/cgroup/cpuset.c +[commit 5715456af3e0 ("kernfs: Convert kernfs_path_from_node_locked() +from strlcpy() to strscpy()") was not merged] +Signed-off-by: Chen Ridong +--- + kernel/cgroup/cpuset.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 140dfb5ad3fc..f3cf9b1268e0 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -21,6 +21,7 @@ + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ ++#include "cgroup-internal.h" + + #include + #include +@@ -5191,10 +5192,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + if (!buf) + goto out; + +- css = task_get_css(tsk, cpuset_cgrp_id); +- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, +- current->nsproxy->cgroup_ns); +- css_put(css); ++ rcu_read_lock(); ++ spin_lock_irq(&css_set_lock); ++ css = task_css(tsk, cpuset_cgrp_id); ++ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, ++ current->nsproxy->cgroup_ns); ++ spin_unlock_irq(&css_set_lock); ++ rcu_read_unlock(); ++ + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) +-- +2.25.1 + diff --git a/0021-cgroup-add-more-reserve-kabi.patch b/0021-cgroup-add-more-reserve-kabi.patch new file mode 100644 index 0000000000000000000000000000000000000000..5c0ed0801f0f17360af652b9104c43370874f823 --- /dev/null +++ b/0021-cgroup-add-more-reserve-kabi.patch @@ -0,0 +1,90 @@ +From d68991f87f738657074d93a1ae8ccf865f40b65a Mon Sep 17 00:00:00 2001 +From: Chen Ridong +Date: Wed, 18 Dec 2024 08:11:03 +0000 +Subject: [PATCH 17/17] cgroup: add more reserve kabi + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I8SA3O + +-------------------------------- + +Reserve KABI for future feature development. + +Signed-off-by: Chen Ridong +--- + include/linux/cgroup-defs.h | 7 +++++++ + include/linux/memcontrol.h | 8 ++++++++ + kernel/cgroup/cpuset.c | 5 ----- + 3 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 8eb518ce87a1..f3fd0407d346 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -325,6 +325,8 @@ struct cgroup_base_stat { + #ifdef CONFIG_SCHED_CORE + u64 forceidle_sum; + #endif ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + /* +@@ -555,6 +557,9 @@ struct cgroup { + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + /* All ancestors including self */ + struct cgroup *ancestors[]; + }; +@@ -606,6 +611,8 @@ struct cgroup_root { + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) + }; + + /* +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index b2a80e089a0a..abe236201e68 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -429,6 +429,14 @@ struct mem_cgroup { + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) ++ KABI_RESERVE(9) ++ KABI_RESERVE(10) ++ KABI_RESERVE(11) ++ KABI_RESERVE(12) ++ KABI_RESERVE(13) ++ KABI_RESERVE(14) ++ KABI_RESERVE(15) ++ KABI_RESERVE(16) + struct mem_cgroup_per_node *nodeinfo[]; + }; + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index f3cf9b1268e0..7ea0a6d00519 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -211,11 +211,6 @@ struct cpuset { + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; +- +- KABI_RESERVE(1) +- KABI_RESERVE(2) +- KABI_RESERVE(3) +- KABI_RESERVE(4) + }; + + /* +-- +2.25.1 + diff --git a/0022-14223.patch b/0022-14223.patch new file mode 100644 index 0000000000000000000000000000000000000000..b10342738af488cadd2bab4ffd42010e767b5f41 --- /dev/null +++ b/0022-14223.patch @@ -0,0 +1,80 @@ +From f8cb61566576a623971d5cc8dd3cd6229e787e30 Mon Sep 17 00:00:00 2001 +From: Zhang Changzhong +Date: Wed, 18 Dec 2024 17:50:29 +0800 +Subject: [PATCH] kabi: net: reserve space for xdp subsystem related structure + +hulk inclusion +category: other +bugzilla: https://gitee.com/openeuler/kernel/issues/I8OWRC + +---------------------------------------------------- + +Reserve some fields beforehand for xdp framework related structures +prone to change. + +Signed-off-by: Zhang Changzhong +--- + include/net/xdp.h | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/include/net/xdp.h b/include/net/xdp.h +index c283668458ca..9b9c7dc25eeb 100644 +--- a/include/net/xdp.h ++++ b/include/net/xdp.h +@@ -54,6 +54,9 @@ enum xdp_mem_type { + struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + struct page_pool; +@@ -74,6 +77,9 @@ struct xdp_rxq_info { + + struct xdp_txq_info { + struct net_device *dev; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + enum xdp_buff_flags { +@@ -92,6 +98,11 @@ struct xdp_buff { + struct xdp_txq_info *txq; + u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ + u32 flags; /* supported values defined in xdp_buff_flags */ ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +@@ -181,6 +192,11 @@ struct xdp_frame { + struct net_device *dev_rx; /* used by cpumap */ + u32 frame_sz; + u32 flags; /* supported values defined in xdp_buff_flags */ ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +@@ -198,6 +214,9 @@ struct xdp_frame_bulk { + int count; + void *xa; + void *q[XDP_BULK_QUEUE_SIZE]; ++ ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) + }; + + static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) +-- +Gitee + diff --git a/0023-14224.patch b/0023-14224.patch new file mode 100644 index 0000000000000000000000000000000000000000..62ba017f20debc62800306679fc06b1265b58aef --- /dev/null +++ b/0023-14224.patch @@ -0,0 +1,85 @@ +From a2bbb3a7e3d30f5efc443fa17fcfe20fdd5a98d5 Mon Sep 17 00:00:00 2001 +From: Dong Chenchen +Date: Wed, 18 Dec 2024 17:15:36 +0800 +Subject: [PATCH] net/kabi: Reserve space for net structures + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC1RH + +-------------------------------- + +Reserve some fields beforehand for net subsystem related +structures prone to change. + +Signed-off-by: Dong Chenchen +--- + include/net/flow.h | 2 ++ + include/net/netns/netfilter.h | 2 ++ + include/net/netns/xfrm.h | 2 ++ + include/net/xfrm.h | 4 ++++ + 4 files changed, 10 insertions(+) + +diff --git a/include/net/flow.h b/include/net/flow.h +index 0cc5f2ef1000..72d2ea2374ba 100644 +--- a/include/net/flow.h ++++ b/include/net/flow.h +@@ -46,6 +46,8 @@ struct flowi_common { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + union flowi_uli { +diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h +index 4b77a9b031b6..963588269637 100644 +--- a/include/net/netns/netfilter.h ++++ b/include/net/netns/netfilter.h +@@ -34,5 +34,7 @@ struct netns_nf { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + #endif +diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h +index a0c1359cc7eb..af7f20ef4823 100644 +--- a/include/net/netns/xfrm.h ++++ b/include/net/netns/xfrm.h +@@ -87,6 +87,8 @@ struct netns_xfrm { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + #endif +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index c875faf98492..b9dec5f9c973 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -294,6 +294,8 @@ struct xfrm_state { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static inline struct net *xs_net(struct xfrm_state *x) +@@ -562,6 +564,8 @@ struct xfrm_policy { + + KABI_RESERVE(1) + KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) + }; + + static inline struct net *xp_net(const struct xfrm_policy *xp) +-- +Gitee + diff --git a/0024-14225.patch b/0024-14225.patch new file mode 100644 index 0000000000000000000000000000000000000000..32a10378e03c63d02d8559123ce5db0f5df1bc1f --- /dev/null +++ b/0024-14225.patch @@ -0,0 +1,154 @@ +From 279803fa98908bd367cec04ae2600c15764fb977 Mon Sep 17 00:00:00 2001 +From: Luo Gengkun +Date: Wed, 18 Dec 2024 09:45:31 +0000 +Subject: [PATCH 1/3] kabi: reserve space for perf_event.h + +hulk inclusion +category: feature +bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM + +-------------------------------- + +reserve space for perf_event.h + +Signed-off-by: Luo Gengkun +--- + include/linux/perf_event.h | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 89f2a02db563..fe692e9bd0b2 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1010,6 +1010,14 @@ struct perf_cpu_pmu_context { + struct hrtimer hrtimer; + ktime_t hrtimer_interval; + unsigned int hrtimer_active; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + /** +@@ -1031,6 +1039,14 @@ struct perf_cpu_context { + int heap_size; + struct perf_event **heap; + struct perf_event *heap_default[2]; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct perf_output_handle { +-- +Gitee + + +From 078ad81846b81844eb98f90eee57c06954715c8d Mon Sep 17 00:00:00 2001 +From: Luo Gengkun +Date: Wed, 18 Dec 2024 09:45:32 +0000 +Subject: [PATCH 2/3] kabi: reserve space for internal.h + +hulk inclusion +category: feature +bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM + +-------------------------------- + +reserve space for internal.h + +Signed-off-by: Luo Gengkun +--- + kernel/events/internal.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/kernel/events/internal.h b/kernel/events/internal.h +index d2e6e6144c54..d1ffa00b91b6 100644 +--- a/kernel/events/internal.h ++++ b/kernel/events/internal.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + /* Buffer handling */ + +@@ -54,6 +55,15 @@ struct perf_buffer { + void **aux_pages; + void *aux_priv; + ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) ++ + struct perf_event_mmap_page *user_page; + void *data_pages[]; + }; +-- +Gitee + + +From 59a2a3e8b1c35d9e0bde08cd2e6f01f1c12d384b Mon Sep 17 00:00:00 2001 +From: Luo Gengkun +Date: Wed, 18 Dec 2024 09:45:33 +0000 +Subject: [PATCH 3/3] kabi: reserve space for uprobes.h + +hulk inclusion +category: feature +bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM + +-------------------------------- + +reserve space for uprobes.h + +Signed-off-by: Luo Gengkun +--- + include/linux/uprobes.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h +index f46e0ca0169c..86d0868b584a 100644 +--- a/include/linux/uprobes.h ++++ b/include/linux/uprobes.h +@@ -47,6 +47,7 @@ struct uprobe_consumer { + + #ifdef CONFIG_UPROBES + #include ++#include + + enum uprobe_task_state { + UTASK_RUNNING, +@@ -78,6 +79,14 @@ struct uprobe_task { + + struct return_instance *return_instances; + unsigned int depth; ++ KABI_RESERVE(1) ++ KABI_RESERVE(2) ++ KABI_RESERVE(3) ++ KABI_RESERVE(4) ++ KABI_RESERVE(5) ++ KABI_RESERVE(6) ++ KABI_RESERVE(7) ++ KABI_RESERVE(8) + }; + + struct return_instance { +-- +Gitee + diff --git a/0025-14226.patch b/0025-14226.patch new file mode 100644 index 0000000000000000000000000000000000000000..172bd62c1e76e56a4510255952f89b2dca1d4740 --- /dev/null +++ b/0025-14226.patch @@ -0,0 +1,3685 @@ +From d1c833cfcc6661276386ef005382f6cd817ade5f Mon Sep 17 00:00:00 2001 +From: Kemeng Shi +Date: Wed, 18 Dec 2024 17:34:44 +0800 +Subject: [PATCH 01/19] mm/page_alloc: remove unnecessary check in + break_down_buddy_pages + +mainline inclusion +from mainline-v6.7-rc1 +commit 27e0db3c21aaf1422980e64b77956e15b839306f +category: cleanup +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=27e0db3c21aaf1422980e64b77956e15b839306f + +-------------------------------- + +Patch series "Two minor cleanups to break_down_buddy_pages", v2. + +Two minor cleanups to break_down_buddy_pages. + +This patch (of 2): + +1. We always have target in range started with next_page and full free + range started with current_buddy. + +2. The last split range size is 1 << low and low should be >= 0, then + size >= 1. So page + size != page is always true (because size > 0). + As summary, current_page will not equal to target page. + +Link: https://lkml.kernel.org/r/20230927103514.98281-1-shikemeng@huaweicloud.com +Link: https://lkml.kernel.org/r/20230927103514.98281-2-shikemeng@huaweicloud.com +Signed-off-by: Kemeng Shi +Acked-by: Naoya Horiguchi +Cc: Matthew Wilcox (Oracle) +Cc: Oscar Salvador +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 36cd38df0614..fb6008b30b48 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6955,10 +6955,8 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; + +- if (current_buddy != target) { +- add_to_free_list(current_buddy, zone, high, migratetype); +- set_buddy_order(current_buddy, high); +- } ++ add_to_free_list(current_buddy, zone, high, migratetype); ++ set_buddy_order(current_buddy, high); + } + } + +-- +Gitee + + +From f3e36dbf45c6a413f85c6d41a84565111728030d Mon Sep 17 00:00:00 2001 +From: Kemeng Shi +Date: Wed, 18 Dec 2024 17:34:45 +0800 +Subject: [PATCH 02/19] mm/page_alloc: remove unnecessary next_page in + break_down_buddy_pages + +mainline inclusion +from mainline-v6.7-rc1 +commit 0dfca313a009c83e2ad44b3719dc1222df6c6db5 +category: cleanup +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0dfca313a009c83e2ad44b3719dc1222df6c6db5 + +-------------------------------- + +The next_page is only used to forward page in case target is in second +half range. Move forward page directly to remove unnecessary next_page. + +Link: https://lkml.kernel.org/r/20230927103514.98281-3-shikemeng@huaweicloud.com +Signed-off-by: Kemeng Shi +Acked-by: Naoya Horiguchi +Cc: Matthew Wilcox (Oracle) +Cc: Oscar Salvador +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index fb6008b30b48..3cc5d5c7826e 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6937,20 +6937,18 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, + int migratetype) + { + unsigned long size = 1 << high; +- struct page *current_buddy, *next_page; ++ struct page *current_buddy; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { +- next_page = page + size; + current_buddy = page; ++ page = page + size; + } else { +- next_page = page; + current_buddy = page + size; + } +- page = next_page; + + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; +-- +Gitee + + +From 8a91855c80870c36e7d5f540e502b42716512680 Mon Sep 17 00:00:00 2001 +From: Yajun Deng +Date: Wed, 18 Dec 2024 17:34:46 +0800 +Subject: [PATCH 03/19] mm: page_alloc: simplify __free_pages_ok() + +mainline inclusion +from mainline-v6.8-rc1 +commit 250ae189d98290d0539b4f9b8c4703e0bf24f9d3 +category: cleanup +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=250ae189d98290d0539b4f9b8c4703e0bf24f9d3 + +-------------------------------- + +There is redundant code in __free_pages_ok(). Use free_one_page() +simplify it. + +Link: https://lkml.kernel.org/r/20231216030503.2126130-1-yajun.deng@linux.dev +Signed-off-by: Yajun Deng +Reviewed-by: Matthew Wilcox (Oracle) +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3cc5d5c7826e..ff0940ab0fe6 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1258,7 +1258,6 @@ static void free_one_page(struct zone *zone, + static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) + { +- unsigned long flags; + int migratetype; + unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); +@@ -1273,13 +1272,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, + */ + migratetype = get_pfnblock_migratetype(page, pfn); + +- spin_lock_irqsave(&zone->lock, flags); +- if (unlikely(has_isolate_pageblock(zone) || +- is_migrate_isolate(migratetype))) { +- migratetype = get_pfnblock_migratetype(page, pfn); +- } +- __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); +- spin_unlock_irqrestore(&zone->lock, flags); ++ free_one_page(zone, page, pfn, order, migratetype, fpi_flags); + + __count_vm_events(PGFREE, 1 << order); + } +-- +Gitee + + +From e30777461ce931191902c5d35263b9a3d23b1de7 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:47 +0800 +Subject: [PATCH 04/19] mm: page_alloc: remove pcppage migratetype caching + +mainline inclusion +from mainline-v6.10-rc1 +commit 17edeb5d3f761c20fd28f6002f5a9faa53c0a0d8 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=17edeb5d3f761c20fd28f6002f5a9faa53c0a0d8 + +-------------------------------- + +Patch series "mm: page_alloc: freelist migratetype hygiene", v4. + +The page allocator's mobility grouping is intended to keep unmovable pages +separate from reclaimable/compactable ones to allow on-demand +defragmentation for higher-order allocations and huge pages. + +Currently, there are several places where accidental type mixing occurs: +an allocation asks for a page of a certain migratetype and receives +another. This ruins pageblocks for compaction, which in turn makes +allocating huge pages more expensive and less reliable. + +The series addresses those causes. The last patch adds type checks on all +freelist movements to prevent new violations being introduced. + +The benefits can be seen in a mixed workload that stresses the machine +with a memcache-type workload and a kernel build job while periodically +attempting to allocate batches of THP. The following data is aggregated +over 50 consecutive defconfig builds: + + VANILLA PATCHED +Hugealloc Time mean 165843.93 ( +0.00%) 113025.88 ( -31.85%) +Hugealloc Time stddev 158957.35 ( +0.00%) 114716.07 ( -27.83%) +Kbuild Real time 310.24 ( +0.00%) 300.73 ( -3.06%) +Kbuild User time 1271.13 ( +0.00%) 1259.42 ( -0.92%) +Kbuild System time 582.02 ( +0.00%) 559.79 ( -3.81%) +THP fault alloc 30585.14 ( +0.00%) 40853.62 ( +33.57%) +THP fault fallback 36626.46 ( +0.00%) 26357.62 ( -28.04%) +THP fault fail rate % 54.49 ( +0.00%) 39.22 ( -27.53%) +Pagealloc fallback 1328.00 ( +0.00%) 1.00 ( -99.85%) +Pagealloc type mismatch 181009.50 ( +0.00%) 0.00 ( -100.00%) +Direct compact stall 434.56 ( +0.00%) 257.66 ( -40.61%) +Direct compact fail 421.70 ( +0.00%) 249.94 ( -40.63%) +Direct compact success 12.86 ( +0.00%) 7.72 ( -37.09%) +Direct compact success rate % 2.86 ( +0.00%) 2.82 ( -0.96%) +Compact daemon scanned migrate 3370059.62 ( +0.00%) 3612054.76 ( +7.18%) +Compact daemon scanned free 7718439.20 ( +0.00%) 5386385.02 ( -30.21%) +Compact direct scanned migrate 309248.62 ( +0.00%) 176721.04 ( -42.85%) +Compact direct scanned free 433582.84 ( +0.00%) 315727.66 ( -27.18%) +Compact migrate scanned daemon % 91.20 ( +0.00%) 94.48 ( +3.56%) +Compact free scanned daemon % 94.58 ( +0.00%) 94.42 ( -0.16%) +Compact total migrate scanned 3679308.24 ( +0.00%) 3788775.80 ( +2.98%) +Compact total free scanned 8152022.04 ( +0.00%) 5702112.68 ( -30.05%) +Alloc stall 872.04 ( +0.00%) 5156.12 ( +490.71%) +Pages kswapd scanned 510645.86 ( +0.00%) 3394.94 ( -99.33%) +Pages kswapd reclaimed 134811.62 ( +0.00%) 2701.26 ( -98.00%) +Pages direct scanned 99546.06 ( +0.00%) 376407.52 ( +278.12%) +Pages direct reclaimed 62123.40 ( +0.00%) 289535.70 ( +366.06%) +Pages total scanned 610191.92 ( +0.00%) 379802.46 ( -37.76%) +Pages scanned kswapd % 76.36 ( +0.00%) 0.10 ( -98.58%) +Swap out 12057.54 ( +0.00%) 15022.98 ( +24.59%) +Swap in 209.16 ( +0.00%) 256.48 ( +22.52%) +File refaults 17701.64 ( +0.00%) 11765.40 ( -33.53%) + +Huge page success rate is higher, allocation latencies are shorter and +more predictable. + +Stealing (fallback) rate is drastically reduced. Notably, while the +vanilla kernel keeps doing fallbacks on an ongoing basis, the patched +kernel enters a steady state once the distribution of block types is +adequate for the workload. Steals over 50 runs: + +VANILLA PATCHED +1504.0 227.0 +1557.0 6.0 +1391.0 13.0 +1080.0 26.0 +1057.0 40.0 +1156.0 6.0 +805.0 46.0 +736.0 20.0 +1747.0 2.0 +1699.0 34.0 +1269.0 13.0 +1858.0 12.0 +907.0 4.0 +727.0 2.0 +563.0 2.0 +3094.0 2.0 +10211.0 3.0 +2621.0 1.0 +5508.0 2.0 +1060.0 2.0 +538.0 3.0 +5773.0 2.0 +2199.0 0.0 +3781.0 2.0 +1387.0 1.0 +4977.0 0.0 +2865.0 1.0 +1814.0 1.0 +3739.0 1.0 +6857.0 0.0 +382.0 0.0 +407.0 1.0 +3784.0 0.0 +297.0 0.0 +298.0 0.0 +6636.0 0.0 +4188.0 0.0 +242.0 0.0 +9960.0 0.0 +5816.0 0.0 +354.0 0.0 +287.0 0.0 +261.0 0.0 +140.0 1.0 +2065.0 0.0 +312.0 0.0 +331.0 0.0 +164.0 0.0 +465.0 1.0 +219.0 0.0 + +Type mismatches are down too. Those count every time an allocation +request asks for one migratetype and gets another. This can still occur +minimally in the patched kernel due to non-stealing fallbacks, but it's +quite rare and follows the pattern of overall fallbacks - once the block +type distribution settles, mismatches cease as well: + +VANILLA: PATCHED: +182602.0 268.0 +135794.0 20.0 +88619.0 19.0 +95973.0 0.0 +129590.0 0.0 +129298.0 0.0 +147134.0 0.0 +230854.0 0.0 +239709.0 0.0 +137670.0 0.0 +132430.0 0.0 +65712.0 0.0 +57901.0 0.0 +67506.0 0.0 +63565.0 4.0 +34806.0 0.0 +42962.0 0.0 +32406.0 0.0 +38668.0 0.0 +61356.0 0.0 +57800.0 0.0 +41435.0 0.0 +83456.0 0.0 +65048.0 0.0 +28955.0 0.0 +47597.0 0.0 +75117.0 0.0 +55564.0 0.0 +38280.0 0.0 +52404.0 0.0 +26264.0 0.0 +37538.0 0.0 +19671.0 0.0 +30936.0 0.0 +26933.0 0.0 +16962.0 0.0 +44554.0 0.0 +46352.0 0.0 +24995.0 0.0 +35152.0 0.0 +12823.0 0.0 +21583.0 0.0 +18129.0 0.0 +31693.0 0.0 +28745.0 0.0 +33308.0 0.0 +31114.0 0.0 +35034.0 0.0 +12111.0 0.0 +24885.0 0.0 + +Compaction work is markedly reduced despite much better THP rates. + +In the vanilla kernel, reclaim seems to have been driven primarily by +watermark boosting that happens as a result of fallbacks. With those all +but eliminated, watermarks average lower and kswapd does less work. The +uptick in direct reclaim is because THP requests have to fend for +themselves more often - which is intended policy right now. Aggregate +reclaim activity is lowered significantly, though. + +This patch (of 10): + +The idea behind the cache is to save get_pageblock_migratetype() lookups +during bulk freeing. A microbenchmark suggests this isn't helping, +though. The pcp migratetype can get stale, which means that bulk freeing +has an extra branch to check if the pageblock was isolated while on the +pcp. + +While the variance overlaps, the cache write and the branch seem to make +this a net negative. The following test allocates and frees batches of +10,000 pages (~3x the pcp high marks to trigger flushing): + +Before: + 8,668.48 msec task-clock # 99.735 CPUs utilized ( +- 2.90% ) + 19 context-switches # 4.341 /sec ( +- 3.24% ) + 0 cpu-migrations # 0.000 /sec + 17,440 page-faults # 3.984 K/sec ( +- 2.90% ) + 41,758,692,473 cycles # 9.541 GHz ( +- 2.90% ) + 126,201,294,231 instructions # 5.98 insn per cycle ( +- 2.90% ) + 25,348,098,335 branches # 5.791 G/sec ( +- 2.90% ) + 33,436,921 branch-misses # 0.26% of all branches ( +- 2.90% ) + + 0.0869148 +- 0.0000302 seconds time elapsed ( +- 0.03% ) + +After: + 8,444.81 msec task-clock # 99.726 CPUs utilized ( +- 2.90% ) + 22 context-switches # 5.160 /sec ( +- 3.23% ) + 0 cpu-migrations # 0.000 /sec + 17,443 page-faults # 4.091 K/sec ( +- 2.90% ) + 40,616,738,355 cycles # 9.527 GHz ( +- 2.90% ) + 126,383,351,792 instructions # 6.16 insn per cycle ( +- 2.90% ) + 25,224,985,153 branches # 5.917 G/sec ( +- 2.90% ) + 32,236,793 branch-misses # 0.25% of all branches ( +- 2.90% ) + + 0.0846799 +- 0.0000412 seconds time elapsed ( +- 0.05% ) + +A side effect is that this also ensures that pages whose pageblock gets +stolen while on the pcplist end up on the right freelist and we don't +perform potentially type-incompatible buddy merges (or skip merges when we +shouldn't), which is likely beneficial to long-term fragmentation +management, although the effects would be harder to measure. Settle for +simpler and faster code as justification here. + +Link: https://lkml.kernel.org/r/20240320180429.678181-1-hannes@cmpxchg.org +Link: https://lkml.kernel.org/r/20240320180429.678181-2-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Acked-by: Zi Yan +Reviewed-by: Vlastimil Babka +Acked-by: Mel Gorman +Tested-by: "Huang, Ying" +Tested-by: Baolin Wang +Cc: David Hildenbrand +Signed-off-by: Andrew Morton +Conflicts: + mm/page_alloc.c +[ Context conflicts with commit 62b208c4859c and ae577de78c12. ] +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 66 +++++++++++-------------------------------------- + 1 file changed, 14 insertions(+), 52 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index ff0940ab0fe6..3d4932cd2332 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -207,24 +207,6 @@ EXPORT_SYMBOL(node_states); + + gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; + +-/* +- * A cached value of the page's pageblock's migratetype, used when the page is +- * put on a pcplist. Used to avoid the pageblock migratetype lookup when +- * freeing from pcplists in most cases, at the cost of possibly becoming stale. +- * Also the migratetype set in the page does not necessarily match the pcplist +- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any +- * other index - this ensures that it will be put on the correct CMA freelist. +- */ +-static inline int get_pcppage_migratetype(struct page *page) +-{ +- return page->index; +-} +- +-static inline void set_pcppage_migratetype(struct page *page, int migratetype) +-{ +- page->index = migratetype; +-} +- + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE + unsigned int pageblock_order __read_mostly; + #endif +@@ -1186,7 +1168,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, + { + unsigned long flags; + unsigned int order; +- bool isolated_pageblocks; + struct page *page; + + /* +@@ -1199,7 +1180,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, + pindex = pindex - 1; + + spin_lock_irqsave(&zone->lock, flags); +- isolated_pageblocks = has_isolate_pageblock(zone); + + while (count > 0) { + struct list_head *list; +@@ -1215,23 +1195,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, + order = pindex_to_order(pindex); + nr_pages = 1 << order; + do { ++ unsigned long pfn; + int mt; + + page = list_last_entry(list, struct page, pcp_list); +- mt = get_pcppage_migratetype(page); ++ pfn = page_to_pfn(page); ++ mt = get_pfnblock_migratetype(page, pfn); + + /* must delete to avoid corrupting pcp list */ + list_del(&page->pcp_list); + count -= nr_pages; + pcp->count -= nr_pages; + +- /* MIGRATE_ISOLATE page should not go to pcplists */ +- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); +- /* Pageblock could have been isolated meanwhile */ +- if (unlikely(isolated_pageblocks)) +- mt = get_pageblock_migratetype(page); +- +- __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); ++ __free_one_page(page, pfn, zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); + } while (count > 0 && !list_empty(list)); + } +@@ -1591,7 +1567,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + continue; + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, migratetype); +- set_pcppage_migratetype(page, migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); +@@ -2162,7 +2137,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + * pages are ordered properly. + */ + list_add_tail(&page->pcp_list, list); +- if (is_migrate_cma(get_pcppage_migratetype(page))) ++ if (is_migrate_cma(get_pageblock_migratetype(page))) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); + } +@@ -2362,19 +2337,6 @@ void drain_all_pages(struct zone *zone) + __drain_all_pages(zone, false); + } + +-static bool free_unref_page_prepare(struct page *page, unsigned long pfn, +- unsigned int order) +-{ +- int migratetype; +- +- if (!free_pages_prepare(page, order)) +- return false; +- +- migratetype = get_pfnblock_migratetype(page, pfn); +- set_pcppage_migratetype(page, migratetype); +- return true; +-} +- + static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) + { + int min_nr_free, max_nr_free; +@@ -2517,7 +2479,7 @@ void free_unref_page(struct page *page, unsigned int order) + return; + } + +- if (!free_unref_page_prepare(page, pfn, order)) ++ if (!free_pages_prepare(page, order)) + return; + + /* +@@ -2527,7 +2489,7 @@ void free_unref_page(struct page *page, unsigned int order) + * get those areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ +- migratetype = pcpmigratetype = get_pcppage_migratetype(page); ++ migratetype = pcpmigratetype = get_pfnblock_migratetype(page, pfn); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); +@@ -2570,14 +2532,14 @@ void free_unref_folios(struct folio_batch *folios) + } + + folio_undo_large_rmappable(folio); +- if (!free_unref_page_prepare(&folio->page, pfn, order)) ++ if (!free_pages_prepare(&folio->page, order)) + continue; + + /* + * Free isolated folios and orders not handled on the PCP + * directly to the allocator, see comment in free_unref_page. + */ +- migratetype = get_pcppage_migratetype(&folio->page); ++ migratetype = get_pfnblock_migratetype(&folio->page, pfn); + if (!pcp_allowed_order(order) || + is_migrate_isolate(migratetype)) { + free_one_page(folio_zone(folio), &folio->page, pfn, +@@ -2594,10 +2556,11 @@ void free_unref_folios(struct folio_batch *folios) + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + struct zone *zone = folio_zone(folio); ++ unsigned long pfn = folio_pfn(folio); + unsigned int order = (unsigned long)folio->private; + + folio->private = NULL; +- migratetype = get_pcppage_migratetype(&folio->page); ++ migratetype = get_pfnblock_migratetype(&folio->page, pfn); + + /* Different zone requires a different pcp lock */ + if (zone != locked_zone) { +@@ -2614,9 +2577,8 @@ void free_unref_folios(struct folio_batch *folios) + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); +- free_one_page(zone, &folio->page, +- folio_pfn(folio), order, +- migratetype, FPI_NONE); ++ free_one_page(zone, &folio->page, pfn, ++ order, migratetype, FPI_NONE); + locked_zone = NULL; + continue; + } +@@ -2785,7 +2747,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + } + } + __mod_zone_freepage_state(zone, -(1 << order), +- get_pcppage_migratetype(page)); ++ get_pageblock_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + +-- +Gitee + + +From b1ab4c1538a8daf9ec62d7464b039bd2231a50c3 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:48 +0800 +Subject: [PATCH 05/19] mm: page_alloc: optimize free_unref_folios() + +mainline inclusion +from mainline-v6.10-rc1 +commit 9cbe97bad5cd75b5b493734bd2695febb8e95281 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9cbe97bad5cd75b5b493734bd2695febb8e95281 + +-------------------------------- + +Move direct freeing of isolated pages to the lock-breaking block in the +second loop. This saves an unnecessary migratetype reassessment. + +Minor comment and local variable scoping cleanups. + +Link: https://lkml.kernel.org/r/20240320180429.678181-3-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Suggested-by: Vlastimil Babka +Tested-by: "Huang, Ying" +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: Mel Gorman +Cc: Zi Yan +Signed-off-by: Andrew Morton +Conflicts: + mm/page_alloc.c +[ Context conflict with commit ae577de78c12. ] +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 32 +++++++++++++++++++++++--------- + 1 file changed, 23 insertions(+), 9 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3d4932cd2332..dad180df69da 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2518,7 +2518,7 @@ void free_unref_folios(struct folio_batch *folios) + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; +- int i, j, migratetype; ++ int i, j; + + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { +@@ -2534,14 +2534,15 @@ void free_unref_folios(struct folio_batch *folios) + folio_undo_large_rmappable(folio); + if (!free_pages_prepare(&folio->page, order)) + continue; +- + /* +- * Free isolated folios and orders not handled on the PCP +- * directly to the allocator, see comment in free_unref_page. ++ * Free orders not handled on the PCP directly to the ++ * allocator. + */ +- migratetype = get_pfnblock_migratetype(&folio->page, pfn); +- if (!pcp_allowed_order(order) || +- is_migrate_isolate(migratetype)) { ++ if (!pcp_allowed_order(order)) { ++ int migratetype; ++ ++ migratetype = get_pfnblock_migratetype(&folio->page, ++ pfn); + free_one_page(folio_zone(folio), &folio->page, pfn, + order, migratetype, FPI_NONE); + continue; +@@ -2558,15 +2559,29 @@ void free_unref_folios(struct folio_batch *folios) + struct zone *zone = folio_zone(folio); + unsigned long pfn = folio_pfn(folio); + unsigned int order = (unsigned long)folio->private; ++ int migratetype; + + folio->private = NULL; + migratetype = get_pfnblock_migratetype(&folio->page, pfn); + + /* Different zone requires a different pcp lock */ +- if (zone != locked_zone) { ++ if (zone != locked_zone || ++ is_migrate_isolate(migratetype)) { + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); ++ locked_zone = NULL; ++ pcp = NULL; ++ } ++ ++ /* ++ * Free isolated pages directly to the ++ * allocator, see comment in free_unref_page. ++ */ ++ if (is_migrate_isolate(migratetype)) { ++ free_one_page(zone, &folio->page, pfn, ++ order, migratetype, FPI_NONE); ++ continue; + } + + /* +@@ -2579,7 +2594,6 @@ void free_unref_folios(struct folio_batch *folios) + pcp_trylock_finish(UP_flags); + free_one_page(zone, &folio->page, pfn, + order, migratetype, FPI_NONE); +- locked_zone = NULL; + continue; + } + locked_zone = zone; +-- +Gitee + + +From e9e8af9de46a8bdb6b9d79156f35e8d12e3a62a7 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:49 +0800 +Subject: [PATCH 06/19] mm: page_alloc: fix up block types when merging + compatible blocks + +mainline inclusion +from mainline-v6.10-rc1 +commit e6cf9e1c4cde8a53385423ecb8ca581097f42e02 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e6cf9e1c4cde8a53385423ecb8ca581097f42e02 + +-------------------------------- + +The buddy allocator coalesces compatible blocks during freeing, but it +doesn't update the types of the subblocks to match. When an allocation +later breaks the chunk down again, its pieces will be put on freelists of +the wrong type. This encourages incompatible page mixing (ask for one +type, get another), and thus long-term fragmentation. + +Update the subblocks when merging a larger chunk, such that a later +expand() will maintain freelist type hygiene. + +Link: https://lkml.kernel.org/r/20240320180429.678181-4-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reviewed-by: Zi Yan +Reviewed-by: Vlastimil Babka +Acked-by: Mel Gorman +Tested-by: "Huang, Ying" +Tested-by: Baolin Wang +Cc: David Hildenbrand +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index dad180df69da..3d7e0f110868 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -779,10 +779,17 @@ static inline void __free_one_page(struct page *page, + */ + int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + +- if (migratetype != buddy_mt +- && (!migratetype_is_mergeable(migratetype) || +- !migratetype_is_mergeable(buddy_mt))) +- goto done_merging; ++ if (migratetype != buddy_mt) { ++ if (!migratetype_is_mergeable(migratetype) || ++ !migratetype_is_mergeable(buddy_mt)) ++ goto done_merging; ++ /* ++ * Match buddy type. This ensures that ++ * an expand() down the line puts the ++ * sub-blocks on the right freelists. ++ */ ++ set_pageblock_migratetype(buddy, migratetype); ++ } + } + + /* +-- +Gitee + + +From c5babea33c4fe82208c95895869468ec022de6e6 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:50 +0800 +Subject: [PATCH 07/19] mm: page_alloc: move free pages when converting block + during isolation + +mainline inclusion +from mainline-v6.10-rc1 +commit b54ccd3c6bacbc571f7e61797fb5ff9fe3861413 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b54ccd3c6bacbc571f7e61797fb5ff9fe3861413 + +-------------------------------- + +When claiming a block during compaction isolation, move any remaining free +pages to the correct freelists as well, instead of stranding them on the +wrong list. Otherwise, this encourages incompatible page mixing down the +line, and thus long-term fragmentation. + +Link: https://lkml.kernel.org/r/20240320180429.678181-5-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reviewed-by: Zi Yan +Reviewed-by: Vlastimil Babka +Acked-by: Mel Gorman +Tested-by: "Huang, Ying" +Tested-by: Baolin Wang +Cc: David Hildenbrand +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3d7e0f110868..7e3593318813 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2681,9 +2681,12 @@ int __isolate_free_page(struct page *page, unsigned int order) + * Only change normal pageblocks (i.e., they can merge + * with others) + */ +- if (migratetype_is_mergeable(mt)) ++ if (migratetype_is_mergeable(mt)) { + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); ++ move_freepages_block(zone, page, ++ MIGRATE_MOVABLE, NULL); ++ } + } + } + +-- +Gitee + + +From eb6b987628f29ba04e6232b6132f8865f3115c4d Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:51 +0800 +Subject: [PATCH 08/19] mm: page_alloc: fix move_freepages_block() range error + +mainline inclusion +from mainline-v6.10-rc1 +commit 2dd482ba627de15d67f0c0ed445133c8ae9b201b +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2dd482ba627de15d67f0c0ed445133c8ae9b201b + +-------------------------------- + +When a block is partially outside the zone of the cursor page, the +function cuts the range to the pivot page instead of the zone start. This +can leave large parts of the block behind, which encourages incompatible +page mixing down the line (ask for one type, get another), and thus +long-term fragmentation. + +This triggers reliably on the first block in the DMA zone, whose start_pfn +is 1. The block is stolen, but everything before the pivot page (which +was often hundreds of pages) is left on the old list. + +Link: https://lkml.kernel.org/r/20240320180429.678181-6-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Mel Gorman +Cc: Zi Yan +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 7e3593318813..a101f5f550dc 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1661,9 +1661,15 @@ int move_freepages_block(struct zone *zone, struct page *page, + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; + +- /* Do not cross zone boundaries */ ++ /* ++ * The caller only has the lock for @zone, don't touch ranges ++ * that straddle into other zones. While we could move part of ++ * the range that's inside the zone, this call is usually ++ * accompanied by other operations such as migratetype updates ++ * which also should be locked. ++ */ + if (!zone_spans_pfn(zone, start_pfn)) +- start_pfn = pfn; ++ return 0; + if (!zone_spans_pfn(zone, end_pfn)) + return 0; + +-- +Gitee + + +From 3bd6146784a11153ea37b528313c89e160e1fa7c Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:52 +0800 +Subject: [PATCH 09/19] mm: page_alloc: fix freelist movement during block + conversion + +mainline inclusion +from mainline-v6.10-rc1 +commit c0cd6f557b9090525d288806cccbc73440ac235a +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0cd6f557b9090525d288806cccbc73440ac235a + +-------------------------------- + +Currently, page block type conversion during fallbacks, atomic +reservations and isolation can strand various amounts of free pages on +incorrect freelists. + +For example, fallback stealing moves free pages in the block to the new +type's freelists, but then may not actually claim the block for that type +if there aren't enough compatible pages already allocated. + +In all cases, free page moving might fail if the block straddles more than +one zone, in which case no free pages are moved at all, but the block type +is changed anyway. + +This is detrimental to type hygiene on the freelists. It encourages +incompatible page mixing down the line (ask for one type, get another) and +thus contributes to long-term fragmentation. + +Split the process into a proper transaction: check first if conversion +will happen, then try to move the free pages, and only if that was +successful convert the block to the new type. + +[baolin.wang@linux.alibaba.com: fix allocation failures with CONFIG_CMA] + Link: https://lkml.kernel.org/r/a97697e0-45b0-4f71-b087-fdc7a1d43c0e@linux.alibaba.com +Link: https://lkml.kernel.org/r/20240320180429.678181-7-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Signed-off-by: Baolin Wang +Tested-by: "Huang, Ying" +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: Mel Gorman +Cc: Zi Yan +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/page-isolation.h | 3 +- + mm/page_alloc.c | 174 ++++++++++++++++++++------------- + mm/page_isolation.c | 22 +++-- + 3 files changed, 121 insertions(+), 78 deletions(-) + +diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h +index 4ac34392823a..8550b3c91480 100644 +--- a/include/linux/page-isolation.h ++++ b/include/linux/page-isolation.h +@@ -34,8 +34,7 @@ static inline bool is_migrate_isolate(int migratetype) + #define REPORT_FAILURE 0x2 + + void set_pageblock_migratetype(struct page *page, int migratetype); +-int move_freepages_block(struct zone *zone, struct page *page, +- int migratetype, int *num_movable); ++int move_freepages_block(struct zone *zone, struct page *page, int migratetype); + + int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags, gfp_t gfp_flags); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index a101f5f550dc..ba85db6cf987 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1612,9 +1612,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + * Note that start_page and end_pages are not aligned on a pageblock + * boundary. If alignment is required, use move_freepages_block() + */ +-static int move_freepages(struct zone *zone, +- unsigned long start_pfn, unsigned long end_pfn, +- int migratetype, int *num_movable) ++static int move_freepages(struct zone *zone, unsigned long start_pfn, ++ unsigned long end_pfn, int migratetype) + { + struct page *page; + unsigned long pfn; +@@ -1624,14 +1623,6 @@ static int move_freepages(struct zone *zone, + for (pfn = start_pfn; pfn <= end_pfn;) { + page = pfn_to_page(pfn); + if (!PageBuddy(page)) { +- /* +- * We assume that pages that could be isolated for +- * migration are movable. But we don't actually try +- * isolating, as that would be expensive. +- */ +- if (num_movable && +- (PageLRU(page) || __PageMovable(page))) +- (*num_movable)++; + pfn++; + continue; + } +@@ -1649,17 +1640,16 @@ static int move_freepages(struct zone *zone, + return pages_moved; + } + +-int move_freepages_block(struct zone *zone, struct page *page, +- int migratetype, int *num_movable) ++static bool prep_move_freepages_block(struct zone *zone, struct page *page, ++ unsigned long *start_pfn, ++ unsigned long *end_pfn, ++ int *num_free, int *num_movable) + { +- unsigned long start_pfn, end_pfn, pfn; +- +- if (num_movable) +- *num_movable = 0; ++ unsigned long pfn, start, end; + + pfn = page_to_pfn(page); +- start_pfn = pageblock_start_pfn(pfn); +- end_pfn = pageblock_end_pfn(pfn) - 1; ++ start = pageblock_start_pfn(pfn); ++ end = pageblock_end_pfn(pfn) - 1; + + /* + * The caller only has the lock for @zone, don't touch ranges +@@ -1668,13 +1658,50 @@ int move_freepages_block(struct zone *zone, struct page *page, + * accompanied by other operations such as migratetype updates + * which also should be locked. + */ +- if (!zone_spans_pfn(zone, start_pfn)) +- return 0; +- if (!zone_spans_pfn(zone, end_pfn)) +- return 0; ++ if (!zone_spans_pfn(zone, start)) ++ return false; ++ if (!zone_spans_pfn(zone, end)) ++ return false; ++ ++ *start_pfn = start; ++ *end_pfn = end; ++ ++ if (num_free) { ++ *num_free = 0; ++ *num_movable = 0; ++ for (pfn = start; pfn <= end;) { ++ page = pfn_to_page(pfn); ++ if (PageBuddy(page)) { ++ int nr = 1 << buddy_order(page); ++ ++ *num_free += nr; ++ pfn += nr; ++ continue; ++ } ++ /* ++ * We assume that pages that could be isolated for ++ * migration are movable. But we don't actually try ++ * isolating, as that would be expensive. ++ */ ++ if (PageLRU(page) || __PageMovable(page)) ++ (*num_movable)++; ++ pfn++; ++ } ++ } ++ ++ return true; ++} ++ ++int move_freepages_block(struct zone *zone, struct page *page, ++ int migratetype) ++{ ++ unsigned long start_pfn, end_pfn; ++ ++ if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, ++ NULL, NULL)) ++ return -1; + +- return move_freepages(zone, start_pfn, end_pfn, migratetype, +- num_movable); ++ return move_freepages(zone, start_pfn, end_pfn, migratetype); + } + + static void change_pageblock_range(struct page *pageblock_page, +@@ -1759,33 +1786,37 @@ static inline bool boost_watermark(struct zone *zone) + } + + /* +- * This function implements actual steal behaviour. If order is large enough, +- * we can steal whole pageblock. If not, we first move freepages in this +- * pageblock to our migratetype and determine how many already-allocated pages +- * are there in the pageblock with a compatible migratetype. If at least half +- * of pages are free or compatible, we can change migratetype of the pageblock +- * itself, so pages freed in the future will be put on the correct free list. ++ * This function implements actual steal behaviour. If order is large enough, we ++ * can claim the whole pageblock for the requested migratetype. If not, we check ++ * the pageblock for constituent pages; if at least half of the pages are free ++ * or compatible, we can still claim the whole block, so pages freed in the ++ * future will be put on the correct free list. Otherwise, we isolate exactly ++ * the order we need from the fallback block and leave its migratetype alone. + */ +-static void steal_suitable_fallback(struct zone *zone, struct page *page, +- unsigned int alloc_flags, int start_type, bool whole_block) ++static struct page * ++steal_suitable_fallback(struct zone *zone, struct page *page, ++ int current_order, int order, int start_type, ++ unsigned int alloc_flags, bool whole_block) + { +- unsigned int current_order = buddy_order(page); + int free_pages, movable_pages, alike_pages; +- int old_block_type; ++ unsigned long start_pfn, end_pfn; ++ int block_type; + +- old_block_type = get_pageblock_migratetype(page); ++ block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ +- if (is_migrate_highatomic(old_block_type)) ++ if (is_migrate_highatomic(block_type)) + goto single_page; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { ++ del_page_from_free_list(page, zone, current_order); + change_pageblock_range(page, current_order, start_type); +- goto single_page; ++ expand(zone, page, order, current_order, start_type); ++ return page; + } + + /* +@@ -1800,10 +1831,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, + if (!whole_block) + goto single_page; + +- free_pages = move_freepages_block(zone, page, start_type, +- &movable_pages); + /* moving whole block can fail due to zone boundary conditions */ +- if (!free_pages) ++ if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, ++ &free_pages, &movable_pages)) + goto single_page; + + /* +@@ -1821,7 +1851,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ +- if (old_block_type == MIGRATE_MOVABLE) ++ if (block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else +@@ -1832,13 +1862,16 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, + * compatible migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || +- page_group_by_mobility_disabled) ++ page_group_by_mobility_disabled) { ++ move_freepages(zone, start_pfn, end_pfn, start_type); + set_pageblock_migratetype(page, start_type); +- +- return; ++ return __rmqueue_smallest(zone, order, start_type); ++ } + + single_page: +- move_to_free_list(page, zone, current_order, start_type); ++ del_page_from_free_list(page, zone, current_order); ++ expand(zone, page, order, current_order, block_type); ++ return page; + } + + /* +@@ -1906,9 +1939,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { +- zone->nr_reserved_highatomic += pageblock_nr_pages; +- set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); +- move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); ++ if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) { ++ set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); ++ zone->nr_reserved_highatomic += pageblock_nr_pages; ++ } + } + + out_unlock: +@@ -1933,7 +1967,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + struct zone *zone; + struct page *page; + int order; +- bool ret; ++ int ret; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, + ac->nodemask) { +@@ -1982,10 +2016,14 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * of pageblocks that cannot be completely freed + * may increase. + */ ++ ret = move_freepages_block(zone, page, ac->migratetype); ++ /* ++ * Reserving this block already succeeded, so this should ++ * not fail on zone boundaries. ++ */ ++ WARN_ON_ONCE(ret == -1); + set_pageblock_migratetype(page, ac->migratetype); +- ret = move_freepages_block(zone, page, ac->migratetype, +- NULL); +- if (ret) { ++ if (ret > 0) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } +@@ -2006,7 +2044,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * deviation from the rest of this file, to make the for loop + * condition simpler. + */ +-static __always_inline bool ++static __always_inline struct page * + __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + unsigned int alloc_flags) + { +@@ -2053,7 +2091,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + goto do_steal; + } + +- return false; ++ return NULL; + + find_smallest: + for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { +@@ -2073,14 +2111,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + do_steal: + page = get_page_from_free_area(area, fallback_mt); + +- steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, +- can_steal); ++ /* take off list, maybe claim block, expand remainder */ ++ page = steal_suitable_fallback(zone, page, current_order, order, ++ start_migratetype, alloc_flags, can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + +- return true; +- ++ return page; + } + + /* +@@ -2107,15 +2145,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, + return page; + } + } +-retry: ++ + page = __rmqueue_smallest(zone, order, migratetype); + if (unlikely(!page)) { + if (alloc_flags & ALLOC_CMA) + page = __rmqueue_cma_fallback(zone, order); + +- if (!page && __rmqueue_fallback(zone, order, migratetype, +- alloc_flags)) +- goto retry; ++ if (!page) ++ page = __rmqueue_fallback(zone, order, migratetype, ++ alloc_flags); + } + return page; + } +@@ -2687,12 +2725,10 @@ int __isolate_free_page(struct page *page, unsigned int order) + * Only change normal pageblocks (i.e., they can merge + * with others) + */ +- if (migratetype_is_mergeable(mt)) { +- set_pageblock_migratetype(page, +- MIGRATE_MOVABLE); +- move_freepages_block(zone, page, +- MIGRATE_MOVABLE, NULL); +- } ++ if (migratetype_is_mergeable(mt) && ++ move_freepages_block(zone, page, ++ MIGRATE_MOVABLE) != -1) ++ set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } + } + +diff --git a/mm/page_isolation.c b/mm/page_isolation.c +index 03381be87b28..b27ed476f80e 100644 +--- a/mm/page_isolation.c ++++ b/mm/page_isolation.c +@@ -179,15 +179,18 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ + unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, + migratetype, isol_flags); + if (!unmovable) { +- unsigned long nr_pages; ++ int nr_pages; + int mt = get_pageblock_migratetype(page); + ++ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); ++ /* Block spans zone boundaries? */ ++ if (nr_pages == -1) { ++ spin_unlock_irqrestore(&zone->lock, flags); ++ return -EBUSY; ++ } ++ __mod_zone_freepage_state(zone, -nr_pages, mt); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; +- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, +- NULL); +- +- __mod_zone_freepage_state(zone, -nr_pages, mt); + spin_unlock_irqrestore(&zone->lock, flags); + return 0; + } +@@ -207,7 +210,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ + static void unset_migratetype_isolate(struct page *page, int migratetype) + { + struct zone *zone; +- unsigned long flags, nr_pages; ++ unsigned long flags; + bool isolated_page = false; + unsigned int order; + struct page *buddy; +@@ -253,7 +256,12 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) + * allocation. + */ + if (!isolated_page) { +- nr_pages = move_freepages_block(zone, page, migratetype, NULL); ++ int nr_pages = move_freepages_block(zone, page, migratetype); ++ /* ++ * Isolating this block already succeeded, so this ++ * should not fail on zone boundaries. ++ */ ++ WARN_ON_ONCE(nr_pages == -1); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } + set_pageblock_migratetype(page, migratetype); +-- +Gitee + + +From b4ab6afae98c5b97e2b6a5681ea47dd87d833c5d Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:53 +0800 +Subject: [PATCH 10/19] mm: page_alloc: close migratetype race between freeing + and stealing + +mainline inclusion +from mainline-v6.10-rc1 +commit 55612e80e722ac554cc5e80df05555b4f8d40c37 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55612e80e722ac554cc5e80df05555b4f8d40c37 + +-------------------------------- + +There are three freeing paths that read the page's migratetype +optimistically before grabbing the zone lock. When this races with block +stealing, those pages go on the wrong freelist. + +The paths in question are: +- when freeing >costly orders that aren't THP +- when freeing pages to the buddy upon pcp lock contention +- when freeing pages that are isolated +- when freeing pages initially during boot +- when freeing the remainder in alloc_pages_exact() +- when "accepting" unaccepted VM host memory before first use +- when freeing pages during unpoisoning + +None of these are so hot that they would need this optimization at the +cost of hampering defrag efforts. Especially when contrasted with the +fact that the most common buddy freeing path - free_pcppages_bulk - is +checking the migratetype under the zone->lock just fine. + +In addition, isolated pages need to look up the migratetype under the lock +anyway, which adds branches to the locked section, and results in a double +lookup when the pages are in fact isolated. + +Move the lookups into the lock. + +Link: https://lkml.kernel.org/r/20240320180429.678181-8-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reported-by: Vlastimil Babka +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Mel Gorman +Cc: Zi Yan +Signed-off-by: Andrew Morton +Conflicts: + mm/page_alloc.c +[ Context conflict with commit 2ae116c3257d. ] +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 52 ++++++++++++++++++------------------------------- + 1 file changed, 19 insertions(+), 33 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index ba85db6cf987..cae51fd0b7b2 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1222,18 +1222,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, + spin_unlock_irqrestore(&zone->lock, flags); + } + +-static void free_one_page(struct zone *zone, +- struct page *page, unsigned long pfn, +- unsigned int order, +- int migratetype, fpi_t fpi_flags) ++static void free_one_page(struct zone *zone, struct page *page, ++ unsigned long pfn, unsigned int order, ++ fpi_t fpi_flags) + { + unsigned long flags; ++ int migratetype; + + spin_lock_irqsave(&zone->lock, flags); +- if (unlikely(has_isolate_pageblock(zone) || +- is_migrate_isolate(migratetype))) { +- migratetype = get_pfnblock_migratetype(page, pfn); +- } ++ migratetype = get_pfnblock_migratetype(page, pfn); + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); + } +@@ -1241,21 +1238,13 @@ static void free_one_page(struct zone *zone, + static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) + { +- int migratetype; + unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); + + if (!free_pages_prepare(page, order)) + return; + +- /* +- * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here +- * is used to avoid calling get_pfnblock_migratetype() under the lock. +- * This will reduce the lock holding time. +- */ +- migratetype = get_pfnblock_migratetype(page, pfn); +- +- free_one_page(zone, page, pfn, order, migratetype, fpi_flags); ++ free_one_page(zone, page, pfn, order, fpi_flags); + + __count_vm_events(PGFREE, 1 << order); + } +@@ -2518,7 +2507,7 @@ void free_unref_page(struct page *page, unsigned int order) + struct per_cpu_pages *pcp; + struct zone *zone; + unsigned long pfn = page_to_pfn(page); +- int migratetype, pcpmigratetype; ++ int migratetype; + + if (page_from_dynamic_pool(page)) { + dynamic_pool_free_page(page); +@@ -2540,23 +2529,23 @@ void free_unref_page(struct page *page, unsigned int order) + * get those areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ +- migratetype = pcpmigratetype = get_pfnblock_migratetype(page, pfn); ++ migratetype = get_pfnblock_migratetype(page, pfn); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { +- free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); ++ free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + return; + } +- pcpmigratetype = MIGRATE_MOVABLE; ++ migratetype = MIGRATE_MOVABLE; + } + + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (pcp) { +- free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); ++ free_unref_page_commit(zone, pcp, page, migratetype, order); + pcp_spin_unlock(pcp); + } else { +- free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); ++ free_one_page(zone, page, pfn, order, FPI_NONE); + } + pcp_trylock_finish(UP_flags); + } +@@ -2590,12 +2579,8 @@ void free_unref_folios(struct folio_batch *folios) + * allocator. + */ + if (!pcp_allowed_order(order)) { +- int migratetype; +- +- migratetype = get_pfnblock_migratetype(&folio->page, +- pfn); +- free_one_page(folio_zone(folio), &folio->page, pfn, +- order, migratetype, FPI_NONE); ++ free_one_page(folio_zone(folio), &folio->page, ++ pfn, order, FPI_NONE); + continue; + } + folio->private = (void *)(unsigned long)order; +@@ -2631,7 +2616,7 @@ void free_unref_folios(struct folio_batch *folios) + */ + if (is_migrate_isolate(migratetype)) { + free_one_page(zone, &folio->page, pfn, +- order, migratetype, FPI_NONE); ++ order, FPI_NONE); + continue; + } + +@@ -2644,7 +2629,7 @@ void free_unref_folios(struct folio_batch *folios) + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); + free_one_page(zone, &folio->page, pfn, +- order, migratetype, FPI_NONE); ++ order, FPI_NONE); + continue; + } + locked_zone = zone; +@@ -7022,13 +7007,14 @@ bool take_page_off_buddy(struct page *page) + bool put_page_back_buddy(struct page *page) + { + struct zone *zone = page_zone(page); +- unsigned long pfn = page_to_pfn(page); + unsigned long flags; +- int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { ++ unsigned long pfn = page_to_pfn(page); ++ int migratetype = get_pfnblock_migratetype(page, pfn); ++ + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { +-- +Gitee + + +From 4f118ff5ee3c0ddc3b921a8f9bcabfe48fa2dab9 Mon Sep 17 00:00:00 2001 +From: Zi Yan +Date: Wed, 18 Dec 2024 17:34:54 +0800 +Subject: [PATCH 11/19] mm: page_alloc: set migratetype inside move_freepages() + +mainline inclusion +from mainline-v6.10-rc1 +commit f37c0f6876a8eabe1477c87860460bc181f6cdbb +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f37c0f6876a8eabe1477c87860460bc181f6cdbb + +-------------------------------- + +This avoids changing migratetype after move_freepages() or +move_freepages_block(), which is error prone. It also prepares for +upcoming changes to fix move_freepages() not moving free pages partially +in the range. + +Link: https://lkml.kernel.org/r/20240320180429.678181-9-hannes@cmpxchg.org +Signed-off-by: Zi Yan +Signed-off-by: Johannes Weiner +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 27 +++++++++++++-------------- + mm/page_isolation.c | 7 +++---- + 2 files changed, 16 insertions(+), 18 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index cae51fd0b7b2..6f59b8e73daa 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1597,9 +1597,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + #endif + + /* +- * Move the free pages in a range to the freelist tail of the requested type. +- * Note that start_page and end_pages are not aligned on a pageblock +- * boundary. If alignment is required, use move_freepages_block() ++ * Change the type of a block and move all its free pages to that ++ * type's freelist. + */ + static int move_freepages(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn, int migratetype) +@@ -1609,6 +1608,9 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, + unsigned int order; + int pages_moved = 0; + ++ VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); ++ VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn); ++ + for (pfn = start_pfn; pfn <= end_pfn;) { + page = pfn_to_page(pfn); + if (!PageBuddy(page)) { +@@ -1626,6 +1628,8 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, + pages_moved += 1 << order; + } + ++ set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype); ++ + return pages_moved; + } + +@@ -1853,7 +1857,6 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { + move_freepages(zone, start_pfn, end_pfn, start_type); +- set_pageblock_migratetype(page, start_type); + return __rmqueue_smallest(zone, order, start_type); + } + +@@ -1927,12 +1930,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ +- if (migratetype_is_mergeable(mt)) { +- if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) { +- set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); ++ if (migratetype_is_mergeable(mt)) ++ if (move_freepages_block(zone, page, ++ MIGRATE_HIGHATOMIC) != -1) + zone->nr_reserved_highatomic += pageblock_nr_pages; +- } +- } + + out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +@@ -2011,7 +2012,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); +- set_pageblock_migratetype(page, ac->migratetype); + if (ret > 0) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +@@ -2710,10 +2710,9 @@ int __isolate_free_page(struct page *page, unsigned int order) + * Only change normal pageblocks (i.e., they can merge + * with others) + */ +- if (migratetype_is_mergeable(mt) && +- move_freepages_block(zone, page, +- MIGRATE_MOVABLE) != -1) +- set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ if (migratetype_is_mergeable(mt)) ++ move_freepages_block(zone, page, ++ MIGRATE_MOVABLE); + } + } + +diff --git a/mm/page_isolation.c b/mm/page_isolation.c +index b27ed476f80e..8fc4f9491417 100644 +--- a/mm/page_isolation.c ++++ b/mm/page_isolation.c +@@ -189,7 +189,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ + return -EBUSY; + } + __mod_zone_freepage_state(zone, -nr_pages, mt); +- set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; + spin_unlock_irqrestore(&zone->lock, flags); + return 0; +@@ -263,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) + */ + WARN_ON_ONCE(nr_pages == -1); + __mod_zone_freepage_state(zone, nr_pages, migratetype); +- } +- set_pageblock_migratetype(page, migratetype); +- if (isolated_page) ++ } else { ++ set_pageblock_migratetype(page, migratetype); + __putback_isolated_page(page, order, migratetype); ++ } + zone->nr_isolate_pageblock--; + out: + spin_unlock_irqrestore(&zone->lock, flags); +-- +Gitee + + +From 38dfe2413fdbcdafee46edb83704f65c39eb4a74 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:55 +0800 +Subject: [PATCH 12/19] mm: page_isolation: prepare for hygienic freelists + +mainline inclusion +from mainline-v6.10-rc1 +commit fd919a85cd55be5d00a6a7372071f44c8eafb825 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fd919a85cd55be5d00a6a7372071f44c8eafb825 + +-------------------------------- + +Page isolation currently sets MIGRATE_ISOLATE on a block, then drops +zone->lock and scans the block for straddling buddies to split up. +Because this happens non-atomically wrt the page allocator, it's possible +for allocations to get a buddy whose first block is a regular pcp +migratetype but whose tail is isolated. This means that in certain cases +memory can still be allocated after isolation. It will also trigger the +freelist type hygiene warnings in subsequent patches. + +start_isolate_page_range() + isolate_single_pageblock() + set_migratetype_isolate(tail) + lock zone->lock + move_freepages_block(tail) // nop + set_pageblock_migratetype(tail) + unlock zone->lock + __rmqueue_smallest() + del_page_from_freelist(head) + expand(head, head_mt) + WARN(head_mt != tail_mt) + start_pfn = ALIGN_DOWN(MAX_ORDER_NR_PAGES) + for (pfn = start_pfn, pfn < end_pfn) + if (PageBuddy()) + split_free_page(head) + +Introduce a variant of move_freepages_block() provided by the allocator +specifically for page isolation; it moves free pages, converts the block, +and handles the splitting of straddling buddies while holding zone->lock. + +The allocator knows that pageblocks and buddies are always naturally +aligned, which means that buddies can only straddle blocks if they're +actually >pageblock_order. This means the search-and-split part can be +simplified compared to what page isolation used to do. + +Also tighten up the page isolation code around the expectations of which +pages can be large, and how they are freed. + +Based on extensive discussions with and invaluable input from Zi Yan. + +[hannes@cmpxchg.org: work around older gcc warning] + Link: https://lkml.kernel.org/r/20240321142426.GB777580@cmpxchg.org +Link: https://lkml.kernel.org/r/20240320180429.678181-10-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Mel Gorman +Cc: Zi Yan +Signed-off-by: Andrew Morton +Conflicts: + mm/internal.h + mm/page_alloc.c + mm/page_isolation.c +[ Context conflict due to miss MAX_PAGE_ORDER. ] +Signed-off-by: Liu Shixin +--- + include/linux/page-isolation.h | 4 +- + mm/internal.h | 4 - + mm/page_alloc.c | 204 +++++++++++++++++++-------------- + mm/page_isolation.c | 106 ++++++----------- + 4 files changed, 155 insertions(+), 163 deletions(-) + +diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h +index 8550b3c91480..c16db0067090 100644 +--- a/include/linux/page-isolation.h ++++ b/include/linux/page-isolation.h +@@ -34,7 +34,9 @@ static inline bool is_migrate_isolate(int migratetype) + #define REPORT_FAILURE 0x2 + + void set_pageblock_migratetype(struct page *page, int migratetype); +-int move_freepages_block(struct zone *zone, struct page *page, int migratetype); ++ ++bool move_freepages_block_isolate(struct zone *zone, struct page *page, ++ int migratetype); + + int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags, gfp_t gfp_flags); +diff --git a/mm/internal.h b/mm/internal.h +index 0478e5dab55b..de564608dfa6 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -693,10 +693,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + void memmap_init_range(unsigned long, int, unsigned long, unsigned long, + unsigned long, enum meminit_context, struct vmem_altmap *, int); + +- +-int split_free_page(struct page *free_page, +- unsigned int order, unsigned long split_pfn_offset); +- + #if defined CONFIG_COMPACTION || defined CONFIG_CMA + + #define MAX_PAGE_ORDER MAX_ORDER +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 6f59b8e73daa..3bc1502e42cf 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -826,64 +826,6 @@ static inline void __free_one_page(struct page *page, + page_reporting_notify_free(order); + } + +-/** +- * split_free_page() -- split a free page at split_pfn_offset +- * @free_page: the original free page +- * @order: the order of the page +- * @split_pfn_offset: split offset within the page +- * +- * Return -ENOENT if the free page is changed, otherwise 0 +- * +- * It is used when the free page crosses two pageblocks with different migratetypes +- * at split_pfn_offset within the page. The split free page will be put into +- * separate migratetype lists afterwards. Otherwise, the function achieves +- * nothing. +- */ +-int split_free_page(struct page *free_page, +- unsigned int order, unsigned long split_pfn_offset) +-{ +- struct zone *zone = page_zone(free_page); +- unsigned long free_page_pfn = page_to_pfn(free_page); +- unsigned long pfn; +- unsigned long flags; +- int free_page_order; +- int mt; +- int ret = 0; +- +- if (split_pfn_offset == 0) +- return ret; +- +- spin_lock_irqsave(&zone->lock, flags); +- +- if (!PageBuddy(free_page) || buddy_order(free_page) != order) { +- ret = -ENOENT; +- goto out; +- } +- +- mt = get_pfnblock_migratetype(free_page, free_page_pfn); +- if (likely(!is_migrate_isolate(mt))) +- __mod_zone_freepage_state(zone, -(1UL << order), mt); +- +- del_page_from_free_list(free_page, zone, order); +- for (pfn = free_page_pfn; +- pfn < free_page_pfn + (1UL << order);) { +- int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); +- +- free_page_order = min_t(unsigned int, +- pfn ? __ffs(pfn) : order, +- __fls(split_pfn_offset)); +- __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, +- mt, FPI_NONE); +- pfn += 1UL << free_page_order; +- split_pfn_offset -= (1UL << free_page_order); +- /* we have done the first part, now switch to second part */ +- if (split_pfn_offset == 0) +- split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); +- } +-out: +- spin_unlock_irqrestore(&zone->lock, flags); +- return ret; +-} + /* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed +@@ -1685,8 +1627,8 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, + return true; + } + +-int move_freepages_block(struct zone *zone, struct page *page, +- int migratetype) ++static int move_freepages_block(struct zone *zone, struct page *page, ++ int migratetype) + { + unsigned long start_pfn, end_pfn; + +@@ -1697,6 +1639,123 @@ int move_freepages_block(struct zone *zone, struct page *page, + return move_freepages(zone, start_pfn, end_pfn, migratetype); + } + ++#ifdef CONFIG_MEMORY_ISOLATION ++/* Look for a buddy that straddles start_pfn */ ++static unsigned long find_large_buddy(unsigned long start_pfn) ++{ ++ int order = 0; ++ struct page *page; ++ unsigned long pfn = start_pfn; ++ ++ while (!PageBuddy(page = pfn_to_page(pfn))) { ++ /* Nothing found */ ++ if (++order > MAX_PAGE_ORDER) ++ return start_pfn; ++ pfn &= ~0UL << order; ++ } ++ ++ /* ++ * Found a preceding buddy, but does it straddle? ++ */ ++ if (pfn + (1 << buddy_order(page)) > start_pfn) ++ return pfn; ++ ++ /* Nothing found */ ++ return start_pfn; ++} ++ ++/* Split a multi-block free page into its individual pageblocks */ ++static void split_large_buddy(struct zone *zone, struct page *page, ++ unsigned long pfn, int order) ++{ ++ unsigned long end_pfn = pfn + (1 << order); ++ ++ VM_WARN_ON_ONCE(order <= pageblock_order); ++ VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); ++ ++ /* Caller removed page from freelist, buddy info cleared! */ ++ VM_WARN_ON_ONCE(PageBuddy(page)); ++ ++ while (pfn != end_pfn) { ++ int mt = get_pfnblock_migratetype(page, pfn); ++ ++ __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); ++ pfn += pageblock_nr_pages; ++ page = pfn_to_page(pfn); ++ } ++} ++ ++/** ++ * move_freepages_block_isolate - move free pages in block for page isolation ++ * @zone: the zone ++ * @page: the pageblock page ++ * @migratetype: migratetype to set on the pageblock ++ * ++ * This is similar to move_freepages_block(), but handles the special ++ * case encountered in page isolation, where the block of interest ++ * might be part of a larger buddy spanning multiple pageblocks. ++ * ++ * Unlike the regular page allocator path, which moves pages while ++ * stealing buddies off the freelist, page isolation is interested in ++ * arbitrary pfn ranges that may have overlapping buddies on both ends. ++ * ++ * This function handles that. Straddling buddies are split into ++ * individual pageblocks. Only the block of interest is moved. ++ * ++ * Returns %true if pages could be moved, %false otherwise. ++ */ ++bool move_freepages_block_isolate(struct zone *zone, struct page *page, ++ int migratetype) ++{ ++ unsigned long start_pfn, end_pfn, pfn; ++ int nr_moved, mt; ++ ++ if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, ++ NULL, NULL)) ++ return false; ++ ++ /* No splits needed if buddies can't span multiple blocks */ ++ if (pageblock_order == MAX_PAGE_ORDER) ++ goto move; ++ ++ /* We're a tail block in a larger buddy */ ++ pfn = find_large_buddy(start_pfn); ++ if (pfn != start_pfn) { ++ struct page *buddy = pfn_to_page(pfn); ++ int order = buddy_order(buddy); ++ int mt = get_pfnblock_migratetype(buddy, pfn); ++ ++ if (!is_migrate_isolate(mt)) ++ __mod_zone_freepage_state(zone, -(1UL << order), mt); ++ del_page_from_free_list(buddy, zone, order); ++ set_pageblock_migratetype(page, migratetype); ++ split_large_buddy(zone, buddy, pfn, order); ++ return true; ++ } ++ ++ /* We're the starting block of a larger buddy */ ++ if (PageBuddy(page) && buddy_order(page) > pageblock_order) { ++ int mt = get_pfnblock_migratetype(page, pfn); ++ int order = buddy_order(page); ++ ++ if (!is_migrate_isolate(mt)) ++ __mod_zone_freepage_state(zone, -(1UL << order), mt); ++ del_page_from_free_list(page, zone, order); ++ set_pageblock_migratetype(page, migratetype); ++ split_large_buddy(zone, page, pfn, order); ++ return true; ++ } ++move: ++ mt = get_pfnblock_migratetype(page, start_pfn); ++ nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype); ++ if (!is_migrate_isolate(mt)) ++ __mod_zone_freepage_state(zone, -nr_moved, mt); ++ else if (!is_migrate_isolate(migratetype)) ++ __mod_zone_freepage_state(zone, nr_moved, migratetype); ++ return true; ++} ++#endif /* CONFIG_MEMORY_ISOLATION */ ++ + static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) + { +@@ -6575,7 +6634,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype, gfp_t gfp_mask) + { + unsigned long outer_start, outer_end; +- int order; + int ret = 0; + + struct compact_control cc = { +@@ -6648,29 +6706,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ +- +- order = 0; +- outer_start = start; +- while (!PageBuddy(pfn_to_page(outer_start))) { +- if (++order > MAX_ORDER) { +- outer_start = start; +- break; +- } +- outer_start &= ~0UL << order; +- } +- +- if (outer_start != start) { +- order = buddy_order(pfn_to_page(outer_start)); +- +- /* +- * outer_start page could be small order buddy page and +- * it doesn't include start page. Adjust outer_start +- * in this case to report failed page properly +- * on tracepoint in test_pages_isolated() +- */ +- if (outer_start + (1UL << order) <= start) +- outer_start = start; +- } ++ outer_start = find_large_buddy(start); + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end, 0)) { +diff --git a/mm/page_isolation.c b/mm/page_isolation.c +index 8fc4f9491417..b3aae89ed226 100644 +--- a/mm/page_isolation.c ++++ b/mm/page_isolation.c +@@ -179,16 +179,10 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ + unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, + migratetype, isol_flags); + if (!unmovable) { +- int nr_pages; +- int mt = get_pageblock_migratetype(page); +- +- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); +- /* Block spans zone boundaries? */ +- if (nr_pages == -1) { ++ if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } +- __mod_zone_freepage_state(zone, -nr_pages, mt); + zone->nr_isolate_pageblock++; + spin_unlock_irqrestore(&zone->lock, flags); + return 0; +@@ -255,13 +249,11 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) + * allocation. + */ + if (!isolated_page) { +- int nr_pages = move_freepages_block(zone, page, migratetype); + /* + * Isolating this block already succeeded, so this + * should not fail on zone boundaries. + */ +- WARN_ON_ONCE(nr_pages == -1); +- __mod_zone_freepage_state(zone, nr_pages, migratetype); ++ WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); + } else { + set_pageblock_migratetype(page, migratetype); + __putback_isolated_page(page, order, migratetype); +@@ -377,26 +369,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + + VM_BUG_ON(!page); + pfn = page_to_pfn(page); +- /* +- * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any +- * free pages in [start_pfn, boundary_pfn), its head page will +- * always be in the range. +- */ ++ + if (PageBuddy(page)) { + int order = buddy_order(page); + +- if (pfn + (1UL << order) > boundary_pfn) { +- /* free page changed before split, check it again */ +- if (split_free_page(page, order, boundary_pfn - pfn)) +- continue; +- } ++ /* move_freepages_block_isolate() handled this */ ++ VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); + + pfn += 1UL << order; + continue; + } ++ + /* +- * migrate compound pages then let the free page handling code +- * above do the rest. If migration is not possible, just fail. ++ * If a compound page is straddling our block, attempt ++ * to migrate it out of the way. ++ * ++ * We don't have to worry about this creating a large ++ * free page that straddles into our block: gigantic ++ * pages are freed as order-0 chunks, and LRU pages ++ * (currently) do not exceed pageblock_order. ++ * ++ * The block of interest has already been marked ++ * MIGRATE_ISOLATE above, so when migration is done it ++ * will free its pages onto the correct freelists. + */ + if (PageCompound(page)) { + struct page *head = compound_head(page); +@@ -407,16 +402,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + pfn = head_pfn + nr_pages; + continue; + } ++ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA +- /* +- * hugetlb, lru compound (THP), and movable compound pages +- * can be migrated. Otherwise, fail the isolation. +- */ +- if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { +- int order; +- unsigned long outer_pfn; ++ if (PageHuge(page)) { + int page_mt = get_pageblock_migratetype(page); +- bool isolate_page = !is_migrate_isolate_page(page); + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, +@@ -429,56 +418,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + }; + INIT_LIST_HEAD(&cc.migratepages); + +- /* +- * XXX: mark the page as MIGRATE_ISOLATE so that +- * no one else can grab the freed page after migration. +- * Ideally, the page should be freed as two separate +- * pages to be added into separate migratetype free +- * lists. +- */ +- if (isolate_page) { +- ret = set_migratetype_isolate(page, page_mt, +- flags, head_pfn, head_pfn + nr_pages); +- if (ret) +- goto failed; +- } +- + ret = __alloc_contig_migrate_range(&cc, head_pfn, + head_pfn + nr_pages, page_mt); +- +- /* +- * restore the page's migratetype so that it can +- * be split into separate migratetype free lists +- * later. +- */ +- if (isolate_page) +- unset_migratetype_isolate(page, page_mt); +- + if (ret) + goto failed; +- /* +- * reset pfn to the head of the free page, so +- * that the free page handling code above can split +- * the free page to the right migratetype list. +- * +- * head_pfn is not used here as a hugetlb page order +- * can be bigger than MAX_ORDER, but after it is +- * freed, the free page order is not. Use pfn within +- * the range to find the head of the free page. +- */ +- order = 0; +- outer_pfn = pfn; +- while (!PageBuddy(pfn_to_page(outer_pfn))) { +- /* stop if we cannot find the free page */ +- if (++order > MAX_ORDER) +- goto failed; +- outer_pfn &= ~0UL << order; +- } +- pfn = outer_pfn; ++ pfn = head_pfn + nr_pages; + continue; +- } else ++ } ++ ++ /* ++ * These pages are movable too, but they're ++ * not expected to exceed pageblock_order. ++ * ++ * Let us know when they do, so we can add ++ * proper free and split handling for them. ++ */ ++ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); ++ VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); + #endif +- goto failed; ++ goto failed; + } + + pfn++; +-- +Gitee + + +From adfca1d2fd31fb7cc452fbcbf8ee0c5cb961f02f Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:56 +0800 +Subject: [PATCH 13/19] mm: page_alloc: consolidate free page accounting + +mainline inclusion +from mainline-v6.10-rc1 +commit e0932b6c1f942fa747258e152cdce0d0b2b5be5c +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e0932b6c1f942fa747258e152cdce0d0b2b5be5c + +-------------------------------- + +Free page accounting currently happens a bit too high up the call stack, +where it has to deal with guard pages, compaction capturing, block +stealing and even page isolation. This is subtle and fragile, and makes +it difficult to hack on the code. + +Now that type violations on the freelists have been fixed, push the +accounting down to where pages enter and leave the freelist. + +[hannes@cmpxchg.org: undo unrelated drive-by line wrap] + Link: https://lkml.kernel.org/r/20240327185736.GA7597@cmpxchg.org +[hannes@cmpxchg.org: remove unused page parameter from account_freepages()] + Link: https://lkml.kernel.org/r/20240327185831.GB7597@cmpxchg.org +[baolin.wang@linux.alibaba.com: fix free page accounting] + Link: https://lkml.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang@linux.alibaba.com +[andriy.shevchenko@linux.intel.com: avoid defining unused function] + Link: https://lkml.kernel.org/r/20240423161506.2637177-1-andriy.shevchenko@linux.intel.com +Link: https://lkml.kernel.org/r/20240320180429.678181-11-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Signed-off-by: Andy Shevchenko +Signed-off-by: Baolin Wang +Reviewed-by: Vlastimil Babka +Tested-by: Baolin Wang +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Mel Gorman +Cc: Zi Yan +Signed-off-by: Andrew Morton +Conflicts: + mm/page_alloc.c +[ Context conflicts due to miss MAX_PAGE_ORDER. ] +Signed-off-by: Liu Shixin +--- + include/linux/mm.h | 18 ++-- + include/linux/vmstat.h | 8 -- + mm/debug_page_alloc.c | 12 +-- + mm/internal.h | 5 -- + mm/page_alloc.c | 192 +++++++++++++++++++++++------------------ + 5 files changed, 118 insertions(+), 117 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 2e6ef9532fc3..b6dcdaafc592 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3819,24 +3819,22 @@ static inline bool page_is_guard(struct page *page) + return PageGuard(page); + } + +-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype); ++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); + static inline bool set_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) ++ unsigned int order) + { + if (!debug_guardpage_enabled()) + return false; +- return __set_page_guard(zone, page, order, migratetype); ++ return __set_page_guard(zone, page, order); + } + +-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype); ++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); + static inline void clear_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) ++ unsigned int order) + { + if (!debug_guardpage_enabled()) + return; +- __clear_page_guard(zone, page, order, migratetype); ++ __clear_page_guard(zone, page, order); + } + + #else /* CONFIG_DEBUG_PAGEALLOC */ +@@ -3846,9 +3844,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } + static inline bool debug_guardpage_enabled(void) { return false; } + static inline bool page_is_guard(struct page *page) { return false; } + static inline bool set_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) { return false; } ++ unsigned int order) { return false; } + static inline void clear_page_guard(struct zone *zone, struct page *page, +- unsigned int order, int migratetype) {} ++ unsigned int order) {} + #endif /* CONFIG_DEBUG_PAGEALLOC */ + + #ifdef __HAVE_ARCH_GATE_AREA +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index 343906a98d6e..735eae6e272c 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio, + mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); + } + +-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, +- int migratetype) +-{ +- __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); +- if (is_migrate_cma(migratetype)) +- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); +-} +- + extern const char * const vmstat_text[]; + + static inline const char *zone_stat_name(enum zone_stat_item item) +diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c +index f9d145730fd1..03a810927d0a 100644 +--- a/mm/debug_page_alloc.c ++++ b/mm/debug_page_alloc.c +@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) + } + early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); + +-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype) ++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) + { + if (order >= debug_guardpage_minorder()) + return false; +@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + __SetPageGuard(page); + INIT_LIST_HEAD(&page->buddy_list); + set_page_private(page, order); +- /* Guard pages are not available for any usage */ +- if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; + } + +-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, +- int migratetype) ++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) + { + __ClearPageGuard(page); +- + set_page_private(page, 0); +- if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, (1 << order), migratetype); + } +diff --git a/mm/internal.h b/mm/internal.h +index de564608dfa6..8742aafde387 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -1171,11 +1171,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) + return migratetype == MIGRATE_HIGHATOMIC; + } + +-static inline bool is_migrate_highatomic_page(struct page *page) +-{ +- return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +-} +- + void setup_zone_pageset(struct zone *zone); + + struct migration_target_control { +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3bc1502e42cf..d662bbdf2e91 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -636,23 +636,33 @@ compaction_capture(struct capture_control *capc, struct page *page, + } + #endif /* CONFIG_COMPACTION */ + +-/* Used for pages not on another list */ +-static inline void add_to_free_list(struct page *page, struct zone *zone, +- unsigned int order, int migratetype) ++static inline void account_freepages(struct zone *zone, int nr_pages, ++ int migratetype) + { +- struct free_area *area = &zone->free_area[order]; ++ if (is_migrate_isolate(migratetype)) ++ return; + +- list_add(&page->buddy_list, &area->free_list[migratetype]); +- area->nr_free++; ++ __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); ++ ++ if (is_migrate_cma(migratetype)) ++ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); + } + + /* Used for pages not on another list */ +-static inline void add_to_free_list_tail(struct page *page, struct zone *zone, +- unsigned int order, int migratetype) ++static inline void __add_to_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype, ++ bool tail) + { + struct free_area *area = &zone->free_area[order]; + +- list_add_tail(&page->buddy_list, &area->free_list[migratetype]); ++ VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, ++ "page type is %lu, passed migratetype is %d (nr=%d)\n", ++ get_pageblock_migratetype(page), migratetype, 1 << order); ++ ++ if (tail) ++ list_add_tail(&page->buddy_list, &area->free_list[migratetype]); ++ else ++ list_add(&page->buddy_list, &area->free_list[migratetype]); + area->nr_free++; + } + +@@ -662,16 +672,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + * allocation again (e.g., optimization for memory onlining). + */ + static inline void move_to_free_list(struct page *page, struct zone *zone, +- unsigned int order, int migratetype) ++ unsigned int order, int old_mt, int new_mt) + { + struct free_area *area = &zone->free_area[order]; + +- list_move_tail(&page->buddy_list, &area->free_list[migratetype]); ++ /* Free page moving can fail, so it happens before the type update */ ++ VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, ++ "page type is %lu, passed migratetype is %d (nr=%d)\n", ++ get_pageblock_migratetype(page), old_mt, 1 << order); ++ ++ list_move_tail(&page->buddy_list, &area->free_list[new_mt]); ++ ++ account_freepages(zone, -(1 << order), old_mt); ++ account_freepages(zone, 1 << order, new_mt); + } + +-static inline void del_page_from_free_list(struct page *page, struct zone *zone, +- unsigned int order) ++static inline void __del_page_from_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype) + { ++ VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, ++ "page type is %lu, passed migratetype is %d (nr=%d)\n", ++ get_pageblock_migratetype(page), migratetype, 1 << order); ++ + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); +@@ -682,6 +704,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, + zone->free_area[order].nr_free--; + } + ++static inline void del_page_from_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype) ++{ ++ __del_page_from_free_list(page, zone, order, migratetype); ++ account_freepages(zone, -(1 << order), migratetype); ++} ++ + static inline struct page *get_page_from_free_area(struct free_area *area, + int migratetype) + { +@@ -753,16 +782,16 @@ static inline void __free_one_page(struct page *page, + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + + VM_BUG_ON(migratetype == -1); +- if (likely(!is_migrate_isolate(migratetype))) +- __mod_zone_freepage_state(zone, 1 << order, migratetype); +- + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); + ++ account_freepages(zone, 1 << order, migratetype); ++ + while (order < MAX_ORDER) { ++ int buddy_mt = migratetype; ++ + if (compaction_capture(capc, page, order, migratetype)) { +- __mod_zone_freepage_state(zone, -(1 << order), +- migratetype); ++ account_freepages(zone, -(1 << order), migratetype); + return; + } + +@@ -777,19 +806,12 @@ static inline void __free_one_page(struct page *page, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. + */ +- int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); ++ buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + +- if (migratetype != buddy_mt) { +- if (!migratetype_is_mergeable(migratetype) || +- !migratetype_is_mergeable(buddy_mt)) +- goto done_merging; +- /* +- * Match buddy type. This ensures that +- * an expand() down the line puts the +- * sub-blocks on the right freelists. +- */ +- set_pageblock_migratetype(buddy, migratetype); +- } ++ if (migratetype != buddy_mt && ++ (!migratetype_is_mergeable(migratetype) || ++ !migratetype_is_mergeable(buddy_mt))) ++ goto done_merging; + } + + /* +@@ -797,9 +819,19 @@ static inline void __free_one_page(struct page *page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) +- clear_page_guard(zone, buddy, order, migratetype); ++ clear_page_guard(zone, buddy, order); + else +- del_page_from_free_list(buddy, zone, order); ++ __del_page_from_free_list(buddy, zone, order, buddy_mt); ++ ++ if (unlikely(buddy_mt != migratetype)) { ++ /* ++ * Match buddy type. This ensures that an ++ * expand() down the line puts the sub-blocks ++ * on the right freelists. ++ */ ++ set_pageblock_migratetype(buddy, migratetype); ++ } ++ + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; +@@ -816,10 +848,7 @@ static inline void __free_one_page(struct page *page, + else + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); + +- if (to_tail) +- add_to_free_list_tail(page, zone, order, migratetype); +- else +- add_to_free_list(page, zone, order, migratetype); ++ __add_to_free_list(page, zone, order, migratetype, to_tail); + + /* Notify page reporting subsystem of freed page */ + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) +@@ -1309,10 +1338,10 @@ static inline void expand(struct zone *zone, struct page *page, + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ +- if (set_page_guard(zone, &page[size], high, migratetype)) ++ if (set_page_guard(zone, &page[size], high)) + continue; + +- add_to_free_list(&page[size], zone, high, migratetype); ++ add_to_free_list(&page[size], zone, high, migratetype, false); + set_buddy_order(&page[size], high); + } + } +@@ -1503,7 +1532,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + page = get_page_from_free_area(area, migratetype); + if (!page) + continue; +- del_page_from_free_list(page, zone, current_order); ++ del_page_from_free_list(page, zone, current_order, migratetype); + expand(zone, page, order, current_order, migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && +@@ -1543,7 +1572,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + * type's freelist. + */ + static int move_freepages(struct zone *zone, unsigned long start_pfn, +- unsigned long end_pfn, int migratetype) ++ unsigned long end_pfn, int old_mt, int new_mt) + { + struct page *page; + unsigned long pfn; +@@ -1565,12 +1594,14 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + + order = buddy_order(page); +- move_to_free_list(page, zone, order, migratetype); ++ ++ move_to_free_list(page, zone, order, old_mt, new_mt); ++ + pfn += 1 << order; + pages_moved += 1 << order; + } + +- set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype); ++ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); + + return pages_moved; + } +@@ -1628,7 +1659,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, + } + + static int move_freepages_block(struct zone *zone, struct page *page, +- int migratetype) ++ int old_mt, int new_mt) + { + unsigned long start_pfn, end_pfn; + +@@ -1636,7 +1667,7 @@ static int move_freepages_block(struct zone *zone, struct page *page, + NULL, NULL)) + return -1; + +- return move_freepages(zone, start_pfn, end_pfn, migratetype); ++ return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt); + } + + #ifdef CONFIG_MEMORY_ISOLATION +@@ -1708,7 +1739,6 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype) + { + unsigned long start_pfn, end_pfn, pfn; +- int nr_moved, mt; + + if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, + NULL, NULL)) +@@ -1723,11 +1753,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, + if (pfn != start_pfn) { + struct page *buddy = pfn_to_page(pfn); + int order = buddy_order(buddy); +- int mt = get_pfnblock_migratetype(buddy, pfn); + +- if (!is_migrate_isolate(mt)) +- __mod_zone_freepage_state(zone, -(1UL << order), mt); +- del_page_from_free_list(buddy, zone, order); ++ del_page_from_free_list(buddy, zone, order, ++ get_pfnblock_migratetype(buddy, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, buddy, pfn, order); + return true; +@@ -1735,23 +1763,17 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, + + /* We're the starting block of a larger buddy */ + if (PageBuddy(page) && buddy_order(page) > pageblock_order) { +- int mt = get_pfnblock_migratetype(page, pfn); + int order = buddy_order(page); + +- if (!is_migrate_isolate(mt)) +- __mod_zone_freepage_state(zone, -(1UL << order), mt); +- del_page_from_free_list(page, zone, order); ++ del_page_from_free_list(page, zone, order, ++ get_pfnblock_migratetype(page, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, page, pfn, order); + return true; + } + move: +- mt = get_pfnblock_migratetype(page, start_pfn); +- nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype); +- if (!is_migrate_isolate(mt)) +- __mod_zone_freepage_state(zone, -nr_moved, mt); +- else if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, nr_moved, migratetype); ++ move_freepages(zone, start_pfn, end_pfn, ++ get_pfnblock_migratetype(page, start_pfn), migratetype); + return true; + } + #endif /* CONFIG_MEMORY_ISOLATION */ +@@ -1865,7 +1887,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { +- del_page_from_free_list(page, zone, current_order); ++ del_page_from_free_list(page, zone, current_order, block_type); + change_pageblock_range(page, current_order, start_type); + expand(zone, page, order, current_order, start_type); + return page; +@@ -1915,12 +1937,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { +- move_freepages(zone, start_pfn, end_pfn, start_type); ++ move_freepages(zone, start_pfn, end_pfn, block_type, start_type); + return __rmqueue_smallest(zone, order, start_type); + } + + single_page: +- del_page_from_free_list(page, zone, current_order); ++ del_page_from_free_list(page, zone, current_order, block_type); + expand(zone, page, order, current_order, block_type); + return page; + } +@@ -1990,7 +2012,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) +- if (move_freepages_block(zone, page, ++ if (move_freepages_block(zone, page, mt, + MIGRATE_HIGHATOMIC) != -1) + zone->nr_reserved_highatomic += pageblock_nr_pages; + +@@ -2031,11 +2053,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &(zone->free_area[order]); ++ int mt; + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); + if (!page) + continue; + ++ mt = get_pageblock_migratetype(page); + /* + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock +@@ -2043,7 +2067,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * from highatomic to ac->migratetype. So we should + * adjust the count once. + */ +- if (is_migrate_highatomic_page(page)) { ++ if (is_migrate_highatomic(mt)) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu +@@ -2065,7 +2089,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * of pageblocks that cannot be completely freed + * may increase. + */ +- ret = move_freepages_block(zone, page, ac->migratetype); ++ ret = move_freepages_block(zone, page, mt, ++ ac->migratetype); + /* + * Reserving this block already succeeded, so this should + * not fail on zone boundaries. +@@ -2236,12 +2261,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + * pages are ordered properly. + */ + list_add_tail(&page->pcp_list, list); +- if (is_migrate_cma(get_pageblock_migratetype(page))) +- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, +- -(1 << order)); + } +- +- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + spin_unlock_irqrestore(&zone->lock, flags); + + return i; +@@ -2751,11 +2771,9 @@ int __isolate_free_page(struct page *page, unsigned int order) + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return 0; +- +- __mod_zone_freepage_state(zone, -(1UL << order), mt); + } + +- del_page_from_free_list(page, zone, order); ++ del_page_from_free_list(page, zone, order, mt); + + /* + * Set the pageblock if the isolated page is at least half of a +@@ -2770,7 +2788,7 @@ int __isolate_free_page(struct page *page, unsigned int order) + * with others) + */ + if (migratetype_is_mergeable(mt)) +- move_freepages_block(zone, page, ++ move_freepages_block(zone, page, mt, + MIGRATE_MOVABLE); + } + } +@@ -2855,8 +2873,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + return NULL; + } + } +- __mod_zone_freepage_state(zone, -(1 << order), +- get_pageblock_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + +@@ -6940,8 +6956,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) + + BUG_ON(page_count(page)); + BUG_ON(!PageBuddy(page)); ++ VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); + order = buddy_order(page); +- del_page_from_free_list(page, zone, order); ++ del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); + pfn += (1 << order); + } + spin_unlock_irqrestore(&zone->lock, flags); +@@ -6969,6 +6986,14 @@ bool is_free_buddy_page(struct page *page) + EXPORT_SYMBOL(is_free_buddy_page); + + #ifdef CONFIG_MEMORY_FAILURE ++static inline void add_to_free_list(struct page *page, struct zone *zone, ++ unsigned int order, int migratetype, ++ bool tail) ++{ ++ __add_to_free_list(page, zone, order, migratetype, tail); ++ account_freepages(zone, 1 << order, migratetype); ++} ++ + /* + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. +@@ -6991,10 +7016,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, + current_buddy = page + size; + } + +- if (set_page_guard(zone, current_buddy, high, migratetype)) ++ if (set_page_guard(zone, current_buddy, high)) + continue; + +- add_to_free_list(current_buddy, zone, high, migratetype); ++ add_to_free_list(current_buddy, zone, high, migratetype, false); + set_buddy_order(current_buddy, high); + } + } +@@ -7020,12 +7045,11 @@ bool take_page_off_buddy(struct page *page) + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); + +- del_page_from_free_list(page_head, zone, page_order); ++ del_page_from_free_list(page_head, zone, page_order, ++ migratetype); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); + SetPageHWPoisonTakenOff(page); +- if (!is_migrate_isolate(migratetype)) +- __mod_zone_freepage_state(zone, -1, migratetype); + ret = true; + break; + } +@@ -7130,7 +7154,7 @@ static bool try_to_accept_memory_one(struct zone *zone) + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + +- __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); ++ account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + +@@ -7188,7 +7212,7 @@ static bool __free_unaccepted(struct page *page) + spin_lock_irqsave(&zone->lock, flags); + first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); +- __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); ++ account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + +-- +Gitee + + +From b0fd1d17c59627bb8b64234dc3d7b278f8b4656e Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Wed, 18 Dec 2024 17:34:57 +0800 +Subject: [PATCH 14/19] mm: page_alloc: change move_freepages() to + __move_freepages_block() + +mainline inclusion +from mainline-v6.10-rc1 +commit e1f42a577f63647dadf1abe4583053c03d6be045 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e1f42a577f63647dadf1abe4583053c03d6be045 + +-------------------------------- + +The function is now supposed to be called only on a single pageblock and +checks start_pfn and end_pfn accordingly. Rename it to make this more +obvious and drop the end_pfn parameter which can be determined trivially +and none of the callers use it for anything else. + +Also make the (now internal) end_pfn exclusive, which is more common. + +Link: https://lkml.kernel.org/r/81b1d642-2ec0-49f5-89fc-19a3828419ff@suse.cz +Signed-off-by: Vlastimil Babka +Reviewed-by: Zi Yan +Acked-by: Johannes Weiner +Cc: David Hildenbrand +Cc: "Huang, Ying" +Cc: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 43 ++++++++++++++++++++----------------------- + 1 file changed, 20 insertions(+), 23 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index d662bbdf2e91..7270d665fc53 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1571,18 +1571,18 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + * Change the type of a block and move all its free pages to that + * type's freelist. + */ +-static int move_freepages(struct zone *zone, unsigned long start_pfn, +- unsigned long end_pfn, int old_mt, int new_mt) ++static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, ++ int old_mt, int new_mt) + { + struct page *page; +- unsigned long pfn; ++ unsigned long pfn, end_pfn; + unsigned int order; + int pages_moved = 0; + + VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); +- VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn); ++ end_pfn = pageblock_end_pfn(start_pfn); + +- for (pfn = start_pfn; pfn <= end_pfn;) { ++ for (pfn = start_pfn; pfn < end_pfn;) { + page = pfn_to_page(pfn); + if (!PageBuddy(page)) { + pfn++; +@@ -1608,14 +1608,13 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, + + static bool prep_move_freepages_block(struct zone *zone, struct page *page, + unsigned long *start_pfn, +- unsigned long *end_pfn, + int *num_free, int *num_movable) + { + unsigned long pfn, start, end; + + pfn = page_to_pfn(page); + start = pageblock_start_pfn(pfn); +- end = pageblock_end_pfn(pfn) - 1; ++ end = pageblock_end_pfn(pfn); + + /* + * The caller only has the lock for @zone, don't touch ranges +@@ -1626,16 +1625,15 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, + */ + if (!zone_spans_pfn(zone, start)) + return false; +- if (!zone_spans_pfn(zone, end)) ++ if (!zone_spans_pfn(zone, end - 1)) + return false; + + *start_pfn = start; +- *end_pfn = end; + + if (num_free) { + *num_free = 0; + *num_movable = 0; +- for (pfn = start; pfn <= end;) { ++ for (pfn = start; pfn < end;) { + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + int nr = 1 << buddy_order(page); +@@ -1661,13 +1659,12 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, + static int move_freepages_block(struct zone *zone, struct page *page, + int old_mt, int new_mt) + { +- unsigned long start_pfn, end_pfn; ++ unsigned long start_pfn; + +- if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, +- NULL, NULL)) ++ if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return -1; + +- return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt); ++ return __move_freepages_block(zone, start_pfn, old_mt, new_mt); + } + + #ifdef CONFIG_MEMORY_ISOLATION +@@ -1738,10 +1735,9 @@ static void split_large_buddy(struct zone *zone, struct page *page, + bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype) + { +- unsigned long start_pfn, end_pfn, pfn; ++ unsigned long start_pfn, pfn; + +- if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, +- NULL, NULL)) ++ if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return false; + + /* No splits needed if buddies can't span multiple blocks */ +@@ -1772,8 +1768,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, + return true; + } + move: +- move_freepages(zone, start_pfn, end_pfn, +- get_pfnblock_migratetype(page, start_pfn), migratetype); ++ __move_freepages_block(zone, start_pfn, ++ get_pfnblock_migratetype(page, start_pfn), ++ migratetype); + return true; + } + #endif /* CONFIG_MEMORY_ISOLATION */ +@@ -1873,7 +1870,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + unsigned int alloc_flags, bool whole_block) + { + int free_pages, movable_pages, alike_pages; +- unsigned long start_pfn, end_pfn; ++ unsigned long start_pfn; + int block_type; + + block_type = get_pageblock_migratetype(page); +@@ -1906,8 +1903,8 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + goto single_page; + + /* moving whole block can fail due to zone boundary conditions */ +- if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, +- &free_pages, &movable_pages)) ++ if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, ++ &movable_pages)) + goto single_page; + + /* +@@ -1937,7 +1934,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { +- move_freepages(zone, start_pfn, end_pfn, block_type, start_type); ++ __move_freepages_block(zone, start_pfn, block_type, start_type); + return __rmqueue_smallest(zone, order, start_type); + } + +-- +Gitee + + +From e71a8422c09179c193a4806cdc764fe626fb4344 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:58 +0800 +Subject: [PATCH 15/19] mm: page_alloc: batch vmstat updates in expand() + +mainline inclusion +from mainline-v6.10-rc1 +commit 883dd161e9a83e188487debc562b1928917a4b39 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=883dd161e9a83e188487debc562b1928917a4b39 + +-------------------------------- + +expand() currently updates vmstat for every subpage. This is unnecessary, +since they're all of the same zone and migratetype. + +Count added pages locally, then do a single vmstat update. + +Link: https://lkml.kernel.org/r/20240327190111.GC7597@cmpxchg.org +Signed-off-by: Johannes Weiner +Suggested-by: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 7270d665fc53..4e2ec54b6a7f 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1326,6 +1326,7 @@ static inline void expand(struct zone *zone, struct page *page, + int low, int high, int migratetype) + { + unsigned long size = 1 << high; ++ unsigned long nr_added = 0; + + while (high > low) { + high--; +@@ -1341,9 +1342,11 @@ static inline void expand(struct zone *zone, struct page *page, + if (set_page_guard(zone, &page[size], high)) + continue; + +- add_to_free_list(&page[size], zone, high, migratetype, false); ++ __add_to_free_list(&page[size], zone, high, migratetype, false); + set_buddy_order(&page[size], high); ++ nr_added += size; + } ++ account_freepages(zone, nr_added, migratetype); + } + + static void check_new_page_bad(struct page *page) +-- +Gitee + + +From eba87a33b157ef221af2fe47d88dcc57b2d549bf Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:34:59 +0800 +Subject: [PATCH 16/19] mm: page_alloc: fix highatomic typing in multi-block + buddies + +mainline inclusion +from mainline-v6.10-rc3 +commit 7cc5a5d65011983952a9c62f170f5b79e24b1239 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7cc5a5d65011983952a9c62f170f5b79e24b1239 + +-------------------------------- + +Christoph reports a page allocator splat triggered by xfstests: + +generic/176 214s ... [ 1204.507931] run fstests generic/176 at 2024-05-27 12:52:30 +XFS (nvme0n1): Mounting V5 Filesystem cd936307-415f-48a3-b99d-a2d52ae1f273 +XFS (nvme0n1): Ending clean mount +XFS (nvme1n1): Mounting V5 Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850 +XFS (nvme1n1): Ending clean mount +XFS (nvme1n1): Unmounting Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850 +XFS (nvme1n1): Mounting V5 Filesystem 7099b02d-9c58-4d1d-be1d-2cc472d12cd9 +XFS (nvme1n1): Ending clean mount +------------[ cut here ]------------ +page type is 3, passed migratetype is 1 (nr=512) +WARNING: CPU: 0 PID: 509870 at mm/page_alloc.c:645 expand+0x1c5/0x1f0 +Modules linked in: i2c_i801 crc32_pclmul i2c_smbus [last unloaded: scsi_debug] +CPU: 0 PID: 509870 Comm: xfs_io Not tainted 6.10.0-rc1+ #2437 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 +RIP: 0010:expand+0x1c5/0x1f0 +Code: 05 16 70 bf 02 01 e8 ca fc ff ff 8b 54 24 34 44 89 e1 48 c7 c7 80 a2 28 83 48 89 c6 b8 01 00 3 +RSP: 0018:ffffc90003b2b968 EFLAGS: 00010082 +RAX: 0000000000000000 RBX: ffffffff83fa9480 RCX: 0000000000000000 +RDX: 0000000000000005 RSI: 0000000000000027 RDI: 00000000ffffffff +RBP: 00000000001f2600 R08: 00000000fffeffff R09: 0000000000000001 +R10: 0000000000000000 R11: ffffffff83676200 R12: 0000000000000009 +R13: 0000000000000200 R14: 0000000000000001 R15: ffffea0007c98000 +FS: 00007f72ca3d5780(0000) GS:ffff8881f9c00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f72ca1fff38 CR3: 00000001aa0c6002 CR4: 0000000000770ef0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 +PKRU: 55555554 +Call Trace: + + ? __warn+0x7b/0x120 + ? expand+0x1c5/0x1f0 + ? report_bug+0x191/0x1c0 + ? handle_bug+0x3c/0x80 + ? exc_invalid_op+0x17/0x70 + ? asm_exc_invalid_op+0x1a/0x20 + ? expand+0x1c5/0x1f0 + ? expand+0x1c5/0x1f0 + __rmqueue_pcplist+0x3a9/0x730 + get_page_from_freelist+0x7a0/0xf00 + __alloc_pages_noprof+0x153/0x2e0 + __folio_alloc_noprof+0x10/0xa0 + __filemap_get_folio+0x16b/0x370 + iomap_write_begin+0x496/0x680 + +While trying to service a movable allocation (page type 1), the page +allocator runs into a two-pageblock buddy on the movable freelist whose +second block is typed as highatomic (page type 3). + +This inconsistency is caused by the highatomic reservation system +operating on single pageblocks, while MAX_ORDER can be bigger than that - +in this configuration, pageblock_order is 9 while MAX_PAGE_ORDER is 10. +The test case is observed to make several adjacent order-3 requests with +__GFP_DIRECT_RECLAIM cleared, which marks the surrounding block as +highatomic. Upon freeing, the blocks merge into an order-10 buddy. When +the highatomic pool is drained later on, this order-10 buddy gets moved +back to the movable list, but only the first pageblock is marked movable +again. A subsequent expand() of this buddy warns about the tail being of +a different type. + +This is a long-standing bug that's surfaced by the recent block type +warnings added to the allocator. The consequences seem mostly benign, it +just results in odd behavior: the highatomic tail blocks are not properly +drained, instead they end up on the movable list first, then go back to +the highatomic list after an alloc-free cycle. + +To fix this, make the highatomic reservation code aware that +allocations/buddies can be larger than a pageblock. + +While it's an old quirk, the recently added type consistency warnings seem +to be the most prominent consequence of it. Set the Fixes: tag +accordingly to highlight this backporting dependency. + +Link: https://lkml.kernel.org/r/20240530114203.GA1222079@cmpxchg.org +Fixes: e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") +Signed-off-by: Johannes Weiner +Reported-by: Christoph Hellwig +Reviewed-by: Zi Yan +Tested-by: Christoph Hellwig +Cc: Andy Shevchenko +Cc: Baolin Wang +Cc: Mel Gorman +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 34 insertions(+), 16 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 4e2ec54b6a7f..53534165b5ab 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1982,10 +1982,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, + } + + /* +- * Reserve a pageblock for exclusive use of high-order atomic allocations if +- * there are no empty page blocks that contain a page with a suitable order ++ * Reserve the pageblock(s) surrounding an allocation request for ++ * exclusive use of high-order atomic allocations if there are no ++ * empty page blocks that contain a page with a suitable order + */ +-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) ++static void reserve_highatomic_pageblock(struct page *page, int order, ++ struct zone *zone) + { + int mt; + unsigned long max_managed, flags; +@@ -2011,10 +2013,17 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ +- if (migratetype_is_mergeable(mt)) +- if (move_freepages_block(zone, page, mt, +- MIGRATE_HIGHATOMIC) != -1) +- zone->nr_reserved_highatomic += pageblock_nr_pages; ++ if (!migratetype_is_mergeable(mt)) ++ goto out_unlock; ++ ++ if (order < pageblock_order) { ++ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) ++ goto out_unlock; ++ zone->nr_reserved_highatomic += pageblock_nr_pages; ++ } else { ++ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); ++ zone->nr_reserved_highatomic += 1 << order; ++ } + + out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +@@ -2026,7 +2035,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + * +- * If @force is true, try to unreserve a pageblock even though highatomic ++ * If @force is true, try to unreserve pageblocks even though highatomic + * pageblock is exhausted. + */ + static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, +@@ -2068,6 +2077,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * adjust the count once. + */ + if (is_migrate_highatomic(mt)) { ++ unsigned long size; + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu +@@ -2075,9 +2085,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * while unreserving so be safe and watch for + * underflows. + */ +- zone->nr_reserved_highatomic -= min( +- pageblock_nr_pages, +- zone->nr_reserved_highatomic); ++ size = max(pageblock_nr_pages, 1UL << order); ++ size = min(size, zone->nr_reserved_highatomic); ++ zone->nr_reserved_highatomic -= size; + } + + /* +@@ -2089,11 +2099,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + * of pageblocks that cannot be completely freed + * may increase. + */ +- ret = move_freepages_block(zone, page, mt, +- ac->migratetype); ++ if (order < pageblock_order) ++ ret = move_freepages_block(zone, page, mt, ++ ac->migratetype); ++ else { ++ move_to_free_list(page, zone, order, mt, ++ ac->migratetype); ++ change_pageblock_range(page, order, ++ ac->migratetype); ++ ret = 1; ++ } + /* +- * Reserving this block already succeeded, so this should +- * not fail on zone boundaries. ++ * Reserving the block(s) already succeeded, ++ * so this should not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { +@@ -3440,7 +3458,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + * if the pageblock should be reserved for the future + */ + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) +- reserve_highatomic_pageblock(page, zone); ++ reserve_highatomic_pageblock(page, order, zone); + + return page; + } else { +-- +Gitee + + +From 6285cc96f1f40030144d86d54604450cf31b3588 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Wed, 18 Dec 2024 17:35:00 +0800 +Subject: [PATCH 17/19] mm/page_alloc: keep track of free highatomic + +mainline inclusion +from mainline-v6.12-rc7 +commit c928807f6f6b6d595a7e199591ae297c81de3aeb +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c928807f6f6b6d595a7e199591ae297c81de3aeb + +-------------------------------- + +OOM kills due to vastly overestimated free highatomic reserves were +observed: + + ... invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0 ... + Node 0 Normal free:1482936kB boost:0kB min:410416kB low:739404kB high:1068392kB reserved_highatomic:1073152KB ... + Node 0 Normal: 1292*4kB (ME) 1920*8kB (E) 383*16kB (UE) 220*32kB (ME) 340*64kB (E) 2155*128kB (UE) 3243*256kB (UE) 615*512kB (U) 1*1024kB (M) 0*2048kB 0*4096kB = 1477408kB + +The second line above shows that the OOM kill was due to the following +condition: + + free (1482936kB) - reserved_highatomic (1073152kB) = 409784KB < min (410416kB) + +And the third line shows there were no free pages in any +MIGRATE_HIGHATOMIC pageblocks, which otherwise would show up as type 'H'. +Therefore __zone_watermark_unusable_free() underestimated the usable free +memory by over 1GB, which resulted in the unnecessary OOM kill above. + +The comments in __zone_watermark_unusable_free() warns about the potential +risk, i.e., + + If the caller does not have rights to reserves below the min + watermark then subtract the high-atomic reserves. This will + over-estimate the size of the atomic reserve but it avoids a search. + +However, it is possible to keep track of free pages in reserved highatomic +pageblocks with a new per-zone counter nr_free_highatomic protected by the +zone lock, to avoid a search when calculating the usable free memory. And +the cost would be minimal, i.e., simple arithmetics in the highatomic +alloc/free/move paths. + +Note that since nr_free_highatomic can be relatively small, using a +per-cpu counter might cause too much drift and defeat its purpose, in +addition to the extra memory overhead. + +Dependson e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") - see [1] + +[akpm@linux-foundation.org: s/if/else if/, per Johannes, stealth whitespace tweak] +Link: https://lkml.kernel.org/r/20241028182653.3420139-1-yuzhao@google.com +Link: https://lkml.kernel.org/r/0d0ddb33-fcdc-43e2-801f-0c1df2031afb@suse.cz [1] +Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") +Signed-off-by: Yu Zhao +Reported-by: Link Lin +Acked-by: David Rientjes +Acked-by: Vlastimil Babka +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/mmzone.h | 1 + + mm/page_alloc.c | 10 +++++++--- + 2 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 3cee238de7c8..18bee72ebc71 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -865,6 +865,7 @@ struct zone { + unsigned long watermark_boost; + + unsigned long nr_reserved_highatomic; ++ unsigned long nr_free_highatomic; + + /* + * We don't know if the memory that we're going to allocate will be +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 53534165b5ab..e786cdd98bea 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -639,6 +639,8 @@ compaction_capture(struct capture_control *capc, struct page *page, + static inline void account_freepages(struct zone *zone, int nr_pages, + int migratetype) + { ++ lockdep_assert_held(&zone->lock); ++ + if (is_migrate_isolate(migratetype)) + return; + +@@ -646,6 +648,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages, + + if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); ++ else if (is_migrate_highatomic(migratetype)) ++ WRITE_ONCE(zone->nr_free_highatomic, ++ zone->nr_free_highatomic + nr_pages); + } + + /* Used for pages not on another list */ +@@ -3072,11 +3077,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z, + + /* + * If the caller does not have rights to reserves below the min +- * watermark then subtract the high-atomic reserves. This will +- * over-estimate the size of the atomic reserve but it avoids a search. ++ * watermark then subtract the free pages reserved for highatomic. + */ + if (likely(!(alloc_flags & ALLOC_RESERVES))) +- unusable_free += z->nr_reserved_highatomic; ++ unusable_free += READ_ONCE(z->nr_free_highatomic); + + #ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ +-- +Gitee + + +From 10a32a6824d78e5e36332cb1b7a4ee0bcca6812b Mon Sep 17 00:00:00 2001 +From: Kefeng Wang +Date: Wed, 18 Dec 2024 17:35:01 +0800 +Subject: [PATCH 18/19] mm: remove migration for HugePage in + isolate_single_pageblock() + +mainline inclusion +from mainline-v6.12-rc1 +commit cd5f3193b432cd70cc1c19aba790300dd11ae934 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cd5f3193b432cd70cc1c19aba790300dd11ae934 + +-------------------------------- + +The gigantic page size may larger than memory block size, so memory +offline always fails in this case after commit b2c9e2fbba32 ("mm: make +alloc_contig_range work at pageblock granularity"), + +offline_pages + start_isolate_page_range + start_isolate_page_range(isolate_before=true) + isolate [isolate_start, isolate_start + pageblock_nr_pages) + start_isolate_page_range(isolate_before=false) + isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock + __alloc_contig_migrate_range + isolate_migratepages_range + isolate_migratepages_block + isolate_or_dissolve_huge_page + if (hstate_is_gigantic(h)) + return -ENOMEM; + +[ 15.815756] memory offlining [mem 0x3c0000000-0x3c7ffffff] failed due to failure to isolate range + +Gigantic PageHuge is bigger than a pageblock, but since it is freed as +order-0 pages, its pageblocks after being freed will get to the right +free list. There is no need to have special handling code for them in +start_isolate_page_range(). For both alloc_contig_range() and memory +offline cases, the migration code after start_isolate_page_range() will +be able to migrate gigantic PageHuge when possible. Let's clean up +start_isolate_page_range() and fix the aforementioned memory offline +failure issue all together. + +Let's clean up start_isolate_page_range() and fix the aforementioned +memory offline failure issue all together. + +Link: https://lkml.kernel.org/r/20240820032630.1894770-1-wangkefeng.wang@huawei.com +Fixes: b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity") +Signed-off-by: Kefeng Wang +Acked-by: David Hildenbrand +Acked-by: Zi Yan +Cc: Matthew Wilcox +Cc: Oscar Salvador +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_isolation.c | 28 +++------------------------- + 1 file changed, 3 insertions(+), 25 deletions(-) + +diff --git a/mm/page_isolation.c b/mm/page_isolation.c +index b3aae89ed226..cf7f1922fc3e 100644 +--- a/mm/page_isolation.c ++++ b/mm/page_isolation.c +@@ -398,30 +398,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + unsigned long head_pfn = page_to_pfn(head); + unsigned long nr_pages = compound_nr(head); + +- if (head_pfn + nr_pages <= boundary_pfn) { +- pfn = head_pfn + nr_pages; +- continue; +- } +- +-#if defined CONFIG_COMPACTION || defined CONFIG_CMA +- if (PageHuge(page)) { +- int page_mt = get_pageblock_migratetype(page); +- struct compact_control cc = { +- .nr_migratepages = 0, +- .order = -1, +- .zone = page_zone(pfn_to_page(head_pfn)), +- .mode = MIGRATE_SYNC, +- .ignore_skip_hint = true, +- .no_set_skip_hint = true, +- .gfp_mask = gfp_flags, +- .alloc_contig = true, +- }; +- INIT_LIST_HEAD(&cc.migratepages); +- +- ret = __alloc_contig_migrate_range(&cc, head_pfn, +- head_pfn + nr_pages, page_mt); +- if (ret) +- goto failed; ++ if (head_pfn + nr_pages <= boundary_pfn || ++ PageHuge(page)) { + pfn = head_pfn + nr_pages; + continue; + } +@@ -435,7 +413,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + */ + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); +-#endif ++ + goto failed; + } + +-- +Gitee + + +From ad41788f98eca42a9da702565742413372a3e8e4 Mon Sep 17 00:00:00 2001 +From: Huan Yang +Date: Wed, 18 Dec 2024 17:35:02 +0800 +Subject: [PATCH 19/19] mm: page_alloc: simpify page del and expand + +mainline inclusion +from mainline-v6.12-rc1 +commit 94deaf69dcd33462c61fa8cabb0883e3085a1046 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=94deaf69dcd33462c61fa8cabb0883e3085a1046 + +-------------------------------- + +When page del from buddy and need expand, it will account free_pages in +zone's migratetype. + +The current way is to subtract the page number of the current order when +deleting, and then add it back when expanding. + +This is unnecessary, as when migrating the same type, we can directly +record the difference between the high-order pages and the expand added, +and then subtract it directly. + +This patch merge that, only when del and expand done, then account +free_pages. + +Link: https://lkml.kernel.org/r/20240826064048.187790-1-link@vivo.com +Signed-off-by: Huan Yang +Reviewed-by: Vlastimil Babka +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/page_alloc.c | 35 +++++++++++++++++++++++++---------- + 1 file changed, 25 insertions(+), 10 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index e786cdd98bea..7734245d7870 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1327,11 +1327,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + * + * -- nyc + */ +-static inline void expand(struct zone *zone, struct page *page, +- int low, int high, int migratetype) ++static inline unsigned int expand(struct zone *zone, struct page *page, int low, ++ int high, int migratetype) + { +- unsigned long size = 1 << high; +- unsigned long nr_added = 0; ++ unsigned int size = 1 << high; ++ unsigned int nr_added = 0; + + while (high > low) { + high--; +@@ -1351,7 +1351,19 @@ static inline void expand(struct zone *zone, struct page *page, + set_buddy_order(&page[size], high); + nr_added += size; + } +- account_freepages(zone, nr_added, migratetype); ++ ++ return nr_added; ++} ++ ++static __always_inline void page_del_and_expand(struct zone *zone, ++ struct page *page, int low, ++ int high, int migratetype) ++{ ++ int nr_pages = 1 << high; ++ ++ __del_page_from_free_list(page, zone, high, migratetype); ++ nr_pages -= expand(zone, page, low, high, migratetype); ++ account_freepages(zone, -nr_pages, migratetype); + } + + static void check_new_page_bad(struct page *page) +@@ -1540,8 +1552,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + page = get_page_from_free_area(area, migratetype); + if (!page) + continue; +- del_page_from_free_list(page, zone, current_order, migratetype); +- expand(zone, page, order, current_order, migratetype); ++ ++ page_del_and_expand(zone, page, order, current_order, ++ migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); +@@ -1892,9 +1905,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { ++ unsigned int nr_added; ++ + del_page_from_free_list(page, zone, current_order, block_type); + change_pageblock_range(page, current_order, start_type); +- expand(zone, page, order, current_order, start_type); ++ nr_added = expand(zone, page, order, current_order, start_type); ++ account_freepages(zone, nr_added, start_type); + return page; + } + +@@ -1947,8 +1963,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, + } + + single_page: +- del_page_from_free_list(page, zone, current_order, block_type); +- expand(zone, page, order, current_order, block_type); ++ page_del_and_expand(zone, page, order, current_order, block_type); + return page; + } + +-- +Gitee + diff --git a/0026-14227.patch b/0026-14227.patch new file mode 100644 index 0000000000000000000000000000000000000000..4caa7397c454f7d8530d44b46573f1221ef106d8 --- /dev/null +++ b/0026-14227.patch @@ -0,0 +1,3464 @@ +From 3c8ff7deba8ed905fb4c3d05ccccdecb6000b7d4 Mon Sep 17 00:00:00 2001 +From: Chengming Zhou +Date: Wed, 18 Dec 2024 17:51:06 +0800 +Subject: [PATCH 01/14] mm/zswap: invalidate zswap entry when swap entry free + +mainline inclusion +from mainline-v6.9-rc1 +commit 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 + +-------------------------------- + +During testing I found there are some times the zswap_writeback_entry() +return -ENOMEM, which is not we expected: + +bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' +@[-12]: 1563 +@[0]: 277221 + +The reason is that __read_swap_cache_async() return NULL because +swapcache_prepare() failed. The reason is that we won't invalidate zswap +entry when swap entry freed to the per-cpu pool, these zswap entries are +still on the zswap tree and lru list. + +This patch moves the invalidation ahead to when swap entry freed to the +per-cpu pool, since there is no any benefit to leave trashy zswap entry on +the tree and lru list. + +With this patch: +bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' +@[0]: 259744 + +Note: large folio can't have zswap entry for now, so don't bother +to add zswap entry invalidation in the large folio swap free path. + +Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com +Signed-off-by: Chengming Zhou +Reviewed-by: Nhat Pham +Acked-by: Johannes Weiner +Acked-by: Yosry Ahmed +Signed-off-by: Andrew Morton +Conflicts: + include/linux/zswap.h + mm/zswap.c +[ Context conflict. ] +Signed-off-by: Liu Shixin +--- + include/linux/zswap.h | 4 ++-- + mm/swap_slots.c | 4 ++++ + mm/swapfile.c | 1 - + mm/zswap.c | 5 +++-- + 4 files changed, 9 insertions(+), 5 deletions(-) + +diff --git a/include/linux/zswap.h b/include/linux/zswap.h +index 2a60ce39cfde..a13d2d2d9131 100644 +--- a/include/linux/zswap.h ++++ b/include/linux/zswap.h +@@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages; + + bool zswap_store(struct folio *folio); + bool zswap_load(struct folio *folio); +-void zswap_invalidate(int type, pgoff_t offset); ++void zswap_invalidate(swp_entry_t swp); + void zswap_swapon(int type); + void zswap_swapoff(int type); + +@@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio) + return false; + } + +-static inline void zswap_invalidate(int type, pgoff_t offset) {} ++static inline void zswap_invalidate(swp_entry_t swp) {} + static inline void zswap_swapon(int type) {} + static inline void zswap_swapoff(int type) {} + +diff --git a/mm/swap_slots.c b/mm/swap_slots.c +index 7af3b93d4c8c..5579eed7065f 100644 +--- a/mm/swap_slots.c ++++ b/mm/swap_slots.c +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); + #ifdef CONFIG_MEMCG_SWAP_QOS +@@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry) + { + struct swap_slots_cache *cache; + ++ /* Large folio swap slot is not covered. */ ++ zswap_invalidate(entry); ++ + cache = raw_cpu_ptr(&swp_slots); + if (likely(use_swap_slot_cache && cache->slots_ret)) { + spin_lock_irq(&cache->free_lock); +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 3af5b6ebb241..30832b85d6c2 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -765,7 +765,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + swap_slot_free_notify = NULL; + while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); +- zswap_invalidate(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; +diff --git a/mm/zswap.c b/mm/zswap.c +index 69681b9173fd..5acda5b906bc 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio) + return ret; + } + +-void zswap_invalidate(int type, pgoff_t offset) ++void zswap_invalidate(swp_entry_t swp) + { +- struct zswap_tree *tree = zswap_trees[type]; ++ pgoff_t offset = swp_offset(swp); ++ struct zswap_tree *tree = zswap_trees[swp_type(swp)]; + struct zswap_entry *entry; + + /* find */ +-- +Gitee + + +From e2f02eacab254e29bd451782950ac6a03de685bd Mon Sep 17 00:00:00 2001 +From: Chris Li +Date: Wed, 18 Dec 2024 17:51:07 +0800 +Subject: [PATCH 02/14] mm: swap: swap cluster switch to double link list +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +mainline inclusion +from mainline-v6.12-rc1 +commit 73ed0baae66df50359c876f65f41179d6ebd2716 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73ed0baae66df50359c876f65f41179d6ebd2716 + +-------------------------------- + +Patch series "mm: swap: mTHP swap allocator base on swap cluster order", +v5. + +This is the short term solutions "swap cluster order" listed in my "Swap +Abstraction" discussion slice 8 in the recent LSF/MM conference. + +When commit 845982eb264bc "mm: swap: allow storage of all mTHP orders" is +introduced, it only allocates the mTHP swap entries from the new empty +cluster list.  It has a fragmentation issue reported by Barry. + +https://lore.kernel.org/all/CAGsJ_4zAcJkuW016Cfi6wicRr8N9X+GJJhgMQdSMp+Ah+NSgNQ@mail.gmail.com/ + +The reason is that all the empty clusters have been exhausted while there +are plenty of free swap entries in the cluster that are not 100% free. + +Remember the swap allocation order in the cluster. Keep track of the per +order non full cluster list for later allocation. + +This series gives the swap SSD allocation a new separate code path from +the HDD allocation. The new allocator use cluster list only and do not +global scan swap_map[] without lock any more. + +This streamline the swap allocation for SSD. The code matches the +execution flow much better. + +User impact: For users that allocate and free mix order mTHP swapping, It +greatly improves the success rate of the mTHP swap allocation after the +initial phase. + +It also performs faster when the swapfile is close to full, because the +allocator can get the non full cluster from a list rather than scanning a +lot of swap_map entries.  + +With Barry's mthp test program V2: + +Without: +$ ./thp_swap_allocator_test -a +Iteration 1: swpout inc: 32, swpout fallback inc: 192, Fallback percentage: 85.71% +Iteration 2: swpout inc: 0, swpout fallback inc: 231, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 227, Fallback percentage: 100.00% +... +Iteration 98: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 215, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% + +$ ./thp_swap_allocator_test -a -s +Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +.. +Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% + +$ ./thp_swap_allocator_test -s +Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +.. +Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% + +$ ./thp_swap_allocator_test +Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% +Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% +Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +.. +Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% +Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% +Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% + +With: # with all 0.00% filter out +$ ./thp_swap_allocator_test -a | grep -v "0.00%" +$ # all result are 0.00% + +$ ./thp_swap_allocator_test -a -s | grep -v "0.00%" +./thp_swap_allocator_test -a -s | grep -v "0.00%" +Iteration 14: swpout inc: 223, swpout fallback inc: 3, Fallback percentage: 1.33% +Iteration 19: swpout inc: 219, swpout fallback inc: 7, Fallback percentage: 3.10% +Iteration 28: swpout inc: 225, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 29: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 34: swpout inc: 220, swpout fallback inc: 8, Fallback percentage: 3.51% +Iteration 35: swpout inc: 222, swpout fallback inc: 11, Fallback percentage: 4.72% +Iteration 38: swpout inc: 217, swpout fallback inc: 4, Fallback percentage: 1.81% +Iteration 40: swpout inc: 222, swpout fallback inc: 6, Fallback percentage: 2.63% +Iteration 42: swpout inc: 221, swpout fallback inc: 2, Fallback percentage: 0.90% +Iteration 43: swpout inc: 215, swpout fallback inc: 7, Fallback percentage: 3.15% +Iteration 47: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88% +Iteration 49: swpout inc: 217, swpout fallback inc: 1, Fallback percentage: 0.46% +Iteration 52: swpout inc: 221, swpout fallback inc: 8, Fallback percentage: 3.49% +Iteration 56: swpout inc: 224, swpout fallback inc: 4, Fallback percentage: 1.75% +Iteration 58: swpout inc: 214, swpout fallback inc: 5, Fallback percentage: 2.28% +Iteration 62: swpout inc: 220, swpout fallback inc: 3, Fallback percentage: 1.35% +Iteration 64: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 67: swpout inc: 221, swpout fallback inc: 1, Fallback percentage: 0.45% +Iteration 75: swpout inc: 220, swpout fallback inc: 9, Fallback percentage: 3.93% +Iteration 82: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 86: swpout inc: 211, swpout fallback inc: 12, Fallback percentage: 5.38% +Iteration 89: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88% +Iteration 93: swpout inc: 220, swpout fallback inc: 1, Fallback percentage: 0.45% +Iteration 94: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 96: swpout inc: 221, swpout fallback inc: 6, Fallback percentage: 2.64% +Iteration 98: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% +Iteration 99: swpout inc: 227, swpout fallback inc: 3, Fallback percentage: 1.30% + +$ ./thp_swap_allocator_test +./thp_swap_allocator_test +Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% +Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53% +Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58% +Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34% +Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51% +Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84% +Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91% +Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05% +Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25% +Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74% +Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01% +Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45% +Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98% +Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64% +Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36% +Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02% +Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07% + +$ ./thp_swap_allocator_test -s + ./thp_swap_allocator_test -s +Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% +Iteration 2: swpout inc: 97, swpout fallback inc: 135, Fallback percentage: 58.19% +Iteration 3: swpout inc: 42, swpout fallback inc: 192, Fallback percentage: 82.05% +Iteration 4: swpout inc: 19, swpout fallback inc: 214, Fallback percentage: 91.85% +Iteration 5: swpout inc: 12, swpout fallback inc: 213, Fallback percentage: 94.67% +Iteration 6: swpout inc: 11, swpout fallback inc: 217, Fallback percentage: 95.18% +Iteration 7: swpout inc: 9, swpout fallback inc: 214, Fallback percentage: 95.96% +Iteration 8: swpout inc: 8, swpout fallback inc: 213, Fallback percentage: 96.38% +Iteration 9: swpout inc: 2, swpout fallback inc: 223, Fallback percentage: 99.11% +Iteration 10: swpout inc: 2, swpout fallback inc: 228, Fallback percentage: 99.13% +Iteration 11: swpout inc: 4, swpout fallback inc: 214, Fallback percentage: 98.17% +Iteration 12: swpout inc: 5, swpout fallback inc: 226, Fallback percentage: 97.84% +Iteration 13: swpout inc: 3, swpout fallback inc: 212, Fallback percentage: 98.60% +Iteration 14: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% +Iteration 15: swpout inc: 3, swpout fallback inc: 222, Fallback percentage: 98.67% +Iteration 16: swpout inc: 4, swpout fallback inc: 223, Fallback percentage: 98.24% + +========= +Kernel compile under tmpfs with cgroup memory.max = 470M. +12 core 24 hyperthreading, 32 jobs. 10 Run each group + +SSD swap 10 runs average, 20G swap partition: +With: +user 2929.064 +system 1479.381 : 1376.89 1398.22 1444.64 1477.39 1479.04 1497.27 +1504.47 1531.4 1532.92 1551.57 +real 1441.324 + +Without: +user 2910.872 +system 1482.732 : 1440.01 1451.4 1462.01 1467.47 1467.51 1469.3 +1470.19 1496.32 1544.1 1559.01 +real 1580.822 + +Two zram swap: zram0 3.0G zram1 20G. + +The idea is forcing the zram0 almost full then overflow to zram1: + +With: +user 4320.301 +system 4272.403 : 4236.24 4262.81 4264.75 4269.13 4269.44 4273.06 +4279.85 4285.98 4289.64 4293.13 +real 431.759 + +Without +user 4301.393 +system 4387.672 : 4374.47 4378.3 4380.95 4382.84 4383.06 4388.05 +4389.76 4397.16 4398.23 4403.9 +real 433.979 + +------ more test result from Kaiui ---------- + +Test with build linux kernel using a 4G ZRAM, 1G memory.max limit on top of shmem: + +System info: 32 Core AMD Zen2, 64G total memory. + +Test 3 times using only 4K pages: +================================= + +With: +----- +1838.74user 2411.21system 2:37.86elapsed 2692%CPU (0avgtext+0avgdata 847060maxresident)k +1839.86user 2465.77system 2:39.35elapsed 2701%CPU (0avgtext+0avgdata 847060maxresident)k +1840.26user 2454.68system 2:39.43elapsed 2693%CPU (0avgtext+0avgdata 847060maxresident)k + +Summary (~4.6% improment of system time): +User: 1839.62 +System: 2443.89: 2465.77 2454.68 2411.21 +Real: 158.88 + +Without: +-------- +1837.99user 2575.95system 2:43.09elapsed 2706%CPU (0avgtext+0avgdata 846520maxresident)k +1838.32user 2555.15system 2:42.52elapsed 2709%CPU (0avgtext+0avgdata 846520maxresident)k +1843.02user 2561.55system 2:43.35elapsed 2702%CPU (0avgtext+0avgdata 846520maxresident)k + +Summary: +User: 1839.78 +System: 2564.22: 2575.95 2555.15 2561.55 +Real: 162.99 + +Test 5 times using enabled all mTHP pages: +========================================== + +With: +----- +1796.44user 2937.33system 2:59.09elapsed 2643%CPU (0avgtext+0avgdata 846936maxresident)k +1802.55user 3002.32system 2:54.68elapsed 2750%CPU (0avgtext+0avgdata 847072maxresident)k +1806.59user 2986.53system 2:55.17elapsed 2736%CPU (0avgtext+0avgdata 847092maxresident)k +1803.27user 2982.40system 2:54.49elapsed 2742%CPU (0avgtext+0avgdata 846796maxresident)k +1807.43user 3036.08system 2:56.06elapsed 2751%CPU (0avgtext+0avgdata 846488maxresident)k + +Summary (~8.4% improvement of system time): +User: 1803.25 +System: 2988.93: 2937.33 3002.32 2986.53 2982.40 3036.08 +Real: 175.90 + +mTHP swapout status: +/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout:347721 +/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout_fallback:3110 +/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout:3365 +/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout_fallback:8269 +/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout:24 +/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout_fallback:3341 +/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout:145 +/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout_fallback:5038 +/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout:322737 +/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback:36808 +/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout:380455 +/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout_fallback:1010 +/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout:24973 +/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout_fallback:13223 +/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout:197348 +/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout_fallback:80541 + +Without: +-------- +1794.41user 3151.29system 3:05.97elapsed 2659%CPU (0avgtext+0avgdata 846704maxresident)k +1810.27user 3304.48system 3:05.38elapsed 2759%CPU (0avgtext+0avgdata 846636maxresident)k +1809.84user 3254.85system 3:03.83elapsed 2755%CPU (0avgtext+0avgdata 846952maxresident)k +1813.54user 3259.56system 3:04.28elapsed 2752%CPU (0avgtext+0avgdata 846848maxresident)k +1829.97user 3338.40system 3:07.32elapsed 2759%CPU (0avgtext+0avgdata 847024maxresident)k + +Summary: +User: 1811.61 +System: 3261.72 : 3151.29 3304.48 3254.85 3259.56 3338.40 +Real: 185.356 + +mTHP swapout status: +hugepages-32kB/stats/swpout:35630 +hugepages-32kB/stats/swpout_fallback:1809908 +hugepages-512kB/stats/swpout:523 +hugepages-512kB/stats/swpout_fallback:55235 +hugepages-2048kB/stats/swpout:53 +hugepages-2048kB/stats/swpout_fallback:17264 +hugepages-1024kB/stats/swpout:85 +hugepages-1024kB/stats/swpout_fallback:24979 +hugepages-64kB/stats/swpout:30117 +hugepages-64kB/stats/swpout_fallback:1825399 +hugepages-16kB/stats/swpout:42775 +hugepages-16kB/stats/swpout_fallback:1951123 +hugepages-256kB/stats/swpout:2326 +hugepages-256kB/stats/swpout_fallback:170165 +hugepages-128kB/stats/swpout:17925 +hugepages-128kB/stats/swpout_fallback:1309757 + +This patch (of 9): + +Previously, the swap cluster used a cluster index as a pointer to +construct a custom single link list type "swap_cluster_list". The next +cluster pointer is shared with the cluster->count. It prevents puting the +non free cluster into a list. + +Change the cluster to use the standard double link list instead. This +allows tracing the nonfull cluster in the follow up patch. That way, it +is faster to get to the nonfull cluster of that order. + +Remove the cluster getter/setter for accessing the cluster struct member. + +The list operation is protected by the swap_info_struct->lock. + +Change cluster code to use "struct swap_cluster_info *" to reference the +cluster rather than by using index. That is more consistent with the list +manipulation. It avoids the repeat adding index to the cluser_info. The +code is easier to understand. + +Remove the cluster next pointer is NULL flag, the double link list can +handle the empty list pretty well. + +The "swap_cluster_info" struct is two pointer bigger, because 512 swap +entries share one swap_cluster_info struct, it has very little impact on +the average memory usage per swap entry. For 1TB swapfile, the swap +cluster data structure increases from 8MB to 24MB. + +Other than the list conversion, there is no real function change in this +patch. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-1-cb9c148b9297@kernel.org +Signed-off-by: Chris Li +Reported-by: Barry Song <21cnbao@gmail.com> +Reviewed-by: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kairui Song +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Conflicts: + include/linux/swap.h +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 25 ++--- + mm/swapfile.c | 226 ++++++++++++------------------------------- + 2 files changed, 71 insertions(+), 180 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index bea0c0f1f640..94e1b6bb04ce 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -255,22 +255,20 @@ enum { + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * +- * The data field stores next cluster if the cluster is free or cluster usage +- * counter otherwise. The flags field determines if a cluster is free. This is +- * protected by swap_info_struct.lock. ++ * The flags field determines if a cluster is free. This is ++ * protected by cluster lock. + */ + struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields +- * and swap_info_struct->swap_map +- * elements correspond to the swap +- * cluster ++ * other than list, and swap_info_struct->swap_map ++ * elements corresponding to the swap cluster. + */ +- unsigned int data:24; +- unsigned int flags:8; ++ u16 count; ++ u8 flags; ++ struct list_head list; + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -295,11 +293,6 @@ struct percpu_cluster { + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ + }; + +-struct swap_cluster_list { +- struct swap_cluster_info head; +- struct swap_cluster_info tail; +-}; +- + /* + * The in-memory structure used to track swap areas. + */ +@@ -312,7 +305,7 @@ struct swap_info_struct { + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ +- struct swap_cluster_list free_clusters; /* free clusters list */ ++ struct list_head free_clusters; /* free clusters list */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +@@ -345,7 +338,7 @@ struct swap_info_struct { + * list. + */ + struct work_struct discard_work; /* discard worker */ +- struct swap_cluster_list discard_clusters; /* discard clusters list */ ++ struct list_head discard_clusters; /* discard clusters list */ + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 30832b85d6c2..76b344438606 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -289,62 +289,15 @@ static void discard_swap_cluster(struct swap_info_struct *si, + #endif + #define LATENCY_LIMIT 256 + +-static inline void cluster_set_flag(struct swap_cluster_info *info, +- unsigned int flag) +-{ +- info->flags = flag; +-} +- +-static inline unsigned int cluster_count(struct swap_cluster_info *info) +-{ +- return info->data; +-} +- +-static inline void cluster_set_count(struct swap_cluster_info *info, +- unsigned int c) +-{ +- info->data = c; +-} +- +-static inline void cluster_set_count_flag(struct swap_cluster_info *info, +- unsigned int c, unsigned int f) +-{ +- info->flags = f; +- info->data = c; +-} +- +-static inline unsigned int cluster_next(struct swap_cluster_info *info) +-{ +- return info->data; +-} +- +-static inline void cluster_set_next(struct swap_cluster_info *info, +- unsigned int n) +-{ +- info->data = n; +-} +- +-static inline void cluster_set_next_flag(struct swap_cluster_info *info, +- unsigned int n, unsigned int f) +-{ +- info->flags = f; +- info->data = n; +-} +- + static inline bool cluster_is_free(struct swap_cluster_info *info) + { + return info->flags & CLUSTER_FLAG_FREE; + } + +-static inline bool cluster_is_null(struct swap_cluster_info *info) +-{ +- return info->flags & CLUSTER_FLAG_NEXT_NULL; +-} +- +-static inline void cluster_set_null(struct swap_cluster_info *info) ++static inline unsigned int cluster_index(struct swap_info_struct *si, ++ struct swap_cluster_info *ci) + { +- info->flags = CLUSTER_FLAG_NEXT_NULL; +- info->data = 0; ++ return ci - si->cluster_info; + } + + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, +@@ -393,65 +346,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + spin_unlock(&si->lock); + } + +-static inline bool cluster_list_empty(struct swap_cluster_list *list) +-{ +- return cluster_is_null(&list->head); +-} +- +-static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +-{ +- return cluster_next(&list->head); +-} +- +-static void cluster_list_init(struct swap_cluster_list *list) +-{ +- cluster_set_null(&list->head); +- cluster_set_null(&list->tail); +-} +- +-static void cluster_list_add_tail(struct swap_cluster_list *list, +- struct swap_cluster_info *ci, +- unsigned int idx) +-{ +- if (cluster_list_empty(list)) { +- cluster_set_next_flag(&list->head, idx, 0); +- cluster_set_next_flag(&list->tail, idx, 0); +- } else { +- struct swap_cluster_info *ci_tail; +- unsigned int tail = cluster_next(&list->tail); +- +- /* +- * Nested cluster lock, but both cluster locks are +- * only acquired when we held swap_info_struct->lock +- */ +- ci_tail = ci + tail; +- spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); +- cluster_set_next(ci_tail, idx); +- spin_unlock(&ci_tail->lock); +- cluster_set_next_flag(&list->tail, idx, 0); +- } +-} +- +-static unsigned int cluster_list_del_first(struct swap_cluster_list *list, +- struct swap_cluster_info *ci) +-{ +- unsigned int idx; +- +- idx = cluster_next(&list->head); +- if (cluster_next(&list->tail) == idx) { +- cluster_set_null(&list->head); +- cluster_set_null(&list->tail); +- } else +- cluster_set_next_flag(&list->head, +- cluster_next(&ci[idx]), 0); +- +- return idx; +-} +- + /* Add a cluster to discard list and schedule it to do discard */ + static void swap_cluster_schedule_discard(struct swap_info_struct *si, +- unsigned int idx) ++ struct swap_cluster_info *ci) + { ++ unsigned int idx = cluster_index(si, ci); + /* + * If scan_swap_map_slots() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't +@@ -461,17 +360,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + +- cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); +- ++ list_add_tail(&ci->list, &si->discard_clusters); + schedule_work(&si->discard_work); + } + +-static void __free_cluster(struct swap_info_struct *si, unsigned long idx) ++static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { +- struct swap_cluster_info *ci = si->cluster_info; +- +- cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); +- cluster_list_add_tail(&si->free_clusters, ci, idx); ++ ci->flags = CLUSTER_FLAG_FREE; ++ list_add_tail(&ci->list, &si->free_clusters); + } + + /* +@@ -480,24 +376,25 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) + */ + static void swap_do_scheduled_discard(struct swap_info_struct *si) + { +- struct swap_cluster_info *info, *ci; ++ struct swap_cluster_info *ci; + unsigned int idx; + +- info = si->cluster_info; +- +- while (!cluster_list_empty(&si->discard_clusters)) { +- idx = cluster_list_del_first(&si->discard_clusters, info); ++ while (!list_empty(&si->discard_clusters)) { ++ ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); ++ list_del(&ci->list); ++ idx = cluster_index(si, ci); + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); +- ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); +- __free_cluster(si, idx); ++ ++ spin_lock(&ci->lock); ++ __free_cluster(si, ci); + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); +- unlock_cluster(ci); ++ spin_unlock(&ci->lock); + } + } + +@@ -520,20 +417,21 @@ static void swap_users_ref_free(struct percpu_ref *ref) + complete(&si->comp); + } + +-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) ++static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx) + { +- struct swap_cluster_info *ci = si->cluster_info; ++ struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, ++ struct swap_cluster_info, list); + +- VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); +- cluster_list_del_first(&si->free_clusters, ci); +- cluster_set_count_flag(ci + idx, 0, 0); ++ VM_BUG_ON(cluster_index(si, ci) != idx); ++ list_del(&ci->list); ++ ci->count = 0; ++ ci->flags = 0; ++ return ci; + } + +-static void free_cluster(struct swap_info_struct *si, unsigned long idx) ++static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { +- struct swap_cluster_info *ci = si->cluster_info + idx; +- +- VM_BUG_ON(cluster_count(ci) != 0); ++ VM_BUG_ON(ci->count != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -541,11 +439,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { +- swap_cluster_schedule_discard(si, idx); ++ swap_cluster_schedule_discard(si, ci); + return; + } + +- __free_cluster(si, idx); ++ __free_cluster(si, ci); + } + + /* +@@ -558,15 +456,15 @@ static void add_cluster_info_page(struct swap_info_struct *p, + unsigned long count) + { + unsigned long idx = page_nr / SWAPFILE_CLUSTER; ++ struct swap_cluster_info *ci = cluster_info + idx; + + if (!cluster_info) + return; +- if (cluster_is_free(&cluster_info[idx])) ++ if (cluster_is_free(ci)) + alloc_cluster(p, idx); + +- VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); +- cluster_set_count(&cluster_info[idx], +- cluster_count(&cluster_info[idx]) + count); ++ VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER); ++ ci->count += count; + } + + /* +@@ -580,24 +478,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p, + } + + /* +- * The cluster corresponding to page_nr decreases one usage. If the usage +- * counter becomes 0, which means no page in the cluster is in using, we can +- * optionally discard the cluster and add it to free cluster list. ++ * The cluster ci decreases one usage. If the usage counter becomes 0, ++ * which means no page in the cluster is in use, we can optionally discard ++ * the cluster and add it to free cluster list. + */ +-static void dec_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr) ++static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) + { +- unsigned long idx = page_nr / SWAPFILE_CLUSTER; +- +- if (!cluster_info) ++ if (!p->cluster_info) + return; + +- VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); +- cluster_set_count(&cluster_info[idx], +- cluster_count(&cluster_info[idx]) - 1); ++ VM_BUG_ON(ci->count == 0); ++ ci->count--; + +- if (cluster_count(&cluster_info[idx]) == 0) +- free_cluster(p, idx); ++ if (!ci->count) ++ free_cluster(p, ci); + } + + /* +@@ -610,10 +504,12 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + { + struct percpu_cluster *percpu_cluster; + bool conflict; ++ struct swap_cluster_info *first = list_first_entry(&si->free_clusters, ++ struct swap_cluster_info, list); + + offset /= SWAPFILE_CLUSTER; +- conflict = !cluster_list_empty(&si->free_clusters) && +- offset != cluster_list_first(&si->free_clusters) && ++ conflict = !list_empty(&si->free_clusters) && ++ offset != cluster_index(si, first) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) +@@ -654,10 +550,10 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + cluster = this_cpu_ptr(si->percpu_cluster); + tmp = cluster->next[order]; + if (tmp == SWAP_NEXT_INVALID) { +- if (!cluster_list_empty(&si->free_clusters)) { +- tmp = cluster_next(&si->free_clusters.head) * +- SWAPFILE_CLUSTER; +- } else if (!cluster_list_empty(&si->discard_clusters)) { ++ if (!list_empty(&si->free_clusters)) { ++ ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; ++ } else if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then +@@ -1055,8 +951,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) + + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); +- cluster_set_count_flag(ci, 0, 0); +- free_cluster(si, idx); ++ ci->count = 0; ++ ci->flags = 0; ++ free_cluster(si, ci); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); + } +@@ -1418,7 +1315,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) + count = p->swap_map[offset]; + VM_BUG_ON(count != SWAP_HAS_CACHE); + p->swap_map[offset] = 0; +- dec_cluster_info_page(p, p->cluster_info, offset); ++ dec_cluster_info_page(p, ci); + unlock_cluster(ci); + + mem_cgroup_uncharge_swap(entry, 1); +@@ -3113,8 +3010,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + + nr_good_pages = maxpages - 1; /* omit header page */ + +- cluster_list_init(&p->free_clusters); +- cluster_list_init(&p->discard_clusters); ++ INIT_LIST_HEAD(&p->free_clusters); ++ INIT_LIST_HEAD(&p->discard_clusters); + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; +@@ -3165,14 +3062,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { ++ struct swap_cluster_info *ci; + idx = i * SWAP_CLUSTER_COLS + j; ++ ci = cluster_info + idx; + if (idx >= nr_clusters) + continue; +- if (cluster_count(&cluster_info[idx])) ++ if (ci->count) + continue; +- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); +- cluster_list_add_tail(&p->free_clusters, cluster_info, +- idx); ++ ci->flags = CLUSTER_FLAG_FREE; ++ list_add_tail(&ci->list, &p->free_clusters); + } + } + return nr_extents; +-- +Gitee + + +From 3bc5a5e67c63e14fe1342ed16ecb304cf60d94b3 Mon Sep 17 00:00:00 2001 +From: Chris Li +Date: Wed, 18 Dec 2024 17:51:08 +0800 +Subject: [PATCH 03/14] mm: swap: mTHP allocate swap entries from nonfull list +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +mainline inclusion +from mainline-v6.12-rc1 +commit d07a46a4ac18786e7f4c98fb08525ed80dd1f642 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d07a46a4ac18786e7f4c98fb08525ed80dd1f642 + +-------------------------------- + +Track the nonfull cluster as well as the empty cluster on lists. Each +order has one nonfull cluster list. + +The cluster will remember which order it was used during new cluster +allocation. + +When the cluster has free entry, add to the nonfull[order] list.  When +the free cluster list is empty, also allocate from the nonempty list of +that order. + +This improves the mTHP swap allocation success rate. + +There are limitations if the distribution of numbers of different orders +of mTHP changes a lot. e.g. there are a lot of nonfull cluster assign to +order A while later time there are a lot of order B allocation while very +little allocation in order A. Currently the cluster used by order A will +not reused by order B unless the cluster is 100% empty. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org +Signed-off-by: Chris Li +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kairui Song +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 4 ++++ + mm/swapfile.c | 38 +++++++++++++++++++++++++++++++++++--- + 2 files changed, 39 insertions(+), 3 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 94e1b6bb04ce..29a1daa46421 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -266,9 +266,11 @@ struct swap_cluster_info { + */ + u16 count; + u8 flags; ++ u8 order; + struct list_head list; + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ ++#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -306,6 +308,8 @@ struct swap_info_struct { + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct list_head free_clusters; /* free clusters list */ ++ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; ++ /* list of cluster that contains at least one free slot */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 76b344438606..adde6877c0fe 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -360,14 +360,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + +- list_add_tail(&ci->list, &si->discard_clusters); ++ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); ++ if (ci->flags & CLUSTER_FLAG_NONFULL) ++ list_move_tail(&ci->list, &si->discard_clusters); ++ else ++ list_add_tail(&ci->list, &si->discard_clusters); ++ ci->flags = 0; + schedule_work(&si->discard_work); + } + + static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { ++ if (ci->flags & CLUSTER_FLAG_NONFULL) ++ list_move_tail(&ci->list, &si->free_clusters); ++ else ++ list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; +- list_add_tail(&ci->list, &si->free_clusters); + } + + /* +@@ -490,8 +498,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste + VM_BUG_ON(ci->count == 0); + ci->count--; + +- if (!ci->count) ++ if (!ci->count) { + free_cluster(p, ci); ++ return; ++ } ++ ++ if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { ++ list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ ci->flags |= CLUSTER_FLAG_NONFULL; ++ } + } + + /* +@@ -552,6 +567,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + if (tmp == SWAP_NEXT_INVALID) { + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ list_del(&ci->list); ++ spin_lock(&ci->lock); ++ ci->order = order; ++ ci->flags = 0; ++ spin_unlock(&ci->lock); ++ tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; ++ } else if (!list_empty(&si->nonfull_clusters[order])) { ++ ci = list_first_entry(&si->nonfull_clusters[order], ++ struct swap_cluster_info, list); ++ list_del(&ci->list); ++ spin_lock(&ci->lock); ++ ci->flags = 0; ++ spin_unlock(&ci->lock); + tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; + } else if (!list_empty(&si->discard_clusters)) { + /* +@@ -952,6 +980,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); + ci->count = 0; ++ ci->order = 0; + ci->flags = 0; + free_cluster(si, ci); + unlock_cluster(ci); +@@ -3013,6 +3042,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + INIT_LIST_HEAD(&p->free_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + ++ for (i = 0; i < SWAP_NR_ORDERS; i++) ++ INIT_LIST_HEAD(&p->nonfull_clusters[i]); ++ + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) +-- +Gitee + + +From 71c1b6bdf4681e292a269a16337b6fbf64c388d6 Mon Sep 17 00:00:00 2001 +From: Chris Li +Date: Wed, 18 Dec 2024 17:51:09 +0800 +Subject: [PATCH 04/14] mm: swap: separate SSD allocation from + scan_swap_map_slots() + +mainline inclusion +from mainline-v6.12-rc1 +commit 5f843a9a3a1e865fbf349419bde39977c2e7d3d1 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f843a9a3a1e865fbf349419bde39977c2e7d3d1 + +-------------------------------- + +Previously the SSD and HDD share the same swap_map scan loop in +scan_swap_map_slots(). This function is complex and hard to flow the +execution flow. + +scan_swap_map_try_ssd_cluster() can already do most of the heavy lifting +to locate the candidate swap range in the cluster. However it needs to go +back to scan_swap_map_slots() to check conflict and then perform the +allocation. + +When scan_swap_map_try_ssd_cluster() failed, it still depended on the +scan_swap_map_slots() to do brute force scanning of the swap_map. When +the swapfile is large and almost full, it will take some CPU time to go +through the swap_map array. + +Get rid of the cluster allocation dependency on the swap_map scan loop in +scan_swap_map_slots(). Streamline the cluster allocation code path. No +more conflict checks. + +For order 0 swap entry, when run out of free and nonfull list. It will +allocate from the higher order nonfull cluster list. + +Users should see less CPU time spent on searching the free swap slot when +swapfile is almost full. + +[ryncsn@gmail.com: fix array-bounds error with CONFIG_THP_SWAP=n] + Link: https://lkml.kernel.org/r/CAMgjq7Bz0DY+rY0XgCoH7-Q=uHLdo3omi8kUr4ePDweNyofsbQ@mail.gmail.com +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-3-cb9c148b9297@kernel.org +Signed-off-by: Chris Li +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 300 ++++++++++++++++++++++++++++---------------------- + 1 file changed, 168 insertions(+), 132 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index adde6877c0fe..a3e721510311 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -52,6 +52,8 @@ + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); + static void free_swap_count_continuations(struct swap_info_struct *); ++static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, ++ unsigned int nr_entries); + + static DEFINE_SPINLOCK(swap_lock); + static unsigned int nr_swapfiles; +@@ -300,6 +302,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, + return ci - si->cluster_info; + } + ++static inline unsigned int cluster_offset(struct swap_info_struct *si, ++ struct swap_cluster_info *ci) ++{ ++ return cluster_index(si, ci) * SWAPFILE_CLUSTER; ++} ++ + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset) + { +@@ -371,11 +379,15 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + + static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); ++ + if (ci->flags & CLUSTER_FLAG_NONFULL) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; ++ ci->order = 0; + } + + /* +@@ -430,9 +442,11 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi + struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, + struct swap_cluster_info, list); + ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); + VM_BUG_ON(cluster_index(si, ci) != idx); ++ VM_BUG_ON(ci->count); + list_del(&ci->list); +- ci->count = 0; + ci->flags = 0; + return ci; + } +@@ -440,6 +454,8 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi + static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { + VM_BUG_ON(ci->count != 0); ++ lockdep_assert_held(&si->lock); ++ lockdep_assert_held(&ci->lock); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -496,6 +512,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste + return; + + VM_BUG_ON(ci->count == 0); ++ VM_BUG_ON(cluster_is_free(ci)); ++ lockdep_assert_held(&p->lock); ++ lockdep_assert_held(&ci->lock); + ci->count--; + + if (!ci->count) { +@@ -504,48 +523,88 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste + } + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { ++ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- ci->flags |= CLUSTER_FLAG_NONFULL; ++ ci->flags = CLUSTER_FLAG_NONFULL; + } + } + +-/* +- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free +- * cluster list. Avoiding such abuse to avoid list corruption. +- */ +-static bool +-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, +- unsigned long offset, int order) ++static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, ++ unsigned int nr_pages) + { +- struct percpu_cluster *percpu_cluster; +- bool conflict; +- struct swap_cluster_info *first = list_first_entry(&si->free_clusters, +- struct swap_cluster_info, list); +- +- offset /= SWAPFILE_CLUSTER; +- conflict = !list_empty(&si->free_clusters) && +- offset != cluster_index(si, first) && +- cluster_is_free(&si->cluster_info[offset]); ++ unsigned char *p = si->swap_map + start; ++ unsigned char *end = p + nr_pages; + +- if (!conflict) +- return false; ++ while (p < end) ++ if (*p++) ++ return false; + +- percpu_cluster = this_cpu_ptr(si->percpu_cluster); +- percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; + } + +-static inline bool swap_range_empty(char *swap_map, unsigned int start, +- unsigned int nr_pages) ++ ++static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++ unsigned int start, unsigned char usage, ++ unsigned int order) + { +- unsigned int i; ++ unsigned int nr_pages = 1 << order; + +- for (i = 0; i < nr_pages; i++) { +- if (swap_map[start + i]) +- return false; ++ if (cluster_is_free(ci)) { ++ if (nr_pages < SWAPFILE_CLUSTER) { ++ list_move_tail(&ci->list, &si->nonfull_clusters[order]); ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ } ++ ci->order = order; + } + +- return true; ++ memset(si->swap_map + start, usage, nr_pages); ++ swap_range_alloc(si, start, nr_pages); ++ ci->count += nr_pages; ++ ++ if (ci->count == SWAPFILE_CLUSTER) { ++ VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL))); ++ list_del(&ci->list); ++ ci->flags = 0; ++ } ++} ++ ++static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, ++ unsigned int *foundp, unsigned int order, ++ unsigned char usage) ++{ ++ unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); ++ unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); ++ unsigned int nr_pages = 1 << order; ++ struct swap_cluster_info *ci; ++ ++ if (end < nr_pages) ++ return SWAP_NEXT_INVALID; ++ end -= nr_pages; ++ ++ ci = lock_cluster(si, offset); ++ if (ci->count + nr_pages > SWAPFILE_CLUSTER) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ ++ while (offset <= end) { ++ if (cluster_scan_range(si, offset, nr_pages)) { ++ cluster_alloc_range(si, ci, offset, usage, order); ++ *foundp = offset; ++ if (ci->count == SWAPFILE_CLUSTER) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } ++ offset += nr_pages; ++ break; ++ } ++ offset += nr_pages; ++ } ++ if (offset > end) ++ offset = SWAP_NEXT_INVALID; ++done: ++ unlock_cluster(ci); ++ return offset; + } + + /* +@@ -553,72 +612,66 @@ static inline bool swap_range_empty(char *swap_map, unsigned int start, + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. + */ +-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, +- unsigned long *offset, unsigned long *scan_base, int order) ++static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, ++ unsigned char usage) + { +- unsigned int nr_pages = 1 << order; + struct percpu_cluster *cluster; +- struct swap_cluster_info *ci; +- unsigned int tmp, max; ++ struct swap_cluster_info *ci, *n; ++ unsigned int offset, found = 0; + + new_cluster: ++ lockdep_assert_held(&si->lock); + cluster = this_cpu_ptr(si->percpu_cluster); +- tmp = cluster->next[order]; +- if (tmp == SWAP_NEXT_INVALID) { +- if (!list_empty(&si->free_clusters)) { +- ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); +- list_del(&ci->list); +- spin_lock(&ci->lock); +- ci->order = order; +- ci->flags = 0; +- spin_unlock(&ci->lock); +- tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; +- } else if (!list_empty(&si->nonfull_clusters[order])) { +- ci = list_first_entry(&si->nonfull_clusters[order], +- struct swap_cluster_info, list); +- list_del(&ci->list); +- spin_lock(&ci->lock); +- ci->flags = 0; +- spin_unlock(&ci->lock); +- tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; +- } else if (!list_empty(&si->discard_clusters)) { +- /* +- * we don't have free cluster but have some clusters in +- * discarding, do discard now and reclaim them, then +- * reread cluster_next_cpu since we dropped si->lock +- */ +- swap_do_scheduled_discard(si); +- *scan_base = this_cpu_read(*si->cluster_next_cpu); +- *offset = *scan_base; +- goto new_cluster; +- } else +- return false; ++ offset = cluster->next[order]; ++ if (offset) { ++ offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); ++ if (found) ++ goto done; + } + +- /* +- * Other CPUs can use our cluster if they can't find a free cluster, +- * check if there is still free entry in the cluster, maintaining +- * natural alignment. +- */ +- max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); +- if (tmp < max) { +- ci = lock_cluster(si, tmp); +- while (tmp < max) { +- if (swap_range_empty(si->swap_map, tmp, nr_pages)) +- break; +- tmp += nr_pages; ++ if (!list_empty(&si->free_clusters)) { ++ ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); ++ VM_BUG_ON(!found); ++ goto done; ++ } ++ ++ if (order < PMD_ORDER) { ++ list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, order, usage); ++ if (found) ++ goto done; + } +- unlock_cluster(ci); + } +- if (tmp >= max) { +- cluster->next[order] = SWAP_NEXT_INVALID; ++ ++ if (!list_empty(&si->discard_clusters)) { ++ /* ++ * we don't have free cluster but have some clusters in ++ * discarding, do discard now and reclaim them, then ++ * reread cluster_next_cpu since we dropped si->lock ++ */ ++ swap_do_scheduled_discard(si); + goto new_cluster; + } +- *offset = tmp; +- *scan_base = tmp; +- tmp += nr_pages; +- cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; +- return true; ++ ++ if (order) ++ goto done; ++ ++ for (int o = 1; o < SWAP_NR_ORDERS; o++) { ++ if (!list_empty(&si->nonfull_clusters[o])) { ++ ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, ++ list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, 0, usage); ++ VM_BUG_ON(!found); ++ goto done; ++ } ++ } ++ ++done: ++ cluster->next[order] = offset; ++ return found; + } + + static void __del_from_avail_list(struct swap_info_struct *p) +@@ -739,11 +792,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, + return false; + } + ++static int cluster_alloc_swap(struct swap_info_struct *si, ++ unsigned char usage, int nr, ++ swp_entry_t slots[], int order) ++{ ++ int n_ret = 0; ++ ++ VM_BUG_ON(!si->cluster_info); ++ ++ while (n_ret < nr) { ++ unsigned long offset = cluster_alloc_swap_entry(si, order, usage); ++ ++ if (!offset) ++ break; ++ slots[n_ret++] = swp_entry(si->type, offset); ++ } ++ ++ return n_ret; ++} ++ + static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) + { +- struct swap_cluster_info *ci; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; +@@ -782,26 +853,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + return 0; + } + ++ if (si->cluster_info) ++ return cluster_alloc_swap(si, usage, nr, slots, order); ++ + si->flags += SWP_SCANNING; +- /* +- * Use percpu scan base for SSD to reduce lock contention on +- * cluster and swap cache. For HDD, sequential access is more +- * important. +- */ +- if (si->flags & SWP_SOLIDSTATE) +- scan_base = this_cpu_read(*si->cluster_next_cpu); +- else +- scan_base = si->cluster_next; ++ ++ /* For HDD, sequential access is more important. */ ++ scan_base = si->cluster_next; + offset = scan_base; + +- /* SSD algorithm */ +- if (si->cluster_info) { +- if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { +- if (order > 0) +- goto no_page; +- goto scan; +- } +- } else if (unlikely(!si->cluster_nr--)) { ++ if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; +@@ -812,8 +873,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. +- * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info +- * case, just handled by scan_swap_map_try_ssd_cluster() above. + */ + scan_base = offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; +@@ -841,19 +900,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + } + + checks: +- if (si->cluster_info) { +- while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { +- /* take a break if we already got some slots */ +- if (n_ret) +- goto done; +- if (!scan_swap_map_try_ssd_cluster(si, &offset, +- &scan_base, order)) { +- if (order > 0) +- goto no_page; +- goto scan; +- } +- } +- } + if (!(si->flags & SWP_WRITEOK)) + goto no_page; + if (!si->highest_bit) +@@ -861,11 +907,9 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + +- ci = lock_cluster(si, offset); + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; +- unlock_cluster(ci); + spin_unlock(&si->lock); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + spin_lock(&si->lock); +@@ -876,15 +920,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + } + + if (si->swap_map[offset]) { +- unlock_cluster(ci); + if (!n_ret) + goto scan; + else + goto done; + } + memset(si->swap_map + offset, usage, nr_pages); +- add_cluster_info_page(si, si->cluster_info, offset, nr_pages); +- unlock_cluster(ci); + + swap_range_alloc(si, offset, nr_pages); + slots[n_ret++] = swp_entry(si->type, offset); +@@ -905,13 +946,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + latency_ration = LATENCY_LIMIT; + } + +- /* try to get more slots in cluster */ +- if (si->cluster_info) { +- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) +- goto checks; +- if (order > 0) +- goto done; +- } else if (si->cluster_nr && !si->swap_map[++offset]) { ++ if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ + --si->cluster_nr; + goto checks; +@@ -980,8 +1015,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); + ci->count = 0; +- ci->order = 0; +- ci->flags = 0; + free_cluster(si, ci); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +@@ -3099,8 +3132,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + ci = cluster_info + idx; + if (idx >= nr_clusters) + continue; +- if (ci->count) ++ if (ci->count) { ++ ci->flags = CLUSTER_FLAG_NONFULL; ++ list_add_tail(&ci->list, &p->nonfull_clusters[0]); + continue; ++ } + ci->flags = CLUSTER_FLAG_FREE; + list_add_tail(&ci->list, &p->free_clusters); + } +-- +Gitee + + +From 4db67dafd426f7dd2fbde13583c1875a2b242b95 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:10 +0800 +Subject: [PATCH 05/14] mm: swap: clean up initialization helper + +mainline inclusion +from mainline-v6.12-rc1 +commit 3b2561b5daeb3531c011491e9a6d2b934cc8f49f +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b2561b5daeb3531c011491e9a6d2b934cc8f49f + +-------------------------------- + +At this point, alloc_cluster is never called already, and +inc_cluster_info_page is called by initialization only, a lot of dead code +can be dropped. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-4-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 44 ++++++++++---------------------------------- + 1 file changed, 10 insertions(+), 34 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index a3e721510311..4be5fbbdc1c8 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -437,20 +437,6 @@ static void swap_users_ref_free(struct percpu_ref *ref) + complete(&si->comp); + } + +-static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx) +-{ +- struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, +- struct swap_cluster_info, list); +- +- lockdep_assert_held(&si->lock); +- lockdep_assert_held(&ci->lock); +- VM_BUG_ON(cluster_index(si, ci) != idx); +- VM_BUG_ON(ci->count); +- list_del(&ci->list); +- ci->flags = 0; +- return ci; +-} +- + static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) + { + VM_BUG_ON(ci->count != 0); +@@ -471,34 +457,24 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * + } + + /* +- * The cluster corresponding to page_nr will be used. The cluster will be +- * removed from free cluster list and its usage counter will be increased by +- * count. ++ * The cluster corresponding to page_nr will be used. The cluster will not be ++ * added to free cluster list and its usage counter will be increased by 1. ++ * Only used for initialization. + */ +-static void add_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr, +- unsigned long count) ++static void inc_cluster_info_page(struct swap_info_struct *p, ++ struct swap_cluster_info *cluster_info, unsigned long page_nr) + { + unsigned long idx = page_nr / SWAPFILE_CLUSTER; +- struct swap_cluster_info *ci = cluster_info + idx; ++ struct swap_cluster_info *ci; + + if (!cluster_info) + return; +- if (cluster_is_free(ci)) +- alloc_cluster(p, idx); + +- VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER); +- ci->count += count; +-} ++ ci = cluster_info + idx; ++ ci->count++; + +-/* +- * The cluster corresponding to page_nr will be used. The cluster will be +- * removed from free cluster list and its usage counter will be increased by 1. +- */ +-static void inc_cluster_info_page(struct swap_info_struct *p, +- struct swap_cluster_info *cluster_info, unsigned long page_nr) +-{ +- add_cluster_info_page(p, cluster_info, page_nr, 1); ++ VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); ++ VM_BUG_ON(ci->flags); + } + + /* +-- +Gitee + + +From 18f732c19747e766e0632419f32dfb02768ada67 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:11 +0800 +Subject: [PATCH 06/14] mm: swap: skip slot cache on freeing for mTHP + +mainline inclusion +from mainline-v6.12-rc1 +commit 650975d2b181e30c9017c42cb3f6535287555b1e +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=650975d2b181e30c9017c42cb3f6535287555b1e + +-------------------------------- + +Currently when we are freeing mTHP folios from swap cache, we free then +one by one and put each entry into swap slot cache. Slot cache is +designed to reduce the overhead by batching the freeing, but mTHP swap +entries are already continuous so they can be batch freed without it +already, it saves litle overhead, or even increase overhead for larger +mTHP. + +What's more, mTHP entries could stay in swap cache for a while. +Contiguous swap entry is an rather rare resource so releasing them +directly can help improve mTHP allocation success rate when under +pressure. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Acked-by: Barry Song +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Conflicts: + mm/swapfile.c +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 59 +++++++++++++++++++++++---------------------------- + 1 file changed, 26 insertions(+), 33 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 4be5fbbdc1c8..44726e0b8f8f 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -478,20 +478,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p, + } + + /* +- * The cluster ci decreases one usage. If the usage counter becomes 0, ++ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, + * which means no page in the cluster is in use, we can optionally discard + * the cluster and add it to free cluster list. + */ +-static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) ++static void dec_cluster_info_page(struct swap_info_struct *p, ++ struct swap_cluster_info *ci, int nr_pages) + { + if (!p->cluster_info) + return; + +- VM_BUG_ON(ci->count == 0); ++ VM_BUG_ON(ci->count < nr_pages); + VM_BUG_ON(cluster_is_free(ci)); + lockdep_assert_held(&p->lock); + lockdep_assert_held(&ci->lock); +- ci->count--; ++ ci->count -= nr_pages; + + if (!ci->count) { + free_cluster(p, ci); +@@ -983,19 +984,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + return n_ret; + } + +-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +-{ +- unsigned long offset = idx * SWAPFILE_CLUSTER; +- struct swap_cluster_info *ci; +- +- ci = lock_cluster(si, offset); +- memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); +- ci->count = 0; +- free_cluster(si, ci); +- unlock_cluster(ci); +- swap_range_free(si, offset, SWAPFILE_CLUSTER); +-} +- + #ifdef CONFIG_MEMCG_SWAP_QOS + int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type) + { +@@ -1343,21 +1331,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, + return usage; + } + +-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) ++/* ++ * Drop the last HAS_CACHE flag of swap entries, caller have to ++ * ensure all entries belong to the same cgroup. ++ */ ++static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry, ++ unsigned int nr_pages) + { +- struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); +- unsigned char count; ++ unsigned char *map = p->swap_map + offset; ++ unsigned char *map_end = map + nr_pages; ++ struct swap_cluster_info *ci; + + ci = lock_cluster(p, offset); +- count = p->swap_map[offset]; +- VM_BUG_ON(count != SWAP_HAS_CACHE); +- p->swap_map[offset] = 0; +- dec_cluster_info_page(p, ci); ++ do { ++ VM_BUG_ON(*map != SWAP_HAS_CACHE); ++ *map = 0; ++ } while (++map < map_end); ++ dec_cluster_info_page(p, ci, nr_pages); + unlock_cluster(ci); + +- mem_cgroup_uncharge_swap(entry, 1); +- swap_range_free(p, offset, 1); ++ mem_cgroup_uncharge_swap(entry, nr_pages); ++ swap_range_free(p, offset, nr_pages); + } + + static void cluster_swap_free_nr(struct swap_info_struct *sis, +@@ -1418,7 +1413,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) + void put_swap_folio(struct folio *folio, swp_entry_t entry) + { + unsigned long offset = swp_offset(entry); +- unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; +@@ -1431,19 +1425,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + return; + + ci = lock_cluster_or_swap_info(si, offset); +- if (size == SWAPFILE_CLUSTER) { ++ if (size > 1) { + map = si->swap_map + offset; +- for (i = 0; i < SWAPFILE_CLUSTER; i++) { ++ for (i = 0; i < size; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } +- if (free_entries == SWAPFILE_CLUSTER) { ++ if (free_entries == size) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); +- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); +- swap_free_cluster(si, idx); ++ swap_entry_range_free(si, entry, size); + spin_unlock(&si->lock); + return; + } +@@ -1488,7 +1481,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) +- swap_entry_free(p, entries[i]); ++ swap_entry_range_free(p, entries[i], 1); + prev = p; + } + if (p) +-- +Gitee + + +From 53a99352d0946625a0d45deeb8d0729855d4b080 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:12 +0800 +Subject: [PATCH 07/14] mm: swap: allow cache reclaim to skip slot cache + +mainline inclusion +from mainline-v6.12-rc1 +commit 862590ac3708e1cbbfb02a8ed78587b86ecba4ba +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=862590ac3708e1cbbfb02a8ed78587b86ecba4ba + +-------------------------------- + +Currently we free the reclaimed slots through slot cache even if the slot +is required to be empty immediately. As a result the reclaim caller will +see the slot still occupied even after a successful reclaim, and need to +keep reclaiming until slot cache get flushed. This caused ineffective or +over reclaim when SWAP is under stress. + +So introduce a new flag allowing the slot to be emptied bypassing the slot +cache. + +[21cnbao@gmail.com: small folios should have nr_pages == 1 but not nr_page == 0] + Link: https://lkml.kernel.org/r/20240805015324.45134-1-21cnbao@gmail.com +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Conflicts: + mm/swapfile.c +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 152 ++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 109 insertions(+), 43 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 44726e0b8f8f..e58457b801fb 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -52,8 +52,15 @@ + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); + static void free_swap_count_continuations(struct swap_info_struct *); ++static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, ++ unsigned int nr_pages); + static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries); ++static bool folio_swapcache_freeable(struct folio *folio); ++static struct swap_cluster_info *lock_cluster_or_swap_info( ++ struct swap_info_struct *si, unsigned long offset); ++static void unlock_cluster_or_swap_info(struct swap_info_struct *si, ++ struct swap_cluster_info *ci); + + static DEFINE_SPINLOCK(swap_lock); + static unsigned int nr_swapfiles; +@@ -128,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent) + * corresponding page + */ + #define TTRS_UNMAPPED 0x2 +-/* Reclaim the swap entry if swap is getting full*/ ++/* Reclaim the swap entry if swap is getting full */ + #define TTRS_FULL 0x4 ++/* Reclaim directly, bypass the slot cache and don't touch device lock */ ++#define TTRS_DIRECT 0x8 ++ ++static bool swap_is_has_cache(struct swap_info_struct *si, ++ unsigned long offset, int nr_pages) ++{ ++ unsigned char *map = si->swap_map + offset; ++ unsigned char *map_end = map + nr_pages; ++ ++ do { ++ VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); ++ if (*map != SWAP_HAS_CACHE) ++ return false; ++ } while (++map < map_end); ++ ++ return true; ++} + + /* + * returns number of pages in the folio that backs the swap entry. If positive, +@@ -140,12 +164,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) + { + swp_entry_t entry = swp_entry(si->type, offset); ++ struct address_space *address_space = swap_address_space(entry); ++ struct swap_cluster_info *ci; + struct folio *folio; +- int ret = 0; ++ int ret, nr_pages; ++ bool need_reclaim; + +- folio = filemap_get_folio(swap_address_space(entry), offset); ++ folio = filemap_get_folio(address_space, offset); + if (IS_ERR(folio)) + return 0; ++ ++ /* offset could point to the middle of a large folio */ ++ entry = folio->swap; ++ offset = swp_offset(entry); ++ nr_pages = folio_nr_pages(folio); ++ ret = -nr_pages; ++ + /* + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming folios. So we hold a folio lock +@@ -153,14 +187,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + * case and you should use folio_free_swap() with explicit folio_lock() + * in usual operations. + */ +- if (folio_trylock(folio)) { +- if ((flags & TTRS_ANYWAY) || +- ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || +- ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) +- ret = folio_free_swap(folio); +- folio_unlock(folio); ++ if (!folio_trylock(folio)) ++ goto out; ++ ++ need_reclaim = ((flags & TTRS_ANYWAY) || ++ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ++ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); ++ if (!need_reclaim || !folio_swapcache_freeable(folio)) ++ goto out_unlock; ++ ++ /* ++ * It's safe to delete the folio from swap cache only if the folio's ++ * swap_map is HAS_CACHE only, which means the slots have no page table ++ * reference or pending writeback, and can't be allocated to others. ++ */ ++ ci = lock_cluster_or_swap_info(si, offset); ++ need_reclaim = swap_is_has_cache(si, offset, nr_pages); ++ unlock_cluster_or_swap_info(si, ci); ++ if (!need_reclaim) ++ goto out_unlock; ++ ++ if (!(flags & TTRS_DIRECT)) { ++ /* Free through slot cache */ ++ delete_from_swap_cache(folio); ++ folio_set_dirty(folio); ++ ret = nr_pages; ++ goto out_unlock; + } +- ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); ++ ++ xa_lock_irq(&address_space->i_pages); ++ __delete_from_swap_cache(folio, entry, NULL); ++ xa_unlock_irq(&address_space->i_pages); ++ folio_ref_sub(folio, nr_pages); ++ folio_set_dirty(folio); ++ ++ spin_lock(&si->lock); ++ /* Only sinple page folio can be backed by zswap */ ++ if (nr_pages == 1) ++ zswap_invalidate(entry); ++ swap_entry_range_free(si, entry, nr_pages); ++ spin_unlock(&si->lock); ++ ret = nr_pages; ++out_unlock: ++ folio_unlock(folio); ++out: + folio_put(folio); + return ret; + } +@@ -888,7 +958,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&si->lock); +- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); ++ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&si->lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed > 0) +@@ -1415,9 +1485,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + unsigned long offset = swp_offset(entry); + struct swap_cluster_info *ci; + struct swap_info_struct *si; +- unsigned char *map; +- unsigned int i, free_entries = 0; +- unsigned char val; + int size = 1 << swap_entry_order(folio_order(folio)); + + si = _swap_info_get(entry); +@@ -1425,23 +1492,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) + return; + + ci = lock_cluster_or_swap_info(si, offset); +- if (size > 1) { +- map = si->swap_map + offset; +- for (i = 0; i < size; i++) { +- val = map[i]; +- VM_BUG_ON(!(val & SWAP_HAS_CACHE)); +- if (val == SWAP_HAS_CACHE) +- free_entries++; +- } +- if (free_entries == size) { +- unlock_cluster_or_swap_info(si, ci); +- spin_lock(&si->lock); +- swap_entry_range_free(si, entry, size); +- spin_unlock(&si->lock); +- return; +- } ++ if (size > 1 && swap_is_has_cache(si, offset, size)) { ++ unlock_cluster_or_swap_info(si, ci); ++ spin_lock(&si->lock); ++ swap_entry_range_free(si, entry, size); ++ spin_unlock(&si->lock); ++ return; + } +- for (i = 0; i < size; i++, entry.val++) { ++ for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); +@@ -1601,16 +1659,7 @@ static bool folio_swapped(struct folio *folio) + return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); + } + +-/** +- * folio_free_swap() - Free the swap space used for this folio. +- * @folio: The folio to remove. +- * +- * If swap is getting full, or if there are no more mappings of this folio, +- * then call folio_free_swap to free its swap space. +- * +- * Return: true if we were able to release the swap space. +- */ +-bool folio_free_swap(struct folio *folio) ++static bool folio_swapcache_freeable(struct folio *folio) + { + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + +@@ -1618,8 +1667,6 @@ bool folio_free_swap(struct folio *folio) + return false; + if (folio_test_writeback(folio)) + return false; +- if (folio_swapped(folio)) +- return false; + + /* + * Once hibernation has begun to create its image of memory, +@@ -1639,6 +1686,25 @@ bool folio_free_swap(struct folio *folio) + if (pm_suspended_storage()) + return false; + ++ return true; ++} ++ ++/** ++ * folio_free_swap() - Free the swap space used for this folio. ++ * @folio: The folio to remove. ++ * ++ * If swap is getting full, or if there are no more mappings of this folio, ++ * then call folio_free_swap to free its swap space. ++ * ++ * Return: true if we were able to release the swap space. ++ */ ++bool folio_free_swap(struct folio *folio) ++{ ++ if (!folio_swapcache_freeable(folio)) ++ return false; ++ if (folio_swapped(folio)) ++ return false; ++ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + return true; +@@ -1715,7 +1781,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) + * to the next boundary. + */ + nr = __try_to_reclaim_swap(si, offset, +- TTRS_UNMAPPED | TTRS_FULL); ++ TTRS_UNMAPPED | TTRS_FULL); + if (nr == 0) + nr = 1; + else if (nr < 0) +-- +Gitee + + +From a1f6274ecbb551837ea7a66e740c660f405a2443 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:13 +0800 +Subject: [PATCH 08/14] mm: swap: add a fragment cluster list + +mainline inclusion +from mainline-v6.12-rc1 +commit 477cb7ba28892eda112c79d8f75d10edabfc3050 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=477cb7ba28892eda112c79d8f75d10edabfc3050 + +-------------------------------- + +Now swap cluster allocator arranges the clusters in LRU style, so the +"cold" cluster stay at the head of nonfull lists are the ones that were +used for allocation long time ago and still partially occupied. So if +allocator can't find enough contiguous slots to satisfy an high order +allocation, it's unlikely there will be slot being free on them to satisfy +the allocation, at least in a short period. + +As a result, nonfull cluster scanning will waste time repeatly scanning +the unusable head of the list. + +Also, multiple CPUs could content on the same head cluster of nonfull +list. Unlike free clusters which are removed from the list when any CPU +starts using it, nonfull cluster stays on the head. + +So introduce a new list frag list, all scanned nonfull clusters will be +moved to this list. Both for avoiding repeated scanning and contention. + +Frag list is still used as fallback for allocations, so if one CPU failed +to allocate one order of slots, it can still steal other CPU's clusters. +And order 0 will favor the fragmented clusters to better protect nonfull +clusters + +If any slots on a fragment list are being freed, move the fragment list +back to nonfull list indicating it worth another scan on the cluster. +Compared to scan upon freeing a slot, this keep the scanning lazy and save +some CPU if there are still other clusters to use. + +It may seems unneccessay to keep the fragmented cluster on list at all if +they can't be used for specific order allocation. But this will start to +make sense once reclaim dring scanning is ready. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-7-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 3 +++ + mm/swapfile.c | 41 +++++++++++++++++++++++++++++++++++++---- + 2 files changed, 40 insertions(+), 4 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 29a1daa46421..81188caed2d2 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -271,6 +271,7 @@ struct swap_cluster_info { + }; + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ + #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ ++#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -310,6 +311,8 @@ struct swap_info_struct { + struct list_head free_clusters; /* free clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ ++ struct list_head frag_clusters[SWAP_NR_ORDERS]; ++ /* list of cluster that are fragmented or contented */ + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +diff --git a/mm/swapfile.c b/mm/swapfile.c +index e58457b801fb..7c71e7df9cf3 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -571,7 +571,10 @@ static void dec_cluster_info_page(struct swap_info_struct *p, + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ else ++ list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + } +@@ -609,7 +612,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_ + ci->count += nr_pages; + + if (ci->count == SWAPFILE_CLUSTER) { +- VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL))); ++ VM_BUG_ON(!(ci->flags & ++ (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + list_del(&ci->list); + ci->flags = 0; + } +@@ -665,6 +669,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + struct percpu_cluster *cluster; + struct swap_cluster_info *ci, *n; + unsigned int offset, found = 0; ++ LIST_HEAD(fraged); + + new_cluster: + lockdep_assert_held(&si->lock); +@@ -685,13 +690,29 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + + if (order < PMD_ORDER) { + list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { ++ list_move_tail(&ci->list, &fraged); ++ ci->flags = CLUSTER_FLAG_FRAG; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + if (found) +- goto done; ++ break; + } ++ ++ if (!found) { ++ list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, order, usage); ++ if (found) ++ break; ++ } ++ } ++ ++ list_splice_tail(&fraged, &si->frag_clusters[order]); + } + ++ if (found) ++ goto done; ++ + if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in +@@ -705,7 +726,17 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + if (order) + goto done; + ++ /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { ++ if (!list_empty(&si->frag_clusters[o])) { ++ ci = list_first_entry(&si->frag_clusters[o], ++ struct swap_cluster_info, list); ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, ++ 0, usage); ++ VM_BUG_ON(!found); ++ goto done; ++ } ++ + if (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, + list); +@@ -3110,8 +3141,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + INIT_LIST_HEAD(&p->free_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + +- for (i = 0; i < SWAP_NR_ORDERS; i++) ++ for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&p->nonfull_clusters[i]); ++ INIT_LIST_HEAD(&p->frag_clusters[i]); ++ } + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; +-- +Gitee + + +From 7c0f2c55f9a21373319df1952070b162b3c6be8a Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:14 +0800 +Subject: [PATCH 09/14] mm: swap: relaim the cached parts that got scanned + +mainline inclusion +from mainline-v6.12-rc1 +commit 661383c6111a38c88df61af6bfbcfacd2ff20a67 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=661383c6111a38c88df61af6bfbcfacd2ff20a67 + +-------------------------------- + +This commit implements reclaim during scan for cluster allocator. + +Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could +result in low allocation success rate or early OOM. + +So to ensure maximum allocation success rate, integrate reclaiming with +scanning. If found a range of suitable swap slots but fragmented due to +HAS_CACHE, just try to reclaim the slots. + +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 1 + + mm/swapfile.c | 140 +++++++++++++++++++++++++++++++++---------- + 2 files changed, 110 insertions(+), 31 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 81188caed2d2..83b1bcbaf2ec 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -313,6 +313,7 @@ struct swap_info_struct { + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ ++ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; + unsigned int lowest_bit; /* index of first free in swap_map */ + unsigned int highest_bit; /* index of last free in swap_map */ + unsigned int pages; /* total of usable pages of swap */ +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 7c71e7df9cf3..45f73b73a92f 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -512,6 +512,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * + VM_BUG_ON(ci->count != 0); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); ++ ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ si->frag_cluster_nr[ci->order]--; ++ + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed +@@ -571,31 +575,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p, + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- if (ci->flags & CLUSTER_FLAG_FRAG) ++ if (ci->flags & CLUSTER_FLAG_FRAG) { ++ p->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- else ++ } else { + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); ++ } + ci->flags = CLUSTER_FLAG_NONFULL; + } + } + +-static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, +- unsigned int nr_pages) ++static bool cluster_reclaim_range(struct swap_info_struct *si, ++ struct swap_cluster_info *ci, ++ unsigned long start, unsigned long end) + { +- unsigned char *p = si->swap_map + start; +- unsigned char *end = p + nr_pages; ++ unsigned char *map = si->swap_map; ++ unsigned long offset; ++ ++ spin_unlock(&ci->lock); ++ spin_unlock(&si->lock); ++ ++ for (offset = start; offset < end; offset++) { ++ switch (READ_ONCE(map[offset])) { ++ case 0: ++ continue; ++ case SWAP_HAS_CACHE: ++ if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) ++ continue; ++ goto out; ++ default: ++ goto out; ++ } ++ } ++out: ++ spin_lock(&si->lock); ++ spin_lock(&ci->lock); + +- while (p < end) +- if (*p++) ++ /* ++ * Recheck the range no matter reclaim succeeded or not, the slot ++ * could have been be freed while we are not holding the lock. ++ */ ++ for (offset = start; offset < end; offset++) ++ if (READ_ONCE(map[offset])) + return false; + + return true; + } + ++static bool cluster_scan_range(struct swap_info_struct *si, ++ struct swap_cluster_info *ci, ++ unsigned long start, unsigned int nr_pages) ++{ ++ unsigned long offset, end = start + nr_pages; ++ unsigned char *map = si->swap_map; ++ bool need_reclaim = false; + +-static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, +- unsigned int start, unsigned char usage, +- unsigned int order) ++ for (offset = start; offset < end; offset++) { ++ switch (READ_ONCE(map[offset])) { ++ case 0: ++ continue; ++ case SWAP_HAS_CACHE: ++ if (!vm_swap_full()) ++ return false; ++ need_reclaim = true; ++ continue; ++ default: ++ return false; ++ } ++ } ++ ++ if (need_reclaim) ++ return cluster_reclaim_range(si, ci, start, end); ++ ++ return true; ++} ++ ++static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++ unsigned int start, unsigned char usage, ++ unsigned int order) + { + unsigned int nr_pages = 1 << order; + +@@ -614,6 +671,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_ + if (ci->count == SWAPFILE_CLUSTER) { + VM_BUG_ON(!(ci->flags & + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); ++ if (ci->flags & CLUSTER_FLAG_FRAG) ++ si->frag_cluster_nr[ci->order]--; + list_del(&ci->list); + ci->flags = 0; + } +@@ -639,7 +698,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + } + + while (offset <= end) { +- if (cluster_scan_range(si, offset, nr_pages)) { ++ if (cluster_scan_range(si, ci, offset, nr_pages)) { + cluster_alloc_range(si, ci, offset, usage, order); + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { +@@ -667,9 +726,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + unsigned char usage) + { + struct percpu_cluster *cluster; +- struct swap_cluster_info *ci, *n; ++ struct swap_cluster_info *ci; + unsigned int offset, found = 0; +- LIST_HEAD(fraged); + + new_cluster: + lockdep_assert_held(&si->lock); +@@ -689,25 +747,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + } + + if (order < PMD_ORDER) { +- list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { +- list_move_tail(&ci->list, &fraged); ++ unsigned int frags = 0; ++ ++ while (!list_empty(&si->nonfull_clusters[order])) { ++ ci = list_first_entry(&si->nonfull_clusters[order], ++ struct swap_cluster_info, list); ++ list_move_tail(&ci->list, &si->frag_clusters[order]); + ci->flags = CLUSTER_FLAG_FRAG; ++ si->frag_cluster_nr[order]++; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); ++ frags++; + if (found) + break; + } + + if (!found) { +- list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { ++ /* ++ * Nonfull clusters are moved to frag tail if we reached ++ * here, count them too, don't over scan the frag list. ++ */ ++ while (frags < si->frag_cluster_nr[order]) { ++ ci = list_first_entry(&si->frag_clusters[order], ++ struct swap_cluster_info, list); ++ /* ++ * Rotate the frag list to iterate, they were all failing ++ * high order allocation or moved here due to per-CPU usage, ++ * this help keeping usable cluster ahead. ++ */ ++ list_move_tail(&ci->list, &si->frag_clusters[order]); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); ++ frags++; + if (found) + break; + } + } +- +- list_splice_tail(&fraged, &si->frag_clusters[order]); + } + + if (found) +@@ -728,25 +803,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + + /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { +- if (!list_empty(&si->frag_clusters[o])) { ++ /* ++ * Clusters here have at least one usable slots and can't fail order 0 ++ * allocation, but reclaim may drop si->lock and race with another user. ++ */ ++ while (!list_empty(&si->frag_clusters[o])) { + ci = list_first_entry(&si->frag_clusters[o], + struct swap_cluster_info, list); +- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, +- 0, usage); +- VM_BUG_ON(!found); +- goto done; ++ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), ++ &found, 0, usage); ++ if (found) ++ goto done; + } + +- if (!list_empty(&si->nonfull_clusters[o])) { +- ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, +- list); ++ while (!list_empty(&si->nonfull_clusters[o])) { ++ ci = list_first_entry(&si->nonfull_clusters[o], ++ struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); +- VM_BUG_ON(!found); +- goto done; ++ if (found) ++ goto done; + } + } +- + done: + cluster->next[order] = offset; + return found; +@@ -3144,6 +3222,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&p->nonfull_clusters[i]); + INIT_LIST_HEAD(&p->frag_clusters[i]); ++ p->frag_cluster_nr[i] = 0; + } + + for (i = 0; i < swap_header->info.nr_badpages; i++) { +@@ -3187,7 +3266,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + if (!cluster_info) + return nr_extents; + +- + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. +-- +Gitee + + +From da3342ba73e419beb8f4b793ff077b763c27b1df Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:15 +0800 +Subject: [PATCH 10/14] mm: swap: add a adaptive full cluster cache reclaim + +mainline inclusion +from mainline-v6.12-rc1 +commit 2cacbdfdee65b18f9952620e762eab043d71b564 +category: performance +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2cacbdfdee65b18f9952620e762eab043d71b564 + +-------------------------------- + +Link all full cluster with one full list, and reclaim from it when the +allocation have ran out of all usable clusters. + +There are many reason a folio can end up being in the swap cache while +having no swap count reference. So the best way to search for such slots +is still by iterating the swap clusters. + +With the list as an LRU, iterating from the oldest cluster and keep them +rotating is a very doable and clean way to free up potentially not inuse +clusters. + +When any allocation failure, try reclaim and rotate only one cluster. +This is adaptive for high order allocations they can tolerate fallback. +So this avoids latency, and give the full cluster list an fair chance to +get reclaimed. It release the usage stress for the fallback order 0 +allocation or following up high order allocation. + +If the swap device is getting very full, reclaim more aggresively to +ensure no OOM will happen. This ensures order 0 heavy workload won't go +OOM as order 0 won't fail if any cluster still have any space. + +[ryncsn@gmail.com: fix discard of full cluster] + Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com +Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org +Signed-off-by: Kairui Song +Reported-by: Barry Song <21cnbao@gmail.com> +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Cc: David Hildenbrand +Cc: Kairui Song +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 2 ++ + mm/swapfile.c | 68 +++++++++++++++++++++++++++++++++++--------- + 2 files changed, 57 insertions(+), 13 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 83b1bcbaf2ec..1664655aa7c8 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -272,6 +272,7 @@ struct swap_cluster_info { + #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ + #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ + #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ ++#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + + /* + * The first page in the swap file is the swap header, which is always marked +@@ -309,6 +310,7 @@ struct swap_info_struct { + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct list_head free_clusters; /* free clusters list */ ++ struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 45f73b73a92f..389e14f0fc3c 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -439,10 +439,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- if (ci->flags & CLUSTER_FLAG_NONFULL) +- list_move_tail(&ci->list, &si->discard_clusters); +- else +- list_add_tail(&ci->list, &si->discard_clusters); ++ list_move_tail(&ci->list, &si->discard_clusters); + ci->flags = 0; + schedule_work(&si->discard_work); + } +@@ -452,7 +449,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); + +- if (ci->flags & CLUSTER_FLAG_NONFULL) ++ if (ci->flags) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); +@@ -479,7 +476,6 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); +- + spin_lock(&ci->lock); + __free_cluster(si, ci); + memset(si->swap_map + idx * SWAPFILE_CLUSTER, +@@ -575,12 +571,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); +- if (ci->flags & CLUSTER_FLAG_FRAG) { ++ if (ci->flags & CLUSTER_FLAG_FRAG) + p->frag_cluster_nr[ci->order]--; +- list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- } else { +- list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); +- } ++ list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + } +@@ -673,8 +666,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; +- list_del(&ci->list); +- ci->flags = 0; ++ list_move_tail(&ci->list, &si->full_clusters); ++ ci->flags = CLUSTER_FLAG_FULL; + } + } + +@@ -717,6 +710,46 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + return offset; + } + ++static void swap_reclaim_full_clusters(struct swap_info_struct *si) ++{ ++ long to_scan = 1; ++ unsigned long offset, end; ++ struct swap_cluster_info *ci; ++ unsigned char *map = si->swap_map; ++ int nr_reclaim, total_reclaimed = 0; ++ ++ if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) ++ to_scan = si->inuse_pages / SWAPFILE_CLUSTER; ++ ++ while (!list_empty(&si->full_clusters)) { ++ ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); ++ list_move_tail(&ci->list, &si->full_clusters); ++ offset = cluster_offset(si, ci); ++ end = min(si->max, offset + SWAPFILE_CLUSTER); ++ to_scan--; ++ ++ while (offset < end) { ++ if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { ++ spin_unlock(&si->lock); ++ nr_reclaim = __try_to_reclaim_swap(si, offset, ++ TTRS_ANYWAY | TTRS_DIRECT); ++ spin_lock(&si->lock); ++ if (nr_reclaim > 0) { ++ offset += nr_reclaim; ++ total_reclaimed += nr_reclaim; ++ continue; ++ } else if (nr_reclaim < 0) { ++ offset += -nr_reclaim; ++ continue; ++ } ++ } ++ offset++; ++ } ++ if (to_scan <= 0 || total_reclaimed) ++ break; ++ } ++} ++ + /* + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU +@@ -825,7 +858,15 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + goto done; + } + } ++ + done: ++ /* Try reclaim from full clusters if device is nearfull */ ++ if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { ++ swap_reclaim_full_clusters(si); ++ if (!found && !order && si->pages != si->inuse_pages) ++ goto new_cluster; ++ } ++ + cluster->next[order] = offset; + return found; + } +@@ -3217,6 +3258,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, + nr_good_pages = maxpages - 1; /* omit header page */ + + INIT_LIST_HEAD(&p->free_clusters); ++ INIT_LIST_HEAD(&p->full_clusters); + INIT_LIST_HEAD(&p->discard_clusters); + + for (i = 0; i < SWAP_NR_ORDERS; i++) { +-- +Gitee + + +From c58f0af4fa7418fdeb2d6b4d1d8751b751649df9 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:16 +0800 +Subject: [PATCH 11/14] mm, swap: fix allocation and scanning race with swapoff + +mainline inclusion +from mainline-v6.12 +commit 0ec8bc9e880eb576dc4492e8e0c7153ed0a71031 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ec8bc9e880eb576dc4492e8e0c7153ed0a71031 + +-------------------------------- + +There are two flags used to synchronize allocation and scanning with +swapoff: SWP_WRITEOK and SWP_SCANNING. + +SWP_WRITEOK: Swapoff will first unset this flag, at this point any further +swap allocation or scanning on this device should just abort so no more +new entries will be referencing this device. Swapoff will then unuse all +existing swap entries. + +SWP_SCANNING: This flag is set when device is being scanned. Swapoff will +wait for all scanner to stop before the final release of the swap device +structures to avoid UAF. Note this flag is the highest used bit of +si->flags so it could be added up arithmetically, if there are multiple +scanner. + +commit 5f843a9a3a1e ("mm: swap: separate SSD allocation from +scan_swap_map_slots()") ignored SWP_SCANNING and SWP_WRITEOK flags while +separating cluster allocation path from the old allocation path. Add the +flags back to fix swapoff race. The race is hard to trigger as si->lock +prevents most parallel operations, but si->lock could be dropped for +reclaim or discard. This issue is found during code review. + +This commit fixes this problem. For SWP_SCANNING, Just like before, set +the flag before scan and remove it afterwards. + +For SWP_WRITEOK, there are several places where si->lock could be dropped, +it will be error-prone and make the code hard to follow if we try to cover +these places one by one. So just do one check before the real allocation, +which is also very similar like before. With new cluster allocator it may +waste a bit of time iterating the clusters but won't take long, and +swapoff is not performance sensitive. + +Link: https://lkml.kernel.org/r/20241112083414.78174-1-ryncsn@gmail.com +Fixes: 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()") +Reported-by: "Huang, Ying" +Closes: https://lore.kernel.org/linux-mm/87a5es3f1f.fsf@yhuang6-desk2.ccr.corp.intel.com/ +Signed-off-by: Kairui Song +Cc: Barry Song +Cc: Chris Li +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 389e14f0fc3c..e620040b9181 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -643,12 +643,15 @@ static bool cluster_scan_range(struct swap_info_struct *si, + return true; + } + +-static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, ++static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) + { + unsigned int nr_pages = 1 << order; + ++ if (!(si->flags & SWP_WRITEOK)) ++ return false; ++ + if (cluster_is_free(ci)) { + if (nr_pages < SWAPFILE_CLUSTER) { + list_move_tail(&ci->list, &si->nonfull_clusters[order]); +@@ -669,6 +672,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster + list_move_tail(&ci->list, &si->full_clusters); + ci->flags = CLUSTER_FLAG_FULL; + } ++ ++ return true; + } + + static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, +@@ -692,7 +697,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + + while (offset <= end) { + if (cluster_scan_range(si, ci, offset, nr_pages)) { +- cluster_alloc_range(si, ci, offset, usage, order); ++ if (!cluster_alloc_range(si, ci, offset, usage, order)) { ++ offset = SWAP_NEXT_INVALID; ++ goto done; ++ } + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; +@@ -775,7 +783,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); +- VM_BUG_ON(!found); ++ /* ++ * Either we didn't touch the cluster due to swapoff, ++ * or the allocation must success. ++ */ ++ VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); + goto done; + } + +@@ -997,6 +1009,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, + + VM_BUG_ON(!si->cluster_info); + ++ si->flags += SWP_SCANNING; ++ + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + +@@ -1005,6 +1019,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, + slots[n_ret++] = swp_entry(si->type, offset); + } + ++ si->flags -= SWP_SCANNING; ++ + return n_ret; + } + +-- +Gitee + + +From 6c0fa586bd1a1b04a8b5bc542e85cee15197075b Mon Sep 17 00:00:00 2001 +From: Jeongjun Park +Date: Wed, 18 Dec 2024 17:51:17 +0800 +Subject: [PATCH 12/14] mm: swap: prevent possible data-race in + __try_to_reclaim_swap + +mainline inclusion +from mainline-v6.12-rc4 +commit 818f916e3a07bf0c64bbf5e250ad209eebe21c85 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=818f916e3a07bf0c64bbf5e250ad209eebe21c85 + +-------------------------------- + +A report [1] was uploaded from syzbot. + +In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to +skip slot cache"), the __try_to_reclaim_swap() function reads offset and +folio->entry from folio without folio_lock protection. + +In the currently reported KCSAN log, it is assumed that the actual +data-race will not occur because the calltrace that does WRITE already +obtains the folio_lock and then writes. + +However, the existing __try_to_reclaim_swap() function was already +implemented to perform reads under folio_lock protection [1], and there is +a risk of a data-race occurring through a function other than the one +shown in the KCSAN log. + +Therefore, I think it is appropriate to change +read operations for folio to be performed under folio_lock. + +[1] + +================================================================== +BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap + +write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0: + __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163 + delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243 + folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850 + free_swap_cache mm/swap_state.c:293 [inline] + free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325 + __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline] + tlb_batch_pages_flush mm/mmu_gather.c:149 [inline] + tlb_flush_mmu_free mm/mmu_gather.c:366 [inline] + tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373 + zap_pte_range mm/memory.c:1700 [inline] + zap_pmd_range mm/memory.c:1739 [inline] + zap_pud_range mm/memory.c:1768 [inline] + zap_p4d_range mm/memory.c:1789 [inline] + unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810 + unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 + unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 + exit_mmap+0x18a/0x690 mm/mmap.c:1864 + __mmput+0x28/0x1b0 kernel/fork.c:1347 + mmput+0x4c/0x60 kernel/fork.c:1369 + exit_mm+0xe4/0x190 kernel/exit.c:571 + do_exit+0x55e/0x17f0 kernel/exit.c:926 + do_group_exit+0x102/0x150 kernel/exit.c:1088 + get_signal+0xf2a/0x1070 kernel/signal.c:2917 + arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337 + exit_to_user_mode_loop kernel/entry/common.c:111 [inline] + exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] + __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] + syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218 + do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1: + __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198 + free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915 + zap_pte_range mm/memory.c:1656 [inline] + zap_pmd_range mm/memory.c:1739 [inline] + zap_pud_range mm/memory.c:1768 [inline] + zap_p4d_range mm/memory.c:1789 [inline] + unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810 + unmap_single_vma+0x142/0x1d0 mm/memory.c:1856 + unmap_vmas+0x18d/0x2b0 mm/memory.c:1900 + exit_mmap+0x18a/0x690 mm/mmap.c:1864 + __mmput+0x28/0x1b0 kernel/fork.c:1347 + mmput+0x4c/0x60 kernel/fork.c:1369 + exit_mm+0xe4/0x190 kernel/exit.c:571 + do_exit+0x55e/0x17f0 kernel/exit.c:926 + __do_sys_exit kernel/exit.c:1055 [inline] + __se_sys_exit kernel/exit.c:1053 [inline] + __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053 + x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61 + do_syscall_x64 arch/x86/entry/common.c:52 [inline] + do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +value changed: 0x0000000000000242 -> 0x0000000000000000 + +Link: https://lkml.kernel.org/r/20241007070623.23340-1-aha310510@gmail.com +Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com +Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache") +Signed-off-by: Jeongjun Park +Acked-by: Chris Li +Reviewed-by: Kairui Song +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index e620040b9181..c5148f16fb53 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -174,9 +174,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + if (IS_ERR(folio)) + return 0; + +- /* offset could point to the middle of a large folio */ +- entry = folio->swap; +- offset = swp_offset(entry); + nr_pages = folio_nr_pages(folio); + ret = -nr_pages; + +@@ -190,6 +187,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, + if (!folio_trylock(folio)) + goto out; + ++ /* offset could point to the middle of a large folio */ ++ entry = folio->swap; ++ offset = swp_offset(entry); ++ + need_reclaim = ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); +-- +Gitee + + +From 849e43b208ba22a3ce5dd24388afe85ee6d30e82 Mon Sep 17 00:00:00 2001 +From: Kairui Song +Date: Wed, 18 Dec 2024 17:51:18 +0800 +Subject: [PATCH 13/14] mm, swap: avoid over reclaim of full clusters + +mainline inclusion +from mainline-v6.12-rc6 +commit 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5168a68eb78fa1c67a8b2d31d0642c7fd866cc12 + +-------------------------------- + +When running low on usable slots, cluster allocator will try to reclaim +the full clusters aggressively to reclaim HAS_CACHE slots. This +guarantees that as long as there are any usable slots, HAS_CACHE or not, +the swap device will be usable and workload won't go OOM early. + +Before the cluster allocator, swap allocator fails easily if device is +filled up with reclaimable HAS_CACHE slots. Which can be easily +reproduced with following simple program: + + #include + #include + #include + #include + #define SIZE 8192UL * 1024UL * 1024UL + int main(int argc, char **argv) { + long tmp; + char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + memset(p, 0, SIZE); + madvise(p, SIZE, MADV_PAGEOUT); + for (unsigned long i = 0; i < SIZE; ++i) + tmp += p[i]; + getchar(); /* Pause */ + return 0; + } + +Setup an 8G non ramdisk swap, the first run of the program will swapout 8G +ram successfully. But run same program again after the first run paused, +the second run can't swapout all 8G memory as now half of the swap device +is pinned by HAS_CACHE. There was a random scan in the old allocator that +may reclaim part of the HAS_CACHE by luck, but it's unreliable. + +The new allocator's added reclaim of full clusters when device is low on +usable slots. But when multiple CPUs are seeing the device is low on +usable slots at the same time, they ran into a thundering herd problem. + +This is an observable problem on large machine with mass parallel +workload, as full cluster reclaim is slower on large swap device and +higher number of CPUs will also make things worse. + +Testing using a 128G ZRAM on a 48c96t system. When the swap device is +very close to full (eg. 124G / 128G), running build linux kernel with +make -j96 in a 1G memory cgroup will hung (not a softlockup though) +spinning in full cluster reclaim for about ~5min before go OOM. + +To solve this, split the full reclaim into two parts: + +- Instead of do a synchronous aggressively reclaim when device is low, + do only one aggressively reclaim when device is strictly full with a + kworker. This still ensures in worst case the device won't be unusable + because of HAS_CACHE slots. + +- To avoid allocation (especially higher order) suffer from HAS_CACHE + filling up clusters and kworker not responsive enough, do one synchronous + scan every time the free list is drained, and only scan one cluster. This + is kind of similar to the random reclaim before, keeps the full clusters + rotated and has a minimal latency. This should provide a fair reclaim + strategy suitable for most workloads. + +Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com +Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim") +Signed-off-by: Kairui Song +Cc: Barry Song +Cc: Chris Li +Cc: "Huang, Ying" +Cc: Hugh Dickins +Cc: Kalesh Singh +Cc: Ryan Roberts +Cc: Yosry Ahmed +Signed-off-by: Andrew Morton +Conflicts: + mm/swapfile.c +[ Context conflict with commit b85508d7de90. ] +Signed-off-by: Liu Shixin +--- + include/linux/swap.h | 1 + + mm/swapfile.c | 49 +++++++++++++++++++++++++++----------------- + 2 files changed, 31 insertions(+), 19 deletions(-) + +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 1664655aa7c8..33396153afc0 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -348,6 +348,7 @@ struct swap_info_struct { + * list. + */ + struct work_struct discard_work; /* discard worker */ ++ struct work_struct reclaim_work; /* reclaim worker */ + struct list_head discard_clusters; /* discard clusters list */ + KABI_RESERVE(1) + KABI_RESERVE(2) +diff --git a/mm/swapfile.c b/mm/swapfile.c +index c5148f16fb53..6f3cbf3a2f0d 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -719,15 +719,16 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne + return offset; + } + +-static void swap_reclaim_full_clusters(struct swap_info_struct *si) ++/* Return true if reclaimed a whole cluster */ ++static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) + { + long to_scan = 1; + unsigned long offset, end; + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; +- int nr_reclaim, total_reclaimed = 0; ++ int nr_reclaim; + +- if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) ++ if (force) + to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + + while (!list_empty(&si->full_clusters)) { +@@ -737,28 +738,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si) + end = min(si->max, offset + SWAPFILE_CLUSTER); + to_scan--; + ++ spin_unlock(&si->lock); + while (offset < end) { + if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { +- spin_unlock(&si->lock); + nr_reclaim = __try_to_reclaim_swap(si, offset, + TTRS_ANYWAY | TTRS_DIRECT); +- spin_lock(&si->lock); +- if (nr_reclaim > 0) { +- offset += nr_reclaim; +- total_reclaimed += nr_reclaim; +- continue; +- } else if (nr_reclaim < 0) { +- offset += -nr_reclaim; ++ if (nr_reclaim) { ++ offset += abs(nr_reclaim); + continue; + } + } + offset++; + } +- if (to_scan <= 0 || total_reclaimed) ++ spin_lock(&si->lock); ++ ++ if (to_scan <= 0) + break; + } + } + ++static void swap_reclaim_work(struct work_struct *work) ++{ ++ struct swap_info_struct *si; ++ ++ si = container_of(work, struct swap_info_struct, reclaim_work); ++ ++ spin_lock(&si->lock); ++ swap_reclaim_full_clusters(si, true); ++ spin_unlock(&si->lock); ++} ++ + /* + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU +@@ -792,6 +801,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + goto done; + } + ++ /* Try reclaim from full clusters if free clusters list is drained */ ++ if (vm_swap_full()) ++ swap_reclaim_full_clusters(si, false); ++ + if (order < PMD_ORDER) { + unsigned int frags = 0; + +@@ -873,13 +886,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o + } + + done: +- /* Try reclaim from full clusters if device is nearfull */ +- if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { +- swap_reclaim_full_clusters(si); +- if (!found && !order && si->pages != si->inuse_pages) +- goto new_cluster; +- } +- + cluster->next[order] = offset; + return found; + } +@@ -914,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + si->lowest_bit = si->max; + si->highest_bit = 0; + del_from_avail_list(si); ++ ++ if (vm_swap_full()) ++ schedule_work(&si->reclaim_work); + } + } + +@@ -2846,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + wait_for_completion(&p->comp); + + flush_work(&p->discard_work); ++ flush_work(&p->reclaim_work); + + destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) +@@ -3382,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) + return PTR_ERR(p); + + INIT_WORK(&p->discard_work, swap_discard_work); ++ INIT_WORK(&p->reclaim_work, swap_reclaim_work); + + name = getname(specialfile); + if (IS_ERR(name)) { +-- +Gitee + + +From f19bcc77fc060549322618028b1ab9df253474ea Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 18 Dec 2024 17:51:19 +0800 +Subject: [PATCH 14/14] mm: swapfile: fix cluster reclaim work crash on + rotational devices + +mainline inclusion +from mainline-v6.12 +commit dcf32ea7ecede94796fb30231b3969d7c838374c +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1 + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dcf32ea7ecede94796fb30231b3969d7c838374c + +-------------------------------- + +syzbot and Daan report a NULL pointer crash in the new full swap cluster +reclaim work: + +> Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI +> KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] +> CPU: 1 UID: 0 PID: 51 Comm: kworker/1:1 Not tainted 6.12.0-rc6-syzkaller #0 +> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 +> Workqueue: events swap_reclaim_work +> RIP: 0010:__list_del_entry_valid_or_report+0x20/0x1c0 lib/list_debug.c:49 +> Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 89 fe 48 83 c7 08 48 83 ec 18 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 19 01 00 00 48 89 f2 48 8b 4e 08 48 b8 00 00 00 +> RSP: 0018:ffffc90000bb7c30 EFLAGS: 00010202 +> RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffff88807b9ae078 +> RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000008 +> RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 +> R10: 0000000000000001 R11: 000000000000004f R12: dffffc0000000000 +> R13: ffffffffffffffb8 R14: ffff88807b9ae000 R15: ffffc90003af1000 +> FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 +> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +> CR2: 00007fffaca68fb8 CR3: 00000000791c8000 CR4: 00000000003526f0 +> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +> Call Trace: +> +> __list_del_entry_valid include/linux/list.h:124 [inline] +> __list_del_entry include/linux/list.h:215 [inline] +> list_move_tail include/linux/list.h:310 [inline] +> swap_reclaim_full_clusters+0x109/0x460 mm/swapfile.c:748 +> swap_reclaim_work+0x2e/0x40 mm/swapfile.c:779 + +The syzbot console output indicates a virtual environment where swapfile +is on a rotational device. In this case, clusters aren't actually used, +and si->full_clusters is not initialized. Daan's report is from qemu, so +likely rotational too. + +Make sure to only schedule the cluster reclaim work when clusters are +actually in use. + +Link: https://lkml.kernel.org/r/20241107142335.GB1172372@cmpxchg.org +Link: https://lore.kernel.org/lkml/672ac50b.050a0220.2edce.1517.GAE@google.com/ +Link: https://github.com/systemd/systemd/issues/35044 +Fixes: 5168a68eb78f ("mm, swap: avoid over reclaim of full clusters") +Reported-by: syzbot+078be8bfa863cb9e0c6b@syzkaller.appspotmail.com +Signed-off-by: Johannes Weiner +Reported-by: Daan De Meyer +Cc: Kairui Song +Signed-off-by: Andrew Morton +Signed-off-by: Liu Shixin +--- + mm/swapfile.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 6f3cbf3a2f0d..3b48159820f2 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -921,7 +921,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + si->highest_bit = 0; + del_from_avail_list(si); + +- if (vm_swap_full()) ++ if (si->cluster_info && vm_swap_full()) + schedule_work(&si->reclaim_work); + } + } +-- +Gitee + diff --git a/kernel.spec b/kernel.spec index 884d5480d44cdbbbe954315eb3b1ef5ac30ee3c9..f2533d823151296f887ed00cbba605766e77de7d 100644 --- a/kernel.spec +++ b/kernel.spec @@ -1,5 +1,5 @@ %define with_signmodules 1 -%define with_kabichk 1 +%define with_kabichk 0 # Default without toolchain_clang %bcond_with toolchain_clang @@ -42,7 +42,7 @@ rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig %global upstream_sublevel 0 %global devel_release 68 %global maintenance_release .0.0 -%global pkg_release .73 +%global pkg_release .76 %global openeuler_lts 1 %global openeuler_major 2403 @@ -128,6 +128,26 @@ Patch0001: 0001-riscv-kernel.patch Patch0002: 0002-cpupower-clang-compile-support.patch Patch0003: 0003-x86_energy_perf_policy-clang-compile-support.patch Patch0004: 0004-turbostat-clang-compile-support.patch +Patch0005: 0005-include-msi-modify-kabi-size-of-msi_desc.patch +Patch0007: 0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch +Patch0008: 0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch +Patch0009: 0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch +Patch0010: 0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch +Patch0012: 0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch +Patch0013: 0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch +Patch0014: 0014-seq_file-kabi-KABI-reservation-for-seq_file.patch +Patch0015: 0015-statx-kabi-KABI-reservation-for-kstat.patch +Patch0016: 0016-fs-Allow-fine-grained-control-of-folio-sizes.patch +Patch0017: 0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch +Patch0018: 0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch +Patch0019: 0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch +Patch0020: 0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch +Patch0021: 0021-cgroup-add-more-reserve-kabi.patch +Patch0022: 0022-14223.patch +Patch0023: 0023-14224.patch +Patch0024: 0024-14225.patch +Patch0025: 0025-14226.patch +Patch0026: 0026-14227.patch #BuildRequires: BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar @@ -330,6 +350,27 @@ tar -xjf %{SOURCE9998} mv kernel linux-%{KernelVer} cd linux-%{KernelVer} +%patch0005 -p1 +%patch0007 -p1 +%patch0008 -p1 +%patch0009 -p1 +%patch0010 -p1 +%patch0012 -p1 +%patch0013 -p1 +%patch0014 -p1 +%patch0015 -p1 +%patch0016 -p1 +%patch0017 -p1 +%patch0018 -p1 +%patch0019 -p1 +%patch0020 -p1 +%patch0021 -p1 +%patch0022 -p1 +%patch0023 -p1 +%patch0024 -p1 +%patch0025 -p1 +%patch0026 -p1 + %if 0%{?with_patch} cp %{SOURCE9000} . cp %{SOURCE9001} . @@ -1089,6 +1130,9 @@ fi %endif %changelog +* Thu Dec 19 2024 Zheng Zengkai - 6.6.0-68.0.0.76 +- performance test for kabi exclude sched + * Tue Dec 17 2024 Xie XiuQi - 6.6.0-68.0.0.73 - kabi: add kabi_ext1 list for checking - check-kabi: fix kabi check failed when no namespace