diff --git a/0005-include-msi-modify-kabi-size-of-msi_desc.patch b/0005-include-msi-modify-kabi-size-of-msi_desc.patch
new file mode 100644
index 0000000000000000000000000000000000000000..79c77ab1e2d4ebfe75b6d3b522790ebcd6df6204
--- /dev/null
+++ b/0005-include-msi-modify-kabi-size-of-msi_desc.patch
@@ -0,0 +1,45 @@
+From 723d41836db7669ab658d3e07c62fcbe17d7d7f4 Mon Sep 17 00:00:00 2001
+From: zhengjunlong <zhengjunlong@huawei.com>
+Date: Fri, 11 Oct 2024 17:08:35 +0800
+Subject: [PATCH 01/17] include/msi: modify kabi size of msi_desc
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAW8JF
+
+----------------------------------------------------
+
+Change the size of the pre-embedded memory for msi_desc to 40 bytes.
+
+Signed-off-by: Zheng Junlong <zhengjunlong@huawei.com>
+---
+ include/linux/msi.h | 11 ++++-------
+ 1 file changed, 4 insertions(+), 7 deletions(-)
+
+diff --git a/include/linux/msi.h b/include/linux/msi.h
+index 7354ffb14856..5fd8a6caae98 100644
+--- a/include/linux/msi.h
++++ b/include/linux/msi.h
+@@ -205,15 +205,12 @@ struct msi_desc {
+ 	union {
+ 		struct pci_msi_desc	pci;
+ 		struct msi_desc_data	data;
+-		KABI_RESERVE(1)
+-		KABI_RESERVE(2)
+-		KABI_RESERVE(3)
+-		KABI_RESERVE(4)
++		KABI_EXTEND_WITH_SIZE(KABI_RESERVE(1), 5)
+ 	};
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ 	KABI_RESERVE(5)
+-	KABI_RESERVE(6)
+-	KABI_RESERVE(7)
+-	KABI_RESERVE(8)
+ };
+ 
+ /*
+-- 
+2.25.1
+
diff --git a/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1d3c32fe05be2df1ee24d756c0db25e7377882fd
--- /dev/null
+++ b/0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
@@ -0,0 +1,40 @@
+From e68e6e3cf90ec8fb7893057c768d55e83855aaa0 Mon Sep 17 00:00:00 2001
+From: Li Lingfeng <lilingfeng3@huawei.com>
+Date: Mon, 16 Dec 2024 20:15:25 +0800
+Subject: [PATCH 03/17] nfs: fix the loss of superblock's initialized flags
+
+hulk inclusion
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IB42W1
+
+--------------------------------
+
+Commit 573573887e0b ("nfs: pass flags to second superblock") directly
+assigns fc->sb_flags to dentry->d_sb->s_flags, which will cause the loss
+of the initialized flags in dentry->d_sb->s_flags.
+
+Fix it by just passing SB_RDONLY from fc->sb_flags to
+dentry->d_sb->s_flags.
+
+Fixes: 573573887e0b ("nfs: pass flags to second superblock")
+Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
+---
+ fs/nfs/nfs4super.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
+index bb13894ad152..e87f878178f3 100644
+--- a/fs/nfs/nfs4super.c
++++ b/fs/nfs/nfs4super.c
+@@ -209,7 +209,7 @@ static int do_nfs4_mount(struct nfs_server *server,
+ 	if (IS_ERR(dentry))
+ 		return PTR_ERR(dentry);
+ 
+-	dentry->d_sb->s_flags = fc->sb_flags;
++	dentry->d_sb->s_flags |= (fc->sb_flags & SB_RDONLY);
+ 	fc->root = dentry;
+ 	return 0;
+ }
+-- 
+2.25.1
+
diff --git a/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f9c3ab227f17a2ee820ed733a996a6e9e1fdc9d1
--- /dev/null
+++ b/0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
@@ -0,0 +1,61 @@
+From 844a44e5a21be8062fd0c120a75e9ecf97427ae8 Mon Sep 17 00:00:00 2001
+From: hanliyang <hanliyang@hygon.cn>
+Date: Mon, 16 Dec 2024 20:44:36 +0800
+Subject: [PATCH 04/17] x86/config: Enable CONFIG_CMA by default in
+ openeuler_defconfig
+
+hygon inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI
+CVE: NA
+
+---------------------------
+
+Enable CONFIG_CMA will change kabi.
+
+Enable CONFIG_CMA will also enable CONFIG_DMA_CMA.
+
+Signed-off-by: hanliyang <hanliyang@hygon.cn>
+---
+ arch/x86/configs/openeuler_defconfig | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
+index 8e8542796a13..adfaef0cb10c 100644
+--- a/arch/x86/configs/openeuler_defconfig
++++ b/arch/x86/configs/openeuler_defconfig
+@@ -1158,7 +1158,11 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
+ CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
+ CONFIG_USE_PERCPU_NUMA_NODE_ID=y
+ CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+-# CONFIG_CMA is not set
++CONFIG_CMA=y
++# CONFIG_CMA_DEBUG is not set
++# CONFIG_CMA_DEBUGFS is not set
++# CONFIG_CMA_SYSFS is not set
++CONFIG_CMA_AREAS=19
+ CONFIG_MEM_SOFT_DIRTY=y
+ CONFIG_GENERIC_EARLY_IOREMAP=y
+ CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
+@@ -9018,6 +9022,18 @@ CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y
+ CONFIG_SWIOTLB=y
+ # CONFIG_SWIOTLB_DYNAMIC is not set
+ CONFIG_DMA_COHERENT_POOL=y
++CONFIG_DMA_CMA=y
++# CONFIG_DMA_NUMA_CMA is not set
++
++#
++# Default contiguous memory area size:
++#
++CONFIG_CMA_SIZE_MBYTES=0
++CONFIG_CMA_SIZE_SEL_MBYTES=y
++# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set
++# CONFIG_CMA_SIZE_SEL_MIN is not set
++# CONFIG_CMA_SIZE_SEL_MAX is not set
++CONFIG_CMA_ALIGNMENT=8
+ # CONFIG_DMA_API_DEBUG is not set
+ # CONFIG_DMA_MAP_BENCHMARK is not set
+ CONFIG_SGL_ALLOC=y
+-- 
+2.25.1
+
diff --git a/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
new file mode 100644
index 0000000000000000000000000000000000000000..79f223e8325d70865474d83d9c4c89e5d2ca9bf4
--- /dev/null
+++ b/0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
@@ -0,0 +1,35 @@
+From f0e6b8ca2a5b0bc1347906ff6b80422c4c9878b2 Mon Sep 17 00:00:00 2001
+From: hanliyang <hanliyang@hygon.cn>
+Date: Mon, 16 Dec 2024 20:52:08 +0800
+Subject: [PATCH 05/17] x86/Kconfig: Select CONFIG_CMA if CONFIG_HYGON_CSV=y
+
+hygon inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBBNJI
+CVE: NA
+
+---------------------------
+
+The Hygon CSV3 use CMA to manage CSV3 guest's private memory. If the
+CONFIG_HYGON_CSV is enabled, then enable CONFIG_CMA automatically.
+
+Signed-off-by: hanliyang <hanliyang@hygon.cn>
+---
+ arch/x86/Kconfig | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index fcd0c3b2065d..a6bbe6029121 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2075,6 +2075,7 @@ config HYGON_CSV
+ 	bool "Hygon secure virtualization CSV support"
+ 	default y
+ 	depends on CPU_SUP_HYGON && AMD_MEM_ENCRYPT
++	select CONFIG_CMA
+ 	help
+ 	  Hygon CSV integrates secure processor, memory encryption and
+ 	  memory isolation to provide the ability to protect guest's private
+-- 
+2.25.1
+
diff --git a/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a07a0a576382ff80a1e12d00aabc365047db7efa
--- /dev/null
+++ b/0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
@@ -0,0 +1,60 @@
+From 44c5a161852ac117a94ed7748784aecaab552b47 Mon Sep 17 00:00:00 2001
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+Date: Tue, 17 Dec 2024 16:33:23 +0800
+Subject: [PATCH 06/17] tcp: Fix use-after-free of nreq in
+ reqsk_timer_handler().
+
+stable inclusion
+from stable-v6.6.64
+commit 65ed89cad1f57034c256b016e89e8c0a4ec7c65b
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBA6RL
+CVE: NA
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=65ed89cad1f57034c256b016e89e8c0a4ec7c65b
+
+-------------------------------------------------
+
+[ Upstream commit c31e72d021db2714df03df6c42855a1db592716c ]
+
+The cited commit replaced inet_csk_reqsk_queue_drop_and_put() with
+__inet_csk_reqsk_queue_drop() and reqsk_put() in reqsk_timer_handler().
+
+Then, oreq should be passed to reqsk_put() instead of req; otherwise
+use-after-free of nreq could happen when reqsk is migrated but the
+retry attempt failed (e.g. due to timeout).
+
+Let's pass oreq to reqsk_put().
+
+Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().")
+Reported-by: Liu Jian <liujian56@huawei.com>
+Closes: https://lore.kernel.org/netdev/1284490f-9525-42ee-b7b8-ccadf6606f6d@huawei.com/
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
+Reviewed-by: Liu Jian <liujian56@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
+Link: https://patch.msgid.link/20241123174236.62438-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Liu Jian <liujian56@huawei.com>
+---
+ net/ipv4/inet_connection_sock.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index ca8cc0988b61..bd032ac2376e 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -1124,7 +1124,7 @@ static void reqsk_timer_handler(struct timer_list *t)
+ 
+ drop:
+ 	__inet_csk_reqsk_queue_drop(sk_listener, oreq, true);
+-	reqsk_put(req);
++	reqsk_put(oreq);
+ }
+ 
+ static bool reqsk_queue_hash_req(struct request_sock *req,
+-- 
+2.25.1
+
diff --git a/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9a958456d1487dd65305ecd45b7261c3186101ac
--- /dev/null
+++ b/0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
@@ -0,0 +1,63 @@
+From c189729809e4c7a6298126a76db608da2b571240 Mon Sep 17 00:00:00 2001
+From: Pu Lehui <pulehui@huawei.com>
+Date: Wed, 18 Dec 2024 06:24:00 +0000
+Subject: [PATCH 08/17] bpf: Add kabi reserve padding for uapi struct
+ bpf_link_info
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC248
+
+--------------------------------
+
+Add kabi reserve padding for uapi struct bpf_link_info
+
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+---
+ include/uapi/linux/bpf.h       | 9 +++++++++
+ tools/include/uapi/linux/bpf.h | 9 +++++++++
+ 2 files changed, 18 insertions(+)
+
+diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
+index 482647774bf5..a660cb68c853 100644
+--- a/include/uapi/linux/bpf.h
++++ b/include/uapi/linux/bpf.h
+@@ -6573,6 +6573,15 @@ struct bpf_link_info {
+ 					__u64 config;
+ 					__u32 type;
+ 				} event; /* BPF_PERF_EVENT_EVENT */
++				struct {
++					__u64:64;
++					__u32:32;
++					__u32:32;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++				} kabi_reserve;
+ 			};
+ 		} perf_event;
+ 		struct {
+diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
+index c112c6f7c766..9b302242be6c 100644
+--- a/tools/include/uapi/linux/bpf.h
++++ b/tools/include/uapi/linux/bpf.h
+@@ -6576,6 +6576,15 @@ struct bpf_link_info {
+ 					__u64 config;
+ 					__u32 type;
+ 				} event; /* BPF_PERF_EVENT_EVENT */
++				struct {
++					__u64:64;
++					__u32:32;
++					__u32:32;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++					__u64:64;
++				} kabi_reserve;
+ 			};
+ 		} perf_event;
+ 		struct {
+-- 
+2.25.1
+
diff --git a/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
new file mode 100644
index 0000000000000000000000000000000000000000..43e830cb9ff9e7129dbbb2322c2698bdca0d8050
--- /dev/null
+++ b/0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
@@ -0,0 +1,38 @@
+From bbfb8fd7b1297acf7769a814f3fbf919afd391dc Mon Sep 17 00:00:00 2001
+From: Zhang Zekun <zhangzekun11@huawei.com>
+Date: Wed, 18 Dec 2024 14:43:35 +0800
+Subject: [PATCH 09/17] iommu: Reserve extra KABI entry for struct iopf_group
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBBRHP
+
+---------------------------------------------------------------
+
+The list_head entry in iopf_group has been moved to iopf_group_extend
+for KABI compatibility and the lack of KABI reserve entry. Reserve extra
+kabi entry for future usage.
+
+Signed-off-by: Zhang Zekun <zhangzekun11@huawei.com>
+---
+ include/linux/iommu.h | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/include/linux/iommu.h b/include/linux/iommu.h
+index bb463cb96a44..83ec4bf9809e 100644
+--- a/include/linux/iommu.h
++++ b/include/linux/iommu.h
+@@ -155,6 +155,10 @@ struct iopf_group {
+ 	KABI_USE(2, u32 cookie)
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct iopf_group_extend {
+-- 
+2.25.1
+
diff --git a/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
new file mode 100644
index 0000000000000000000000000000000000000000..371e3afefc94e3038552e8c5abc889b030d75667
--- /dev/null
+++ b/0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
@@ -0,0 +1,45 @@
+From 1cb26ea1471efb775f2aa141863e82efead07d61 Mon Sep 17 00:00:00 2001
+From: Baokun Li <libaokun1@huawei.com>
+Date: Wed, 18 Dec 2024 15:21:56 +0800
+Subject: [PATCH 10/17] seq_file: kabi: KABI reservation for seq_file
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC34X
+
+----------------------------------------------------------------------
+
+    structure                size reserves reserved
+    seq_file                 120     1       128
+    seq_operations           32      1       40
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+---
+ include/linux/seq_file.h | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
+index 234bcdb1fba4..cf4a2258df85 100644
+--- a/include/linux/seq_file.h
++++ b/include/linux/seq_file.h
+@@ -27,6 +27,8 @@ struct seq_file {
+ 	int poll_event;
+ 	const struct file *file;
+ 	void *private;
++
++	KABI_RESERVE(1)
+ };
+ 
+ struct seq_operations {
+@@ -34,6 +36,8 @@ struct seq_operations {
+ 	void (*stop) (struct seq_file *m, void *v);
+ 	void * (*next) (struct seq_file *m, void *v, loff_t *pos);
+ 	int (*show) (struct seq_file *m, void *v);
++
++	KABI_RESERVE(1)
+ };
+ 
+ #define SEQ_SKIP 1
+-- 
+2.25.1
+
diff --git a/0015-statx-kabi-KABI-reservation-for-kstat.patch b/0015-statx-kabi-KABI-reservation-for-kstat.patch
new file mode 100644
index 0000000000000000000000000000000000000000..12b7151da8e418c07a999522b7cf35fd49fb33dd
--- /dev/null
+++ b/0015-statx-kabi-KABI-reservation-for-kstat.patch
@@ -0,0 +1,38 @@
+From ed5b59b6c40d2563994c1f7b5a1321affb490d45 Mon Sep 17 00:00:00 2001
+From: Baokun Li <libaokun1@huawei.com>
+Date: Wed, 18 Dec 2024 15:23:01 +0800
+Subject: [PATCH 11/17] statx: kabi: KABI reservation for kstat
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC24E
+
+----------------------------------------------------------------------
+
+    structure                size reserves reserved  mainline
+    kstat                    160     4       192       184
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+---
+ include/linux/stat.h | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/include/linux/stat.h b/include/linux/stat.h
+index 52150570d37a..d342e89b7aaa 100644
+--- a/include/linux/stat.h
++++ b/include/linux/stat.h
+@@ -53,6 +53,11 @@ struct kstat {
+ 	u32		dio_mem_align;
+ 	u32		dio_offset_align;
+ 	u64		change_cookie;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
+-- 
+2.25.1
+
diff --git a/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ca2556d094eb53b3afc70b790eda1820ab3cee4c
--- /dev/null
+++ b/0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
@@ -0,0 +1,200 @@
+From 30f7b1506ec798949e6ce99c023780b0306845c9 Mon Sep 17 00:00:00 2001
+From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Date: Wed, 18 Dec 2024 15:31:44 +0800
+Subject: [PATCH 12/17] fs: Allow fine-grained control of folio sizes
+
+mainline inclusion
+from mainline-v6.10-rc2
+commit 84429b675bcfd2a518ae167ee4661cdf7539aa7d
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC20Q
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=84429b675bcfd2a518ae167ee4661cdf7539aa7d
+
+--------------------------------
+
+We need filesystems to be able to communicate acceptable folio sizes
+to the pagecache for a variety of uses (e.g. large block sizes).
+Support a range of folio sizes between order-0 and order-31.
+
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
+Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
+Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com
+Tested-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Conflicts:
+	include/linux/pagemap.h
+	mm/filemap.c
+[Conflicts due to not merged 83ee0e20fd9f ("filemap: support disable large
+folios on active inode")]
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+---
+ include/linux/pagemap.h | 90 +++++++++++++++++++++++++++++++++++------
+ mm/readahead.c          |  4 +-
+ 2 files changed, 79 insertions(+), 15 deletions(-)
+
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 429627abfef4..e44e377661f2 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -203,12 +203,21 @@ enum mapping_flags {
+ 	AS_EXITING	= 4, 	/* final truncate in progress */
+ 	/* writeback related tags are not used */
+ 	AS_NO_WRITEBACK_TAGS = 5,
+-	AS_LARGE_FOLIO_SUPPORT = 6,
+-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+-	AS_STABLE_WRITES,	/* must wait for writeback before modifying
++	AS_RELEASE_ALWAYS = 6,	/* Call ->release_folio(), even if no private data */
++	AS_STABLE_WRITES = 7,	/* must wait for writeback before modifying
+ 				   folio contents */
++	AS_INACCESSIBLE = 8,	/* Do not attempt direct R/W access to the mapping */
++	/* Bits 16-25 are used for FOLIO_ORDER */
++	AS_FOLIO_ORDER_BITS = 5,
++	AS_FOLIO_ORDER_MIN = 16,
++	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
+ };
+ 
++#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
++#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
++#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
++#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)
++
+ /**
+  * mapping_set_error - record a writeback error in the address_space
+  * @mapping: the mapping in which an error should be set
+@@ -348,9 +357,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+ #define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
+ #define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
+ 
++/*
++ * mapping_set_folio_order_range() - Set the orders supported by a file.
++ * @mapping: The address space of the file.
++ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
++ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
++ *
++ * The filesystem should call this function in its inode constructor to
++ * indicate which base size (min) and maximum size (max) of folio the VFS
++ * can use to cache the contents of the file.  This should only be used
++ * if the filesystem needs special handling of folio sizes (ie there is
++ * something the core cannot know).
++ * Do not tune it based on, eg, i_size.
++ *
++ * Context: This should not be called while the inode is active as it
++ * is non-atomic.
++ */
++static inline void mapping_set_folio_order_range(struct address_space *mapping,
++						 unsigned int min,
++						 unsigned int max)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return;
++
++	if (min > MAX_PAGECACHE_ORDER)
++		min = MAX_PAGECACHE_ORDER;
++
++	if (max > MAX_PAGECACHE_ORDER)
++		max = MAX_PAGECACHE_ORDER;
++
++	if (max < min)
++		max = min;
++
++	mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
++		(min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
++}
++
++static inline void mapping_set_folio_min_order(struct address_space *mapping,
++					       unsigned int min)
++{
++	mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
++}
++
+ /**
+  * mapping_set_large_folios() - Indicate the file supports large folios.
+- * @mapping: The file.
++ * @mapping: The address space of the file.
+  *
+  * The filesystem should call this function in its inode constructor to
+  * indicate that the VFS can use large folios to cache the contents of
+@@ -361,7 +412,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
+  */
+ static inline void mapping_set_large_folios(struct address_space *mapping)
+ {
+-	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
++}
++
++static inline unsigned int
++mapping_max_folio_order(const struct address_space *mapping)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return 0;
++	return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
++}
++
++static inline unsigned int
++mapping_min_folio_order(const struct address_space *mapping)
++{
++	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
++		return 0;
++	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
+ }
+ 
+ /**
+@@ -375,7 +442,7 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
+ static inline void mapping_clear_large_folios(struct address_space *mapping)
+ {
+ 	WARN_ON_ONCE(!rwsem_is_locked(&mapping->invalidate_lock));
+-	__clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	mapping_set_folio_order_range(mapping, 0, 0);
+ }
+ 
+ /*
+@@ -384,20 +451,17 @@ static inline void mapping_clear_large_folios(struct address_space *mapping)
+  */
+ static inline bool mapping_large_folio_support(struct address_space *mapping)
+ {
+-	/* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
++	/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
+ 	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
+ 			"Anonymous mapping always supports large folio");
+ 
+-	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+-		test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
++	return mapping_max_folio_order(mapping) > 0;
+ }
+ 
+ /* Return the maximum folio size for this pagecache mapping, in bytes. */
+-static inline size_t mapping_max_folio_size(struct address_space *mapping)
++static inline size_t mapping_max_folio_size(const struct address_space *mapping)
+ {
+-	if (mapping_large_folio_support(mapping))
+-		return PAGE_SIZE << MAX_PAGECACHE_ORDER;
+-	return PAGE_SIZE;
++	return PAGE_SIZE << mapping_max_folio_order(mapping);
+ }
+ 
+ static inline int filemap_nr_thps(struct address_space *mapping)
+diff --git a/mm/readahead.c b/mm/readahead.c
+index 438f142a3e74..c13c130efcca 100644
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -513,10 +513,10 @@ void page_cache_ra_order(struct readahead_control *ractl,
+ 
+ 	limit = min(limit, index + ra->size - 1);
+ 
+-	if (new_order < MAX_PAGECACHE_ORDER)
++	if (new_order < mapping_max_folio_order(mapping))
+ 		new_order += 2;
+ 
+-	new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
++	new_order = min(mapping_max_folio_order(mapping), new_order);
+ 	new_order = min_t(unsigned int, new_order, ilog2(ra->size));
+ 
+ 	/* See comment in page_cache_ra_unbounded() */
+-- 
+2.25.1
+
diff --git a/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ebe3ba02a5bc5f03b16204014daa2c53b0c0b53c
--- /dev/null
+++ b/0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
@@ -0,0 +1,68 @@
+From 8c8766f9500b9ffdb907d23269aa888d0632e68c Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Wed, 18 Dec 2024 08:10:59 +0000
+Subject: [PATCH 13/17] Revert "cgroup: fix uaf when proc_cpuset_show"
+
+hulk inclusion
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IA9YQ9
+
+--------------------------------
+
+To keep the same with the mainline and backport the lts patch.
+This reverts commit 24c448de81d48ad08925dda9869bcf535a3258b8.
+
+Fixes: 24c448de81d4 ("cgroup: fix uaf when proc_cpuset_show")
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ kernel/cgroup/cpuset.c | 24 ------------------------
+ 1 file changed, 24 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 2c9e50f09fc1..140dfb5ad3fc 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -5185,7 +5185,6 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 	char *buf;
+ 	struct cgroup_subsys_state *css;
+ 	int retval;
+-	struct cgroup *root_cgroup = NULL;
+ 
+ 	retval = -ENOMEM;
+ 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+@@ -5193,32 +5192,9 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 		goto out;
+ 
+ 	css = task_get_css(tsk, cpuset_cgrp_id);
+-	rcu_read_lock();
+-	/*
+-	 * When the cpuset subsystem is mounted on the legacy hierarchy,
+-	 * the top_cpuset.css->cgroup does not hold a reference count of
+-	 * cgroup_root.cgroup. This makes accessing css->cgroup very
+-	 * dangerous because when the cpuset subsystem is remounted to the
+-	 * default hierarchy, the cgroup_root.cgroup that css->cgroup points
+-	 * to will be released, leading to a UAF issue. To avoid this problem,
+-	 * get the reference count of top_cpuset.css->cgroup first.
+-	 *
+-	 * This is ugly!!
+-	 */
+-	if (css == &top_cpuset.css) {
+-		root_cgroup = css->cgroup;
+-		if (!css_tryget_online(&root_cgroup->self)) {
+-			rcu_read_unlock();
+-			retval = -EBUSY;
+-			goto out_free;
+-		}
+-	}
+-	rcu_read_unlock();
+ 	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+ 				current->nsproxy->cgroup_ns);
+ 	css_put(css);
+-	if (root_cgroup)
+-		css_put(&root_cgroup->self);
+ 	if (retval >= PATH_MAX)
+ 		retval = -ENAMETOOLONG;
+ 	if (retval < 0)
+-- 
+2.25.1
+
diff --git a/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0c54088d11faad0dafea84ea125ddbbfd7305321
--- /dev/null
+++ b/0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
@@ -0,0 +1,145 @@
+From 7b6abe1742cbfedea405f03fcf7fc88cacb2a205 Mon Sep 17 00:00:00 2001
+From: Yafang Shao <laoar.shao@gmail.com>
+Date: Wed, 18 Dec 2024 08:11:00 +0000
+Subject: [PATCH 14/17] cgroup: Make operations on the cgroup root_list RCU
+ safe
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+stable inclusion
+from stable-v6.6.47
+commit dd9542ae7c7ca82ed2d7c185754ba9026361f6bc
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=dd9542ae7c7ca82ed2d7c185754ba9026361f6bc
+
+--------------------------------
+
+commit d23b5c577715892c87533b13923306acc6243f93 upstream.
+
+At present, when we perform operations on the cgroup root_list, we must
+hold the cgroup_mutex, which is a relatively heavyweight lock. In reality,
+we can make operations on this list RCU-safe, eliminating the need to hold
+the cgroup_mutex during traversal. Modifications to the list only occur in
+the cgroup root setup and destroy paths, which should be infrequent in a
+production environment. In contrast, traversal may occur frequently.
+Therefore, making it RCU-safe would be beneficial.
+
+Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+To: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ include/linux/cgroup-defs.h     |  1 +
+ kernel/cgroup/cgroup-internal.h |  3 ++-
+ kernel/cgroup/cgroup.c          | 23 ++++++++++++++++-------
+ 3 files changed, 19 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 6e3227a688de..05ece896af7d 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -591,6 +591,7 @@ struct cgroup_root {
+ 
+ 	/* A list running through the active hierarchies */
+ 	struct list_head root_list;
++	struct rcu_head rcu;
+ 
+ 	/* Hierarchy-specific flags */
+ 	unsigned int flags;
+diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
+index 96a9bd2c26f0..f5fb12890645 100644
+--- a/kernel/cgroup/cgroup-internal.h
++++ b/kernel/cgroup/cgroup-internal.h
+@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots;
+ 
+ /* iterate across the hierarchies */
+ #define for_each_root(root)						\
+-	list_for_each_entry((root), &cgroup_roots, root_list)
++	list_for_each_entry_rcu((root), &cgroup_roots, root_list,	\
++				lockdep_is_held(&cgroup_mutex))
+ 
+ /**
+  * for_each_subsys - iterate all enabled cgroup subsystems
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 52fe6ba2fefd..c26a9b3a3576 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
+ 
+ void cgroup_free_root(struct cgroup_root *root)
+ {
+-	kfree(root);
++	kfree_rcu(root, rcu);
+ }
+ 
+ static void cgroup_destroy_root(struct cgroup_root *root)
+@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
+ 	spin_unlock_irq(&css_set_lock);
+ 
+ 	if (!list_empty(&root->root_list)) {
+-		list_del(&root->root_list);
++		list_del_rcu(&root->root_list);
+ 		cgroup_root_count--;
+ 	}
+ 
+@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ 		}
+ 	}
+ 
+-	BUG_ON(!res_cgroup);
++	/*
++	 * If cgroup_mutex is not held, the cgrp_cset_link will be freed
++	 * before we remove the cgroup root from the root_list. Consequently,
++	 * when accessing a cgroup root, the cset_link may have already been
++	 * freed, resulting in a NULL res_cgroup. However, by holding the
++	 * cgroup_mutex, we ensure that res_cgroup can't be NULL.
++	 * If we don't hold cgroup_mutex in the caller, we must do the NULL
++	 * check.
++	 */
+ 	return res_cgroup;
+ }
+ 
+@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
+ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ 					    struct cgroup_root *root)
+ {
+-	lockdep_assert_held(&cgroup_mutex);
+ 	lockdep_assert_held(&css_set_lock);
+ 
+ 	return __cset_cgroup_from_root(cset, root);
+@@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ 
+ /*
+  * Return the cgroup for "task" from the given hierarchy. Must be
+- * called with cgroup_mutex and css_set_lock held.
++ * called with css_set_lock held to prevent task's groups from being modified.
++ * Must be called with either cgroup_mutex or rcu read lock to prevent the
++ * cgroup root from being destroyed.
+  */
+ struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ 				     struct cgroup_root *root)
+@@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
+ 	struct cgroup_root *root = ctx->root;
+ 	struct cgroup *cgrp = &root->cgrp;
+ 
+-	INIT_LIST_HEAD(&root->root_list);
++	INIT_LIST_HEAD_RCU(&root->root_list);
+ 	atomic_set(&root->nr_cgrps, 1);
+ 	cgrp->root = root;
+ 	init_cgroup_housekeeping(cgrp);
+@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+ 	 * care of subsystems' refcounts, which are explicitly dropped in
+ 	 * the failure exit path.
+ 	 */
+-	list_add(&root->root_list, &cgroup_roots);
++	list_add_rcu(&root->root_list, &cgroup_roots);
+ 	cgroup_root_count++;
+ 
+ 	/*
+-- 
+2.25.1
+
diff --git a/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
new file mode 100644
index 0000000000000000000000000000000000000000..45d780248fb256f34f308724c08e0cf675c3bf28
--- /dev/null
+++ b/0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
@@ -0,0 +1,84 @@
+From 4363688e9b49bde3cce7b2ea1882f3d44d1f5289 Mon Sep 17 00:00:00 2001
+From: Waiman Long <longman@redhat.com>
+Date: Wed, 18 Dec 2024 08:11:01 +0000
+Subject: [PATCH 15/17] cgroup: Move rcu_head up near the top of cgroup_root
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+stable inclusion
+from stable-v6.6.47
+commit f3c60ab676bb62e01d004d5b1cf2963a296c8e6a
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f3c60ab676bb62e01d004d5b1cf2963a296c8e6a
+
+--------------------------------
+
+commit a7fb0423c201ba12815877a0b5a68a6a1710b23a upstream.
+
+Commit 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU
+safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu()
+for freeing the cgroup_root.
+
+The current implementation of kvfree_rcu(), however, has the limitation
+that the offset of the rcu_head structure within the larger data
+structure must be less than 4096 or the compilation will fail. See the
+macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h
+for more information.
+
+By putting rcu_head below the large cgroup structure, any change to the
+cgroup structure that makes it larger run the risk of causing build
+failure under certain configurations. Commit 77070eeb8821 ("cgroup:
+Avoid false cacheline sharing of read mostly rstat_cpu") happens to be
+the last straw that breaks it. Fix this problem by moving the rcu_head
+structure up before the cgroup structure.
+
+Fixes: 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe")
+Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/
+Signed-off-by: Waiman Long <longman@redhat.com>
+Acked-by: Yafang Shao <laoar.shao@gmail.com>
+Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
+Reviewed-by: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Conflicts:
+	include/linux/cgroup-defs.h
+[Context is mismatched for wait_queue_head_t wait was merged]
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ include/linux/cgroup-defs.h | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 05ece896af7d..8eb518ce87a1 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -573,6 +573,10 @@ struct cgroup_root {
+ 	/* Unique id for this hierarchy. */
+ 	int hierarchy_id;
+ 
++	/* A list running through the active hierarchies */
++	struct list_head root_list;
++	struct rcu_head rcu;    /* Must be near the top */
++
+ 	/*
+ 	 * The root cgroup. The containing cgroup_root will be destroyed on its
+ 	 * release. cgrp->ancestors[0] will be used overflowing into the
+@@ -589,10 +593,6 @@ struct cgroup_root {
+ 	/* Wait while cgroups are being destroyed */
+ 	wait_queue_head_t wait;
+ 
+-	/* A list running through the active hierarchies */
+-	struct list_head root_list;
+-	struct rcu_head rcu;
+-
+ 	/* Hierarchy-specific flags */
+ 	unsigned int flags;
+ 
+-- 
+2.25.1
+
diff --git a/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c528ff32a703b081492f054515fb49b431716238
--- /dev/null
+++ b/0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
@@ -0,0 +1,110 @@
+From 724b6581cd8b49962e3add6e8795423f2c1390f8 Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Wed, 18 Dec 2024 08:11:02 +0000
+Subject: [PATCH 16/17] cgroup/cpuset: Prevent UAF in proc_cpuset_show()
+
+stable inclusion
+from stable-v6.6.44
+commit 96226fbed566f3f686f53a489a29846f2d538080
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=96226fbed566f3f686f53a489a29846f2d538080
+
+--------------------------------
+
+[ Upstream commit 1be59c97c83ccd67a519d8a49486b3a8a73ca28a ]
+
+An UAF can happen when /proc/cpuset is read as reported in [1].
+
+This can be reproduced by the following methods:
+1.add an mdelay(1000) before acquiring the cgroup_lock In the
+ cgroup_path_ns function.
+2.$cat /proc/<pid>/cpuset   repeatly.
+3.$mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset/
+$umount /sys/fs/cgroup/cpuset/   repeatly.
+
+The race that cause this bug can be shown as below:
+
+(umount)		|	(cat /proc/<pid>/cpuset)
+css_release		|	proc_cpuset_show
+css_release_work_fn	|	css = task_get_css(tsk, cpuset_cgrp_id);
+css_free_rwork_fn	|	cgroup_path_ns(css->cgroup, ...);
+cgroup_destroy_root	|	mutex_lock(&cgroup_mutex);
+rebind_subsystems	|
+cgroup_free_root 	|
+			|	// cgrp was freed, UAF
+			|	cgroup_path_ns_locked(cgrp,..);
+
+When the cpuset is initialized, the root node top_cpuset.css.cgrp
+will point to &cgrp_dfl_root.cgrp. In cgroup v1, the mount operation will
+allocate cgroup_root, and top_cpuset.css.cgrp will point to the allocated
+&cgroup_root.cgrp. When the umount operation is executed,
+top_cpuset.css.cgrp will be rebound to &cgrp_dfl_root.cgrp.
+
+The problem is that when rebinding to cgrp_dfl_root, there are cases
+where the cgroup_root allocated by setting up the root for cgroup v1
+is cached. This could lead to a Use-After-Free (UAF) if it is
+subsequently freed. The descendant cgroups of cgroup v1 can only be
+freed after the css is released. However, the css of the root will never
+be released, yet the cgroup_root should be freed when it is unmounted.
+This means that obtaining a reference to the css of the root does
+not guarantee that css.cgrp->root will not be freed.
+
+Fix this problem by using rcu_read_lock in proc_cpuset_show().
+As cgroup_root is kfree_rcu after commit 331654dc5f40
+("cgroup: Make operations on the cgroup root_list RCU safe"),
+css->cgroup won't be freed during the critical section.
+To call cgroup_path_ns_locked, css_set_lock is needed, so it is safe to
+replace task_get_css with task_css.
+
+[1] https://syzkaller.appspot.com/bug?extid=9b1ff7be974a403aa4cd
+
+Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+
+Conflicts:
+	kernel/cgroup/cpuset.c
+[commit 5715456af3e0 ("kernfs: Convert kernfs_path_from_node_locked()
+from strlcpy() to strscpy()") was not merged]
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ kernel/cgroup/cpuset.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 140dfb5ad3fc..f3cf9b1268e0 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -21,6 +21,7 @@
+  *  License.  See the file COPYING in the main directory of the Linux
+  *  distribution for more details.
+  */
++#include "cgroup-internal.h"
+ 
+ #include <linux/cpu.h>
+ #include <linux/cpumask.h>
+@@ -5191,10 +5192,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ 	if (!buf)
+ 		goto out;
+ 
+-	css = task_get_css(tsk, cpuset_cgrp_id);
+-	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+-				current->nsproxy->cgroup_ns);
+-	css_put(css);
++	rcu_read_lock();
++	spin_lock_irq(&css_set_lock);
++	css = task_css(tsk, cpuset_cgrp_id);
++	retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
++				       current->nsproxy->cgroup_ns);
++	spin_unlock_irq(&css_set_lock);
++	rcu_read_unlock();
++
+ 	if (retval >= PATH_MAX)
+ 		retval = -ENAMETOOLONG;
+ 	if (retval < 0)
+-- 
+2.25.1
+
diff --git a/0021-cgroup-add-more-reserve-kabi.patch b/0021-cgroup-add-more-reserve-kabi.patch
new file mode 100644
index 0000000000000000000000000000000000000000..5c0ed0801f0f17360af652b9104c43370874f823
--- /dev/null
+++ b/0021-cgroup-add-more-reserve-kabi.patch
@@ -0,0 +1,90 @@
+From d68991f87f738657074d93a1ae8ccf865f40b65a Mon Sep 17 00:00:00 2001
+From: Chen Ridong <chenridong@huawei.com>
+Date: Wed, 18 Dec 2024 08:11:03 +0000
+Subject: [PATCH 17/17] cgroup: add more reserve kabi
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/I8SA3O
+
+--------------------------------
+
+Reserve KABI for future feature development.
+
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+---
+ include/linux/cgroup-defs.h | 7 +++++++
+ include/linux/memcontrol.h  | 8 ++++++++
+ kernel/cgroup/cpuset.c      | 5 -----
+ 3 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
+index 8eb518ce87a1..f3fd0407d346 100644
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -325,6 +325,8 @@ struct cgroup_base_stat {
+ #ifdef CONFIG_SCHED_CORE
+ 	u64 forceidle_sum;
+ #endif
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ /*
+@@ -555,6 +557,9 @@ struct cgroup {
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
+ 	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ 	/* All ancestors including self */
+ 	struct cgroup *ancestors[];
+ };
+@@ -606,6 +611,8 @@ struct cgroup_root {
+ 	KABI_RESERVE(2)
+ 	KABI_RESERVE(3)
+ 	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
+ };
+ 
+ /*
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index b2a80e089a0a..abe236201e68 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -429,6 +429,14 @@ struct mem_cgroup {
+ 	KABI_RESERVE(6)
+ 	KABI_RESERVE(7)
+ 	KABI_RESERVE(8)
++	KABI_RESERVE(9)
++	KABI_RESERVE(10)
++	KABI_RESERVE(11)
++	KABI_RESERVE(12)
++	KABI_RESERVE(13)
++	KABI_RESERVE(14)
++	KABI_RESERVE(15)
++	KABI_RESERVE(16)
+ 	struct mem_cgroup_per_node *nodeinfo[];
+ };
+ 
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index f3cf9b1268e0..7ea0a6d00519 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -211,11 +211,6 @@ struct cpuset {
+ 
+ 	/* Remote partition silbling list anchored at remote_children */
+ 	struct list_head remote_sibling;
+-
+-	KABI_RESERVE(1)
+-	KABI_RESERVE(2)
+-	KABI_RESERVE(3)
+-	KABI_RESERVE(4)
+ };
+ 
+ /*
+-- 
+2.25.1
+
diff --git a/0022-14223.patch b/0022-14223.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b10342738af488cadd2bab4ffd42010e767b5f41
--- /dev/null
+++ b/0022-14223.patch
@@ -0,0 +1,80 @@
+From f8cb61566576a623971d5cc8dd3cd6229e787e30 Mon Sep 17 00:00:00 2001
+From: Zhang Changzhong <zhangchangzhong@huawei.com>
+Date: Wed, 18 Dec 2024 17:50:29 +0800
+Subject: [PATCH] kabi: net: reserve space for xdp subsystem related structure
+
+hulk inclusion
+category: other
+bugzilla: https://gitee.com/openeuler/kernel/issues/I8OWRC
+
+----------------------------------------------------
+
+Reserve some fields beforehand for xdp framework related structures
+prone to change.
+
+Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
+---
+ include/net/xdp.h | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/include/net/xdp.h b/include/net/xdp.h
+index c283668458ca..9b9c7dc25eeb 100644
+--- a/include/net/xdp.h
++++ b/include/net/xdp.h
+@@ -54,6 +54,9 @@ enum xdp_mem_type {
+ struct xdp_mem_info {
+ 	u32 type; /* enum xdp_mem_type, but known size type */
+ 	u32 id;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ struct page_pool;
+@@ -74,6 +77,9 @@ struct xdp_rxq_info {
+ 
+ struct xdp_txq_info {
+ 	struct net_device *dev;
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ enum xdp_buff_flags {
+@@ -92,6 +98,11 @@ struct xdp_buff {
+ 	struct xdp_txq_info *txq;
+ 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
+ 	u32 flags; /* supported values defined in xdp_buff_flags */
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
+@@ -181,6 +192,11 @@ struct xdp_frame {
+ 	struct net_device *dev_rx; /* used by cpumap */
+ 	u32 frame_sz;
+ 	u32 flags; /* supported values defined in xdp_buff_flags */
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
+@@ -198,6 +214,9 @@ struct xdp_frame_bulk {
+ 	int count;
+ 	void *xa;
+ 	void *q[XDP_BULK_QUEUE_SIZE];
++
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
+ };
+ 
+ static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
+-- 
+Gitee
+
diff --git a/0023-14224.patch b/0023-14224.patch
new file mode 100644
index 0000000000000000000000000000000000000000..62ba017f20debc62800306679fc06b1265b58aef
--- /dev/null
+++ b/0023-14224.patch
@@ -0,0 +1,85 @@
+From a2bbb3a7e3d30f5efc443fa17fcfe20fdd5a98d5 Mon Sep 17 00:00:00 2001
+From: Dong Chenchen <dongchenchen2@huawei.com>
+Date: Wed, 18 Dec 2024 17:15:36 +0800
+Subject: [PATCH] net/kabi: Reserve space for net structures
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC1RH
+
+--------------------------------
+
+Reserve some fields beforehand for net subsystem related
+structures prone to change.
+
+Signed-off-by: Dong Chenchen <dongchenchen2@huawei.com>
+---
+ include/net/flow.h            | 2 ++
+ include/net/netns/netfilter.h | 2 ++
+ include/net/netns/xfrm.h      | 2 ++
+ include/net/xfrm.h            | 4 ++++
+ 4 files changed, 10 insertions(+)
+
+diff --git a/include/net/flow.h b/include/net/flow.h
+index 0cc5f2ef1000..72d2ea2374ba 100644
+--- a/include/net/flow.h
++++ b/include/net/flow.h
+@@ -46,6 +46,8 @@ struct flowi_common {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ union flowi_uli {
+diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
+index 4b77a9b031b6..963588269637 100644
+--- a/include/net/netns/netfilter.h
++++ b/include/net/netns/netfilter.h
+@@ -34,5 +34,7 @@ struct netns_nf {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ #endif
+diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
+index a0c1359cc7eb..af7f20ef4823 100644
+--- a/include/net/netns/xfrm.h
++++ b/include/net/netns/xfrm.h
+@@ -87,6 +87,8 @@ struct netns_xfrm {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ #endif
+diff --git a/include/net/xfrm.h b/include/net/xfrm.h
+index c875faf98492..b9dec5f9c973 100644
+--- a/include/net/xfrm.h
++++ b/include/net/xfrm.h
+@@ -294,6 +294,8 @@ struct xfrm_state {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static inline struct net *xs_net(struct xfrm_state *x)
+@@ -562,6 +564,8 @@ struct xfrm_policy {
+ 
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
+ };
+ 
+ static inline struct net *xp_net(const struct xfrm_policy *xp)
+-- 
+Gitee
+
diff --git a/0024-14225.patch b/0024-14225.patch
new file mode 100644
index 0000000000000000000000000000000000000000..32a10378e03c63d02d8559123ce5db0f5df1bc1f
--- /dev/null
+++ b/0024-14225.patch
@@ -0,0 +1,154 @@
+From 279803fa98908bd367cec04ae2600c15764fb977 Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun2@huawei.com>
+Date: Wed, 18 Dec 2024 09:45:31 +0000
+Subject: [PATCH 1/3] kabi: reserve space for perf_event.h
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM
+
+--------------------------------
+
+reserve space for perf_event.h
+
+Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
+---
+ include/linux/perf_event.h | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
+index 89f2a02db563..fe692e9bd0b2 100644
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -1010,6 +1010,14 @@ struct perf_cpu_pmu_context {
+ 	struct hrtimer			hrtimer;
+ 	ktime_t				hrtimer_interval;
+ 	unsigned int			hrtimer_active;
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ /**
+@@ -1031,6 +1039,14 @@ struct perf_cpu_context {
+ 	int				heap_size;
+ 	struct perf_event		**heap;
+ 	struct perf_event		*heap_default[2];
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct perf_output_handle {
+-- 
+Gitee
+
+
+From 078ad81846b81844eb98f90eee57c06954715c8d Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun2@huawei.com>
+Date: Wed, 18 Dec 2024 09:45:32 +0000
+Subject: [PATCH 2/3] kabi: reserve space for internal.h
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM
+
+--------------------------------
+
+reserve space for internal.h
+
+Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
+---
+ kernel/events/internal.h | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/kernel/events/internal.h b/kernel/events/internal.h
+index d2e6e6144c54..d1ffa00b91b6 100644
+--- a/kernel/events/internal.h
++++ b/kernel/events/internal.h
+@@ -5,6 +5,7 @@
+ #include <linux/hardirq.h>
+ #include <linux/uaccess.h>
+ #include <linux/refcount.h>
++#include <linux/kabi.h>
+ 
+ /* Buffer handling */
+ 
+@@ -54,6 +55,15 @@ struct perf_buffer {
+ 	void				**aux_pages;
+ 	void				*aux_priv;
+ 
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
++
+ 	struct perf_event_mmap_page	*user_page;
+ 	void				*data_pages[];
+ };
+-- 
+Gitee
+
+
+From 59a2a3e8b1c35d9e0bde08cd2e6f01f1c12d384b Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun2@huawei.com>
+Date: Wed, 18 Dec 2024 09:45:33 +0000
+Subject: [PATCH 3/3] kabi: reserve space for uprobes.h
+
+hulk inclusion
+category: feature
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBC1PM
+
+--------------------------------
+
+reserve space for uprobes.h
+
+Signed-off-by: Luo Gengkun <luogengkun2@huawei.com>
+---
+ include/linux/uprobes.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
+index f46e0ca0169c..86d0868b584a 100644
+--- a/include/linux/uprobes.h
++++ b/include/linux/uprobes.h
+@@ -47,6 +47,7 @@ struct uprobe_consumer {
+ 
+ #ifdef CONFIG_UPROBES
+ #include <asm/uprobes.h>
++#include <linux/kabi.h>
+ 
+ enum uprobe_task_state {
+ 	UTASK_RUNNING,
+@@ -78,6 +79,14 @@ struct uprobe_task {
+ 
+ 	struct return_instance		*return_instances;
+ 	unsigned int			depth;
++	KABI_RESERVE(1)
++	KABI_RESERVE(2)
++	KABI_RESERVE(3)
++	KABI_RESERVE(4)
++	KABI_RESERVE(5)
++	KABI_RESERVE(6)
++	KABI_RESERVE(7)
++	KABI_RESERVE(8)
+ };
+ 
+ struct return_instance {
+-- 
+Gitee
+
diff --git a/0025-14226.patch b/0025-14226.patch
new file mode 100644
index 0000000000000000000000000000000000000000..172bd62c1e76e56a4510255952f89b2dca1d4740
--- /dev/null
+++ b/0025-14226.patch
@@ -0,0 +1,3685 @@
+From d1c833cfcc6661276386ef005382f6cd817ade5f Mon Sep 17 00:00:00 2001
+From: Kemeng Shi <shikemeng@huaweicloud.com>
+Date: Wed, 18 Dec 2024 17:34:44 +0800
+Subject: [PATCH 01/19] mm/page_alloc: remove unnecessary check in
+ break_down_buddy_pages
+
+mainline inclusion
+from mainline-v6.7-rc1
+commit 27e0db3c21aaf1422980e64b77956e15b839306f
+category: cleanup
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=27e0db3c21aaf1422980e64b77956e15b839306f
+
+--------------------------------
+
+Patch series "Two minor cleanups to break_down_buddy_pages", v2.
+
+Two minor cleanups to break_down_buddy_pages.
+
+This patch (of 2):
+
+1. We always have target in range started with next_page and full free
+   range started with current_buddy.
+
+2. The last split range size is 1 << low and low should be >= 0, then
+   size >= 1.  So page + size != page is always true (because size > 0).
+   As summary, current_page will not equal to target page.
+
+Link: https://lkml.kernel.org/r/20230927103514.98281-1-shikemeng@huaweicloud.com
+Link: https://lkml.kernel.org/r/20230927103514.98281-2-shikemeng@huaweicloud.com
+Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Oscar Salvador <osalvador@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 36cd38df0614..fb6008b30b48 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -6955,10 +6955,8 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ 		if (set_page_guard(zone, current_buddy, high, migratetype))
+ 			continue;
+ 
+-		if (current_buddy != target) {
+-			add_to_free_list(current_buddy, zone, high, migratetype);
+-			set_buddy_order(current_buddy, high);
+-		}
++		add_to_free_list(current_buddy, zone, high, migratetype);
++		set_buddy_order(current_buddy, high);
+ 	}
+ }
+ 
+-- 
+Gitee
+
+
+From f3e36dbf45c6a413f85c6d41a84565111728030d Mon Sep 17 00:00:00 2001
+From: Kemeng Shi <shikemeng@huaweicloud.com>
+Date: Wed, 18 Dec 2024 17:34:45 +0800
+Subject: [PATCH 02/19] mm/page_alloc: remove unnecessary next_page in
+ break_down_buddy_pages
+
+mainline inclusion
+from mainline-v6.7-rc1
+commit 0dfca313a009c83e2ad44b3719dc1222df6c6db5
+category: cleanup
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0dfca313a009c83e2ad44b3719dc1222df6c6db5
+
+--------------------------------
+
+The next_page is only used to forward page in case target is in second
+half range.  Move forward page directly to remove unnecessary next_page.
+
+Link: https://lkml.kernel.org/r/20230927103514.98281-3-shikemeng@huaweicloud.com
+Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Oscar Salvador <osalvador@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index fb6008b30b48..3cc5d5c7826e 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -6937,20 +6937,18 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ 				   int migratetype)
+ {
+ 	unsigned long size = 1 << high;
+-	struct page *current_buddy, *next_page;
++	struct page *current_buddy;
+ 
+ 	while (high > low) {
+ 		high--;
+ 		size >>= 1;
+ 
+ 		if (target >= &page[size]) {
+-			next_page = page + size;
+ 			current_buddy = page;
++			page = page + size;
+ 		} else {
+-			next_page = page;
+ 			current_buddy = page + size;
+ 		}
+-		page = next_page;
+ 
+ 		if (set_page_guard(zone, current_buddy, high, migratetype))
+ 			continue;
+-- 
+Gitee
+
+
+From 8a91855c80870c36e7d5f540e502b42716512680 Mon Sep 17 00:00:00 2001
+From: Yajun Deng <yajun.deng@linux.dev>
+Date: Wed, 18 Dec 2024 17:34:46 +0800
+Subject: [PATCH 03/19] mm: page_alloc: simplify __free_pages_ok()
+
+mainline inclusion
+from mainline-v6.8-rc1
+commit 250ae189d98290d0539b4f9b8c4703e0bf24f9d3
+category: cleanup
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=250ae189d98290d0539b4f9b8c4703e0bf24f9d3
+
+--------------------------------
+
+There is redundant code in __free_pages_ok(). Use free_one_page()
+simplify it.
+
+Link: https://lkml.kernel.org/r/20231216030503.2126130-1-yajun.deng@linux.dev
+Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 9 +--------
+ 1 file changed, 1 insertion(+), 8 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3cc5d5c7826e..ff0940ab0fe6 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1258,7 +1258,6 @@ static void free_one_page(struct zone *zone,
+ static void __free_pages_ok(struct page *page, unsigned int order,
+ 			    fpi_t fpi_flags)
+ {
+-	unsigned long flags;
+ 	int migratetype;
+ 	unsigned long pfn = page_to_pfn(page);
+ 	struct zone *zone = page_zone(page);
+@@ -1273,13 +1272,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
+ 	 */
+ 	migratetype = get_pfnblock_migratetype(page, pfn);
+ 
+-	spin_lock_irqsave(&zone->lock, flags);
+-	if (unlikely(has_isolate_pageblock(zone) ||
+-		is_migrate_isolate(migratetype))) {
+-		migratetype = get_pfnblock_migratetype(page, pfn);
+-	}
+-	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+-	spin_unlock_irqrestore(&zone->lock, flags);
++	free_one_page(zone, page, pfn, order, migratetype, fpi_flags);
+ 
+ 	__count_vm_events(PGFREE, 1 << order);
+ }
+-- 
+Gitee
+
+
+From e30777461ce931191902c5d35263b9a3d23b1de7 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:47 +0800
+Subject: [PATCH 04/19] mm: page_alloc: remove pcppage migratetype caching
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit 17edeb5d3f761c20fd28f6002f5a9faa53c0a0d8
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=17edeb5d3f761c20fd28f6002f5a9faa53c0a0d8
+
+--------------------------------
+
+Patch series "mm: page_alloc: freelist migratetype hygiene", v4.
+
+The page allocator's mobility grouping is intended to keep unmovable pages
+separate from reclaimable/compactable ones to allow on-demand
+defragmentation for higher-order allocations and huge pages.
+
+Currently, there are several places where accidental type mixing occurs:
+an allocation asks for a page of a certain migratetype and receives
+another.  This ruins pageblocks for compaction, which in turn makes
+allocating huge pages more expensive and less reliable.
+
+The series addresses those causes.  The last patch adds type checks on all
+freelist movements to prevent new violations being introduced.
+
+The benefits can be seen in a mixed workload that stresses the machine
+with a memcache-type workload and a kernel build job while periodically
+attempting to allocate batches of THP.  The following data is aggregated
+over 50 consecutive defconfig builds:
+
+                                                        VANILLA                 PATCHED
+Hugealloc Time mean                      165843.93 (    +0.00%)  113025.88 (   -31.85%)
+Hugealloc Time stddev                    158957.35 (    +0.00%)  114716.07 (   -27.83%)
+Kbuild Real time                            310.24 (    +0.00%)     300.73 (    -3.06%)
+Kbuild User time                           1271.13 (    +0.00%)    1259.42 (    -0.92%)
+Kbuild System time                          582.02 (    +0.00%)     559.79 (    -3.81%)
+THP fault alloc                           30585.14 (    +0.00%)   40853.62 (   +33.57%)
+THP fault fallback                        36626.46 (    +0.00%)   26357.62 (   -28.04%)
+THP fault fail rate %                        54.49 (    +0.00%)      39.22 (   -27.53%)
+Pagealloc fallback                         1328.00 (    +0.00%)       1.00 (   -99.85%)
+Pagealloc type mismatch                  181009.50 (    +0.00%)       0.00 (  -100.00%)
+Direct compact stall                        434.56 (    +0.00%)     257.66 (   -40.61%)
+Direct compact fail                         421.70 (    +0.00%)     249.94 (   -40.63%)
+Direct compact success                       12.86 (    +0.00%)       7.72 (   -37.09%)
+Direct compact success rate %                 2.86 (    +0.00%)       2.82 (    -0.96%)
+Compact daemon scanned migrate          3370059.62 (    +0.00%) 3612054.76 (    +7.18%)
+Compact daemon scanned free             7718439.20 (    +0.00%) 5386385.02 (   -30.21%)
+Compact direct scanned migrate           309248.62 (    +0.00%)  176721.04 (   -42.85%)
+Compact direct scanned free              433582.84 (    +0.00%)  315727.66 (   -27.18%)
+Compact migrate scanned daemon %             91.20 (    +0.00%)      94.48 (    +3.56%)
+Compact free scanned daemon %                94.58 (    +0.00%)      94.42 (    -0.16%)
+Compact total migrate scanned           3679308.24 (    +0.00%) 3788775.80 (    +2.98%)
+Compact total free scanned              8152022.04 (    +0.00%) 5702112.68 (   -30.05%)
+Alloc stall                                 872.04 (    +0.00%)    5156.12 (  +490.71%)
+Pages kswapd scanned                     510645.86 (    +0.00%)    3394.94 (   -99.33%)
+Pages kswapd reclaimed                   134811.62 (    +0.00%)    2701.26 (   -98.00%)
+Pages direct scanned                      99546.06 (    +0.00%)  376407.52 (  +278.12%)
+Pages direct reclaimed                    62123.40 (    +0.00%)  289535.70 (  +366.06%)
+Pages total scanned                      610191.92 (    +0.00%)  379802.46 (   -37.76%)
+Pages scanned kswapd %                       76.36 (    +0.00%)       0.10 (   -98.58%)
+Swap out                                  12057.54 (    +0.00%)   15022.98 (   +24.59%)
+Swap in                                     209.16 (    +0.00%)     256.48 (   +22.52%)
+File refaults                             17701.64 (    +0.00%)   11765.40 (   -33.53%)
+
+Huge page success rate is higher, allocation latencies are shorter and
+more predictable.
+
+Stealing (fallback) rate is drastically reduced.  Notably, while the
+vanilla kernel keeps doing fallbacks on an ongoing basis, the patched
+kernel enters a steady state once the distribution of block types is
+adequate for the workload.  Steals over 50 runs:
+
+VANILLA         PATCHED
+1504.0		227.0
+1557.0		6.0
+1391.0		13.0
+1080.0		26.0
+1057.0		40.0
+1156.0		6.0
+805.0		46.0
+736.0		20.0
+1747.0		2.0
+1699.0		34.0
+1269.0		13.0
+1858.0		12.0
+907.0		4.0
+727.0		2.0
+563.0		2.0
+3094.0		2.0
+10211.0		3.0
+2621.0		1.0
+5508.0		2.0
+1060.0		2.0
+538.0		3.0
+5773.0		2.0
+2199.0		0.0
+3781.0		2.0
+1387.0		1.0
+4977.0		0.0
+2865.0		1.0
+1814.0		1.0
+3739.0		1.0
+6857.0		0.0
+382.0		0.0
+407.0		1.0
+3784.0		0.0
+297.0		0.0
+298.0		0.0
+6636.0		0.0
+4188.0		0.0
+242.0		0.0
+9960.0		0.0
+5816.0		0.0
+354.0		0.0
+287.0		0.0
+261.0		0.0
+140.0		1.0
+2065.0		0.0
+312.0		0.0
+331.0		0.0
+164.0		0.0
+465.0		1.0
+219.0		0.0
+
+Type mismatches are down too.  Those count every time an allocation
+request asks for one migratetype and gets another.  This can still occur
+minimally in the patched kernel due to non-stealing fallbacks, but it's
+quite rare and follows the pattern of overall fallbacks - once the block
+type distribution settles, mismatches cease as well:
+
+VANILLA:        PATCHED:
+182602.0	268.0
+135794.0	20.0
+88619.0		19.0
+95973.0		0.0
+129590.0	0.0
+129298.0	0.0
+147134.0	0.0
+230854.0	0.0
+239709.0	0.0
+137670.0	0.0
+132430.0	0.0
+65712.0		0.0
+57901.0		0.0
+67506.0		0.0
+63565.0		4.0
+34806.0		0.0
+42962.0		0.0
+32406.0		0.0
+38668.0		0.0
+61356.0		0.0
+57800.0		0.0
+41435.0		0.0
+83456.0		0.0
+65048.0		0.0
+28955.0		0.0
+47597.0		0.0
+75117.0		0.0
+55564.0		0.0
+38280.0		0.0
+52404.0		0.0
+26264.0		0.0
+37538.0		0.0
+19671.0		0.0
+30936.0		0.0
+26933.0		0.0
+16962.0		0.0
+44554.0		0.0
+46352.0		0.0
+24995.0		0.0
+35152.0		0.0
+12823.0		0.0
+21583.0		0.0
+18129.0		0.0
+31693.0		0.0
+28745.0		0.0
+33308.0		0.0
+31114.0		0.0
+35034.0		0.0
+12111.0		0.0
+24885.0		0.0
+
+Compaction work is markedly reduced despite much better THP rates.
+
+In the vanilla kernel, reclaim seems to have been driven primarily by
+watermark boosting that happens as a result of fallbacks.  With those all
+but eliminated, watermarks average lower and kswapd does less work.  The
+uptick in direct reclaim is because THP requests have to fend for
+themselves more often - which is intended policy right now.  Aggregate
+reclaim activity is lowered significantly, though.
+
+This patch (of 10):
+
+The idea behind the cache is to save get_pageblock_migratetype() lookups
+during bulk freeing.  A microbenchmark suggests this isn't helping,
+though.  The pcp migratetype can get stale, which means that bulk freeing
+has an extra branch to check if the pageblock was isolated while on the
+pcp.
+
+While the variance overlaps, the cache write and the branch seem to make
+this a net negative.  The following test allocates and frees batches of
+10,000 pages (~3x the pcp high marks to trigger flushing):
+
+Before:
+          8,668.48 msec task-clock                       #   99.735 CPUs utilized               ( +-  2.90% )
+                19      context-switches                 #    4.341 /sec                        ( +-  3.24% )
+                 0      cpu-migrations                   #    0.000 /sec
+            17,440      page-faults                      #    3.984 K/sec                       ( +-  2.90% )
+    41,758,692,473      cycles                           #    9.541 GHz                         ( +-  2.90% )
+   126,201,294,231      instructions                     #    5.98  insn per cycle              ( +-  2.90% )
+    25,348,098,335      branches                         #    5.791 G/sec                       ( +-  2.90% )
+        33,436,921      branch-misses                    #    0.26% of all branches             ( +-  2.90% )
+
+         0.0869148 +- 0.0000302 seconds time elapsed  ( +-  0.03% )
+
+After:
+          8,444.81 msec task-clock                       #   99.726 CPUs utilized               ( +-  2.90% )
+                22      context-switches                 #    5.160 /sec                        ( +-  3.23% )
+                 0      cpu-migrations                   #    0.000 /sec
+            17,443      page-faults                      #    4.091 K/sec                       ( +-  2.90% )
+    40,616,738,355      cycles                           #    9.527 GHz                         ( +-  2.90% )
+   126,383,351,792      instructions                     #    6.16  insn per cycle              ( +-  2.90% )
+    25,224,985,153      branches                         #    5.917 G/sec                       ( +-  2.90% )
+        32,236,793      branch-misses                    #    0.25% of all branches             ( +-  2.90% )
+
+         0.0846799 +- 0.0000412 seconds time elapsed  ( +-  0.05% )
+
+A side effect is that this also ensures that pages whose pageblock gets
+stolen while on the pcplist end up on the right freelist and we don't
+perform potentially type-incompatible buddy merges (or skip merges when we
+shouldn't), which is likely beneficial to long-term fragmentation
+management, although the effects would be harder to measure.  Settle for
+simpler and faster code as justification here.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-1-hannes@cmpxchg.org
+Link: https://lkml.kernel.org/r/20240320180429.678181-2-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Tested-by: "Huang, Ying" <ying.huang@intel.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/page_alloc.c
+[ Context conflicts with commit 62b208c4859c and ae577de78c12. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 66 +++++++++++--------------------------------------
+ 1 file changed, 14 insertions(+), 52 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index ff0940ab0fe6..3d4932cd2332 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -207,24 +207,6 @@ EXPORT_SYMBOL(node_states);
+ 
+ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+ 
+-/*
+- * A cached value of the page's pageblock's migratetype, used when the page is
+- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+- * Also the migratetype set in the page does not necessarily match the pcplist
+- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+- * other index - this ensures that it will be put on the correct CMA freelist.
+- */
+-static inline int get_pcppage_migratetype(struct page *page)
+-{
+-	return page->index;
+-}
+-
+-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+-{
+-	page->index = migratetype;
+-}
+-
+ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+ unsigned int pageblock_order __read_mostly;
+ #endif
+@@ -1186,7 +1168,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ {
+ 	unsigned long flags;
+ 	unsigned int order;
+-	bool isolated_pageblocks;
+ 	struct page *page;
+ 
+ 	/*
+@@ -1199,7 +1180,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 	pindex = pindex - 1;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+-	isolated_pageblocks = has_isolate_pageblock(zone);
+ 
+ 	while (count > 0) {
+ 		struct list_head *list;
+@@ -1215,23 +1195,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 		order = pindex_to_order(pindex);
+ 		nr_pages = 1 << order;
+ 		do {
++			unsigned long pfn;
+ 			int mt;
+ 
+ 			page = list_last_entry(list, struct page, pcp_list);
+-			mt = get_pcppage_migratetype(page);
++			pfn = page_to_pfn(page);
++			mt = get_pfnblock_migratetype(page, pfn);
+ 
+ 			/* must delete to avoid corrupting pcp list */
+ 			list_del(&page->pcp_list);
+ 			count -= nr_pages;
+ 			pcp->count -= nr_pages;
+ 
+-			/* MIGRATE_ISOLATE page should not go to pcplists */
+-			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+-			/* Pageblock could have been isolated meanwhile */
+-			if (unlikely(isolated_pageblocks))
+-				mt = get_pageblock_migratetype(page);
+-
+-			__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
++			__free_one_page(page, pfn, zone, order, mt, FPI_NONE);
+ 			trace_mm_page_pcpu_drain(page, order, mt);
+ 		} while (count > 0 && !list_empty(list));
+ 	}
+@@ -1591,7 +1567,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ 			continue;
+ 		del_page_from_free_list(page, zone, current_order);
+ 		expand(zone, page, order, current_order, migratetype);
+-		set_pcppage_migratetype(page, migratetype);
+ 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ 				pcp_allowed_order(order) &&
+ 				migratetype < MIGRATE_PCPTYPES);
+@@ -2162,7 +2137,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		 * pages are ordered properly.
+ 		 */
+ 		list_add_tail(&page->pcp_list, list);
+-		if (is_migrate_cma(get_pcppage_migratetype(page)))
++		if (is_migrate_cma(get_pageblock_migratetype(page)))
+ 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ 					      -(1 << order));
+ 	}
+@@ -2362,19 +2337,6 @@ void drain_all_pages(struct zone *zone)
+ 	__drain_all_pages(zone, false);
+ }
+ 
+-static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+-							unsigned int order)
+-{
+-	int migratetype;
+-
+-	if (!free_pages_prepare(page, order))
+-		return false;
+-
+-	migratetype = get_pfnblock_migratetype(page, pfn);
+-	set_pcppage_migratetype(page, migratetype);
+-	return true;
+-}
+-
+ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
+ {
+ 	int min_nr_free, max_nr_free;
+@@ -2517,7 +2479,7 @@ void free_unref_page(struct page *page, unsigned int order)
+ 		return;
+ 	}
+ 
+-	if (!free_unref_page_prepare(page, pfn, order))
++	if (!free_pages_prepare(page, order))
+ 		return;
+ 
+ 	/*
+@@ -2527,7 +2489,7 @@ void free_unref_page(struct page *page, unsigned int order)
+ 	 * get those areas back if necessary. Otherwise, we may have to free
+ 	 * excessively into the page allocator
+ 	 */
+-	migratetype = pcpmigratetype = get_pcppage_migratetype(page);
++	migratetype = pcpmigratetype = get_pfnblock_migratetype(page, pfn);
+ 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ 		if (unlikely(is_migrate_isolate(migratetype))) {
+ 			free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
+@@ -2570,14 +2532,14 @@ void free_unref_folios(struct folio_batch *folios)
+ 		}
+ 
+ 		folio_undo_large_rmappable(folio);
+-		if (!free_unref_page_prepare(&folio->page, pfn, order))
++		if (!free_pages_prepare(&folio->page, order))
+ 			continue;
+ 
+ 		/*
+ 		 * Free isolated folios and orders not handled on the PCP
+ 		 * directly to the allocator, see comment in free_unref_page.
+ 		 */
+-		migratetype = get_pcppage_migratetype(&folio->page);
++		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
+ 		if (!pcp_allowed_order(order) ||
+ 		    is_migrate_isolate(migratetype)) {
+ 			free_one_page(folio_zone(folio), &folio->page, pfn,
+@@ -2594,10 +2556,11 @@ void free_unref_folios(struct folio_batch *folios)
+ 	for (i = 0; i < folios->nr; i++) {
+ 		struct folio *folio = folios->folios[i];
+ 		struct zone *zone = folio_zone(folio);
++		unsigned long pfn = folio_pfn(folio);
+ 		unsigned int order = (unsigned long)folio->private;
+ 
+ 		folio->private = NULL;
+-		migratetype = get_pcppage_migratetype(&folio->page);
++		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
+ 
+ 		/* Different zone requires a different pcp lock */
+ 		if (zone != locked_zone) {
+@@ -2614,9 +2577,8 @@ void free_unref_folios(struct folio_batch *folios)
+ 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ 			if (unlikely(!pcp)) {
+ 				pcp_trylock_finish(UP_flags);
+-				free_one_page(zone, &folio->page,
+-						folio_pfn(folio), order,
+-						migratetype, FPI_NONE);
++				free_one_page(zone, &folio->page, pfn,
++					      order, migratetype, FPI_NONE);
+ 				locked_zone = NULL;
+ 				continue;
+ 			}
+@@ -2785,7 +2747,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ 			}
+ 		}
+ 		__mod_zone_freepage_state(zone, -(1 << order),
+-					  get_pcppage_migratetype(page));
++					  get_pageblock_migratetype(page));
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 	} while (check_new_pages(page, order));
+ 
+-- 
+Gitee
+
+
+From b1ab4c1538a8daf9ec62d7464b039bd2231a50c3 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:48 +0800
+Subject: [PATCH 05/19] mm: page_alloc: optimize free_unref_folios()
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit 9cbe97bad5cd75b5b493734bd2695febb8e95281
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9cbe97bad5cd75b5b493734bd2695febb8e95281
+
+--------------------------------
+
+Move direct freeing of isolated pages to the lock-breaking block in the
+second loop.  This saves an unnecessary migratetype reassessment.
+
+Minor comment and local variable scoping cleanups.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-3-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Suggested-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/page_alloc.c
+[ Context conflict with commit ae577de78c12. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 32 +++++++++++++++++++++++---------
+ 1 file changed, 23 insertions(+), 9 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3d4932cd2332..dad180df69da 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2518,7 +2518,7 @@ void free_unref_folios(struct folio_batch *folios)
+ 	unsigned long __maybe_unused UP_flags;
+ 	struct per_cpu_pages *pcp = NULL;
+ 	struct zone *locked_zone = NULL;
+-	int i, j, migratetype;
++	int i, j;
+ 
+ 	/* Prepare folios for freeing */
+ 	for (i = 0, j = 0; i < folios->nr; i++) {
+@@ -2534,14 +2534,15 @@ void free_unref_folios(struct folio_batch *folios)
+ 		folio_undo_large_rmappable(folio);
+ 		if (!free_pages_prepare(&folio->page, order))
+ 			continue;
+-
+ 		/*
+-		 * Free isolated folios and orders not handled on the PCP
+-		 * directly to the allocator, see comment in free_unref_page.
++		 * Free orders not handled on the PCP directly to the
++		 * allocator.
+ 		 */
+-		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
+-		if (!pcp_allowed_order(order) ||
+-		    is_migrate_isolate(migratetype)) {
++		if (!pcp_allowed_order(order)) {
++			int migratetype;
++
++			migratetype = get_pfnblock_migratetype(&folio->page,
++							       pfn);
+ 			free_one_page(folio_zone(folio), &folio->page, pfn,
+ 					order, migratetype, FPI_NONE);
+ 			continue;
+@@ -2558,15 +2559,29 @@ void free_unref_folios(struct folio_batch *folios)
+ 		struct zone *zone = folio_zone(folio);
+ 		unsigned long pfn = folio_pfn(folio);
+ 		unsigned int order = (unsigned long)folio->private;
++		int migratetype;
+ 
+ 		folio->private = NULL;
+ 		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
+ 
+ 		/* Different zone requires a different pcp lock */
+-		if (zone != locked_zone) {
++		if (zone != locked_zone ||
++		    is_migrate_isolate(migratetype)) {
+ 			if (pcp) {
+ 				pcp_spin_unlock(pcp);
+ 				pcp_trylock_finish(UP_flags);
++				locked_zone = NULL;
++				pcp = NULL;
++			}
++
++			/*
++			 * Free isolated pages directly to the
++			 * allocator, see comment in free_unref_page.
++			 */
++			if (is_migrate_isolate(migratetype)) {
++				free_one_page(zone, &folio->page, pfn,
++					      order, migratetype, FPI_NONE);
++				continue;
+ 			}
+ 
+ 			/*
+@@ -2579,7 +2594,6 @@ void free_unref_folios(struct folio_batch *folios)
+ 				pcp_trylock_finish(UP_flags);
+ 				free_one_page(zone, &folio->page, pfn,
+ 					      order, migratetype, FPI_NONE);
+-				locked_zone = NULL;
+ 				continue;
+ 			}
+ 			locked_zone = zone;
+-- 
+Gitee
+
+
+From e9e8af9de46a8bdb6b9d79156f35e8d12e3a62a7 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:49 +0800
+Subject: [PATCH 06/19] mm: page_alloc: fix up block types when merging
+ compatible blocks
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit e6cf9e1c4cde8a53385423ecb8ca581097f42e02
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e6cf9e1c4cde8a53385423ecb8ca581097f42e02
+
+--------------------------------
+
+The buddy allocator coalesces compatible blocks during freeing, but it
+doesn't update the types of the subblocks to match.  When an allocation
+later breaks the chunk down again, its pieces will be put on freelists of
+the wrong type.  This encourages incompatible page mixing (ask for one
+type, get another), and thus long-term fragmentation.
+
+Update the subblocks when merging a larger chunk, such that a later
+expand() will maintain freelist type hygiene.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-4-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Tested-by: "Huang, Ying" <ying.huang@intel.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 15 +++++++++++----
+ 1 file changed, 11 insertions(+), 4 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index dad180df69da..3d7e0f110868 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -779,10 +779,17 @@ static inline void __free_one_page(struct page *page,
+ 			 */
+ 			int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
+ 
+-			if (migratetype != buddy_mt
+-					&& (!migratetype_is_mergeable(migratetype) ||
+-						!migratetype_is_mergeable(buddy_mt)))
+-				goto done_merging;
++			if (migratetype != buddy_mt) {
++				if (!migratetype_is_mergeable(migratetype) ||
++				    !migratetype_is_mergeable(buddy_mt))
++					goto done_merging;
++				/*
++				 * Match buddy type. This ensures that
++				 * an expand() down the line puts the
++				 * sub-blocks on the right freelists.
++				 */
++				set_pageblock_migratetype(buddy, migratetype);
++			}
+ 		}
+ 
+ 		/*
+-- 
+Gitee
+
+
+From c5babea33c4fe82208c95895869468ec022de6e6 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:50 +0800
+Subject: [PATCH 07/19] mm: page_alloc: move free pages when converting block
+ during isolation
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit b54ccd3c6bacbc571f7e61797fb5ff9fe3861413
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b54ccd3c6bacbc571f7e61797fb5ff9fe3861413
+
+--------------------------------
+
+When claiming a block during compaction isolation, move any remaining free
+pages to the correct freelists as well, instead of stranding them on the
+wrong list.  Otherwise, this encourages incompatible page mixing down the
+line, and thus long-term fragmentation.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-5-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Tested-by: "Huang, Ying" <ying.huang@intel.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3d7e0f110868..7e3593318813 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2681,9 +2681,12 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 			 * Only change normal pageblocks (i.e., they can merge
+ 			 * with others)
+ 			 */
+-			if (migratetype_is_mergeable(mt))
++			if (migratetype_is_mergeable(mt)) {
+ 				set_pageblock_migratetype(page,
+ 							  MIGRATE_MOVABLE);
++				move_freepages_block(zone, page,
++						     MIGRATE_MOVABLE, NULL);
++			}
+ 		}
+ 	}
+ 
+-- 
+Gitee
+
+
+From eb6b987628f29ba04e6232b6132f8865f3115c4d Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:51 +0800
+Subject: [PATCH 08/19] mm: page_alloc: fix move_freepages_block() range error
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit 2dd482ba627de15d67f0c0ed445133c8ae9b201b
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2dd482ba627de15d67f0c0ed445133c8ae9b201b
+
+--------------------------------
+
+When a block is partially outside the zone of the cursor page, the
+function cuts the range to the pivot page instead of the zone start.  This
+can leave large parts of the block behind, which encourages incompatible
+page mixing down the line (ask for one type, get another), and thus
+long-term fragmentation.
+
+This triggers reliably on the first block in the DMA zone, whose start_pfn
+is 1.  The block is stolen, but everything before the pivot page (which
+was often hundreds of pages) is left on the old list.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-6-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 7e3593318813..a101f5f550dc 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1661,9 +1661,15 @@ int move_freepages_block(struct zone *zone, struct page *page,
+ 	start_pfn = pageblock_start_pfn(pfn);
+ 	end_pfn = pageblock_end_pfn(pfn) - 1;
+ 
+-	/* Do not cross zone boundaries */
++	/*
++	 * The caller only has the lock for @zone, don't touch ranges
++	 * that straddle into other zones. While we could move part of
++	 * the range that's inside the zone, this call is usually
++	 * accompanied by other operations such as migratetype updates
++	 * which also should be locked.
++	 */
+ 	if (!zone_spans_pfn(zone, start_pfn))
+-		start_pfn = pfn;
++		return 0;
+ 	if (!zone_spans_pfn(zone, end_pfn))
+ 		return 0;
+ 
+-- 
+Gitee
+
+
+From 3bd6146784a11153ea37b528313c89e160e1fa7c Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:52 +0800
+Subject: [PATCH 09/19] mm: page_alloc: fix freelist movement during block
+ conversion
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit c0cd6f557b9090525d288806cccbc73440ac235a
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0cd6f557b9090525d288806cccbc73440ac235a
+
+--------------------------------
+
+Currently, page block type conversion during fallbacks, atomic
+reservations and isolation can strand various amounts of free pages on
+incorrect freelists.
+
+For example, fallback stealing moves free pages in the block to the new
+type's freelists, but then may not actually claim the block for that type
+if there aren't enough compatible pages already allocated.
+
+In all cases, free page moving might fail if the block straddles more than
+one zone, in which case no free pages are moved at all, but the block type
+is changed anyway.
+
+This is detrimental to type hygiene on the freelists.  It encourages
+incompatible page mixing down the line (ask for one type, get another) and
+thus contributes to long-term fragmentation.
+
+Split the process into a proper transaction: check first if conversion
+will happen, then try to move the free pages, and only if that was
+successful convert the block to the new type.
+
+[baolin.wang@linux.alibaba.com: fix allocation failures with CONFIG_CMA]
+  Link: https://lkml.kernel.org/r/a97697e0-45b0-4f71-b087-fdc7a1d43c0e@linux.alibaba.com
+Link: https://lkml.kernel.org/r/20240320180429.678181-7-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Tested-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/page-isolation.h |   3 +-
+ mm/page_alloc.c                | 174 ++++++++++++++++++++-------------
+ mm/page_isolation.c            |  22 +++--
+ 3 files changed, 121 insertions(+), 78 deletions(-)
+
+diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
+index 4ac34392823a..8550b3c91480 100644
+--- a/include/linux/page-isolation.h
++++ b/include/linux/page-isolation.h
+@@ -34,8 +34,7 @@ static inline bool is_migrate_isolate(int migratetype)
+ #define REPORT_FAILURE	0x2
+ 
+ void set_pageblock_migratetype(struct page *page, int migratetype);
+-int move_freepages_block(struct zone *zone, struct page *page,
+-				int migratetype, int *num_movable);
++int move_freepages_block(struct zone *zone, struct page *page, int migratetype);
+ 
+ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ 			     int migratetype, int flags, gfp_t gfp_flags);
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index a101f5f550dc..ba85db6cf987 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1612,9 +1612,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+  * Note that start_page and end_pages are not aligned on a pageblock
+  * boundary. If alignment is required, use move_freepages_block()
+  */
+-static int move_freepages(struct zone *zone,
+-			  unsigned long start_pfn, unsigned long end_pfn,
+-			  int migratetype, int *num_movable)
++static int move_freepages(struct zone *zone, unsigned long start_pfn,
++			  unsigned long end_pfn, int migratetype)
+ {
+ 	struct page *page;
+ 	unsigned long pfn;
+@@ -1624,14 +1623,6 @@ static int move_freepages(struct zone *zone,
+ 	for (pfn = start_pfn; pfn <= end_pfn;) {
+ 		page = pfn_to_page(pfn);
+ 		if (!PageBuddy(page)) {
+-			/*
+-			 * We assume that pages that could be isolated for
+-			 * migration are movable. But we don't actually try
+-			 * isolating, as that would be expensive.
+-			 */
+-			if (num_movable &&
+-					(PageLRU(page) || __PageMovable(page)))
+-				(*num_movable)++;
+ 			pfn++;
+ 			continue;
+ 		}
+@@ -1649,17 +1640,16 @@ static int move_freepages(struct zone *zone,
+ 	return pages_moved;
+ }
+ 
+-int move_freepages_block(struct zone *zone, struct page *page,
+-				int migratetype, int *num_movable)
++static bool prep_move_freepages_block(struct zone *zone, struct page *page,
++				      unsigned long *start_pfn,
++				      unsigned long *end_pfn,
++				      int *num_free, int *num_movable)
+ {
+-	unsigned long start_pfn, end_pfn, pfn;
+-
+-	if (num_movable)
+-		*num_movable = 0;
++	unsigned long pfn, start, end;
+ 
+ 	pfn = page_to_pfn(page);
+-	start_pfn = pageblock_start_pfn(pfn);
+-	end_pfn = pageblock_end_pfn(pfn) - 1;
++	start = pageblock_start_pfn(pfn);
++	end = pageblock_end_pfn(pfn) - 1;
+ 
+ 	/*
+ 	 * The caller only has the lock for @zone, don't touch ranges
+@@ -1668,13 +1658,50 @@ int move_freepages_block(struct zone *zone, struct page *page,
+ 	 * accompanied by other operations such as migratetype updates
+ 	 * which also should be locked.
+ 	 */
+-	if (!zone_spans_pfn(zone, start_pfn))
+-		return 0;
+-	if (!zone_spans_pfn(zone, end_pfn))
+-		return 0;
++	if (!zone_spans_pfn(zone, start))
++		return false;
++	if (!zone_spans_pfn(zone, end))
++		return false;
++
++	*start_pfn = start;
++	*end_pfn = end;
++
++	if (num_free) {
++		*num_free = 0;
++		*num_movable = 0;
++		for (pfn = start; pfn <= end;) {
++			page = pfn_to_page(pfn);
++			if (PageBuddy(page)) {
++				int nr = 1 << buddy_order(page);
++
++				*num_free += nr;
++				pfn += nr;
++				continue;
++			}
++			/*
++			 * We assume that pages that could be isolated for
++			 * migration are movable. But we don't actually try
++			 * isolating, as that would be expensive.
++			 */
++			if (PageLRU(page) || __PageMovable(page))
++				(*num_movable)++;
++			pfn++;
++		}
++	}
++
++	return true;
++}
++
++int move_freepages_block(struct zone *zone, struct page *page,
++			 int migratetype)
++{
++	unsigned long start_pfn, end_pfn;
++
++	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
++				       NULL, NULL))
++		return -1;
+ 
+-	return move_freepages(zone, start_pfn, end_pfn, migratetype,
+-								num_movable);
++	return move_freepages(zone, start_pfn, end_pfn, migratetype);
+ }
+ 
+ static void change_pageblock_range(struct page *pageblock_page,
+@@ -1759,33 +1786,37 @@ static inline bool boost_watermark(struct zone *zone)
+ }
+ 
+ /*
+- * This function implements actual steal behaviour. If order is large enough,
+- * we can steal whole pageblock. If not, we first move freepages in this
+- * pageblock to our migratetype and determine how many already-allocated pages
+- * are there in the pageblock with a compatible migratetype. If at least half
+- * of pages are free or compatible, we can change migratetype of the pageblock
+- * itself, so pages freed in the future will be put on the correct free list.
++ * This function implements actual steal behaviour. If order is large enough, we
++ * can claim the whole pageblock for the requested migratetype. If not, we check
++ * the pageblock for constituent pages; if at least half of the pages are free
++ * or compatible, we can still claim the whole block, so pages freed in the
++ * future will be put on the correct free list. Otherwise, we isolate exactly
++ * the order we need from the fallback block and leave its migratetype alone.
+  */
+-static void steal_suitable_fallback(struct zone *zone, struct page *page,
+-		unsigned int alloc_flags, int start_type, bool whole_block)
++static struct page *
++steal_suitable_fallback(struct zone *zone, struct page *page,
++			int current_order, int order, int start_type,
++			unsigned int alloc_flags, bool whole_block)
+ {
+-	unsigned int current_order = buddy_order(page);
+ 	int free_pages, movable_pages, alike_pages;
+-	int old_block_type;
++	unsigned long start_pfn, end_pfn;
++	int block_type;
+ 
+-	old_block_type = get_pageblock_migratetype(page);
++	block_type = get_pageblock_migratetype(page);
+ 
+ 	/*
+ 	 * This can happen due to races and we want to prevent broken
+ 	 * highatomic accounting.
+ 	 */
+-	if (is_migrate_highatomic(old_block_type))
++	if (is_migrate_highatomic(block_type))
+ 		goto single_page;
+ 
+ 	/* Take ownership for orders >= pageblock_order */
+ 	if (current_order >= pageblock_order) {
++		del_page_from_free_list(page, zone, current_order);
+ 		change_pageblock_range(page, current_order, start_type);
+-		goto single_page;
++		expand(zone, page, order, current_order, start_type);
++		return page;
+ 	}
+ 
+ 	/*
+@@ -1800,10 +1831,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	if (!whole_block)
+ 		goto single_page;
+ 
+-	free_pages = move_freepages_block(zone, page, start_type,
+-						&movable_pages);
+ 	/* moving whole block can fail due to zone boundary conditions */
+-	if (!free_pages)
++	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
++				       &free_pages, &movable_pages))
+ 		goto single_page;
+ 
+ 	/*
+@@ -1821,7 +1851,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ 		 * vice versa, be conservative since we can't distinguish the
+ 		 * exact migratetype of non-movable pages.
+ 		 */
+-		if (old_block_type == MIGRATE_MOVABLE)
++		if (block_type == MIGRATE_MOVABLE)
+ 			alike_pages = pageblock_nr_pages
+ 						- (free_pages + movable_pages);
+ 		else
+@@ -1832,13 +1862,16 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	 * compatible migratability as our allocation, claim the whole block.
+ 	 */
+ 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
+-			page_group_by_mobility_disabled)
++			page_group_by_mobility_disabled) {
++		move_freepages(zone, start_pfn, end_pfn, start_type);
+ 		set_pageblock_migratetype(page, start_type);
+-
+-	return;
++		return __rmqueue_smallest(zone, order, start_type);
++	}
+ 
+ single_page:
+-	move_to_free_list(page, zone, current_order, start_type);
++	del_page_from_free_list(page, zone, current_order);
++	expand(zone, page, order, current_order, block_type);
++	return page;
+ }
+ 
+ /*
+@@ -1906,9 +1939,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+ 	mt = get_pageblock_migratetype(page);
+ 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
+ 	if (migratetype_is_mergeable(mt)) {
+-		zone->nr_reserved_highatomic += pageblock_nr_pages;
+-		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+-		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
++		if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) {
++			set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
++			zone->nr_reserved_highatomic += pageblock_nr_pages;
++		}
+ 	}
+ 
+ out_unlock:
+@@ -1933,7 +1967,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 	struct zone *zone;
+ 	struct page *page;
+ 	int order;
+-	bool ret;
++	int ret;
+ 
+ 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ 								ac->nodemask) {
+@@ -1982,10 +2016,14 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * of pageblocks that cannot be completely freed
+ 			 * may increase.
+ 			 */
++			ret = move_freepages_block(zone, page, ac->migratetype);
++			/*
++			 * Reserving this block already succeeded, so this should
++			 * not fail on zone boundaries.
++			 */
++			WARN_ON_ONCE(ret == -1);
+ 			set_pageblock_migratetype(page, ac->migratetype);
+-			ret = move_freepages_block(zone, page, ac->migratetype,
+-									NULL);
+-			if (ret) {
++			if (ret > 0) {
+ 				spin_unlock_irqrestore(&zone->lock, flags);
+ 				return ret;
+ 			}
+@@ -2006,7 +2044,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+  * deviation from the rest of this file, to make the for loop
+  * condition simpler.
+  */
+-static __always_inline bool
++static __always_inline struct page *
+ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ 						unsigned int alloc_flags)
+ {
+@@ -2053,7 +2091,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ 		goto do_steal;
+ 	}
+ 
+-	return false;
++	return NULL;
+ 
+ find_smallest:
+ 	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
+@@ -2073,14 +2111,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ do_steal:
+ 	page = get_page_from_free_area(area, fallback_mt);
+ 
+-	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+-								can_steal);
++	/* take off list, maybe claim block, expand remainder */
++	page = steal_suitable_fallback(zone, page, current_order, order,
++				       start_migratetype, alloc_flags, can_steal);
+ 
+ 	trace_mm_page_alloc_extfrag(page, order, current_order,
+ 		start_migratetype, fallback_mt);
+ 
+-	return true;
+-
++	return page;
+ }
+ 
+ /*
+@@ -2107,15 +2145,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ 				return page;
+ 		}
+ 	}
+-retry:
++
+ 	page = __rmqueue_smallest(zone, order, migratetype);
+ 	if (unlikely(!page)) {
+ 		if (alloc_flags & ALLOC_CMA)
+ 			page = __rmqueue_cma_fallback(zone, order);
+ 
+-		if (!page && __rmqueue_fallback(zone, order, migratetype,
+-								alloc_flags))
+-			goto retry;
++		if (!page)
++			page = __rmqueue_fallback(zone, order, migratetype,
++						  alloc_flags);
+ 	}
+ 	return page;
+ }
+@@ -2687,12 +2725,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 			 * Only change normal pageblocks (i.e., they can merge
+ 			 * with others)
+ 			 */
+-			if (migratetype_is_mergeable(mt)) {
+-				set_pageblock_migratetype(page,
+-							  MIGRATE_MOVABLE);
+-				move_freepages_block(zone, page,
+-						     MIGRATE_MOVABLE, NULL);
+-			}
++			if (migratetype_is_mergeable(mt) &&
++			    move_freepages_block(zone, page,
++						 MIGRATE_MOVABLE) != -1)
++				set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ 		}
+ 	}
+ 
+diff --git a/mm/page_isolation.c b/mm/page_isolation.c
+index 03381be87b28..b27ed476f80e 100644
+--- a/mm/page_isolation.c
++++ b/mm/page_isolation.c
+@@ -179,15 +179,18 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
+ 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+ 			migratetype, isol_flags);
+ 	if (!unmovable) {
+-		unsigned long nr_pages;
++		int nr_pages;
+ 		int mt = get_pageblock_migratetype(page);
+ 
++		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
++		/* Block spans zone boundaries? */
++		if (nr_pages == -1) {
++			spin_unlock_irqrestore(&zone->lock, flags);
++			return -EBUSY;
++		}
++		__mod_zone_freepage_state(zone, -nr_pages, mt);
+ 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ 		zone->nr_isolate_pageblock++;
+-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+-									NULL);
+-
+-		__mod_zone_freepage_state(zone, -nr_pages, mt);
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 		return 0;
+ 	}
+@@ -207,7 +210,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
+ static void unset_migratetype_isolate(struct page *page, int migratetype)
+ {
+ 	struct zone *zone;
+-	unsigned long flags, nr_pages;
++	unsigned long flags;
+ 	bool isolated_page = false;
+ 	unsigned int order;
+ 	struct page *buddy;
+@@ -253,7 +256,12 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
+ 	 * allocation.
+ 	 */
+ 	if (!isolated_page) {
+-		nr_pages = move_freepages_block(zone, page, migratetype, NULL);
++		int nr_pages = move_freepages_block(zone, page, migratetype);
++		/*
++		 * Isolating this block already succeeded, so this
++		 * should not fail on zone boundaries.
++		 */
++		WARN_ON_ONCE(nr_pages == -1);
+ 		__mod_zone_freepage_state(zone, nr_pages, migratetype);
+ 	}
+ 	set_pageblock_migratetype(page, migratetype);
+-- 
+Gitee
+
+
+From b4ab6afae98c5b97e2b6a5681ea47dd87d833c5d Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:53 +0800
+Subject: [PATCH 10/19] mm: page_alloc: close migratetype race between freeing
+ and stealing
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit 55612e80e722ac554cc5e80df05555b4f8d40c37
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55612e80e722ac554cc5e80df05555b4f8d40c37
+
+--------------------------------
+
+There are three freeing paths that read the page's migratetype
+optimistically before grabbing the zone lock.  When this races with block
+stealing, those pages go on the wrong freelist.
+
+The paths in question are:
+- when freeing >costly orders that aren't THP
+- when freeing pages to the buddy upon pcp lock contention
+- when freeing pages that are isolated
+- when freeing pages initially during boot
+- when freeing the remainder in alloc_pages_exact()
+- when "accepting" unaccepted VM host memory before first use
+- when freeing pages during unpoisoning
+
+None of these are so hot that they would need this optimization at the
+cost of hampering defrag efforts.  Especially when contrasted with the
+fact that the most common buddy freeing path - free_pcppages_bulk - is
+checking the migratetype under the zone->lock just fine.
+
+In addition, isolated pages need to look up the migratetype under the lock
+anyway, which adds branches to the locked section, and results in a double
+lookup when the pages are in fact isolated.
+
+Move the lookups into the lock.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-8-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/page_alloc.c
+[ Context conflict with commit 2ae116c3257d. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 52 ++++++++++++++++++-------------------------------
+ 1 file changed, 19 insertions(+), 33 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index ba85db6cf987..cae51fd0b7b2 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1222,18 +1222,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ 
+-static void free_one_page(struct zone *zone,
+-				struct page *page, unsigned long pfn,
+-				unsigned int order,
+-				int migratetype, fpi_t fpi_flags)
++static void free_one_page(struct zone *zone, struct page *page,
++			  unsigned long pfn, unsigned int order,
++			  fpi_t fpi_flags)
+ {
+ 	unsigned long flags;
++	int migratetype;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+-	if (unlikely(has_isolate_pageblock(zone) ||
+-		is_migrate_isolate(migratetype))) {
+-		migratetype = get_pfnblock_migratetype(page, pfn);
+-	}
++	migratetype = get_pfnblock_migratetype(page, pfn);
+ 	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ }
+@@ -1241,21 +1238,13 @@ static void free_one_page(struct zone *zone,
+ static void __free_pages_ok(struct page *page, unsigned int order,
+ 			    fpi_t fpi_flags)
+ {
+-	int migratetype;
+ 	unsigned long pfn = page_to_pfn(page);
+ 	struct zone *zone = page_zone(page);
+ 
+ 	if (!free_pages_prepare(page, order))
+ 		return;
+ 
+-	/*
+-	 * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
+-	 * is used to avoid calling get_pfnblock_migratetype() under the lock.
+-	 * This will reduce the lock holding time.
+-	 */
+-	migratetype = get_pfnblock_migratetype(page, pfn);
+-
+-	free_one_page(zone, page, pfn, order, migratetype, fpi_flags);
++	free_one_page(zone, page, pfn, order, fpi_flags);
+ 
+ 	__count_vm_events(PGFREE, 1 << order);
+ }
+@@ -2518,7 +2507,7 @@ void free_unref_page(struct page *page, unsigned int order)
+ 	struct per_cpu_pages *pcp;
+ 	struct zone *zone;
+ 	unsigned long pfn = page_to_pfn(page);
+-	int migratetype, pcpmigratetype;
++	int migratetype;
+ 
+ 	if (page_from_dynamic_pool(page)) {
+ 		dynamic_pool_free_page(page);
+@@ -2540,23 +2529,23 @@ void free_unref_page(struct page *page, unsigned int order)
+ 	 * get those areas back if necessary. Otherwise, we may have to free
+ 	 * excessively into the page allocator
+ 	 */
+-	migratetype = pcpmigratetype = get_pfnblock_migratetype(page, pfn);
++	migratetype = get_pfnblock_migratetype(page, pfn);
+ 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ 		if (unlikely(is_migrate_isolate(migratetype))) {
+-			free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
++			free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ 			return;
+ 		}
+-		pcpmigratetype = MIGRATE_MOVABLE;
++		migratetype = MIGRATE_MOVABLE;
+ 	}
+ 
+ 	zone = page_zone(page);
+ 	pcp_trylock_prepare(UP_flags);
+ 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ 	if (pcp) {
+-		free_unref_page_commit(zone, pcp, page, pcpmigratetype, order);
++		free_unref_page_commit(zone, pcp, page, migratetype, order);
+ 		pcp_spin_unlock(pcp);
+ 	} else {
+-		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
++		free_one_page(zone, page, pfn, order, FPI_NONE);
+ 	}
+ 	pcp_trylock_finish(UP_flags);
+ }
+@@ -2590,12 +2579,8 @@ void free_unref_folios(struct folio_batch *folios)
+ 		 * allocator.
+ 		 */
+ 		if (!pcp_allowed_order(order)) {
+-			int migratetype;
+-
+-			migratetype = get_pfnblock_migratetype(&folio->page,
+-							       pfn);
+-			free_one_page(folio_zone(folio), &folio->page, pfn,
+-					order, migratetype, FPI_NONE);
++			free_one_page(folio_zone(folio), &folio->page,
++				      pfn, order, FPI_NONE);
+ 			continue;
+ 		}
+ 		folio->private = (void *)(unsigned long)order;
+@@ -2631,7 +2616,7 @@ void free_unref_folios(struct folio_batch *folios)
+ 			 */
+ 			if (is_migrate_isolate(migratetype)) {
+ 				free_one_page(zone, &folio->page, pfn,
+-					      order, migratetype, FPI_NONE);
++					      order, FPI_NONE);
+ 				continue;
+ 			}
+ 
+@@ -2644,7 +2629,7 @@ void free_unref_folios(struct folio_batch *folios)
+ 			if (unlikely(!pcp)) {
+ 				pcp_trylock_finish(UP_flags);
+ 				free_one_page(zone, &folio->page, pfn,
+-					      order, migratetype, FPI_NONE);
++					      order, FPI_NONE);
+ 				continue;
+ 			}
+ 			locked_zone = zone;
+@@ -7022,13 +7007,14 @@ bool take_page_off_buddy(struct page *page)
+ bool put_page_back_buddy(struct page *page)
+ {
+ 	struct zone *zone = page_zone(page);
+-	unsigned long pfn = page_to_pfn(page);
+ 	unsigned long flags;
+-	int migratetype = get_pfnblock_migratetype(page, pfn);
+ 	bool ret = false;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	if (put_page_testzero(page)) {
++		unsigned long pfn = page_to_pfn(page);
++		int migratetype = get_pfnblock_migratetype(page, pfn);
++
+ 		ClearPageHWPoisonTakenOff(page);
+ 		__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
+ 		if (TestClearPageHWPoison(page)) {
+-- 
+Gitee
+
+
+From 4f118ff5ee3c0ddc3b921a8f9bcabfe48fa2dab9 Mon Sep 17 00:00:00 2001
+From: Zi Yan <ziy@nvidia.com>
+Date: Wed, 18 Dec 2024 17:34:54 +0800
+Subject: [PATCH 11/19] mm: page_alloc: set migratetype inside move_freepages()
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit f37c0f6876a8eabe1477c87860460bc181f6cdbb
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f37c0f6876a8eabe1477c87860460bc181f6cdbb
+
+--------------------------------
+
+This avoids changing migratetype after move_freepages() or
+move_freepages_block(), which is error prone.  It also prepares for
+upcoming changes to fix move_freepages() not moving free pages partially
+in the range.
+
+Link: https://lkml.kernel.org/r/20240320180429.678181-9-hannes@cmpxchg.org
+Signed-off-by: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c     | 27 +++++++++++++--------------
+ mm/page_isolation.c |  7 +++----
+ 2 files changed, 16 insertions(+), 18 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index cae51fd0b7b2..6f59b8e73daa 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1597,9 +1597,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ #endif
+ 
+ /*
+- * Move the free pages in a range to the freelist tail of the requested type.
+- * Note that start_page and end_pages are not aligned on a pageblock
+- * boundary. If alignment is required, use move_freepages_block()
++ * Change the type of a block and move all its free pages to that
++ * type's freelist.
+  */
+ static int move_freepages(struct zone *zone, unsigned long start_pfn,
+ 			  unsigned long end_pfn, int migratetype)
+@@ -1609,6 +1608,9 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
+ 	unsigned int order;
+ 	int pages_moved = 0;
+ 
++	VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
++	VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn);
++
+ 	for (pfn = start_pfn; pfn <= end_pfn;) {
+ 		page = pfn_to_page(pfn);
+ 		if (!PageBuddy(page)) {
+@@ -1626,6 +1628,8 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
+ 		pages_moved += 1 << order;
+ 	}
+ 
++	set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype);
++
+ 	return pages_moved;
+ }
+ 
+@@ -1853,7 +1857,6 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
+ 			page_group_by_mobility_disabled) {
+ 		move_freepages(zone, start_pfn, end_pfn, start_type);
+-		set_pageblock_migratetype(page, start_type);
+ 		return __rmqueue_smallest(zone, order, start_type);
+ 	}
+ 
+@@ -1927,12 +1930,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+ 	/* Yoink! */
+ 	mt = get_pageblock_migratetype(page);
+ 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
+-	if (migratetype_is_mergeable(mt)) {
+-		if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) {
+-			set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
++	if (migratetype_is_mergeable(mt))
++		if (move_freepages_block(zone, page,
++					 MIGRATE_HIGHATOMIC) != -1)
+ 			zone->nr_reserved_highatomic += pageblock_nr_pages;
+-		}
+-	}
+ 
+ out_unlock:
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+@@ -2011,7 +2012,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * not fail on zone boundaries.
+ 			 */
+ 			WARN_ON_ONCE(ret == -1);
+-			set_pageblock_migratetype(page, ac->migratetype);
+ 			if (ret > 0) {
+ 				spin_unlock_irqrestore(&zone->lock, flags);
+ 				return ret;
+@@ -2710,10 +2710,9 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 			 * Only change normal pageblocks (i.e., they can merge
+ 			 * with others)
+ 			 */
+-			if (migratetype_is_mergeable(mt) &&
+-			    move_freepages_block(zone, page,
+-						 MIGRATE_MOVABLE) != -1)
+-				set_pageblock_migratetype(page, MIGRATE_MOVABLE);
++			if (migratetype_is_mergeable(mt))
++				move_freepages_block(zone, page,
++						     MIGRATE_MOVABLE);
+ 		}
+ 	}
+ 
+diff --git a/mm/page_isolation.c b/mm/page_isolation.c
+index b27ed476f80e..8fc4f9491417 100644
+--- a/mm/page_isolation.c
++++ b/mm/page_isolation.c
+@@ -189,7 +189,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
+ 			return -EBUSY;
+ 		}
+ 		__mod_zone_freepage_state(zone, -nr_pages, mt);
+-		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ 		zone->nr_isolate_pageblock++;
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 		return 0;
+@@ -263,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
+ 		 */
+ 		WARN_ON_ONCE(nr_pages == -1);
+ 		__mod_zone_freepage_state(zone, nr_pages, migratetype);
+-	}
+-	set_pageblock_migratetype(page, migratetype);
+-	if (isolated_page)
++	} else {
++		set_pageblock_migratetype(page, migratetype);
+ 		__putback_isolated_page(page, order, migratetype);
++	}
+ 	zone->nr_isolate_pageblock--;
+ out:
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+-- 
+Gitee
+
+
+From 38dfe2413fdbcdafee46edb83704f65c39eb4a74 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:55 +0800
+Subject: [PATCH 12/19] mm: page_isolation: prepare for hygienic freelists
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit fd919a85cd55be5d00a6a7372071f44c8eafb825
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fd919a85cd55be5d00a6a7372071f44c8eafb825
+
+--------------------------------
+
+Page isolation currently sets MIGRATE_ISOLATE on a block, then drops
+zone->lock and scans the block for straddling buddies to split up.
+Because this happens non-atomically wrt the page allocator, it's possible
+for allocations to get a buddy whose first block is a regular pcp
+migratetype but whose tail is isolated.  This means that in certain cases
+memory can still be allocated after isolation.  It will also trigger the
+freelist type hygiene warnings in subsequent patches.
+
+start_isolate_page_range()
+  isolate_single_pageblock()
+    set_migratetype_isolate(tail)
+      lock zone->lock
+      move_freepages_block(tail) // nop
+      set_pageblock_migratetype(tail)
+      unlock zone->lock
+                                                     __rmqueue_smallest()
+                                                       del_page_from_freelist(head)
+                                                       expand(head, head_mt)
+                                                         WARN(head_mt != tail_mt)
+    start_pfn = ALIGN_DOWN(MAX_ORDER_NR_PAGES)
+    for (pfn = start_pfn, pfn < end_pfn)
+      if (PageBuddy())
+        split_free_page(head)
+
+Introduce a variant of move_freepages_block() provided by the allocator
+specifically for page isolation; it moves free pages, converts the block,
+and handles the splitting of straddling buddies while holding zone->lock.
+
+The allocator knows that pageblocks and buddies are always naturally
+aligned, which means that buddies can only straddle blocks if they're
+actually >pageblock_order.  This means the search-and-split part can be
+simplified compared to what page isolation used to do.
+
+Also tighten up the page isolation code around the expectations of which
+pages can be large, and how they are freed.
+
+Based on extensive discussions with and invaluable input from Zi Yan.
+
+[hannes@cmpxchg.org: work around older gcc warning]
+  Link: https://lkml.kernel.org/r/20240321142426.GB777580@cmpxchg.org
+Link: https://lkml.kernel.org/r/20240320180429.678181-10-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/internal.h
+	mm/page_alloc.c
+	mm/page_isolation.c
+[ Context conflict due to miss MAX_PAGE_ORDER. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/page-isolation.h |   4 +-
+ mm/internal.h                  |   4 -
+ mm/page_alloc.c                | 204 +++++++++++++++++++--------------
+ mm/page_isolation.c            | 106 ++++++-----------
+ 4 files changed, 155 insertions(+), 163 deletions(-)
+
+diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
+index 8550b3c91480..c16db0067090 100644
+--- a/include/linux/page-isolation.h
++++ b/include/linux/page-isolation.h
+@@ -34,7 +34,9 @@ static inline bool is_migrate_isolate(int migratetype)
+ #define REPORT_FAILURE	0x2
+ 
+ void set_pageblock_migratetype(struct page *page, int migratetype);
+-int move_freepages_block(struct zone *zone, struct page *page, int migratetype);
++
++bool move_freepages_block_isolate(struct zone *zone, struct page *page,
++				  int migratetype);
+ 
+ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ 			     int migratetype, int flags, gfp_t gfp_flags);
+diff --git a/mm/internal.h b/mm/internal.h
+index 0478e5dab55b..de564608dfa6 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -693,10 +693,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+ void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
+ 		unsigned long, enum meminit_context, struct vmem_altmap *, int);
+ 
+-
+-int split_free_page(struct page *free_page,
+-			unsigned int order, unsigned long split_pfn_offset);
+-
+ #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ 
+ #define MAX_PAGE_ORDER	MAX_ORDER
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 6f59b8e73daa..3bc1502e42cf 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -826,64 +826,6 @@ static inline void __free_one_page(struct page *page,
+ 		page_reporting_notify_free(order);
+ }
+ 
+-/**
+- * split_free_page() -- split a free page at split_pfn_offset
+- * @free_page:		the original free page
+- * @order:		the order of the page
+- * @split_pfn_offset:	split offset within the page
+- *
+- * Return -ENOENT if the free page is changed, otherwise 0
+- *
+- * It is used when the free page crosses two pageblocks with different migratetypes
+- * at split_pfn_offset within the page. The split free page will be put into
+- * separate migratetype lists afterwards. Otherwise, the function achieves
+- * nothing.
+- */
+-int split_free_page(struct page *free_page,
+-			unsigned int order, unsigned long split_pfn_offset)
+-{
+-	struct zone *zone = page_zone(free_page);
+-	unsigned long free_page_pfn = page_to_pfn(free_page);
+-	unsigned long pfn;
+-	unsigned long flags;
+-	int free_page_order;
+-	int mt;
+-	int ret = 0;
+-
+-	if (split_pfn_offset == 0)
+-		return ret;
+-
+-	spin_lock_irqsave(&zone->lock, flags);
+-
+-	if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+-		ret = -ENOENT;
+-		goto out;
+-	}
+-
+-	mt = get_pfnblock_migratetype(free_page, free_page_pfn);
+-	if (likely(!is_migrate_isolate(mt)))
+-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
+-
+-	del_page_from_free_list(free_page, zone, order);
+-	for (pfn = free_page_pfn;
+-	     pfn < free_page_pfn + (1UL << order);) {
+-		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+-
+-		free_page_order = min_t(unsigned int,
+-					pfn ? __ffs(pfn) : order,
+-					__fls(split_pfn_offset));
+-		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+-				mt, FPI_NONE);
+-		pfn += 1UL << free_page_order;
+-		split_pfn_offset -= (1UL << free_page_order);
+-		/* we have done the first part, now switch to second part */
+-		if (split_pfn_offset == 0)
+-			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+-	}
+-out:
+-	spin_unlock_irqrestore(&zone->lock, flags);
+-	return ret;
+-}
+ /*
+  * A bad page could be due to a number of fields. Instead of multiple branches,
+  * try and check multiple fields with one check. The caller must do a detailed
+@@ -1685,8 +1627,8 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ 	return true;
+ }
+ 
+-int move_freepages_block(struct zone *zone, struct page *page,
+-			 int migratetype)
++static int move_freepages_block(struct zone *zone, struct page *page,
++				int migratetype)
+ {
+ 	unsigned long start_pfn, end_pfn;
+ 
+@@ -1697,6 +1639,123 @@ int move_freepages_block(struct zone *zone, struct page *page,
+ 	return move_freepages(zone, start_pfn, end_pfn, migratetype);
+ }
+ 
++#ifdef CONFIG_MEMORY_ISOLATION
++/* Look for a buddy that straddles start_pfn */
++static unsigned long find_large_buddy(unsigned long start_pfn)
++{
++	int order = 0;
++	struct page *page;
++	unsigned long pfn = start_pfn;
++
++	while (!PageBuddy(page = pfn_to_page(pfn))) {
++		/* Nothing found */
++		if (++order > MAX_PAGE_ORDER)
++			return start_pfn;
++		pfn &= ~0UL << order;
++	}
++
++	/*
++	 * Found a preceding buddy, but does it straddle?
++	 */
++	if (pfn + (1 << buddy_order(page)) > start_pfn)
++		return pfn;
++
++	/* Nothing found */
++	return start_pfn;
++}
++
++/* Split a multi-block free page into its individual pageblocks */
++static void split_large_buddy(struct zone *zone, struct page *page,
++			      unsigned long pfn, int order)
++{
++	unsigned long end_pfn = pfn + (1 << order);
++
++	VM_WARN_ON_ONCE(order <= pageblock_order);
++	VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
++
++	/* Caller removed page from freelist, buddy info cleared! */
++	VM_WARN_ON_ONCE(PageBuddy(page));
++
++	while (pfn != end_pfn) {
++		int mt = get_pfnblock_migratetype(page, pfn);
++
++		__free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
++		pfn += pageblock_nr_pages;
++		page = pfn_to_page(pfn);
++	}
++}
++
++/**
++ * move_freepages_block_isolate - move free pages in block for page isolation
++ * @zone: the zone
++ * @page: the pageblock page
++ * @migratetype: migratetype to set on the pageblock
++ *
++ * This is similar to move_freepages_block(), but handles the special
++ * case encountered in page isolation, where the block of interest
++ * might be part of a larger buddy spanning multiple pageblocks.
++ *
++ * Unlike the regular page allocator path, which moves pages while
++ * stealing buddies off the freelist, page isolation is interested in
++ * arbitrary pfn ranges that may have overlapping buddies on both ends.
++ *
++ * This function handles that. Straddling buddies are split into
++ * individual pageblocks. Only the block of interest is moved.
++ *
++ * Returns %true if pages could be moved, %false otherwise.
++ */
++bool move_freepages_block_isolate(struct zone *zone, struct page *page,
++				  int migratetype)
++{
++	unsigned long start_pfn, end_pfn, pfn;
++	int nr_moved, mt;
++
++	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
++				       NULL, NULL))
++		return false;
++
++	/* No splits needed if buddies can't span multiple blocks */
++	if (pageblock_order == MAX_PAGE_ORDER)
++		goto move;
++
++	/* We're a tail block in a larger buddy */
++	pfn = find_large_buddy(start_pfn);
++	if (pfn != start_pfn) {
++		struct page *buddy = pfn_to_page(pfn);
++		int order = buddy_order(buddy);
++		int mt = get_pfnblock_migratetype(buddy, pfn);
++
++		if (!is_migrate_isolate(mt))
++			__mod_zone_freepage_state(zone, -(1UL << order), mt);
++		del_page_from_free_list(buddy, zone, order);
++		set_pageblock_migratetype(page, migratetype);
++		split_large_buddy(zone, buddy, pfn, order);
++		return true;
++	}
++
++	/* We're the starting block of a larger buddy */
++	if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
++		int mt = get_pfnblock_migratetype(page, pfn);
++		int order = buddy_order(page);
++
++		if (!is_migrate_isolate(mt))
++			__mod_zone_freepage_state(zone, -(1UL << order), mt);
++		del_page_from_free_list(page, zone, order);
++		set_pageblock_migratetype(page, migratetype);
++		split_large_buddy(zone, page, pfn, order);
++		return true;
++	}
++move:
++	mt = get_pfnblock_migratetype(page, start_pfn);
++	nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype);
++	if (!is_migrate_isolate(mt))
++		__mod_zone_freepage_state(zone, -nr_moved, mt);
++	else if (!is_migrate_isolate(migratetype))
++		__mod_zone_freepage_state(zone, nr_moved, migratetype);
++	return true;
++}
++#endif /* CONFIG_MEMORY_ISOLATION */
++
+ static void change_pageblock_range(struct page *pageblock_page,
+ 					int start_order, int migratetype)
+ {
+@@ -6575,7 +6634,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
+ 		       unsigned migratetype, gfp_t gfp_mask)
+ {
+ 	unsigned long outer_start, outer_end;
+-	int order;
+ 	int ret = 0;
+ 
+ 	struct compact_control cc = {
+@@ -6648,29 +6706,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
+ 	 * We don't have to hold zone->lock here because the pages are
+ 	 * isolated thus they won't get removed from buddy.
+ 	 */
+-
+-	order = 0;
+-	outer_start = start;
+-	while (!PageBuddy(pfn_to_page(outer_start))) {
+-		if (++order > MAX_ORDER) {
+-			outer_start = start;
+-			break;
+-		}
+-		outer_start &= ~0UL << order;
+-	}
+-
+-	if (outer_start != start) {
+-		order = buddy_order(pfn_to_page(outer_start));
+-
+-		/*
+-		 * outer_start page could be small order buddy page and
+-		 * it doesn't include start page. Adjust outer_start
+-		 * in this case to report failed page properly
+-		 * on tracepoint in test_pages_isolated()
+-		 */
+-		if (outer_start + (1UL << order) <= start)
+-			outer_start = start;
+-	}
++	outer_start = find_large_buddy(start);
+ 
+ 	/* Make sure the range is really isolated. */
+ 	if (test_pages_isolated(outer_start, end, 0)) {
+diff --git a/mm/page_isolation.c b/mm/page_isolation.c
+index 8fc4f9491417..b3aae89ed226 100644
+--- a/mm/page_isolation.c
++++ b/mm/page_isolation.c
+@@ -179,16 +179,10 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
+ 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+ 			migratetype, isol_flags);
+ 	if (!unmovable) {
+-		int nr_pages;
+-		int mt = get_pageblock_migratetype(page);
+-
+-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+-		/* Block spans zone boundaries? */
+-		if (nr_pages == -1) {
++		if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
+ 			spin_unlock_irqrestore(&zone->lock, flags);
+ 			return -EBUSY;
+ 		}
+-		__mod_zone_freepage_state(zone, -nr_pages, mt);
+ 		zone->nr_isolate_pageblock++;
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 		return 0;
+@@ -255,13 +249,11 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
+ 	 * allocation.
+ 	 */
+ 	if (!isolated_page) {
+-		int nr_pages = move_freepages_block(zone, page, migratetype);
+ 		/*
+ 		 * Isolating this block already succeeded, so this
+ 		 * should not fail on zone boundaries.
+ 		 */
+-		WARN_ON_ONCE(nr_pages == -1);
+-		__mod_zone_freepage_state(zone, nr_pages, migratetype);
++		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+ 	} else {
+ 		set_pageblock_migratetype(page, migratetype);
+ 		__putback_isolated_page(page, order, migratetype);
+@@ -377,26 +369,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ 
+ 		VM_BUG_ON(!page);
+ 		pfn = page_to_pfn(page);
+-		/*
+-		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+-		 * free pages in [start_pfn, boundary_pfn), its head page will
+-		 * always be in the range.
+-		 */
++
+ 		if (PageBuddy(page)) {
+ 			int order = buddy_order(page);
+ 
+-			if (pfn + (1UL << order) > boundary_pfn) {
+-				/* free page changed before split, check it again */
+-				if (split_free_page(page, order, boundary_pfn - pfn))
+-					continue;
+-			}
++			/* move_freepages_block_isolate() handled this */
++			VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
+ 
+ 			pfn += 1UL << order;
+ 			continue;
+ 		}
++
+ 		/*
+-		 * migrate compound pages then let the free page handling code
+-		 * above do the rest. If migration is not possible, just fail.
++		 * If a compound page is straddling our block, attempt
++		 * to migrate it out of the way.
++		 *
++		 * We don't have to worry about this creating a large
++		 * free page that straddles into our block: gigantic
++		 * pages are freed as order-0 chunks, and LRU pages
++		 * (currently) do not exceed pageblock_order.
++		 *
++		 * The block of interest has already been marked
++		 * MIGRATE_ISOLATE above, so when migration is done it
++		 * will free its pages onto the correct freelists.
+ 		 */
+ 		if (PageCompound(page)) {
+ 			struct page *head = compound_head(page);
+@@ -407,16 +402,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ 				pfn = head_pfn + nr_pages;
+ 				continue;
+ 			}
++
+ #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+-			/*
+-			 * hugetlb, lru compound (THP), and movable compound pages
+-			 * can be migrated. Otherwise, fail the isolation.
+-			 */
+-			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+-				int order;
+-				unsigned long outer_pfn;
++			if (PageHuge(page)) {
+ 				int page_mt = get_pageblock_migratetype(page);
+-				bool isolate_page = !is_migrate_isolate_page(page);
+ 				struct compact_control cc = {
+ 					.nr_migratepages = 0,
+ 					.order = -1,
+@@ -429,56 +418,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ 				};
+ 				INIT_LIST_HEAD(&cc.migratepages);
+ 
+-				/*
+-				 * XXX: mark the page as MIGRATE_ISOLATE so that
+-				 * no one else can grab the freed page after migration.
+-				 * Ideally, the page should be freed as two separate
+-				 * pages to be added into separate migratetype free
+-				 * lists.
+-				 */
+-				if (isolate_page) {
+-					ret = set_migratetype_isolate(page, page_mt,
+-						flags, head_pfn, head_pfn + nr_pages);
+-					if (ret)
+-						goto failed;
+-				}
+-
+ 				ret = __alloc_contig_migrate_range(&cc, head_pfn,
+ 							head_pfn + nr_pages, page_mt);
+-
+-				/*
+-				 * restore the page's migratetype so that it can
+-				 * be split into separate migratetype free lists
+-				 * later.
+-				 */
+-				if (isolate_page)
+-					unset_migratetype_isolate(page, page_mt);
+-
+ 				if (ret)
+ 					goto failed;
+-				/*
+-				 * reset pfn to the head of the free page, so
+-				 * that the free page handling code above can split
+-				 * the free page to the right migratetype list.
+-				 *
+-				 * head_pfn is not used here as a hugetlb page order
+-				 * can be bigger than MAX_ORDER, but after it is
+-				 * freed, the free page order is not. Use pfn within
+-				 * the range to find the head of the free page.
+-				 */
+-				order = 0;
+-				outer_pfn = pfn;
+-				while (!PageBuddy(pfn_to_page(outer_pfn))) {
+-					/* stop if we cannot find the free page */
+-					if (++order > MAX_ORDER)
+-						goto failed;
+-					outer_pfn &= ~0UL << order;
+-				}
+-				pfn = outer_pfn;
++				pfn = head_pfn + nr_pages;
+ 				continue;
+-			} else
++			}
++
++			/*
++			 * These pages are movable too, but they're
++			 * not expected to exceed pageblock_order.
++			 *
++			 * Let us know when they do, so we can add
++			 * proper free and split handling for them.
++			 */
++			VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
++			VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+ #endif
+-				goto failed;
++			goto failed;
+ 		}
+ 
+ 		pfn++;
+-- 
+Gitee
+
+
+From adfca1d2fd31fb7cc452fbcbf8ee0c5cb961f02f Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:56 +0800
+Subject: [PATCH 13/19] mm: page_alloc: consolidate free page accounting
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit e0932b6c1f942fa747258e152cdce0d0b2b5be5c
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e0932b6c1f942fa747258e152cdce0d0b2b5be5c
+
+--------------------------------
+
+Free page accounting currently happens a bit too high up the call stack,
+where it has to deal with guard pages, compaction capturing, block
+stealing and even page isolation.  This is subtle and fragile, and makes
+it difficult to hack on the code.
+
+Now that type violations on the freelists have been fixed, push the
+accounting down to where pages enter and leave the freelist.
+
+[hannes@cmpxchg.org: undo unrelated drive-by line wrap]
+  Link: https://lkml.kernel.org/r/20240327185736.GA7597@cmpxchg.org
+[hannes@cmpxchg.org: remove unused page parameter from account_freepages()]
+  Link: https://lkml.kernel.org/r/20240327185831.GB7597@cmpxchg.org
+[baolin.wang@linux.alibaba.com: fix free page accounting]
+  Link: https://lkml.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang@linux.alibaba.com
+[andriy.shevchenko@linux.intel.com: avoid defining unused function]
+  Link: https://lkml.kernel.org/r/20240423161506.2637177-1-andriy.shevchenko@linux.intel.com
+Link: https://lkml.kernel.org/r/20240320180429.678181-11-hannes@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/page_alloc.c
+[ Context conflicts due to miss MAX_PAGE_ORDER. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/mm.h     |  18 ++--
+ include/linux/vmstat.h |   8 --
+ mm/debug_page_alloc.c  |  12 +--
+ mm/internal.h          |   5 --
+ mm/page_alloc.c        | 192 +++++++++++++++++++++++------------------
+ 5 files changed, 118 insertions(+), 117 deletions(-)
+
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 2e6ef9532fc3..b6dcdaafc592 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3819,24 +3819,22 @@ static inline bool page_is_guard(struct page *page)
+ 	return PageGuard(page);
+ }
+ 
+-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-		      int migratetype);
++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
+ static inline bool set_page_guard(struct zone *zone, struct page *page,
+-				  unsigned int order, int migratetype)
++				  unsigned int order)
+ {
+ 	if (!debug_guardpage_enabled())
+ 		return false;
+-	return __set_page_guard(zone, page, order, migratetype);
++	return __set_page_guard(zone, page, order);
+ }
+ 
+-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-			int migratetype);
++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
+ static inline void clear_page_guard(struct zone *zone, struct page *page,
+-				    unsigned int order, int migratetype)
++				    unsigned int order)
+ {
+ 	if (!debug_guardpage_enabled())
+ 		return;
+-	__clear_page_guard(zone, page, order, migratetype);
++	__clear_page_guard(zone, page, order);
+ }
+ 
+ #else	/* CONFIG_DEBUG_PAGEALLOC */
+@@ -3846,9 +3844,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+ static inline bool debug_guardpage_enabled(void) { return false; }
+ static inline bool page_is_guard(struct page *page) { return false; }
+ static inline bool set_page_guard(struct zone *zone, struct page *page,
+-			unsigned int order, int migratetype) { return false; }
++			unsigned int order) { return false; }
+ static inline void clear_page_guard(struct zone *zone, struct page *page,
+-				unsigned int order, int migratetype) {}
++				unsigned int order) {}
+ #endif	/* CONFIG_DEBUG_PAGEALLOC */
+ 
+ #ifdef __HAVE_ARCH_GATE_AREA
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index 343906a98d6e..735eae6e272c 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio,
+ 	mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+ }
+ 
+-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
+-					     int migratetype)
+-{
+-	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+-	if (is_migrate_cma(migratetype))
+-		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+-}
+-
+ extern const char * const vmstat_text[];
+ 
+ static inline const char *zone_stat_name(enum zone_stat_item item)
+diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
+index f9d145730fd1..03a810927d0a 100644
+--- a/mm/debug_page_alloc.c
++++ b/mm/debug_page_alloc.c
+@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
+ }
+ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+ 
+-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-		      int migratetype)
++bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order)
+ {
+ 	if (order >= debug_guardpage_minorder())
+ 		return false;
+@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ 	__SetPageGuard(page);
+ 	INIT_LIST_HEAD(&page->buddy_list);
+ 	set_page_private(page, order);
+-	/* Guard pages are not available for any usage */
+-	if (!is_migrate_isolate(migratetype))
+-		__mod_zone_freepage_state(zone, -(1 << order), migratetype);
+ 
+ 	return true;
+ }
+ 
+-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+-		      int migratetype)
++void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order)
+ {
+ 	__ClearPageGuard(page);
+-
+ 	set_page_private(page, 0);
+-	if (!is_migrate_isolate(migratetype))
+-		__mod_zone_freepage_state(zone, (1 << order), migratetype);
+ }
+diff --git a/mm/internal.h b/mm/internal.h
+index de564608dfa6..8742aafde387 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -1171,11 +1171,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype)
+ 	return migratetype == MIGRATE_HIGHATOMIC;
+ }
+ 
+-static inline bool is_migrate_highatomic_page(struct page *page)
+-{
+-	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+-}
+-
+ void setup_zone_pageset(struct zone *zone);
+ 
+ struct migration_target_control {
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3bc1502e42cf..d662bbdf2e91 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -636,23 +636,33 @@ compaction_capture(struct capture_control *capc, struct page *page,
+ }
+ #endif /* CONFIG_COMPACTION */
+ 
+-/* Used for pages not on another list */
+-static inline void add_to_free_list(struct page *page, struct zone *zone,
+-				    unsigned int order, int migratetype)
++static inline void account_freepages(struct zone *zone, int nr_pages,
++				     int migratetype)
+ {
+-	struct free_area *area = &zone->free_area[order];
++	if (is_migrate_isolate(migratetype))
++		return;
+ 
+-	list_add(&page->buddy_list, &area->free_list[migratetype]);
+-	area->nr_free++;
++	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
++
++	if (is_migrate_cma(migratetype))
++		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+ }
+ 
+ /* Used for pages not on another list */
+-static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+-					 unsigned int order, int migratetype)
++static inline void __add_to_free_list(struct page *page, struct zone *zone,
++				      unsigned int order, int migratetype,
++				      bool tail)
+ {
+ 	struct free_area *area = &zone->free_area[order];
+ 
+-	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
++	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
++		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
++		     get_pageblock_migratetype(page), migratetype, 1 << order);
++
++	if (tail)
++		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
++	else
++		list_add(&page->buddy_list, &area->free_list[migratetype]);
+ 	area->nr_free++;
+ }
+ 
+@@ -662,16 +672,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+  * allocation again (e.g., optimization for memory onlining).
+  */
+ static inline void move_to_free_list(struct page *page, struct zone *zone,
+-				     unsigned int order, int migratetype)
++				     unsigned int order, int old_mt, int new_mt)
+ {
+ 	struct free_area *area = &zone->free_area[order];
+ 
+-	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
++	/* Free page moving can fail, so it happens before the type update */
++	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
++		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
++		     get_pageblock_migratetype(page), old_mt, 1 << order);
++
++	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
++
++	account_freepages(zone, -(1 << order), old_mt);
++	account_freepages(zone, 1 << order, new_mt);
+ }
+ 
+-static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+-					   unsigned int order)
++static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
++					     unsigned int order, int migratetype)
+ {
++        VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
++		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
++		     get_pageblock_migratetype(page), migratetype, 1 << order);
++
+ 	/* clear reported state and update reported page count */
+ 	if (page_reported(page))
+ 		__ClearPageReported(page);
+@@ -682,6 +704,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+ 	zone->free_area[order].nr_free--;
+ }
+ 
++static inline void del_page_from_free_list(struct page *page, struct zone *zone,
++					   unsigned int order, int migratetype)
++{
++	__del_page_from_free_list(page, zone, order, migratetype);
++	account_freepages(zone, -(1 << order), migratetype);
++}
++
+ static inline struct page *get_page_from_free_area(struct free_area *area,
+ 					    int migratetype)
+ {
+@@ -753,16 +782,16 @@ static inline void __free_one_page(struct page *page,
+ 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ 
+ 	VM_BUG_ON(migratetype == -1);
+-	if (likely(!is_migrate_isolate(migratetype)))
+-		__mod_zone_freepage_state(zone, 1 << order, migratetype);
+-
+ 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
+ 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
+ 
++	account_freepages(zone, 1 << order, migratetype);
++
+ 	while (order < MAX_ORDER) {
++		int buddy_mt = migratetype;
++
+ 		if (compaction_capture(capc, page, order, migratetype)) {
+-			__mod_zone_freepage_state(zone, -(1 << order),
+-								migratetype);
++			account_freepages(zone, -(1 << order), migratetype);
+ 			return;
+ 		}
+ 
+@@ -777,19 +806,12 @@ static inline void __free_one_page(struct page *page,
+ 			 * pageblock isolation could cause incorrect freepage or CMA
+ 			 * accounting or HIGHATOMIC accounting.
+ 			 */
+-			int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
++			buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
+ 
+-			if (migratetype != buddy_mt) {
+-				if (!migratetype_is_mergeable(migratetype) ||
+-				    !migratetype_is_mergeable(buddy_mt))
+-					goto done_merging;
+-				/*
+-				 * Match buddy type. This ensures that
+-				 * an expand() down the line puts the
+-				 * sub-blocks on the right freelists.
+-				 */
+-				set_pageblock_migratetype(buddy, migratetype);
+-			}
++			if (migratetype != buddy_mt &&
++			    (!migratetype_is_mergeable(migratetype) ||
++			     !migratetype_is_mergeable(buddy_mt)))
++				goto done_merging;
+ 		}
+ 
+ 		/*
+@@ -797,9 +819,19 @@ static inline void __free_one_page(struct page *page,
+ 		 * merge with it and move up one order.
+ 		 */
+ 		if (page_is_guard(buddy))
+-			clear_page_guard(zone, buddy, order, migratetype);
++			clear_page_guard(zone, buddy, order);
+ 		else
+-			del_page_from_free_list(buddy, zone, order);
++			__del_page_from_free_list(buddy, zone, order, buddy_mt);
++
++		if (unlikely(buddy_mt != migratetype)) {
++			/*
++			 * Match buddy type. This ensures that an
++			 * expand() down the line puts the sub-blocks
++			 * on the right freelists.
++			 */
++			set_pageblock_migratetype(buddy, migratetype);
++		}
++
+ 		combined_pfn = buddy_pfn & pfn;
+ 		page = page + (combined_pfn - pfn);
+ 		pfn = combined_pfn;
+@@ -816,10 +848,7 @@ static inline void __free_one_page(struct page *page,
+ 	else
+ 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
+ 
+-	if (to_tail)
+-		add_to_free_list_tail(page, zone, order, migratetype);
+-	else
+-		add_to_free_list(page, zone, order, migratetype);
++	__add_to_free_list(page, zone, order, migratetype, to_tail);
+ 
+ 	/* Notify page reporting subsystem of freed page */
+ 	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
+@@ -1309,10 +1338,10 @@ static inline void expand(struct zone *zone, struct page *page,
+ 		 * Corresponding page table entries will not be touched,
+ 		 * pages will stay not present in virtual address space
+ 		 */
+-		if (set_page_guard(zone, &page[size], high, migratetype))
++		if (set_page_guard(zone, &page[size], high))
+ 			continue;
+ 
+-		add_to_free_list(&page[size], zone, high, migratetype);
++		add_to_free_list(&page[size], zone, high, migratetype, false);
+ 		set_buddy_order(&page[size], high);
+ 	}
+ }
+@@ -1503,7 +1532,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ 		page = get_page_from_free_area(area, migratetype);
+ 		if (!page)
+ 			continue;
+-		del_page_from_free_list(page, zone, current_order);
++		del_page_from_free_list(page, zone, current_order, migratetype);
+ 		expand(zone, page, order, current_order, migratetype);
+ 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ 				pcp_allowed_order(order) &&
+@@ -1543,7 +1572,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+  * type's freelist.
+  */
+ static int move_freepages(struct zone *zone, unsigned long start_pfn,
+-			  unsigned long end_pfn, int migratetype)
++			  unsigned long end_pfn, int old_mt, int new_mt)
+ {
+ 	struct page *page;
+ 	unsigned long pfn;
+@@ -1565,12 +1594,14 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
+ 		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+ 
+ 		order = buddy_order(page);
+-		move_to_free_list(page, zone, order, migratetype);
++
++		move_to_free_list(page, zone, order, old_mt, new_mt);
++
+ 		pfn += 1 << order;
+ 		pages_moved += 1 << order;
+ 	}
+ 
+-	set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype);
++	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+ 
+ 	return pages_moved;
+ }
+@@ -1628,7 +1659,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ }
+ 
+ static int move_freepages_block(struct zone *zone, struct page *page,
+-				int migratetype)
++				int old_mt, int new_mt)
+ {
+ 	unsigned long start_pfn, end_pfn;
+ 
+@@ -1636,7 +1667,7 @@ static int move_freepages_block(struct zone *zone, struct page *page,
+ 				       NULL, NULL))
+ 		return -1;
+ 
+-	return move_freepages(zone, start_pfn, end_pfn, migratetype);
++	return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt);
+ }
+ 
+ #ifdef CONFIG_MEMORY_ISOLATION
+@@ -1708,7 +1739,6 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ 				  int migratetype)
+ {
+ 	unsigned long start_pfn, end_pfn, pfn;
+-	int nr_moved, mt;
+ 
+ 	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+ 				       NULL, NULL))
+@@ -1723,11 +1753,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ 	if (pfn != start_pfn) {
+ 		struct page *buddy = pfn_to_page(pfn);
+ 		int order = buddy_order(buddy);
+-		int mt = get_pfnblock_migratetype(buddy, pfn);
+ 
+-		if (!is_migrate_isolate(mt))
+-			__mod_zone_freepage_state(zone, -(1UL << order), mt);
+-		del_page_from_free_list(buddy, zone, order);
++		del_page_from_free_list(buddy, zone, order,
++					get_pfnblock_migratetype(buddy, pfn));
+ 		set_pageblock_migratetype(page, migratetype);
+ 		split_large_buddy(zone, buddy, pfn, order);
+ 		return true;
+@@ -1735,23 +1763,17 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ 
+ 	/* We're the starting block of a larger buddy */
+ 	if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
+-		int mt = get_pfnblock_migratetype(page, pfn);
+ 		int order = buddy_order(page);
+ 
+-		if (!is_migrate_isolate(mt))
+-			__mod_zone_freepage_state(zone, -(1UL << order), mt);
+-		del_page_from_free_list(page, zone, order);
++		del_page_from_free_list(page, zone, order,
++					get_pfnblock_migratetype(page, pfn));
+ 		set_pageblock_migratetype(page, migratetype);
+ 		split_large_buddy(zone, page, pfn, order);
+ 		return true;
+ 	}
+ move:
+-	mt = get_pfnblock_migratetype(page, start_pfn);
+-	nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype);
+-	if (!is_migrate_isolate(mt))
+-		__mod_zone_freepage_state(zone, -nr_moved, mt);
+-	else if (!is_migrate_isolate(migratetype))
+-		__mod_zone_freepage_state(zone, nr_moved, migratetype);
++	move_freepages(zone, start_pfn, end_pfn,
++		       get_pfnblock_migratetype(page, start_pfn), migratetype);
+ 	return true;
+ }
+ #endif /* CONFIG_MEMORY_ISOLATION */
+@@ -1865,7 +1887,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 
+ 	/* Take ownership for orders >= pageblock_order */
+ 	if (current_order >= pageblock_order) {
+-		del_page_from_free_list(page, zone, current_order);
++		del_page_from_free_list(page, zone, current_order, block_type);
+ 		change_pageblock_range(page, current_order, start_type);
+ 		expand(zone, page, order, current_order, start_type);
+ 		return page;
+@@ -1915,12 +1937,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	 */
+ 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
+ 			page_group_by_mobility_disabled) {
+-		move_freepages(zone, start_pfn, end_pfn, start_type);
++		move_freepages(zone, start_pfn, end_pfn, block_type, start_type);
+ 		return __rmqueue_smallest(zone, order, start_type);
+ 	}
+ 
+ single_page:
+-	del_page_from_free_list(page, zone, current_order);
++	del_page_from_free_list(page, zone, current_order, block_type);
+ 	expand(zone, page, order, current_order, block_type);
+ 	return page;
+ }
+@@ -1990,7 +2012,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+ 	mt = get_pageblock_migratetype(page);
+ 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
+ 	if (migratetype_is_mergeable(mt))
+-		if (move_freepages_block(zone, page,
++		if (move_freepages_block(zone, page, mt,
+ 					 MIGRATE_HIGHATOMIC) != -1)
+ 			zone->nr_reserved_highatomic += pageblock_nr_pages;
+ 
+@@ -2031,11 +2053,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 		spin_lock_irqsave(&zone->lock, flags);
+ 		for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ 			struct free_area *area = &(zone->free_area[order]);
++			int mt;
+ 
+ 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ 			if (!page)
+ 				continue;
+ 
++			mt = get_pageblock_migratetype(page);
+ 			/*
+ 			 * In page freeing path, migratetype change is racy so
+ 			 * we can counter several free pages in a pageblock
+@@ -2043,7 +2067,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * from highatomic to ac->migratetype. So we should
+ 			 * adjust the count once.
+ 			 */
+-			if (is_migrate_highatomic_page(page)) {
++			if (is_migrate_highatomic(mt)) {
+ 				/*
+ 				 * It should never happen but changes to
+ 				 * locking could inadvertently allow a per-cpu
+@@ -2065,7 +2089,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * of pageblocks that cannot be completely freed
+ 			 * may increase.
+ 			 */
+-			ret = move_freepages_block(zone, page, ac->migratetype);
++			ret = move_freepages_block(zone, page, mt,
++						   ac->migratetype);
+ 			/*
+ 			 * Reserving this block already succeeded, so this should
+ 			 * not fail on zone boundaries.
+@@ -2236,12 +2261,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		 * pages are ordered properly.
+ 		 */
+ 		list_add_tail(&page->pcp_list, list);
+-		if (is_migrate_cma(get_pageblock_migratetype(page)))
+-			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+-					      -(1 << order));
+ 	}
+-
+-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ 
+ 	return i;
+@@ -2751,11 +2771,9 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
+ 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ 			return 0;
+-
+-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
+ 	}
+ 
+-	del_page_from_free_list(page, zone, order);
++	del_page_from_free_list(page, zone, order, mt);
+ 
+ 	/*
+ 	 * Set the pageblock if the isolated page is at least half of a
+@@ -2770,7 +2788,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
+ 			 * with others)
+ 			 */
+ 			if (migratetype_is_mergeable(mt))
+-				move_freepages_block(zone, page,
++				move_freepages_block(zone, page, mt,
+ 						     MIGRATE_MOVABLE);
+ 		}
+ 	}
+@@ -2855,8 +2873,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ 				return NULL;
+ 			}
+ 		}
+-		__mod_zone_freepage_state(zone, -(1 << order),
+-					  get_pageblock_migratetype(page));
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 	} while (check_new_pages(page, order));
+ 
+@@ -6940,8 +6956,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+ 
+ 		BUG_ON(page_count(page));
+ 		BUG_ON(!PageBuddy(page));
++		VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
+ 		order = buddy_order(page);
+-		del_page_from_free_list(page, zone, order);
++		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
+ 		pfn += (1 << order);
+ 	}
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+@@ -6969,6 +6986,14 @@ bool is_free_buddy_page(struct page *page)
+ EXPORT_SYMBOL(is_free_buddy_page);
+ 
+ #ifdef CONFIG_MEMORY_FAILURE
++static inline void add_to_free_list(struct page *page, struct zone *zone,
++				    unsigned int order, int migratetype,
++				    bool tail)
++{
++	__add_to_free_list(page, zone, order, migratetype, tail);
++	account_freepages(zone, 1 << order, migratetype);
++}
++
+ /*
+  * Break down a higher-order page in sub-pages, and keep our target out of
+  * buddy allocator.
+@@ -6991,10 +7016,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ 			current_buddy = page + size;
+ 		}
+ 
+-		if (set_page_guard(zone, current_buddy, high, migratetype))
++		if (set_page_guard(zone, current_buddy, high))
+ 			continue;
+ 
+-		add_to_free_list(current_buddy, zone, high, migratetype);
++		add_to_free_list(current_buddy, zone, high, migratetype, false);
+ 		set_buddy_order(current_buddy, high);
+ 	}
+ }
+@@ -7020,12 +7045,11 @@ bool take_page_off_buddy(struct page *page)
+ 			int migratetype = get_pfnblock_migratetype(page_head,
+ 								   pfn_head);
+ 
+-			del_page_from_free_list(page_head, zone, page_order);
++			del_page_from_free_list(page_head, zone, page_order,
++						migratetype);
+ 			break_down_buddy_pages(zone, page_head, page, 0,
+ 						page_order, migratetype);
+ 			SetPageHWPoisonTakenOff(page);
+-			if (!is_migrate_isolate(migratetype))
+-				__mod_zone_freepage_state(zone, -1, migratetype);
+ 			ret = true;
+ 			break;
+ 		}
+@@ -7130,7 +7154,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
+ 	list_del(&page->lru);
+ 	last = list_empty(&zone->unaccepted_pages);
+ 
+-	__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
++	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ 
+@@ -7188,7 +7212,7 @@ static bool __free_unaccepted(struct page *page)
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	first = list_empty(&zone->unaccepted_pages);
+ 	list_add_tail(&page->lru, &zone->unaccepted_pages);
+-	__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
++	account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ 	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+ 
+-- 
+Gitee
+
+
+From b0fd1d17c59627bb8b64234dc3d7b278f8b4656e Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Wed, 18 Dec 2024 17:34:57 +0800
+Subject: [PATCH 14/19] mm: page_alloc: change move_freepages() to
+ __move_freepages_block()
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit e1f42a577f63647dadf1abe4583053c03d6be045
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e1f42a577f63647dadf1abe4583053c03d6be045
+
+--------------------------------
+
+The function is now supposed to be called only on a single pageblock and
+checks start_pfn and end_pfn accordingly.  Rename it to make this more
+obvious and drop the end_pfn parameter which can be determined trivially
+and none of the callers use it for anything else.
+
+Also make the (now internal) end_pfn exclusive, which is more common.
+
+Link: https://lkml.kernel.org/r/81b1d642-2ec0-49f5-89fc-19a3828419ff@suse.cz
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 43 ++++++++++++++++++++-----------------------
+ 1 file changed, 20 insertions(+), 23 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index d662bbdf2e91..7270d665fc53 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1571,18 +1571,18 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+  * Change the type of a block and move all its free pages to that
+  * type's freelist.
+  */
+-static int move_freepages(struct zone *zone, unsigned long start_pfn,
+-			  unsigned long end_pfn, int old_mt, int new_mt)
++static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
++				  int old_mt, int new_mt)
+ {
+ 	struct page *page;
+-	unsigned long pfn;
++	unsigned long pfn, end_pfn;
+ 	unsigned int order;
+ 	int pages_moved = 0;
+ 
+ 	VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
+-	VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn);
++	end_pfn = pageblock_end_pfn(start_pfn);
+ 
+-	for (pfn = start_pfn; pfn <= end_pfn;) {
++	for (pfn = start_pfn; pfn < end_pfn;) {
+ 		page = pfn_to_page(pfn);
+ 		if (!PageBuddy(page)) {
+ 			pfn++;
+@@ -1608,14 +1608,13 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
+ 
+ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ 				      unsigned long *start_pfn,
+-				      unsigned long *end_pfn,
+ 				      int *num_free, int *num_movable)
+ {
+ 	unsigned long pfn, start, end;
+ 
+ 	pfn = page_to_pfn(page);
+ 	start = pageblock_start_pfn(pfn);
+-	end = pageblock_end_pfn(pfn) - 1;
++	end = pageblock_end_pfn(pfn);
+ 
+ 	/*
+ 	 * The caller only has the lock for @zone, don't touch ranges
+@@ -1626,16 +1625,15 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ 	 */
+ 	if (!zone_spans_pfn(zone, start))
+ 		return false;
+-	if (!zone_spans_pfn(zone, end))
++	if (!zone_spans_pfn(zone, end - 1))
+ 		return false;
+ 
+ 	*start_pfn = start;
+-	*end_pfn = end;
+ 
+ 	if (num_free) {
+ 		*num_free = 0;
+ 		*num_movable = 0;
+-		for (pfn = start; pfn <= end;) {
++		for (pfn = start; pfn < end;) {
+ 			page = pfn_to_page(pfn);
+ 			if (PageBuddy(page)) {
+ 				int nr = 1 << buddy_order(page);
+@@ -1661,13 +1659,12 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ static int move_freepages_block(struct zone *zone, struct page *page,
+ 				int old_mt, int new_mt)
+ {
+-	unsigned long start_pfn, end_pfn;
++	unsigned long start_pfn;
+ 
+-	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+-				       NULL, NULL))
++	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
+ 		return -1;
+ 
+-	return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt);
++	return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ }
+ 
+ #ifdef CONFIG_MEMORY_ISOLATION
+@@ -1738,10 +1735,9 @@ static void split_large_buddy(struct zone *zone, struct page *page,
+ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ 				  int migratetype)
+ {
+-	unsigned long start_pfn, end_pfn, pfn;
++	unsigned long start_pfn, pfn;
+ 
+-	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+-				       NULL, NULL))
++	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
+ 		return false;
+ 
+ 	/* No splits needed if buddies can't span multiple blocks */
+@@ -1772,8 +1768,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ 		return true;
+ 	}
+ move:
+-	move_freepages(zone, start_pfn, end_pfn,
+-		       get_pfnblock_migratetype(page, start_pfn), migratetype);
++	__move_freepages_block(zone, start_pfn,
++			       get_pfnblock_migratetype(page, start_pfn),
++			       migratetype);
+ 	return true;
+ }
+ #endif /* CONFIG_MEMORY_ISOLATION */
+@@ -1873,7 +1870,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 			unsigned int alloc_flags, bool whole_block)
+ {
+ 	int free_pages, movable_pages, alike_pages;
+-	unsigned long start_pfn, end_pfn;
++	unsigned long start_pfn;
+ 	int block_type;
+ 
+ 	block_type = get_pageblock_migratetype(page);
+@@ -1906,8 +1903,8 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 		goto single_page;
+ 
+ 	/* moving whole block can fail due to zone boundary conditions */
+-	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+-				       &free_pages, &movable_pages))
++	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
++				       &movable_pages))
+ 		goto single_page;
+ 
+ 	/*
+@@ -1937,7 +1934,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	 */
+ 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
+ 			page_group_by_mobility_disabled) {
+-		move_freepages(zone, start_pfn, end_pfn, block_type, start_type);
++		__move_freepages_block(zone, start_pfn, block_type, start_type);
+ 		return __rmqueue_smallest(zone, order, start_type);
+ 	}
+ 
+-- 
+Gitee
+
+
+From e71a8422c09179c193a4806cdc764fe626fb4344 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:58 +0800
+Subject: [PATCH 15/19] mm: page_alloc: batch vmstat updates in expand()
+
+mainline inclusion
+from mainline-v6.10-rc1
+commit 883dd161e9a83e188487debc562b1928917a4b39
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=883dd161e9a83e188487debc562b1928917a4b39
+
+--------------------------------
+
+expand() currently updates vmstat for every subpage.  This is unnecessary,
+since they're all of the same zone and migratetype.
+
+Count added pages locally, then do a single vmstat update.
+
+Link: https://lkml.kernel.org/r/20240327190111.GC7597@cmpxchg.org
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Suggested-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 7270d665fc53..4e2ec54b6a7f 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1326,6 +1326,7 @@ static inline void expand(struct zone *zone, struct page *page,
+ 	int low, int high, int migratetype)
+ {
+ 	unsigned long size = 1 << high;
++	unsigned long nr_added = 0;
+ 
+ 	while (high > low) {
+ 		high--;
+@@ -1341,9 +1342,11 @@ static inline void expand(struct zone *zone, struct page *page,
+ 		if (set_page_guard(zone, &page[size], high))
+ 			continue;
+ 
+-		add_to_free_list(&page[size], zone, high, migratetype, false);
++		__add_to_free_list(&page[size], zone, high, migratetype, false);
+ 		set_buddy_order(&page[size], high);
++		nr_added += size;
+ 	}
++	account_freepages(zone, nr_added, migratetype);
+ }
+ 
+ static void check_new_page_bad(struct page *page)
+-- 
+Gitee
+
+
+From eba87a33b157ef221af2fe47d88dcc57b2d549bf Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:34:59 +0800
+Subject: [PATCH 16/19] mm: page_alloc: fix highatomic typing in multi-block
+ buddies
+
+mainline inclusion
+from mainline-v6.10-rc3
+commit 7cc5a5d65011983952a9c62f170f5b79e24b1239
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7cc5a5d65011983952a9c62f170f5b79e24b1239
+
+--------------------------------
+
+Christoph reports a page allocator splat triggered by xfstests:
+
+generic/176 214s ... [ 1204.507931] run fstests generic/176 at 2024-05-27 12:52:30
+XFS (nvme0n1): Mounting V5 Filesystem cd936307-415f-48a3-b99d-a2d52ae1f273
+XFS (nvme0n1): Ending clean mount
+XFS (nvme1n1): Mounting V5 Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850
+XFS (nvme1n1): Ending clean mount
+XFS (nvme1n1): Unmounting Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850
+XFS (nvme1n1): Mounting V5 Filesystem 7099b02d-9c58-4d1d-be1d-2cc472d12cd9
+XFS (nvme1n1): Ending clean mount
+------------[ cut here ]------------
+page type is 3, passed migratetype is 1 (nr=512)
+WARNING: CPU: 0 PID: 509870 at mm/page_alloc.c:645 expand+0x1c5/0x1f0
+Modules linked in: i2c_i801 crc32_pclmul i2c_smbus [last unloaded: scsi_debug]
+CPU: 0 PID: 509870 Comm: xfs_io Not tainted 6.10.0-rc1+ #2437
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
+RIP: 0010:expand+0x1c5/0x1f0
+Code: 05 16 70 bf 02 01 e8 ca fc ff ff 8b 54 24 34 44 89 e1 48 c7 c7 80 a2 28 83 48 89 c6 b8 01 00 3
+RSP: 0018:ffffc90003b2b968 EFLAGS: 00010082
+RAX: 0000000000000000 RBX: ffffffff83fa9480 RCX: 0000000000000000
+RDX: 0000000000000005 RSI: 0000000000000027 RDI: 00000000ffffffff
+RBP: 00000000001f2600 R08: 00000000fffeffff R09: 0000000000000001
+R10: 0000000000000000 R11: ffffffff83676200 R12: 0000000000000009
+R13: 0000000000000200 R14: 0000000000000001 R15: ffffea0007c98000
+FS:  00007f72ca3d5780(0000) GS:ffff8881f9c00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f72ca1fff38 CR3: 00000001aa0c6002 CR4: 0000000000770ef0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
+PKRU: 55555554
+Call Trace:
+ <TASK>
+ ? __warn+0x7b/0x120
+ ? expand+0x1c5/0x1f0
+ ? report_bug+0x191/0x1c0
+ ? handle_bug+0x3c/0x80
+ ? exc_invalid_op+0x17/0x70
+ ? asm_exc_invalid_op+0x1a/0x20
+ ? expand+0x1c5/0x1f0
+ ? expand+0x1c5/0x1f0
+ __rmqueue_pcplist+0x3a9/0x730
+ get_page_from_freelist+0x7a0/0xf00
+ __alloc_pages_noprof+0x153/0x2e0
+ __folio_alloc_noprof+0x10/0xa0
+ __filemap_get_folio+0x16b/0x370
+ iomap_write_begin+0x496/0x680
+
+While trying to service a movable allocation (page type 1), the page
+allocator runs into a two-pageblock buddy on the movable freelist whose
+second block is typed as highatomic (page type 3).
+
+This inconsistency is caused by the highatomic reservation system
+operating on single pageblocks, while MAX_ORDER can be bigger than that -
+in this configuration, pageblock_order is 9 while MAX_PAGE_ORDER is 10.
+The test case is observed to make several adjacent order-3 requests with
+__GFP_DIRECT_RECLAIM cleared, which marks the surrounding block as
+highatomic.  Upon freeing, the blocks merge into an order-10 buddy.  When
+the highatomic pool is drained later on, this order-10 buddy gets moved
+back to the movable list, but only the first pageblock is marked movable
+again.  A subsequent expand() of this buddy warns about the tail being of
+a different type.
+
+This is a long-standing bug that's surfaced by the recent block type
+warnings added to the allocator.  The consequences seem mostly benign, it
+just results in odd behavior: the highatomic tail blocks are not properly
+drained, instead they end up on the movable list first, then go back to
+the highatomic list after an alloc-free cycle.
+
+To fix this, make the highatomic reservation code aware that
+allocations/buddies can be larger than a pageblock.
+
+While it's an old quirk, the recently added type consistency warnings seem
+to be the most prominent consequence of it.  Set the Fixes: tag
+accordingly to highlight this backporting dependency.
+
+Link: https://lkml.kernel.org/r/20240530114203.GA1222079@cmpxchg.org
+Fixes: e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting")
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Christoph Hellwig <hch@infradead.org>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Tested-by: Christoph Hellwig <hch@lst.de>
+Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++----------------
+ 1 file changed, 34 insertions(+), 16 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 4e2ec54b6a7f..53534165b5ab 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1982,10 +1982,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
+ }
+ 
+ /*
+- * Reserve a pageblock for exclusive use of high-order atomic allocations if
+- * there are no empty page blocks that contain a page with a suitable order
++ * Reserve the pageblock(s) surrounding an allocation request for
++ * exclusive use of high-order atomic allocations if there are no
++ * empty page blocks that contain a page with a suitable order
+  */
+-static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
++static void reserve_highatomic_pageblock(struct page *page, int order,
++					 struct zone *zone)
+ {
+ 	int mt;
+ 	unsigned long max_managed, flags;
+@@ -2011,10 +2013,17 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+ 	/* Yoink! */
+ 	mt = get_pageblock_migratetype(page);
+ 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
+-	if (migratetype_is_mergeable(mt))
+-		if (move_freepages_block(zone, page, mt,
+-					 MIGRATE_HIGHATOMIC) != -1)
+-			zone->nr_reserved_highatomic += pageblock_nr_pages;
++	if (!migratetype_is_mergeable(mt))
++		goto out_unlock;
++
++	if (order < pageblock_order) {
++		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
++			goto out_unlock;
++		zone->nr_reserved_highatomic += pageblock_nr_pages;
++	} else {
++		change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
++		zone->nr_reserved_highatomic += 1 << order;
++	}
+ 
+ out_unlock:
+ 	spin_unlock_irqrestore(&zone->lock, flags);
+@@ -2026,7 +2035,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
+  * intense memory pressure but failed atomic allocations should be easier
+  * to recover from than an OOM.
+  *
+- * If @force is true, try to unreserve a pageblock even though highatomic
++ * If @force is true, try to unreserve pageblocks even though highatomic
+  * pageblock is exhausted.
+  */
+ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+@@ -2068,6 +2077,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * adjust the count once.
+ 			 */
+ 			if (is_migrate_highatomic(mt)) {
++				unsigned long size;
+ 				/*
+ 				 * It should never happen but changes to
+ 				 * locking could inadvertently allow a per-cpu
+@@ -2075,9 +2085,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 				 * while unreserving so be safe and watch for
+ 				 * underflows.
+ 				 */
+-				zone->nr_reserved_highatomic -= min(
+-						pageblock_nr_pages,
+-						zone->nr_reserved_highatomic);
++				size = max(pageblock_nr_pages, 1UL << order);
++				size = min(size, zone->nr_reserved_highatomic);
++				zone->nr_reserved_highatomic -= size;
+ 			}
+ 
+ 			/*
+@@ -2089,11 +2099,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ 			 * of pageblocks that cannot be completely freed
+ 			 * may increase.
+ 			 */
+-			ret = move_freepages_block(zone, page, mt,
+-						   ac->migratetype);
++			if (order < pageblock_order)
++				ret = move_freepages_block(zone, page, mt,
++							   ac->migratetype);
++			else {
++				move_to_free_list(page, zone, order, mt,
++						  ac->migratetype);
++				change_pageblock_range(page, order,
++						       ac->migratetype);
++				ret = 1;
++			}
+ 			/*
+-			 * Reserving this block already succeeded, so this should
+-			 * not fail on zone boundaries.
++			 * Reserving the block(s) already succeeded,
++			 * so this should not fail on zone boundaries.
+ 			 */
+ 			WARN_ON_ONCE(ret == -1);
+ 			if (ret > 0) {
+@@ -3440,7 +3458,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
+ 			 * if the pageblock should be reserved for the future
+ 			 */
+ 			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
+-				reserve_highatomic_pageblock(page, zone);
++				reserve_highatomic_pageblock(page, order, zone);
+ 
+ 			return page;
+ 		} else {
+-- 
+Gitee
+
+
+From 6285cc96f1f40030144d86d54604450cf31b3588 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <yuzhao@google.com>
+Date: Wed, 18 Dec 2024 17:35:00 +0800
+Subject: [PATCH 17/19] mm/page_alloc: keep track of free highatomic
+
+mainline inclusion
+from mainline-v6.12-rc7
+commit c928807f6f6b6d595a7e199591ae297c81de3aeb
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c928807f6f6b6d595a7e199591ae297c81de3aeb
+
+--------------------------------
+
+OOM kills due to vastly overestimated free highatomic reserves were
+observed:
+
+  ... invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0 ...
+  Node 0 Normal free:1482936kB boost:0kB min:410416kB low:739404kB high:1068392kB reserved_highatomic:1073152KB ...
+  Node 0 Normal: 1292*4kB (ME) 1920*8kB (E) 383*16kB (UE) 220*32kB (ME) 340*64kB (E) 2155*128kB (UE) 3243*256kB (UE) 615*512kB (U) 1*1024kB (M) 0*2048kB 0*4096kB = 1477408kB
+
+The second line above shows that the OOM kill was due to the following
+condition:
+
+  free (1482936kB) - reserved_highatomic (1073152kB) = 409784KB < min (410416kB)
+
+And the third line shows there were no free pages in any
+MIGRATE_HIGHATOMIC pageblocks, which otherwise would show up as type 'H'.
+Therefore __zone_watermark_unusable_free() underestimated the usable free
+memory by over 1GB, which resulted in the unnecessary OOM kill above.
+
+The comments in __zone_watermark_unusable_free() warns about the potential
+risk, i.e.,
+
+  If the caller does not have rights to reserves below the min
+  watermark then subtract the high-atomic reserves. This will
+  over-estimate the size of the atomic reserve but it avoids a search.
+
+However, it is possible to keep track of free pages in reserved highatomic
+pageblocks with a new per-zone counter nr_free_highatomic protected by the
+zone lock, to avoid a search when calculating the usable free memory.  And
+the cost would be minimal, i.e., simple arithmetics in the highatomic
+alloc/free/move paths.
+
+Note that since nr_free_highatomic can be relatively small, using a
+per-cpu counter might cause too much drift and defeat its purpose, in
+addition to the extra memory overhead.
+
+Dependson e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") - see [1]
+
+[akpm@linux-foundation.org: s/if/else if/, per Johannes, stealth whitespace tweak]
+Link: https://lkml.kernel.org/r/20241028182653.3420139-1-yuzhao@google.com
+Link: https://lkml.kernel.org/r/0d0ddb33-fcdc-43e2-801f-0c1df2031afb@suse.cz [1]
+Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand")
+Signed-off-by: Yu Zhao <yuzhao@google.com>
+Reported-by: Link Lin <linkl@google.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/mmzone.h |  1 +
+ mm/page_alloc.c        | 10 +++++++---
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 3cee238de7c8..18bee72ebc71 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -865,6 +865,7 @@ struct zone {
+ 	unsigned long watermark_boost;
+ 
+ 	unsigned long nr_reserved_highatomic;
++	unsigned long nr_free_highatomic;
+ 
+ 	/*
+ 	 * We don't know if the memory that we're going to allocate will be
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 53534165b5ab..e786cdd98bea 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -639,6 +639,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
+ static inline void account_freepages(struct zone *zone, int nr_pages,
+ 				     int migratetype)
+ {
++	lockdep_assert_held(&zone->lock);
++
+ 	if (is_migrate_isolate(migratetype))
+ 		return;
+ 
+@@ -646,6 +648,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
+ 
+ 	if (is_migrate_cma(migratetype))
+ 		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
++	else if (is_migrate_highatomic(migratetype))
++		WRITE_ONCE(zone->nr_free_highatomic,
++			   zone->nr_free_highatomic + nr_pages);
+ }
+ 
+ /* Used for pages not on another list */
+@@ -3072,11 +3077,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
+ 
+ 	/*
+ 	 * If the caller does not have rights to reserves below the min
+-	 * watermark then subtract the high-atomic reserves. This will
+-	 * over-estimate the size of the atomic reserve but it avoids a search.
++	 * watermark then subtract the free pages reserved for highatomic.
+ 	 */
+ 	if (likely(!(alloc_flags & ALLOC_RESERVES)))
+-		unusable_free += z->nr_reserved_highatomic;
++		unusable_free += READ_ONCE(z->nr_free_highatomic);
+ 
+ #ifdef CONFIG_CMA
+ 	/* If allocation can't use CMA areas don't use free CMA pages */
+-- 
+Gitee
+
+
+From 10a32a6824d78e5e36332cb1b7a4ee0bcca6812b Mon Sep 17 00:00:00 2001
+From: Kefeng Wang <wangkefeng.wang@huawei.com>
+Date: Wed, 18 Dec 2024 17:35:01 +0800
+Subject: [PATCH 18/19] mm: remove migration for HugePage in
+ isolate_single_pageblock()
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit cd5f3193b432cd70cc1c19aba790300dd11ae934
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cd5f3193b432cd70cc1c19aba790300dd11ae934
+
+--------------------------------
+
+The gigantic page size may larger than memory block size, so memory
+offline always fails in this case after commit b2c9e2fbba32 ("mm: make
+alloc_contig_range work at pageblock granularity"),
+
+offline_pages
+  start_isolate_page_range
+    start_isolate_page_range(isolate_before=true)
+      isolate [isolate_start, isolate_start + pageblock_nr_pages)
+    start_isolate_page_range(isolate_before=false)
+      isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock
+       	__alloc_contig_migrate_range
+          isolate_migratepages_range
+            isolate_migratepages_block
+              isolate_or_dissolve_huge_page
+                if (hstate_is_gigantic(h))
+                    return -ENOMEM;
+
+[   15.815756] memory offlining [mem 0x3c0000000-0x3c7ffffff] failed due to failure to isolate range
+
+Gigantic PageHuge is bigger than a pageblock, but since it is freed as
+order-0 pages, its pageblocks after being freed will get to the right
+free list.  There is no need to have special handling code for them in
+start_isolate_page_range().  For both alloc_contig_range() and memory
+offline cases, the migration code after start_isolate_page_range() will
+be able to migrate gigantic PageHuge when possible.  Let's clean up
+start_isolate_page_range() and fix the aforementioned memory offline
+failure issue all together.
+
+Let's clean up start_isolate_page_range() and fix the aforementioned
+memory offline failure issue all together.
+
+Link: https://lkml.kernel.org/r/20240820032630.1894770-1-wangkefeng.wang@huawei.com
+Fixes: b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity")
+Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Zi Yan <ziy@nvidia.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Oscar Salvador <osalvador@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_isolation.c | 28 +++-------------------------
+ 1 file changed, 3 insertions(+), 25 deletions(-)
+
+diff --git a/mm/page_isolation.c b/mm/page_isolation.c
+index b3aae89ed226..cf7f1922fc3e 100644
+--- a/mm/page_isolation.c
++++ b/mm/page_isolation.c
+@@ -398,30 +398,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ 			unsigned long head_pfn = page_to_pfn(head);
+ 			unsigned long nr_pages = compound_nr(head);
+ 
+-			if (head_pfn + nr_pages <= boundary_pfn) {
+-				pfn = head_pfn + nr_pages;
+-				continue;
+-			}
+-
+-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+-			if (PageHuge(page)) {
+-				int page_mt = get_pageblock_migratetype(page);
+-				struct compact_control cc = {
+-					.nr_migratepages = 0,
+-					.order = -1,
+-					.zone = page_zone(pfn_to_page(head_pfn)),
+-					.mode = MIGRATE_SYNC,
+-					.ignore_skip_hint = true,
+-					.no_set_skip_hint = true,
+-					.gfp_mask = gfp_flags,
+-					.alloc_contig = true,
+-				};
+-				INIT_LIST_HEAD(&cc.migratepages);
+-
+-				ret = __alloc_contig_migrate_range(&cc, head_pfn,
+-							head_pfn + nr_pages, page_mt);
+-				if (ret)
+-					goto failed;
++			if (head_pfn + nr_pages <= boundary_pfn ||
++			    PageHuge(page)) {
+ 				pfn = head_pfn + nr_pages;
+ 				continue;
+ 			}
+@@ -435,7 +413,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ 			 */
+ 			VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
+ 			VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+-#endif
++
+ 			goto failed;
+ 		}
+ 
+-- 
+Gitee
+
+
+From ad41788f98eca42a9da702565742413372a3e8e4 Mon Sep 17 00:00:00 2001
+From: Huan Yang <link@vivo.com>
+Date: Wed, 18 Dec 2024 17:35:02 +0800
+Subject: [PATCH 19/19] mm: page_alloc: simpify page del and expand
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 94deaf69dcd33462c61fa8cabb0883e3085a1046
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC4T0
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=94deaf69dcd33462c61fa8cabb0883e3085a1046
+
+--------------------------------
+
+When page del from buddy and need expand, it will account free_pages in
+zone's migratetype.
+
+The current way is to subtract the page number of the current order when
+deleting, and then add it back when expanding.
+
+This is unnecessary, as when migrating the same type, we can directly
+record the difference between the high-order pages and the expand added,
+and then subtract it directly.
+
+This patch merge that, only when del and expand done, then account
+free_pages.
+
+Link: https://lkml.kernel.org/r/20240826064048.187790-1-link@vivo.com
+Signed-off-by: Huan Yang <link@vivo.com>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/page_alloc.c | 35 +++++++++++++++++++++++++----------
+ 1 file changed, 25 insertions(+), 10 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index e786cdd98bea..7734245d7870 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1327,11 +1327,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+  *
+  * -- nyc
+  */
+-static inline void expand(struct zone *zone, struct page *page,
+-	int low, int high, int migratetype)
++static inline unsigned int expand(struct zone *zone, struct page *page, int low,
++				  int high, int migratetype)
+ {
+-	unsigned long size = 1 << high;
+-	unsigned long nr_added = 0;
++	unsigned int size = 1 << high;
++	unsigned int nr_added = 0;
+ 
+ 	while (high > low) {
+ 		high--;
+@@ -1351,7 +1351,19 @@ static inline void expand(struct zone *zone, struct page *page,
+ 		set_buddy_order(&page[size], high);
+ 		nr_added += size;
+ 	}
+-	account_freepages(zone, nr_added, migratetype);
++
++	return nr_added;
++}
++
++static __always_inline void page_del_and_expand(struct zone *zone,
++						struct page *page, int low,
++						int high, int migratetype)
++{
++	int nr_pages = 1 << high;
++
++	__del_page_from_free_list(page, zone, high, migratetype);
++	nr_pages -= expand(zone, page, low, high, migratetype);
++	account_freepages(zone, -nr_pages, migratetype);
+ }
+ 
+ static void check_new_page_bad(struct page *page)
+@@ -1540,8 +1552,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ 		page = get_page_from_free_area(area, migratetype);
+ 		if (!page)
+ 			continue;
+-		del_page_from_free_list(page, zone, current_order, migratetype);
+-		expand(zone, page, order, current_order, migratetype);
++
++		page_del_and_expand(zone, page, order, current_order,
++				    migratetype);
+ 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ 				pcp_allowed_order(order) &&
+ 				migratetype < MIGRATE_PCPTYPES);
+@@ -1892,9 +1905,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 
+ 	/* Take ownership for orders >= pageblock_order */
+ 	if (current_order >= pageblock_order) {
++		unsigned int nr_added;
++
+ 		del_page_from_free_list(page, zone, current_order, block_type);
+ 		change_pageblock_range(page, current_order, start_type);
+-		expand(zone, page, order, current_order, start_type);
++		nr_added = expand(zone, page, order, current_order, start_type);
++		account_freepages(zone, nr_added, start_type);
+ 		return page;
+ 	}
+ 
+@@ -1947,8 +1963,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
+ 	}
+ 
+ single_page:
+-	del_page_from_free_list(page, zone, current_order, block_type);
+-	expand(zone, page, order, current_order, block_type);
++	page_del_and_expand(zone, page, order, current_order, block_type);
+ 	return page;
+ }
+ 
+-- 
+Gitee
+
diff --git a/0026-14227.patch b/0026-14227.patch
new file mode 100644
index 0000000000000000000000000000000000000000..4caa7397c454f7d8530d44b46573f1221ef106d8
--- /dev/null
+++ b/0026-14227.patch
@@ -0,0 +1,3464 @@
+From 3c8ff7deba8ed905fb4c3d05ccccdecb6000b7d4 Mon Sep 17 00:00:00 2001
+From: Chengming Zhou <zhouchengming@bytedance.com>
+Date: Wed, 18 Dec 2024 17:51:06 +0800
+Subject: [PATCH 01/14] mm/zswap: invalidate zswap entry when swap entry free
+
+mainline inclusion
+from mainline-v6.9-rc1
+commit 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0827a1fb143fae588cb6f5b9a97c405d6c2ddec9
+
+--------------------------------
+
+During testing I found there are some times the zswap_writeback_entry()
+return -ENOMEM, which is not we expected:
+
+bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'
+@[-12]: 1563
+@[0]: 277221
+
+The reason is that __read_swap_cache_async() return NULL because
+swapcache_prepare() failed.  The reason is that we won't invalidate zswap
+entry when swap entry freed to the per-cpu pool, these zswap entries are
+still on the zswap tree and lru list.
+
+This patch moves the invalidation ahead to when swap entry freed to the
+per-cpu pool, since there is no any benefit to leave trashy zswap entry on
+the tree and lru list.
+
+With this patch:
+bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}'
+@[0]: 259744
+
+Note: large folio can't have zswap entry for now, so don't bother
+to add zswap entry invalidation in the large folio swap free path.
+
+Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com
+Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
+Reviewed-by: Nhat Pham <nphamcs@gmail.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Yosry Ahmed <yosryahmed@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	include/linux/zswap.h
+	mm/zswap.c
+[ Context conflict. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/zswap.h | 4 ++--
+ mm/swap_slots.c       | 4 ++++
+ mm/swapfile.c         | 1 -
+ mm/zswap.c            | 5 +++--
+ 4 files changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/include/linux/zswap.h b/include/linux/zswap.h
+index 2a60ce39cfde..a13d2d2d9131 100644
+--- a/include/linux/zswap.h
++++ b/include/linux/zswap.h
+@@ -12,7 +12,7 @@ extern atomic_t zswap_stored_pages;
+ 
+ bool zswap_store(struct folio *folio);
+ bool zswap_load(struct folio *folio);
+-void zswap_invalidate(int type, pgoff_t offset);
++void zswap_invalidate(swp_entry_t swp);
+ void zswap_swapon(int type);
+ void zswap_swapoff(int type);
+ 
+@@ -28,7 +28,7 @@ static inline bool zswap_load(struct folio *folio)
+ 	return false;
+ }
+ 
+-static inline void zswap_invalidate(int type, pgoff_t offset) {}
++static inline void zswap_invalidate(swp_entry_t swp) {}
+ static inline void zswap_swapon(int type) {}
+ static inline void zswap_swapoff(int type) {}
+ 
+diff --git a/mm/swap_slots.c b/mm/swap_slots.c
+index 7af3b93d4c8c..5579eed7065f 100644
+--- a/mm/swap_slots.c
++++ b/mm/swap_slots.c
+@@ -34,6 +34,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/mutex.h>
+ #include <linux/mm.h>
++#include <linux/zswap.h>
+ 
+ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+ #ifdef CONFIG_MEMCG_SWAP_QOS
+@@ -394,6 +395,9 @@ void free_swap_slot(swp_entry_t entry)
+ {
+ 	struct swap_slots_cache *cache;
+ 
++	/* Large folio swap slot is not covered. */
++	zswap_invalidate(entry);
++
+ 	cache = raw_cpu_ptr(&swp_slots);
+ 	if (likely(use_swap_slot_cache && cache->slots_ret)) {
+ 		spin_lock_irq(&cache->free_lock);
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 3af5b6ebb241..30832b85d6c2 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -765,7 +765,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+ 		swap_slot_free_notify = NULL;
+ 	while (offset <= end) {
+ 		arch_swap_invalidate_page(si->type, offset);
+-		zswap_invalidate(si->type, offset);
+ 		if (swap_slot_free_notify)
+ 			swap_slot_free_notify(si->bdev, offset);
+ 		offset++;
+diff --git a/mm/zswap.c b/mm/zswap.c
+index 69681b9173fd..5acda5b906bc 100644
+--- a/mm/zswap.c
++++ b/mm/zswap.c
+@@ -1482,9 +1482,10 @@ bool zswap_load(struct folio *folio)
+ 	return ret;
+ }
+ 
+-void zswap_invalidate(int type, pgoff_t offset)
++void zswap_invalidate(swp_entry_t swp)
+ {
+-	struct zswap_tree *tree = zswap_trees[type];
++	pgoff_t offset = swp_offset(swp);
++	struct zswap_tree *tree = zswap_trees[swp_type(swp)];
+ 	struct zswap_entry *entry;
+ 
+ 	/* find */
+-- 
+Gitee
+
+
+From e2f02eacab254e29bd451782950ac6a03de685bd Mon Sep 17 00:00:00 2001
+From: Chris Li <chrisl@kernel.org>
+Date: Wed, 18 Dec 2024 17:51:07 +0800
+Subject: [PATCH 02/14] mm: swap: swap cluster switch to double link list
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 73ed0baae66df50359c876f65f41179d6ebd2716
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=73ed0baae66df50359c876f65f41179d6ebd2716
+
+--------------------------------
+
+Patch series "mm: swap: mTHP swap allocator base on swap cluster order",
+v5.
+
+This is the short term solutions "swap cluster order" listed in my "Swap
+Abstraction" discussion slice 8 in the recent LSF/MM conference.
+
+When commit 845982eb264bc "mm: swap: allow storage of all mTHP orders" is
+introduced, it only allocates the mTHP swap entries from the new empty
+cluster list.   It has a fragmentation issue reported by Barry.
+
+https://lore.kernel.org/all/CAGsJ_4zAcJkuW016Cfi6wicRr8N9X+GJJhgMQdSMp+Ah+NSgNQ@mail.gmail.com/
+
+The reason is that all the empty clusters have been exhausted while there
+are plenty of free swap entries in the cluster that are not 100% free.
+
+Remember the swap allocation order in the cluster.  Keep track of the per
+order non full cluster list for later allocation.
+
+This series gives the swap SSD allocation a new separate code path from
+the HDD allocation.  The new allocator use cluster list only and do not
+global scan swap_map[] without lock any more.
+
+This streamline the swap allocation for SSD.  The code matches the
+execution flow much better.
+
+User impact: For users that allocate and free mix order mTHP swapping, It
+greatly improves the success rate of the mTHP swap allocation after the
+initial phase.
+
+It also performs faster when the swapfile is close to full, because the
+allocator can get the non full cluster from a list rather than scanning a
+lot of swap_map entries. 
+
+With Barry's mthp test program V2:
+
+Without:
+$ ./thp_swap_allocator_test -a
+Iteration 1: swpout inc: 32, swpout fallback inc: 192, Fallback percentage: 85.71%
+Iteration 2: swpout inc: 0, swpout fallback inc: 231, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 227, Fallback percentage: 100.00%
+...
+Iteration 98: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 215, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+
+$ ./thp_swap_allocator_test -a -s
+Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+..
+Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%
+
+$ ./thp_swap_allocator_test -s
+Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+..
+Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%
+
+$ ./thp_swap_allocator_test
+Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
+Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
+Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+..
+Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
+Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
+Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%
+
+With: # with all 0.00% filter out
+$ ./thp_swap_allocator_test -a | grep -v "0.00%"
+$ # all result are 0.00%
+
+$ ./thp_swap_allocator_test -a -s | grep -v "0.00%"
+./thp_swap_allocator_test -a -s | grep -v "0.00%"
+Iteration 14: swpout inc: 223, swpout fallback inc: 3, Fallback percentage: 1.33%
+Iteration 19: swpout inc: 219, swpout fallback inc: 7, Fallback percentage: 3.10%
+Iteration 28: swpout inc: 225, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 29: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 34: swpout inc: 220, swpout fallback inc: 8, Fallback percentage: 3.51%
+Iteration 35: swpout inc: 222, swpout fallback inc: 11, Fallback percentage: 4.72%
+Iteration 38: swpout inc: 217, swpout fallback inc: 4, Fallback percentage: 1.81%
+Iteration 40: swpout inc: 222, swpout fallback inc: 6, Fallback percentage: 2.63%
+Iteration 42: swpout inc: 221, swpout fallback inc: 2, Fallback percentage: 0.90%
+Iteration 43: swpout inc: 215, swpout fallback inc: 7, Fallback percentage: 3.15%
+Iteration 47: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88%
+Iteration 49: swpout inc: 217, swpout fallback inc: 1, Fallback percentage: 0.46%
+Iteration 52: swpout inc: 221, swpout fallback inc: 8, Fallback percentage: 3.49%
+Iteration 56: swpout inc: 224, swpout fallback inc: 4, Fallback percentage: 1.75%
+Iteration 58: swpout inc: 214, swpout fallback inc: 5, Fallback percentage: 2.28%
+Iteration 62: swpout inc: 220, swpout fallback inc: 3, Fallback percentage: 1.35%
+Iteration 64: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 67: swpout inc: 221, swpout fallback inc: 1, Fallback percentage: 0.45%
+Iteration 75: swpout inc: 220, swpout fallback inc: 9, Fallback percentage: 3.93%
+Iteration 82: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 86: swpout inc: 211, swpout fallback inc: 12, Fallback percentage: 5.38%
+Iteration 89: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88%
+Iteration 93: swpout inc: 220, swpout fallback inc: 1, Fallback percentage: 0.45%
+Iteration 94: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 96: swpout inc: 221, swpout fallback inc: 6, Fallback percentage: 2.64%
+Iteration 98: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
+Iteration 99: swpout inc: 227, swpout fallback inc: 3, Fallback percentage: 1.30%
+
+$ ./thp_swap_allocator_test
+./thp_swap_allocator_test
+Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00%
+Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53%
+Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58%
+Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34%
+Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51%
+Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84%
+Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91%
+Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05%
+Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25%
+Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74%
+Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01%
+Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45%
+Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98%
+Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64%
+Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36%
+Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02%
+Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07%
+
+$ ./thp_swap_allocator_test -s
+ ./thp_swap_allocator_test -s
+Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00%
+Iteration 2: swpout inc: 97, swpout fallback inc: 135, Fallback percentage: 58.19%
+Iteration 3: swpout inc: 42, swpout fallback inc: 192, Fallback percentage: 82.05%
+Iteration 4: swpout inc: 19, swpout fallback inc: 214, Fallback percentage: 91.85%
+Iteration 5: swpout inc: 12, swpout fallback inc: 213, Fallback percentage: 94.67%
+Iteration 6: swpout inc: 11, swpout fallback inc: 217, Fallback percentage: 95.18%
+Iteration 7: swpout inc: 9, swpout fallback inc: 214, Fallback percentage: 95.96%
+Iteration 8: swpout inc: 8, swpout fallback inc: 213, Fallback percentage: 96.38%
+Iteration 9: swpout inc: 2, swpout fallback inc: 223, Fallback percentage: 99.11%
+Iteration 10: swpout inc: 2, swpout fallback inc: 228, Fallback percentage: 99.13%
+Iteration 11: swpout inc: 4, swpout fallback inc: 214, Fallback percentage: 98.17%
+Iteration 12: swpout inc: 5, swpout fallback inc: 226, Fallback percentage: 97.84%
+Iteration 13: swpout inc: 3, swpout fallback inc: 212, Fallback percentage: 98.60%
+Iteration 14: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
+Iteration 15: swpout inc: 3, swpout fallback inc: 222, Fallback percentage: 98.67%
+Iteration 16: swpout inc: 4, swpout fallback inc: 223, Fallback percentage: 98.24%
+
+=========
+Kernel compile under tmpfs with cgroup memory.max = 470M.
+12 core 24 hyperthreading, 32 jobs. 10 Run each group
+
+SSD swap 10 runs average, 20G swap partition:
+With:
+user    2929.064
+system  1479.381 : 1376.89 1398.22 1444.64 1477.39 1479.04 1497.27
+1504.47 1531.4 1532.92 1551.57
+real    1441.324
+
+Without:
+user    2910.872
+system  1482.732 : 1440.01 1451.4 1462.01 1467.47 1467.51 1469.3
+1470.19 1496.32 1544.1 1559.01
+real    1580.822
+
+Two zram swap: zram0 3.0G zram1 20G.
+
+The idea is forcing the zram0 almost full then overflow to zram1:
+
+With:
+user    4320.301
+system  4272.403 : 4236.24 4262.81 4264.75 4269.13 4269.44 4273.06
+4279.85 4285.98 4289.64 4293.13
+real    431.759
+
+Without
+user    4301.393
+system  4387.672 : 4374.47 4378.3 4380.95 4382.84 4383.06 4388.05
+4389.76 4397.16 4398.23 4403.9
+real    433.979
+
+------ more test result from Kaiui ----------
+
+Test with build linux kernel using a 4G ZRAM, 1G memory.max limit on top of shmem:
+
+System info: 32 Core AMD Zen2, 64G total memory.
+
+Test 3 times using only 4K pages:
+=================================
+
+With:
+-----
+1838.74user 2411.21system 2:37.86elapsed 2692%CPU (0avgtext+0avgdata 847060maxresident)k
+1839.86user 2465.77system 2:39.35elapsed 2701%CPU (0avgtext+0avgdata 847060maxresident)k
+1840.26user 2454.68system 2:39.43elapsed 2693%CPU (0avgtext+0avgdata 847060maxresident)k
+
+Summary (~4.6% improment of system time):
+User: 1839.62
+System: 2443.89: 2465.77 2454.68 2411.21
+Real: 158.88
+
+Without:
+--------
+1837.99user 2575.95system 2:43.09elapsed 2706%CPU (0avgtext+0avgdata 846520maxresident)k
+1838.32user 2555.15system 2:42.52elapsed 2709%CPU (0avgtext+0avgdata 846520maxresident)k
+1843.02user 2561.55system 2:43.35elapsed 2702%CPU (0avgtext+0avgdata 846520maxresident)k
+
+Summary:
+User: 1839.78
+System: 2564.22: 2575.95 2555.15 2561.55
+Real: 162.99
+
+Test 5 times using enabled all mTHP pages:
+==========================================
+
+With:
+-----
+1796.44user 2937.33system 2:59.09elapsed 2643%CPU (0avgtext+0avgdata 846936maxresident)k
+1802.55user 3002.32system 2:54.68elapsed 2750%CPU (0avgtext+0avgdata 847072maxresident)k
+1806.59user 2986.53system 2:55.17elapsed 2736%CPU (0avgtext+0avgdata 847092maxresident)k
+1803.27user 2982.40system 2:54.49elapsed 2742%CPU (0avgtext+0avgdata 846796maxresident)k
+1807.43user 3036.08system 2:56.06elapsed 2751%CPU (0avgtext+0avgdata 846488maxresident)k
+
+Summary (~8.4% improvement of system time):
+User: 1803.25
+System: 2988.93: 2937.33 3002.32 2986.53 2982.40 3036.08
+Real: 175.90
+
+mTHP swapout status:
+/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout:347721
+/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout_fallback:3110
+/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout:3365
+/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout_fallback:8269
+/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout:24
+/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout_fallback:3341
+/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout:145
+/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout_fallback:5038
+/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout:322737
+/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback:36808
+/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout:380455
+/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout_fallback:1010
+/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout:24973
+/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout_fallback:13223
+/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout:197348
+/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout_fallback:80541
+
+Without:
+--------
+1794.41user 3151.29system 3:05.97elapsed 2659%CPU (0avgtext+0avgdata 846704maxresident)k
+1810.27user 3304.48system 3:05.38elapsed 2759%CPU (0avgtext+0avgdata 846636maxresident)k
+1809.84user 3254.85system 3:03.83elapsed 2755%CPU (0avgtext+0avgdata 846952maxresident)k
+1813.54user 3259.56system 3:04.28elapsed 2752%CPU (0avgtext+0avgdata 846848maxresident)k
+1829.97user 3338.40system 3:07.32elapsed 2759%CPU (0avgtext+0avgdata 847024maxresident)k
+
+Summary:
+User: 1811.61
+System: 3261.72 : 3151.29 3304.48 3254.85 3259.56 3338.40
+Real: 185.356
+
+mTHP swapout status:
+hugepages-32kB/stats/swpout:35630
+hugepages-32kB/stats/swpout_fallback:1809908
+hugepages-512kB/stats/swpout:523
+hugepages-512kB/stats/swpout_fallback:55235
+hugepages-2048kB/stats/swpout:53
+hugepages-2048kB/stats/swpout_fallback:17264
+hugepages-1024kB/stats/swpout:85
+hugepages-1024kB/stats/swpout_fallback:24979
+hugepages-64kB/stats/swpout:30117
+hugepages-64kB/stats/swpout_fallback:1825399
+hugepages-16kB/stats/swpout:42775
+hugepages-16kB/stats/swpout_fallback:1951123
+hugepages-256kB/stats/swpout:2326
+hugepages-256kB/stats/swpout_fallback:170165
+hugepages-128kB/stats/swpout:17925
+hugepages-128kB/stats/swpout_fallback:1309757
+
+This patch (of 9):
+
+Previously, the swap cluster used a cluster index as a pointer to
+construct a custom single link list type "swap_cluster_list".  The next
+cluster pointer is shared with the cluster->count.  It prevents puting the
+non free cluster into a list.
+
+Change the cluster to use the standard double link list instead.  This
+allows tracing the nonfull cluster in the follow up patch.  That way, it
+is faster to get to the nonfull cluster of that order.
+
+Remove the cluster getter/setter for accessing the cluster struct member.
+
+The list operation is protected by the swap_info_struct->lock.
+
+Change cluster code to use "struct swap_cluster_info *" to reference the
+cluster rather than by using index.  That is more consistent with the list
+manipulation.  It avoids the repeat adding index to the cluser_info.  The
+code is easier to understand.
+
+Remove the cluster next pointer is NULL flag, the double link list can
+handle the empty list pretty well.
+
+The "swap_cluster_info" struct is two pointer bigger, because 512 swap
+entries share one swap_cluster_info struct, it has very little impact on
+the average memory usage per swap entry.  For 1TB swapfile, the swap
+cluster data structure increases from 8MB to 24MB.
+
+Other than the list conversion, there is no real function change in this
+patch.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-1-cb9c148b9297@kernel.org
+Signed-off-by: Chris Li <chrisl@kernel.org>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kairui Song <kasong@tencent.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	include/linux/swap.h
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  25 ++---
+ mm/swapfile.c        | 226 ++++++++++++-------------------------------
+ 2 files changed, 71 insertions(+), 180 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index bea0c0f1f640..94e1b6bb04ce 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -255,22 +255,20 @@ enum {
+  * free clusters are organized into a list. We fetch an entry from the list to
+  * get a free cluster.
+  *
+- * The data field stores next cluster if the cluster is free or cluster usage
+- * counter otherwise. The flags field determines if a cluster is free. This is
+- * protected by swap_info_struct.lock.
++ * The flags field determines if a cluster is free. This is
++ * protected by cluster lock.
+  */
+ struct swap_cluster_info {
+ 	spinlock_t lock;	/*
+ 				 * Protect swap_cluster_info fields
+-				 * and swap_info_struct->swap_map
+-				 * elements correspond to the swap
+-				 * cluster
++				 * other than list, and swap_info_struct->swap_map
++				 * elements corresponding to the swap cluster.
+ 				 */
+-	unsigned int data:24;
+-	unsigned int flags:8;
++	u16 count;
++	u8 flags;
++	struct list_head list;
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -295,11 +293,6 @@ struct percpu_cluster {
+ 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+ };
+ 
+-struct swap_cluster_list {
+-	struct swap_cluster_info head;
+-	struct swap_cluster_info tail;
+-};
+-
+ /*
+  * The in-memory structure used to track swap areas.
+  */
+@@ -312,7 +305,7 @@ struct swap_info_struct {
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+-	struct swap_cluster_list free_clusters; /* free clusters list */
++	struct list_head free_clusters; /* free clusters list */
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+@@ -345,7 +338,7 @@ struct swap_info_struct {
+ 					 * list.
+ 					 */
+ 	struct work_struct discard_work; /* discard worker */
+-	struct swap_cluster_list discard_clusters; /* discard clusters list */
++	struct list_head discard_clusters; /* discard clusters list */
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
+ 	KABI_RESERVE(3)
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 30832b85d6c2..76b344438606 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -289,62 +289,15 @@ static void discard_swap_cluster(struct swap_info_struct *si,
+ #endif
+ #define LATENCY_LIMIT		256
+ 
+-static inline void cluster_set_flag(struct swap_cluster_info *info,
+-	unsigned int flag)
+-{
+-	info->flags = flag;
+-}
+-
+-static inline unsigned int cluster_count(struct swap_cluster_info *info)
+-{
+-	return info->data;
+-}
+-
+-static inline void cluster_set_count(struct swap_cluster_info *info,
+-				     unsigned int c)
+-{
+-	info->data = c;
+-}
+-
+-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+-					 unsigned int c, unsigned int f)
+-{
+-	info->flags = f;
+-	info->data = c;
+-}
+-
+-static inline unsigned int cluster_next(struct swap_cluster_info *info)
+-{
+-	return info->data;
+-}
+-
+-static inline void cluster_set_next(struct swap_cluster_info *info,
+-				    unsigned int n)
+-{
+-	info->data = n;
+-}
+-
+-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+-					 unsigned int n, unsigned int f)
+-{
+-	info->flags = f;
+-	info->data = n;
+-}
+-
+ static inline bool cluster_is_free(struct swap_cluster_info *info)
+ {
+ 	return info->flags & CLUSTER_FLAG_FREE;
+ }
+ 
+-static inline bool cluster_is_null(struct swap_cluster_info *info)
+-{
+-	return info->flags & CLUSTER_FLAG_NEXT_NULL;
+-}
+-
+-static inline void cluster_set_null(struct swap_cluster_info *info)
++static inline unsigned int cluster_index(struct swap_info_struct *si,
++					 struct swap_cluster_info *ci)
+ {
+-	info->flags = CLUSTER_FLAG_NEXT_NULL;
+-	info->data = 0;
++	return ci - si->cluster_info;
+ }
+ 
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+@@ -393,65 +346,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ 		spin_unlock(&si->lock);
+ }
+ 
+-static inline bool cluster_list_empty(struct swap_cluster_list *list)
+-{
+-	return cluster_is_null(&list->head);
+-}
+-
+-static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
+-{
+-	return cluster_next(&list->head);
+-}
+-
+-static void cluster_list_init(struct swap_cluster_list *list)
+-{
+-	cluster_set_null(&list->head);
+-	cluster_set_null(&list->tail);
+-}
+-
+-static void cluster_list_add_tail(struct swap_cluster_list *list,
+-				  struct swap_cluster_info *ci,
+-				  unsigned int idx)
+-{
+-	if (cluster_list_empty(list)) {
+-		cluster_set_next_flag(&list->head, idx, 0);
+-		cluster_set_next_flag(&list->tail, idx, 0);
+-	} else {
+-		struct swap_cluster_info *ci_tail;
+-		unsigned int tail = cluster_next(&list->tail);
+-
+-		/*
+-		 * Nested cluster lock, but both cluster locks are
+-		 * only acquired when we held swap_info_struct->lock
+-		 */
+-		ci_tail = ci + tail;
+-		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+-		cluster_set_next(ci_tail, idx);
+-		spin_unlock(&ci_tail->lock);
+-		cluster_set_next_flag(&list->tail, idx, 0);
+-	}
+-}
+-
+-static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
+-					   struct swap_cluster_info *ci)
+-{
+-	unsigned int idx;
+-
+-	idx = cluster_next(&list->head);
+-	if (cluster_next(&list->tail) == idx) {
+-		cluster_set_null(&list->head);
+-		cluster_set_null(&list->tail);
+-	} else
+-		cluster_set_next_flag(&list->head,
+-				      cluster_next(&ci[idx]), 0);
+-
+-	return idx;
+-}
+-
+ /* Add a cluster to discard list and schedule it to do discard */
+ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+-		unsigned int idx)
++		struct swap_cluster_info *ci)
+ {
++	unsigned int idx = cluster_index(si, ci);
+ 	/*
+ 	 * If scan_swap_map_slots() can't find a free cluster, it will check
+ 	 * si->swap_map directly. To make sure the discarding cluster isn't
+@@ -461,17 +360,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+-	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
+-
++	list_add_tail(&ci->list, &si->discard_clusters);
+ 	schedule_work(&si->discard_work);
+ }
+ 
+-static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
++static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info;
+-
+-	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+-	cluster_list_add_tail(&si->free_clusters, ci, idx);
++	ci->flags = CLUSTER_FLAG_FREE;
++	list_add_tail(&ci->list, &si->free_clusters);
+ }
+ 
+ /*
+@@ -480,24 +376,25 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+ */
+ static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ {
+-	struct swap_cluster_info *info, *ci;
++	struct swap_cluster_info *ci;
+ 	unsigned int idx;
+ 
+-	info = si->cluster_info;
+-
+-	while (!cluster_list_empty(&si->discard_clusters)) {
+-		idx = cluster_list_del_first(&si->discard_clusters, info);
++	while (!list_empty(&si->discard_clusters)) {
++		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
++		list_del(&ci->list);
++		idx = cluster_index(si, ci);
+ 		spin_unlock(&si->lock);
+ 
+ 		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+ 				SWAPFILE_CLUSTER);
+ 
+ 		spin_lock(&si->lock);
+-		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+-		__free_cluster(si, idx);
++
++		spin_lock(&ci->lock);
++		__free_cluster(si, ci);
+ 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 				0, SWAPFILE_CLUSTER);
+-		unlock_cluster(ci);
++		spin_unlock(&ci->lock);
+ 	}
+ }
+ 
+@@ -520,20 +417,21 @@ static void swap_users_ref_free(struct percpu_ref *ref)
+ 	complete(&si->comp);
+ }
+ 
+-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
++static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info;
++	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
++							struct swap_cluster_info, list);
+ 
+-	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+-	cluster_list_del_first(&si->free_clusters, ci);
+-	cluster_set_count_flag(ci + idx, 0, 0);
++	VM_BUG_ON(cluster_index(si, ci) != idx);
++	list_del(&ci->list);
++	ci->count = 0;
++	ci->flags = 0;
++	return ci;
+ }
+ 
+-static void free_cluster(struct swap_info_struct *si, unsigned long idx)
++static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+-	struct swap_cluster_info *ci = si->cluster_info + idx;
+-
+-	VM_BUG_ON(cluster_count(ci) != 0);
++	VM_BUG_ON(ci->count != 0);
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -541,11 +439,11 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	 */
+ 	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ 	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+-		swap_cluster_schedule_discard(si, idx);
++		swap_cluster_schedule_discard(si, ci);
+ 		return;
+ 	}
+ 
+-	__free_cluster(si, idx);
++	__free_cluster(si, ci);
+ }
+ 
+ /*
+@@ -558,15 +456,15 @@ static void add_cluster_info_page(struct swap_info_struct *p,
+ 	unsigned long count)
+ {
+ 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
++	struct swap_cluster_info *ci = cluster_info + idx;
+ 
+ 	if (!cluster_info)
+ 		return;
+-	if (cluster_is_free(&cluster_info[idx]))
++	if (cluster_is_free(ci))
+ 		alloc_cluster(p, idx);
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
+-	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) + count);
++	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
++	ci->count += count;
+ }
+ 
+ /*
+@@ -580,24 +478,20 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
+ }
+ 
+ /*
+- * The cluster corresponding to page_nr decreases one usage. If the usage
+- * counter becomes 0, which means no page in the cluster is in using, we can
+- * optionally discard the cluster and add it to free cluster list.
++ * The cluster ci decreases one usage. If the usage counter becomes 0,
++ * which means no page in the cluster is in use, we can optionally discard
++ * the cluster and add it to free cluster list.
+  */
+-static void dec_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
++static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
+ {
+-	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+-
+-	if (!cluster_info)
++	if (!p->cluster_info)
+ 		return;
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+-	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) - 1);
++	VM_BUG_ON(ci->count == 0);
++	ci->count--;
+ 
+-	if (cluster_count(&cluster_info[idx]) == 0)
+-		free_cluster(p, idx);
++	if (!ci->count)
++		free_cluster(p, ci);
+ }
+ 
+ /*
+@@ -610,10 +504,12 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+ {
+ 	struct percpu_cluster *percpu_cluster;
+ 	bool conflict;
++	struct swap_cluster_info *first = list_first_entry(&si->free_clusters,
++							   struct swap_cluster_info, list);
+ 
+ 	offset /= SWAPFILE_CLUSTER;
+-	conflict = !cluster_list_empty(&si->free_clusters) &&
+-		offset != cluster_list_first(&si->free_clusters) &&
++	conflict = !list_empty(&si->free_clusters) &&
++		offset !=  cluster_index(si, first) &&
+ 		cluster_is_free(&si->cluster_info[offset]);
+ 
+ 	if (!conflict)
+@@ -654,10 +550,10 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+ 	tmp = cluster->next[order];
+ 	if (tmp == SWAP_NEXT_INVALID) {
+-		if (!cluster_list_empty(&si->free_clusters)) {
+-			tmp = cluster_next(&si->free_clusters.head) *
+-					SWAPFILE_CLUSTER;
+-		} else if (!cluster_list_empty(&si->discard_clusters)) {
++		if (!list_empty(&si->free_clusters)) {
++			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
++		} else if (!list_empty(&si->discard_clusters)) {
+ 			/*
+ 			 * we don't have free cluster but have some clusters in
+ 			 * discarding, do discard now and reclaim them, then
+@@ -1055,8 +951,9 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 
+ 	ci = lock_cluster(si, offset);
+ 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+-	cluster_set_count_flag(ci, 0, 0);
+-	free_cluster(si, idx);
++	ci->count = 0;
++	ci->flags = 0;
++	free_cluster(si, ci);
+ 	unlock_cluster(ci);
+ 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+ }
+@@ -1418,7 +1315,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+ 	count = p->swap_map[offset];
+ 	VM_BUG_ON(count != SWAP_HAS_CACHE);
+ 	p->swap_map[offset] = 0;
+-	dec_cluster_info_page(p, p->cluster_info, offset);
++	dec_cluster_info_page(p, ci);
+ 	unlock_cluster(ci);
+ 
+ 	mem_cgroup_uncharge_swap(entry, 1);
+@@ -3113,8 +3010,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 
+ 	nr_good_pages = maxpages - 1;	/* omit header page */
+ 
+-	cluster_list_init(&p->free_clusters);
+-	cluster_list_init(&p->discard_clusters);
++	INIT_LIST_HEAD(&p->free_clusters);
++	INIT_LIST_HEAD(&p->discard_clusters);
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+@@ -3165,14 +3062,15 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+ 		j = (k + col) % SWAP_CLUSTER_COLS;
+ 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
++			struct swap_cluster_info *ci;
+ 			idx = i * SWAP_CLUSTER_COLS + j;
++			ci = cluster_info + idx;
+ 			if (idx >= nr_clusters)
+ 				continue;
+-			if (cluster_count(&cluster_info[idx]))
++			if (ci->count)
+ 				continue;
+-			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+-			cluster_list_add_tail(&p->free_clusters, cluster_info,
+-					      idx);
++			ci->flags = CLUSTER_FLAG_FREE;
++			list_add_tail(&ci->list, &p->free_clusters);
+ 		}
+ 	}
+ 	return nr_extents;
+-- 
+Gitee
+
+
+From 3bc5a5e67c63e14fe1342ed16ecb304cf60d94b3 Mon Sep 17 00:00:00 2001
+From: Chris Li <chrisl@kernel.org>
+Date: Wed, 18 Dec 2024 17:51:08 +0800
+Subject: [PATCH 03/14] mm: swap: mTHP allocate swap entries from nonfull list
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit d07a46a4ac18786e7f4c98fb08525ed80dd1f642
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d07a46a4ac18786e7f4c98fb08525ed80dd1f642
+
+--------------------------------
+
+Track the nonfull cluster as well as the empty cluster on lists.  Each
+order has one nonfull cluster list.
+
+The cluster will remember which order it was used during new cluster
+allocation.
+
+When the cluster has free entry, add to the nonfull[order] list.   When
+the free cluster list is empty, also allocate from the nonempty list of
+that order.
+
+This improves the mTHP swap allocation success rate.
+
+There are limitations if the distribution of numbers of different orders
+of mTHP changes a lot.  e.g.  there are a lot of nonfull cluster assign to
+order A while later time there are a lot of order B allocation while very
+little allocation in order A.  Currently the cluster used by order A will
+not reused by order B unless the cluster is 100% empty.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org
+Signed-off-by: Chris Li <chrisl@kernel.org>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kairui Song <kasong@tencent.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  4 ++++
+ mm/swapfile.c        | 38 +++++++++++++++++++++++++++++++++++---
+ 2 files changed, 39 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 94e1b6bb04ce..29a1daa46421 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -266,9 +266,11 @@ struct swap_cluster_info {
+ 				 */
+ 	u16 count;
+ 	u8 flags;
++	u8 order;
+ 	struct list_head list;
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
++#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -306,6 +308,8 @@ struct swap_info_struct {
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+ 	struct list_head free_clusters; /* free clusters list */
++	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
++					/* list of cluster that contains at least one free slot */
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 76b344438606..adde6877c0fe 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -360,14 +360,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+-	list_add_tail(&ci->list, &si->discard_clusters);
++	VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
++	if (ci->flags & CLUSTER_FLAG_NONFULL)
++		list_move_tail(&ci->list, &si->discard_clusters);
++	else
++		list_add_tail(&ci->list, &si->discard_clusters);
++	ci->flags = 0;
+ 	schedule_work(&si->discard_work);
+ }
+ 
+ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
++	if (ci->flags & CLUSTER_FLAG_NONFULL)
++		list_move_tail(&ci->list, &si->free_clusters);
++	else
++		list_add_tail(&ci->list, &si->free_clusters);
+ 	ci->flags = CLUSTER_FLAG_FREE;
+-	list_add_tail(&ci->list, &si->free_clusters);
+ }
+ 
+ /*
+@@ -490,8 +498,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
+ 	VM_BUG_ON(ci->count == 0);
+ 	ci->count--;
+ 
+-	if (!ci->count)
++	if (!ci->count) {
+ 		free_cluster(p, ci);
++		return;
++	}
++
++	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
++		list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		ci->flags |= CLUSTER_FLAG_NONFULL;
++	}
+ }
+ 
+ /*
+@@ -552,6 +567,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+ 	if (tmp == SWAP_NEXT_INVALID) {
+ 		if (!list_empty(&si->free_clusters)) {
+ 			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++			list_del(&ci->list);
++			spin_lock(&ci->lock);
++			ci->order = order;
++			ci->flags = 0;
++			spin_unlock(&ci->lock);
++			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
++		} else if (!list_empty(&si->nonfull_clusters[order])) {
++			ci = list_first_entry(&si->nonfull_clusters[order],
++					      struct swap_cluster_info, list);
++			list_del(&ci->list);
++			spin_lock(&ci->lock);
++			ci->flags = 0;
++			spin_unlock(&ci->lock);
+ 			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
+ 		} else if (!list_empty(&si->discard_clusters)) {
+ 			/*
+@@ -952,6 +980,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	ci = lock_cluster(si, offset);
+ 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+ 	ci->count = 0;
++	ci->order = 0;
+ 	ci->flags = 0;
+ 	free_cluster(si, ci);
+ 	unlock_cluster(ci);
+@@ -3013,6 +3042,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	INIT_LIST_HEAD(&p->free_clusters);
+ 	INIT_LIST_HEAD(&p->discard_clusters);
+ 
++	for (i = 0; i < SWAP_NR_ORDERS; i++)
++		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
++
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+ 		if (page_nr == 0 || page_nr > swap_header->info.last_page)
+-- 
+Gitee
+
+
+From 71c1b6bdf4681e292a269a16337b6fbf64c388d6 Mon Sep 17 00:00:00 2001
+From: Chris Li <chrisl@kernel.org>
+Date: Wed, 18 Dec 2024 17:51:09 +0800
+Subject: [PATCH 04/14] mm: swap: separate SSD allocation from
+ scan_swap_map_slots()
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 5f843a9a3a1e865fbf349419bde39977c2e7d3d1
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5f843a9a3a1e865fbf349419bde39977c2e7d3d1
+
+--------------------------------
+
+Previously the SSD and HDD share the same swap_map scan loop in
+scan_swap_map_slots().  This function is complex and hard to flow the
+execution flow.
+
+scan_swap_map_try_ssd_cluster() can already do most of the heavy lifting
+to locate the candidate swap range in the cluster.  However it needs to go
+back to scan_swap_map_slots() to check conflict and then perform the
+allocation.
+
+When scan_swap_map_try_ssd_cluster() failed, it still depended on the
+scan_swap_map_slots() to do brute force scanning of the swap_map.  When
+the swapfile is large and almost full, it will take some CPU time to go
+through the swap_map array.
+
+Get rid of the cluster allocation dependency on the swap_map scan loop in
+scan_swap_map_slots().  Streamline the cluster allocation code path.  No
+more conflict checks.
+
+For order 0 swap entry, when run out of free and nonfull list.  It will
+allocate from the higher order nonfull cluster list.
+
+Users should see less CPU time spent on searching the free swap slot when
+swapfile is almost full.
+
+[ryncsn@gmail.com: fix array-bounds error with CONFIG_THP_SWAP=n]
+  Link: https://lkml.kernel.org/r/CAMgjq7Bz0DY+rY0XgCoH7-Q=uHLdo3omi8kUr4ePDweNyofsbQ@mail.gmail.com
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-3-cb9c148b9297@kernel.org
+Signed-off-by: Chris Li <chrisl@kernel.org>
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 300 ++++++++++++++++++++++++++++----------------------
+ 1 file changed, 168 insertions(+), 132 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index adde6877c0fe..a3e721510311 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -52,6 +52,8 @@
+ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ 				 unsigned char);
+ static void free_swap_count_continuations(struct swap_info_struct *);
++static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
++			     unsigned int nr_entries);
+ 
+ static DEFINE_SPINLOCK(swap_lock);
+ static unsigned int nr_swapfiles;
+@@ -300,6 +302,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
+ 	return ci - si->cluster_info;
+ }
+ 
++static inline unsigned int cluster_offset(struct swap_info_struct *si,
++					  struct swap_cluster_info *ci)
++{
++	return cluster_index(si, ci) * SWAPFILE_CLUSTER;
++}
++
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ 						     unsigned long offset)
+ {
+@@ -371,11 +379,15 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 
+ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
++
+ 	if (ci->flags & CLUSTER_FLAG_NONFULL)
+ 		list_move_tail(&ci->list, &si->free_clusters);
+ 	else
+ 		list_add_tail(&ci->list, &si->free_clusters);
+ 	ci->flags = CLUSTER_FLAG_FREE;
++	ci->order = 0;
+ }
+ 
+ /*
+@@ -430,9 +442,11 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi
+ 	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
+ 							struct swap_cluster_info, list);
+ 
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
+ 	VM_BUG_ON(cluster_index(si, ci) != idx);
++	VM_BUG_ON(ci->count);
+ 	list_del(&ci->list);
+-	ci->count = 0;
+ 	ci->flags = 0;
+ 	return ci;
+ }
+@@ -440,6 +454,8 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi
+ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+ 	VM_BUG_ON(ci->count != 0);
++	lockdep_assert_held(&si->lock);
++	lockdep_assert_held(&ci->lock);
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -496,6 +512,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
+ 		return;
+ 
+ 	VM_BUG_ON(ci->count == 0);
++	VM_BUG_ON(cluster_is_free(ci));
++	lockdep_assert_held(&p->lock);
++	lockdep_assert_held(&ci->lock);
+ 	ci->count--;
+ 
+ 	if (!ci->count) {
+@@ -504,48 +523,88 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
+ 	}
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
++		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+ 		list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		ci->flags |= CLUSTER_FLAG_NONFULL;
++		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+ 
+-/*
+- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
+- * cluster list. Avoiding such abuse to avoid list corruption.
+- */
+-static bool
+-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+-	unsigned long offset, int order)
++static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
++				      unsigned int nr_pages)
+ {
+-	struct percpu_cluster *percpu_cluster;
+-	bool conflict;
+-	struct swap_cluster_info *first = list_first_entry(&si->free_clusters,
+-							   struct swap_cluster_info, list);
+-
+-	offset /= SWAPFILE_CLUSTER;
+-	conflict = !list_empty(&si->free_clusters) &&
+-		offset !=  cluster_index(si, first) &&
+-		cluster_is_free(&si->cluster_info[offset]);
++	unsigned char *p = si->swap_map + start;
++	unsigned char *end = p + nr_pages;
+ 
+-	if (!conflict)
+-		return false;
++	while (p < end)
++		if (*p++)
++			return false;
+ 
+-	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+-	percpu_cluster->next[order] = SWAP_NEXT_INVALID;
+ 	return true;
+ }
+ 
+-static inline bool swap_range_empty(char *swap_map, unsigned int start,
+-				    unsigned int nr_pages)
++
++static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++						unsigned int start, unsigned char usage,
++						unsigned int order)
+ {
+-	unsigned int i;
++	unsigned int nr_pages = 1 << order;
+ 
+-	for (i = 0; i < nr_pages; i++) {
+-		if (swap_map[start + i])
+-			return false;
++	if (cluster_is_free(ci)) {
++		if (nr_pages < SWAPFILE_CLUSTER) {
++			list_move_tail(&ci->list, &si->nonfull_clusters[order]);
++			ci->flags = CLUSTER_FLAG_NONFULL;
++		}
++		ci->order = order;
+ 	}
+ 
+-	return true;
++	memset(si->swap_map + start, usage, nr_pages);
++	swap_range_alloc(si, start, nr_pages);
++	ci->count += nr_pages;
++
++	if (ci->count == SWAPFILE_CLUSTER) {
++		VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL)));
++		list_del(&ci->list);
++		ci->flags = 0;
++	}
++}
++
++static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
++					    unsigned int *foundp, unsigned int order,
++					    unsigned char usage)
++{
++	unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
++	unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
++	unsigned int nr_pages = 1 << order;
++	struct swap_cluster_info *ci;
++
++	if (end < nr_pages)
++		return SWAP_NEXT_INVALID;
++	end -= nr_pages;
++
++	ci = lock_cluster(si, offset);
++	if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
++		offset = SWAP_NEXT_INVALID;
++		goto done;
++	}
++
++	while (offset <= end) {
++		if (cluster_scan_range(si, offset, nr_pages)) {
++			cluster_alloc_range(si, ci, offset, usage, order);
++			*foundp = offset;
++			if (ci->count == SWAPFILE_CLUSTER) {
++				offset = SWAP_NEXT_INVALID;
++				goto done;
++			}
++			offset += nr_pages;
++			break;
++		}
++		offset += nr_pages;
++	}
++	if (offset > end)
++		offset = SWAP_NEXT_INVALID;
++done:
++	unlock_cluster(ci);
++	return offset;
+ }
+ 
+ /*
+@@ -553,72 +612,66 @@ static inline bool swap_range_empty(char *swap_map, unsigned int start,
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+  * too.
+  */
+-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+-	unsigned long *offset, unsigned long *scan_base, int order)
++static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
++					      unsigned char usage)
+ {
+-	unsigned int nr_pages = 1 << order;
+ 	struct percpu_cluster *cluster;
+-	struct swap_cluster_info *ci;
+-	unsigned int tmp, max;
++	struct swap_cluster_info *ci, *n;
++	unsigned int offset, found = 0;
+ 
+ new_cluster:
++	lockdep_assert_held(&si->lock);
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+-	tmp = cluster->next[order];
+-	if (tmp == SWAP_NEXT_INVALID) {
+-		if (!list_empty(&si->free_clusters)) {
+-			ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+-			list_del(&ci->list);
+-			spin_lock(&ci->lock);
+-			ci->order = order;
+-			ci->flags = 0;
+-			spin_unlock(&ci->lock);
+-			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
+-		} else if (!list_empty(&si->nonfull_clusters[order])) {
+-			ci = list_first_entry(&si->nonfull_clusters[order],
+-					      struct swap_cluster_info, list);
+-			list_del(&ci->list);
+-			spin_lock(&ci->lock);
+-			ci->flags = 0;
+-			spin_unlock(&ci->lock);
+-			tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
+-		} else if (!list_empty(&si->discard_clusters)) {
+-			/*
+-			 * we don't have free cluster but have some clusters in
+-			 * discarding, do discard now and reclaim them, then
+-			 * reread cluster_next_cpu since we dropped si->lock
+-			 */
+-			swap_do_scheduled_discard(si);
+-			*scan_base = this_cpu_read(*si->cluster_next_cpu);
+-			*offset = *scan_base;
+-			goto new_cluster;
+-		} else
+-			return false;
++	offset = cluster->next[order];
++	if (offset) {
++		offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
++		if (found)
++			goto done;
+ 	}
+ 
+-	/*
+-	 * Other CPUs can use our cluster if they can't find a free cluster,
+-	 * check if there is still free entry in the cluster, maintaining
+-	 * natural alignment.
+-	 */
+-	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+-	if (tmp < max) {
+-		ci = lock_cluster(si, tmp);
+-		while (tmp < max) {
+-			if (swap_range_empty(si->swap_map, tmp, nr_pages))
+-				break;
+-			tmp += nr_pages;
++	if (!list_empty(&si->free_clusters)) {
++		ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
++		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
++		VM_BUG_ON(!found);
++		goto done;
++	}
++
++	if (order < PMD_ORDER) {
++		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, order, usage);
++			if (found)
++				goto done;
+ 		}
+-		unlock_cluster(ci);
+ 	}
+-	if (tmp >= max) {
+-		cluster->next[order] = SWAP_NEXT_INVALID;
++
++	if (!list_empty(&si->discard_clusters)) {
++		/*
++		 * we don't have free cluster but have some clusters in
++		 * discarding, do discard now and reclaim them, then
++		 * reread cluster_next_cpu since we dropped si->lock
++		 */
++		swap_do_scheduled_discard(si);
+ 		goto new_cluster;
+ 	}
+-	*offset = tmp;
+-	*scan_base = tmp;
+-	tmp += nr_pages;
+-	cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
+-	return true;
++
++	if (order)
++		goto done;
++
++	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
++		if (!list_empty(&si->nonfull_clusters[o])) {
++			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
++					      list);
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, 0, usage);
++			VM_BUG_ON(!found);
++			goto done;
++		}
++	}
++
++done:
++	cluster->next[order] = offset;
++	return found;
+ }
+ 
+ static void __del_from_avail_list(struct swap_info_struct *p)
+@@ -739,11 +792,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+ 	return false;
+ }
+ 
++static int cluster_alloc_swap(struct swap_info_struct *si,
++			     unsigned char usage, int nr,
++			     swp_entry_t slots[], int order)
++{
++	int n_ret = 0;
++
++	VM_BUG_ON(!si->cluster_info);
++
++	while (n_ret < nr) {
++		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
++
++		if (!offset)
++			break;
++		slots[n_ret++] = swp_entry(si->type, offset);
++	}
++
++	return n_ret;
++}
++
+ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			       unsigned char usage, int nr,
+ 			       swp_entry_t slots[], int order)
+ {
+-	struct swap_cluster_info *ci;
+ 	unsigned long offset;
+ 	unsigned long scan_base;
+ 	unsigned long last_in_cluster = 0;
+@@ -782,26 +853,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			return 0;
+ 	}
+ 
++	if (si->cluster_info)
++		return cluster_alloc_swap(si, usage, nr, slots, order);
++
+ 	si->flags += SWP_SCANNING;
+-	/*
+-	 * Use percpu scan base for SSD to reduce lock contention on
+-	 * cluster and swap cache.  For HDD, sequential access is more
+-	 * important.
+-	 */
+-	if (si->flags & SWP_SOLIDSTATE)
+-		scan_base = this_cpu_read(*si->cluster_next_cpu);
+-	else
+-		scan_base = si->cluster_next;
++
++	/* For HDD, sequential access is more important. */
++	scan_base = si->cluster_next;
+ 	offset = scan_base;
+ 
+-	/* SSD algorithm */
+-	if (si->cluster_info) {
+-		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
+-			if (order > 0)
+-				goto no_page;
+-			goto scan;
+-		}
+-	} else if (unlikely(!si->cluster_nr--)) {
++	if (unlikely(!si->cluster_nr--)) {
+ 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ 			goto checks;
+@@ -812,8 +873,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 		/*
+ 		 * If seek is expensive, start searching for new cluster from
+ 		 * start of partition, to minimize the span of allocated swap.
+-		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
+-		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
+ 		 */
+ 		scan_base = offset = si->lowest_bit;
+ 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+@@ -841,19 +900,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	}
+ 
+ checks:
+-	if (si->cluster_info) {
+-		while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
+-		/* take a break if we already got some slots */
+-			if (n_ret)
+-				goto done;
+-			if (!scan_swap_map_try_ssd_cluster(si, &offset,
+-							&scan_base, order)) {
+-				if (order > 0)
+-					goto no_page;
+-				goto scan;
+-			}
+-		}
+-	}
+ 	if (!(si->flags & SWP_WRITEOK))
+ 		goto no_page;
+ 	if (!si->highest_bit)
+@@ -861,11 +907,9 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	if (offset > si->highest_bit)
+ 		scan_base = offset = si->lowest_bit;
+ 
+-	ci = lock_cluster(si, offset);
+ 	/* reuse swap entry of cache-only swap if not busy. */
+ 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ 		int swap_was_freed;
+-		unlock_cluster(ci);
+ 		spin_unlock(&si->lock);
+ 		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ 		spin_lock(&si->lock);
+@@ -876,15 +920,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	}
+ 
+ 	if (si->swap_map[offset]) {
+-		unlock_cluster(ci);
+ 		if (!n_ret)
+ 			goto scan;
+ 		else
+ 			goto done;
+ 	}
+ 	memset(si->swap_map + offset, usage, nr_pages);
+-	add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
+-	unlock_cluster(ci);
+ 
+ 	swap_range_alloc(si, offset, nr_pages);
+ 	slots[n_ret++] = swp_entry(si->type, offset);
+@@ -905,13 +946,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 		latency_ration = LATENCY_LIMIT;
+ 	}
+ 
+-	/* try to get more slots in cluster */
+-	if (si->cluster_info) {
+-		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
+-			goto checks;
+-		if (order > 0)
+-			goto done;
+-	} else if (si->cluster_nr && !si->swap_map[++offset]) {
++	if (si->cluster_nr && !si->swap_map[++offset]) {
+ 		/* non-ssd case, still more slots in cluster? */
+ 		--si->cluster_nr;
+ 		goto checks;
+@@ -980,8 +1015,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ 	ci = lock_cluster(si, offset);
+ 	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+ 	ci->count = 0;
+-	ci->order = 0;
+-	ci->flags = 0;
+ 	free_cluster(si, ci);
+ 	unlock_cluster(ci);
+ 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+@@ -3099,8 +3132,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 			ci = cluster_info + idx;
+ 			if (idx >= nr_clusters)
+ 				continue;
+-			if (ci->count)
++			if (ci->count) {
++				ci->flags = CLUSTER_FLAG_NONFULL;
++				list_add_tail(&ci->list, &p->nonfull_clusters[0]);
+ 				continue;
++			}
+ 			ci->flags = CLUSTER_FLAG_FREE;
+ 			list_add_tail(&ci->list, &p->free_clusters);
+ 		}
+-- 
+Gitee
+
+
+From 4db67dafd426f7dd2fbde13583c1875a2b242b95 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:10 +0800
+Subject: [PATCH 05/14] mm: swap: clean up initialization helper
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 3b2561b5daeb3531c011491e9a6d2b934cc8f49f
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b2561b5daeb3531c011491e9a6d2b934cc8f49f
+
+--------------------------------
+
+At this point, alloc_cluster is never called already, and
+inc_cluster_info_page is called by initialization only, a lot of dead code
+can be dropped.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-4-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 44 ++++++++++----------------------------------
+ 1 file changed, 10 insertions(+), 34 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index a3e721510311..4be5fbbdc1c8 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -437,20 +437,6 @@ static void swap_users_ref_free(struct percpu_ref *ref)
+ 	complete(&si->comp);
+ }
+ 
+-static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+-{
+-	struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
+-							struct swap_cluster_info, list);
+-
+-	lockdep_assert_held(&si->lock);
+-	lockdep_assert_held(&ci->lock);
+-	VM_BUG_ON(cluster_index(si, ci) != idx);
+-	VM_BUG_ON(ci->count);
+-	list_del(&ci->list);
+-	ci->flags = 0;
+-	return ci;
+-}
+-
+ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
+ {
+ 	VM_BUG_ON(ci->count != 0);
+@@ -471,34 +457,24 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
+ }
+ 
+ /*
+- * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased by
+- * count.
++ * The cluster corresponding to page_nr will be used. The cluster will not be
++ * added to free cluster list and its usage counter will be increased by 1.
++ * Only used for initialization.
+  */
+-static void add_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr,
+-	unsigned long count)
++static void inc_cluster_info_page(struct swap_info_struct *p,
++	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+ {
+ 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+-	struct swap_cluster_info *ci = cluster_info + idx;
++	struct swap_cluster_info *ci;
+ 
+ 	if (!cluster_info)
+ 		return;
+-	if (cluster_is_free(ci))
+-		alloc_cluster(p, idx);
+ 
+-	VM_BUG_ON(ci->count + count > SWAPFILE_CLUSTER);
+-	ci->count += count;
+-}
++	ci = cluster_info + idx;
++	ci->count++;
+ 
+-/*
+- * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased by 1.
+- */
+-static void inc_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+-{
+-	add_cluster_info_page(p, cluster_info, page_nr, 1);
++	VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
++	VM_BUG_ON(ci->flags);
+ }
+ 
+ /*
+-- 
+Gitee
+
+
+From 18f732c19747e766e0632419f32dfb02768ada67 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:11 +0800
+Subject: [PATCH 06/14] mm: swap: skip slot cache on freeing for mTHP
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 650975d2b181e30c9017c42cb3f6535287555b1e
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=650975d2b181e30c9017c42cb3f6535287555b1e
+
+--------------------------------
+
+Currently when we are freeing mTHP folios from swap cache, we free then
+one by one and put each entry into swap slot cache.  Slot cache is
+designed to reduce the overhead by batching the freeing, but mTHP swap
+entries are already continuous so they can be batch freed without it
+already, it saves litle overhead, or even increase overhead for larger
+mTHP.
+
+What's more, mTHP entries could stay in swap cache for a while.
+Contiguous swap entry is an rather rare resource so releasing them
+directly can help improve mTHP allocation success rate when under
+pressure.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Acked-by: Barry Song <baohua@kernel.org>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/swapfile.c
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 59 +++++++++++++++++++++++----------------------------
+ 1 file changed, 26 insertions(+), 33 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 4be5fbbdc1c8..44726e0b8f8f 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -478,20 +478,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
+ }
+ 
+ /*
+- * The cluster ci decreases one usage. If the usage counter becomes 0,
++ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
+  * which means no page in the cluster is in use, we can optionally discard
+  * the cluster and add it to free cluster list.
+  */
+-static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
++static void dec_cluster_info_page(struct swap_info_struct *p,
++				  struct swap_cluster_info *ci, int nr_pages)
+ {
+ 	if (!p->cluster_info)
+ 		return;
+ 
+-	VM_BUG_ON(ci->count == 0);
++	VM_BUG_ON(ci->count < nr_pages);
+ 	VM_BUG_ON(cluster_is_free(ci));
+ 	lockdep_assert_held(&p->lock);
+ 	lockdep_assert_held(&ci->lock);
+-	ci->count--;
++	ci->count -= nr_pages;
+ 
+ 	if (!ci->count) {
+ 		free_cluster(p, ci);
+@@ -983,19 +984,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	return n_ret;
+ }
+ 
+-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+-{
+-	unsigned long offset = idx * SWAPFILE_CLUSTER;
+-	struct swap_cluster_info *ci;
+-
+-	ci = lock_cluster(si, offset);
+-	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+-	ci->count = 0;
+-	free_cluster(si, ci);
+-	unlock_cluster(ci);
+-	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+-}
+-
+ #ifdef CONFIG_MEMCG_SWAP_QOS
+ int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type)
+ {
+@@ -1343,21 +1331,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ 	return usage;
+ }
+ 
+-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
++/*
++ * Drop the last HAS_CACHE flag of swap entries, caller have to
++ * ensure all entries belong to the same cgroup.
++ */
++static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
++				  unsigned int nr_pages)
+ {
+-	struct swap_cluster_info *ci;
+ 	unsigned long offset = swp_offset(entry);
+-	unsigned char count;
++	unsigned char *map = p->swap_map + offset;
++	unsigned char *map_end = map + nr_pages;
++	struct swap_cluster_info *ci;
+ 
+ 	ci = lock_cluster(p, offset);
+-	count = p->swap_map[offset];
+-	VM_BUG_ON(count != SWAP_HAS_CACHE);
+-	p->swap_map[offset] = 0;
+-	dec_cluster_info_page(p, ci);
++	do {
++		VM_BUG_ON(*map != SWAP_HAS_CACHE);
++		*map = 0;
++	} while (++map < map_end);
++	dec_cluster_info_page(p, ci, nr_pages);
+ 	unlock_cluster(ci);
+ 
+-	mem_cgroup_uncharge_swap(entry, 1);
+-	swap_range_free(p, offset, 1);
++	mem_cgroup_uncharge_swap(entry, nr_pages);
++	swap_range_free(p, offset, nr_pages);
+ }
+ 
+ static void cluster_swap_free_nr(struct swap_info_struct *sis,
+@@ -1418,7 +1413,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
+ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ {
+ 	unsigned long offset = swp_offset(entry);
+-	unsigned long idx = offset / SWAPFILE_CLUSTER;
+ 	struct swap_cluster_info *ci;
+ 	struct swap_info_struct *si;
+ 	unsigned char *map;
+@@ -1431,19 +1425,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 		return;
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+-	if (size == SWAPFILE_CLUSTER) {
++	if (size > 1) {
+ 		map = si->swap_map + offset;
+-		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
++		for (i = 0; i < size; i++) {
+ 			val = map[i];
+ 			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ 			if (val == SWAP_HAS_CACHE)
+ 				free_entries++;
+ 		}
+-		if (free_entries == SWAPFILE_CLUSTER) {
++		if (free_entries == size) {
+ 			unlock_cluster_or_swap_info(si, ci);
+ 			spin_lock(&si->lock);
+-			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+-			swap_free_cluster(si, idx);
++			swap_entry_range_free(si, entry, size);
+ 			spin_unlock(&si->lock);
+ 			return;
+ 		}
+@@ -1488,7 +1481,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
+ 	for (i = 0; i < n; ++i) {
+ 		p = swap_info_get_cont(entries[i], prev);
+ 		if (p)
+-			swap_entry_free(p, entries[i]);
++			swap_entry_range_free(p, entries[i], 1);
+ 		prev = p;
+ 	}
+ 	if (p)
+-- 
+Gitee
+
+
+From 53a99352d0946625a0d45deeb8d0729855d4b080 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:12 +0800
+Subject: [PATCH 07/14] mm: swap: allow cache reclaim to skip slot cache
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 862590ac3708e1cbbfb02a8ed78587b86ecba4ba
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=862590ac3708e1cbbfb02a8ed78587b86ecba4ba
+
+--------------------------------
+
+Currently we free the reclaimed slots through slot cache even if the slot
+is required to be empty immediately.  As a result the reclaim caller will
+see the slot still occupied even after a successful reclaim, and need to
+keep reclaiming until slot cache get flushed.  This caused ineffective or
+over reclaim when SWAP is under stress.
+
+So introduce a new flag allowing the slot to be emptied bypassing the slot
+cache.
+
+[21cnbao@gmail.com: small folios should have nr_pages == 1 but not nr_page == 0]
+  Link: https://lkml.kernel.org/r/20240805015324.45134-1-21cnbao@gmail.com
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/swapfile.c
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 152 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 109 insertions(+), 43 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 44726e0b8f8f..e58457b801fb 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -52,8 +52,15 @@
+ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ 				 unsigned char);
+ static void free_swap_count_continuations(struct swap_info_struct *);
++static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
++				  unsigned int nr_pages);
+ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 			     unsigned int nr_entries);
++static bool folio_swapcache_freeable(struct folio *folio);
++static struct swap_cluster_info *lock_cluster_or_swap_info(
++		struct swap_info_struct *si, unsigned long offset);
++static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
++					struct swap_cluster_info *ci);
+ 
+ static DEFINE_SPINLOCK(swap_lock);
+ static unsigned int nr_swapfiles;
+@@ -128,8 +135,25 @@ static inline unsigned char swap_count(unsigned char ent)
+  * corresponding page
+  */
+ #define TTRS_UNMAPPED		0x2
+-/* Reclaim the swap entry if swap is getting full*/
++/* Reclaim the swap entry if swap is getting full */
+ #define TTRS_FULL		0x4
++/* Reclaim directly, bypass the slot cache and don't touch device lock */
++#define TTRS_DIRECT		0x8
++
++static bool swap_is_has_cache(struct swap_info_struct *si,
++			      unsigned long offset, int nr_pages)
++{
++	unsigned char *map = si->swap_map + offset;
++	unsigned char *map_end = map + nr_pages;
++
++	do {
++		VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
++		if (*map != SWAP_HAS_CACHE)
++			return false;
++	} while (++map < map_end);
++
++	return true;
++}
+ 
+ /*
+  * returns number of pages in the folio that backs the swap entry. If positive,
+@@ -140,12 +164,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 				 unsigned long offset, unsigned long flags)
+ {
+ 	swp_entry_t entry = swp_entry(si->type, offset);
++	struct address_space *address_space = swap_address_space(entry);
++	struct swap_cluster_info *ci;
+ 	struct folio *folio;
+-	int ret = 0;
++	int ret, nr_pages;
++	bool need_reclaim;
+ 
+-	folio = filemap_get_folio(swap_address_space(entry), offset);
++	folio = filemap_get_folio(address_space, offset);
+ 	if (IS_ERR(folio))
+ 		return 0;
++
++	/* offset could point to the middle of a large folio */
++	entry = folio->swap;
++	offset = swp_offset(entry);
++	nr_pages = folio_nr_pages(folio);
++	ret = -nr_pages;
++
+ 	/*
+ 	 * When this function is called from scan_swap_map_slots() and it's
+ 	 * called by vmscan.c at reclaiming folios. So we hold a folio lock
+@@ -153,14 +187,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	 * case and you should use folio_free_swap() with explicit folio_lock()
+ 	 * in usual operations.
+ 	 */
+-	if (folio_trylock(folio)) {
+-		if ((flags & TTRS_ANYWAY) ||
+-		    ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+-		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+-			ret = folio_free_swap(folio);
+-		folio_unlock(folio);
++	if (!folio_trylock(folio))
++		goto out;
++
++	need_reclaim = ((flags & TTRS_ANYWAY) ||
++			((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
++			((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
++	if (!need_reclaim || !folio_swapcache_freeable(folio))
++		goto out_unlock;
++
++	/*
++	 * It's safe to delete the folio from swap cache only if the folio's
++	 * swap_map is HAS_CACHE only, which means the slots have no page table
++	 * reference or pending writeback, and can't be allocated to others.
++	 */
++	ci = lock_cluster_or_swap_info(si, offset);
++	need_reclaim = swap_is_has_cache(si, offset, nr_pages);
++	unlock_cluster_or_swap_info(si, ci);
++	if (!need_reclaim)
++		goto out_unlock;
++
++	if (!(flags & TTRS_DIRECT)) {
++		/* Free through slot cache */
++		delete_from_swap_cache(folio);
++		folio_set_dirty(folio);
++		ret = nr_pages;
++		goto out_unlock;
+ 	}
+-	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
++
++	xa_lock_irq(&address_space->i_pages);
++	__delete_from_swap_cache(folio, entry, NULL);
++	xa_unlock_irq(&address_space->i_pages);
++	folio_ref_sub(folio, nr_pages);
++	folio_set_dirty(folio);
++
++	spin_lock(&si->lock);
++	/* Only sinple page folio can be backed by zswap */
++	if (nr_pages == 1)
++		zswap_invalidate(entry);
++	swap_entry_range_free(si, entry, nr_pages);
++	spin_unlock(&si->lock);
++	ret = nr_pages;
++out_unlock:
++	folio_unlock(folio);
++out:
+ 	folio_put(folio);
+ 	return ret;
+ }
+@@ -888,7 +958,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ 		int swap_was_freed;
+ 		spin_unlock(&si->lock);
+-		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
++		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
+ 		spin_lock(&si->lock);
+ 		/* entry was freed successfully, try to use this again */
+ 		if (swap_was_freed > 0)
+@@ -1415,9 +1485,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 	unsigned long offset = swp_offset(entry);
+ 	struct swap_cluster_info *ci;
+ 	struct swap_info_struct *si;
+-	unsigned char *map;
+-	unsigned int i, free_entries = 0;
+-	unsigned char val;
+ 	int size = 1 << swap_entry_order(folio_order(folio));
+ 
+ 	si = _swap_info_get(entry);
+@@ -1425,23 +1492,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
+ 		return;
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+-	if (size > 1) {
+-		map = si->swap_map + offset;
+-		for (i = 0; i < size; i++) {
+-			val = map[i];
+-			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+-			if (val == SWAP_HAS_CACHE)
+-				free_entries++;
+-		}
+-		if (free_entries == size) {
+-			unlock_cluster_or_swap_info(si, ci);
+-			spin_lock(&si->lock);
+-			swap_entry_range_free(si, entry, size);
+-			spin_unlock(&si->lock);
+-			return;
+-		}
++	if (size > 1 && swap_is_has_cache(si, offset, size)) {
++		unlock_cluster_or_swap_info(si, ci);
++		spin_lock(&si->lock);
++		swap_entry_range_free(si, entry, size);
++		spin_unlock(&si->lock);
++		return;
+ 	}
+-	for (i = 0; i < size; i++, entry.val++) {
++	for (int i = 0; i < size; i++, entry.val++) {
+ 		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ 			unlock_cluster_or_swap_info(si, ci);
+ 			free_swap_slot(entry);
+@@ -1601,16 +1659,7 @@ static bool folio_swapped(struct folio *folio)
+ 	return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
+ }
+ 
+-/**
+- * folio_free_swap() - Free the swap space used for this folio.
+- * @folio: The folio to remove.
+- *
+- * If swap is getting full, or if there are no more mappings of this folio,
+- * then call folio_free_swap to free its swap space.
+- *
+- * Return: true if we were able to release the swap space.
+- */
+-bool folio_free_swap(struct folio *folio)
++static bool folio_swapcache_freeable(struct folio *folio)
+ {
+ 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ 
+@@ -1618,8 +1667,6 @@ bool folio_free_swap(struct folio *folio)
+ 		return false;
+ 	if (folio_test_writeback(folio))
+ 		return false;
+-	if (folio_swapped(folio))
+-		return false;
+ 
+ 	/*
+ 	 * Once hibernation has begun to create its image of memory,
+@@ -1639,6 +1686,25 @@ bool folio_free_swap(struct folio *folio)
+ 	if (pm_suspended_storage())
+ 		return false;
+ 
++	return true;
++}
++
++/**
++ * folio_free_swap() - Free the swap space used for this folio.
++ * @folio: The folio to remove.
++ *
++ * If swap is getting full, or if there are no more mappings of this folio,
++ * then call folio_free_swap to free its swap space.
++ *
++ * Return: true if we were able to release the swap space.
++ */
++bool folio_free_swap(struct folio *folio)
++{
++	if (!folio_swapcache_freeable(folio))
++		return false;
++	if (folio_swapped(folio))
++		return false;
++
+ 	delete_from_swap_cache(folio);
+ 	folio_set_dirty(folio);
+ 	return true;
+@@ -1715,7 +1781,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+ 			 * to the next boundary.
+ 			 */
+ 			nr = __try_to_reclaim_swap(si, offset,
+-					      TTRS_UNMAPPED | TTRS_FULL);
++						   TTRS_UNMAPPED | TTRS_FULL);
+ 			if (nr == 0)
+ 				nr = 1;
+ 			else if (nr < 0)
+-- 
+Gitee
+
+
+From a1f6274ecbb551837ea7a66e740c660f405a2443 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:13 +0800
+Subject: [PATCH 08/14] mm: swap: add a fragment cluster list
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 477cb7ba28892eda112c79d8f75d10edabfc3050
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=477cb7ba28892eda112c79d8f75d10edabfc3050
+
+--------------------------------
+
+Now swap cluster allocator arranges the clusters in LRU style, so the
+"cold" cluster stay at the head of nonfull lists are the ones that were
+used for allocation long time ago and still partially occupied.  So if
+allocator can't find enough contiguous slots to satisfy an high order
+allocation, it's unlikely there will be slot being free on them to satisfy
+the allocation, at least in a short period.
+
+As a result, nonfull cluster scanning will waste time repeatly scanning
+the unusable head of the list.
+
+Also, multiple CPUs could content on the same head cluster of nonfull
+list.  Unlike free clusters which are removed from the list when any CPU
+starts using it, nonfull cluster stays on the head.
+
+So introduce a new list frag list, all scanned nonfull clusters will be
+moved to this list.  Both for avoiding repeated scanning and contention.
+
+Frag list is still used as fallback for allocations, so if one CPU failed
+to allocate one order of slots, it can still steal other CPU's clusters.
+And order 0 will favor the fragmented clusters to better protect nonfull
+clusters
+
+If any slots on a fragment list are being freed, move the fragment list
+back to nonfull list indicating it worth another scan on the cluster.
+Compared to scan upon freeing a slot, this keep the scanning lazy and save
+some CPU if there are still other clusters to use.
+
+It may seems unneccessay to keep the fragmented cluster on list at all if
+they can't be used for specific order allocation.  But this will start to
+make sense once reclaim dring scanning is ready.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-7-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  3 +++
+ mm/swapfile.c        | 41 +++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 40 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 29a1daa46421..81188caed2d2 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -271,6 +271,7 @@ struct swap_cluster_info {
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+ #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
++#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -310,6 +311,8 @@ struct swap_info_struct {
+ 	struct list_head free_clusters; /* free clusters list */
+ 	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+ 					/* list of cluster that contains at least one free slot */
++	struct list_head frag_clusters[SWAP_NR_ORDERS];
++					/* list of cluster that are fragmented or contented */
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index e58457b801fb..7c71e7df9cf3 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -571,7 +571,10 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-		list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		if (ci->flags & CLUSTER_FLAG_FRAG)
++			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		else
++			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+ 		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+@@ -609,7 +612,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
+ 	ci->count += nr_pages;
+ 
+ 	if (ci->count == SWAPFILE_CLUSTER) {
+-		VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL)));
++		VM_BUG_ON(!(ci->flags &
++			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+ 		list_del(&ci->list);
+ 		ci->flags = 0;
+ 	}
+@@ -665,6 +669,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	struct percpu_cluster *cluster;
+ 	struct swap_cluster_info *ci, *n;
+ 	unsigned int offset, found = 0;
++	LIST_HEAD(fraged);
+ 
+ new_cluster:
+ 	lockdep_assert_held(&si->lock);
+@@ -685,13 +690,29 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 
+ 	if (order < PMD_ORDER) {
+ 		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
++			list_move_tail(&ci->list, &fraged);
++			ci->flags = CLUSTER_FLAG_FRAG;
+ 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 							 &found, order, usage);
+ 			if (found)
+-				goto done;
++				break;
+ 		}
++
++		if (!found) {
++			list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
++				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++								 &found, order, usage);
++				if (found)
++					break;
++			}
++		}
++
++		list_splice_tail(&fraged, &si->frag_clusters[order]);
+ 	}
+ 
++	if (found)
++		goto done;
++
+ 	if (!list_empty(&si->discard_clusters)) {
+ 		/*
+ 		 * we don't have free cluster but have some clusters in
+@@ -705,7 +726,17 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	if (order)
+ 		goto done;
+ 
++	/* Order 0 stealing from higher order */
+ 	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
++		if (!list_empty(&si->frag_clusters[o])) {
++			ci = list_first_entry(&si->frag_clusters[o],
++					      struct swap_cluster_info, list);
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
++							 0, usage);
++			VM_BUG_ON(!found);
++			goto done;
++		}
++
+ 		if (!list_empty(&si->nonfull_clusters[o])) {
+ 			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
+ 					      list);
+@@ -3110,8 +3141,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	INIT_LIST_HEAD(&p->free_clusters);
+ 	INIT_LIST_HEAD(&p->discard_clusters);
+ 
+-	for (i = 0; i < SWAP_NR_ORDERS; i++)
++	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+ 		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
++		INIT_LIST_HEAD(&p->frag_clusters[i]);
++	}
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ 		unsigned int page_nr = swap_header->info.badpages[i];
+-- 
+Gitee
+
+
+From 7c0f2c55f9a21373319df1952070b162b3c6be8a Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:14 +0800
+Subject: [PATCH 09/14] mm: swap: relaim the cached parts that got scanned
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 661383c6111a38c88df61af6bfbcfacd2ff20a67
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=661383c6111a38c88df61af6bfbcfacd2ff20a67
+
+--------------------------------
+
+This commit implements reclaim during scan for cluster allocator.
+
+Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could
+result in low allocation success rate or early OOM.
+
+So to ensure maximum allocation success rate, integrate reclaiming with
+scanning.  If found a range of suitable swap slots but fragmented due to
+HAS_CACHE, just try to reclaim the slots.
+
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |   1 +
+ mm/swapfile.c        | 140 +++++++++++++++++++++++++++++++++----------
+ 2 files changed, 110 insertions(+), 31 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 81188caed2d2..83b1bcbaf2ec 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -313,6 +313,7 @@ struct swap_info_struct {
+ 					/* list of cluster that contains at least one free slot */
+ 	struct list_head frag_clusters[SWAP_NR_ORDERS];
+ 					/* list of cluster that are fragmented or contented */
++	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
+ 	unsigned int lowest_bit;	/* index of first free in swap_map */
+ 	unsigned int highest_bit;	/* index of last free in swap_map */
+ 	unsigned int pages;		/* total of usable pages of swap */
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 7c71e7df9cf3..45f73b73a92f 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -512,6 +512,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
+ 	VM_BUG_ON(ci->count != 0);
+ 	lockdep_assert_held(&si->lock);
+ 	lockdep_assert_held(&ci->lock);
++
++	if (ci->flags & CLUSTER_FLAG_FRAG)
++		si->frag_cluster_nr[ci->order]--;
++
+ 	/*
+ 	 * If the swap is discardable, prepare discard the cluster
+ 	 * instead of free it immediately. The cluster will be freed
+@@ -571,31 +575,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-		if (ci->flags & CLUSTER_FLAG_FRAG)
++		if (ci->flags & CLUSTER_FLAG_FRAG) {
++			p->frag_cluster_nr[ci->order]--;
+ 			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		else
++		} else {
+ 			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
++		}
+ 		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+ 
+-static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
+-				      unsigned int nr_pages)
++static bool cluster_reclaim_range(struct swap_info_struct *si,
++				  struct swap_cluster_info *ci,
++				  unsigned long start, unsigned long end)
+ {
+-	unsigned char *p = si->swap_map + start;
+-	unsigned char *end = p + nr_pages;
++	unsigned char *map = si->swap_map;
++	unsigned long offset;
++
++	spin_unlock(&ci->lock);
++	spin_unlock(&si->lock);
++
++	for (offset = start; offset < end; offset++) {
++		switch (READ_ONCE(map[offset])) {
++		case 0:
++			continue;
++		case SWAP_HAS_CACHE:
++			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
++				continue;
++			goto out;
++		default:
++			goto out;
++		}
++	}
++out:
++	spin_lock(&si->lock);
++	spin_lock(&ci->lock);
+ 
+-	while (p < end)
+-		if (*p++)
++	/*
++	 * Recheck the range no matter reclaim succeeded or not, the slot
++	 * could have been be freed while we are not holding the lock.
++	 */
++	for (offset = start; offset < end; offset++)
++		if (READ_ONCE(map[offset]))
+ 			return false;
+ 
+ 	return true;
+ }
+ 
++static bool cluster_scan_range(struct swap_info_struct *si,
++			       struct swap_cluster_info *ci,
++			       unsigned long start, unsigned int nr_pages)
++{
++	unsigned long offset, end = start + nr_pages;
++	unsigned char *map = si->swap_map;
++	bool need_reclaim = false;
+ 
+-static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+-						unsigned int start, unsigned char usage,
+-						unsigned int order)
++	for (offset = start; offset < end; offset++) {
++		switch (READ_ONCE(map[offset])) {
++		case 0:
++			continue;
++		case SWAP_HAS_CACHE:
++			if (!vm_swap_full())
++				return false;
++			need_reclaim = true;
++			continue;
++		default:
++			return false;
++		}
++	}
++
++	if (need_reclaim)
++		return cluster_reclaim_range(si, ci, start, end);
++
++	return true;
++}
++
++static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++				unsigned int start, unsigned char usage,
++				unsigned int order)
+ {
+ 	unsigned int nr_pages = 1 << order;
+ 
+@@ -614,6 +671,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
+ 	if (ci->count == SWAPFILE_CLUSTER) {
+ 		VM_BUG_ON(!(ci->flags &
+ 			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
++		if (ci->flags & CLUSTER_FLAG_FRAG)
++			si->frag_cluster_nr[ci->order]--;
+ 		list_del(&ci->list);
+ 		ci->flags = 0;
+ 	}
+@@ -639,7 +698,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 	}
+ 
+ 	while (offset <= end) {
+-		if (cluster_scan_range(si, offset, nr_pages)) {
++		if (cluster_scan_range(si, ci, offset, nr_pages)) {
+ 			cluster_alloc_range(si, ci, offset, usage, order);
+ 			*foundp = offset;
+ 			if (ci->count == SWAPFILE_CLUSTER) {
+@@ -667,9 +726,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 					      unsigned char usage)
+ {
+ 	struct percpu_cluster *cluster;
+-	struct swap_cluster_info *ci, *n;
++	struct swap_cluster_info *ci;
+ 	unsigned int offset, found = 0;
+-	LIST_HEAD(fraged);
+ 
+ new_cluster:
+ 	lockdep_assert_held(&si->lock);
+@@ -689,25 +747,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	}
+ 
+ 	if (order < PMD_ORDER) {
+-		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
+-			list_move_tail(&ci->list, &fraged);
++		unsigned int frags = 0;
++
++		while (!list_empty(&si->nonfull_clusters[order])) {
++			ci = list_first_entry(&si->nonfull_clusters[order],
++					      struct swap_cluster_info, list);
++			list_move_tail(&ci->list, &si->frag_clusters[order]);
+ 			ci->flags = CLUSTER_FLAG_FRAG;
++			si->frag_cluster_nr[order]++;
+ 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 							 &found, order, usage);
++			frags++;
+ 			if (found)
+ 				break;
+ 		}
+ 
+ 		if (!found) {
+-			list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
++			/*
++			 * Nonfull clusters are moved to frag tail if we reached
++			 * here, count them too, don't over scan the frag list.
++			 */
++			while (frags < si->frag_cluster_nr[order]) {
++				ci = list_first_entry(&si->frag_clusters[order],
++						      struct swap_cluster_info, list);
++				/*
++				 * Rotate the frag list to iterate, they were all failing
++				 * high order allocation or moved here due to per-CPU usage,
++				 * this help keeping usable cluster ahead.
++				 */
++				list_move_tail(&ci->list, &si->frag_clusters[order]);
+ 				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 								 &found, order, usage);
++				frags++;
+ 				if (found)
+ 					break;
+ 			}
+ 		}
+-
+-		list_splice_tail(&fraged, &si->frag_clusters[order]);
+ 	}
+ 
+ 	if (found)
+@@ -728,25 +803,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 
+ 	/* Order 0 stealing from higher order */
+ 	for (int o = 1; o < SWAP_NR_ORDERS; o++) {
+-		if (!list_empty(&si->frag_clusters[o])) {
++		/*
++		 * Clusters here have at least one usable slots and can't fail order 0
++		 * allocation, but reclaim may drop si->lock and race with another user.
++		 */
++		while (!list_empty(&si->frag_clusters[o])) {
+ 			ci = list_first_entry(&si->frag_clusters[o],
+ 					      struct swap_cluster_info, list);
+-			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
+-							 0, usage);
+-			VM_BUG_ON(!found);
+-			goto done;
++			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
++							 &found, 0, usage);
++			if (found)
++				goto done;
+ 		}
+ 
+-		if (!list_empty(&si->nonfull_clusters[o])) {
+-			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
+-					      list);
++		while (!list_empty(&si->nonfull_clusters[o])) {
++			ci = list_first_entry(&si->nonfull_clusters[o],
++					      struct swap_cluster_info, list);
+ 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ 							 &found, 0, usage);
+-			VM_BUG_ON(!found);
+-			goto done;
++			if (found)
++				goto done;
+ 		}
+ 	}
+-
+ done:
+ 	cluster->next[order] = offset;
+ 	return found;
+@@ -3144,6 +3222,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+ 		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
+ 		INIT_LIST_HEAD(&p->frag_clusters[i]);
++		p->frag_cluster_nr[i] = 0;
+ 	}
+ 
+ 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
+@@ -3187,7 +3266,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	if (!cluster_info)
+ 		return nr_extents;
+ 
+-
+ 	/*
+ 	 * Reduce false cache line sharing between cluster_info and
+ 	 * sharing same address space.
+-- 
+Gitee
+
+
+From da3342ba73e419beb8f4b793ff077b763c27b1df Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:15 +0800
+Subject: [PATCH 10/14] mm: swap: add a adaptive full cluster cache reclaim
+
+mainline inclusion
+from mainline-v6.12-rc1
+commit 2cacbdfdee65b18f9952620e762eab043d71b564
+category: performance
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2cacbdfdee65b18f9952620e762eab043d71b564
+
+--------------------------------
+
+Link all full cluster with one full list, and reclaim from it when the
+allocation have ran out of all usable clusters.
+
+There are many reason a folio can end up being in the swap cache while
+having no swap count reference.  So the best way to search for such slots
+is still by iterating the swap clusters.
+
+With the list as an LRU, iterating from the oldest cluster and keep them
+rotating is a very doable and clean way to free up potentially not inuse
+clusters.
+
+When any allocation failure, try reclaim and rotate only one cluster.
+This is adaptive for high order allocations they can tolerate fallback.
+So this avoids latency, and give the full cluster list an fair chance to
+get reclaimed.  It release the usage stress for the fallback order 0
+allocation or following up high order allocation.
+
+If the swap device is getting very full, reclaim more aggresively to
+ensure no OOM will happen.  This ensures order 0 heavy workload won't go
+OOM as order 0 won't fail if any cluster still have any space.
+
+[ryncsn@gmail.com: fix discard of full cluster]
+  Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com
+Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Kairui Song <ryncsn@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  2 ++
+ mm/swapfile.c        | 68 +++++++++++++++++++++++++++++++++++---------
+ 2 files changed, 57 insertions(+), 13 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 83b1bcbaf2ec..1664655aa7c8 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -272,6 +272,7 @@ struct swap_cluster_info {
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+ #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
+ #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
++#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */
+ 
+ /*
+  * The first page in the swap file is the swap header, which is always marked
+@@ -309,6 +310,7 @@ struct swap_info_struct {
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+ 	struct list_head free_clusters; /* free clusters list */
++	struct list_head full_clusters; /* full clusters list */
+ 	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+ 					/* list of cluster that contains at least one free slot */
+ 	struct list_head frag_clusters[SWAP_NR_ORDERS];
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 45f73b73a92f..389e14f0fc3c 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -439,10 +439,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ 			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+ 
+ 	VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-	if (ci->flags & CLUSTER_FLAG_NONFULL)
+-		list_move_tail(&ci->list, &si->discard_clusters);
+-	else
+-		list_add_tail(&ci->list, &si->discard_clusters);
++	list_move_tail(&ci->list, &si->discard_clusters);
+ 	ci->flags = 0;
+ 	schedule_work(&si->discard_work);
+ }
+@@ -452,7 +449,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
+ 	lockdep_assert_held(&si->lock);
+ 	lockdep_assert_held(&ci->lock);
+ 
+-	if (ci->flags & CLUSTER_FLAG_NONFULL)
++	if (ci->flags)
+ 		list_move_tail(&ci->list, &si->free_clusters);
+ 	else
+ 		list_add_tail(&ci->list, &si->free_clusters);
+@@ -479,7 +476,6 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ 				SWAPFILE_CLUSTER);
+ 
+ 		spin_lock(&si->lock);
+-
+ 		spin_lock(&ci->lock);
+ 		__free_cluster(si, ci);
+ 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+@@ -575,12 +571,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
+ 
+ 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+-		if (ci->flags & CLUSTER_FLAG_FRAG) {
++		if (ci->flags & CLUSTER_FLAG_FRAG)
+ 			p->frag_cluster_nr[ci->order]--;
+-			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		} else {
+-			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+-		}
++		list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+ 		ci->flags = CLUSTER_FLAG_NONFULL;
+ 	}
+ }
+@@ -673,8 +666,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
+ 			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+ 		if (ci->flags & CLUSTER_FLAG_FRAG)
+ 			si->frag_cluster_nr[ci->order]--;
+-		list_del(&ci->list);
+-		ci->flags = 0;
++		list_move_tail(&ci->list, &si->full_clusters);
++		ci->flags = CLUSTER_FLAG_FULL;
+ 	}
+ }
+ 
+@@ -717,6 +710,46 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 	return offset;
+ }
+ 
++static void swap_reclaim_full_clusters(struct swap_info_struct *si)
++{
++	long to_scan = 1;
++	unsigned long offset, end;
++	struct swap_cluster_info *ci;
++	unsigned char *map = si->swap_map;
++	int nr_reclaim, total_reclaimed = 0;
++
++	if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
++		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
++
++	while (!list_empty(&si->full_clusters)) {
++		ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
++		list_move_tail(&ci->list, &si->full_clusters);
++		offset = cluster_offset(si, ci);
++		end = min(si->max, offset + SWAPFILE_CLUSTER);
++		to_scan--;
++
++		while (offset < end) {
++			if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
++				spin_unlock(&si->lock);
++				nr_reclaim = __try_to_reclaim_swap(si, offset,
++								   TTRS_ANYWAY | TTRS_DIRECT);
++				spin_lock(&si->lock);
++				if (nr_reclaim > 0) {
++					offset += nr_reclaim;
++					total_reclaimed += nr_reclaim;
++					continue;
++				} else if (nr_reclaim < 0) {
++					offset += -nr_reclaim;
++					continue;
++				}
++			}
++			offset++;
++		}
++		if (to_scan <= 0 || total_reclaimed)
++			break;
++	}
++}
++
+ /*
+  * Try to get swap entries with specified order from current cpu's swap entry
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+@@ -825,7 +858,15 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 				goto done;
+ 		}
+ 	}
++
+ done:
++	/* Try reclaim from full clusters if device is nearfull */
++	if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
++		swap_reclaim_full_clusters(si);
++		if (!found && !order && si->pages != si->inuse_pages)
++			goto new_cluster;
++	}
++
+ 	cluster->next[order] = offset;
+ 	return found;
+ }
+@@ -3217,6 +3258,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ 	nr_good_pages = maxpages - 1;	/* omit header page */
+ 
+ 	INIT_LIST_HEAD(&p->free_clusters);
++	INIT_LIST_HEAD(&p->full_clusters);
+ 	INIT_LIST_HEAD(&p->discard_clusters);
+ 
+ 	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+-- 
+Gitee
+
+
+From c58f0af4fa7418fdeb2d6b4d1d8751b751649df9 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:16 +0800
+Subject: [PATCH 11/14] mm, swap: fix allocation and scanning race with swapoff
+
+mainline inclusion
+from mainline-v6.12
+commit 0ec8bc9e880eb576dc4492e8e0c7153ed0a71031
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ec8bc9e880eb576dc4492e8e0c7153ed0a71031
+
+--------------------------------
+
+There are two flags used to synchronize allocation and scanning with
+swapoff: SWP_WRITEOK and SWP_SCANNING.
+
+SWP_WRITEOK: Swapoff will first unset this flag, at this point any further
+swap allocation or scanning on this device should just abort so no more
+new entries will be referencing this device.  Swapoff will then unuse all
+existing swap entries.
+
+SWP_SCANNING: This flag is set when device is being scanned.  Swapoff will
+wait for all scanner to stop before the final release of the swap device
+structures to avoid UAF.  Note this flag is the highest used bit of
+si->flags so it could be added up arithmetically, if there are multiple
+scanner.
+
+commit 5f843a9a3a1e ("mm: swap: separate SSD allocation from
+scan_swap_map_slots()") ignored SWP_SCANNING and SWP_WRITEOK flags while
+separating cluster allocation path from the old allocation path.  Add the
+flags back to fix swapoff race.  The race is hard to trigger as si->lock
+prevents most parallel operations, but si->lock could be dropped for
+reclaim or discard.  This issue is found during code review.
+
+This commit fixes this problem.  For SWP_SCANNING, Just like before, set
+the flag before scan and remove it afterwards.
+
+For SWP_WRITEOK, there are several places where si->lock could be dropped,
+it will be error-prone and make the code hard to follow if we try to cover
+these places one by one.  So just do one check before the real allocation,
+which is also very similar like before.  With new cluster allocator it may
+waste a bit of time iterating the clusters but won't take long, and
+swapoff is not performance sensitive.
+
+Link: https://lkml.kernel.org/r/20241112083414.78174-1-ryncsn@gmail.com
+Fixes: 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()")
+Reported-by: "Huang, Ying" <ying.huang@intel.com>
+Closes: https://lore.kernel.org/linux-mm/87a5es3f1f.fsf@yhuang6-desk2.ccr.corp.intel.com/
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 22 +++++++++++++++++++---
+ 1 file changed, 19 insertions(+), 3 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 389e14f0fc3c..e620040b9181 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -643,12 +643,15 @@ static bool cluster_scan_range(struct swap_info_struct *si,
+ 	return true;
+ }
+ 
+-static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
++static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+ 				unsigned int start, unsigned char usage,
+ 				unsigned int order)
+ {
+ 	unsigned int nr_pages = 1 << order;
+ 
++	if (!(si->flags & SWP_WRITEOK))
++		return false;
++
+ 	if (cluster_is_free(ci)) {
+ 		if (nr_pages < SWAPFILE_CLUSTER) {
+ 			list_move_tail(&ci->list, &si->nonfull_clusters[order]);
+@@ -669,6 +672,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
+ 		list_move_tail(&ci->list, &si->full_clusters);
+ 		ci->flags = CLUSTER_FLAG_FULL;
+ 	}
++
++	return true;
+ }
+ 
+ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
+@@ -692,7 +697,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 
+ 	while (offset <= end) {
+ 		if (cluster_scan_range(si, ci, offset, nr_pages)) {
+-			cluster_alloc_range(si, ci, offset, usage, order);
++			if (!cluster_alloc_range(si, ci, offset, usage, order)) {
++				offset = SWAP_NEXT_INVALID;
++				goto done;
++			}
+ 			*foundp = offset;
+ 			if (ci->count == SWAPFILE_CLUSTER) {
+ 				offset = SWAP_NEXT_INVALID;
+@@ -775,7 +783,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	if (!list_empty(&si->free_clusters)) {
+ 		ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+ 		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
+-		VM_BUG_ON(!found);
++		/*
++		 * Either we didn't touch the cluster due to swapoff,
++		 * or the allocation must success.
++		 */
++		VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
+ 		goto done;
+ 	}
+ 
+@@ -997,6 +1009,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
+ 
+ 	VM_BUG_ON(!si->cluster_info);
+ 
++	si->flags += SWP_SCANNING;
++
+ 	while (n_ret < nr) {
+ 		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
+ 
+@@ -1005,6 +1019,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
+ 		slots[n_ret++] = swp_entry(si->type, offset);
+ 	}
+ 
++	si->flags -= SWP_SCANNING;
++
+ 	return n_ret;
+ }
+ 
+-- 
+Gitee
+
+
+From 6c0fa586bd1a1b04a8b5bc542e85cee15197075b Mon Sep 17 00:00:00 2001
+From: Jeongjun Park <aha310510@gmail.com>
+Date: Wed, 18 Dec 2024 17:51:17 +0800
+Subject: [PATCH 12/14] mm: swap: prevent possible data-race in
+ __try_to_reclaim_swap
+
+mainline inclusion
+from mainline-v6.12-rc4
+commit 818f916e3a07bf0c64bbf5e250ad209eebe21c85
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=818f916e3a07bf0c64bbf5e250ad209eebe21c85
+
+--------------------------------
+
+A report [1] was uploaded from syzbot.
+
+In the previous commit 862590ac3708 ("mm: swap: allow cache reclaim to
+skip slot cache"), the __try_to_reclaim_swap() function reads offset and
+folio->entry from folio without folio_lock protection.
+
+In the currently reported KCSAN log, it is assumed that the actual
+data-race will not occur because the calltrace that does WRITE already
+obtains the folio_lock and then writes.
+
+However, the existing __try_to_reclaim_swap() function was already
+implemented to perform reads under folio_lock protection [1], and there is
+a risk of a data-race occurring through a function other than the one
+shown in the KCSAN log.
+
+Therefore, I think it is appropriate to change
+read operations for folio to be performed under folio_lock.
+
+[1]
+
+==================================================================
+BUG: KCSAN: data-race in __delete_from_swap_cache / __try_to_reclaim_swap
+
+write to 0xffffea0004c90328 of 8 bytes by task 5186 on cpu 0:
+ __delete_from_swap_cache+0x1f0/0x290 mm/swap_state.c:163
+ delete_from_swap_cache+0x72/0xe0 mm/swap_state.c:243
+ folio_free_swap+0x1d8/0x1f0 mm/swapfile.c:1850
+ free_swap_cache mm/swap_state.c:293 [inline]
+ free_pages_and_swap_cache+0x1fc/0x410 mm/swap_state.c:325
+ __tlb_batch_free_encoded_pages mm/mmu_gather.c:136 [inline]
+ tlb_batch_pages_flush mm/mmu_gather.c:149 [inline]
+ tlb_flush_mmu_free mm/mmu_gather.c:366 [inline]
+ tlb_flush_mmu+0x2cf/0x440 mm/mmu_gather.c:373
+ zap_pte_range mm/memory.c:1700 [inline]
+ zap_pmd_range mm/memory.c:1739 [inline]
+ zap_pud_range mm/memory.c:1768 [inline]
+ zap_p4d_range mm/memory.c:1789 [inline]
+ unmap_page_range+0x1f3c/0x22d0 mm/memory.c:1810
+ unmap_single_vma+0x142/0x1d0 mm/memory.c:1856
+ unmap_vmas+0x18d/0x2b0 mm/memory.c:1900
+ exit_mmap+0x18a/0x690 mm/mmap.c:1864
+ __mmput+0x28/0x1b0 kernel/fork.c:1347
+ mmput+0x4c/0x60 kernel/fork.c:1369
+ exit_mm+0xe4/0x190 kernel/exit.c:571
+ do_exit+0x55e/0x17f0 kernel/exit.c:926
+ do_group_exit+0x102/0x150 kernel/exit.c:1088
+ get_signal+0xf2a/0x1070 kernel/signal.c:2917
+ arch_do_signal_or_restart+0x95/0x4b0 arch/x86/kernel/signal.c:337
+ exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
+ exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
+ __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
+ syscall_exit_to_user_mode+0x59/0x130 kernel/entry/common.c:218
+ do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+read to 0xffffea0004c90328 of 8 bytes by task 5189 on cpu 1:
+ __try_to_reclaim_swap+0x9d/0x510 mm/swapfile.c:198
+ free_swap_and_cache_nr+0x45d/0x8a0 mm/swapfile.c:1915
+ zap_pte_range mm/memory.c:1656 [inline]
+ zap_pmd_range mm/memory.c:1739 [inline]
+ zap_pud_range mm/memory.c:1768 [inline]
+ zap_p4d_range mm/memory.c:1789 [inline]
+ unmap_page_range+0xcf8/0x22d0 mm/memory.c:1810
+ unmap_single_vma+0x142/0x1d0 mm/memory.c:1856
+ unmap_vmas+0x18d/0x2b0 mm/memory.c:1900
+ exit_mmap+0x18a/0x690 mm/mmap.c:1864
+ __mmput+0x28/0x1b0 kernel/fork.c:1347
+ mmput+0x4c/0x60 kernel/fork.c:1369
+ exit_mm+0xe4/0x190 kernel/exit.c:571
+ do_exit+0x55e/0x17f0 kernel/exit.c:926
+ __do_sys_exit kernel/exit.c:1055 [inline]
+ __se_sys_exit kernel/exit.c:1053 [inline]
+ __x64_sys_exit+0x1f/0x20 kernel/exit.c:1053
+ x64_sys_call+0x2d46/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:61
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+value changed: 0x0000000000000242 -> 0x0000000000000000
+
+Link: https://lkml.kernel.org/r/20241007070623.23340-1-aha310510@gmail.com
+Reported-by: syzbot+fa43f1b63e3aa6f66329@syzkaller.appspotmail.com
+Fixes: 862590ac3708 ("mm: swap: allow cache reclaim to skip slot cache")
+Signed-off-by: Jeongjun Park <aha310510@gmail.com>
+Acked-by: Chris Li <chrisl@kernel.org>
+Reviewed-by: Kairui Song <kasong@tencent.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index e620040b9181..c5148f16fb53 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -174,9 +174,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	if (IS_ERR(folio))
+ 		return 0;
+ 
+-	/* offset could point to the middle of a large folio */
+-	entry = folio->swap;
+-	offset = swp_offset(entry);
+ 	nr_pages = folio_nr_pages(folio);
+ 	ret = -nr_pages;
+ 
+@@ -190,6 +187,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 	if (!folio_trylock(folio))
+ 		goto out;
+ 
++	/* offset could point to the middle of a large folio */
++	entry = folio->swap;
++	offset = swp_offset(entry);
++
+ 	need_reclaim = ((flags & TTRS_ANYWAY) ||
+ 			((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ 			((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
+-- 
+Gitee
+
+
+From 849e43b208ba22a3ce5dd24388afe85ee6d30e82 Mon Sep 17 00:00:00 2001
+From: Kairui Song <kasong@tencent.com>
+Date: Wed, 18 Dec 2024 17:51:18 +0800
+Subject: [PATCH 13/14] mm, swap: avoid over reclaim of full clusters
+
+mainline inclusion
+from mainline-v6.12-rc6
+commit 5168a68eb78fa1c67a8b2d31d0642c7fd866cc12
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5168a68eb78fa1c67a8b2d31d0642c7fd866cc12
+
+--------------------------------
+
+When running low on usable slots, cluster allocator will try to reclaim
+the full clusters aggressively to reclaim HAS_CACHE slots.  This
+guarantees that as long as there are any usable slots, HAS_CACHE or not,
+the swap device will be usable and workload won't go OOM early.
+
+Before the cluster allocator, swap allocator fails easily if device is
+filled up with reclaimable HAS_CACHE slots.  Which can be easily
+reproduced with following simple program:
+
+    #include <stdio.h>
+    #include <string.h>
+    #include <linux/mman.h>
+    #include <sys/mman.h>
+    #define SIZE 8192UL * 1024UL * 1024UL
+    int main(int argc, char **argv) {
+        long tmp;
+        char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
+               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        memset(p, 0, SIZE);
+        madvise(p, SIZE, MADV_PAGEOUT);
+        for (unsigned long i = 0; i < SIZE; ++i)
+            tmp += p[i];
+        getchar(); /* Pause */
+        return 0;
+    }
+
+Setup an 8G non ramdisk swap, the first run of the program will swapout 8G
+ram successfully.  But run same program again after the first run paused,
+the second run can't swapout all 8G memory as now half of the swap device
+is pinned by HAS_CACHE.  There was a random scan in the old allocator that
+may reclaim part of the HAS_CACHE by luck, but it's unreliable.
+
+The new allocator's added reclaim of full clusters when device is low on
+usable slots.  But when multiple CPUs are seeing the device is low on
+usable slots at the same time, they ran into a thundering herd problem.
+
+This is an observable problem on large machine with mass parallel
+workload, as full cluster reclaim is slower on large swap device and
+higher number of CPUs will also make things worse.
+
+Testing using a 128G ZRAM on a 48c96t system.  When the swap device is
+very close to full (eg.  124G / 128G), running build linux kernel with
+make -j96 in a 1G memory cgroup will hung (not a softlockup though)
+spinning in full cluster reclaim for about ~5min before go OOM.
+
+To solve this, split the full reclaim into two parts:
+
+- Instead of do a synchronous aggressively reclaim when device is low,
+  do only one aggressively reclaim when device is strictly full with a
+  kworker. This still ensures in worst case the device won't be unusable
+  because of HAS_CACHE slots.
+
+- To avoid allocation (especially higher order) suffer from HAS_CACHE
+  filling up clusters and kworker not responsive enough, do one synchronous
+  scan every time the free list is drained, and only scan one cluster. This
+  is kind of similar to the random reclaim before, keeps the full clusters
+  rotated and has a minimal latency. This should provide a fair reclaim
+  strategy suitable for most workloads.
+
+Link: https://lkml.kernel.org/r/20241022175512.10398-1-ryncsn@gmail.com
+Fixes: 2cacbdfdee65 ("mm: swap: add a adaptive full cluster cache reclaim")
+Signed-off-by: Kairui Song <kasong@tencent.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Yosry Ahmed <yosryahmed@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Conflicts:
+	mm/swapfile.c
+[ Context conflict with commit b85508d7de90. ]
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ include/linux/swap.h |  1 +
+ mm/swapfile.c        | 49 +++++++++++++++++++++++++++-----------------
+ 2 files changed, 31 insertions(+), 19 deletions(-)
+
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 1664655aa7c8..33396153afc0 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -348,6 +348,7 @@ struct swap_info_struct {
+ 					 * list.
+ 					 */
+ 	struct work_struct discard_work; /* discard worker */
++	struct work_struct reclaim_work; /* reclaim worker */
+ 	struct list_head discard_clusters; /* discard clusters list */
+ 	KABI_RESERVE(1)
+ 	KABI_RESERVE(2)
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index c5148f16fb53..6f3cbf3a2f0d 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -719,15 +719,16 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
+ 	return offset;
+ }
+ 
+-static void swap_reclaim_full_clusters(struct swap_info_struct *si)
++/* Return true if reclaimed a whole cluster */
++static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
+ {
+ 	long to_scan = 1;
+ 	unsigned long offset, end;
+ 	struct swap_cluster_info *ci;
+ 	unsigned char *map = si->swap_map;
+-	int nr_reclaim, total_reclaimed = 0;
++	int nr_reclaim;
+ 
+-	if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
++	if (force)
+ 		to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+ 
+ 	while (!list_empty(&si->full_clusters)) {
+@@ -737,28 +738,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si)
+ 		end = min(si->max, offset + SWAPFILE_CLUSTER);
+ 		to_scan--;
+ 
++		spin_unlock(&si->lock);
+ 		while (offset < end) {
+ 			if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+-				spin_unlock(&si->lock);
+ 				nr_reclaim = __try_to_reclaim_swap(si, offset,
+ 								   TTRS_ANYWAY | TTRS_DIRECT);
+-				spin_lock(&si->lock);
+-				if (nr_reclaim > 0) {
+-					offset += nr_reclaim;
+-					total_reclaimed += nr_reclaim;
+-					continue;
+-				} else if (nr_reclaim < 0) {
+-					offset += -nr_reclaim;
++				if (nr_reclaim) {
++					offset += abs(nr_reclaim);
+ 					continue;
+ 				}
+ 			}
+ 			offset++;
+ 		}
+-		if (to_scan <= 0 || total_reclaimed)
++		spin_lock(&si->lock);
++
++		if (to_scan <= 0)
+ 			break;
+ 	}
+ }
+ 
++static void swap_reclaim_work(struct work_struct *work)
++{
++	struct swap_info_struct *si;
++
++	si = container_of(work, struct swap_info_struct, reclaim_work);
++
++	spin_lock(&si->lock);
++	swap_reclaim_full_clusters(si, true);
++	spin_unlock(&si->lock);
++}
++
+ /*
+  * Try to get swap entries with specified order from current cpu's swap entry
+  * pool (a cluster). This might involve allocating a new cluster for current CPU
+@@ -792,6 +801,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 		goto done;
+ 	}
+ 
++	/* Try reclaim from full clusters if free clusters list is drained */
++	if (vm_swap_full())
++		swap_reclaim_full_clusters(si, false);
++
+ 	if (order < PMD_ORDER) {
+ 		unsigned int frags = 0;
+ 
+@@ -873,13 +886,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
+ 	}
+ 
+ done:
+-	/* Try reclaim from full clusters if device is nearfull */
+-	if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
+-		swap_reclaim_full_clusters(si);
+-		if (!found && !order && si->pages != si->inuse_pages)
+-			goto new_cluster;
+-	}
+-
+ 	cluster->next[order] = offset;
+ 	return found;
+ }
+@@ -914,6 +920,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 		si->lowest_bit = si->max;
+ 		si->highest_bit = 0;
+ 		del_from_avail_list(si);
++
++		if (vm_swap_full())
++			schedule_work(&si->reclaim_work);
+ 	}
+ }
+ 
+@@ -2846,6 +2855,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ 	wait_for_completion(&p->comp);
+ 
+ 	flush_work(&p->discard_work);
++	flush_work(&p->reclaim_work);
+ 
+ 	destroy_swap_extents(p);
+ 	if (p->flags & SWP_CONTINUED)
+@@ -3382,6 +3392,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+ 		return PTR_ERR(p);
+ 
+ 	INIT_WORK(&p->discard_work, swap_discard_work);
++	INIT_WORK(&p->reclaim_work, swap_reclaim_work);
+ 
+ 	name = getname(specialfile);
+ 	if (IS_ERR(name)) {
+-- 
+Gitee
+
+
+From f19bcc77fc060549322618028b1ab9df253474ea Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 18 Dec 2024 17:51:19 +0800
+Subject: [PATCH 14/14] mm: swapfile: fix cluster reclaim work crash on
+ rotational devices
+
+mainline inclusion
+from mainline-v6.12
+commit dcf32ea7ecede94796fb30231b3969d7c838374c
+category: bugfix
+bugzilla: https://gitee.com/openeuler/kernel/issues/IBC5I1
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dcf32ea7ecede94796fb30231b3969d7c838374c
+
+--------------------------------
+
+syzbot and Daan report a NULL pointer crash in the new full swap cluster
+reclaim work:
+
+> Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI
+> KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
+> CPU: 1 UID: 0 PID: 51 Comm: kworker/1:1 Not tainted 6.12.0-rc6-syzkaller #0
+> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
+> Workqueue: events swap_reclaim_work
+> RIP: 0010:__list_del_entry_valid_or_report+0x20/0x1c0 lib/list_debug.c:49
+> Code: 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 48 89 fe 48 83 c7 08 48 83 ec 18 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 19 01 00 00 48 89 f2 48 8b 4e 08 48 b8 00 00 00
+> RSP: 0018:ffffc90000bb7c30 EFLAGS: 00010202
+> RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffff88807b9ae078
+> RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000008
+> RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000
+> R10: 0000000000000001 R11: 000000000000004f R12: dffffc0000000000
+> R13: ffffffffffffffb8 R14: ffff88807b9ae000 R15: ffffc90003af1000
+> FS:  0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000
+> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+> CR2: 00007fffaca68fb8 CR3: 00000000791c8000 CR4: 00000000003526f0
+> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+> Call Trace:
+>  <TASK>
+>  __list_del_entry_valid include/linux/list.h:124 [inline]
+>  __list_del_entry include/linux/list.h:215 [inline]
+>  list_move_tail include/linux/list.h:310 [inline]
+>  swap_reclaim_full_clusters+0x109/0x460 mm/swapfile.c:748
+>  swap_reclaim_work+0x2e/0x40 mm/swapfile.c:779
+
+The syzbot console output indicates a virtual environment where swapfile
+is on a rotational device.  In this case, clusters aren't actually used,
+and si->full_clusters is not initialized.  Daan's report is from qemu, so
+likely rotational too.
+
+Make sure to only schedule the cluster reclaim work when clusters are
+actually in use.
+
+Link: https://lkml.kernel.org/r/20241107142335.GB1172372@cmpxchg.org
+Link: https://lore.kernel.org/lkml/672ac50b.050a0220.2edce.1517.GAE@google.com/
+Link: https://github.com/systemd/systemd/issues/35044
+Fixes: 5168a68eb78f ("mm, swap: avoid over reclaim of full clusters")
+Reported-by: syzbot+078be8bfa863cb9e0c6b@syzkaller.appspotmail.com
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Daan De Meyer <daan.j.demeyer@gmail.com>
+Cc: Kairui Song <ryncsn@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Liu Shixin <liushixin2@huawei.com>
+---
+ mm/swapfile.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 6f3cbf3a2f0d..3b48159820f2 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -921,7 +921,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ 		si->highest_bit = 0;
+ 		del_from_avail_list(si);
+ 
+-		if (vm_swap_full())
++		if (si->cluster_info && vm_swap_full())
+ 			schedule_work(&si->reclaim_work);
+ 	}
+ }
+-- 
+Gitee
+
diff --git a/kernel.spec b/kernel.spec
index 884d5480d44cdbbbe954315eb3b1ef5ac30ee3c9..f2533d823151296f887ed00cbba605766e77de7d 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -1,5 +1,5 @@
 %define with_signmodules  1
-%define with_kabichk 1
+%define with_kabichk 0
 
 # Default without toolchain_clang
 %bcond_with toolchain_clang
@@ -42,7 +42,7 @@ rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig
 %global upstream_sublevel   0
 %global devel_release       68
 %global maintenance_release .0.0
-%global pkg_release         .73
+%global pkg_release         .76
 
 %global openeuler_lts       1
 %global openeuler_major     2403
@@ -128,6 +128,26 @@ Patch0001: 0001-riscv-kernel.patch
 Patch0002: 0002-cpupower-clang-compile-support.patch
 Patch0003: 0003-x86_energy_perf_policy-clang-compile-support.patch
 Patch0004: 0004-turbostat-clang-compile-support.patch
+Patch0005: 0005-include-msi-modify-kabi-size-of-msi_desc.patch
+Patch0007: 0007-nfs-fix-the-loss-of-superblock-s-initialized-flags.patch
+Patch0008: 0008-x86-config-Enable-CONFIG_CMA-by-default-in-openeuler.patch
+Patch0009: 0009-x86-Kconfig-Select-CONFIG_CMA-if-CONFIG_HYGON_CSV-y.patch
+Patch0010: 0010-tcp-Fix-use-after-free-of-nreq-in-reqsk_timer_handle.patch
+Patch0012: 0012-bpf-Add-kabi-reserve-padding-for-uapi-struct-bpf_lin.patch
+Patch0013: 0013-iommu-Reserve-extra-KABI-entry-for-struct-iopf_group.patch
+Patch0014: 0014-seq_file-kabi-KABI-reservation-for-seq_file.patch
+Patch0015: 0015-statx-kabi-KABI-reservation-for-kstat.patch
+Patch0016: 0016-fs-Allow-fine-grained-control-of-folio-sizes.patch
+Patch0017: 0017-Revert-cgroup-fix-uaf-when-proc_cpuset_show.patch
+Patch0018: 0018-cgroup-Make-operations-on-the-cgroup-root_list-RCU-s.patch
+Patch0019: 0019-cgroup-Move-rcu_head-up-near-the-top-of-cgroup_root.patch
+Patch0020: 0020-cgroup-cpuset-Prevent-UAF-in-proc_cpuset_show.patch
+Patch0021: 0021-cgroup-add-more-reserve-kabi.patch
+Patch0022: 0022-14223.patch
+Patch0023: 0023-14224.patch
+Patch0024: 0024-14225.patch
+Patch0025: 0025-14226.patch
+Patch0026: 0026-14227.patch
 
 #BuildRequires:
 BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar
@@ -330,6 +350,27 @@ tar -xjf %{SOURCE9998}
 mv kernel linux-%{KernelVer}
 cd linux-%{KernelVer}
 
+%patch0005 -p1
+%patch0007 -p1
+%patch0008 -p1
+%patch0009 -p1
+%patch0010 -p1
+%patch0012 -p1
+%patch0013 -p1
+%patch0014 -p1
+%patch0015 -p1
+%patch0016 -p1
+%patch0017 -p1
+%patch0018 -p1
+%patch0019 -p1
+%patch0020 -p1
+%patch0021 -p1
+%patch0022 -p1
+%patch0023 -p1
+%patch0024 -p1
+%patch0025 -p1
+%patch0026 -p1
+
 %if 0%{?with_patch}
 cp %{SOURCE9000} .
 cp %{SOURCE9001} .
@@ -1089,6 +1130,9 @@ fi
 %endif
 
 %changelog
+* Thu Dec 19 2024 Zheng Zengkai <zhengzengkai@huawei.com> - 6.6.0-68.0.0.76
+- performance test for kabi exclude sched
+
 * Tue Dec 17 2024 Xie XiuQi <xiexiuqi@huawei.com> - 6.6.0-68.0.0.73
 - kabi: add kabi_ext1 list for checking
 - check-kabi: fix kabi check failed when no namespace