From a2498b6fd8b9d88e105460d5ee544a1d55ccf03a Mon Sep 17 00:00:00 2001
From: Yang Rong <youngrong@linux.alibaba.com>
Date: Mon, 7 Jul 2025 20:31:13 +0800
Subject: [PATCH 1/6] anolis: mm/memory_hotplug: add MHP_PHASE_* macros

ANBZ: #18841

We added three macros to represent different phases of memory hotplug.
MHP_PHASE_DEFAULT represents the original process, while MHP_PHASE_PREPARE
and MHP_PHASE_DEFERRED represent the process split into the prepare
phase and the deferred phase. The purpose of this change is to move
time-consuming operations to the deferred phase, thereby effectively
improving the speed of memory hotplug through concurrency.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
---
 include/linux/memory_hotplug.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 26b4ac8c59a7..5ab6289597ae 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -73,6 +73,9 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+#define MHP_PHASE_PREPARE	1
+#define MHP_PHASE_DEFERRED	2
+#define MHP_PHASE_DEFAULT	3
 /*
  * Return page for the valid pfn only if the page is online. All pfn
  * walkers which rely on the fully initialized page->flags and others
-- 
Gitee


From 06cdcf246e1839a69f0c875d6f82e77b941108f4 Mon Sep 17 00:00:00 2001
From: Yang Rong <youngrong@linux.alibaba.com>
Date: Mon, 7 Jul 2025 21:08:20 +0800
Subject: [PATCH 2/6] anolis: mm/memory_hotplug: refactor
 adjust_present_page_count()

ANBZ: #18841

This commit refactors the `adjust_present_page_count()` function
to prepare for deferred memory online support. If memory online is
deferred, then the adjustment of the present pages must also be deferred.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
---
 mm/memory_hotplug.c | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b91c0806228a..42ca17564df1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -979,6 +979,33 @@ struct zone *zone_for_pfn_range(int online_type, int nid,
 	return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
+void __adjust_present_page_count(struct page *page, struct memory_group *group,
+			       long nr_pages, struct zone *zone, int phase)
+{
+	const bool movable = zone_idx(zone) == ZONE_MOVABLE;
+	unsigned long flags;
+
+	if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_DEFERRED) {
+		/*
+		 * We only support onlining/offlining/adding/removing of complete
+		 * memory blocks; therefore, either all is either early or hotplugged.
+		 */
+		if (early_section(__pfn_to_section(page_to_pfn(page))))
+			zone->present_early_pages += nr_pages;
+		zone->present_pages += nr_pages;
+		pgdat_resize_lock(zone->zone_pgdat, &flags);
+		zone->zone_pgdat->node_present_pages += nr_pages;
+		pgdat_resize_unlock(zone->zone_pgdat, &flags);
+	}
+
+	if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE) {
+		if (group && movable)
+			group->present_movable_pages += nr_pages;
+		else if (group && !movable)
+			group->present_kernel_pages += nr_pages;
+	}
+}
+
 /*
  * This function should only be called by memory_block_{online,offline},
  * and {online,offline}_pages.
@@ -987,24 +1014,8 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
 			       long nr_pages)
 {
 	struct zone *zone = page_zone(page);
-	const bool movable = zone_idx(zone) == ZONE_MOVABLE;
-	unsigned long flags;
-
-	/*
-	 * We only support onlining/offlining/adding/removing of complete
-	 * memory blocks; therefore, either all is either early or hotplugged.
-	 */
-	if (early_section(__pfn_to_section(page_to_pfn(page))))
-		zone->present_early_pages += nr_pages;
-	zone->present_pages += nr_pages;
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	zone->zone_pgdat->node_present_pages += nr_pages;
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 
-	if (group && movable)
-		group->present_movable_pages += nr_pages;
-	else if (group && !movable)
-		group->present_kernel_pages += nr_pages;
+	__adjust_present_page_count(page, group, nr_pages, zone, MHP_PHASE_DEFAULT);
 }
 
 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-- 
Gitee


From 6da895ffc43849a5fd620aa76a19a9af2c83fd3a Mon Sep 17 00:00:00 2001
From: Yang Rong <youngrong@linux.alibaba.com>
Date: Tue, 8 Jul 2025 11:22:01 +0800
Subject: [PATCH 3/6] anolis: mm/memory_hotplug: refactor
 move_pfn_range_to_zone()

ANBZ: #18841

This commit refactors the `move_pfn_range_to_zone()` function to prepare
for deferred memory online support. `memmap_init_zone()` is a
time-consuming operation, we put it into the deferred phase for execution.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
---
 mm/memory_hotplug.c | 65 +++++++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 42ca17564df1..fe0edca80a0a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -660,37 +660,31 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
 
 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
 }
-/*
- * Associate the pfn range with the given zone, initializing the memmaps
- * and resizing the pgdat/zone data to span the added pages. After this
- * call, all affected pages are PG_reserved.
- *
- * All aligned pageblocks are initialized to the specified migratetype
- * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
- * zone stats (e.g., nr_isolate_pageblock) are touched.
- */
-void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
-				  unsigned long nr_pages,
-				  struct vmem_altmap *altmap, int migratetype)
+
+void __ref __move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+				  unsigned long nr_pages, struct vmem_altmap *altmap,
+				  int migratetype, int phase)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nid = pgdat->node_id;
 	unsigned long flags;
 
+	if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE) {
 #ifdef KIDLED_AGE_NOT_IN_PAGE_FLAGS
-	kidled_free_page_age(pgdat);
+		kidled_free_page_age(pgdat);
 #endif
-	clear_zone_contiguous(zone);
-
-	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
-	pgdat_resize_lock(pgdat, &flags);
-	zone_span_writelock(zone);
-	if (zone_is_empty(zone))
-		init_currently_empty_zone(zone, start_pfn, nr_pages);
-	resize_zone_range(zone, start_pfn, nr_pages);
-	zone_span_writeunlock(zone);
-	resize_pgdat_range(pgdat, start_pfn, nr_pages);
-	pgdat_resize_unlock(pgdat, &flags);
+		clear_zone_contiguous(zone);
+
+		/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+		pgdat_resize_lock(pgdat, &flags);
+		zone_span_writelock(zone);
+		if (zone_is_empty(zone))
+			init_currently_empty_zone(zone, start_pfn, nr_pages);
+		resize_zone_range(zone, start_pfn, nr_pages);
+		zone_span_writeunlock(zone);
+		resize_pgdat_range(pgdat, start_pfn, nr_pages);
+		pgdat_resize_unlock(pgdat, &flags);
+	}
 
 	/*
 	 * TODO now we have a visible range of pages which are not associated
@@ -698,10 +692,29 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	 * expects the zone spans the pfn range. All the pages in the range
 	 * are reserved so nobody should be touching them so we should be safe
 	 */
-	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+	if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_DEFERRED)
+		memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
 			 MEMINIT_HOTPLUG, altmap, migratetype);
 
-	set_zone_contiguous(zone);
+	if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE)
+		set_zone_contiguous(zone);
+}
+
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
+ */
+void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+				  unsigned long nr_pages,
+				  struct vmem_altmap *altmap, int migratetype)
+{
+	__move_pfn_range_to_zone(zone, start_pfn, nr_pages, altmap, migratetype,
+				 MHP_PHASE_DEFAULT);
 }
 
 struct auto_movable_stats {
-- 
Gitee


From c4e30b2eab16510ead4e5c2cd0daa59e3d0280eb Mon Sep 17 00:00:00 2001
From: Yang Rong <youngrong@linux.alibaba.com>
Date: Tue, 8 Jul 2025 14:03:24 +0800
Subject: [PATCH 4/6] anolis: mm/memory_hotplug: refactor online_pages()

ANBZ: #18841

This commit refactors the `online_pages()` function to prepare for
deferred memory online support. `online_pages()` is the core function
for memory hotplug. Initializing struct pages and freeing pages to buddy
are the most time-consuming operations, so we move these operations and
related ones to the deferred phase.

Additionally, since the adjustment of `present_pages` is deferred, and
`auto_movable_zone_for_pfn()` uses `present_pages` to determine which
zone the new memory block belongs to, we introduce `deferred_pages` to
indicate the number of deferred pages. This allows
`auto_movable_zone_for_pfn()` to make decisions based on both
`present_pages` and `deferred_pages`.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
---
 include/linux/mmzone.h |  6 ++++++
 mm/memory_hotplug.c    | 49 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3b4ff5685af7..5ce3718c1719 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -873,6 +873,7 @@ struct zone {
 	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 
 	unsigned long reported_pages;
+	atomic_long_t deferred_pages;
 
 	CK_KABI_RESERVE(1)
 	CK_KABI_RESERVE(2)
@@ -895,6 +896,11 @@ static inline unsigned long zone_managed_pages(struct zone *zone)
 	return (unsigned long)atomic_long_read(&zone->managed_pages);
 }
 
+static inline unsigned long zone_deferred_pages(struct zone *zone)
+{
+	return (unsigned long)atomic_long_read(&zone->deferred_pages);
+}
+
 static inline unsigned long zone_cma_pages(struct zone *zone)
 {
 #ifdef CONFIG_CMA
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fe0edca80a0a..023ce457a914 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -726,7 +726,8 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
 					    struct zone *zone)
 {
 	if (zone_idx(zone) == ZONE_MOVABLE) {
-		stats->movable_pages += zone->present_pages;
+		stats->movable_pages +=
+			zone->present_pages + zone_deferred_pages(zone);
 	} else {
 		stats->kernel_early_pages += zone->present_early_pages;
 #ifdef CONFIG_CMA
@@ -1077,14 +1078,16 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
 	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
 }
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-		       struct zone *zone, struct memory_group *group)
+int __ref __online_pages(unsigned long pfn, unsigned long nr_pages,
+		       struct zone *zone, struct memory_group *group,
+			   int phase)
 {
 	unsigned long flags;
 	int need_zonelists_rebuild = 0;
 	const int nid = zone_to_nid(zone);
 	int ret;
 	struct memory_notify arg;
+	bool need_lock = phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE;
 
 	/*
 	 * {on,off}lining is constrained to full memory sections (or more
@@ -1098,10 +1101,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 			 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
 		return -EINVAL;
 
-	mem_hotplug_begin();
+	if (need_lock)
+		mem_hotplug_begin();
 
 	/* associate pfn range with the zone */
-	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
+	__move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE, phase);
+
+	if (phase == MHP_PHASE_PREPARE)
+		goto adjust_count;
 
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@@ -1131,7 +1138,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	}
 
 	online_pages_range(pfn, nr_pages);
-	adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
+
+adjust_count:
+	__adjust_present_page_count(pfn_to_page(pfn), group, nr_pages, zone, phase);
+	if (phase == MHP_PHASE_PREPARE) {
+		atomic_long_add(nr_pages, &zone->deferred_pages);
+		goto out;
+	} else if (phase == MHP_PHASE_DEFERRED)
+		atomic_long_sub(nr_pages, &zone->deferred_pages);
 
 	node_states_set_node(nid, &arg);
 	if (need_zonelists_rebuild)
@@ -1158,7 +1172,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	writeback_set_ratelimit();
 
 	memory_notify(MEM_ONLINE, &arg);
-	mem_hotplug_done();
+
+out:
+	if (need_lock)
+		mem_hotplug_done();
 	return 0;
 
 failed_addition:
@@ -1167,9 +1184,16 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
 	memory_notify(MEM_CANCEL_ONLINE, &arg);
 	remove_pfn_range_from_zone(zone, pfn, nr_pages);
-	mem_hotplug_done();
+	if (need_lock)
+		mem_hotplug_done();
 	return ret;
 }
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+		       struct zone *zone, struct memory_group *group)
+{
+	return __online_pages(pfn, nr_pages, zone, group, MHP_PHASE_DEFAULT);
+}
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static void reset_node_present_pages(pg_data_t *pgdat)
@@ -1182,6 +1206,14 @@ static void reset_node_present_pages(pg_data_t *pgdat)
 	pgdat->node_present_pages = 0;
 }
 
+static void reset_node_deferred_pages(pg_data_t *pgdat)
+{
+	struct zone *z;
+
+	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+		atomic_long_set(&z->deferred_pages, 0);
+}
+
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 static pg_data_t __ref *hotadd_init_pgdat(int nid)
 {
@@ -1212,6 +1244,7 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid)
 	 */
 	reset_node_managed_pages(pgdat);
 	reset_node_present_pages(pgdat);
+	reset_node_deferred_pages(pgdat);
 
 	return pgdat;
 }
-- 
Gitee


From 263e0c94539a2754943b695353d925acd93a5c80 Mon Sep 17 00:00:00 2001
From: Yang Rong <youngrong@linux.alibaba.com>
Date: Tue, 8 Jul 2025 15:56:43 +0800
Subject: [PATCH 5/6] anolis: mm/memory_hotplug: support parallel deferred
 memory online

ANBZ: #18841

Memory hotplug is a serial process that adds memory to Linux in the
granularity of memory blocks. We identified two memory initialization
functions that consume significant time when onling memory blocks:
- `__init_single_page`: initialize the struct page
- `__free_pages_core`: add page to the buddy allocator

We attempted to execute these two functions in parallel during the
process of hotplugging a memory block. The experimental results showed
that when the memory block size was 1GB, the hotplug speed was
increased by approximately 200%. However, when the memory block size
was 128MB, which is the more commonly used size, the hotplug speed
was even worse than that of serial execution.

Therefore, how to improve the hotplug speed when the memory block size
is 128MB remains a challenge.

Here is my idea:
- Defer the execution of these two functions and their associated
  processs to the final phase of the entire hotplug process, so
  that the hotplug speed will no longer be limited by the memory
  block size.
- Perform parallel execution in the final phase, as previous
  implementations have proven that this can accelerate the hotplug
  process.

We introduce the new online function, `deferred_online_memory`, for
deferring the actual online process of memory blocks.

Additionally, we have added a command-line argument,
parallel_hotplug_ratio, which sets the ratio of parallel workers to
the number of CPUs on the node. When parallel_hotplug_ratio is 0,
the memory online process will no longer be deferred.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
---
 drivers/base/memory.c          |  24 +++++-
 include/linux/memory.h         |  10 +++
 include/linux/memory_hotplug.h |   5 ++
 include/linux/mmzone.h         |   7 ++
 mm/memory_hotplug.c            | 149 +++++++++++++++++++++++++++++++--
 5 files changed, 184 insertions(+), 11 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index a82db349a3a8..da25ccba297b 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -198,8 +198,22 @@ static int memory_block_online(struct memory_block *mem)
 		if (ret)
 			return ret;
 	}
-
-	ret = online_pages(start_pfn + nr_vmemmap_pages,
+	/*
+	 * Defer struct pages initialization and defer freeing pages to buddy
+	 * allocator starting from at least the second memory block of the zone,
+	 * as rebuilding the zone is not required from that point onwards.
+	 */
+	if (parallel_hotplug_ratio &&
+	    start_pfn + nr_vmemmap_pages >=
+		    zone->zone_start_pfn +
+			    (memory_block_size_bytes() >> PAGE_SHIFT)) {
+		ret = __online_pages(start_pfn + nr_vmemmap_pages,
+				     nr_pages - nr_vmemmap_pages, zone,
+				     mem->group, MHP_PHASE_PREPARE);
+		atomic_set(&mem->deferred_state, MEM_NEED_DEFER);
+		mem->deferred_zone = zone;
+	} else
+		ret = online_pages(start_pfn + nr_vmemmap_pages,
 			   nr_pages - nr_vmemmap_pages, zone, mem->group);
 	if (ret) {
 		if (nr_vmemmap_pages)
@@ -286,7 +300,9 @@ static int memory_block_change_state(struct memory_block *mem,
 		mem->state = MEM_GOING_OFFLINE;
 
 	ret = memory_block_action(mem, to_state);
-	mem->state = ret ? from_state_req : to_state;
+	mem->state =
+		(ret || atomic_read(&mem->deferred_state) == MEM_NEED_DEFER) ?
+			from_state_req : to_state;
 
 	return ret;
 }
@@ -675,6 +691,8 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
 	mem->state = state;
 	mem->nid = NUMA_NO_NODE;
 	mem->nr_vmemmap_pages = nr_vmemmap_pages;
+	atomic_set(&mem->deferred_state, MEM_SKIP_DEFER);
+	mem->deferred_zone = NULL;
 	INIT_LIST_HEAD(&mem->group_next);
 
 	if (group) {
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 4ddc3b960ae9..567682ce4c26 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -65,6 +65,10 @@ struct memory_group {
 	};
 };
 
+/* Memory block defer state flags */
+#define MEM_SKIP_DEFER 0
+#define MEM_NEED_DEFER 1
+
 struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
@@ -76,6 +80,12 @@ struct memory_block {
 	 * lay at the beginning of the memory block.
 	 */
 	unsigned long nr_vmemmap_pages;
+	/*
+	 * Whether struct pages initialization and free pages
+	 * to buddy allocator needs to be deferred or not.
+	 */
+	atomic_t deferred_state;
+	struct zone *deferred_zone; /* zone for this defered block */
 	struct memory_group *group;	/* group (if any) for this block */
 	struct list_head group_next;	/* next block inside memory group */
 };
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5ab6289597ae..b5d5b1b82c61 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -183,6 +183,9 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group);
+extern int __online_pages(unsigned long pfn, unsigned long nr_pages,
+			  struct zone *zone, struct memory_group *group,
+			  int phase);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -203,6 +206,7 @@ extern u64 max_mem_size;
 extern int memhp_online_type_from_str(const char *str);
 
 extern bool skip_set_contiguous;
+extern unsigned int parallel_hotplug_ratio;
 /* Default online_type (MMOP_*) when new memory blocks are added. */
 extern int memhp_default_online_type;
 /* If movable_node boot option specified */
@@ -359,6 +363,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid,
 		struct memory_group *group, unsigned long start_pfn,
 		unsigned long nr_pages);
 extern bool mhp_supports_memmap_on_memory(unsigned long size);
+extern int deferred_online_memory(int nid, u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5ce3718c1719..16e35e24cc58 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1052,6 +1052,13 @@ typedef struct pglist_data {
 	 * Nests above zone->lock and zone->span_seqlock
 	 */
 	spinlock_t node_size_lock;
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * This workqueue is used to handle deferred pages
+	 * initialization of hotplugged memory.
+	 */
+	struct workqueue_struct *deferred_hotplug_wq;
 #endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 023ce457a914..fcefbe5978e7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -122,6 +122,13 @@ bool skip_set_contiguous __read_mostly;
 module_param(skip_set_contiguous, bool, 0644);
 MODULE_PARM_DESC(skip_set_contiguous, "Do not set zone contiguous when online/offline pages");
 
+unsigned int parallel_hotplug_ratio __read_mostly;
+EXPORT_SYMBOL_GPL(parallel_hotplug_ratio);
+module_param(parallel_hotplug_ratio, uint, 0644);
+MODULE_PARM_DESC(parallel_hotplug_ratio,
+		"Set the ratio of parallel hotplug workers to the number of CPUs on "
+		"the node, with values constrained between 0 and 100. Default: 0");
+
 /*
  * memory_hotplug.auto_movable_numa_aware: consider numa node stats
  */
@@ -1107,8 +1114,13 @@ int __ref __online_pages(unsigned long pfn, unsigned long nr_pages,
 	/* associate pfn range with the zone */
 	__move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE, phase);
 
-	if (phase == MHP_PHASE_PREPARE)
-		goto adjust_count;
+	if (phase == MHP_PHASE_PREPARE) {
+		__adjust_present_page_count(pfn_to_page(pfn), group, nr_pages,
+					    zone, phase);
+		atomic_long_add(nr_pages, &zone->deferred_pages);
+		mem_hotplug_done();
+		return 0;
+	}
 
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@@ -1139,12 +1151,8 @@ int __ref __online_pages(unsigned long pfn, unsigned long nr_pages,
 
 	online_pages_range(pfn, nr_pages);
 
-adjust_count:
 	__adjust_present_page_count(pfn_to_page(pfn), group, nr_pages, zone, phase);
-	if (phase == MHP_PHASE_PREPARE) {
-		atomic_long_add(nr_pages, &zone->deferred_pages);
-		goto out;
-	} else if (phase == MHP_PHASE_DEFERRED)
+	if (phase == MHP_PHASE_DEFERRED)
 		atomic_long_sub(nr_pages, &zone->deferred_pages);
 
 	node_states_set_node(nid, &arg);
@@ -1173,7 +1181,6 @@ int __ref __online_pages(unsigned long pfn, unsigned long nr_pages,
 
 	memory_notify(MEM_ONLINE, &arg);
 
-out:
 	if (need_lock)
 		mem_hotplug_done();
 	return 0;
@@ -1194,6 +1201,132 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 {
 	return __online_pages(pfn, nr_pages, zone, group, MHP_PHASE_DEFAULT);
 }
+
+static int deferred_memory_block_online_pages(struct memory_block *mem,
+					      void *arg)
+{
+	unsigned long start_pfn, nr_pages;
+	unsigned long nr_vmemmap_pages;
+	struct zone *zone;
+	int ret;
+
+	/* Continue if struct pages initialization need to be deferred */
+	if (memhp_default_online_type == MMOP_OFFLINE ||
+	    mem->state == MEM_ONLINE || !mem->deferred_zone ||
+	    atomic_cmpxchg(&mem->deferred_state, MEM_NEED_DEFER,
+			   MEM_SKIP_DEFER) != MEM_NEED_DEFER)
+		return 0;
+
+	zone = mem->deferred_zone;
+	mem->deferred_zone = NULL;
+
+	start_pfn = section_nr_to_pfn(mem->start_section_nr);
+	nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
+	nr_vmemmap_pages = mem->nr_vmemmap_pages;
+
+	ret = __online_pages(start_pfn + nr_vmemmap_pages,
+			     nr_pages - nr_vmemmap_pages, zone, mem->group,
+			     MHP_PHASE_DEFERRED);
+	if (ret) {
+		if (nr_vmemmap_pages)
+			mhp_deinit_memmap_on_memory(start_pfn,
+						    nr_vmemmap_pages);
+		return ret;
+	}
+
+	mem->state = MEM_ONLINE;
+	return 0;
+}
+
+struct deferred_walk_memory_blocks_work {
+	struct work_struct work;
+	u64 start;
+	u64 size;
+	int ret;
+};
+
+static void deferred_walk_memory_blocks_worker(struct work_struct *work)
+{
+	struct deferred_walk_memory_blocks_work *w = container_of(
+		work, struct deferred_walk_memory_blocks_work, work);
+
+	w->ret = walk_memory_blocks(w->start, w->size, NULL,
+				 deferred_memory_block_online_pages);
+}
+
+int __ref deferred_online_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	int i, ret = 0;
+	struct workqueue_struct *wq;
+	struct deferred_walk_memory_blocks_work *ws, *w;
+	const struct cpumask *cpumask;
+	u64 chunk_start = start;
+	u64 chunk_size, chunk_num, chunk_remain;
+
+	if (!parallel_hotplug_ratio)
+		return -EINVAL;
+
+	wq = pgdat->deferred_hotplug_wq;
+	if (!wq) {
+		pr_warn("Deferred hotplug work queue is not initialized for node %d\n",
+			nid);
+		goto sequential;
+	}
+
+	cpumask = cpumask_of_node(nid);
+	/*
+	 * The number of parallel workers (chunk_num) should be less than
+	 * or equal to the maximum number of CPUs on the node.
+	 * And the memory size handled by each worker needs to be aligned
+	 * with the memory block size.
+	 */
+	chunk_num =
+		max_t(uint, 1,
+		      max_t(uint, cpumask_weight(cpumask), 1) *
+			      min_t(uint, parallel_hotplug_ratio, 100) / 100);
+	chunk_size = ALIGN(size / chunk_num, memory_block_size_bytes());
+	chunk_num = size / chunk_size;
+	chunk_remain = size % chunk_size;
+
+	if (chunk_num == 1)
+		goto sequential;
+
+	ws = kmalloc_array_node(chunk_num, sizeof(*ws), GFP_KERNEL, nid);
+	if (!ws)
+		goto sequential;
+
+	for (i = 0; i < chunk_num; i++) {
+		w = ws + i;
+		INIT_WORK(&w->work, deferred_walk_memory_blocks_worker);
+		w->start = chunk_start;
+		if (i == chunk_num - 1)
+			w->size = chunk_size + chunk_remain;
+		else
+			w->size = chunk_size;
+		chunk_start += w->size;
+		queue_work_node(nid, wq, &w->work);
+	}
+
+	flush_workqueue(wq);
+
+	for (i = 0; i < chunk_num; i++) {
+		w = ws + i;
+		if (w->ret) {
+			ret = w->ret;
+			pr_err("Deferred online memory failed for node %d, start: %#llx, size: %#llx, ret: %d\n",
+			       nid, w->start, w->size, ret);
+			break;
+		}
+	}
+	kfree(ws);
+	return ret;
+
+sequential:
+	return walk_memory_blocks(start, size, NULL,
+				 deferred_memory_block_online_pages);
+}
+EXPORT_SYMBOL_GPL(deferred_online_memory);
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static void reset_node_present_pages(pg_data_t *pgdat)
-- 
Gitee


From dd774e14f7efc3a38f04f213367e498bdcd583a3 Mon Sep 17 00:00:00 2001
From: Yang Rong <youngrong@linux.alibaba.com>
Date: Mon, 24 Feb 2025 11:30:33 +0800
Subject: [PATCH 6/6] anolis: virtio-mem: apply parallel deferred memory online

ANBZ: #18841

We applied parallel deferred memory online to virtio-mem and
observed a significant speedup in the hot-plug process.

We conducted an experiment to hotplug 400G of memory, and
the results were as follows:
- Before applying the patch:
  - Total Time = Origin Hotplug Time = 5537ms (72.24 GB/s)
- After applying the patch (with `parallel_hotplug_ratio=80`):
  - Origin Hotplug Time = 178ms
  - Deferred Parallel Hotplug Time = 1200ms
  - Total Time = 1378ms (76% reduction, 290.28 GB/s)

Lastly, there's an issue regarding the guest's plug
request to the VMM. The VMM relies on the plug requests
sent by the guest to determine the size of the hot-plugged
memory. Therefore, we should defer the sending of the plug
requests after the memory has been actually onlined.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
---
 drivers/virtio/virtio_mem.c | 174 +++++++++++++++++++++++++++++++++---
 1 file changed, 160 insertions(+), 14 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index c3fa9e2211d1..83841c7801f9 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -598,6 +598,15 @@ static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
 	if (WARN_ON_ONCE(size > vm->offline_threshold))
 		return false;
 
+	/*
+	 * TODO: If memory online is deferred, offiine_size will exceed offline_threashold
+	 * immediately. However, even if we hotplug 400G memory on a machine with only
+	 * 256M boot memory, OOM is still not triggered. So in most cases, adding memory
+	 * is okay. We may have a better way to deal with it in the future.
+	 */
+	if (parallel_hotplug_ratio)
+		return true;
+
 	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
 }
 
@@ -1456,14 +1465,16 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
  * of the memory block.
  */
 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
-				  int sb_id, int count)
+				  int sb_id, int count, bool skip_send_req)
 {
 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
 			      sb_id * vm->sbm.sb_size;
 	const uint64_t size = count * vm->sbm.sb_size;
-	int rc;
+	int rc = 0;
 
-	rc = virtio_mem_send_plug_request(vm, addr, size);
+	/* memory not onlined yet, so we also need defer the request. */
+	if (!skip_send_req)
+		rc = virtio_mem_send_plug_request(vm, addr, size);
 	if (!rc)
 		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
 	return rc;
@@ -1613,7 +1624,7 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
 	 * Plug the requested number of subblocks before adding it to linux,
 	 * so that onlining will directly online all plugged subblocks.
 	 */
-	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
+	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count, parallel_hotplug_ratio);
 	if (rc)
 		return rc;
 
@@ -1672,7 +1683,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
 		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
 			count++;
 
-		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
+		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count, false);
 		if (rc)
 			return rc;
 		*nb_sb -= count;
@@ -1692,6 +1703,57 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
 	return 0;
 }
 
+struct deferred_mb_range {
+	unsigned long start_id;
+	unsigned long end_id;
+};
+
+struct deferred_mb_range_list {
+	struct deferred_mb_range *ranges;
+	unsigned long size;
+	unsigned long capacity;
+	int nid;
+};
+
+#define deferred_mb_range_list_for_each(_i, _ranges, _start, _end) \
+	for (_i = 0; \
+	     _i < _ranges.size && (_start = _ranges.ranges[_i].start_id, \
+				_end = _ranges.ranges[_i].end_id, true); \
+	     _i++)
+
+static int deferred_mb_range_list_add(struct deferred_mb_range_list *rs,
+				      unsigned long mb_id)
+{
+	struct deferred_mb_range *new_ranges;
+
+	if (!rs)
+		return -EINVAL;
+
+	if (rs->size && rs->ranges &&
+	    rs->ranges[rs->size - 1].end_id + 1 == mb_id) {
+		rs->ranges[rs->size - 1].end_id = mb_id;
+	} else {
+		if (rs->size == rs->capacity) {
+			rs->capacity++;
+			new_ranges = kmalloc_array_node(rs->capacity,
+						 sizeof(*rs->ranges), GFP_KERNEL, rs->nid);
+			if (!new_ranges)
+				return -ENOMEM;
+			if (rs->ranges) {
+				memcpy(new_ranges, rs->ranges,
+				       rs->size * sizeof(*rs->ranges));
+				kfree(rs->ranges);
+			}
+			rs->ranges = new_ranges;
+		}
+		rs->ranges[rs->size++] = (struct deferred_mb_range){
+			.start_id = mb_id,
+			.end_id = mb_id,
+		};
+	}
+	return 0;
+}
+
 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
 {
 	const int mb_states[] = {
@@ -1701,6 +1763,17 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
 	};
 	uint64_t nb_sb = diff / vm->sbm.sb_size;
 	unsigned long mb_id;
+	struct deferred_mb_range_list rs = {
+		.ranges = NULL,
+		.size = 0,
+		.capacity = 0,
+		.nid = vm->nid,
+	};
+	unsigned long sid, eid;
+	uint64_t addr, size;
+	/* Last deferred memory block may not plug all subblocks */
+	uint64_t part_nb_sb = 0;
+	unsigned long timestamp;
 	int rc, i;
 
 	if (!nb_sb)
@@ -1726,32 +1799,87 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
 
 	/* Try to plug and add unused blocks */
 	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
-		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
-			return -ENOSPC;
+		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) {
+			rc = -ENOSPC;
+			goto out_free;
+		}
 
+		if (!nb_sb)
+			break;
+		if (parallel_hotplug_ratio) {
+			if (nb_sb < vm->sbm.sbs_per_mb)
+				part_nb_sb = nb_sb;
+			rc = deferred_mb_range_list_add(&rs, mb_id);
+			if (rc)
+				goto out_free;
+		}
 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
-		if (rc || !nb_sb)
-			return rc;
+		if (rc)
+			goto out_free;
 		cond_resched();
 	}
 
 	/* Try to prepare, plug and add new blocks */
 	while (nb_sb) {
-		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
-			return -ENOSPC;
+		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) {
+			rc = -ENOSPC;
+			goto out_free;
+		}
 
 		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
 		if (rc)
-			return rc;
+			goto out_free;
+		if (parallel_hotplug_ratio) {
+			if (nb_sb < vm->sbm.sbs_per_mb)
+				part_nb_sb = nb_sb;
+			rc = deferred_mb_range_list_add(&rs, mb_id);
+			if (rc)
+				goto out_free;
+		}
 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
 		if (rc)
-			return rc;
+			goto out_free;
 		cond_resched();
 	}
 
-	return 0;
+	if (parallel_hotplug_ratio) {
+		timestamp = jiffies;
+		deferred_mb_range_list_for_each(i, rs, sid, eid) {
+			addr = virtio_mem_mb_id_to_phys(sid);
+			/* Always add complete memory block to Linux */
+			size = (eid - sid + 1) * memory_block_size_bytes();
+			/*
+			 * Deferred struct pages initialization and
+			 * Deferred free pages to buddy allocator.
+			 */
+			rc = deferred_online_memory(vm->nid, addr, size);
+			if (rc)
+				goto out_free;
+
+			/* Deferred send plug requests */
+			for (mb_id = sid; mb_id <= eid; mb_id++) {
+				addr = virtio_mem_mb_id_to_phys(mb_id);
+				if (part_nb_sb && i == rs.size - 1 &&
+				    mb_id == eid)
+					size = part_nb_sb * vm->sbm.sb_size;
+				else
+					size = memory_block_size_bytes();
+
+				rc = virtio_mem_send_plug_request(vm, addr, size);
+				if (rc)
+					goto out_free;
+			}
+		}
+		dev_info(&vm->vdev->dev, "deferred time: %ums",
+			 jiffies_to_msecs(jiffies - timestamp));
+	}
+	goto out_free;
+
 out_unlock:
 	mutex_unlock(&vm->hotplug_mutex);
+out_free:
+	if (parallel_hotplug_ratio)
+		kfree(rs.ranges);
 	return rc;
 }
 
@@ -2496,6 +2624,8 @@ static int virtio_mem_init(struct virtio_mem *vm)
 	const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
 	uint64_t sb_size, addr;
 	uint16_t node_id;
+	struct pglist_data *pgdat;
+	char deferred_wq_name[24];
 
 	if (!vm->vdev->config->get) {
 		dev_err(&vm->vdev->dev, "config access disabled\n");
@@ -2527,6 +2657,22 @@ static int virtio_mem_init(struct virtio_mem *vm)
 	if (vm->nid == NUMA_NO_NODE)
 		vm->nid = memory_add_physaddr_to_nid(vm->addr);
 
+	if (parallel_hotplug_ratio) {
+		pgdat = NODE_DATA(vm->nid);
+		if (!pgdat->deferred_hotplug_wq) {
+			snprintf(deferred_wq_name, sizeof(deferred_wq_name),
+				 "deferred_hotplug_wq_%d", vm->nid);
+			pgdat->deferred_hotplug_wq =
+				alloc_workqueue(deferred_wq_name,
+						WQ_UNBOUND | WQ_HIGHPRI, 0);
+			if (!pgdat->deferred_hotplug_wq)
+				return -ENOMEM;
+			dev_info(&vm->vdev->dev,
+				 "deferred workqueue created on node: %d\n",
+				 vm->nid);
+		}
+	}
+
 	/* bad device setup - warn only */
 	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
 		dev_warn(&vm->vdev->dev,
-- 
Gitee