diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a82db349a3a8e9ce7a7b247176cdcf888774f253..da25ccba297b312a9eae5345534953f3eea3adc4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,8 +198,22 @@ static int memory_block_online(struct memory_block *mem) if (ret) return ret; } - - ret = online_pages(start_pfn + nr_vmemmap_pages, + /* + * Defer struct pages initialization and defer freeing pages to buddy + * allocator starting from at least the second memory block of the zone, + * as rebuilding the zone is not required from that point onwards. + */ + if (parallel_hotplug_ratio && + start_pfn + nr_vmemmap_pages >= + zone->zone_start_pfn + + (memory_block_size_bytes() >> PAGE_SHIFT)) { + ret = __online_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages, zone, + mem->group, MHP_PHASE_PREPARE); + atomic_set(&mem->deferred_state, MEM_NEED_DEFER); + mem->deferred_zone = zone; + } else + ret = online_pages(start_pfn + nr_vmemmap_pages, nr_pages - nr_vmemmap_pages, zone, mem->group); if (ret) { if (nr_vmemmap_pages) @@ -286,7 +300,9 @@ static int memory_block_change_state(struct memory_block *mem, mem->state = MEM_GOING_OFFLINE; ret = memory_block_action(mem, to_state); - mem->state = ret ? from_state_req : to_state; + mem->state = + (ret || atomic_read(&mem->deferred_state) == MEM_NEED_DEFER) ? + from_state_req : to_state; return ret; } @@ -675,6 +691,8 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->state = state; mem->nid = NUMA_NO_NODE; mem->nr_vmemmap_pages = nr_vmemmap_pages; + atomic_set(&mem->deferred_state, MEM_SKIP_DEFER); + mem->deferred_zone = NULL; INIT_LIST_HEAD(&mem->group_next); if (group) { diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c3fa9e2211d16c152249e65d719953b9ad8d872e..83841c7801f9d1bb3298ed0f2d69d08f568ccb85 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -598,6 +598,15 @@ static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) if (WARN_ON_ONCE(size > vm->offline_threshold)) return false; + /* + * TODO: If memory online is deferred, offiine_size will exceed offline_threashold + * immediately. However, even if we hotplug 400G memory on a machine with only + * 256M boot memory, OOM is still not triggered. So in most cases, adding memory + * is okay. We may have a better way to deal with it in the future. + */ + if (parallel_hotplug_ratio) + return true; + return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; } @@ -1456,14 +1465,16 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) * of the memory block. */ static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, - int sb_id, int count) + int sb_id, int count, bool skip_send_req) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->sbm.sb_size; const uint64_t size = count * vm->sbm.sb_size; - int rc; + int rc = 0; - rc = virtio_mem_send_plug_request(vm, addr, size); + /* memory not onlined yet, so we also need defer the request. */ + if (!skip_send_req) + rc = virtio_mem_send_plug_request(vm, addr, size); if (!rc) virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); return rc; @@ -1613,7 +1624,7 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, * Plug the requested number of subblocks before adding it to linux, * so that onlining will directly online all plugged subblocks. */ - rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); + rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count, parallel_hotplug_ratio); if (rc) return rc; @@ -1672,7 +1683,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) count++; - rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count, false); if (rc) return rc; *nb_sb -= count; @@ -1692,6 +1703,57 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, return 0; } +struct deferred_mb_range { + unsigned long start_id; + unsigned long end_id; +}; + +struct deferred_mb_range_list { + struct deferred_mb_range *ranges; + unsigned long size; + unsigned long capacity; + int nid; +}; + +#define deferred_mb_range_list_for_each(_i, _ranges, _start, _end) \ + for (_i = 0; \ + _i < _ranges.size && (_start = _ranges.ranges[_i].start_id, \ + _end = _ranges.ranges[_i].end_id, true); \ + _i++) + +static int deferred_mb_range_list_add(struct deferred_mb_range_list *rs, + unsigned long mb_id) +{ + struct deferred_mb_range *new_ranges; + + if (!rs) + return -EINVAL; + + if (rs->size && rs->ranges && + rs->ranges[rs->size - 1].end_id + 1 == mb_id) { + rs->ranges[rs->size - 1].end_id = mb_id; + } else { + if (rs->size == rs->capacity) { + rs->capacity++; + new_ranges = kmalloc_array_node(rs->capacity, + sizeof(*rs->ranges), GFP_KERNEL, rs->nid); + if (!new_ranges) + return -ENOMEM; + if (rs->ranges) { + memcpy(new_ranges, rs->ranges, + rs->size * sizeof(*rs->ranges)); + kfree(rs->ranges); + } + rs->ranges = new_ranges; + } + rs->ranges[rs->size++] = (struct deferred_mb_range){ + .start_id = mb_id, + .end_id = mb_id, + }; + } + return 0; +} + static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) { const int mb_states[] = { @@ -1701,6 +1763,17 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) }; uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; + struct deferred_mb_range_list rs = { + .ranges = NULL, + .size = 0, + .capacity = 0, + .nid = vm->nid, + }; + unsigned long sid, eid; + uint64_t addr, size; + /* Last deferred memory block may not plug all subblocks */ + uint64_t part_nb_sb = 0; + unsigned long timestamp; int rc, i; if (!nb_sb) @@ -1726,32 +1799,87 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) /* Try to plug and add unused blocks */ virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { - if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) - return -ENOSPC; + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) { + rc = -ENOSPC; + goto out_free; + } + if (!nb_sb) + break; + if (parallel_hotplug_ratio) { + if (nb_sb < vm->sbm.sbs_per_mb) + part_nb_sb = nb_sb; + rc = deferred_mb_range_list_add(&rs, mb_id); + if (rc) + goto out_free; + } rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - return rc; + if (rc) + goto out_free; cond_resched(); } /* Try to prepare, plug and add new blocks */ while (nb_sb) { - if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) - return -ENOSPC; + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) { + rc = -ENOSPC; + goto out_free; + } rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); if (rc) - return rc; + goto out_free; + if (parallel_hotplug_ratio) { + if (nb_sb < vm->sbm.sbs_per_mb) + part_nb_sb = nb_sb; + rc = deferred_mb_range_list_add(&rs, mb_id); + if (rc) + goto out_free; + } rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); if (rc) - return rc; + goto out_free; cond_resched(); } - return 0; + if (parallel_hotplug_ratio) { + timestamp = jiffies; + deferred_mb_range_list_for_each(i, rs, sid, eid) { + addr = virtio_mem_mb_id_to_phys(sid); + /* Always add complete memory block to Linux */ + size = (eid - sid + 1) * memory_block_size_bytes(); + /* + * Deferred struct pages initialization and + * Deferred free pages to buddy allocator. + */ + rc = deferred_online_memory(vm->nid, addr, size); + if (rc) + goto out_free; + + /* Deferred send plug requests */ + for (mb_id = sid; mb_id <= eid; mb_id++) { + addr = virtio_mem_mb_id_to_phys(mb_id); + if (part_nb_sb && i == rs.size - 1 && + mb_id == eid) + size = part_nb_sb * vm->sbm.sb_size; + else + size = memory_block_size_bytes(); + + rc = virtio_mem_send_plug_request(vm, addr, size); + if (rc) + goto out_free; + } + } + dev_info(&vm->vdev->dev, "deferred time: %ums", + jiffies_to_msecs(jiffies - timestamp)); + } + goto out_free; + out_unlock: mutex_unlock(&vm->hotplug_mutex); +out_free: + if (parallel_hotplug_ratio) + kfree(rs.ranges); return rc; } @@ -2496,6 +2624,8 @@ static int virtio_mem_init(struct virtio_mem *vm) const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; uint64_t sb_size, addr; uint16_t node_id; + struct pglist_data *pgdat; + char deferred_wq_name[24]; if (!vm->vdev->config->get) { dev_err(&vm->vdev->dev, "config access disabled\n"); @@ -2527,6 +2657,22 @@ static int virtio_mem_init(struct virtio_mem *vm) if (vm->nid == NUMA_NO_NODE) vm->nid = memory_add_physaddr_to_nid(vm->addr); + if (parallel_hotplug_ratio) { + pgdat = NODE_DATA(vm->nid); + if (!pgdat->deferred_hotplug_wq) { + snprintf(deferred_wq_name, sizeof(deferred_wq_name), + "deferred_hotplug_wq_%d", vm->nid); + pgdat->deferred_hotplug_wq = + alloc_workqueue(deferred_wq_name, + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!pgdat->deferred_hotplug_wq) + return -ENOMEM; + dev_info(&vm->vdev->dev, + "deferred workqueue created on node: %d\n", + vm->nid); + } + } + /* bad device setup - warn only */ if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) dev_warn(&vm->vdev->dev, diff --git a/include/linux/memory.h b/include/linux/memory.h index 4ddc3b960ae95f266fedcfa249949552695d6bac..567682ce4c26d70eb62df97d90857a670a0bff7a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -65,6 +65,10 @@ struct memory_group { }; }; +/* Memory block defer state flags */ +#define MEM_SKIP_DEFER 0 +#define MEM_NEED_DEFER 1 + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -76,6 +80,12 @@ struct memory_block { * lay at the beginning of the memory block. */ unsigned long nr_vmemmap_pages; + /* + * Whether struct pages initialization and free pages + * to buddy allocator needs to be deferred or not. + */ + atomic_t deferred_state; + struct zone *deferred_zone; /* zone for this defered block */ struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ }; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 26b4ac8c59a72b4ea1b01f7ddffa534d5a51085f..b5d5b1b82c617b3ee993a93ac27b1428ad7cbb03 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -73,6 +73,9 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ #ifdef CONFIG_MEMORY_HOTPLUG +#define MHP_PHASE_PREPARE 1 +#define MHP_PHASE_DEFERRED 2 +#define MHP_PHASE_DEFAULT 3 /* * Return page for the valid pfn only if the page is online. All pfn * walkers which rely on the fully initialized page->flags and others @@ -180,6 +183,9 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); +extern int __online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group, + int phase); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -200,6 +206,7 @@ extern u64 max_mem_size; extern int memhp_online_type_from_str(const char *str); extern bool skip_set_contiguous; +extern unsigned int parallel_hotplug_ratio; /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ @@ -356,6 +363,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); extern bool mhp_supports_memmap_on_memory(unsigned long size); +extern int deferred_online_memory(int nid, u64 start, u64 size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3b4ff5685af72a2634d5fafb19d62f4e10ca0978..16e35e24cc589c0432fd49879937774de2f027de 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -873,6 +873,7 @@ struct zone { atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; unsigned long reported_pages; + atomic_long_t deferred_pages; CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) @@ -895,6 +896,11 @@ static inline unsigned long zone_managed_pages(struct zone *zone) return (unsigned long)atomic_long_read(&zone->managed_pages); } +static inline unsigned long zone_deferred_pages(struct zone *zone) +{ + return (unsigned long)atomic_long_read(&zone->deferred_pages); +} + static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA @@ -1046,6 +1052,13 @@ typedef struct pglist_data { * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; +#endif +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * This workqueue is used to handle deferred pages + * initialization of hotplugged memory. + */ + struct workqueue_struct *deferred_hotplug_wq; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b91c0806228a22b0e0cc1f8a09c0a50e06d244dd..fcefbe5978e7020853ad7c25395a6507d8ea58c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -122,6 +122,13 @@ bool skip_set_contiguous __read_mostly; module_param(skip_set_contiguous, bool, 0644); MODULE_PARM_DESC(skip_set_contiguous, "Do not set zone contiguous when online/offline pages"); +unsigned int parallel_hotplug_ratio __read_mostly; +EXPORT_SYMBOL_GPL(parallel_hotplug_ratio); +module_param(parallel_hotplug_ratio, uint, 0644); +MODULE_PARM_DESC(parallel_hotplug_ratio, + "Set the ratio of parallel hotplug workers to the number of CPUs on " + "the node, with values constrained between 0 and 100. Default: 0"); + /* * memory_hotplug.auto_movable_numa_aware: consider numa node stats */ @@ -660,37 +667,31 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } -/* - * Associate the pfn range with the given zone, initializing the memmaps - * and resizing the pgdat/zone data to span the added pages. After this - * call, all affected pages are PG_reserved. - * - * All aligned pageblocks are initialized to the specified migratetype - * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related - * zone stats (e.g., nr_isolate_pageblock) are touched. - */ -void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype) + +void __ref __move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + int migratetype, int phase) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; unsigned long flags; + if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE) { #ifdef KIDLED_AGE_NOT_IN_PAGE_FLAGS - kidled_free_page_age(pgdat); + kidled_free_page_age(pgdat); #endif - clear_zone_contiguous(zone); - - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ - pgdat_resize_lock(pgdat, &flags); - zone_span_writelock(zone); - if (zone_is_empty(zone)) - init_currently_empty_zone(zone, start_pfn, nr_pages); - resize_zone_range(zone, start_pfn, nr_pages); - zone_span_writeunlock(zone); - resize_pgdat_range(pgdat, start_pfn, nr_pages); - pgdat_resize_unlock(pgdat, &flags); + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + } /* * TODO now we have a visible range of pages which are not associated @@ -698,10 +699,29 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, + if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_DEFERRED) + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype); - set_zone_contiguous(zone); + if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE) + set_zone_contiguous(zone); +} + +/* + * Associate the pfn range with the given zone, initializing the memmaps + * and resizing the pgdat/zone data to span the added pages. After this + * call, all affected pages are PG_reserved. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. + */ +void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype) +{ + __move_pfn_range_to_zone(zone, start_pfn, nr_pages, altmap, migratetype, + MHP_PHASE_DEFAULT); } struct auto_movable_stats { @@ -713,7 +733,8 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, struct zone *zone) { if (zone_idx(zone) == ZONE_MOVABLE) { - stats->movable_pages += zone->present_pages; + stats->movable_pages += + zone->present_pages + zone_deferred_pages(zone); } else { stats->kernel_early_pages += zone->present_early_pages; #ifdef CONFIG_CMA @@ -979,6 +1000,33 @@ struct zone *zone_for_pfn_range(int online_type, int nid, return default_zone_for_pfn(nid, start_pfn, nr_pages); } +void __adjust_present_page_count(struct page *page, struct memory_group *group, + long nr_pages, struct zone *zone, int phase) +{ + const bool movable = zone_idx(zone) == ZONE_MOVABLE; + unsigned long flags; + + if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_DEFERRED) { + /* + * We only support onlining/offlining/adding/removing of complete + * memory blocks; therefore, either all is either early or hotplugged. + */ + if (early_section(__pfn_to_section(page_to_pfn(page)))) + zone->present_early_pages += nr_pages; + zone->present_pages += nr_pages; + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + } + + if (phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE) { + if (group && movable) + group->present_movable_pages += nr_pages; + else if (group && !movable) + group->present_kernel_pages += nr_pages; + } +} + /* * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. @@ -987,24 +1035,8 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages) { struct zone *zone = page_zone(page); - const bool movable = zone_idx(zone) == ZONE_MOVABLE; - unsigned long flags; - - /* - * We only support onlining/offlining/adding/removing of complete - * memory blocks; therefore, either all is either early or hotplugged. - */ - if (early_section(__pfn_to_section(page_to_pfn(page)))) - zone->present_early_pages += nr_pages; - zone->present_pages += nr_pages; - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); - if (group && movable) - group->present_movable_pages += nr_pages; - else if (group && !movable) - group->present_kernel_pages += nr_pages; + __adjust_present_page_count(page, group, nr_pages, zone, MHP_PHASE_DEFAULT); } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, @@ -1053,14 +1085,16 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, struct memory_group *group) +int __ref __online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group, + int phase) { unsigned long flags; int need_zonelists_rebuild = 0; const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; + bool need_lock = phase == MHP_PHASE_DEFAULT || phase == MHP_PHASE_PREPARE; /* * {on,off}lining is constrained to full memory sections (or more @@ -1074,10 +1108,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); + if (need_lock) + mem_hotplug_begin(); /* associate pfn range with the zone */ - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); + __move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE, phase); + + if (phase == MHP_PHASE_PREPARE) { + __adjust_present_page_count(pfn_to_page(pfn), group, nr_pages, + zone, phase); + atomic_long_add(nr_pages, &zone->deferred_pages); + mem_hotplug_done(); + return 0; + } arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -1107,7 +1150,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } online_pages_range(pfn, nr_pages); - adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); + + __adjust_present_page_count(pfn_to_page(pfn), group, nr_pages, zone, phase); + if (phase == MHP_PHASE_DEFERRED) + atomic_long_sub(nr_pages, &zone->deferred_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1134,7 +1180,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, writeback_set_ratelimit(); memory_notify(MEM_ONLINE, &arg); - mem_hotplug_done(); + + if (need_lock) + mem_hotplug_done(); return 0; failed_addition: @@ -1143,9 +1191,142 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); - mem_hotplug_done(); + if (need_lock) + mem_hotplug_done(); + return ret; +} + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) +{ + return __online_pages(pfn, nr_pages, zone, group, MHP_PHASE_DEFAULT); +} + +static int deferred_memory_block_online_pages(struct memory_block *mem, + void *arg) +{ + unsigned long start_pfn, nr_pages; + unsigned long nr_vmemmap_pages; + struct zone *zone; + int ret; + + /* Continue if struct pages initialization need to be deferred */ + if (memhp_default_online_type == MMOP_OFFLINE || + mem->state == MEM_ONLINE || !mem->deferred_zone || + atomic_cmpxchg(&mem->deferred_state, MEM_NEED_DEFER, + MEM_SKIP_DEFER) != MEM_NEED_DEFER) + return 0; + + zone = mem->deferred_zone; + mem->deferred_zone = NULL; + + start_pfn = section_nr_to_pfn(mem->start_section_nr); + nr_pages = memory_block_size_bytes() >> PAGE_SHIFT; + nr_vmemmap_pages = mem->nr_vmemmap_pages; + + ret = __online_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages, zone, mem->group, + MHP_PHASE_DEFERRED); + if (ret) { + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, + nr_vmemmap_pages); + return ret; + } + + mem->state = MEM_ONLINE; + return 0; +} + +struct deferred_walk_memory_blocks_work { + struct work_struct work; + u64 start; + u64 size; + int ret; +}; + +static void deferred_walk_memory_blocks_worker(struct work_struct *work) +{ + struct deferred_walk_memory_blocks_work *w = container_of( + work, struct deferred_walk_memory_blocks_work, work); + + w->ret = walk_memory_blocks(w->start, w->size, NULL, + deferred_memory_block_online_pages); +} + +int __ref deferred_online_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int i, ret = 0; + struct workqueue_struct *wq; + struct deferred_walk_memory_blocks_work *ws, *w; + const struct cpumask *cpumask; + u64 chunk_start = start; + u64 chunk_size, chunk_num, chunk_remain; + + if (!parallel_hotplug_ratio) + return -EINVAL; + + wq = pgdat->deferred_hotplug_wq; + if (!wq) { + pr_warn("Deferred hotplug work queue is not initialized for node %d\n", + nid); + goto sequential; + } + + cpumask = cpumask_of_node(nid); + /* + * The number of parallel workers (chunk_num) should be less than + * or equal to the maximum number of CPUs on the node. + * And the memory size handled by each worker needs to be aligned + * with the memory block size. + */ + chunk_num = + max_t(uint, 1, + max_t(uint, cpumask_weight(cpumask), 1) * + min_t(uint, parallel_hotplug_ratio, 100) / 100); + chunk_size = ALIGN(size / chunk_num, memory_block_size_bytes()); + chunk_num = size / chunk_size; + chunk_remain = size % chunk_size; + + if (chunk_num == 1) + goto sequential; + + ws = kmalloc_array_node(chunk_num, sizeof(*ws), GFP_KERNEL, nid); + if (!ws) + goto sequential; + + for (i = 0; i < chunk_num; i++) { + w = ws + i; + INIT_WORK(&w->work, deferred_walk_memory_blocks_worker); + w->start = chunk_start; + if (i == chunk_num - 1) + w->size = chunk_size + chunk_remain; + else + w->size = chunk_size; + chunk_start += w->size; + queue_work_node(nid, wq, &w->work); + } + + flush_workqueue(wq); + + for (i = 0; i < chunk_num; i++) { + w = ws + i; + if (w->ret) { + ret = w->ret; + pr_err("Deferred online memory failed for node %d, start: %#llx, size: %#llx, ret: %d\n", + nid, w->start, w->size, ret); + break; + } + } + kfree(ws); return ret; + +sequential: + return walk_memory_blocks(start, size, NULL, + deferred_memory_block_online_pages); } +EXPORT_SYMBOL_GPL(deferred_online_memory); #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ static void reset_node_present_pages(pg_data_t *pgdat) @@ -1158,6 +1339,14 @@ static void reset_node_present_pages(pg_data_t *pgdat) pgdat->node_present_pages = 0; } +static void reset_node_deferred_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + atomic_long_set(&z->deferred_pages, 0); +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_init_pgdat(int nid) { @@ -1188,6 +1377,7 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) */ reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); + reset_node_deferred_pages(pgdat); return pgdat; }