diff --git a/drivers/Kconfig b/drivers/Kconfig index c9a22b0413034676f56b87906d059952d2fbaa79..826b2b19d0b8608a6423e84ca8cfe262f9139b57 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -10,6 +10,8 @@ source "drivers/pcmcia/Kconfig" source "drivers/rapidio/Kconfig" +source "drivers/hyperhold/Kconfig" + source "drivers/base/Kconfig" source "drivers/bus/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 71da48160b098700d04ef86e7260788f3a9397ed..ecc494918773a322108b6bc095ceae518e1fee2a 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -68,6 +68,9 @@ obj-$(CONFIG_CONNECTOR) += connector/ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +# Hyperhold driver +obj-$(CONFIG_HYPERHOLD) += hyperhold/ + obj-$(CONFIG_PARPORT) += parport/ obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index fe7a4b7d30cfe3076d4e8fffbfee51d290366720..69719562f1b2fc8a211286f586f667e79f943c25 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -37,3 +37,5 @@ config ZRAM_MEMORY_TRACKING /sys/kernel/debug/zram/zramX/block_state. See Documentation/admin-guide/blockdev/zram.rst for more information. + +source "drivers/block/zram/zram_group/Kconfig" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index de9e457907b1e9834937df323413bd11d18f5d5c..a8947f7faa980f96ce88ee9ae1d8278761175435 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,4 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_GROUP) += zram_group/zram_group.o zram_group/zlist.o zram_group/group_writeback.o + obj-$(CONFIG_ZRAM) += zram.o + +ccflags-$(CONFIG_ZRAM_GROUP) += -I$(srctree)/drivers/block/zram/zram_group/ +ccflags-$(CONFIG_HYPERHOLD) += -I$(srctree)/drivers/hyperhold/ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7dce17fd59baaec34fa045223cc58e3eebd51afd..8751ba2f63f2fcd80750fccb9fcd376a57affdd5 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -35,6 +35,10 @@ #include #include +#ifdef CONFIG_ZRAM_GROUP +#include +#endif + #include "zram_drv.h" static DEFINE_IDR(zram_index_idr); @@ -59,22 +63,6 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); - -static int zram_slot_trylock(struct zram *zram, u32 index) -{ - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); -} - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -85,35 +73,6 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static unsigned long zram_get_handle(struct zram *zram, u32 index) -{ - return zram->table[index].handle; -} - -static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) -{ - zram->table[index].handle = handle; -} - -/* flag operations require table entry bit_spin_lock() being held */ -static bool zram_test_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - return zram->table[index].flags & BIT(flag); -} - -static void zram_set_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags |= BIT(flag); -} - -static void zram_clear_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags &= ~BIT(flag); -} - static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { @@ -125,19 +84,6 @@ static unsigned long zram_get_element(struct zram *zram, u32 index) return zram->table[index].element; } -static size_t zram_get_obj_size(struct zram *zram, u32 index) -{ - return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); -} - -static void zram_set_obj_size(struct zram *zram, - u32 index, size_t size) -{ - unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; - - zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; -} - static inline bool zram_allocated(struct zram *zram, u32 index) { return zram_get_obj_size(zram, index) || @@ -1135,6 +1081,65 @@ static DEVICE_ATTR_RO(bd_stat); #endif static DEVICE_ATTR_RO(debug_stat); +#ifdef CONFIG_ZRAM_GROUP +static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + if (zram->zgrp_ctrl == ZGRP_NONE) + strcpy(buf, "disable\n"); + else if (zram->zgrp_ctrl == ZGRP_TRACK) + strcpy(buf, "readonly\n"); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (zram->zgrp_ctrl == ZGRP_WRITE) + strcpy(buf, "readwrite"); +#endif + up_read(&zram->init_lock); + + return strlen(buf); +} + +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + int ret; +#ifdef CONFIG_ZRAM_GROUP_DEBUG + u32 op, gid, index; + + ret = sscanf(buf, "%u %u %u", &op, &index, &gid); + if (ret == 3) { + pr_info("op[%u] index[%u] gid[%u].\n", op, index, gid); + group_debug(zram, op, index, gid); + return len; + } +#endif + + ret = len; + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Can't setup group ctrl for initialized device!\n"); + ret = -EBUSY; + goto out; + } + if (!strcmp(buf, "disable\n")) + zram->zgrp_ctrl = ZGRP_NONE; + else if (!strcmp(buf, "readonly\n")) + zram->zgrp_ctrl = ZGRP_TRACK; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (!strcmp(buf, "readwrite\n")) + zram->zgrp_ctrl = ZGRP_WRITE; +#endif + else + ret = -EINVAL; +out: + up_write(&zram->init_lock); + + return ret; +} +#endif + static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -1146,6 +1151,9 @@ static void zram_meta_free(struct zram *zram, u64 disksize) zs_destroy_pool(zram->mem_pool); vfree(zram->table); +#ifdef CONFIG_ZRAM_GROUP + zram_group_deinit(zram); +#endif } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -1165,6 +1173,10 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); +#ifdef CONFIG_ZRAM_GROUP + zram_group_init(zram, num_pages); +#endif + return true; } @@ -1177,6 +1189,10 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; +#ifdef CONFIG_ZRAM_GROUP + zram_group_untrack_obj(zram, index); +#endif + #ifdef CONFIG_ZRAM_MEMORY_TRACKING zram->table[index].ac_time = 0; #endif @@ -1242,7 +1258,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_get_element(zram, index), bio, partial_io); } +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (!bio) { + ret = zram_group_fault_obj(zram, index); + if (ret) { + zram_slot_unlock(zram, index); + return ret; + } + } + if (zram_test_flag(zram, index, ZRAM_GWB)) { + zram_slot_unlock(zram, index); + return -EIO; + } +#endif handle = zram_get_handle(zram, index); if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { unsigned long value; @@ -1425,6 +1454,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); } +#ifdef CONFIG_ZRAM_GROUP + zram_group_track_obj(zram, index, page->mem_cgroup); +#endif zram_slot_unlock(zram, index); /* Update stats */ @@ -1850,6 +1882,9 @@ static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); #endif +#ifdef CONFIG_ZRAM_GROUP +static DEVICE_ATTR_RW(group); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1873,6 +1908,9 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_bd_stat.attr, #endif &dev_attr_debug_stat.attr, +#ifdef CONFIG_ZRAM_GROUP + &dev_attr_group.attr, +#endif NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index f2fd46daa7604583b1c3bebaba86b484bca901c7..ae2ec81c0f8adfb36975d7e4c9e6dd18b4caea48 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,6 +21,10 @@ #include "zcomp.h" +#ifdef CONFIG_ZRAM_GROUP +#include "zram_group.h" +#endif + #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -39,7 +43,15 @@ * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), * the higher bits is for zram_pageflags. */ +#ifdef CONFIG_ZRAM_GROUP +/* reserve 16 bits for group id */ +#define ZRAM_SIZE_SHIFT 24 +#define ZRAM_GRPID_SHIFT 16 +#define ZRAM_GRPID_MASK (((1UL << ZRAM_GRPID_SHIFT) - 1) << ZRAM_SIZE_SHIFT) +#define ZRAM_FLAG_SHIFT (ZRAM_SIZE_SHIFT + ZRAM_GRPID_SHIFT) +#else #define ZRAM_FLAG_SHIFT 24 +#endif /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { @@ -50,6 +62,10 @@ enum zram_pageflags { ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZRAM_GWB, /* obj is group writeback*/ + ZRAM_FAULT, /* obj is needed by a pagefault req */ +#endif __NR_ZRAM_PAGEFLAGS, }; @@ -91,6 +107,10 @@ struct zram_stats { struct zram { struct zram_table_entry *table; +#ifdef CONFIG_ZRAM_GROUP + struct zram_group *zgrp; + unsigned int zgrp_ctrl; +#endif struct zs_pool *mem_pool; struct zcomp *comp; struct gendisk *disk; @@ -126,4 +146,86 @@ struct zram { struct dentry *debugfs_dir; #endif }; + +static inline int zram_slot_trylock(struct zram *zram, u32 index) +{ + return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static inline void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + +/* flag operations require table entry bit_spin_lock() being held */ +static inline bool zram_test_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + return zram->table[index].flags & BIT(flag); +} + +static inline void zram_set_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags |= BIT(flag); +} + +static inline void zram_clear_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags &= ~BIT(flag); +} +#ifdef CONFIG_ZRAM_GROUP +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_SIZE_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_SIZE_SHIFT; + + zram->table[index].flags = (flags << ZRAM_SIZE_SHIFT) | size; +} + +void zram_group_init(struct zram *zram, u32 nr_obj); +void zram_group_deinit(struct zram *zram); +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg); +void zram_group_untrack_obj(struct zram *zram, u32 index); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +int zram_group_fault_obj(struct zram *zram, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid); +#endif + +#else +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; + + zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; +} +#endif #endif diff --git a/drivers/block/zram/zram_group/Kconfig b/drivers/block/zram/zram_group/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..0eacf79fb2594db32641d6997e463061c8da7880 --- /dev/null +++ b/drivers/block/zram/zram_group/Kconfig @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +config ZRAM_GROUP + bool "Manage Zram objs with mem_cgroup" + depends on ZRAM && MEMCG + help + Manage Zram objs with mem_cgroup. + +config ZRAM_GROUP_DEBUG + bool "Debug info for zram group" + depends on ZRAM_GROUP + help + Debug info for ZRAM_GROUP. + +config ZLIST_DEBUG + bool "Debug info for zram group list" + depends on ZRAM_GROUP + help + Debug info for zram group list. + +config ZRAM_GROUP_WRITEBACK + bool "Write back grouped zram objs to Hyperhold driver" + depends on ZRAM_GROUP && HYPERHOLD + help + Write back grouped zram objs to hyperhold. diff --git a/drivers/block/zram/zram_group/group_writeback.c b/drivers/block/zram/zram_group/group_writeback.c new file mode 100644 index 0000000000000000000000000000000000000000..f1b2550c94ff8455833268d943d0f032edbd9ba4 --- /dev/null +++ b/drivers/block/zram/zram_group/group_writeback.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/group_writeback.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include + +#include "../zram_drv.h" +#include "zram_group.h" + +#ifdef CONFIG_HYPERHOLD +#include "hyperhold.h" +#endif + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +static u16 zram_get_memcg_id(struct zram *zram, u32 index) +{ + return (zram->table[index].flags & ZRAM_GRPID_MASK) >> ZRAM_SIZE_SHIFT; +} + +static void zram_set_memcg_id(struct zram *zram, u32 index, u16 gid) +{ + unsigned long old = zram->table[index].flags & (~ZRAM_GRPID_MASK); + + zram->table[index].flags = old | ((u64)gid << ZRAM_SIZE_SHIFT); +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static bool obj_can_wb(struct zram *zram, u32 index, u16 gid) +{ + /* overwrited obj, just skip */ + if (zram_get_memcg_id(zram, index) != gid) { + pr_info("obj %u is from group %u instead of group %u.\n", + index, zram_get_memcg_id(zram, index), gid); + return false; + } + if (!zgrp_obj_is_isolated(zram->zgrp, index)) { + pr_info("obj %u is not isolated.\n", index); + return false; + } + /* need not to writeback, put back the obj as HOTEST */ + if (zram_test_flag(zram, index, ZRAM_SAME)) { + pr_info("obj %u is filled with same element.\n", index); + goto insert; + } + if (zram_test_flag(zram, index, ZRAM_WB)) { + pr_info("obj %u is writeback.\n", index); + goto insert; + } + /* obj is needed by a pagefault req, do not writeback it. */ + if (zram_test_flag(zram, index, ZRAM_FAULT)) { + pr_info("obj %u is needed by a pagefault request.\n", index); + goto insert; + } + /* should never happen */ + if (zram_test_flag(zram, index, ZRAM_GWB)) { + pr_info("obj %u is group writeback.\n", index); + BUG(); + return false; + } + + return true; +insert: + zgrp_obj_insert(zram->zgrp, index, gid); + + return false; +} + +static void copy_obj(struct hpio *hpio, u32 offset, char *obj, u32 size, bool to) +{ + u32 page_id, start; + char *buf = NULL; + + page_id = offset / PAGE_SIZE; + start = offset % PAGE_SIZE; + if (size + start <= PAGE_SIZE) { + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, size); + else + memcpy(obj, buf + start, size); + + return; + } + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, PAGE_SIZE - start); + else + memcpy(obj, buf + start, PAGE_SIZE - start); + buf = page_to_virt(hyperhold_io_page(hpio, page_id + 1)); + if (to) + memcpy(buf, obj + PAGE_SIZE - start, size + start - PAGE_SIZE); + else + memcpy(obj + PAGE_SIZE - start, buf, size + start - PAGE_SIZE); +} + +static u32 move_obj_to_hpio(struct zram *zram, u32 index, u16 gid, + struct hpio *hpio, u32 offset) +{ + u32 size = 0; + unsigned long handle; + char *src = NULL; + u32 ext_size; + u32 eid; + + eid = hyperhold_io_extent(hpio); + ext_size = hyperhold_extent_size(eid); + + zram_slot_lock(zram, index); + if (!obj_can_wb(zram, index, gid)) + goto unlock; + size = zram_get_obj_size(zram, index); + /* no space, put back the obj as COLDEST */ + if (size + offset > ext_size) { + pr_info("obj %u size is %u, but ext %u only %u space left.\n", + index, size, eid, ext_size - offset); + zgrp_obj_putback(zram->zgrp, index, gid); + size = 0; + goto unlock; + } + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + copy_obj(hpio, offset, src, size, true); + zs_unmap_object(zram->mem_pool, handle); + zs_free(zram->mem_pool, handle); + zram_set_handle(zram, index, hyperhold_address(eid, offset)); + zram_set_flag(zram, index, ZRAM_GWB); + wbgrp_obj_insert(zram->zgrp, index, eid); + wbgrp_obj_stats_inc(zram->zgrp, gid, eid, size); + zgrp_obj_stats_dec(zram->zgrp, gid, size); + pr_info("move obj %u of group %u to hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size); +unlock: + zram_slot_unlock(zram, index); + + return size; +} + +static void move_obj_from_hpio(struct zram *zram, int index, struct hpio *hpio) +{ + u32 size = 0; + unsigned long handle = 0; + u32 eid, offset; + u64 addr; + char *dst = NULL; + u16 gid; + + eid = hyperhold_io_extent(hpio); +retry: + zram_slot_lock(zram, index); + if (!zram_test_flag(zram, index, ZRAM_GWB)) + goto unlock; + addr = zram_get_handle(zram, index); + if (hyperhold_addr_extent(addr) != eid) + goto unlock; + size = zram_get_obj_size(zram, index); + if (handle) + goto move; + handle = zs_malloc(zram->mem_pool, size, GFP_NOWAIT); + if (handle) + goto move; + zram_slot_unlock(zram, index); + handle = zs_malloc(zram->mem_pool, size, GFP_NOIO | __GFP_NOFAIL); + if (handle) + goto retry; + BUG(); + + return; +move: + offset = hyperhold_addr_offset(addr); + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + copy_obj(hpio, offset, dst, size, false); + zs_unmap_object(zram->mem_pool, handle); + zram_set_handle(zram, index, handle); + zram_clear_flag(zram, index, ZRAM_GWB); + gid = zram_get_memcg_id(zram, index); + zgrp_obj_insert(zram->zgrp, index, gid); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zgrp_obj_stats_inc(zram->zgrp, gid, size); + pr_info("move obj %u of group %u from hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size); +unlock: + zram_slot_unlock(zram, index); +} + + +#define NR_ISOLATE 32 +static bool move_extent_from_hpio(struct zram *zram, struct hpio *hpio) +{ + u32 idxs[NR_ISOLATE]; + u32 eid; + u32 nr; + int i; + bool last = false; + + eid = hyperhold_io_extent(hpio); +repeat: + nr = wbgrp_isolate_objs(zram->zgrp, eid, idxs, NR_ISOLATE, &last); + for (i = 0; i < nr; i++) + move_obj_from_hpio(zram, idxs[i], hpio); + if (last) + return true; + if (nr) + goto repeat; + + return false; +} + +struct hpio_priv { + struct zram *zram; + u16 gid; +}; + +static void write_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (hyperhold_io_success(hpio)) + goto out; + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u32 collect_objs(struct zram *zram, u16 gid, struct hpio *hpio, u32 ext_size) +{ + u32 offset = 0; + u32 last_offset; + u32 nr; + u32 idxs[NR_ISOLATE]; + int i; + +more: + last_offset = offset; + nr = zgrp_isolate_objs(zram->zgrp, gid, idxs, NR_ISOLATE, NULL); + for (i = 0; i < nr; i++) + offset += move_obj_to_hpio(zram, idxs[i], gid, hpio, offset); + pr_info("%u data attached, offset = %u.\n", offset - last_offset, offset); + if (offset < ext_size && offset != last_offset) + goto more; + + return offset; +} + +static u64 write_one_extent(struct zram *zram, u16 gid) +{ + int eid; + struct hpio *hpio = NULL; + struct hpio_priv *priv = NULL; + u32 size = 0; + int ret; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + return 0; + priv->gid = gid; + priv->zram = zram; + eid = hyperhold_alloc_extent(); + if (eid < 0) + goto err; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_WRITE); + if (!hpio) + goto free_extent; + + size = collect_objs(zram, gid, hpio, hyperhold_extent_size(eid)); + if (size == 0) { + pr_err("group %u has no data in zram.\n", gid); + goto put_hpio; + } + zgrp_ext_insert(zram->zgrp, eid, gid); + + ret = hyperhold_write_async(hpio, write_endio, priv); + if (ret) + goto move_back; + + return size; +move_back: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + eid = -EINVAL; +put_hpio: + hyperhold_io_put(hpio); +free_extent: + if (eid >= 0) + hyperhold_free_extent(eid); +err: + kfree(priv); + + return 0; +} + +static void read_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (!hyperhold_io_success(hpio)) { + BUG(); + goto out; + } + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u64 read_one_extent(struct zram *zram, u32 eid, u16 gid) +{ + struct hpio *hpio = NULL; + u32 ext_size = 0; + int ret; + struct hpio_priv *priv = NULL; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + goto err; + priv->gid = gid; + priv->zram = zram; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) + goto err; + ext_size = hyperhold_extent_size(eid); + ret = hyperhold_read_async(hpio, read_endio, priv); + if (ret) + goto err; + + return ext_size; +err: + hyperhold_io_put(hpio); + kfree(priv); + + return 0; +} + +static void sync_read_endio(struct hpio *hpio) +{ + hyperhold_io_complete(hpio); +} + +static int read_one_obj_sync(struct zram *zram, u32 index) +{ + struct hpio *hpio = NULL; + int ret; + u32 eid; + u16 gid; + u32 size; + + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + + pr_info("read obj %u.\n", index); + + gid = zram_get_memcg_id(zram, index); + eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + size = zram_get_obj_size(zram, index); + wbgrp_fault_stats_inc(zram->zgrp, gid, eid, size); +check: + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto read; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +read: + zram_set_flag(zram, index, ZRAM_FAULT); + zram_slot_unlock(zram, index); + + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) { + ret = -ENOMEM; + goto out; + } + ret = hyperhold_read_async(hpio, sync_read_endio, NULL); + /* io submit error */ + if (ret && ret != -EAGAIN) + goto out; + + hyperhold_io_wait(hpio); + /* get a write io, data is ready, copy the pages even write failed */ + if (op_is_write(hyperhold_io_operate(hpio))) + goto move; + /* read io failed, return -EIO */ + if (!hyperhold_io_success(hpio)) { + ret = -EIO; + goto out; + } + /* success, copy the data and free extent */ +move: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_put(hpio); + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_FAULT); + wake_up(&zram->zgrp->wbgrp.fault_wq); + + return ret; +} + +u64 read_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u32 eid; + u64 read_size = 0; + u32 nr; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_info("read %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > read_size) { + nr = zgrp_isolate_exts(zram->zgrp, gid, &eid, 1, NULL); + if (!nr) + break; + read_size += read_one_extent(zram, eid, gid); + } + + return read_size; +} + +u64 write_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u64 write_size = 0; + u64 size = 0; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_info("write %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > write_size) { + size = write_one_extent(zram, gid); + if (!size) + break; + write_size += size; + } + + return write_size; +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +#include +#define ZGRP_TEST_MAX_GRP 101 +#endif + +int zram_group_fault_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return 0; + + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + zgrp_fault_stats_inc(zram->zgrp, gid, size); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + return read_one_obj_sync(zram, index); +#else + return 0; +#endif +} + +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg) +{ + u16 gid; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + if (!CHECK(memcg || !memcg->id.id, "obj %u has no memcg!\n", index)) + return; + gid = zram_get_memcg_id(zram, index); + if (!CHECK(!gid, "obj %u has gid %u.\n", index, gid)) + BUG(); + + gid = memcg->id.id; + zram_set_memcg_id(zram, index, gid); + zgrp_obj_insert(zram->zgrp, index, gid); + zgrp_obj_stats_inc(zram->zgrp, gid, zram_get_obj_size(zram, index)); +} + +void zram_group_untrack_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +check: + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto clear; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +clear: +#endif + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + if (!gid) + return; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (zram_test_flag(zram, index, ZRAM_GWB)) { + u32 eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + + if (wbgrp_obj_delete(zram->zgrp, index, eid)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + zram_clear_flag(zram, index, ZRAM_GWB); + zram_set_memcg_id(zram, index, 0); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zram_set_handle(zram, index, 0); + return; + } +#endif + zgrp_obj_delete(zram->zgrp, index, gid); + zram_set_memcg_id(zram, index, 0); + zgrp_obj_stats_dec(zram->zgrp, gid, size); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid) +{ + if (op == 0) + zram_group_dump(zram->zgrp, gid, index); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (op == 22) + read_group_objs(zram, gid, index); + if (op == 23) + write_group_objs(zram, gid, index); + if (op == 20) { + if (index) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); + else + zram_group_remove_writeback(zram->zgrp); + } +#endif +} +#endif + +static u64 group_obj_stats(struct zram *zram, u16 gid, int type) +{ + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 0, zram->zgrp->nr_grp - 1)) + return 0; + + if (type == CACHE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].zram_size); + else if (type == CACHE_PAGE) + return atomic_read(&zram->zgrp->stats[gid].zram_pages); + else if (type == CACHE_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].zram_fault); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (type == SWAP_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].wb_size); + else if (type == SWAP_PAGE) + return atomic_read(&zram->zgrp->stats[gid].wb_pages); + else if (type == READ_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].read_size); + else if (type == WRITE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].write_size); + else if (type == SWAP_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].wb_fault); + BUG(); +#endif + + return 0; +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return read_group_objs((struct zram *)priv, gid, req_size); +} + +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return write_group_objs((struct zram *)priv, gid, req_size); +} +#else +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +#endif + + +static u64 zram_group_data_size(u16 gid, int type, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return group_obj_stats((struct zram *)priv, gid, type); +} + +struct group_swap_ops zram_group_ops = { + .group_read = zram_group_read, + .group_write = zram_group_write, + .group_data_size = zram_group_data_size, +}; + +static int register_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return -EINVAL; + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return -EINVAL; + + zram->zgrp->gsdev = register_group_swap(&zram_group_ops, zram); + if (!zram->zgrp->gsdev) { + pr_err("register zram group failed!\n"); + return -ENOMEM; + } + + return 0; +} + +static void unregister_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return; + if (!CHECK(zram->zgrp, "zram group is not enable!\n")) + return; + + unregister_group_swap(zram->zgrp->gsdev); + zram->zgrp->gsdev = NULL; +} + +void zram_group_init(struct zram *zram, u32 nr_obj) +{ + unsigned int ctrl = zram->zgrp_ctrl; + + if (ctrl == ZGRP_NONE) + return; + zram->zgrp = zram_group_meta_alloc(nr_obj, ZGRP_MAX_GRP - 1); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (ctrl == ZGRP_WRITE) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); +#endif + register_zram_group(zram); +} + +void zram_group_deinit(struct zram *zram) +{ + unregister_zram_group(zram); + zram_group_meta_free(zram->zgrp); + zram->zgrp = NULL; +} diff --git a/drivers/block/zram/zram_group/zlist.c b/drivers/block/zram/zram_group/zlist.c new file mode 100644 index 0000000000000000000000000000000000000000..d1fe608759492ea64bd6fb0843a05697834f5dcf --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zlist.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZLIST]" fmt + +#include +#include +#include + +#include "zlist.h" + +#define assert(expr) \ + do { \ + if (expr) \ + break; \ + pr_err("assertion [%s] failed: in func<%s> at %s:%d\n", \ + #expr, __func__, __FILE__, __LINE__); \ + BUG(); \ + } while (0) + +static inline void zlist_node_lock(struct zlist_node *node) +{ + bit_spin_lock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +static inline void zlist_node_unlock(struct zlist_node *node) +{ + bit_spin_unlock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +#ifdef CONFIG_ZLIST_DEBUG +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} + +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} +#else +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +#endif + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp) +{ + struct zlist_table *tab = kmalloc(sizeof(struct zlist_table), gfp); + + if (!tab) + return NULL; + tab->idx2node = i2n; + tab->private = private; + + return tab; +} + +void zlist_lock(u32 idx, struct zlist_table *tab) +{ + zlist_node_lock(idx2node(idx, tab)); +} + +void zlist_unlock(u32 idx, struct zlist_table *tab) +{ + zlist_node_unlock(idx2node(idx, tab)); +} + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 nid = head->next; + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_add_check(tab, head, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = hid; + node->next = nid; + if (idx != hid) + zlist_node_unlock(node); + head->next = idx; + if (nid != hid) + zlist_node_lock(next); + next->prev = idx; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_add_check(tab, head, node, next); +} + +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 tid = head->prev; + struct zlist_node *tail = idx2node(tid, tab); + + zlist_before_add_check(tab, tail, node, head); + if (idx != hid) + zlist_node_lock(node); + node->prev = tid; + node->next = hid; + if (idx != hid) + zlist_node_unlock(node); + head->prev = idx; + if (tid != hid) + zlist_node_lock(tail); + tail->next = idx; + if (tid != hid) + zlist_node_unlock(tail); + zlist_after_add_check(tab, tail, node, head); +} + +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + u32 pid = node->prev; + u32 nid = node->next; + struct zlist_node *prev = idx2node(pid, tab); + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_del_check(tab, prev, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = idx; + node->next = idx; + if (idx != hid) + zlist_node_unlock(node); + if (pid != hid) + zlist_node_lock(prev); + prev->next = nid; + if (pid != hid) + zlist_node_unlock(prev); + if (nid != hid) + zlist_node_lock(next); + next->prev = pid; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_del_check(tab, prev, node, next); + + return zlist_is_isolated_nolock(hid, tab); +} + +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + return (node->prev == idx) && (node->next == idx); +} + +bool zlist_set_priv(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + zlist_node_lock(node); + ret = !test_and_set_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + zlist_node_unlock(node); + + return ret; +} + +bool zlist_clr_priv(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + zlist_node_lock(node); + ret = !test_and_clear_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + zlist_node_unlock(node); + + return ret; +} + +void zlist_node_init(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + memset(node, 0, sizeof(struct zlist_node)); + node->prev = idx; + node->next = idx; +} diff --git a/drivers/block/zram/zram_group/zlist.h b/drivers/block/zram/zram_group/zlist.h new file mode 100644 index 0000000000000000000000000000000000000000..430b079bcd4932388bf1d5dd22f10537e98dd124 --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zlist.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZLIST_H_ +#define _ZLIST_H_ + +#define ZLIST_IDX_SHIFT 30 +#define ZLIST_LOCK_BIT ZLIST_IDX_SHIFT +#define ZLIST_PRIV_BIT ((ZLIST_IDX_SHIFT << 1) + 1) + +#define ZLIST_IDX_MAX (1 << ZLIST_IDX_SHIFT) + +struct zlist_node { + u32 prev : ZLIST_IDX_SHIFT; + u32 lock : 1; + u32 next : ZLIST_IDX_SHIFT; + u32 priv : 1; +}; + +struct zlist_table { + struct zlist_node *(*idx2node)(u32 idx, void *priv); + void *private; +}; + +static inline struct zlist_node *idx2node(u32 idx, struct zlist_table *tab) +{ + return tab->idx2node(idx, tab->private); +} + +static inline u32 next_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->next; +} + +static inline u32 prev_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->prev; +} + +static inline void zlist_table_free(struct zlist_table *tab) +{ + kfree(tab); +} + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp); + +void zlist_lock(u32 idx, struct zlist_table *tab); +void zlist_unlock(u32 idx, struct zlist_table *tab); + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab); +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab); + +static inline void zlist_add(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline void zlist_add_tail(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_tail_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline bool zlist_del(u32 hid, u32 idx, struct zlist_table *tab) +{ + bool ret = false; + + zlist_lock(hid, tab); + ret = zlist_del_nolock(hid, idx, tab); + zlist_unlock(hid, tab); + + return ret; +} + +bool zlist_get_priv(u32 idx, struct zlist_table *tab); +bool zlist_clr_priv(u32 idx, struct zlist_table *tab); + +void zlist_node_init(u32 idx, struct zlist_table *tab); + +#define zlist_for_each_entry(idx, hid, tab) \ + for ((idx) = next_idx(hid, tab); (idx) != (hid); \ + (idx) = next_idx(idx, tab)) +#define zlist_for_each_entry_reverse(idx, hid, tab) \ + for ((idx) = prev_idx(hid, tab); (idx) != (hid); \ + (idx) = prev_idx(idx, tab)) +#endif diff --git a/drivers/block/zram/zram_group/zram_group.c b/drivers/block/zram/zram_group/zram_group.c new file mode 100644 index 0000000000000000000000000000000000000000..ea0cdcfadc7b1d6954d1567f8a9897c033600003 --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zram_group.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZRAM_GROUP]" fmt + +#include +#include +#include "zram_group.h" + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +/* + * idx2node for obj table + */ +static struct zlist_node *get_obj(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->nr_obj) + return &zgrp->obj[index]; + + index -= zgrp->nr_obj; + BUG_ON(!index); + if (index < zgrp->nr_grp) + return &zgrp->grp_obj_head[index]; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + index -= zgrp->nr_grp; + BUG_ON(index >= zgrp->wbgrp.nr_ext); + return &zgrp->wbgrp.ext_obj_head[index]; +#endif + BUG(); +} + +void zram_group_meta_free(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zram_group_remove_writeback(zgrp); +#endif + vfree(zgrp->grp_obj_head); + vfree(zgrp->obj); + zlist_table_free(zgrp->obj_tab); + vfree(zgrp->stats); + kfree(zgrp); + + pr_info("zram group freed.\n"); +} + +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp) +{ + struct zram_group *zgrp = NULL; + u32 i; + + if (!CHECK_BOUND(nr_grp, 1, ZGRP_MAX_GRP - 1)) + return NULL; + + /* reserve gid 0 */ + nr_grp++; + if (!CHECK_BOUND(nr_obj, 1, ZGRP_MAX_OBJ)) + return NULL; + zgrp = kzalloc(sizeof(struct zram_group), GFP_KERNEL); + if (!zgrp) + goto err; + zgrp->nr_obj = nr_obj; + zgrp->nr_grp = nr_grp; + zgrp->grp_obj_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!zgrp->grp_obj_head) + goto err; + zgrp->obj = vmalloc(sizeof(struct zlist_node) * zgrp->nr_obj); + if (!zgrp->obj) + goto err; + zgrp->obj_tab = zlist_table_alloc(get_obj, zgrp, GFP_KERNEL); + if (!zgrp->obj_tab) + goto err; + zgrp->stats = vzalloc(sizeof(struct zram_group_stats) * zgrp->nr_grp); + if (!zgrp->stats) + goto err; + zgrp->gsdev = NULL; + + for (i = 0; i < zgrp->nr_obj; i++) + zlist_node_init(i, zgrp->obj_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + zgrp->nr_obj, zgrp->obj_tab); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zgrp->wbgrp.enable = false; + mutex_init(&zgrp->wbgrp.init_lock); +#endif + pr_info("zram_group alloc succ.\n"); + return zgrp; +err: + pr_err("zram_group alloc failed!\n"); + zram_group_meta_free(zgrp); + + return NULL; +} + +/* + * insert obj at @index into group @gid as the HOTTEST obj + */ +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add(hid, index, zgrp->obj_tab); + pr_info("insert obj %u to group %u\n", index, gid); +} + +/* + * remove obj at @index from group @gid + */ +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + pr_info("delete obj %u from group %u\n", index, gid); + hid = gid + zgrp->nr_obj; + + return zlist_del(hid, index, zgrp->obj_tab); +} + +/* + * try to isolate the last @nr objs of @gid, store their indexes in array @idxs + * and @return the obj cnt actually isolated. isolate all objs if nr is 0. + */ +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!CHECK(zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = gid + zgrp->nr_obj; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_info("isolated %u objs from group %u.\n", cnt, gid); + + return cnt; +} + +/* + * check if the obj at @index is isolate from zram groups + */ +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index) +{ + bool ret = false; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + + zlist_lock(index, zgrp->obj_tab); + ret = zlist_is_isolated_nolock(index, zgrp->obj_tab); + zlist_unlock(index, zgrp->obj_tab); + + return ret; +} +/* + * insert obj at @index into group @gid as the COLDEST obj + */ +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_info("putback obj %u to group %u\n", index, gid); +} + +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_inc(&zgrp->stats[gid].zram_pages); + atomic64_add(size, &zgrp->stats[gid].zram_size); + atomic_inc(&zgrp->stats[0].zram_pages); + atomic64_add(size, &zgrp->stats[0].zram_size); +} + +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_dec(&zgrp->stats[gid].zram_pages); + atomic64_sub(size, &zgrp->stats[gid].zram_size); + atomic_dec(&zgrp->stats[0].zram_pages); + atomic64_sub(size, &zgrp->stats[0].zram_size); +} + +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].zram_fault); + atomic64_inc(&zgrp->stats[0].zram_fault); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index) +{ + u32 hid, idx; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + hid = gid + zgrp->nr_obj; + if (gid == 0) { + struct zlist_node *node = NULL; + + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + node = idx2node(index, zgrp->obj_tab); + pr_err("dump index %u = %u %u %u %u\n", index, + node->prev, node->next, + node->lock, node->priv); + } else { + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + pr_err("dump index of group %u\n", gid); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) + pr_err("%u\n", idx); + } +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +/* + * idx2node for ext table + */ +static struct zlist_node *get_ext(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->wbgrp.nr_ext) + return &zgrp->wbgrp.ext[index]; + + index -= zgrp->wbgrp.nr_ext; + BUG_ON(!index); + return &zgrp->wbgrp.grp_ext_head[index]; +} + +/* + * disable writeback for zram group @zgrp + */ +void zram_group_remove_writeback(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + zgrp->wbgrp.enable = false; + vfree(zgrp->wbgrp.grp_ext_head); + vfree(zgrp->wbgrp.ext); + zlist_table_free(zgrp->wbgrp.ext_tab); + vfree(zgrp->wbgrp.ext_obj_head); + pr_info("zram group writeback is removed.\n"); +} + +/* + * init & enable writeback on exist zram group @zgrp with a backing device of + * @nr_ext extents. + */ +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext) +{ + struct writeback_group *wbgrp = NULL; + u32 i; + int ret = 0; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return -EINVAL; + + mutex_lock(&zgrp->wbgrp.init_lock); + if (!CHECK(!zgrp->wbgrp.enable, "zram group writeback is already enable!\n")) + goto out; + if (!CHECK_BOUND(nr_ext, 1, ZGRP_MAX_EXT)) { + ret = -EINVAL; + goto out; + } + wbgrp = &zgrp->wbgrp; + wbgrp->nr_ext = nr_ext; + wbgrp->grp_ext_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!wbgrp->grp_ext_head) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext_obj_head = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext_obj_head) { + ret = -ENOMEM; + goto out; + } + + wbgrp->ext_tab = zlist_table_alloc(get_ext, zgrp, GFP_KERNEL); + if (!wbgrp->ext_tab) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i, wbgrp->ext_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + wbgrp->nr_ext, wbgrp->ext_tab); + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i + zgrp->nr_obj + zgrp->nr_grp, zgrp->obj_tab); + + init_waitqueue_head(&wbgrp->fault_wq); + wbgrp->enable = true; + pr_info("zram group writeback is enabled.\n"); +out: + mutex_unlock(&zgrp->wbgrp.init_lock); + + if (ret) { + zram_group_remove_writeback(zgrp); + pr_err("zram group writeback enable failed!\n"); + } + + return ret; +} + +/* + * attach extent at @eid to group @gid as the HOTTEST extent + */ +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_add(hid, eid, zgrp->wbgrp.ext_tab); + pr_info("insert extent %u to group %u\n", eid, gid); +} + +/* + * remove extent at @eid from group @gid + */ +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + bool isolated = false; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + + zlist_lock(eid, zgrp->wbgrp.ext_tab); + isolated = zlist_is_isolated_nolock(eid, zgrp->wbgrp.ext_tab); + zlist_unlock(eid, zgrp->wbgrp.ext_tab); + if (isolated) { + pr_info("extent %u is already isolated, skip delete.\n", eid); + return false; + } + + pr_info("delete extent %u from group %u\n", eid, gid); + hid = gid + zgrp->wbgrp.nr_ext; + return zlist_del(hid, eid, zgrp->wbgrp.ext_tab); +} + +/* + * try to isolate the first @nr exts of @gid, store their eids in array @eids + * and @return the cnt actually isolated. isolate all exts if nr is 0. + */ +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!CHECK(zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(eids, "return array eids is null!\n")) + return 0; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_lock(hid, zgrp->wbgrp.ext_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->wbgrp.ext_tab) { + eids[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, eids[i], zgrp->wbgrp.ext_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->wbgrp.ext_tab); + zlist_unlock(hid, zgrp->wbgrp.ext_tab); + + pr_info("isolated %u exts from group %u.\n", cnt, gid); + + return cnt; +} + +/* + * insert obj at @index into extent @eid + */ +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_info("insert obj %u to extent %u\n", index, eid); +} + +/* + * remove obj at @index from extent @eid + */ +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + pr_info("delete obj %u from extent %u\n", index, eid); + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + + return zlist_del(hid, index, zgrp->obj_tab); +} + +/* + * try to isolate the first @nr writeback objs of @eid, store their indexes in + * array @idxs and @return the obj cnt actually isolated. isolate all objs if + * @nr is 0. + */ +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!CHECK(zgrp, "zram group is not enable!\n")) + return 0; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_info("isolated %u objs from extent %u.\n", cnt, eid); + + return cnt; +} + +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_inc(&zgrp->stats[gid].wb_pages); + atomic64_add(size, &zgrp->stats[gid].wb_size); + atomic_inc(&zgrp->stats[0].wb_pages); + atomic64_add(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_dec(&zgrp->stats[gid].wb_pages); + atomic64_sub(size, &zgrp->stats[gid].wb_size); + atomic_dec(&zgrp->stats[0].wb_pages); + atomic64_sub(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].wb_fault); + atomic64_inc(&zgrp->stats[0].wb_fault); +} +#endif diff --git a/drivers/block/zram/zram_group/zram_group.h b/drivers/block/zram/zram_group/zram_group.h new file mode 100644 index 0000000000000000000000000000000000000000..7ac16ba87703a02e8ab2572602ec901c8199c2e0 --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zram_group.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZRAM_GROUP_H_ +#define _ZRAM_GROUP_H_ + +#include +#include + +#include "zlist.h" + +#define ZGRP_MAX_GRP USHRT_MAX +#define ZGRP_MAX_OBJ (1 << 30) + +enum { + ZGRP_NONE = 0, + ZGRP_TRACK, +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZGRP_WRITE, +#endif +}; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +#define ZGRP_MAX_EXT (ZLIST_IDX_MAX - ZGRP_MAX_GRP - ZGRP_MAX_OBJ) +struct writeback_group { + bool enable; + u32 nr_ext; + struct zlist_node *grp_ext_head; + struct zlist_node *ext; + struct zlist_table *ext_tab; + struct zlist_node *ext_obj_head; + struct mutex init_lock; + wait_queue_head_t fault_wq; +}; +#endif + +struct zram_group_stats { + atomic64_t zram_size; + atomic_t zram_pages; + atomic64_t zram_fault; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + atomic64_t wb_size; + atomic_t wb_pages; + atomic64_t wb_fault; + atomic_t wb_exts; + atomic64_t write_size; + atomic64_t read_size; +#endif +}; + +struct zram_group { + u32 nr_obj; + u32 nr_grp; + struct zlist_node *grp_obj_head; + struct zlist_node *obj; + struct zlist_table *obj_tab; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + struct writeback_group wbgrp; +#endif + struct group_swap_device *gsdev; + struct zram_group_stats *stats; +}; + +void zram_group_meta_free(struct zram_group *zgrp); +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp); +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid); +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid); +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last); +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index); +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid); +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +void zram_group_remove_writeback(struct zram_group *zgrp); +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext); +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid); +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid); +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last); +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid); +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid); +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last); +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +#endif +#endif diff --git a/drivers/hyperhold/Kconfig b/drivers/hyperhold/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..8e5e7a1ee95734f9d3ffdc786afc484cf88f905a --- /dev/null +++ b/drivers/hyperhold/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +config HYPERHOLD + bool "Hyperhold driver" + default n + help + Hyperhold driver. + +config HYPERHOLD_DEBUG + bool "Debug info for Hyperhold driver" + depends on HYPERHOLD + help + Debug info for Hyperhold driver. diff --git a/drivers/hyperhold/Makefile b/drivers/hyperhold/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b45a1a6784669913d3f484bd2b6f7665724e4d3b --- /dev/null +++ b/drivers/hyperhold/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +hyperhold-y := hp_core.o hp_device.o hp_space.o hp_iotab.o + +obj-$(CONFIG_HYPERHOLD) += hyperhold.o diff --git a/drivers/hyperhold/hp_core.c b/drivers/hyperhold/hp_core.c new file mode 100644 index 0000000000000000000000000000000000000000..86a9e4704f2ea1a0250e87e7a3e1df8698075bdf --- /dev/null +++ b/drivers/hyperhold/hp_core.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_core.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + + #define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include +#include + +#include "hyperhold.h" +#include "hp_device.h" +#include "hp_space.h" +#include "hp_iotab.h" + +#ifdef CONFIG_HYPERHOLD_DEBUG +#define HP_DFLT_DEVICE "/dev/loop6" +#else +#define HP_DFLT_DEVICE "/dev/by-name/hyperhold" +#endif +#define HP_DFLT_EXT_SIZE (1 << 15) +#define HP_DEV_NAME_LEN 256 +#define HP_STATE_LEN 10 + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", #var, (var), (min), (max)) +#define CHECK_INITED CHECK(hyperhold.inited, "hyperhold is not enable!\n") +#define CHECK_ENABLE (CHECK_INITED && CHECK(hyperhold.enable, "hyperhold is readonly!\n")) + +struct hyperhold { + bool enable; + bool inited; + + char device_name[HP_DEV_NAME_LEN]; + u32 extent_size; + + struct hp_device dev; + struct hp_space spc; + + struct workqueue_struct *read_wq; + struct workqueue_struct *write_wq; + + struct mutex init_lock; +}; + +struct hyperhold hyperhold; + +atomic64_t mem_used = ATOMIC64_INIT(0); +#ifdef CONFIG_HYPERHOLD_DEBUG +/* + * return the memory overhead of hyperhold module + */ +u64 hyperhold_memory_used(void) +{ + return atomic64_read(&mem_used) + hpio_memory() + space_memory(); +} +#endif + +void hyperhold_disable(bool force) +{ + if (!CHECK_INITED) + return; + if (!force && !CHECK_ENABLE) + return; + + mutex_lock(&hyperhold.init_lock); + hyperhold.enable = false; + if (!wait_for_space_empty(&hyperhold.spc, force)) + goto out; + hyperhold.inited = false; + wait_for_iotab_empty(); + if (hyperhold.read_wq) + destroy_workqueue(hyperhold.read_wq); + if (hyperhold.write_wq) + destroy_workqueue(hyperhold.write_wq); + deinit_space(&hyperhold.spc); + unbind_bdev(&hyperhold.dev); +out: + if (hyperhold.inited) + pr_info("hyperhold is disabled, read only.\n"); + else + pr_info("hyperhold is totally disabled!\n"); + mutex_unlock(&hyperhold.init_lock); +} +EXPORT_SYMBOL(hyperhold_disable); + +void hyperhold_enable(void) +{ + bool enable = true; + + if (hyperhold.inited) + goto out; + + mutex_lock(&hyperhold.init_lock); + if (hyperhold.inited) + goto unlock; + if (!bind_bdev(&hyperhold.dev, hyperhold.device_name)) + goto err; + if (!init_space(&hyperhold.spc, hyperhold.dev.dev_size, hyperhold.extent_size)) + goto err; + hyperhold.read_wq = alloc_workqueue("hyperhold_read", WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!hyperhold.read_wq) + goto err; + hyperhold.write_wq = alloc_workqueue("hyperhold_write", 0, 0); + if (!hyperhold.write_wq) + goto err; + hyperhold.inited = true; + goto unlock; +err: + if (hyperhold.read_wq) + destroy_workqueue(hyperhold.read_wq); + if (hyperhold.write_wq) + destroy_workqueue(hyperhold.write_wq); + deinit_space(&hyperhold.spc); + unbind_bdev(&hyperhold.dev); + enable = false; +unlock: + mutex_unlock(&hyperhold.init_lock); +out: + if (enable) { + hyperhold.enable = true; + pr_info("hyperhold is enabled.\n"); + } else { + hyperhold.enable = false; + pr_err("hyperhold enable failed!\n"); + } +} +EXPORT_SYMBOL(hyperhold_enable); + +static int hyperhold_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write) { + if (!strcmp(buffer, "enable\n")) + hyperhold_enable(); + else if (!strcmp(buffer, "disable\n")) + hyperhold_disable(false); + else if (!strcmp(buffer, "force_disable\n")) + hyperhold_disable(true); + } else { + if (*lenp < HP_STATE_LEN || *ppos) { + *lenp = 0; + return 0; + } + if (hyperhold.enable) + strcpy(buffer, "enable\n"); + else if (hyperhold.inited) + strcpy(buffer, "readonly\n"); + else + strcpy(buffer, "disable\n"); + *lenp = strlen(buffer); + *ppos += *lenp; +#ifdef CONFIG_HYPERHOLD_DEBUG + pr_info("hyperhold memory overhead = %llu.\n", hyperhold_memory_used()); +#endif + } + return 0; +} + +static struct ctl_table_header *hp_sysctl_header; +static struct ctl_table hp_table[] = { + { + .procname = "enable", + .mode = 0644, + .proc_handler = hyperhold_sysctl_handler, + }, + { + .procname = "device", + .data = &hyperhold.device_name, + .maxlen = sizeof(hyperhold.device_name), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "extent_size", + .data = &hyperhold.extent_size, + .maxlen = sizeof(hyperhold.extent_size), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + {} +}; +static struct ctl_table hp_kernel_table[] = { + { + .procname = "hyperhold", + .mode = 0555, + .child = hp_table, + }, + {} +}; +static struct ctl_table hp_sys_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = hp_kernel_table, + }, + {} +}; + +bool is_hyperhold_enable(void) +{ + return CHECK_ENABLE; +} + +static int __init hyperhold_init(void) +{ + strcpy(hyperhold.device_name, HP_DFLT_DEVICE); + hyperhold.extent_size = HP_DFLT_EXT_SIZE; + mutex_init(&hyperhold.init_lock); + hp_sysctl_header = register_sysctl_table(hp_sys_table); + if (!hp_sysctl_header) { + pr_err("register hyperhold sysctl table failed!\n"); + return -EINVAL; + } + + return 0; +} + +static void __exit hyperhold_exit(void) +{ + unregister_sysctl_table(hp_sysctl_header); + hyperhold_disable(true); +} + +static struct hp_space *space_of(u32 eid) +{ + return &hyperhold.spc; +} + +/* replace this func for multi devices */ +static struct hp_device *device_of(u32 eid) +{ + return &hyperhold.dev; +} + +/* replace this func for multi devices */ +u32 hyperhold_nr_extent(void) +{ + if (!CHECK_INITED) + return 0; + + return hyperhold.spc.nr_ext; +} +EXPORT_SYMBOL(hyperhold_nr_extent); + +u32 hyperhold_extent_size(u32 eid) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return 0; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return 0; + + return spc->ext_size; +} +EXPORT_SYMBOL(hyperhold_extent_size); + +/* replace this func for multi devices */ +long hyperhold_address(u32 eid, u32 offset) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return -EINVAL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return -EINVAL; + if (!CHECK_BOUND(offset, 0, spc->ext_size - 1)) + return -EINVAL; + + return (u64)eid * spc->ext_size + offset; +} +EXPORT_SYMBOL(hyperhold_address); + +/* replace this func for multi devices */ +int hyperhold_addr_extent(u64 addr) +{ + struct hp_space *spc = NULL; + u32 eid; + + if (!CHECK_INITED) + return -EINVAL; + eid = addr / hyperhold.spc.ext_size; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return -EINVAL; + + return eid; +} +EXPORT_SYMBOL(hyperhold_addr_extent); + +/* replace this func for multi devices */ +int hyperhold_addr_offset(u64 addr) +{ + if (!CHECK_INITED) + return -EINVAL; + + return addr % hyperhold.spc.ext_size; +} +EXPORT_SYMBOL(hyperhold_addr_offset); + +/* replace this func for multi devices */ +int hyperhold_alloc_extent(void) +{ + if (!CHECK_ENABLE) + return -EINVAL; + + return alloc_eid(&hyperhold.spc); +} +EXPORT_SYMBOL(hyperhold_alloc_extent); + +void hyperhold_free_extent(u32 eid) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return; + + free_eid(spc, eid); +} +EXPORT_SYMBOL(hyperhold_free_extent); + +void hyperhold_should_free_extent(u32 eid) +{ + struct hpio *hpio = NULL; + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u", eid)) + return; + + hpio = hpio_get(eid); + if (!hpio) { + free_eid(spc, eid); + return; + } + hpio->free_extent = hyperhold_free_extent; + hpio_put(hpio); +} +EXPORT_SYMBOL(hyperhold_should_free_extent); + +/* + * alloc hpio struct for r/w extent at @eid, will fill hpio with new alloced + * pages if @new_page. @return NULL on fail. + */ +struct hpio *hyperhold_io_alloc(u32 eid, gfp_t gfp, unsigned int op, bool new_page) +{ + struct hpio *hpio = NULL; + struct hp_space *spc; + u32 nr_page; + + if (!CHECK_ENABLE) + return NULL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return NULL; + + nr_page = spc->ext_size / PAGE_SIZE; + hpio = hpio_alloc(nr_page, gfp, op, new_page); + if (!hpio) + goto err; + hpio->eid = eid; + + return hpio; +err: + hpio_free(hpio); + + return NULL; +} +EXPORT_SYMBOL(hyperhold_io_alloc); + +void hyperhold_io_free(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_free(hpio); +} +EXPORT_SYMBOL(hyperhold_io_free); + +/* + * find exist read hpio of the extent @eid in iotab and inc its refcnt, + * alloc a new hpio and insert it into iotab if there is no hpio for @eid + */ +struct hpio *hyperhold_io_get(u32 eid, gfp_t gfp, unsigned int op) +{ + struct hp_space *spc = NULL; + u32 nr_page; + + if (!CHECK_INITED) + return NULL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u", eid)) + return NULL; + + nr_page = spc->ext_size / PAGE_SIZE; + return hpio_get_alloc(eid, nr_page, gfp, op); +} +EXPORT_SYMBOL(hyperhold_io_get); + +bool hyperhold_io_put(struct hpio *hpio) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + + return hpio_put(hpio); +} +EXPORT_SYMBOL(hyperhold_io_put); + +/* + * notify all threads waiting for this hpio + */ +void hyperhold_io_complete(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_complete(hpio); +} +EXPORT_SYMBOL(hyperhold_io_complete); + +void hyperhold_io_wait(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_wait(hpio); +} +EXPORT_SYMBOL(hyperhold_io_wait); + +bool hyperhold_io_success(struct hpio *hpio) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + + return hpio_get_state(hpio) == HPIO_DONE; +} +EXPORT_SYMBOL(hyperhold_io_success); + +int hyperhold_io_extent(struct hpio *hpio) +{ + if (!CHECK_INITED) + return -EINVAL; + if (!CHECK(hpio, "hpio is null!\n")) + return -EINVAL; + + return hpio->eid; +} +EXPORT_SYMBOL(hyperhold_io_extent); + +int hyperhold_io_operate(struct hpio *hpio) +{ + if (!CHECK_INITED) + return -EINVAL; + if (!CHECK(hpio, "hpio is null!\n")) + return -EINVAL; + + return hpio->op; +} +EXPORT_SYMBOL(hyperhold_io_operate); + +struct page *hyperhold_io_page(struct hpio *hpio, u32 index) +{ + if (!CHECK_INITED) + return NULL; + if (!CHECK(hpio, "hpio is null!\n")) + return NULL; + if (!CHECK_BOUND(index, 0, hpio->nr_page - 1)) + return NULL; + + return hpio->pages[index]; +} +EXPORT_SYMBOL(hyperhold_io_page); + +bool hyperhold_io_add_page(struct hpio *hpio, u32 index, struct page *page) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + if (!CHECK(page, "page is null!\n")) + return false; + if (!CHECK_BOUND(index, 0, hpio->nr_page - 1)) + return false; + + get_page(page); + atomic64_add(PAGE_SIZE, &mem_used); + BUG_ON(hpio->pages[index]); + hpio->pages[index] = page; + + return true; +} +EXPORT_SYMBOL(hyperhold_io_add_page); + +u32 hyperhold_io_nr_page(struct hpio *hpio) +{ + if (!CHECK_INITED) + return 0; + if (!CHECK(hpio, "hpio is null!\n")) + return 0; + + return hpio->nr_page; +} +EXPORT_SYMBOL(hyperhold_io_nr_page); + +void *hyperhold_io_private(struct hpio *hpio) +{ + if (!CHECK_INITED) + return NULL; + if (!CHECK(hpio, "hpio is null!\n")) + return NULL; + + return hpio->private; +} +EXPORT_SYMBOL(hyperhold_io_private); + +static void hp_endio_work(struct work_struct *work) +{ + struct hpio *hpio = container_of(work, struct hpio, endio_work); + + if (hpio->endio) + hpio->endio(hpio); +} + +static void hpio_endio(struct bio *bio) +{ + struct hpio *hpio = bio->bi_private; + struct workqueue_struct *wq = NULL; + + pr_info("hpio %p for eid %u returned %d.\n", + hpio, hpio->eid, bio->bi_status); + hpio_set_state(hpio, bio->bi_status ? HPIO_FAIL : HPIO_DONE); + wq = op_is_write(hpio->op) ? hyperhold.write_wq : hyperhold.read_wq; + queue_work(wq, &hpio->endio_work); + bio_put(bio); + atomic64_sub(sizeof(struct bio), &mem_used); +} + +static int hpio_submit(struct hpio *hpio) +{ + struct hp_device *dev = NULL; + struct bio *bio = NULL; + u32 ext_size; + sector_t sec; + int i; + + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + if (!bio) { + pr_err("bio alloc failed!\n"); + return -ENOMEM; + } + atomic64_add(sizeof(struct bio), &mem_used); + + dev = device_of(hpio->eid); + bio_set_op_attrs(bio, hpio->op, 0); + bio_set_dev(bio, dev->bdev); + + ext_size = space_of(hpio->eid)->ext_size; + sec = (u64)hpio->eid * ext_size / dev->sec_size; + bio->bi_iter.bi_sector = sec; + for (i = 0; i < hpio->nr_page; i++) { + if (!hpio->pages[i]) + break; + hpio->pages[i]->index = sec; + if (!bio_add_page(bio, hpio->pages[i], PAGE_SIZE, 0)) + goto err; + sec += PAGE_SIZE / dev->sec_size; + } + + bio->bi_private = hpio; + bio->bi_end_io = hpio_endio; + submit_bio(bio); + pr_info("submit hpio %p for eid %u.\n", hpio, hpio->eid); + + return 0; +err: + bio_put(bio); + atomic64_sub(sizeof(struct bio), &mem_used); + return -EIO; +} + +static int rw_extent_async(struct hpio *hpio, hp_endio endio, void *priv, unsigned int op) +{ + int ret = 0; + + if (!hpio_change_state(hpio, HPIO_INIT, HPIO_SUBMIT)) + return -EAGAIN; + + hpio->private = priv; + hpio->endio = endio; + INIT_WORK(&hpio->endio_work, hp_endio_work); + + ret = hpio_submit(hpio); + if (ret) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + } + + return ret; +} + +int hyperhold_write_async(struct hpio *hpio, hp_endio endio, void *priv) +{ + if (!CHECK_ENABLE) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + return -EINVAL; + } + + BUG_ON(!op_is_write(hpio->op)); + + return rw_extent_async(hpio, endio, priv, REQ_OP_WRITE); +} +EXPORT_SYMBOL(hyperhold_write_async); + +int hyperhold_read_async(struct hpio *hpio, hp_endio endio, void *priv) +{ + if (!CHECK_INITED) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + return -EINVAL; + } + + if (op_is_write(hpio->op)) + return -EAGAIN; + + return rw_extent_async(hpio, endio, priv, REQ_OP_READ); +} +EXPORT_SYMBOL(hyperhold_read_async); + +module_init(hyperhold_init) +module_exit(hyperhold_exit) diff --git a/drivers/hyperhold/hp_device.c b/drivers/hyperhold/hp_device.c new file mode 100644 index 0000000000000000000000000000000000000000..0fd81be5ffa819bf52beeafe7cb49b64d55888f5 --- /dev/null +++ b/drivers/hyperhold/hp_device.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_device.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include + +#include "hp_device.h" + +void unbind_bdev(struct hp_device *dev) +{ + int ret; + + if (!dev->bdev) + goto close; + if (!dev->old_block_size) + goto put; + ret = set_blocksize(dev->bdev, dev->old_block_size); + if (ret) + pr_err("set old block size %d failed, err = %d!\n", + dev->old_block_size, ret); + dev->old_block_size = 0; +put: + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + dev->bdev = NULL; +close: + if (dev->filp) + filp_close(dev->filp, NULL); + dev->filp = NULL; + + pr_info("hyperhold bdev unbinded.\n"); +} + +bool bind_bdev(struct hp_device *dev, const char *name) +{ + struct inode *inode = NULL; + int ret; + + dev->filp = filp_open(name, O_RDWR | O_LARGEFILE, 0); + if (IS_ERR(dev->filp)) { + pr_err("open file %s failed, err = %ld!\n", name, PTR_ERR(dev->filp)); + dev->filp = NULL; + goto err; + } + inode = dev->filp->f_mapping->host; + if (!S_ISBLK(inode->i_mode)) { + pr_err("%s is not a block device!\n", name); + goto err; + } + dev->bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, dev); + if (IS_ERR(dev->bdev)) { + ret = PTR_ERR(dev->bdev); + dev->bdev = NULL; + pr_err("get blkdev %s failed, err = %d!\n", name, ret); + goto err; + } + dev->old_block_size = block_size(dev->bdev); + ret = set_blocksize(dev->bdev, PAGE_SIZE); + if (ret) { + pr_err("set %s block size failed, err = %d!\n", name, ret); + goto err; + } + dev->dev_size = (u64)i_size_read(inode); + dev->sec_size = SECTOR_SIZE; + + pr_info("hyperhold bind bdev %s of size %llu / %u succ.\n", + name, dev->dev_size, dev->sec_size); + + return true; +err: + unbind_bdev(dev); + + return false; +} diff --git a/drivers/hyperhold/hp_device.h b/drivers/hyperhold/hp_device.h new file mode 100644 index 0000000000000000000000000000000000000000..52d5de370fdaa9fe0e7cf1589da55ec7c86e0ae2 --- /dev/null +++ b/drivers/hyperhold/hp_device.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_device.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_DEVICE_H_ +#define _HP_DEVICE_H_ + +#include + +struct hp_device { + struct file *filp; + struct block_device *bdev; + u32 old_block_size; + u64 dev_size; + u32 sec_size; +}; + +void unbind_bdev(struct hp_device *dev); +bool bind_bdev(struct hp_device *dev, const char *name); +#endif diff --git a/drivers/hyperhold/hp_iotab.c b/drivers/hyperhold/hp_iotab.c new file mode 100644 index 0000000000000000000000000000000000000000..258cb83a16c33e273567ba5f40ef90fa3ef60456 --- /dev/null +++ b/drivers/hyperhold/hp_iotab.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_iotab.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include + +#include "hp_iotab.h" + +atomic64_t hpio_mem = ATOMIC64_INIT(0); +u64 hpio_memory(void) +{ + return atomic64_read(&hpio_mem); +} + +struct hp_iotab { + struct list_head io_list; + rwlock_t lock; + u32 io_cnt; + wait_queue_head_t empty_wq; +}; + +/* store all inflight hpio in iotab */ +struct hp_iotab iotab = { + .io_list = LIST_HEAD_INIT(iotab.io_list), + .lock = __RW_LOCK_UNLOCKED(iotab.lock), + .io_cnt = 0, + .empty_wq = __WAIT_QUEUE_HEAD_INITIALIZER(iotab.empty_wq), +}; + +static struct hpio *__iotab_search_get(struct hp_iotab *iotab, u32 eid) +{ + struct hpio *hpio = NULL; + + list_for_each_entry(hpio, &iotab->io_list, list) + if (hpio->eid == eid && kref_get_unless_zero(&hpio->refcnt)) + return hpio; + + return NULL; +} + +static struct hpio *iotab_search_get(struct hp_iotab *iotab, u32 eid) +{ + struct hpio *hpio = NULL; + unsigned long flags; + + read_lock_irqsave(&iotab->lock, flags); + hpio = __iotab_search_get(iotab, eid); + read_unlock_irqrestore(&iotab->lock, flags); + + pr_info("find hpio %p for eid %u.\n", hpio, eid); + + return hpio; +} + +/* + * insert @hpio into @iotab, cancel insertion if there is a hpio of the same + * @eid, inc the refcnt of duplicated hpio and return it + */ +static struct hpio *iotab_insert(struct hp_iotab *iotab, struct hpio *hpio) +{ + struct hpio *dup = NULL; + unsigned long flags; + + write_lock_irqsave(&iotab->lock, flags); + dup = __iotab_search_get(iotab, hpio->eid); + if (dup) { + pr_info("find exist hpio %p for eid %u, insert hpio %p failed.\n", + dup, hpio->eid, hpio); + goto unlock; + } + list_add(&hpio->list, &iotab->io_list); + iotab->io_cnt++; + pr_info("insert new hpio %p for eid %u.\n", hpio, hpio->eid); +unlock: + write_unlock_irqrestore(&iotab->lock, flags); + + return dup; +} + +static void iotab_delete(struct hp_iotab *iotab, struct hpio *hpio) +{ + unsigned long flags; + + write_lock_irqsave(&iotab->lock, flags); + list_del(&hpio->list); + iotab->io_cnt--; + if (!iotab->io_cnt) + wake_up(&iotab->empty_wq); + write_unlock_irqrestore(&iotab->lock, flags); + + pr_info("delete hpio %p for eid %u from iotab.\n", hpio, hpio->eid); +} + +static void hpio_clear_pages(struct hpio *hpio) +{ + int i; + + if (!hpio->pages) + return; + + for (i = 0; i < hpio->nr_page; i++) + if (hpio->pages[i]) { + put_page(hpio->pages[i]); + atomic64_sub(PAGE_SIZE, &hpio_mem); + } + kfree(hpio->pages); + atomic64_sub(sizeof(struct page *) * hpio->nr_page, &hpio_mem); + hpio->nr_page = 0; + hpio->pages = NULL; +} + +/* + * alloc pages array for @hpio, fill in new alloced pages if @new_page + */ +static bool hpio_fill_pages(struct hpio *hpio, u32 nr_page, gfp_t gfp, bool new_page) +{ + int i; + + BUG_ON(hpio->pages); + hpio->nr_page = nr_page; + hpio->pages = kcalloc(hpio->nr_page, sizeof(struct page *), gfp); + if (!hpio->pages) + goto err; + atomic64_add(sizeof(struct page *) * hpio->nr_page, &hpio_mem); + + if (!new_page) + goto out; + for (i = 0; i < hpio->nr_page; i++) { + hpio->pages[i] = alloc_page(gfp); + if (!hpio->pages[i]) + goto err; + atomic64_add(PAGE_SIZE, &hpio_mem); + } +out: + return true; +err: + hpio_clear_pages(hpio); + + return false; +} + +void hpio_free(struct hpio *hpio) +{ + if (!hpio) + return; + + pr_info("free hpio = %p.\n", hpio); + + hpio_clear_pages(hpio); + kfree(hpio); + atomic64_sub(sizeof(struct hpio), &hpio_mem); +} + +struct hpio *hpio_alloc(u32 nr_page, gfp_t gfp, unsigned int op, bool new_page) +{ + struct hpio *hpio = NULL; + + hpio = kzalloc(sizeof(struct hpio), gfp); + if (!hpio) + goto err; + atomic64_add(sizeof(struct hpio), &hpio_mem); + if (!hpio_fill_pages(hpio, nr_page, gfp, new_page)) + goto err; + hpio->op = op; + atomic_set(&hpio->state, HPIO_INIT); + kref_init(&hpio->refcnt); + init_completion(&hpio->wait); + + return hpio; +err: + hpio_free(hpio); + + return NULL; +} + +struct hpio *hpio_get(u32 eid) +{ + return iotab_search_get(&iotab, eid); +} + +struct hpio *hpio_get_alloc(u32 eid, u32 nr_page, gfp_t gfp, unsigned int op) +{ + struct hpio *hpio = NULL; + struct hpio *dup = NULL; + + hpio = iotab_search_get(&iotab, eid); + if (hpio) { + pr_info("find exist hpio %p for eid %u.\n", hpio, eid); + goto out; + } + hpio = hpio_alloc(nr_page, gfp, op, true); + if (!hpio) + goto out; + hpio->eid = eid; + + pr_info("alloc hpio %p for eid %u.\n", hpio, eid); + + dup = iotab_insert(&iotab, hpio); + if (dup) { + hpio_free(hpio); + hpio = dup; + } +out: + return hpio; +} + +static void hpio_release(struct kref *kref) +{ + struct hpio *hpio = container_of(kref, struct hpio, refcnt); + + iotab_delete(&iotab, hpio); + if (hpio->free_extent) + hpio->free_extent(hpio->eid); + hpio_free(hpio); +} + +bool hpio_put(struct hpio *hpio) +{ + pr_info("put hpio %p for eid %u, ref = %u.\n", hpio, hpio->eid, kref_read(&hpio->refcnt)); + return kref_put(&hpio->refcnt, hpio_release); +} + +void hpio_complete(struct hpio *hpio) +{ + pr_info("complete hpio %p for eid %u.\n", hpio, hpio->eid); + complete_all(&hpio->wait); +} + +void hpio_wait(struct hpio *hpio) +{ + wait_for_completion(&hpio->wait); +} + +enum hpio_state hpio_get_state(struct hpio *hpio) +{ + return atomic_read(&hpio->state); +} + +void hpio_set_state(struct hpio *hpio, enum hpio_state state) +{ + atomic_set(&hpio->state, state); +} + +bool hpio_change_state(struct hpio *hpio, enum hpio_state from, enum hpio_state to) +{ + return atomic_cmpxchg(&hpio->state, from, to) == from; +} + +static void dump_iotab(struct hp_iotab *iotab) +{ + struct hpio *hpio = NULL; + unsigned long flags; + + pr_info("dump inflight hpio in iotab.\n"); + read_lock_irqsave(&iotab->lock, flags); + list_for_each_entry(hpio, &iotab->io_list, list) + pr_info("hpio %p for eid %u is inflight.\n", hpio, hpio->eid); + read_unlock_irqrestore(&iotab->lock, flags); +} + +void wait_for_iotab_empty(void) +{ + dump_iotab(&iotab); + wait_event(iotab.empty_wq, !iotab.io_cnt); +} diff --git a/drivers/hyperhold/hp_iotab.h b/drivers/hyperhold/hp_iotab.h new file mode 100644 index 0000000000000000000000000000000000000000..a2f03620af13c73aa5cd9a018e8d8165617bae5a --- /dev/null +++ b/drivers/hyperhold/hp_iotab.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_iotab.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_IOTAB_H_ +#define _HP_IOTAB_H_ + +#include +#include +#include +#include + +enum hpio_state { + HPIO_INIT, + HPIO_SUBMIT, + HPIO_DONE, + HPIO_FAIL, +}; + +struct hpio; + +typedef void (*hp_endio)(struct hpio *); + +struct hpio { + u32 eid; + struct page **pages; + u32 nr_page; + void *private; + + unsigned int op; + void (*free_extent)(u32 eid); + + atomic_t state; + struct kref refcnt; + struct completion wait; + hp_endio endio; + struct work_struct endio_work; + + struct list_head list; +}; + +struct hpio *hpio_alloc(u32 nr_page, gfp_t gfp, unsigned int op, bool new_page); +void hpio_free(struct hpio *hpio); + +struct hpio *hpio_get(u32 eid); +bool hpio_put(struct hpio *hpio); +struct hpio *hpio_get_alloc(u32 eid, u32 nr_page, gfp_t gfp, unsigned int op); + +void hpio_complete(struct hpio *hpio); +void hpio_wait(struct hpio *hpio); + +enum hpio_state hpio_get_state(struct hpio *hpio); +void hpio_set_state(struct hpio *hpio, enum hpio_state state); +bool hpio_change_state(struct hpio *hpio, enum hpio_state from, enum hpio_state to); + +void wait_for_iotab_empty(void); + +u64 hpio_memory(void); +#endif diff --git a/drivers/hyperhold/hp_space.c b/drivers/hyperhold/hp_space.c new file mode 100644 index 0000000000000000000000000000000000000000..95d42d064290edf311be568890362f38b575b657 --- /dev/null +++ b/drivers/hyperhold/hp_space.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_space.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include + +#include "hp_space.h" + +atomic64_t spc_mem = ATOMIC64_INIT(0); + +u64 space_memory(void) +{ + return atomic64_read(&spc_mem); +} + +void deinit_space(struct hp_space *spc) +{ + kvfree(spc->bitmap); + atomic64_sub(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), &spc_mem); + spc->ext_size = 0; + spc->nr_ext = 0; + atomic_set(&spc->last_alloc_bit, 0); + atomic_set(&spc->nr_alloced, 0); + + pr_info("hyperhold space deinited.\n"); +} + +bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size) +{ + if (ext_size & (PAGE_SIZE - 1)) { + pr_err("extent size %u do not align to page size %lu!", ext_size, PAGE_SIZE); + return false; + } + if (dev_size & (ext_size - 1)) { + pr_err("device size %llu do not align to extent size %u!", dev_size, ext_size); + return false; + } + spc->ext_size = ext_size; + spc->nr_ext = dev_size / ext_size; + atomic_set(&spc->last_alloc_bit, 0); + atomic_set(&spc->nr_alloced, 0); + init_waitqueue_head(&spc->empty_wq); + spc->bitmap = kvzalloc(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), GFP_KERNEL); + if (!spc->bitmap) { + pr_err("hyperhold bitmap alloc failed.\n"); + return false; + } + atomic64_add(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), &spc_mem); + + pr_info("hyperhold space init succ, capacity = %u x %u.\n", ext_size, spc->nr_ext); + + return true; +} + +int alloc_eid(struct hp_space *spc) +{ + u32 bit; + u32 last_bit; + +retry: + last_bit = atomic_read(&spc->last_alloc_bit); + bit = find_next_zero_bit(spc->bitmap, spc->nr_ext, last_bit); + if (bit == spc->nr_ext) + bit = find_next_zero_bit(spc->bitmap, spc->nr_ext, 0); + if (bit == spc->nr_ext) + goto full; + if (test_and_set_bit(bit, spc->bitmap)) + goto retry; + + atomic_set(&spc->last_alloc_bit, bit); + atomic_inc(&spc->nr_alloced); + + pr_info("hyperhold alloc extent %u.\n", bit); + + return bit; +full: + pr_err("hyperhold space is full.\n"); + + return -ENOSPC; +} + +void free_eid(struct hp_space *spc, u32 eid) +{ + if (!test_and_clear_bit(eid, spc->bitmap)) { + pr_err("eid is not alloced!\n"); + BUG(); + return; + } + if (atomic_dec_and_test(&spc->nr_alloced)) { + pr_info("notify space empty.\n"); + wake_up(&spc->empty_wq); + } + pr_info("hyperhold free extent %u.\n", eid); +} + +static void dump_space(struct hp_space *spc) +{ + u32 i = 0; + + pr_info("dump alloced extent in space.\n"); + for (i = 0; i < spc->nr_ext; i++) + if (test_bit(i, spc->bitmap)) + pr_info("alloced eid %u.\n", i); +} + +bool wait_for_space_empty(struct hp_space *spc, bool force) +{ + if (!atomic_read(&spc->nr_alloced)) + return true; + if (!force) + return false; + + dump_space(spc); + wait_event(spc->empty_wq, !atomic_read(&spc->nr_alloced)); + + return true; +} diff --git a/drivers/hyperhold/hp_space.h b/drivers/hyperhold/hp_space.h new file mode 100644 index 0000000000000000000000000000000000000000..caaaf92a07f795a5a72423dcee26c8204a39873e --- /dev/null +++ b/drivers/hyperhold/hp_space.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_space.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_SPACE_H_ +#define _HP_SPACE_H_ + +#include + +struct hp_space { + u32 ext_size; + u32 nr_ext; + unsigned long *bitmap; + atomic_t last_alloc_bit; + atomic_t nr_alloced; + wait_queue_head_t empty_wq; +}; + +void deinit_space(struct hp_space *spc); +bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size); +int alloc_eid(struct hp_space *spc); +void free_eid(struct hp_space *spc, u32 eid); + +bool wait_for_space_empty(struct hp_space *spc, bool force); + +u64 space_memory(void); +#endif diff --git a/drivers/hyperhold/hyperhold.h b/drivers/hyperhold/hyperhold.h new file mode 100644 index 0000000000000000000000000000000000000000..b65ff54445136679593e0b5c60be215c12f5ff88 --- /dev/null +++ b/drivers/hyperhold/hyperhold.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hyperhold.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HYPERHOLD_H_ +#define _HYPERHOLD_H_ + +#include + +struct hpio; + +typedef void (*hp_endio)(struct hpio *); + +void hyperhold_disable(bool force); +void hyperhold_enable(void); +bool is_hyperhold_enable(void); + +u32 hyperhold_nr_extent(void); +u32 hyperhold_extent_size(u32 eid); +long hyperhold_address(u32 eid, u32 offset); +int hyperhold_addr_extent(u64 addr); +int hyperhold_addr_offset(u64 addr); + +int hyperhold_alloc_extent(void); +void hyperhold_free_extent(u32 eid); +void hyperhold_should_free_extent(u32 eid); + +struct hpio *hyperhold_io_alloc(u32 eid, gfp_t gfp, unsigned int op, bool new_page); +void hyperhold_io_free(struct hpio *hpio); + +struct hpio *hyperhold_io_get(u32 eid, gfp_t gfp, unsigned int op); +bool hyperhold_io_put(struct hpio *hpio); + +void hyperhold_io_complete(struct hpio *hpio); +void hyperhold_io_wait(struct hpio *hpio); + +bool hyperhold_io_success(struct hpio *hpio); + +int hyperhold_io_extent(struct hpio *hpio); +int hyperhold_io_operate(struct hpio *hpio); +struct page *hyperhold_io_page(struct hpio *hpio, u32 index); +bool hyperhold_io_add_page(struct hpio *hpio, u32 index, struct page *page); +u32 hyperhold_io_nr_page(struct hpio *hpio); +void *hyperhold_io_private(struct hpio *hpio); + +int hyperhold_write_async(struct hpio *hpio, hp_endio endio, void *priv); +int hyperhold_read_async(struct hpio *hpio, hp_endio endio, void *priv); + +#endif diff --git a/include/linux/hyperhold_inf.h b/include/linux/hyperhold_inf.h new file mode 100644 index 0000000000000000000000000000000000000000..7d2bd1e88c1ca1146cf7c106c09366b53fce12bc --- /dev/null +++ b/include/linux/hyperhold_inf.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/hyperhold_inf.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef HYPERHOLD_INF_H +#define HYPERHOLD_INF_H + +#ifdef CONFIG_HYPERHOLD + +extern bool is_hyperhold_enable(void); + +#else + +static inline is_hyperhold_enable(void) +{ + return false; +} +#endif + +#endif diff --git a/include/linux/memcg_policy.h b/include/linux/memcg_policy.h new file mode 100644 index 0000000000000000000000000000000000000000..201b0e973e3c47f7e6b980b5ff8f1a1eee0f293d --- /dev/null +++ b/include/linux/memcg_policy.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/memcg_policy.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + * + */ +#ifndef _MEMCG_POLICY_H +#define _MEMCG_POLICY_H + +struct mem_cgroup; +struct pglist_data; +struct scan_control; + + +extern struct list_head score_head; +extern bool score_head_inited; +extern spinlock_t score_list_lock; +extern struct cgroup_subsys memory_cgrp_subsys; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr); +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc); +#endif /* CONFIG_HYPERHOLD_FILE_LRU */ + +#ifdef CONFIG_HYPERHOLD_MEMCG +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev); +void get_next_memcg_break(struct mem_cgroup *memcg); +void memcg_app_score_update(struct mem_cgroup *target); + +struct memcg_reclaim { + atomic64_t app_score; + atomic64_t ub_ufs2zram_ratio; +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_t ub_zram2ufs_ratio; + atomic_t ub_mem2zram_ratio; + atomic_t refault_threshold; + /* anon refault */ + unsigned long long reclaimed_pagefault; +#endif +}; +#define MAX_APP_SCORE 1000 +#endif + + +#endif /* _LINUX_MEMCG_POLICY_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4b975111b53617c16967c6ad73897655d22f2184..2469ca802798a4720ac78f56d51604c3323cba7a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #include #include #include +#include struct mem_cgroup; struct obj_cgroup; @@ -53,6 +54,11 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +static inline bool is_prot_page(struct page *page) +{ + return false; +} + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -295,6 +301,13 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; +#ifdef CONFIG_HYPERHOLD_MEMCG + struct list_head score_node; +#define MEM_CGROUP_NAME_MAX_LEN 100 + char name[MEM_CGROUP_NAME_MAX_LEN]; + struct memcg_reclaim memcg_reclaimed; +#endif + #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -549,6 +562,10 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return -1; +#endif return memcg->id.id; } @@ -566,6 +583,11 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) if (mem_cgroup_disabled()) return NULL; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return NULL; +#endif + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } @@ -763,6 +785,10 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = atomic_long_read(&pn->lruvec_stat[idx]); #ifdef CONFIG_SMP @@ -782,6 +808,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); @@ -830,6 +861,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static __always_inline bool is_file_page(struct page *page) +{ + if (!PageUnevictable(page) && !PageSwapBacked(page) && page_mapping(page)) + return true; + + return false; + +} +#endif + static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -837,6 +879,14 @@ static inline void __mod_lruvec_page_state(struct page *page, pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_file_page(page) && !is_prot_page(page)) { + __mod_node_page_state(pgdat, idx, val); + return; + + } +#endif + /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!head->mem_cgroup) { __mod_node_page_state(pgdat, idx, val); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ac2799dcb4aea873b35c429c3c1f33d8f51bfab..855a598ff674d69ab0f7d3abcf135868f555e156 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -763,6 +763,12 @@ typedef struct pglist_data { int kswapd_failures; /* Number of 'reclaimed == 0' runs */ +#ifdef CONFIG_HYPERHOLD_ZSWAPD + wait_queue_head_t zswapd_wait; + atomic_t zswapd_wait_flag; + struct task_struct *zswapd; +#endif + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; @@ -829,6 +835,11 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) +{ + return &pgdat->__lruvec; +} + static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -875,6 +886,13 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static inline int is_node_lruvec(struct lruvec *lruvec) +{ + return &lruvec_pgdat(lruvec)->__lruvec == lruvec; +} +#endif + extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORYLESS_NODES diff --git a/include/linux/swap.h b/include/linux/swap.h index fbc6805358da0c1f7a1cfd7e331928f5e00ad98f..517ab5adb9730e84973abb8ac43874edbf19d0fe 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -380,7 +380,22 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +struct scan_control; + +extern unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc); +extern bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru); +extern bool cgroup_reclaim(struct scan_control *sc); extern void check_move_unevictable_pages(struct pagevec *pvec); +extern unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); +extern bool writeback_throttling_sane(struct scan_control *sc); +extern inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc); + +extern int current_may_throttle(void); extern int kswapd_run(int nid); extern void kswapd_stop(int nid); @@ -443,6 +458,9 @@ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern bool free_swap_is_low(void); +#endif /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e37bd76f6b31d88951a9cededaec63..add63d0bc703a1fc916983f195aef64c7a232583 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -120,6 +120,24 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + ZSWAPD_WAKEUP, + ZSWAPD_REFAULT, + ZSWAPD_MEDIUM_PRESS, + ZSWAPD_CRITICAL_PRESS, + ZSWAPD_MEMCG_RATIO_SKIP, + ZSWAPD_MEMCG_REFAULT_SKIP, + ZSWAPD_SWAPOUT, + ZSWAPD_EMPTY_ROUND, + ZSWAPD_EMPTY_ROUND_SKIP_TIMES, + ZSWAPD_SNAPSHOT_TIMES, + ZSWAPD_RECLAIMED, + ZSWAPD_SCANNED, +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + FREEZE_RECLAIMED, + FREEZE_RECLAIME_COUNT, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/zswapd.h b/include/linux/zswapd.h new file mode 100644 index 0000000000000000000000000000000000000000..44cd060b12e4ac1338e810d61afd0421faa4e22d --- /dev/null +++ b/include/linux/zswapd.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/zswapd.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_H +#define _ZSWAPD_H + +enum { + CACHE_SIZE, + SWAP_SIZE, + CACHE_PAGE, + SWAP_PAGE, + CACHE_FAULT, + SWAP_FAULT, + READ_SIZE, + WRITE_SIZE, +}; + +struct group_swap_ops { + u64 (*group_read)(u16 gid, u64 req_size, void *priv); + u64 (*group_write)(u16 gid, u64 req_size, void *priv); + u64 (*group_data_size)(u16 gid, int type, void *priv); +}; + +struct group_swap_device { + void *priv; + struct group_swap_ops *ops; + struct list_head list; +}; + +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern int zswapd_run(int nid); +extern void zswapd_stop(int nid); +extern void wakeup_zswapd(pg_data_t *pgdat); +extern bool zram_watermark_ok(void); +extern void zswapd_status_show(struct seq_file *m); +extern void wake_all_zswapd(void); +extern void set_snapshotd_init_flag(unsigned int val); +extern pid_t get_zswapd_pid(void); +extern unsigned long long get_free_swap_threshold(void); +extern struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv); +extern void unregister_group_swap(struct group_swap_device *gsdev); +extern void memcg_eswap_info_show(struct seq_file *m); +#else +static inline int zswap_run(int nid) +{ + return 0; +} + +static inline void zswapd_stop(int nid) +{ +} + +static inline void wakeup_zswapd(pg_data_t *pgdat) +{ +} + +static inline bool zram_watermark_ok(void) +{ + return true; +} + +static inline void zswapd_status_show(struct seq_file *m) +{ +} + +static inline void wake_all_zswapd(void) +{ +} + +static inline void set_snapshotd_init_flag(unsigned int val) +{ +} + +static inline pid_t get_zswapd_pid(void) +{ + return -EINVAL; +} + +static inline u64 get_free_swap_threshold(void) +{ + return 0; +} + +static struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) +{ + return NULL; +} + +static void unregister_group_swap(struct group_swap_device *gsdev) +{ +} +#endif + +#endif /* _LINUX_ZSWAPD_H */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 2070df64958ead9e736b4d6026363c6536e7897c..a71ba5860e5635b8bb67bce96380ed5221e341c6 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -336,6 +336,36 @@ TRACE_EVENT(mm_vmscan_writepage, show_reclaim_flags(__entry->reclaim_flags)) ); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +TRACE_EVENT(mm_vmscan_lru_zswapd_shrink_active, + + TP_PROTO(int nid, unsigned long nr_taken, + unsigned long nr_deactivated, int priority), + + TP_ARGS(nid, nr_taken, nr_deactivated, priority), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_taken) + __field(unsigned long, nr_deactivated) + __field(int, priority) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_taken = nr_taken; + __entry->nr_deactivated = nr_deactivated; + __entry->priority = priority; + ), + + TP_printk("nid=%d nr_taken=%ld nr_deactivated=%ld priority=%d", + __entry->nid, + __entry->nr_taken, + __entry->nr_deactivated, + __entry->priority) +); +#endif + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/mm/Kconfig b/mm/Kconfig index ed97e8ddd70b7ba07dec8c4239173ef48a72b9a0..6760018a1c8c27a684c134df1f0ee85f058cb2c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -63,6 +63,33 @@ config SPARSEMEM_MANUAL endchoice +config HYPERHOLD_FILE_LRU + bool "Enable HyperHold FILE LRU" + depends on HYPERHOLD && MEMCG + select HYPERHOLD_MEMCG + default n + help + File-LRU is a mechanism that put file page in global lru list, + and anon page in memcg lru list(if MEMCG is enable), what's + more, recliam of anonymous pages and file page are separated. + +config HYPERHOLD_MEMCG + bool "Enable Memcg Management in HyperHold" + depends on HYPERHOLD && MEMCG + help + Add more attributes in memory cgroup, these attribute is used + to show information, shrink memory, swapin page and so on. + +config HYPERHOLD_ZSWAPD + bool "Enable zswapd thread to reclaim anon pages in background" + depends on HYPERHOLD + default n + help + zswapd is a kernel thread that reclaim anonymous pages in the + background. When the use of swap pages reaches the watermark + and the refault of anonymous pages is high, the content of + zram will exchanged to eswap by a certain percentage. + config DISCONTIGMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL diff --git a/mm/Makefile b/mm/Makefile index d73aed0fc99c1d408090c8175f482bbd24a6f2a9..56abb804cc19f4afe2cdc00641e07b023707edea 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -120,3 +120,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o +obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o +obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o diff --git a/mm/internal.h b/mm/internal.h index 840b8a330b9acf5c87e976214a7dd25f0aef685a..ccdee4a0368d1a32370bf0b58b60b56e16142ef6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* * The set of flags that only affect watermark checking and reclaim @@ -32,6 +34,121 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +enum reclaim_invoker { + ALL, + KSWAPD, + ZSWAPD, + DIRECT_RECLAIM, + NODE_RECLAIM, + SOFT_LIMIT, + RCC_RECLAIM, + FILE_RECLAIM, + ANON_RECLAIM +}; + +struct scan_control { + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; + + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + + /* Can active pages be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + + /* Writepage batching in laptop mode; RECLAIM_WRITE */ + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* + * Cgroups are not reclaimed below their configured memory.low, + * unless we threaten to OOM. If any cgroups are skipped due to + * memory.low and nothing was reclaimed, go back for memory.low. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file pages on the current node are dangerously low */ + unsigned int file_is_tiny:1; + + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate pages for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; + + enum reclaim_invoker invoker; + u32 isolate_count; + unsigned long nr_scanned_anon; + unsigned long nr_scanned_file; + unsigned long nr_reclaimed_anon; + unsigned long nr_reclaimed_file; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; +}; + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + void page_writeback_init(void); vm_fault_t do_swap_page(struct vm_fault *vmf); @@ -110,6 +227,17 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern unsigned int shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, + struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references); +extern unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, + struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, + enum lru_list lru); +extern unsigned move_pages_to_lru(struct lruvec *lruvec, struct list_head *list); +extern void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc); /* * in mm/rmap.c: diff --git a/mm/memcg_control.c b/mm/memcg_control.c new file mode 100644 index 0000000000000000000000000000000000000000..d56a2ba665b682d63e0b2e2497da7a4e8a4098a3 --- /dev/null +++ b/mm/memcg_control.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include "internal.h" + +#include "zswapd_internal.h" + +#ifdef CONFIG_HYPERHOLD_MEMCG + +struct list_head score_head; +bool score_head_inited; +DEFINE_SPINLOCK(score_list_lock); +DEFINE_MUTEX(reclaim_para_lock); + +/** + * get_next_memcg - iterate over memory cgroup score_list + * @prev: previously returned memcg, NULL on first invocation + * + * Returns references to the next memg on score_list of @prev, + * or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use get_next_memcg_break() + * to cancel a walk before the round-trip is complete. + */ +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + spin_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!prev)) + pos = &score_head; + else + pos = &(prev->score_node); + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->next == &score_head) + goto unlock; + + memcg = list_entry(pos->next, + struct mem_cgroup, score_node); + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + spin_unlock_irqrestore(&score_list_lock, flags); + + if (prev) + css_put(&prev->css); + + return memcg; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + spin_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!next)) + pos = &score_head; + else + pos = &next->score_node; + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->prev == &score_head) + goto unlock; + + memcg = list_entry(pos->prev, + struct mem_cgroup, score_node); + + if (unlikely(!memcg)) + goto unlock; + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + spin_unlock_irqrestore(&score_list_lock, flags); + + if (next) + css_put(&next->css); + return memcg; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ + struct list_head *pos = NULL; + struct list_head *tmp; + unsigned long flags; + + spin_lock_irqsave(&score_list_lock, flags); + list_for_each_prev_safe(pos, tmp, &score_head) { + struct mem_cgroup *memcg = list_entry(pos, + struct mem_cgroup, score_node); + if (atomic64_read(&memcg->memcg_reclaimed.app_score) < + atomic64_read(&target->memcg_reclaimed.app_score)) + break; + } + list_move_tail(&target->score_node, pos); + spin_unlock_irqrestore(&score_list_lock, flags); +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.app_score); +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > MAX_APP_SCORE) + return -EINVAL; + + if (atomic64_read(&memcg->memcg_reclaimed.app_score) != val) { + atomic64_set(&memcg->memcg_reclaimed.app_score, val); + memcg_app_score_update(memcg); + } + + return 0; +} + +static unsigned long move_pages_to_page_list(struct lruvec *lruvec, enum lru_list lru, + struct list_head *page_list) +{ + struct list_head *src = &lruvec->lists[lru]; + unsigned long nr_isolated = 0; + struct page *page; + + while (!list_empty(src)) { + page = lru_to_page(src); + + if (PageUnevictable(page)) + continue; + + if (likely(get_page_unless_zero(page))) { + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + put_page(page); + + } else { + continue; + } + + + if (PageUnevictable(page)) { + putback_lru_page(page); + continue; + } + + if (PageAnon(page) && !PageSwapBacked(page)) { + putback_lru_page(page); + continue; + } + + list_add(&page->lru, page_list); + nr_isolated++; + } + + return nr_isolated; +} + + +unsigned long reclaim_all_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed; + LIST_HEAD(page_list); + struct page *page; + struct reclaim_stat stat = {}; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + count_vm_event(FREEZE_RECLAIME_COUNT); + move_pages_to_page_list(lruvec, LRU_INACTIVE_ANON, &page_list); + + nr_reclaimed = shrink_page_list(&page_list, pgdat, &sc, &stat, true); + count_vm_event(FREEZE_RECLAIMED); + + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + return nr_reclaimed; +} + +static ssize_t memcg_force_shrink_anon(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct pglist_data *pgdat; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + reclaim_all_anon_memcg(pgdat, memcg); + } + + return nbytes; +} + +static int memcg_name_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%s\n", memcg->name); + return 0; +} + +static ssize_t memcg_name_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + buf = strstrip(buf); + if (nbytes >= MEM_CGROUP_NAME_MAX_LEN) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + if (memcg) + strcpy(memcg->name, buf); + mutex_unlock(&reclaim_para_lock); + + return nbytes; +} + +static int memcg_total_info_per_app_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = NULL; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon_size; + unsigned long zram_compress_size; + unsigned long eswap_compress_size; + + + while ((memcg = get_next_memcg(memcg))) { + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + zram_compress_size = memcg_data_size(memcg, CACHE_SIZE); + eswap_compress_size = memcg_data_size(memcg, SWAP_SIZE); + anon_size *= PAGE_SIZE / SZ_1K; + zram_compress_size /= SZ_1K; + eswap_compress_size /= SZ_1K; + + if (!strlen(memcg->name)) + continue; + + seq_printf(m, "%s %lu %lu %lu\n", memcg->name, anon_size, + zram_compress_size, eswap_compress_size); + } + + return 0; +} + +static int memcg_ub_ufs2zram_ratio_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + const unsigned int ratio = 100; + + if (val > ratio) + return -EINVAL; + + atomic64_set(&memcg->memcg_reclaimed.ub_ufs2zram_ratio, val); + + return 0; +} + +static u64 memcg_ub_ufs2zram_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); +} + +static int memcg_force_swapin_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + unsigned long size; + const unsigned int ratio = 100; + + size = memcg_data_size(memcg, SWAP_SIZE); + size = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio) * size / ratio; + + swapin_memcg(memcg, size); + + return 0; +} + +static struct cftype memcg_policy_files[] = { + { + .name = "name", + .write = memcg_name_write, + .seq_show = memcg_name_show, + }, + { + .name = "ub_ufs2zram_ratio", + .write_u64 = memcg_ub_ufs2zram_ratio_write, + .read_u64 = memcg_ub_ufs2zram_ratio_read, + }, + { + .name = "total_info_per_app", + .seq_show = memcg_total_info_per_app_show, + }, + { + .name = "app_score", + .write_u64 = mem_cgroup_app_score_write, + .read_u64 = mem_cgroup_app_score_read, + }, + { + .name = "force_shrink_anon", + .write = memcg_force_shrink_anon + }, + { + .name = "force_swapin", + .write_u64 = memcg_force_swapin_write, + }, + { }, /* terminate */ +}; + +static int __init memcg_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memcg_policy_files)); + + return 0; +} +subsys_initcall(memcg_policy_init); +#else +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + return NULL; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ +} + + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + return NULL; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return 0; +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return 0; +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ +} +#endif diff --git a/mm/memcg_reclaim.c b/mm/memcg_reclaim.c new file mode 100644 index 0000000000000000000000000000000000000000..f88826c13ae2e287713e5e7032ccd724cbd31416 --- /dev/null +++ b/mm/memcg_reclaim.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_reclaim.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include + +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#include "internal.h" +#endif + +static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness) +{ + return !sc->may_swap || !swappiness || !get_nr_swap_pages(); +} + +/* + * From 0 .. 100. Higher means more swappy. + */ +#define HYPERHOLD_SWAPPINESS 100 + +static int get_hyperhold_swappiness(void) +{ + return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness; +} + +static void get_scan_count_hyperhold(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) +{ + int swappiness = get_hyperhold_swappiness(); + struct lruvec *lruvec = node_lruvec(pgdat); + u64 fraction[2]; + u64 denominator; + enum scan_balance scan_balance; + unsigned long ap, fp; + enum lru_list lru; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long anon_cost, file_cost, total_cost; + unsigned long total_high_wmark = 0; + + + if (cgroup_reclaim(sc) && !swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + if (!cgroup_reclaim(sc)) { + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) && + (lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + sc->reclaim_idx) >> + (unsigned int)sc->priority)) { + scan_balance = SCAN_ANON; + goto out; + } + } + } + + /* + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. + */ + + if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) && + !inactive_is_low(lruvec, LRU_INACTIVE_FILE) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp; + +out: + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long lruvec_size; + unsigned long scan; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = lruvec_size; + *lru_pages += scan; + scan >>= sc->priority; + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. + */ + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + + nr[lru] = scan; + } +} + +#define ISOLATE_LIMIT_CNT 5 +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) + break; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_anon += nr_reclaimed; +} + +static void shrink_anon(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + unsigned long reclaimed; + unsigned long scanned; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + unsigned long nr_memcg[NR_LRU_LISTS]; + unsigned long nr_node_active = lruvec_lru_size( + node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES); + unsigned long nr_node_inactive = lruvec_lru_size( + node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES); + + while ((memcg = get_next_memcg(memcg))) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_active + 1); + nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_inactive + 1); + nr_memcg[LRU_ACTIVE_FILE] = 0; + nr_memcg[LRU_INACTIVE_FILE] = 0; + + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + + mem_cgroup_calculate_protection(target_memcg, memcg); + + if (mem_cgroup_below_min(memcg)) { + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + } else if (mem_cgroup_below_low(memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + } + + shrink_anon_memcg(pgdat, memcg, sc, nr_memcg); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) { + get_next_memcg_break(memcg); + break; + } + } +} + +static void shrink_file(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = node_lruvec(pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, + nr_to_scan, + lruvec, sc); + } + } + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_file += nr_reclaimed; +} + +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc) +{ + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; + unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + do { + /* Get scan count for file and anon */ + unsigned long node_lru_pages = 0; + unsigned long nr[NR_LRU_LISTS] = {0}; + + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost; + sc->file_cost = node_lruvec(pgdat)->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + refaults = lruvec_page_state(node_lruvec(pgdat), + WORKINGSET_ACTIVATE_FILE); + if (refaults != node_lruvec(pgdat)->refaults[1] || + inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#else + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#endif + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE); +#else + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +#endif + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } + + get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages); + + /* Shrink the Total-File-LRU */ + shrink_file(pgdat, sc, nr); + + /* Shrink Anon by iterating score_list */ + shrink_anon(pgdat, sc, nr); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim. */ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + + return reclaimable; +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 167169b3907d7153ee7e9b159ea8f6dbeaa670e5..30e068e95e214f5b078f88c47d8f6ed4f2d59c18 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -65,6 +65,7 @@ #include "slab.h" #include +#include #include @@ -666,7 +667,15 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); + struct lruvec *lruvec = &mz->lruvec; + unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES); +#else unsigned long nr_pages = page_counter_read(&memcg->memory); +#endif unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; @@ -854,8 +863,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ - if (!mem_cgroup_disabled()) + if (!mem_cgroup_disabled()) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif __mod_memcg_lruvec_state(lruvec, idx, val); + } } void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) @@ -906,6 +920,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return; +#endif x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { @@ -1350,6 +1368,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd goto out; } +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_file_lru(page_lru(page)) && + !is_prot_page(page)) { + lruvec = node_lruvec(pgdat); + goto out; + } +#endif memcg = page->mem_cgroup; /* * Swapcache readahead pages are added to the LRU - and @@ -1392,6 +1417,10 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, if (mem_cgroup_disabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = &mz->lru_zone_size[zid][lru]; @@ -4168,6 +4197,9 @@ static int memcg_stat_show(struct seq_file *m, void *v) } #endif +#ifdef CONFIG_HYPERHOLD_DEBUG + memcg_eswap_info_show(m); +#endif return 0; } @@ -5191,6 +5223,10 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (id == -1) + return NULL; +#endif return idr_find(&mem_cgroup_idr, id); } @@ -5229,6 +5265,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) lruvec_init(&pn->lruvec); pn->usage_in_excess = 0; + pn->lruvec.pgdat = NODE_DATA(node); pn->on_tree = false; pn->memcg = memcg; @@ -5334,6 +5371,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); memcg->deferred_split_queue.split_queue_len = 0; #endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + if (unlikely(!score_head_inited)) { + INIT_LIST_HEAD(&score_head); + score_head_inited = true; + } +#endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + INIT_LIST_HEAD(&memcg->score_node); +#endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: @@ -5355,6 +5403,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); +#ifdef CONFIG_HYPERHOLD_MEMCG + atomic64_set(&memcg->memcg_reclaimed.app_score, 300); +#endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); +#endif page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); @@ -5421,6 +5477,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } +#ifdef CONFIG_HYPERHOLD_MEMCG + memcg_app_score_update(memcg); + css_get(css); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); @@ -5432,6 +5493,15 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; +#ifdef CONFIG_HYPERHOLD_MEMCG + unsigned long flags; + + spin_lock_irqsave(&score_list_lock, flags); + list_del_init(&memcg->score_node); + spin_unlock_irqrestore(&score_list_lock, flags); + css_put(css); +#endif + /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6275b1c05f111276e7289516aac3a42e1e02a1f5..5da1c0299456b4a77b240fb8f7eef22c24422d0f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -851,6 +852,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_run(nid); +#endif writeback_set_ratelimit(); @@ -1600,6 +1604,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) if (arg.status_change_nid >= 0) { kswapd_stop(node); kcompactd_stop(node); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_stop(node); +#endif } writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83c0146cb59e6ccbac90a9b7c3acd812cdeffd9d..15d25006cfa0656f0f742fae18f9292cf2b11928 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4924,6 +4925,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_zswapd(); +#endif + if (should_fail_alloc_page(gfp_mask, order)) return false; @@ -6928,10 +6934,16 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + init_waitqueue_head(&pgdat->zswapd_wait); +#endif pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); +#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) + pgdat->__lruvec.pgdat = pgdat; +#endif } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/swap.c b/mm/swap.c index 47a47681c86b7f79f697c007af39f3e308dbbc06..4ea819c7a9e42450a157635619d5e0acff1e36d6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -311,6 +311,12 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) void lru_note_cost_page(struct page *page) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (page_is_file_lru(page)) { + lru_note_cost(&(page_pgdat(page)->__lruvec), 1, thp_nr_pages(page)); + return; + } +#endif lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), page_is_file_lru(page), thp_nr_pages(page)); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 5af6b0f770de626c8ab644563c01e8f3081c6aee..181cfc1b129683e7a46a597b8953df4f32185694 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -43,6 +43,7 @@ #include #include #include +#include static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); @@ -3441,6 +3442,28 @@ void si_swapinfo(struct sysinfo *val) spin_unlock(&swap_lock); } +#ifdef CONFIG_HYPERHOLD_ZSWAPD +bool free_swap_is_low(void) +{ + unsigned int type; + unsigned long long freeswap = 0; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; + } + freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; + spin_unlock(&swap_lock); + + return (freeswap < get_free_swap_threshold()); +} +EXPORT_SYMBOL(free_swap_is_low); +#endif + /* * Verify that a swap entry is valid and increment its swap map count. * diff --git a/mm/vmscan.c b/mm/vmscan.c index 9f292132ed88997a45bffb88f3adb4b2f6e54228..86da03e277c5d26541b0d073a76431f88e1dc66d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,97 +63,9 @@ #define CREATE_TRACE_POINTS #include -struct scan_control { - /* How many pages shrink_list() should reclaim */ - unsigned long nr_to_reclaim; - - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; - - /* - * The memory cgroup that hit its limit and as a result is the - * primary target of this reclaim invocation. - */ - struct mem_cgroup *target_mem_cgroup; - - /* - * Scan pressure balancing between anon and file LRUs - */ - unsigned long anon_cost; - unsigned long file_cost; - - /* Can active pages be deactivated as part of reclaim? */ -#define DEACTIVATE_ANON 1 -#define DEACTIVATE_FILE 2 - unsigned int may_deactivate:2; - unsigned int force_deactivate:1; - unsigned int skipped_deactivate:1; - - /* Writepage batching in laptop mode; RECLAIM_WRITE */ - unsigned int may_writepage:1; - - /* Can mapped pages be reclaimed? */ - unsigned int may_unmap:1; - - /* Can pages be swapped as part of reclaim? */ - unsigned int may_swap:1; - - /* - * Cgroup memory below memory.low is protected as long as we - * don't threaten to OOM. If any cgroup is reclaimed at - * reduced force or passed over entirely due to its memory.low - * setting (memcg_low_skipped), and nothing is reclaimed as a - * result, then go back for one more cycle that reclaims the protected - * memory (memcg_low_reclaim) to avert OOM. - */ - unsigned int memcg_low_reclaim:1; - unsigned int memcg_low_skipped:1; - - unsigned int hibernation_mode:1; - - /* One of the zones is ready for compaction */ - unsigned int compaction_ready:1; - - /* There is easily reclaimable cold cache in the current node */ - unsigned int cache_trim_mode:1; - - /* The file pages on the current node are dangerously low */ - unsigned int file_is_tiny:1; - - /* Allocation order */ - s8 order; - - /* Scan (total_size >> priority) pages at once */ - s8 priority; - - /* The highest zone to isolate pages for reclaim from */ - s8 reclaim_idx; - - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - - struct { - unsigned int dirty; - unsigned int unqueued_dirty; - unsigned int congested; - unsigned int writeback; - unsigned int immediate; - unsigned int file_taken; - unsigned int taken; - } nr; - - /* for recording the reclaimed slab by now */ - struct reclaim_state reclaim_state; -}; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#endif #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_page(_page, _base, _field) \ @@ -169,6 +81,10 @@ struct scan_control { #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif +#ifdef CONFIG_HYPERHOLD_FILE_LRU +unsigned int enough_inactive_file = 1; +#endif + /* * From 0 .. 200. Higher means more swappy. */ @@ -230,7 +146,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); } -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } @@ -248,7 +164,7 @@ static bool cgroup_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; @@ -268,12 +184,12 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return false; } -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { return true; } @@ -308,6 +224,20 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone unsigned long size = 0; int zid; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!mem_cgroup_disabled() && is_node_lruvec(lruvec)) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + + if (!managed_zone(zone)) + continue; + + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); + } + + return size; + } +#endif for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; @@ -638,9 +568,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * * Returns the number of reclaimed slab objects. */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, - int priority) +unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + int priority) { unsigned long ret, freed = 0; struct shrinker *shrinker; @@ -1064,11 +994,11 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references) +unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1642,7 +1572,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -1837,14 +1767,17 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * Returns the number of pages moved to the given lruvec. */ -static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +unsigned move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; enum lru_list lru; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + bool prot; + bool file; +#endif while (!list_empty(list)) { page = lru_to_page(list); @@ -1878,8 +1811,23 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (PageActive(page)) { + prot = is_prot_page(page); + file = page_is_file_lru(page); + if (!prot && file) { + lruvec = node_lruvec(pgdat); + workingset_age_nonresident(lruvec, + nr_pages); + } else { + workingset_age_nonresident(lruvec, + nr_pages); + } + } +#else if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages); +#endif } } @@ -1897,7 +1845,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ -static int current_may_throttle(void) +int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || @@ -1908,9 +1856,8 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) { LIST_HEAD(page_list); unsigned long nr_scanned; @@ -1926,6 +1873,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stalled) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + sc->isolate_count++; +#endif /* wait a bit for the reclaimer. */ msleep(100); stalled = true; @@ -1961,7 +1911,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (file) + lru_note_cost(node_lruvec(pgdat), file, stat.nr_pageout); + else + lru_note_cost(lruvec, file, stat.nr_pageout); +#else lru_note_cost(lruvec, file, stat.nr_pageout); + +#endif + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); @@ -2001,7 +1960,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return nr_reclaimed; } -static void shrink_active_list(unsigned long nr_to_scan, +void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) @@ -2150,7 +2109,7 @@ unsigned long reclaim_pages(struct list_head *page_list) return nr_reclaimed; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, +unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { @@ -2192,7 +2151,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) +bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; @@ -2211,13 +2170,6 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) return inactive * inactive_ratio < active; } -enum scan_balance { - SCAN_EQUAL, - SCAN_FRACT, - SCAN_ANON, - SCAN_FILE, -}; - /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -2227,6 +2179,7 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { @@ -2423,7 +2376,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } } -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2536,6 +2489,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } +#endif /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) @@ -2555,9 +2509,9 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct pglist_data *pgdat, - unsigned long nr_reclaimed, - struct scan_control *sc) +inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; @@ -2608,6 +2562,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -2856,6 +2811,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (reclaimable) pgdat->kswapd_failures = 0; } +#endif /* * Returns true if compaction should go ahead for a costly-order request, or @@ -2972,7 +2928,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(zone->zone_pgdat, sc); +#else shrink_node(zone->zone_pgdat, sc); +#endif } /* @@ -2987,6 +2947,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct lruvec *lruvec; + + lruvec = node_lruvec(pgdat); + lruvec->refaults[0] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_ANON); /* modified */ + lruvec->refaults[1] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_FILE); /* modified */ +#endif + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[0] = refaults; @@ -3291,6 +3259,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + unsigned long nr[NR_LRU_LISTS]; +#endif WARN_ON_ONCE(!current->reclaim_state); @@ -3307,7 +3278,17 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + nr[LRU_ACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_INACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + shrink_anon_memcg(pgdat, memcg, &sc, nr); +#else shrink_lruvec(lruvec, &sc); +#endif trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -3512,7 +3493,11 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, sc); +#else shrink_node(pgdat, sc); +#endif /* * Fragmentation may mean that the system cannot be rebalanced for @@ -4198,7 +4183,11 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, &sc); +#else shrink_node(pgdat, &sc); +#endif } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d146942151348bac4012dea31b09bb..a03aa6b3e4dcb638438e969db4f0deb5f8f7ef20 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1350,6 +1350,24 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + "zswapd_running", + "zswapd_hit_refaults", + "zswapd_medium_press", + "zswapd_critical_press", + "zswapd_memcg_ratio_skip", + "zswapd_memcg_refault_skip", + "zswapd_swapout", + "zswapd_empty_round", + "zswapd_empty_round_skip_times", + "zswapd_snapshot_times", + "zswapd_reclaimed", + "zswapd_scanned", +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + "freeze_reclaimed", + "freeze_reclaim_count", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02eeb064a440e13cdce1cf1ee6a6c55..28d9bf0c5e5d506954cd102db7d5669e39f2f004 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -263,7 +263,16 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(!PageLocked(page), page); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && page_is_file_lru(page)) { + lruvec = node_lruvec(pgdat); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } else { + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } +#else workingset_age_nonresident(lruvec, thp_nr_pages(page)); +#endif /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); @@ -313,9 +322,19 @@ void workingset_refault(struct page *page, void *shadow) * would be better if the root_mem_cgroup existed in all * configurations instead. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (memcgid == -1) + eviction_lruvec = node_lruvec(pgdat); + else { + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) + goto out; + } +#else eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) goto out; +#endif eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -347,8 +366,15 @@ void workingset_refault(struct page *page, void *shadow) */ memcg = page_memcg(page); lruvec = mem_cgroup_lruvec(memcg, pgdat); - +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && file) + inc_lruvec_state(node_lruvec(pgdat), + WORKINGSET_REFAULT_BASE + file); + else + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); +#else inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); +#endif /* * Compare the distance to the existing workingset size. We @@ -357,10 +383,21 @@ void workingset_refault(struct page *page, void *shadow) * workingset competition needs to consider anon or not depends * on having swap. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size = lruvec_page_state(node_lruvec(pgdat), NR_ACTIVE_FILE); +#else workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); +#endif + if (!file) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size += lruvec_page_state(node_lruvec(pgdat), + NR_INACTIVE_FILE); +#else + workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); +#endif } if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, @@ -374,8 +411,19 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && file) { + workingset_age_nonresident(node_lruvec(pgdat), + thp_nr_pages(page)); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); + } else { + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); + } +#else workingset_age_nonresident(lruvec, thp_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); +#endif /* Page was active prior to eviction */ if (workingset) { @@ -384,7 +432,14 @@ void workingset_refault(struct page *page, void *shadow) spin_lock_irq(&page_pgdat(page)->lru_lock); lru_note_cost_page(page); spin_unlock_irq(&page_pgdat(page)->lru_lock); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && file) + inc_lruvec_state(node_lruvec(pgdat), WORKINGSET_RESTORE_BASE + file); + else + inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); +#else inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); +#endif } out: rcu_read_unlock(); @@ -411,7 +466,16 @@ void workingset_activation(struct page *page) if (!mem_cgroup_disabled() && !memcg) goto out; lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(page) && page_is_file_lru(page)) { + lruvec = node_lruvec(page_pgdat(page)); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } else { + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + } +#else workingset_age_nonresident(lruvec, thp_nr_pages(page)); +#endif out: rcu_read_unlock(); } @@ -487,6 +551,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG +#ifdef CONFIG_HYPERHOLD_FILE_LRU + pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); +#else + if (sc->memcg) { struct lruvec *lruvec; int i; @@ -500,6 +569,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else +#endif #endif pages = node_present_pages(sc->nid); diff --git a/mm/zswapd.c b/mm/zswapd.c new file mode 100644 index 0000000000000000000000000000000000000000..577d97974229d2ca25a2c661b5cc31880463b59b --- /dev/null +++ b/mm/zswapd.c @@ -0,0 +1,882 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include + +#include "zswapd_internal.h" +#include "internal.h" + +#define UNSET_ZRAM_WM_RATIO 0 +#define DEFAULT_ZRAM_WM_RATIO 37 +#define SWAP_MORE_ZRAM (50 * (SZ_1M)) + +static wait_queue_head_t snapshotd_wait; +static atomic_t snapshotd_wait_flag; +static atomic_t snapshotd_init_flag = ATOMIC_INIT(0); +static struct task_struct *snapshotd_task; + +static pid_t zswapd_pid = -1; +static unsigned long long last_anon_pagefault; +static unsigned long long anon_refault_ratio; +static unsigned long long zswapd_skip_interval; +static unsigned long last_zswapd_time; +static unsigned long last_snapshot_time; +bool last_round_is_empty; + + +DECLARE_RWSEM(gs_lock); +LIST_HEAD(gs_list); + +void unregister_group_swap(struct group_swap_device *gsdev) +{ + down_write(&gs_lock); + list_del(&gsdev->list); + up_write(&gs_lock); + + kfree(gsdev); +} +EXPORT_SYMBOL(unregister_group_swap); + +struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) +{ + struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL); + + if (!gsdev) + return NULL; + + gsdev->priv = priv; + gsdev->ops = ops; + + down_write(&gs_lock); + list_add(&gsdev->list, &gs_list); + up_write(&gs_lock); + + return gsdev; +} +EXPORT_SYMBOL(register_group_swap); + +u64 memcg_data_size(struct mem_cgroup *memcg, int type) +{ + struct group_swap_device *gsdev = NULL; + u64 size = 0; + + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) + size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv); + up_read(&gs_lock); + + return size; +} + +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 read_size = 0; + u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); + struct group_swap_device *gsdev = NULL; + + if (req_size > swap_size * ratio) + req_size = swap_size * ratio; + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + read_size += gsdev->ops->group_write(memcg->id.id, req_size - read_size, + gsdev->priv); + if (read_size >= req_size) + break; + } + up_read(&gs_lock); + + return read_size; +} + +static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 cache_size = memcg_data_size(memcg, CACHE_SIZE); + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 all_size = cache_size + swap_size; + u64 write_size = 0; + u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio); + struct group_swap_device *gsdev = NULL; + + if (all_size * ratio <= swap_size) + return 0; + if (req_size > all_size * ratio - swap_size) + req_size = all_size * ratio - swap_size; + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size, + gsdev->priv); + if (write_size >= req_size) + break; + } + up_read(&gs_lock); + + return write_size; +} + +static u64 swapout(u64 req_size) +{ + struct mem_cgroup *memcg = NULL; + u64 write_size = 0; + + while ((memcg = get_next_memcg(memcg))) { + write_size += swapout_memcg(memcg, req_size - write_size); + if (write_size >= req_size) + break; + } + + return write_size; +} + +static unsigned long long get_zram_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long zram_pages = 0; + + while ((memcg = get_next_memcg(memcg))) + zram_pages += memcg_data_size(memcg, CACHE_PAGE); + + return zram_pages; +} + +static unsigned long long get_eswap_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long eswap_pages = 0; + + while ((memcg = get_next_memcg(memcg))) + eswap_pages += memcg_data_size(memcg, SWAP_PAGE); + + return eswap_pages; +} + +static unsigned long long get_zram_pagefault(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long cache_fault = 0; + + while ((memcg = get_next_memcg(memcg))) + cache_fault += memcg_data_size(memcg, CACHE_FAULT); + + return cache_fault; +} + +static unsigned int calc_sys_cur_avail_buffers(void) +{ + const unsigned int percent_constant = 100; + unsigned long freemem; + unsigned long active_file; + unsigned long inactive_file; + unsigned long inactive_anon; + unsigned long buffers; + + freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K; + active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K; + inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K; + inactive_anon = global_node_page_state(NR_INACTIVE_ANON) * PAGE_SIZE / SZ_1K; + + buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant + + active_file * get_active_file_ratio() / percent_constant; + + return (buffers * SZ_1K / SZ_1M); /* kb to mb */ +} + +void zswapd_status_show(struct seq_file *m) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + seq_printf(m, "buffer_size:%u\n", buffers); + seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio); +} + +pid_t get_zswapd_pid(void) +{ + return zswapd_pid; +} + +static bool min_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_min_avail_buffers()) + return true; + + return false; +} + +static bool buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_avail_buffers()) + return true; + + return false; +} + +static bool high_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_high_avail_buffers()) + return true; + + return false; +} + +static void snapshot_anon_refaults(void) +{ + struct mem_cgroup *memcg = NULL; + + while (memcg = get_next_memcg(memcg)) + memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT); + + last_anon_pagefault = get_zram_pagefault(); + last_snapshot_time = jiffies; +} + +/* + * Return true if refault changes between two read operations. + */ +static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg) +{ + const unsigned int percent_constant = 100; + unsigned long long anon_pagefault; + unsigned long anon_total; + unsigned long long ratio; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + + if (!memcg) + return false; + + anon_pagefault = memcg_data_size(memcg, CACHE_FAULT); + if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault) + return false; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return false; + + lruvec = &mz->lruvec; + if (!lruvec) + return false; + + anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) + + memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE); + + ratio = (anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) * + percent_constant / (anon_total + 1); + if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold)) + return true; + + return false; +} + +static bool get_area_anon_refault_status(void) +{ + const unsigned int percent_constant = 1000; + unsigned long long anon_pagefault; + unsigned long long ratio; + unsigned long long time; + + anon_pagefault = get_zram_pagefault(); + time = jiffies; + if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time) + return false; + + ratio = (anon_pagefault - last_anon_pagefault) * percent_constant / + (jiffies_to_msecs(time - last_snapshot_time) + 1); + anon_refault_ratio = ratio; + + if (ratio > get_area_anon_refault_threshold()) + return true; + + return false; +} + +void wakeup_snapshotd(void) +{ + unsigned long snapshot_interval; + + snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time); + if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) { + atomic_set(&snapshotd_wait_flag, 1); + wake_up_interruptible(&snapshotd_wait); + } +} + +static int snapshotd(void *p) +{ + int ret; + + while (!kthread_should_stop()) { + ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag)); + if (ret) + continue; + + atomic_set(&snapshotd_wait_flag, 0); + + snapshot_anon_refaults(); + count_vm_event(ZSWAPD_SNAPSHOT_TIMES); + } + + return 0; +} + +void set_snapshotd_init_flag(unsigned int val) +{ + atomic_set(&snapshotd_init_flag, val); +} + +/* + * This snapshotd start function will be called by init. + */ +int snapshotd_run(void) +{ + atomic_set(&snapshotd_wait_flag, 0); + init_waitqueue_head(&snapshotd_wait); + + snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd"); + if (IS_ERR(snapshotd_task)) { + pr_err("Failed to start snapshotd\n"); + return PTR_ERR(snapshotd_task); + } + + return 0; +} + +static int __init snapshotd_init(void) +{ + snapshotd_run(); + + return 0; +} +module_init(snapshotd_init); + +static int get_zswapd_eswap_policy(void) +{ + if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO) + return CHECK_BUFFER_ONLY; + else + return CHECK_BUFFER_ZRAMRATIO_BOTH; +} + +static unsigned int get_policy_zram_wm_ratio(void) +{ + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ONLY) + return DEFAULT_ZRAM_WM_RATIO; + else + return get_zram_wm_ratio(); +} + +int get_zram_current_watermark(void) +{ + long long diff_buffers; + const unsigned int percent_constant = 10; + u64 nr_total; + unsigned int zram_wm_ratio = get_policy_zram_wm_ratio(); + + nr_total = totalram_pages(); + /* B_target - B_current */ + diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers(); + /* MB to page */ + diff_buffers *= SZ_1M / PAGE_SIZE; + /* after_comp to before_comp */ + diff_buffers *= get_compress_ratio(); + /* page to ratio */ + diff_buffers = diff_buffers * percent_constant / nr_total; + + return min(zram_wm_ratio, zram_wm_ratio - diff_buffers); +} + +bool zram_watermark_ok(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = totalram_pages() * ratio / percent_constant; + if (nr_zram_used > nr_wm) + return true; + + return false; +} + +bool zram_watermark_exceed(void) +{ + u64 nr_zram_used; + const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE); + + if (!nr_wm) + return false; + + nr_zram_used = get_zram_used_pages(); + if (nr_zram_used > nr_wm) + return true; + return false; +} + +void wakeup_zswapd(pg_data_t *pgdat) +{ + unsigned long interval; + + if (IS_ERR(pgdat->zswapd)) + return; + + if (!wq_has_sleeper(&pgdat->zswapd_wait)) + return; + + /* + * make anon pagefault snapshots + * wake up snapshotd + */ + if (atomic_read(&snapshotd_init_flag) == 1) + wakeup_snapshotd(); + + /* wake up when the buffer is lower than min_avail_buffer */ + if (min_buffer_is_suitable()) + return; + + interval = jiffies_to_msecs(jiffies - last_zswapd_time); + if (interval < zswapd_skip_interval) { + count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES); + return; + } + + atomic_set(&pgdat->zswapd_wait_flag, 1); + wake_up_interruptible(&pgdat->zswapd_wait); +} + +void wake_all_zswapd(void) +{ + pg_data_t *pgdat = NULL; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + wakeup_zswapd(pgdat); + } +} + +static void zswapd_shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) +{ + unsigned int nr_deactivate; + unsigned long nr_scanned; + unsigned long nr_taken; + + struct page *page = NULL; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost; + unsigned long *anon_cost = &lruvec->anon_cost; + LIST_HEAD(l_inactive); + LIST_HEAD(l_hold); + + lru_add_drain(); + + spin_lock_irq(&pgdat->lru_lock); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken); + *anon_cost += nr_taken; + *node_anon_cost += nr_taken; + __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + + if (unlikely(!page_evictable(page))) { + putback_lru_page(page); + continue; + } + + ClearPageActive(page); + SetPageWorkingset(page); + list_add(&page->lru, &l_inactive); + } + + spin_lock_irq(&pgdat->lru_lock); + nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken); + spin_unlock_irq(&pgdat->lru_lock); + + mem_cgroup_uncharge_list(&l_inactive); + free_unref_page_list(&l_inactive); + + trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken, + nr_deactivate, sc->priority); +} + +static unsigned long zswapd_shrink_list(enum lru_list lru, + unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + +static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed = 0; + unsigned long nr_to_scan; + struct blk_plug plug; + enum lru_list lru; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += zswapd_shrink_list(lru, + nr_to_scan, lruvec, sc); + } + } + } + + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; +} + +static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) +{ + const unsigned int percent_constant = 100; + struct mem_cgroup *memcg = NULL; + unsigned long nr[NR_LRU_LISTS]; + + while ((memcg = get_next_memcg(memcg))) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) { + get_next_memcg_break(memcg); + break; + } + + if (get_memcg_anon_refault_status(memcg)) { + count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP); + continue; + } + + nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr_zram = memcg_data_size(memcg, CACHE_PAGE); + nr_eswap = memcg_data_size(memcg, SWAP_PAGE); + + zram_ratio = (nr_zram + nr_eswap) * percent_constant / + (nr_inactive + nr_active + nr_zram + nr_eswap + 1); + if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) { + count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP); + continue; + } + + nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority; + nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority; + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + +#ifdef CONFIG_HYPERHOLD_FILE_LRU + zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr); +#else + shrink_lruvec(lruvec, sc); +#endif + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + get_next_memcg_break(memcg); + break; + } + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +static u64 __calc_nr_to_reclaim(void) +{ + unsigned int buffers; + unsigned int high_buffers; + unsigned int max_reclaim_size; + u64 reclaim_size = 0; + + high_buffers = get_high_avail_buffers(); + buffers = calc_sys_cur_avail_buffers(); + max_reclaim_size = get_zswapd_max_reclaim_size(); + if (buffers < high_buffers) + reclaim_size = high_buffers - buffers; + + /* once max reclaim target is max_reclaim_size */ + reclaim_size = min(reclaim_size, max_reclaim_size); + + /* MB to pages */ + return reclaim_size * SZ_1M / PAGE_SIZE; +} + +static void zswapd_shrink_node(pg_data_t *pgdat) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY / 2, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + const unsigned int increase_rate = 2; + + do { + unsigned long nr_reclaimed = sc.nr_reclaimed; + bool raise_priority = true; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) + break; + + sc.nr_scanned = 0; + sc.nr_to_reclaim = __calc_nr_to_reclaim(); + + if (zswapd_shrink_anon(pgdat, &sc)) + raise_priority = false; + count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned); + count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed); + if (try_to_freeze() || kthread_should_stop()) + break; + + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1); + + /* + * When meets the first empty round, set the interval to t. + * If the following round is still empty, set the intervall + * to 2t. If the round is always empty, then 4t, 8t, and so on. + * But make sure the interval is not more than the max_skip_interval. + * Once a non-empty round occurs, reset the interval to 0. + */ + if (sc.nr_reclaimed < get_empty_round_check_threshold()) { + count_vm_event(ZSWAPD_EMPTY_ROUND); + if (last_round_is_empty) + zswapd_skip_interval = min(zswapd_skip_interval * + increase_rate, get_max_skip_interval()); + else + zswapd_skip_interval = get_empty_round_skip_interval(); + last_round_is_empty = true; + } else { + zswapd_skip_interval = 0; + last_round_is_empty = false; + } +} + +u64 zram_watermark_diff(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = totalram_pages() * ratio / percent_constant; + if (nr_zram_used > nr_wm) + return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM; + + return 0; +} + +u64 zswapd_buffer_diff(void) +{ + u64 buffers; + u64 avail; + + buffers = calc_sys_cur_avail_buffers(); + avail = get_high_avail_buffers(); + if (buffers < avail) + return (avail - buffers) * SZ_1M; + + return 0; +} + +u64 get_do_eswap_size(bool refault) +{ + u64 size = 0; + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH) + size = max(zram_watermark_diff(), zswapd_buffer_diff()); + else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault)) + size = zswapd_buffer_diff(); + + return size; +} + +static int zswapd(void *p) +{ + struct task_struct *tsk = current; + pg_data_t *pgdat = (pg_data_t *)p; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + /* save zswapd pid for schedule strategy */ + zswapd_pid = tsk->pid; + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + while (!kthread_should_stop()) { + bool refault = false; + u64 size = 0; + + (void)wait_event_freezable(pgdat->zswapd_wait, + atomic_read(&pgdat->zswapd_wait_flag)); + atomic_set(&pgdat->zswapd_wait_flag, 0); + count_vm_event(ZSWAPD_WAKEUP); + zswapd_pressure_report(LEVEL_LOW); + + if (get_area_anon_refault_status()) { + refault = true; + count_vm_event(ZSWAPD_REFAULT); + goto do_eswap; + } + + zswapd_shrink_node(pgdat); + last_zswapd_time = jiffies; + +do_eswap: + size = get_do_eswap_size(refault); + if (size >= SZ_1M) { + count_vm_event(ZSWAPD_SWAPOUT); + size = swapout(size); + } + + if (!buffer_is_suitable()) { + if (free_swap_is_low() || zram_watermark_exceed()) { + zswapd_pressure_report(LEVEL_CRITICAL); + count_vm_event(ZSWAPD_CRITICAL_PRESS); + pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__, + get_zram_used_pages(), get_eswap_used_pages()); + } else { + zswapd_pressure_report(LEVEL_MEDIUM); + count_vm_event(ZSWAPD_MEDIUM_PRESS); + } + } + } + + return 0; +} + +/* + * This zswapd start function will be called by init and node-hot-add. + */ +int zswapd_run(int nid) +{ + const unsigned int priority_less = 5; + struct sched_param param = { + .sched_priority = MAX_PRIO - priority_less, + }; + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->zswapd) + return 0; + + atomic_set(&pgdat->zswapd_wait_flag, 0); + pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid); + if (IS_ERR(pgdat->zswapd)) { + pr_err("Failed to start zswapd on node %d\n", nid); + return PTR_ERR(pgdat->zswapd); + } + + sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, ¶m); + set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority)); + wake_up_process(pgdat->zswapd); + + return 0; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void zswapd_stop(int nid) +{ + struct task_struct *zswapd = NODE_DATA(nid)->zswapd; + + if (zswapd) { + kthread_stop(zswapd); + NODE_DATA(nid)->zswapd = NULL; + } + + zswapd_pid = -1; +} + +/* + * It's optimal to keep kswapds on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes away, + * we get changed to run anywhere: as the first one comes back, restore + * their cpu bindings. + */ +static int zswapd_cpu_online(unsigned int cpu) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->zswapd, mask); + } + + return 0; +} + +static int __init zswapd_init(void) +{ + int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online", + zswapd_cpu_online, NULL); + if (ret < 0) { + pr_err("zswapd: failed to register hotplug callbacks.\n"); + return ret; + } + + for_each_node_state(nid, N_MEMORY) + zswapd_run(nid); + + return 0; +} +module_init(zswapd_init) diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c new file mode 100644 index 0000000000000000000000000000000000000000..934eff21f09b2a0a1b35a0a6459b0b1b03fe8e18 --- /dev/null +++ b/mm/zswapd_control.c @@ -0,0 +1,878 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include "zswapd_internal.h" + +#define ANON_REFAULT_SNAPSHOT_MIN_INTERVAL 200 +#define AREA_ANON_REFAULT_THRESHOLD 22000 +#define EMPTY_ROUND_CHECK_THRESHOLD 10 +#define EMPTY_ROUND_SKIP_INTERVAL 20 +#define ZSWAPD_MAX_LEVEL_NUM 10 +#define MAX_SKIP_INTERVAL 1000 +#define MAX_RECLAIM_SIZE 100 + +#define INACTIVE_FILE_RATIO 90 +#define ACTIVE_FILE_RATIO 70 +#define COMPRESS_RATIO 30 +#define ZRAM_WM_RATIO 0 +#define MAX_RATIO 100 + +struct zswapd_param { + unsigned int min_score; + unsigned int max_score; + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; +}; + +static struct zswapd_param zswap_param[ZSWAPD_MAX_LEVEL_NUM]; +struct eventfd_ctx *zswapd_press_efd[LEVEL_COUNT]; +static DEFINE_MUTEX(pressure_event_lock); +static DEFINE_MUTEX(reclaim_para_lock); + +atomic_t avail_buffers = ATOMIC_INIT(0); +atomic_t min_avail_buffers = ATOMIC_INIT(0); +atomic_t high_avail_buffers = ATOMIC_INIT(0); +atomic_t max_reclaim_size = ATOMIC_INIT(MAX_RECLAIM_SIZE); + +atomic_t inactive_file_ratio = ATOMIC_INIT(INACTIVE_FILE_RATIO); +atomic_t active_file_ratio = ATOMIC_INIT(ACTIVE_FILE_RATIO); +atomic_t zram_wm_ratio = ATOMIC_INIT(ZRAM_WM_RATIO); +atomic_t compress_ratio = ATOMIC_INIT(COMPRESS_RATIO); + +atomic64_t zram_critical_threshold = ATOMIC_LONG_INIT(0); +atomic64_t free_swap_threshold = ATOMIC_LONG_INIT(0); +atomic64_t area_anon_refault_threshold = ATOMIC_LONG_INIT(AREA_ANON_REFAULT_THRESHOLD); +atomic64_t anon_refault_snapshot_min_interval = + ATOMIC_LONG_INIT(ANON_REFAULT_SNAPSHOT_MIN_INTERVAL); +atomic64_t empty_round_skip_interval = ATOMIC_LONG_INIT(EMPTY_ROUND_SKIP_INTERVAL); +atomic64_t max_skip_interval = ATOMIC_LONG_INIT(MAX_SKIP_INTERVAL); +atomic64_t empty_round_check_threshold = ATOMIC_LONG_INIT(EMPTY_ROUND_CHECK_THRESHOLD); + +inline unsigned int get_zram_wm_ratio(void) +{ + return atomic_read(&zram_wm_ratio); +} + +inline unsigned int get_compress_ratio(void) +{ + return atomic_read(&compress_ratio); +} + +inline unsigned int get_inactive_file_ratio(void) +{ + return atomic_read(&inactive_file_ratio); +} + +inline unsigned int get_active_file_ratio(void) +{ + return atomic_read(&active_file_ratio); +} + +inline unsigned int get_avail_buffers(void) +{ + return atomic_read(&avail_buffers); +} + +inline unsigned int get_min_avail_buffers(void) +{ + return atomic_read(&min_avail_buffers); +} + +inline unsigned int get_high_avail_buffers(void) +{ + return atomic_read(&high_avail_buffers); +} + +inline unsigned int get_zswapd_max_reclaim_size(void) +{ + return atomic_read(&max_reclaim_size); +} + +inline unsigned long long get_free_swap_threshold(void) +{ + return atomic64_read(&free_swap_threshold); +} + +inline unsigned long long get_area_anon_refault_threshold(void) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +inline unsigned long long get_anon_refault_snapshot_min_interval(void) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +inline unsigned long long get_empty_round_skip_interval(void) +{ + return atomic64_read(&empty_round_skip_interval); +} + +inline unsigned long long get_max_skip_interval(void) +{ + return atomic64_read(&max_skip_interval); +} + +inline unsigned long long get_empty_round_check_threshold(void) +{ + return atomic64_read(&empty_round_check_threshold); +} + +inline unsigned long long get_zram_critical_threshold(void) +{ + return atomic64_read(&zram_critical_threshold); +} + +static ssize_t avail_buffers_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned long long threshold; + unsigned int high_buffers; + unsigned int min_buffers; + unsigned int buffers; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u %llu", &buffers, &min_buffers, &high_buffers, &threshold) != 4) + return -EINVAL; + + atomic_set(&avail_buffers, buffers); + atomic_set(&min_avail_buffers, min_buffers); + atomic_set(&high_avail_buffers, high_buffers); + atomic64_set(&free_swap_threshold, (threshold * (SZ_1M / PAGE_SIZE))); + + if (atomic_read(&min_avail_buffers) == 0) + set_snapshotd_init_flag(0); + else + set_snapshotd_init_flag(1); + + wake_all_zswapd(); + + return nbytes; +} + +static ssize_t zswapd_max_reclaim_size_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + u32 max; + int ret; + + buf = strstrip(buf); + ret = kstrtouint(buf, 10, &max); + if (ret) + return -EINVAL; + + atomic_set(&max_reclaim_size, max); + + return nbytes; +} + +static ssize_t buffers_ratio_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int inactive; + unsigned int active; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u", &inactive, &active) != 2) + return -EINVAL; + + if (inactive > MAX_RATIO || active > MAX_RATIO) + return -EINVAL; + + atomic_set(&inactive_file_ratio, inactive); + atomic_set(&active_file_ratio, active); + + return nbytes; +} + +static int area_anon_refault_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&area_anon_refault_threshold, val); + + return 0; +} + +static int empty_round_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_skip_interval, val); + + return 0; +} + +static int max_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&max_skip_interval, val); + + return 0; +} + +static int empty_round_check_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_check_threshold, val); + + return 0; +} + +static int anon_refault_snapshot_min_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&anon_refault_snapshot_min_interval, val); + + return 0; +} + +static int zram_critical_thres_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&zram_critical_threshold, val); + + return 0; +} + +static ssize_t zswapd_pressure_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int level; + unsigned int efd; + struct fd efile; + int ret; + + buf = strstrip(buf); + if (sscanf(buf, "%u %u", &efd, &level) != 2) + return -EINVAL; + + if (level >= LEVEL_COUNT) + return -EINVAL; + + mutex_lock(&pressure_event_lock); + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out; + } + + zswapd_press_efd[level] = eventfd_ctx_fileget(efile.file); + if (IS_ERR(zswapd_press_efd[level])) { + ret = PTR_ERR(zswapd_press_efd[level]); + goto out_put_efile; + } + fdput(efile); + mutex_unlock(&pressure_event_lock); + return nbytes; + +out_put_efile: + fdput(efile); +out: + mutex_unlock(&pressure_event_lock); + + return ret; +} + +void zswapd_pressure_report(enum zswapd_pressure_level level) +{ + int ret; + + if (zswapd_press_efd[level] == NULL) + return; + + ret = eventfd_signal(zswapd_press_efd[level], 1); + if (ret < 0) + pr_err("SWAP-MM: %s : level:%u, ret:%d ", __func__, level, ret); +} + +static u64 zswapd_pid_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return get_zswapd_pid(); +} + +static void zswapd_memcgs_param_parse(int level_num) +{ + struct mem_cgroup *memcg = NULL; + u64 score; + int i; + + while ((memcg = get_next_memcg(memcg))) { + score = atomic64_read(&memcg->memcg_reclaimed.app_score); + for (i = 0; i < level_num; ++i) + if (score >= zswap_param[i].min_score && + score <= zswap_param[i].max_score) + break; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + zswap_param[i].ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + zswap_param[i].ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + zswap_param[i].refault_threshold); + } +} + +static ssize_t zswapd_memcgs_param_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + char *token = NULL; + int level_num; + int i; + + buf = strstrip(buf); + token = strsep(&buf, " "); + + if (!token) + return -EINVAL; + + if (kstrtoint(token, 0, &level_num)) + return -EINVAL; + + if (level_num > ZSWAPD_MAX_LEVEL_NUM) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + for (i = 0; i < level_num; ++i) { + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].min_score) || + zswap_param[i].min_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].max_score) || + zswap_param[i].max_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_mem2zram_ratio) || + zswap_param[i].ub_mem2zram_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_zram2ufs_ratio) || + zswap_param[i].ub_zram2ufs_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].refault_threshold)) + goto out; + } + + zswapd_memcgs_param_parse(level_num); + mutex_unlock(&reclaim_para_lock); + + return nbytes; + +out: + mutex_unlock(&reclaim_para_lock); + return -EINVAL; +} + +static ssize_t zswapd_single_memcg_param_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u", &ub_mem2zram_ratio, &ub_zram2ufs_ratio, + &refault_threshold) != 3) + return -EINVAL; + + if (ub_mem2zram_ratio > MAX_RATIO || ub_zram2ufs_ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + refault_threshold); + + return nbytes; +} + +static ssize_t mem_cgroup_zram_wm_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&zram_wm_ratio, ratio); + + return nbytes; +} + +static ssize_t mem_cgroup_compress_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&compress_ratio, ratio); + + return nbytes; +} + +static int zswapd_pressure_show(struct seq_file *m, void *v) +{ + zswapd_status_show(m); + + return 0; +} + +static int memcg_active_app_info_list_show(struct seq_file *m, void *v) +{ + struct mem_cgroup_per_node *mz = NULL; + struct mem_cgroup *memcg = NULL; + struct lruvec *lruvec = NULL; + unsigned long eswap_size; + unsigned long anon_size; + unsigned long zram_size; + + while ((memcg = get_next_memcg(memcg))) { + u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + eswap_size = memcg_data_size(memcg, SWAP_SIZE); + zram_size = memcg_data_size(memcg, CACHE_SIZE); + + if (anon_size + zram_size + eswap_size == 0) + continue; + + if (!strlen(memcg->name)) + continue; + + anon_size *= PAGE_SIZE / SZ_1K; + zram_size *= PAGE_SIZE / SZ_1K; + eswap_size *= PAGE_SIZE / SZ_1K; + + seq_printf(m, "%s %llu %lu %lu %lu %llu\n", memcg->name, score, + anon_size, zram_size, eswap_size, + memcg->memcg_reclaimed.reclaimed_pagefault); + } + return 0; +} + +static int report_app_info_show(struct seq_file *m, void *v) +{ + struct mem_cgroup_per_node *mz = NULL; + struct mem_cgroup *memcg = NULL; + struct lruvec *lruvec = NULL; + unsigned long eswap_size; + unsigned long zram_size; + unsigned long anon_size; + + while ((memcg = get_next_memcg(memcg))) { + u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + eswap_size = memcg_data_size(memcg, SWAP_SIZE); + zram_size = memcg_data_size(memcg, CACHE_SIZE); + + if (anon_size + zram_size + eswap_size == 0) + continue; + + anon_size *= PAGE_SIZE / SZ_1K; + zram_size *= PAGE_SIZE / SZ_1K; + eswap_size *= PAGE_SIZE / SZ_1K; + + seq_printf(m, "%s, %llu, %lu, %lu, %lu\n", + strlen(memcg->name) ? memcg->name : "root", + score, anon_size, zram_size, eswap_size); + } + return 0; +} + +#ifdef CONFIG_HYPERHOLD_DEBUG +static int avail_buffers_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "avail_buffers: %u\n", atomic_read(&avail_buffers)); + seq_printf(m, "min_avail_buffers: %u\n", atomic_read(&min_avail_buffers)); + seq_printf(m, "high_avail_buffers: %u\n", atomic_read(&high_avail_buffers)); + seq_printf(m, "free_swap_threshold: %llu\n", + atomic64_read(&free_swap_threshold) * PAGE_SIZE / SZ_1M); + + return 0; +} + +static int zswapd_max_reclaim_size_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zswapd_max_reclaim_size: %u\n", + atomic_read(&max_reclaim_size)); + + return 0; +} + +static int buffers_ratio_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "inactive_file_ratio: %u\n", atomic_read(&inactive_file_ratio)); + seq_printf(m, "active_file_ratio: %u\n", atomic_read(&active_file_ratio)); + + return 0; +} + +static u64 area_anon_refault_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +static u64 empty_round_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_skip_interval); +} + +static u64 max_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&max_skip_interval); +} + +static u64 empty_round_check_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_check_threshold); +} + +static u64 anon_refault_snapshot_min_interval_read( + struct cgroup_subsys_state *css, struct cftype *cft) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +static u64 zram_critical_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&zram_critical_threshold); +} + +static int zswapd_memcgs_param_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < ZSWAPD_MAX_LEVEL_NUM; ++i) { + seq_printf(m, "level %d min score: %u\n", i, + zswap_param[i].min_score); + seq_printf(m, "level %d max score: %u\n", i, + zswap_param[i].max_score); + seq_printf(m, "level %d ub_mem2zram_ratio: %u\n", i, + zswap_param[i].ub_mem2zram_ratio); + seq_printf(m, "level %d ub_zram2ufs_ratio: %u\n", i, + zswap_param[i].ub_zram2ufs_ratio); + seq_printf(m, "level %d refault_threshold: %u\n", i, + zswap_param[i].refault_threshold); + } + + return 0; +} + +static int zswapd_single_memcg_param_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "memcg score: %llu\n", + atomic64_read(&memcg->memcg_reclaimed.app_score)); + seq_printf(m, "memcg ub_mem2zram_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)); + seq_printf(m, "memcg ub_zram2ufs_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio)); + seq_printf(m, "memcg refault_threshold: %u\n", + atomic_read(&memcg->memcg_reclaimed.refault_threshold)); + + return 0; +} + +static int zram_wm_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zram_wm_ratio: %u\n", atomic_read(&zram_wm_ratio)); + + return 0; +} + +static int compress_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "compress_ratio: %u\n", atomic_read(&compress_ratio)); + + return 0; +} +static int zswapd_vmstat_show(struct seq_file *m, void *v) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *vm_buf = NULL; + + vm_buf = kzalloc(sizeof(struct vm_event_state), GFP_KERNEL); + if (!vm_buf) + return -ENOMEM; + all_vm_events(vm_buf); + + seq_printf(m, "zswapd_wake_up:%lu\n", vm_buf[ZSWAPD_WAKEUP]); + seq_printf(m, "zswapd_area_refault:%lu\n", vm_buf[ZSWAPD_REFAULT]); + seq_printf(m, "zswapd_medium_press:%lu\n", vm_buf[ZSWAPD_MEDIUM_PRESS]); + seq_printf(m, "zswapd_critical_press:%lu\n", vm_buf[ZSWAPD_CRITICAL_PRESS]); + seq_printf(m, "zswapd_memcg_ratio_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_RATIO_SKIP]); + seq_printf(m, "zswapd_memcg_refault_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_REFAULT_SKIP]); + seq_printf(m, "zswapd_swapout:%lu\n", vm_buf[ZSWAPD_SWAPOUT]); + seq_printf(m, "zswapd_snapshot_times:%lu\n", vm_buf[ZSWAPD_SNAPSHOT_TIMES]); + seq_printf(m, "zswapd_reclaimed:%lu\n", vm_buf[ZSWAPD_RECLAIMED]); + seq_printf(m, "zswapd_scanned:%lu\n", vm_buf[ZSWAPD_SCANNED]); + + kfree(vm_buf); +#endif + + return 0; +} + +void memcg_eswap_info_show(struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon; + unsigned long file; + unsigned long zram; + unsigned long eswap; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return; + + lruvec = &mz->lruvec; + if (!lruvec) + return; + + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + zram = memcg_data_size(memcg, CACHE_SIZE) / SZ_1K; + eswap = memcg_data_size(memcg, SWAP_SIZE) / SZ_1K; + anon *= PAGE_SIZE / SZ_1K; + file *= PAGE_SIZE / SZ_1K; + seq_printf(m, "Anon:\t%12lu kB\nFile:\t%12lu kB\nzram:\t%12lu kB\nEswap:\t%12lu kB\n", + anon, file, zram, eswap); +} +#endif + +static struct cftype zswapd_policy_files[] = { + { + .name = "active_app_info_list", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = memcg_active_app_info_list_show, + }, + { + .name = "zram_wm_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_zram_wm_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zram_wm_ratio_show, +#endif + }, + { + .name = "compress_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_compress_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = compress_ratio_show, +#endif + }, + { + .name = "zswapd_pressure", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_pressure_event_control, + }, + { + .name = "zswapd_pid", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = zswapd_pid_read, + }, + { + .name = "avail_buffers", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = avail_buffers_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = avail_buffers_params_show, +#endif + }, + { + .name = "zswapd_max_reclaim_size", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_max_reclaim_size_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_max_reclaim_size_show, +#endif + }, + { + .name = "area_anon_refault_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = area_anon_refault_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = area_anon_refault_threshold_read, +#endif + }, + { + .name = "empty_round_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_skip_interval_read, +#endif + }, + { + .name = "max_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = max_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = max_skip_interval_read, +#endif + }, + { + .name = "empty_round_check_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_check_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_check_threshold_read, +#endif + }, + { + .name = "anon_refault_snapshot_min_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = anon_refault_snapshot_min_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = anon_refault_snapshot_min_interval_read, +#endif + }, + { + .name = "zswapd_memcgs_param", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_memcgs_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_memcgs_param_show, +#endif + }, + { + .name = "zswapd_single_memcg_param", + .write = zswapd_single_memcg_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_single_memcg_param_show, +#endif + }, + { + .name = "buffer_ratio_params", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = buffers_ratio_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = buffers_ratio_params_show, +#endif + }, + { + .name = "zswapd_pressure_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_pressure_show, + }, + { + .name = "zram_critical_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = zram_critical_thres_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = zram_critical_threshold_read, +#endif + }, + +#ifdef CONFIG_HYPERHOLD_DEBUG + { + .name = "zswapd_vmstat_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_vmstat_show, + }, +#endif + + { }, /* terminate */ +}; + +static int __init zswapd_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, zswapd_policy_files)); + + return 0; +} +subsys_initcall(zswapd_policy_init); diff --git a/mm/zswapd_internal.h b/mm/zswapd_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..1447882ae49725663a160ed2d7a106690dd67e9b --- /dev/null +++ b/mm/zswapd_internal.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/zswapd_internal.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_INTERNAL_H +#define _ZSWAPD_INTERNAL_H + +enum zswapd_pressure_level { + LEVEL_LOW = 0, + LEVEL_MEDIUM, + LEVEL_CRITICAL, + LEVEL_COUNT +}; + +enum zswapd_eswap_policy { + CHECK_BUFFER_ONLY = 0, + CHECK_BUFFER_ZRAMRATIO_BOTH +}; + +void zswapd_pressure_report(enum zswapd_pressure_level level); +inline unsigned int get_zram_wm_ratio(void); +inline unsigned int get_compress_ratio(void); +inline unsigned int get_avail_buffers(void); +inline unsigned int get_min_avail_buffers(void); +inline unsigned int get_high_avail_buffers(void); +inline unsigned int get_zswapd_max_reclaim_size(void); +inline unsigned int get_inactive_file_ratio(void); +inline unsigned int get_active_file_ratio(void); +inline unsigned long long get_area_anon_refault_threshold(void); +inline unsigned long long get_anon_refault_snapshot_min_interval(void); +inline unsigned long long get_empty_round_skip_interval(void); +inline unsigned long long get_max_skip_interval(void); +inline unsigned long long get_empty_round_check_threshold(void); +inline unsigned long long get_zram_critical_threshold(void); +u64 memcg_data_size(struct mem_cgroup *memcg, int type); +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size); + +#endif /* MM_ZSWAPD_INTERNAL_H */