diff --git a/drivers/Kconfig b/drivers/Kconfig index 4f1149db289853a1121d1a31c753fdd1ac5a47c0..199d56f5c3ddc61638edd5faf680f720b7ea4a43 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -11,6 +11,8 @@ source "drivers/pcmcia/Kconfig" source "drivers/rapidio/Kconfig" +source "drivers/hyperhold/Kconfig" + source "drivers/base/Kconfig" source "drivers/bus/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 71129b9f75b2b6e675a1c950a94629e1df9e0d89..6b899d76afc813a3ef3585e67817f3eef2866201 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -73,6 +73,9 @@ obj-$(CONFIG_CONNECTOR) += connector/ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +# Hyperhold driver +obj-$(CONFIG_HYPERHOLD) += hyperhold/ + obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ diff --git a/drivers/accesstokenid/access_tokenid.c b/drivers/accesstokenid/access_tokenid.c index 33a61ef163b3e8833cf265892fc20b934150a9e3..c69a7fbd2d119b06450940adb71966c053f95f65 100755 --- a/drivers/accesstokenid/access_tokenid.c +++ b/drivers/accesstokenid/access_tokenid.c @@ -186,6 +186,7 @@ static int add_node_to_tree(struct token_perm_node *root_node, struct token_perm static struct token_perm_node *remove_node_by_token(struct token_perm_node *root_node, uint32_t token) { + struct token_perm_node **new_node_addr = NULL; struct token_perm_node *target_node = NULL; struct token_perm_node *parent_node = NULL; find_node_by_token(root_node, token, &target_node, &parent_node); @@ -194,7 +195,6 @@ static struct token_perm_node *remove_node_by_token(struct token_perm_node *root return NULL; } - struct token_perm_node **new_node_addr = NULL; if (parent_node == NULL) { new_node_addr = &root_node; } else if (parent_node->perm_data.token > token) { @@ -304,12 +304,15 @@ int access_tokenid_get_permission(struct file *file, void __user *uarg) struct token_perm_node *parent_node = NULL; read_lock(&token_rwlock); find_node_by_token(g_token_perm_root, get_perm_data.token, &target_node, &parent_node); - read_unlock(&token_rwlock); - if (target_node == NULL) + if (target_node == NULL) { + read_unlock(&token_rwlock); return -ENODATA; + } uint32_t bit_idx = get_perm_data.op_code % UINT32_T_BITS; - return (target_node->perm_data.perm[idx] & ((uint32_t)0x01 << bit_idx)) >> bit_idx; + int ret = (target_node->perm_data.perm[idx] & ((uint32_t)0x01 << bit_idx)) >> bit_idx; + read_unlock(&token_rwlock); + return ret; } typedef int (*access_token_id_func)(struct file *file, void __user *arg); diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 0386b7da02aa3ba46d187358d5fe3a0302b97a8d..6326e4a1462efc2338a24f45f962dd7e1f68f95e 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -87,3 +87,5 @@ config ZRAM_MULTI_COMP re-compress pages using a potentially slower but more effective compression algorithm. Note, that IDLE page recompression requires ZRAM_MEMORY_TRACKING. + +source "drivers/block/zram/zram_group/Kconfig" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index de9e457907b1e9834937df323413bd11d18f5d5c..a8947f7faa980f96ce88ee9ae1d8278761175435 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,4 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_GROUP) += zram_group/zram_group.o zram_group/zlist.o zram_group/group_writeback.o + obj-$(CONFIG_ZRAM) += zram.o + +ccflags-$(CONFIG_ZRAM_GROUP) += -I$(srctree)/drivers/block/zram/zram_group/ +ccflags-$(CONFIG_HYPERHOLD) += -I$(srctree)/drivers/hyperhold/ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aa490da3cef233409e2b85db33a7f8c88d3cba29..604c7bc2bff36a2d0b01f91af88e861e4680e263 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -34,6 +34,10 @@ #include #include +#ifdef CONFIG_ZRAM_GROUP +#include +#endif + #include "zram_drv.h" static DEFINE_IDR(zram_index_idr); @@ -58,21 +62,6 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); -static int zram_slot_trylock(struct zram *zram, u32 index) -{ - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); -} - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -83,35 +72,6 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static unsigned long zram_get_handle(struct zram *zram, u32 index) -{ - return zram->table[index].handle; -} - -static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) -{ - zram->table[index].handle = handle; -} - -/* flag operations require table entry bit_spin_lock() being held */ -static bool zram_test_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - return zram->table[index].flags & BIT(flag); -} - -static void zram_set_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags |= BIT(flag); -} - -static void zram_clear_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags &= ~BIT(flag); -} - static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { @@ -123,19 +83,6 @@ static unsigned long zram_get_element(struct zram *zram, u32 index) return zram->table[index].element; } -static size_t zram_get_obj_size(struct zram *zram, u32 index) -{ - return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); -} - -static void zram_set_obj_size(struct zram *zram, - u32 index, size_t size) -{ - unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; - - zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; -} - static inline bool zram_allocated(struct zram *zram, u32 index) { return zram_get_obj_size(zram, index) || @@ -643,9 +590,6 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, return 1; } -#define PAGE_WB_SIG "page_index=" - -#define PAGE_WRITEBACK 0 #define HUGE_WRITEBACK (1<<0) #define IDLE_WRITEBACK (1<<1) #define INCOMPRESSIBLE_WRITEBACK (1<<2) @@ -671,17 +615,8 @@ static ssize_t writeback_store(struct device *dev, mode = IDLE_WRITEBACK | HUGE_WRITEBACK; else if (sysfs_streq(buf, "incompressible")) mode = INCOMPRESSIBLE_WRITEBACK; - else { - if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) - return -EINVAL; - - if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || - index >= nr_pages) - return -EINVAL; - - nr_pages = 1; - mode = PAGE_WRITEBACK; - } + else + return -EINVAL; down_read(&zram->init_lock); if (!init_done(zram)) { @@ -700,7 +635,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - for (; nr_pages != 0; index++, nr_pages--) { + for (index = 0; index < nr_pages; index++) { struct bio_vec bvec; bvec_set_page(&bvec, page, PAGE_SIZE, 0); @@ -1281,6 +1216,66 @@ static DEVICE_ATTR_RO(bd_stat); #endif static DEVICE_ATTR_RO(debug_stat); +#ifdef CONFIG_ZRAM_GROUP +static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + int ret = 0; + + down_read(&zram->init_lock); + if (zram->zgrp_ctrl == ZGRP_NONE) + ret = snprintf(buf, PAGE_SIZE - 1, "disable\n"); + else if (zram->zgrp_ctrl == ZGRP_TRACK) + ret = snprintf(buf, PAGE_SIZE - 1, "readonly\n"); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (zram->zgrp_ctrl == ZGRP_WRITE) + ret = snprintf(buf, PAGE_SIZE - 1, "readwrite\n"); +#endif + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + int ret; +#ifdef CONFIG_ZRAM_GROUP_DEBUG + u32 op, gid, index; + + ret = sscanf(buf, "%u %u %u", &op, &index, &gid); + if (ret == 3) { + pr_info("op[%u] index[%u] gid[%u].\n", op, index, gid); + group_debug(zram, op, index, gid); + return len; + } +#endif + + ret = len; + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Can't setup group ctrl for initialized device!\n"); + ret = -EBUSY; + goto out; + } + if (!strcmp(buf, "disable\n")) + zram->zgrp_ctrl = ZGRP_NONE; + else if (!strcmp(buf, "readonly\n")) + zram->zgrp_ctrl = ZGRP_TRACK; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (!strcmp(buf, "readwrite\n")) + zram->zgrp_ctrl = ZGRP_WRITE; +#endif + else + ret = -EINVAL; +out: + up_write(&zram->init_lock); + + return ret; +} +#endif + static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -1292,6 +1287,9 @@ static void zram_meta_free(struct zram *zram, u64 disksize) zs_destroy_pool(zram->mem_pool); vfree(zram->table); +#ifdef CONFIG_ZRAM_GROUP + zram_group_deinit(zram); +#endif } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -1311,6 +1309,10 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); +#ifdef CONFIG_ZRAM_GROUP + zram_group_init(zram, num_pages); +#endif + return true; } @@ -1323,6 +1325,10 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; +#ifdef CONFIG_ZRAM_GROUP + zram_group_untrack_obj(zram, index); +#endif + #ifdef CONFIG_ZRAM_MEMORY_TRACKING zram->table[index].ac_time = 0; #endif @@ -1440,6 +1446,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, int ret; zram_slot_lock(zram, index); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (!parent) { + ret = zram_group_fault_obj(zram, index); + if (ret) { + zram_slot_unlock(zram, index); + return ret; + } + } + + if (zram_test_flag(zram, index, ZRAM_GWB)) { + zram_slot_unlock(zram, index); + return -EIO; + } +#endif if (!zram_test_flag(zram, index, ZRAM_WB)) { /* Slot should be locked through out the function call */ ret = zram_read_from_zspool(zram, page, index); @@ -1610,6 +1630,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); } +#ifdef CONFIG_ZRAM_GROUP + zram_group_track_obj(zram, index, page_memcg(page)); +#endif zram_slot_unlock(zram, index); /* Update stats */ @@ -2246,6 +2269,9 @@ static DEVICE_ATTR_RW(writeback_limit_enable); static DEVICE_ATTR_RW(recomp_algorithm); static DEVICE_ATTR_WO(recompress); #endif +#ifdef CONFIG_ZRAM_GROUP +static DEVICE_ATTR_RW(group); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -2272,6 +2298,9 @@ static struct attribute *zram_disk_attrs[] = { #ifdef CONFIG_ZRAM_MULTI_COMP &dev_attr_recomp_algorithm.attr, &dev_attr_recompress.attr, +#endif +#ifdef CONFIG_ZRAM_GROUP + &dev_attr_group.attr, #endif NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c5254626f051faebcf06453242d8711467b68981..782ac75b32c537eb694a3da6c4144036dc8caa5b 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,6 +21,10 @@ #include "zcomp.h" +#ifdef CONFIG_ZRAM_GROUP +#include "zram_group.h" +#endif + #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -38,7 +42,15 @@ * * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ +#ifdef CONFIG_ZRAM_GROUP +/* reserve 16 bits for group id */ +#define ZRAM_SIZE_SHIFT 24 +#define ZRAM_GRPID_SHIFT 16 +#define ZRAM_GRPID_MASK (((1UL << ZRAM_GRPID_SHIFT) - 1) << ZRAM_SIZE_SHIFT) +#define ZRAM_FLAG_SHIFT (ZRAM_SIZE_SHIFT + ZRAM_GRPID_SHIFT) +#else #define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) +#endif /* Only 2 bits are allowed for comp priority index */ #define ZRAM_COMP_PRIORITY_MASK 0x3 @@ -52,6 +64,10 @@ enum zram_pageflags { ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZRAM_GWB, /* obj is group writeback*/ + ZRAM_FAULT, /* obj is needed by a pagefault req */ +#endif ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */ ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ @@ -106,6 +122,10 @@ struct zram_stats { struct zram { struct zram_table_entry *table; +#ifdef CONFIG_ZRAM_GROUP + struct zram_group *zgrp; + unsigned int zgrp_ctrl; +#endif struct zs_pool *mem_pool; struct zcomp *comps[ZRAM_MAX_COMPS]; struct gendisk *disk; @@ -141,4 +161,86 @@ struct zram { struct dentry *debugfs_dir; #endif }; + +static inline int zram_slot_trylock(struct zram *zram, u32 index) +{ + return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static inline void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + +/* flag operations require table entry bit_spin_lock() being held */ +static inline bool zram_test_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + return zram->table[index].flags & BIT(flag); +} + +static inline void zram_set_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags |= BIT(flag); +} + +static inline void zram_clear_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags &= ~BIT(flag); +} +#ifdef CONFIG_ZRAM_GROUP +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_SIZE_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_SIZE_SHIFT; + + zram->table[index].flags = (flags << ZRAM_SIZE_SHIFT) | size; +} + +void zram_group_init(struct zram *zram, u32 nr_obj); +void zram_group_deinit(struct zram *zram); +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg); +void zram_group_untrack_obj(struct zram *zram, u32 index); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +int zram_group_fault_obj(struct zram *zram, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid); +#endif + +#else +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; + + zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; +} +#endif #endif diff --git a/drivers/block/zram/zram_group/Kconfig b/drivers/block/zram/zram_group/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..0eacf79fb2594db32641d6997e463061c8da7880 --- /dev/null +++ b/drivers/block/zram/zram_group/Kconfig @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +config ZRAM_GROUP + bool "Manage Zram objs with mem_cgroup" + depends on ZRAM && MEMCG + help + Manage Zram objs with mem_cgroup. + +config ZRAM_GROUP_DEBUG + bool "Debug info for zram group" + depends on ZRAM_GROUP + help + Debug info for ZRAM_GROUP. + +config ZLIST_DEBUG + bool "Debug info for zram group list" + depends on ZRAM_GROUP + help + Debug info for zram group list. + +config ZRAM_GROUP_WRITEBACK + bool "Write back grouped zram objs to Hyperhold driver" + depends on ZRAM_GROUP && HYPERHOLD + help + Write back grouped zram objs to hyperhold. diff --git a/drivers/block/zram/zram_group/group_writeback.c b/drivers/block/zram/zram_group/group_writeback.c new file mode 100644 index 0000000000000000000000000000000000000000..0956a2eb939a2b312fcc00a7a8325e99eedb24c7 --- /dev/null +++ b/drivers/block/zram/zram_group/group_writeback.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/group_writeback.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include + +#include "../zram_drv.h" +#include "zram_group.h" + +#ifdef CONFIG_HYPERHOLD +#include "hyperhold.h" +#endif + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +static u16 zram_get_memcg_id(struct zram *zram, u32 index) +{ + return (zram->table[index].flags & ZRAM_GRPID_MASK) >> ZRAM_SIZE_SHIFT; +} + +static void zram_set_memcg_id(struct zram *zram, u32 index, u16 gid) +{ + unsigned long old = zram->table[index].flags & (~ZRAM_GRPID_MASK); + + zram->table[index].flags = old | ((u64)gid << ZRAM_SIZE_SHIFT); +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static bool obj_can_wb(struct zram *zram, u32 index, u16 gid) +{ + /* overwrited obj, just skip */ + if (zram_get_memcg_id(zram, index) != gid) { + pr_debug("obj %u is from group %u instead of group %u.\n", + index, zram_get_memcg_id(zram, index), gid); + return false; + } + if (!zgrp_obj_is_isolated(zram->zgrp, index)) { + pr_debug("obj %u is not isolated.\n", index); + return false; + } + /* need not to writeback, put back the obj as HOTEST */ + if (zram_test_flag(zram, index, ZRAM_SAME)) { + pr_debug("obj %u is filled with same element.\n", index); + goto insert; + } + if (zram_test_flag(zram, index, ZRAM_WB)) { + pr_debug("obj %u is writeback.\n", index); + goto insert; + } + /* obj is needed by a pagefault req, do not writeback it. */ + if (zram_test_flag(zram, index, ZRAM_FAULT)) { + pr_debug("obj %u is needed by a pagefault request.\n", index); + goto insert; + } + /* should never happen */ + if (zram_test_flag(zram, index, ZRAM_GWB)) { + pr_debug("obj %u is group writeback.\n", index); + BUG(); + return false; + } + + return true; +insert: + zgrp_obj_insert(zram->zgrp, index, gid); + + return false; +} + +static void copy_obj(struct hpio *hpio, u32 offset, char *obj, u32 size, bool to) +{ + u32 page_id, start; + char *buf = NULL; + + page_id = offset / PAGE_SIZE; + start = offset % PAGE_SIZE; + if (size + start <= PAGE_SIZE) { + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, size); + else + memcpy(obj, buf + start, size); + + return; + } + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, PAGE_SIZE - start); + else + memcpy(obj, buf + start, PAGE_SIZE - start); + buf = page_to_virt(hyperhold_io_page(hpio, page_id + 1)); + if (to) + memcpy(buf, obj + PAGE_SIZE - start, size + start - PAGE_SIZE); + else + memcpy(obj + PAGE_SIZE - start, buf, size + start - PAGE_SIZE); +} + +static u32 move_obj_to_hpio(struct zram *zram, u32 index, u16 gid, + struct hpio *hpio, u32 offset) +{ + u32 size = 0; + unsigned long handle; + char *src = NULL; + u32 ext_size; + u32 eid; + + eid = hyperhold_io_extent(hpio); + ext_size = hyperhold_extent_size(eid); + + zram_slot_lock(zram, index); + if (!obj_can_wb(zram, index, gid)) + goto unlock; + size = zram_get_obj_size(zram, index); + /* no space, put back the obj as COLDEST */ + if (size + offset > ext_size) { + pr_debug("obj %u size is %u, but ext %u only %u space left.\n", + index, size, eid, ext_size - offset); + zgrp_obj_putback(zram->zgrp, index, gid); + size = 0; + goto unlock; + } + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + copy_obj(hpio, offset, src, size, true); + zs_unmap_object(zram->mem_pool, handle); + zs_free(zram->mem_pool, handle); + zram_set_handle(zram, index, hyperhold_address(eid, offset)); + zram_set_flag(zram, index, ZRAM_GWB); + wbgrp_obj_insert(zram->zgrp, index, eid); + wbgrp_obj_stats_inc(zram->zgrp, gid, eid, size); + zgrp_obj_stats_dec(zram->zgrp, gid, size); + pr_debug("move obj %u of group %u to hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size, offset); +unlock: + zram_slot_unlock(zram, index); + + return size; +} + +static void move_obj_from_hpio(struct zram *zram, int index, struct hpio *hpio) +{ + u32 size = 0; + unsigned long handle = 0; + u32 eid, offset; + u64 addr; + char *dst = NULL; + u16 gid; + + eid = hyperhold_io_extent(hpio); +retry: + zram_slot_lock(zram, index); + if (!zram_test_flag(zram, index, ZRAM_GWB)) + goto unlock; + addr = zram_get_handle(zram, index); + if (hyperhold_addr_extent(addr) != eid) + goto unlock; + size = zram_get_obj_size(zram, index); + if (handle) + goto move; + handle = zs_malloc(zram->mem_pool, size, GFP_NOWAIT); + if (handle) + goto move; + zram_slot_unlock(zram, index); + handle = zs_malloc(zram->mem_pool, size, GFP_NOIO | __GFP_NOFAIL); + if (handle) + goto retry; + BUG(); + + return; +move: + offset = hyperhold_addr_offset(addr); + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + copy_obj(hpio, offset, dst, size, false); + zs_unmap_object(zram->mem_pool, handle); + zram_set_handle(zram, index, handle); + zram_clear_flag(zram, index, ZRAM_GWB); + gid = zram_get_memcg_id(zram, index); + zgrp_obj_insert(zram->zgrp, index, gid); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zgrp_obj_stats_inc(zram->zgrp, gid, size); + pr_debug("move obj %u of group %u from hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size, offset); +unlock: + zram_slot_unlock(zram, index); +} + + +#define NR_ISOLATE 32 +static bool move_extent_from_hpio(struct zram *zram, struct hpio *hpio) +{ + u32 idxs[NR_ISOLATE]; + u32 eid; + u32 nr; + int i; + bool last = false; + + eid = hyperhold_io_extent(hpio); +repeat: + nr = wbgrp_isolate_objs(zram->zgrp, eid, idxs, NR_ISOLATE, &last); + for (i = 0; i < nr; i++) + move_obj_from_hpio(zram, idxs[i], hpio); + if (last) + return true; + if (nr) + goto repeat; + + return false; +} + +struct hpio_priv { + struct zram *zram; + u16 gid; +}; + +static void write_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (hyperhold_io_success(hpio)) + goto out; + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u32 collect_objs(struct zram *zram, u16 gid, struct hpio *hpio, u32 ext_size) +{ + u32 offset = 0; + u32 last_offset; + u32 nr; + u32 idxs[NR_ISOLATE]; + int i; + +more: + last_offset = offset; + nr = zgrp_isolate_objs(zram->zgrp, gid, idxs, NR_ISOLATE, NULL); + for (i = 0; i < nr; i++) + offset += move_obj_to_hpio(zram, idxs[i], gid, hpio, offset); + pr_debug("%u data attached, offset = %u.\n", offset - last_offset, offset); + if (offset < ext_size && offset != last_offset) + goto more; + + return offset; +} + +static u64 write_one_extent(struct zram *zram, u16 gid) +{ + int eid; + struct hpio *hpio = NULL; + struct hpio_priv *priv = NULL; + u32 size = 0; + int ret; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + return 0; + priv->gid = gid; + priv->zram = zram; + eid = hyperhold_alloc_extent(); + if (eid < 0) + goto err; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_WRITE); + if (!hpio) + goto free_extent; + + zgrp_get_ext(zram->zgrp, eid); + size = collect_objs(zram, gid, hpio, hyperhold_extent_size(eid)); + if (size == 0) { + pr_err("group %u has no data in zram.\n", gid); + zgrp_put_ext(zram->zgrp, eid); + goto put_hpio; + } + zgrp_ext_insert(zram->zgrp, eid, gid); + if (zgrp_put_ext(zram->zgrp, eid)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + + ret = hyperhold_write_async(hpio, write_endio, priv); + if (ret) + goto move_back; + + return size; +move_back: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + eid = -EINVAL; +put_hpio: + hyperhold_io_put(hpio); +free_extent: + if (eid >= 0) + hyperhold_free_extent(eid); +err: + kfree(priv); + + return 0; +} + +static void read_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (!hyperhold_io_success(hpio)) { + BUG(); + goto out; + } + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u64 read_one_extent(struct zram *zram, u32 eid, u16 gid) +{ + struct hpio *hpio = NULL; + u32 ext_size = 0; + int ret; + struct hpio_priv *priv = NULL; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + goto err; + priv->gid = gid; + priv->zram = zram; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) + goto err; + ext_size = hyperhold_extent_size(eid); + ret = hyperhold_read_async(hpio, read_endio, priv); + if (ret) + goto err; + + return ext_size; +err: + hyperhold_io_put(hpio); + kfree(priv); + + return 0; +} + +static void sync_read_endio(struct hpio *hpio) +{ + hyperhold_io_complete(hpio); +} + +static int read_one_obj_sync(struct zram *zram, u32 index) +{ + struct hpio *hpio = NULL; + int ret; + u32 eid; + u16 gid; + u32 size; + + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + + pr_debug("read obj %u.\n", index); + + gid = zram_get_memcg_id(zram, index); + eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + size = zram_get_obj_size(zram, index); + wbgrp_fault_stats_inc(zram->zgrp, gid, eid, size); +check: + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto read; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +read: + zram_set_flag(zram, index, ZRAM_FAULT); + zram_slot_unlock(zram, index); + + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) { + ret = -ENOMEM; + goto out; + } + ret = hyperhold_read_async(hpio, sync_read_endio, NULL); + /* io submit error */ + if (ret && ret != -EAGAIN) + goto out; + + hyperhold_io_wait(hpio); + + /* if not reset to zero, will return err sometimes and cause SIG_BUS error */ + ret = 0; + + /* get a write io, data is ready, copy the pages even write failed */ + if (op_is_write(hyperhold_io_operate(hpio))) + goto move; + /* read io failed, return -EIO */ + if (!hyperhold_io_success(hpio)) { + ret = -EIO; + goto out; + } + /* success, copy the data and free extent */ +move: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + move_obj_from_hpio(zram, index, hpio); +out: + hyperhold_io_put(hpio); + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_FAULT); + wake_up(&zram->zgrp->wbgrp.fault_wq); + + return ret; +} + +u64 read_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u32 eid; + u64 read_size = 0; + u32 nr; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_debug("read %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > read_size) { + nr = zgrp_isolate_exts(zram->zgrp, gid, &eid, 1, NULL); + if (!nr) + break; + read_size += read_one_extent(zram, eid, gid); + } + + return read_size; +} + +u64 write_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u64 write_size = 0; + u64 size = 0; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK(zram->zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_debug("write %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > write_size) { + size = write_one_extent(zram, gid); + if (!size) + break; + write_size += size; + } + + atomic64_add(write_size, &zram->zgrp->stats[0].write_size); + atomic64_add(write_size, &zram->zgrp->stats[gid].write_size); + return write_size; +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +#include +#define ZGRP_TEST_MAX_GRP 101 +#endif + +int zram_group_fault_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return 0; + + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + zgrp_fault_stats_inc(zram->zgrp, gid, size); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + return read_one_obj_sync(zram, index); +#else + return 0; +#endif +} + +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg) +{ + u16 gid; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return; + } + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + if (!CHECK(memcg || !memcg->id.id, "obj %u has no memcg!\n", index)) + return; + gid = zram_get_memcg_id(zram, index); + if (!CHECK(!gid, "obj %u has gid %u.\n", index, gid)) + BUG(); + + gid = memcg->id.id; + zram_set_memcg_id(zram, index, gid); + zgrp_obj_insert(zram->zgrp, index, gid); + zgrp_obj_stats_inc(zram->zgrp, gid, zram_get_obj_size(zram, index)); +} + +void zram_group_untrack_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return; + } + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +check: + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto clear; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +clear: +#endif + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + if (!gid) + return; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (zram_test_flag(zram, index, ZRAM_GWB)) { + u32 eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + + if (wbgrp_obj_delete(zram->zgrp, index, eid)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + zram_clear_flag(zram, index, ZRAM_GWB); + zram_set_memcg_id(zram, index, 0); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zram_set_handle(zram, index, 0); + return; + } +#endif + zgrp_obj_delete(zram->zgrp, index, gid); + zram_set_memcg_id(zram, index, 0); + zgrp_obj_stats_dec(zram->zgrp, gid, size); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid) +{ + if (op == 0) + zram_group_dump(zram->zgrp, gid, index); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (op == 22) + read_group_objs(zram, gid, index); + if (op == 23) + write_group_objs(zram, gid, index); + if (op == 20) { + if (index) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); + else + zram_group_remove_writeback(zram->zgrp); + } +#endif +} +#endif + +static u64 group_obj_stats(struct zram *zram, u16 gid, int type) +{ + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK_BOUND(gid, 0, zram->zgrp->nr_grp - 1)) + return 0; + + if (type == CACHE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].zram_size); + else if (type == CACHE_PAGE) + return atomic_read(&zram->zgrp->stats[gid].zram_pages); + else if (type == CACHE_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].zram_fault); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (type == SWAP_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].wb_size); + else if (type == SWAP_PAGE) + return atomic_read(&zram->zgrp->stats[gid].wb_pages); + else if (type == READ_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].read_size); + else if (type == WRITE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].write_size); + else if (type == SWAP_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].wb_fault); + BUG(); +#endif + + return 0; +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return read_group_objs((struct zram *)priv, gid, req_size); +} + +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return write_group_objs((struct zram *)priv, gid, req_size); +} +#else +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +#endif + + +static u64 zram_group_data_size(u16 gid, int type, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return group_obj_stats((struct zram *)priv, gid, type); +} + +struct group_swap_ops zram_group_ops = { + .group_read = zram_group_read, + .group_write = zram_group_write, + .group_data_size = zram_group_data_size, +}; + +static int register_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return -EINVAL; + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return -EINVAL; + } + + zram->zgrp->gsdev = register_group_swap(&zram_group_ops, zram); + if (!zram->zgrp->gsdev) { + pr_err("register zram group failed!\n"); + return -ENOMEM; + } + + return 0; +} + +static void unregister_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return; + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return; + } + + unregister_group_swap(zram->zgrp->gsdev); + zram->zgrp->gsdev = NULL; +} + +void zram_group_init(struct zram *zram, u32 nr_obj) +{ + unsigned int ctrl = zram->zgrp_ctrl; + + if (ctrl == ZGRP_NONE) + return; + zram->zgrp = zram_group_meta_alloc(nr_obj, ZGRP_MAX_GRP - 1); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (ctrl == ZGRP_WRITE) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); +#endif + register_zram_group(zram); +} + +void zram_group_deinit(struct zram *zram) +{ + unregister_zram_group(zram); + zram_group_meta_free(zram->zgrp); + zram->zgrp = NULL; +} diff --git a/drivers/block/zram/zram_group/zlist.c b/drivers/block/zram/zram_group/zlist.c new file mode 100644 index 0000000000000000000000000000000000000000..fd8295ecadaacb27312f7bde75cc48dd9940f54e --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zlist.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZLIST]" fmt + +#include +#include +#include + +#include "zlist.h" + +#define assert(expr) \ + do { \ + if (expr) \ + break; \ + pr_err("assertion [%s] failed: in func<%s> at %s:%d\n", \ + #expr, __func__, __FILE__, __LINE__); \ + BUG(); \ + } while (0) + +static inline void zlist_node_lock(struct zlist_node *node) +{ + bit_spin_lock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +static inline void zlist_node_unlock(struct zlist_node *node) +{ + bit_spin_unlock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +#ifdef CONFIG_ZLIST_DEBUG +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} + +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} +#else +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +#endif + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp) +{ + struct zlist_table *tab = kmalloc(sizeof(struct zlist_table), gfp); + + if (!tab) + return NULL; + tab->idx2node = i2n; + tab->private = private; + + return tab; +} + +void zlist_lock(u32 idx, struct zlist_table *tab) +{ + zlist_node_lock(idx2node(idx, tab)); +} + +void zlist_unlock(u32 idx, struct zlist_table *tab) +{ + zlist_node_unlock(idx2node(idx, tab)); +} + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 nid = head->next; + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_add_check(tab, head, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = hid; + node->next = nid; + if (idx != hid) + zlist_node_unlock(node); + head->next = idx; + if (nid != hid) + zlist_node_lock(next); + next->prev = idx; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_add_check(tab, head, node, next); +} + +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 tid = head->prev; + struct zlist_node *tail = idx2node(tid, tab); + + zlist_before_add_check(tab, tail, node, head); + if (idx != hid) + zlist_node_lock(node); + node->prev = tid; + node->next = hid; + if (idx != hid) + zlist_node_unlock(node); + head->prev = idx; + if (tid != hid) + zlist_node_lock(tail); + tail->next = idx; + if (tid != hid) + zlist_node_unlock(tail); + zlist_after_add_check(tab, tail, node, head); +} + +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + u32 pid = node->prev; + u32 nid = node->next; + struct zlist_node *prev = idx2node(pid, tab); + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_del_check(tab, prev, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = idx; + node->next = idx; + if (idx != hid) + zlist_node_unlock(node); + if (pid != hid) + zlist_node_lock(prev); + prev->next = nid; + if (pid != hid) + zlist_node_unlock(prev); + if (nid != hid) + zlist_node_lock(next); + next->prev = pid; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_del_check(tab, prev, node, next); + + return zlist_is_isolated_nolock(hid, tab); +} + +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + return (node->prev == idx) && (node->next == idx); +} + +bool zlist_set_priv(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + zlist_node_lock(node); + ret = !test_and_set_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + zlist_node_unlock(node); + + return ret; +} + +bool zlist_clr_priv_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + ret = !test_and_clear_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + + return ret; +} + +bool zlist_test_priv_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + ret = test_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + + return ret; +} + +void zlist_node_init(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + memset(node, 0, sizeof(struct zlist_node)); + node->prev = idx; + node->next = idx; +} diff --git a/drivers/block/zram/zram_group/zlist.h b/drivers/block/zram/zram_group/zlist.h new file mode 100644 index 0000000000000000000000000000000000000000..a7cbf37509e9291a1feee9dbc9ac78f79a924f42 --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zlist.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZLIST_H_ +#define _ZLIST_H_ + +#define ZLIST_IDX_SHIFT 30 +#define ZLIST_LOCK_BIT ZLIST_IDX_SHIFT +#define ZLIST_PRIV_BIT ((ZLIST_IDX_SHIFT << 1) + 1) + +#define ZLIST_IDX_MAX (1 << ZLIST_IDX_SHIFT) + +struct zlist_node { + u64 prev : ZLIST_IDX_SHIFT; + u64 lock : 1; + u64 next : ZLIST_IDX_SHIFT; + u64 priv : 1; +}; + +struct zlist_table { + struct zlist_node *(*idx2node)(u32 idx, void *priv); + void *private; +}; + +static inline struct zlist_node *idx2node(u32 idx, struct zlist_table *tab) +{ + return tab->idx2node(idx, tab->private); +} + +static inline u32 next_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->next; +} + +static inline u32 prev_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->prev; +} + +static inline void zlist_table_free(struct zlist_table *tab) +{ + kfree(tab); +} + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp); + +void zlist_lock(u32 idx, struct zlist_table *tab); +void zlist_unlock(u32 idx, struct zlist_table *tab); + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab); +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab); + +static inline void zlist_add(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline void zlist_add_tail(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_tail_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline bool zlist_del(u32 hid, u32 idx, struct zlist_table *tab) +{ + bool ret = false; + + zlist_lock(hid, tab); + ret = zlist_del_nolock(hid, idx, tab); + zlist_unlock(hid, tab); + + return ret; +} + +bool zlist_set_priv(u32 idx, struct zlist_table *tab); +bool zlist_clr_priv_nolock(u32 idx, struct zlist_table *tab); +bool zlist_test_priv_nolock(u32 idx, struct zlist_table *tab); + +void zlist_node_init(u32 idx, struct zlist_table *tab); + +#define zlist_for_each_entry(idx, hid, tab) \ + for ((idx) = next_idx(hid, tab); (idx) != (hid); \ + (idx) = next_idx(idx, tab)) +#define zlist_for_each_entry_reverse(idx, hid, tab) \ + for ((idx) = prev_idx(hid, tab); (idx) != (hid); \ + (idx) = prev_idx(idx, tab)) +#endif diff --git a/drivers/block/zram/zram_group/zram_group.c b/drivers/block/zram/zram_group/zram_group.c new file mode 100644 index 0000000000000000000000000000000000000000..9a023e77d5cdb9c90f2b5c2682d1373135e7d86f --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zram_group.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZRAM_GROUP]" fmt + +#include +#include +#include "zram_group.h" + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +/* + * idx2node for obj table + */ +static struct zlist_node *get_obj(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->nr_obj) + return &zgrp->obj[index]; + + index -= zgrp->nr_obj; + BUG_ON(!index); + if (index < zgrp->nr_grp) + return &zgrp->grp_obj_head[index]; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + index -= zgrp->nr_grp; + BUG_ON(index >= zgrp->wbgrp.nr_ext); + return &zgrp->wbgrp.ext_obj_head[index]; +#endif + BUG(); +} + +void zram_group_meta_free(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zram_group_remove_writeback(zgrp); +#endif + vfree(zgrp->grp_obj_head); + vfree(zgrp->obj); + zlist_table_free(zgrp->obj_tab); + vfree(zgrp->stats); + kfree(zgrp); + + pr_info("zram group freed.\n"); +} + +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp) +{ + struct zram_group *zgrp = NULL; + u32 i; + + if (!CHECK_BOUND(nr_grp, 1, ZGRP_MAX_GRP - 1)) + return NULL; + + /* reserve gid 0 */ + nr_grp++; + if (!CHECK_BOUND(nr_obj, 1, ZGRP_MAX_OBJ)) + return NULL; + zgrp = kzalloc(sizeof(struct zram_group), GFP_KERNEL); + if (!zgrp) + goto err; + zgrp->nr_obj = nr_obj; + zgrp->nr_grp = nr_grp; + zgrp->grp_obj_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!zgrp->grp_obj_head) + goto err; + zgrp->obj = vmalloc(sizeof(struct zlist_node) * zgrp->nr_obj); + if (!zgrp->obj) + goto err; + zgrp->obj_tab = zlist_table_alloc(get_obj, zgrp, GFP_KERNEL); + if (!zgrp->obj_tab) + goto err; + zgrp->stats = vzalloc(sizeof(struct zram_group_stats) * zgrp->nr_grp); + if (!zgrp->stats) + goto err; + zgrp->gsdev = NULL; + + for (i = 0; i < zgrp->nr_obj; i++) + zlist_node_init(i, zgrp->obj_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + zgrp->nr_obj, zgrp->obj_tab); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zgrp->wbgrp.enable = false; + mutex_init(&zgrp->wbgrp.init_lock); +#endif + pr_info("zram_group alloc succ.\n"); + return zgrp; +err: + pr_err("zram_group alloc failed!\n"); + zram_group_meta_free(zgrp); + + return NULL; +} + +/* + * insert obj at @index into group @gid as the HOTTEST obj + */ +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add(hid, index, zgrp->obj_tab); + pr_debug("insert obj %u to group %u\n", index, gid); +} + +/* + * remove obj at @index from group @gid + */ +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + pr_debug("delete obj %u from group %u\n", index, gid); + hid = gid + zgrp->nr_obj; + + return zlist_del(hid, index, zgrp->obj_tab); +} + +/* + * try to isolate the last @nr objs of @gid, store their indexes in array @idxs + * and @return the obj cnt actually isolated. isolate all objs if nr is 0. + */ +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!zgrp) { + pr_debug("zram group is not enable!"); + return 0; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = gid + zgrp->nr_obj; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_debug("isolated %u objs from group %u.\n", cnt, gid); + + return cnt; +} + +/* + * check if the obj at @index is isolate from zram groups + */ +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index) +{ + bool ret = false; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + + zlist_lock(index, zgrp->obj_tab); + ret = zlist_is_isolated_nolock(index, zgrp->obj_tab); + zlist_unlock(index, zgrp->obj_tab); + + return ret; +} +/* + * insert obj at @index into group @gid as the COLDEST obj + */ +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_debug("putback obj %u to group %u\n", index, gid); +} + +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_inc(&zgrp->stats[gid].zram_pages); + atomic64_add(size, &zgrp->stats[gid].zram_size); + atomic_inc(&zgrp->stats[0].zram_pages); + atomic64_add(size, &zgrp->stats[0].zram_size); +} + +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_dec(&zgrp->stats[gid].zram_pages); + atomic64_sub(size, &zgrp->stats[gid].zram_size); + atomic_dec(&zgrp->stats[0].zram_pages); + atomic64_sub(size, &zgrp->stats[0].zram_size); +} + +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].zram_fault); + atomic64_inc(&zgrp->stats[0].zram_fault); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index) +{ + u32 hid, idx; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + hid = gid + zgrp->nr_obj; + if (gid == 0) { + struct zlist_node *node = NULL; + + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + node = idx2node(index, zgrp->obj_tab); + pr_err("dump index %u = %u %u %u %u\n", index, + node->prev, node->next, + node->lock, node->priv); + } else { + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + pr_err("dump index of group %u\n", gid); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) + pr_err("%u\n", idx); + } +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +/* + * idx2node for ext table + */ +static struct zlist_node *get_ext(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->wbgrp.nr_ext) + return &zgrp->wbgrp.ext[index]; + + index -= zgrp->wbgrp.nr_ext; + BUG_ON(!index); + return &zgrp->wbgrp.grp_ext_head[index]; +} + +/* + * disable writeback for zram group @zgrp + */ +void zram_group_remove_writeback(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + zgrp->wbgrp.enable = false; + vfree(zgrp->wbgrp.grp_ext_head); + vfree(zgrp->wbgrp.ext); + zlist_table_free(zgrp->wbgrp.ext_tab); + vfree(zgrp->wbgrp.ext_obj_head); + pr_info("zram group writeback is removed.\n"); +} + +/* + * init & enable writeback on exist zram group @zgrp with a backing device of + * @nr_ext extents. + */ +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext) +{ + struct writeback_group *wbgrp = NULL; + u32 i; + int ret = 0; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return -EINVAL; + + mutex_lock(&zgrp->wbgrp.init_lock); + if (!CHECK(!zgrp->wbgrp.enable, "zram group writeback is already enable!\n")) + goto out; + if (!CHECK_BOUND(nr_ext, 1, ZGRP_MAX_EXT)) { + ret = -EINVAL; + goto out; + } + wbgrp = &zgrp->wbgrp; + wbgrp->nr_ext = nr_ext; + wbgrp->grp_ext_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!wbgrp->grp_ext_head) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext_obj_head = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext_obj_head) { + ret = -ENOMEM; + goto out; + } + + wbgrp->ext_tab = zlist_table_alloc(get_ext, zgrp, GFP_KERNEL); + if (!wbgrp->ext_tab) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i, wbgrp->ext_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + wbgrp->nr_ext, wbgrp->ext_tab); + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i + zgrp->nr_obj + zgrp->nr_grp, zgrp->obj_tab); + + init_waitqueue_head(&wbgrp->fault_wq); + wbgrp->enable = true; + pr_info("zram group writeback is enabled.\n"); +out: + mutex_unlock(&zgrp->wbgrp.init_lock); + + if (ret) { + zram_group_remove_writeback(zgrp); + pr_err("zram group writeback enable failed!\n"); + } + + return ret; +} + +/* + * attach extent at @eid to group @gid as the HOTTEST extent + */ +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_add(hid, eid, zgrp->wbgrp.ext_tab); + pr_debug("insert extent %u to group %u\n", eid, gid); +} + +/* + * remove extent at @eid from group @gid + */ +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + bool isolated = false; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + + zlist_lock(eid, zgrp->wbgrp.ext_tab); + isolated = zlist_is_isolated_nolock(eid, zgrp->wbgrp.ext_tab); + zlist_unlock(eid, zgrp->wbgrp.ext_tab); + if (isolated) { + pr_debug("extent %u is already isolated, skip delete.\n", eid); + return false; + } + + pr_debug("delete extent %u from group %u\n", eid, gid); + hid = gid + zgrp->wbgrp.nr_ext; + return zlist_del(hid, eid, zgrp->wbgrp.ext_tab); +} + +/* + * try to isolate the first @nr exts of @gid, store their eids in array @eids + * and @return the cnt actually isolated. isolate all exts if nr is 0. + */ +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!zgrp) { + pr_debug("zram group is not enable!"); + return 0; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(eids, "return array eids is null!\n")) + return 0; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_lock(hid, zgrp->wbgrp.ext_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->wbgrp.ext_tab) { + eids[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, eids[i], zgrp->wbgrp.ext_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->wbgrp.ext_tab); + zlist_unlock(hid, zgrp->wbgrp.ext_tab); + + pr_debug("isolated %u exts from group %u.\n", cnt, gid); + + return cnt; +} + +void zgrp_get_ext(struct zram_group *zgrp, u32 eid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_set_priv(hid, zgrp->obj_tab); + pr_info("get extent %u\n", eid); +} + +bool zgrp_put_ext(struct zram_group *zgrp, u32 eid) +{ + u32 hid; + bool ret = false; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_lock(hid, zgrp->obj_tab); + zlist_clr_priv_nolock(hid, zgrp->obj_tab); + ret = zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_info("put extent %u, ret = %d\n", eid, ret); + + return ret; +} + +/* + * insert obj at @index into extent @eid + */ +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_debug("insert obj %u to extent %u\n", index, eid); +} + +/* + * remove obj at @index from extent @eid + */ +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + bool ret = false; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + pr_debug("delete obj %u from extent %u\n", index, eid); + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + + zlist_lock(hid, zgrp->obj_tab); + ret = zlist_del_nolock(hid, index, zgrp->obj_tab) + && !zlist_test_priv_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + return ret; +} + +/* + * try to isolate the first @nr writeback objs of @eid, store their indexes in + * array @idxs and @return the obj cnt actually isolated. isolate all objs if + * @nr is 0. + */ +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!zgrp) { + pr_debug("zram group is not enable!"); + return 0; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab) + && !zlist_test_priv_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_debug("isolated %u objs from extent %u.\n", cnt, eid); + + return cnt; +} + +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_inc(&zgrp->stats[gid].wb_pages); + atomic64_add(size, &zgrp->stats[gid].wb_size); + atomic_inc(&zgrp->stats[0].wb_pages); + atomic64_add(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_dec(&zgrp->stats[gid].wb_pages); + atomic64_sub(size, &zgrp->stats[gid].wb_size); + atomic_dec(&zgrp->stats[0].wb_pages); + atomic64_sub(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].wb_fault); + atomic64_inc(&zgrp->stats[0].wb_fault); +} +#endif diff --git a/drivers/block/zram/zram_group/zram_group.h b/drivers/block/zram/zram_group/zram_group.h new file mode 100644 index 0000000000000000000000000000000000000000..9b184b7bda77b55ddb78b37a0d0af06c2e04caa3 --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zram_group.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZRAM_GROUP_H_ +#define _ZRAM_GROUP_H_ + +#include +#include + +#include "zlist.h" + +#define ZGRP_MAX_GRP USHRT_MAX +#define ZGRP_MAX_OBJ (1 << 30) + +enum { + ZGRP_NONE = 0, + ZGRP_TRACK, +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZGRP_WRITE, +#endif +}; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +#define ZGRP_MAX_EXT (ZLIST_IDX_MAX - ZGRP_MAX_GRP - ZGRP_MAX_OBJ) +struct writeback_group { + bool enable; + u32 nr_ext; + struct zlist_node *grp_ext_head; + struct zlist_node *ext; + struct zlist_table *ext_tab; + struct zlist_node *ext_obj_head; + struct mutex init_lock; + wait_queue_head_t fault_wq; +}; +#endif + +struct zram_group_stats { + atomic64_t zram_size; + atomic_t zram_pages; + atomic64_t zram_fault; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + atomic64_t wb_size; + atomic_t wb_pages; + atomic64_t wb_fault; + atomic_t wb_exts; + atomic64_t write_size; + atomic64_t read_size; +#endif +}; + +struct zram_group { + u32 nr_obj; + u32 nr_grp; + struct zlist_node *grp_obj_head; + struct zlist_node *obj; + struct zlist_table *obj_tab; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + struct writeback_group wbgrp; +#endif + struct group_swap_device *gsdev; + struct zram_group_stats *stats; +}; + +void zram_group_meta_free(struct zram_group *zgrp); +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp); +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid); +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid); +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last); +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index); +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid); +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +void zram_group_remove_writeback(struct zram_group *zgrp); +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext); +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid); +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid); +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last); +void zgrp_get_ext(struct zram_group *zgrp, u32 eid); +bool zgrp_put_ext(struct zram_group *zgrp, u32 eid); +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid); +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid); +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last); +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +#endif +#endif diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig index e4dc53a364282457e6a85fbc9a09f27ad561c401..c86a4756a29b8d5addd9ed0e15b2f331f54a361b 100644 --- a/drivers/dma-buf/Kconfig +++ b/drivers/dma-buf/Kconfig @@ -65,6 +65,19 @@ config DMABUF_SELFTESTS default n depends on DMA_SHARED_BUFFER +config DMABUF_PROCESS_INFO + bool "Show dmabuf usage of all processes" + default n + depends on DMA_SHARED_BUFFER + depends on PROC_FS || DEBUG_FS + help + Choose this option to show dmabuf objects usage of all processes. + Firstly, with this option, when a process creates a dmabuf object, + its pid and task_comm will be recorded in the dmabuf. + Secondly, this option creates dma_buf/process_bufinfo file in + debugfs (if DEBUG_FS enabled) and process_dmabuf_info file in procfs + (if PROC_FS enabled) to show dmabuf objects usage of all processes. + menuconfig DMABUF_HEAPS bool "DMA-BUF Userland Memory Heaps" select DMA_SHARED_BUFFER diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile index 70ec901edf2c59f427e9b09e159426eb65405377..cdb3bb0493a94eb40d0b4a9498945b7194f159ad 100644 --- a/drivers/dma-buf/Makefile +++ b/drivers/dma-buf/Makefile @@ -16,3 +16,5 @@ dmabuf_selftests-y := \ st-dma-resv.o obj-$(CONFIG_DMABUF_SELFTESTS) += dmabuf_selftests.o + +obj-$(CONFIG_DMABUF_PROCESS_INFO) += dma-buf-process-info.o diff --git a/drivers/dma-buf/dma-buf-process-info.c b/drivers/dma-buf/dma-buf-process-info.c new file mode 100755 index 0000000000000000000000000000000000000000..ec8ff826574b84610bd4cf7f7021b9be62d0d725 --- /dev/null +++ b/drivers/dma-buf/dma-buf-process-info.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA-BUF: dmabuf usage of all processes statistics. + * + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#include +#include +#include +#include +#include + +#include "dma-buf-process-info.h" + +static struct proc_dir_entry *proc_dmabuf_entry; + +struct dmabuf_task_info_args { + struct seq_file *seq; + struct task_struct *tsk; + size_t tsk_dmabuf_bytes; +}; + +void init_dma_buf_task_info(struct dma_buf *buf) +{ + struct task_struct *tsk = NULL; + + if (IS_ERR_OR_NULL(buf)) + return; + + get_task_struct(current->group_leader); + task_lock(current->group_leader); + tsk = current->group_leader; + buf->exp_pid = task_pid_nr(tsk); + if (tsk->flags & PF_KTHREAD) + tsk = NULL; + task_unlock(current->group_leader); + put_task_struct(current->group_leader); + + if (tsk) + get_task_comm(buf->exp_task_comm, tsk); + else /* kernel task */ + strncpy(buf->exp_task_comm, "[kernel task]", + sizeof(buf->exp_task_comm)); +} + +pid_t dma_buf_exp_pid(const struct dma_buf *buf) +{ + if (IS_ERR_OR_NULL(buf)) + return 0; + + return buf->exp_pid; +} + +const char *dma_buf_exp_task_comm(const struct dma_buf *buf) +{ + if (IS_ERR_OR_NULL(buf)) + return NULL; + + return buf->exp_task_comm; +} + +static int dma_buf_single_file_show(const void *data, struct file *f, + unsigned int fd) +{ + struct dmabuf_task_info_args *tsk_info = NULL; + struct task_struct *tsk = NULL; + struct dma_buf *buf = NULL; + + tsk_info = (struct dmabuf_task_info_args *)data; + if (IS_ERR_OR_NULL(tsk_info) || IS_ERR_OR_NULL(tsk_info->seq)) + return 0; + + tsk = tsk_info->tsk; + buf = get_dma_buf_from_file(f); + if (IS_ERR_OR_NULL(tsk) || IS_ERR_OR_NULL(buf)) + return 0; + + tsk_info->tsk_dmabuf_bytes += buf->size; + + spin_lock(&buf->name_lock); + seq_printf(tsk_info->seq, + "%-16s %-16d %-16u %-16zu %-16lu %-16d %-16s %s \t %s\n", + tsk->comm, + tsk->pid, + fd, + buf->size, + file_inode(buf->file)->i_ino, + buf->exp_pid, + buf->exp_task_comm, + buf->name ?: "NULL", + buf->exp_name ?: "NULL"); + spin_unlock(&buf->name_lock); + + return 0; +} + +static int dma_buf_process_info_show(struct seq_file *s, void *unused) +{ + struct dmabuf_task_info_args task_info = { NULL, NULL, 0 }; + struct task_struct *tsk = NULL; + + seq_puts(s, "Dma-buf objects usage of processes:\n"); + seq_printf(s, "%-16s %-16s %-16s %-16s %-16s %-16s %-16s %s \t %s\n", + "Process", "pid", "fd", "size_bytes", "ino", "exp_pid", + "exp_task_comm", "buf_name", "exp_name"); + + task_info.seq = s; + + rcu_read_lock(); + for_each_process(tsk) { + task_info.tsk = tsk; + task_info.tsk_dmabuf_bytes = 0; + + task_lock(tsk); + iterate_fd(tsk->files, 0, dma_buf_single_file_show, + (void *)&task_info); + if (task_info.tsk_dmabuf_bytes) + seq_printf(s, "Total dmabuf size of %s: %zu bytes\n", + tsk->comm, task_info.tsk_dmabuf_bytes); + task_unlock(tsk); + } + rcu_read_unlock(); + + return 0; +} + +void dma_buf_process_info_init_procfs(void) +{ + proc_dmabuf_entry = proc_create_single("process_dmabuf_info", 0444, + NULL, + dma_buf_process_info_show); + if (!proc_dmabuf_entry) + pr_err("%s: create node /proc/process_dmabuf_info failed\n", + __func__); +} + +void dma_buf_process_info_uninit_procfs(void) +{ + if (!proc_dmabuf_entry) + return; + + proc_remove(proc_dmabuf_entry); +} + +DEFINE_SHOW_ATTRIBUTE(dma_buf_process_info); + +int dma_buf_process_info_init_debugfs(struct dentry *parent) +{ + struct dentry *debugfs_file = NULL; + int err = 0; + + if (IS_ERR_OR_NULL(parent)) + return -EINVAL; + + debugfs_file = debugfs_create_file("process_bufinfo", S_IRUGO, + parent, NULL, + &dma_buf_process_info_fops); + if (IS_ERR(debugfs_file)) { + pr_err("dma_buf: debugfs: create process_bufinfo failed\n"); + err = PTR_ERR(debugfs_file); + } + + pr_err("dma_buf: debugfs: create process_bufinfo\n"); + + return err; +} diff --git a/drivers/dma-buf/dma-buf-process-info.h b/drivers/dma-buf/dma-buf-process-info.h new file mode 100755 index 0000000000000000000000000000000000000000..1275c1c7e2aaa68503962f4d9daa77541141e840 --- /dev/null +++ b/drivers/dma-buf/dma-buf-process-info.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DMA-BUF: dmabuf usage of all processes statistics. + * + * Copyright (c) 2022 Huawei Device Co., Ltd. + */ + +#ifndef __DMA_BUF_PROCESS_INFO_H +#define __DMA_BUF_PROCESS_INFO_H + +#ifdef CONFIG_DMABUF_PROCESS_INFO +/** + * init_dma_buf_task_info - init exp_pid and exp_task_comm of dma_buf + * @buf: [in] pointer to struct dma_buf. If @buf IS_ERR_OR_NULL, + * return with doing nothing. + */ +void init_dma_buf_task_info(struct dma_buf *buf); + +/** + * dma_buf_exp_pid - return exp_pid of @buf + * @buf: [in] pointer to struct dma_buf + * + * Return 0 if @buf IS_ERR_OR_NULL, else return buf->exp_pid + */ +pid_t dma_buf_exp_pid(const struct dma_buf *buf); + +/** + * dma_buf_exp_task_comm - return exp_task_comm of @buf + * @buf: [in] pointer to struct dma_buf + * + * Return NULL if @buf IS_ERR_OR_NULL, else return buf->exp_task_comm + */ +const char *dma_buf_exp_task_comm(const struct dma_buf *buf); + +/** + * dma_buf_process_info_init_procfs - module init: create node in procfs + */ +void dma_buf_process_info_init_procfs(void); + +/** + * dma_buf_process_info_uninit_procfs - module exit: remove node in procfs + */ +void dma_buf_process_info_uninit_procfs(void); + +/** + * dma_buf_process_info_init_debugfs - create debug node under @parent + * in debugfs. + * @parent: [in] pointer to struct dentry. If @parent IS_ERR_OR_NULL, + * return -EINVAL + * + * Return 0 if success, otherwise return errno. + * + * Note that there is no related uninit function, since the debug node will + * be removed in dma_buf_uninit_debugfs() when dma_buf_deinit() called. + */ +int dma_buf_process_info_init_debugfs(struct dentry *parent); + +#else /* CONFIG_DMABUF_PROCESS_INFO */ + +static inline void init_dma_buf_task_info(struct dma_buf *buf) {} + +static inline pid_t dma_buf_exp_pid(const struct dma_buf *buf) +{ + return 0; +} + +static inline const char *dma_buf_exp_task_comm(const struct dma_buf *buf) +{ + return NULL; +} + +static inline void dma_buf_process_info_init_procfs(void) {} + +static inline void dma_buf_process_info_uninit_procfs(void) {} + +static inline int +dma_buf_process_info_init_debugfs(struct dentry *parent) +{ + return 0; +} +#endif /* CONFIG_DMABUF_PROCESS_INFO */ +#endif /* __DMA_BUF_PROCESS_INFO_H */ + diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 757c0fb77a6cb33aba4fc6fa28f529ae6f507201..23662cfb635cd4bc72936eb5ad17129cae6b9205 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -32,6 +32,7 @@ #include #include "dma-buf-sysfs-stats.h" +#include "dma-buf-process-info.h" static inline int is_dma_buf_file(struct file *); @@ -1700,6 +1701,7 @@ static int dma_buf_init_debugfs(void) err = PTR_ERR(d); } + dma_buf_process_info_init_debugfs(dma_buf_debugfs_dir); return err; } @@ -1717,6 +1719,19 @@ static inline void dma_buf_uninit_debugfs(void) } #endif +#ifdef CONFIG_DMABUF_PROCESS_INFO +struct dma_buf *get_dma_buf_from_file(struct file *f) +{ + if (IS_ERR_OR_NULL(f)) + return NULL; + + if (!is_dma_buf_file(f)) + return NULL; + + return f->private_data; +} +#endif /* CONFIG_DMABUF_PROCESS_INFO */ + static int __init dma_buf_init(void) { int ret; @@ -1732,6 +1747,7 @@ static int __init dma_buf_init(void) mutex_init(&db_list.lock); INIT_LIST_HEAD(&db_list.head); dma_buf_init_debugfs(); + dma_buf_process_info_init_procfs(); return 0; } subsys_initcall(dma_buf_init); @@ -1741,5 +1757,6 @@ static void __exit dma_buf_deinit(void) dma_buf_uninit_debugfs(); kern_unmount(dma_buf_mnt); dma_buf_uninit_sysfs_statistics(); + dma_buf_process_info_uninit_procfs(); } __exitcall(dma_buf_deinit); diff --git a/drivers/hyperhold/Kconfig b/drivers/hyperhold/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..4bba0efd1c3e7c50edb1f67152204814cf2892e2 --- /dev/null +++ b/drivers/hyperhold/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +config HYPERHOLD + bool "Hyperhold driver" + select HYPERHOLD_ZSWAPD + select HYPERHOLD_MEMCG + default n + help + Hyperhold driver. + +config HYPERHOLD_DEBUG + bool "Debug info for Hyperhold driver" + depends on HYPERHOLD + help + Debug info for Hyperhold driver. diff --git a/drivers/hyperhold/Makefile b/drivers/hyperhold/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b45a1a6784669913d3f484bd2b6f7665724e4d3b --- /dev/null +++ b/drivers/hyperhold/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +hyperhold-y := hp_core.o hp_device.o hp_space.o hp_iotab.o + +obj-$(CONFIG_HYPERHOLD) += hyperhold.o diff --git a/drivers/hyperhold/hp_core.c b/drivers/hyperhold/hp_core.c new file mode 100644 index 0000000000000000000000000000000000000000..a2288c1d3f7d2a006ee77a4c1789e946f907ef3e --- /dev/null +++ b/drivers/hyperhold/hp_core.c @@ -0,0 +1,854 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_core.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + + #define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include +#include +#include + +#include "hyperhold.h" +#include "hp_device.h" +#include "hp_space.h" +#include "hp_iotab.h" + +#define HP_DFLT_DEVICE "/dev/by-name/hyperhold" +#define HP_DFLT_EXT_SIZE (1 << 15) +#define HP_DEV_NAME_LEN 256 +#define HP_STATE_LEN 10 + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", #var, (var), (min), (max)) +#define CHECK_INITED CHECK(hyperhold.inited, "hyperhold is not enable!\n") +#define CHECK_ENABLE (CHECK_INITED && CHECK(hyperhold.enable, "hyperhold is readonly!\n")) + +struct hyperhold { + bool enable; + bool inited; + + char device_name[HP_DEV_NAME_LEN]; + u32 extent_size; + u32 enable_soft_crypt; + + struct hp_device dev; + struct hp_space spc; + + struct workqueue_struct *read_wq; + struct workqueue_struct *write_wq; + + struct mutex init_lock; +}; + +struct hyperhold hyperhold; + +atomic64_t mem_used = ATOMIC64_INIT(0); +#ifdef CONFIG_HYPERHOLD_DEBUG +/* + * return the memory overhead of hyperhold module + */ +u64 hyperhold_memory_used(void) +{ + return atomic64_read(&mem_used) + hpio_memory() + space_memory(); +} +#endif + +void hyperhold_disable(bool force) +{ + if (!CHECK_INITED) + return; + if (!force && !CHECK_ENABLE) + return; + + mutex_lock(&hyperhold.init_lock); + hyperhold.enable = false; + if (!wait_for_space_empty(&hyperhold.spc, force)) + goto out; + hyperhold.inited = false; + wait_for_iotab_empty(); + destroy_workqueue(hyperhold.read_wq); + destroy_workqueue(hyperhold.write_wq); + deinit_space(&hyperhold.spc); + crypto_deinit(&hyperhold.dev); + unbind_bdev(&hyperhold.dev); +out: + if (hyperhold.inited) + pr_info("hyperhold is disabled, read only.\n"); + else + pr_info("hyperhold is totally disabled!\n"); + mutex_unlock(&hyperhold.init_lock); +} +EXPORT_SYMBOL(hyperhold_disable); + +void hyperhold_enable(void) +{ + bool enable = true; + + if (hyperhold.inited) + goto out; + + mutex_lock(&hyperhold.init_lock); + if (hyperhold.inited) + goto unlock; + if (!bind_bdev(&hyperhold.dev, hyperhold.device_name)) + goto err1; + if (!crypto_init(&hyperhold.dev, hyperhold.enable_soft_crypt)) + goto err2; + if (!init_space(&hyperhold.spc, hyperhold.dev.dev_size, hyperhold.extent_size)) + goto err3; + hyperhold.read_wq = alloc_workqueue("hyperhold_read", WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!hyperhold.read_wq) + goto err4; + hyperhold.write_wq = alloc_workqueue("hyperhold_write", 0, 0); + if (!hyperhold.write_wq) + goto err5; + hyperhold.inited = true; + goto unlock; +err5: + destroy_workqueue(hyperhold.read_wq); +err4: + deinit_space(&hyperhold.spc); +err3: + crypto_deinit(&hyperhold.dev); +err2: + unbind_bdev(&hyperhold.dev); +err1: + enable = false; +unlock: + mutex_unlock(&hyperhold.init_lock); +out: + if (enable) { + hyperhold.enable = true; + pr_info("hyperhold is enabled.\n"); + } else { + hyperhold.enable = false; + pr_err("hyperhold enable failed!\n"); + } +} +EXPORT_SYMBOL(hyperhold_enable); + +static int enable_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + const struct cred *cred = current_cred(); + char *filter_buf; + + filter_buf = strstrip((char *)buffer); + if (write) { + if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) && + !uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + pr_err("no permission to enable/disable eswap!\n"); + return 0; + } + if (!strcmp(filter_buf, "enable")) + hyperhold_enable(); + else if (!strcmp(filter_buf, "disable")) + hyperhold_disable(false); + else if (!strcmp(filter_buf, "force_disable")) + hyperhold_disable(true); + } else { + if (*lenp < HP_STATE_LEN || *ppos) { + *lenp = 0; + return 0; + } + if (hyperhold.enable) + strcpy(buffer, "enable\n"); + else if (hyperhold.inited) + strcpy(buffer, "readonly\n"); + else + strcpy(buffer, "disable\n"); + *lenp = strlen(buffer); + *ppos += *lenp; +#ifdef CONFIG_HYPERHOLD_DEBUG + pr_info("hyperhold memory overhead = %llu.\n", hyperhold_memory_used()); +#endif + } + return 0; +} + +static int device_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&hyperhold.init_lock); + if (write && hyperhold.inited) { + pr_err("hyperhold device is busy!\n"); + ret = -EBUSY; + goto unlock; + } + ret = proc_dostring(table, write, buffer, lenp, ppos); + if (write && !ret) { + hyperhold.enable_soft_crypt = 1; + pr_info("device changed, default enable soft crypt.\n"); + } +unlock: + mutex_unlock(&hyperhold.init_lock); + + return ret; +} + +static int extent_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&hyperhold.init_lock); + if (write && hyperhold.inited) { + pr_err("hyperhold device is busy!\n"); + ret = -EBUSY; + goto unlock; + } + ret = proc_douintvec(table, write, buffer, lenp, ppos); +unlock: + mutex_unlock(&hyperhold.init_lock); + + return ret; +} + +static int crypto_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&hyperhold.init_lock); + if (write && hyperhold.inited) { + pr_err("hyperhold device is busy!\n"); + ret = -EBUSY; + goto unlock; + } + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); +unlock: + mutex_unlock(&hyperhold.init_lock); + + return ret; +} + +static struct ctl_table_header *hp_sysctl_header; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0) +static struct ctl_table hp_sys_table[] = { + { + .procname = "enable", + .mode = 0666, + .proc_handler = enable_sysctl_handler, + }, + { + .procname = "device", + .data = &hyperhold.device_name, + .maxlen = sizeof(hyperhold.device_name), + .mode = 0644, + .proc_handler = device_sysctl_handler, + }, + { + .procname = "extent_size", + .data = &hyperhold.extent_size, + .maxlen = sizeof(hyperhold.extent_size), + .mode = 0644, + .proc_handler = extent_sysctl_handler, + }, + { + .procname = "soft_crypt", + .data = &hyperhold.enable_soft_crypt, + .maxlen = sizeof(hyperhold.enable_soft_crypt), + .mode = 0644, + .proc_handler = crypto_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; +#else +static struct ctl_table hp_table[] = { + { + .procname = "enable", + .mode = 0666, + .proc_handler = enable_sysctl_handler, + }, + { + .procname = "device", + .data = &hyperhold.device_name, + .maxlen = sizeof(hyperhold.device_name), + .mode = 0644, + .proc_handler = device_sysctl_handler, + }, + { + .procname = "extent_size", + .data = &hyperhold.extent_size, + .maxlen = sizeof(hyperhold.extent_size), + .mode = 0644, + .proc_handler = extent_sysctl_handler, + }, + { + .procname = "soft_crypt", + .data = &hyperhold.enable_soft_crypt, + .maxlen = sizeof(hyperhold.enable_soft_crypt), + .mode = 0644, + .proc_handler = crypto_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; +static struct ctl_table hp_kernel_table[] = { + { + .procname = "hyperhold", + .mode = 0555, + .child = hp_table, + }, + {} +}; +static struct ctl_table hp_sys_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = hp_kernel_table, + }, + {} +}; +#endif + +bool is_hyperhold_enable(void) +{ + return hyperhold.enable; +} + +static int __init hyperhold_init(void) +{ + strcpy(hyperhold.device_name, HP_DFLT_DEVICE); + hyperhold.extent_size = HP_DFLT_EXT_SIZE; + hyperhold.enable_soft_crypt = 1; + mutex_init(&hyperhold.init_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0) + hp_sysctl_header = register_sysctl("kernel/hyperhold", hp_sys_table); +#else + hp_sysctl_header = register_sysctl_table(hp_sys_table); +#endif + if (!hp_sysctl_header) { + pr_err("register hyperhold sysctl table failed!\n"); + return -EINVAL; + } + + return 0; +} + +static void __exit hyperhold_exit(void) +{ + unregister_sysctl_table(hp_sysctl_header); + hyperhold_disable(true); +} + +static struct hp_space *space_of(u32 eid) +{ + return &hyperhold.spc; +} + +/* replace this func for multi devices */ +static struct hp_device *device_of(u32 eid) +{ + return &hyperhold.dev; +} + +/* replace this func for multi devices */ +u32 hyperhold_nr_extent(void) +{ + if (!CHECK_INITED) + return 0; + + return hyperhold.spc.nr_ext; +} +EXPORT_SYMBOL(hyperhold_nr_extent); + +u32 hyperhold_extent_size(u32 eid) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return 0; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return 0; + + return spc->ext_size; +} +EXPORT_SYMBOL(hyperhold_extent_size); + +/* replace this func for multi devices */ +long hyperhold_address(u32 eid, u32 offset) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return -EINVAL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return -EINVAL; + if (!CHECK_BOUND(offset, 0, spc->ext_size - 1)) + return -EINVAL; + + return (u64)eid * spc->ext_size + offset; +} +EXPORT_SYMBOL(hyperhold_address); + +/* replace this func for multi devices */ +int hyperhold_addr_extent(u64 addr) +{ + struct hp_space *spc = NULL; + u32 eid; + + if (!CHECK_INITED) + return -EINVAL; + eid = div_u64(addr, hyperhold.spc.ext_size); + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return -EINVAL; + + return eid; +} +EXPORT_SYMBOL(hyperhold_addr_extent); + +/* replace this func for multi devices */ +int hyperhold_addr_offset(u64 addr) +{ + if (!CHECK_INITED) + return -EINVAL; + + return do_div(addr, hyperhold.spc.ext_size); +} +EXPORT_SYMBOL(hyperhold_addr_offset); + +/* replace this func for multi devices */ +int hyperhold_alloc_extent(void) +{ + if (!CHECK_ENABLE) + return -EINVAL; + + return alloc_eid(&hyperhold.spc); +} +EXPORT_SYMBOL(hyperhold_alloc_extent); + +void hyperhold_free_extent(u32 eid) +{ + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return; + + free_eid(spc, eid); +} +EXPORT_SYMBOL(hyperhold_free_extent); + +void hyperhold_should_free_extent(u32 eid) +{ + struct hpio *hpio = NULL; + struct hp_space *spc = NULL; + + if (!CHECK_INITED) + return; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u", eid)) + return; + + hpio = hpio_get(eid); + if (!hpio) { + free_eid(spc, eid); + return; + } + hpio->free_extent = hyperhold_free_extent; + hpio_put(hpio); +} +EXPORT_SYMBOL(hyperhold_should_free_extent); + +/* + * alloc hpio struct for r/w extent at @eid, will fill hpio with new alloced + * pages if @new_page. @return NULL on fail. + */ +struct hpio *hyperhold_io_alloc(u32 eid, gfp_t gfp, unsigned int op, bool new_page) +{ + struct hpio *hpio = NULL; + struct hp_space *spc; + u32 nr_page; + + if (!CHECK_ENABLE) + return NULL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u!\n", eid)) + return NULL; + + nr_page = spc->ext_size / PAGE_SIZE; + hpio = hpio_alloc(nr_page, gfp, op, new_page); + if (!hpio) + goto err; + hpio->eid = eid; + + return hpio; +err: + hpio_free(hpio); + + return NULL; +} +EXPORT_SYMBOL(hyperhold_io_alloc); + +void hyperhold_io_free(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_free(hpio); +} +EXPORT_SYMBOL(hyperhold_io_free); + +/* + * find exist read hpio of the extent @eid in iotab and inc its refcnt, + * alloc a new hpio and insert it into iotab if there is no hpio for @eid + */ +struct hpio *hyperhold_io_get(u32 eid, gfp_t gfp, unsigned int op) +{ + struct hp_space *spc = NULL; + u32 nr_page; + + if (!CHECK_INITED) + return NULL; + spc = space_of(eid); + if (!CHECK(spc, "invalid eid %u", eid)) + return NULL; + + nr_page = spc->ext_size / PAGE_SIZE; + return hpio_get_alloc(eid, nr_page, gfp, op); +} +EXPORT_SYMBOL(hyperhold_io_get); + +bool hyperhold_io_put(struct hpio *hpio) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + + return hpio_put(hpio); +} +EXPORT_SYMBOL(hyperhold_io_put); + +/* + * notify all threads waiting for this hpio + */ +void hyperhold_io_complete(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_complete(hpio); +} +EXPORT_SYMBOL(hyperhold_io_complete); + +void hyperhold_io_wait(struct hpio *hpio) +{ + if (!CHECK_INITED) + return; + if (!CHECK(hpio, "hpio is null!\n")) + return; + + hpio_wait(hpio); +} +EXPORT_SYMBOL(hyperhold_io_wait); + +bool hyperhold_io_success(struct hpio *hpio) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + + return hpio_get_state(hpio) == HPIO_DONE; +} +EXPORT_SYMBOL(hyperhold_io_success); + +int hyperhold_io_extent(struct hpio *hpio) +{ + if (!CHECK_INITED) + return -EINVAL; + if (!CHECK(hpio, "hpio is null!\n")) + return -EINVAL; + + return hpio->eid; +} +EXPORT_SYMBOL(hyperhold_io_extent); + +int hyperhold_io_operate(struct hpio *hpio) +{ + if (!CHECK_INITED) + return -EINVAL; + if (!CHECK(hpio, "hpio is null!\n")) + return -EINVAL; + + return hpio->op; +} +EXPORT_SYMBOL(hyperhold_io_operate); + +struct page *hyperhold_io_page(struct hpio *hpio, u32 index) +{ + if (!CHECK_INITED) + return NULL; + if (!CHECK(hpio, "hpio is null!\n")) + return NULL; + if (!CHECK_BOUND(index, 0, hpio->nr_page - 1)) + return NULL; + + return hpio->pages[index]; +} +EXPORT_SYMBOL(hyperhold_io_page); + +bool hyperhold_io_add_page(struct hpio *hpio, u32 index, struct page *page) +{ + if (!CHECK_INITED) + return false; + if (!CHECK(hpio, "hpio is null!\n")) + return false; + if (!CHECK(page, "page is null!\n")) + return false; + if (!CHECK_BOUND(index, 0, hpio->nr_page - 1)) + return false; + + get_page(page); + atomic64_add(PAGE_SIZE, &mem_used); + BUG_ON(hpio->pages[index]); + hpio->pages[index] = page; + + return true; +} +EXPORT_SYMBOL(hyperhold_io_add_page); + +u32 hyperhold_io_nr_page(struct hpio *hpio) +{ + if (!CHECK_INITED) + return 0; + if (!CHECK(hpio, "hpio is null!\n")) + return 0; + + return hpio->nr_page; +} +EXPORT_SYMBOL(hyperhold_io_nr_page); + +void *hyperhold_io_private(struct hpio *hpio) +{ + if (!CHECK_INITED) + return NULL; + if (!CHECK(hpio, "hpio is null!\n")) + return NULL; + + return hpio->private; +} +EXPORT_SYMBOL(hyperhold_io_private); + +static struct page *get_encrypted_page(struct hp_device *dev, struct page *page, unsigned int op) +{ + struct page *encrypted_page = NULL; + + if (!dev->ctfm) { + encrypted_page = page; + get_page(encrypted_page); + goto out; + } + + encrypted_page = alloc_page(GFP_NOIO); + if (!encrypted_page) { + pr_err("alloc encrypted page failed!\n"); + goto out; + } + encrypted_page->index = page->index; + + /* just alloc a new page for read */ + if (!op_is_write(op)) + goto out; + + /* encrypt page for write */ + if (soft_crypt_page(dev->ctfm, encrypted_page, page, HP_DEV_ENCRYPT)) { + put_page(encrypted_page); + encrypted_page = NULL; + } +out: + return encrypted_page; +} + +static void put_encrypted_pages(struct bio *bio) +{ + struct bio_vec *bv = NULL; + struct bvec_iter_all iter; + + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); +} + +static void hp_endio_work(struct work_struct *work) +{ + struct hpio *hpio = container_of(work, struct hpio, endio_work); + struct hp_device *dev = NULL; + struct bio_vec *bv = NULL; + struct bvec_iter_all iter; + struct page *page = NULL; + u32 ext_size; + sector_t sec; + int i; + + if (op_is_write(hpio->op)) + goto endio; + ext_size = space_of(hpio->eid)->ext_size; + dev = device_of(hpio->eid); + sec = hpio->eid * ext_size / dev->sec_size; + i = 0; + bio_for_each_segment_all(bv, hpio->bio, iter) { + page = bv->bv_page; + BUG_ON(i >= hpio->nr_page); + BUG_ON(!hpio->pages[i]); + if (dev->ctfm) + BUG_ON(soft_crypt_page(dev->ctfm, hpio->pages[i], page, HP_DEV_DECRYPT)); + sec += PAGE_SIZE / dev->sec_size; + i++; + } +endio: + put_encrypted_pages(hpio->bio); + bio_put(hpio->bio); + if (hpio->endio) + hpio->endio(hpio); +} + +static void hpio_endio(struct bio *bio) +{ + struct hpio *hpio = bio->bi_private; + struct workqueue_struct *wq = NULL; + + pr_info("hpio %p for eid %u returned %d.\n", + hpio, hpio->eid, bio->bi_status); + hpio_set_state(hpio, bio->bi_status ? HPIO_FAIL : HPIO_DONE); + wq = op_is_write(hpio->op) ? hyperhold.write_wq : hyperhold.read_wq; + queue_work(wq, &hpio->endio_work); + atomic64_sub(sizeof(struct bio), &mem_used); +} + +static int hpio_submit(struct hpio *hpio) +{ + struct hp_device *dev = NULL; + struct bio *bio = NULL; + struct page *page = NULL; + u32 ext_size; + sector_t sec; + int i; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0) + dev = device_of(hpio->eid); + bio = bio_alloc(dev->bdev, BIO_MAX_VECS, + hpio->op, GFP_NOIO); +#else + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); +#endif + if (!bio) { + pr_err("bio alloc failed!\n"); + return -ENOMEM; + } + atomic64_add(sizeof(struct bio), &mem_used); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0) + bio->bi_opf = hpio->op; +#else + dev = device_of(hpio->eid); + bio_set_op_attrs(bio, hpio->op, 0); +#endif + bio_set_dev(bio, dev->bdev); + + ext_size = space_of(hpio->eid)->ext_size; + sec = div_u64((u64)hpio->eid * ext_size, dev->sec_size); + bio->bi_iter.bi_sector = sec; + for (i = 0; i < hpio->nr_page; i++) { + if (!hpio->pages[i]) + break; + hpio->pages[i]->index = sec; + page = get_encrypted_page(dev, hpio->pages[i], hpio->op); + if (!page) + goto err; + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + put_page(page); + goto err; + } + sec += PAGE_SIZE / dev->sec_size; + } + + if (dev->blk_key) + inline_crypt_bio(dev->blk_key, bio); + bio->bi_private = hpio; + bio->bi_end_io = hpio_endio; + hpio->bio = bio; + submit_bio(bio); + pr_info("submit hpio %p for eid %u.\n", hpio, hpio->eid); + + return 0; +err: + put_encrypted_pages(bio); + bio_put(bio); + atomic64_sub(sizeof(struct bio), &mem_used); + return -EIO; +} + +static int rw_extent_async(struct hpio *hpio, hp_endio endio, void *priv, unsigned int op) +{ + int ret = 0; + + if (!hpio_change_state(hpio, HPIO_INIT, HPIO_SUBMIT)) + return -EAGAIN; + + hpio->private = priv; + hpio->endio = endio; + INIT_WORK(&hpio->endio_work, hp_endio_work); + + ret = hpio_submit(hpio); + if (ret) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + } + + return ret; +} + +int hyperhold_write_async(struct hpio *hpio, hp_endio endio, void *priv) +{ + if (!CHECK_ENABLE) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + return -EINVAL; + } + + BUG_ON(!op_is_write(hpio->op)); + + return rw_extent_async(hpio, endio, priv, REQ_OP_WRITE); +} +EXPORT_SYMBOL(hyperhold_write_async); + +int hyperhold_read_async(struct hpio *hpio, hp_endio endio, void *priv) +{ + if (!CHECK_INITED) { + hpio_set_state(hpio, HPIO_FAIL); + hpio_complete(hpio); + return -EINVAL; + } + + if (op_is_write(hpio->op)) + return -EAGAIN; + + return rw_extent_async(hpio, endio, priv, REQ_OP_READ); +} +EXPORT_SYMBOL(hyperhold_read_async); + +module_init(hyperhold_init) +module_exit(hyperhold_exit) diff --git a/drivers/hyperhold/hp_device.c b/drivers/hyperhold/hp_device.c new file mode 100644 index 0000000000000000000000000000000000000000..e0dd9334b266eb4663ecc654a07de856e307a80b --- /dev/null +++ b/drivers/hyperhold/hp_device.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_device.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include +#include +#include + +#include "hp_device.h" + +#define HP_CIPHER_MODE BLK_ENCRYPTION_MODE_AES_256_XTS +#define HP_CIPHER_NAME "xts(aes)" +#define HP_KEY_SIZE (64) +#define HP_IV_SIZE (16) + +union hp_iv { + __le64 index; + __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; +}; + +void unbind_bdev(struct hp_device *dev) +{ + int ret; + + if (!dev->bdev) + goto close; + if (!dev->old_block_size) + goto put; + ret = set_blocksize(dev->bdev, dev->old_block_size); + if (ret) + pr_err("set old block size %d failed, err = %d!\n", + dev->old_block_size, ret); + dev->old_block_size = 0; +put: + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); + dev->bdev = NULL; +close: + if (dev->filp) + filp_close(dev->filp, NULL); + dev->filp = NULL; + + pr_info("hyperhold bdev unbinded.\n"); +} + +bool bind_bdev(struct hp_device *dev, const char *name) +{ + struct inode *inode = NULL; + int ret; + + dev->filp = filp_open(name, O_RDWR | O_LARGEFILE, 0); + if (IS_ERR(dev->filp)) { + pr_err("open file %s failed, err = %ld!\n", name, PTR_ERR(dev->filp)); + dev->filp = NULL; + goto err; + } + inode = dev->filp->f_mapping->host; + if (!S_ISBLK(inode->i_mode)) { + pr_err("%s is not a block device!\n", name); + goto err; + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 3, 0) + dev->bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, dev); +#else + dev->bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, dev); +#endif + if (IS_ERR(dev->bdev)) { + ret = PTR_ERR(dev->bdev); + dev->bdev = NULL; + pr_err("get blkdev %s failed, err = %d!\n", name, ret); + goto err; + } + dev->old_block_size = block_size(dev->bdev); + ret = set_blocksize(dev->bdev, PAGE_SIZE); + if (ret) { + pr_err("set %s block size failed, err = %d!\n", name, ret); + goto err; + } + dev->dev_size = (u64)i_size_read(inode); + dev->sec_size = SECTOR_SIZE; + + pr_info("hyperhold bind bdev %s of size %llu / %u succ.\n", + name, dev->dev_size, dev->sec_size); + + return true; +err: + unbind_bdev(dev); + + return false; +} + +int soft_crypt_page(struct crypto_skcipher *ctfm, struct page *dst_page, + struct page *src_page, unsigned int op) +{ + struct skcipher_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist dst, src; + int ret = 0; + union hp_iv iv; + + memset(&iv, 0, sizeof(union hp_iv)); + iv.index = cpu_to_le64(src_page->index); + + req = skcipher_request_alloc(ctfm, GFP_NOIO); + if (!req) { + pr_err("alloc skcipher request failed!\n"); + return -ENOMEM; + } + + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, &wait); + sg_init_table(&dst, 1); + sg_set_page(&dst, dst_page, PAGE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &iv); + if (op == HP_DEV_ENCRYPT) + ret = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + else if (op == HP_DEV_DECRYPT) + ret = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); + else + BUG(); + + skcipher_request_free(req); + + if (ret) + pr_err("%scrypt failed!\n", op == HP_DEV_ENCRYPT ? "en" : "de"); + + return ret; +} + +static struct crypto_skcipher *soft_crypto_init(const u8 *key) +{ + char *cipher = HP_CIPHER_NAME; + u32 key_len = HP_KEY_SIZE; + struct crypto_skcipher *ctfm = NULL; + int ret; + + ctfm = crypto_alloc_skcipher(cipher, 0, 0); + if (IS_ERR(ctfm)) { + pr_err("alloc ctfm failed, ret = %ld!\n", PTR_ERR(ctfm)); + ctfm = NULL; + goto err; + } + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + ret = crypto_skcipher_setkey(ctfm, key, key_len); + if (ret) { + pr_err("ctfm setkey failed, ret = %d!\n", ret); + goto err; + } + + return ctfm; +err: + if (ctfm) + crypto_free_skcipher(ctfm); + + return NULL; +} + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION +void inline_crypt_bio(struct blk_crypto_key *blk_key, struct bio *bio) +{ + union hp_iv iv; + + memset(&iv, 0, sizeof(union hp_iv)); + iv.index = cpu_to_le64(bio->bi_iter.bi_sector); + + bio_crypt_set_ctx(bio, blk_key, iv.dun, GFP_NOIO); +} + +static struct blk_crypto_key *inline_crypto_init(const u8 *key) +{ + struct blk_crypto_key *blk_key = NULL; + u32 dun_bytes = HP_IV_SIZE - sizeof(__le64); + int ret; + + blk_key = kzalloc(sizeof(struct blk_crypto_key), GFP_KERNEL); + if (!blk_key) { + pr_err("blk key alloc failed!\n"); + goto err; + } + ret = blk_crypto_init_key(blk_key, key, HP_CIPHER_MODE, dun_bytes, PAGE_SIZE); + if (ret) { + pr_err("blk key init failed, ret = %d!\n", ret); + goto err; + } + + return blk_key; +err: + if (blk_key) + kfree_sensitive(blk_key); + + return NULL; +} +#else +void inline_crypt_bio(struct blk_crypto_key *blk_key, struct bio *bio) {} +static struct blk_crypto_key *inline_crypto_init(const u8 *key) +{ + pr_err("CONFIG_BLK_INLINE_ENCRYPTION is not enabled!\n"); + return NULL; +} +#endif + +bool crypto_init(struct hp_device *dev, bool soft) +{ + u8 key[HP_KEY_SIZE]; + bool ret = false; + + get_random_bytes(key, HP_KEY_SIZE); + if (soft) { + dev->ctfm = soft_crypto_init(key); + ret = dev->ctfm; + } else { + dev->blk_key = inline_crypto_init(key); + ret = dev->blk_key; + if (ret) + pr_warn("soft crypt has been turned off, now apply hard crypt!\n"); + } + memzero_explicit(key, HP_KEY_SIZE); + + return ret; +} + +void crypto_deinit(struct hp_device *dev) +{ + if (dev->ctfm) { + crypto_free_skcipher(dev->ctfm); + dev->ctfm = NULL; + } + if (dev->blk_key) { + kfree_sensitive(dev->blk_key); + dev->blk_key = NULL; + } +} diff --git a/drivers/hyperhold/hp_device.h b/drivers/hyperhold/hp_device.h new file mode 100644 index 0000000000000000000000000000000000000000..06f0078914819f62f6ce2e5254d798bcb3668e32 --- /dev/null +++ b/drivers/hyperhold/hp_device.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_device.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_DEVICE_H_ +#define _HP_DEVICE_H_ + +#include +#include +#include + +enum { + HP_DEV_ENCRYPT, + HP_DEV_DECRYPT, +}; + +struct hp_device { + struct file *filp; + struct block_device *bdev; + u32 old_block_size; + u64 dev_size; + u32 sec_size; + + struct crypto_skcipher *ctfm; + struct blk_crypto_key *blk_key; +}; + +void unbind_bdev(struct hp_device *dev); +bool bind_bdev(struct hp_device *dev, const char *name); +bool crypto_init(struct hp_device *dev, bool soft); +void crypto_deinit(struct hp_device *dev); +int soft_crypt_page(struct crypto_skcipher *ctfm, + struct page *dst_page, struct page *src_page, unsigned int op); +void inline_crypt_bio(struct blk_crypto_key *blk_key, struct bio *bio); +#endif diff --git a/drivers/hyperhold/hp_iotab.c b/drivers/hyperhold/hp_iotab.c new file mode 100644 index 0000000000000000000000000000000000000000..258cb83a16c33e273567ba5f40ef90fa3ef60456 --- /dev/null +++ b/drivers/hyperhold/hp_iotab.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_iotab.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include +#include + +#include "hp_iotab.h" + +atomic64_t hpio_mem = ATOMIC64_INIT(0); +u64 hpio_memory(void) +{ + return atomic64_read(&hpio_mem); +} + +struct hp_iotab { + struct list_head io_list; + rwlock_t lock; + u32 io_cnt; + wait_queue_head_t empty_wq; +}; + +/* store all inflight hpio in iotab */ +struct hp_iotab iotab = { + .io_list = LIST_HEAD_INIT(iotab.io_list), + .lock = __RW_LOCK_UNLOCKED(iotab.lock), + .io_cnt = 0, + .empty_wq = __WAIT_QUEUE_HEAD_INITIALIZER(iotab.empty_wq), +}; + +static struct hpio *__iotab_search_get(struct hp_iotab *iotab, u32 eid) +{ + struct hpio *hpio = NULL; + + list_for_each_entry(hpio, &iotab->io_list, list) + if (hpio->eid == eid && kref_get_unless_zero(&hpio->refcnt)) + return hpio; + + return NULL; +} + +static struct hpio *iotab_search_get(struct hp_iotab *iotab, u32 eid) +{ + struct hpio *hpio = NULL; + unsigned long flags; + + read_lock_irqsave(&iotab->lock, flags); + hpio = __iotab_search_get(iotab, eid); + read_unlock_irqrestore(&iotab->lock, flags); + + pr_info("find hpio %p for eid %u.\n", hpio, eid); + + return hpio; +} + +/* + * insert @hpio into @iotab, cancel insertion if there is a hpio of the same + * @eid, inc the refcnt of duplicated hpio and return it + */ +static struct hpio *iotab_insert(struct hp_iotab *iotab, struct hpio *hpio) +{ + struct hpio *dup = NULL; + unsigned long flags; + + write_lock_irqsave(&iotab->lock, flags); + dup = __iotab_search_get(iotab, hpio->eid); + if (dup) { + pr_info("find exist hpio %p for eid %u, insert hpio %p failed.\n", + dup, hpio->eid, hpio); + goto unlock; + } + list_add(&hpio->list, &iotab->io_list); + iotab->io_cnt++; + pr_info("insert new hpio %p for eid %u.\n", hpio, hpio->eid); +unlock: + write_unlock_irqrestore(&iotab->lock, flags); + + return dup; +} + +static void iotab_delete(struct hp_iotab *iotab, struct hpio *hpio) +{ + unsigned long flags; + + write_lock_irqsave(&iotab->lock, flags); + list_del(&hpio->list); + iotab->io_cnt--; + if (!iotab->io_cnt) + wake_up(&iotab->empty_wq); + write_unlock_irqrestore(&iotab->lock, flags); + + pr_info("delete hpio %p for eid %u from iotab.\n", hpio, hpio->eid); +} + +static void hpio_clear_pages(struct hpio *hpio) +{ + int i; + + if (!hpio->pages) + return; + + for (i = 0; i < hpio->nr_page; i++) + if (hpio->pages[i]) { + put_page(hpio->pages[i]); + atomic64_sub(PAGE_SIZE, &hpio_mem); + } + kfree(hpio->pages); + atomic64_sub(sizeof(struct page *) * hpio->nr_page, &hpio_mem); + hpio->nr_page = 0; + hpio->pages = NULL; +} + +/* + * alloc pages array for @hpio, fill in new alloced pages if @new_page + */ +static bool hpio_fill_pages(struct hpio *hpio, u32 nr_page, gfp_t gfp, bool new_page) +{ + int i; + + BUG_ON(hpio->pages); + hpio->nr_page = nr_page; + hpio->pages = kcalloc(hpio->nr_page, sizeof(struct page *), gfp); + if (!hpio->pages) + goto err; + atomic64_add(sizeof(struct page *) * hpio->nr_page, &hpio_mem); + + if (!new_page) + goto out; + for (i = 0; i < hpio->nr_page; i++) { + hpio->pages[i] = alloc_page(gfp); + if (!hpio->pages[i]) + goto err; + atomic64_add(PAGE_SIZE, &hpio_mem); + } +out: + return true; +err: + hpio_clear_pages(hpio); + + return false; +} + +void hpio_free(struct hpio *hpio) +{ + if (!hpio) + return; + + pr_info("free hpio = %p.\n", hpio); + + hpio_clear_pages(hpio); + kfree(hpio); + atomic64_sub(sizeof(struct hpio), &hpio_mem); +} + +struct hpio *hpio_alloc(u32 nr_page, gfp_t gfp, unsigned int op, bool new_page) +{ + struct hpio *hpio = NULL; + + hpio = kzalloc(sizeof(struct hpio), gfp); + if (!hpio) + goto err; + atomic64_add(sizeof(struct hpio), &hpio_mem); + if (!hpio_fill_pages(hpio, nr_page, gfp, new_page)) + goto err; + hpio->op = op; + atomic_set(&hpio->state, HPIO_INIT); + kref_init(&hpio->refcnt); + init_completion(&hpio->wait); + + return hpio; +err: + hpio_free(hpio); + + return NULL; +} + +struct hpio *hpio_get(u32 eid) +{ + return iotab_search_get(&iotab, eid); +} + +struct hpio *hpio_get_alloc(u32 eid, u32 nr_page, gfp_t gfp, unsigned int op) +{ + struct hpio *hpio = NULL; + struct hpio *dup = NULL; + + hpio = iotab_search_get(&iotab, eid); + if (hpio) { + pr_info("find exist hpio %p for eid %u.\n", hpio, eid); + goto out; + } + hpio = hpio_alloc(nr_page, gfp, op, true); + if (!hpio) + goto out; + hpio->eid = eid; + + pr_info("alloc hpio %p for eid %u.\n", hpio, eid); + + dup = iotab_insert(&iotab, hpio); + if (dup) { + hpio_free(hpio); + hpio = dup; + } +out: + return hpio; +} + +static void hpio_release(struct kref *kref) +{ + struct hpio *hpio = container_of(kref, struct hpio, refcnt); + + iotab_delete(&iotab, hpio); + if (hpio->free_extent) + hpio->free_extent(hpio->eid); + hpio_free(hpio); +} + +bool hpio_put(struct hpio *hpio) +{ + pr_info("put hpio %p for eid %u, ref = %u.\n", hpio, hpio->eid, kref_read(&hpio->refcnt)); + return kref_put(&hpio->refcnt, hpio_release); +} + +void hpio_complete(struct hpio *hpio) +{ + pr_info("complete hpio %p for eid %u.\n", hpio, hpio->eid); + complete_all(&hpio->wait); +} + +void hpio_wait(struct hpio *hpio) +{ + wait_for_completion(&hpio->wait); +} + +enum hpio_state hpio_get_state(struct hpio *hpio) +{ + return atomic_read(&hpio->state); +} + +void hpio_set_state(struct hpio *hpio, enum hpio_state state) +{ + atomic_set(&hpio->state, state); +} + +bool hpio_change_state(struct hpio *hpio, enum hpio_state from, enum hpio_state to) +{ + return atomic_cmpxchg(&hpio->state, from, to) == from; +} + +static void dump_iotab(struct hp_iotab *iotab) +{ + struct hpio *hpio = NULL; + unsigned long flags; + + pr_info("dump inflight hpio in iotab.\n"); + read_lock_irqsave(&iotab->lock, flags); + list_for_each_entry(hpio, &iotab->io_list, list) + pr_info("hpio %p for eid %u is inflight.\n", hpio, hpio->eid); + read_unlock_irqrestore(&iotab->lock, flags); +} + +void wait_for_iotab_empty(void) +{ + dump_iotab(&iotab); + wait_event(iotab.empty_wq, !iotab.io_cnt); +} diff --git a/drivers/hyperhold/hp_iotab.h b/drivers/hyperhold/hp_iotab.h new file mode 100644 index 0000000000000000000000000000000000000000..b3785f7aaad968bcfe62a2b40af652c1b170e520 --- /dev/null +++ b/drivers/hyperhold/hp_iotab.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_iotab.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_IOTAB_H_ +#define _HP_IOTAB_H_ + +#include +#include +#include +#include + +enum hpio_state { + HPIO_INIT, + HPIO_SUBMIT, + HPIO_DONE, + HPIO_FAIL, +}; + +struct hpio; + +typedef void (*hp_endio)(struct hpio *); + +struct hpio { + u32 eid; + struct page **pages; + u32 nr_page; + void *private; + + unsigned int op; + void (*free_extent)(u32 eid); + + atomic_t state; + struct kref refcnt; + struct completion wait; + hp_endio endio; + struct work_struct endio_work; + + struct bio *bio; + struct list_head list; +}; + +struct hpio *hpio_alloc(u32 nr_page, gfp_t gfp, unsigned int op, bool new_page); +void hpio_free(struct hpio *hpio); + +struct hpio *hpio_get(u32 eid); +bool hpio_put(struct hpio *hpio); +struct hpio *hpio_get_alloc(u32 eid, u32 nr_page, gfp_t gfp, unsigned int op); + +void hpio_complete(struct hpio *hpio); +void hpio_wait(struct hpio *hpio); + +enum hpio_state hpio_get_state(struct hpio *hpio); +void hpio_set_state(struct hpio *hpio, enum hpio_state state); +bool hpio_change_state(struct hpio *hpio, enum hpio_state from, enum hpio_state to); + +void wait_for_iotab_empty(void); + +u64 hpio_memory(void); +#endif diff --git a/drivers/hyperhold/hp_space.c b/drivers/hyperhold/hp_space.c new file mode 100644 index 0000000000000000000000000000000000000000..cb3d3439c5a601e93f77dd56c3dd5cc146d8bc41 --- /dev/null +++ b/drivers/hyperhold/hp_space.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/hyperhold/hp_space.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[HYPERHOLD]" fmt + +#include + +#include "hp_space.h" + +atomic64_t spc_mem = ATOMIC64_INIT(0); + +u64 space_memory(void) +{ + return atomic64_read(&spc_mem); +} + +void deinit_space(struct hp_space *spc) +{ + kvfree(spc->bitmap); + atomic64_sub(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), &spc_mem); + spc->ext_size = 0; + spc->nr_ext = 0; + atomic_set(&spc->last_alloc_bit, 0); + atomic_set(&spc->nr_alloced, 0); + + pr_info("hyperhold space deinited.\n"); +} + +bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size) +{ + if (ext_size & (PAGE_SIZE - 1)) { + pr_err("extent size %u do not align to page size %lu!", ext_size, PAGE_SIZE); + return false; + } + if (dev_size & (ext_size - 1)) { + pr_err("device size %llu do not align to extent size %u!", dev_size, ext_size); + return false; + } + spc->ext_size = ext_size; + spc->nr_ext = div_u64(dev_size, ext_size); + atomic_set(&spc->last_alloc_bit, 0); + atomic_set(&spc->nr_alloced, 0); + init_waitqueue_head(&spc->empty_wq); + spc->bitmap = kvzalloc(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), GFP_KERNEL); + if (!spc->bitmap) { + pr_err("hyperhold bitmap alloc failed.\n"); + return false; + } + atomic64_add(BITS_TO_LONGS(spc->nr_ext) * sizeof(long), &spc_mem); + + pr_info("hyperhold space init succ, capacity = %u x %u.\n", ext_size, spc->nr_ext); + + return true; +} + +int alloc_eid(struct hp_space *spc) +{ + u32 bit; + u32 last_bit; + +retry: + last_bit = atomic_read(&spc->last_alloc_bit); + bit = find_next_zero_bit(spc->bitmap, spc->nr_ext, last_bit); + if (bit == spc->nr_ext) + bit = find_next_zero_bit(spc->bitmap, spc->nr_ext, 0); + if (bit == spc->nr_ext) + goto full; + if (test_and_set_bit(bit, spc->bitmap)) + goto retry; + + atomic_set(&spc->last_alloc_bit, bit); + atomic_inc(&spc->nr_alloced); + + pr_info("hyperhold alloc extent %u.\n", bit); + + return bit; +full: + pr_err("hyperhold space is full.\n"); + + return -ENOSPC; +} + +void free_eid(struct hp_space *spc, u32 eid) +{ + if (!test_and_clear_bit(eid, spc->bitmap)) { + pr_err("eid is not alloced!\n"); + BUG(); + return; + } + if (atomic_dec_and_test(&spc->nr_alloced)) { + pr_info("notify space empty.\n"); + wake_up(&spc->empty_wq); + } + pr_info("hyperhold free extent %u.\n", eid); +} + +static void dump_space(struct hp_space *spc) +{ + u32 i = 0; + + pr_info("dump alloced extent in space.\n"); + for (i = 0; i < spc->nr_ext; i++) + if (test_bit(i, spc->bitmap)) + pr_info("alloced eid %u.\n", i); +} + +bool wait_for_space_empty(struct hp_space *spc, bool force) +{ + if (!atomic_read(&spc->nr_alloced)) + return true; + if (!force) + return false; + + dump_space(spc); + wait_event(spc->empty_wq, !atomic_read(&spc->nr_alloced)); + + return true; +} diff --git a/drivers/hyperhold/hp_space.h b/drivers/hyperhold/hp_space.h new file mode 100644 index 0000000000000000000000000000000000000000..caaaf92a07f795a5a72423dcee26c8204a39873e --- /dev/null +++ b/drivers/hyperhold/hp_space.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hp_space.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HP_SPACE_H_ +#define _HP_SPACE_H_ + +#include + +struct hp_space { + u32 ext_size; + u32 nr_ext; + unsigned long *bitmap; + atomic_t last_alloc_bit; + atomic_t nr_alloced; + wait_queue_head_t empty_wq; +}; + +void deinit_space(struct hp_space *spc); +bool init_space(struct hp_space *spc, u64 dev_size, u32 ext_size); +int alloc_eid(struct hp_space *spc); +void free_eid(struct hp_space *spc, u32 eid); + +bool wait_for_space_empty(struct hp_space *spc, bool force); + +u64 space_memory(void); +#endif diff --git a/drivers/hyperhold/hyperhold.h b/drivers/hyperhold/hyperhold.h new file mode 100644 index 0000000000000000000000000000000000000000..b65ff54445136679593e0b5c60be215c12f5ff88 --- /dev/null +++ b/drivers/hyperhold/hyperhold.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/hyperhold/hyperhold.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _HYPERHOLD_H_ +#define _HYPERHOLD_H_ + +#include + +struct hpio; + +typedef void (*hp_endio)(struct hpio *); + +void hyperhold_disable(bool force); +void hyperhold_enable(void); +bool is_hyperhold_enable(void); + +u32 hyperhold_nr_extent(void); +u32 hyperhold_extent_size(u32 eid); +long hyperhold_address(u32 eid, u32 offset); +int hyperhold_addr_extent(u64 addr); +int hyperhold_addr_offset(u64 addr); + +int hyperhold_alloc_extent(void); +void hyperhold_free_extent(u32 eid); +void hyperhold_should_free_extent(u32 eid); + +struct hpio *hyperhold_io_alloc(u32 eid, gfp_t gfp, unsigned int op, bool new_page); +void hyperhold_io_free(struct hpio *hpio); + +struct hpio *hyperhold_io_get(u32 eid, gfp_t gfp, unsigned int op); +bool hyperhold_io_put(struct hpio *hpio); + +void hyperhold_io_complete(struct hpio *hpio); +void hyperhold_io_wait(struct hpio *hpio); + +bool hyperhold_io_success(struct hpio *hpio); + +int hyperhold_io_extent(struct hpio *hpio); +int hyperhold_io_operate(struct hpio *hpio); +struct page *hyperhold_io_page(struct hpio *hpio, u32 index); +bool hyperhold_io_add_page(struct hpio *hpio, u32 index, struct page *page); +u32 hyperhold_io_nr_page(struct hpio *hpio); +void *hyperhold_io_private(struct hpio *hpio); + +int hyperhold_write_async(struct hpio *hpio, hp_endio endio, void *priv); +int hyperhold_read_async(struct hpio *hpio, hp_endio endio, void *priv); + +#endif diff --git a/fs/epfs/inode.c b/fs/epfs/inode.c index 2e3e9c62cacdc68395846e10ff2dd6b1d05229e0..3714cf71fd62caf20f055282de9927b625ea28c8 100644 --- a/fs/epfs/inode.c +++ b/fs/epfs/inode.c @@ -41,7 +41,7 @@ static int epfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) d_tmpfile(file, inode); if (IS_ENABLED(CONFIG_EPFS_DEBUG)) epfs_debug("epfs: tmpfile %p", inode); - return 0; + return finish_open_simple(file, 0);; } const struct inode_operations epfs_dir_iops = { diff --git a/fs/epfs/super.c b/fs/epfs/super.c index 4d708f855d1fb14185ce242f1d2a54e7326687e7..7368af775c8d77fc091a157d3c4df19bd22de2ce 100644 --- a/fs/epfs/super.c +++ b/fs/epfs/super.c @@ -79,7 +79,7 @@ static int epfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_operations epfs_sops = { .alloc_inode = epfs_alloc_inode, .destroy_inode = epfs_destroy_inode, -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) .free_inode = epfs_free_inode, #endif .evict_inode = epfs_evict_inode, diff --git a/fs/hmdfs/comm/connection.c b/fs/hmdfs/comm/connection.c index aec9cabf69311e20792ee517459efb708fe2b302..44a4cb93354fcd89fd223b2ced9820ab1964cc7b 100755 --- a/fs/hmdfs/comm/connection.c +++ b/fs/hmdfs/comm/connection.c @@ -640,11 +640,6 @@ void connection_handshake_recv_handler(struct connection *conn_impl, void *buf, hmdfs_info( "Recved handshake response: device_id = %llu, cmd->status = %hhu, tcp->fd = %d", conn_impl->node->device_id, status, fd); - if (status == CONNECT_STAT_WAIT_REQUEST) { - // must be 10.1 device, no need to set ktls - connection_to_working(conn_impl->node); - goto out; - } ret = hs_proc_msg_data(conn_impl, ops, data, data_len); if (ret) @@ -1241,21 +1236,24 @@ void head_put(struct hmdfs_msg_idr_head *head) kref_put_lock(&head->ref, head_release, &head->peer->idr_lock); } -struct hmdfs_msg_idr_head *hmdfs_find_msg_head(struct hmdfs_peer *peer, int id) +struct hmdfs_msg_idr_head *hmdfs_find_msg_head(struct hmdfs_peer *peer, + int id, struct hmdfs_cmd operations) { struct hmdfs_msg_idr_head *head = NULL; spin_lock(&peer->idr_lock); head = idr_find(&peer->msg_idr, id); - if (head) + if (head && head->send_cmd_operations.command == operations.command) kref_get(&head->ref); + else + head = NULL; spin_unlock(&peer->idr_lock); return head; } int hmdfs_alloc_msg_idr(struct hmdfs_peer *peer, enum MSG_IDR_TYPE type, - void *ptr) + void *ptr, struct hmdfs_cmd operations) { int ret = -EAGAIN; struct hmdfs_msg_idr_head *head = ptr; @@ -1270,6 +1268,7 @@ int hmdfs_alloc_msg_idr(struct hmdfs_peer *peer, enum MSG_IDR_TYPE type, head->msg_id = ret; head->type = type; head->peer = peer; + head->send_cmd_operations = operations; peer->msg_idr_process++; ret = 0; } diff --git a/fs/hmdfs/comm/connection.h b/fs/hmdfs/comm/connection.h index 8178590d4e3d1aefe54423bfe406c03214d15763..1988e99f78089aba55f67925786bce73e4b749f1 100755 --- a/fs/hmdfs/comm/connection.h +++ b/fs/hmdfs/comm/connection.h @@ -331,8 +331,9 @@ void hmdfs_disconnect_node(struct hmdfs_peer *node); void connection_to_working(struct hmdfs_peer *node); int hmdfs_alloc_msg_idr(struct hmdfs_peer *peer, enum MSG_IDR_TYPE type, - void *ptr); -struct hmdfs_msg_idr_head *hmdfs_find_msg_head(struct hmdfs_peer *peer, int id); + void *ptr, struct hmdfs_cmd operations); +struct hmdfs_msg_idr_head *hmdfs_find_msg_head(struct hmdfs_peer *peer, int id, + struct hmdfs_cmd operations); static inline void hmdfs_start_process_offline(struct hmdfs_peer *peer) { diff --git a/fs/hmdfs/comm/device_node.c b/fs/hmdfs/comm/device_node.c index 2a5a54fb1c58f01ec531bc8640c64560d634880d..ed568e0c1ee5773b8d113afdf4f96adcd0228445 100755 --- a/fs/hmdfs/comm/device_node.c +++ b/fs/hmdfs/comm/device_node.c @@ -201,20 +201,21 @@ static ssize_t sbi_status_show(struct kobject *kobj, struct sbi_attribute *attr, struct tcp_handle *tcp = NULL; sbi = to_sbi(kobj); - size += sprintf(buf + size, "peers status\n"); + size += snprintf(buf + size, PAGE_SIZE - size, "peers status\n"); mutex_lock(&sbi->connections.node_lock); list_for_each_entry(peer, &sbi->connections.node_list, list) { - size += sprintf(buf + size, "%s %d\n", peer->cid, - peer->status); + size += snprintf(buf + size, PAGE_SIZE - size, "%s %d\n", + peer->cid, peer->status); // connection information - size += sprintf( - buf + size, + size += snprintf( + buf + size, PAGE_SIZE - size, "\t socket_fd connection_status tcp_status ... refcnt\n"); mutex_lock(&peer->conn_impl_list_lock); list_for_each_entry(conn_impl, &peer->conn_impl_list, list) { tcp = conn_impl->connect_handle; - size += sprintf(buf + size, "\t %d \t%d \t%d \t%p \t%ld\n", + size += snprintf(buf + size, PAGE_SIZE - size, + "\t %d \t%d \t%d \t%p \t%ld\n", tcp->fd, conn_impl->status, tcp->sock->state, tcp->sock, file_count(tcp->sock->file)); } @@ -250,12 +251,13 @@ static ssize_t sbi_stat_show(struct kobject *kobj, struct sbi_attribute *attr, mutex_lock(&peer->conn_impl_list_lock); list_for_each_entry(conn_impl, &peer->conn_impl_list, list) { tcp = conn_impl->connect_handle; - size += sprintf(buf + size, "socket_fd: %d\n", tcp->fd); - size += sprintf(buf + size, + size += snprintf(buf + size, PAGE_SIZE - size, + "socket_fd: %d\n", tcp->fd); + size += snprintf(buf + size, PAGE_SIZE - size, "\tsend_msg %d \tsend_bytes %llu\n", conn_impl->stat.send_message_count, conn_impl->stat.send_bytes); - size += sprintf(buf + size, + size += snprintf(buf + size, PAGE_SIZE - size, "\trecv_msg %d \trecv_bytes %llu\n", conn_impl->stat.recv_message_count, conn_impl->stat.recv_bytes); diff --git a/fs/hmdfs/comm/message_verify.c b/fs/hmdfs/comm/message_verify.c index 2ef046016eada7b0ed797352b3678efb586fc607..4c593390778c07917e9793e3f0d786efa2041566 100755 --- a/fs/hmdfs/comm/message_verify.c +++ b/fs/hmdfs/comm/message_verify.c @@ -271,50 +271,6 @@ void hmdfs_message_verify_init(void) MESSAGE_LEN_JUDGE_RANGE; } -static void find_first_no_slash(const char **name, int *len) -{ - const char *s = *name; - int l = *len; - - while (*s == '/' && l > 0) { - s++; - l--; - } - - *name = s; - *len = l; -} - -static void find_first_slash(const char **name, int *len) -{ - const char *s = *name; - int l = *len; - - while (*s != '/' && l > 0) { - s++; - l--; - } - - *name = s; - *len = l; -} - -static bool path_contain_dotdot(const char *name, int len) -{ - while (true) { - find_first_no_slash(&name, &len); - - if (len == 0) - return false; - - if (len >= 2 && name[0] == '.' && name[1] == '.' && - (len == 2 || name[2] == '/')) - return true; - - find_first_slash(&name, &len); - } -} - static int is_str_msg_valid(char *msg, int str_len[], size_t str_num) { int i = 0; @@ -345,15 +301,6 @@ static int verify_open_req(size_t msg_len, void *msg) if (is_str_msg_valid(req->buf, str_len, sizeof(str_len) / sizeof(int))) return -EINVAL; - /* - * We only allow server to open file in hmdfs, thus we need to - * make sure path don't contain "..". - */ - if (path_contain_dotdot(req->buf, req->path_len)) { - hmdfs_err("verify fail, path contain dotdot"); - return -EINVAL; - } - return 0; } @@ -708,6 +655,9 @@ static int verify_getxattr_resp(size_t msg_len, void *msg) { struct getxattr_response *resp = msg; + if (resp->size != sizeof(*resp->value)) + return -EINVAL; + if (msg_len < sizeof(*resp)) return -EINVAL; @@ -786,6 +736,9 @@ static int verify_listxattr_resp(size_t msg_len, void *msg) { struct listxattr_response *resp = msg; + if (resp->size != sizeof(*resp->list)) + return -EINVAL; + if (msg_len < sizeof(*resp)) return -EINVAL; diff --git a/fs/hmdfs/comm/protocol.h b/fs/hmdfs/comm/protocol.h index e140963989de9a4ff9da15c3dbc3d387413ae5fb..beaa5adf4ba13782fc2c86afa4349d51c222f9a7 100755 --- a/fs/hmdfs/comm/protocol.h +++ b/fs/hmdfs/comm/protocol.h @@ -60,6 +60,7 @@ enum MSG_IDR_TYPE { struct hmdfs_msg_idr_head { __u32 type; __u32 msg_id; + struct hmdfs_cmd send_cmd_operations; struct kref ref; struct hmdfs_peer *peer; }; diff --git a/fs/hmdfs/comm/socket_adapter.c b/fs/hmdfs/comm/socket_adapter.c index e6b340b4ee72ac35592f8a67e323696218b8b312..b9f35b9e1626b8bc9940d60c190daf3aaaebccfd 100755 --- a/fs/hmdfs/comm/socket_adapter.c +++ b/fs/hmdfs/comm/socket_adapter.c @@ -96,12 +96,13 @@ static void recv_info_init(struct file_recv_info *recv_info) atomic_set(&recv_info->state, FILE_RECV_PROCESS); } -static int msg_init(struct hmdfs_peer *con, struct sendmsg_wait_queue *msg_wq) +static int msg_init(struct hmdfs_peer *con, struct sendmsg_wait_queue *msg_wq, + struct hmdfs_cmd operations) { int ret = 0; struct file_recv_info *recv_info = &msg_wq->recv_info; - ret = hmdfs_alloc_msg_idr(con, MSG_IDR_MESSAGE_SYNC, msg_wq); + ret = hmdfs_alloc_msg_idr(con, MSG_IDR_MESSAGE_SYNC, msg_wq, operations); if (unlikely(ret)) return ret; @@ -279,7 +280,8 @@ static struct hmdfs_msg_parasite *mp_alloc(struct hmdfs_peer *peer, if (unlikely(!mp)) return ERR_PTR(-ENOMEM); - ret = hmdfs_alloc_msg_idr(peer, MSG_IDR_MESSAGE_ASYNC, mp); + ret = hmdfs_alloc_msg_idr(peer, MSG_IDR_MESSAGE_ASYNC, mp, + req->operations); if (unlikely(ret)) { kfree(mp); return ERR_PTR(ret); @@ -437,7 +439,7 @@ int hmdfs_sendmessage_request(struct hmdfs_peer *con, ret = -ENOMEM; goto free_filp; } - ret = msg_init(con, msg_wq); + ret = msg_init(con, msg_wq, sm->operations); if (ret) { kfree(msg_wq); msg_wq = NULL; @@ -674,7 +676,7 @@ int hmdfs_sendpage_request(struct hmdfs_peer *con, goto unlock; } async_work->start = start; - ret = hmdfs_alloc_msg_idr(con, MSG_IDR_PAGE, async_work); + ret = hmdfs_alloc_msg_idr(con, MSG_IDR_PAGE, async_work, sm->operations); if (ret) { hmdfs_err("alloc msg_id failed, err %d", ret); goto unlock; @@ -878,6 +880,11 @@ static int hmdfs_readfile_slice(struct sendmsg_wait_queue *msg_info, loff_t offset; ssize_t written_size; + if (filp == NULL) { + hmdfs_warning("recv_info filp is NULL \n"); + return -EINVAL; + } + if (atomic_read(&recv_info->state) != FILE_RECV_PROCESS) return -EBUSY; @@ -911,7 +918,7 @@ static void hmdfs_file_response_work_fn(struct work_struct *ptr) hmdfs_override_creds(desp->peer->sbi->cred); msg_info = (struct sendmsg_wait_queue *)hmdfs_find_msg_head(desp->peer, - le32_to_cpu(desp->head->msg_id)); + le32_to_cpu(desp->head->msg_id), desp->head->operations); if (!msg_info || atomic_read(&msg_info->valid) != MSG_Q_SEND) { hmdfs_client_resp_statis(desp->peer->sbi, cmd, HMDFS_RESP_DELAY, 0, 0); @@ -963,7 +970,7 @@ int hmdfs_response_handle_sync(struct hmdfs_peer *con, bool woke = false; u8 cmd = head->operations.command; - msg_head = hmdfs_find_msg_head(con, msg_id); + msg_head = hmdfs_find_msg_head(con, msg_id, head->operations); if (!msg_head) goto out; diff --git a/fs/hmdfs/comm/transport.c b/fs/hmdfs/comm/transport.c index 3c616a0a04b8363c541944df9b3884670df147ed..fdd7fd98fd2ca65e1c449dede2f8c8b3d72f1ddc 100755 --- a/fs/hmdfs/comm/transport.c +++ b/fs/hmdfs/comm/transport.c @@ -235,7 +235,7 @@ static int tcp_recvpage_tls(struct connection *connect, node->device_id, rd_err); async_work = (struct hmdfs_async_work *)hmdfs_find_msg_head(node, - le32_to_cpu(recv->msg_id)); + le32_to_cpu(recv->msg_id), recv->operations); if (!async_work || !cancel_delayed_work(&async_work->d_work)) goto out; @@ -896,12 +896,15 @@ static bool is_tcp_socket(struct tcp_handle *tcp) return false; } - if (tcp->sock->sk->sk_protocol != IPPROTO_TCP) { + lock_sock(tcp->sock->sk); + if (tcp->sock->sk->sk_protocol != IPPROTO_TCP || + tcp->sock->type != SOCK_STREAM || + tcp->sock->sk->sk_family != AF_INET) { hmdfs_err("invalid socket protocol"); + release_sock(tcp->sock->sk); return false; } - lock_sock(tcp->sock->sk); icsk = inet_csk(tcp->sock->sk); if (icsk->icsk_ulp_ops) { hmdfs_err("ulp not NULL"); diff --git a/fs/hmdfs/dentry.c b/fs/hmdfs/dentry.c index d12ef45f3071152c31a193587c9d722a02469cfe..040d698e17850bcb70ce65c816441ae6ae0f584f 100644 --- a/fs/hmdfs/dentry.c +++ b/fs/hmdfs/dentry.c @@ -289,6 +289,8 @@ static int d_revalidate_merge(struct dentry *direntry, unsigned int flags) struct hmdfs_dentry_comrade *comrade = NULL; struct dentry *parent_dentry = NULL; struct dentry *lower_cur_parent_dentry = NULL; + struct inode *dinode = NULL; + struct hmdfs_inode_info *info = NULL; int ret = 1; if (flags & LOOKUP_RCU) { @@ -299,6 +301,14 @@ static int d_revalidate_merge(struct dentry *direntry, unsigned int flags) return 0; } + dinode = d_inode(direntry); + if (!dinode) + return 0; + + info = hmdfs_i(dinode); + if (info->inode_type == HMDFS_LAYER_FIRST_MERGE_CLOUD) + return 1; + parent_dentry = dget_parent(direntry); mutex_lock(&dim->comrade_list_lock); list_for_each_entry(comrade, &(dim->comrade_list), list) { diff --git a/fs/hmdfs/file_cloud.c b/fs/hmdfs/file_cloud.c index c3bb8b561d1f478ba00abb22198ed31c90c7fd5d..088d89929e52902db0f39a607a9e2f0a8054aca0 100755 --- a/fs/hmdfs/file_cloud.c +++ b/fs/hmdfs/file_cloud.c @@ -31,33 +31,6 @@ static const struct vm_operations_struct hmdfs_cloud_vm_ops = { .page_mkwrite = NULL, }; -struct cloud_readpages_work { - struct file *filp; - loff_t pos; - int cnt; - struct work_struct work; - struct page *pages[0]; -}; - -static ssize_t hmdfs_file_read_iter_cloud(struct kiocb *iocb, - struct iov_iter *iter) -{ - ssize_t ret = -ENOENT; - struct file *filp = iocb->ki_filp; - struct hmdfs_file_info *gfi = filp->private_data; - struct file *lower_file = NULL; - - if (gfi) - lower_file = gfi->lower_file; - - if (lower_file) { - kiocb_clone(iocb, iocb, lower_file); - ret = vfs_iter_read(lower_file, iter, &iocb->ki_pos, 0); - } - - return ret; -} - int hmdfs_file_open_cloud(struct inode *inode, struct file *file) { const char *dir_path; @@ -91,7 +64,7 @@ int hmdfs_file_open_cloud(struct inode *inode, struct file *file) } lower_file = file_open_root(&root_path, dir_path, - file->f_flags, file->f_mode); + file->f_flags | O_DIRECT, file->f_mode); path_put(&root_path); if (IS_ERR(lower_file)) { hmdfs_info("file_open_root failed: %ld", PTR_ERR(lower_file)); @@ -158,69 +131,63 @@ int hmdfs_file_mmap_cloud(struct file *file, struct vm_area_struct *vma) return ret; } -static void cloud_readpages_work_func(struct work_struct *work) +static int hmdfs_do_readpages_cloud(struct file *filp, int cnt, + struct page **vec) { - void *pages_buf; + struct hmdfs_file_info *gfi = filp->private_data; + struct file *lower_filp; + loff_t pos = (loff_t)(vec[0]->index) << HMDFS_PAGE_OFFSET; + void *pages_buf = NULL; int idx, ret; - ssize_t read_len; - struct cloud_readpages_work *cr_work; - cr_work = container_of(work, struct cloud_readpages_work, work); + if (gfi) { + lower_filp = gfi->lower_file; + } + else { + ret = -EINVAL; + goto out_err; + } - read_len = cr_work->cnt * HMDFS_PAGE_SIZE; - pages_buf = vmap(cr_work->pages, cr_work->cnt, VM_MAP, PAGE_KERNEL); - if (!pages_buf) - goto out; + pages_buf = vmap(vec, cnt, VM_MAP, PAGE_KERNEL); + if (!pages_buf) { + ret = -ENOMEM; + goto out_err; + } - ret = kernel_read(cr_work->filp, pages_buf, read_len, &cr_work->pos); - if (ret < 0) - goto out_vunmap; + trace_hmdfs_do_readpages_cloud_begin(cnt, pos); + ret = kernel_read(lower_filp, pages_buf, cnt * HMDFS_PAGE_SIZE, &pos); + trace_hmdfs_do_readpages_cloud_end(cnt, pos, ret); - if (ret != read_len) - memset(pages_buf + ret, 0, read_len - ret); + if (ret >= 0) + memset(pages_buf + ret, 0, cnt * HMDFS_PAGE_SIZE - ret); + else + goto out_err; -out_vunmap: vunmap(pages_buf); -out: - for (idx = 0; idx < cr_work->cnt; ++idx) { - SetPageUptodate(cr_work->pages[idx]); - unlock_page(cr_work->pages[idx]); + for (idx = 0; idx < cnt; ++idx) { + SetPageUptodate(vec[idx]); + unlock_page(vec[idx]); } - kfree(cr_work); -} - -static int prepare_cloud_readpage_work(struct file *filp, int cnt, - struct page **vec) -{ - struct cloud_readpages_work *cr_work; - struct hmdfs_file_info *gfi = filp->private_data; - - cr_work = kzalloc(sizeof(*cr_work) + - sizeof(cr_work->pages[0]) * cnt, - GFP_KERNEL); - if (!cr_work) { - hmdfs_warning("cannot alloc work"); - return -ENOMEM; + goto out; + +out_err: + if (pages_buf) + vunmap(pages_buf); + for (idx = 0; idx < cnt; ++idx) { + folio_clear_uptodate((struct folio *)vec[idx]); + filemap_remove_folio((struct folio *)vec[idx]); + unlock_page(vec[idx]); + put_page(vec[idx]); } - - if (gfi) - cr_work->filp = gfi->lower_file; - else - cr_work->filp = filp; - cr_work->pos = (loff_t)(vec[0]->index) << HMDFS_PAGE_OFFSET; - cr_work->cnt = cnt; - memcpy(cr_work->pages, vec, cnt * sizeof(*vec)); - - INIT_WORK(&cr_work->work, cloud_readpages_work_func); - schedule_work(&cr_work->work); - return 0; +out: + return ret; } -static int hmdfs_readpages_cloud(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static void hmdfs_readahead(struct readahead_control *ractl) { + struct file *filp = ractl->file; + struct address_space *mapping = ractl->mapping; + unsigned int nr_pages = readahead_count(ractl); struct hmdfs_sb_info *sbi = hmdfs_sb(file_inode(filp)->i_sb); unsigned int ret = 0, idx, cnt, limit; unsigned long next_index; @@ -231,42 +198,79 @@ static int hmdfs_readpages_cloud(struct file *filp, vec = kmalloc(limit * sizeof(*vec), GFP_KERNEL); if (!vec) { hmdfs_warning("cannot alloc vec (%u pages)", limit); - return -ENOMEM; + return; } cnt = 0; next_index = 0; for (idx = 0; idx < nr_pages; ++idx) { - struct page *page = lru_to_page(pages); + struct page *page = readahead_page(ractl); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) - goto next_page; + if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { + unlock_page(page); + put_page(page); + continue; + } if (cnt && (cnt >= limit || page->index != next_index)) { - ret = prepare_cloud_readpage_work(filp, cnt, vec); + ret = hmdfs_do_readpages_cloud(filp, cnt, vec); cnt = 0; if (ret) break; } next_index = page->index + 1; vec[cnt++] = page; -next_page: - put_page(page); } if (cnt) - ret = prepare_cloud_readpage_work(filp, cnt, vec); + ret = hmdfs_do_readpages_cloud(filp, cnt, vec); kfree(vec); trace_hmdfs_readpages_cloud(nr_pages, ret); + return; +} + +static int hmdfs_readpage(struct file *file, struct page *page) +{ + loff_t offset = page_file_offset(page); + int ret = -EACCES; + char *page_buf; + struct hmdfs_file_info *gfi = file->private_data; + struct file *lower_file; + + if (gfi) + lower_file = gfi->lower_file; + else + goto out; + + page_buf = kmap(page); + if (!page_buf) + goto out; + ret = kernel_read(lower_file, page_buf, PAGE_SIZE, &offset); + + if (ret >= 0 && ret <= PAGE_SIZE) { + ret = 0; + memset(page_buf + ret, 0, PAGE_SIZE - ret); + } + + kunmap(page); + if (ret == 0) + SetPageUptodate(page); +out: + unlock_page(page); return ret; } +static int hmdfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + return hmdfs_readpage(file, page); +} + const struct file_operations hmdfs_dev_file_fops_cloud = { .owner = THIS_MODULE, .llseek = generic_file_llseek, - .read_iter = hmdfs_file_read_iter_cloud, + .read_iter = generic_file_read_iter, .write_iter = NULL, .mmap = hmdfs_file_mmap_cloud, .open = hmdfs_file_open_cloud, @@ -279,7 +283,8 @@ const struct file_operations hmdfs_dev_file_fops_cloud = { const struct address_space_operations hmdfs_dev_file_aops_cloud = { - .read_folio = NULL, + .read_folio = hmdfs_read_folio, + .readahead = hmdfs_readahead, .write_begin = NULL, .write_end = NULL, .writepage = NULL, @@ -287,7 +292,8 @@ const struct address_space_operations hmdfs_dev_file_aops_cloud = { }; const struct address_space_operations hmdfs_aops_cloud = { - .read_folio = NULL, + .read_folio = hmdfs_read_folio, + .readahead = hmdfs_readahead, }; int analysis_dentry_file_from_cloud(struct hmdfs_sb_info *sbi, @@ -412,7 +418,7 @@ static int hmdfs_dir_release_cloud(struct inode *inode, struct file *file) const struct file_operations hmdfs_dev_dir_ops_cloud = { .owner = THIS_MODULE, - .iterate = hmdfs_iterate_cloud, + .iterate_shared = hmdfs_iterate_cloud, .open = hmdfs_dir_open_cloud, .release = hmdfs_dir_release_cloud, .fsync = __generic_file_fsync, diff --git a/fs/hmdfs/file_local.c b/fs/hmdfs/file_local.c index c9aaaaa9ebc9d06916015750e577a8fceab5513b..e15f31484b2861ed973c522fb46bb5d93cb66981 100755 --- a/fs/hmdfs/file_local.c +++ b/fs/hmdfs/file_local.c @@ -36,7 +36,12 @@ int hmdfs_file_open_local(struct inode *inode, struct file *file) } hmdfs_get_lower_path(file->f_path.dentry, &lower_path); - lower_file = dentry_open(&lower_path, file->f_flags, cred); + if (inode->i_mapping != NULL && + inode->i_mapping->a_ops == &hmdfs_aops_cloud) + lower_file = dentry_open(&lower_path, file->f_flags | O_DIRECT, + cred); + else + lower_file = dentry_open(&lower_path, file->f_flags, cred); hmdfs_put_lower_path(&lower_path); if (IS_ERR(lower_file)) { err = PTR_ERR(lower_file); @@ -44,6 +49,7 @@ int hmdfs_file_open_local(struct inode *inode, struct file *file) } else { gfi->lower_file = lower_file; file->private_data = gfi; + hmdfs_update_upper_file(file, lower_file); if (file->f_flags & (O_RDWR | O_WRONLY)) atomic_inc(&info->write_opened); } @@ -86,11 +92,13 @@ ssize_t hmdfs_do_read_iter(struct file *file, struct iov_iter *iter, if (!iov_iter_count(iter)) return 0; - if (file->f_inode->i_mapping->a_ops == &hmdfs_aops_cloud) { + if (file->f_inode->i_mapping != NULL && + file->f_inode->i_mapping->a_ops == &hmdfs_aops_cloud) { iocb = container_of(ppos, struct kiocb, ki_pos); ret = generic_file_read_iter(iocb, iter); - } else + } else { ret = vfs_iter_read(lower_file, iter, ppos, 0); + } hmdfs_file_accessed(file); return ret; @@ -287,7 +295,7 @@ static int hmdfs_dir_release_local(struct inode *inode, struct file *file) const struct file_operations hmdfs_dir_ops_local = { .owner = THIS_MODULE, - .iterate = hmdfs_iterate_local, + .iterate_shared = hmdfs_iterate_local, .open = hmdfs_dir_open_local, .release = hmdfs_dir_release_local, .fsync = hmdfs_fsync_local, @@ -388,7 +396,7 @@ static long hmdfs_dir_ioctl_local(struct file *file, unsigned int cmd, const struct file_operations hmdfs_dir_ops_share = { .owner = THIS_MODULE, - .iterate = hmdfs_iterate_local, + .iterate_shared = hmdfs_iterate_local, .open = hmdfs_dir_open_local, .release = hmdfs_dir_release_local, .fsync = hmdfs_fsync_local, diff --git a/fs/hmdfs/file_merge.c b/fs/hmdfs/file_merge.c index d31fc93d566b07fc78868d96d759e7200d17ab15..a247e9934fdda0902dba34350354771aed03f989 100755 --- a/fs/hmdfs/file_merge.c +++ b/fs/hmdfs/file_merge.c @@ -19,12 +19,12 @@ struct hmdfs_iterate_callback_merge { /* * Record the return value of 'caller->actor': * - * -EINVAL, buffer is exhausted - * -EINTR, current task is pending - * -EFAULT, something is wrong - * 0, success and can do more + * false, buffer is exhausted + * false, current task is pending + * false, something is wrong + * true, success and can do more */ - int result; + bool result ; struct rb_root *root; uint64_t dev_id; }; @@ -210,7 +210,7 @@ static bool hmdfs_actor_merge(struct dir_context *ctx, const char *name, int namelen, long long offset, unsigned long long ino, unsigned int d_type) { - int ret = 0; + bool ret = true; int insert_res = 0; int max_devid_len = 2; char *dentry_name = NULL; @@ -219,14 +219,20 @@ static bool hmdfs_actor_merge(struct dir_context *ctx, const char *name, struct hmdfs_iterate_callback_merge *iterate_callback_merge = NULL; struct dir_context *org_ctx = NULL; - if (hmdfs_file_type(name) != HMDFS_TYPE_COMMON) - return 0; + if (hmdfs_file_type(name) != HMDFS_TYPE_COMMON) { + /* + * return true here, so that the caller can continue to next + * dentry even if failed on this dentry somehow. + */ + return true; + } + if (namelen > NAME_MAX) - return -EINVAL; + return false; dentry_name = kzalloc(NAME_MAX + 1, GFP_KERNEL); if (!dentry_name) - return -ENOMEM; + return false; strncpy(dentry_name, name, dentry_len); @@ -245,7 +251,7 @@ static bool hmdfs_actor_merge(struct dir_context *ctx, const char *name, } else if (d_type == DT_DIR && (insert_res == DT_REG || insert_res == DT_LNK)) { if (strlen(CONFLICTING_DIR_SUFFIX) > NAME_MAX - dentry_len) { - ret = -ENAMETOOLONG; + ret = false; goto delete; } rename_conflicting_directory(dentry_name, &dentry_len); @@ -253,7 +259,7 @@ static bool hmdfs_actor_merge(struct dir_context *ctx, const char *name, } else if ((d_type == DT_REG || d_type == DT_LNK) && insert_res > 0) { if (strlen(CONFLICTING_FILE_SUFFIX) + max_devid_len > NAME_MAX - dentry_len) { - ret = -ENAMETOOLONG; + ret = false; goto delete; } rename_conflicting_file(dentry_name, &dentry_len, @@ -268,13 +274,12 @@ static bool hmdfs_actor_merge(struct dir_context *ctx, const char *name, * different situations. */ iterate_callback_merge->result = ret; - ret = ret == 0 ? 0 : 1; - if (ret && d_type == DT_DIR && cache_entry->file_type == DT_DIR && + if (!ret && d_type == DT_DIR && cache_entry->file_type == DT_DIR && (insert_res == DT_REG || insert_res == DT_LNK)) cache_entry->file_type = DT_REG; delete: - if (ret && !insert_res) + if (!ret && !insert_res) delete_filename(iterate_callback_merge->root, cache_entry); done: kfree(dentry_name); @@ -520,7 +525,7 @@ long hmdfs_dir_compat_ioctl_merge(struct file *file, unsigned int cmd, const struct file_operations hmdfs_dir_fops_merge = { .owner = THIS_MODULE, - .iterate = hmdfs_iterate_merge, + .iterate_shared = hmdfs_iterate_merge, .open = hmdfs_dir_open_merge, .release = hmdfs_dir_release_merge, .unlocked_ioctl = hmdfs_dir_unlocked_ioctl_merge, @@ -567,6 +572,7 @@ int hmdfs_file_open_merge(struct inode *inode, struct file *file) } else { gfi->lower_file = lower_file; file->private_data = gfi; + hmdfs_update_upper_file(file, lower_file); } dput(parent); out_err: diff --git a/fs/hmdfs/file_remote.c b/fs/hmdfs/file_remote.c index a39b09c602705e0a6e34989cd62a5f7570110383..80cc773e1e93561895a10839141fc39ac6a50c10 100755 --- a/fs/hmdfs/file_remote.c +++ b/fs/hmdfs/file_remote.c @@ -1056,7 +1056,7 @@ static int hmdfs_dir_release_remote(struct inode *inode, struct file *file) const struct file_operations hmdfs_dev_dir_ops_remote = { .owner = THIS_MODULE, - .iterate = hmdfs_iterate_remote, + .iterate_shared = hmdfs_iterate_remote, .open = hmdfs_dir_open_remote, .release = hmdfs_dir_release_remote, .fsync = __generic_file_fsync, diff --git a/fs/hmdfs/file_root.c b/fs/hmdfs/file_root.c index 02f331511da8f43cb47900e71553598da541e977..60d04f921b4dc74c1b6cd7b59d3618128b84c023 100755 --- a/fs/hmdfs/file_root.c +++ b/fs/hmdfs/file_root.c @@ -165,10 +165,10 @@ int hmdfs_root_iterate(struct file *file, struct dir_context *ctx) const struct file_operations hmdfs_root_fops = { .owner = THIS_MODULE, - .iterate = hmdfs_root_iterate, + .iterate_shared = hmdfs_root_iterate, }; const struct file_operations hmdfs_device_fops = { .owner = THIS_MODULE, - .iterate = hmdfs_device_iterate, + .iterate_shared = hmdfs_device_iterate, }; diff --git a/fs/hmdfs/hmdfs_client.c b/fs/hmdfs/hmdfs_client.c index fa8e7435c1efe38d6de68695b3b16c75f6ef023e..827d6b533f66bf74d8edc7f532ead4ed2ab85d1c 100755 --- a/fs/hmdfs/hmdfs_client.c +++ b/fs/hmdfs/hmdfs_client.c @@ -40,6 +40,8 @@ int hmdfs_send_open(struct hmdfs_peer *con, const char *send_buf, struct hmdfs_send_command sm = { .data = open_req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_OPEN); @@ -80,6 +82,7 @@ void hmdfs_send_close(struct hmdfs_peer *con, const struct hmdfs_fid *fid) struct hmdfs_send_command sm = { .data = release_req, .len = send_len, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_RELEASE); @@ -102,6 +105,8 @@ int hmdfs_send_fsync(struct hmdfs_peer *con, const struct hmdfs_fid *fid, struct hmdfs_send_command sm = { .data = fsync_req, .len = sizeof(struct fsync_request), + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_FSYNC); @@ -130,6 +135,7 @@ int hmdfs_client_readpage(struct hmdfs_peer *con, const struct hmdfs_fid *fid, struct hmdfs_send_command sm = { .data = read_data, .len = send_len, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_READPAGE); @@ -464,6 +470,8 @@ int hmdfs_client_start_mkdir(struct hmdfs_peer *con, struct hmdfs_send_command sm = { .data = mkdir_req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_MKDIR); @@ -511,6 +519,8 @@ int hmdfs_client_start_create(struct hmdfs_peer *con, struct hmdfs_send_command sm = { .data = create_req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_CREATE); @@ -556,6 +566,8 @@ int hmdfs_client_start_rmdir(struct hmdfs_peer *con, const char *path, struct hmdfs_send_command sm = { .data = rmdir_req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_RMDIR); @@ -585,6 +597,8 @@ int hmdfs_client_start_unlink(struct hmdfs_peer *con, const char *path, struct hmdfs_send_command sm = { .data = unlink_req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_UNLINK); @@ -619,6 +633,8 @@ int hmdfs_client_start_rename(struct hmdfs_peer *con, const char *old_path, struct hmdfs_send_command sm = { .data = rename_req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_RENAME); @@ -656,6 +672,7 @@ int hmdfs_send_setattr(struct hmdfs_peer *con, const char *send_buf, struct hmdfs_send_command sm = { .data = setattr_req, .len = send_len, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_SETATTR); @@ -714,6 +731,8 @@ int hmdfs_send_getattr(struct hmdfs_peer *con, const char *send_buf, struct hmdfs_send_command sm = { .data = req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_GETATTR); @@ -768,6 +787,8 @@ int hmdfs_send_statfs(struct hmdfs_peer *con, const char *path, struct hmdfs_send_command sm = { .data = req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_STATFS); @@ -850,6 +871,8 @@ int hmdfs_send_getxattr(struct hmdfs_peer *con, const char *send_buf, struct hmdfs_send_command sm = { .data = req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_GETXATTR); @@ -888,6 +911,7 @@ int hmdfs_send_setxattr(struct hmdfs_peer *con, const char *send_buf, struct hmdfs_send_command sm = { .data = req, .len = send_len, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_SETXATTR); @@ -937,6 +961,8 @@ ssize_t hmdfs_send_listxattr(struct hmdfs_peer *con, const char *send_buf, struct hmdfs_send_command sm = { .data = req, .len = send_len, + .out_buf = NULL, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_LISTXATTR); @@ -1008,6 +1034,7 @@ void hmdfs_send_drop_push(struct hmdfs_peer *con, const char *path) struct hmdfs_send_command sm = { .data = dp_req, .len = send_len, + .local_filp = NULL, }; hmdfs_init_cmd(&sm.operations, F_DROP_PUSH); diff --git a/fs/hmdfs/hmdfs_dentryfile.c b/fs/hmdfs/hmdfs_dentryfile.c index 12c04e73675ffd09310a08d9d0ca68b0d4ece01e..46e0deae724042342871a1c7cd1ba33c3f686514 100755 --- a/fs/hmdfs/hmdfs_dentryfile.c +++ b/fs/hmdfs/hmdfs_dentryfile.c @@ -684,6 +684,7 @@ static struct hmdfs_dentry *find_in_block(struct hmdfs_dentry_group *dentry_blk, if (!test_bit_le(bit_pos, dentry_blk->bitmap)) { bit_pos++; max_len++; + continue; } de = &dentry_blk->nsl[bit_pos]; if (unlikely(!de->namelen)) { @@ -1826,25 +1827,25 @@ static bool cache_file_iterate(struct dir_context *ctx, const char *name, if (name_len > NAME_MAX) { hmdfs_err("name_len:%d NAME_MAX:%u", name_len, NAME_MAX); - return 0; + return true; } if (d_type != DT_REG) - return 0; + return true; cfi = kmalloc(sizeof(*cfi), GFP_KERNEL); if (!cfi) - return -ENOMEM; + return false; cfi->name = kstrndup(name, name_len, GFP_KERNEL); if (!cfi->name) { kfree(cfi); - return -ENOMEM; + return false; } list_add_tail(&cfi->list, &cb->list); - return 0; + return true; } void hmdfs_do_load(struct hmdfs_sb_info *sbi, const char *fullname, bool server) @@ -2520,6 +2521,7 @@ static void hmdfs_rename_bak(struct dentry *dentry) struct dentry *lower_parent = NULL; struct dentry *lower_dentry = NULL; struct dentry *new_dentry = NULL; + struct renamedata rename_data; char *name = NULL; int len = 0; int err = 0; @@ -2553,17 +2555,14 @@ static void hmdfs_rename_bak(struct dentry *dentry) goto unlock_parent; } - struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = d_inode(lower_parent), - .old_dentry = lower_dentry, - .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = d_inode(lower_parent), - .new_dentry = new_dentry, - .flags = 0, - }; - - err = vfs_rename(&rd); + rename_data.old_mnt_idmap = &nop_mnt_idmap; + rename_data.old_dir = d_inode(lower_parent); + rename_data.old_dentry = lower_dentry; + rename_data.new_mnt_idmap = &nop_mnt_idmap; + rename_data.new_dir = d_inode(lower_parent); + rename_data.new_dentry = new_dentry; + rename_data.flags = 0; + err = vfs_rename(&rename_data); dput(new_dentry); unlock_parent: @@ -2784,6 +2783,7 @@ int hmdfs_root_rename(struct hmdfs_sb_info *sbi, uint64_t device_id, struct dentry *trap = NULL; struct dentry *old_dentry = NULL; struct dentry *new_dentry = NULL; + struct renamedata rename_data; err = kern_path(sbi->local_dst, 0, &path_dst); if (err) { @@ -2852,17 +2852,14 @@ int hmdfs_root_rename(struct hmdfs_sb_info *sbi, uint64_t device_id, if (path_old.dentry != path_new.dentry) hmdfs_mark_drop_flag(device_id, path_new.dentry); - struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = d_inode(path_old.dentry), - .old_dentry = old_dentry, - .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = d_inode(path_new.dentry), - .new_dentry = new_dentry, - .flags = flags, - }; - - err = vfs_rename(&rd); + rename_data.old_mnt_idmap = &nop_mnt_idmap; + rename_data.old_dir = d_inode(path_old.dentry); + rename_data.old_dentry = old_dentry; + rename_data.new_mnt_idmap = &nop_mnt_idmap; + rename_data.new_dir = d_inode(path_new.dentry); + rename_data.new_dentry = new_dentry; + rename_data.flags = flags; + err = vfs_rename(&rename_data); put_new_dentry: dput(new_dentry); diff --git a/fs/hmdfs/hmdfs_server.c b/fs/hmdfs/hmdfs_server.c index 4961e37e2681028ef6165bf3d6bf29b7d579e445..b10052934f730c53b5ca0b9e1e581affad825ae8 100755 --- a/fs/hmdfs/hmdfs_server.c +++ b/fs/hmdfs/hmdfs_server.c @@ -32,6 +32,50 @@ struct hmdfs_open_info { int file_id; }; +static void find_first_no_slash(const char **name, int *len) +{ + const char *s = *name; + int l = *len; + + while (l > 0 && *s == '/') { + s++; + l--; + } + + *name = s; + *len = l; +} + +static void find_first_slash(const char **name, int *len) +{ + const char *s = *name; + int l = *len; + + while (l > 0 && *s != '/') { + s++; + l--; + } + + *name = s; + *len = l; +} + +static bool path_contain_dotdot(const char *name, int len) +{ + while (true) { + find_first_no_slash(&name, &len); + + if (len == 0) + return false; + + if (len >= 2 && name[0] == '.' && name[1] == '.' && + (len == 2 || name[2] == '/')) + return true; + + find_first_slash(&name, &len); + } +} + static int insert_file_into_conn(struct hmdfs_peer *conn, struct file *file) { struct idr *idr = &(conn->file_id_idr); @@ -63,14 +107,21 @@ static struct file *get_file_from_conn(struct hmdfs_peer *conn, __u32 file_id) return file; } -void remove_file_from_conn(struct hmdfs_peer *conn, __u32 file_id) +int remove_file_from_conn(struct hmdfs_peer *conn, __u32 file_id) { spinlock_t *lock = &(conn->file_id_lock); struct idr *idr = &(conn->file_id_idr); + struct file *file; spin_lock(lock); - idr_remove(idr, file_id); + file = idr_remove(idr, file_id); spin_unlock(lock); + + if (!file) { + return -ENOENT; + } else { + return 0; + } } struct file *hmdfs_open_link(struct hmdfs_sb_info *sbi, @@ -324,11 +375,13 @@ static struct file *hmdfs_open_file(struct hmdfs_peer *con, return file; } + get_file(file); id = insert_file_into_conn(con, file); if (id < 0) { hmdfs_err("file_id alloc failed! err=%d", id); reset_item_opened_status(con->sbi, filename); hmdfs_close_path(file); + hmdfs_close_path(file); return ERR_PTR(id); } *file_id = id; @@ -499,6 +552,11 @@ void hmdfs_server_open(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, goto err_free; } + if (path_contain_dotdot(recv->buf, recv->path_len)) { + ret = -EINVAL; + goto err_free; + } + info->file = hmdfs_open_file(con, recv->buf, recv->file_type, &info->file_id); if (IS_ERR(info->file)) { @@ -520,11 +578,13 @@ void hmdfs_server_open(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, remove_file_from_conn(con, info->file_id); hmdfs_close_path(info->file); } + hmdfs_close_path(info->file); kfree(resp); kfree(info); return; err_close: + hmdfs_close_path(info->file); remove_file_from_conn(con, info->file_id); hmdfs_close_path(info->file); err_free: @@ -624,11 +684,13 @@ static int hmdfs_dentry_open(struct hmdfs_peer *con, return err; } + get_file(info->file); info->file_id = insert_file_into_conn(con, info->file); if (info->file_id < 0) { err = info->file_id; hmdfs_err("file_id alloc failed! err %d", err); hmdfs_close_path(info->file); + hmdfs_close_path(info->file); return err; } @@ -671,6 +733,7 @@ static int hmdfs_server_do_atomic_open(struct hmdfs_peer *con, if (err) { remove_file_from_conn(con, info->file_id); hmdfs_close_path(info->file); + hmdfs_close_path(info->file); } put_child: path_put(&child_path); @@ -684,6 +747,17 @@ void hmdfs_server_atomic_open(struct hmdfs_peer *con, struct atomic_open_request *recv = data; struct atomic_open_response *resp = NULL; struct hmdfs_open_info *info = NULL; + char *file_path = recv->buf; + char *file = recv->buf + recv->path_len + 1; + + if (path_contain_dotdot(file_path, recv->path_len)) { + err = -EINVAL; + goto out; + } + if (path_contain_dotdot(file, recv->file_len)) { + err = -EINVAL; + goto out; + } info = kmalloc(sizeof(*info), GFP_KERNEL); resp = kzalloc(sizeof(*resp), GFP_KERNEL); @@ -735,7 +809,11 @@ void hmdfs_server_release(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, /* put the reference acquired by get_file_by_fid_and_ver() */ hmdfs_close_path(file); hmdfs_info("close %u", file_id); - remove_file_from_conn(con, file_id); + ret = remove_file_from_conn(con, file_id); + if (ret) { + hmdfs_err("cannot find after close %u", file_id); + goto out; + } hmdfs_close_path(file); @@ -1021,6 +1099,11 @@ void hmdfs_server_readdir(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, trace_hmdfs_server_readdir(readdir_recv); + if (path_contain_dotdot(readdir_recv->path, readdir_recv->path_len)) { + err = -EINVAL; + goto send_err; + } + lo_p_name = server_lookup_lower(con, readdir_recv->path, &lo_p); if (IS_ERR(lo_p_name)) { err = PTR_ERR(lo_p_name); @@ -1080,6 +1163,14 @@ void hmdfs_server_mkdir(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, mkdir_dir = mkdir_recv->path; mkdir_name = mkdir_recv->path + path_len + 1; + if (path_contain_dotdot(mkdir_dir, mkdir_recv->path_len)) { + err = -EINVAL; + goto mkdir_out; + } + if (path_contain_dotdot(mkdir_name, mkdir_recv->name_len)) { + err = -EINVAL; + goto mkdir_out; + } dent = hmdfs_root_mkdir(con->device_id, con->sbi->local_dst, mkdir_dir, mkdir_name, @@ -1122,6 +1213,14 @@ void hmdfs_server_create(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, create_dir = create_recv->path; create_name = create_recv->path + path_len + 1; + if (path_contain_dotdot(create_dir, create_recv->path_len)) { + err = -EINVAL; + goto create_out; + } + if (path_contain_dotdot(create_name, create_recv->name_len)) { + err = -EINVAL; + goto create_out; + } dent = hmdfs_root_create(con->device_id, con->sbi->local_dst, create_dir, create_name, @@ -1161,12 +1260,22 @@ void hmdfs_server_rmdir(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, path = rmdir_recv->path; name = rmdir_recv->path + le32_to_cpu(rmdir_recv->path_len) + 1; + if (path_contain_dotdot(path, rmdir_recv->path_len)) { + err = -EINVAL; + goto rmdir_out; + } + if (path_contain_dotdot(name, rmdir_recv->name_len)) { + err = -EINVAL; + goto rmdir_out; + } + err = kern_path(con->sbi->local_dst, 0, &root_path); if (!err) { err = hmdfs_root_rmdir(con->device_id, &root_path, path, name); path_put(&root_path); } +rmdir_out: hmdfs_send_err_response(con, cmd, err); } @@ -1181,12 +1290,22 @@ void hmdfs_server_unlink(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, path = unlink_recv->path; name = unlink_recv->path + le32_to_cpu(unlink_recv->path_len) + 1; + if (path_contain_dotdot(path, unlink_recv->path_len)) { + err = -EINVAL; + goto unlink_out; + } + if (path_contain_dotdot(name, unlink_recv->name_len)) { + err = -EINVAL; + goto unlink_out; + } + err = kern_path(con->sbi->local_dst, 0, &root_path); if (!err) { err = hmdfs_root_unlink(con->device_id, &root_path, path, name); path_put(&root_path); } +unlink_out: hmdfs_send_err_response(con, cmd, err); } @@ -1216,10 +1335,27 @@ void hmdfs_server_rename(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, name_old = recv->path + old_path_len + 1 + new_path_len + 1; name_new = recv->path + old_path_len + 1 + new_path_len + 1 + old_name_len + 1; + if (path_contain_dotdot(path_old, old_path_len)) { + err = -EINVAL; + goto rename_out; + } + if (path_contain_dotdot(path_new, new_path_len)) { + err = -EINVAL; + goto rename_out; + } + if (path_contain_dotdot(name_old, old_name_len)) { + err = -EINVAL; + goto rename_out; + } + if (path_contain_dotdot(name_new, new_name_len)) { + err = -EINVAL; + goto rename_out; + } err = hmdfs_root_rename(con->sbi, con->device_id, path_old, name_old, path_new, name_new, flags); +rename_out: hmdfs_send_err_response(con, cmd, err); } @@ -1327,10 +1463,10 @@ static bool hmdfs_filldir_real(struct dir_context *ctx, const char *name, out: /* - * we always return 0 here, so that the caller can continue to next + * we always return true here, so that the caller can continue to next * dentry even if failed on this dentry somehow. */ - return 0; + return true; } static void hmdfs_server_set_header(struct hmdfs_dcache_header *header, @@ -1517,6 +1653,11 @@ void hmdfs_server_setattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, struct iattr attr; __u32 valid = le32_to_cpu(recv->valid); + if (path_contain_dotdot(recv->buf, recv->path_len)) { + err = -EINVAL; + goto out; + } + err = kern_path(con->sbi->local_dst, 0, &root_path); if (err) { hmdfs_err("kern_path failed err = %d", err); @@ -1605,6 +1746,11 @@ void hmdfs_server_getattr(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, unsigned int recv_flags = le32_to_cpu(recv->lookup_flags); unsigned int lookup_flags = 0; + if (path_contain_dotdot(recv->buf, recv->path_len)) { + err = -EINVAL; + goto err; + } + err = hmdfs_convert_lookup_flags(recv_flags, &lookup_flags); if (err) goto err; @@ -1696,6 +1842,11 @@ void hmdfs_server_statfs(struct hmdfs_peer *con, struct hmdfs_head_cmd *cmd, struct kstatfs *st = NULL; int err = 0; + if (path_contain_dotdot(recv->path, recv->path_len)) { + err = -EINVAL; + goto out; + } + st = kzalloc(sizeof(*st), GFP_KERNEL); if (!st) { err = -ENOMEM; @@ -1768,9 +1919,20 @@ void hmdfs_server_getxattr(struct hmdfs_peer *con, char *name = recv->buf + recv->path_len + 1; int err = -ENOMEM; + if (path_contain_dotdot(file_path, recv->path_len)) { + err = -EINVAL; + goto err; + } + if (path_contain_dotdot(name, recv->name_len)) { + err = -EINVAL; + goto err; + } + resp = kzalloc(size_read, GFP_KERNEL); - if (!resp) + if (!resp) { + err = -ENOMEM; goto err; + } err = kern_path(con->sbi->local_dst, LOOKUP_DIRECTORY, &root_path); if (err) { @@ -1820,20 +1982,25 @@ void hmdfs_server_setxattr(struct hmdfs_peer *con, bool del = recv->del; struct path root_path; struct path path; - const char *file_path = NULL; - const char *name = NULL; - const void *value = NULL; + const char *file_path = recv->buf; + const char *name = recv->buf + recv->path_len + 1; + const void *value = name + recv->name_len + 1; int err; + if (path_contain_dotdot(file_path, recv->path_len)) { + err = -EINVAL; + goto err; + } + if (path_contain_dotdot(name, recv->name_len)) { + err = -EINVAL; + goto err; + } + err = kern_path(con->sbi->local_dst, LOOKUP_DIRECTORY, &root_path); if (err) { hmdfs_info("kern_path failed err = %d", err); goto err; } - - file_path = recv->buf; - name = recv->buf + recv->path_len + 1; - value = name + recv->name_len + 1; err = vfs_path_lookup(root_path.dentry, root_path.mnt, file_path, 0, &path); if (err) { @@ -1862,11 +2029,16 @@ void hmdfs_server_listxattr(struct hmdfs_peer *con, size_t size = le32_to_cpu(recv->size); int size_read = sizeof(struct listxattr_response) + size; struct listxattr_response *resp = NULL; - const char *file_path = NULL; + const char *file_path = recv->buf; struct path root_path; struct path path; int err = 0; + if (path_contain_dotdot(file_path, recv->path_len)) { + err = -EINVAL; + goto err; + } + resp = kzalloc(size_read, GFP_KERNEL); if (!resp) { err = -ENOMEM; @@ -1878,8 +2050,6 @@ void hmdfs_server_listxattr(struct hmdfs_peer *con, hmdfs_info("kern_path failed err = %d", err); goto err_free_resp; } - - file_path = recv->buf; err = vfs_path_lookup(root_path.dentry, root_path.mnt, file_path, 0, &path); if (err) { @@ -1921,6 +2091,11 @@ void hmdfs_server_get_drop_push(struct hmdfs_peer *con, int err; char *tmp_path = NULL; + if (path_contain_dotdot(dp_recv->path, dp_recv->path_len)) { + err = -EINVAL; + goto quickack; + } + err = kern_path(con->sbi->real_dst, 0, &root_path); if (err) { hmdfs_err("kern_path failed err = %d", err); diff --git a/fs/hmdfs/hmdfs_share.c b/fs/hmdfs/hmdfs_share.c index 6b9557d022632cde378ed070dfc6d90f2fd21199..436d3324fc19908223c8fc41a5c86b6bd74fbc49 100644 --- a/fs/hmdfs/hmdfs_share.c +++ b/fs/hmdfs/hmdfs_share.c @@ -157,13 +157,13 @@ bool in_share_dir(struct dentry *child_dentry) inline bool is_share_dir(struct inode *inode, const char *name) { - return (S_ISDIR(inode->i_mode) && - !strncmp(name, SHARE_RESERVED_DIR, strlen(SHARE_RESERVED_DIR))); + return (S_ISDIR(inode->i_mode) && + !strncmp(name, SHARE_RESERVED_DIR, sizeof(SHARE_RESERVED_DIR))); } int get_path_from_share_table(struct hmdfs_sb_info *sbi, - struct dentry *cur_dentry, - struct path *src_path) + struct dentry *cur_dentry, + struct path *src_path) { struct hmdfs_share_item *item; const char *path_name; @@ -329,7 +329,8 @@ void hmdfs_clear_share_table(struct hmdfs_sb_info *sbi) } spin_unlock(&sbi->share_table.item_list_lock); - destroy_workqueue(st->share_item_timeout_wq); + if (st->share_item_timeout_wq != NULL) + destroy_workqueue(st->share_item_timeout_wq); } int hmdfs_clear_first_item(struct hmdfs_share_table *st) diff --git a/fs/hmdfs/hmdfs_trace.h b/fs/hmdfs/hmdfs_trace.h index 15bedbaa5cfaf938a31d4bd2d00f85f37d694d5b..0660d06404d2e5cc3c60e288c14aa092163ca3e4 100755 --- a/fs/hmdfs/hmdfs_trace.h +++ b/fs/hmdfs/hmdfs_trace.h @@ -546,6 +546,48 @@ TRACE_EVENT(hmdfs_readpages_cloud, __entry->nr_pages, __entry->err) ); +TRACE_EVENT(hmdfs_do_readpages_cloud_begin, + + TP_PROTO(int cnt, loff_t pos), + + TP_ARGS(cnt, pos), + + TP_STRUCT__entry( + __field(int, cnt) + __field(loff_t, pos) + ), + + TP_fast_assign( + __entry->cnt = cnt; + __entry->pos = pos; + ), + + TP_printk("cnt:%d, pos:%llx", + __entry->cnt, __entry->pos) +); + +TRACE_EVENT(hmdfs_do_readpages_cloud_end, + + TP_PROTO(int cnt, loff_t pos, int ret), + + TP_ARGS(cnt, pos, ret), + + TP_STRUCT__entry( + __field(int, cnt) + __field(loff_t, pos) + __field(int, ret) + ), + + TP_fast_assign( + __entry->cnt = cnt; + __entry->pos = pos; + __entry->ret = ret; + ), + + TP_printk("cnt:%d, pos:%llx", + __entry->cnt, __entry->pos, __entry->ret) +); + TRACE_EVENT(hmdfs_client_recv_readpage, TP_PROTO(struct hmdfs_peer *con, unsigned long long remote_ino, diff --git a/fs/hmdfs/inode.c b/fs/hmdfs/inode.c index 0b3ec919c3e7808e1d11dcdc0c14df3ac6fc5591..33cc8c7419d5c89ee3292df056f6164907b65972 100755 --- a/fs/hmdfs/inode.c +++ b/fs/hmdfs/inode.c @@ -88,8 +88,9 @@ static int iget_test(struct inode *inode, void *data) WARN_ON(ia->ino.domain < DOMAIN_ROOT || ia->ino.domain >= DOMAIN_INVALID); - if ((read_ino_domain(inode->i_ino) == DOMAIN_ROOT) || - (read_ino_domain(inode->i_ino) != ia->ino.domain)) + if (read_ino_domain(inode->i_ino) == DOMAIN_ROOT) + return 1; + if (read_ino_domain(inode->i_ino) != ia->ino.domain) return 0; switch (ia->ino.domain) { @@ -342,3 +343,15 @@ struct inode *hmdfs_iget_locked_root(struct super_block *sb, uint64_t root_ino, return iget5_locked(sb, ia.ino.ino_output, iget_test, iget_set, &ia); } + + +void hmdfs_update_upper_file(struct file *upper_file, struct file *lower_file) +{ + loff_t upper_size = i_size_read(upper_file->f_inode); + loff_t lower_size = i_size_read(lower_file->f_inode); + + if (upper_file->f_inode->i_mapping && upper_size != lower_size) { + i_size_write(upper_file->f_inode, lower_size); + truncate_inode_pages(upper_file->f_inode->i_mapping, 0); + } +} \ No newline at end of file diff --git a/fs/hmdfs/inode.h b/fs/hmdfs/inode.h index 8877a53a0110cb6d6a98350ae2ac3eba5a30675a..fb9bd2929d581e6e48dee17bd369962869440e02 100755 --- a/fs/hmdfs/inode.h +++ b/fs/hmdfs/inode.h @@ -259,5 +259,6 @@ struct inode *hmdfs_iget5_locked_cloud(struct super_block *sb, struct hmdfs_peer *peer, struct hmdfs_lookup_cloud_ret *res); +void hmdfs_update_upper_file(struct file *upper_file, struct file *lower_file); uint32_t make_ino_raw_cloud(uint8_t *cloud_id); #endif // INODE_H diff --git a/fs/hmdfs/inode_cloud.c b/fs/hmdfs/inode_cloud.c index 5cf488486cbb538ef48f66fae0759c94b2b1fc2e..9510dd5f3be565e2719cc3964e3819bc041a5258 100755 --- a/fs/hmdfs/inode_cloud.c +++ b/fs/hmdfs/inode_cloud.c @@ -286,6 +286,10 @@ static struct dentry *hmdfs_lookup_cloud_dentry(struct inode *parent_inode, if (in_share_dir(child_dentry)) gdi->file_type = HM_SHARE; inode = fill_inode_cloud(sb, lookup_result, parent_inode); + if (IS_ERR(inode)) { + ret = ERR_CAST(inode); + goto out; + } check_and_fixup_ownership_remote(parent_inode, inode, @@ -296,7 +300,7 @@ static struct dentry *hmdfs_lookup_cloud_dentry(struct inode *parent_inode, } else { ret = ERR_PTR(-ENOENT); } - +out: kfree(lookup_result); return ret; } @@ -364,7 +368,7 @@ int hmdfs_rmdir_cloud(struct inode *dir, struct dentry *dentry) int hmdfs_unlink_cloud(struct inode *dir, struct dentry *dentry) { - return -EPERM; + return 0; } int hmdfs_rename_cloud(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, diff --git a/fs/hmdfs/inode_cloud_merge.c b/fs/hmdfs/inode_cloud_merge.c index fef58ddb927c7a644415a854938f4244ce6e3258..dc733b82b8bda72b04d979aecff48b128feab204 100755 --- a/fs/hmdfs/inode_cloud_merge.c +++ b/fs/hmdfs/inode_cloud_merge.c @@ -322,6 +322,10 @@ struct dentry *hmdfs_lookup_cloud_merge(struct inode *parent_inode, child_inode = fill_inode_merge(parent_inode->i_sb, parent_inode, child_dentry, NULL); + if (IS_ERR(child_inode)) { + err = PTR_ERR(child_inode); + goto out; + } info = hmdfs_i(child_inode); if (info->inode_type == HMDFS_LAYER_FIRST_MERGE) hmdfs_root_inode_perm_init(child_inode); @@ -679,11 +683,6 @@ static int hmdfs_rename_cloud_merge(struct mnt_idmap *idmap, struct inode *old_d goto rename_out; } - if (hmdfs_d(old_dentry)->device_id != hmdfs_d(new_dentry)->device_id) { - ret = -EXDEV; - goto rename_out; - } - rec_op_para = kmalloc(sizeof(*rec_op_para), GFP_KERNEL); if (!rec_op_para) { ret = -ENOMEM; diff --git a/fs/hmdfs/inode_local.c b/fs/hmdfs/inode_local.c index d57d268182191542c5401347b3b70c5e1f72292c..4c65516aafff0553894e10a1f962cf18e70017dd 100755 --- a/fs/hmdfs/inode_local.c +++ b/fs/hmdfs/inode_local.c @@ -38,7 +38,6 @@ int init_hmdfs_dentry_info(struct hmdfs_sb_info *sbi, struct dentry *dentry, if (!info) return -ENOMEM; - dentry->d_fsdata = info; INIT_LIST_HEAD(&info->cache_list_head); INIT_LIST_HEAD(&info->remote_cache_list_head); spin_lock_init(&info->cache_list_lock); @@ -47,6 +46,7 @@ int init_hmdfs_dentry_info(struct hmdfs_sb_info *sbi, struct dentry *dentry, spin_lock_init(&info->lock); info->dentry_type = dentry_type; info->device_id = 0; + dentry->d_fsdata = info; if (dentry_type == HMDFS_LAYER_ZERO || dentry_type == HMDFS_LAYER_FIRST_DEVICE || dentry_type == HMDFS_LAYER_SECOND_LOCAL || @@ -183,9 +183,9 @@ static bool hmdfs_name_match(struct dir_context *ctx, const char *name, memcpy(buf->name, name, namelen); buf->name[namelen] = 0; buf->found = true; - return 1; + return false; } - return 0; + return true; } static int __lookup_nosensitive(struct path *lower_parent_path, @@ -266,9 +266,6 @@ struct dentry *hmdfs_lookup_local(struct inode *parent_inode, flags &= ~LOOKUP_FOLLOW; err = vfs_path_lookup(lower_parent_path.dentry, lower_parent_path.mnt, (child_dentry->d_name.name), 0, &lower_path); - if (err == -ENOENT && !sbi->s_case_sensitive) - err = __lookup_nosensitive(&lower_parent_path, child_dentry, 0, - &lower_path); if (err && err != -ENOENT) { ret = ERR_PTR(err); goto out_err; @@ -601,9 +598,9 @@ int hmdfs_unlink_local_dentry(struct inode *dir, struct dentry *dentry) hmdfs_drop_remote_cache_dents(dentry->d_parent); d_drop(dentry); - hmdfs_put_lower_path(&lower_path); path_err: + hmdfs_put_lower_path(&lower_path); if (error) hmdfs_clear_drop_flag(dentry->d_parent); return error; @@ -628,6 +625,7 @@ int hmdfs_rename_local_dentry(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_old_dir_dentry = NULL; struct dentry *lower_new_dir_dentry = NULL; struct dentry *trap = NULL; + struct renamedata rename_data; int rc = 0; kuid_t old_dir_uid, new_dir_uid; @@ -667,17 +665,14 @@ int hmdfs_rename_local_dentry(struct inode *old_dir, struct dentry *old_dentry, goto out_lock; } - struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = d_inode(lower_old_dir_dentry), - .old_dentry = lower_old_dentry, - .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = d_inode(lower_new_dir_dentry), - .new_dentry = lower_new_dentry, - .flags = flags, - }; - - rc = vfs_rename(&rd); + rename_data.old_mnt_idmap = &nop_mnt_idmap; + rename_data.old_dir = d_inode(lower_old_dir_dentry); + rename_data.old_dentry = lower_old_dentry; + rename_data.new_mnt_idmap = &nop_mnt_idmap; + rename_data.new_dir = d_inode(lower_new_dir_dentry); + rename_data.new_dentry = lower_new_dentry; + rename_data.flags = flags; + rc = vfs_rename(&rename_data); out_lock: dget(old_dentry); @@ -755,28 +750,23 @@ int hmdfs_rename_local(struct mnt_idmap *idmap, struct inode *old_dir, struct de static bool symname_is_allowed(const char *symname) { - char *p; - char *buf = 0; - size_t symname_len; + char *p = NULL; + size_t len; - symname_len = strnlen(symname, PATH_MAX); - if (symname_len >= PATH_MAX) + len = strnlen(symname, PATH_MAX); + if (len >= PATH_MAX) return false; - buf = kzalloc(PATH_MAX + 2, GFP_KERNEL); - if (!buf) - return false; - - buf[0] = '/'; - strncpy(buf + 1, symname, symname_len); - strcat(buf, "/"); p = strstr(symname, "/../"); - if (p) { - kfree(buf); + if (p) return false; - } - kfree(buf); + if (len == 2u && strncmp(symname, "..", 2u) == 0) + return false; + if (len >= 3u && strncmp(symname, "../", 3u) == 0) + return false; + if (len >= 3u && strncmp(symname + len - 3u, "/..", 3u) == 0) + return false; return true; } @@ -930,11 +920,18 @@ static int hmdfs_getattr_local(struct mnt_idmap *idmap, const struct path *path, struct path lower_path; int ret; + if (path->dentry == NULL || hmdfs_d(path->dentry) == NULL) { + hmdfs_err("dentry is NULL"); + return -ENOENT; + } + hmdfs_get_lower_path(path->dentry, &lower_path); ret = vfs_getattr(&lower_path, stat, request_mask, flags); stat->ino = d_inode(path->dentry)->i_ino; stat->uid = d_inode(path->dentry)->i_uid; stat->gid = d_inode(path->dentry)->i_gid; + stat->dev = 0; + stat->rdev = 0; hmdfs_put_lower_path(&lower_path); return ret; diff --git a/fs/hmdfs/inode_merge.c b/fs/hmdfs/inode_merge.c index 3b7e477a4ff0f54091da8fd1eac558b077555ca3..2da71b9d0737d1724008186daa1cd5a5d0cd8d07 100755 --- a/fs/hmdfs/inode_merge.c +++ b/fs/hmdfs/inode_merge.c @@ -739,6 +739,10 @@ struct dentry *hmdfs_lookup_merge(struct inode *parent_inode, child_inode = fill_inode_merge(parent_inode->i_sb, parent_inode, child_dentry, NULL); + if (IS_ERR(child_inode)) { + err = PTR_ERR(child_inode); + goto out; + } info = hmdfs_i(child_inode); if (info->inode_type == HMDFS_LAYER_FIRST_MERGE) hmdfs_root_inode_perm_init(child_inode); @@ -1183,7 +1187,7 @@ int do_unlink_merge(struct inode *dir, struct dentry *dentry) mutex_lock(&dim->comrade_list_lock); list_for_each_entry(comrade, &(dim->comrade_list), list) { lo_d = comrade->lo_d; - dget(lo_d); + dget(lo_d); lo_d_dir = lock_parent(lo_d); /* lo_d could be unhashed, need to lookup again here */ lo_d_lookup = lookup_one_len(lo_d->d_name.name, lo_d_dir, @@ -1196,7 +1200,7 @@ int do_unlink_merge(struct inode *dir, struct dentry *dentry) break; } lo_i_dir = d_inode(lo_d_dir); - ret = vfs_unlink(&nop_mnt_idmap, lo_i_dir, lo_d_lookup, NULL); // lo_d GET + ret = vfs_unlink(&nop_mnt_idmap, lo_i_dir, lo_d_lookup, NULL); dput(lo_d_lookup); unlock_dir(lo_d_dir); dput(lo_d); @@ -1247,6 +1251,7 @@ int do_rename_merge(struct inode *old_dir, struct dentry *old_dentry, char *abs_path_buf = kmalloc(PATH_MAX, GFP_KERNEL); char *path_name = NULL; struct hmdfs_dentry_info_merge *pmdi = NULL; + struct renamedata rename_data; if (flags & ~RENAME_NOREPLACE) { ret = -EINVAL; @@ -1302,17 +1307,14 @@ int do_rename_merge(struct inode *old_dir, struct dentry *old_dentry, lo_d_old_dir = dget_parent(lo_d_old); lo_i_old_dir = d_inode(lo_d_old_dir); - struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = lo_i_old_dir, - .old_dentry = lo_d_old, - .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = lo_i_new_dir, - .new_dentry = lo_d_new, - .flags = flags, - }; - - ret = vfs_rename(&rd); + rename_data.old_mnt_idmap = &nop_mnt_idmap; + rename_data.old_dir = lo_i_old_dir; + rename_data.old_dentry = lo_d_old; + rename_data.new_mnt_idmap = &nop_mnt_idmap; + rename_data.new_dir = lo_i_new_dir; + rename_data.new_dentry = lo_d_new; + rename_data.flags = flags; + ret = vfs_rename(&rename_data); new_comrade = alloc_comrade(lo_p_new.dentry, comrade->dev_id); if (IS_ERR(new_comrade)) { diff --git a/fs/hmdfs/inode_remote.c b/fs/hmdfs/inode_remote.c index 658bae037a4358b64a410dec3de2d940c0455729..900f08aeeeca370a48e556b7ea53af78810def92 100755 --- a/fs/hmdfs/inode_remote.c +++ b/fs/hmdfs/inode_remote.c @@ -827,7 +827,7 @@ int hmdfs_rename_remote(struct mnt_idmap *idmap, struct inode *old_dir, struct d rename_in_cache_file(con->device_id, old_dentry, new_dentry); } else if (S_ISDIR(old_dentry->d_inode->i_mode)) { - if ((con->status == NODE_STAT_ONLINE)) { + if (con->status == NODE_STAT_ONLINE) { ret = hmdfs_client_start_rename( con, relative_old_dir_path, old_dentry_d_name, relative_new_dir_path, new_dentry_d_name, diff --git a/fs/hmdfs/inode_root.c b/fs/hmdfs/inode_root.c index 91ca37bb6b7c3872533fd2d42230459359b2c13a..d1d996eef369cb04c78e1a57d44cc1a5aac461e9 100755 --- a/fs/hmdfs/inode_root.c +++ b/fs/hmdfs/inode_root.c @@ -143,7 +143,7 @@ struct dentry *hmdfs_device_lookup(struct inode *parent_inode, trace_hmdfs_device_lookup(parent_inode, child_dentry, flags); if (!strncmp(d_name, DEVICE_VIEW_LOCAL, - sizeof(DEVICE_VIEW_LOCAL) - 1)) { + sizeof(DEVICE_VIEW_LOCAL))) { err = init_hmdfs_dentry_info(sbi, child_dentry, HMDFS_LAYER_SECOND_LOCAL); if (err) { @@ -170,7 +170,7 @@ struct dentry *hmdfs_device_lookup(struct inode *parent_inode, goto out; } } else if (!strncmp(d_name, DEVICE_VIEW_CLOUD, - sizeof(DEVICE_VIEW_CLOUD) - 1)) { + sizeof(DEVICE_VIEW_CLOUD))) { err = init_hmdfs_dentry_info(sbi, child_dentry, HMDFS_LAYER_SECOND_CLOUD); if (err) { diff --git a/fs/hmdfs/main.c b/fs/hmdfs/main.c index 2718e66d3d64a46173e1da637d928708bb873f76..f08ed9fd9fb1c6aabb96ebbc630a186c633da396 100755 --- a/fs/hmdfs/main.c +++ b/fs/hmdfs/main.c @@ -948,8 +948,10 @@ static int hmdfs_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; goto out_sput; } - - err = init_hmdfs_dentry_info(sbi, root_dentry, HMDFS_LAYER_ZERO); + if (sbi->s_cloud_disk_switch) + err = init_hmdfs_dentry_info(sbi, root_dentry, HMDFS_LAYER_SECOND_LOCAL); + else + err = init_hmdfs_dentry_info(sbi, root_dentry, HMDFS_LAYER_ZERO); if (err) goto out_freeroot; hmdfs_set_lower_path(root_dentry, &lower_path); diff --git a/fs/hmdfs/stash.c b/fs/hmdfs/stash.c index 71ea8098aab31c99e624dd8df3b3f5c110a56566..d62c2c3ad789166f222724b50ecc97edb5499f93 100755 --- a/fs/hmdfs/stash.c +++ b/fs/hmdfs/stash.c @@ -1008,10 +1008,10 @@ static bool hmdfs_has_stash_file(struct dir_context *dctx, const char *name, err = hmdfs_parse_stash_file_name(dctx, name, namelen, d_type, &stash_inum); if (!err) - return 0; + return true; ctx->tbl->cnt++; - return 1; + return false; } static bool hmdfs_fill_stash_file(struct dir_context *dctx, const char *name, @@ -1026,13 +1026,13 @@ static bool hmdfs_fill_stash_file(struct dir_context *dctx, const char *name, err = hmdfs_parse_stash_file_name(dctx, name, namelen, d_type, &stash_inum); if (!err) - return 0; + return true; if (ctx->tbl->cnt >= ctx->tbl->max) - return 1; + return false; ctx->tbl->inodes[ctx->tbl->cnt++] = stash_inum; - return 0; + return true; } static int hmdfs_del_stash_file(struct dentry *parent, struct dentry *child) diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 3f31baa3293f9819e271911db351456ad2a14519..e4c217c743f98bcf7469ab3a5e74eddd54c3ce08 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -422,6 +422,10 @@ struct dma_buf { * obeying fences. See enum dma_resv_usage for further descriptions. */ struct dma_resv *resv; +#ifdef CONFIG_DMABUF_PROCESS_INFO + pid_t exp_pid; + char exp_task_comm[TASK_COMM_LEN]; +#endif /** @poll: for userspace poll support */ wait_queue_head_t poll; @@ -631,4 +635,16 @@ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map); int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); + +#ifdef CONFIG_DMABUF_PROCESS_INFO +/** + * get_dma_buf_from_file - Get struct dma_buf* from struct file* + * @f: [in] pointer to struct file, which is associated with a + * dma_buf object. + * + * If @f IS_ERR_OR_NULL, return NULL. + * If @f is not a file associated with dma_buf, return NULL. + */ +struct dma_buf *get_dma_buf_from_file(struct file *f); +#endif /* CONFIG_DMABUF_PROCESS_INFO */ #endif /* __DMA_BUF_H__ */ diff --git a/include/linux/hyperhold_inf.h b/include/linux/hyperhold_inf.h new file mode 100644 index 0000000000000000000000000000000000000000..7d2bd1e88c1ca1146cf7c106c09366b53fce12bc --- /dev/null +++ b/include/linux/hyperhold_inf.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/hyperhold_inf.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef HYPERHOLD_INF_H +#define HYPERHOLD_INF_H + +#ifdef CONFIG_HYPERHOLD + +extern bool is_hyperhold_enable(void); + +#else + +static inline is_hyperhold_enable(void) +{ + return false; +} +#endif + +#endif diff --git a/include/linux/memcg_policy.h b/include/linux/memcg_policy.h new file mode 100644 index 0000000000000000000000000000000000000000..4aec2a1bb3ecf8a8e7f2c4835a524f54c09bb7de --- /dev/null +++ b/include/linux/memcg_policy.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/memcg_policy.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + * + */ +#ifndef _MEMCG_POLICY_H +#define _MEMCG_POLICY_H + +struct mem_cgroup; +struct pglist_data; +struct scan_control; + + +extern struct list_head score_head; +extern bool score_head_inited; +extern rwlock_t score_list_lock; +extern struct cgroup_subsys memory_cgrp_subsys; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr); +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc); +#endif /* CONFIG_HYPERHOLD_FILE_LRU */ + +#ifdef CONFIG_HYPERHOLD_MEMCG +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev); +void get_next_memcg_break(struct mem_cgroup *memcg); +void memcg_app_score_update(struct mem_cgroup *target); + +struct memcg_reclaim { + atomic64_t app_score; + atomic64_t ub_ufs2zram_ratio; +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_t ub_zram2ufs_ratio; + atomic_t ub_mem2zram_ratio; + atomic_t refault_threshold; + /* anon refault */ + unsigned long long reclaimed_pagefault; +#endif +}; +#define MAX_APP_SCORE 1000 +#endif + + +#endif /* _LINUX_MEMCG_POLICY_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b6eda2ab205dc7133472ba56a4f658cfaec5ff4f..3bb4599b6ab4fd0e3493068a6d2fdf90a647d42f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,8 @@ #include #include #include +#include +#include struct mem_cgroup; struct obj_cgroup; @@ -58,6 +60,11 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +static inline bool is_prot_page(struct page *page) +{ + return false; +} + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -289,6 +296,13 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; +#ifdef CONFIG_HYPERHOLD_MEMCG + struct list_head score_node; +#define MEM_CGROUP_NAME_MAX_LEN 100 + char name[MEM_CGROUP_NAME_MAX_LEN]; + struct memcg_reclaim memcg_reclaimed; +#endif + #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; @@ -707,6 +721,12 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_migrate(struct folio *old, struct folio *new); +static inline struct mem_cgroup_per_node *mem_cgroup_nodeinfo(struct mem_cgroup *memcg, + int nid) +{ + return memcg->nodeinfo[nid]; +} + /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec @@ -826,6 +846,10 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return -1; +#endif return memcg->id.id; } @@ -852,6 +876,11 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) if (mem_cgroup_disabled()) return NULL; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return NULL; +#endif + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } @@ -1007,6 +1036,10 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP @@ -1026,6 +1059,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); @@ -1063,6 +1101,17 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static __always_inline bool is_file_page(struct page *page) +{ + if (!PageUnevictable(page) && !PageSwapBacked(page) && page_mapping(page)) + return true; + + return false; + +} +#endif + void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9fb1b03b83b2308f5489bd908f774cb83ba537e2..0c22aa8d1d114f5ccdb13824f63ebb13cbc5cb41 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1305,6 +1305,12 @@ typedef struct pglist_data { int kswapd_failures; /* Number of 'reclaimed == 0' runs */ +#ifdef CONFIG_HYPERHOLD_ZSWAPD + wait_queue_head_t zswapd_wait; + atomic_t zswapd_wait_flag; + struct task_struct *zswapd; +#endif + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; @@ -1393,6 +1399,11 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) +{ + return &pgdat->__lruvec; +} + static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -1434,6 +1445,15 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static inline int is_node_lruvec(struct lruvec *lruvec) +{ + return &lruvec_pgdat(lruvec)->__lruvec == lruvec; +} +#endif + +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); + #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else diff --git a/include/linux/reclaim_acct.h b/include/linux/reclaim_acct.h new file mode 100644 index 0000000000000000000000000000000000000000..5cf26f3267d1b73e6caa33fa7cd47b3257ff86f3 --- /dev/null +++ b/include/linux/reclaim_acct.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/reclaim_acct.h + * + * Copyright (c) 2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _RECLAIM_ACCT_H +#define _RECLAIM_ACCT_H + +#include +#include + +/* RA is the abbreviation of reclaim accouting */ +enum reclaimacct_stubs { + RA_RECLAIM = 0, + RA_DRAINALLPAGES, + RA_SHRINKFILE, + RA_SHRINKANON, + RA_SHRINKSLAB, + NR_RA_STUBS +}; + +enum reclaim_type { + DIRECT_RECLAIMS = 0, + KSWAPD_RECLAIM, + ZSWAPD_RECLAIM, + RECLAIM_TYPES +}; + +#ifdef CONFIG_RECLAIM_ACCT +static inline bool is_system_reclaim(enum reclaim_type type) +{ + return (type == KSWAPD_RECLAIM || type == ZSWAPD_RECLAIM); +} + +void reclaimacct_tsk_init(struct task_struct *tsk); +void reclaimacct_init(void); + +void reclaimacct_start(enum reclaim_type type, struct reclaim_acct *ra); +void reclaimacct_end(enum reclaim_type type); + +void reclaimacct_substage_start(enum reclaimacct_stubs stub); +void reclaimacct_substage_end(enum reclaimacct_stubs stub, unsigned long freed, + const struct shrinker *shrinker); +#endif + +#endif /* _RECLAIM_ACCT_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 209a425739a9f9bec472a2009b5ad9ce084d3949..060f8c84d91510bcd921e577938380261d20da90 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -436,6 +436,23 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +struct scan_control; + +extern unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc); +extern bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru); +extern bool cgroup_reclaim(struct scan_control *sc); +extern void check_move_unevictable_pages(struct pagevec *pvec); +extern unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); +extern bool writeback_throttling_sane(struct scan_control *sc); +extern inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc); + +extern int current_may_throttle(void); + static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ @@ -468,6 +485,9 @@ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern bool free_swap_is_low(void); +#endif /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 7f5d1caf5890e4c0cc97058640ff9e92bdc113aa..75f1bd32bebb3fb592d0b1229def6497701b9e55 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -149,6 +149,24 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, +#endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + ZSWAPD_WAKEUP, + ZSWAPD_REFAULT, + ZSWAPD_MEDIUM_PRESS, + ZSWAPD_CRITICAL_PRESS, + ZSWAPD_MEMCG_RATIO_SKIP, + ZSWAPD_MEMCG_REFAULT_SKIP, + ZSWAPD_SWAPOUT, + ZSWAPD_EMPTY_ROUND, + ZSWAPD_EMPTY_ROUND_SKIP_TIMES, + ZSWAPD_SNAPSHOT_TIMES, + ZSWAPD_RECLAIMED, + ZSWAPD_SCANNED, +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + FREEZE_RECLAIMED, + FREEZE_RECLAIME_COUNT, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/zswapd.h b/include/linux/zswapd.h new file mode 100644 index 0000000000000000000000000000000000000000..3a9768a358a8eb31666cfdf58f4efe10bd4e8263 --- /dev/null +++ b/include/linux/zswapd.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/zswapd.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_H +#define _ZSWAPD_H + +enum { + CACHE_SIZE, + SWAP_SIZE, + CACHE_PAGE, + SWAP_PAGE, + CACHE_FAULT, + SWAP_FAULT, + READ_SIZE, + WRITE_SIZE, +}; + +struct group_swap_ops { + u64 (*group_read)(u16 gid, u64 req_size, void *priv); + u64 (*group_write)(u16 gid, u64 req_size, void *priv); + u64 (*group_data_size)(u16 gid, int type, void *priv); +}; + +struct group_swap_device { + void *priv; + struct group_swap_ops *ops; + struct list_head list; +}; + +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern int zswapd_run(int nid); +extern void zswapd_stop(int nid); +extern void zswapd_status_show(struct seq_file *m); +extern void wake_all_zswapd(void); +extern void set_snapshotd_init_flag(unsigned int val); +extern pid_t get_zswapd_pid(void); +extern unsigned long long get_free_swap_threshold(void); +extern struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv); +extern void unregister_group_swap(struct group_swap_device *gsdev); + +#ifdef CONFIG_HYPERHOLD_DEBUG +extern void memcg_eswap_info_show(struct seq_file *m); +#endif +#endif + +#endif /* _LINUX_ZSWAPD_H */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d2123dd960d59b41408310d13310b5b41a2f40cc..bef2cf6f986d868b4c5b525b3c7e2b48a509ce37 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -350,6 +350,36 @@ TRACE_EVENT(mm_vmscan_write_folio, show_reclaim_flags(__entry->reclaim_flags)) ); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +TRACE_EVENT(mm_vmscan_lru_zswapd_shrink_active, + + TP_PROTO(int nid, unsigned long nr_taken, + unsigned long nr_deactivated, int priority), + + TP_ARGS(nid, nr_taken, nr_deactivated, priority), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_taken) + __field(unsigned long, nr_deactivated) + __field(int, priority) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_taken = nr_taken; + __entry->nr_deactivated = nr_deactivated; + __entry->priority = priority; + ), + + TP_printk("nid=%d nr_taken=%ld nr_deactivated=%ld priority=%d", + __entry->nid, + __entry->nr_taken, + __entry->nr_deactivated, + __entry->priority) +); +#endif + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 52bb5a74a23b98f8721466126ec84a8059c6a3f7..7138facb00a5a947fa59d7902df3a47c81f70ab5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -512,7 +512,12 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, */ cred = of->file->f_cred; tcred = get_task_cred(task); +#ifdef CONFIG_HYPERHOLD + if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) && + !uid_eq(cred->euid, GLOBAL_ROOT_UID) && +#else if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && +#endif !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; diff --git a/mm/Kconfig b/mm/Kconfig index 281e116b5ce417add29abe3eff758100a750836e..32c4c3a107e151a87e0732b495892dd356ea3237 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -438,6 +438,41 @@ config SPARSEMEM_MANUAL endchoice +config MEMORY_MONITOR + bool "ENABLE MEMORY_MONITOR" + depends on PROC_FS + default n + help + MEMORY_MONITOR is a monitor of some memory reclaim method. + Now, kswapd wake up monitor use it. + +config HYPERHOLD_FILE_LRU + bool "Enable HyperHold FILE LRU" + depends on HYPERHOLD && MEMCG + select HYPERHOLD_MEMCG + default n + help + File-LRU is a mechanism that put file page in global lru list, + and anon page in memcg lru list(if MEMCG is enable), what's + more, recliam of anonymous pages and file page are separated. + +config HYPERHOLD_MEMCG + bool "Enable Memcg Management in HyperHold" + depends on HYPERHOLD && MEMCG + help + Add more attributes in memory cgroup, these attribute is used + to show information, shrink memory, swapin page and so on. + +config HYPERHOLD_ZSWAPD + bool "Enable zswapd thread to reclaim anon pages in background" + depends on HYPERHOLD && ZRAM + default n + help + zswapd is a kernel thread that reclaim anonymous pages in the + background. When the use of swap pages reaches the watermark + and the refault of anonymous pages is high, the content of + zram will exchanged to eswap by a certain percentage. + config SPARSEMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL diff --git a/mm/Makefile b/mm/Makefile index 358e2333a5b8968bb8941cac6f4070183eb265cd..81eab541cde2efdf21206f48651269a77b17082f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -139,3 +139,7 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o +obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o +obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o +obj-$(CONFIG_MEMORY_MONITOR) += memory_monitor.o diff --git a/mm/internal.h b/mm/internal.h index 7920a8b7982ec3b9753f520217d16bcc0f8270e2..65a5a42b8cb6e6fac8bcbc258d02d9951328b43f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -10,8 +10,11 @@ #include #include #include +#include #include #include +#include +#include struct folio_batch; @@ -35,6 +38,130 @@ struct folio_batch; /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +enum reclaim_invoker { + ALL, + KSWAPD, + ZSWAPD, + DIRECT_RECLAIM, + NODE_RECLAIM, + SOFT_LIMIT, + RCC_RECLAIM, + FILE_RECLAIM, + ANON_RECLAIM +}; + +struct scan_control { + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; + + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + + /* Can active folios be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + + /* Writepage batching in laptop mode; RECLAIM_WRITE */ + unsigned int may_writepage:1; + + /* Can mapped folios be reclaimed? */ + unsigned int may_unmap:1; + + /* Can folios be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + + /* + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate folios for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; + + enum reclaim_invoker invoker; + u32 isolate_count; + unsigned long nr_scanned_anon; + unsigned long nr_scanned_file; + unsigned long nr_reclaimed_anon; + unsigned long nr_reclaimed_file; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; +}; + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. @@ -187,11 +314,25 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, /* * in mm/vmscan.c: */ +#ifdef CONFIG_MEMORY_MONITOR +extern void kswapd_monitor_wake_up_queue(void); +#endif bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +extern unsigned int shrink_folio_list(struct list_head *page_list, struct pglist_data *pgdat, + struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references); +extern unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, + enum lru_list lru); +extern unsigned move_folios_to_lru(struct lruvec *lruvec, struct list_head *list); +extern void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc); /* * in mm/rmap.c: diff --git a/mm/memcg_control.c b/mm/memcg_control.c new file mode 100644 index 0000000000000000000000000000000000000000..4ca565174add4c5ec54ae12e58d916032b06b76a --- /dev/null +++ b/mm/memcg_control.c @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include "internal.h" + +#include "zswapd_internal.h" + +#ifdef CONFIG_HYPERHOLD_MEMCG + +struct list_head score_head; +bool score_head_inited; +DEFINE_RWLOCK(score_list_lock); +DEFINE_MUTEX(reclaim_para_lock); + +/** + * get_next_memcg - iterate over memory cgroup score_list + * @prev: previously returned memcg, NULL on first invocation + * + * Returns references to the next memg on score_list of @prev, + * or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use get_next_memcg_break() + * to cancel a walk before the round-trip is complete. + */ +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + read_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!prev)) + pos = &score_head; + else + pos = &(prev->score_node); + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->next == &score_head) + goto unlock; + + memcg = list_entry(pos->next, + struct mem_cgroup, score_node); + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + read_unlock_irqrestore(&score_list_lock, flags); + + if (prev) + css_put(&prev->css); + + return memcg; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + read_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!next)) + pos = &score_head; + else + pos = &next->score_node; + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->prev == &score_head) + goto unlock; + + memcg = list_entry(pos->prev, + struct mem_cgroup, score_node); + + if (unlikely(!memcg)) + goto unlock; + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + read_unlock_irqrestore(&score_list_lock, flags); + + if (next) + css_put(&next->css); + return memcg; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ + struct list_head *pos = NULL; + struct list_head *tmp; + unsigned long flags; + + write_lock_irqsave(&score_list_lock, flags); + list_for_each_prev_safe(pos, tmp, &score_head) { + struct mem_cgroup *memcg = list_entry(pos, + struct mem_cgroup, score_node); + if (atomic64_read(&memcg->memcg_reclaimed.app_score) < + atomic64_read(&target->memcg_reclaimed.app_score)) + break; + } + list_move_tail(&target->score_node, pos); + write_unlock_irqrestore(&score_list_lock, flags); +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.app_score); +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > MAX_APP_SCORE) + return -EINVAL; + + if (atomic64_read(&memcg->memcg_reclaimed.app_score) != val) { + atomic64_set(&memcg->memcg_reclaimed.app_score, val); + memcg_app_score_update(memcg); + } + + return 0; +} + +static unsigned long move_pages_to_page_list(struct lruvec *lruvec, enum lru_list lru, + struct list_head *page_list) +{ + struct list_head *src = &lruvec->lists[lru]; + unsigned long nr_isolated = 0; + struct page *page; + + while (!list_empty(src)) { + page = lru_to_page(src); + + if (PageUnevictable(page)) + continue; + + if (likely(get_page_unless_zero(page))) { + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + put_page(page); + + } else { + continue; + } + + + if (PageUnevictable(page)) { + putback_lru_page(page); + continue; + } + + if (PageAnon(page) && !PageSwapBacked(page)) { + putback_lru_page(page); + continue; + } + + list_add(&page->lru, page_list); + nr_isolated++; + } + + return nr_isolated; +} + + +unsigned long reclaim_all_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed; + LIST_HEAD(page_list); + struct page *page; + struct reclaim_stat stat = {}; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_start(RA_SHRINKANON); +#endif + count_vm_event(FREEZE_RECLAIME_COUNT); + move_pages_to_page_list(lruvec, LRU_INACTIVE_ANON, &page_list); + + nr_reclaimed = shrink_folio_list(&page_list, pgdat, &sc, &stat, true); + count_vm_event(FREEZE_RECLAIMED); + + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + list_del(&page->lru); + putback_lru_page(page); + } + +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL); +#endif + + return nr_reclaimed; +} + +static ssize_t memcg_force_shrink_anon(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct pglist_data *pgdat; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + reclaim_all_anon_memcg(pgdat, memcg); + } + + return nbytes; +} + +static int memcg_name_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%s\n", memcg->name); + return 0; +} + +static ssize_t memcg_name_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + buf = strstrip(buf); + if (nbytes >= MEM_CGROUP_NAME_MAX_LEN) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + if (memcg) + strcpy(memcg->name, buf); + mutex_unlock(&reclaim_para_lock); + + return nbytes; +} + +static int memcg_total_info_per_app_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = NULL; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon_size; + unsigned long zram_compress_size; + unsigned long eswap_compress_size; + + + while ((memcg = get_next_memcg(memcg))) { + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + zram_compress_size = memcg_data_size(memcg, CACHE_SIZE); + eswap_compress_size = memcg_data_size(memcg, SWAP_SIZE); + anon_size *= PAGE_SIZE / SZ_1K; + zram_compress_size /= SZ_1K; + eswap_compress_size /= SZ_1K; + + if (!strlen(memcg->name)) + continue; + + seq_printf(m, "%s %lu %lu %lu\n", memcg->name, anon_size, + zram_compress_size, eswap_compress_size); + } + + return 0; +} + +static int memcg_ub_ufs2zram_ratio_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + const unsigned int ratio = 100; + + if (val > ratio) + return -EINVAL; + + atomic64_set(&memcg->memcg_reclaimed.ub_ufs2zram_ratio, val); + + return 0; +} + +static u64 memcg_ub_ufs2zram_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); +} + +static int memcg_force_swapin_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + u64 size; + const unsigned int ratio = 100; + + size = memcg_data_size(memcg, SWAP_SIZE); + size = div_u64(atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio) * size, ratio); + + swapin_memcg(memcg, size); + + return 0; +} + +#ifdef CONFIG_MEM_PURGEABLE +static unsigned long purgeable_memcg_node(pg_data_t *pgdata, + struct scan_control *sc, struct mem_cgroup *memcg) +{ + unsigned long nr = 0; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata); + if (!lruvec) + return 0; + + shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc); + nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc); + + pr_info("reclaim %lu purgeable pages \n", nr); + return nr; +} + +static int memcg_force_shrink_purgeable_bysize(struct cgroup_subsys_state *css, + struct cftype *cft, u64 reclaim_size) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + if (!memcg) + return 0; + + if (reclaim_size == 0) { + pr_err("reclaim_size is zero, skip shrink\n"); + return 0; + } + + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY, + .may_deactivate = DEACTIVATE_ANON, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES -1, + }; + int nid = 0; + sc.nr_to_reclaim = div_u64(reclaim_size, PAGE_SIZE); + + for_each_node_state(nid, N_MEMORY) + purgeable_memcg_node(NODE_DATA(nid), &sc, memcg); + return 0; +} +#endif + +static struct cftype memcg_policy_files[] = { + { + .name = "name", + .write = memcg_name_write, + .seq_show = memcg_name_show, + }, + { + .name = "ub_ufs2zram_ratio", + .write_u64 = memcg_ub_ufs2zram_ratio_write, + .read_u64 = memcg_ub_ufs2zram_ratio_read, + }, + { + .name = "total_info_per_app", + .seq_show = memcg_total_info_per_app_show, + }, + { + .name = "app_score", + .write_u64 = mem_cgroup_app_score_write, + .read_u64 = mem_cgroup_app_score_read, + }, + { + .name = "force_shrink_anon", + .write = memcg_force_shrink_anon + }, + { + .name = "force_swapin", + .write_u64 = memcg_force_swapin_write, + }, +#ifdef CONFIG_MEM_PURGEABLE + { + .name = "force_shrink_purgeable_bysize", + .write_u64 = memcg_force_shrink_purgeable_bysize, + }, +#endif + { }, /* terminate */ +}; + +static int __init memcg_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memcg_policy_files)); + + return 0; +} +subsys_initcall(memcg_policy_init); +#else +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + return NULL; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ +} + + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + return NULL; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return 0; +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return 0; +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ +} +#endif diff --git a/mm/memcg_reclaim.c b/mm/memcg_reclaim.c new file mode 100644 index 0000000000000000000000000000000000000000..03e47713a8cc98a23784afa1d68ca814114d364a --- /dev/null +++ b/mm/memcg_reclaim.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_reclaim.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include + +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#include "internal.h" +#endif + +static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness) +{ + return !sc->may_swap || !swappiness || !get_nr_swap_pages(); +} + +/* + * From 0 .. 100. Higher means more swappy. + */ +#define HYPERHOLD_SWAPPINESS 100 + +static int get_hyperhold_swappiness(void) +{ + return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness; +} + +static void get_scan_count_hyperhold(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) +{ + int swappiness = get_hyperhold_swappiness(); + struct lruvec *lruvec = node_lruvec(pgdat); + u64 fraction[2]; + u64 denominator; + enum scan_balance scan_balance; + unsigned long ap, fp; + enum lru_list lru; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long anon_cost, file_cost, total_cost; + unsigned long total_high_wmark = 0; + + + if (cgroup_reclaim(sc) && !swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + if (!cgroup_reclaim(sc)) { + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) && + (lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + sc->reclaim_idx) >> + (unsigned int)sc->priority)) { + scan_balance = SCAN_ANON; + goto out; + } + } + } + + /* + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. + */ + + if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) && + !inactive_is_low(lruvec, LRU_INACTIVE_FILE) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp; + +out: + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long lruvec_size; + unsigned long scan; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = lruvec_size; + *lru_pages += scan; + scan >>= sc->priority; + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. + */ + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + + nr[lru] = scan; + } +} + +#define ISOLATE_LIMIT_CNT 5 +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) + break; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_anon += nr_reclaimed; +} + +static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg) +{ + if (tmcg == NULL) + return true; + + while (!mem_cgroup_is_root(mcg)) { + if (mcg == tmcg) + break; + + mcg = parent_mem_cgroup(mcg); + } + + return (mcg == tmcg); +} + +static void shrink_anon(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + unsigned long reclaimed; + unsigned long scanned; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + unsigned long nr_memcg[NR_LRU_LISTS]; + unsigned long nr_node_active = lruvec_lru_size( + node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES); + unsigned long nr_node_inactive = lruvec_lru_size( + node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES); + + while ((memcg = get_next_memcg(memcg))) { + struct lruvec *lruvec = NULL; + + if (!memcg_is_child_of(memcg, target_memcg)) + continue; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_active + 1); + nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_inactive + 1); + nr_memcg[LRU_ACTIVE_FILE] = 0; + nr_memcg[LRU_INACTIVE_FILE] = 0; + + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + + mem_cgroup_calculate_protection(target_memcg, memcg); + + if (mem_cgroup_below_min(target_memcg, memcg)) { + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + } else if (mem_cgroup_below_low(target_memcg, memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + } + + shrink_anon_memcg(pgdat, memcg, sc, nr_memcg); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) { + get_next_memcg_break(memcg); + break; + } + } +} + +static void shrink_file(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = node_lruvec(pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, + nr_to_scan, + lruvec, sc); + } + } + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_file += nr_reclaimed; +} + +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc) +{ + unsigned long nr_reclaimed; + struct lruvec *target_lruvec; + bool reclaimable = false; + unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + do { + /* Get scan count for file and anon */ + unsigned long node_lru_pages = 0; + unsigned long nr[NR_LRU_LISTS] = {0}; + + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost; + sc->file_cost = node_lruvec(pgdat)->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + refaults = lruvec_page_state(node_lruvec(pgdat), + WORKINGSET_ACTIVATE_FILE); + if (refaults != node_lruvec(pgdat)->refaults[1] || + inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#else + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#endif + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE); +#else + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +#endif + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } + + get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages); + + if (!cgroup_reclaim(sc)) { + /* Shrink the Total-File-LRU */ + shrink_file(pgdat, sc, nr); + } + + /* Shrink Anon by iterating score_list */ + shrink_anon(pgdat, sc, nr); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim. */ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in reclaim_throttle(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + + return reclaimable; +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5abffe6f8389e27a705068e028dee875c91efa91..15b5bf8bbc2d317b92e7d4a6bc840a0667599db2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -70,6 +70,7 @@ #include "swap.h" #include +#include #include @@ -86,7 +87,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem = true; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; @@ -465,7 +466,15 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); + struct lruvec *lruvec = &mz->lruvec; + unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES); +#else unsigned long nr_pages = page_counter_read(&memcg->memory); +#endif unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; @@ -829,8 +838,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ - if (!mem_cgroup_disabled()) + if (!mem_cgroup_disabled()) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif __mod_memcg_lruvec_state(lruvec, idx, val); + } } void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, @@ -841,6 +855,13 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_file_page(page) && !is_prot_page(page)) { + __mod_node_page_state(pgdat, idx, val); + return; + } +#endif + rcu_read_lock(); memcg = page_memcg(head); /* Untracked pages have no memcg, no lruvec. Update only the node */ @@ -893,6 +914,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled() || index < 0) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return; +#endif memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[index], count); @@ -1380,6 +1405,11 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, if (mem_cgroup_disabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = &mz->lru_zone_size[zid][lru]; @@ -5191,6 +5221,10 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (id == -1) + return NULL; +#endif return idr_find(&mem_cgroup_idr, id); } @@ -5233,6 +5267,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) } lruvec_init(&pn->lruvec); +#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) + pn->lruvec.pgdat = NODE_DATA(node); +#endif pn->memcg = memcg; memcg->nodeinfo[node] = pn; @@ -5326,6 +5363,16 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); +#ifdef CONFIG_HYPERHOLD_MEMCG + if (unlikely(!score_head_inited)) { + INIT_LIST_HEAD(&score_head); + score_head_inited = true; + } +#endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + INIT_LIST_HEAD(&memcg->score_node); +#endif lru_gen_init_memcg(memcg); return memcg; fail: @@ -5346,6 +5393,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); +#ifdef CONFIG_HYPERHOLD_MEMCG + atomic64_set(&memcg->memcg_reclaimed.app_score, 300); +#endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); +#endif page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) @@ -5397,6 +5452,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; +#ifdef CONFIG_HYPERHOLD_MEMCG + memcg_app_score_update(memcg); + css_get(css); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); @@ -5418,6 +5478,15 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; +#ifdef CONFIG_HYPERHOLD_MEMCG + unsigned long flags; + + write_lock_irqsave(&score_list_lock, flags); + list_del_init(&memcg->score_node); + write_unlock_irqrestore(&score_list_lock, flags); + css_put(css); +#endif + /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup @@ -6579,6 +6648,9 @@ static int memory_stat_show(struct seq_file *m, void *v) memory_stat_format(memcg, buf, PAGE_SIZE); seq_puts(m, buf); kfree(buf); +#ifdef CONFIG_HYPERHOLD_DEBUG + memcg_eswap_info_show(m); +#endif return 0; } @@ -7325,6 +7397,8 @@ static int __init cgroup_memory(char *s) cgroup_memory_nokmem = true; if (!strcmp(token, "nobpf")) cgroup_memory_nobpf = true; + if (!strcmp(token, "kmem")) + cgroup_memory_nokmem = false; } return 1; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index db3b270254f1ed6dd09d730f24127d986a5de0b0..e82004895bd8e92d75968b5e254bedc368c713df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -1144,6 +1145,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_run(nid); +#endif writeback_set_ratelimit(); @@ -1938,6 +1942,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, if (arg.status_change_nid >= 0) { kcompactd_stop(node); kswapd_stop(node); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_stop(node); +#endif } writeback_set_ratelimit(); diff --git a/mm/memory_monitor.c b/mm/memory_monitor.c new file mode 100644 index 0000000000000000000000000000000000000000..88fb97466b247eba470a1125ac74418f0c9d7cb2 --- /dev/null +++ b/mm/memory_monitor.c @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static atomic_t kswapd_monitor = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(kswapd_poll_wait); + +void kswapd_monitor_wake_up_queue(void) +{ + atomic_inc(&kswapd_monitor); + wake_up_interruptible(&kswapd_poll_wait); +} + +static __poll_t kswapd_monitor_poll(struct file *file, struct poll_table_struct *wait) +{ + struct seq_file *seq = file->private_data; + + poll_wait(file, &kswapd_poll_wait, wait); + + if (seq->poll_event != atomic_read(&kswapd_monitor)) { + seq->poll_event = atomic_read(&kswapd_monitor); + return EPOLLPRI; + } + + return EPOLLIN | EPOLLRDNORM; +} + +static int kswapd_monitor_show(struct seq_file *m, void *v) +{ + seq_printf(m, "kswapd_monitor_show kswapd_monitor %d\n", atomic_read(&kswapd_monitor)); + return 0; +} + +static int kswapd_monitor_open(struct inode *inode, struct file *file) +{ + return single_open(file, kswapd_monitor_show, NULL); +} + +static const struct proc_ops proc_kswapd_monitor_operations = { + .proc_open = kswapd_monitor_open, + .proc_poll = kswapd_monitor_poll, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int __init memory_monitor_init(void) +{ + proc_create("kswapd_monitor", 0, NULL, &proc_kswapd_monitor_operations); + return 0; +} + +__initcall(memory_monitor_init) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e39705c7bdc29f7669fae143af05100b0d93a61..1277c417701a46bd743fae7e70ba6d83f5c61cee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -79,6 +79,10 @@ #include #include #include +#include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #include "internal.h" #include "shuffle.h" #include "page_reporting.h" @@ -5362,6 +5366,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_alloc(gfp_mask); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_zswapd(); +#endif + if (should_fail_alloc_page(gfp_mask, order)) return false; @@ -7772,12 +7781,18 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + init_waitqueue_head(&pgdat->zswapd_wait); +#endif for (i = 0; i < NR_VMSCAN_THROTTLE; i++) init_waitqueue_head(&pgdat->reclaim_wait[i]); pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); +#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) + pgdat->__lruvec.pgdat = pgdat; +#endif } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/swap.c b/mm/swap.c index 423199ee8478c19542126c029c7dbd2f7a4db6bc..ca861ede90117c526b1f9c49711e28237d10cb03 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -320,6 +320,13 @@ void lru_note_cost(struct lruvec *lruvec, bool file, void lru_note_cost_refault(struct folio *folio) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (page_is_file_lru(folio_page(folio, 0))) { + lru_note_cost(&(folio_pgdat(folio)->__lruvec), 1, folio_nr_pages(folio), 0); + return; + } +#endif + lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 2c718f45745f8c87fdac889fa3d255ffa58b774a..cf9a9cf6bd98476b8c28d5d0b1c0c01d774835e9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, @@ -3268,6 +3269,28 @@ void si_swapinfo(struct sysinfo *val) spin_unlock(&swap_lock); } +#ifdef CONFIG_HYPERHOLD_ZSWAPD +bool free_swap_is_low(void) +{ + unsigned int type; + unsigned long long freeswap = 0; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; + } + freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; + spin_unlock(&swap_lock); + + return (freeswap < get_free_swap_threshold()); +} +EXPORT_SYMBOL(free_swap_is_low); +#endif + /* * Verify that a swap entry is valid and increment its swap map count. * diff --git a/mm/vmscan.c b/mm/vmscan.c index 2bb7ce0a934a7b08d908009de7cb5d91b8324c22..92cc71da36066b2baaa3cf9fbf82e286b291d51e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -71,103 +71,12 @@ #define CREATE_TRACE_POINTS #include -struct scan_control { - /* How many pages shrink_list() should reclaim */ - unsigned long nr_to_reclaim; - - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; - - /* - * The memory cgroup that hit its limit and as a result is the - * primary target of this reclaim invocation. - */ - struct mem_cgroup *target_mem_cgroup; - - /* - * Scan pressure balancing between anon and file LRUs - */ - unsigned long anon_cost; - unsigned long file_cost; - - /* Can active folios be deactivated as part of reclaim? */ -#define DEACTIVATE_ANON 1 -#define DEACTIVATE_FILE 2 - unsigned int may_deactivate:2; - unsigned int force_deactivate:1; - unsigned int skipped_deactivate:1; - - /* Writepage batching in laptop mode; RECLAIM_WRITE */ - unsigned int may_writepage:1; - - /* Can mapped folios be reclaimed? */ - unsigned int may_unmap:1; - - /* Can folios be swapped as part of reclaim? */ - unsigned int may_swap:1; - - /* Proactive reclaim invoked by userspace through memory.reclaim */ - unsigned int proactive:1; - - /* - * Cgroup memory below memory.low is protected as long as we - * don't threaten to OOM. If any cgroup is reclaimed at - * reduced force or passed over entirely due to its memory.low - * setting (memcg_low_skipped), and nothing is reclaimed as a - * result, then go back for one more cycle that reclaims the protected - * memory (memcg_low_reclaim) to avert OOM. - */ - unsigned int memcg_low_reclaim:1; - unsigned int memcg_low_skipped:1; - - unsigned int hibernation_mode:1; - - /* One of the zones is ready for compaction */ - unsigned int compaction_ready:1; - - /* There is easily reclaimable cold cache in the current node */ - unsigned int cache_trim_mode:1; - - /* The file folios on the current node are dangerously low */ - unsigned int file_is_tiny:1; - - /* Always discard instead of demoting to lower tier memory */ - unsigned int no_demotion:1; - - /* Allocation order */ - s8 order; - - /* Scan (total_size >> priority) pages at once */ - s8 priority; - - /* The highest zone to isolate folios for reclaim from */ - s8 reclaim_idx; - - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - - struct { - unsigned int dirty; - unsigned int unqueued_dirty; - unsigned int congested; - unsigned int writeback; - unsigned int immediate; - unsigned int file_taken; - unsigned int taken; - } nr; - - /* for recording the reclaimed slab by now */ - struct reclaim_state reclaim_state; -}; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#endif +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ @@ -183,6 +92,10 @@ struct scan_control { #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif +#ifdef CONFIG_HYPERHOLD_FILE_LRU +unsigned int enough_inactive_file = 1; +#endif + /* * From 0 .. 200. Higher means more swappy. */ @@ -440,7 +353,8 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) up_read(&shrinker_rwsem); } -static bool cgroup_reclaim(struct scan_control *sc) +/* Returns true for reclaim through cgroup limits or cgroup interfaces. */ +bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } @@ -463,7 +377,7 @@ static bool global_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; @@ -495,7 +409,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, return 0; } -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return false; } @@ -505,7 +419,7 @@ static bool global_reclaim(struct scan_control *sc) return true; } -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { return true; } @@ -605,12 +519,27 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ -static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!mem_cgroup_disabled() && is_node_lruvec(lruvec)) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + + if (!managed_zone(zone)) + continue; + + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); + } + + return size; + } +#endif + for (zid = 0; zid <= zone_idx; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; @@ -983,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * * Returns the number of reclaimed slab objects. */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { @@ -1654,7 +1583,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) /* * shrink_folio_list() returns the number of reclaimed pages */ -static unsigned int shrink_folio_list(struct list_head *folio_list, +unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { @@ -2234,7 +2163,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_folios(unsigned long nr_to_scan, +unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -2419,11 +2348,15 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * * Returns the number of pages moved to the given lruvec. */ -static unsigned int move_folios_to_lru(struct lruvec *lruvec, +unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(folios_to_free); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + bool prot; + bool file; +#endif while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -2471,8 +2404,23 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (folio_test_active(folio)) { + prot = is_prot_page(folio_page(folio, 0)); + file = page_is_file_lru(folio_page(folio, 0)); + if (!prot && file) { + lruvec = folio_lruvec(folio); + workingset_age_nonresident(lruvec, + nr_pages); + } else { + workingset_age_nonresident(lruvec, + nr_pages); + } + } +#else if (folio_test_active(folio)) workingset_age_nonresident(lruvec, nr_pages); +#endif } /* @@ -2488,7 +2436,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so. */ -static int current_may_throttle(void) +int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE); } @@ -2497,7 +2445,7 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static unsigned long shrink_inactive_list(unsigned long nr_to_scan, +unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { @@ -2515,6 +2463,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (stalled) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + sc->isolate_count++; +#endif /* wait a bit for the reclaimer. */ stalled = true; reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); @@ -2556,7 +2507,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (file) + lru_note_cost(node_lruvec(pgdat), file, stat.nr_pageout, nr_scanned - nr_reclaimed); + else + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); +#else lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); +#endif mem_cgroup_uncharge_list(&folio_list); free_unref_page_list(&folio_list); @@ -2617,7 +2575,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway. */ -static void shrink_active_list(unsigned long nr_to_scan, +void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) @@ -2773,7 +2731,7 @@ unsigned long reclaim_pages(struct list_head *folio_list) return nr_reclaimed; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, +unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { @@ -2815,7 +2773,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) +bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; @@ -2834,13 +2792,6 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) return inactive * inactive_ratio < active; } -enum scan_balance { - SCAN_EQUAL, - SCAN_FRACT, - SCAN_ANON, - SCAN_FILE, -}; - static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; @@ -5429,6 +5380,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) goto restart; } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; @@ -5449,6 +5401,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc blk_finish_plug(&plug); } +#endif #else /* !CONFIG_MEMCG */ @@ -5457,10 +5410,12 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) BUILD_BUG(); } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { BUILD_BUG(); } +#endif #endif @@ -6211,7 +6166,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * #endif /* CONFIG_LRU_GEN */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -6400,6 +6355,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -6568,6 +6524,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (reclaimable) pgdat->kswapd_failures = 0; } +#endif /* * Returns true if compaction should go ahead for a costly-order request, or @@ -6718,7 +6675,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(zone->zone_pgdat, sc); +#else shrink_node(zone->zone_pgdat, sc); +#endif } if (first_pgdat) @@ -6735,10 +6696,19 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { struct lruvec *target_lruvec; unsigned long refaults; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct lruvec *lruvec; +#endif if (lru_gen_enabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + lruvec = node_lruvec(pgdat); + lruvec->refaults[0] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_ANON); /* modified */ + lruvec->refaults[1] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_FILE); /* modified */ +#endif + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[WORKINGSET_ANON] = refaults; @@ -7040,6 +7010,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + unsigned long nr[NR_LRU_LISTS]; +#endif WARN_ON_ONCE(!current->reclaim_state); @@ -7056,7 +7029,17 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + nr[LRU_ACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_INACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + shrink_anon_memcg(pgdat, memcg, &sc, nr); +#else shrink_lruvec(lruvec, &sc); +#endif trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -7269,7 +7252,11 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, sc); +#else shrink_node(pgdat, sc); +#endif /* * Fragmentation may mean that the system cannot be rebalanced for @@ -7719,6 +7706,9 @@ static int kswapd(void *p) */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); +#ifdef CONFIG_MEMORY_MONITOR + kswapd_monitor_wake_up_queue(); +#endif reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); if (reclaim_order < alloc_order) @@ -7981,7 +7971,11 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, &sc); +#else shrink_node(pgdat, &sc); +#endif } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 1ea6a5ce1c4161b5f41387a82e64c2446ad93a50..e2c81b9d47fcebd4dda48b203b52df3f8a92c23a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1399,6 +1399,24 @@ const char * const vmstat_text[] = { "direct_map_level2_splits", "direct_map_level3_splits", #endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + "zswapd_running", + "zswapd_hit_refaults", + "zswapd_medium_press", + "zswapd_critical_press", + "zswapd_memcg_ratio_skip", + "zswapd_memcg_refault_skip", + "zswapd_swapout", + "zswapd_empty_round", + "zswapd_empty_round_skip_times", + "zswapd_snapshot_times", + "zswapd_reclaimed", + "zswapd_scanned", +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + "freeze_reclaimed", + "freeze_reclaim_count", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ diff --git a/mm/workingset.c b/mm/workingset.c index 00c6f4d9d9be5ae8a09a85c87bce47440b1a0a76..f006fbc96aadadeafd257c3071f09c2bcd8aa9a8 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -368,7 +368,16 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && page_is_file_lru(folio_page(folio, 0))) { + lruvec = folio_lruvec(folio); + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + } else { + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + } +#else workingset_age_nonresident(lruvec, folio_nr_pages(folio)); +#endif return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } @@ -395,7 +404,7 @@ void workingset_refault(struct folio *folio, void *shadow) struct lruvec *lruvec; unsigned long refault; bool workingset; - int memcgid; + int memcgid = 0; long nr; if (lru_gen_enabled()) { @@ -423,9 +432,17 @@ void workingset_refault(struct folio *folio, void *shadow) * would be better if the root_mem_cgroup existed in all * configurations instead. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (memcgid != -1) { + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) + goto out; + } +#else eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) goto out; +#endif eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -460,7 +477,15 @@ void workingset_refault(struct folio *folio, void *shadow) pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && file) + mod_lruvec_state(node_lruvec(pgdat), + WORKINGSET_REFAULT_BASE + file, folio_nr_pages(folio)); + else + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); +#else mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); +#endif mem_cgroup_flush_stats_delayed(); /* @@ -470,10 +495,21 @@ void workingset_refault(struct folio *folio, void *shadow) * workingset competition needs to consider anon or not depends * on having swap. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size = lruvec_page_state(node_lruvec(pgdat), NR_ACTIVE_FILE); +#else workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); +#endif + if (!file) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size += lruvec_page_state(node_lruvec(pgdat), + NR_INACTIVE_FILE); +#else + workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); +#endif } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, @@ -487,8 +523,19 @@ void workingset_refault(struct folio *folio, void *shadow) goto out; folio_set_active(folio); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && file) { + workingset_age_nonresident(node_lruvec(pgdat), + folio_nr_pages(folio)); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, folio_nr_pages(folio)); + } else { + workingset_age_nonresident(lruvec, nr); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); + } +#else workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); +#endif /* Folio was active prior to eviction */ if (workingset) { @@ -498,7 +545,14 @@ void workingset_refault(struct folio *folio, void *shadow) * putback */ lru_note_cost_refault(folio); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && file) + mod_lruvec_state(node_lruvec(pgdat), WORKINGSET_RESTORE_BASE + file, folio_nr_pages(folio)); + else + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); +#else mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); +#endif } out: rcu_read_unlock(); @@ -511,6 +565,7 @@ void workingset_refault(struct folio *folio, void *shadow) void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -523,7 +578,16 @@ void workingset_activation(struct folio *folio) memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && page_is_file_lru(folio_page(folio, 0))) { + lruvec = folio_lruvec(folio); + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + } else { + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); + } +#else workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); +#endif out: rcu_read_unlock(); } @@ -604,6 +668,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG +#ifndef CONFIG_HYPERHOLD_FILE_LRU if (sc->memcg) { struct lruvec *lruvec; int i; @@ -617,6 +682,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else +#endif #endif pages = node_present_pages(sc->nid); diff --git a/mm/zswapd.c b/mm/zswapd.c new file mode 100644 index 0000000000000000000000000000000000000000..d80a00d9f1fd90b8571c0f45df2416f7712db03f --- /dev/null +++ b/mm/zswapd.c @@ -0,0 +1,911 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif + +#include "zswapd_internal.h" +#include "internal.h" + +#define UNSET_ZRAM_WM_RATIO 0 +#define ESWAP_PERCENT_CONSTANT 100 +#define DEFAULT_ZRAM_WM_RATIO 37 +#define SWAP_MORE_ZRAM (50 * (SZ_1M)) + +static wait_queue_head_t snapshotd_wait; +static atomic_t snapshotd_wait_flag; +static atomic_t snapshotd_init_flag = ATOMIC_INIT(0); +static struct task_struct *snapshotd_task; + +static pid_t zswapd_pid = -1; +static unsigned long long last_anon_pagefault; +static unsigned long long anon_refault_ratio; +static unsigned long long zswapd_skip_interval; +static unsigned long last_zswapd_time; +static unsigned long last_snapshot_time; +bool last_round_is_empty; + + +DECLARE_RWSEM(gs_lock); +LIST_HEAD(gs_list); + +void unregister_group_swap(struct group_swap_device *gsdev) +{ + down_write(&gs_lock); + list_del(&gsdev->list); + up_write(&gs_lock); + + kfree(gsdev); +} +EXPORT_SYMBOL(unregister_group_swap); + +struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) +{ + struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL); + + if (!gsdev) + return NULL; + + gsdev->priv = priv; + gsdev->ops = ops; + + down_write(&gs_lock); + list_add(&gsdev->list, &gs_list); + up_write(&gs_lock); + + return gsdev; +} +EXPORT_SYMBOL(register_group_swap); + +u64 memcg_data_size(struct mem_cgroup *memcg, int type) +{ + struct group_swap_device *gsdev = NULL; + u64 size = 0; + + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) + size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv); + up_read(&gs_lock); + + return size; +} + +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 read_size = 0; + u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); + struct group_swap_device *gsdev = NULL; + + if (req_size > div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT)) + req_size = div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT); + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + read_size += gsdev->ops->group_read(memcg->id.id, req_size - read_size, + gsdev->priv); + if (read_size >= req_size) + break; + } + up_read(&gs_lock); + + return read_size; +} + +static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 cache_size = memcg_data_size(memcg, CACHE_SIZE); + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 all_size = cache_size + swap_size; + u64 write_size = 0; + u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio); + struct group_swap_device *gsdev = NULL; + + if (div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) <= swap_size) + return 0; + if (req_size > div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size) + req_size = div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size; + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size, + gsdev->priv); + if (write_size >= req_size) + break; + } + up_read(&gs_lock); + + return write_size; +} + +static u64 swapout(u64 req_size) +{ + struct mem_cgroup *memcg = NULL; + u64 write_size = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) { + write_size += swapout_memcg(memcg, req_size - write_size); + if (write_size >= req_size) + break; + } + + return write_size; +} + +static unsigned long long get_zram_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long zram_pages = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) + zram_pages += memcg_data_size(memcg, CACHE_PAGE); + + return zram_pages; +} + +static unsigned long long get_eswap_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long eswap_pages = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) + eswap_pages += memcg_data_size(memcg, SWAP_PAGE); + + return eswap_pages; +} + +static unsigned long long get_zram_pagefault(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long cache_fault = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) + cache_fault += memcg_data_size(memcg, CACHE_FAULT); + + return cache_fault; +} + +static unsigned int calc_sys_cur_avail_buffers(void) +{ + const unsigned int percent_constant = 100; + unsigned long freemem; + unsigned long active_file; + unsigned long inactive_file; + unsigned long buffers; + + freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K; + active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K; + inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K; + + buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant + + active_file * get_active_file_ratio() / percent_constant; + + return (buffers * SZ_1K / SZ_1M); /* kb to mb */ +} + +void zswapd_status_show(struct seq_file *m) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + seq_printf(m, "buffer_size:%u\n", buffers); + seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio); +} + +pid_t get_zswapd_pid(void) +{ + return zswapd_pid; +} + +static bool min_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_min_avail_buffers()) + return true; + + return false; +} + +static bool buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_avail_buffers()) + return true; + + return false; +} + +static bool high_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_high_avail_buffers()) + return true; + + return false; +} + +static void snapshot_anon_refaults(void) +{ + struct mem_cgroup *memcg = NULL; + + while ((memcg = get_next_memcg(memcg)) != NULL) + memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT); + + last_anon_pagefault = get_zram_pagefault(); + last_snapshot_time = jiffies; +} + +/* + * Return true if refault changes between two read operations. + */ +static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg) +{ + const unsigned int percent_constant = 100; + unsigned long long anon_pagefault; + unsigned long long anon_total; + unsigned long long ratio; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + + if (!memcg) + return false; + + anon_pagefault = memcg_data_size(memcg, CACHE_FAULT); + if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault) + return false; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return false; + + lruvec = &mz->lruvec; + if (!lruvec) + return false; + + anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) + + memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE); + + ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) * + percent_constant, (anon_total + 1)); + if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold)) + return true; + + return false; +} + +static bool get_area_anon_refault_status(void) +{ + const unsigned int percent_constant = 1000; + unsigned long long anon_pagefault; + unsigned long long ratio; + unsigned long long time; + + anon_pagefault = get_zram_pagefault(); + time = jiffies; + if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time) + return false; + + ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant, + (jiffies_to_msecs(time - last_snapshot_time) + 1)); + anon_refault_ratio = ratio; + + if (ratio > get_area_anon_refault_threshold()) + return true; + + return false; +} + +void wakeup_snapshotd(void) +{ + unsigned long snapshot_interval; + + snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time); + if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) { + atomic_set(&snapshotd_wait_flag, 1); + wake_up_interruptible(&snapshotd_wait); + } +} + +static int snapshotd(void *p) +{ + int ret; + + while (!kthread_should_stop()) { + ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag)); + if (ret) + continue; + + atomic_set(&snapshotd_wait_flag, 0); + + snapshot_anon_refaults(); + count_vm_event(ZSWAPD_SNAPSHOT_TIMES); + } + + return 0; +} + +void set_snapshotd_init_flag(unsigned int val) +{ + atomic_set(&snapshotd_init_flag, val); +} + +/* + * This snapshotd start function will be called by init. + */ +int snapshotd_run(void) +{ + atomic_set(&snapshotd_wait_flag, 0); + init_waitqueue_head(&snapshotd_wait); + + snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd"); + if (IS_ERR(snapshotd_task)) { + pr_err("Failed to start snapshotd\n"); + return PTR_ERR(snapshotd_task); + } + + return 0; +} + +static int __init snapshotd_init(void) +{ + snapshotd_run(); + + return 0; +} +module_init(snapshotd_init); + +static int get_zswapd_eswap_policy(void) +{ + if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO) + return CHECK_BUFFER_ONLY; + else + return CHECK_BUFFER_ZRAMRATIO_BOTH; +} + +static unsigned int get_policy_zram_wm_ratio(void) +{ + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ONLY) + return DEFAULT_ZRAM_WM_RATIO; + else + return get_zram_wm_ratio(); +} + +int get_zram_current_watermark(void) +{ + long long diff_buffers; + const unsigned int percent_constant = 10; + u64 nr_total; + unsigned int zram_wm_ratio = get_policy_zram_wm_ratio(); + + nr_total = totalram_pages(); + /* B_target - B_current */ + diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers(); + /* MB to page */ + diff_buffers *= SZ_1M / PAGE_SIZE; + /* after_comp to before_comp */ + diff_buffers *= get_compress_ratio(); + /* page to ratio */ + diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total); + + return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers); +} + +bool zram_watermark_ok(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = div_u64(totalram_pages() * ratio, percent_constant); + if (nr_zram_used > nr_wm) + return true; + + return false; +} + +bool zram_watermark_exceed(void) +{ + u64 nr_zram_used; + const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE); + + if (!nr_wm) + return false; + + nr_zram_used = get_zram_used_pages(); + if (nr_zram_used > nr_wm) + return true; + return false; +} + +void wakeup_zswapd(pg_data_t *pgdat) +{ + unsigned long interval; + + if (IS_ERR(pgdat->zswapd)) + return; + + if (!wq_has_sleeper(&pgdat->zswapd_wait)) + return; + + /* + * make anon pagefault snapshots + * wake up snapshotd + */ + if (atomic_read(&snapshotd_init_flag) == 1) + wakeup_snapshotd(); + + /* wake up when the buffer is lower than min_avail_buffer */ + if (min_buffer_is_suitable()) + return; + + interval = jiffies_to_msecs(jiffies - last_zswapd_time); + if (interval < zswapd_skip_interval) { + count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES); + return; + } + + atomic_set(&pgdat->zswapd_wait_flag, 1); + wake_up_interruptible(&pgdat->zswapd_wait); +} + +void wake_all_zswapd(void) +{ + pg_data_t *pgdat = NULL; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + wakeup_zswapd(pgdat); + } +} + +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static void zswapd_shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) +{ + unsigned int nr_deactivate; + unsigned long nr_scanned; + unsigned long nr_taken; + + struct page *page = NULL; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost; + unsigned long *anon_cost = &lruvec->anon_cost; + LIST_HEAD(l_inactive); + LIST_HEAD(l_hold); + + lru_add_drain(); + + spin_lock_irq(&lruvec->lru_lock); + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken); + *anon_cost += nr_taken; + *node_anon_cost += nr_taken; + __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + spin_unlock_irq(&lruvec->lru_lock); + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + + if (unlikely(!folio_evictable(page_folio(page)))) { + putback_lru_page(page); + continue; + } + + ClearPageActive(page); + SetPageWorkingset(page); + list_add(&page->lru, &l_inactive); + } + + spin_lock_irq(&lruvec->lru_lock); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken); + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&l_inactive); + free_unref_page_list(&l_inactive); + + trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken, + nr_deactivate, sc->priority); +} + +static unsigned long zswapd_shrink_list(enum lru_list lru, + unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc) +{ +#ifdef CONFIG_RECLAIM_ACCT + unsigned long nr_reclaimed; + + reclaimacct_substage_start(RA_SHRINKANON); +#endif + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_SHRINKANON, 0, NULL); +#endif + return 0; + } + +#ifdef CONFIG_RECLAIM_ACCT + nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru); + reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL); + return nr_reclaimed; +#else + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +#endif +} + +static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed = 0; + unsigned long nr_to_scan; + struct blk_plug plug; + enum lru_list lru; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += zswapd_shrink_list(lru, + nr_to_scan, lruvec, sc); + } + } + } + + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; +} +#endif + +static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) +{ + const unsigned int percent_constant = 100; + struct mem_cgroup *memcg = NULL; + unsigned long nr[NR_LRU_LISTS]; + + while ((memcg = get_next_memcg(memcg)) != NULL) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) { + get_next_memcg_break(memcg); + break; + } + + if (get_memcg_anon_refault_status(memcg)) { + count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP); + continue; + } + + nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr_zram = memcg_data_size(memcg, CACHE_PAGE); + nr_eswap = memcg_data_size(memcg, SWAP_PAGE); + + zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant, + (nr_inactive + nr_active + nr_zram + nr_eswap + 1)); + if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) { + count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP); + continue; + } + + nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority; + nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority; + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + +#ifdef CONFIG_HYPERHOLD_FILE_LRU + zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr); +#else + shrink_lruvec(lruvec, sc); +#endif + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + get_next_memcg_break(memcg); + break; + } + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +static u64 __calc_nr_to_reclaim(void) +{ + unsigned int buffers; + unsigned int high_buffers; + unsigned int max_reclaim_size; + u64 reclaim_size = 0; + + high_buffers = get_high_avail_buffers(); + buffers = calc_sys_cur_avail_buffers(); + max_reclaim_size = get_zswapd_max_reclaim_size(); + if (buffers < high_buffers) + reclaim_size = high_buffers - buffers; + + /* once max reclaim target is max_reclaim_size */ + reclaim_size = min(reclaim_size, (u64)max_reclaim_size); + + /* MB to pages */ + return div_u64(reclaim_size * SZ_1M, PAGE_SIZE); +} + +static void zswapd_shrink_node(pg_data_t *pgdat) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY / 2, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + const unsigned int increase_rate = 2; + + do { + unsigned long nr_reclaimed = sc.nr_reclaimed; + bool raise_priority = true; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) + break; + + sc.nr_scanned = 0; + sc.nr_to_reclaim = __calc_nr_to_reclaim(); + + if (zswapd_shrink_anon(pgdat, &sc)) + raise_priority = false; + count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned); + count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed); + if (try_to_freeze() || kthread_should_stop()) + break; + + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1); + + /* + * When meets the first empty round, set the interval to t. + * If the following round is still empty, set the intervall + * to 2t. If the round is always empty, then 4t, 8t, and so on. + * But make sure the interval is not more than the max_skip_interval. + * Once a non-empty round occurs, reset the interval to 0. + */ + if (sc.nr_reclaimed < get_empty_round_check_threshold()) { + count_vm_event(ZSWAPD_EMPTY_ROUND); + if (last_round_is_empty) + zswapd_skip_interval = min(zswapd_skip_interval * + increase_rate, get_max_skip_interval()); + else + zswapd_skip_interval = get_empty_round_skip_interval(); + last_round_is_empty = true; + } else { + zswapd_skip_interval = 0; + last_round_is_empty = false; + } +} + +u64 zram_watermark_diff(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = div_u64(totalram_pages() * ratio, percent_constant); + if (nr_zram_used > nr_wm) + return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM; + + return 0; +} + +u64 zswapd_buffer_diff(void) +{ + u64 buffers; + u64 avail; + + buffers = calc_sys_cur_avail_buffers(); + avail = get_high_avail_buffers(); + if (buffers < avail) + return (avail - buffers) * SZ_1M; + + return 0; +} + +u64 get_do_eswap_size(bool refault) +{ + u64 size = 0; + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH) + size = max(zram_watermark_diff(), zswapd_buffer_diff()); + else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault)) + size = zswapd_buffer_diff(); + + return size; +} + +static int zswapd(void *p) +{ + struct task_struct *tsk = current; + pg_data_t *pgdat = (pg_data_t *)p; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); +#ifdef CONFIG_RECLAIM_ACCT + struct reclaim_acct ra = {0}; +#endif + + /* save zswapd pid for schedule strategy */ + zswapd_pid = tsk->pid; + + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + while (!kthread_should_stop()) { + bool refault = false; + u64 size = 0; + + (void)wait_event_freezable(pgdat->zswapd_wait, + atomic_read(&pgdat->zswapd_wait_flag)); + atomic_set(&pgdat->zswapd_wait_flag, 0); + count_vm_event(ZSWAPD_WAKEUP); + zswapd_pressure_report(LEVEL_LOW); + + if (get_area_anon_refault_status()) { + refault = true; + count_vm_event(ZSWAPD_REFAULT); + goto do_eswap; + } + +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_start(ZSWAPD_RECLAIM, &ra); +#endif + zswapd_shrink_node(pgdat); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_end(ZSWAPD_RECLAIM); +#endif + last_zswapd_time = jiffies; + +do_eswap: + size = get_do_eswap_size(refault); + if (size >= SZ_1M) { + count_vm_event(ZSWAPD_SWAPOUT); + size = swapout(size); + } + + if (!buffer_is_suitable()) { + if (free_swap_is_low() || zram_watermark_exceed()) { + zswapd_pressure_report(LEVEL_CRITICAL); + count_vm_event(ZSWAPD_CRITICAL_PRESS); + pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__, + get_zram_used_pages(), get_eswap_used_pages()); + } else { + zswapd_pressure_report(LEVEL_MEDIUM); + count_vm_event(ZSWAPD_MEDIUM_PRESS); + } + } + } + + return 0; +} + +/* + * This zswapd start function will be called by init and node-hot-add. + */ +int zswapd_run(int nid) +{ + const unsigned int priority_less = 5; + struct sched_param param = { + .sched_priority = MAX_PRIO - priority_less, + }; + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->zswapd) + return 0; + + atomic_set(&pgdat->zswapd_wait_flag, 0); + pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid); + if (IS_ERR(pgdat->zswapd)) { + pr_err("Failed to start zswapd on node %d\n", nid); + return PTR_ERR(pgdat->zswapd); + } + + sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, ¶m); + set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority)); + wake_up_process(pgdat->zswapd); + + return 0; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void zswapd_stop(int nid) +{ + struct task_struct *zswapd = NODE_DATA(nid)->zswapd; + + if (zswapd) { + kthread_stop(zswapd); + NODE_DATA(nid)->zswapd = NULL; + } + + zswapd_pid = -1; +} + +/* + * It's optimal to keep kswapds on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes away, + * we get changed to run anywhere: as the first one comes back, restore + * their cpu bindings. + */ +static int zswapd_cpu_online(unsigned int cpu) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->zswapd, mask); + } + + return 0; +} + +static int __init zswapd_init(void) +{ + int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online", + zswapd_cpu_online, NULL); + if (ret < 0) { + pr_err("zswapd: failed to register hotplug callbacks.\n"); + return ret; + } + + for_each_node_state(nid, N_MEMORY) + zswapd_run(nid); + + return 0; +} +module_init(zswapd_init) diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c new file mode 100644 index 0000000000000000000000000000000000000000..340b6830619a439f0ddb8bf1a1ab3d831836830e --- /dev/null +++ b/mm/zswapd_control.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zswapd_internal.h" + +#define ANON_REFAULT_SNAPSHOT_MIN_INTERVAL 200 +#define AREA_ANON_REFAULT_THRESHOLD 22000 +#define EMPTY_ROUND_CHECK_THRESHOLD 10 +#define EMPTY_ROUND_SKIP_INTERVAL 20 +#define ZSWAPD_MAX_LEVEL_NUM 10 +#define MAX_SKIP_INTERVAL 1000 +#define MAX_RECLAIM_SIZE 100 + +#define INACTIVE_FILE_RATIO 90 +#define ACTIVE_FILE_RATIO 70 +#define COMPRESS_RATIO 30 +#define ZRAM_WM_RATIO 0 +#define MAX_RATIO 100 + +#define CHECK_BUFFER_VALID(var1, var2) (((var2) != 0) && ((var1) > (var2))) + +struct zswapd_param { + unsigned int min_score; + unsigned int max_score; + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; +}; + +static struct zswapd_param zswap_param[ZSWAPD_MAX_LEVEL_NUM]; +struct eventfd_ctx *zswapd_press_efd[LEVEL_COUNT]; +static DEFINE_MUTEX(pressure_event_lock); +static DEFINE_MUTEX(reclaim_para_lock); + +atomic_t avail_buffers = ATOMIC_INIT(0); +atomic_t min_avail_buffers = ATOMIC_INIT(0); +atomic_t high_avail_buffers = ATOMIC_INIT(0); +atomic_t max_reclaim_size = ATOMIC_INIT(MAX_RECLAIM_SIZE); + +atomic_t inactive_file_ratio = ATOMIC_INIT(INACTIVE_FILE_RATIO); +atomic_t active_file_ratio = ATOMIC_INIT(ACTIVE_FILE_RATIO); +atomic_t zram_wm_ratio = ATOMIC_INIT(ZRAM_WM_RATIO); +atomic_t compress_ratio = ATOMIC_INIT(COMPRESS_RATIO); + +atomic64_t zram_critical_threshold = ATOMIC_LONG_INIT(0); +atomic64_t free_swap_threshold = ATOMIC_LONG_INIT(0); +atomic64_t area_anon_refault_threshold = ATOMIC_LONG_INIT(AREA_ANON_REFAULT_THRESHOLD); +atomic64_t anon_refault_snapshot_min_interval = + ATOMIC_LONG_INIT(ANON_REFAULT_SNAPSHOT_MIN_INTERVAL); +atomic64_t empty_round_skip_interval = ATOMIC_LONG_INIT(EMPTY_ROUND_SKIP_INTERVAL); +atomic64_t max_skip_interval = ATOMIC_LONG_INIT(MAX_SKIP_INTERVAL); +atomic64_t empty_round_check_threshold = ATOMIC_LONG_INIT(EMPTY_ROUND_CHECK_THRESHOLD); + +inline unsigned int get_zram_wm_ratio(void) +{ + return atomic_read(&zram_wm_ratio); +} + +inline unsigned int get_compress_ratio(void) +{ + return atomic_read(&compress_ratio); +} + +inline unsigned int get_inactive_file_ratio(void) +{ + return atomic_read(&inactive_file_ratio); +} + +inline unsigned int get_active_file_ratio(void) +{ + return atomic_read(&active_file_ratio); +} + +inline unsigned int get_avail_buffers(void) +{ + return atomic_read(&avail_buffers); +} + +inline unsigned int get_min_avail_buffers(void) +{ + return atomic_read(&min_avail_buffers); +} + +inline unsigned int get_high_avail_buffers(void) +{ + return atomic_read(&high_avail_buffers); +} + +inline unsigned int get_zswapd_max_reclaim_size(void) +{ + return atomic_read(&max_reclaim_size); +} + +inline unsigned long long get_free_swap_threshold(void) +{ + return atomic64_read(&free_swap_threshold); +} + +inline unsigned long long get_area_anon_refault_threshold(void) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +inline unsigned long long get_anon_refault_snapshot_min_interval(void) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +inline unsigned long long get_empty_round_skip_interval(void) +{ + return atomic64_read(&empty_round_skip_interval); +} + +inline unsigned long long get_max_skip_interval(void) +{ + return atomic64_read(&max_skip_interval); +} + +inline unsigned long long get_empty_round_check_threshold(void) +{ + return atomic64_read(&empty_round_check_threshold); +} + +inline unsigned long long get_zram_critical_threshold(void) +{ + return atomic64_read(&zram_critical_threshold); +} + +static ssize_t avail_buffers_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned long long threshold; + unsigned int high_buffers; + unsigned int min_buffers; + unsigned int buffers; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u %llu", &buffers, &min_buffers, &high_buffers, &threshold) != 4) + return -EINVAL; + + if (CHECK_BUFFER_VALID(min_buffers, buffers) || + CHECK_BUFFER_VALID(min_buffers, high_buffers) || + CHECK_BUFFER_VALID(buffers, high_buffers)) + return -EINVAL; + + atomic_set(&avail_buffers, buffers); + atomic_set(&min_avail_buffers, min_buffers); + atomic_set(&high_avail_buffers, high_buffers); + atomic64_set(&free_swap_threshold, (threshold * (SZ_1M / PAGE_SIZE))); + + if (atomic_read(&min_avail_buffers) == 0) + set_snapshotd_init_flag(0); + else + set_snapshotd_init_flag(1); + + wake_all_zswapd(); + + return nbytes; +} + +static ssize_t zswapd_max_reclaim_size_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + u32 max; + int ret; + + buf = strstrip(buf); + ret = kstrtouint(buf, 10, &max); + if (ret) + return -EINVAL; + + atomic_set(&max_reclaim_size, max); + + return nbytes; +} + +static ssize_t buffers_ratio_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int inactive; + unsigned int active; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u", &inactive, &active) != 2) + return -EINVAL; + + if (inactive > MAX_RATIO || active > MAX_RATIO) + return -EINVAL; + + atomic_set(&inactive_file_ratio, inactive); + atomic_set(&active_file_ratio, active); + + return nbytes; +} + +static int area_anon_refault_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&area_anon_refault_threshold, val); + + return 0; +} + +static int empty_round_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_skip_interval, val); + + return 0; +} + +static int max_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&max_skip_interval, val); + + return 0; +} + +static int empty_round_check_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_check_threshold, val); + + return 0; +} + +static int anon_refault_snapshot_min_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&anon_refault_snapshot_min_interval, val); + + return 0; +} + +static int zram_critical_thres_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&zram_critical_threshold, val); + + return 0; +} + +static ssize_t zswapd_pressure_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int level; + unsigned int efd; + struct fd efile; + int ret; + + buf = strstrip(buf); + if (sscanf(buf, "%u %u", &efd, &level) != 2) + return -EINVAL; + + if (level >= LEVEL_COUNT) + return -EINVAL; + + mutex_lock(&pressure_event_lock); + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out; + } + + zswapd_press_efd[level] = eventfd_ctx_fileget(efile.file); + if (IS_ERR(zswapd_press_efd[level])) { + ret = PTR_ERR(zswapd_press_efd[level]); + goto out_put_efile; + } + fdput(efile); + mutex_unlock(&pressure_event_lock); + return nbytes; + +out_put_efile: + fdput(efile); +out: + mutex_unlock(&pressure_event_lock); + + return ret; +} + +void zswapd_pressure_report(enum zswapd_pressure_level level) +{ + int ret; + + if (zswapd_press_efd[level] == NULL) + return; + + ret = eventfd_signal(zswapd_press_efd[level], 1); + if (ret < 0) + pr_err("SWAP-MM: %s : level:%u, ret:%d ", __func__, level, ret); +} + +static u64 zswapd_pid_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return get_zswapd_pid(); +} + +static void zswapd_memcgs_param_parse(int level_num) +{ + struct mem_cgroup *memcg = NULL; + u64 score; + int i; + + while ((memcg = get_next_memcg(memcg))) { + score = atomic64_read(&memcg->memcg_reclaimed.app_score); + for (i = 0; i < level_num; ++i) + if (score >= zswap_param[i].min_score && + score <= zswap_param[i].max_score) + break; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + zswap_param[i].ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + zswap_param[i].ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + zswap_param[i].refault_threshold); + } +} + +static ssize_t zswapd_memcgs_param_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + char *token = NULL; + int level_num; + int i; + + buf = strstrip(buf); + token = strsep(&buf, " "); + + if (!token) + return -EINVAL; + + if (kstrtoint(token, 0, &level_num)) + return -EINVAL; + + if (level_num > ZSWAPD_MAX_LEVEL_NUM) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + for (i = 0; i < level_num; ++i) { + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].min_score) || + zswap_param[i].min_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].max_score) || + zswap_param[i].max_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_mem2zram_ratio) || + zswap_param[i].ub_mem2zram_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_zram2ufs_ratio) || + zswap_param[i].ub_zram2ufs_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].refault_threshold)) + goto out; + } + + zswapd_memcgs_param_parse(level_num); + mutex_unlock(&reclaim_para_lock); + + return nbytes; + +out: + mutex_unlock(&reclaim_para_lock); + return -EINVAL; +} + +static ssize_t zswapd_single_memcg_param_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u", &ub_mem2zram_ratio, &ub_zram2ufs_ratio, + &refault_threshold) != 3) + return -EINVAL; + + if (ub_mem2zram_ratio > MAX_RATIO || ub_zram2ufs_ratio > MAX_RATIO || + refault_threshold > MAX_RATIO) + return -EINVAL; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + refault_threshold); + + return nbytes; +} + +static ssize_t mem_cgroup_zram_wm_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&zram_wm_ratio, ratio); + + return nbytes; +} + +static ssize_t mem_cgroup_compress_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&compress_ratio, ratio); + + return nbytes; +} + +static int zswapd_pressure_show(struct seq_file *m, void *v) +{ + zswapd_status_show(m); + + return 0; +} + +static int memcg_active_app_info_list_show(struct seq_file *m, void *v) +{ + struct mem_cgroup_per_node *mz = NULL; + struct mem_cgroup *memcg = NULL; + struct lruvec *lruvec = NULL; + unsigned long eswap_size; + unsigned long anon_size; + unsigned long zram_size; + + while ((memcg = get_next_memcg(memcg))) { + u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + eswap_size = memcg_data_size(memcg, SWAP_SIZE); + zram_size = memcg_data_size(memcg, CACHE_SIZE); + + if (anon_size + zram_size + eswap_size == 0) + continue; + + if (!strlen(memcg->name)) + continue; + + anon_size *= PAGE_SIZE / SZ_1K; + zram_size *= PAGE_SIZE / SZ_1K; + eswap_size *= PAGE_SIZE / SZ_1K; + + seq_printf(m, "%s %llu %lu %lu %lu %llu\n", memcg->name, score, + anon_size, zram_size, eswap_size, + memcg->memcg_reclaimed.reclaimed_pagefault); + } + return 0; +} + +#ifdef CONFIG_HYPERHOLD_DEBUG +static int avail_buffers_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "avail_buffers: %u\n", atomic_read(&avail_buffers)); + seq_printf(m, "min_avail_buffers: %u\n", atomic_read(&min_avail_buffers)); + seq_printf(m, "high_avail_buffers: %u\n", atomic_read(&high_avail_buffers)); + seq_printf(m, "free_swap_threshold: %llu\n", + atomic64_read(&free_swap_threshold) * PAGE_SIZE / SZ_1M); + + return 0; +} + +static int zswapd_max_reclaim_size_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zswapd_max_reclaim_size: %u\n", + atomic_read(&max_reclaim_size)); + + return 0; +} + +static int buffers_ratio_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "inactive_file_ratio: %u\n", atomic_read(&inactive_file_ratio)); + seq_printf(m, "active_file_ratio: %u\n", atomic_read(&active_file_ratio)); + + return 0; +} + +static u64 area_anon_refault_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +static u64 empty_round_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_skip_interval); +} + +static u64 max_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&max_skip_interval); +} + +static u64 empty_round_check_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_check_threshold); +} + +static u64 anon_refault_snapshot_min_interval_read( + struct cgroup_subsys_state *css, struct cftype *cft) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +static u64 zram_critical_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&zram_critical_threshold); +} + +static int zswapd_memcgs_param_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < ZSWAPD_MAX_LEVEL_NUM; ++i) { + seq_printf(m, "level %d min score: %u\n", i, + zswap_param[i].min_score); + seq_printf(m, "level %d max score: %u\n", i, + zswap_param[i].max_score); + seq_printf(m, "level %d ub_mem2zram_ratio: %u\n", i, + zswap_param[i].ub_mem2zram_ratio); + seq_printf(m, "level %d ub_zram2ufs_ratio: %u\n", i, + zswap_param[i].ub_zram2ufs_ratio); + seq_printf(m, "level %d refault_threshold: %u\n", i, + zswap_param[i].refault_threshold); + } + + return 0; +} + +static int zswapd_single_memcg_param_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "memcg score: %llu\n", + atomic64_read(&memcg->memcg_reclaimed.app_score)); + seq_printf(m, "memcg ub_mem2zram_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)); + seq_printf(m, "memcg ub_zram2ufs_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio)); + seq_printf(m, "memcg refault_threshold: %u\n", + atomic_read(&memcg->memcg_reclaimed.refault_threshold)); + + return 0; +} + +static int zram_wm_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zram_wm_ratio: %u\n", atomic_read(&zram_wm_ratio)); + + return 0; +} + +static int compress_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "compress_ratio: %u\n", atomic_read(&compress_ratio)); + + return 0; +} + +static int zswapd_vmstat_show(struct seq_file *m, void *v) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *vm_buf = NULL; + + vm_buf = kzalloc(sizeof(struct vm_event_state), GFP_KERNEL); + if (!vm_buf) + return -ENOMEM; + all_vm_events(vm_buf); + + seq_printf(m, "zswapd_wake_up:%lu\n", vm_buf[ZSWAPD_WAKEUP]); + seq_printf(m, "zswapd_area_refault:%lu\n", vm_buf[ZSWAPD_REFAULT]); + seq_printf(m, "zswapd_medium_press:%lu\n", vm_buf[ZSWAPD_MEDIUM_PRESS]); + seq_printf(m, "zswapd_critical_press:%lu\n", vm_buf[ZSWAPD_CRITICAL_PRESS]); + seq_printf(m, "zswapd_memcg_ratio_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_RATIO_SKIP]); + seq_printf(m, "zswapd_memcg_refault_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_REFAULT_SKIP]); + seq_printf(m, "zswapd_swapout:%lu\n", vm_buf[ZSWAPD_SWAPOUT]); + seq_printf(m, "zswapd_snapshot_times:%lu\n", vm_buf[ZSWAPD_SNAPSHOT_TIMES]); + seq_printf(m, "zswapd_reclaimed:%lu\n", vm_buf[ZSWAPD_RECLAIMED]); + seq_printf(m, "zswapd_scanned:%lu\n", vm_buf[ZSWAPD_SCANNED]); + + kfree(vm_buf); +#endif + + return 0; +} + +static int eswap_info_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long long eswap_size; + + eswap_size = memcg_data_size(memcg, WRITE_SIZE) / SZ_1K; + seq_printf(m, "Total Swapout Size: %llu kB\n", eswap_size); + + return 0; +} + +void memcg_eswap_info_show(struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon; + unsigned long file; + unsigned long zram; + unsigned long eswap; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return; + + lruvec = &mz->lruvec; + if (!lruvec) + return; + + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + zram = memcg_data_size(memcg, CACHE_SIZE) / SZ_1K; + eswap = memcg_data_size(memcg, SWAP_SIZE) / SZ_1K; + anon *= PAGE_SIZE / SZ_1K; + file *= PAGE_SIZE / SZ_1K; + seq_printf(m, "Anon:\t%12lu kB\nFile:\t%12lu kB\nzram:\t%12lu kB\nEswap:\t%12lu kB\n", + anon, file, zram, eswap); +} +#endif + +static struct cftype zswapd_policy_files[] = { + { + .name = "active_app_info_list", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = memcg_active_app_info_list_show, + }, + { + .name = "zram_wm_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_zram_wm_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zram_wm_ratio_show, +#endif + }, + { + .name = "compress_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_compress_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = compress_ratio_show, +#endif + }, + { + .name = "zswapd_pressure", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_pressure_event_control, + }, + { + .name = "zswapd_pid", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = zswapd_pid_read, + }, + { + .name = "avail_buffers", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = avail_buffers_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = avail_buffers_params_show, +#endif + }, + { + .name = "zswapd_max_reclaim_size", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_max_reclaim_size_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_max_reclaim_size_show, +#endif + }, + { + .name = "area_anon_refault_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = area_anon_refault_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = area_anon_refault_threshold_read, +#endif + }, + { + .name = "empty_round_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_skip_interval_read, +#endif + }, + { + .name = "max_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = max_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = max_skip_interval_read, +#endif + }, + { + .name = "empty_round_check_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_check_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_check_threshold_read, +#endif + }, + { + .name = "anon_refault_snapshot_min_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = anon_refault_snapshot_min_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = anon_refault_snapshot_min_interval_read, +#endif + }, + { + .name = "zswapd_memcgs_param", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_memcgs_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_memcgs_param_show, +#endif + }, + { + .name = "zswapd_single_memcg_param", + .write = zswapd_single_memcg_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_single_memcg_param_show, +#endif + }, + { + .name = "buffer_ratio_params", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = buffers_ratio_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = buffers_ratio_params_show, +#endif + }, + { + .name = "zswapd_pressure_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_pressure_show, + }, + { + .name = "zram_critical_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = zram_critical_thres_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = zram_critical_threshold_read, +#endif + }, + +#ifdef CONFIG_HYPERHOLD_DEBUG + { + .name = "zswapd_vmstat_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_vmstat_show, + }, +#endif + { + .name = "eswap_info", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = eswap_info_show, + }, + + { }, /* terminate */ +}; + +static int __init zswapd_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, zswapd_policy_files)); + + return 0; +} +subsys_initcall(zswapd_policy_init); diff --git a/mm/zswapd_internal.h b/mm/zswapd_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..1447882ae49725663a160ed2d7a106690dd67e9b --- /dev/null +++ b/mm/zswapd_internal.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/zswapd_internal.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_INTERNAL_H +#define _ZSWAPD_INTERNAL_H + +enum zswapd_pressure_level { + LEVEL_LOW = 0, + LEVEL_MEDIUM, + LEVEL_CRITICAL, + LEVEL_COUNT +}; + +enum zswapd_eswap_policy { + CHECK_BUFFER_ONLY = 0, + CHECK_BUFFER_ZRAMRATIO_BOTH +}; + +void zswapd_pressure_report(enum zswapd_pressure_level level); +inline unsigned int get_zram_wm_ratio(void); +inline unsigned int get_compress_ratio(void); +inline unsigned int get_avail_buffers(void); +inline unsigned int get_min_avail_buffers(void); +inline unsigned int get_high_avail_buffers(void); +inline unsigned int get_zswapd_max_reclaim_size(void); +inline unsigned int get_inactive_file_ratio(void); +inline unsigned int get_active_file_ratio(void); +inline unsigned long long get_area_anon_refault_threshold(void); +inline unsigned long long get_anon_refault_snapshot_min_interval(void); +inline unsigned long long get_empty_round_skip_interval(void); +inline unsigned long long get_max_skip_interval(void); +inline unsigned long long get_empty_round_check_threshold(void); +inline unsigned long long get_zram_critical_threshold(void); +u64 memcg_data_size(struct mem_cgroup *memcg, int type); +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size); + +#endif /* MM_ZSWAPD_INTERNAL_H */