diff --git a/Documentation/vm/kidled.rst b/Documentation/vm/kidled.rst index f5ce2fed2a8ccfdfe1563b4af43257ab6712ec53..7aa9a07ff773f5b692379126f2ef9d39f390946b 100644 --- a/Documentation/vm/kidled.rst +++ b/Documentation/vm/kidled.rst @@ -7,18 +7,29 @@ kidled Introduction ============ -kidled uses a kernel thread to scan the pages on LRU list, and supports to -output statistics for each memory cgroup (process is not supported yet). -kidled scans pages round to round indexed by pfn, and will try to finish each -round in a fixed duration which is named as scan period. Of course, users can -set the scan period whose unit is seconds. Each page has an attribute named -as 'idle age', which represents how long the page is kept in idle state, the -age's unit is in one scan period. The idle aging information (field) consumes +kidled uses a kernel thread to scan the pages and slab objects on LRU list +respectively, and supports to output statistics for each memory cgroup +(process is not supported yet). Kidled scans pages round to round indexed +by pfn, but scan slab objects is different. Slab lru list is not stable as +time goes, hence we regards the first accessed slab lru size as the real +size that kidled should scan the numbers of the specified slab in a round. +Kidled scanning will try to finish each round in a fixed duration which +is named as scan period. Of course, users can set the scan period whose +unit is seconds. Scanned objects has an attribute named as 'idle age', +which represents how long the object is kept in idle state, the age's unit +is in one scan period. The idle aging information (field) of the page consumes one byte, which is stored in dynamically allocated array, tied with the NUMA -node or flags field of page descriptor (struct page). So the maximal age is -255. kidled eventually shows the histogram statistics through memory cgroup -files (``memory.idle_page_stats``). The statistics could be used to evaluate -the working-set size of that memory cgroup or the hierarchy. +node or flags field of page descriptor (struct page). Meanwhile, Slab objects +use two bytes to store the information, its lower bytes to store the idle aging +information and upper bytes to make an mark to avoid accessing an object more +than one time. So the maximal age is 255. kidled eventually shows the histogram +statistics through memory cgroup files (``memory.idle_page_stats``). The statistics +could be used to evaluate the working-set size of that memory cgroup or the hierarchy. + +Especially, we add a switch to control whether slab scan or not. That isolate +page scan and slab scan effectively to avoid too many slab objects interfering +with page scan. Because it is important for us to reap cold userspace page, which +reclaim more memory at the lower cost. Note: The implementation of kidled had referred to Michel Lespinasse's patch: https://lore.kernel.org/lkml/20110922161448.91a2e2b2.akpm@google.com/T/ @@ -63,7 +74,25 @@ Here are their functions: statistics, but it won't be very odd due to the duration are the same at least. -* ``memory.idle_page_stats.local`` (memory cgroup v1/v2) +* ``/sys/kernel/mm/kidled/scan_target`` + + It controls which type kidled will scan, there are three kinds of type + could be selected: scan page only, scan slab only, scan both page and + slab. The users can enable them as follows. Other value will be invalid. + + To scan user page only + echo 1 > ``/sys/kernel/mm/kidled/scan_target`` + To scan slab only + echo 2 > ``/sys/kernel/mm/kidled/scan_target`` + Both scan page and slab + echo 3 > ``/sys/kernel/mm/kidled/scan_target`` + + By default, kidled will not scan slab because the cpu load will very + high if the system has a lot of reclaimable slabs. But we need to enable + it when userspace pages have been reclaimed and a lot of reclaimable + slabs is in the system. We'd better mark and reclaim the cold slab in + front of the memory reclaim triggered by allocating memory request. + It shows histogram of idle statistics for the corresponding memory cgroup. @@ -77,7 +106,8 @@ Here are their functions: ----------------------------- snapshot start ----------------------------- # version: 1.0 - # scans: 1380 + # page_scans: 92 + # slab_scans: 92 # scan_period_in_seconds: 120 # buckets: 1,2,5,15,30,60,120,240 # @@ -85,27 +115,30 @@ Here are their functions: # / _----=> swap/file # | / _---=> evict/unevict # || / _--=> inactive/active - # ||| / - # |||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) - csei 0 0 0 0 0 0 0 0 - dsei 0 0 442368 49152 0 49152 212992 7741440 - cfei 4096 233472 1171456 1032192 28672 65536 122880 147550208 - dfei 0 0 4096 20480 4096 0 12288 12288 - csui 0 0 0 0 0 0 0 0 - dsui 0 0 0 0 0 0 0 0 - cfui 0 0 0 0 0 0 0 0 - dfui 0 0 0 0 0 0 0 0 - csea 77824 331776 1216512 1069056 217088 372736 327680 33284096 - dsea 0 0 0 0 0 0 0 139264 - cfea 4096 57344 606208 13144064 53248 135168 1683456 48357376 - dfea 0 0 0 0 0 0 0 0 - csua 0 0 0 0 0 0 0 0 - dsua 0 0 0 0 0 0 0 0 - cfua 0 0 0 0 0 0 0 0 - dfua 0 0 0 0 0 0 0 0 + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 0 0 0 0 0 0 0 0 + dsei 0 16384 0 0 0 360448 0 0 + cfei 774144 3624960 1744896 1298432 20676608 161087488 0 0 + dfei 0 0 16384 0 24576 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 278528 3510272 389120 872448 806912 22716416 0 0 + dsea 0 12288 0 0 0 196608 0 0 + cfea 1298432 12115968 3510272 10518528 78409728 1503793152 0 0 + dfea 0 0 0 0 0 4096 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 2704 832 15600 20800 70720 763819160 0 0 ----------------------------- snapshot end ----------------------------- - ``scans`` means how many rounds current cgroup has been scanned. + ``page_scans`` means how many rounds current cgroup's pagecache has been scanned. + ``slab_scans`` means how many rounds current cgroup's slab has been scanned. ``scan_period_in_seconds`` means kidled will take how long to finish one round. ``buckets`` is to allow scripts parsing easily. The table shows how many bytes are in idle state, the row is indexed by idle @@ -132,7 +165,8 @@ Here are their functions: $ sudo bash -c "echo '' > /sys/fs/cgroup/memory/test/memory.idle_page_stats" $ cat /sys/fs/cgroup/memory/test/memory.idle_page_stats # version: 1.0 - # scans: 0 + # page_scans: 0 + # slab_scans: 0 # scan_period_in_seconds: 1 # buckets: no valid bucket available ----------------------------- snapshot end ----------------------------- diff --git a/fs/dcache.c b/fs/dcache.c index 4030c010a76820d9d3f7f9abde97c9a11ebb4e73..713e686de0cc1bc24b498268be454a7ac90bb109 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -1288,6 +1289,52 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } +#ifdef CONFIG_KIDLED +/* + * It will takes a lot of time in spin_trylock and spin_unlock when + * scanning the slab. I remove the lock operation directly even if + * it can bring in some inaccuracy in statistics. Meanwhile, it is + * safe because the dentry will not be released when lru lock is hold. + */ +static enum lru_status dentry_lru_cold_count(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct dentry *dentry = container_of(item, struct dentry, d_lru); + static int dentry_size; + u16 dentry_age = kidled_get_slab_age(dentry); + + /* avoid an object to scan twice in an round */ + if (dentry_age && + kidled_is_slab_scanned(dentry_age, kidled_scan_rounds)) + goto out; + + if (READ_ONCE(dentry->d_lockref.count) || + (dentry->d_flags & DCACHE_REFERENCED)) { + if (dentry_age) + kidled_set_slab_age(dentry, 0); + goto out; + } + + kidled_clear_slab_scanned(dentry); + if (unlikely(!dentry_size)) + dentry_size = ksize(dentry); + dentry_age = kidled_inc_slab_age(dentry); + kidled_mem_cgroup_slab_account(dentry, dentry_age, dentry_size); + kidled_mark_slab_scanned(dentry, kidled_scan_rounds); +out: + return LRU_ROTATE_DELAY; +} + +void cold_dcache_sb(struct super_block *sb, struct shrink_control *sc) +{ + unsigned long nr_to_walk = sc->nr_to_scan; + + list_lru_walk_node(&sb->s_dentry_lru, sc->nid, + dentry_lru_cold_count, + NULL, &nr_to_walk); +} +#endif + static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -1810,6 +1857,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_lockref.count = 1; dentry->d_flags = 0; + kidled_set_slab_age(dentry, 0); spin_lock_init(&dentry->d_lock); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; diff --git a/fs/inode.c b/fs/inode.c index 2c44dda61a690d254849f71292aaf268ded0ff77..6606fa242b28e8e8b806191904511f2eab407484 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -19,6 +19,7 @@ #include /* for inode_has_buffers */ #include #include +#include #include #include #include "internal.h" @@ -162,6 +163,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + kidled_set_slab_age(inode, 0); atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; @@ -928,6 +930,58 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } +#ifdef CONFIG_KIDLED +/* + * The implementation of principle is similar to the dentry. It will + * takes a lot of time in spin_lock/spin_unlock. it is useless that + * we only want to know the real free slab. + */ +static enum lru_status inode_lru_cold_count(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lock, void *arg) +{ + struct inode *inode = container_of(item, struct inode, i_lru); + static int inode_size; + u16 inode_age = kidled_get_slab_age(inode); + + if (inode_age && + kidled_is_slab_scanned(inode_age, kidled_scan_rounds)) + goto out; + + if (atomic_read(&inode->i_count) || + (inode->i_state & I_REFERENCED)) { + if (unlikely(inode_age)) + kidled_set_slab_age(inode, 0); + goto out; + } + + if (inode->i_data.nrpages || + !list_empty(&inode->i_data.private_list)) { + if (unlikely(inode_age)) + kidled_set_slab_age(inode, 0); + goto out; + } + + kidled_clear_slab_scanned(inode); + if (unlikely(!inode_size)) + inode_size = ksize(inode); + inode_age = kidled_inc_slab_age(inode); + kidled_mem_cgroup_slab_account(inode, inode_age, inode_size); + kidled_mark_slab_scanned(inode, kidled_scan_rounds); +out: + return LRU_ROTATE_DELAY; +} + +void cold_icache_sb(struct super_block *sb, + struct shrink_control *sc) +{ + unsigned long nr_to_walk = sc->nr_to_scan; + + list_lru_walk_node(&sb->s_inode_lru, sc->nid, + inode_lru_cold_count, NULL, + &nr_to_walk); +} +#endif + static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. @@ -1533,6 +1587,10 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; + } else { + /* reset its age if it has already had an age */ + if (kidled_get_slab_age(inode)) + kidled_set_slab_age(inode, 0); } } return inode; @@ -1563,6 +1621,10 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; + } else { + /* reset its age if it has already had an age */ + if (kidled_get_slab_age(inode)) + kidled_set_slab_age(inode, 0); } } return inode; diff --git a/fs/internal.h b/fs/internal.h index d64ae03998cce9dcffdad33baa3e52f4f7dc8983..e527ff2f732a8fc64af66e5c957477cc68d93a6a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -4,6 +4,9 @@ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ +#ifdef CONFIG_KIDLED +#include +#endif struct super_block; struct file_system_type; @@ -207,6 +210,12 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *) extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); +#ifdef CONFIG_KIDLED +extern void cold_dcache_sb(struct super_block *sb, + struct shrink_control *sc); +extern void cold_icache_sb(struct super_block *sb, + struct shrink_control *sc); +#endif /* * pipe.c @@ -298,3 +307,4 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); + diff --git a/fs/namei.c b/fs/namei.c index 887e8bfaf25ae6fd3b1ce267b3afc485ff12685a..bc3cc6554112574f345b356881180e67d1faa40c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -858,10 +859,14 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { + int status = 1; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) - return dentry->d_op->d_revalidate(dentry, flags); - else - return 1; + status = dentry->d_op->d_revalidate(dentry, flags); + /* Reset the age when lookuping the dentry successfully */ + if (status > 0 && kidled_get_slab_age(dentry)) + kidled_set_slab_age(dentry, 0); + return status; } /** diff --git a/fs/super.c b/fs/super.c index 992a6f80252dbd9d67f9f08dba5f7e877da2eb4e..49774f1814a0c07f3bad04a7780812846f9d7d10 100644 --- a/fs/super.c +++ b/fs/super.c @@ -277,6 +277,41 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +#ifdef CONFIG_KIDLED +static unsigned long super_cache_cold(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct super_block *sb; + unsigned long dentry_objects, inode_objects; + unsigned long dentries, inodes; + unsigned long total_objects; + unsigned long nr_to_scan = sc->nr_to_scan; + + sb = shrinker->private_data; + + if (!super_trylock_shared(sb)) + return SHRINK_STOP; + + dentry_objects = list_lru_shrink_count(&sb->s_dentry_lru, sc); + inode_objects = list_lru_shrink_count(&sb->s_inode_lru, sc); + total_objects = dentry_objects + inode_objects; + if (!total_objects) + total_objects = 1; + + /* make sure dentries and inodes scan at least one object */ + dentries = mult_frac(nr_to_scan, dentry_objects, total_objects); + inodes = mult_frac(nr_to_scan, inode_objects, total_objects); + + sc->nr_to_scan = dentries + 1; + cold_dcache_sb(sb, sc); + sc->nr_to_scan = inodes + 1; + cold_icache_sb(sb, sc); + super_unlock_shared(sb); + + return nr_to_scan; +} +#endif + static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, @@ -390,6 +425,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; +#ifdef CONFIG_KIDLED + s->s_shrink->cold_objects = super_cache_cold; +#endif s->s_shrink->batch = 1024; s->s_shrink->private_data = s; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 05b9b6e86c3a9a4d836f3d665e8f03ef18d6957b..224fa7e227950e6e86965885ecdbdc581bea3f93 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -113,6 +113,7 @@ struct dentry { struct rcu_head d_rcu; } d_u; + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) } __randomize_layout; diff --git a/include/linux/kidled.h b/include/linux/kidled.h index e0ae7700fa19ad5a0a5c4062afbabb0619f906ed..0fc766b9137176b1cbf8df3a6b8d360705fd5d8a 100644 --- a/include/linux/kidled.h +++ b/include/linux/kidled.h @@ -5,8 +5,25 @@ #ifdef CONFIG_KIDLED #include +#include #define KIDLED_VERSION "1.0" +struct mem_cgroup; + +/* + * Kidled_scan_type define the scan type that kidled will + * work at. The default option is to scan page only, but + * it can be modified by a specified interface at any time. + */ +enum kidled_scan_type { + SCAN_TARGET_PAGE = 0, + SCAN_TARGET_SLAB, + SCAN_TARGET_ALL +}; + +#define KIDLED_SCAN_PAGE (1 << SCAN_TARGET_PAGE) +#define KIDLED_SCAN_SLAB (1 << SCAN_TARGET_SLAB) +#define KIDLED_SCAN_ALL (KIDLED_SCAN_PAGE | KIDLED_SCAN_SLAB) /* * We want to get more info about a specified idle page, whether it's @@ -17,17 +34,19 @@ * KIDLE_FILE : page is a page cache or not; * KIDLE_UNEVIT : page is unevictable or evictable; * KIDLE_ACTIVE : page is in active LRU list or not. + * KIDLE_SLAB : whether it belongs to a slab or not. * * Each KIDLE_ occupies one bit position in a specified idle type. - * There exist total 2^4=16 idle types. + * There exist total 2^4+1=17 idle types. */ #define KIDLE_BASE 0 #define KIDLE_DIRTY (1 << 0) #define KIDLE_FILE (1 << 1) #define KIDLE_UNEVICT (1 << 2) #define KIDLE_ACTIVE (1 << 3) +#define KIDLE_SLAB (1 << 4) -#define KIDLE_NR_TYPE 16 +#define KIDLE_NR_TYPE 17 /* * Each page has an idle age which means how long the page is keeping @@ -68,6 +87,9 @@ * kidled_get_bucket(). User shouldn't use KIDLED_INVALID_BUCKET directly. */ #define KIDLED_INVALID_BUCKET (KIDLED_MAX_IDLE_AGE + 1) +/* Mark the higher byte as an sign of slab objects access in a round */ +#define KIDLED_SLAB_ACCESS_MASK 0xff00 +#define KIDLED_SLAB_ACCESS_SHIFT 0x8 #define KIDLED_MARK_BUCKET_INVALID(buckets) \ (buckets[0] = KIDLED_INVALID_BUCKET) @@ -76,6 +98,12 @@ DECLARE_STATIC_KEY_FALSE(kidled_enabled_key); +static inline bool kidled_is_slab_scanned(unsigned short slab_age, + unsigned long scan_rounds) +{ + return slab_age >> KIDLED_SLAB_ACCESS_SHIFT == (scan_rounds & 0xff); +} + /* * We account number of idle pages depending on idle type and buckets * for a specified instance (e.g. one memory cgroup or one process...) @@ -98,7 +126,7 @@ struct idle_page_stats { * least. */ #define KIDLED_MAX_SCAN_DURATION U16_MAX /* max 65536 seconds */ -struct kidled_scan_period { +struct kidled_scan_control { union { atomic_t val; struct { @@ -106,58 +134,135 @@ struct kidled_scan_period { u16 duration; /* in seconds */ }; }; + unsigned int scan_target; /* decide how kidled to scan */ }; -extern struct kidled_scan_period kidled_scan_period; +extern struct kidled_scan_control kidled_scan_control; +extern unsigned int kidled_scan_target; +extern unsigned long kidled_scan_rounds; #define KIDLED_OP_SET_DURATION (1 << 0) #define KIDLED_OP_INC_SEQ (1 << 1) -static inline struct kidled_scan_period kidled_get_current_scan_period(void) +#ifdef CONFIG_MEMCG_KMEM +extern bool cgroup_memory_nokmem; +#else +#define cgroup_memory_nokmem 1 +#endif +extern int kidled_alloc_slab_age(struct slab *slab, struct kmem_cache *s, gfp_t flags); +extern void kidled_free_slab_age(struct slab *slab); +extern void kidled_mem_cgroup_account(struct folio *folio, + void *ptr, int age, unsigned long size); +static inline void kidled_mem_cgroup_slab_account(void *object, + int age, int size) +{ + struct folio *folio; + + folio = virt_to_folio(object); + kidled_mem_cgroup_account(folio, object, age, size); +} + +static inline struct kidled_scan_control kidled_get_current_scan_control(void) { - struct kidled_scan_period scan_period; + struct kidled_scan_control scan_control; - atomic_set(&scan_period.val, atomic_read(&kidled_scan_period.val)); - return scan_period; + atomic_set(&scan_control.val, atomic_read(&kidled_scan_control.val)); + scan_control.scan_target = kidled_scan_target; + return scan_control; } static inline unsigned int kidled_get_current_scan_duration(void) { - struct kidled_scan_period scan_period = - kidled_get_current_scan_period(); + struct kidled_scan_control scan_control = + kidled_get_current_scan_control(); - return scan_period.duration; + return scan_control.duration; } -static inline void kidled_reset_scan_period(struct kidled_scan_period *p) +static inline void kidled_reset_scan_control(struct kidled_scan_control *p) { atomic_set(&p->val, 0); + p->scan_target = KIDLED_SCAN_PAGE; } /* - * Compare with global kidled_scan_period, return true if equals. + * Compare with global kidled_scan_control, return true if equals. */ -static inline bool kidled_is_scan_period_equal(struct kidled_scan_period *p) +static inline bool kidled_is_scan_period_equal(struct kidled_scan_control *p) +{ + return atomic_read(&p->val) == atomic_read(&kidled_scan_control.val); +} + +static inline bool kidled_has_slab_target(struct kidled_scan_control *p) +{ + return p->scan_target & KIDLED_SCAN_SLAB; +} + +static inline bool kidled_has_page_target(struct kidled_scan_control *p) +{ + return p->scan_target & KIDLED_SCAN_PAGE; +} + +static inline bool kidled_has_slab_target_equal(struct kidled_scan_control *p) +{ + if (!kidled_has_slab_target(p)) + return false; + + return kidled_scan_target & KIDLED_SCAN_SLAB; +} + +static inline bool +kidled_is_scan_target_equal(struct kidled_scan_control *p) +{ + return p->scan_target == kidled_scan_target; +} + +static inline bool +kidled_has_slab_target_only(struct kidled_scan_control *p) +{ + return p->scan_target == KIDLED_SCAN_SLAB; +} + +static inline bool +kidled_has_page_target_only(struct kidled_scan_control *p) +{ + return p->scan_target == KIDLED_SCAN_PAGE; +} + +static inline bool +kidled_has_page_target_equal(struct kidled_scan_control *p) { - return atomic_read(&p->val) == atomic_read(&kidled_scan_period.val); + if (!kidled_has_page_target(p)) + return false; + + return kidled_scan_target & KIDLED_SCAN_PAGE; +} + +static inline void kidled_get_reset_type(struct kidled_scan_control *p, + bool *page_disabled, bool *slab_disabled) +{ + if (kidled_has_page_target(p) && !kidled_has_page_target_equal(p)) + *page_disabled = 1; + if (kidled_has_slab_target(p) && !kidled_has_slab_target_equal(p)) + *slab_disabled = 1; } -static inline bool kidled_set_scan_period(int op, u16 duration, - struct kidled_scan_period *orig) +static inline bool kidled_set_scan_control(int op, u16 duration, + struct kidled_scan_control *orig) { bool retry = false; /* - * atomic_cmpxchg() tries to update kidled_scan_period, shouldn't + * atomic_cmpxchg() tries to update kidled_scan_control, shouldn't * retry to avoid endless loop when caller specify a period. */ if (!orig) { - orig = &kidled_scan_period; + orig = &kidled_scan_control; retry = true; } while (true) { int new_period_val, old_period_val; - struct kidled_scan_period new_period; + struct kidled_scan_control new_period; old_period_val = atomic_read(&orig->val); atomic_set(&new_period.val, old_period_val); @@ -167,7 +272,7 @@ static inline bool kidled_set_scan_period(int op, u16 duration, new_period.duration = duration; new_period_val = atomic_read(&new_period.val); - if (atomic_cmpxchg(&kidled_scan_period.val, + if (atomic_cmpxchg(&kidled_scan_control.val, old_period_val, new_period_val) == old_period_val) return true; @@ -179,7 +284,7 @@ static inline bool kidled_set_scan_period(int op, u16 duration, static inline void kidled_set_scan_duration(u16 duration) { - kidled_set_scan_period(KIDLED_OP_INC_SEQ | + kidled_set_scan_control(KIDLED_OP_INC_SEQ | KIDLED_OP_SET_DURATION, duration, NULL); } @@ -193,7 +298,8 @@ static inline bool is_kidled_enabled(void) * Caller must specify the original scan period, avoid the race between * the double operation and user's updates through sysfs interface. */ -static inline bool kidled_try_double_scan_period(struct kidled_scan_period orig) +static inline bool +kidled_try_double_scan_control(struct kidled_scan_control orig) { u16 duration = orig.duration; @@ -203,7 +309,7 @@ static inline bool kidled_try_double_scan_period(struct kidled_scan_period orig) duration <<= 1; if (duration < orig.duration) duration = KIDLED_MAX_SCAN_DURATION; - return kidled_set_scan_period(KIDLED_OP_INC_SEQ | + return kidled_set_scan_control(KIDLED_OP_INC_SEQ | KIDLED_OP_SET_DURATION, duration, &orig); @@ -215,7 +321,38 @@ static inline bool kidled_try_double_scan_period(struct kidled_scan_period orig) */ static inline void kidled_inc_scan_seq(void) { - kidled_set_scan_period(KIDLED_OP_INC_SEQ, 0, NULL); + kidled_set_scan_control(KIDLED_OP_INC_SEQ, 0, NULL); +} + +extern bool page_has_slab_age(struct slab *slab); +extern unsigned short kidled_get_slab_age(void *object); +extern void kidled_set_slab_age(void *object, unsigned short age); +static inline unsigned short kidled_inc_slab_age(void *object) +{ + unsigned short slab_age = kidled_get_slab_age(object); + + if (slab_age < KIDLED_MAX_IDLE_AGE) { + slab_age++; + kidled_set_slab_age(object, slab_age); + } + + return slab_age; +} + +static inline void kidled_clear_slab_scanned(void *object) +{ + unsigned short slab_age = kidled_get_slab_age(object); + + slab_age &= ~KIDLED_SLAB_ACCESS_MASK; + kidled_set_slab_age(object, slab_age); +} + +static inline void kidled_mark_slab_scanned(void *object, unsigned long scan_rounds) +{ + unsigned short slab_age = kidled_get_slab_age(object); + + slab_age |= (scan_rounds & 0xff) << KIDLED_SLAB_ACCESS_SHIFT; + kidled_set_slab_age(object, slab_age); } extern const int kidled_default_buckets[NUM_KIDLED_BUCKETS]; @@ -224,7 +361,7 @@ extern const int kidled_default_buckets[NUM_KIDLED_BUCKETS]; void kidled_mem_cgroup_move_stats(struct mem_cgroup *from, struct mem_cgroup *to, struct folio *folio, - unsigned int nr_pages); + unsigned long size); #endif /* CONFIG_MEMCG */ #ifdef KIDLED_AGE_NOT_IN_PAGE_FLAGS @@ -237,11 +374,33 @@ void kidled_free_folio_age(pg_data_t *pgdat); static inline void kidled_mem_cgroup_move_stats(struct mem_cgroup *from, struct mem_cgroup *to, struct folio *folio, - unsigned int nr_pages) + unsigned long size) { } #endif /* CONFIG_MEMCG */ +static inline unsigned short kidled_get_slab_age(void *object) +{ + return 0; +} + +static inline void kidled_set_slab_age(void *object, unsigned short age) +{ +} + +static inline int kidled_alloc_slab_age(struct slab *slab, struct kmem_cache *s, gfp_t flags) +{ + return 0; +} + +static inline void kidled_free_slab_age(struct slab *slab) +{ +} + +static inline bool page_has_slab_age(struct slab *slab) +{ + return false; +} static inline unsigned int kidled_get_current_scan_duration(void) { return 0; diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 069d0515b50c35e6b1cd43cbb76556f4fef55b4a..b3ece2916b5a1c9922fef515128d897caa1fc8fa 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -21,6 +21,9 @@ enum lru_status { LRU_REMOVED_RETRY, /* item removed, but lock has been dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ +#ifdef CONFIG_KIDLED + LRU_ROTATE_DELAY, /* item rotate, but not execute immediately */ +#endif LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 79dac3a65e6976bdd447bddaeb5becfeae625ba1..fc07bd75557b611135c31f5cfc81dec75fc1bc2a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -445,8 +445,9 @@ struct mem_cgroup { #ifdef CONFIG_KIDLED struct rw_semaphore idle_stats_rwsem; - unsigned long idle_scans; - struct kidled_scan_period scan_period; + unsigned long idle_page_scans; + unsigned long idle_slab_scans; + struct kidled_scan_control scan_control; int idle_stable_idx; struct idle_page_stats idle_stats[KIDLED_STATS_NR_TYPE]; #endif @@ -477,8 +478,10 @@ enum page_memcg_data_flags { MEMCG_DATA_OBJCGS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), + /* page->memcg_data is a pointer to the slab age */ + MEMCG_DATA_SLAB_AGE = (1UL << 2), /* the next bit after the last actual flag */ - __NR_MEMCG_DATA_FLAGS = (1UL << 2), + __NR_MEMCG_DATA_FLAGS = (1UL << 3), }; #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) @@ -630,7 +633,7 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); - if (memcg_data & MEMCG_DATA_OBJCGS) + if ((memcg_data & MEMCG_DATA_OBJCGS) || (memcg_data & MEMCG_DATA_SLAB_AGE)) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a49e6d6aecbfb42fa84d269c6261f03c77fae5e6..aff540cf1f018d2182e62cc4212e46adbcaf9b5c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -180,7 +180,7 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; -#ifdef CONFIG_MEMCG +#if defined(CONFIG_MEMCG) || defined(CONFIG_KIDLED) unsigned long memcg_data; #endif @@ -342,7 +342,7 @@ struct folio { }; atomic_t _mapcount; atomic_t _refcount; -#ifdef CONFIG_MEMCG +#if defined(CONFIG_MEMCG) || defined(CONFIG_KIDLED) unsigned long memcg_data; #endif #if defined(WANT_PAGE_VIRTUAL) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 2ee653b4b3e1a59ad1244ab2dd43585529b059b3..bfc79f579bf5801b1a15b1b49e1c3512d8a5cc6f 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -86,6 +86,10 @@ struct shrinker { unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); +#ifdef CONFIG_KIDLED + unsigned long (*cold_objects)(struct shrinker *, + struct shrink_control *sc); +#endif long batch; /* reclaim batch size, 0 = default */ int seeks; /* seeks to recreate an obj */ unsigned flags; diff --git a/include/linux/swap.h b/include/linux/swap.h index b19ff4234661672d5fd6770f7fb0c55a6ec5cc2b..0cf9dafde89b531bc7edca5a9922e729b48042c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -437,6 +437,10 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); +#ifdef CONFIG_KIDLED +extern void kidled_scan_slab(int nid, struct mem_cgroup *memcg, + struct kidled_scan_control scan_control); +#endif extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); diff --git a/mm/kidled.c b/mm/kidled.c index a1db033f2bb98efff465cb27b2c4587557f26932..a7de9b18e4f1c9b93b4767417043c16636178984 100644 --- a/mm/kidled.c +++ b/mm/kidled.c @@ -11,6 +11,9 @@ #include #include #include +#include +#include "slab.h" +#include #include #include @@ -67,7 +70,8 @@ DEFINE_STATIC_KEY_FALSE(kidled_enabled_key); -struct kidled_scan_period kidled_scan_period; +unsigned int kidled_scan_target __read_mostly = KIDLED_SCAN_PAGE; +struct kidled_scan_control kidled_scan_control; /* * These bucket values are copied from Michel Lespinasse's patch, they are * the default buckets to do histogram sampling. @@ -79,7 +83,8 @@ struct kidled_scan_period kidled_scan_period; const int kidled_default_buckets[NUM_KIDLED_BUCKETS] = { 1, 2, 5, 15, 30, 60, 120, 240 }; static DECLARE_WAIT_QUEUE_HEAD(kidled_wait); -static unsigned long kidled_scan_rounds __read_mostly; +static DEFINE_STATIC_KEY_FALSE(kidled_slab_key); +unsigned long kidled_scan_rounds __read_mostly; static inline int kidled_get_bucket(int *idle_buckets, int age) { @@ -100,6 +105,11 @@ static inline int kidled_get_idle_type(struct folio *folio) { int idle_type = KIDLE_BASE; + if (folio_test_slab(folio)) { + idle_type |= KIDLE_SLAB; + goto out; + } + if (folio_test_dirty(folio) || folio_test_writeback(folio)) idle_type |= KIDLE_DIRTY; if (folio_is_file_lru(folio)) @@ -113,6 +123,7 @@ static inline int kidled_get_idle_type(struct folio *folio) idle_type |= KIDLE_UNEVICT; if (folio_test_active(folio)) idle_type |= KIDLE_ACTIVE; +out: return idle_type; } @@ -154,38 +165,49 @@ EXPORT_SYMBOL_GPL(kidled_set_folio_age); #endif /* !KIDLED_AGE_NOT_IN_PAGE_FLAGS */ #ifdef CONFIG_MEMCG -static inline void kidled_mem_cgroup_account(struct folio *folio, - int age, - int nr_pages) +void kidled_mem_cgroup_account(struct folio *folio, + void *ptr, int age, unsigned long size) { struct mem_cgroup *memcg; struct idle_page_stats *stats; int type, bucket; + bool locked = false; if (mem_cgroup_disabled()) return; type = kidled_get_idle_type(folio); - - folio_memcg_lock(folio); - memcg = folio_memcg(folio); - if (unlikely(!memcg)) { - folio_memcg_unlock(folio); - return; + if (type == KIDLE_SLAB) { + if (!memcg_kmem_online()) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_obj(ptr); + if (!memcg) + return; + } + } else { + folio_memcg_lock(folio); + memcg = folio_memcg(folio); + if (unlikely(!memcg)) { + folio_memcg_unlock(folio); + return; + } + locked = true; } stats = mem_cgroup_get_unstable_idle_stats(memcg); bucket = kidled_get_bucket(stats->buckets, age); if (bucket >= 0) - stats->count[type][bucket] += nr_pages; + stats->count[type][bucket] += size; - folio_memcg_unlock(folio); + if (locked) + folio_memcg_unlock(folio); } void kidled_mem_cgroup_move_stats(struct mem_cgroup *from, struct mem_cgroup *to, struct folio *folio, - unsigned int nr_pages) + unsigned long size) { pg_data_t *pgdat = folio_pgdat(folio); unsigned long pfn = folio_pfn(folio); @@ -220,13 +242,13 @@ void kidled_mem_cgroup_move_stats(struct mem_cgroup *from, return; /* Remove from the source memory cgroup */ - if (stats[0]->count[type][bucket] > nr_pages) - stats[0]->count[type][bucket] -= nr_pages; + if (stats[0]->count[type][bucket] > size) + stats[0]->count[type][bucket] -= size; else stats[0]->count[type][bucket] = 0; if (pgdat->node_idle_scan_pfn >= pfn) { - if (stats[1]->count[type][bucket] > nr_pages) - stats[1]->count[type][bucket] -= nr_pages; + if (stats[1]->count[type][bucket] > size) + stats[1]->count[type][bucket] -= size; else stats[1]->count[type][bucket] = 0; } @@ -239,16 +261,18 @@ void kidled_mem_cgroup_move_stats(struct mem_cgroup *from, if (bucket < 0) return; - stats[2]->count[type][bucket] += nr_pages; + stats[2]->count[type][bucket] += size; if (pgdat->node_idle_scan_pfn >= pfn) - stats[3]->count[type][bucket] += nr_pages; + stats[3]->count[type][bucket] += size; } EXPORT_SYMBOL_GPL(kidled_mem_cgroup_move_stats); -static inline void kidled_mem_cgroup_scan_done(struct kidled_scan_period period) +static inline void +kidled_mem_cgroup_scan_done(struct kidled_scan_control scan_control) { struct mem_cgroup *memcg; struct idle_page_stats *stable_stats, *unstable_stats; + bool slab_only = false; for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg != NULL; @@ -265,22 +289,38 @@ static inline void kidled_mem_cgroup_scan_done(struct kidled_scan_period period) */ if (!KIDLED_IS_BUCKET_INVALID(unstable_stats->buckets)) { mem_cgroup_idle_page_stats_switch(memcg); - memcg->idle_scans++; + if (kidled_has_page_target(&scan_control)) + memcg->idle_page_scans++; + if (kidled_has_slab_target(&scan_control) && + (memcg_kmem_online() || mem_cgroup_is_root(memcg))) + memcg->idle_slab_scans++; + + slab_only = kidled_has_slab_target_only(&scan_control); } else { memcpy(unstable_stats->buckets, stable_stats->buckets, sizeof(unstable_stats->buckets)); } - memcg->scan_period = period; + memcg->scan_control = scan_control; up_write(&memcg->idle_stats_rwsem); unstable_stats = mem_cgroup_get_unstable_idle_stats(memcg); memset(&unstable_stats->count, 0, sizeof(unstable_stats->count)); + + if (slab_only && !memcg_kmem_online()) + break; } } -static inline void kidled_mem_cgroup_reset(void) +/* + * Reset the specified statistics by scan_type when users want to + * change the scan target. For example, we should clear the slab + * statistics when we only want to scan the page and vice versa. + * Otherwise it will mislead the user about the statistics. + */ +static inline void +kidled_mem_cgroup_reset(enum kidled_scan_type scan_type) { struct mem_cgroup *memcg; struct idle_page_stats *stable_stats, *unstable_stats; @@ -291,27 +331,48 @@ static inline void kidled_mem_cgroup_reset(void) down_write(&memcg->idle_stats_rwsem); stable_stats = mem_cgroup_get_stable_idle_stats(memcg); unstable_stats = mem_cgroup_get_unstable_idle_stats(memcg); - memset(&stable_stats->count, 0, sizeof(stable_stats->count)); - - memcg->idle_scans = 0; - kidled_reset_scan_period(&memcg->scan_period); - up_write(&memcg->idle_stats_rwsem); - - memset(&unstable_stats->count, 0, - sizeof(unstable_stats->count)); + if (scan_type == SCAN_TARGET_PAGE) { + int i; + + for (i = 0; i < KIDLE_NR_TYPE - 1; i++) + memset(&stable_stats->count[i], 0, + sizeof(stable_stats->count[i])); + memcg->scan_control.scan_target = kidled_scan_target; + up_write(&memcg->idle_stats_rwsem); + for (i = 0; i < KIDLE_NR_TYPE - 1; i++) + memset(&unstable_stats->count[i], 0, + sizeof(unstable_stats->count[i])); + } else if (scan_type == SCAN_TARGET_SLAB) { + memset(&stable_stats->count[KIDLE_SLAB], 0, + sizeof(stable_stats->count[KIDLE_SLAB])); + memcg->scan_control.scan_target = kidled_scan_target; + up_write(&memcg->idle_stats_rwsem); + memset(&unstable_stats->count[KIDLE_SLAB], 0, + sizeof(unstable_stats->count[KIDLE_SLAB])); + + if (!memcg_kmem_online()) + break; + } else { + memset(&stable_stats->count, 0, + sizeof(stable_stats->count)); + memcg->idle_page_scans = 0; + kidled_reset_scan_control(&memcg->scan_control); + up_write(&memcg->idle_stats_rwsem); + memset(&unstable_stats->count, 0, + sizeof(unstable_stats->count)); + } } } #else /* !CONFIG_MEMCG */ static inline void kidled_mem_cgroup_account(struct folio *folio, - int age, - int nr_pages) + void *ptr, int age, unsigned long size) { } -static inline void kidled_mem_cgroup_scan_done(struct kidled_scan_period - scan_period) +static inline void kidled_mem_cgroup_scan_done(struct kidled_scan_control + scan_control) { } -static inline void kidled_mem_cgroup_reset(void) +static inline void kidled_mem_cgroup_reset(enum kidled_scan_type scan_type) { } #endif /* CONFIG_MEMCG */ @@ -424,7 +485,8 @@ static inline int kidled_scan_folio(pg_data_t *pgdat, unsigned long pfn) if (idle) { age = kidled_inc_folio_age(pgdat, pfn); if (age > 0) - kidled_mem_cgroup_account(folio, age, nr_pages); + kidled_mem_cgroup_account(folio, NULL, age, + nr_pages << PAGE_SHIFT); else age = 0; } else { @@ -442,7 +504,7 @@ static inline int kidled_scan_folio(pg_data_t *pgdat, unsigned long pfn) } static bool kidled_scan_node(pg_data_t *pgdat, - struct kidled_scan_period scan_period, + struct kidled_scan_control scan_control, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn = start_pfn; @@ -452,6 +514,11 @@ static bool kidled_scan_node(pg_data_t *pgdat, int nr_nodes = num_online_nodes(); #endif + if (kidled_has_slab_target_only(&scan_control)) + return false; + else if (pgdat->node_idle_scan_pfn >= node_end) + return true; + #ifdef KIDLED_AGE_NOT_IN_PAGE_FLAGS if (unlikely(!pgdat->node_folio_age)) { u8 *age; @@ -468,7 +535,8 @@ static bool kidled_scan_node(pg_data_t *pgdat, while (pfn < end_pfn) { /* Restart new scanning when user updates the period */ - if (unlikely(!kidled_is_scan_period_equal(&scan_period))) + if (unlikely(!kidled_is_scan_period_equal(&scan_control) || + !kidled_has_page_target_equal(&scan_control))) break; #if !defined(CONFIG_ARCH_KEEP_MEMBLOCK) && !defined(CONFIG_MEMORY_HOTPLUG) @@ -498,7 +566,7 @@ static bool kidled_scan_node(pg_data_t *pgdat, * happen if caller executes to them. */ #if defined(CONFIG_ARCH_KEEP_MEMBLOCK) || defined(CONFIG_MEMORY_HOTPLUG) -static __kidled_ref bool kidled_scan_nodes(struct kidled_scan_period scan_period, +static __kidled_ref bool kidled_scan_nodes(struct kidled_scan_control scan_control, bool restart) { int i, nid; @@ -508,7 +576,7 @@ static __kidled_ref bool kidled_scan_nodes(struct kidled_scan_period scan_period for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long pages_to_scan = DIV_ROUND_UP(pgdat->node_present_pages, - scan_period.duration); + scan_control.duration); bool init = !restart; if (restart) @@ -544,7 +612,7 @@ static __kidled_ref bool kidled_scan_nodes(struct kidled_scan_period scan_period if ((end_pfn - start_pfn) > pages_to_scan) end_pfn = start_pfn + pages_to_scan; - scan_done &= kidled_scan_node(pgdat, scan_period, + scan_done &= kidled_scan_node(pgdat, scan_control, start_pfn, end_pfn); /* * That empirical value mainly to ensure that @@ -559,7 +627,7 @@ static __kidled_ref bool kidled_scan_nodes(struct kidled_scan_period scan_period return scan_done; } #else -static bool kidled_scan_nodes(struct kidled_scan_period scan_period, +static bool kidled_scan_nodes(struct kidled_scan_control scan_control, bool restart) { unsigned long start_pfn, end_pfn; @@ -577,8 +645,8 @@ static bool kidled_scan_nodes(struct kidled_scan_period scan_period, pgdat->node_idle_scan_pfn = pgdat->node_start_pfn; start_pfn = pgdat->node_idle_scan_pfn; end_pfn = min(start_pfn + DIV_ROUND_UP(pgdat->node_spanned_pages, - scan_period.duration), node_end); - scan_done &= kidled_scan_node(pgdat, scan_period, start_pfn, + scan_control.duration), node_end); + scan_done &= kidled_scan_node(pgdat, scan_control, start_pfn, end_pfn); } @@ -600,9 +668,33 @@ void kidled_free_folio_age(pg_data_t *pgdat) } #endif -static inline void kidled_scan_done(struct kidled_scan_period scan_period) +static inline void kidled_scan_slab_node(int nid, + struct kidled_scan_control scan_control) { - kidled_mem_cgroup_scan_done(scan_period); + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + kidled_scan_slab(nid, memcg, scan_control); + if (!memcg_kmem_online()) + break; + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +} + +static inline void kidled_scan_slabs(struct kidled_scan_control scan_control) +{ + int nid; + + if (!kidled_has_slab_target(&scan_control)) + return; + + for_each_online_node(nid) + kidled_scan_slab_node(nid, scan_control); +} + +static inline void kidled_scan_done(struct kidled_scan_control scan_control) +{ + kidled_mem_cgroup_scan_done(scan_control); kidled_scan_rounds++; } @@ -611,7 +703,7 @@ static void kidled_reset(bool free) { pg_data_t *pgdat; - kidled_mem_cgroup_reset(); + kidled_mem_cgroup_reset(SCAN_TARGET_ALL); get_online_mems(); @@ -636,7 +728,7 @@ static __kidled_ref void kidled_reset(void) pg_data_t *pgdat; int i, nid; - kidled_mem_cgroup_reset(); + kidled_mem_cgroup_reset(SCAN_TARGET_ALL); get_online_mems(); for_each_online_node(nid) { @@ -660,7 +752,7 @@ static void kidled_reset(void) { pg_data_t *pgdat; - kidled_mem_cgroup_reset(); + kidled_mem_cgroup_reset(SCAN_TARGET_ALL); get_online_mems(); for_each_online_pgdat(pgdat) { @@ -678,24 +770,48 @@ static void kidled_reset(void) } #endif -static inline bool kidled_should_run(struct kidled_scan_period *p, bool *new) +static inline bool kidled_should_run(struct kidled_scan_control *p, + bool *new, int *count_slab_scan) { if (unlikely(!kidled_is_scan_period_equal(p))) { - struct kidled_scan_period scan_period; + struct kidled_scan_control scan_control; - scan_period = kidled_get_current_scan_period(); + scan_control = kidled_get_current_scan_control(); if (p->duration) { #ifdef KIDLED_AGE_NOT_IN_PAGE_FLAGS - kidled_reset(!scan_period.duration); + kidled_reset(!scan_control.duration); #else kidled_reset(); #endif } - if (!scan_period.duration) + if (!scan_control.duration) static_branch_disable(&kidled_enabled_key); - *p = scan_period; + *p = scan_control; *new = true; + } else if (unlikely(!kidled_is_scan_target_equal(p))) { + struct kidled_scan_control scan_control; + bool page_disabled = false; + bool slab_disabled = false; + + scan_control = kidled_get_current_scan_control(); + kidled_get_reset_type(p, &page_disabled, &slab_disabled); + if (slab_disabled) { + kidled_mem_cgroup_reset(SCAN_TARGET_SLAB); + *count_slab_scan = 0; + } + if (page_disabled) + kidled_mem_cgroup_reset(SCAN_TARGET_PAGE); + + /* + * It need to restart the page scan when user enable + * the specified scan type again. + */ + if (kidled_has_slab_target_only(p)) + *new = true; + else + *new = false; + *p = scan_control; } else { *new = false; } @@ -706,38 +822,58 @@ static inline bool kidled_should_run(struct kidled_scan_period *p, bool *new) return false; } +static inline bool is_kidled_scan_done(bool scan_done, + int count_slab_scan, + struct kidled_scan_control scan_control) +{ + u16 duration = scan_control.duration; + + if (kidled_has_slab_target_only(&scan_control)) + return count_slab_scan >= duration; + else if (kidled_has_page_target_only(&scan_control)) + return scan_done; + else + return scan_done && (count_slab_scan >= duration); +} + static int kidled(void *dummy) { int busy_loop = 0; bool restart = true; - struct kidled_scan_period scan_period; + struct kidled_scan_control scan_control; + int count_slab_scan = 0; - kidled_reset_scan_period(&scan_period); + kidled_reset_scan_control(&scan_control); while (!kthread_should_stop()) { u64 start_jiffies, elapsed; bool new, scan_done = true; wait_event_interruptible(kidled_wait, - kidled_should_run(&scan_period, &new)); + kidled_should_run(&scan_control, + &new, &count_slab_scan)); if (unlikely(new)) { restart = true; busy_loop = 0; } - if (unlikely(scan_period.duration == 0)) + if (unlikely(scan_control.duration == 0)) continue; start_jiffies = jiffies_64; get_online_mems(); - scan_done = kidled_scan_nodes(scan_period, restart); + scan_done = kidled_scan_nodes(scan_control, restart); put_online_mems(); - if (scan_done) { - kidled_scan_done(scan_period); + kidled_scan_slabs(scan_control); + if (is_kidled_scan_done(scan_done, + count_slab_scan + 1, scan_control)) { + kidled_scan_done(scan_control); restart = true; + count_slab_scan = 0; } else { restart = false; + count_slab_scan++; } /* @@ -747,7 +883,7 @@ static int kidled(void *dummy) * neighbors (e.g. cause spike latency). * * We hope kidled can scan specified pages which depends on - * scan_period in each slice, and supposed to finish each + * scan_control in each slice, and supposed to finish each * slice in one second: * * pages_to_scan = total_pages / scan_duration @@ -760,7 +896,13 @@ static int kidled(void *dummy) * * We thought it's busy when elapsed >= (HZ / 2), and if keep * busy for several consecutive times, we'll scale up the - * scan duration. + * scan duration, But except in one case when we enable the + * slab scan. It's acceptable that the cpu load is very high + * for a while and we can not scale up the scan duration. + * Otherwise it will takes a lot of time to scan an round. + * + * Because kidled is the lowest priority, and it can be + * scheduled easily when other task want to run in current cpu. * * NOTE it's a simple guard, not a promise. */ @@ -772,7 +914,7 @@ static int kidled(void *dummy) schedule_timeout_interruptible(HZ - elapsed); } else if (++busy_loop == KIDLED_BUSY_LOOP_THRESHOLD) { busy_loop = 0; - if (kidled_try_double_scan_period(scan_period)) { + if (kidled_try_double_scan_control(scan_control)) { pr_warn_ratelimited("%s: period -> %u\n", __func__, kidled_get_current_scan_duration()); @@ -786,6 +928,147 @@ static int kidled(void *dummy) return 0; } +static inline bool kidled_allow_scan_slab(void) +{ + struct kidled_scan_control scan_control = + kidled_get_current_scan_control(); + + if (!scan_control.duration) + return false; + + if (!kidled_has_slab_target(&scan_control)) + return false; + + return true; +} + +static inline void kidled_slab_scan_enabled(void) +{ + if (!static_key_enabled(&kidled_slab_key)) { + if (kidled_allow_scan_slab()) + static_branch_enable(&kidled_slab_key); + } else { + if (!kidled_allow_scan_slab()) + static_branch_disable(&kidled_slab_key); + } +} + +static inline unsigned short *kidled_slab_age(struct slab *slab) +{ + return (unsigned short *)((unsigned long)slab->memcg_data & ~MEMCG_DATA_SLAB_AGE); +} + +bool page_has_slab_age(struct slab *slab) +{ + return (((unsigned long)slab->memcg_data & MEMCG_DATA_FLAGS_MASK) == MEMCG_DATA_SLAB_AGE); +} + +static unsigned short *kidled_get_slab_age_array(void *object) +{ + struct slab *slab = virt_to_slab(object); + unsigned int objects = objs_per_slab(slab->slab_cache, slab); + unsigned short *slab_age = NULL; + + if (!kidled_available_slab(slab_folio(slab), slab->slab_cache)) + goto out; + + if (!cgroup_memory_nokmem) { + /* In case fail to allocate memory for cold slab */ + if (likely(slab_objcgs(slab))) + slab_age = (unsigned short *)slab_objcgs(slab)[objects]; + } else + slab_age = kidled_slab_age(slab); + +out: + return slab_age; +} + +unsigned short kidled_get_slab_age(void *object) +{ + unsigned short *slab_age; + struct slab *slab; + unsigned int off; + + if (!static_branch_unlikely(&kidled_slab_key)) + return 0; + + slab_age = kidled_get_slab_age_array(object); + if (!slab_age) + return 0; + + slab = virt_to_slab(object); + off = obj_to_index(slab->slab_cache, slab, object); + + return *(slab_age + off); +} + +void kidled_set_slab_age(void *object, unsigned short age) +{ + unsigned short *slab_age; + struct slab *slab; + unsigned int off; + + if (!static_branch_unlikely(&kidled_slab_key)) + return; + + slab_age = kidled_get_slab_age_array(object); + if (!slab_age) + return; + + slab = virt_to_slab(object); + off = obj_to_index(slab->slab_cache, slab, object); + + *(slab_age + off) = age; +} + +/* + * each slab object pointer to an memcg respectively when kmem account enable, + * slab page can be used by root mem_cgroup and children memcg. slab object + * age is recorded in slab_age of page when kmem account disable. Otherwise, + * an special obj_cgroups pointer will store the value. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +int kidled_alloc_slab_age(struct slab *slab, struct kmem_cache *s, gfp_t flags) +{ + unsigned int objects = objs_per_slab(s, slab); + void *ver; + int ret; + + if (!kidled_available_slab(slab_folio(slab), s)) + return 0; + + /* void count the memory to kmem accounting when kmem enable */ + flags &= ~OBJCGS_CLEAR_MASK; + ver = kzalloc_node(objects * sizeof(unsigned short), flags, slab_nid(slab)); + if (!ver) + return -ENOMEM; + + if (!cgroup_memory_nokmem) { + if (!slab_objcgs(slab)) { + ret = memcg_alloc_slab_cgroups(slab, s, flags, true); + + if (!ret) + slab_objcgs(slab)[objects] = ver; + else { + kfree(ver); + return -ENOMEM; + } + } else { + slab_objcgs(slab)[objects] = ver; + } + return 0; + } + + slab->memcg_data = ((unsigned long)ver | MEMCG_DATA_SLAB_AGE); + return 0; +} + +void kidled_free_slab_age(struct slab *slab) +{ + kfree(kidled_slab_age(slab)); + slab->memcg_data = 0; +} + static ssize_t kidled_scan_period_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -823,6 +1106,30 @@ static ssize_t kidled_scan_period_store(struct kobject *kobj, kidled_set_scan_duration(secs); wake_up_interruptible(&kidled_wait); + kidled_slab_scan_enabled(); + return count; +} + +static ssize_t kidled_scan_target_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", kidled_scan_target); +} + +static ssize_t kidled_scan_target_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned int val; + + ret = kstrtouint(buf, 10, &val); + if (ret || !val || val > KIDLED_SCAN_ALL) + return -EINVAL; + + WRITE_ONCE(kidled_scan_target, val); + kidled_slab_scan_enabled(); return count; } @@ -830,8 +1137,13 @@ static struct kobj_attribute kidled_scan_period_attr = __ATTR(scan_period_in_seconds, 0644, kidled_scan_period_show, kidled_scan_period_store); +static struct kobj_attribute kidled_scan_target_attr = + __ATTR(scan_target, 0644, + kidled_scan_target_show, kidled_scan_target_store); + static struct attribute *kidled_attrs[] = { &kidled_scan_period_attr.attr, + &kidled_scan_target_attr.attr, NULL }; static struct attribute_group kidled_attr_group = { diff --git a/mm/list_lru.c b/mm/list_lru.c index a05e5bef3b4007a1c5f7b4fa13d96c24e997c971..3c2d3ce29f65f669653692b306a23e5a9e3c2800 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -211,6 +211,10 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; +#ifdef CONFIG_KIDLED + bool kidled_slab_scan = false; + LIST_HEAD(head_temp); +#endif restart: l = list_lru_from_memcg_idx(lru, nid, memcg_idx); @@ -247,6 +251,12 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, case LRU_ROTATE: list_move_tail(item, &l->list); break; +#ifdef CONFIG_KIDLED + case LRU_ROTATE_DELAY: + if (unlikely(!kidled_slab_scan)) + kidled_slab_scan = true; + /* fall through */ +#endif case LRU_SKIP: break; case LRU_RETRY: @@ -260,6 +270,17 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, BUG(); } } +#ifdef CONFIG_KIDLED + if (kidled_slab_scan) { + struct list_head *head = &l->list; + struct list_head *entry = item->prev; + + if (item != head) { + list_cut_position(&head_temp, head, entry); + list_splice_tail(&head_temp, head); + } + } +#endif out: return isolated; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6bdeda54c0ae9cd4de178fa69bc06234689d3503..8db51adc57769de8b6b13375fc84adec1f338b6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -97,7 +97,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem __ro_after_init; +bool cgroup_memory_nokmem __ro_after_init; #ifdef CONFIG_MEMSLI /* Cgroup memory SLI disabled? */ @@ -3226,6 +3226,10 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, unsigned long memcg_data; void *vec; + /* extra allocate an special pointer for cold slab */ + if (kidled_available_slab(slab_folio(slab), s)) + objects += 1; + gfp &= ~OBJCGS_CLEAR_MASK; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, slab_nid(slab)); @@ -3262,7 +3266,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) * Memcg membership data for each individual object is saved in * slab->memcg_data. */ - if (folio_test_slab(folio)) { + if (folio_test_slab(folio) && !page_has_slab_age(folio_slab(folio))) { struct obj_cgroup **objcgs; struct slab *slab; unsigned int off; @@ -4329,9 +4333,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, static int mem_cgroup_idle_page_stats_show(struct seq_file *m, void *v) { struct mem_cgroup *iter, *memcg = mem_cgroup_from_css(seq_css(m)); - struct kidled_scan_period scan_period, period; + struct kidled_scan_control scan_control; struct idle_page_stats *stats, *cache; - unsigned long scans; + unsigned long page_scans, slab_scans; bool has_hierarchy = !!seq_cft(m)->private; bool no_buckets = false; int i, j, t; @@ -4343,41 +4347,72 @@ static int mem_cgroup_idle_page_stats_show(struct seq_file *m, void *v) down_read(&memcg->idle_stats_rwsem); *stats = memcg->idle_stats[memcg->idle_stable_idx]; - scans = memcg->idle_scans; - scan_period = memcg->scan_period; + page_scans = memcg->idle_page_scans; + slab_scans = memcg->idle_slab_scans; + scan_control = memcg->scan_control; up_read(&memcg->idle_stats_rwsem); /* Nothing will be outputed with invalid buckets */ if (KIDLED_IS_BUCKET_INVALID(stats->buckets)) { no_buckets = true; - scans = 0; + page_scans = 0; + slab_scans = 0; goto output; } /* Zeroes will be output with mismatched scan period */ - if (!kidled_is_scan_period_equal(&scan_period)) { + if (!kidled_is_scan_period_equal(&scan_control)) { memset(&stats->count, 0, sizeof(stats->count)); - scan_period = kidled_get_current_scan_period(); - scans = 0; + scan_control = kidled_get_current_scan_control(); + page_scans = 0; + slab_scans = 0; goto output; } + /* Zeroes will be output with mismatched scan type */ + if (!kidled_is_scan_target_equal(&scan_control)) { + bool page_disabled = false; + bool slab_disabled = false; + + kidled_get_reset_type(&scan_control, &page_disabled, &slab_disabled); + if (slab_disabled) { + memset(&stats->count[KIDLE_SLAB], 0, + sizeof(stats->count[KIDLE_SLAB])); + slab_scans = 0; + } + if (page_disabled) { + int i; + + for (i = 0; i < KIDLE_NR_TYPE - 1; i++) { + memset(&stats->count[i], 0, sizeof(stats->count[i])); + page_scans = 0; + } + } + } else { + if (kidled_has_slab_target_only(&scan_control) && page_scans != 0) + page_scans = 0; + if (kidled_has_page_target_only(&scan_control) && slab_scans != 0) + slab_scans = 0; + } + if (has_hierarchy) { for_each_mem_cgroup_tree(iter, memcg) { + struct kidled_scan_control scan_control; + /* The root memcg was just accounted */ if (iter == memcg) continue; down_read(&iter->idle_stats_rwsem); *cache = iter->idle_stats[iter->idle_stable_idx]; - period = memcg->scan_period; + scan_control = memcg->scan_control; up_read(&iter->idle_stats_rwsem); /* * Skip to account if the scan period is mismatched * or buckets are invalid. */ - if (!kidled_is_scan_period_equal(&period) || + if (!kidled_is_scan_period_equal(&scan_control) || KIDLED_IS_BUCKET_INVALID(cache->buckets)) continue; @@ -4406,8 +4441,9 @@ static int mem_cgroup_idle_page_stats_show(struct seq_file *m, void *v) output: seq_printf(m, "# version: %s\n", KIDLED_VERSION); - seq_printf(m, "# scans: %lu\n", scans); - seq_printf(m, "# scan_period_in_seconds: %u\n", scan_period.duration); + seq_printf(m, "# page_scans: %lu\n", page_scans); + seq_printf(m, "# slab_scans: %lu\n", slab_scans); + seq_printf(m, "# scan_period_in_seconds: %u\n", scan_control.duration); seq_puts(m, "# buckets: "); if (no_buckets) { seq_puts(m, "no valid bucket available\n"); @@ -4431,9 +4467,10 @@ static int mem_cgroup_idle_page_stats_show(struct seq_file *m, void *v) seq_puts(m, "# / _----=> swap/file\n"); seq_puts(m, "# | / _---=> evict/unevict\n"); seq_puts(m, "# || / _--=> inactive/active\n"); - seq_puts(m, "# ||| /\n"); + seq_puts(m, "# ||| / _-=> slab\n"); + seq_puts(m, "# |||| /\n"); - seq_printf(m, "# %-8s", "||||"); + seq_printf(m, "# %-8s", "|||||"); for (i = 0; i < j; i++) { char region[20]; @@ -4453,16 +4490,19 @@ static int mem_cgroup_idle_page_stats_show(struct seq_file *m, void *v) for (t = 0; t < KIDLE_NR_TYPE; t++) { char kidled_type_str[5]; - kidled_type_str[0] = t & KIDLE_DIRTY ? 'd' : 'c'; - kidled_type_str[1] = t & KIDLE_FILE ? 'f' : 's'; - kidled_type_str[2] = t & KIDLE_UNEVICT ? 'u' : 'e'; - kidled_type_str[3] = t & KIDLE_ACTIVE ? 'a' : 'i'; - kidled_type_str[4] = '\0'; + if (t & KIDLE_SLAB) + memcpy(kidled_type_str, "slab", 5); + else { + kidled_type_str[0] = t & KIDLE_DIRTY ? 'd' : 'c'; + kidled_type_str[1] = t & KIDLE_FILE ? 'f' : 's'; + kidled_type_str[2] = t & KIDLE_UNEVICT ? 'u' : 'e'; + kidled_type_str[3] = t & KIDLE_ACTIVE ? 'a' : 'i'; + kidled_type_str[4] = '\0'; + } seq_printf(m, " %-8s", kidled_type_str); for (i = 0; i < j; i++) { - seq_printf(m, " %14lu", - stats->count[t][i] << PAGE_SHIFT); + seq_printf(m, " %14lu", stats->count[t][i]); } seq_puts(m, "\n"); @@ -4526,7 +4566,8 @@ static ssize_t mem_cgroup_idle_page_stats_write(struct kernfs_open_file *of, * holding any read side locks. */ KIDLED_MARK_BUCKET_INVALID(unstable_stats->buckets); - memcg->idle_scans = 0; + memcg->idle_page_scans = 0; + memcg->idle_slab_scans = 0; up_write(&memcg->idle_stats_rwsem); return nbytes; @@ -7733,7 +7774,7 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; nid = folio_nid(folio); - kidled_mem_cgroup_move_stats(from, to, folio, nr_pages); + kidled_mem_cgroup_move_stats(from, to, folio, nr_pages << PAGE_SHIFT); local_irq_disable(); mem_cgroup_charge_statistics(to, nr_pages); diff --git a/mm/shrinker.c b/mm/shrinker.c index ea6b5289e0731e3194be18e54ae52cc35b86b0d8..45d36086e44c85c605cd954efa274d24c53d9a28 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -381,6 +381,129 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, #define SHRINK_BATCH 128 +#ifdef CONFIG_KIDLED +static void kidled_scan_slab_common(struct shrinker *shrinker, + struct shrink_control *sc, + struct kidled_scan_control scan_control) +{ + long batch_size = shrinker->batch ?: SHRINK_BATCH; + long freeable, nr_free; + + if (!shrinker->cold_objects) + return; + freeable = shrinker->count_objects(shrinker, sc); + if (freeable == 0 || freeable == SHRINK_EMPTY) + return; + + nr_free = DIV_ROUND_UP(freeable, scan_control.duration); + while (nr_free > 0) { + unsigned long nr_scanned; + + sc->nr_to_scan = min(nr_free, batch_size); + nr_scanned = shrinker->cold_objects(shrinker, sc); + if (nr_scanned == SHRINK_STOP) + break; + nr_free -= nr_scanned; + cond_resched(); + + if (unlikely(!kidled_is_scan_period_equal(&scan_control) || + !kidled_has_slab_target_equal(&scan_control))) + break; + } +} + +#ifdef CONFIG_MEMCG +static void kidled_scan_slab_memcg(int nid, struct mem_cgroup *memcg, + struct kidled_scan_control scan_control) +{ + struct shrinker_info *info; + int offset, index = 0; + + if (!mem_cgroup_online(memcg)) + return; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + if (unlikely(!info)) + goto out; + +again: + if (index < shrinker_id_to_index(info->map_nr_max)) { + struct shrinker_info_unit *unit; + + unit = info->unit[index]; + + rcu_read_unlock(); + + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + int shrinker_id = calc_shrinker_id(index, offset); + + rcu_read_lock(); + shrinker = idr_find(&shrinker_idr, shrinker_id); + if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { + clear_bit(offset, unit->map); + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + kidled_scan_slab_common(shrinker, &sc, scan_control); + shrinker_put(shrinker); + } + + index++; + goto again; + } +out: + rcu_read_unlock(); +} +#else /* !CONFIG_MEMCG */ +static void kidled_scan_slab_memcg(int nid, struct mem_cgroup *memcg, + struct kidled_scan_control scan_control) +{ +} +#endif /* CONFIG_MEMCG */ + +void kidled_scan_slab(int nid, struct mem_cgroup *memcg, + struct kidled_scan_control scan_control) +{ + struct shrinker *shrinker; + + if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) + return kidled_scan_slab_memcg(nid, memcg, scan_control); + + rcu_read_lock(); + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + .nid = nid, + .memcg = memcg, + }; + if (!shrinker_try_get(shrinker)) + continue; + + rcu_read_unlock(); + + kidled_scan_slab_common(shrinker, &sc, scan_control); + rcu_read_lock(); + shrinker_put(shrinker); + } + rcu_read_unlock(); + cond_resched(); +} +#endif + static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, struct shrinker *shrinker, int priority) { diff --git a/mm/slab.h b/mm/slab.h index 319a2a35aeab2b16ff7079d539dbd5f3b6f096ec..350d35ffa37d6411b1578c0bafac1637612bb498 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -101,7 +101,7 @@ struct slab { #endif atomic_t __page_refcount; -#ifdef CONFIG_MEMCG +#if defined(CONFIG_MEMCG) || defined(CONFIG_KIDLED) unsigned long memcg_data; #endif }; @@ -402,6 +402,38 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } +#ifdef CONFIG_KIDLED +static inline bool kidled_available_slab(struct folio *folio, struct kmem_cache *s) +{ +#ifdef CONFIG_KFENCE + /* Do not monitor kfence memory. */ + if (unlikely(PageKfence(&folio->page))) + return false; +#endif + if (!strcmp(s->name, "inode_cache") || + !strcmp(s->name, "ext4_inode_cache") || + !strcmp(s->name, "dentry")) + return true; + return false; +} + +/* cold slab will need the special condition */ +static inline bool kidled_kmem_enabled(void) +{ + return !cgroup_memory_nokmem; +} +#else +static inline bool kidled_available_slab(struct folio *folio, struct kmem_cache *s) +{ + return false; +} + +static inline bool kidled_kmem_enabled(void) +{ + return memcg_kmem_online(); +} +#endif + #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -452,7 +484,8 @@ static inline struct obj_cgroup **slab_objcgs(struct slab *slab) VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), slab_page(slab)); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); + VM_BUG_ON_PAGE((memcg_data & MEMCG_DATA_FLAGS_MASK) != MEMCG_DATA_KMEM, + slab_page(slab)); return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -462,8 +495,16 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr); -static inline void memcg_free_slab_cgroups(struct slab *slab) +static inline void memcg_free_slab_cgroups(struct slab *slab, struct kmem_cache *s) { + unsigned int objects = objs_per_slab(s, slab); + + if (kidled_available_slab(slab_folio(slab), s)) { + /* In case fail to allocate memory for cold slab */ + if (likely(slab_objcgs(slab))) + kfree(slab_objcgs(slab)[objects]); + } + kfree(slab_objcgs(slab)); slab->memcg_data = 0; } @@ -602,7 +643,7 @@ static inline int memcg_alloc_slab_cgroups(struct slab *slab, return 0; } -static inline void memcg_free_slab_cgroups(struct slab *slab) +static inline void memcg_free_slab_cgroups(struct slab *slab, struct kmem_cache *s) { } @@ -651,8 +692,12 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); + if (kidled_kmem_enabled()) + memcg_free_slab_cgroups(slab, s); + else { + if (page_has_slab_age(slab)) + kidled_free_slab_age(slab); + } mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); diff --git a/mm/slub.c b/mm/slub.c index 30a1f3fb88c6e9138387c58590d84666fcad49f1..df3d4fc61335397b20f083b0cfa03ace05d5b9f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2084,6 +2084,8 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) } set_freepointer(s, p, NULL); } + if (unlikely(kidled_alloc_slab_age(slab, s, alloc_gfp))) + pr_warn("Fails to trace %s:%p cold slab distribution.\n", s->name, slab); return slab; }