diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a5e84700b319e56e4314ab3f20b5a1d75c1d4b24..2ce5ddb7a043a4d2cb9a172d5c0749682f86389d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -681,8 +682,10 @@ void set_cpu_sibling_map(int cpu) if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; - if ((i == cpu) || (has_smt && match_smt(c, o))) + if ((i == cpu) || (has_smt && match_smt(c, o))) { link_mask(topology_sibling_cpumask, cpu, i); + cgroup_ifs_set_smt(topology_sibling_cpumask(cpu)); + } if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1525d3e2f9d50d17f895c1845dbb9cc2d62e19cc..a26169094712b8cbe091a9031efa53be4094a387 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -807,6 +807,8 @@ void update_siblings_masks(unsigned int cpuid) cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); } + + cgroup_ifs_set_smt(&cpuid_topo->thread_sibling); } static void clear_cpu_topology(int cpu) @@ -823,6 +825,8 @@ static void clear_cpu_topology(int cpu) cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); + + cgroup_ifs_set_smt(&cpu_topo->thread_sibling); } void __init reset_cpu_topology(void) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f3fd0407d346f34996741aef5afc19bbaf517e6d..47372df6abbe18b82ed068fa05d327590f169d2b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -552,7 +552,11 @@ struct cgroup { struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif +#ifdef CONFIG_CGROUP_IFS + KABI_USE(1, struct cgroup_ifs *ifs) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 62cea15eb6df942168d2f453a818a49d9dce8cb4..d6f43bca5ecb58e22b0a88817db6670ab889ca49 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -860,4 +860,193 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} void cgroup_move_task_to_root(struct task_struct *tsk); #endif +#ifdef CONFIG_CGROUP_IFS + +enum ifs_types { + IFS_SMT, + IFS_RUNDELAY, + IFS_WAKELAT, + IFS_THROTTLE, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + IFS_SOFTIRQ, + IFS_HARDIRQ, +#endif +#ifdef CONFIG_SCHEDSTATS + IFS_SLEEP, +#endif + NR_IFS_TYPES, +}; + +struct cgroup_ifs_cpu { + /* total time for each interference, in ns */ + u64 time[NR_IFS_TYPES]; +}; + +/* + * cgroup interference statistics + */ +struct cgroup_ifs { + /* per-cpu interference statistics tracking */ + struct cgroup_ifs_cpu __percpu *pcpu; +}; + +extern struct cgroup_ifs cgroup_root_ifs; + +DECLARE_STATIC_KEY_FALSE(cgrp_ifs_enabled); +static inline bool cgroup_ifs_enabled(void) +{ + return static_branch_unlikely(&cgrp_ifs_enabled); +} + +static inline struct cgroup_ifs *cgroup_ifs(struct cgroup *cgrp) +{ + return cgroup_ino(cgrp) == 1 ? &cgroup_root_ifs : cgrp->ifs; +} + +static inline struct cgroup_ifs *task_ifs(struct task_struct *task) +{ + return cgroup_ifs(task_dfl_cgroup(task)); +} + +static inline struct cgroup_ifs *current_ifs(void) +{ + return task_ifs(current); +} + +static inline void cgroup_ifs_account_delta(struct cgroup_ifs_cpu *ifsc, + int type, u64 delta) +{ + if (!cgroup_ifs_enabled()) + return; + + if (delta > 0) + ifsc->time[type] += delta; +} + +void cgroup_ifs_account_smttime(struct task_struct *prev, + struct task_struct *next, + struct task_struct *idle); +void cgroup_ifs_set_smt(cpumask_t *sibling); + +static inline void cgroup_ifs_account_rundelay(struct task_struct *task, + u64 delta) +{ + struct cgroup_ifs *ifs; + + if (!cgroup_ifs_enabled()) + return; + + ifs = task_ifs(task); + if (!ifs) + return; + + cgroup_ifs_account_delta(this_cpu_ptr(ifs->pcpu), IFS_RUNDELAY, delta); +} + +static inline void cgroup_ifs_account_wakelat(struct task_struct *task, + u64 delta) +{ + struct cgroup_ifs *ifs; + + if (!cgroup_ifs_enabled()) + return; + + ifs = task_ifs(task); + if (!ifs) + return; + + cgroup_ifs_account_delta(this_cpu_ptr(ifs->pcpu), IFS_WAKELAT, delta); +} + +static inline void cgroup_ifs_account_throttle(struct cgroup *cgrp, int cpu, + u64 delta) +{ + struct cgroup_ifs *ifs; + struct cgroup_ifs_cpu *ifsc; + + if (!cgroup_ifs_enabled()) + return; + + ifs = cgroup_ifs(cgrp); + if (!ifs) + return; + + ifsc = per_cpu_ptr(ifs->pcpu, cpu); /* XXX: set another cpu data ? */ + cgroup_ifs_account_delta(ifsc, IFS_THROTTLE, delta); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static inline void cgroup_ifs_account_softirq(u64 delta) +{ + struct cgroup_ifs *ifs; + struct cgroup_ifs_cpu *ifsc; + + if (!cgroup_ifs_enabled()) + return; + + ifs = current_ifs(); + if (!ifs) + return; + + ifsc = this_cpu_ptr(ifs->pcpu); + cgroup_ifs_account_delta(ifsc, IFS_SOFTIRQ, delta); +} + +static inline void cgroup_ifs_account_hardirq(u64 delta) +{ + struct cgroup_ifs *ifs; + struct cgroup_ifs_cpu *ifsc; + + if (!cgroup_ifs_enabled()) + return; + + ifs = current_ifs(); + if (!ifs) + return; + + ifsc = this_cpu_ptr(ifs->pcpu); + cgroup_ifs_account_delta(ifsc, IFS_HARDIRQ, delta); +} + +void cgroup_ifs_enable_irq_account(void); +#endif + +#ifdef CONFIG_SCHEDSTATS +static inline void cgroup_ifs_account_sleep(struct task_struct *task, + u64 delta) +{ + struct cgroup_ifs *ifs; + + if (!cgroup_ifs_enabled()) + return; + + ifs = task_ifs(task); + if (!ifs) + return; + + cgroup_ifs_account_delta(this_cpu_ptr(ifs->pcpu), IFS_SLEEP, delta); +} + +void cgroup_ifs_enable_sleep_account(void); +#endif + +#else /* !CONFIG_CGROUP_IFS */ +static inline void cgroup_ifs_account_smttime(struct task_struct *prev, + struct task_struct *next, + struct task_struct *idle) {} +static inline void cgroup_ifs_set_smt(cpumask_t *sibling) {} +static inline void cgroup_ifs_account_rundelay(struct task_struct *task, u64 delta) {} +static inline void cgroup_ifs_account_wakelat(struct task_struct *task, u64 delta) {} +static inline void cgroup_ifs_account_throttle(struct cgroup *cgrp, int cpu, u64 delta) {} +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static inline void cgroup_ifs_account_softirq(u64 delta) {} +static inline void cgroup_ifs_account_hardirq(u64 delta) {} +static inline void cgroup_ifs_enable_irq_account(void) {} +#endif +#ifdef CONFIG_SCHEDSTATS +static inline void cgroup_ifs_account_sleep(struct task_struct *task, u64 delta) {} +static inline void cgroup_ifs_enable_sleep_account(void) {} +#endif +#endif /* CONFIG_CGROUP_IFS */ + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a694cc11dea51ad78a0cc08b825c38870bded37c..3979c34e9b83d09343468dd5fa406de289098875 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -390,7 +390,12 @@ struct sched_info { /* When were we last queued to run? */ unsigned long long last_queued; +#ifdef CONFIG_CGROUP_IFS + /* When were we last waking to run? */ + KABI_USE(1, unsigned long long last_waking) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) #endif /* CONFIG_SCHED_INFO */ }; @@ -994,6 +999,10 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif +#ifdef CONFIG_CGROUP_IFS + /* Run delayed due to bandwidth throttling */ + KABI_FILL_HOLE(unsigned in_throttle:1) +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/init/Kconfig b/init/Kconfig index 22d9ac8ca08fadf435599b48515d3485e5ac76d5..e560adcd08c10e5d2801d42f7f08d767af05bf50 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,29 @@ config CGROUP_FILES This supports catching misbehaving processes and return EMFILE instead of ENOMEM for kernel memory limits. +config CGROUP_IFS + bool "Cgroup-based Interference Statistics" + default n + select KERNFS + select IRQ_TIME_ACCOUTING + help + This option will provide online low-overhead interference + statistics based on cgroup, which will help in troubleshooting + performance issues. + + Say N if unsure. + +config CGROUP_IFS_DEFAULT_ENABLED + bool "Enable Cgroup-based Interference Statistics by default" + default y + depends on CGROUP_IFS + help + This option will enable cgroup-based interference statistics + by default. If this option is N, you need to enable cgroup-based + interference statistics by configuring the cgroup_ifs startup param. + + Say Y if unsure. + endif # CGROUPS menuconfig NAMESPACES diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 12f8457ad1f90f8dfa5bdd87b8ff8237f63bc0a2..8d0cc493fb23e1253b77d6deddc80d7bfaa2ce50 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_IFS) += ifs.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index f5fb12890645c41b581d6d241e7eeb0070f7acf1..0e24ff3c661a68a6f03ae158962144353b844b91 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -297,5 +297,25 @@ void cgroup1_check_for_release(struct cgroup *cgrp); int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); int cgroup1_get_tree(struct fs_context *fc); int cgroup1_reconfigure(struct fs_context *ctx); +int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], + bool is_add); + +#ifdef CONFIG_CGROUP_IFS +int cgroup_ifs_alloc(struct cgroup *cgrp); +void cgroup_ifs_free(struct cgroup *cgrp); +void cgroup_ifs_init(void); +int cgroup_ifs_add_files(struct cgroup_subsys_state *css, struct cgroup *cgrp); +void cgroup_ifs_rm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp); +#else /* !CONFIG_CGROUP_IFS */ +static inline int cgroup_ifs_alloc(struct cgroup *cgrp) { return 0; } +static inline void cgroup_ifs_free(struct cgroup *cgrp) {} +static inline void cgroup_ifs_init(void) {} +static inline int cgroup_ifs_add_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp) { return 0; } +static inline void cgroup_ifs_rm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp) {} +#endif /* CONFIG_CGROUP_IFS */ #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 34f1a810f7b08a569d5787ff89d4d14ed465ff5c..01553d04c3666cd1681534e8131668f20e876104 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -247,9 +247,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); -static int cgroup_addrm_files(struct cgroup_subsys_state *css, - struct cgroup *cgrp, struct cftype cfts[], - bool is_add); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -1703,6 +1700,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css) if (cgroup_psi_enabled()) cgroup_addrm_files(css, cgrp, cgroup_psi_files, false); + cgroup_ifs_rm_files(css, cgrp); } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, false); @@ -1741,6 +1739,10 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (ret < 0) return ret; } + + ret = cgroup_ifs_add_files(css, cgrp); + if (ret < 0) + return ret; } else { ret = cgroup_addrm_files(css, cgrp, cgroup1_base_files, true); @@ -4331,9 +4333,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ -static int cgroup_addrm_files(struct cgroup_subsys_state *css, - struct cgroup *cgrp, struct cftype cfts[], - bool is_add) +int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; @@ -4414,7 +4416,7 @@ static void cgroup_exit_cftypes(struct cftype *cfts) } } -static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; int ret = 0; @@ -5480,6 +5482,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); + cgroup_ifs_free(cgrp); cgroup_rstat_exit(cgrp); kfree(cgrp); } else { @@ -5727,10 +5730,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_kernfs_remove; + ret = cgroup_ifs_alloc(cgrp); + if (ret) + goto out_psi_free; + if (cgrp->root == &cgrp_dfl_root) { ret = cgroup_bpf_inherit(cgrp); if (ret) - goto out_psi_free; + goto out_ifs_free; } /* @@ -5791,6 +5798,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, return cgrp; +out_ifs_free: + cgroup_ifs_free(cgrp); out_psi_free: psi_cgroup_free(cgrp); out_kernfs_remove: @@ -6198,6 +6207,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_ifs_init(); + cgroup_rstat_boot(); get_user_ns(init_cgroup_ns.user_ns); diff --git a/kernel/cgroup/ifs.c b/kernel/cgroup/ifs.c new file mode 100644 index 0000000000000000000000000000000000000000..7ac0519139202f1ea1669f88f2947bbf8fc5c06a --- /dev/null +++ b/kernel/cgroup/ifs.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interference statistics for cgroup + * + * Copyright (C) 2025-2025 Huawei Technologies Co., Ltd + */ + +#include +#include "cgroup-internal.h" + +/* smt information */ +struct smt_info { + u64 total_time; + u64 prev_read_time; + u64 noidle_enter_time; + bool is_noidle; +}; + +static DEFINE_PER_CPU(struct smt_info, smt_info); +static DEFINE_PER_CPU_READ_MOSTLY(int, smt_sibling) = -1; + +static DEFINE_PER_CPU(struct cgroup_ifs_cpu, cgrp_root_ifs_cpu); +struct cgroup_ifs cgroup_root_ifs = { + .pcpu = &cgrp_root_ifs_cpu, +}; + +DEFINE_STATIC_KEY_FALSE(cgrp_ifs_enabled); + +#ifdef CONFIG_CGROUP_IFS_DEFAULT_ENABLED +static bool ifs_enable = true; +#else +static bool ifs_enable; +#endif + +#ifdef CONFIG_SCHEDSTATS +static bool ifs_sleep_enable; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static bool ifs_irq_enable; +#endif + +static int __init setup_ifs(char *str) +{ + return kstrtobool(str, &ifs_enable) == 0; +} +__setup("cgroup_ifs=", setup_ifs); + +void cgroup_ifs_set_smt(cpumask_t *sibling) +{ + int cpu; + int cpuid1 = -1; + int cpuid2 = -1; + bool off = false; + + for_each_cpu(cpu, sibling) { + if (cpuid1 == -1) { + cpuid1 = cpu; + } else if (cpuid2 == -1) { + cpuid2 = cpu; + } else { + *per_cpu_ptr(&smt_sibling, cpu) = -1; + off = true; + } + } + + if (cpuid1 != -1) + *per_cpu_ptr(&smt_sibling, cpuid1) = off ? -1 : cpuid2; + + if (cpuid2 != -1) + *per_cpu_ptr(&smt_sibling, cpuid2) = off ? -1 : cpuid1; +} + +static void account_smttime(struct task_struct *task) +{ + u64 delta; + struct cgroup_ifs *ifs; + struct smt_info *info; + + ifs = task_ifs(task); + if (!ifs) + return; + + info = this_cpu_ptr(&smt_info); + + delta = info->total_time - info->prev_read_time; + info->prev_read_time = info->total_time; + + cgroup_ifs_account_delta(this_cpu_ptr(ifs->pcpu), IFS_SMT, delta); +} + +void cgroup_ifs_account_smttime(struct task_struct *prev, + struct task_struct *next, + struct task_struct *idle) +{ + struct smt_info *ci, *si; + u64 now, delta; + int sibling; + + sibling = this_cpu_read(smt_sibling); + if (sibling == -1 || prev == next) + return; + + ci = this_cpu_ptr(&smt_info); + si = per_cpu_ptr(&smt_info, sibling); + + now = sched_clock_cpu(smp_processor_id()); + + /* leave noidle */ + if (prev != idle && next == idle) { + ci->is_noidle = false; + /* account interference time */ + if (ci->noidle_enter_time && si->is_noidle) { + delta = now - ci->noidle_enter_time; + + ci->total_time += delta; + si->total_time += delta; + + si->noidle_enter_time = 0; + ci->noidle_enter_time = 0; + + account_smttime(prev); + } + /* enter noidle */ + } else if (prev == idle && next != idle) { + /* if the sibling is also nonidle, there is smt interference */ + if (si->is_noidle) { + ci->noidle_enter_time = now; + si->noidle_enter_time = now; + } + ci->is_noidle = true; + /* cgroup changed */ + } else if (task_ifs(prev) != task_ifs(next)) + account_smttime(prev); +} + +int cgroup_ifs_alloc(struct cgroup *cgrp) +{ + cgrp->ifs = kzalloc(sizeof(struct cgroup_ifs), GFP_KERNEL); + if (!cgrp->ifs) + return -ENOMEM; + + cgrp->ifs->pcpu = alloc_percpu(struct cgroup_ifs_cpu); + if (!cgrp->ifs->pcpu) { + kfree(cgrp->ifs); + return -ENOMEM; + } + + return 0; +} + +void cgroup_ifs_free(struct cgroup *cgrp) +{ + free_percpu(cgrp->ifs->pcpu); + kfree(cgrp->ifs); +} + +static const char *ifs_type_name(int type) +{ + char *name = NULL; + + switch (type) { + case IFS_SMT: + name = "smt"; + break; + case IFS_RUNDELAY: + name = "rundelay"; + break; + case IFS_WAKELAT: + name = "wakelat"; + break; + case IFS_THROTTLE: + name = "throttle"; + break; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + case IFS_SOFTIRQ: + name = "softirq"; + break; + case IFS_HARDIRQ: + name = "hardirq"; + break; +#endif +#ifdef CONFIG_SCHEDSTATS + case IFS_SLEEP: + name = "sleep"; + break; +#endif + default: + break; + } + + return name; +} + +static bool should_print(int type) +{ +#ifdef CONFIG_SCHEDSTATS + if (type == IFS_SLEEP) + return ifs_sleep_enable; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (type == IFS_SOFTIRQ || type == IFS_HARDIRQ) + return ifs_irq_enable; +#endif + return true; +} + +static int print_sum_time(struct cgroup_ifs *ifs, struct seq_file *seq) +{ + u64 time[NR_IFS_TYPES] = { 0 }; + int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = 0; i < NR_IFS_TYPES; i++) { + if (!should_print(i)) + continue; + time[i] += per_cpu_ptr(ifs->pcpu, cpu)->time[i]; + } + } + + seq_printf(seq, "%-18s%s\n", "Interference", "Total Time (ns)"); + + for (i = 0; i < NR_IFS_TYPES; i++) { + if (!should_print(i)) + continue; + seq_printf(seq, "%-18s%llu\n", ifs_type_name(i), time[i]); + } + + return 0; +} + +static int cgroup_ifs_show(struct seq_file *seq, void *v) +{ + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; + struct cgroup_ifs __maybe_unused *ifs = cgroup_ifs(cgrp); + int ret; + + if (!ifs) { + pr_info("cgroup_ino(cgrp) = %ld\n", cgroup_ino(cgrp)); + return -EINVAL; + } + + ret = print_sum_time(ifs, seq); + if (ret) + return ret; + + return 0; +} + +static ssize_t cgroup_ifs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup *cgrp = seq_css(of->seq_file)->cgroup; + struct cgroup_ifs *ifs = cgroup_ifs(cgrp); + struct cgroup_ifs_cpu *ifsc; + bool clear; + int cpu; + + if (!ifs) + return -EOPNOTSUPP; + + if (kstrtobool(strstrip(buf), &clear) < 0) + return -EINVAL; + + if (!clear) { + for_each_possible_cpu(cpu) { + ifsc = per_cpu_ptr(ifs->pcpu, cpu); + memset(ifsc->time, 0, sizeof(ifsc->time)); + } + } + + return nbytes; +} + +static struct cftype cgroup_ifs_files[] = { + { + .name = "interference.stat", + .seq_show = cgroup_ifs_show, + .write = cgroup_ifs_write, + }, + { } /* terminate */ +}; + +void cgroup_ifs_init(void) +{ + if (!ifs_enable) + return; + + BUG_ON(cgroup_init_cftypes(NULL, cgroup_ifs_files)); + + static_branch_enable(&cgrp_ifs_enabled); +} + +int cgroup_ifs_add_files(struct cgroup_subsys_state *css, struct cgroup *cgrp) +{ + int ret = 0; + + if (!cgroup_ifs_enabled()) + return 0; + + ret = cgroup_addrm_files(css, cgrp, cgroup_ifs_files, true); + if (ret < 0) { + cgroup_addrm_files(css, cgrp, cgroup_ifs_files, false); + return ret; + } + + return 0; +} + +void cgroup_ifs_rm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp) +{ + if (cgroup_ifs_enabled()) + cgroup_addrm_files(css, cgrp, cgroup_ifs_files, false); +} + +#ifdef CONFIG_SCHEDSTATS +void cgroup_ifs_enable_sleep_account(void) +{ + ifs_sleep_enable = true; +} +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void cgroup_ifs_enable_irq_account(void) +{ + ifs_irq_enable = true; +} +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b41f3f30ef57ebbf52dc95ca2fe9a506b77b74a2..6c9da25a3d45abded0a85d2d5075f422268e6946 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -154,6 +154,16 @@ const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) +int sched_task_is_throttled(struct task_struct *p, int cpu) +{ + if (p->sched_class->task_is_throttled) + return p->sched_class->task_is_throttled(p, cpu); + + return 0; +} +#endif + #ifdef CONFIG_SCHED_CORE DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); @@ -268,14 +278,6 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } -static int sched_task_is_throttled(struct task_struct *p, int cpu) -{ - if (p->sched_class->task_is_throttled) - return p->sched_class->task_is_throttled(p, cpu); - - return 0; -} - static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) { struct rb_node *node = &p->core_node; @@ -4244,6 +4246,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) trace_sched_waking(p); ttwu_do_wakeup(p); + ifs_task_waking(p); goto out; } @@ -4259,6 +4262,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) break; trace_sched_waking(p); + ifs_task_waking(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -4667,6 +4671,7 @@ static void set_schedstats(bool enabled) { if (enabled) { compute_skid(); + cgroup_ifs_enable_sleep_account(); static_branch_enable(&sched_schedstats); } else { static_branch_disable(&sched_schedstats); @@ -4932,6 +4937,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); + ifs_task_waking(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { @@ -6750,6 +6756,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + cgroup_ifs_account_smttime(prev, next, rq->idle); trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b453f8a6a7c76473ae8bdfb1b4ee668b3d110244..499812509f7db851ac02f41550965b1c9ee5acd4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -27,6 +27,7 @@ static int sched_clock_irqtime; void enable_sched_clock_irqtime(void) { sched_clock_irqtime = 1; + cgroup_ifs_enable_irq_account(); } void disable_sched_clock_irqtime(void) @@ -71,10 +72,13 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (pc & HARDIRQ_MASK) + if (pc & HARDIRQ_MASK) { irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) + cgroup_ifs_account_hardirq(delta); + } else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) { irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + cgroup_ifs_account_softirq(delta); + } } static u64 irqtime_tick_accounted(u64 maxtime) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index dd13bbe8c0e4844c032350361e4208905634fa62..3aa12a73c9cef8763e8014d03667b09f70ec4e1d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2715,7 +2715,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, #endif } -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) static int task_is_throttled_dl(struct task_struct *p, int cpu) { return p->dl.dl_throttled; @@ -2754,7 +2754,7 @@ DEFINE_SCHED_CLASS(dl) = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) .task_is_throttled = task_is_throttled_dl, #endif }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ddaa8dd71c3e4b7735ee0981ac826724c0ef2d65..cf3f147cba4dbabf06a22af2a649095bd122e723 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6053,8 +6053,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_b->lock); if (cfs_rq->throttled_clock) { - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock; + + cfs_b->throttled_time += delta; cfs_rq->throttled_clock = 0; + cgroup_ifs_account_throttle(cfs_rq->tg->css.cgroup, + cpu_of(rq), delta); } list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -14344,7 +14348,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, return delta > 0; } +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) static int task_is_throttled_fair(struct task_struct *p, int cpu) { struct cfs_rq *cfs_rq; @@ -14356,8 +14364,6 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else -static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif #ifdef CONFIG_SCHED_STEAL @@ -15092,7 +15098,7 @@ DEFINE_SCHED_CLASS(fair) = { .task_change_group = task_change_group_fair, #endif -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) .task_is_throttled = task_is_throttled_fair, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 77bb7ee8cce092daefdb2c8099e19447fe7426d8..e2eaa8ffd009ed31b4bfe54bc585103f9af2b773 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2692,7 +2692,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) static int task_is_throttled_rt(struct task_struct *p, int cpu) { struct rt_rq *rt_rq; @@ -2740,7 +2740,7 @@ DEFINE_SCHED_CLASS(rt) = { .update_curr = update_curr_rt, -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) .task_is_throttled = task_is_throttled_rt, #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 761870540a2168ad17d3e6f658119b68c7a3e254..41ba8ad1a753a36bcf1bd9beb980cbee32e20dac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -546,6 +546,15 @@ extern void sched_release_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) +extern int sched_task_is_throttled(struct task_struct *p, int cpu); +#else +static inline int sched_task_is_throttled(struct task_struct *p, int cpu) +{ + return 0; +} +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID extern void start_auto_affinity(struct auto_affinity *auto_affi); extern void stop_auto_affinity(struct auto_affinity *auto_affi); @@ -2451,7 +2460,7 @@ struct sched_class { void (*task_change_group)(struct task_struct *p); #endif -#ifdef CONFIG_SCHED_CORE +#if defined(CONFIG_SCHED_CORE) || defined(CONFIG_CGROUP_IFS) int (*task_is_throttled)(struct task_struct *p, int cpu); #endif KABI_RESERVE(1) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 306f26fde69ad0214badbfc61891dd87ed7bc46a..e3f1026ba5e80412f3284a9669c699d189e0d396 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -67,6 +67,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, if (p) { account_scheduler_latency(p, delta >> 10, 1); trace_sched_stat_sleep(p, delta); + cgroup_ifs_account_sleep(p, delta); } } @@ -91,6 +92,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, } trace_sched_stat_blocked(p, delta); + cgroup_ifs_account_sleep(p, delta); /* * Blocking time is in units of nanosecs, so shift by diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index a6d7206d969de0df30fbf05bd0a5e1c593631002..1b736b93124707824521e9dfadcc4523e2d0c4a8 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_STATS_H #define _KERNEL_STATS_H +#include + #ifdef CONFIG_SCHEDSTATS extern struct static_key_false sched_schedstats; @@ -134,6 +136,47 @@ __schedstats_from_se(struct sched_entity *se) #define QOS_THROTTLED 2 #endif +#ifdef CONFIG_CGROUP_IFS +static inline void ifs_account_rundelay(struct task_struct *task, u64 delta) +{ + /* + * No need to include bandwidth throttling time in rundelay, + * leave it to the throttle metric. + */ + if (unlikely(task->in_throttle)) { + task->in_throttle = 0; + return; + } + + cgroup_ifs_account_rundelay(task, delta); +} + +static inline void ifs_task_waking(struct task_struct *t) +{ + if (t->sched_info.last_waking) + return; + + t->sched_info.last_waking = sched_clock(); +} + +static inline void ifs_task_arrive(struct task_struct *t) +{ + unsigned long long now, delta = 0; + + if (!t->sched_info.last_waking) + return; + + now = sched_clock(); + delta = now - t->sched_info.last_waking; + t->sched_info.last_waking = 0; + cgroup_ifs_account_wakelat(t, delta); +} +#else +static inline void ifs_account_rundelay(struct task_struct *task, u64 delta) {} +static inline void ifs_task_waking(struct task_struct *t) {} +static inline void ifs_task_arrive(struct task_struct *t) {} +#endif + #ifdef CONFIG_PSI void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -241,6 +284,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; + ifs_account_rundelay(t, delta); rq_sched_info_dequeue(rq, delta); } @@ -263,6 +307,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; + ifs_account_rundelay(t, delta); rq_sched_info_arrive(rq, delta); } @@ -275,6 +320,11 @@ static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); + +#ifdef CONFIG_CGROUP_IFS + if (!t->in_throttle && sched_task_is_throttled(t, task_cpu(t))) + t->in_throttle = 1; +#endif } /* @@ -311,8 +361,10 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n if (prev != rq->idle) sched_info_depart(rq, prev); - if (next != rq->idle) + if (next != rq->idle) { sched_info_arrive(rq, next); + ifs_task_arrive(next); + } } #else /* !CONFIG_SCHED_INFO: */