From 22dd0f44655d6de8fd8273f5b883b774c5821528 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 14 Aug 2025 17:39:59 +0800 Subject: [PATCH 1/2] anolis: sched: add sys tracking state and aware sys mode when scheduling ANBZ: #23574 Introduce a slight per-CPU sys tracking state machine to monitor whether the CPU is in user, kernel, or guest mode. Enhance id_idle_cpu() logic to consider a CPU as a valid migration target for high-priority tasks only when it is not in kernel mode, improving scheduling responsiveness. Signed-off-by: Vincent Gao --- include/linux/context_tracking.h | 51 ++++++++++++++++++++++++- include/linux/context_tracking_state.h | 10 +++++ kernel/context_tracking.c | 53 ++++++++++++++++++++++++++ kernel/sched/core.c | 8 ++++ kernel/sched/fair.c | 42 +++++++++++++++++++- kernel/sched/features.h | 1 + 6 files changed, 162 insertions(+), 3 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index f5d127a5d819..f7f8528de24f 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -16,22 +16,31 @@ extern void context_tracking_cpu_set(int cpu); /* Called with interrupts disabled. */ extern void __context_tracking_enter(enum ctx_state state); extern void __context_tracking_exit(enum ctx_state state); +extern void __sys_tracking_enter(enum sys_state state); +extern void __sys_tracking_exit(enum sys_state state); +extern void sys_tracking_enter(enum sys_state state); +extern void sys_tracking_exit(enum sys_state state); extern void context_tracking_enter(enum ctx_state state); extern void context_tracking_exit(enum ctx_state state); extern void context_tracking_user_enter(void); extern void context_tracking_user_exit(void); +extern bool is_sys_aware_enabled(void); + static inline void user_enter(void) { if (context_tracking_enabled()) context_tracking_enter(CONTEXT_USER); - + if (is_sys_aware_enabled()) + sys_tracking_enter(ST_USER); } static inline void user_exit(void) { if (context_tracking_enabled()) context_tracking_exit(CONTEXT_USER); + if (is_sys_aware_enabled()) + sys_tracking_exit(ST_USER); } /* Called with interrupts disabled. */ @@ -39,12 +48,16 @@ static __always_inline void user_enter_irqoff(void) { if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_USER); + if (is_sys_aware_enabled()) + __sys_tracking_enter(ST_USER); } static __always_inline void user_exit_irqoff(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_USER); + if (is_sys_aware_enabled()) + __sys_tracking_exit(ST_USER); } static inline enum ctx_state exception_enter(void) @@ -61,6 +74,20 @@ static inline enum ctx_state exception_enter(void) return prev_ctx; } +static inline enum sys_state st_exception_enter(void) +{ + enum sys_state prev_sys; + + if (!is_sys_aware_enabled()) + return 0; + + prev_sys = this_cpu_read(sys_tracking.state); + if (prev_sys != ST_KERNEL) + sys_tracking_exit(prev_sys); + + return prev_sys; +} + static inline void exception_exit(enum ctx_state prev_ctx) { if (context_tracking_enabled()) { @@ -69,6 +96,14 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } +static inline void st_exception_exit(enum sys_state prev_sys) +{ + if (is_sys_aware_enabled()) { + if (prev_sys != ST_KERNEL) + sys_tracking_enter(prev_sys); + } +} + /** * ct_state() - return the current context tracking state if known @@ -82,14 +117,22 @@ static __always_inline enum ctx_state ct_state(void) return context_tracking_enabled() ? this_cpu_read(context_tracking.state) : CONTEXT_DISABLED; } +static __always_inline enum sys_state sys_state(void) +{ + return is_sys_aware_enabled() ? + this_cpu_read(sys_tracking.state) : ST_DISABLED; +} #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline void user_enter_irqoff(void) { } static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } +static inline enum sys_state st_exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } +static inline void st_exception_exit(enum sys_state prev_sys) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } +static inline enum sys_state sys_state(void) { return ST_DISABLED; } #endif /* !CONFIG_CONTEXT_TRACKING */ #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) @@ -115,6 +158,9 @@ static __always_inline void guest_enter_irqoff(void) if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_GUEST); + if (is_sys_aware_enabled()) + __sys_tracking_enter(ST_GUEST); + /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In @@ -133,6 +179,9 @@ static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_GUEST); + + if (is_sys_aware_enabled()) + __sys_tracking_exit(ST_GUEST); } static __always_inline void vtime_account_guest_exit(void) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 65a60d3313b0..d045739e01f4 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -22,9 +22,19 @@ struct context_tracking { } state; }; +struct sys_tracking { + enum sys_state { + ST_DISABLED = -1, /* returned by sys_tracking_state() if unknown */ + ST_KERNEL = 0, + ST_USER, + ST_GUEST, + } state; +}; + #ifdef CONFIG_CONTEXT_TRACKING extern struct static_key_false context_tracking_key; DECLARE_PER_CPU(struct context_tracking, context_tracking); +DECLARE_PER_CPU(struct sys_tracking, sys_tracking); static __always_inline bool context_tracking_enabled(void) { diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 36a98c48aedc..75a4e2d4c5de 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -29,6 +29,7 @@ DEFINE_STATIC_KEY_FALSE(context_tracking_key); EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); +DEFINE_PER_CPU(struct sys_tracking, sys_tracking); EXPORT_SYMBOL_GPL(context_tracking); static noinstr bool context_tracking_recursion_enter(void) @@ -103,6 +104,16 @@ void noinstr __context_tracking_enter(enum ctx_state state) } EXPORT_SYMBOL_GPL(__context_tracking_enter); +void noinstr __sys_tracking_enter(enum sys_state state) +{ + if (!is_sys_aware_enabled()) + return; + + if (__this_cpu_read(sys_tracking.state) != state) + __this_cpu_write(sys_tracking.state, state); +} +EXPORT_SYMBOL_GPL(__sys_tracking_enter); + void context_tracking_enter(enum ctx_state state) { unsigned long flags; @@ -125,6 +136,22 @@ void context_tracking_enter(enum ctx_state state) NOKPROBE_SYMBOL(context_tracking_enter); EXPORT_SYMBOL_GPL(context_tracking_enter); +void sys_tracking_enter(enum sys_state state) +{ + unsigned long flags; + + if (!is_sys_aware_enabled()) + return; + + if (in_interrupt()) + return; + local_irq_save(flags); + __sys_tracking_enter(state); + local_irq_restore(flags); +} +NOKPROBE_SYMBOL(sys_tracking_enter); +EXPORT_SYMBOL_GPL(sys_tracking_enter); + void context_tracking_user_enter(void) { user_enter(); @@ -168,6 +195,16 @@ void noinstr __context_tracking_exit(enum ctx_state state) } EXPORT_SYMBOL_GPL(__context_tracking_exit); +void noinstr __sys_tracking_exit(enum sys_state state) +{ + if (!is_sys_aware_enabled()) + return; + + if (__this_cpu_read(sys_tracking.state) == state) + __this_cpu_write(sys_tracking.state, ST_KERNEL); +} +EXPORT_SYMBOL_GPL(__sys_tracking_exit); + void context_tracking_exit(enum ctx_state state) { unsigned long flags; @@ -182,6 +219,22 @@ void context_tracking_exit(enum ctx_state state) NOKPROBE_SYMBOL(context_tracking_exit); EXPORT_SYMBOL_GPL(context_tracking_exit); +void sys_tracking_exit(enum sys_state state) +{ + unsigned long flags; + + if (!is_sys_aware_enabled()) + return; + + if (in_interrupt()) + return; + local_irq_save(flags); + __sys_tracking_exit(state); + local_irq_restore(flags); +} +NOKPROBE_SYMBOL(sys_tracking_exit); +EXPORT_SYMBOL_GPL(sys_tracking_exit); + void context_tracking_user_exit(void) { user_exit(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b325aee62a3..cb4cb2bc16d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5887,7 +5887,9 @@ asmlinkage __visible void __sched schedule_user(void) * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); + enum sys_state prev_st_state = st_exception_enter(); schedule(); + st_exception_exit(prev_st_state); exception_exit(prev_state); } #endif @@ -5968,6 +5970,7 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) { enum ctx_state prev_ctx; + enum sys_state prev_st_state; if (likely(!preemptible())) return; @@ -5994,7 +5997,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); + prev_st_state = st_exception_enter(); __schedule(true); + st_exception_exit(prev_st_state); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -6014,11 +6019,13 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; + enum sys_state prev_st_state; /* Catch callers which need to be fixed */ BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); + prev_st_state = st_exception_enter(); do { preempt_disable(); @@ -6028,6 +6035,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) sched_preempt_enable_no_resched(); } while (need_resched()); + st_exception_exit(prev_st_state); exception_exit(prev_state); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1fc6bc53c881..dea3c1884291 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,7 +21,7 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" - +#include /* * Targeted preemption latency for CPU-bound tasks: * @@ -1056,6 +1056,33 @@ static inline int get_id_book_cpu_nr_tries(void) return 0; } +bool is_sys_aware_enabled(void) +{ + return sched_feat(ID_SYS_AWARE); +} +EXPORT_SYMBOL_GPL(is_sys_aware_enabled); + +#if defined(CONFIG_PREEMPT) || !defined(CONFIG_CONTEXT_TRACKING) +static inline bool is_cpu_in_sys_mode(int cpu) +{ + return false; +} +#else +static inline bool is_cpu_in_sys_mode(int cpu) +{ + if (!is_sys_aware_enabled()) + return false; + + if (!cpu_online(cpu)) + return false; + + if (cpu_rq(cpu)->curr == cpu_rq(cpu)->idle) + return false; + + return per_cpu(sys_tracking.state, cpu) == ST_KERNEL; +} +#endif + static noinline bool id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) { @@ -1107,6 +1134,10 @@ id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) /* CPU full of underclass is idle for highclass */ if (!is_idle) { + + if (is_highclass_task(p) && is_cpu_in_sys_mode(cpu)) + return false; + /* * For ID_LOAD_BALANCE, CPU full of underclass is also idle * for normal. @@ -2353,7 +2384,14 @@ id_wake_affine(struct task_struct *p, int this_cpu, int prev_cpu) { return true; } - +bool is_sys_aware_enabled(void) +{ + return false; +} +static inline bool is_cpu_in_sys_mode(int cpu) +{ + return false; +} static inline bool id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index ee41c2abc9de..7ae9c211cd2d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -108,6 +108,7 @@ SCHED_FEAT(ID_ABSOLUTE_EXPEL, false) SCHED_FEAT(ID_LOAD_BALANCE, false) SCHED_FEAT(ID_PUSH_EXPELLEE, false) SCHED_FEAT(ID_BOOK_CPU, false) +SCHED_FEAT(ID_SYS_AWARE, false) #endif #ifdef CONFIG_SCHED_CORE -- Gitee From 1b3f84a7d5aafcf4c28dc3a203b86822af103962 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 21 Aug 2025 18:31:04 +0800 Subject: [PATCH 2/2] anolis: sched: better backup cpu when enable sys aware ANBZ: #23574 When sys aware enable, if there is an underclass only CPU that is not in sys state, it will be selected as the backup CPU. If it does not exist, the underclass only CPU but in sys state will be selected as the backup CPU. Signed-off-by: Vincent Gao --- kernel/sched/fair.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea3c1884291..b2fbcb49b737 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1084,7 +1084,7 @@ static inline bool is_cpu_in_sys_mode(int cpu) #endif static noinline bool -id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) +id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle, bool *is_sys) { struct rq *rq; bool need_expel; @@ -1135,8 +1135,8 @@ id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) /* CPU full of underclass is idle for highclass */ if (!is_idle) { - if (is_highclass_task(p) && is_cpu_in_sys_mode(cpu)) - return false; + if (is_sys && is_highclass_task(p) && is_cpu_in_sys_mode(cpu)) + *is_sys = true; /* * For ID_LOAD_BALANCE, CPU full of underclass is also idle @@ -2393,7 +2393,7 @@ static inline bool is_cpu_in_sys_mode(int cpu) return false; } static inline bool -id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) +id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle, bool *is_sys) { bool is_idle = available_idle_cpu(cpu); @@ -6913,7 +6913,7 @@ static void __push_expellee(struct rq *rq) for_each_cpu_wrap(i, traverse_mask, cpu) { struct rq *tmp_rq = cpu_rq(i); - if (id_idle_cpu(p, i, true, &idle)) { + if (id_idle_cpu(p, i, true, &idle, NULL)) { dst_cpu = i; dst_rq = cpu_rq(dst_cpu); /* @@ -8890,7 +8890,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p static inline int __select_idle_cpu(int cpu, struct task_struct *p, int *id_backup) { - bool idle, is_seeker, is_expellee; + bool idle, is_seeker, is_expellee, is_sys = false; is_seeker = is_idle_seeker_task(p); is_expellee = is_expellee_task(p); @@ -8900,12 +8900,13 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p, int *id_back * a backup option, which will be pick only when * failed to locate a real idle one. */ - if ((id_idle_cpu(p, cpu, is_expellee, &idle) || sched_idle_cpu(cpu)) && + if ((id_idle_cpu(p, cpu, is_expellee, &idle, &is_sys) || sched_idle_cpu(cpu)) && sched_cpu_cookie_match(cpu_rq(cpu), p)) { if (!group_identity_disabled()) { if (idle || !is_seeker) return cpu; - *id_backup = cpu; + if (*id_backup == -1 || !is_sys) + *id_backup = cpu; } else return cpu; } @@ -9016,7 +9017,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t if (!cpumask_test_cpu(cpu, task_allowed_cpu(p)) || !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (id_idle_cpu(p, cpu, is_expellee, NULL) || sched_idle_cpu(cpu)) + if (id_idle_cpu(p, cpu, is_expellee, NULL, NULL) || sched_idle_cpu(cpu)) return cpu; } @@ -9214,7 +9215,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((id_idle_cpu(p, target, is_expellee, NULL) || sched_idle_cpu(target)) && + if ((id_idle_cpu(p, target, is_expellee, NULL, NULL) || sched_idle_cpu(target)) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -9222,7 +9223,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (id_idle_cpu(p, prev, is_expellee, NULL) || sched_idle_cpu(prev)) && + (id_idle_cpu(p, prev, is_expellee, NULL, NULL) || sched_idle_cpu(prev)) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -9253,7 +9254,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (id_idle_cpu(p, recent_used_cpu, is_expellee, NULL) || + (id_idle_cpu(p, recent_used_cpu, is_expellee, NULL, NULL) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, task_allowed_cpu(p)) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -9794,7 +9795,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (is_highclass_task(p) && found_id_idle_cpu()) { rq = cpu_rq(new_cpu); rq_lock(rq, &rf); - if (!id_idle_cpu(p, new_cpu, false, NULL)) { + if (!id_idle_cpu(p, new_cpu, false, NULL, NULL)) { if (nr_tries > 0) { nr_tries--; rq_unlock(rq, &rf); -- Gitee