diff --git a/1008-sched-fair-Introduce-ID_LOAD_BALANCE.patch b/1008-sched-fair-Introduce-ID_LOAD_BALANCE.patch new file mode 100644 index 0000000000000000000000000000000000000000..cfd46d28fae23ddeb2f22862fe54ec498203db55 --- /dev/null +++ b/1008-sched-fair-Introduce-ID_LOAD_BALANCE.patch @@ -0,0 +1,392 @@ +From 0e96d655309e95ab249be847aeba66243fc67656 Mon Sep 17 00:00:00 2001 +From: Cruz Zhao +Date: Sat, 10 Dec 2022 13:56:05 +0000 +Subject: [PATCH] sched/fair: Introduce ID_LOAD_BALANCE + +This patch introduces ID_LOAD_BALANCE, provides backup for +select_idle_core and select_idle_cpu, and also give more +chance for highclass and normal task to find an idle cpu. + +This patch aims at prevent expeller and other tasks +running on the same core at the same time. + +select_idle_core: + - If cpus of the core are all id_idle, it's an available + backup core. + - If one cpu is id_idle and the other is preemptable for + highclass, it's also an available backup core. + +select_idle_cpu: + - If target is on expel, find an unexpelling cpu as backup + +select_idle_sibling: + - If there's no idle cpu in LLC, find it in a higher sched domain. + +This patch also fixes id_wake_affine. + +This patch also changes the following default sched features: + + - ID_RESCUE_EXPELLEE false + - ID_EXPELLER_SHARE_CORE false + - ID_LOAD_BALANCE true + +Signed-off-by: Cruz Zhao +--- + kernel/sched/mod/fair.c | 147 ++++++++++++++++++++++++++++++------ + kernel/sched/mod/features.h | 5 +- + 2 files changed, 127 insertions(+), 25 deletions(-) + +diff --git a/kernel/sched/mod/fair.c b/kernel/sched/mod/fair.c +index d651430f2..7bd0e370c 100644 +--- a/kernel/sched/mod/fair.c ++++ b/kernel/sched/mod/fair.c +@@ -901,8 +901,8 @@ id_can_migrate_task(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) + && src_rq->nr_high_running < 2) + goto bad_dst; + +- if (!sched_feat(ID_EXPELLER_SHARE_CORE) && +- task_is_expeller(p) && rq_on_expel(dst_rq)) ++ /* Expeller task doesn't wanna share core with any task*/ ++ if (!sched_feat(ID_EXPELLER_SHARE_CORE) && rq_on_expel(dst_rq)) + goto bad_dst; + + if (!is_expellee_task(p)) +@@ -939,11 +939,12 @@ id_wake_affine(struct task_struct *p, int this_cpu, int prev_cpu) + struct rq *prev_rq = cpu_rq(prev_cpu); + + /* Last highclass should stay */ +- if (is_highclass_task(p) && prev_rq->nr_high_running < 1) +- return false; ++ if (sched_feat(ID_LAST_HIGHCLASS_STAY) && is_highclass_task(p) && prev_rq->nr_high_running < 1) ++ /* Here's some problem before */ ++ return true; + +- /* Do not pull underclass to the cpu on expel */ +- if (is_expellee_task(p) && rq_on_expel(this_rq)) ++ /* Do not pull anyone to the cpu on expel */ ++ if ((!sched_feat(ID_EXPELLER_SHARE_CORE) || is_expellee_task(p)) && rq_on_expel(this_rq)) + return false; + + return true; +@@ -980,16 +981,16 @@ id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) + * highclass workload are heavy, for others they + * don't really need to worry about this. + */ +- if (!sched_feat(ID_EXPELLER_SHARE_CORE) && +- task_is_expeller(p) && rq_on_expel(rq)) ++ /* Expeller task doesn't wanna share core with any task*/ ++ if (!sched_feat(ID_EXPELLER_SHARE_CORE) && rq_on_expel(rq)) + return false; + + if (need_expel) + return false; + +- /* CPU full of underclass is idle for highclass */ ++ /* CPU full of underclass is idle for highclass and normal task*/ + if (!is_idle) +- return is_highclass_task(p) && underclass_only(cpu); ++ return !is_underclass_task(p) && underclass_only(cpu); + + if (!is_saver) + return true; +@@ -1001,6 +1002,23 @@ id_idle_cpu(struct task_struct *p, int cpu, bool expellee, bool *idle) + return avg_idle >= sysctl_sched_idle_saver_wmark; + } + ++/* Whether this core is id_idle for task p */ ++static bool id_idle_core(struct task_struct *p, int core) ++{ ++ int cpu; ++ ++ if (!sched_feat(ID_LOAD_BALANCE)) ++ return true; ++ ++ if (!is_highclass_task(p)) ++ return true; ++ for_each_cpu(cpu, cpu_smt_mask(core)) ++ if (!id_idle_cpu(p, cpu, false, NULL)) ++ return false; ++ ++ return true; ++} ++ + #ifdef CONFIG_CFS_BANDWIDTH + static __always_inline void + id_update_make_up(struct task_group *tg, struct rq *rq, struct cfs_rq *cfs_rq, +@@ -7640,23 +7658,46 @@ void __update_idle_core(struct rq *rq) + static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) + { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); +- int core, cpu; ++ int core, cpu, id_backup = -1; ++ bool is_expellee; ++ int max_id_idle = 0, max_id_idle_core = -1; + + if (!static_branch_likely(&sched_smt_present)) + return -1; + +- if (!test_idle_cores(target, false)) ++ /* If there's an available id_idle_core, find it */ ++ if (!test_idle_cores(target, false) && (!sched_feat(ID_LOAD_BALANCE) || is_underclass_task(p))) + return -1; + + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + ++ is_expellee = is_expellee_task(p); + for_each_cpu_wrap(core, cpus, target) { + bool idle = true; ++ int id_idle = 0; ++ bool is_idle; ++ int preemptable_cpu = -1; + +- for_each_cpu(cpu, cpu_smt_mask(core)) { +- if (!available_idle_cpu(cpu)) { +- idle = false; +- break; ++ /* If cpus of the core are all id_idle, it's an available backup core. ++ * If one cpu is id_idle and the other is preemptable for highclass, ++ * it's also an available backup core. ++ */ ++ if (sched_feat(ID_LOAD_BALANCE)) { ++ for_each_cpu(cpu, cpu_smt_mask(core)) { ++ if (id_idle_cpu(p, cpu, is_expellee, &is_idle)) ++ id_idle++; ++ else if (is_highclass_task(p) && !cpu_rq(cpu)->nr_high_running) ++ preemptable_cpu = cpu; ++ ++ if (!is_idle) ++ idle = false; ++ } ++ } else { ++ for_each_cpu(cpu, cpu_smt_mask(core)) { ++ if (!available_idle_cpu(cpu)) { ++ idle = false; ++ break; ++ } + } + } + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); +@@ -7664,6 +7705,19 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int + if (idle) + return core; + ++ if (sched_feat(ID_LOAD_BALANCE)) { ++ /* look for an available backup core */ ++ if (id_idle == 1 && preemptable_cpu != -1) ++ id_backup = preemptable_cpu; ++ else if (id_idle > 1) ++ id_backup = core; ++ ++ /* The more id_idle_cpu, the better */ ++ if (id_idle > max_id_idle) { ++ max_id_idle = id_idle; ++ max_id_idle_core = id_backup; ++ } ++ } + } + + /* +@@ -7671,7 +7725,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int + */ + set_idle_cores(target, 0); + +- return -1; ++ return sched_feat(ID_LOAD_BALANCE) ? max_id_idle_core : -1; + } + + /* +@@ -7725,6 +7779,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + int this = smp_processor_id(); + int cpu, nr = INT_MAX, id_backup = -1; + bool is_seeker, is_expellee; ++ struct rq *target_rq = cpu_rq(target); + + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) +@@ -7756,6 +7811,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + is_expellee = is_expellee_task(p); + for_each_cpu_wrap(cpu, cpus, target) { + bool idle; ++ struct rq *rq = cpu_rq(cpu); + + if (!--nr) + return -1; +@@ -7770,6 +7826,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + if (idle || !is_seeker) + break; + id_backup = cpu; ++ /* If target is on expel, find an unexpelling cpu as backup */ ++ } else if (!sched_feat(ID_EXPELLER_SHARE_CORE) && ++ rq_on_expel(target_rq) && !rq_on_expel(rq) && ++ !is_highclass_task(p) && id_backup == -1) { ++ id_backup = cpu; + } + } + +@@ -7830,6 +7891,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + unsigned long task_util; + int i, recent_used_cpu; + bool is_expellee = is_expellee_task(p); ++ bool retried = false; + + /* + * On asymmetric system, update task utilization because we will check +@@ -7840,16 +7902,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + task_util = uclamp_task_util(p); + } + ++ /* If target isn't an id idle core, don't select it */ + if ((id_idle_cpu(p, target, is_expellee, NULL) || sched_idle_cpu(target)) && +- asym_fits_capacity(task_util, target)) ++ asym_fits_capacity(task_util, target) && id_idle_core(p, target)) + return target; + + /* + * If the previous CPU is cache affine and idle, don't be stupid: ++ * If prev isn't an id idle core, don't select it + */ + if (prev != target && cpus_share_cache(prev, target) && + (id_idle_cpu(p, prev, is_expellee, NULL) || sched_idle_cpu(prev)) && +- asym_fits_capacity(task_util, prev)) ++ asym_fits_capacity(task_util, prev) && id_idle_core(p, prev)) + return prev; + + /* +@@ -7859,16 +7923,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. ++ * If prev isn't an id idle core, don't select it + */ + if (is_per_cpu_kthread(current) && + in_task() && + prev == smp_processor_id() && + this_rq()->nr_running <= 1 && +- asym_fits_capacity(task_util, prev)) { ++ asym_fits_capacity(task_util, prev) && ++ id_idle_core(p, prev)) { + return prev; + } + + /* Check a recently used CPU as a potential idle candidate: */ ++ /* If recent_used_cpu isn't an id idle core, don't select it */ + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && +@@ -7876,7 +7943,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + (id_idle_cpu(p, recent_used_cpu, is_expellee, NULL) || + sched_idle_cpu(recent_used_cpu)) && + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && +- asym_fits_capacity(task_util, recent_used_cpu)) { ++ asym_fits_capacity(task_util, recent_used_cpu) && ++ id_idle_core(p, recent_used_cpu)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake: +@@ -7909,10 +7977,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if (!sd) + return target; + ++select_idle_core: + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + ++ /* ++ * highclass task prefer to find an idle core, if there's no one in LLC, ++ * find it in higher sched domain. ++ */ ++ if (sched_feat(ID_LOAD_BALANCE) && is_highclass_task(p) && i == -1 && !retried) { ++ sd = rcu_dereference(per_cpu(sd_numa, target)); ++ retried = true; ++ goto select_idle_core; ++ } ++ + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; +@@ -7921,6 +8000,15 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if ((unsigned)i < nr_cpumask_bits) + return i; + ++ /* ++ * Don't disturb expeller, find a proper one in higher sched domain. ++ */ ++ if (sched_feat(ID_LOAD_BALANCE) && !sched_feat(ID_EXPELLER_SHARE_CORE) && ++ rq_on_expel(cpu_rq(target)) && !is_expellee && !retried) { ++ sd = rcu_dereference(per_cpu(sd_numa, target)); ++ retried = true; ++ goto select_idle_core; ++ } + return target; + } + +@@ -8319,9 +8407,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + int new_cpu = prev_cpu; + int want_affine = 0; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); ++ bool highclass_fork = (sd_flag & SD_BALANCE_FORK) && is_highclass_task(p); + + /* Endow LS task the ability to balance at fork */ +- if ((sd_flag & SD_BALANCE_FORK) && is_highclass_task(p)) ++ if (highclass_fork) + sd_flag |= SD_BALANCE_WAKE; + + if (sd_flag & SD_BALANCE_WAKE) { +@@ -8335,6 +8424,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + } + + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr) ++ && (!highclass_fork) + && id_wake_affine(p, cpu, prev_cpu); + } + +@@ -8362,9 +8452,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f + if (unlikely(sd)) { + /* Slow path */ + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); ++ /* As underclass is calculated into load, rq on expel will ++ * always be selected, if so, select another cpu. ++ * ++ * TODO: take underclass out of load. ++ */ ++ if (!sched_feat(ID_EXPELLER_SHARE_CORE) && rq_on_expel(cpu_rq(new_cpu)) && ++ !is_underclass_task(p)) ++ goto fast_path; + } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + /* Fast path */ +- ++fast_path: + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + + if (want_affine) +@@ -12244,6 +12342,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) + if (!cpu_active(this_cpu)) + return 0; + ++ /* Do not pull anyone to the cpu on expel */ ++ if (!sched_feat(ID_EXPELLER_SHARE_CORE) && rq_on_expel(this_rq)) ++ return 0; + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding +diff --git a/kernel/sched/mod/features.h b/kernel/sched/mod/features.h +index fc317163d..2faab8698 100644 +--- a/kernel/sched/mod/features.h ++++ b/kernel/sched/mod/features.h +@@ -98,9 +98,10 @@ SCHED_FEAT(BASE_SLICE, true) + + #ifdef CONFIG_GROUP_IDENTITY + SCHED_FEAT(ID_IDLE_AVG, true) +-SCHED_FEAT(ID_RESCUE_EXPELLEE, true) ++SCHED_FEAT(ID_RESCUE_EXPELLEE, false) + SCHED_FEAT(ID_EXPELLEE_NEVER_HOT, false) + SCHED_FEAT(ID_LAST_HIGHCLASS_STAY, false) + SCHED_FEAT(ID_LOOSE_EXPEL, false) +-SCHED_FEAT(ID_EXPELLER_SHARE_CORE, true) ++SCHED_FEAT(ID_EXPELLER_SHARE_CORE, false) ++SCHED_FEAT(ID_LOAD_BALANCE, true) + #endif +-- +2.27.0 + diff --git a/scheduler-group-identity.spec b/scheduler-group-identity.spec index 1f904347fdac126e7820d6de0f636e509522aeb5..51736280b5ae4e62a5e251b790f63f6802655ab6 100644 --- a/scheduler-group-identity.spec +++ b/scheduler-group-identity.spec @@ -4,7 +4,7 @@ %define anolis_version 1 %define KVER 5.10.134 %define KREL 12.2 -%define anolis_release 2 +%define anolis_release 3 Name: scheduler-group-identity Version: %{KVER}.%{KREL}.%{anolis_version} @@ -29,6 +29,7 @@ Patch1004: 1004-sched-work-around-AliSecGuard.patch Patch1005: 1005-sched-fix-panic-when-CPU-hotplug-on-5.10.patch Patch1006: 1006-sched-add-the-sidecar-of-smp.c-to-fix-the-scheduler_.patch Patch1007: 1007-sched-miss-put_prev_task-after-pick_next_task.patch +Patch1008: 1008-sched-fair-Introduce-ID_LOAD_BALANCE.patch ExclusiveArch: x86_64 @@ -113,6 +114,9 @@ systemctl reset-failed plugsched %changelog +* Tue Dec 13 2022 Cruz Zhao - 5.10.134.12.2.1-3 +- introduce sched_feat ID_LOAD_BALANCE + * Wed Dec 7 2022 Erwei Deng - 5.10.134.12.2.1-2 - fix a bug of missing put_prev_task_fair after pick_next_task_fair