diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e3aec81fd92de8002e1722963a2351ff3d649155..ec21e9723c84c7390f2b8e692613435015e4aa26 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -79,6 +79,10 @@ extern unsigned int sysctl_overload_detect_period; extern unsigned int sysctl_offline_wait_interval; #endif +#ifdef CONFIG_SCHED_PRIO_LB +extern unsigned int sysctl_sched_prio_load_balance_enabled; +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/init/Kconfig b/init/Kconfig index 27c5ed16fef173303a64de3c228e002df858defa..0ac3206686dbb4be00f8dbca108d154b45180541 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -975,6 +975,15 @@ config QOS_SCHED_SMT_EXPELLER This feature enable online tasks to expel offline tasks on the smt sibling cpus, and exclusively occupy CPU resources. +config SCHED_PRIO_LB + bool "Priority load balance for CFS" + depends on SMP + default n + help + This feature enable priority load balance + for CFS, which prefer migrating online tasks + and migrating offline tasks secondly. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e00b39d4e2e267fd4ec558103398e7e5d9d54e9e..51c707897c8d696f315506cda91d227271615be9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7432,6 +7432,9 @@ void __init sched_init(void) rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); +#ifdef CONFIG_SCHED_PRIO_LB + INIT_LIST_HEAD(&rq->cfs_offline_tasks); +#endif rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d457979db61fa44f73f64c2a082f30fbeab8ee..e6e3f15ac9ca3769740982285b7aeb23d6062e52 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -131,6 +131,10 @@ unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif +#ifdef CONFIG_SCHED_PRIO_LB +unsigned int sysctl_sched_prio_load_balance_enabled; +#endif + #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER static DEFINE_PER_CPU(int, qos_smt_status); #endif @@ -3018,6 +3022,20 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_PRIO_LB +static void +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *), + struct rq *rq, + struct sched_entity *se) +{ + if (sysctl_sched_prio_load_balance_enabled && + task_has_idle_policy(task_of(se))) + (*list_op)(&se->group_node, &rq->cfs_offline_tasks); + else + (*list_op)(&se->group_node, &rq->cfs_tasks); +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3027,7 +3045,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); +#ifdef CONFIG_SCHED_PRIO_LB + adjust_rq_cfs_tasks(list_add, rq, se); +#else list_add(&se->group_node, &rq->cfs_tasks); +#endif } #endif cfs_rq->nr_running++; @@ -7736,7 +7758,11 @@ done: __maybe_unused; * the list, so our cfs_tasks list becomes MRU * one. */ +#ifdef CONFIG_SCHED_PRIO_LB + adjust_rq_cfs_tasks(list_move, rq, &p->se); +#else list_move(&p->se.group_node, &rq->cfs_tasks); +#endif #endif if (hrtick_enabled(rq)) @@ -8106,6 +8132,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env) &p->se == cfs_rq_of(&p->se)->last)) return 1; +#ifdef CONFIG_SCHED_PRIO_LB + /* Preempt sched idle cpu do not consider migration cost */ + if (sysctl_sched_prio_load_balance_enabled && + cpus_share_cache(env->src_cpu, env->dst_cpu) && + sched_idle_cpu(env->dst_cpu)) + return 0; +#endif + if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) @@ -8311,11 +8345,18 @@ static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; + struct list_head *tasks = &env->src_rq->cfs_tasks; +#ifdef CONFIG_SCHED_PRIO_LB + int loop = 0; +#endif lockdep_assert_held(&env->src_rq->lock); +#ifdef CONFIG_SCHED_PRIO_LB +again: +#endif list_for_each_entry_reverse(p, - &env->src_rq->cfs_tasks, se.group_node) { + tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; @@ -8330,6 +8371,15 @@ static struct task_struct *detach_one_task(struct lb_env *env) schedstat_inc(env->sd->lb_gained[env->idle]); return p; } +#ifdef CONFIG_SCHED_PRIO_LB + if (sysctl_sched_prio_load_balance_enabled) { + loop++; + if (loop == 1) { + tasks = &env->src_rq->cfs_offline_tasks; + goto again; + } + } +#endif return NULL; } @@ -8347,12 +8397,18 @@ static int detach_tasks(struct lb_env *env) unsigned long util, load; struct task_struct *p; int detached = 0; +#ifdef CONFIG_SCHED_PRIO_LB + int loop = 0; +#endif lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; +#ifdef CONFIG_SCHED_PRIO_LB +again: +#endif while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -8454,6 +8510,15 @@ static int detach_tasks(struct lb_env *env) list_move(&p->se.group_node, tasks); } +#ifdef CONFIG_SCHED_PRIO_LB + if (sysctl_sched_prio_load_balance_enabled && env->imbalance > 0) { + loop++; + if (loop == 1) { + tasks = &env->src_rq->cfs_offline_tasks; + goto again; + } + } +#endif /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather @@ -11780,7 +11845,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) * Move the next running task to the front of the list, so our * cfs_tasks list becomes MRU one. */ +#ifdef CONFIG_SCHED_PRIO_LB + adjust_rq_cfs_tasks(list_move, rq, se); +#else list_move(&se->group_node, &rq->cfs_tasks); +#endif } #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e41a5207a212edf05ab3804ee1c4c20f2481f58c..b556aee36dcd2452e4bdd6f801c95a1f132f832d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1107,8 +1107,12 @@ struct rq { struct cpuidle_state *idle_state; #endif +#if defined(CONFIG_SCHED_PRIO_LB) && !defined(__GENKSYMS__) + struct list_head cfs_offline_tasks; +#else KABI_RESERVE(1) KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 91812d673c6ba541b1cb1fd9e81870f73c1a56bf..d1243d1150b25790feb3ac718d62b7a00e77c2c9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2718,6 +2718,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one_hundred, .extra2 = &one_thousand, }, +#endif +#ifdef CONFIG_SCHED_PRIO_LB + { + .procname = "sched_prio_load_balance_enabled", + .data = &sysctl_sched_prio_load_balance_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif { } };