diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5422d1502fd6f2817c80d962fe872708f053a401..b1f550c8c82adfee461fb09c7531a4e8c44a46d7 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -105,6 +105,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_SCHED_PARAL select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 6b2116f83cbf8096026f325b6cd83fe309dafcac..773389ef15651fcb91dcb26732a82f7a05d9aa59 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -209,6 +209,7 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_SCHED_STEAL=y +CONFIG_SCHED_PARAL=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y diff --git a/arch/arm64/include/asm/prefer_numa.h b/arch/arm64/include/asm/prefer_numa.h new file mode 100644 index 0000000000000000000000000000000000000000..7e579cd9355bffe88bd75d94d5b0bb0addd1cb02 --- /dev/null +++ b/arch/arm64/include/asm/prefer_numa.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_PREFER_NUMA_H +#define __ASM_PREFER_NUMA_H + +#include + +void set_task_paral_node(struct task_struct *p); + +#endif /* __ASM_PREFER_NUMA_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 20b8c646696541246c559661ee22b807040ad1ed..c0612c173baabafa6836eb17ff4ece28aa2ecf19 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_IPI_AS_NMI) += ipi_nmi.o obj-$(CONFIG_HISI_VIRTCCA_GUEST) += virtcca_cvm_guest.o virtcca_cvm_tsi.o obj-$(CONFIG_HISI_VIRTCCA_HOST) += virtcca_cvm_host.o CFLAGS_patch-scs.o += -mbranch-protection=none +obj-$(CONFIG_SCHED_PARAL) += prefer_numa.o # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so diff --git a/arch/arm64/kernel/prefer_numa.c b/arch/arm64/kernel/prefer_numa.c new file mode 100644 index 0000000000000000000000000000000000000000..8dcd6c746df8bfefe88e32093d9362229e0a1c06 --- /dev/null +++ b/arch/arm64/kernel/prefer_numa.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * choose a prefer numa node + * + * Copyright (C) 2025 Huawei Limited. + */ +#include +#include +#include + +static atomic_t paral_nid_last = ATOMIC_INIT(-1); + +int is_prefer_numa(void) +{ + if (num_possible_nodes() <= 1) + return 0; + + return 1; +} + +static inline unsigned int update_sched_paral_nid(void) +{ + return (unsigned int)atomic_inc_return(¶l_nid_last); +} + +void set_task_paral_node(struct task_struct *p) +{ + int nid; + int i = 0; + const cpumask_t *cpus_mask; + + if (is_global_init(current)) + return; + + if (p->flags & PF_KTHREAD || p->tgid != p->pid) + return; + + while (i < nr_node_ids) { + nid = update_sched_paral_nid() % nr_node_ids; + cpus_mask = cpumask_of_node(nid); + + if (cpumask_empty(cpus_mask) || + !cpumask_subset(cpus_mask, p->cpus_ptr)) { + i++; + continue; + } + + cpumask_copy(p->prefer_cpus, cpus_mask); + break; + } +} diff --git a/fs/proc/array.c b/fs/proc/array.c index a933a878df3c32e89dba772cd7ea5751610c9ce5..6a4b0a850dcec015cd64370b915aeb86fe51305d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -439,9 +439,6 @@ __weak void arch_proc_pid_thread_features(struct seq_file *m, #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) { - if (!dynamic_affinity_enabled()) - return; - seq_printf(m, "Cpus_preferred:\t%*pb\n", cpumask_pr_args(task->prefer_cpus)); seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", diff --git a/include/linux/sched.h b/include/linux/sched.h index b6bc8d72309af4551faa7bff8c221d8cd2010ab1..cf394b636b790583f68328d8a99488d2b8ead148 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2627,6 +2627,12 @@ static inline bool dynamic_affinity_enabled(void) { return static_branch_unlikely(&__dynamic_affinity_switch); } + +#ifdef CONFIG_SCHED_PARAL +bool sched_paral_used(void); +#else +static inline bool sched_paral_used(void) { return false; } +#endif #endif #ifdef CONFIG_QOS_SCHED_SMART_GRID diff --git a/init/Kconfig b/init/Kconfig index c8bd58347a871946bf3c1ea9f91ca8a2b5eb349c..925e8517a7e801e8cf3b0efd7e99a340d251b50d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1484,6 +1484,28 @@ config SCHED_STEAL If unsure, say N here. +# +# For architectures that want to enable the support for SCHED_PARAL +# +config ARCH_SUPPORTS_SCHED_PARAL + bool + +config SCHED_PARAL + bool "Parallelly schedule processes on different NUMA nodes" + depends on ARCH_SUPPORTS_SCHED_PARAL + depends on QOS_SCHED_DYNAMIC_AFFINITY + default n + help + By enabling this feature, processes can be scheduled in parallel + on various NUMA nodes to better utilize the cache in NUMA node. + The usage is restricted to the following scenarios: + 1. No CPU binding is performed for user-space processes; + 2. It is applicable to distributed applications, such as business + architectures with one master and multiple slaves running in + parallel; + 3. The existing "qos dynamic affinity" and "qos smart grid" + features must not be used simultaneously. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" depends on PROC_FS diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 417827f2c043d745973e2be1f64661acbc9e40e6..01a9b18d80ceda23b9fa8b994f58a5f2cf9dbace 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3488,7 +3488,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); - set_prefer_cpus_ptr(task, prefer_cpus_attach); + if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach)) + set_prefer_cpus_ptr(task, prefer_cpus_attach); #endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); @@ -4348,7 +4349,8 @@ static void cpuset_fork(struct task_struct *task) set_cpus_allowed_ptr(task, current->cpus_ptr); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - set_prefer_cpus_ptr(task, current->prefer_cpus); + if (!sched_paral_used() || !cpumask_empty(cs->prefer_cpus)) + set_prefer_cpus_ptr(task, current->prefer_cpus); #endif task->mems_allowed = current->mems_allowed; return; diff --git a/kernel/fork.c b/kernel/fork.c index 7f7c297d5f48f0ea804e6137f58c1f57cf073308..78663ca681600ff7b78150acb521d115e3f1f1a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -634,8 +634,7 @@ void free_task(struct task_struct *tsk) free_kthread_struct(tsk); bpf_task_storage_free(tsk); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) - sched_prefer_cpus_free(tsk); + sched_prefer_cpus_free(tsk); #endif #ifdef CONFIG_QOS_SCHED_SMART_GRID if (smart_grid_enabled()) @@ -2450,11 +2449,9 @@ __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) { - retval = sched_prefer_cpus_fork(p, current->prefer_cpus); - if (retval) - goto bad_fork_free; - } + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; #endif lockdep_assert_irqs_enabled(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b497efc763b538f808da9942d6f60356bde21f0..fab904f44c879c4666889cbd432c31de79d5bcbf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -12142,9 +12142,6 @@ static int __set_prefer_cpus_ptr(struct task_struct *p, struct rq *rq; int ret = 0; - if (!dynamic_affinity_enabled()) - return -EPERM; - if (unlikely(!p->prefer_cpus)) return -EINVAL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7a9e6896c6991e0494a67fbab2ba64e72fb6f652..10a19e51e9b8236a9432e9de0bdbb399466e294a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -7,6 +7,10 @@ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#ifdef CONFIG_SCHED_PARAL +#include +#endif + /* * This allows printing both to /proc/sched_debug and * to the console @@ -95,6 +99,41 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* CONFIG_JUMP_LABEL */ +int __weak is_prefer_numa(void) +{ + return 0; +} + +#ifdef CONFIG_SCHED_PARAL +static void sched_feat_disable_paral(char *cmp) +{ + struct task_struct *tsk, *t; + + if (strncmp(cmp, "PARAL", 5) == 0) { + read_lock(&tasklist_lock); + for_each_process(tsk) { + if (tsk->flags & PF_KTHREAD || is_global_init(tsk)) + continue; + + for_each_thread(tsk, t) + cpumask_clear(t->prefer_cpus); + } + read_unlock(&tasklist_lock); + } +} + +static bool sched_feat_enable_paral(char *cmp) +{ + if (strncmp(cmp, "PARAL", 5) != 0) + return true; + + return is_prefer_numa(); +} +#else +static void sched_feat_disable_paral(char *cmp) {}; +static bool sched_feat_enable_paral(char *cmp) { return true; }; +#endif /* CONFIG_SCHED_PARAL */ + static int sched_feat_set(char *cmp) { int i; @@ -111,8 +150,12 @@ static int sched_feat_set(char *cmp) if (neg) { sysctl_sched_features &= ~(1UL << i); + sched_feat_disable_paral(cmp); sched_feat_disable(i); } else { + if (!sched_feat_enable_paral(cmp)) + return -EPERM; + sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } @@ -1045,7 +1088,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) { + if (dynamic_affinity_enabled() || sched_paral_used()) { P_SCHEDSTAT(nr_wakeups_preferred_cpus); P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f619dd53cc497f59217fd88ca61b52c76e1ba9c9..99175318885c7d1584806afc973317878ee3d9de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -75,6 +75,10 @@ #endif #include +#ifdef CONFIG_SCHED_PARAL +#include +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -9057,6 +9061,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_SCHED_PARAL +bool sched_paral_used(void) +{ + return sched_feat(PARAL); +} +#endif DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch); @@ -9084,16 +9094,15 @@ __setup("dynamic_affinity=", dynamic_affinity_switch_setup); static inline bool prefer_cpus_valid(struct task_struct *p) { - struct cpumask *prefer_cpus; + struct cpumask *prefer_cpus = task_prefer_cpus(p); - if (!dynamic_affinity_enabled()) - return false; - - prefer_cpus = task_prefer_cpus(p); + if (dynamic_affinity_enabled() || sched_paral_used()) { + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, p->cpus_ptr) && + cpumask_subset(prefer_cpus, p->cpus_ptr); + } - return !cpumask_empty(prefer_cpus) && - !cpumask_equal(prefer_cpus, p->cpus_ptr) && - cpumask_subset(prefer_cpus, p->cpus_ptr); + return false; } static inline unsigned long taskgroup_cpu_util(struct task_group *tg, @@ -9193,6 +9202,14 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, } rcu_read_unlock(); + /* In extreme cases, it may cause uneven system load. */ + if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + return; + } + /* * Follow cases should select cpus_ptr, checking by condition of * tg_capacity > nr_cpus_valid: @@ -14679,6 +14696,12 @@ static void task_fork_fair(struct task_struct *p) if (curr) update_curr(cfs_rq); place_entity(cfs_rq, se, ENQUEUE_INITIAL); + +#ifdef CONFIG_SCHED_PARAL + if (sched_paral_used()) + set_task_paral_node(p); +#endif + rq_unlock(rq, &rf); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index ea7ba74810e38b9b0f33c033a29674a5ff0f36fa..67939d04542fc14a2eba7e7a422060e2a9ed2903 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -61,6 +61,10 @@ SCHED_FEAT(SIS_UTIL, true) SCHED_FEAT(STEAL, false) #endif +#ifdef CONFIG_SCHED_PARAL +SCHED_FEAT(PARAL, false) +#endif + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the