From 1222ec457ff1bdf3870366929ac33cc55e545be2 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:54:43 +0800 Subject: [PATCH 1/7] sched/proc: Print accurate cpumask vs migrate_disable() commit a3564e6c6b54d43d9db3a2b29b223ac8cc0a4365 upstream. Ensure /proc/*/status doesn't print 'random' cpumasks due to migrate_disable(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index ec53f3b145e7..260d7c996f38 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -384,9 +384,9 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); } static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) -- Gitee From be48cc54d699bfcddd7c703f68e98ffb1d89e3b5 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:55:18 +0800 Subject: [PATCH 2/7] sched: Add migrate_disable() tracepoints commit 8e57645ec4e2ebc28b8769f8b2553b9873615074 upstream. XXX write a tracer: - 'migirate_disable() -> migrate_enable()' time in task_sched_runtime() - 'migrate_pull -> sched-in' time in task_sched_runtime() The first will give worst case for the second, which is the actual interference experienced by the task to due migration constraints of migrate_disable(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior --- include/trace/events/sched.h | 12 ++++++++++++ kernel/sched/core.c | 4 ++++ kernel/sched/deadline.c | 1 + kernel/sched/rt.c | 8 +++++++- 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6..e48f584abf5f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -650,6 +650,18 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); +DECLARE_TRACE(sched_migrate_disable_tp, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +DECLARE_TRACE(sched_migrate_enable_tp, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +DECLARE_TRACE(sched_migrate_pull_tp, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0fe97a9a547b..8a9b1a8f4662 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1754,6 +1754,8 @@ void migrate_disable(void) return; } + trace_sched_migrate_disable_tp(p); + preempt_disable(); this_rq()->nr_pinned++; p->migration_disabled = 1; @@ -1786,6 +1788,8 @@ void migrate_enable(void) p->migration_disabled = 0; this_rq()->nr_pinned--; preempt_enable(); + + trace_sched_migrate_enable_tp(p); } EXPORT_SYMBOL_GPL(migrate_enable); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 69e9ac84237e..212785b9131f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2261,6 +2261,7 @@ static void pull_dl_task(struct rq *this_rq) goto skip; if (is_migration_disabled(p)) { + trace_sched_migrate_pull_tp(p); push_task = get_push_task(src_rq); } else { deactivate_task(src_rq, p, 0); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 480d6cf3001d..0fc923765758 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1889,7 +1889,12 @@ static int push_rt_task(struct rq *rq, bool pull) struct task_struct *push_task = NULL; int cpu; - if (!pull || rq->push_busy) + if (!pull) + return 0; + + trace_sched_migrate_pull_tp(next_task); + + if (rq->push_busy) return 0; cpu = find_lowest_rq(rq->curr); @@ -2235,6 +2240,7 @@ static void pull_rt_task(struct rq *this_rq) goto skip; if (is_migration_disabled(p)) { + trace_sched_migrate_pull_tp(p); push_task = get_push_task(src_rq); } else { deactivate_task(src_rq, p, 0); -- Gitee From ae694b0e5d631eb24cf6350b152334bd09cae7bb Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:55:33 +0800 Subject: [PATCH 3/7] sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() commit fcbab15ec7dd7b280b91cf304ef6de49b9925a65 upstream. migrate_disable(); set_cpus_allowed_ptr(current, {something excluding task_cpu(current)}); affine_move_task(); <-- never returns Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201013140116.26651-1-valentin.schneider@arm.com Signed-off-by: Sebastian Andrzej Siewior --- kernel/sched/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a9b1a8f4662..8b653d4ddef2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2266,8 +2266,17 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) - goto out; + if (!(flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), new_mask))) { + ret = -EBUSY; + goto out; + } + } /* * Picking a ~random cpu helps in cases where we are changing affinity -- Gitee From 42dbdf190ee21c5937204c339e08902cfddfb043 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:55:46 +0800 Subject: [PATCH 4/7] sched: Comment affine_move_task() commit 4e50046a61534e1287b98de1a5d148d43fe76f6a upstream. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201013140116.26651-2-valentin.schneider@arm.com Signed-off-by: Sebastian Andrzej Siewior --- kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b653d4ddef2..b7b3e57e0fcd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2106,7 +2106,75 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) } /* - * This function is wildly self concurrent, consider at least 3 times. + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * + * set_cpus_allowed_ptr(P0, [1]); + * + * + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * + * `--> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * cancels the need for an active migration. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 P2 + * + * migrate_disable(); + * + * set_cpus_allowed_ptr(P0, [1]); + * + * set_cpus_allowed_ptr(P0, [0, 1]); + * + * + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded an uninstallion of + * p->migration_pending done with p->pi_lock held. */ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, int dest_cpu, unsigned int flags) @@ -2150,6 +2218,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (!(flags & SCA_MIGRATE_ENABLE)) { /* serialized by p->pi_lock */ if (!p->migration_pending) { + /* Install the request */ refcount_set(&my_pending.refs, 1); init_completion(&my_pending.done); p->migration_pending = &my_pending; @@ -2193,7 +2262,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } if (task_running(rq, p) || p->state == TASK_WAKING) { - + /* + * Lessen races (and headaches) by delegating + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ task_rq_unlock(rq, p, rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); @@ -2218,6 +2291,10 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (refcount_dec_and_test(&pending->refs)) wake_up_var(&pending->refs); + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); return 0; -- Gitee From 8979ef205e73f6e7b4d168c09b088e350aae9f38 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:55:57 +0800 Subject: [PATCH 5/7] sched: Unlock the rq in affine_move_task() error path commit abcfef4563651ce5474c530edf53543c1146919c upstream. Unlock the rq if returned early in the error path. Reported-by: Joe Korty Signed-off-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20201106203921.GA48461@zipoli.concurrent-rt.com --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7b3e57e0fcd..a2deeb029c46 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2240,8 +2240,10 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag * * Either way, we really should have a @pending here. */ - if (WARN_ON_ONCE(!pending)) + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); return -EINVAL; + } if (flags & SCA_MIGRATE_ENABLE) { -- Gitee From a6e1eb896a0b2334476864ebb65fa36b7b46c911 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:56:07 +0800 Subject: [PATCH 6/7] sched: Fix migration_cpu_stop() WARN commit 0fa0389b21f3c8ace7f2b64b966efe3bf9fdae00 upstream. Oleksandr reported hitting the WARN in the 'task_rq(p) != rq' branch of migration_cpu_stop(). Valentin noted that using cpu_of(rq) in that case is just plain wrong to begin with, since per the earlier branch that isn't the actual CPU of the task. Replace both instances of is_cpu_allowed() by a direct p->cpus_mask test using task_cpu(). Reported-by: Oleksandr Natalenko Debugged-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a2deeb029c46..c86b7074f3b7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1941,7 +1941,7 @@ static int migration_cpu_stop(void *data) * and we should be valid again. Nothing to do. */ if (!pending) { - WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); goto out; } @@ -1969,7 +1969,7 @@ static int migration_cpu_stop(void *data) * valid again. Nothing to do. */ if (!pending) { - WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); goto out; } -- Gitee From df9a894d900d1ff8f7f0bb9bf21ddf79f9dcd7d8 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 2 Jun 2023 09:56:18 +0800 Subject: [PATCH 7/7] sched/core: Add missing completion for affine_move_task() waiters commit c77c6223740437fc398f65d0526cd50ceaca2092 upstream. Qian reported that some fuzzer issuing sched_setaffinity() ends up stuck on a wait_for_completion(). The problematic pattern seems to be: affine_move_task() // task_running() case stop_one_cpu(); wait_for_completion(&pending->done); Combined with, on the stopper side: migration_cpu_stop() // Task moved between unlocks and scheduling the stopper task_rq(p) != rq && // task_running() case dest_cpu >= 0 => no complete_all() This can happen with both PREEMPT and !PREEMPT, although !PREEMPT should be more likely to see this given the targeted task has a much bigger window to block and be woken up elsewhere before the stopper runs. Make migration_cpu_stop() always look at pending affinity requests; signal their completion if the stopper hits a rq mismatch but the task is still within its allowed mask. When Migrate-Disable isn't involved, this matches the previous set_cpus_allowed_ptr() vs migration_cpu_stop() behaviour. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Reported-by: Qian Cai Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/lkml/8b62fd1ad1b18def27f18e2ee2df3ff5b36d0762.camel@redhat.com Signed-off-by: Sebastian Andrzej Siewior --- kernel/sched/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c86b7074f3b7..479ed40374ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1953,7 +1953,7 @@ static int migration_cpu_stop(void *data) else p->wake_cpu = dest_cpu; - } else if (dest_cpu < 0) { + } else if (dest_cpu < 0 || pending) { /* * This happens when we get migrated between migrate_enable()'s * preempt_enable() and scheduling the stopper task. At that @@ -1963,6 +1963,17 @@ static int migration_cpu_stop(void *data) * more likely. */ + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + p->migration_pending = NULL; + complete = true; + goto out; + } + /* * When this was migrate_enable() but we no longer have an * @pending, a concurrent SCA 'fixed' things and we should be -- Gitee