From 9ad5f04ba35bdae2f90dfbde37b19e55c8955fb3 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Mon, 12 Jun 2023 17:04:44 +0800
Subject: [PATCH 1/6] cgroups: use simple wait in css_release()

commit b59ff060d2629b8e98f80af20df9e89d4aa31c5d upstream.

To avoid:
|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:914
|in_atomic(): 1, irqs_disabled(): 0, pid: 92, name: rcuc/11
|2 locks held by rcuc/11/92:
| #0:  (rcu_callback){......}, at: [<ffffffff810e037e>] rcu_cpu_kthread+0x3de/0x940
| #1:  (rcu_read_lock_sched){......}, at: [<ffffffff81328390>] percpu_ref_call_confirm_rcu+0x0/0xd0
|Preemption disabled at:[<ffffffff813284e2>] percpu_ref_switch_to_atomic_rcu+0x82/0xc0
|CPU: 11 PID: 92 Comm: rcuc/11 Not tainted 3.18.7-rt0+ #1
| ffff8802398cdf80 ffff880235f0bc28 ffffffff815b3a12 0000000000000000
| 0000000000000000 ffff880235f0bc48 ffffffff8109aa16 0000000000000000
| ffff8802398cdf80 ffff880235f0bc78 ffffffff815b8dd4 000000000000df80
|Call Trace:
| [<ffffffff815b3a12>] dump_stack+0x4f/0x7c
| [<ffffffff8109aa16>] __might_sleep+0x116/0x190
| [<ffffffff815b8dd4>] rt_spin_lock+0x24/0x60
| [<ffffffff8108d2cd>] queue_work_on+0x6d/0x1d0
| [<ffffffff8110c881>] css_release+0x81/0x90
| [<ffffffff8132844e>] percpu_ref_call_confirm_rcu+0xbe/0xd0
| [<ffffffff813284e2>] percpu_ref_switch_to_atomic_rcu+0x82/0xc0
| [<ffffffff810e03e5>] rcu_cpu_kthread+0x445/0x940
| [<ffffffff81098a2d>] smpboot_thread_fn+0x18d/0x2d0
| [<ffffffff810948d8>] kthread+0xe8/0x100
| [<ffffffff815b9c3c>] ret_from_fork+0x7c/0xb0

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/cgroup-defs.h | 2 ++
 kernel/cgroup/cgroup.c      | 9 +++++----
 2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index a92a7b559e43..b6f896b0c2cd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -21,6 +21,7 @@
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/psi_types.h>
+#include <linux/swork.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -253,6 +254,7 @@ struct cgroup_subsys_state {
 
 	/* percpu_ref killing and RCU release */
 	struct work_struct destroy_work;
+	struct swork_event destroy_swork;
 	struct rcu_work destroy_rwork;
 
 	CK_HOTFIX_RESERVE(1)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c429a4b61429..2a548b801ba3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4930,10 +4930,10 @@ static void css_free_rwork_fn(struct work_struct *work)
 	}
 }
 
-static void css_release_work_fn(struct work_struct *work)
+static void css_release_work_fn(struct swork_event *sev)
 {
 	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
+		container_of(sev, struct cgroup_subsys_state, destroy_swork);
 	struct cgroup_subsys *ss = css->ss;
 	struct cgroup *cgrp = css->cgroup;
 
@@ -4995,8 +4995,8 @@ static void css_release(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	INIT_WORK(&css->destroy_work, css_release_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
+	INIT_SWORK(&css->destroy_swork, css_release_work_fn);
+	swork_queue(&css->destroy_swork);
 }
 
 static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5890,6 +5890,7 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
+	BUG_ON(swork_get());
 	return 0;
 }
 core_initcall(cgroup_wq_init);
-- 
Gitee


From 76c8ed60f2f3a22de79dbaa758795067634d798d Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Mon, 12 Jun 2023 17:14:58 +0800
Subject: [PATCH 2/6] cpuset: Convert callback_lock to raw_spinlock_t

commit 81c16cc87fefa39b56e6c4bca4e83beddf6eac24 upstream.

The two commits below add up to a cpuset might_sleep() splat for RT:

8447a0fee974 cpuset: convert callback_mutex to a spinlock
344736f29b35 cpuset: simplify cpuset_node_allowed API

BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:995
in_atomic(): 0, irqs_disabled(): 1, pid: 11718, name: cset
CPU: 135 PID: 11718 Comm: cset Tainted: G            E   4.10.0-rt1-rt #4
Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0056.R01.1409242327 09/24/2014
Call Trace:
 ? dump_stack+0x5c/0x81
 ? ___might_sleep+0xf4/0x170
 ? rt_spin_lock+0x1c/0x50
 ? __cpuset_node_allowed+0x66/0xc0
 ? ___slab_alloc+0x390/0x570 <disables IRQs>
 ? anon_vma_fork+0x8f/0x140
 ? copy_page_range+0x6cf/0xb00
 ? anon_vma_fork+0x8f/0x140
 ? __slab_alloc.isra.74+0x5a/0x81
 ? anon_vma_fork+0x8f/0x140
 ? kmem_cache_alloc+0x1b5/0x1f0
 ? anon_vma_fork+0x8f/0x140
 ? copy_process.part.35+0x1670/0x1ee0
 ? _do_fork+0xdd/0x3f0
 ? _do_fork+0xdd/0x3f0
 ? do_syscall_64+0x61/0x170
 ? entry_SYSCALL64_slow_path+0x25/0x25

The later ensured that a NUMA box WILL take callback_lock in atomic
context by removing the allocator and reclaim path __GFP_HARDWALL
usage which prevented such contexts from taking callback_mutex.

One option would be to reinstate __GFP_HARDWALL protections for
RT, however, as the 8447a0fee974 changelog states:

The callback_mutex is only used to synchronize reads/updates of cpusets'
flags and cpu/node masks. These operations should always proceed fast so
there's no reason why we can't use a spinlock instead of the mutex.

Cc: stable-rt@vger.kernel.org
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Wenya Zhang <zhang.wenya1@zte.com.cn>
Reviewed-by: Xuexin Jiang <jiang.xuexin@zte.com.cn>
---
 kernel/cgroup/cpuset.c | 66 +++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0945698a6784..4d4f82245d70 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -289,7 +289,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_SPINLOCK(callback_lock);
+static DEFINE_RAW_SPINLOCK(callback_lock);
 
 static struct workqueue_struct *cpuset_migrate_mm_wq;
 
@@ -931,9 +931,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 			continue;
 		rcu_read_unlock();
 
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		cpumask_copy(cp->effective_cpus, new_cpus);
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 
 		WARN_ON(!is_in_v2_mode() &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1006,9 +1006,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		return retval;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	/* use trialcs->cpus_allowed as a temp variable */
 	update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1192,9 +1192,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 			continue;
 		rcu_read_unlock();
 
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		cp->effective_mems = *new_mems;
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 
 		WARN_ON(!is_in_v2_mode() &&
 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1262,9 +1262,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto done;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cs->mems_allowed = trialcs->mems_allowed;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	/* use trialcs->mems_allowed as a temp variable */
 	update_nodemasks_hier(cs, &trialcs->mems_allowed);
@@ -1355,9 +1355,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
 			|| (is_spread_page(cs) != is_spread_page(trialcs)));
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cs->flags = trialcs->flags;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		rebuild_sched_domains_locked();
@@ -1773,7 +1773,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	cpuset_filetype_t type = seq_cft(sf)->private;
 	int ret = 0;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 
 	switch (type) {
 	case FILE_CPULIST:
@@ -1792,7 +1792,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 		ret = -EINVAL;
 	}
 
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 	return ret;
 }
 
@@ -2007,12 +2007,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 
 	cpuset_inc();
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	if (is_in_v2_mode()) {
 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
 		cs->effective_mems = parent->effective_mems;
 	}
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
 		goto out_unlock;
@@ -2039,13 +2039,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	}
 	rcu_read_unlock();
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cs->mems_allowed = parent->mems_allowed;
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
 	cpuacct_cpuset_changed(cs->css.cgroup, NULL, cs->effective_cpus);
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return 0;
@@ -2084,7 +2084,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
 	mutex_lock(&cpuset_mutex);
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 
 	if (is_in_v2_mode()) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2095,7 +2095,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 		top_cpuset.mems_allowed = top_cpuset.effective_mems;
 	}
 
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 	mutex_unlock(&cpuset_mutex);
 }
 
@@ -2209,12 +2209,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 {
 	bool is_empty;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, new_cpus);
 	cpumask_copy(cs->effective_cpus, new_cpus);
 	cs->mems_allowed = *new_mems;
 	cs->effective_mems = *new_mems;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	/*
 	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2251,10 +2251,10 @@ hotplug_update_tasks(struct cpuset *cs,
 	if (nodes_empty(*new_mems))
 		*new_mems = parent_cs(cs)->effective_mems;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->effective_cpus, new_cpus);
 	cs->effective_mems = *new_mems;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	if (cpus_updated)
 		update_tasks_cpumask(cs);
@@ -2347,21 +2347,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 
 	/* synchronize cpus_allowed to cpu_active_mask */
 	if (cpus_updated) {
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		if (!on_dfl)
 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 		/* we don't mess with cpumasks of tasks in top_cpuset */
 	}
 
 	/* synchronize mems_allowed to N_MEMORY */
 	if (mems_updated) {
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		if (!on_dfl)
 			top_cpuset.mems_allowed = new_mems;
 		top_cpuset.effective_mems = new_mems;
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 		update_tasks_nodemask(&top_cpuset);
 	}
 
@@ -2460,11 +2460,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&callback_lock, flags);
+	raw_spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
 	guarantee_online_cpus(task_cs(tsk), pmask);
 	rcu_read_unlock();
-	spin_unlock_irqrestore(&callback_lock, flags);
+	raw_spin_unlock_irqrestore(&callback_lock, flags);
 }
 
 #ifdef CONFIG_RICH_CONTAINER
@@ -2562,11 +2562,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 	nodemask_t mask;
 	unsigned long flags;
 
-	spin_lock_irqsave(&callback_lock, flags);
+	raw_spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
 	guarantee_online_mems(task_cs(tsk), &mask);
 	rcu_read_unlock();
-	spin_unlock_irqrestore(&callback_lock, flags);
+	raw_spin_unlock_irqrestore(&callback_lock, flags);
 
 	return mask;
 }
@@ -2658,14 +2658,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	spin_lock_irqsave(&callback_lock, flags);
+	raw_spin_lock_irqsave(&callback_lock, flags);
 
 	rcu_read_lock();
 	cs = nearest_hardwall_ancestor(task_cs(current));
 	allowed = node_isset(node, cs->mems_allowed);
 	rcu_read_unlock();
 
-	spin_unlock_irqrestore(&callback_lock, flags);
+	raw_spin_unlock_irqrestore(&callback_lock, flags);
 	return allowed;
 }
 
-- 
Gitee


From d2bf8cdf693866ee0e2e2e6d82e71c17eab91965 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Mon, 12 Jun 2023 17:16:18 +0800
Subject: [PATCH 3/6] apparmor: use a locallock instead preempt_disable()

commit aa1de601999a7958fa83e1dae7b0819b6d5106a9 upstream.

get_buffers() disables preemption which acts as a lock for the per-CPU
variable. Since we can't disable preemption here on RT, a local_lock is
lock is used in order to remain on the same CPU and not to have more
than one user within the critical section.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 security/apparmor/include/path.h | 19 ++++++++++++++++---
 security/apparmor/lsm.c          |  2 +-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h
index b6380c5f0097..12abfddb19c9 100644
--- a/security/apparmor/include/path.h
+++ b/security/apparmor/include/path.h
@@ -40,8 +40,10 @@ struct aa_buffers {
 
 #include <linux/percpu.h>
 #include <linux/preempt.h>
+#include <linux/locallock.h>
 
 DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
+DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
 
 #define ASSIGN(FN, A, X, N) ((X) = FN(A, N))
 #define EVAL1(FN, A, X) ASSIGN(FN, A, X, 0) /*X = FN(0)*/
@@ -51,7 +53,17 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
 
 #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
+{
+	struct local_irq_lock *lv;
+
+	lv = this_cpu_ptr(&aa_buffers_lock);
+	WARN_ONCE(lv->owner != current,
+		  "__get_buffer without aa_buffers_lock\n");
+}
+
+#elif defined(CONFIG_DEBUG_PREEMPT)
 #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
 #else
 #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
@@ -67,14 +79,15 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
 
 #define get_buffers(X...)						\
 do {									\
-	struct aa_buffers *__cpu_var = get_cpu_ptr(&aa_buffers);	\
+	struct aa_buffers *__cpu_var;					\
+	__cpu_var = get_locked_ptr(aa_buffers_lock, &aa_buffers);	\
 	__get_buffers(__cpu_var, X);					\
 } while (0)
 
 #define put_buffers(X, Y...)		\
 do {					\
 	__put_buffers(X, Y);		\
-	put_cpu_ptr(&aa_buffers);	\
+	put_locked_ptr(aa_buffers_lock, &aa_buffers);	\
 } while (0)
 
 #endif /* __AA_PATH_H */
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 8b8b70620bbe..8330ef57a784 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -45,7 +45,7 @@
 int apparmor_initialized;
 
 DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
-
+DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
 
 /*
  * LSM hook functions
-- 
Gitee


From a0c96f7a043300415a640215c65e40440ffdfd6c Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Mon, 12 Jun 2023 17:16:42 +0800
Subject: [PATCH 4/6] workqueue: Prevent deadlock/stall on RT

commit 6b39a01e228fa1dd4b6e4a63ee0948b2e925a4c4 upstream.

Austin reported a XFS deadlock/stall on RT where scheduled work gets
never exececuted and tasks are waiting for each other for ever.

The underlying problem is the modification of the RT code to the
handling of workers which are about to go to sleep. In mainline a
worker thread which goes to sleep wakes an idle worker if there is
more work to do. This happens from the guts of the schedule()
function. On RT this must be outside and the accessed data structures
are not protected against scheduling due to the spinlock to rtmutex
conversion. So the naive solution to this was to move the code outside
of the scheduler and protect the data structures by the pool
lock. That approach turned out to be a little naive as we cannot call
into that code when the thread blocks on a lock, as it is not allowed
to block on two locks in parallel. So we dont call into the worker
wakeup magic when the worker is blocked on a lock, which causes the
deadlock/stall observed by Austin and Mike.

Looking deeper into that worker code it turns out that the only
relevant data structure which needs to be protected is the list of
idle workers which can be woken up.

So the solution is to protect the list manipulation operations with
preempt_enable/disable pairs on RT and call unconditionally into the
worker code even when the worker is blocked on a lock. The preemption
protection is safe as there is nothing which can fiddle with the list
outside of thread context.

Reported-and_tested-by: Austin Schuh <austin@peloton-tech.com>
Reported-and_tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos
Cc: Richard Weinberger <richard.weinberger@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/sched/core.c |  6 +++--
 kernel/workqueue.c  | 60 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a016bc828a55..60de304a7b09 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3653,9 +3653,8 @@ void __noreturn do_task_dead(void)
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-	if (!tsk->state || tsk_is_pi_blocked(tsk))
+	if (!tsk->state)
 		return;
-
 	/*
 	 * If a worker went to sleep, notify and ask workqueue whether
 	 * it wants to wake up a task to maintain concurrency.
@@ -3672,6 +3671,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		preempt_enable_no_resched();
 	}
 
+	if (tsk_is_pi_blocked(tsk))
+		return;
+
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 39521eca80be..4f41609df39e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,6 +125,11 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
+ *    On RT we need the extra protection via rt_lock_idle_list() for
+ *    the list manipulations against read access from
+ *    wq_worker_sleeping(). All other places are nicely serialized via
+ *    pool->lock.
+ *
  * A: wq_pool_attach_mutex protected.
  *
  * PL: wq_pool_mutex protected.
@@ -430,6 +435,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
 		else
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void rt_lock_idle_list(struct worker_pool *pool)
+{
+	preempt_disable();
+}
+static inline void rt_unlock_idle_list(struct worker_pool *pool)
+{
+	preempt_enable();
+}
+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
+#else
+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
+static inline void sched_lock_idle_list(struct worker_pool *pool)
+{
+	spin_lock_irq(&pool->lock);
+}
+static inline void sched_unlock_idle_list(struct worker_pool *pool)
+{
+	spin_unlock_irq(&pool->lock);
+}
+#endif
+
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -836,10 +866,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
  */
 static void wake_up_worker(struct worker_pool *pool)
 {
-	struct worker *worker = first_idle_worker(pool);
+	struct worker *worker;
+
+	rt_lock_idle_list(pool);
+
+	worker = first_idle_worker(pool);
 
 	if (likely(worker))
 		wake_up_process(worker->task);
+
+	rt_unlock_idle_list(pool);
 }
 
 /**
@@ -868,7 +904,7 @@ void wq_worker_running(struct task_struct *task)
  */
 void wq_worker_sleeping(struct task_struct *task)
 {
-	struct worker *next, *worker = kthread_data(task);
+	struct worker *worker = kthread_data(task);
 	struct worker_pool *pool;
 
 	/*
@@ -885,26 +921,18 @@ void wq_worker_sleeping(struct task_struct *task)
 		return;
 
 	worker->sleeping = 1;
-	spin_lock_irq(&pool->lock);
 
 	/*
 	 * The counterpart of the following dec_and_test, implied mb,
 	 * worklist not empty test sequence is in insert_work().
 	 * Please read comment there.
-	 *
-	 * NOT_RUNNING is clear.  This means that we're bound to and
-	 * running on the local cpu w/ rq lock held and preemption
-	 * disabled, which in turn means that none else could be
-	 * manipulating idle_list, so dereferencing idle_list without pool
-	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(&pool->nr_running) &&
 	    !list_empty(&pool->worklist)) {
-		next = first_idle_worker(pool);
-		if (next)
-			wake_up_process(next->task);
+		sched_lock_idle_list(pool);
+		wake_up_worker(pool);
+		sched_unlock_idle_list(pool);
 	}
-	spin_unlock_irq(&pool->lock);
 }
 
 /**
@@ -1695,7 +1723,9 @@ static void worker_enter_idle(struct worker *worker)
 	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
+	rt_lock_idle_list(pool);
 	list_add(&worker->entry, &pool->idle_list);
+	rt_unlock_idle_list(pool);
 
 	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
@@ -1728,7 +1758,9 @@ static void worker_leave_idle(struct worker *worker)
 		return;
 	worker_clr_flags(worker, WORKER_IDLE);
 	pool->nr_idle--;
+	rt_lock_idle_list(pool);
 	list_del_init(&worker->entry);
+	rt_unlock_idle_list(pool);
 }
 
 static struct worker *alloc_worker(int node)
@@ -1896,7 +1928,9 @@ static void destroy_worker(struct worker *worker)
 	pool->nr_workers--;
 	pool->nr_idle--;
 
+	rt_lock_idle_list(pool);
 	list_del_init(&worker->entry);
+	rt_unlock_idle_list(pool);
 	worker->flags |= WORKER_DIE;
 	wake_up_process(worker->task);
 }
-- 
Gitee


From 551ef4e15b44a48c782250cc18c5fb2a518cf362 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Mon, 12 Jun 2023 17:16:53 +0800
Subject: [PATCH 5/6] signals: Allow rt tasks to cache one sigqueue struct

commit 3bbe691ca13a581aeb69342fe32522906eefba3c upstream.

To avoid allocation allow rt tasks to cache one sigqueue struct in
task struct.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h  |  2 ++
 include/linux/signal.h |  1 +
 kernel/exit.c          |  2 +-
 kernel/fork.c          |  1 +
 kernel/signal.c        | 69 +++++++++++++++++++++++++++++++++++++++---
 5 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0736a167d030..68fa3d093a97 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -982,6 +982,8 @@ struct task_struct {
 	/* Signal handlers: */
 	struct signal_struct		*signal;
 	struct sighand_struct		*sighand;
+	struct sigqueue			*sigqueue_cache;
+
 	sigset_t			blocked;
 	sigset_t			real_blocked;
 	/* Restored if set_restore_sigmask() was used: */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 0be5ce2375cb..6495fda18c2c 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -245,6 +245,7 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
+extern void flush_task_sigqueue(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index eb5dcd9d138c..d89fc31fdccc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,7 @@ static void __exit_signal(struct task_struct *tsk)
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
-	flush_sigqueue(&tsk->pending);
+	flush_task_sigqueue(tsk);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 742ed633b741..e43babb1af4f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1977,6 +1977,7 @@ static __latent_entropy struct task_struct *copy_process(
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
+	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
diff --git a/kernel/signal.c b/kernel/signal.c
index e70f9439f7be..9af6829ac69a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/rt.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
@@ -393,13 +394,30 @@ void task_join_group_stop(struct task_struct *task)
 	}
 }
 
+static inline struct sigqueue *get_task_cache(struct task_struct *t)
+{
+	struct sigqueue *q = t->sigqueue_cache;
+
+	if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
+		return NULL;
+	return q;
+}
+
+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+{
+	if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+		return 0;
+	return 1;
+}
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
  *   appropriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+		    int override_rlimit, int fromslab)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -416,7 +434,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
 			task_rlimit(t, RLIMIT_SIGPENDING)) {
-		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (!fromslab)
+			q = get_task_cache(t);
+		if (!q)
+			q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -433,6 +454,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	return q;
 }
 
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+		 int override_rlimit)
+{
+	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+}
+
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
@@ -442,6 +470,21 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
+static void sigqueue_free_current(struct sigqueue *q)
+{
+	struct user_struct *up;
+
+	if (q->flags & SIGQUEUE_PREALLOC)
+		return;
+
+	up = q->user;
+	if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
+		atomic_dec(&up->sigpending);
+		free_uid(up);
+	} else
+		  __sigqueue_free(q);
+}
+
 void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
@@ -454,6 +497,21 @@ void flush_sigqueue(struct sigpending *queue)
 	}
 }
 
+/*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+	struct sigqueue *q;
+
+	flush_sigqueue(&tsk->pending);
+
+	q = get_task_cache(tsk);
+	if (q)
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
 /*
  * Flush all pending signals for this kthread.
  */
@@ -577,7 +635,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
 			(info->si_code == SI_TIMER) &&
 			(info->si_sys_private);
 
-		__sigqueue_free(first);
+		sigqueue_free_current(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -614,6 +672,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	bool resched_timer = false;
 	int signr;
 
+	WARN_ON_ONCE(tsk != current);
+
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
@@ -1752,7 +1812,8 @@ EXPORT_SYMBOL(kill_pid);
  */
 struct sigqueue *sigqueue_alloc(void)
 {
-	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+	/* Preallocated sigqueue objects always from the slabcache ! */
+	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
 
 	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;
-- 
Gitee


From 624b7ab848bcafe9d198d562e450ea011b5bf11e Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Mon, 12 Jun 2023 17:17:08 +0800
Subject: [PATCH 6/6] Add localversion for -RT release

commit 1ce039a98b3970742c177d167d323f4a51fa51f0 upstream.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 localversion-rt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 localversion-rt

diff --git a/localversion-rt b/localversion-rt
new file mode 100644
index 000000000000..1199ebade17b
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
+-rt16
-- 
Gitee