From 4dbb17f665526412f2e7fbf59098ed07bd2a3b33 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Fri, 9 Jun 2023 10:47:31 +0800
Subject: [PATCH 1/4] fs/dcache: use swait_queue instead of waitqueue

commit 558647aa05135aa727be9198f29a041686e683af upstream.

__d_lookup_done() invokes wake_up_all() while holding a hlist_bl_lock()
which disables preemption. As a workaround convert it to swait.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 fs/cifs/readdir.c |  2 +-
 fs/dcache.c       | 27 +++++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 3925a7bfc74d..33f7723fb83e 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
 	struct inode *inode;
 	struct super_block *sb = parent->d_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+	DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
 
 	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 8f868760e799..f08ed98cf372 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2497,21 +2497,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
 
 static void d_wait_lookup(struct dentry *dentry)
 {
-	if (d_in_lookup(dentry)) {
-		DECLARE_WAITQUEUE(wait, current);
-		add_wait_queue(dentry->d_wait, &wait);
-		do {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			spin_unlock(&dentry->d_lock);
-			schedule();
-			spin_lock(&dentry->d_lock);
-		} while (d_in_lookup(dentry));
-	}
+	struct swait_queue __wait;
+
+	if (!d_in_lookup(dentry))
+		return;
+
+	INIT_LIST_HEAD(&__wait.task_list);
+	do {
+		prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&dentry->d_lock);
+		schedule();
+		spin_lock(&dentry->d_lock);
+	} while (d_in_lookup(dentry));
+	finish_swait(dentry->d_wait, &__wait);
 }
 
 struct dentry *d_alloc_parallel(struct dentry *parent,
 				const struct qstr *name,
-				wait_queue_head_t *wq)
+				struct swait_queue_head *wq)
 {
 	unsigned int hash = name->hash;
 	struct hlist_bl_head *b = in_lookup_hash(parent, hash);
@@ -2626,7 +2629,7 @@ void __d_lookup_done(struct dentry *dentry)
 	hlist_bl_lock(b);
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
 	__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
-	wake_up_all(dentry->d_wait);
+	swake_up_all(dentry->d_wait);
 	dentry->d_wait = NULL;
 	hlist_bl_unlock(b);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
-- 
Gitee


From 42615e0e32230d71c98aacc511210f34296ae918 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Fri, 9 Jun 2023 10:47:49 +0800
Subject: [PATCH 2/4] workqueue: Use normal rcu

commit a65796914416941351ac0cd0271c5da76bed792a upstream.

There is no need for sched_rcu. The undocumented reason why sched_rcu
is used is to avoid a few explicit rcu_read_lock()/unlock() pairs by
abusing the fact that sched_rcu reader side critical sections are also
protected by preempt or irq disabled regions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 95 +++++++++++++++++++++++++---------------------
 1 file changed, 52 insertions(+), 43 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a8a260a8f4bf..d0d9b126bc0d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -127,7 +127,7 @@ enum {
  *
  * PL: wq_pool_mutex protected.
  *
- * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
+ * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
  *
  * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
  *
@@ -136,7 +136,7 @@ enum {
  *
  * WQ: wq->mutex protected.
  *
- * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
+ * WR: wq->mutex protected for writes.  RCU protected for reads.
  *
  * MD: wq_mayday_lock protected.
  */
@@ -183,7 +183,7 @@ struct worker_pool {
 	atomic_t		nr_running ____cacheline_aligned_in_smp;
 
 	/*
-	 * Destruction of pool is sched-RCU protected to allow dereferences
+	 * Destruction of pool is RCU protected to allow dereferences
 	 * from get_work_pool().
 	 */
 	struct rcu_head		rcu;
@@ -212,7 +212,7 @@ struct pool_workqueue {
 	/*
 	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
 	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
-	 * itself is also sched-RCU protected so that the first pwq can be
+	 * itself is also RCU protected so that the first pwq can be
 	 * determined without grabbing wq->mutex.
 	 */
 	struct work_struct	unbound_release_work;
@@ -357,20 +357,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #include <trace/events/workqueue.h>
 
 #define assert_rcu_or_pool_mutex()					\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			 !lockdep_is_held(&wq_pool_mutex),		\
-			 "sched RCU or wq_pool_mutex should be held")
+			 "RCU or wq_pool_mutex should be held")
 
 #define assert_rcu_or_wq_mutex(wq)					\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			 !lockdep_is_held(&wq->mutex),			\
-			 "sched RCU or wq->mutex should be held")
+			 "RCU or wq->mutex should be held")
 
 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			 !lockdep_is_held(&wq->mutex) &&		\
 			 !lockdep_is_held(&wq_pool_mutex),		\
-			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
+			 "RCU, wq->mutex or wq_pool_mutex should be held")
 
 #define for_each_cpu_worker_pool(pool, cpu)				\
 	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
@@ -382,7 +382,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
  * @pool: iteration cursor
  * @pi: integer used for iteration
  *
- * This must be called either with wq_pool_mutex held or sched RCU read
+ * This must be called either with wq_pool_mutex held or RCU read
  * locked.  If the pool needs to be used beyond the locking in effect, the
  * caller is responsible for guaranteeing that the pool stays online.
  *
@@ -414,7 +414,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
  * @pwq: iteration cursor
  * @wq: the target workqueue
  *
- * This must be called either with wq->mutex held or sched RCU read locked.
+ * This must be called either with wq->mutex held or RCU read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
  *
@@ -550,7 +550,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  * @wq: the target workqueue
  * @node: the node ID
  *
- * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
  * read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
@@ -694,8 +694,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
  * @work: the work item of interest
  *
  * Pools are created and destroyed under wq_pool_mutex, and allows read
- * access under sched-RCU read lock.  As such, this function should be
- * called under wq_pool_mutex or with preemption disabled.
+ * access under RCU read lock.  As such, this function should be
+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
  *
  * All fields of the returned pool are accessible as long as the above
  * mentioned locking is in effect.  If the returned pool needs to be used
@@ -1114,7 +1114,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
 {
 	if (pwq) {
 		/*
-		 * As both pwqs and pools are sched-RCU protected, the
+		 * As both pwqs and pools are RCU protected, the
 		 * following lock operations are safe.
 		 */
 		spin_lock_irq(&pwq->pool->lock);
@@ -1242,6 +1242,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
+	rcu_read_lock();
 	/*
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
@@ -1280,10 +1281,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 		set_work_pool_and_keep_pending(work, pool->id);
 
 		spin_unlock(&pool->lock);
+		rcu_read_unlock();
 		return 1;
 	}
 	spin_unlock(&pool->lock);
 fail:
+	rcu_read_unlock();
 	local_irq_restore(*flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
@@ -1397,6 +1400,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	if (unlikely(wq->flags & __WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
+	rcu_read_lock();
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
 		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
@@ -1453,10 +1457,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	/* pwq determined, queue */
 	trace_workqueue_queue_work(req_cpu, pwq, work);
 
-	if (WARN_ON(!list_empty(&work->entry))) {
-		spin_unlock(&pwq->pool->lock);
-		return;
-	}
+	if (WARN_ON(!list_empty(&work->entry)))
+		goto out;
 
 	pwq->nr_in_flight[pwq->work_color]++;
 	work_flags = work_color_to_flags(pwq->work_color);
@@ -1474,7 +1476,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 
 	insert_work(pwq, work, worklist, work_flags);
 
+out:
 	spin_unlock(&pwq->pool->lock);
+	rcu_read_unlock();
 }
 
 /**
@@ -2878,14 +2882,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 
 	might_sleep();
 
-	local_irq_disable();
+	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (!pool) {
-		local_irq_enable();
+		rcu_read_unlock();
 		return false;
 	}
 
-	spin_lock(&pool->lock);
+	spin_lock_irq(&pool->lock);
 	/* see the comment in try_to_grab_pending() with the same code */
 	pwq = get_work_pwq(work);
 	if (pwq) {
@@ -2917,10 +2921,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 		lock_map_acquire(&pwq->wq->lockdep_map);
 		lock_map_release(&pwq->wq->lockdep_map);
 	}
-
+	rcu_read_unlock();
 	return true;
 already_gone:
 	spin_unlock_irq(&pool->lock);
+	rcu_read_unlock();
 	return false;
 }
 
@@ -3367,7 +3372,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
  * put_unbound_pool - put a worker_pool
  * @pool: worker_pool to put
  *
- * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
  * safe manner.  get_unbound_pool() calls this function on its failure path
  * and this function should be able to release pools which went through,
  * successfully or not, init_worker_pool().
@@ -3421,8 +3426,8 @@ static void put_unbound_pool(struct worker_pool *pool)
 	del_timer_sync(&pool->idle_timer);
 	del_timer_sync(&pool->mayday_timer);
 
-	/* sched-RCU protected to allow dereferences from get_work_pool() */
-	call_rcu_sched(&pool->rcu, rcu_free_pool);
+	/* RCU protected to allow dereferences from get_work_pool() */
+	call_rcu(&pool->rcu, rcu_free_pool);
 }
 
 /**
@@ -3529,14 +3534,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
 	put_unbound_pool(pool);
 	mutex_unlock(&wq_pool_mutex);
 
-	call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+	call_rcu(&pwq->rcu, rcu_free_pwq);
 
 	/*
 	 * If we're the last pwq going away, @wq is already dead and no one
 	 * is gonna access it anymore.  Schedule RCU free.
 	 */
 	if (is_last)
-		call_rcu_sched(&wq->rcu, rcu_free_wq);
+		call_rcu(&wq->rcu, rcu_free_wq);
 }
 
 /**
@@ -4236,7 +4241,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		 * The base ref is never dropped on per-cpu pwqs.  Directly
 		 * schedule RCU free.
 		 */
-		call_rcu_sched(&wq->rcu, rcu_free_wq);
+		call_rcu(&wq->rcu, rcu_free_wq);
 	} else {
 		/*
 		 * We're the sole accessor of @wq at this point.  Directly
@@ -4346,7 +4351,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	struct pool_workqueue *pwq;
 	bool ret;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
+	preempt_disable();
 
 	if (cpu == WORK_CPU_UNBOUND)
 		cpu = smp_processor_id();
@@ -4357,7 +4363,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	ret = !list_empty(&pwq->delayed_works);
-	rcu_read_unlock_sched();
+	preempt_enable();
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -4383,15 +4390,15 @@ unsigned int work_busy(struct work_struct *work)
 	if (work_pending(work))
 		ret |= WORK_BUSY_PENDING;
 
-	local_irq_save(flags);
+	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (pool) {
-		spin_lock(&pool->lock);
+		spin_lock_irqsave(&pool->lock, flags);
 		if (find_worker_executing_work(pool, work))
 			ret |= WORK_BUSY_RUNNING;
-		spin_unlock(&pool->lock);
+		spin_unlock_irqrestore(&pool->lock, flags);
 	}
-	local_irq_restore(flags);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -4576,7 +4583,7 @@ void show_workqueue_state(void)
 	unsigned long flags;
 	int pi;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
 
 	pr_info("Showing busy workqueues and worker pools:\n");
 
@@ -4641,7 +4648,7 @@ void show_workqueue_state(void)
 		touch_nmi_watchdog();
 	}
 
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
 }
 
 /* used to show worker information through /proc/PID/{comm,stat,status} */
@@ -5028,16 +5035,16 @@ bool freeze_workqueues_busy(void)
 		 * nr_active is monotonically decreasing.  It's safe
 		 * to peek without lock.
 		 */
-		rcu_read_lock_sched();
+		rcu_read_lock();
 		for_each_pwq(pwq, wq) {
 			WARN_ON_ONCE(pwq->nr_active < 0);
 			if (pwq->nr_active) {
 				busy = true;
-				rcu_read_unlock_sched();
+				rcu_read_unlock();
 				goto out_unlock;
 			}
 		}
-		rcu_read_unlock_sched();
+		rcu_read_unlock();
 	}
 out_unlock:
 	mutex_unlock(&wq_pool_mutex);
@@ -5232,7 +5239,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
 	const char *delim = "";
 	int node, written = 0;
 
-	rcu_read_lock_sched();
+	get_online_cpus();
+	rcu_read_lock();
 	for_each_node(node) {
 		written += scnprintf(buf + written, PAGE_SIZE - written,
 				     "%s%d:%d", delim, node,
@@ -5240,7 +5248,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
 		delim = " ";
 	}
 	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
+	put_online_cpus();
 
 	return written;
 }
-- 
Gitee


From a020d31e95bcf0911144c20ff84bd34f9b96f6fc Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Fri, 9 Jun 2023 10:48:05 +0800
Subject: [PATCH 3/4] workqueue: Use local irq lock instead of irq disable 
 regions

commit 670717ab259111dcf589c98670ff74a206bf87d3 upstream.

Use a local_irq_lock as a replacement for irq off regions. We keep the
semantic of irq-off in regard to the pool->lock and remain preemptible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d0d9b126bc0d..ce78223154f2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,6 +49,7 @@
 #include <linux/uaccess.h>
 #include <linux/sched/isolation.h>
 #include <linux/nmi.h>
+#include <linux/locallock.h>
 
 #include "workqueue_internal.h"
 
@@ -350,6 +351,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 
+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
+
 static int worker_thread(void *__worker);
 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 
@@ -1117,9 +1120,11 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
 		 * As both pwqs and pools are RCU protected, the
 		 * following lock operations are safe.
 		 */
-		spin_lock_irq(&pwq->pool->lock);
+		rcu_read_lock();
+		local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
 		put_pwq(pwq);
-		spin_unlock_irq(&pwq->pool->lock);
+		local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
+		rcu_read_unlock();
 	}
 }
 
@@ -1223,7 +1228,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	struct worker_pool *pool;
 	struct pool_workqueue *pwq;
 
-	local_irq_save(*flags);
+	local_lock_irqsave(pendingb_lock, *flags);
 
 	/* try to steal the timer if it exists */
 	if (is_dwork) {
@@ -1287,7 +1292,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	spin_unlock(&pool->lock);
 fail:
 	rcu_read_unlock();
-	local_irq_restore(*flags);
+	local_unlock_irqrestore(pendingb_lock, *flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
 	cpu_relax();
@@ -1392,7 +1397,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	 * queued or lose PENDING.  Grabbing PENDING and queueing should
 	 * happen with IRQ disabled.
 	 */
+#ifndef CONFIG_PREEMPT_RT_FULL
+	/*
+	 * nort: On RT the "interrupts-disabled" rule has been replaced with
+	 * pendingb_lock.
+	 */
 	lockdep_assert_irqs_disabled();
+#endif
 
 	debug_work_activate(work);
 
@@ -1498,14 +1509,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 	bool ret = false;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock,flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(queue_work_on);
@@ -1514,8 +1525,11 @@ void delayed_work_timer_fn(struct timer_list *t)
 {
 	struct delayed_work *dwork = from_timer(dwork, t, timer);
 
+	/* XXX */
+	/* local_lock(pendingb_lock); */
 	/* should have been called from irqsafe timer with irq already off */
 	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
+	/* local_unlock(pendingb_lock); */
 }
 EXPORT_SYMBOL(delayed_work_timer_fn);
 
@@ -1570,14 +1584,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	unsigned long flags;
 
 	/* read the comment in __queue_work() */
-	local_irq_save(flags);
+	local_lock_irqsave(pendingb_lock, flags);
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
 		ret = true;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(queue_delayed_work_on);
@@ -1612,7 +1626,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 	if (likely(ret >= 0)) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(pendingb_lock, flags);
 	}
 
 	/* -ENOENT from try_to_grab_pending() becomes %true */
@@ -1623,11 +1637,12 @@ EXPORT_SYMBOL_GPL(mod_delayed_work_on);
 static void rcu_work_rcufn(struct rcu_head *rcu)
 {
 	struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
+	unsigned long flags;
 
 	/* read the comment in __queue_work() */
-	local_irq_disable();
+	local_lock_irqsave(pendingb_lock, flags);
 	__queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
-	local_irq_enable();
+	local_unlock_irqrestore(pendingb_lock, flags);
 }
 
 /**
@@ -3025,7 +3040,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 
 	/* tell other tasks trying to grab @work to back off */
 	mark_work_canceling(work);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 
 	/*
 	 * This allows canceling during early boot.  We know that @work
@@ -3086,10 +3101,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
-	local_irq_disable();
+	local_lock_irq(pendingb_lock);
 	if (del_timer_sync(&dwork->timer))
 		__queue_work(dwork->cpu, dwork->wq, &dwork->work);
-	local_irq_enable();
+	local_unlock_irq(pendingb_lock);
 	return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
@@ -3127,7 +3142,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
 		return false;
 
 	set_work_pool_and_clear_pending(work, get_work_pool_id(work));
-	local_irq_restore(flags);
+	local_unlock_irqrestore(pendingb_lock, flags);
 	return ret;
 }
 
-- 
Gitee


From c73df060c460b49dca929baeb7fe35fcf56e0df0 Mon Sep 17 00:00:00 2001
From: meganz009 <zhang.wenya1@zte.com.cn>
Date: Fri, 9 Jun 2023 10:48:16 +0800
Subject: [PATCH 4/4] workqueue: Prevent workqueue versus ata-piix livelock

commit c6d8c462a41a7c4c64ade776387c5b9394464de6 upstream.

An Intel i7 system regularly detected rcu_preempt stalls after the kernel
was upgraded from 3.6-rt to 3.8-rt. When the stall happened, disk I/O was no
longer possible, unless the system was restarted.

The kernel message was:
INFO: rcu_preempt self-detected stall on CPU { 6}
[..]
NMI backtrace for cpu 6
CPU 6
Pid: 119, comm: irq/19-ata_piix Not tainted 3.8.13-rt13 #11 Shuttle Inc. SX58/SX58
RIP: 0010:[<ffffffff8124ca60>]  [<ffffffff8124ca60>] ip_compute_csum+0x30/0x30
RSP: 0018:ffff880333303cb0  EFLAGS: 00000002
RAX: 0000000000000006 RBX: 00000000000003e9 RCX: 0000000000000034
RDX: 0000000000000000 RSI: ffffffff81aa16d0 RDI: 0000000000000001
RBP: ffff880333303ce8 R08: ffffffff81aa16d0 R09: ffffffff81c1b8cc
R10: 0000000000000000 R11: 0000000000000000 R12: 000000000005161f
R13: 0000000000000006 R14: ffffffff81aa16d0 R15: 0000000000000002
FS:  0000000000000000(0000) GS:ffff880333300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000003c1b2bb420 CR3: 0000000001a0f000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process irq/19-ata_piix (pid: 119, threadinfo ffff88032d88a000, task ffff88032df80000)
Stack:
ffffffff8124cb32 000000000005161e 00000000000003e9 0000000000001000
0000000000009022 ffffffff81aa16d0 0000000000000002 ffff880333303cf8
ffffffff8124caa9 ffff880333303d08 ffffffff8124cad2 ffff880333303d28
Call Trace:
<IRQ>
[<ffffffff8124cb32>] ? delay_tsc+0x33/0xe3
[<ffffffff8124caa9>] __delay+0xf/0x11
[<ffffffff8124cad2>] __const_udelay+0x27/0x29
[<ffffffff8102d1fa>] native_safe_apic_wait_icr_idle+0x39/0x45
[<ffffffff8102dc9b>] __default_send_IPI_dest_field.constprop.0+0x1e/0x58
[<ffffffff8102dd1e>] default_send_IPI_mask_sequence_phys+0x49/0x7d
[<ffffffff81030326>] physflat_send_IPI_all+0x17/0x19
[<ffffffff8102de53>] arch_trigger_all_cpu_backtrace+0x50/0x79
[<ffffffff810b21d0>] rcu_check_callbacks+0x1cb/0x568
[<ffffffff81048c9c>] ? raise_softirq+0x2e/0x35
[<ffffffff81086be0>] ? tick_sched_do_timer+0x38/0x38
[<ffffffff8104f653>] update_process_times+0x44/0x55
[<ffffffff81086866>] tick_sched_handle+0x4a/0x59
[<ffffffff81086c1c>] tick_sched_timer+0x3c/0x5b
[<ffffffff81062845>] __run_hrtimer+0x9b/0x158
[<ffffffff810631d8>] hrtimer_interrupt+0x172/0x2aa
[<ffffffff8102d498>] smp_apic_timer_interrupt+0x76/0x89
[<ffffffff814d881d>] apic_timer_interrupt+0x6d/0x80
<EOI>
[<ffffffff81057cd2>] ? __local_lock_irqsave+0x17/0x4a
[<ffffffff81059336>] try_to_grab_pending+0x42/0x17e
[<ffffffff8105a699>] mod_delayed_work_on+0x32/0x88
[<ffffffff8105a70b>] mod_delayed_work+0x1c/0x1e
[<ffffffff8122ae84>] blk_run_queue_async+0x37/0x39
[<ffffffff81230985>] flush_end_io+0xf1/0x107
[<ffffffff8122e0da>] blk_finish_request+0x21e/0x264
[<ffffffff8122e162>] blk_end_bidi_request+0x42/0x60
[<ffffffff8122e1ba>] blk_end_request+0x10/0x12
[<ffffffff8132de46>] scsi_io_completion+0x1bf/0x492
[<ffffffff81335cec>] ? sd_done+0x298/0x2ef
[<ffffffff81325a02>] scsi_finish_command+0xe9/0xf2
[<ffffffff8132dbcb>] scsi_softirq_done+0x106/0x10f
[<ffffffff812333d3>] blk_done_softirq+0x77/0x87
[<ffffffff8104826f>] do_current_softirqs+0x172/0x2e1
[<ffffffff810aa820>] ? irq_thread_fn+0x3a/0x3a
[<ffffffff81048466>] local_bh_enable+0x43/0x72
[<ffffffff810aa866>] irq_forced_thread_fn+0x46/0x52
[<ffffffff810ab089>] irq_thread+0x8c/0x17c
[<ffffffff810ab179>] ? irq_thread+0x17c/0x17c
[<ffffffff810aaffd>] ? wake_threads_waitq+0x44/0x44
[<ffffffff8105eb18>] kthread+0x8d/0x95
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65
[<ffffffff814d7b7c>] ret_from_fork+0x7c/0xb0
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65

The state of softirqd of this CPU at the time of the crash was:
ksoftirqd/6     R  running task        0    53      2 0x00000000
ffff88032fc39d18 0000000000000046 ffff88033330c4c0 ffff8803303f4710
ffff88032fc39fd8 ffff88032fc39fd8 0000000000000000 0000000000062500
ffff88032df88000 ffff8803303f4710 0000000000000000 ffff88032fc38000
Call Trace:
[<ffffffff8105a3ae>] ? __queue_work+0x27c/0x27c
[<ffffffff814d178c>] preempt_schedule+0x61/0x76
[<ffffffff8106cccf>] migrate_enable+0xe5/0x1df
[<ffffffff8105a3ae>] ? __queue_work+0x27c/0x27c
[<ffffffff8104ef52>] run_timer_softirq+0x161/0x1d6
[<ffffffff8104826f>] do_current_softirqs+0x172/0x2e1
[<ffffffff8104840b>] run_ksoftirqd+0x2d/0x45
[<ffffffff8106658a>] smpboot_thread_fn+0x2ea/0x308
[<ffffffff810662a0>] ? test_ti_thread_flag+0xc/0xc
[<ffffffff810662a0>] ? test_ti_thread_flag+0xc/0xc
[<ffffffff8105eb18>] kthread+0x8d/0x95
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65
[<ffffffff814d7afc>] ret_from_fork+0x7c/0xb0
[<ffffffff8105ea8b>] ? __kthread_parkme+0x65/0x65

Apparently, the softirq demon and the ata_piix IRQ handler were waiting
for each other to finish ending up in a livelock. After the below patch
was applied, the system no longer crashes.

Reported-by: Carsten Emde <C.Emde@osadl.org>
Proposed-by: Thomas Gleixner <tglx@linutronix.de>
Tested by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce78223154f2..39521eca80be 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -50,6 +50,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/nmi.h>
 #include <linux/locallock.h>
+#include <linux/delay.h>
 
 #include "workqueue_internal.h"
 
@@ -1295,7 +1296,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	local_unlock_irqrestore(pendingb_lock, *flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
-	cpu_relax();
+	cpu_chill();
 	return -EAGAIN;
 }
 
-- 
Gitee