From e6653ad20af33f336f5f5d4709d18460380d6b0f Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:06:08 +0800 Subject: [PATCH 1/7] rt: Introduce cpu_chill() commit c021d3e1d9574028eba2cdd5b6c776e9020a0d9d upstream. Retry loops on RT might loop forever when the modifying side was preempted. Add cpu_chill() to replace cpu_relax(). cpu_chill() defaults to cpu_relax() for non RT. On RT it puts the looping task to sleep for a tick so the preempted task can make progress. Steven Rostedt changed it to use a hrtimer instead of msleep(): | |Ulrich Obergfell pointed out that cpu_chill() calls msleep() which is woken |up by the ksoftirqd running the TIMER softirq. But as the cpu_chill() is |called from softirq context, it may block the ksoftirqd() from running, in |which case, it may never wake up the msleep() causing the deadlock. + bigeasy later changed to schedule_hrtimeout() |If a task calls cpu_chill() and gets woken up by a regular or spurious |wakeup and has a signal pending, then it exits the sleep loop in |do_nanosleep() and sets up the restart block. If restart->nanosleep.type is |not TI_NONE then this results in accessing a stale user pointer from a |previously interrupted syscall and a copy to user based on the stale |pointer or a BUG() when 'type' is not supported in nanosleep_copyout(). + bigeasy: add PF_NOFREEZE: | [....] Waiting for /dev to be fully populated... | ===================================== | [ BUG: udevd/229 still has locks held! ] | 3.12.11-rt17 #23 Not tainted | ------------------------------------- | 1 lock held by udevd/229: | #0: (&type->i_mutex_dir_key#2){+.+.+.}, at: lookup_slow+0x28/0x98 | | stack backtrace: | CPU: 0 PID: 229 Comm: udevd Not tainted 3.12.11-rt17 #23 | (unwind_backtrace+0x0/0xf8) from (show_stack+0x10/0x14) | (show_stack+0x10/0x14) from (dump_stack+0x74/0xbc) | (dump_stack+0x74/0xbc) from (do_nanosleep+0x120/0x160) | (do_nanosleep+0x120/0x160) from (hrtimer_nanosleep+0x90/0x110) | (hrtimer_nanosleep+0x90/0x110) from (cpu_chill+0x30/0x38) | (cpu_chill+0x30/0x38) from (dentry_kill+0x158/0x1ec) | (dentry_kill+0x158/0x1ec) from (dput+0x74/0x15c) | (dput+0x74/0x15c) from (lookup_real+0x4c/0x50) | (lookup_real+0x4c/0x50) from (__lookup_hash+0x34/0x44) | (__lookup_hash+0x34/0x44) from (lookup_slow+0x38/0x98) | (lookup_slow+0x38/0x98) from (path_lookupat+0x208/0x7fc) | (path_lookupat+0x208/0x7fc) from (filename_lookup+0x20/0x60) | (filename_lookup+0x20/0x60) from (user_path_at_empty+0x50/0x7c) | (user_path_at_empty+0x50/0x7c) from (user_path_at+0x14/0x1c) | (user_path_at+0x14/0x1c) from (vfs_fstatat+0x48/0x94) | (vfs_fstatat+0x48/0x94) from (SyS_stat64+0x14/0x30) | (SyS_stat64+0x14/0x30) from (ret_fast_syscall+0x0/0x48) Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Sebastian Andrzej Siewior --- include/linux/delay.h | 6 ++++++ kernel/time/hrtimer.c | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/delay.h b/include/linux/delay.h index b78bab4395d8..7c4bc414a504 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -64,4 +64,10 @@ static inline void ssleep(unsigned int seconds) msleep(seconds * 1000); } +#ifdef CONFIG_PREEMPT_RT_FULL +extern void cpu_chill(void); +#else +# define cpu_chill() cpu_relax() +#endif + #endif /* defined(_LINUX_DELAY_H) */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1a549c9e399..5570a0bb91c0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1797,6 +1797,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, } #endif +#ifdef CONFIG_PREEMPT_RT_FULL +/* + * Sleep for 1 ms in hope whoever holds what we want will let it go. + */ +void cpu_chill(void) +{ + ktime_t chill_time; + unsigned int freeze_flag = current->flags & PF_NOFREEZE; + + chill_time = ktime_set(0, NSEC_PER_MSEC); + set_current_state(TASK_UNINTERRUPTIBLE); + current->flags |= PF_NOFREEZE; + sleeping_lock_inc(); + schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); + sleeping_lock_dec(); + if (!freeze_flag) + current->flags &= ~PF_NOFREEZE; +} +EXPORT_SYMBOL(cpu_chill); +#endif + /* * Functions related to boot-time initialization: */ -- Gitee From ca7c4febdf38d075dae2473df5f700ba37395d07 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:07:57 +0800 Subject: [PATCH 2/7] hrtimer: Don't lose state in cpu_chill() commit d81a1f846aac209ee8c5c7ccbc613324abeba9c4 upstream. In cpu_chill() the state is set to TASK_UNINTERRUPTIBLE and a timer is programmed. On return the state is always TASK_RUNNING which means we lose the state if it was something other than RUNNING. Also set_current_state() sets ->task_state_change to within cpu_chill() which is not expected. Save the task state on entry and restore it on return. Simply set the state in order to avoid updating ->task_state_change. Cc: stable-rt@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior --- kernel/time/hrtimer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5570a0bb91c0..97d9022876cb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1805,15 +1805,18 @@ void cpu_chill(void) { ktime_t chill_time; unsigned int freeze_flag = current->flags & PF_NOFREEZE; + long saved_state; + saved_state = current->state; chill_time = ktime_set(0, NSEC_PER_MSEC); - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); current->flags |= PF_NOFREEZE; sleeping_lock_inc(); schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); sleeping_lock_dec(); if (!freeze_flag) current->flags &= ~PF_NOFREEZE; + __set_current_state_no_track(saved_state); } EXPORT_SYMBOL(cpu_chill); #endif -- Gitee From 27f441946134c1142f10a75f3c8c25166a1271b2 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:08:15 +0800 Subject: [PATCH 3/7] hrtimer: cpu_chill(): save task state in ->saved_state() commit fe55dde1efd56a8afb667d104442861cde103feb upstream. In the previous change I saved the current task state on stack. This was bad because while the task is scheduled-out it might receive a wake-up. The wake up changes the task state and we must not destroy it. Save the task-state in ->saved_state under a PI-lock to unsure that state changes during are not missed while the task temporary scheduled out. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior --- kernel/time/hrtimer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 97d9022876cb..49feba6f0762 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1803,20 +1803,28 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, */ void cpu_chill(void) { - ktime_t chill_time; unsigned int freeze_flag = current->flags & PF_NOFREEZE; - long saved_state; + struct task_struct *self = current; + ktime_t chill_time; - saved_state = current->state; - chill_time = ktime_set(0, NSEC_PER_MSEC); + raw_spin_lock_irq(&self->pi_lock); + self->saved_state = self->state; __set_current_state_no_track(TASK_UNINTERRUPTIBLE); + raw_spin_unlock_irq(&self->pi_lock); + + chill_time = ktime_set(0, NSEC_PER_MSEC); + current->flags |= PF_NOFREEZE; sleeping_lock_inc(); schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); sleeping_lock_dec(); if (!freeze_flag) current->flags &= ~PF_NOFREEZE; - __set_current_state_no_track(saved_state); + + raw_spin_lock_irq(&self->pi_lock); + __set_current_state_no_track(self->saved_state); + self->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(&self->pi_lock); } EXPORT_SYMBOL(cpu_chill); #endif -- Gitee From d186ac61ed4c7b790b017f08fa5bc30cd87d2673 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:57:59 +0800 Subject: [PATCH 4/7] block: blk-mq: move blk_queue_usage_counter_release() into process context commit a315826e2a7f0937429019a4fc122b56a4a827b9 upstream. | BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:914 | in_atomic(): 1, irqs_disabled(): 0, pid: 255, name: kworker/u257:6 | 5 locks held by kworker/u257:6/255: | #0: ("events_unbound"){.+.+.+}, at: [] process_one_work+0x171/0x5e0 | #1: ((&entry->work)){+.+.+.}, at: [] process_one_work+0x171/0x5e0 | #2: (&shost->scan_mutex){+.+.+.}, at: [] __scsi_add_device+0xa3/0x130 [scsi_mod] | #3: (&set->tag_list_lock){+.+...}, at: [] blk_mq_init_queue+0x96a/0xa50 | #4: (rcu_read_lock_sched){......}, at: [] percpu_ref_kill_and_confirm+0x1d/0x120 | Preemption disabled at:[] blk_mq_freeze_queue_start+0x56/0x70 | | CPU: 2 PID: 255 Comm: kworker/u257:6 Not tainted 3.18.7-rt0+ #1 | Workqueue: events_unbound async_run_entry_fn | 0000000000000003 ffff8800bc29f998 ffffffff815b3a12 0000000000000000 | 0000000000000000 ffff8800bc29f9b8 ffffffff8109aa16 ffff8800bc29fa28 | ffff8800bc5d1bc8 ffff8800bc29f9e8 ffffffff815b8dd4 ffff880000000000 | Call Trace: | [] dump_stack+0x4f/0x7c | [] __might_sleep+0x116/0x190 | [] rt_spin_lock+0x24/0x60 | [] __wake_up+0x29/0x60 | [] blk_mq_usage_counter_release+0x1e/0x20 | [] percpu_ref_kill_and_confirm+0x106/0x120 | [] blk_mq_freeze_queue_start+0x56/0x70 | [] blk_mq_update_tag_set_depth+0x40/0xd0 | [] blk_mq_init_queue+0x98c/0xa50 | [] scsi_mq_alloc_queue+0x20/0x60 [scsi_mod] | [] scsi_alloc_sdev+0x2f5/0x370 [scsi_mod] | [] scsi_probe_and_add_lun+0x9e4/0xdd0 [scsi_mod] | [] __scsi_add_device+0x126/0x130 [scsi_mod] | [] ata_scsi_scan_host+0xaf/0x200 [libata] | [] async_port_probe+0x46/0x60 [libata] | [] async_run_entry_fn+0x3b/0xf0 | [] process_one_work+0x201/0x5e0 percpu_ref_kill_and_confirm() invokes blk_mq_usage_counter_release() in a rcu-sched region. swait based wake queue can't be used due to wake_up_all() usage and disabled interrupts in !RT configs (as reported by Corey Minyard). The wq_has_sleeper() check has been suggested by Peter Zijlstra. Signed-off-by: Sebastian Andrzej Siewior --- block/blk-core.c | 14 +++++++++++++- include/linux/blkdev.h | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index a4a52b742885..2111ef5ff3fc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1017,12 +1017,21 @@ void blk_queue_exit(struct request_queue *q) percpu_ref_put(&q->q_usage_counter); } +static void blk_queue_usage_counter_release_swork(struct swork_event *sev) +{ + struct request_queue *q = + container_of(sev, struct request_queue, mq_pcpu_wake); + + wake_up_all(&q->mq_freeze_wq); +} + static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); - wake_up_all(&q->mq_freeze_wq); + if (wq_has_sleeper(&q->mq_freeze_wq)) + swork_queue(&q->mq_pcpu_wake); } static void blk_rq_timed_out_timer(struct timer_list *t) @@ -1121,6 +1130,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q); init_waitqueue_head(&q->mq_freeze_wq); + INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork); mutex_init(&q->mq_freeze_lock); /* @@ -4005,6 +4015,8 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); + BUG_ON(swork_get()); + request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a4811f216cda..9acd42a2ff2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -683,6 +684,7 @@ struct request_queue { #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; + struct swork_event mq_pcpu_wake; /* * Protect concurrent access to q_usage_counter by * percpu_ref_kill() and percpu_ref_reinit(). -- Gitee From 90b21e24c34d95021282c334b1609b41ea01b754 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:59:01 +0800 Subject: [PATCH 5/7] block: Use cpu_chill() for retry loops commit effa2a0476d7f71cd3c5007874c228db414f3318 upstream. Retry loops on RT might loop forever when the modifying side was preempted. Steven also observed a live lock when there was a concurrent priority boosting going on. Use cpu_chill() instead of cpu_relax() to let the system make progress. Signed-off-by: Thomas Gleixner --- block/blk-ioc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 4c810969c3e2..48089f5bcc11 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "blk.h" @@ -119,7 +120,7 @@ static void ioc_release_fn(struct work_struct *work) spin_unlock(q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); + cpu_chill(); spin_lock_irqsave_nested(&ioc->lock, flags, 1); } } @@ -203,7 +204,7 @@ void put_io_context_active(struct io_context *ioc) spin_unlock(icq->q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); + cpu_chill(); goto retry; } } -- Gitee From 3974f5f290fd16357da2c87f20c82bdd124ce66b Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:59:13 +0800 Subject: [PATCH 6/7] fs: dcache: Use cpu_chill() in trylock loops commit 3757c05ac9fc60a97b20abf3681d7f869c368be6 upstream. Retry loops on RT might loop forever when the modifying side was preempted. Use cpu_chill() instead of cpu_relax() to let the system make progress. Signed-off-by: Thomas Gleixner --- fs/autofs/expire.c | 3 ++- fs/namespace.c | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 70e9afe589fb..1a6b88ad4fe0 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -8,6 +8,7 @@ * option, any later version, incorporated herein by reference. */ +#include #include "autofs_i.h" /* Check if a dentry can be expired */ @@ -153,7 +154,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev, parent = p->d_parent; if (!spin_trylock(&parent->d_lock)) { spin_unlock(&p->d_lock); - cpu_relax(); + cpu_chill(); goto relock; } spin_unlock(&p->d_lock); diff --git a/fs/namespace.c b/fs/namespace.c index 82fa909c1c20..a18d8166c315 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -328,8 +329,11 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + preempt_enable(); + cpu_chill(); + preempt_disable(); + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until -- Gitee From 9bcff915073b100b0dca35a0ec353703fbb2e6d4 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Thu, 8 Jun 2023 20:59:25 +0800 Subject: [PATCH 7/7] net: Use cpu_chill() instead of cpu_relax() commit 8f296bb8916125380ca66e63999aac396f5ab698 upstream. Retry loops on RT might loop forever when the modifying side was preempted. Use cpu_chill() instead of cpu_relax() to let the system make progress. Signed-off-by: Thomas Gleixner --- net/packet/af_packet.c | 5 +++-- net/rds/ib_rdma.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 042bf763a40f..f95b35d83fc1 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -667,7 +668,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) if (BLOCK_NUM_PKTS(pbd)) { while (atomic_read(&pkc->blk_fill_in_prog)) { /* Waiting for skb_copy_bits to finish... */ - cpu_relax(); + cpu_chill(); } } @@ -929,7 +930,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, if (!(status & TP_STATUS_BLK_TMO)) { while (atomic_read(&pkc->blk_fill_in_prog)) { /* Waiting for skb_copy_bits to finish... */ - cpu_relax(); + cpu_chill(); } } prb_close_block(pkc, pbd, po, status); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0b347f46b2f4..f395f06031bc 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "ib_mr.h" @@ -222,7 +223,7 @@ static inline void wait_clean_list_grace(void) for_each_online_cpu(cpu) { flag = &per_cpu(clean_list_grace, cpu); while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) - cpu_relax(); + cpu_chill(); } } -- Gitee