From 97c628a2017c201b72a22fcd22febd40d0c4be0b Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:47:58 +0800 Subject: [PATCH 01/16] sched/completion: Fix a lockup in wait_for_completion() commit d3bae006e096e47f22baad0a9cdf3f496f2bee6a upstream. Consider following race: T0 T1 T2 wait_for_completion() do_wait_for_common() __prepare_to_swait() schedule() complete() x->done++ (0 -> 1) raw_spin_lock_irqsave() swake_up_locked() wait_for_completion() wake_up_process(T0) list_del_init() raw_spin_unlock_irqrestore() raw_spin_lock_irq(&x->wait.lock) raw_spin_lock_irq(&x->wait.lock) x->done != UINT_MAX, 1 -> 0 raw_spin_unlock_irq(&x->wait.lock) return 1 while (!x->done && timeout), continue loop, not enqueued on &x->wait Basically, the problem is that the original wait queues used in completions did not remove the item from the queue in the wakeup function, but swake_up_locked() does. Fix it by adding the thread to the wait queue inside the do loop. The design of swait detects if it is already in the list and doesn't do the list add again. Cc: stable-rt@vger.kernel.org Fixes: a04ff6b4ec4ee7e ("completion: Use simple wait queues") Signed-off-by: Corey Minyard Acked-by: Steven Rostedt (VMware) Signed-off-by: Steven Rostedt (VMware) [bigeasy: shorten commit message ] Signed-off-by: Sebastian Andrzej Siewior --- kernel/sched/completion.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 755a58084978..49c14137988e 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -72,12 +72,12 @@ do_wait_for_common(struct completion *x, if (!x->done) { DECLARE_SWAITQUEUE(wait); - __prepare_to_swait(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); -- Gitee From 6ab93243ce5e5bc11de3385afe304e3ba3692028 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:56:55 +0800 Subject: [PATCH 02/16] kthread: add a global worker thread. commit 9ec5aac95c498d3cd970b50a838ee738fe3861fc upstream. [ Upstream commit 0532e87d9d44795221aa921ba7024bde689cc894 ] Add kthread_schedule_work() which uses a global kthread for all its jobs. Split the cgroup include to avoid recussive includes from interrupt.h. Fixup everything that fails to build (and did not include all header). Signed-off-by: Sebastian Andrzej Siewior [ Fixed up include in blk-cgroup.h reported by Juri Lelli http://lkml.kernel.org/r/20190722083009.GE25636@localhost.localdomain ] Signed-off-by: Steven Rostedt (VMware) --- drivers/block/loop.c | 2 +- drivers/spi/spi-rockchip.c | 1 + include/linux/blk-cgroup.h | 2 +- include/linux/kthread-cgroup.h | 17 +++++++++++++++++ include/linux/kthread.h | 16 +++++++--------- init/main.c | 1 + kernel/kthread.c | 14 ++++++++++++++ 7 files changed, 42 insertions(+), 11 deletions(-) create mode 100644 include/linux/kthread-cgroup.h diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1a71e49fcda4..01b7bb2a97c0 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index e05426079b4d..7998278221a4 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -23,6 +23,7 @@ #include #include #include +#include #define DRIVER_NAME "rockchip-spi" diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 670f0019a67c..d7476470957e 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,7 +14,7 @@ * Nauman Rafique */ -#include +#include #include #include #include diff --git a/include/linux/kthread-cgroup.h b/include/linux/kthread-cgroup.h new file mode 100644 index 000000000000..53d34bca9d72 --- /dev/null +++ b/include/linux/kthread-cgroup.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KTHREAD_CGROUP_H +#define _LINUX_KTHREAD_CGROUP_H +#include +#include + +#ifdef CONFIG_BLK_CGROUP +void kthread_associate_blkcg(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *kthread_blkcg(void); +#else +static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } +static inline struct cgroup_subsys_state *kthread_blkcg(void) +{ + return NULL; +} +#endif +#endif diff --git a/include/linux/kthread.h b/include/linux/kthread.h index fd6403945fc3..4571e2511237 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -105,7 +105,7 @@ struct kthread_delayed_work { }; #define KTHREAD_WORKER_INIT(worker) { \ - .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock), \ .work_list = LIST_HEAD_INIT((worker).work_list), \ .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ } @@ -199,14 +199,12 @@ void kthread_destroy_worker(struct kthread_worker *worker); struct cgroup_subsys_state; -#ifdef CONFIG_BLK_CGROUP -void kthread_associate_blkcg(struct cgroup_subsys_state *css); -struct cgroup_subsys_state *kthread_blkcg(void); -#else -static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } -static inline struct cgroup_subsys_state *kthread_blkcg(void) +extern struct kthread_worker kthread_global_worker; +void kthread_init_global_worker(void); + +static inline bool kthread_schedule_work(struct kthread_work *work) { - return NULL; + return kthread_queue_work(&kthread_global_worker, work); } -#endif + #endif /* _LINUX_KTHREAD_H */ diff --git a/init/main.c b/init/main.c index c11aed805ef5..e244ed4045c0 100644 --- a/init/main.c +++ b/init/main.c @@ -1136,6 +1136,7 @@ static noinline void __init kernel_init_freeable(void) smp_prepare_cpus(setup_max_cpus); workqueue_init(); + kthread_init_global_worker(); init_mm_internals(); diff --git a/kernel/kthread.c b/kernel/kthread.c index fcb3a1a6e14b..20d01a4bf16d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,6 +21,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -1181,6 +1182,19 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); +DEFINE_KTHREAD_WORKER(kthread_global_worker); +EXPORT_SYMBOL(kthread_global_worker); + +__init void kthread_init_global_worker(void) +{ + kthread_global_worker.task = kthread_create(kthread_worker_fn, + &kthread_global_worker, + "kswork"); + if (WARN_ON(IS_ERR(kthread_global_worker.task))) + return; + wake_up_process(kthread_global_worker.task); +} + #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread -- Gitee From 3094d4cf96db4cfca2e572755326c568c34539f1 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:57:38 +0800 Subject: [PATCH 03/16] genirq: Do not invoke the affinity callback via a workqueue on RT commit 0fcd7c5cee291d12eb90409cad4636cc17be0973 upstream. [ Upstream commit 2122adbe011cdc0eb62ad62494e181005b23c76a ] Joe Korty reported, that __irq_set_affinity_locked() schedules a workqueue while holding a rawlock which results in a might_sleep() warning. This patch uses swork_queue() instead. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- include/linux/interrupt.h | 5 ++--- kernel/irq/manage.c | 19 ++++--------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6936d921aa53..26579b8233a7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -248,7 +248,6 @@ extern void resume_device_irqs(void); * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use - * @swork: Swork item, for internal use * @work: Work item, for internal use * @notify: Function to be called on change. This will be * called in process context. @@ -261,7 +260,7 @@ struct irq_affinity_notify { unsigned int irq; struct kref kref; #ifdef CONFIG_PREEMPT_RT_BASE - struct swork_event swork; + struct kthread_work work; #else struct work_struct work; #endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7e51e1d9f437..a6c8f0f9598c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -324,7 +324,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, kref_get(&desc->affinity_notify->kref); #ifdef CONFIG_PREEMPT_RT_BASE - swork_queue(&desc->affinity_notify->swork); + kthread_schedule_work(&desc->affinity_notify->work); #else schedule_work(&desc->affinity_notify->work); #endif @@ -389,21 +389,11 @@ static void _irq_affinity_notify(struct irq_affinity_notify *notify) } #ifdef CONFIG_PREEMPT_RT_BASE -static void init_helper_thread(void) -{ - static int init_sworker_once; - - if (init_sworker_once) - return; - if (WARN_ON(swork_get())) - return; - init_sworker_once = 1; -} -static void irq_affinity_notify(struct swork_event *swork) +static void irq_affinity_notify(struct kthread_work *work) { struct irq_affinity_notify *notify = - container_of(swork, struct irq_affinity_notify, swork); + container_of(work, struct irq_affinity_notify, work); _irq_affinity_notify(notify); } @@ -446,8 +436,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) notify->irq = irq; kref_init(¬ify->kref); #ifdef CONFIG_PREEMPT_RT_BASE - INIT_SWORK(¬ify->swork, irq_affinity_notify); - init_helper_thread(); + kthread_init_work(¬ify->work, irq_affinity_notify); #else INIT_WORK(¬ify->work, irq_affinity_notify); #endif -- Gitee From f65994b251eed2d105e6c94de55ae7e237f1506d Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:57:52 +0800 Subject: [PATCH 04/16] genirq: Handle missing work_struct in irq_set_affinity_notifier() commit 5f103966da617ccbc58a5db958144f55725dd823 upstream. [ Upstream commit bbc4d2a7d6ff54ba923640d9a42c7bef7185fe98 ] The backported stable commit 59c39840f5abf ("genirq: Prevent use-after-free and work list corruption") added cancel_work_sync() on a work_struct element which is not available in RT. Replace cancel_work_sync() with kthread_cancel_work_sync() on RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/irq/manage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a6c8f0f9598c..c7440dd83e23 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -448,8 +448,9 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { -#ifndef CONFIG_PREEMPT_RT_BASE - /* Need to address this for PREEMPT_RT */ +#ifdef CONFIG_PREEMPT_RT_BASE + kthread_cancel_work_sync(¬ify->work); +#else cancel_work_sync(&old_notify->work); #endif kref_put(&old_notify->kref, old_notify->release); -- Gitee From b633d9450d2d475954bf28ec79d81eb9348b6b53 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:58:02 +0800 Subject: [PATCH 05/16] arm: imx6: cpuidle: Use raw_spinlock_t commit 499b81170e694652eb759044a65b1d2de8a02540 upstream. [ Upstream commit 40d0332ec8312e9c090f0a5414d9c90e12b13611 ] The idle call back is invoked with disabled interrupts and requires raw_spinlock_t locks to work. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- arch/arm/mach-imx/cpuidle-imx6q.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c index 326e870d7123..d9ac80aa1eb0 100644 --- a/arch/arm/mach-imx/cpuidle-imx6q.c +++ b/arch/arm/mach-imx/cpuidle-imx6q.c @@ -17,22 +17,22 @@ #include "hardware.h" static int num_idle_cpus = 0; -static DEFINE_SPINLOCK(cpuidle_lock); +static DEFINE_RAW_SPINLOCK(cpuidle_lock); static int imx6q_enter_wait(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - spin_lock(&cpuidle_lock); + raw_spin_lock(&cpuidle_lock); if (++num_idle_cpus == num_online_cpus()) imx6_set_lpm(WAIT_UNCLOCKED); - spin_unlock(&cpuidle_lock); + raw_spin_unlock(&cpuidle_lock); cpu_do_idle(); - spin_lock(&cpuidle_lock); + raw_spin_lock(&cpuidle_lock); if (num_idle_cpus-- == num_online_cpus()) imx6_set_lpm(WAIT_CLOCKED); - spin_unlock(&cpuidle_lock); + raw_spin_unlock(&cpuidle_lock); return index; } -- Gitee From 21409b355789bcb4c893e57df2dca0ff0fdb3daf Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:58:12 +0800 Subject: [PATCH 06/16] rcu: Don't allow to change rcu_normal_after_boot on RT commit 8ffd7db75f55aa4a603b0a6a83eb05fe77ca543c upstream. [ Upstream commit c6c058c10577815a2491ce661876cff00a4c3b15 ] On RT rcu_normal_after_boot is enabled by default. Don't allow to disable it on RT because the "expedited rcu" would introduce latency spikes. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 16d8dba23329..ed75addd3ccd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -69,7 +69,9 @@ module_param(rcu_expedited, int, 0); extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL); +#ifndef CONFIG_PREEMPT_RT_FULL module_param(rcu_normal_after_boot, int, 0); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -- Gitee From c39cc582ed09dda93779c6c64709c92d8801c25d Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:58:22 +0800 Subject: [PATCH 07/16] pci/switchtec: fix stream_open.cocci warnings commit 6c25ddefabb7f6bd4adaa2de6a0fdce865eb4c0c upstream. [ Upstream commit 9462c69e29307adc95c289f50839d5d683973891 ] drivers/pci/switch/switchtec.c:395:1-17: ERROR: switchtec_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix. Generated by: scripts/coccinelle/api/stream_open.cocci Cc: Kirill Smelkov Cc: Julia Lawall Fixes: 8a29a3bae2a2 ("pci/switchtec: Don't use completion's wait queue") Cc: stable-rt@vger.kernel.org # where it applies to Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1904131849350.2536@hadrien Signed-off-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- drivers/pci/switch/switchtec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 77d4fb86d05b..ea70bc0b06e9 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -360,7 +360,7 @@ static int switchtec_dev_open(struct inode *inode, struct file *filp) return PTR_ERR(stuser); filp->private_data = stuser; - nonseekable_open(inode, filp); + stream_open(inode, filp); dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser); -- Gitee From e2a2662fecb0192415eef022979848d5f28d2041 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 19:58:33 +0800 Subject: [PATCH 08/16] sched/core: Drop a preempt_disable_rt() statement commit e92a326589d8cf6722dac937b84c265aa10fdf1b upstream. [ Upstream commit 761126efdcbe3fa3e99c9079fa0ad6eca2f251f2 ] The caller holds a lock which already disables preemption. Drop the preempt_disable_rt() statement in get_nohz_timer_target(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/sched/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 60de304a7b09..6b064084256d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -569,14 +569,11 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - preempt_disable_rt(); - cpu = smp_processor_id(); - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - goto preempt_en_rt; + return cpu; rcu_read_lock(); for_each_domain(cpu, sd) { @@ -595,8 +592,6 @@ int get_nohz_timer_target(void) cpu = housekeeping_any_cpu(HK_FLAG_TIMER); unlock: rcu_read_unlock(); -preempt_en_rt: - preempt_enable_rt(); return cpu; } -- Gitee From 439bfdad7ba55923c44e29f27a5c0ca736d7b359 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:13:58 +0800 Subject: [PATCH 09/16] hrtimer: consolidate hrtimer_init() + hrtimer_init_sleeper() calls commit 8553725681eb8c0f78dad5fb1ef542301586e9ff upstream. hrtimer_init_sleeper() calls require a prior initialisation of the hrtimer object with hrtimer_init(). Lets make the initialisation of the hrtimer object part of hrtimer_init_sleeper(). To remain consistent consider init_on_stack as well. Beside adapting the hrtimer_init_sleeper[_on_stack]() functions, call sites need to be updated as well. Link: http://lkml.kernel.org/r/20180703092541.2870-1-anna-maria@linutronix.de Signed-off-by: Sebastian Andrzej Siewior [anna-maria: Updating the commit message, add staging/android/vsoc.c] Signed-off-by: Anna-Maria Gleixner --- block/blk-mq.c | 3 +-- drivers/staging/android/vsoc.c | 6 ++--- include/linux/hrtimer.h | 19 +++++++++++--- include/linux/wait.h | 4 +-- kernel/futex.c | 19 ++++++-------- kernel/time/hrtimer.c | 46 ++++++++++++++++++++++++++-------- net/core/pktgen.c | 4 +-- 7 files changed, 67 insertions(+), 34 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4542fb021777..7ac3221b5769 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3416,10 +3416,9 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, kt = nsecs; mode = HRTIMER_MODE_REL; - hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); + hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current); hrtimer_set_expires(&hs.timer, kt); - hrtimer_init_sleeper(&hs, current); do { if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c index 22571abcaa4e..78a529d363f3 100644 --- a/drivers/staging/android/vsoc.c +++ b/drivers/staging/android/vsoc.c @@ -437,12 +437,10 @@ static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg) return -EINVAL; wake_time = ktime_set(arg->wake_time_sec, arg->wake_time_nsec); - hrtimer_init_on_stack(&to->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + hrtimer_init_sleeper_on_stack(to, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS, current); hrtimer_set_expires_range_ns(&to->timer, wake_time, current->timer_slack_ns); - - hrtimer_init_sleeper(to, current); } while (1) { diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3892e9c8b2de..b8bbaabd5aff 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -364,10 +364,17 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode, + struct task_struct *task); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); +extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, + enum hrtimer_mode mode, + struct task_struct *task); extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else @@ -377,6 +384,15 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer, { hrtimer_init(timer, which_clock, mode); } + +static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, + enum hrtimer_mode mode, + struct task_struct *task) +{ + hrtimer_init_sleeper(sl, clock_id, mode, task); +} + static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif @@ -480,9 +496,6 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - struct task_struct *tsk); - extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, diff --git a/include/linux/wait.h b/include/linux/wait.h index 2b5ef8e94d19..94bd2e841de6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -489,8 +489,8 @@ do { \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ - hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \ - hrtimer_init_sleeper(&__t, current); \ + hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL, \ + current); \ if ((timeout) != KTIME_MAX) \ hrtimer_start_range_ns(&__t.timer, timeout, \ current->timer_slack_ns, \ diff --git a/kernel/futex.c b/kernel/futex.c index 0e5b78290df6..0ef8ec8a583e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2708,10 +2708,9 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); + hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); } @@ -2810,9 +2809,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); + hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME, + HRTIMER_MODE_ABS, current); hrtimer_set_expires(&to->timer, *time); } @@ -3249,10 +3247,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); + hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 49feba6f0762..bb0d3ebc7bdf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1648,13 +1648,44 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, + enum hrtimer_mode mode, + struct task_struct *task) { + __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = task; } + +/** + * hrtimer_init_sleeper - initialize sleeper to the given clock + * @sl: sleeper to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + * @task: the task to wake up + */ +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode, struct task_struct *task) +{ + debug_init(&sl->timer, clock_id, mode); + __hrtimer_init_sleeper(sl, clock_id, mode, task); + +} EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, + enum hrtimer_mode mode, + struct task_struct *task) +{ + debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); + __hrtimer_init_sleeper(sl, clock_id, mode, task); +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); +#endif + int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { @@ -1678,8 +1709,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod { struct restart_block *restart; - hrtimer_init_sleeper(t, current); - do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); @@ -1716,10 +1745,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); + hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, + HRTIMER_MODE_ABS, current); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -1737,7 +1765,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, if (dl_task(current) || rt_task(current)) slack = 0; - hrtimer_init_on_stack(&t.timer, clockid, mode); + hrtimer_init_sleeper_on_stack(&t, clockid, mode, current); hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) @@ -1968,11 +1996,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, clock_id, mode); + hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_init_sleeper(&t, current); - hrtimer_start_expires(&t.timer, mode); if (likely(t.task)) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 092fa3d75b32..9d472d626aaa 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2160,7 +2160,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS, + current); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); @@ -2175,7 +2176,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } while (ktime_compare(end_time, spin_until) < 0); } else { /* see do_nanosleep */ - hrtimer_init_sleeper(&t, current); do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); -- Gitee From afa037c86f42b0426831b60f3cd2b193ffcfadb0 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:14:12 +0800 Subject: [PATCH 10/16] hrtimers: Prepare full preemption commit e033fc3f9345af04f20c1d45fa8ee453caaf5b1c upstream. Make cancellation of a running callback in softirq context safe against preemption. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- fs/timerfd.c | 5 ++++- include/linux/hrtimer.h | 13 +++++++++++- include/linux/posix-timers.h | 2 +- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 33 +++++++++++++++++++++++++++++- kernel/time/itimer.c | 1 + kernel/time/posix-timers.c | 39 ++++++++++++++++++++++++++++++++++-- 7 files changed, 88 insertions(+), 7 deletions(-) diff --git a/fs/timerfd.c b/fs/timerfd.c index d69ad801eb80..82d0f52414a6 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags, break; } spin_unlock_irq(&ctx->wqh.lock); - cpu_relax(); + if (isalarm(ctx)) + hrtimer_wait_for_timer(&ctx->t.alarm.timer); + else + hrtimer_wait_for_timer(&ctx->t.tmr); } /* diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index b8bbaabd5aff..73ad7309436a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -22,6 +22,7 @@ #include #include #include +#include struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -216,6 +217,9 @@ struct hrtimer_cpu_base { ktime_t expires_next; struct hrtimer *next_timer; ktime_t softirq_expires_next; +#ifdef CONFIG_PREEMPT_RT_BASE + wait_queue_head_t wait; +#endif struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; } ____cacheline_aligned; @@ -433,6 +437,13 @@ static inline void hrtimer_restart(struct hrtimer *timer) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_RT_BASE + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); @@ -458,7 +469,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) * Helper function to check, whether the timer is running the callback * function */ -static inline int hrtimer_callback_running(struct hrtimer *timer) +static inline int hrtimer_callback_running(const struct hrtimer *timer) { return timer->base->running == timer; } diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index ee7e987ea1b4..0571b498db73 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -114,8 +114,8 @@ struct k_itimer { struct { struct alarm alarmtimer; } alarm; - struct rcu_head rcu; } it; + struct rcu_head rcu; }; void run_posix_cpu_timers(struct task_struct *task); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f4255a65c44b..1d1f077cffb3 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -436,7 +436,7 @@ int alarm_cancel(struct alarm *alarm) int ret = alarm_try_to_cancel(alarm); if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(&alarm->timer); } } EXPORT_SYMBOL_GPL(alarm_cancel); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bb0d3ebc7bdf..8bf8e40bfb7c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -939,6 +939,33 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(hrtimer_forward); +#ifdef CONFIG_PREEMPT_RT_BASE +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base && + base->index >= HRTIMER_BASE_MONOTONIC_SOFT) + wait_event(base->cpu_base->wait, + !(hrtimer_callback_running(timer))); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + /* * enqueue_hrtimer - internal function to (re)start a timer * @@ -1171,7 +1198,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1477,6 +1504,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + wake_up_timer_waiters(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1878,6 +1906,9 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; +#ifdef CONFIG_PREEMPT_RT_BASE + init_waitqueue_head(&cpu_base->wait); +#endif return 0; } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9a65713c8309..55b0e58368bf 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -215,6 +215,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 5a01c4fdbfef..a5ec421e3437 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -463,7 +463,7 @@ static struct k_itimer * alloc_posix_timer(void) static void k_itimer_rcu_free(struct rcu_head *head) { - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); kmem_cache_free(posix_timers_cache, tmr); } @@ -480,7 +480,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - call_rcu(&tmr->it.rcu, k_itimer_rcu_free); + call_rcu(&tmr->rcu, k_itimer_rcu_free); } static int common_timer_create(struct k_itimer *new_timer) @@ -821,6 +821,22 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * Protected by RCU! + */ +static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr) +{ +#ifdef CONFIG_PREEMPT_RT_FULL + if (kc->timer_arm == common_hrtimer_arm) + hrtimer_wait_for_timer(&timr->it.real.timer); + else if (kc == &alarm_clock) + hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer); + else + /* FIXME: Whacky hack for posix-cpu-timers */ + schedule_timeout(1); +#endif +} + static int common_hrtimer_try_to_cancel(struct k_itimer *timr) { return hrtimer_try_to_cancel(&timr->it.real.timer); @@ -885,6 +901,7 @@ static int do_timer_settime(timer_t timer_id, int flags, if (!timr) return -EINVAL; + rcu_read_lock(); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -893,9 +910,12 @@ static int do_timer_settime(timer_t timer_id, int flags, unlock_timer(timr, flag); if (error == TIMER_RETRY) { + timer_wait_for_callback(kc, timr); old_spec64 = NULL; // We already got the old time... + rcu_read_unlock(); goto retry; } + rcu_read_unlock(); return error; } @@ -977,10 +997,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) if (!timer) return -EINVAL; + rcu_read_lock(); if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + timer_wait_for_callback(clockid_to_kclock(timer->it_clock), + timer); + rcu_read_unlock(); goto retry_delete; } + rcu_read_unlock(); spin_lock(¤t->sighand->siglock); list_del(&timer->list); @@ -1006,8 +1031,18 @@ static void itimer_delete(struct k_itimer *timer) retry_delete: spin_lock_irqsave(&timer->it_lock, flags); + /* On RT we can race with a deletion */ + if (!timer->it_signal) { + unlock_timer(timer, flags); + return; + } + if (timer_delete_hook(timer) == TIMER_RETRY) { + rcu_read_lock(); unlock_timer(timer, flags); + timer_wait_for_callback(clockid_to_kclock(timer->it_clock), + timer); + rcu_read_unlock(); goto retry_delete; } list_del(&timer->list); -- Gitee From 404614029e5764521e97002ab7a84425cca854ee Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:14:27 +0800 Subject: [PATCH 11/16] hrtimer: by timers by default into the softirq context commit b26ce46313b85d2d4f52b053c1602b3c6d3e3f9f upstream. We can't have hrtimers callbacks running in hardirq context on RT. Therefore the timers are deferred to the softirq context by default. There are few timers which expect to be run in hardirq context even on RT. Those are: - very short running where low latency is critical (kvm lapic) - timers which take raw locks and need run in hard-irq context (perf, sched) - wake up related timer (kernel side of clock_nanosleep() and so on) Signed-off-by: Sebastian Andrzej Siewior --- arch/x86/kvm/lapic.c | 2 +- include/linux/hrtimer.h | 6 ++++++ kernel/events/core.c | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 4 ++-- kernel/sched/rt.c | 4 ++-- kernel/time/hrtimer.c | 21 +++++++++++++++++++-- kernel/time/tick-broadcast-hrtimer.c | 2 +- kernel/time/tick-sched.c | 2 +- kernel/watchdog.c | 2 +- 11 files changed, 37 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3e4dd4b2771d..5840d9033b5b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2261,7 +2261,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_HARD); apic->lapic_timer.timer.function = apic_timer_fn; /* diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 73ad7309436a..2bdb047c7656 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -42,6 +42,7 @@ enum hrtimer_mode { HRTIMER_MODE_REL = 0x01, HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, + HRTIMER_MODE_HARD = 0x08, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, @@ -52,6 +53,11 @@ enum hrtimer_mode { HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, + HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD, + HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD, + + HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, + HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 64c820570445..ce1ebde1a0ec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1102,7 +1102,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpuctx->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } @@ -9326,7 +9326,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6b064084256d..e37d8d0911b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -330,7 +330,7 @@ static void hrtick_rq_init(struct rq *rq) rq->hrtick_csd.info = rq; #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } #else /* CONFIG_SCHED_HRTICK */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4b13df38c069..974a8f9b615a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1086,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); timer->function = dl_task_timer; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac8f35ef3a76..497125a35941 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7060,9 +7060,9 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->buffer = RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); cfs_b->period_timer.function = sched_cfs_period_timer; - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->distribute_running = 0; cfs_b->slack_started = false; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e9a609cdd19b..f5f522d5151a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); rt_b->rt_period_timer.function = sched_rt_period_timer; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8bf8e40bfb7c..b9cbeca5e700 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1135,7 +1135,9 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match. */ +#ifndef CONFIG_PREEMPT_RT_BASE WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); +#endif base = lock_hrtimer_base(timer, &flags); @@ -1295,10 +1297,17 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - bool softtimer = !!(mode & HRTIMER_MODE_SOFT); - int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; + bool softtimer; + int base; struct hrtimer_cpu_base *cpu_base; + softtimer = !!(mode & HRTIMER_MODE_SOFT); +#ifdef CONFIG_PREEMPT_RT_FULL + if (!softtimer && !(mode & HRTIMER_MODE_HARD)) + softtimer = true; +#endif + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; + memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); @@ -1681,6 +1690,14 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, enum hrtimer_mode mode, struct task_struct *task) { +#ifdef CONFIG_PREEMPT_RT_FULL + if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) { + if (task_is_realtime(current) || system_state != SYSTEM_RUNNING) + mode |= HRTIMER_MODE_HARD; + else + mode |= HRTIMER_MODE_SOFT; + } +#endif __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = task; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a836efd34589..c50e8f3262de 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -107,7 +107,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); bctimer.function = bc_handler; clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2778d02bb71c..0e9e5ebdf8ff 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1302,7 +1302,7 @@ void tick_setup_sched_timer(void) /* * Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 72f18b682224..1c6c542e5f36 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -558,7 +558,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer->function = watchdog_timer_fn; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); -- Gitee From 026fcb0b4463fa7a37a41425ce649c90a2046015 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:14:38 +0800 Subject: [PATCH 12/16] sched/fair: Make the hrtimers non-hard again commit 1f9048fd4cb26be3c10ac92b6c9b2e90d5005888 upstream. Since commit "sched/fair: Robustify CFS-bandwidth timer locking" both hrtimer can run in softirq context because now interrupts are disabled as part of the locking procedure. Signed-off-by: Sebastian Andrzej Siewior --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 497125a35941..ac8f35ef3a76 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7060,9 +7060,9 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->buffer = RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->distribute_running = 0; cfs_b->slack_started = false; -- Gitee From f724da8b6557dea06b64a5718dd8fbf685d00434 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:14:47 +0800 Subject: [PATCH 13/16] hrtimer: Move schedule_work call to helper thread commit 1669cbc2eb152b7ae7f94cb966fbfd888a191cc8 upstream. When run ltp leapsec_timer test, the following call trace is caught: BUG: sleeping function called from invalid context at kernel/rtmutex.c:659 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1 Preemption disabled at:[] cpu_startup_entry+0x133/0x310 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.10.10-rt3 #2 Hardware name: Intel Corporation Calpella platform/MATXM-CORE-411-B, BIOS 4.6.3 08/18/2010 ffffffff81c2f800 ffff880076843e40 ffffffff8169918d ffff880076843e58 ffffffff8106db31 ffff88007684b4a0 ffff880076843e70 ffffffff8169d9c0 ffff88007684b4a0 ffff880076843eb0 ffffffff81059da1 0000001876851200 Call Trace: [] dump_stack+0x19/0x1b [] __might_sleep+0xf1/0x170 [] rt_spin_lock+0x20/0x50 [] queue_work_on+0x61/0x100 [] clock_was_set_delayed+0x21/0x30 [] do_timer+0x40e/0x660 [] tick_do_update_jiffies64+0xf7/0x140 [] tick_check_idle+0x92/0xc0 [] irq_enter+0x57/0x70 [] smp_apic_timer_interrupt+0x3e/0x9b [] apic_timer_interrupt+0x6a/0x70 [] ? cpuidle_enter_state+0x4c/0xc0 [] cpuidle_idle_call+0xd8/0x2d0 [] arch_cpu_idle+0xe/0x30 [] cpu_startup_entry+0x19e/0x310 [] start_secondary+0x1ad/0x1b0 The clock_was_set_delayed is called in hard IRQ handler (timer interrupt), which calls schedule_work. Under PREEMPT_RT_FULL, schedule_work calls spinlocks which could sleep, so it's not safe to call schedule_work in interrupt context. Reference upstream commit b68d61c705ef02384c0538b8d9374545097899ca (rt,ntp: Move call to schedule_delayed_work() to helper thread) from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git, which makes a similar change. Signed-off-by: Yang Shi [bigeasy: use swork_queue() instead a helper thread] Signed-off-by: Sebastian Andrzej Siewior --- kernel/time/hrtimer.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b9cbeca5e700..bd1d8a9c5236 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -730,6 +730,29 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } +#ifdef CONFIG_PREEMPT_RT_FULL + +static struct swork_event clock_set_delay_work; + +static void run_clock_set_delay(struct swork_event *event) +{ + clock_was_set(); +} + +void clock_was_set_delayed(void) +{ + swork_queue(&clock_set_delay_work); +} + +static __init int create_clock_set_delay_thread(void) +{ + WARN_ON(swork_get()); + INIT_SWORK(&clock_set_delay_work, run_clock_set_delay); + return 0; +} +early_initcall(create_clock_set_delay_thread); +#else /* PREEMPT_RT_FULL */ + static void clock_was_set_work(struct work_struct *work) { clock_was_set(); @@ -745,6 +768,7 @@ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } +#endif #else -- Gitee From d240d18ea9249a7aead3eea2e5cfd72e813ac5cd Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:15:02 +0800 Subject: [PATCH 14/16] hrtimer: move state change before hrtimer_cancel in do_nanosleep() commit b62f8583e5a6a8a86192ed4bc3cef3e923f295d0 upstream. There is a small window between setting t->task to NULL and waking the task up (which would set TASK_RUNNING). So the timer would fire, run and set ->task to NULL while the other side/do_nanosleep() wouldn't enter freezable_schedule(). After all we are peemptible here (in do_nanosleep() and on the timer wake up path) and on KVM/virt the virt-CPU might get preempted. So do_nanosleep() wouldn't enter freezable_schedule() but cancel the timer which is still running and wait for it via hrtimer_wait_for_timer(). Then wait_event()/might_sleep() would complain that it is invoked with state != TASK_RUNNING. This isn't a problem since it would be reset to TASK_RUNNING later anyway and we don't rely on the previous state. Move the state update to TASK_RUNNING before hrtimer_cancel() so there are no complains from might_sleep() about wrong state. Cc: stable-rt@vger.kernel.org Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Sebastian Andrzej Siewior --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bd1d8a9c5236..e1040b80362c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1785,12 +1785,12 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod if (likely(t->task)) freezable_schedule(); + __set_current_state(TASK_RUNNING); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); - __set_current_state(TASK_RUNNING); if (!t->task) return 0; -- Gitee From f6cac3462c0e121eb2458e3bd3d2f8c9fee5599b Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:15:16 +0800 Subject: [PATCH 15/16] posix-timers: Thread posix-cpu-timers on -rt commit e63ab953f4da79eb0d2767e029e61f4d8491f131 upstream. posix-cpu-timer code takes non -rt safe locks in hard irq context. Move it to a thread. [ 3.0 fixes from Peter Zijlstra ] Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 3 + init/init_task.c | 7 ++ kernel/fork.c | 3 + kernel/time/posix-cpu-timers.c | 154 ++++++++++++++++++++++++++++++++- 4 files changed, 164 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 68fa3d093a97..5425ccb2d689 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -938,6 +938,9 @@ struct task_struct { #ifdef CONFIG_POSIX_TIMERS struct task_cputime cputime_expires; struct list_head cpu_timers[3]; +#ifdef CONFIG_PREEMPT_RT_BASE + struct task_struct *posix_timer_list; +#endif #endif /* Process credentials: */ diff --git a/init/init_task.c b/init/init_task.c index 0b49b9cf5571..9e3362748214 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -50,6 +50,12 @@ static struct sighand_struct init_sighand = { .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), }; +#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE) +# define INIT_TIMER_LIST .posix_timer_list = NULL, +#else +# define INIT_TIMER_LIST +#endif + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) @@ -119,6 +125,7 @@ struct task_struct init_task INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), .timer_slack_ns = 50000, /* 50 usec default slack */ + INIT_TIMER_LIST .thread_pid = &init_struct_pid, .thread_group = LIST_HEAD_INIT(init_task.thread_group), .thread_node = LIST_HEAD_INIT(init_signals.thread_head), diff --git a/kernel/fork.c b/kernel/fork.c index e43babb1af4f..6aef8c531a90 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1631,6 +1631,9 @@ static void rt_mutex_init_task(struct task_struct *p) */ static void posix_cpu_timers_init(struct task_struct *tsk) { +#ifdef CONFIG_PREEMPT_RT_BASE + tsk->posix_timer_list = NULL; +#endif tsk->cputime_expires.prof_exp = 0; tsk->cputime_expires.virt_exp = 0; tsk->cputime_expires.sched_exp = 0; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d62d7ae5201c..8d95e8de98b2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -3,8 +3,10 @@ * Implement CPU time clocks for the POSIX clock interface. */ +#include #include #include +#include #include #include #include @@ -15,6 +17,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -1140,14 +1143,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +static void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; unsigned long flags; - lockdep_assert_irqs_disabled(); - /* * The fast path checks that there are no expired thread or thread * group timers. If that's so, just return. @@ -1200,6 +1201,153 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +#ifdef CONFIG_PREEMPT_RT_BASE +#include +#include +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); +DEFINE_PER_CPU(bool, posix_timer_th_active); + +static void posix_cpu_kthread_fn(unsigned int cpu) +{ + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + BUG_ON(per_cpu(posix_timer_task, cpu) != current); + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + /* its possible the list is empty, just return */ + if (!tsk) + return; + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } +} + +static inline int __fastpath_timer_check(struct task_struct *tsk) +{ + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) + return 1; + + if (!task_cputime_zero(&tsk->signal->cputime_expires)) + return 1; + + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned int cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + + if (per_cpu(posix_timer_th_active, cpu) != true) + return; + + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + + wake_up_process(per_cpu(posix_timer_task, cpu)); + } +} + +static int posix_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(posix_timer_tasklist) != NULL; +} + +static void posix_cpu_kthread_park(unsigned int cpu) +{ + this_cpu_write(posix_timer_th_active, false); +} + +static void posix_cpu_kthread_unpark(unsigned int cpu) +{ + this_cpu_write(posix_timer_th_active, true); +} + +static void posix_cpu_kthread_setup(unsigned int cpu) +{ + struct sched_param sp; + + sp.sched_priority = MAX_RT_PRIO - 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + posix_cpu_kthread_unpark(cpu); +} + +static struct smp_hotplug_thread posix_cpu_thread = { + .store = &posix_timer_task, + .thread_should_run = posix_cpu_kthread_should_run, + .thread_fn = posix_cpu_kthread_fn, + .thread_comm = "posixcputmr/%u", + .setup = posix_cpu_kthread_setup, + .park = posix_cpu_kthread_park, + .unpark = posix_cpu_kthread_unpark, +}; + +static int __init posix_cpu_thread_init(void) +{ + /* Start one for boot CPU. */ + unsigned long cpu; + int ret; + + /* init the per-cpu posix_timer_tasklets */ + for_each_possible_cpu(cpu) + per_cpu(posix_timer_tasklist, cpu) = NULL; + + ret = smpboot_register_percpu_thread(&posix_cpu_thread); + WARN_ON(ret); + + return 0; +} +early_initcall(posix_cpu_thread_init); +#else /* CONFIG_PREEMPT_RT_BASE */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_assert_irqs_disabled(); + __run_posix_cpu_timers(tsk); +} +#endif /* CONFIG_PREEMPT_RT_BASE */ + /* * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. -- Gitee From 4b55ad96a6be946ebb5e9d8e0a1e59c3929ab20c Mon Sep 17 00:00:00 2001 From: meganz009 Date: Mon, 12 Jun 2023 20:17:59 +0800 Subject: [PATCH 16/16] timers: Redo the notification of canceling timers on -RT commit 07d9aaa5917187845b59efdf55fa02d70f35cad2 upstream. [ Upstream commit c71273154c2ad12e13333aada340ff30e826a11b ] Rework of the hrtimer, timer and posix-timer cancelation interface on -RT. Instead of the swait/schedule interface we now have locks which are taken while timer is active. During the cancellation of an active timer the lock is acquired. The lock will then either PI-boost the timer or block and wait until the timer completed. The new code looks simpler and does not trigger a warning from rcu_note_context_switch() anymore like reported by Grygorii Strashko and Daniel Wagner. The patches were contributed by Anna-Maria Gleixner. This is an all in one commit of the following patches: | [PATCH] timers: Introduce expiry spin lock | [PATCH] timers: Drop expiry lock after each timer invocation | [PATCH] hrtimer: Introduce expiry spin lock | [PATCH] posix-timers: move rcu out of union | [PATCH] posix-timers: Add expiry lock Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- fs/timerfd.c | 5 +- include/linux/hrtimer.h | 17 ++---- include/linux/posix-timers.h | 1 + kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 36 ++++--------- kernel/time/itimer.c | 2 +- kernel/time/posix-cpu-timers.c | 23 ++++++++ kernel/time/posix-timers.c | 69 ++++++++++-------------- kernel/time/posix-timers.h | 2 + kernel/time/timer.c | 96 ++++++++++++++++------------------ 10 files changed, 118 insertions(+), 135 deletions(-) diff --git a/fs/timerfd.c b/fs/timerfd.c index 82d0f52414a6..f845093466be 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -471,10 +471,11 @@ static int do_timerfd_settime(int ufd, int flags, break; } spin_unlock_irq(&ctx->wqh.lock); + if (isalarm(ctx)) - hrtimer_wait_for_timer(&ctx->t.alarm.timer); + hrtimer_grab_expiry_lock(&ctx->t.alarm.timer); else - hrtimer_wait_for_timer(&ctx->t.tmr); + hrtimer_grab_expiry_lock(&ctx->t.tmr); } /* diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2bdb047c7656..6c4c38186c99 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -22,7 +22,6 @@ #include #include #include -#include struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -193,6 +192,8 @@ enum hrtimer_base_type { * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are + * expired * @expires_next: absolute time of the next event, is required for remote * hrtimer enqueue; it is the total first expiry time (hard * and soft hrtimer are taken into account) @@ -220,12 +221,10 @@ struct hrtimer_cpu_base { unsigned short nr_hangs; unsigned int max_hang_time; #endif + spinlock_t softirq_expiry_lock; ktime_t expires_next; struct hrtimer *next_timer; ktime_t softirq_expires_next; -#ifdef CONFIG_PREEMPT_RT_BASE - wait_queue_head_t wait; -#endif struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; } ____cacheline_aligned; @@ -426,6 +425,7 @@ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); +extern void hrtimer_grab_expiry_lock(const struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) @@ -443,13 +443,6 @@ static inline void hrtimer_restart(struct hrtimer *timer) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -/* Softirq preemption could deadlock timer removal */ -#ifdef CONFIG_PREEMPT_RT_BASE - extern void hrtimer_wait_for_timer(const struct hrtimer *timer); -#else -# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) -#endif - /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); @@ -475,7 +468,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) * Helper function to check, whether the timer is running the callback * function */ -static inline int hrtimer_callback_running(const struct hrtimer *timer) +static inline int hrtimer_callback_running(struct hrtimer *timer) { return timer->base->running == timer; } diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 0571b498db73..3e6c91bdf2ef 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -15,6 +15,7 @@ struct cpu_timer_list { u64 expires, incr; struct task_struct *task; int firing; + int firing_cpu; }; /* diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1d1f077cffb3..61ab2c923579 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -436,7 +436,7 @@ int alarm_cancel(struct alarm *alarm) int ret = alarm_try_to_cancel(alarm); if (ret >= 0) return ret; - hrtimer_wait_for_timer(&alarm->timer); + hrtimer_grab_expiry_lock(&alarm->timer); } } EXPORT_SYMBOL_GPL(alarm_cancel); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1040b80362c..4534e7871c8c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -963,33 +963,16 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(hrtimer_forward); -#ifdef CONFIG_PREEMPT_RT_BASE -# define wake_up_timer_waiters(b) wake_up(&(b)->wait) - -/** - * hrtimer_wait_for_timer - Wait for a running timer - * - * @timer: timer to wait for - * - * The function waits in case the timers callback function is - * currently executed on the waitqueue of the timer base. The - * waitqueue is woken up after the timer callback function has - * finished execution. - */ -void hrtimer_wait_for_timer(const struct hrtimer *timer) +void hrtimer_grab_expiry_lock(const struct hrtimer *timer) { struct hrtimer_clock_base *base = timer->base; - if (base && base->cpu_base && - base->index >= HRTIMER_BASE_MONOTONIC_SOFT) - wait_event(base->cpu_base->wait, - !(hrtimer_callback_running(timer))); + if (base && base->cpu_base) { + spin_lock(&base->cpu_base->softirq_expiry_lock); + spin_unlock(&base->cpu_base->softirq_expiry_lock); + } } -#else -# define wake_up_timer_waiters(b) do { } while (0) -#endif - /* * enqueue_hrtimer - internal function to (re)start a timer * @@ -1224,7 +1207,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; - hrtimer_wait_for_timer(timer); + hrtimer_grab_expiry_lock(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1528,6 +1511,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) unsigned long flags; ktime_t now; + spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); @@ -1537,7 +1521,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - wake_up_timer_waiters(cpu_base); + spin_unlock(&cpu_base->softirq_expiry_lock); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1947,9 +1931,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; -#ifdef CONFIG_PREEMPT_RT_BASE - init_waitqueue_head(&cpu_base->wait); -#endif + spin_lock_init(&cpu_base->softirq_expiry_lock); return 0; } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 55b0e58368bf..a5ff222df4c7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -215,7 +215,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); - hrtimer_wait_for_timer(&tsk->signal->real_timer); + hrtimer_grab_expiry_lock(timer); goto again; } expires = timeval_to_ktime(value->it_value); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8d95e8de98b2..765e700962ab 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -792,6 +792,7 @@ check_timers_list(struct list_head *timers, return t->expires; t->firing = 1; + t->firing_cpu = smp_processor_id(); list_move_tail(&t->entry, firing); } @@ -1138,6 +1139,20 @@ static inline int fastpath_timer_check(struct task_struct *tsk) return 0; } +static DEFINE_PER_CPU(spinlock_t, cpu_timer_expiry_lock) = __SPIN_LOCK_UNLOCKED(cpu_timer_expiry_lock); + +void cpu_timers_grab_expiry_lock(struct k_itimer *timer) +{ + int cpu = timer->it.cpu.firing_cpu; + + if (cpu >= 0) { + spinlock_t *expiry_lock = per_cpu_ptr(&cpu_timer_expiry_lock, cpu); + + spin_lock_irq(expiry_lock); + spin_unlock_irq(expiry_lock); + } +} + /* * This is called from the timer interrupt handler. The irq handler has * already updated our counts. We need to check if any timers fire now. @@ -1148,6 +1163,7 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) LIST_HEAD(firing); struct k_itimer *timer, *next; unsigned long flags; + spinlock_t *expiry_lock; /* * The fast path checks that there are no expired thread or thread @@ -1156,6 +1172,9 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) if (!fastpath_timer_check(tsk)) return; + expiry_lock = this_cpu_ptr(&cpu_timer_expiry_lock); + spin_lock(expiry_lock); + if (!lock_task_sighand(tsk, &flags)) return; /* @@ -1190,6 +1209,7 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) list_del_init(&timer->it.cpu.entry); cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; + timer->it.cpu.firing_cpu = -1; /* * The firing flag is -1 if we collided with a reset * of the timer, which already reported this @@ -1199,6 +1219,7 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + spin_unlock(expiry_lock); } #ifdef CONFIG_PREEMPT_RT_BASE @@ -1466,6 +1487,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_unlock_irq(&timer.it_lock); while (error == TIMER_RETRY) { + + cpu_timers_grab_expiry_lock(&timer); /* * We need to handle case when timer was or is in the * middle of firing. In other cases we already freed diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a5ec421e3437..c7e97d421590 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -821,25 +821,20 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -/* - * Protected by RCU! - */ -static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr) +static int common_hrtimer_try_to_cancel(struct k_itimer *timr) { -#ifdef CONFIG_PREEMPT_RT_FULL - if (kc->timer_arm == common_hrtimer_arm) - hrtimer_wait_for_timer(&timr->it.real.timer); - else if (kc == &alarm_clock) - hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer); - else - /* FIXME: Whacky hack for posix-cpu-timers */ - schedule_timeout(1); -#endif + return hrtimer_try_to_cancel(&timr->it.real.timer); } -static int common_hrtimer_try_to_cancel(struct k_itimer *timr) +static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timer) { - return hrtimer_try_to_cancel(&timr->it.real.timer); + if (kc->timer_arm == common_hrtimer_arm) + hrtimer_grab_expiry_lock(&timer->it.real.timer); + else if (kc == &alarm_clock) + hrtimer_grab_expiry_lock(&timer->it.alarm.alarmtimer.timer); + else + /* posix-cpu-timers */ + cpu_timers_grab_expiry_lock(timer); } /* Set a POSIX.1b interval timer. */ @@ -901,21 +896,21 @@ static int do_timer_settime(timer_t timer_id, int flags, if (!timr) return -EINVAL; - rcu_read_lock(); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else error = kc->timer_set(timr, flags, new_spec64, old_spec64); - unlock_timer(timr, flag); if (error == TIMER_RETRY) { + rcu_read_lock(); + unlock_timer(timr, flag); timer_wait_for_callback(kc, timr); - old_spec64 = NULL; // We already got the old time... rcu_read_unlock(); + old_spec64 = NULL; // We already got the old time... goto retry; } - rcu_read_unlock(); + unlock_timer(timr, flag); return error; } @@ -977,13 +972,21 @@ int common_timer_del(struct k_itimer *timer) return 0; } -static inline int timer_delete_hook(struct k_itimer *timer) +static int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; + int ret; if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; - return kc->timer_del(timer); + ret = kc->timer_del(timer); + if (ret == TIMER_RETRY) { + rcu_read_lock(); + spin_unlock_irq(&timer->it_lock); + timer_wait_for_callback(kc, timer); + rcu_read_unlock(); + } + return ret; } /* Delete a POSIX.1b interval timer. */ @@ -997,15 +1000,8 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) if (!timer) return -EINVAL; - rcu_read_lock(); - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - timer_wait_for_callback(clockid_to_kclock(timer->it_clock), - timer); - rcu_read_unlock(); + if (timer_delete_hook(timer) == TIMER_RETRY) goto retry_delete; - } - rcu_read_unlock(); spin_lock(¤t->sighand->siglock); list_del(&timer->list); @@ -1031,20 +1027,9 @@ static void itimer_delete(struct k_itimer *timer) retry_delete: spin_lock_irqsave(&timer->it_lock, flags); - /* On RT we can race with a deletion */ - if (!timer->it_signal) { - unlock_timer(timer, flags); - return; - } - - if (timer_delete_hook(timer) == TIMER_RETRY) { - rcu_read_lock(); - unlock_timer(timer, flags); - timer_wait_for_callback(clockid_to_kclock(timer->it_clock), - timer); - rcu_read_unlock(); + if (timer_delete_hook(timer) == TIMER_RETRY) goto retry_delete; - } + list_del(&timer->list); /* * This keeps any tasks waiting on the spin lock from thinking diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ddb21145211a..725bd230a8db 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -32,6 +32,8 @@ extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern void cpu_timers_grab_expiry_lock(struct k_itimer *timer); + int posix_timer_event(struct k_itimer *timr, int si_private); void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bdbfd52b1cfd..716364bbe5be 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include @@ -199,9 +198,7 @@ EXPORT_SYMBOL(jiffies_64); struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; -#ifdef CONFIG_PREEMPT_RT_FULL - struct swait_queue_head wait_for_running_timer; -#endif + spinlock_t expiry_lock; unsigned long clk; unsigned long next_expiry; unsigned int cpu; @@ -1201,33 +1198,6 @@ void add_timer_on(struct timer_list *timer, int cpu) } EXPORT_SYMBOL_GPL(add_timer_on); -#ifdef CONFIG_PREEMPT_RT_FULL -/* - * Wait for a running timer - */ -static void wait_for_running_timer(struct timer_list *timer) -{ - struct timer_base *base; - u32 tf = timer->flags; - - if (tf & TIMER_MIGRATING) - return; - - base = get_timer_base(tf); - swait_event_exclusive(base->wait_for_running_timer, - base->running_timer != timer); -} - -# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer) -#else -static inline void wait_for_running_timer(struct timer_list *timer) -{ - cpu_relax(); -} - -# define wakeup_timer_waiters(b) do { } while (0) -#endif - /** * del_timer - deactivate a timer. * @timer: the timer to be deactivated @@ -1257,14 +1227,8 @@ int del_timer(struct timer_list *timer) } EXPORT_SYMBOL(del_timer); -/** - * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer to delete - * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. - */ -int try_to_del_timer_sync(struct timer_list *timer) +static int __try_to_del_timer_sync(struct timer_list *timer, + struct timer_base **basep) { struct timer_base *base; unsigned long flags; @@ -1272,7 +1236,7 @@ int try_to_del_timer_sync(struct timer_list *timer) debug_assert_init(timer); - base = lock_timer_base(timer, &flags); + *basep = base = lock_timer_base(timer, &flags); if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); @@ -1281,9 +1245,42 @@ int try_to_del_timer_sync(struct timer_list *timer) return ret; } + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer to delete + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct timer_base *base; + + return __try_to_del_timer_sync(timer, &base); +} EXPORT_SYMBOL(try_to_del_timer_sync); #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) +static int __del_timer_sync(struct timer_list *timer) +{ + struct timer_base *base; + int ret; + + for (;;) { + ret = __try_to_del_timer_sync(timer, &base); + if (ret >= 0) + return ret; + + /* + * When accessing the lock, timers of base are no longer expired + * and so timer is no longer running. + */ + spin_lock(&base->expiry_lock); + spin_unlock(&base->expiry_lock); + } +} + /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1339,12 +1336,8 @@ int del_timer_sync(struct timer_list *timer) * could lead to deadlock. */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - wait_for_running_timer(timer); - } + + return __del_timer_sync(timer); } EXPORT_SYMBOL(del_timer_sync); #endif @@ -1409,11 +1402,15 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) raw_spin_unlock(&base->lock); call_timer_fn(timer, fn); base->running_timer = NULL; + spin_unlock(&base->expiry_lock); + spin_lock(&base->expiry_lock); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn); base->running_timer = NULL; + spin_unlock(&base->expiry_lock); + spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); } } @@ -1717,6 +1714,7 @@ static inline void __run_timers(struct timer_base *base) if (!time_after_eq(jiffies, base->clk)) return; + spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); /* @@ -1744,7 +1742,7 @@ static inline void __run_timers(struct timer_base *base) expire_timers(base, heads + levels); } raw_spin_unlock_irq(&base->lock); - wakeup_timer_waiters(base); + spin_unlock(&base->expiry_lock); } /* @@ -1991,9 +1989,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; -#ifdef CONFIG_PREEMPT_RT_FULL - init_swait_queue_head(&base->wait_for_running_timer); -#endif + spin_lock_init(&base->expiry_lock); } } -- Gitee