From 4c3970571d7fad3b9f98dd6925c531ff63a62467 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Fri, 9 Jun 2023 11:05:13 +0800 Subject: [PATCH 1/4] irqwork: push most work into softirq context commit 0f563ce4fffc87e5f71051a3321ffb675811f509 upstream. Initially we defered all irqwork into softirq because we didn't want the latency spikes if perf or another user was busy and delayed the RT task. The NOHZ trigger (nohz_full_kick_work) was the first user that did not work as expected if it did not run in the original irqwork context so we had to bring it back somehow for it. push_irq_work_func is the second one that requires this. This patch adds the IRQ_WORK_HARD_IRQ which makes sure the callback runs in raw-irq context. Everything else is defered into softirq context. Without -RT we have the orignal behavior. This patch incorporates tglx orignal work which revoked a little bringing back the arch_irq_work_raise() if possible and a few fixes from Steven Rostedt and Mike Galbraith, [bigeasy: melt tglx's irq_work_tick_soft() which splits irq_work_tick() into a hard and soft variant] Signed-off-by: Sebastian Andrzej Siewior --- include/linux/irq_work.h | 6 +++++ kernel/irq_work.c | 58 +++++++++++++++++++++++++++++++--------- kernel/sched/topology.c | 1 + kernel/time/timer.c | 2 ++ 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 1603d24a6486..7d181ced3746 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -58,4 +58,10 @@ static inline void irq_work_run(void) { } static inline void irq_work_single(void *arg) { } #endif +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT) +void irq_work_tick_soft(void); +#else +static inline void irq_work_tick_soft(void) { } +#endif + #endif /* _LINUX_IRQ_WORK_H */ diff --git a/kernel/irq_work.c b/kernel/irq_work.c index eca83965b631..8183d30e1bb1 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -52,13 +53,19 @@ void __weak arch_irq_work_raise(void) /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { + struct llist_head *list; + bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT); + + lazy_work = atomic_read(&work->flags) & IRQ_WORK_LAZY; + /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); - } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + if (lazy_work || (realtime && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ))) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + + if (llist_add(&work->llnode, list)) { + if (!lazy_work || tick_nohz_tick_stopped()) arch_irq_work_raise(); } } @@ -102,7 +109,13 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - __smp_call_single_queue(cpu, &work->llnode); + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ)) { + if (llist_add(&work->llnode, &per_cpu(lazy_list, cpu))) + arch_send_call_function_single_ipi(cpu); + } else { + __smp_call_single_queue(cpu, &work->llnode); + } } else { __irq_work_queue_local(work); } @@ -120,9 +133,8 @@ bool irq_work_needs_cpu(void) raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); - if (llist_empty(raised) || arch_irq_work_has_interrupt()) - if (llist_empty(lazy)) - return false; + if (llist_empty(raised) && llist_empty(lazy)) + return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -160,8 +172,12 @@ static void irq_work_run_list(struct llist_head *list) struct irq_work *work, *tmp; struct llist_node *llnode; +#ifndef CONFIG_PREEMPT_RT + /* + * nort: On RT IRQ-work may run in SOFTIRQ context. + */ BUG_ON(!irqs_disabled()); - +#endif if (llist_empty(list)) return; @@ -177,7 +193,16 @@ static void irq_work_run_list(struct llist_head *list) void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* + * NOTE: we raise softirq via IPI for safety, + * and execute in irq_work_tick() to move the + * overhead from hard to soft irq context. + */ + if (!llist_empty(this_cpu_ptr(&lazy_list))) + raise_softirq(TIMER_SOFTIRQ); + } else + irq_work_run_list(this_cpu_ptr(&lazy_list)); } EXPORT_SYMBOL_GPL(irq_work_run); @@ -187,8 +212,17 @@ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT) +void irq_work_tick_soft(void) +{ irq_work_run_list(this_cpu_ptr(&lazy_list)); } +#endif /* * Synchronize against the irq_work @entry, ensures the entry is not diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 94f1e6299aa1..1cb071dd908c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -514,6 +514,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags); #endif init_dl_bw(&rd->dl_bw); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a4fdc7cfb723..1cad0efd635c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1770,6 +1770,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + irq_work_tick_soft(); + __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); -- Gitee From 13cdec26cd009087cab2a2caa2a5f42f76e7ef70 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Fri, 9 Jun 2023 11:06:02 +0800 Subject: [PATCH 2/4] panic: skip get_random_bytes for RT_FULL in init_oops_id commit 198b6c1bbe9771b43fb349b92685fedaa0779808 upstream. Disable on -RT. If this is invoked from irq-context we will have problems to acquire the sleeping lock. Signed-off-by: Thomas Gleixner --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/panic.c b/kernel/panic.c index 1198b962402f..1df51aa0138b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -549,9 +549,11 @@ static u64 oops_id; static int init_oops_id(void) { +#ifndef CONFIG_PREEMPT_RT if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); else +#endif oops_id++; return 0; -- Gitee From 39774af5d2f4cbb405f98204ea7f0f1327fc4ec5 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Fri, 9 Jun 2023 11:06:24 +0800 Subject: [PATCH 3/4] x86: stackprotector: Avoid random pool on rt commit 36f46094a1007e3d5d19a6239dde7d7fe8fca7f2 upstream. CPU bringup calls into the random pool to initialize the stack canary. During boot that works nicely even on RT as the might sleep checks are disabled. During CPU hotplug the might sleep checks trigger. Making the locks in random raw is a major PITA, so avoid the call on RT is the only sensible solution. This is basically the same randomness which we get during boot where the random pool has no entropy and we rely on the TSC randomnness. Reported-by: Carsten Emde Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/stackprotector.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 7fb482f0f25b..3df0a95c9e13 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -65,7 +65,7 @@ */ static __always_inline void boot_init_stack_canary(void) { - u64 canary; + u64 canary = 0; u64 tsc; #ifdef CONFIG_X86_64 @@ -76,8 +76,14 @@ static __always_inline void boot_init_stack_canary(void) * of randomness. The TSC only matters for very early init, * there it already has some randomness on most systems. Later * on during the bootup the random pool has true entropy too. + * For preempt-rt we need to weaken the randomness a bit, as + * we can't call into the random generator from atomic context + * due to locking constraints. We just leave canary + * uninitialized and use the TSC based randomness on top of it. */ +#ifndef CONFIG_PREEMPT_RT get_random_bytes(&canary, sizeof(canary)); +#endif tsc = rdtsc(); canary += tsc + (tsc << 32UL); canary &= CANARY_MASK; -- Gitee From 4a361a4aef59fbdf6e00ccff92a65d2a627ac974 Mon Sep 17 00:00:00 2001 From: "yang.yang29@zte.com.cn" Date: Fri, 9 Jun 2023 11:06:34 +0800 Subject: [PATCH 4/4] net: Remove preemption disabling in netif_rx() commit b29ad8755a7901dda33a9134d7b223e5eb2bc40d upstream. 1)enqueue_to_backlog() (called from netif_rx) should be bind to a particluar CPU. This can be achieved by disabling migration. No need to disable preemption 2)Fixes crash "BUG: scheduling while atomic: ksoftirqd" in case of RT. If preemption is disabled, enqueue_to_backog() is called in atomic context. And if backlog exceeds its count, kfree_skb() is called. But in RT, kfree_skb() might gets scheduled out, so it expects non atomic context. -Replace preempt_enable(), preempt_disable() with migrate_enable(), migrate_disable() respectively -Replace get_cpu(), put_cpu() with get_cpu_light(), put_cpu_light() respectively Signed-off-by: Priyanka Jain Acked-by: Rajan Srivastava Cc: Link: http://lkml.kernel.org/r/1337227511-2271-1-git-send-email-Priyanka.Jain@freescale.com Signed-off-by: Thomas Gleixner [bigeasy: Remove assumption about migrate_disable() from the description.] Signed-off-by: Sebastian Andrzej Siewior --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 23a1534fd991..016b1bf29ca5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4824,7 +4824,7 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); + migrate_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4834,14 +4834,14 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); + migrate_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); + put_cpu_light(); } return ret; } -- Gitee