From f74f3055cbe3c8d778a0e54aaf0542f23e9229b7 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:41:47 +0800 Subject: [PATCH 1/8] net: Use skbufhead with raw lock commit fc3e6e30f2ab8454e5d71c17c1893c38785c1271 upstream. Use the rps lock as rawlock so we can keep irq-off regions. It looks low latency. However we can't kfree() from this context therefore we defer this to the softirq and use the tofree_queue list for it (similar to process_queue). Signed-off-by: Thomas Gleixner --- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 7 +++++++ net/core/dev.c | 33 +++++++++++++++++++++++++-------- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9dc6c3bd2d69..16fe303edc7f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2977,6 +2977,7 @@ struct softnet_data { unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + struct sk_buff_head tofree_queue; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 634bdf3ce111..c6526c50bd20 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -287,6 +287,7 @@ struct sk_buff_head { __u32 qlen; spinlock_t lock; + raw_spinlock_t raw_lock; }; struct sk_buff; @@ -1736,6 +1737,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) __skb_queue_head_init(list); } +static inline void skb_queue_head_init_raw(struct sk_buff_head *list) +{ + raw_spin_lock_init(&list->raw_lock); + __skb_queue_head_init(list); +} + static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { diff --git a/net/core/dev.c b/net/core/dev.c index 1a065b881265..84ec21be3022 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -218,14 +218,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); + raw_spin_lock(&sd->input_pkt_queue.raw_lock); #endif } static inline void rps_unlock(struct softnet_data *sd) { #ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); + raw_spin_unlock(&sd->input_pkt_queue.raw_lock); #endif } @@ -5293,7 +5293,7 @@ static void flush_backlog(struct work_struct *work) skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - kfree_skb(skb); + __skb_queue_tail(&sd->tofree_queue, skb); input_queue_head_incr(sd); } } @@ -5303,11 +5303,14 @@ static void flush_backlog(struct work_struct *work) skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); + __skb_queue_tail(&sd->tofree_queue, skb); input_queue_head_incr(sd); } } + if (!skb_queue_empty(&sd->tofree_queue)) + raise_softirq_irqoff(NET_RX_SOFTIRQ); local_bh_enable(); + } static void flush_all_backlogs(void) @@ -5886,7 +5889,9 @@ static int process_backlog(struct napi_struct *napi, int quota) while (again) { struct sk_buff *skb; + local_irq_disable(); while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); @@ -5894,9 +5899,9 @@ static int process_backlog(struct napi_struct *napi, int quota) if (++work >= quota) return work; + local_irq_disable(); } - local_irq_disable(); rps_lock(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* @@ -6374,13 +6379,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) unsigned long time_limit = jiffies + usecs_to_jiffies(netdev_budget_usecs); int budget = netdev_budget; + struct sk_buff_head tofree_q; + struct sk_buff *skb; LIST_HEAD(list); LIST_HEAD(repoll); + __skb_queue_head_init(&tofree_q); + local_irq_disable(); + skb_queue_splice_init(&sd->tofree_queue, &tofree_q); list_splice_init(&sd->poll_list, &list); local_irq_enable(); + while ((skb = __skb_dequeue(&tofree_q))) + kfree_skb(skb); + for (;;) { struct napi_struct *n; @@ -9372,10 +9385,13 @@ static int dev_cpu_dead(unsigned int oldcpu) netif_rx_ni(skb); input_queue_head_incr(oldsd); } - while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx_ni(skb); input_queue_head_incr(oldsd); } + while ((skb = __skb_dequeue(&oldsd->tofree_queue))) { + kfree_skb(skb); + } return 0; } @@ -9686,8 +9702,9 @@ static int __init net_dev_init(void) INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); - skb_queue_head_init(&sd->process_queue); + skb_queue_head_init_raw(&sd->input_pkt_queue); + skb_queue_head_init_raw(&sd->process_queue); + skb_queue_head_init_raw(&sd->tofree_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif -- Gitee From 164e36f0237f654b20458da413f7fdb0adf65096 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:42:03 +0800 Subject: [PATCH 2/8] net: move xmit_recursion to per-task variable on -RT commit c716d0b63abba6d3d382d6dc6ba74dfa4198aa10 upstream. A softirq on -RT can be preempted. That means one task is in __dev_queue_xmit(), gets preempted and another task may enter __dev_queue_xmit() aw well. netperf together with a bridge device will then trigger the `recursion alert` because each task increments the xmit_recursion variable which is per-CPU. A virtual device like br0 is required to trigger this warning. This patch moves the lock owner and counter to be per task instead per-CPU so it counts the recursion properly on -RT. The owner is also a task now and not a CPU number. Cc: stable-rt@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior --- include/linux/netdevice.h | 95 ++++++++++++++++++++++++++++++++++++--- include/linux/sched.h | 3 ++ net/core/dev.c | 15 ++++--- net/core/filter.c | 6 +-- 4 files changed, 104 insertions(+), 15 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 16fe303edc7f..f6c98f9eb00b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -583,7 +583,11 @@ struct netdev_queue { * write-mostly part */ spinlock_t _xmit_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_PREEMPT_RT_FULL + struct task_struct *xmit_lock_owner; +#else int xmit_lock_owner; +#endif /* * Time (in jiffies) of last Tx */ @@ -2615,14 +2619,53 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); -DECLARE_PER_CPU(int, xmit_recursion); #define XMIT_RECURSION_LIMIT 10 +#ifdef CONFIG_PREEMPT_RT_FULL +static inline int dev_recursion_level(void) +{ + return current->xmit_recursion; +} + +static inline int xmit_rec_read(void) +{ + return current->xmit_recursion; +} + +static inline void xmit_rec_inc(void) +{ + current->xmit_recursion++; +} + +static inline void xmit_rec_dec(void) +{ + current->xmit_recursion--; +} + +#else + +DECLARE_PER_CPU(int, xmit_recursion); static inline int dev_recursion_level(void) { return this_cpu_read(xmit_recursion); } +static inline int xmit_rec_read(void) +{ + return __this_cpu_read(xmit_recursion); +} + +static inline void xmit_rec_inc(void) +{ + __this_cpu_inc(xmit_recursion); +} + +static inline void xmit_rec_dec(void) +{ + __this_cpu_dec(xmit_recursion); +} +#endif + struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); @@ -3798,10 +3841,48 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) return (1U << debug_value) - 1; } +#ifdef CONFIG_PREEMPT_RT_FULL +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu) +{ + txq->xmit_lock_owner = current; +} + +static inline void netdev_queue_clear_owner(struct netdev_queue *txq) +{ + txq->xmit_lock_owner = NULL; +} + +static inline bool netdev_queue_has_owner(struct netdev_queue *txq) +{ + if (txq->xmit_lock_owner != NULL) + return true; + return false; +} + +#else + +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu) +{ + txq->xmit_lock_owner = cpu; +} + +static inline void netdev_queue_clear_owner(struct netdev_queue *txq) +{ + txq->xmit_lock_owner = -1; +} + +static inline bool netdev_queue_has_owner(struct netdev_queue *txq) +{ + if (txq->xmit_lock_owner != -1) + return true; + return false; +} +#endif + static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + netdev_queue_set_owner(txq, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -3818,32 +3899,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + netdev_queue_set_owner(txq, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + netdev_queue_set_owner(txq, smp_processor_id()); return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + netdev_queue_clear_owner(txq); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + netdev_queue_clear_owner(txq); spin_unlock_bh(&txq->_xmit_lock); } static inline void txq_trans_update(struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (netdev_queue_has_owner(txq)) txq->trans_start = jiffies; } diff --git a/include/linux/sched.h b/include/linux/sched.h index d3e60ce943b0..1f3cabb82e50 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1311,6 +1311,9 @@ struct task_struct { #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; +#endif +#ifdef CONFIG_PREEMPT_RT_FULL + int xmit_recursion; #endif int pagefault_disabled; #ifdef CONFIG_MMU diff --git a/net/core/dev.c b/net/core/dev.c index 84ec21be3022..9c5957e5ea54 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3556,8 +3556,10 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif +#ifndef CONFIG_PREEMPT_RT_FULL DEFINE_PER_CPU(int, xmit_recursion); EXPORT_SYMBOL(xmit_recursion); +#endif /** * dev_loopback_xmit - loop back @skb @@ -3848,9 +3850,12 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ +#ifdef CONFIG_PREEMPT_RT_FULL + if (txq->xmit_lock_owner != current) { +#else if (txq->xmit_lock_owner != cpu) { - if (unlikely(__this_cpu_read(xmit_recursion) > - XMIT_RECURSION_LIMIT)) +#endif + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); @@ -3860,9 +3865,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { - __this_cpu_inc(xmit_recursion); + xmit_rec_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); - __this_cpu_dec(xmit_recursion); + xmit_rec_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -8435,7 +8440,7 @@ static void netdev_init_one_queue(struct net_device *dev, /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); - queue->xmit_lock_owner = -1; + netdev_queue_clear_owner(queue); netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; #ifdef CONFIG_BQL diff --git a/net/core/filter.c b/net/core/filter.c index 241828d0d85e..44436c2712f3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2000,7 +2000,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; - if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; @@ -2008,9 +2008,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) skb->dev = dev; - __this_cpu_inc(xmit_recursion); + xmit_rec_inc(); ret = dev_queue_xmit(skb); - __this_cpu_dec(xmit_recursion); + xmit_rec_dec(); return ret; } -- Gitee From f192b9ac74c48387b5d1b3e98d6f5e4b25196083 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:42:19 +0800 Subject: [PATCH 3/8] net: provide a way to delegate processing a softirq to ksoftirqd commit 041b49006ea5c90319dd0e5ba91d255eb59ec635 upstream. If the NET_RX uses up all of his budget it moves the following NAPI invocations into the `ksoftirqd`. On -RT it does not do so. Instead it rises the NET_RX softirq in its current context again. In order to get closer to mainline's behaviour this patch provides __raise_softirq_irqoff_ksoft() which raises the softirq in the ksoftird. Cc: stable-rt@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior --- include/linux/interrupt.h | 8 ++++++++ kernel/softirq.c | 21 +++++++++++++++++++++ net/core/dev.c | 2 +- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index aed4f168bfd5..6936d921aa53 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -548,6 +548,14 @@ extern void thread_do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); +#ifdef CONFIG_PREEMPT_RT_FULL +extern void __raise_softirq_irqoff_ksoft(unsigned int nr); +#else +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr) +{ + __raise_softirq_irqoff(nr); +} +#endif extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); diff --git a/kernel/softirq.c b/kernel/softirq.c index 27a4bb2303d0..25bcf2f2714b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -721,6 +721,27 @@ void __raise_softirq_irqoff(unsigned int nr) wakeup_proper_softirq(nr); } +/* + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd + */ +void __raise_softirq_irqoff_ksoft(unsigned int nr) +{ + unsigned int mask; + + if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) || + !__this_cpu_read(ktimer_softirqd))) + return; + mask = 1UL << nr; + + trace_softirq_raise(nr); + or_softirq_pending(mask); + if (mask & TIMER_SOFTIRQS) + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask; + else + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask; + wakeup_proper_softirq(nr); +} + /* * This function must run with irqs disabled! */ diff --git a/net/core/dev.c b/net/core/dev.c index 9c5957e5ea54..c48e115c1b1a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6428,7 +6428,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); out: -- Gitee From 8bcba42e669337c8455f9347593b2536e6e29e41 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:42:31 +0800 Subject: [PATCH 4/8] net: dev: always take qdisc's busylock in __dev_xmit_skb() commit 50cd61533d1b2d579c54d2acbb14c5e7034b1982 upstream. The root-lock is dropped before dev_hard_start_xmit() is invoked and after setting the __QDISC___STATE_RUNNING bit. If this task is now pushed away by a task with a higher priority then the task with the higher priority won't be able to submit packets to the NIC directly instead they will be enqueued into the Qdisc. The NIC will remain idle until the task(s) with higher priority leave the CPU and the task with lower priority gets back and finishes the job. If we take always the busylock we ensure that the RT task can boost the low-prio task and submit the packet. Signed-off-by: Sebastian Andrzej Siewior --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index c48e115c1b1a..2ab568ab60f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3484,7 +3484,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ +#ifdef CONFIG_PREEMPT_RT_FULL + contended = true; +#else contended = qdisc_is_running(q); +#endif if (unlikely(contended)) spin_lock(&q->busylock); -- Gitee From 55d122e6c134a21ed64a4488ffa75adbf2c5fde5 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:42:44 +0800 Subject: [PATCH 5/8] net/Qdisc: use a seqlock instead seqcount commit 882fef39240fcd645d3affe57b5f9ed4cfb1aac5 upstream. The seqcount disables preemption on -RT while it is held which can't remove. Also we don't want the reader to spin for ages if the writer is scheduled out. The seqlock on the other hand will serialize / sleep on the lock while writer is active. Signed-off-by: Sebastian Andrzej Siewior --- include/linux/seqlock.h | 9 +++++++++ include/net/gen_stats.h | 9 +++++---- include/net/net_seq_lock.h | 15 +++++++++++++++ include/net/sch_generic.h | 19 +++++++++++++++++-- net/core/gen_estimator.c | 6 +++--- net/core/gen_stats.c | 8 ++++---- net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 12 ++++++++++++ 8 files changed, 66 insertions(+), 14 deletions(-) create mode 100644 include/net/net_seq_lock.h diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 689ed53016c7..58f9909d6659 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -482,6 +482,15 @@ static inline void write_seqlock(seqlock_t *sl) __raw_write_seqcount_begin(&sl->seqcount); } +static inline int try_write_seqlock(seqlock_t *sl) +{ + if (spin_trylock(&sl->lock)) { + __raw_write_seqcount_begin(&sl->seqcount); + return 1; + } + return 0; +} + static inline void write_sequnlock(seqlock_t *sl) { __raw_write_seqcount_end(&sl->seqcount); diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 883bb9085f15..3b593cdeb9af 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -6,6 +6,7 @@ #include #include #include +#include struct gnet_stats_basic_cpu { struct gnet_stats_basic_packed bstats; @@ -36,11 +37,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); -int gnet_stats_copy_basic(const seqcount_t *running, +int gnet_stats_copy_basic(net_seqlock_t *running, struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); -void __gnet_stats_copy_basic(const seqcount_t *running, +void __gnet_stats_copy_basic(net_seqlock_t *running, struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); @@ -60,13 +61,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + net_seqlock_t *running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + net_seqlock_t *running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, struct gnet_stats_rate_est64 *sample); diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h new file mode 100644 index 000000000000..a7034298a82a --- /dev/null +++ b/include/net/net_seq_lock.h @@ -0,0 +1,15 @@ +#ifndef __NET_NET_SEQ_LOCK_H__ +#define __NET_NET_SEQ_LOCK_H__ + +#ifdef CONFIG_PREEMPT_RT_BASE +# define net_seqlock_t seqlock_t +# define net_seq_begin(__r) read_seqbegin(__r) +# define net_seq_retry(__r, __s) read_seqretry(__r, __s) + +#else +# define net_seqlock_t seqcount_t +# define net_seq_begin(__r) read_seqcount_begin(__r) +# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s) +#endif + +#endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c9cd5086bd54..b6328680dc71 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -100,7 +101,7 @@ struct Qdisc { struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_packed bstats; - seqcount_t running; + net_seqlock_t running; struct gnet_stats_queue qstats; unsigned long state; struct Qdisc *next_sched; @@ -121,7 +122,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); +#ifdef CONFIG_PREEMPT_RT_BASE + return spin_is_locked(&qdisc->running.lock) ? true : false; +#else return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; +#endif } static inline bool qdisc_run_begin(struct Qdisc *qdisc) @@ -132,17 +137,27 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) } else if (qdisc_is_running(qdisc)) { return false; } +#ifdef CONFIG_PREEMPT_RT_BASE + if (try_write_seqlock(&qdisc->running)) + return true; + return false; +#else /* Variant of write_seqcount_begin() telling lockdep a trylock * was attempted. */ raw_write_seqcount_begin(&qdisc->running); seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); return true; +#endif } static inline void qdisc_run_end(struct Qdisc *qdisc) { +#ifdef CONFIG_PREEMPT_RT_BASE + write_sequnlock(&qdisc->running); +#else write_seqcount_end(&qdisc->running); +#endif if (qdisc->flags & TCQ_F_NOLOCK) spin_unlock(&qdisc->seqlock); } @@ -458,7 +473,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) return qdisc_lock(root); } -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) { struct Qdisc *root = qdisc_root_sleeping(qdisc); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index e4e442d70c2d..c8fa906733fb 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -46,7 +46,7 @@ struct net_rate_estimator { struct gnet_stats_basic_packed *bstats; spinlock_t *stats_lock; - seqcount_t *running; + net_seqlock_t *running; struct gnet_stats_basic_cpu __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ @@ -129,7 +129,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, + net_seqlock_t *running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); @@ -227,7 +227,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt) + net_seqlock_t *running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index e2fd8baec65f..8bab88738691 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -142,7 +142,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, } void -__gnet_stats_copy_basic(const seqcount_t *running, +__gnet_stats_copy_basic(net_seqlock_t *running, struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) @@ -155,10 +155,10 @@ __gnet_stats_copy_basic(const seqcount_t *running, } do { if (running) - seq = read_seqcount_begin(running); + seq = net_seq_begin(running); bstats->bytes = b->bytes; bstats->packets = b->packets; - } while (running && read_seqcount_retry(running, seq)); + } while (running && net_seq_retry(running, seq)); } EXPORT_SYMBOL(__gnet_stats_copy_basic); @@ -176,7 +176,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(const seqcount_t *running, +gnet_stats_copy_basic(net_seqlock_t *running, struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 59d923130258..242d29968ed6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1171,7 +1171,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - seqcount_t *running; + net_seqlock_t *running; err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4ab20f1138fd..a9ed58ca3924 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -575,7 +575,11 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, +#ifdef CONFIG_PREEMPT_RT_BASE + .running = __SEQLOCK_UNLOCKED(noop_qdisc.running), +#else .running = SEQCNT_ZERO(noop_qdisc.running), +#endif .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -876,9 +880,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); +#ifdef CONFIG_PREEMPT_RT_BASE + seqlock_init(&sch->running); + lockdep_set_class(&sch->running.seqcount, + dev->qdisc_running_key ?: &qdisc_running_key); + lockdep_set_class(&sch->running.lock, + dev->qdisc_running_key ?: &qdisc_running_key); +#else seqcount_init(&sch->running); lockdep_set_class(&sch->running, dev->qdisc_running_key ?: &qdisc_running_key); +#endif sch->ops = ops; sch->flags = ops->static_flags; -- Gitee From a4f316405d4b4b0a52fa8388f40eb44049801464 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:42:58 +0800 Subject: [PATCH 6/8] net: add back the missing serialization in ip_send_unicast_reply() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a3e432c3b1fd867225b7515a94ca7b0fbd48b913 upstream. Some time ago Sami Pietikäinen reported a crash on -RT in ip_send_unicast_reply() which was later fixed by Nicholas Mc Guire (v3.12.8-rt11). Later (v3.18.8) the code was reworked and I dropped the patch. As it turns out it was mistake. I have reports that the same crash is possible with a similar backtrace. It seems that vanilla protects access to this_cpu_ptr() via local_bh_disable(). This does not work the on -RT since we can have NET_RX and NET_TX running in parallel on the same CPU. This is brings back the old locks. |Unable to handle kernel NULL pointer dereference at virtual address 00000010 |PC is at __ip_make_skb+0x198/0x3e8 |[] (__ip_make_skb) from [] (ip_push_pending_frames+0x20/0x40) |[] (ip_push_pending_frames) from [] (ip_send_unicast_reply+0x210/0x22c) |[] (ip_send_unicast_reply) from [] (tcp_v4_send_reset+0x190/0x1c0) |[] (tcp_v4_send_reset) from [] (tcp_v4_do_rcv+0x22c/0x288) |[] (tcp_v4_do_rcv) from [] (release_sock+0xb4/0x150) |[] (release_sock) from [] (tcp_close+0x240/0x454) |[] (tcp_close) from [] (inet_release+0x74/0x7c) |[] (inet_release) from [] (sock_release+0x30/0xb0) |[] (sock_release) from [] (sock_close+0x1c/0x24) |[] (sock_close) from [] (__fput+0xe8/0x20c) |[] (__fput) from [] (____fput+0x18/0x1c) |[] (____fput) from [] (task_work_run+0xa4/0xb8) |[] (task_work_run) from [] (do_work_pending+0xd0/0xe4) |[] (do_work_pending) from [] (work_pending+0xc/0x20) |Code: e3530001 8a000001 e3a00040 ea000011 (e5973010) Cc: stable-rt@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior --- net/ipv4/tcp_ipv4.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 06577df66971..713e59bc4d80 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -634,6 +635,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_v4_send_check); +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock); /* * This routine will send an RST to the other tcp. * @@ -768,6 +770,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); + local_lock(tcp_sk_lock); ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); if (sk) ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? @@ -780,6 +783,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_unlock(tcp_sk_lock); local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG @@ -860,6 +864,7 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); + local_lock(tcp_sk_lock); ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); if (sk) ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? @@ -871,6 +876,7 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_unlock(tcp_sk_lock); local_bh_enable(); } -- Gitee From a64919ec3bb26425f4a9757856b999bb3965c551 Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:43:10 +0800 Subject: [PATCH 7/8] net: add a lock around icmp_sk() commit 4c0fdaffb6e6326c5c6318fcd0feb45d4bf77190 upstream. It looks like the this_cpu_ptr() access in icmp_sk() is protected with local_bh_disable(). To avoid missing serialization in -RT I am adding here a local lock. No crash has been observed, this is just precaution. Cc: stable-rt@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior --- net/ipv4/icmp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 3f008fe292f1..7d44c86713b5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -205,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; * * On SMP we have one ICMP socket per-cpu. */ +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock); + static struct sock *icmp_sk(struct net *net) { return *this_cpu_ptr(net->ipv4.icmp_sk); @@ -215,12 +218,16 @@ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; + if (!local_trylock(icmp_sk_lock)) + return NULL; + sk = icmp_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ + local_unlock(icmp_sk_lock); return NULL; } return sk; @@ -229,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net) static inline void icmp_xmit_unlock(struct sock *sk) { spin_unlock(&sk->sk_lock.slock); + local_unlock(icmp_sk_lock); } int sysctl_icmp_msgs_per_sec __read_mostly = 1000; -- Gitee From f022972806c4c538ba56917ef9b00342a2d3a4ad Mon Sep 17 00:00:00 2001 From: meganz009 Date: Fri, 9 Jun 2023 13:43:31 +0800 Subject: [PATCH 8/8] net: Have __napi_schedule_irqoff() disable interrupts on RT commit 8da361e596b7594eb5b0f8abea644982a0c9a1d5 upstream. A customer hit a crash where the napi sd->poll_list became corrupted. The customer had the bnx2x driver, which does a __napi_schedule_irqoff() in its interrupt handler. Unfortunately, when running with CONFIG_PREEMPT_RT_FULL, this interrupt handler is run as a thread and is preemptable. The call to ____napi_schedule() must be done with interrupts disabled to protect the per cpu softnet_data's "poll_list, which is protected by disabling interrupts (disabling preemption is enough when all interrupts are threaded and local_bh_disable() can't preempt)." As bnx2x isn't the only driver that does this, the safest thing to do is to make __napi_schedule_irqoff() call __napi_schedule() instead when CONFIG_PREEMPT_RT_FULL is enabled, which will call local_irq_save() before calling ____napi_schedule(). Cc: stable-rt@vger.kernel.org Signed-off-by: Steven Rostedt (Red Hat) Signed-off-by: Sebastian Andrzej Siewior --- include/linux/netdevice.h | 12 ++++++++++++ net/core/dev.c | 2 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f6c98f9eb00b..5ce77b21c1d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -424,7 +424,19 @@ typedef enum rx_handler_result rx_handler_result_t; typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb); void __napi_schedule(struct napi_struct *n); + +/* + * When PREEMPT_RT_FULL is defined, all device interrupt handlers + * run as threads, and they can also be preempted (without PREEMPT_RT + * interrupt threads can not be preempted). Which means that calling + * __napi_schedule_irqoff() from an interrupt handler can be preempted + * and can corrupt the napi->poll_list. + */ +#ifdef CONFIG_PREEMPT_RT_FULL +#define __napi_schedule_irqoff(n) __napi_schedule(n) +#else void __napi_schedule_irqoff(struct napi_struct *n); +#endif static inline bool napi_disable_pending(struct napi_struct *n) { diff --git a/net/core/dev.c b/net/core/dev.c index 2ab568ab60f3..87541294ee80 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5985,6 +5985,7 @@ bool napi_schedule_prep(struct napi_struct *n) } EXPORT_SYMBOL(napi_schedule_prep); +#ifndef CONFIG_PREEMPT_RT_FULL /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule @@ -5996,6 +5997,7 @@ void __napi_schedule_irqoff(struct napi_struct *n) ____napi_schedule(this_cpu_ptr(&softnet_data), n); } EXPORT_SYMBOL(__napi_schedule_irqoff); +#endif bool napi_complete_done(struct napi_struct *n, int work_done) { -- Gitee