From e1c3440ffbf404fc9bd37b05a58880aab07d37e3 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:39:36 +0800 Subject: [PATCH 1/5] net: Remove preemption disabling in netif_rx() commit 9dcfd3052899233a5c4b7593793005967d025b3f upstream. 1)enqueue_to_backlog() (called from netif_rx) should be bind to a particluar CPU. This can be achieved by disabling migration. No need to disable preemption 2)Fixes crash "BUG: scheduling while atomic: ksoftirqd" in case of RT. If preemption is disabled, enqueue_to_backog() is called in atomic context. And if backlog exceeds its count, kfree_skb() is called. But in RT, kfree_skb() might gets scheduled out, so it expects non atomic context. 3)When CONFIG_PREEMPT_RT_FULL is not defined, migrate_enable(), migrate_disable() maps to preempt_enable() and preempt_disable(), so no change in functionality in case of non-RT. -Replace preempt_enable(), preempt_disable() with migrate_enable(), migrate_disable() respectively -Replace get_cpu(), put_cpu() with get_cpu_light(), put_cpu_light() respectively Signed-off-by: Priyanka Jain Acked-by: Rajan Srivastava Cc: Link: http://lkml.kernel.org/r/1337227511-2271-1-git-send-email-Priyanka.Jain@freescale.com Signed-off-by: Thomas Gleixner --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 87541294ee80..1e223b0d9077 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4517,7 +4517,7 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); + migrate_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4527,14 +4527,14 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); + migrate_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); + put_cpu_light(); } return ret; } -- Gitee From 3f1dda3c57fc04cb43effd4383f6279178487a57 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:39:47 +0800 Subject: [PATCH 2/5] net: Another local_irq_disable/kmalloc headache commit e6babcfe0fa1248a3e14b455738237a4e13b50eb upstream. Replace it by a local lock. Though that's pretty inefficient :( Signed-off-by: Thomas Gleixner --- net/core/skbuff.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d10d2dd2b0d1..705cc4108f7c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -335,6 +336,7 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -342,10 +344,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) unsigned long flags; void *data; - local_irq_save(flags); + local_lock_irqsave(netdev_alloc_lock, flags); nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, fragsz, gfp_mask); - local_irq_restore(flags); + local_unlock_irqrestore(netdev_alloc_lock, flags); return data; } @@ -417,13 +419,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - local_irq_save(flags); + local_lock_irqsave(netdev_alloc_lock, flags); nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; - local_irq_restore(flags); + local_unlock_irqrestore(netdev_alloc_lock, flags); if (unlikely(!data)) return NULL; -- Gitee From c4606dc4616aded7de7e71bf73713d058d3824a1 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:39:58 +0800 Subject: [PATCH 3/5] net/core: protect users of napi_alloc_cache against reentrance commit cf637289b14f736be18788b3ba3160ba35949396 upstream. On -RT the code running in BH can not be moved to another CPU so CPU local variable remain local. However the code can be preempted and another task may enter BH accessing the same CPU using the same napi_alloc_cache variable. This patch ensures that each user of napi_alloc_cache uses a local lock. Cc: stable-rt@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior --- net/core/skbuff.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 705cc4108f7c..02cb9b59b867 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -337,6 +337,7 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock); +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -368,9 +369,13 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; + void *data; - return page_frag_alloc(&nc->page, fragsz, gfp_mask); + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); + data = page_frag_alloc(&nc->page, fragsz, gfp_mask); + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); + return data; } void *napi_alloc_frag(unsigned int fragsz) @@ -466,9 +471,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; struct sk_buff *skb; void *data; + bool pfmemalloc; len += NET_SKB_PAD + NET_IP_ALIGN; @@ -486,7 +492,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = nc->page.pfmemalloc; + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); if (unlikely(!data)) return NULL; @@ -497,7 +506,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->page.pfmemalloc) + if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -729,23 +738,26 @@ void __consume_stateless_skb(struct sk_buff *skb) void __kfree_skb_flush(void) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); /* flush skb_cache if containing objects */ if (nc->skb_count) { kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, nc->skb_cache); nc->skb_count = 0; } + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); } static inline void _kfree_skb_defer(struct sk_buff *skb) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; /* drop skb->head and call any destructors for packet */ skb_release_all(skb); + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); /* record skb to CPU local list */ nc->skb_cache[nc->skb_count++] = skb; @@ -760,6 +772,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb) nc->skb_cache); nc->skb_count = 0; } + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); } void __kfree_skb_defer(struct sk_buff *skb) { -- Gitee From a79827ceca75c69177e504e80169287ad5c05a34 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:40:18 +0800 Subject: [PATCH 4/5] net: netfilter: Serialize xt_write_recseq sections on RT commit 28b182b074b9bb5ea0707e1f674eae8d576357c7 upstream. The netfilter code relies only on the implicit semantics of local_bh_disable() for serializing wt_write_recseq sections. RT breaks that and needs explicit serialization here. Reported-by: Peter LaDow Signed-off-by: Thomas Gleixner --- include/linux/netfilter/x_tables.h | 7 +++++++ net/netfilter/core.c | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 0865337e144e..d832df89f684 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -350,6 +351,8 @@ void xt_free_table_info(struct xt_table_info *info); */ DECLARE_PER_CPU(seqcount_t, xt_recseq); +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock); + /* xt_tee_enabled - true if x_tables needs to handle reentrancy * * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. @@ -370,6 +373,9 @@ static inline unsigned int xt_write_recseq_begin(void) { unsigned int addend; + /* RT protection */ + local_lock(xt_write_lock); + /* * Low order bit of sequence is set if we already * called xt_write_recseq_begin(). @@ -400,6 +406,7 @@ static inline void xt_write_recseq_end(unsigned int addend) /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ smp_wmb(); __this_cpu_add(xt_recseq.sequence, addend); + local_unlock(xt_write_lock); } /* diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 93aaec3a54ec..b364cf8e5776 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,11 @@ #include "nf_internals.h" +#ifdef CONFIG_PREEMPT_RT_BASE +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock); +EXPORT_PER_CPU_SYMBOL(xt_write_lock); +#endif + const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -- Gitee From 4dbd55650ced54f662d460e329c2d3ddfc88caad Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:40:35 +0800 Subject: [PATCH 5/5] net: Add a mutex around devnet_rename_seq commit d93ef2fbc7cc016328b4e3cfc2b58960acc4b696 upstream. On RT write_seqcount_begin() disables preemption and device_rename() allocates memory with GFP_KERNEL and grabs later the sysfs_mutex mutex. Serialize with a mutex and add use the non preemption disabling __write_seqcount_begin(). To avoid writer starvation, let the reader grab the mutex and release it when it detects a writer in progress. This keeps the normal case (no reader on the fly) fast. [ tglx: Instead of replacing the seqcount by a mutex, add the mutex ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner --- net/core/dev.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 1e223b0d9077..25fa55af6d81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -196,6 +196,7 @@ static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; +static DEFINE_MUTEX(devnet_rename_mutex); static inline void dev_base_seq_inc(struct net *net) { @@ -921,7 +922,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex) strcpy(name, dev->name); rcu_read_unlock(); if (read_seqcount_retry(&devnet_rename_seq, seq)) { - cond_resched(); + mutex_lock(&devnet_rename_mutex); + mutex_unlock(&devnet_rename_mutex); goto retry; } @@ -1198,20 +1200,17 @@ int dev_change_name(struct net_device *dev, const char *newname) likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; - write_seqcount_begin(&devnet_rename_seq); + mutex_lock(&devnet_rename_mutex); + __raw_write_seqcount_begin(&devnet_rename_seq); - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - write_seqcount_end(&devnet_rename_seq); - return 0; - } + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) + goto outunlock; memcpy(oldname, dev->name, IFNAMSIZ); err = dev_get_valid_name(net, dev, newname); - if (err < 0) { - write_seqcount_end(&devnet_rename_seq); - return err; - } + if (err < 0) + goto outunlock; if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s\n", oldname); @@ -1224,11 +1223,12 @@ int dev_change_name(struct net_device *dev, const char *newname) if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); dev->name_assign_type = old_assign_type; - write_seqcount_end(&devnet_rename_seq); - return ret; + err = ret; + goto outunlock; } - write_seqcount_end(&devnet_rename_seq); + __raw_write_seqcount_end(&devnet_rename_seq); + mutex_unlock(&devnet_rename_mutex); netdev_adjacent_rename_links(dev, oldname); @@ -1249,7 +1249,8 @@ int dev_change_name(struct net_device *dev, const char *newname) /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - write_seqcount_begin(&devnet_rename_seq); + mutex_lock(&devnet_rename_mutex); + __raw_write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); dev->name_assign_type = old_assign_type; @@ -1262,6 +1263,11 @@ int dev_change_name(struct net_device *dev, const char *newname) } return err; + +outunlock: + __raw_write_seqcount_end(&devnet_rename_seq); + mutex_unlock(&devnet_rename_mutex); + return err; } /** -- Gitee