From ea53bf320ebb87d1fbb08a4d1d7d13976d9a2b8c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:51 +0800 Subject: [PATCH 01/30] netfilter: nf_tables: integrate pipapo into commit protocol ANBZ: #14678 commit 212ed75dc5fb9d1423b3942c8f872a868cda3466 upstream. The pipapo set backend follows copy-on-update approach, maintaining one clone of the existing datastructure that is being updated. The clone and current datastructures are swapped via rcu from the commit step. The existing integration with the commit protocol is flawed because there is no operation to clean up the clone if the transaction is aborted. Moreover, the datastructure swap happens on set element activation. This patch adds two new operations for sets: commit and abort, these new operations are invoked from the commit and abort steps, after the transactions have been digested, and it updates the pipapo set backend to use it. This patch adds a new ->pending_update field to sets to maintain a list of sets that require this new commit and abort operations. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Change-Id: Ic0f6a26e2f029ef60e030e92e2cf990cc6128942 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin [fix contextual conflicts - yaxin] Signed-off-by: Wang Yaxin --- include/net/netfilter/nf_tables.h | 4 ++- net/netfilter/nf_tables_api.c | 56 +++++++++++++++++++++++++++++++ net/netfilter/nft_set_pipapo.c | 55 +++++++++++++++++++++--------- 3 files changed, 99 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 327ca2cc5e7a..d5f623833a8b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -369,7 +369,8 @@ struct nft_set_ops { const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - + void (*commit)(const struct nft_set *set); + void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, @@ -449,6 +450,7 @@ struct nft_set { u16 udlen; unsigned char *udata; struct nft_expr *expr; + struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:14, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index de4a667c28c4..691a0616dc46 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4424,6 +4424,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } set->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) @@ -7970,9 +7971,24 @@ static void nft_commit_notify(struct net *net, u32 portid) WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); } +static void nft_set_commit_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->commit) + continue; + + set->ops->commit(set); + } +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; @@ -8104,6 +8120,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8115,6 +8136,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); te->set->ndeact--; + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { @@ -8175,6 +8201,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } + nft_set_commit_update(&set_update_list); + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_release(net); @@ -8229,9 +8257,24 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } +static void nft_set_abort_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->abort) + continue; + + set->ops->abort(set); + } +} + static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; if (action == NFNL_ABORT_VALIDATE && @@ -8317,6 +8360,12 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) te = (struct nft_trans_elem *)trans->data; te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); + + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -8325,6 +8374,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: @@ -8365,6 +8419,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } } + nft_set_abort_update(&set_update_list); + synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index f03258d483ef..96ed28b9e2d1 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1603,17 +1603,10 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) } } -/** - * pipapo_reclaim_match - RCU callback to free fields from old matching data - * @rcu: RCU head - */ -static void pipapo_reclaim_match(struct rcu_head *rcu) +static void pipapo_free_match(struct nft_pipapo_match *m) { - struct nft_pipapo_match *m; int i; - m = container_of(rcu, struct nft_pipapo_match, rcu); - for_each_possible_cpu(i) kfree(*per_cpu_ptr(m->scratch, i)); @@ -1628,7 +1621,19 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) } /** - * pipapo_commit() - Replace lookup data with current working copy + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + pipapo_free_match(m); +} + +/** + * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working @@ -1638,7 +1643,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *new_clone, *old; @@ -1663,6 +1668,26 @@ static void pipapo_commit(const struct nft_set *set) priv->clone = new_clone; } +static void nft_pipapo_abort(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *m; + + if (!priv->dirty) + return; + + m = rcu_dereference(priv->match); + + new_clone = pipapo_clone(m); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + pipapo_free_match(priv->clone); + priv->clone = new_clone; +} + /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace @@ -1670,8 +1695,7 @@ static void pipapo_commit(const struct nft_set *set) * @elem: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently - * in use for lookups, and not directly inserted into current lookup data, so - * we'll take care of that by calling pipapo_commit() here. Both + * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ @@ -1687,8 +1711,6 @@ static void nft_pipapo_activate(const struct net *net, nft_set_elem_change_active(net, set, &e->ext); nft_set_elem_clear_busy(&e->ext); - - pipapo_commit(set); } /** @@ -1938,7 +1960,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, if (i == m->field_count) { priv->dirty = true; pipapo_drop(m, rulemap); - pipapo_commit(set); return; } @@ -2238,6 +2259,8 @@ const struct nft_set_type nft_set_pipapo_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2260,6 +2283,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; -- Gitee From ab29d0284c87c17ccf08c8153e83e182da75c83b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:52 +0800 Subject: [PATCH 02/30] netfilter: nf_tables: GC transaction API to avoid race with control plane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #14678 commit 5f68718b34a531a556f2f50300ead2862278da26 upstream. The set types rhashtable and rbtree use a GC worker to reclaim memory. From system work queue, in periodic intervals, a scan of the table is done. The major caveat here is that the nft transaction mutex is not held. This causes a race between control plane and GC when they attempt to delete the same element. We cannot grab the netlink mutex from the work queue, because the control plane has to wait for the GC work queue in case the set is to be removed, so we get following deadlock: cpu 1 cpu2 GC work transaction comes in , lock nft mutex `acquire nft mutex // BLOCKS transaction asks to remove the set set destruction calls cancel_work_sync() cancel_work_sync will now block forever, because it is waiting for the mutex the caller already owns. This patch adds a new API that deals with garbage collection in two steps: 1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT so they are not visible via lookup. Annotate current GC sequence in the GC transaction. Enqueue GC transaction work as soon as it is full. If ruleset is updated, then GC transaction is aborted and retried later. 2) GC work grabs the mutex. If GC sequence has changed then this GC transaction lost race with control plane, abort it as it contains stale references to objects and let GC try again later. If the ruleset is intact, then this GC transaction deactivates and removes the elements and it uses call_rcu() to destroy elements. Note that no elements are removed from GC lockless path, the _DEAD bit is set and pointers are collected. GC catchall does not remove the elements anymore too. There is a new set->dead flag that is set on to abort the GC transaction to deal with set->ops->destroy() path which removes the remaining elements in the set from commit_release, where no mutex is held. To deal with GC when mutex is held, which allows safe deactivate and removal, add sync GC API which releases the set element object via call_rcu(). This is used by rbtree and pipapo backends which also perform garbage collection from control plane path. Since element removal from sets can happen from control plane and element garbage collection/timeout, it is necessary to keep the set structure alive until all elements have been deactivated and destroyed. We cannot do a cancel_work_sync or flush_work in nft_set_destroy because its called with the transaction mutex held, but the aforementioned async work queue might be blocked on the very mutex that nft_set_destroy() callchain is sitting on. This gives us the choice of ABBA deadlock or UaF. To avoid both, add set->refs refcount_t member. The GC API can then increment the set refcount and release it once the elements have been free'd. Set backends are adapted to use the GC transaction API in a follow up patch entitled: ("netfilter: nf_tables: use gc transaction API in set backends") This is joint work with Florian Westphal. Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") Change-Id: Ia8fe1c53b3e256e479c1fdbf7c15609583bf1109 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin [move the definition of gc_seq from the original patch’s struct nftables_pernet to struct netns_nftables, and add it using a reserved field to preserve KABI (Kernel ABI) compatibility. - yaxin] Fixes: CVE-2024-4244 Signed-off-by: Wang Yaxin --- include/net/netfilter/nf_tables.h | 60 ++++++++- include/net/netns/nftables.h | 2 +- net/netfilter/nf_tables_api.c | 212 +++++++++++++++++++++++++++++- 3 files changed, 266 insertions(+), 8 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d5f623833a8b..270514f37a5d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -402,6 +402,7 @@ struct nft_set_type { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -431,6 +432,7 @@ struct nft_set_type { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -453,7 +455,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -1420,6 +1423,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) clear_bit(NFT_SET_ELEM_BUSY_BIT, word); } +#define NFT_SET_ELEM_DEAD_MASK (1 << 3) + +#if defined(__LITTLE_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT 3 +#elif defined(__BIG_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#else +#error +#endif + +static inline void nft_set_elem_dead(struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + /** * struct nft_trans - nf_tables object update in transaction * @@ -1543,6 +1572,35 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index ed3d917272c2..c6ea589ac569 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -14,8 +14,8 @@ struct netns_nftables { unsigned int base_seq; u8 gencursor; u8 validate_state; + CK_KABI_USE_SPLIT(1, unsigned int gc_seq) - CK_KABI_RESERVE(1) }; #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 691a0616dc46..824c3182cb0b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -29,7 +29,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); static u64 table_handle; enum { @@ -84,6 +86,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -4389,6 +4394,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4446,6 +4452,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { if (WARN_ON(set->use > 0)) @@ -4455,8 +4469,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) nft_expr_destroy(ctx, set->expr); set->ops->destroy(set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct net *net, struct sock *nlsk, @@ -5272,7 +5285,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_set_elem_deactivate() already deals with +/* Only called from commit path, nft_setelem_data_deactivate() already deals with * the refcounting from the preparation phase. */ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, @@ -5656,7 +5669,7 @@ static void nft_set_elem_activate(const struct net *net, (*nft_set_ext_obj(ext))->use++; } -static void nft_set_elem_deactivate(const struct net *net, +void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_set_elem *elem) { @@ -5735,7 +5748,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, kfree(elem.priv); elem.priv = priv; - nft_set_elem_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, &elem); nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -5769,7 +5782,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, } set->ndeact++; - nft_set_elem_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem); nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -7891,6 +7904,177 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + trans->set->ops->remove(trans->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct netns_nftables *net_nft = &trans->net->nft; + struct nft_ctx ctx = {}; + + mutex_lock(&net_nft->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(net_nft->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&net_nft->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&net_nft->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_destroy_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + refcount_inc(&set->refs); + trans->set = set; + trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + if (nft_trans_gc_space(gc)) + return gc; + + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(gc->set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(gc->set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nft_module_request *req, *next; @@ -7992,6 +8176,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + unsigned int gc_seq; int err; if (list_empty(&net->nft.commit_list)) { @@ -8035,6 +8220,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ while (++net->nft.base_seq == 0); + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(net->nft.gc_seq); + WRITE_ONCE(net->nft.gc_seq, ++gc_seq); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -8109,6 +8298,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); @@ -8205,6 +8395,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + + WRITE_ONCE(net->nft.gc_seq, ++gc_seq); nf_tables_commit_release(net); return 0; @@ -9097,6 +9289,7 @@ static int __net_init nf_tables_init_net(struct net *net) mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; + net->nft.gc_seq = 0; return 0; } @@ -9118,10 +9311,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, }; static int __init nf_tables_module_init(void) @@ -9184,6 +9383,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); -- Gitee From f0be1e807d0a38e275cd184935512ba7cd5aa478 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:52 +0800 Subject: [PATCH 03/30] netfilter: nft_set_rbtree: Switch to node list walk for overlap detection ANBZ: #14678 [ Upstream commit c9e6978e2725a7d4b6cd23b2facd3f11422c0643 ] ...instead of a tree descent, which became overly complicated in an attempt to cover cases where expired or inactive elements would affect comparisons with the new element being inserted. Further, it turned out that it's probably impossible to cover all those cases, as inactive nodes might entirely hide subtrees consisting of a complete interval plus a node that makes the current insertion not overlap. To speed up the overlap check, descent the tree to find a greater element that is closer to the key value to insert. Then walk down the node list for overlap detection. Starting the overlap check from rb_first() unconditionally is slow, it takes 10 times longer due to the full linear traversal of the list. Moreover, perform garbage collection of expired elements when walking down the node list to avoid bogus overlap reports. For the insertion operation itself, this essentially reverts back to the implementation before commit 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion"), except that cases of complete overlap are already handled in the overlap detection phase itself, which slightly simplifies the loop to find the insertion point. Based on initial patch from Stefano Brivio, including text from the original patch description too. Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Change-Id: Ibfccd2bbca58aad10edde8c923cd340f09479de0 Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 316 ++++++++++++++++++++------------- 1 file changed, 189 insertions(+), 127 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 499be6672b77..ee20a93314ef 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); @@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; @@ -214,154 +214,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct nft_rbtree_elem *rbe_prev; + struct nft_set_gc_batch *gcb; + + gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); + if (!gcb) + return -ENOMEM; + + /* search for expired end interval coming before this element. */ + do { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev)) + break; + + prev = rb_prev(prev); + } while (prev != NULL); + + rb_erase(&rbe_prev->node, &priv->root); + rb_erase(&rbe->node, &priv->root); + atomic_sub(2, &set->nelems); + + nft_set_gc_batch_add(gcb, rbe); + nft_set_gc_batch_complete(gcb); + + return 0; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { - bool overlap = false, dup_end_left = false, dup_end_right = false; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; - int d; + int d, err; - /* Detect overlaps as we descend the tree. Set the flag in these cases: - * - * a1. _ _ __>| ?_ _ __| (insert end before existing end) - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) - * a3. _ _ ___? >|_ _ __| (insert start before existing end) - * - * and clear it later on, as we eventually reach the points indicated by - * '?' above, in the cases described below. We'll always meet these - * later, locally, due to tree ordering, and overlaps for the intervals - * that are the closest together are always evaluated last. - * - * b1. _ _ __>| !_ _ __| (insert end before existing start) - * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) - * '--' no nodes falling in this range - * b4. >|_ _ ! (insert start before existing start) - * - * Case a3. resolves to b3.: - * - if the inserted start element is the leftmost, because the '0' - * element in the tree serves as end element - * - otherwise, if an existing end is found immediately to the left. If - * there are existing nodes in between, we need to further descend the - * tree before we can conclude the new start isn't causing an overlap - * - * or to b4., which, preceded by a3., means we already traversed one or - * more existing intervals entirely, from the right. - * - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., - * in that order. - * - * The flag is also cleared in two special cases: - * - * b5. |__ _ _!|<_ _ _ (insert start right before existing end) - * b6. |__ _ >|!__ _ _ (insert end right after existing start) - * - * which always happen as last step and imply that no further - * overlapping is possible. - * - * Another special case comes from the fact that start elements matching - * an already existing start element are allowed: insertion is not - * performed but we return -EEXIST in that case, and the error will be - * cleared by the caller if NLM_F_EXCL is not present in the request. - * This way, request for insertion of an exact overlap isn't reported as - * error to userspace if not desired. - * - * However, if the existing start matches a pre-existing start, but the - * end element doesn't match the corresponding pre-existing end element, - * we need to report a partial overlap. This is a local condition that - * can be noticed without need for a tracking flag, by checking for a - * local duplicated end for a corresponding start, from left and right, - * separately. + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. */ - parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); + d = nft_rbtree_cmp(set, rbe, new); + if (d < 0) { p = &parent->rb_left; - - if (nft_rbtree_interval_start(new)) { - if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext) && !*p) - overlap = false; - } else { - if (dup_end_left && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_right = true; - continue; - } - } } else if (d > 0) { - p = &parent->rb_right; + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; - if (nft_rbtree_interval_end(new)) { - if (dup_end_right && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_left = true; - continue; - } - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - overlap = nft_rbtree_interval_end(rbe); - } + p = &parent->rb_right; } else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; + } + } + + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); + + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - overlap = false; - if (nft_rbtree_interval_end(rbe)) - p = &parent->rb_left; - else - p = &parent->rb_right; + /* perform garbage collection to avoid bogus overlap reports. */ + if (nft_set_elem_expired(&rbe->ext)) { + err = nft_rbtree_gc_elem(set, priv, rbe); + if (err < 0) + return err; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; + } + + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; + } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } + } - dup_end_left = dup_end_right = false; + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *ext = &rbe_ge->ext; + return -EEXIST; + } + + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *ext = &rbe_le->ext; + return -EEXIST; } - if (overlap) + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; -- Gitee From d4b7dc525f26ba9f41db033b9da01f6506110d43 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 15:45:52 +0800 Subject: [PATCH 04/30] netfilter: nft_set_rbtree: fix null deref on element insertion ANBZ: #14678 [ Upstream commit 61ae320a29b0540c16931816299eb86bf2b66c08 ] There is no guarantee that rb_prev() will not return NULL in nft_rbtree_gc_elem(): general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] nft_add_set_elem+0x14b0/0x2990 nf_tables_newsetelem+0x528/0xb30 Furthermore, there is a possible use-after-free while iterating, 'node' can be free'd so we need to cache the next value to use. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Change-Id: I81505470fb19dc2d9a9a18213490cf6087d80e18 Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ee20a93314ef..1b564f40f0ba 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -220,7 +220,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); - struct nft_rbtree_elem *rbe_prev; + struct nft_rbtree_elem *rbe_prev = NULL; struct nft_set_gc_batch *gcb; gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); @@ -228,17 +228,21 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, return -ENOMEM; /* search for expired end interval coming before this element. */ - do { + while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev)) break; prev = rb_prev(prev); - } while (prev != NULL); + } + + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + atomic_dec(&set->nelems); + } - rb_erase(&rbe_prev->node, &priv->root); rb_erase(&rbe->node, &priv->root); - atomic_sub(2, &set->nelems); + atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); nft_set_gc_batch_complete(gcb); @@ -267,7 +271,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_set_ext **ext) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; - struct rb_node *node, *parent, **p, *first = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); int d, err; @@ -306,7 +310,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * Values stored in the tree are in reversed order, starting from * highest to lowest value. */ - for (node = first; node != NULL; node = rb_next(node)) { + for (node = first; node != NULL; node = next) { + next = rb_next(node); + rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&rbe->ext, genmask)) -- Gitee From da7aec91034ce61f9c2754c4a18676b809ee9722 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 15:45:52 +0800 Subject: [PATCH 05/30] netfilter: nft_set_rbtree: fix overlap expiration walk ANBZ: #14678 [ Upstream commit f718863aca469a109895cb855e6b81fff4827d71 ] The lazy gc on insert that should remove timed-out entries fails to release the other half of the interval, if any. Can be reproduced with tests/shell/testcases/sets/0044interval_overlap_0 in nftables.git and kmemleak enabled kernel. Second bug is the use of rbe_prev vs. prev pointer. If rbe_prev() returns NULL after at least one iteration, rbe_prev points to element that is not an end interval, hence it should not be removed. Lastly, check the genmask of the end interval if this is active in the current generation. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Change-Id: I3811e1c77adaba7c9f6153bfa799bafd9f016452 Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 1b564f40f0ba..638944f20a0e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -216,29 +216,37 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) + struct nft_rbtree_elem *rbe, + u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); - struct nft_rbtree_elem *rbe_prev = NULL; + struct nft_rbtree_elem *rbe_prev; struct nft_set_gc_batch *gcb; gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); if (!gcb) return -ENOMEM; - /* search for expired end interval coming before this element. */ + /* search for end interval coming before this element. + * end intervals don't carry a timeout extension, they + * are coupled with the interval start element. + */ while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - if (nft_rbtree_interval_end(rbe_prev)) + if (nft_rbtree_interval_end(rbe_prev) && + nft_set_elem_active(&rbe_prev->ext, genmask)) break; prev = rb_prev(prev); } - if (rbe_prev) { + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + rb_erase(&rbe_prev->node, &priv->root); atomic_dec(&set->nelems); + nft_set_gc_batch_add(gcb, rbe_prev); } rb_erase(&rbe->node, &priv->root); @@ -320,7 +328,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, /* perform garbage collection to avoid bogus overlap reports. */ if (nft_set_elem_expired(&rbe->ext)) { - err = nft_rbtree_gc_elem(set, priv, rbe); + err = nft_rbtree_gc_elem(set, priv, rbe, genmask); if (err < 0) return err; -- Gitee From 2036d5e17792a3f3cf4653a6022093df25d9ddd2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:52 +0800 Subject: [PATCH 06/30] netfilter: nft_set_rbtree: skip elements in transaction from garbage collection ANBZ: #14678 [ Upstream commit 5d235d6ce75c12a7fdee375eb211e4116f7ab01b ] Skip interference with an ongoing transaction, do not perform garbage collection on inactive elements. Reset annotated previous end interval if the expired element is marked as busy (control plane removed the element right before expiration). Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Change-Id: Iafa6a61722414eb10a15bf7aba6e72bf4242ff24 Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 638944f20a0e..c2bfcc2de0c1 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -574,23 +574,37 @@ static void nft_rbtree_gc(struct work_struct *work) struct nft_rbtree *priv; struct rb_node *node; struct nft_set *set; + struct net *net; + u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + genmask = nft_genmask_cur(net); write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + + if (nft_set_elem_mark_busy(&rbe->ext)) { + rbe_end = NULL; continue; + } if (rbe_prev) { rb_erase(&rbe_prev->node, &priv->root); -- Gitee From ad01168f57b839f8eccb63822748700428b44f40 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:53 +0800 Subject: [PATCH 07/30] netfilter: nf_tables: adapt set backend to use GC transaction API ANBZ: #14678 commit f6c383b8c31a93752a52697f8430a71dcbc46adf upstream. Use the GC transaction API to replace the old and buggy gc API and the busy mark approach. No set elements are removed from async garbage collection anymore, instead the _DEAD bit is set on so the set element is not visible from lookup path anymore. Async GC enqueues transaction work that might be aborted and retried later. rbtree and pipapo set backends does not set on the _DEAD bit from the sync GC path since this runs in control plane path where mutex is held. In this case, set elements are deactivated, removed and then released via RCU callback, sync GC never fails. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Change-Id: Iacc941a5a5c1ff19d970d8617180100a098515b4 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin [remove the original patch's #include and extern unsigned int nf_tables_net_idc, replace nft_net with net->nft - yaxin] Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_hash.c | 72 ++++++++++++++++++++++------------ net/netfilter/nft_set_pipapo.c | 44 ++++++++++++++++----- 2 files changed, 81 insertions(+), 35 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 7f7a901ce17b..cc90dd7c7305 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -187,7 +189,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -195,12 +196,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -217,9 +215,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -298,46 +295,70 @@ static void nft_rhash_gc(struct work_struct *work) struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + gc_seq = READ_ONCE(net->nft.gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; + if (PTR_ERR(he) != -EAGAIN) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } continue; } + /* Ruleset has been updated, try later. */ + if (READ_ONCE(net->nft.gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { struct nft_expr *expr = nft_set_ext_expr(&he->ext); if (expr->ops->gc && expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; + goto needs_gc_run; } + if (!nft_set_elem_expired(&he->ext)) continue; -gc: - if (nft_set_elem_mark_busy(&he->ext)) - continue; - - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); +needs_gc_run: + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, he); } + +try_later: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -388,7 +409,6 @@ static void nft_rhash_destroy(const struct nft_set *set) struct nft_rhash *priv = nft_set_priv(set); cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)set); } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 96ed28b9e2d1..c5ba4d124648 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1536,15 +1536,32 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) +{ + struct nft_set_elem elem = { + .priv = e, + }; + + nft_setelem_data_deactivate(net, set, &elem); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @set: nftables API set representation + * @_set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) { + struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); int rules_f0, first_rule = 0; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; @@ -1569,13 +1586,19 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (nft_set_elem_expired(&e->ext)) { priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) + break; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1585,7 +1608,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - priv->last_gc = jiffies; + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1710,7 +1736,6 @@ static void nft_pipapo_activate(const struct net *net, return; nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); } /** @@ -1736,6 +1761,7 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, return NULL; nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); return e; } -- Gitee From 00e1ed874f1a28ab02b778a46d3f0e166c946637 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:53 +0800 Subject: [PATCH 08/30] netfilter: nf_tables: fix GC transaction races with netns and netlink event exit path ANBZ: #14678 commit 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc upstream. Netlink event path is missing a synchronization point with GC transactions. Add GC sequence number update to netns release path and netlink event path, any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: Iba92a19f189b853957465dedcf64fd7ebb8f1af3 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin [modify the functions nft_gc_seq_begin() and nft_gc_seq_end() to accept struct netns_nftables * instead of struct nftables_pernet *, since gc_seq has been moved to netns_nftables. - yaxin] Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 824c3182cb0b..1fab6242ded0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8169,6 +8169,22 @@ static void nft_set_commit_update(struct list_head *set_update_list) } } +static unsigned int nft_gc_seq_begin(struct netns_nftables *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct netns_nftables *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; @@ -8220,9 +8236,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ while (++net->nft.base_seq == 0); - /* Bump gc counter, it becomes odd, this is the busy mark. */ - gc_seq = READ_ONCE(net->nft.gc_seq); - WRITE_ONCE(net->nft.gc_seq, ++gc_seq); + gc_seq = nft_gc_seq_begin(&net->nft); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -8396,7 +8410,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - WRITE_ONCE(net->nft.gc_seq, ++gc_seq); + nft_gc_seq_end(&net->nft, gc_seq); nf_tables_commit_release(net); return 0; @@ -9301,10 +9315,18 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { + unsigned int gc_seq; + mutex_lock(&net->nft.commit_mutex); + + gc_seq = nft_gc_seq_begin(&net->nft); + if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); __nft_release_tables(net); + + nft_gc_seq_end(&net->nft, gc_seq); + mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list)); -- Gitee From 08cc1279429a9d30e3c9d932173fd2d30a754cf7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:53 +0800 Subject: [PATCH 09/30] netfilter: nf_tables: GC transaction race with netns dismantle ANBZ: #14678 commit 02c6c24402bf1c1e986899c14ba22a10b510916b upstream. Use maybe_get_net() since GC workqueue might race with netns exit path. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: I86ec1682b91dfe484660c4fe26a03a153267b05f Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1fab6242ded0..7a9fbfd5f1d1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8002,9 +8002,14 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, if (!trans) return NULL; + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + refcount_inc(&set->refs); trans->set = set; - trans->net = get_net(net); trans->seq = gc_seq; return trans; -- Gitee From 6060f90316df8c2154e027dd198ed83c5fc7be7a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:53 +0800 Subject: [PATCH 10/30] netfilter: nf_tables: GC transaction race with abort path ANBZ: #14678 commit 720344340fb9be2765bbaab7b292ece0a4570eae upstream. Abort path is missing a synchronization point with GC transactions. Add GC sequence number hence any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: I10cc8d4d60d41f70d47adfd984d1103467bbc4ea Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7a9fbfd5f1d1..37ddad00d165 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8656,7 +8656,12 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { - int ret = __nf_tables_abort(net, action); + unsigned int gc_seq; + int ret; + + gc_seq = nft_gc_seq_begin(&net->nft); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(&net->nft, gc_seq); mutex_unlock(&net->nft.commit_mutex); -- Gitee From 3cdfe2ea3ed2781e2fe0b7284eb8ceff72e57771 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:53 +0800 Subject: [PATCH 11/30] netfilter: nf_tables: use correct lock to protect gc_list ANBZ: #14678 commit 8357bc946a2abc2a10ca40e5a2105d2b4c57515e upstream. Use nf_tables_gc_list_lock spinlock, not nf_tables_destroy_list_lock to protect the gc_list. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: I98d8ffb22389ef17a7c2e7871ceaed3d2d7beb60 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 37ddad00d165..32b78b9891ae 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7978,9 +7978,9 @@ static void nft_trans_gc_work(struct work_struct *work) struct nft_trans_gc *trans, *next; LIST_HEAD(trans_gc_list); - spin_lock(&nf_tables_destroy_list_lock); + spin_lock(&nf_tables_gc_list_lock); list_splice_init(&nf_tables_gc_list, &trans_gc_list); - spin_unlock(&nf_tables_destroy_list_lock); + spin_unlock(&nf_tables_gc_list_lock); list_for_each_entry_safe(trans, next, &trans_gc_list, list) { list_del(&trans->list); -- Gitee From 5e21c183552d060f34a9c6b4e4ae655982a171e5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 15:45:54 +0800 Subject: [PATCH 12/30] netfilter: nf_tables: defer gc run if previous batch is still pending ANBZ: #14678 commit 8e51830e29e12670b4c10df070a4ea4c9593e961 upstream. Don't queue more gc work, else we may queue the same elements multiple times. If an element is flagged as dead, this can mean that either the previous gc request was invalidated/discarded by a transaction or that the previous request is still pending in the system work queue. The latter will happen if the gc interval is set to a very low value, e.g. 1ms, and system work queue is backlogged. The sets refcount is 1 if no previous gc requeusts are queued, so add a helper for this and skip gc run if old requests are pending. Add a helper for this and skip the gc run in this case. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Change-Id: I6c1f3808836cd8fced9ceda8f4a96100b3db8c23 Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- include/net/netfilter/nf_tables.h | 5 ++ net/netfilter/nft_set_hash.c | 3 + net/netfilter/nft_set_rbtree.c | 136 +++++++++++++++++++----------- 3 files changed, 93 insertions(+), 51 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 270514f37a5d..bbd58dbc7c65 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -474,6 +474,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index cc90dd7c7305..33b1abd9b221 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -305,6 +305,9 @@ static void nft_rhash_gc(struct work_struct *work) net = read_pnet(&set->net); gc_seq = READ_ONCE(net->nft.gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index c2bfcc2de0c1..7c4a0edf63b4 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); +} + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext, unsigned int seq) @@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -214,6 +220,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe, @@ -221,11 +239,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_set_gc_batch *gcb; + struct nft_trans_gc *gc; - gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); - if (!gcb) + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) return -ENOMEM; /* search for end interval coming before this element. @@ -243,17 +262,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); + + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; - rb_erase(&rbe_prev->node, &priv->root); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_prev); + nft_trans_gc_elem_add(gc, rbe_prev); } - rb_erase(&rbe->node, &priv->root); - atomic_dec(&set->nelems); + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; - nft_set_gc_batch_add(gcb, rbe); - nft_set_gc_batch_complete(gcb); + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); return 0; } @@ -481,7 +511,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -489,12 +518,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -569,26 +595,41 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; struct net *net; - u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - genmask = nft_genmask_cur(net); + gc_seq = READ_ONCE(net->nft.gc_seq); + + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(net->nft.gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (!nft_set_elem_active(&rbe->ext, genmask)) - continue; + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is @@ -601,40 +642,33 @@ static void nft_rbtree_gc(struct work_struct *work) if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) { - rbe_end = NULL; + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - } - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); +try_later: write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - nft_set_gc_batch_complete(gcb); - + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } -- Gitee From f2df7d88ea9d925620dada56b2915a2e7401a6f6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:54 +0800 Subject: [PATCH 13/30] netfilter: nft_set_rbtree: skip sync GC for new elements in this transaction ANBZ: #14678 commit 2ee52ae94baabf7ee09cf2a8d854b990dac5d0e4 upstream. New elements in this transaction might expired before such transaction ends. Skip sync GC for such elements otherwise commit path might walk over an already released object. Once transaction is finished, async GC will collect such expired element. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Change-Id: I91432e2fabbe62879d37974fec170687eee0725a Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7c4a0edf63b4..754efe26fd7a 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -311,6 +311,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); int d, err; @@ -356,8 +357,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (!nft_set_elem_active(&rbe->ext, genmask)) continue; - /* perform garbage collection to avoid bogus overlap reports. */ - if (nft_set_elem_expired(&rbe->ext)) { + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (nft_set_elem_expired(&rbe->ext) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { err = nft_rbtree_gc_elem(set, priv, rbe, genmask); if (err < 0) return err; -- Gitee From 5c1b22cc63ea5b5205c29cc80ca9b32a7b22f19e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:54 +0800 Subject: [PATCH 14/30] netfilter: nft_set_rbtree: use read spinlock to avoid datapath contention ANBZ: #14678 commit 96b33300fba880ec0eafcf3d82486f3463b4b6da upstream. rbtree GC does not modify the datastructure, instead it collects expired elements and it enqueues a GC transaction. Use a read spinlock instead to avoid data contention while GC worker is running. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Change-Id: I046bdf80bd4ae0c1d2b7cf622ac58a555192149d Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 754efe26fd7a..26deb8d747ba 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -619,8 +619,7 @@ static void nft_rbtree_gc(struct work_struct *work) if (!gc) goto done; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { /* Ruleset has been updated, try later. */ @@ -667,8 +666,7 @@ static void nft_rbtree_gc(struct work_struct *work) nft_trans_gc_elem_add(gc, rbe); } try_later: - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + read_unlock_bh(&priv->lock); if (gc) nft_trans_gc_queue_async_done(gc); -- Gitee From d97e2a6b7cabd16d4864c7799787daeeed485a7a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:54 +0800 Subject: [PATCH 15/30] netfilter: nft_set_pipapo: stop GC iteration if GC transaction allocation fails ANBZ: #14678 commit 6d365eabce3c018a80f6e0379b17df2abb17405e upstream. nft_trans_gc_queue_sync() enqueues the GC transaction and it allocates a new one. If this allocation fails, then stop this GC sync run and retry later. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: Ie3a9820eea4f8992408e34606defe05c402be144 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_pipapo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index c5ba4d124648..a9a4ba5b63a3 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1594,7 +1594,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (!gc) - break; + return; nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); -- Gitee From 9151499b2fd99711dfa8fab1d09a2cd768c4132f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 15:45:54 +0800 Subject: [PATCH 16/30] netfilter: nft_set_hash: try later when GC hits EAGAIN on iteration ANBZ: #14678 commit b079155faae94e9b3ab9337e82100a914ebb4e8d upstream. Skip GC run if iterator rewinds to the beginning with EAGAIN, otherwise GC might collect the same element more than once. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Change-Id: I61c79f3bdb734fc4d4290c824f6bdfda24762768 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_hash.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 33b1abd9b221..321210ddef12 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -317,12 +317,9 @@ static void nft_rhash_gc(struct work_struct *work) while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } /* Ruleset has been updated, try later. */ -- Gitee From 9e59353db4024d437d45bd92c1d31ddfff376f92 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 15:45:55 +0800 Subject: [PATCH 17/30] netfilter: nf_tables: fix memleak when more than 255 elements expired ANBZ: #14678 commit cf5000a7787cbc10341091d37245a42c119d26c5 upstream. When more than 255 elements expired we're supposed to switch to a new gc container structure. This never happens: u8 type will wrap before reaching the boundary and nft_trans_gc_space() always returns true. This means we recycle the initial gc container structure and lose track of the elements that came before. While at it, don't deref 'gc' after we've passed it to call_rcu. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: I53029b3b783ad46c4063cde4b332dba90844bba9 Reported-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bbd58dbc7c65..3076737864f4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1584,7 +1584,7 @@ struct nft_trans_gc { struct net *net; struct nft_set *set; u32 seq; - u8 count; + u16 count; void *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 32b78b9891ae..c62b03de737d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8037,12 +8037,15 @@ static int nft_trans_gc_space(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { + struct nft_set *set; + if (nft_trans_gc_space(gc)) return gc; + set = gc->set; nft_trans_gc_queue_work(gc); - return nft_trans_gc_alloc(gc->set, gc_seq, gfp); + return nft_trans_gc_alloc(set, gc_seq, gfp); } void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) @@ -8057,15 +8060,18 @@ void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) { + struct nft_set *set; + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) return NULL; if (nft_trans_gc_space(gc)) return gc; + set = gc->set; call_rcu(&gc->rcu, nft_trans_gc_trans_free); - return nft_trans_gc_alloc(gc->set, 0, gfp); + return nft_trans_gc_alloc(set, 0, gfp); } void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) -- Gitee From 1bd88745d0c7e5423bac720bf35504bbd5daa381 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 15:45:55 +0800 Subject: [PATCH 18/30] netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure ANBZ: #14678 [ Upstream commit 087388278e0f301f4c61ddffb1911d3a180f84b8 ] nft_rbtree_gc_elem() walks back and removes the end interval element that comes before the expired element. There is a small chance that we've cached this element as 'rbe_ge'. If this happens, we hold and test a pointer that has been queued for freeing. It also causes spurious insertion failures: $ cat test-testcases-sets-0044interval_overlap_0.1/testout.log Error: Could not process rule: File exists add element t s { 0 - 2 } ^^^^^^ Failed to insert 0 - 2 given: table ip t { set s { type inet_service flags interval,timeout timeout 2s gc-interval 2s } } The set (rbtree) is empty. The 'failure' doesn't happen on next attempt. Reason is that when we try to insert, the tree may hold an expired element that collides with the range we're adding. While we do evict/erase this element, we can trip over this check: if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; rbe_ge was erased by the synchronous gc, we should not have done this check. Next attempt won't find it, so retry results in successful insertion. Restart in-kernel to avoid such spurious errors. Such restart are rare, unless userspace intentionally adds very large numbers of elements with very short timeouts while setting a huge gc interval. Even in this case, this cannot loop forever, on each retry an existing element has been removed. As the caller is holding the transaction mutex, its impossible for a second entity to add more expiring elements to the tree. After this it also becomes feasible to remove the async gc worker and perform all garbage collection from the commit path. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Change-Id: I446902b9201c892795b7b08d4f7f416803ab7901 Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 46 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 26deb8d747ba..379c00ea78c5 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -232,10 +232,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, rb_erase(&rbe->node, &priv->root); } -static int nft_rbtree_gc_elem(const struct nft_set *__set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, - u8 genmask) +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -245,7 +244,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); if (!gc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -260,6 +259,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, prev = rb_prev(prev); } + rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_remove(net, set, priv, rbe_prev); @@ -271,7 +271,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, */ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe_prev); } @@ -279,13 +279,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, nft_rbtree_gc_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe); nft_trans_gc_queue_sync_done(gc); - return 0; + return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, @@ -313,7 +313,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - int d, err; + int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the @@ -362,9 +362,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (nft_set_elem_expired(&rbe->ext) && nft_set_elem_active(&rbe->ext, cur_genmask)) { - err = nft_rbtree_gc_elem(set, priv, rbe, genmask); - if (err < 0) - return err; + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; continue; } @@ -485,11 +490,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe = elem->priv; int err; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); return err; } -- Gitee From dd4b928976410272e034e0845c22ff68213b5d2c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 15:45:55 +0800 Subject: [PATCH 19/30] netfilter: nf_tables: fix kdoc warnings after gc rework ANBZ: #14678 commit 08713cb006b6f07434f276c5ee214fb20c7fd965 upstream. Jakub Kicinski says: We've got some new kdoc warnings here: net/netfilter/nft_set_pipapo.c:1557: warning: Function parameter or member '_set' not described in 'pipapo_gc' net/netfilter/nft_set_pipapo.c:1557: warning: Excess function parameter 'set' description in 'pipapo_gc' include/net/netfilter/nf_tables.h:577: warning: Function parameter or member 'dead' not described in 'nft_set' Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Change-Id: I6a2b42cbee381c567999bd43029e25a6be326cfb Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20230810104638.746e46f1@kernel.org/ Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Yaxin --- include/net/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3076737864f4..6db6985c9ae4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -424,6 +424,7 @@ struct nft_set_type { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length -- Gitee From afb54e29f0a949f3396bf756b51d6c1e3158e83a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:12 +0800 Subject: [PATCH 20/30] netfilter: nf_tables: skip set commit for deleted/destroyed sets ANBZ: #14678 commit 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a upstream. NFT_MSG_DELSET deactivates all elements in the set, skip set->ops->commit() to avoid the unnecessary clone (for the pipapo case) as well as the sync GC cycle, which could deactivate again expired elements in such set. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: Ide879bcf5ffc10171de67858a4aae8e203b044ed Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c62b03de737d..fb58475be741 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8173,7 +8173,7 @@ static void nft_set_commit_update(struct list_head *set_update_list) list_for_each_entry_safe(set, next, set_update_list, pending_update) { list_del_init(&set->pending_update); - if (!set->ops->commit) + if (!set->ops->commit || set->dead) continue; set->ops->commit(set); -- Gitee From ced60e63ed3d6aef78714889f29fd7a2d1632faf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 16:03:12 +0800 Subject: [PATCH 21/30] netfilter: nf_tables: mark newset as dead on transaction abort ANBZ: #14678 [ Upstream commit 08e4c8c5919fd405a4d709b4ba43d836894a26eb ] If a transaction is aborted, we should mark the to-be-released NEWSET dead, just like commit path does for DEL and DESTROYSET commands. In both cases all remaining elements will be released via set->ops->destroy(). The existing abort code does NOT post the actual release to the work queue. Also the entire __nf_tables_abort() function is wrapped in gc_seq begin/end pair. Therefore, async gc worker will never try to release the pending set elements, as gc sequence is always stale. It might be possible to speed up transaction aborts via work queue too, this would result in a race and a possible use-after-free. So fix this before it becomes an issue. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: I109e322b576d6c54eb6c4b09cd5e8c8b80f4d73e Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fb58475be741..da6af862cf82 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8562,6 +8562,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: -- Gitee From b74b76db3491d157b1b3a1466bf2bd42d5c9ed49 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:12 +0800 Subject: [PATCH 22/30] netfilter: nf_tables: skip dead set elements in netlink dump ANBZ: #14678 [ Upstream commit 6b1ca88e4bb63673dc9f9c7f23c899f22c3cb17a ] Delete from packet path relies on the garbage collector to purge elements with NFT_SET_ELEM_DEAD_BIT on. Skip these dead elements from nf_tables_dump_setelem() path, I very rarely see tests/shell/testcases/maps/typeof_maps_add_delete reports [DUMP FAILED] showing a mismatch in the expected output with an element that should not be there. If the netlink dump happens before GC worker run, it might show dead elements in the ruleset listing. nft_rhash_get() already skips dead elements in nft_rhash_cmp(), therefore, it already does not show the element when getting a single element via netlink control plane. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Change-Id: Ic4e54241ee324d204e6cbe162cb30f236fcc2900 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index da6af862cf82..8b8981d9cf9d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4815,7 +4815,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; - if (nft_set_elem_expired(ext)) + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); -- Gitee From bd9b793cdee13509716ff07e56b57fc0ca746b27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:12 +0800 Subject: [PATCH 23/30] netfilter: nf_tables: mark set as dead when unbinding anonymous set with timeout ANBZ: #14678 commit 552705a3650bbf46a22b1adedc1b04181490fc36 upstream. While the rhashtable set gc runs asynchronously, a race allows it to collect elements from anonymous sets with timeouts while it is being released from the commit path. Mingi Cho originally reported this issue in a different path in 6.1.x with a pipapo set with low timeouts which is not possible upstream since 7395dfacfff6 ("netfilter: nf_tables: use timestamp to check for set element timeout"). Fix this by setting on the dead flag for anonymous sets to skip async gc in this case. According to 08e4c8c5919f ("netfilter: nf_tables: mark newset as dead on transaction abort"), Florian plans to accelerate abort path by releasing objects via workqueue, therefore, this sets on the dead flag for abort path too. Change-Id: Ia49f553b1781dfde3ebb8810ed0457b405642873 Cc: stable@vger.kernel.org Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Mingi Cho Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8b8981d9cf9d..57ebe5837631 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4585,6 +4585,7 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); -- Gitee From 856484740530999bddb1634fe5c1b50ef64dde07 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:12 +0800 Subject: [PATCH 24/30] netfilter: nft_set_rbtree: skip end interval element from gc ANBZ: #14678 commit 60c0c230c6f046da536d3df8b39a20b9a9fd6af0 upstream. rbtree lazy gc on insert might collect an end interval element that has been just added in this transactions, skip end interval elements that are not yet active. Fixes: f718863aca46 ("netfilter: nft_set_rbtree: fix overlap expiration walk") Change-Id: I81dbd3f0ec00a02965abc9b8ebe135df5b28ad67 Cc: stable@vger.kernel.org Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 379c00ea78c5..15fb21720489 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -234,7 +234,7 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, static const struct nft_rbtree_elem * nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, u8 genmask) + struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -253,7 +253,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev) && - nft_set_elem_active(&rbe_prev->ext, genmask)) + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); @@ -364,7 +364,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; - removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + removed_end = nft_rbtree_gc_elem(set, priv, rbe); if (IS_ERR(removed_end)) return PTR_ERR(removed_end); -- Gitee From 5f1a5b9057672f173002f0baff11d598a4d2cff1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Mar 2025 16:03:13 +0800 Subject: [PATCH 25/30] netfilter: nf_tables: don't fail inserts if duplicate has expired ANBZ: #14678 commit 7845914f45f066497ac75b30c50dbc735e84e884 upstream. nftables selftests fail: run-tests.sh testcases/sets/0044interval_overlap_0 Expected: 0-2 . 0-3, got: W: [FAILED] ./testcases/sets/0044interval_overlap_0: got 1 Insertion must ignore duplicate but expired entries. Moreover, there is a strange asymmetry in nft_pipapo_activate: It refetches the current element, whereas the other ->activate callbacks (bitmap, hash, rhash, rbtree) use elem->priv. Same for .remove: other set implementations take elem->priv, nft_pipapo_remove fetches elem->priv, then does a relookup, remove this. I suspect this was the reason for the change that prompted the removal of the expired check in pipapo_get() in the first place, but skipping exired elements there makes no sense to me, this helper is used for normal get requests, insertions (duplicate check) and deactivate callback. In first two cases expired elements must be skipped. For ->deactivate(), this gets called for DELSETELEM, so it seems to me that expired elements should be skipped as well, i.e. delete request should fail with -ENOENT error. Fixes: 24138933b97b ("netfilter: nf_tables: don't skip expired elements during walk") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin Change-Id: I6ec7a033a9ef67e1d675ec9640c5ab45605d8dfb --- net/netfilter/nft_set_pipapo.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index a9a4ba5b63a3..aa6ddea0696a 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,6 +566,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext)) + goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -600,17 +602,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - struct nft_pipapo_elem *ret; - - ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); - if (IS_ERR(ret)) - return ret; - - if (nft_set_elem_expired(&ret->ext)) - return ERR_PTR(-ENOENT); - - return ret; + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); } /** @@ -1729,11 +1722,7 @@ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_pipapo_elem *e; - - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; + struct nft_pipapo_elem *e = elem->priv; nft_set_elem_change_active(net, set, &e->ext); } @@ -1948,10 +1937,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; -- Gitee From 7c61e32ee1cc199b53693a730598be2fb20695f3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:13 +0800 Subject: [PATCH 26/30] netfilter: nft_set_hash: skip duplicated elements pending gc run ANBZ: #14678 [ Upstream commit 7ffc7481153bbabf3332c6a19b289730c7e1edf5 ] rhashtable does not provide stable walk, duplicated elements are possible in case of resizing. I considered that checking for errors when calling rhashtable_walk_next() was sufficient to detect the resizing. However, rhashtable_walk_next() returns -EAGAIN only at the end of the iteration, which is too late, because a gc work containing duplicated elements could have been already scheduled for removal to the worker. Add a u32 gc worker sequence number per set, bump it on every workqueue run. Annotate gc worker sequence number on the expired element. Use it to skip those already seen in this gc workqueue run. Note that this new field is never reset in case gc transaction fails, so next gc worker run on the expired element overrides it. Wraparound of gc worker sequence number should not be an issue with stale gc worker sequence number in the element, that would just postpone the element removal in one gc run. Note that it is not possible to use flags to annotate that element is pending gc run to detect duplicates, given that gc transaction can be invalidated in case of update from the control plane, therefore, not allowing to clear such flag. On x86_64, pahole reports no changes in the size of nft_rhash_elem. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Reported-by: Laurent Fasnacht Tested-by: Laurent Fasnacht Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin Change-Id: Ic4f0f0c0e8c1df9b9528d10b29cb5696d1549b2f --- net/netfilter/nft_set_hash.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 321210ddef12..4a32453a8e08 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -24,10 +24,12 @@ struct nft_rhash { struct rhashtable ht; struct delayed_work gc_work; + u32 wq_gc_seq; }; struct nft_rhash_elem { struct rhash_head node; + u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -312,6 +314,10 @@ static void nft_rhash_gc(struct work_struct *work) if (!gc) goto done; + /* Elements never collected use a zero gc worker sequence number. */ + if (unlikely(++priv->wq_gc_seq == 0)) + priv->wq_gc_seq++; + rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -329,6 +335,14 @@ static void nft_rhash_gc(struct work_struct *work) goto try_later; } + /* rhashtable walk is unstable, already seen in this gc run? + * Then, skip this element. In case of (unlikely) sequence + * wraparound and stale element wq_gc_seq, next gc run will + * just find this expired element. + */ + if (he->wq_gc_seq == priv->wq_gc_seq) + continue; + if (nft_set_elem_is_dead(&he->ext)) goto dead_elem; @@ -349,6 +363,8 @@ static void nft_rhash_gc(struct work_struct *work) if (!gc) goto try_later; + /* annotate gc sequence for this attempt. */ + he->wq_gc_seq = priv->wq_gc_seq; nft_trans_gc_elem_add(gc, he); } -- Gitee From 7ae48d8cf7c9db9f40092e5b399d85e2688a796c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:13 +0800 Subject: [PATCH 27/30] netfilter: nft_set_hash: unaligned atomic read on struct nft_set_ext ANBZ: #14678 [ Upstream commit 542ed8145e6f9392e3d0a86a0e9027d2ffd183e4 ] Access to genmask field in struct nft_set_ext results in unaligned atomic read: [ 72.130109] Unable to handle kernel paging request at virtual address ffff0000c2bb708c [ 72.131036] Mem abort info: [ 72.131213] ESR = 0x0000000096000021 [ 72.131446] EC = 0x25: DABT (current EL), IL = 32 bits [ 72.132209] SET = 0, FnV = 0 [ 72.133216] EA = 0, S1PTW = 0 [ 72.134080] FSC = 0x21: alignment fault [ 72.135593] Data abort info: [ 72.137194] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [ 72.142351] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 72.145989] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 72.150115] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000237d27000 [ 72.154893] [ffff0000c2bb708c] pgd=0000000000000000, p4d=180000023ffff403, pud=180000023f84b403, pmd=180000023f835403, +pte=0068000102bb7707 [ 72.163021] Internal error: Oops: 0000000096000021 [#1] SMP [...] [ 72.170041] CPU: 7 UID: 0 PID: 54 Comm: kworker/7:0 Tainted: G E 6.13.0-rc3+ #2 [ 72.170509] Tainted: [E]=UNSIGNED_MODULE [ 72.170720] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202302-for-qemu 03/01/2023 [ 72.171192] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 72.171552] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 72.171915] pc : nft_rhash_gc+0x200/0x2d8 [nf_tables] [ 72.172166] lr : nft_rhash_gc+0x128/0x2d8 [nf_tables] [ 72.172546] sp : ffff800081f2bce0 [ 72.172724] x29: ffff800081f2bd40 x28: ffff0000c2bb708c x27: 0000000000000038 [ 72.173078] x26: ffff0000c6780ef0 x25: ffff0000c643df00 x24: ffff0000c6778f78 [ 72.173431] x23: 000000000000001a x22: ffff0000c4b1f000 x21: ffff0000c6780f78 [ 72.173782] x20: ffff0000c2bb70dc x19: ffff0000c2bb7080 x18: 0000000000000000 [ 72.174135] x17: ffff0000c0a4e1c0 x16: 0000000000003000 x15: 0000ac26d173b978 [ 72.174485] x14: ffffffffffffffff x13: 0000000000000030 x12: ffff0000c6780ef0 [ 72.174841] x11: 0000000000000000 x10: ffff800081f2bcf8 x9 : ffff0000c3000000 [ 72.175193] x8 : 00000000000004be x7 : 0000000000000000 x6 : 0000000000000000 [ 72.175544] x5 : 0000000000000040 x4 : ffff0000c3000010 x3 : 0000000000000000 [ 72.175871] x2 : 0000000000003a98 x1 : ffff0000c2bb708c x0 : 0000000000000004 [ 72.176207] Call trace: [ 72.176316] nft_rhash_gc+0x200/0x2d8 [nf_tables] (P) [ 72.176653] process_one_work+0x178/0x3d0 [ 72.176831] worker_thread+0x200/0x3f0 [ 72.176995] kthread+0xe8/0xf8 [ 72.177130] ret_from_fork+0x10/0x20 [ 72.177289] Code: 54fff984 d503201f d2800080 91003261 (f820303f) [ 72.177557] ---[ end trace 0000000000000000 ]--- Align struct nft_set_ext to word size to address this and documentation it. pahole reports that this increases the size of elements for rhash and pipapo in 8 bytes on x86_64. Fixes: 7ffc7481153b ("netfilter: nft_set_hash: skip duplicated elements pending gc run") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Wang Yaxin Change-Id: Id0a497c09644964b5dab557d079cdb2c1b00bc44 --- include/net/netfilter/nf_tables.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6db6985c9ae4..3d0fd8c1ecf4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -575,15 +575,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { -- Gitee From fa8825ea179d8c3dfd362bdaad4bcbe9e45eadd1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:13 +0800 Subject: [PATCH 28/30] netfilter: nf_tables: release batch on table validation from abort path ANBZ: #14678 commit a45e6889575c2067d3c0212b6bc1022891e65b91 upstream. Unlike early commit path stage which triggers a call to abort, an explicit release of the batch is required on abort, otherwise mutex is released and commit_list remains in place. Add WARN_ON_ONCE to ensure commit_list is empty from the abort path before releasing the mutex. After this patch, commit_list is always assumed to be empty before grabbing the mutex, therefore 03c1f1ef1584 ("netfilter: Cleanup nft_net->module_list from nf_tables_exit_net()") only needs to release the pending modules for registration. Change-Id: I341be64cf50a8a37e1d501ac121f5e9d8897e24a Cc: stable@vger.kernel.org Fixes: c0391b6ab810 ("netfilter: nf_tables: missing validation from the abort path") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin [replace nft_net with net->nft - yaxin] Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 57ebe5837631..d1355c0e9faa 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8494,10 +8494,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { @@ -8653,7 +8654,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) else nf_tables_module_autoload_cleanup(net); - return 0; + return err; } static void nf_tables_cleanup(struct net *net) @@ -8671,6 +8672,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, ret = __nf_tables_abort(net, action); nft_gc_seq_end(&net->nft, gc_seq); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + mutex_unlock(&net->nft.commit_mutex); return ret; @@ -9339,8 +9342,11 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(&net->nft); - if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + + if (!list_empty(&net->nft.module_list)) + nf_tables_module_autoload_cleanup(net); + __nft_release_tables(net); nft_gc_seq_end(&net->nft, gc_seq); -- Gitee From 549afff4ce48ff18bc2e3e764aed37a4619e9797 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:13 +0800 Subject: [PATCH 29/30] netfilter: nf_tables: release mutex after nft_gc_seq_end from abort path ANBZ: #14678 commit 0d459e2ffb541841714839e8228b845458ed3b27 upstream. The commit mutex should not be released during the critical section between nft_gc_seq_begin() and nft_gc_seq_end(), otherwise, async GC worker could collect expired objects and get the released commit lock within the same GC sequence. nf_tables_module_autoload() temporarily releases the mutex to load module dependencies, then it goes back to replay the transaction again. Move it at the end of the abort phase after nft_gc_seq_end() is called. Change-Id: I5844daad534874eaf2542f328aaec3225bd6a1c6 Cc: stable@vger.kernel.org Fixes: 720344340fb9 ("netfilter: nf_tables: GC transaction race with abort path") Reported-by: Kuan-Ting Chen Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin [fix contextual conflicts and replace nft_net with net->nft - yaxin] Signed-off-by: Wang Yaxin --- net/netfilter/nf_tables_api.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d1355c0e9faa..93b64ba517b9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8649,11 +8649,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - return err; } @@ -8674,6 +8669,14 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&net->nft.commit_mutex); return ret; -- Gitee From c4eb6c5004a9cf73e8ae4fb268a325bebbb7f1e5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 30 Mar 2025 16:03:14 +0800 Subject: [PATCH 30/30] netfilter: nft_set_pipapo: release elements in clone only from destroy path ANBZ: #14678 [ Upstream commit b0e256f3dd2ba6532f37c5c22e07cb07a36031ee ] Clone already always provides a current view of the lookup table, use it to destroy the set, otherwise it is possible to destroy elements twice. This fix requires: 212ed75dc5fb ("netfilter: nf_tables: integrate pipapo into commit protocol") which came after: 9827a0e6e23b ("netfilter: nft_set_pipapo: release elements in clone from abort path"). Fixes: 9827a0e6e23b ("netfilter: nft_set_pipapo: release elements in clone from abort path") Change-Id: Icabd1f639de605cfa972e23c72e02e2299c94678 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin [fix contextual conflicts for the difference of nft_set_pipapo_match_destroy's parameters - yaxin] Signed-off-by: Wang Yaxin --- net/netfilter/nft_set_pipapo.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index aa6ddea0696a..895083d93549 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2205,8 +2205,6 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (m) { rcu_barrier(); - nft_set_pipapo_match_destroy(set, m); - #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); #endif @@ -2221,8 +2219,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (priv->clone) { m = priv->clone; - if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); -- Gitee