diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 327ca2cc5e7a013c4c03c878e35896fe86c15c53..3d0fd8c1ecf470cf4d425906e01ae8db138b02c7 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -369,7 +369,8 @@ struct nft_set_ops { const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - + void (*commit)(const struct nft_set *set); + void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, @@ -401,6 +402,7 @@ struct nft_set_type { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -422,6 +424,7 @@ struct nft_set_type { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length @@ -430,6 +433,7 @@ struct nft_set_type { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -449,9 +453,11 @@ struct nft_set { u16 udlen; unsigned char *udata; struct nft_expr *expr; + struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -469,6 +475,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); @@ -564,15 +575,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { @@ -1418,6 +1432,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) clear_bit(NFT_SET_ELEM_BUSY_BIT, word); } +#define NFT_SET_ELEM_DEAD_MASK (1 << 3) + +#if defined(__LITTLE_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT 3 +#elif defined(__BIG_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#else +#error +#endif + +static inline void nft_set_elem_dead(struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + /** * struct nft_trans - nf_tables object update in transaction * @@ -1541,6 +1581,35 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u16 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index ed3d917272c2dca6a22624eae90ecd83a183bb1e..c6ea589ac569bad69e1dc92bd44a45cc1bf95ff8 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -14,8 +14,8 @@ struct netns_nftables { unsigned int base_seq; u8 gencursor; u8 validate_state; + CK_KABI_USE_SPLIT(1, unsigned int gc_seq) - CK_KABI_RESERVE(1) }; #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index de4a667c28c45c7bcdf73fd80f19137a7fdafbb1..93b64ba517b9f876d604b53b54d3fc66dfc8fe1a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -29,7 +29,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); static u64 table_handle; enum { @@ -84,6 +86,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -4389,6 +4394,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4424,6 +4430,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } set->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) @@ -4445,6 +4452,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { if (WARN_ON(set->use > 0)) @@ -4454,8 +4469,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) nft_expr_destroy(ctx, set->expr); set->ops->destroy(set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct net *net, struct sock *nlsk, @@ -4571,6 +4585,7 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); @@ -4801,7 +4816,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; - if (nft_set_elem_expired(ext)) + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); @@ -5271,7 +5286,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_set_elem_deactivate() already deals with +/* Only called from commit path, nft_setelem_data_deactivate() already deals with * the refcounting from the preparation phase. */ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, @@ -5655,7 +5670,7 @@ static void nft_set_elem_activate(const struct net *net, (*nft_set_ext_obj(ext))->use++; } -static void nft_set_elem_deactivate(const struct net *net, +void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_set_elem *elem) { @@ -5734,7 +5749,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, kfree(elem.priv); elem.priv = priv; - nft_set_elem_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, &elem); nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -5768,7 +5783,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, } set->ndeact++; - nft_set_elem_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem); nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -7890,6 +7905,188 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + trans->set->ops->remove(trans->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct netns_nftables *net_nft = &trans->net->nft; + struct nft_ctx ctx = {}; + + mutex_lock(&net_nft->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(net_nft->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&net_nft->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&net_nft->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_gc_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + + refcount_inc(&set->refs); + trans->set = set; + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + struct nft_set *set; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + struct nft_set *set; + + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + set = gc->set; + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nft_module_request *req, *next; @@ -7970,12 +8167,44 @@ static void nft_commit_notify(struct net *net, u32 portid) WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); } +static void nft_set_commit_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->commit || set->dead) + continue; + + set->ops->commit(set); + } +} + +static unsigned int nft_gc_seq_begin(struct netns_nftables *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct netns_nftables *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + unsigned int gc_seq; int err; if (list_empty(&net->nft.commit_list)) { @@ -8019,6 +8248,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ while (++net->nft.base_seq == 0); + gc_seq = nft_gc_seq_begin(&net->nft); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -8093,6 +8324,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); @@ -8104,6 +8336,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8115,6 +8352,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); te->set->ndeact--; + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { @@ -8175,8 +8417,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } + nft_set_commit_update(&set_update_list); + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + + nft_gc_seq_end(&net->nft, gc_seq); nf_tables_commit_release(net); return 0; @@ -8229,14 +8475,30 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } +static void nft_set_abort_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->abort) + continue; + + set->ops->abort(set); + } +} + static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { @@ -8302,6 +8564,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: @@ -8317,6 +8580,12 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) te = (struct nft_trans_elem *)trans->data; te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); + + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -8325,6 +8594,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: @@ -8365,6 +8639,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } } + nft_set_abort_update(&set_update_list); + synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, @@ -8373,12 +8649,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - - return 0; + return err; } static void nf_tables_cleanup(struct net *net) @@ -8389,7 +8660,22 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { - int ret = __nf_tables_abort(net, action); + unsigned int gc_seq; + int ret; + + gc_seq = nft_gc_seq_begin(&net->nft); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(&net->nft, gc_seq); + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); @@ -9041,6 +9327,7 @@ static int __net_init nf_tables_init_net(struct net *net) mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; + net->nft.gc_seq = 0; return 0; } @@ -9052,20 +9339,37 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { + unsigned int gc_seq; + mutex_lock(&net->nft.commit_mutex); - if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + + gc_seq = nft_gc_seq_begin(&net->nft); + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + + if (!list_empty(&net->nft.module_list)) + nf_tables_module_autoload_cleanup(net); + __nft_release_tables(net); + + nft_gc_seq_end(&net->nft, gc_seq); + mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list)); WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, }; static int __init nf_tables_module_init(void) @@ -9128,6 +9432,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 7f7a901ce17ba2fc30b1212d1ea71588dafea5a9..4a32453a8e08b85b80e52f3cbcab1c689a1ed0ec 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -24,10 +24,12 @@ struct nft_rhash { struct rhashtable ht; struct delayed_work gc_work; + u32 wq_gc_seq; }; struct nft_rhash_elem { struct rhash_head node; + u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -59,6 +61,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -187,7 +191,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -195,12 +198,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -217,9 +217,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -298,46 +297,84 @@ static void nft_rhash_gc(struct work_struct *work) struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + gc_seq = READ_ONCE(net->nft.gc_seq); + + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; + + /* Elements never collected use a zero gc worker sequence number. */ + if (unlikely(++priv->wq_gc_seq == 0)) + priv->wq_gc_seq++; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(net->nft.gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } + /* rhashtable walk is unstable, already seen in this gc run? + * Then, skip this element. In case of (unlikely) sequence + * wraparound and stale element wq_gc_seq, next gc run will + * just find this expired element. + */ + if (he->wq_gc_seq == priv->wq_gc_seq) + continue; + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { struct nft_expr *expr = nft_set_ext_expr(&he->ext); if (expr->ops->gc && expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; + goto needs_gc_run; } + if (!nft_set_elem_expired(&he->ext)) continue; -gc: - if (nft_set_elem_mark_busy(&he->ext)) - continue; - - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); +needs_gc_run: + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + /* annotate gc sequence for this attempt. */ + he->wq_gc_seq = priv->wq_gc_seq; + nft_trans_gc_elem_add(gc, he); } + +try_later: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -388,7 +425,6 @@ static void nft_rhash_destroy(const struct nft_set *set) struct nft_rhash *priv = nft_set_priv(set); cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)set); } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index f03258d483ef5e59ac5a6ce8268a02ad26ae04cf..895083d93549bfaf8672189a702146bc198adade 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,6 +566,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext)) + goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -600,17 +602,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - struct nft_pipapo_elem *ret; - - ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); - if (IS_ERR(ret)) - return ret; - - if (nft_set_elem_expired(&ret->ext)) - return ERR_PTR(-ENOENT); - - return ret; + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); } /** @@ -1536,15 +1529,32 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) +{ + struct nft_set_elem elem = { + .priv = e, + }; + + nft_setelem_data_deactivate(net, set, &elem); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @set: nftables API set representation + * @_set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) { + struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); int rules_f0, first_rule = 0; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; @@ -1569,13 +1579,19 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (nft_set_elem_expired(&e->ext)) { priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) + return; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1585,7 +1601,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - priv->last_gc = jiffies; + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1603,17 +1622,10 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) } } -/** - * pipapo_reclaim_match - RCU callback to free fields from old matching data - * @rcu: RCU head - */ -static void pipapo_reclaim_match(struct rcu_head *rcu) +static void pipapo_free_match(struct nft_pipapo_match *m) { - struct nft_pipapo_match *m; int i; - m = container_of(rcu, struct nft_pipapo_match, rcu); - for_each_possible_cpu(i) kfree(*per_cpu_ptr(m->scratch, i)); @@ -1628,7 +1640,19 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) } /** - * pipapo_commit() - Replace lookup data with current working copy + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + pipapo_free_match(m); +} + +/** + * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working @@ -1638,7 +1662,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *new_clone, *old; @@ -1663,6 +1687,26 @@ static void pipapo_commit(const struct nft_set *set) priv->clone = new_clone; } +static void nft_pipapo_abort(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *m; + + if (!priv->dirty) + return; + + m = rcu_dereference(priv->match); + + new_clone = pipapo_clone(m); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + pipapo_free_match(priv->clone); + priv->clone = new_clone; +} + /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace @@ -1670,8 +1714,7 @@ static void pipapo_commit(const struct nft_set *set) * @elem: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently - * in use for lookups, and not directly inserted into current lookup data, so - * we'll take care of that by calling pipapo_commit() here. Both + * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ @@ -1679,16 +1722,9 @@ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_pipapo_elem *e; - - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; + struct nft_pipapo_elem *e = elem->priv; nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); - - pipapo_commit(set); } /** @@ -1714,6 +1750,7 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, return NULL; nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); return e; } @@ -1900,10 +1937,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; @@ -1938,7 +1971,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, if (i == m->field_count) { priv->dirty = true; pipapo_drop(m, rulemap); - pipapo_commit(set); return; } @@ -2173,8 +2205,6 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (m) { rcu_barrier(); - nft_set_pipapo_match_destroy(set, m); - #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); #endif @@ -2189,8 +2219,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (priv->clone) { m = priv->clone; - if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); @@ -2238,6 +2267,8 @@ const struct nft_set_type nft_set_pipapo_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2260,6 +2291,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 499be6672b77d43d0979a9c4ed7c62a919070f71..15fb21720489596d2de9e755f71c239249bc391e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -38,10 +38,18 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } -static bool nft_rbtree_equal(const struct nft_set *set, const void *this, - const struct nft_rbtree_elem *interval) +static int nft_rbtree_cmp(const struct nft_set *set, + const struct nft_rbtree_elem *e1, + const struct nft_rbtree_elem *e2) { - return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; + return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), + set->klen); +} + +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -52,7 +60,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); @@ -62,12 +69,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set rbe = rb_entry(parent, struct nft_rbtree_elem, node); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && - nft_rbtree_equal(set, this, interval) && + !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; @@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -214,154 +220,263 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); + struct nft_rbtree_elem *rbe_prev; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) + return ERR_PTR(-ENOMEM); + + /* search for end interval coming before this element. + * end intervals don't carry a timeout extension, they + * are coupled with the interval start element. + */ + while (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_end(rbe_prev) && + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) + break; + + prev = rb_prev(prev); + } + + rbe_prev = NULL; + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); + + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return ERR_PTR(-ENOMEM); + + nft_trans_gc_elem_add(gc, rbe_prev); + } + + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return ERR_PTR(-ENOMEM); + + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); + + return rbe_prev; +} + +static bool nft_rbtree_update_first(const struct nft_set *set, + struct nft_rbtree_elem *rbe, + struct rb_node *first) +{ + struct nft_rbtree_elem *first_elem; + + first_elem = rb_entry(first, struct nft_rbtree_elem, node); + /* this element is closest to where the new element is to be inserted: + * update the first element for the node list path. + */ + if (nft_rbtree_cmp(set, rbe, first_elem) < 0) + return true; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_set_ext **ext) { - bool overlap = false, dup_end_left = false, dup_end_right = false; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - struct nft_rbtree_elem *rbe; - struct rb_node *parent, **p; int d; - /* Detect overlaps as we descend the tree. Set the flag in these cases: - * - * a1. _ _ __>| ?_ _ __| (insert end before existing end) - * a2. _ _ ___| ?_ _ _>| (insert end after existing end) - * a3. _ _ ___? >|_ _ __| (insert start before existing end) - * - * and clear it later on, as we eventually reach the points indicated by - * '?' above, in the cases described below. We'll always meet these - * later, locally, due to tree ordering, and overlaps for the intervals - * that are the closest together are always evaluated last. - * - * b1. _ _ __>| !_ _ __| (insert end before existing start) - * b2. _ _ ___| !_ _ _>| (insert end after existing start) - * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) - * '--' no nodes falling in this range - * b4. >|_ _ ! (insert start before existing start) - * - * Case a3. resolves to b3.: - * - if the inserted start element is the leftmost, because the '0' - * element in the tree serves as end element - * - otherwise, if an existing end is found immediately to the left. If - * there are existing nodes in between, we need to further descend the - * tree before we can conclude the new start isn't causing an overlap - * - * or to b4., which, preceded by a3., means we already traversed one or - * more existing intervals entirely, from the right. - * - * For a new, rightmost pair of elements, we'll hit cases b3. and b2., - * in that order. - * - * The flag is also cleared in two special cases: - * - * b5. |__ _ _!|<_ _ _ (insert start right before existing end) - * b6. |__ _ >|!__ _ _ (insert end right after existing start) - * - * which always happen as last step and imply that no further - * overlapping is possible. - * - * Another special case comes from the fact that start elements matching - * an already existing start element are allowed: insertion is not - * performed but we return -EEXIST in that case, and the error will be - * cleared by the caller if NLM_F_EXCL is not present in the request. - * This way, request for insertion of an exact overlap isn't reported as - * error to userspace if not desired. - * - * However, if the existing start matches a pre-existing start, but the - * end element doesn't match the corresponding pre-existing end element, - * we need to report a partial overlap. This is a local condition that - * can be noticed without need for a tracking flag, by checking for a - * local duplicated end for a corresponding start, from left and right, - * separately. + /* Descend the tree to search for an existing element greater than the + * key value to insert that is greater than the new element. This is the + * first element to walk the ordered elements to find possible overlap. */ - parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), - nft_set_ext_key(&new->ext), - set->klen); + d = nft_rbtree_cmp(set, rbe, new); + if (d < 0) { p = &parent->rb_left; - - if (nft_rbtree_interval_start(new)) { - if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext) && !*p) - overlap = false; - } else { - if (dup_end_left && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_right = true; - continue; - } - } } else if (d > 0) { - p = &parent->rb_right; + if (!first || + nft_rbtree_update_first(set, rbe, first)) + first = &rbe->node; - if (nft_rbtree_interval_end(new)) { - if (dup_end_right && !*p) - return -ENOTEMPTY; - - overlap = nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, - genmask) && - !nft_set_elem_expired(&rbe->ext); - - if (overlap) { - dup_end_left = true; - continue; - } - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - overlap = nft_rbtree_interval_end(rbe); - } + p = &parent->rb_right; } else { - if (nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(new)) { + if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; - - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_rbtree_interval_start(rbe) && - nft_rbtree_interval_end(new)) { + else p = &parent->rb_right; + } + } + + if (!first) + first = rb_first(&priv->root); + + /* Detect overlap by going through the list of valid tree nodes. + * Values stored in the tree are in reversed order, starting from + * highest to lowest value. + */ + for (node = first; node != NULL; node = next) { + next = rb_next(node); + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (nft_set_elem_expired(&rbe->ext) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; + + continue; + } + + d = nft_rbtree_cmp(set, rbe, new); + if (d == 0) { + /* Matching end element: no need to look for an + * overlapping greater or equal element. + */ + if (nft_rbtree_interval_end(rbe)) { + rbe_le = rbe; + break; + } + + /* first element that is greater or equal to key value. */ + if (!rbe_ge) { + rbe_ge = rbe; + continue; + } - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) - overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_set_elem_expired(&rbe->ext)) { - *ext = &rbe->ext; - return -EEXIST; - } else { - overlap = false; - if (nft_rbtree_interval_end(rbe)) - p = &parent->rb_left; - else - p = &parent->rb_right; + /* this is a closer more or equal element, update it. */ + if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { + rbe_ge = rbe; + continue; + } + + /* element is equal to key value, make sure flags are + * the same, an existing more or equal start element + * must not be replaced by more or equal end element. + */ + if ((nft_rbtree_interval_start(new) && + nft_rbtree_interval_start(rbe_ge)) || + (nft_rbtree_interval_end(new) && + nft_rbtree_interval_end(rbe_ge))) { + rbe_ge = rbe; + continue; } + } else if (d > 0) { + /* annotate element greater than the new element. */ + rbe_ge = rbe; + continue; + } else if (d < 0) { + /* annotate element less than the new element. */ + rbe_le = rbe; + break; } + } - dup_end_left = dup_end_right = false; + /* - new start element matching existing start element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && + nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { + *ext = &rbe_ge->ext; + return -EEXIST; + } + + /* - new end element matching existing end element: full overlap + * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. + */ + if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && + nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + *ext = &rbe_le->ext; + return -EEXIST; } - if (overlap) + /* - new start element with existing closest, less or equal key value + * being a start element: partial overlap, reported as -ENOTEMPTY. + * Anonymous sets allow for two consecutive start element since they + * are constant, skip them to avoid bogus overlap reports. + */ + if (!nft_set_is_anonymous(set) && rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) + return -ENOTEMPTY; + + /* - new end element with existing closest, less or equal key value + * being a end element: partial overlap, reported as -ENOTEMPTY. + */ + if (rbe_le && + nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - new end element with existing closest, greater or equal key value + * being an end element: partial overlap, reported as -ENOTEMPTY + */ + if (rbe_ge && + nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + + /* Accepted element: pick insertion point depending on key value */ + parent = NULL; + p = &priv->root.rb_node; + while (*p != NULL) { + parent = *p; + rbe = rb_entry(parent, struct nft_rbtree_elem, node); + d = nft_rbtree_cmp(set, rbe, new); + + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + else if (nft_rbtree_interval_end(rbe)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; @@ -375,11 +490,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe = elem->priv; int err; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); return err; } @@ -405,7 +527,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -413,12 +534,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -493,58 +611,78 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; + struct net *net; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + gc_seq = READ_ONCE(net->nft.gc_seq); - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); + if (nft_set_gc_is_pending(set)) + goto done; + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; + + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(net->nft.gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is + * always visited before the start element. + */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) + + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; - } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - nft_set_gc_batch_complete(gcb); + nft_trans_gc_elem_add(gc, rbe); + } +try_later: + read_unlock_bh(&priv->lock); + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); }