diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 766f7ca57e3c5430ba6d334a812219f5e8daf064..c3c89b6cc42f782f41d5e4558a282ffd3dc37812 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1181,6 +1181,7 @@ static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, diff --git a/include/net/tcp.h b/include/net/tcp.h index b69f1790b45e5b6aae4a3e3390afa8b95eec653a..bb7422b87ceb07fc99282af33f395378769124b9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -331,7 +332,6 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); -void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); diff --git a/include/net/tcp_ext.h b/include/net/tcp_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..733534808c4c14ee0d040804e3b1f20e2fd0369d --- /dev/null +++ b/include/net/tcp_ext.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _TCP_EXT_H +#define _TCP_EXT_H + +void tcp_wfree(struct sk_buff *skb); + +static inline bool is_skb_wmem(const struct sk_buff *skb) +{ + return skb->destructor == sock_wfree || + skb->destructor == __sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree); +} +#endif /* _TCP_EXT_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 85c9bbedcb3385594088f1feaf5c58b2d563f3c1..b290db716392b973cc874bd4a65486dd0a5c2ea7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -1773,6 +1774,57 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) return 0; } +/** + * skb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @headroom: needed headroom + * + * Unlike skb_realloc_headroom, this one does not allocate a new skb + * if possible; copies skb->sk to new skb as needed + * and frees original skb in case of failures. + * + * It expect increased headroom and generates warning otherwise. + */ + +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) +{ + int delta = headroom - skb_headroom(skb); + int osize = skb_end_offset(skb); + struct sock *sk = skb->sk; + + if (WARN_ONCE(delta <= 0, + "%s is expecting an increase in the headroom", __func__)) + return skb; + + delta = SKB_DATA_ALIGN(delta); + /* pskb_expand_head() might crash, if skb is shared. */ + if (skb_shared(skb) || !is_skb_wmem(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (unlikely(!nskb)) + goto fail; + + if (sk) + skb_set_owner_w(nskb, sk); + consume_skb(skb); + skb = nskb; + } + if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) + goto fail; + + if (sk && is_skb_wmem(skb)) { + delta = skb_end_offset(skb) - osize; + refcount_add(delta, &sk->sk_wmem_alloc); + skb->truesize += delta; + } + return skb; + +fail: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_expand_head); + /** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e0e8a65d561ec6553970af7ac8948db515f774dc..9f315f74df4a0307a55314a9c35288b4dfabf2c1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -24,6 +24,8 @@ #include #include +#include + /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. @@ -39,6 +41,7 @@ struct ipfrag_skb_cb { }; struct sk_buff *next_frag; int frag_run_len; + int ip_defrag_offset; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) @@ -359,12 +362,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, */ if (!last) fragrun_create(q, skb); /* First fragment. */ - else if (last->ip_defrag_offset + last->len < end) { + else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ - if (offset < last->ip_defrag_offset + last->len) + if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; - if (offset == last->ip_defrag_offset + last->len) + if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); @@ -381,13 +384,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, parent = *rbn; curr = rb_to_skb(parent); - curr_run_end = curr->ip_defrag_offset + + curr_run_end = FRAG_CB(curr)->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; - if (end <= curr->ip_defrag_offset) + if (end <= FRAG_CB(curr)->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; - else if (offset >= curr->ip_defrag_offset && + else if (offset >= FRAG_CB(curr)->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else @@ -401,7 +404,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, rb_insert_color(&skb->rbnode, &q->rb_fragments); } - skb->ip_defrag_offset = offset; + FRAG_CB(skb)->ip_defrag_offset = offset; return IPFRAG_OK; } @@ -411,13 +414,28 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); - struct sk_buff **nextp; + void (*destructor)(struct sk_buff *); + unsigned int orig_truesize = 0; + struct sk_buff **nextp = NULL; + struct sock *sk = skb->sk; int delta; + if (sk && is_skb_wmem(skb)) { + /* TX: skb->sk might have been passed as argument to + * dst->output and must remain valid until tx completes. + * + * Move sk to reassembled skb and fix up wmem accounting. + */ + orig_truesize = skb->truesize; + destructor = skb->destructor; + } + if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - return NULL; + if (!fp) { + head = skb; + goto out_restore_sk; + } FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; @@ -426,6 +444,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; + + if (orig_truesize) { + /* prevent skb_morph from releasing sk */ + skb->sk = NULL; + skb->destructor = NULL; + } skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, @@ -433,13 +457,13 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, consume_skb(head); head = skb; } - WARN_ON(head->ip_defrag_offset != 0); + WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) - return NULL; + goto out_restore_sk; delta += head->truesize; if (delta) @@ -455,7 +479,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) - return NULL; + goto out_restore_sk; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) @@ -472,6 +496,21 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, nextp = &skb_shinfo(head)->frag_list; } +out_restore_sk: + if (orig_truesize) { + int ts_delta = head->truesize - orig_truesize; + + /* if this reassembled skb is fragmented later, + * fraglist skbs will get skb->sk assigned from head->sk, + * and each frag skb will be released via sock_wfree. + * + * Update sk_wmem_alloc. + */ + head->sk = sk; + head->destructor = destructor; + refcount_add(ts_delta, &sk->sk_wmem_alloc); + } + return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); @@ -479,6 +518,8 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { + struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; + const unsigned int head_truesize = head->truesize; struct sk_buff **nextp = (struct sk_buff **)reasm_data; struct rb_node *rbn; struct sk_buff *fp; @@ -541,6 +582,9 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; + + if (sk) + refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(inet_frag_reasm_finish); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fad803d2d711ef0d97f7150f0f710a35ac822946..ec2264adf2a6a5b6f8e935bcbeb459b55642d7ae 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -377,6 +377,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -479,7 +480,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); - skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c129ad334eb392594e1c258b31e52a9d03d7724f..8c2163f95711ce61ce69c4242fd57d722f8230b6 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -296,6 +296,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -471,7 +472,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) {