From b610752613e850114ee5689d30e7d2b73465fa0f Mon Sep 17 00:00:00 2001 From: shiyunfei Date: Tue, 31 Jan 2023 10:14:01 +0800 Subject: [PATCH] =?UTF-8?q?NewIP=20=E4=B9=B1=E5=BA=8F=E9=98=9F=E5=88=97(ou?= =?UTF-8?q?t-of-order=20queue)=E7=94=B1=E9=93=BE=E8=A1=A8=E6=94=B9?= =?UTF-8?q?=E4=B8=BA=E7=BA=A2=E9=BB=91=E6=A0=91=EF=BC=8C=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E8=93=9D=E7=89=99=E9=93=BE=E8=B7=AF=20NewIP=20=E6=94=B6?= =?UTF-8?q?=E5=88=B0=E9=9D=9E=E6=9C=AC=E6=9C=BA=E6=8A=A5=E6=96=87=E4=BC=9A?= =?UTF-8?q?=E4=BB=8E=E9=BB=98=E8=AE=A4=E8=B7=AF=E7=94=B1=E8=BD=AC=E5=8F=91?= =?UTF-8?q?=E5=9B=9E=E5=8E=BB=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: shiyunfei --- src/linux/include/net/nip_fib.h | 2 +- src/linux/include/net/nip_route.h | 4 +- src/linux/net/newip/nip_fib_rules.c | 11 +- src/linux/net/newip/nip_input.c | 14 +- src/linux/net/newip/route.c | 34 ++- src/linux/net/newip/tcp_nip.c | 12 +- src/linux/net/newip/tcp_nip_input.c | 403 ++++++++++++++++++++++------ 7 files changed, 370 insertions(+), 110 deletions(-) diff --git a/src/linux/include/net/nip_fib.h b/src/linux/include/net/nip_fib.h index f2ce0b4..150e131 100644 --- a/src/linux/include/net/nip_fib.h +++ b/src/linux/include/net/nip_fib.h @@ -92,7 +92,7 @@ typedef struct nip_rt_info *(*nip_pol_lookup_t) (struct net *, struct nip_fib_table *nip_fib_get_table(struct net *net, u32 id); struct dst_entry *nip_fib_rule_lookup(struct net *net, struct flow_nip *fln, - int flags, nip_pol_lookup_t lookup); + int flags, int *tbl_type, nip_pol_lookup_t lookup); #define NIP_RT_EXPIRES_FLAGS 12 static inline void nip_rt_set_expires(struct nip_rt_info *rt, diff --git a/src/linux/include/net/nip_route.h b/src/linux/include/net/nip_route.h index 6fa91ab..870c093 100644 --- a/src/linux/include/net/nip_route.h +++ b/src/linux/include/net/nip_route.h @@ -17,10 +17,10 @@ struct nip_rt_info *nip_addrconf_dst_alloc(struct ninet_dev *idev, const struct nip_addr *addr); -void nip_route_input(struct sk_buff *skb); +int nip_route_input(struct sk_buff *skb); struct dst_entry *nip_route_input_lookup(struct net *net, struct net_device *dev, - struct flow_nip *fln, int flags); + struct flow_nip *fln, int flags, int *tbl_type); struct dst_entry *nip_route_output_flags(struct net *net, const struct sock *sk, struct flow_nip *fln, int flags); diff --git a/src/linux/net/newip/nip_fib_rules.c b/src/linux/net/newip/nip_fib_rules.c index 7ae1e6a..03fcd5c 100644 --- a/src/linux/net/newip/nip_fib_rules.c +++ b/src/linux/net/newip/nip_fib_rules.c @@ -16,19 +16,24 @@ #include "tcp_nip_parameter.h" struct dst_entry *nip_fib_rule_lookup(struct net *net, struct flow_nip *fln, - int flags, nip_pol_lookup_t lookup) + int flags, int *tbl_type, nip_pol_lookup_t lookup) { struct nip_rt_info *rt; rt = lookup(net, net->newip.nip_fib_local_tbl, fln, flags); - if (rt != net->newip.nip_null_entry) + if (rt != net->newip.nip_null_entry) { + *tbl_type = (int)RT_TABLE_LOCAL; return &rt->dst; + } nip_rt_put(rt); rt = lookup(net, net->newip.nip_fib_main_tbl, fln, flags); - if (rt != net->newip.nip_null_entry) + if (rt != net->newip.nip_null_entry) { + *tbl_type = (int)RT_TABLE_MAIN; return &rt->dst; + } nip_rt_put(rt); dst_hold(&net->newip.nip_null_entry->dst); + *tbl_type = (int)RT_TABLE_MAX; return &net->newip.nip_null_entry->dst; } diff --git a/src/linux/net/newip/nip_input.c b/src/linux/net/newip/nip_input.c index cacfd09..9941b40 100644 --- a/src/linux/net/newip/nip_input.c +++ b/src/linux/net/newip/nip_input.c @@ -38,7 +38,10 @@ static int _nip_update_recv_skb_len(struct sk_buff *skb, return NET_RX_DROP; } - skb->len = niph->total_len; + /* At present, NewIP only uses linear regions, uses skb_trim to remove end from a buffer; + * If the nonlinear region is also used later, use pskb_trim to remove end from a buffer; + */ + skb_trim(skb, niph->total_len); return 0; } @@ -46,6 +49,7 @@ static int nip_rcv_finish(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); void (*edemux)(struct sk_buff *skb) = NULL; + int err = 0; /* set /proc/sys/net/ipv4/ip_early_demux to change sysctl_ip_early_demux, * which is used by ipv4, ipv6 and newip @@ -65,8 +69,12 @@ static int nip_rcv_finish(struct sk_buff *skb) * instead of NULL in skb when looking up failed. */ if (!skb_valid_dst(skb)) - nip_route_input(skb); - + err = nip_route_input(skb); + if (err) { + nip_dbg("nip_route_input lookup route exception, release skb"); + kfree_skb(skb); + return 0; + } return dst_input(skb); } diff --git a/src/linux/net/newip/route.c b/src/linux/net/newip/route.c index 7e517d1..3430e31 100644 --- a/src/linux/net/newip/route.c +++ b/src/linux/net/newip/route.c @@ -33,6 +33,7 @@ #include #include /* copy_from_user() */ #include /* rtnl_lock() */ +#include #include #include @@ -252,12 +253,12 @@ static struct nip_rt_info *nip_pol_route_input(struct net *net, struct dst_entry *nip_route_input_lookup(struct net *net, struct net_device *dev, - struct flow_nip *fln, int flags) + struct flow_nip *fln, int flags, int *tbl_type) { - return nip_fib_rule_lookup(net, fln, flags, nip_pol_route_input); + return nip_fib_rule_lookup(net, fln, flags, tbl_type, nip_pol_route_input); } -void nip_route_input(struct sk_buff *skb) +int nip_route_input(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); int flags = 0; @@ -266,16 +267,36 @@ void nip_route_input(struct sk_buff *skb) .daddr = NIPCB(skb)->dstaddr, .saddr = NIPCB(skb)->srcaddr, }; + struct dst_entry *out_dst; + int tbl_type = 0; if (nip_addr_eq(&fln.daddr, &nip_broadcast_addr_arp)) { nip_dbg("recv broadcast packet"); dst_hold(&net->newip.nip_broadcast_entry->dst); skb_dst_set(skb, (struct dst_entry *)net->newip.nip_broadcast_entry); - return; + return 0; } - skb_dst_set(skb, nip_route_input_lookup(net, skb->dev, &fln, flags)); + out_dst = nip_route_input_lookup(net, skb->dev, &fln, flags, &tbl_type); + skb_dst_set(skb, out_dst); + + if (tbl_type == RT_TABLE_MAIN) { + struct ninet_dev *nin_dev = rcu_dereference(skb->dev->nip_ptr); + struct ninet_dev *nout_dev = rcu_dereference(out_dst->dev->nip_ptr); + + /* When global variable ipv4 all/send_redirects or + * corresponding network/send_redirects is 1, + * IN_DEV_TX_REDIRECTS() conditions are valid. + * send_redirects default is 1. + */ + if (nin_dev == nout_dev && + IN_DEV_TX_REDIRECTS(rcu_dereference(out_dst->dev->ip_ptr))) { + nip_dbg("The inlet and outlet are the same"); + return 1; + } + } + return 0; } static struct nip_rt_info *nip_pol_route_output(struct net *net, @@ -290,8 +311,9 @@ struct dst_entry *nip_route_output_flags(struct net *net, const struct sock *sk, { struct dst_entry *dst; struct nip_rt_info *rt; + int tbl_type = 0; - dst = nip_fib_rule_lookup(net, fln, flags, nip_pol_route_output); + dst = nip_fib_rule_lookup(net, fln, flags, &tbl_type, nip_pol_route_output); rt = (struct nip_rt_info *)dst; if (!(rt->rt_flags & RTF_LOCAL)) diff --git a/src/linux/net/newip/tcp_nip.c b/src/linux/net/newip/tcp_nip.c index 2b458e3..5dc18af 100644 --- a/src/linux/net/newip/tcp_nip.c +++ b/src/linux/net/newip/tcp_nip.c @@ -575,7 +575,7 @@ static struct sock *tcp_nip_syn_recv_sock(const struct sock *sk, struct sk_buff /* Negotiate MSS */ newtp->mss_cache = TCP_BASE_MSS; - newtp->nip_out_of_order_queue = NULL; + newtp->out_of_order_queue = RB_ROOT; newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) @@ -1260,15 +1260,11 @@ out: return err; } -void skb_nip_ofo_queue_purge(struct sock *sk) +static void skb_nip_rbtree_purge(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - while ((skb = tp->nip_out_of_order_queue) != NULL) { - tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; - kfree_skb(skb); - } + skb_rbtree_purge(&tp->out_of_order_queue); } void tcp_nip_destroy_sock(struct sock *sk) @@ -1279,7 +1275,7 @@ void tcp_nip_destroy_sock(struct sock *sk) tcp_nip_write_queue_purge(sk); - skb_nip_ofo_queue_purge(sk); + skb_nip_rbtree_purge(sk); if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/src/linux/net/newip/tcp_nip_input.c b/src/linux/net/newip/tcp_nip_input.c index 94b77c5..289973e 100644 --- a/src/linux/net/newip/tcp_nip_input.c +++ b/src/linux/net/newip/tcp_nip_input.c @@ -105,105 +105,340 @@ void tcp_nip_fin(struct sock *sk) sk->sk_state_change(sk); } +static void tcp_nip_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + static void tcp_nip_overlap_handle(struct tcp_sock *tp, struct sk_buff *skb) { - u32 diff = tp->rcv_nxt - TCP_SKB_CB(skb)->seq; - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 diff = tp->rcv_nxt - TCP_SKB_CB(skb)->seq; + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - skb->data += diff; - skb->len -= diff; - tcb->seq += diff; + skb->data += diff; + skb->len -= diff; + tcb->seq += diff; } -static void tcp_nip_ofo_queue(struct sock *sk) +static void tcp_nip_left_overlap(struct sk_buff *cur, struct sk_buff *skb) { - struct tcp_sock *tp = tcp_sk(sk); + u32 diff = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(cur)->seq; + struct tcp_skb_cb *tcb = TCP_SKB_CB(cur); - while (tp->nip_out_of_order_queue) { - struct sk_buff *skb = tp->nip_out_of_order_queue; + cur->data += diff; + cur->len -= diff; + tcb->seq += diff; +} - if (after(TCP_SKB_CB(tp->nip_out_of_order_queue)->seq, tp->rcv_nxt)) - return; - tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; - skb->next = NULL; - if (tp->rcv_nxt != TCP_SKB_CB(skb)->seq) - tcp_nip_overlap_handle(tp, skb); +static void tcp_nip_right_overlap(struct sk_buff *cur, struct sk_buff *skb) +{ + u32 diff = TCP_SKB_CB(cur)->end_seq - TCP_SKB_CB(skb)->seq; + struct tcp_skb_cb *tcb = TCP_SKB_CB(cur); + unsigned int len; - __skb_queue_tail(&sk->sk_receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + len = cur->len - diff; + /* At present NewIP only uses linear regions, uses skb_trim to remove end from a buffer; + * If the nonlinear region is also used later, use pskb_trim to remove end from a buffer; + */ + skb_trim(cur, len); + tcb->end_seq -= diff; +} + +/* If we update tp->rcv_nxt, also update tp->bytes_received */ +static void tcp_nip_rcv_nxt_update(struct tcp_sock *tp, u32 seq) +{ + u32 delta = seq - tp->rcv_nxt; + + sock_owned_by_me((struct sock *)tp); + tp->bytes_received += delta; + WRITE_ONCE(tp->rcv_nxt, seq); +} + +/* tcp_nip_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_nip_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + int delta; + + *fragstolen = false; - while (tp->nip_out_of_order_queue && - before(TCP_SKB_CB(tp->nip_out_of_order_queue)->end_seq, tp->rcv_nxt)) { - struct sk_buff *tmp_skb = tp->nip_out_of_order_queue; + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; - tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; - tmp_skb->next = NULL; - __kfree_skb(tmp_skb); + if (!skb_try_coalesce(to, from, fragstolen, &delta)) { + nip_dbg("try to merge skb to the previous one failed"); + return false; + } + + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + + if (TCP_SKB_CB(from)->has_rxtstamp) { + TCP_SKB_CB(to)->has_rxtstamp = true; + to->tstamp = from->tstamp; + skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; + } + + return true; +} + +static bool tcp_nip_ooo_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + bool res = tcp_nip_try_coalesce(sk, to, from, fragstolen); + + /* In case tcp_nip_drop() is called later, update to->gso_segs */ + if (res) { + u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + + max_t(u16, 1, skb_shinfo(from)->gso_segs); + u32 to_gso_segs = skb_shinfo(to)->gso_segs; + + nip_dbg("(to)->gso_segs %u, (from)->gso_segs %u", skb_shinfo(to)->gso_segs, + skb_shinfo(from)->gso_segs); + skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + nip_dbg("gso_segs %u to %u", to_gso_segs, skb_shinfo(to)->gso_segs); + } + return res; +} + +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_nip_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool fin; + bool fragstolen; + bool eaten; + struct sk_buff *skb; + struct sk_buff *tail; + struct rb_node *p; + + p = rb_first(&tp->out_of_order_queue); + while (p) { + skb = rb_to_skb(p); + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + nip_dbg("nodes are all after rcv_nxt"); + break; + } + + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); + + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { + nip_dbg("this node is before rcv_nxt, drop skb"); + tcp_nip_drop(sk, skb); + continue; + } + + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail && tcp_nip_try_coalesce(sk, tail, skb, &fragstolen); + tcp_nip_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); + fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + if (!eaten) + __skb_queue_tail(&sk->sk_receive_queue, skb); + else + kfree_skb_partial(skb, fragstolen); + + if (unlikely(fin)) { + nip_dbg("will send fin"); + tcp_nip_fin(sk); + /* tcp_fin() purges tp->out_of_order_queue, + * so we must end this loop right now. + */ + break; } } } - /* Maintain a sort list order by the seq. */ +/* The tcp_nip_data_queue function is responsible for receiving the socket data. For the packets + * whose start sequence number is after the sequence to be received by the socket and whose + * start sequence number is within the receiving window, current function is called to add them + * to the TCP out-of-order queue. + */ static void tcp_nip_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *pre_skb, *cur_skb; + struct rb_node **p; + struct rb_node *parent; + struct sk_buff *skb1; + struct sk_buff *skb2; + u32 seq; + u32 end_seq; + bool fragstolen; + + if (unlikely(atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)) { + nip_dbg("no memory, drop pkt"); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); + sk->sk_data_ready(sk); + tcp_nip_drop(sk, skb); + return; + } + /* Disable header prediction. */ + tp->pred_flags = 0; + /* set the ICSK_ACK_SCHED flag bit to indicate that an ACK needs to be sent. */ inet_csk_schedule_ack(sk); - skb->next = NULL; - if (!tp->nip_out_of_order_queue) { - tp->nip_out_of_order_queue = skb; - skb_set_owner_r(skb, sk); - return; + + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; + + /* If it is the first out-of-order packet to be added, the out_of_order_queue queue is + * empty, insert it into the queue, and update the last skb pointer ooo_last_skb. + */ + p = &tp->out_of_order_queue.rb_node; + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { + nip_dbg("add first ofo pkt"); + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + tp->ooo_last_skb = skb; + goto end; } - pre_skb = tp->nip_out_of_order_queue; - cur_skb = pre_skb->next; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(pre_skb)->seq) { - if (TCP_SKB_CB(skb)->end_seq > TCP_SKB_CB(pre_skb)->end_seq) { - skb->next = pre_skb->next; - pre_skb->next = NULL; - skb_set_owner_r(skb, sk); - __kfree_skb(pre_skb); - return; + + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (tcp_nip_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { +coalesce_done: + /* fragstolen indicates that the data in the linear cache portion of the data + * packet is merged, but the data in the shared cache is still in use, + * so it cannot be released + */ + nip_dbg("ofo skb coalesce done"); + kfree_skb_partial(skb, fragstolen); + skb = NULL; + goto end; + } + /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ + if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) { + nip_dbg("add skb after ooo_last_skb"); + parent = &tp->ooo_last_skb->rbnode; + p = &parent->rb_right; + goto insert; + } + + if (after(seq, TCP_SKB_CB(tp->ooo_last_skb)->seq)) { + tcp_nip_left_overlap(skb, tp->ooo_last_skb); + if (tcp_nip_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { + nip_dbg("ofo skb coalesce ooo_last_skb done"); + goto coalesce_done; + } else { + nip_dbg("ofo skb coalesce ooo_last_skb failed, drop pkt"); + tcp_nip_drop(sk, skb); + skb = NULL; + goto end; } - __kfree_skb(skb); - return; - } else if (TCP_SKB_CB(skb)->seq < TCP_SKB_CB(pre_skb)->seq) { - tp->nip_out_of_order_queue = skb; - skb->next = pre_skb; - skb_set_owner_r(skb, sk); - return; } - while (cur_skb) { - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(cur_skb)->seq) { - /* Same seq, if skb end_seq is bigger, replace. */ - if (TCP_SKB_CB(skb)->end_seq > TCP_SKB_CB(cur_skb)->end_seq) { - pre_skb->next = skb; - skb->next = cur_skb->next; - cur_skb->next = NULL; - skb_set_owner_r(skb, sk); - __kfree_skb(cur_skb); + + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; + } + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* skb1->seq <= seq, end_seq <= skb1->end_seq */ + nip_dbg("completely overlapping, drop pkt"); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + tcp_nip_drop(sk, skb); + skb = NULL; + goto end; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* skb1->seq < seq, end_seq > skb1->end_seq */ + tcp_nip_left_overlap(skb, skb1); + skb2 = skb_rb_next(skb1); + if (before(TCP_SKB_CB(skb2)->seq, TCP_SKB_CB(skb)->end_seq)) + tcp_nip_right_overlap(skb, skb2); + if (tcp_nip_ooo_try_coalesce(sk, skb1, skb, &fragstolen)) { + nip_dbg("partial overlap, ofo skb coalesce done"); + goto coalesce_done; + } else { + nip_dbg("partial overlap, ofo skb coalesce failed, drop pkt"); + tcp_nip_drop(sk, skb); + skb = NULL; + goto end; + } } else { - __kfree_skb(skb); + /* skb1->seq == seq, end_seq > skb1->end_seq + * partial overlap, skb covers skb1, replace skb1 with skb. + */ + nip_dbg("partial overlap, replace old skb node"); + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &tp->out_of_order_queue); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + tcp_nip_drop(sk, skb1); + goto merge_right; } - return; - } else if (TCP_SKB_CB(skb)->seq < TCP_SKB_CB(cur_skb)->seq) { - pre_skb->next = skb; - skb->next = cur_skb; - skb_set_owner_r(skb, sk); - return; + } else if (tcp_nip_ooo_try_coalesce(sk, skb1, skb, &fragstolen)) { + nip_dbg("ofo skb coalesce done while scan ofo queue"); + goto coalesce_done; } - pre_skb = pre_skb->next; - cur_skb = cur_skb->next; + p = &parent->rb_right; } - pre_skb->next = skb; - skb_set_owner_r(skb, sk); -} +insert: + /* Insert segment into RB tree. */ + nip_dbg("add skb into ofo queue"); + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + +merge_right: + /* Remove other segments covered by skb. */ + while ((skb1 = skb_rb_next(skb)) != NULL) { + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_nip_right_overlap(skb, skb1); + nip_dbg("partial overlap, compress the right side of the current package"); + break; + } + nip_dbg("del overlapping nodes on the right"); + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + tcp_nip_drop(sk, skb1); + } + /* If there is no skb after us, we are the last_skb ! */ + if (!skb1) + tp->ooo_last_skb = skb; -static void tcp_drop(struct sock *sk, struct sk_buff *skb) -{ - sk_drops_add(sk, skb); - __kfree_skb(skb); +end: + if (skb) { + /* Try space compression for the skb. if the skb has enough space left in its + * linear space, the page fragment from its shared space can be copied into the + * linear space to free the page fragment. If the remaining amount of linear space + * is less than the length of the page fragment, or if the skb has been cloned + * (the page fragment is shared with other SKBS), no compression is performed. + */ + skb_condense(skb); + skb_set_owner_r(skb, sk); + } } #define PKT_DISCARD_MAX 500 @@ -246,7 +481,7 @@ static void tcp_nip_data_queue(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); out_of_window: inet_csk_schedule_ack(sk); - tcp_drop(sk, skb); + tcp_nip_drop(sk, skb); return; } icsk->icsk_ack.lrcvtime = tcp_jiffies32; @@ -259,7 +494,7 @@ out_of_window: /* wake up processes that are blocked for lack of data */ sk->sk_data_ready(sk); inet_csk_schedule_ack(sk); - tcp_drop(sk, skb); + tcp_nip_drop(sk, skb); return; } @@ -282,7 +517,7 @@ out_of_window: inet_csk_schedule_ack(sk); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_nip_fin(sk); - if (tp->nip_out_of_order_queue) + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) tcp_nip_ofo_queue(sk); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); @@ -300,7 +535,7 @@ static inline void tcp_nip_push_pending_frames(struct sock *sk) { if (tcp_nip_send_head(sk)) { struct tcp_sock *tp = tcp_sk(sk); - u32 cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS + u32 cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS __tcp_nip_push_pending_frames(sk, cur_mss, tp->nonagle); } @@ -366,8 +601,8 @@ static void __tcp_nip_ack_snd_check(struct sock *sk, int ofo_possible) if (((tp->rcv_nxt - tp->rcv_wup) > get_ack_num() * inet_csk(sk)->icsk_ack.rcv_mss && __nip_tcp_select_window(sk) >= tp->rcv_wnd) || /* We have out of order data. */ - (ofo_possible && tp->nip_out_of_order_queue)) { - if (ofo_possible && tp->nip_out_of_order_queue) { + (ofo_possible && (!RB_EMPTY_ROOT(&tp->out_of_order_queue)))) { + if (ofo_possible && (!RB_EMPTY_ROOT(&tp->out_of_order_queue))) { if (tp->rcv_nxt == ntp->last_rcv_nxt) { ntp->dup_ack_cnt++; } else { @@ -497,12 +732,6 @@ struct request_sock *ninet_reqsk_alloc(const struct request_sock_ops *ops, return req; } -static void tcp_nip_drop(struct sock *sk, struct sk_buff *skb) -{ - sk_drops_add(sk, skb); - __kfree_skb(skb); -} - void tcp_nip_parse_mss(struct tcp_options_received *opt_rx, const struct tcphdr *th, const unsigned char *ptr, @@ -1289,7 +1518,7 @@ static bool tcp_nip_validate_incoming(struct sock *sk, struct sk_buff *skb, return true; discard: - tcp_drop(sk, skb); + tcp_nip_drop(sk, skb); return false; } @@ -1315,7 +1544,7 @@ void tcp_nip_rcv_established(struct sock *sk, struct sk_buff *skb, return; discard: - tcp_drop(sk, skb); + tcp_nip_drop(sk, skb); } static u32 tcp_default_init_rwnd(u32 mss) @@ -1436,7 +1665,7 @@ static int tcp_nip_rcv_synsent_state_process(struct sock *sk, struct sk_buff *sk tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_nip_ack(sk, skb); - tp->nip_out_of_order_queue = NULL; + tp->out_of_order_queue = RB_ROOT; /* The next data number expected to be accepted is +1 */ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; /* Accept the left margin of the window +1 */ @@ -1485,7 +1714,7 @@ static int tcp_nip_rcv_synsent_state_process(struct sock *sk, struct sk_buff *sk tcp_nip_send_ack(sk); return -1; discard: - tcp_drop(sk, skb); + tcp_nip_drop(sk, skb); return 0; } -- Gitee