diff --git a/0200-support-sack-in.patch b/0200-support-sack-in.patch new file mode 100644 index 0000000000000000000000000000000000000000..ebd992cc3d2dadfe05954983cbc4cc19d807e071 --- /dev/null +++ b/0200-support-sack-in.patch @@ -0,0 +1,497 @@ + src/core/tcp_in.c | 154 +++++++++++++++++++++++++++++++++++++++ + src/core/tcp_out.c | 146 ++++++++++++++++++++++++++++++++----- + src/include/lwip/priv/tcp_priv.h | 11 +++ + src/include/lwip/tcp.h | 7 ++ + src/include/lwipopts.h | 1 + + 5 files changed, 299 insertions(+), 20 deletions(-) + +diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c +index 05c97d0..732999b 100644 +--- a/src/core/tcp_in.c ++++ b/src/core/tcp_in.c +@@ -1261,6 +1261,10 @@ tcp_free_acked_segments(struct tcp_pcb *pcb, struct tcp_seg *seg_list, const cha + + pcb->snd_queuelen = (u16_t)(pcb->snd_queuelen - clen); + recv_acked = (tcpwnd_size_t)(recv_acked + next->len); ++#if GAZELLE_SACK_IN ++ if (next->flags & TF_SEG_SACKED) ++ pcb->sacked_num--; ++#endif + tcp_seg_free(next); + + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%"TCPWNDSIZE_F" (after freeing %s)\n", +@@ -1274,6 +1278,122 @@ tcp_free_acked_segments(struct tcp_pcb *pcb, struct tcp_seg *seg_list, const cha + return seg_list; + } + ++#if GAZELLE_SACK_IN ++static bool tcp_is_sackblock_valid(struct tcp_pcb *pcb, u32_t start_seq, u32_t end_seq) ++{ ++ /* too far in future, or reversed */ ++ if (TCP_SEQ_GT(end_seq, pcb->snd_nxt) || TCP_SEQ_GEQ(start_seq, end_seq)) ++ return false; ++ ++ return TCP_SEQ_GT(start_seq, pcb->lastack); ++} ++ ++static int tcp_sack_cache_ok(const struct tcp_pcb *pcb, const struct tcp_sack_range *cache) ++{ ++ return cache < pcb->recv_sack_cache + LWIP_TCP_MAX_SACK_NUM; ++} ++ ++/* tag segments from unacked list if the incoming SACK acknowledges them */ ++static struct tcp_seg * ++tcp_sack_segments(struct tcp_pcb *pcb, struct tcp_seg *seg_list) ++{ ++ const unsigned char *sack_ptr = (u8_t *)tcphdr + TCP_HLEN + pcb->sacked; ++ struct tcp_sack_range *sack_opt = (struct tcp_sack_range *)(sack_ptr + 2); ++ struct tcp_sack_range sack_tmp[LWIP_TCP_MAX_SACK_NUM]; ++ struct tcp_sack_range *cache; ++ int num_sacks = LWIP_MIN(LWIP_TCP_MAX_SACK_NUM, (sack_ptr[1] - LWIP_TCP_OPT_LEN_SACK_BASE) >> 3); ++ ++ int used_sacks = 0; ++ for (int i = 0; i < num_sacks; i++) { ++ sack_tmp[used_sacks].left = lwip_ntohl(sack_opt[i].left); ++ sack_tmp[used_sacks].right = lwip_ntohl(sack_opt[i].right); ++ if (!tcp_is_sackblock_valid(pcb, sack_tmp[used_sacks].left, sack_tmp[used_sacks].right)) { ++ continue; ++ } ++ used_sacks++; ++ } ++ ++ /* order SACK blocks */ ++ struct tcp_sack_range tmp; ++ for (int i = used_sacks - 1; i > 0; i--) { ++ for (int j = 0; j < i; j++) { ++ if (TCP_SEQ_GT(sack_tmp[j].left, sack_tmp[j + 1].left)) { ++ tmp = sack_tmp[j]; ++ sack_tmp[j] = sack_tmp[j + 1]; ++ sack_tmp[j + 1] = tmp; ++ } ++ } ++ } ++ ++ /* get cached blocks */ ++ if (pcb->sacked_num == 0) { ++ cache = pcb->recv_sack_cache + LWIP_TCP_MAX_SACK_NUM; ++ } else { ++ cache = pcb->recv_sack_cache; ++ /* skip empty blocks in at head of the cache */ ++ while (tcp_sack_cache_ok(pcb, cache) && !cache->left && !cache->right) { ++ cache++; ++ } ++ } ++ ++ struct tcp_seg *cur_seg = seg_list; ++ u32_t seqno; ++ ++ for (int i = 0; i < used_sacks; i++) { ++ u32_t start_seq = sack_tmp[i].left; ++ u32_t end_seq = sack_tmp[i].right; ++ ++ /* skip too early cached blocks */ ++ while (tcp_sack_cache_ok(pcb, cache) && TCP_SEQ_GEQ(start_seq, cache->right)) { ++ cache++; ++ } ++ ++ /* match with cached block, skip */ ++ if (tcp_sack_cache_ok(pcb, cache) && start_seq == cache->left && end_seq == cache->right) { ++ continue; ++ } ++ ++ /* skip, if seg was already sacked or seqno less then sack option left */ ++ while (cur_seg && (cur_seg->flags & TF_SEG_SACKED || TCP_SEQ_LT(lwip_ntohl(cur_seg->tcphdr->seqno), start_seq))) { ++ cur_seg = cur_seg->next; ++ } ++ ++ if (cur_seg && TCP_SEQ_GT(lwip_ntohl(cur_seg->tcphdr->seqno), end_seq)) { ++ continue; ++ } ++ ++ while (cur_seg) { ++ seqno = lwip_ntohl(cur_seg->tcphdr->seqno); ++ if (TCP_SEQ_GEQ(seqno, start_seq) && TCP_SEQ_LEQ(seqno + TCP_TCPLEN(cur_seg), end_seq)) { ++ /* tag segment sacked */ ++ cur_seg->flags |= TF_SEG_SACKED; ++ pcb->sacked_num++; ++ } else { ++ break; ++ } ++ cur_seg = cur_seg->next; ++ } ++ ++ if (cur_seg == NULL) { ++ break; ++ } ++ } ++ ++ /* cache received sack blocks*/ ++ for (int i = 0; i < LWIP_TCP_MAX_SACK_NUM - used_sacks; i++) { ++ pcb->recv_sack_cache[i].left = 0; ++ pcb->recv_sack_cache[i].right = 0; ++ } ++ for (int i = 0; i < used_sacks; i++) { ++ pcb->recv_sack_cache[LWIP_TCP_MAX_SACK_NUM - used_sacks + i] = sack_tmp[i]; ++ } ++ ++ pcb->sacked = 0; ++ ++ return seg_list; ++} ++#endif ++ + /** + * Called by tcp_process. Checks if the given segment is an ACK for outstanding + * data, and if so frees the memory of the buffered data. Next, it places the +@@ -1344,6 +1464,16 @@ tcp_receive(struct tcp_pcb *pcb) + + /* Clause 1 */ + if (TCP_SEQ_LEQ(ackno, pcb->lastack)) { ++#if GAZELLE_SACK_IN ++ /* sacked data may be discard by peer, so retransmit all unacked segments*/ ++ if (pcb->unacked && pcb->unacked->flags & TF_SEG_SACKED && ackno == lwip_ntohl(pcb->unacked->tcphdr->seqno)) { ++ pcb->sack_retrans = 1; ++ tcp_rexmit(pcb); ++ } ++ if (pcb->sacked && pcb->unacked != NULL) { ++ tcp_sack_segments(pcb, pcb->unacked); ++ } ++#endif + /* Clause 2 */ + if (tcplen == 0) { + /* Clause 3 */ +@@ -1426,6 +1556,11 @@ tcp_receive(struct tcp_pcb *pcb) + pcb->unacked = tcp_free_acked_segments(pcb, pcb->unacked, "unacked", pcb->unsent); + if (pcb->unacked == NULL) + pcb->last_unacked = NULL; ++#if GAZELLE_SACK_IN ++ if (pcb->sacked && pcb->unacked != NULL) { ++ tcp_sack_segments(pcb, pcb->unacked); ++ } ++#endif + /* We go through the ->unsent list to see if any of the segments + on the list are acknowledged by the ACK. This may seem + strange since an "unsent" segment shouldn't be acked. The +@@ -2159,6 +2294,25 @@ tcp_parseopt(struct tcp_pcb *pcb) + } + break; + #endif /* LWIP_TCP_SACK_OUT */ ++#if GAZELLE_SACK_IN ++ case LWIP_TCP_OPT_SACK: ++ LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: SACK\n")); ++ u8_t opsize = tcp_get_next_optbyte(); ++ if ((opsize < (LWIP_TCP_OPT_LEN_SACK_BASE + LWIP_TCP_OPT_LEN_SACK_PERBLOCK)) || ++ (opsize > (LWIP_TCP_OPT_LEN_SACK_BASE + LWIP_TCP_MAX_SACK_NUM * LWIP_TCP_OPT_LEN_SACK_PERBLOCK)) || ++ ((opsize - LWIP_TCP_OPT_LEN_SACK_BASE) % LWIP_TCP_OPT_LEN_SACK_PERBLOCK) || ++ (tcp_optidx - 2 + opsize) > tcphdr_optlen) { ++ /* Bad length */ ++ LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: bad length\n")); ++ return; ++ } ++ if (tcp_is_flag_set(pcb, TF_SACK)) { ++ pcb->sacked = tcp_optidx - 2; ++ } else { ++ tcp_optidx += opsize - 2; ++ } ++ break; ++#endif /* GAZELLE_SACK_IN */ + default: + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: other\n")); + data = tcp_get_next_optbyte(); +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index 4cf1a62..e5c1f48 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -2061,11 +2061,24 @@ tcp_rexmit_rto_prepare(struct tcp_pcb *pcb) + return ERR_VAL; + } + ++#if GAZELLE_SACK_IN ++ /* the head of unacked queue is sacked, return */ ++ if (pcb->unacked->flags & TF_SEG_SACKED) { ++ return ERR_VAL; ++ } ++#endif ++ + /* Move all unacked segments to the head of the unsent queue. + However, give up if any of the unsent pbufs are still referenced by the + netif driver due to deferred transmission. No point loading the link further + if it is struggling to flush its buffered writes. */ + for (seg = pcb->unacked; seg->next != NULL; seg = seg->next) { ++#if GAZELLE_SACK_IN ++ /* don't retransmit sacked segments. */ ++ if (pcb->sacked_num > 0 && seg->next->flags & TF_SEG_SACKED) { ++ break; ++ } ++#endif + if (tcp_output_segment_busy(seg)) { + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit_rto: segment busy\n")); + return ERR_VAL; +@@ -2075,6 +2088,11 @@ tcp_rexmit_rto_prepare(struct tcp_pcb *pcb) + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit_rto: segment busy\n")); + return ERR_VAL; + } ++ ++#if GAZELLE_SACK_IN ++ struct tcp_seg *tmp_seg = seg->next; ++#endif ++ + /* concatenate unsent queue after unacked queue */ + seg->next = pcb->unsent; + #if TCP_OVERSIZE_DBGCHECK +@@ -2084,6 +2102,16 @@ tcp_rexmit_rto_prepare(struct tcp_pcb *pcb) + } + #endif /* TCP_OVERSIZE_DBGCHECK */ + /* unsent queue is the concatenated queue (of unacked, unsent) */ ++#if GAZELLE_SACK_IN ++ if (pcb->unsent == NULL) { ++ pcb->last_unsent = seg; ++ } ++ pcb->unsent = pcb->unacked; ++ pcb->unacked = tmp_seg; ++ if (tmp_seg == NULL) { ++ pcb->last_unacked = NULL; ++ } ++#else + if (pcb->unsent == NULL) { + pcb->last_unsent = pcb->last_unacked; + } +@@ -2091,6 +2119,7 @@ tcp_rexmit_rto_prepare(struct tcp_pcb *pcb) + /* unacked queue is now empty */ + pcb->unacked = NULL; + pcb->last_unacked = NULL; ++#endif + + /* Mark RTO in-progress */ + tcp_set_flags(pcb, TF_RTO); +@@ -2140,12 +2169,13 @@ tcp_rexmit_rto(struct tcp_pcb *pcb) + } + } + ++#if GAZELLE_ENABLE + /** +- * Requeue the first unacked segment for retransmission ++ * Requeue all unacked segment for retransmission until next segment is busy + * + * Called by tcp_receive() for fast retransmit. + * +- * @param pcb the tcp_pcb for which to retransmit the first unacked segment ++ * @param pcb the tcp_pcb for which to re-enqueue all unacked segments + */ + err_t + tcp_rexmit(struct tcp_pcb *pcb) +@@ -2160,29 +2190,33 @@ tcp_rexmit(struct tcp_pcb *pcb) + } + + seg = pcb->unacked; +-#if GAZELLE_ENABLE +- cur_seg = &(pcb->unsent); +- while (seg) { ++ ++ /* Give up if the segment is still referenced by the netif driver ++ due to deferred transmission. */ ++ if (tcp_output_segment_busy(seg) ++#if GAZELLE_SACK_IN ++ || (pcb->sack_retrans == 0 && seg->flags & TF_SEG_SACKED) + #endif +- /* Give up if the segment is still referenced by the netif driver +- due to deferred transmission. */ +- if (tcp_output_segment_busy(seg)) { +- LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit busy\n")); +- if (seg == pcb->unacked) +- return ERR_VAL; +- else +- break; +- } ++ ) { ++ LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit busy\n")); ++ return ERR_VAL; ++ } + ++ cur_seg = &(pcb->unsent); ++ while (seg) { + /* Move the first unacked segment to the unsent queue */ + /* Keep the unsent queue sorted. */ +- if (pcb->last_unacked == pcb->unacked) +- pcb->last_unacked = pcb->unacked->next; + pcb->unacked = pcb->unacked->next; ++ if (pcb->unacked == NULL) { ++ pcb->last_unacked = NULL; ++ } + +-#if !GAZELLE_ENABLE +- cur_seg = &(pcb->unsent); ++#if GAZELLE_SACK_IN ++ /* clean sacked flag */ ++ seg->flags = (u8_t)(seg->flags & (u8_t)(~(TF_SEG_SACKED) & 0xffU)); ++ pcb->sacked_num--; + #endif ++ + while (*cur_seg && + TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(seg->tcphdr->seqno))) { + cur_seg = &((*cur_seg)->next); +@@ -2198,10 +2232,18 @@ tcp_rexmit(struct tcp_pcb *pcb) + } + #endif /* TCP_OVERSIZE */ + +-#if GAZELLE_ENABLE + seg = pcb->unacked; +- } ++ ++ /* Give up if the segment is still referenced by the netif driver ++ due to deferred transmission. */ ++ if (seg && (tcp_output_segment_busy(seg) ++#if GAZELLE_SACK_IN ++ || (pcb->sack_retrans == 0 && seg->flags & TF_SEG_SACKED) + #endif ++ )) { ++ break; ++ } ++ } + + if (pcb->nrtx < 0xFF) { + ++pcb->nrtx; +@@ -2211,13 +2253,77 @@ tcp_rexmit(struct tcp_pcb *pcb) + pcb->rttest = 0; + pcb->need_tso_send = 1; + ++#if GAZELLE_SACK_IN ++ pcb->sack_retrans = 0; ++#endif ++ + /* Do the actual retransmission. */ + MIB2_STATS_INC(mib2.tcpretranssegs); + /* No need to call tcp_output: we are always called from tcp_input() + and thus tcp_output directly returns. */ + return ERR_OK; + } ++# else ++/** ++ * Requeue the first unacked segment for retransmission ++ * ++ * Called by tcp_receive() for fast retransmit. ++ * ++ * @param pcb the tcp_pcb for which to retransmit the first unacked segment ++ */ ++err_t ++tcp_rexmit(struct tcp_pcb *pcb) ++{ ++ struct tcp_seg *seg; ++ struct tcp_seg **cur_seg; ++ ++ LWIP_ASSERT("tcp_rexmit: invalid pcb", pcb != NULL); ++ ++ if (pcb->unacked == NULL) { ++ return ERR_VAL; ++ } ++ ++ seg = pcb->unacked; ++ ++ /* Give up if the segment is still referenced by the netif driver ++ due to deferred transmission. */ ++ if (tcp_output_segment_busy(seg)) { ++ LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit busy\n")); ++ return ERR_VAL; ++ } + ++ /* Move the first unacked segment to the unsent queue */ ++ /* Keep the unsent queue sorted. */ ++ pcb->unacked = seg->next; ++ ++ cur_seg = &(pcb->unsent); ++ while (*cur_seg && ++ TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(seg->tcphdr->seqno))) { ++ cur_seg = &((*cur_seg)->next ); ++ } ++ seg->next = *cur_seg; ++ *cur_seg = seg; ++#if TCP_OVERSIZE ++ if (seg->next == NULL) { ++ /* the retransmitted segment is last in unsent, so reset unsent_oversize */ ++ pcb->unsent_oversize = 0; ++ } ++#endif /* TCP_OVERSIZE */ ++ ++ if (pcb->nrtx < 0xFF) { ++ ++pcb->nrtx; ++ } ++ ++ /* Don't take any rtt measurements after retransmitting. */ ++ pcb->rttest = 0; ++ ++ /* Do the actual retransmission. */ ++ MIB2_STATS_INC(mib2.tcpretranssegs); ++ /* No need to call tcp_output: we are always called from tcp_input() ++ and thus tcp_output directly returns. */ ++ return ERR_OK; ++} ++#endif + + /** + * Handle retransmission after three dupacks received +diff --git a/src/include/lwip/priv/tcp_priv.h b/src/include/lwip/priv/tcp_priv.h +index 02df1d0..e57887d 100644 +--- a/src/include/lwip/priv/tcp_priv.h ++++ b/src/include/lwip/priv/tcp_priv.h +@@ -267,6 +267,9 @@ struct tcp_seg { + checksummed into 'chksum' */ + #define TF_SEG_OPTS_WND_SCALE (u8_t)0x08U /* Include WND SCALE option (only used in SYN segments) */ + #define TF_SEG_OPTS_SACK_PERM (u8_t)0x10U /* Include SACK Permitted option (only used in SYN segments) */ ++#if GAZELLE_SACK_IN ++#define TF_SEG_SACKED (u8_t)0x20U /* The segment is sacked */ ++#endif + struct tcp_hdr *tcphdr; /* the TCP header */ + }; + +@@ -275,6 +278,9 @@ struct tcp_seg { + #define LWIP_TCP_OPT_MSS 2 + #define LWIP_TCP_OPT_WS 3 + #define LWIP_TCP_OPT_SACK_PERM 4 ++#if GAZELLE_SACK_IN ++#define LWIP_TCP_OPT_SACK 5 ++#endif + #define LWIP_TCP_OPT_TS 8 + + #define LWIP_TCP_OPT_LEN_MSS 4 +@@ -298,6 +304,11 @@ struct tcp_seg { + #define LWIP_TCP_OPT_LEN_SACK_PERM_OUT 0 + #endif + ++#if GAZELLE_SACK_IN ++#define LWIP_TCP_OPT_LEN_SACK_BASE 2 /* kind + length */ ++#define LWIP_TCP_OPT_LEN_SACK_PERBLOCK 8 /* left edge + right edge */ ++#endif ++ + #define LWIP_TCP_OPT_LENGTH(flags) \ + ((flags) & TF_SEG_OPTS_MSS ? LWIP_TCP_OPT_LEN_MSS : 0) + \ + ((flags) & TF_SEG_OPTS_TS ? LWIP_TCP_OPT_LEN_TS_OUT : 0) + \ +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index 9c8a2ca..00595cc 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -307,6 +307,13 @@ struct tcp_pcb { + #define LWIP_TCP_SACK_VALID(pcb, idx) ((pcb)->rcv_sacks[idx].left != (pcb)->rcv_sacks[idx].right) + #endif /* LWIP_TCP_SACK_OUT */ + ++#if GAZELLE_SACK_IN ++ u8_t sacked; /* sack option offset to tcp option. */ ++ u8_t sack_retrans; /* used in tcp_rexmit(). 1: retransmit sacked segment */ ++ u32_t sacked_num; /* number of packets, which SACKed by receiver. */ ++ struct tcp_sack_range recv_sack_cache[LWIP_TCP_MAX_SACK_NUM]; /* last received sack option */ ++#endif ++ + /* Retransmission timer. */ + s16_t rtime; + +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 4483ebe..1ff3dd7 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -63,6 +63,7 @@ + #define GAZELLE_UDP_ENABLE 1 + #define GAZELLE_UDP_NEW_PORT 1 + ++#define GAZELLE_SACK_IN 1 + + /* + ---------------------------------- diff --git a/lwip.spec b/lwip.spec index 14d5c71890a057c7d1e7088047864e7375d7e6ff..8ce07f62cc8aa924ec6ee9642ef42ab2e962b080 100644 --- a/lwip.spec +++ b/lwip.spec @@ -172,6 +172,8 @@ Patch9156: 0157-cleancode-refactor-offload.patch Patch9157: 0158-enable-sys_arch-failed-log.patch Patch9158: 0159-LOOPBACK-fix-loop-coredump.patch +Patch9200: 0200-support-sack-in.patch + BuildRequires: gcc-c++ dos2unix dpdk-devel #Requires: