From 4710c19f2f51c3f26f131a3afdfb886aea11ea7e Mon Sep 17 00:00:00 2001 From: zhengjiebing Date: Mon, 4 Mar 2024 20:52:44 +0800 Subject: [PATCH] optimize rto --- 0111-optimize-rto.patch | 493 ++++++++++++++++++++++++++++++++++++++++ lwip.spec | 7 +- 2 files changed, 499 insertions(+), 1 deletion(-) create mode 100644 0111-optimize-rto.patch diff --git a/0111-optimize-rto.patch b/0111-optimize-rto.patch new file mode 100644 index 0000000..0465a2e --- /dev/null +++ b/0111-optimize-rto.patch @@ -0,0 +1,493 @@ + src/api/sys_arch.c | 21 ++++++++++++ + src/core/tcp.c | 70 ++++++++++++++++++++++++++++++++++++++++ + src/core/tcp_in.c | 44 ++++++++++++++++++++++++- + src/core/tcp_out.c | 13 ++++++++ + src/core/timeouts.c | 24 ++++++++++++++ + src/include/arch/sys_arch.h | 3 ++ + src/include/lwip/opt.h | 4 +++ + src/include/lwip/priv/tcp_priv.h | 3 ++ + src/include/lwip/tcp.h | 10 ++++++ + src/include/lwipopts.h | 3 ++ + 10 files changed, 194 insertions(+), 1 deletion(-) + +diff --git a/src/api/sys_arch.c b/src/api/sys_arch.c +index e27b215..7770753 100644 +--- a/src/api/sys_arch.c ++++ b/src/api/sys_arch.c +@@ -78,6 +78,10 @@ static PER_THREAD struct sys_mem_stats hugepage_stats; + + static uint64_t cycles_per_ms __attribute__((aligned(64))); + static uint64_t sys_start_ms __attribute__((aligned(64))); ++#if GAZELLE_ENABLE ++static uint64_t cycles_per_us __attribute__((aligned(64))); ++static uint64_t sys_start_us __attribute__((aligned(64))); ++#endif + + /* + * Mailbox +@@ -381,6 +385,15 @@ void sys_calibrate_tsc(void) + if (sys_start_ms == 0) { + sys_start_ms = rte_rdtsc() / cycles_per_ms; + } ++#if GAZELLE_ENABLE ++#define US_PER_SEC 1E6 ++ if (cycles_per_us == 0) { ++ cycles_per_us = (freq + US_PER_SEC - 1) / US_PER_SEC; ++ } ++ if (sys_start_us == 0) { ++ sys_start_us = rte_rdtsc() / cycles_per_us; ++ } ++#endif + } + + uint32_t sys_now(void) +@@ -389,6 +402,14 @@ uint32_t sys_now(void) + return (uint32_t)(cur_ms - sys_start_ms); + } + ++#if GAZELLE_ENABLE ++uint32_t sys_now_us(void) ++{ ++ uint64_t cur_us = rte_rdtsc() / cycles_per_us; ++ return (uint32_t)(cur_us - sys_start_us); ++} ++#endif ++ + /* + * Critical section + * */ +diff --git a/src/core/tcp.c b/src/core/tcp.c +index 69f6953..a22d5af 100644 +--- a/src/core/tcp.c ++++ b/src/core/tcp.c +@@ -301,6 +301,57 @@ tcp_tmr(void) + } + } + ++#if GAZELLE_ENABLE ++void ++tcp_rto_tmr(void) ++{ ++ struct tcp_pcb *pcb = tcp_active_pcbs; ++ u32_t now = sys_now_us(); ++ ++ while (pcb != NULL) { ++ if ((pcb->rtime > 0)) { ++ if (now - pcb->rtime >= pcb->rto) { ++ /* Time for a retransmission. */ ++ LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_slowtmr: rtime %"U32_F ++ " pcb->rto %"U32_F"\n", ++ pcb->rtime, pcb->rto)); ++ /* If prepare phase fails but we have unsent data but no unacked data, ++ still execute the backoff calculations below, as this means we somehow ++ failed to send segment. */ ++ if ((tcp_rexmit_rto_prepare(pcb) == ERR_OK) || ((pcb->unacked == NULL) && (pcb->unsent != NULL))) { ++ /* Double retransmission time-out unless we are trying to ++ * connect to somebody (i.e., we are in SYN_SENT). */ ++ if (pcb->state != SYN_SENT) { ++ u8_t backoff_idx = LWIP_MIN(pcb->nrtx, sizeof(tcp_backoff) - 1); ++ u32_t orig_rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++ pcb->rto = orig_rto << tcp_backoff[backoff_idx]; ++ } ++ ++ /* Reset the retransmission timer. */ ++ pcb->rtime = now; ++ ++ /* Reduce congestion window and ssthresh. */ ++ pcb->ssthresh = LWIP_MIN(pcb->cwnd, pcb->snd_wnd) >> 1; ++ if (pcb->ssthresh < (tcpwnd_size_t)(pcb->mss << 1)) { ++ pcb->ssthresh = (tcpwnd_size_t)(pcb->mss << 1); ++ } ++ pcb->cwnd = pcb->mss; ++ LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_slowtmr: cwnd %"TCPWNDSIZE_F ++ " ssthresh %"TCPWNDSIZE_F"\n", ++ pcb->cwnd, pcb->ssthresh)); ++ pcb->bytes_acked = 0; ++ ++ /* The following needs to be called AFTER cwnd is set to one ++ mss - STJ */ ++ tcp_rexmit_rto_commit(pcb); ++ } ++ } ++ } ++ pcb = pcb->next; ++ } ++} ++#endif ++ + #if LWIP_CALLBACK_API || TCP_LISTEN_BACKLOG + /** Called when a listen pcb is closed. Iterates one pcb list and removes the + * closed listener pcb from pcb->listener if matching. +@@ -1362,7 +1413,9 @@ void + tcp_slowtmr(void) + { + struct tcp_pcb *pcb, *prev; ++#if !GAZELLE_ENABLE + tcpwnd_size_t eff_wnd; ++#endif + u8_t pcb_remove; /* flag if a PCB should be removed */ + u8_t pcb_reset; /* flag if a RST should be sent when removing */ + err_t err; +@@ -1438,6 +1491,7 @@ tcp_slowtmr_start: + } + } + } else { ++#if !GAZELLE_ENABLE + /* Increase the retransmission timer if it is running */ + if ((pcb->rtime >= 0) && (pcb->rtime < 0x7FFF)) { + ++pcb->rtime; +@@ -1480,6 +1534,7 @@ tcp_slowtmr_start: + tcp_rexmit_rto_commit(pcb); + } + } ++#endif + } + } + /* Check if this PCB has stayed too long in FIN-WAIT-2 */ +@@ -1522,8 +1577,13 @@ tcp_slowtmr_start: + inactive for too long, will drop the data (it will eventually + be retransmitted). */ + #if TCP_QUEUE_OOSEQ ++#if GAZELLE_ENABLE ++ if (pcb->ooseq != NULL && ++ (tcp_ticks - pcb->tmr >= (u32_t)(pcb->rto / 1000 + TCP_SLOW_INTERVAL) / TCP_SLOW_INTERVAL * TCP_OOSEQ_TIMEOUT)) { ++#else + if (pcb->ooseq != NULL && + (tcp_ticks - pcb->tmr >= (u32_t)pcb->rto * TCP_OOSEQ_TIMEOUT)) { ++#endif + LWIP_DEBUGF(TCP_DEBUG | GAZELLE_DEBUG_SERIOUS, ("tcp_slowtmr: dropping OOSEQ queued data local_port=%u, remote_port=%u\n", pcb->local_port, pcb->remote_port)); + tcp_free_ooseq(pcb); + } +@@ -2115,9 +2175,15 @@ tcp_alloc(u8_t prio) + pcb->mss = INITIAL_MSS; + /* Set initial TCP's retransmission timeout to 3000 ms by default. + This value could be configured in lwipopts */ ++#if GAZELLE_ENABLE ++ pcb->rto = GAZELLE_MIN_RTO_US; ++ pcb->sv = LWIP_TCP_RTO_TIME / TCP_SLOW_INTERVAL; ++ pcb->rtime = 0; ++#else + pcb->rto = LWIP_TCP_RTO_TIME / TCP_SLOW_INTERVAL; + pcb->sv = LWIP_TCP_RTO_TIME / TCP_SLOW_INTERVAL; + pcb->rtime = -1; ++#endif + pcb->cwnd = 1; + pcb->tmr = tcp_ticks; + pcb->last_timer = tcp_timer_ctr; +@@ -2383,7 +2449,11 @@ tcp_pcb_purge(struct tcp_pcb *pcb) + + /* Stop the retransmission timer as it will expect data on unacked + queue if it fires */ ++#if GAZELLE_ENABLE ++ pcb->rtime = 0; ++#else + pcb->rtime = -1; ++#endif + + tcp_segs_free(pcb->unsent); + tcp_segs_free(pcb->unacked); +diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c +index 2922721..c6011f0 100644 +--- a/src/core/tcp_in.c ++++ b/src/core/tcp_in.c +@@ -61,6 +61,7 @@ + #endif /* LWIP_ND6_TCP_REACHABILITY_HINTS */ + #if GAZELLE_ENABLE + #include "lwip/api.h" ++#include "lwip/sys.h" + #endif + + #include +@@ -1059,9 +1060,15 @@ tcp_process(struct tcp_pcb *pcb) + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ + if (pcb->unacked == NULL) { ++#if GAZELLE_ENABLE ++ pcb->rtime = 0; ++ } else { ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = -1; + } else { + pcb->rtime = 0; ++#endif + pcb->nrtx = 0; + } + +@@ -1084,7 +1091,11 @@ tcp_process(struct tcp_pcb *pcb) + connection faster, but do not send more SYNs than we otherwise would + have, or we might get caught in a loop on loopback interfaces. */ + if (pcb->nrtx < TCP_SYNMAXRTX) { ++#if GAZELLE_ENABLE ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = 0; ++#endif + tcp_rexmit_rto(pcb); + } + } +@@ -1326,7 +1337,11 @@ tcp_free_acked_segments(struct tcp_pcb *pcb, struct tcp_seg *seg_list, const cha + static void + tcp_receive(struct tcp_pcb *pcb) + { ++#if GAZELLE_ENABLE ++ long m; ++#else + s16_t m; ++#endif + u32_t right_wnd_edge; + + LWIP_ASSERT("tcp_receive: invalid pcb", pcb != NULL); +@@ -1386,7 +1401,11 @@ tcp_receive(struct tcp_pcb *pcb) + /* Clause 3 */ + if (pcb->snd_wl2 + pcb->snd_wnd == right_wnd_edge) { + /* Clause 4 */ ++#if GAZELLE_ENABLE ++ if (pcb->rtime > 0) { ++#else + if (pcb->rtime >= 0) { ++#endif + /* Clause 5 */ + if (pcb->lastack == ackno) { + if ((u8_t)(pcb->dupacks + 1) > pcb->dupacks) { +@@ -1421,7 +1440,11 @@ tcp_receive(struct tcp_pcb *pcb) + pcb->nrtx = 0; + + /* Reset the retransmission time-out. */ ++#if GAZELLE_ENABLE ++ pcb->rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++#else + pcb->rto = (s16_t)((pcb->sa >> 3) + pcb->sv); ++#endif + + /* Record how much data this ACK acks */ + acked = (tcpwnd_size_t)(ackno - pcb->lastack); +@@ -1476,9 +1499,15 @@ tcp_receive(struct tcp_pcb *pcb) + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ + if (pcb->unacked == NULL) { ++#if GAZELLE_ENABLE ++ pcb->rtime = 0; ++ } else { ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = -1; + } else { + pcb->rtime = 0; ++#endif + } + + pcb->polltmr = 0; +@@ -1528,6 +1557,19 @@ tcp_receive(struct tcp_pcb *pcb) + if (pcb->rttest && TCP_SEQ_LT(pcb->rtseq, ackno)) { + /* diff between this shouldn't exceed 32K since this are tcp timer ticks + and a round-trip shouldn't be that long... */ ++#if GAZELLE_ENABLE ++ pcb->rtt_us = sys_now_us() - pcb->rttest; /* us */ ++ m = pcb->rtt_us; ++ ++ m = m - (pcb->sa >> 3); ++ pcb->sa = pcb->sa + m; ++ if (m < 0) { ++ m = -m; ++ } ++ m = m - (pcb->sv >> 2); ++ pcb->sv = pcb->sv + m; ++ pcb->rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++#else + m = (s16_t)(tcp_ticks - pcb->rttest); + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: experienced rtt %"U16_F" ticks (%"U16_F" msec).\n", +@@ -1545,7 +1587,7 @@ tcp_receive(struct tcp_pcb *pcb) + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: RTO %"U16_F" (%"U16_F" milliseconds)\n", + pcb->rto, (u16_t)(pcb->rto * TCP_SLOW_INTERVAL))); +- ++#endif + pcb->rttest = 0; + } + } +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index 1632a66..76245f3 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -1907,12 +1907,21 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + + /* Set retransmission timer running if it is not currently enabled + This must be set before checking the route. */ ++#if GAZELLE_ENABLE ++ if (pcb->rtime == 0) { ++ pcb->rtime = sys_now_us(); ++#else + if (pcb->rtime < 0) { + pcb->rtime = 0; ++#endif + } + + if (pcb->rttest == 0) { ++#if GAZELLE_ENABLE ++ pcb->rttest = sys_now_us(); ++#else + pcb->rttest = tcp_ticks; ++#endif + pcb->rtseq = lwip_ntohl(seg->tcphdr->seqno); + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_output_segment: rtseq %"U32_F"\n", pcb->rtseq)); +@@ -2257,7 +2266,11 @@ tcp_rexmit_fast(struct tcp_pcb *pcb) + tcp_set_flags(pcb, TF_INFR); + + /* Reset the retransmission timer to prevent immediate rto retransmissions */ ++#if GAZELLE_ENABLE ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = 0; ++#endif + } + } + } +diff --git a/src/core/timeouts.c b/src/core/timeouts.c +index 610a2d7..72bfb22 100644 +--- a/src/core/timeouts.c ++++ b/src/core/timeouts.c +@@ -79,6 +79,9 @@ const struct lwip_cyclic_timer lwip_cyclic_timers[] = { + /* The TCP timer is a special case: it does not have to run always and + is triggered to start from TCP using tcp_timer_needed() */ + {TCP_TMR_INTERVAL, HANDLER(tcp_tmr)}, ++#if GAZELLE_ENABLE ++ {GAZELLE_RTO_TMR_INTERVAL, HANDLER(tcp_rto_tmr)}, ++#endif + #endif /* LWIP_TCP */ + #if LWIP_IPV4 + #if IP_REASSEMBLY +@@ -157,6 +160,20 @@ tcpip_tcp_timer(void *arg) + } + } + ++#if GAZELLE_ENABLE ++void ++tcpip_rto_timer(void *arg) ++{ ++ LWIP_UNUSED_ARG(arg); ++ ++ tcp_rto_tmr(); ++ if (tcp_active_pcbs || tcp_tw_pcbs) { ++ /* restart timer */ ++ sys_timeout(GAZELLE_RTO_TMR_INTERVAL, tcpip_rto_timer, NULL); ++ } ++} ++#endif ++ + /** + * Called from TCP_REG when registering a new PCB: + * the reason is to have the TCP timer only running when +@@ -172,6 +189,9 @@ tcp_timer_needed(void) + /* enable and start timer */ + tcpip_tcp_timer_active = 1; + sys_timeout(TCP_TMR_INTERVAL, tcpip_tcp_timer, NULL); ++#if GAZELLE_ENABLE ++ sys_timeout(GAZELLE_RTO_TMR_INTERVAL, tcpip_rto_timer, NULL); ++#endif + } + } + #endif /* LWIP_TCP */ +@@ -265,7 +285,11 @@ void sys_timeouts_init(void) + { + size_t i; + /* tcp_tmr() at index 0 is started on demand */ ++#if GAZELLE_ENABLE ++ for (i = (LWIP_TCP ? 2 : 0); i < LWIP_ARRAYSIZE(lwip_cyclic_timers); i++) { ++#else + for (i = (LWIP_TCP ? 1 : 0); i < LWIP_ARRAYSIZE(lwip_cyclic_timers); i++) { ++#endif + /* we have to cast via size_t to get rid of const warning + (this is OK as cyclic_timer() casts back to const* */ + sys_timeout(lwip_cyclic_timers[i].interval_ms, lwip_cyclic_timer, LWIP_CONST_CAST(void *, &lwip_cyclic_timers[i])); +diff --git a/src/include/arch/sys_arch.h b/src/include/arch/sys_arch.h +index bf7e437..3607569 100644 +--- a/src/include/arch/sys_arch.h ++++ b/src/include/arch/sys_arch.h +@@ -131,6 +131,9 @@ struct rte_ring *gazelle_ring_create_fast(const char *name, uint32_t size, uint3 + + void sys_calibrate_tsc(void); + uint32_t sys_now(void); ++#if GAZELLE_ENABLE ++uint32_t sys_now_us(void); ++#endif + __attribute__((always_inline)) inline int update_timeout(int timeout, uint32_t poll_ts) + { + uint32_t used_ms = sys_now() - poll_ts; +diff --git a/src/include/lwip/opt.h b/src/include/lwip/opt.h +index 53b1946..7bf316d 100644 +--- a/src/include/lwip/opt.h ++++ b/src/include/lwip/opt.h +@@ -499,7 +499,11 @@ + * The number of sys timeouts used by the core stack (not apps) + * The default number of timeouts is calculated here for all enabled modules. + */ ++#if defined GAZELLE_ENABLE ++#define LWIP_NUM_SYS_TIMEOUT_INTERNAL ((2*LWIP_TCP) + IP_REASSEMBLY + LWIP_ARP + (2*LWIP_DHCP) + LWIP_ACD + LWIP_IGMP + LWIP_DNS + PPP_NUM_TIMEOUTS + (LWIP_IPV6 * (1 + LWIP_IPV6_REASS + LWIP_IPV6_MLD + LWIP_IPV6_DHCP6))) ++#else + #define LWIP_NUM_SYS_TIMEOUT_INTERNAL (LWIP_TCP + IP_REASSEMBLY + LWIP_ARP + (2*LWIP_DHCP) + LWIP_ACD + LWIP_IGMP + LWIP_DNS + PPP_NUM_TIMEOUTS + (LWIP_IPV6 * (1 + LWIP_IPV6_REASS + LWIP_IPV6_MLD + LWIP_IPV6_DHCP6))) ++#endif + + /** + * MEMP_NUM_SYS_TIMEOUT: the number of simultaneously active timeouts. +diff --git a/src/include/lwip/priv/tcp_priv.h b/src/include/lwip/priv/tcp_priv.h +index 8d7b9df..8c29575 100644 +--- a/src/include/lwip/priv/tcp_priv.h ++++ b/src/include/lwip/priv/tcp_priv.h +@@ -66,6 +66,9 @@ void tcp_tmr (void); /* Must be called every + intervals (instead of calling tcp_tmr()). */ + void tcp_slowtmr (void); + void tcp_fasttmr (void); ++#if GAZELLE_ENABLE ++void tcp_rto_tmr (void); ++#endif + + /* Call this from a netif driver (watch out for threading issues!) that has + returned a memory error on transmit and now has free buffers to send more. +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index 5097179..c880757 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -323,16 +323,26 @@ struct tcp_pcb { + #endif /* LWIP_TCP_SACK_OUT */ + + /* Retransmission timer. */ ++#if GAZELLE_ENABLE ++ u32_t rtime; /* time (us) */ ++#else + s16_t rtime; ++#endif + + u16_t mss; /* maximum segment size */ + + /* RTT (round trip time) estimation variables */ + u32_t rttest; /* RTT estimate in 500ms ticks */ + u32_t rtseq; /* sequence number being timed */ ++#if GAZELLE_ENABLE ++ u32_t rtt_us; /* RTT (round trip time) (us) */ ++ u32_t sa, sv; /* @see "Congestion Avoidance and Control" by Van Jacobson and Karels */ ++ u32_t rto; /* retransmission time-out (us) */ ++#else + s16_t sa, sv; /* @see "Congestion Avoidance and Control" by Van Jacobson and Karels */ + + s16_t rto; /* retransmission time-out (in ticks of TCP_SLOW_INTERVAL) */ ++#endif + u8_t nrtx; /* number of retransmissions */ + + /* fast retransmit/recovery */ +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 28b8aca..e1a2954 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -229,6 +229,9 @@ + #define GAZELLE_TCP_MAX_CONN_PER_THREAD 65535 + #define GAZELLE_TCP_REUSE_IPPORT 1 + ++#define GAZELLE_RTO_TMR_INTERVAL 100 ++#define GAZELLE_MIN_RTO_US (GAZELLE_RTO_TMR_INTERVAL * 1000) ++ + /* + ------------------------------------ + ---------- Socket options ---------- diff --git a/lwip.spec b/lwip.spec index f4d07f5..c353c3d 100644 --- a/lwip.spec +++ b/lwip.spec @@ -4,7 +4,7 @@ Summary: lwip is a small independent implementation of the TCP/IP protocol suite Name: lwip Version: 2.2.0 -Release: 2 +Release: 3 License: BSD URL: http://savannah.nongnu.org/projects/lwip/ Source0: http://download.savannah.nongnu.org/releases/lwip/%{name}-%{version}.zip @@ -122,6 +122,7 @@ Patch9106: 0107-fix-move-lpcb-to-the-front-of-list-error.patch Patch9107: 0108-fix-receive-fin-packet-process-error.patch Patch9108: 0109-support-udp-recv-zero-packets.patch Patch9109: 0110-adapt-lwip-2.2.0.patch +Patch9110: 0111-optimize-rto.patch BuildRequires: gcc-c++ dos2unix dpdk-devel @@ -151,6 +152,10 @@ cd %{_builddir}/%{name}-%{version}/src %{_libdir}/liblwip.a %changelog +* Tue Mar 05 2024 zhengjiebing - 2.2.0-3 +- optimize rtt accuracy from 500ms to us +- add rto-tmr + * Sun Feb 18 2024 jiangheng - 2.2.0-2 - remove backport patches - sys_mbox_new return error when rte ring create failed -- Gitee