diff --git a/0111-optimize-rto.patch b/0111-optimize-rto.patch new file mode 100644 index 0000000000000000000000000000000000000000..0465a2e57bd663e10b8cac4ede2b303c52afee61 --- /dev/null +++ b/0111-optimize-rto.patch @@ -0,0 +1,493 @@ + src/api/sys_arch.c | 21 ++++++++++++ + src/core/tcp.c | 70 ++++++++++++++++++++++++++++++++++++++++ + src/core/tcp_in.c | 44 ++++++++++++++++++++++++- + src/core/tcp_out.c | 13 ++++++++ + src/core/timeouts.c | 24 ++++++++++++++ + src/include/arch/sys_arch.h | 3 ++ + src/include/lwip/opt.h | 4 +++ + src/include/lwip/priv/tcp_priv.h | 3 ++ + src/include/lwip/tcp.h | 10 ++++++ + src/include/lwipopts.h | 3 ++ + 10 files changed, 194 insertions(+), 1 deletion(-) + +diff --git a/src/api/sys_arch.c b/src/api/sys_arch.c +index e27b215..7770753 100644 +--- a/src/api/sys_arch.c ++++ b/src/api/sys_arch.c +@@ -78,6 +78,10 @@ static PER_THREAD struct sys_mem_stats hugepage_stats; + + static uint64_t cycles_per_ms __attribute__((aligned(64))); + static uint64_t sys_start_ms __attribute__((aligned(64))); ++#if GAZELLE_ENABLE ++static uint64_t cycles_per_us __attribute__((aligned(64))); ++static uint64_t sys_start_us __attribute__((aligned(64))); ++#endif + + /* + * Mailbox +@@ -381,6 +385,15 @@ void sys_calibrate_tsc(void) + if (sys_start_ms == 0) { + sys_start_ms = rte_rdtsc() / cycles_per_ms; + } ++#if GAZELLE_ENABLE ++#define US_PER_SEC 1E6 ++ if (cycles_per_us == 0) { ++ cycles_per_us = (freq + US_PER_SEC - 1) / US_PER_SEC; ++ } ++ if (sys_start_us == 0) { ++ sys_start_us = rte_rdtsc() / cycles_per_us; ++ } ++#endif + } + + uint32_t sys_now(void) +@@ -389,6 +402,14 @@ uint32_t sys_now(void) + return (uint32_t)(cur_ms - sys_start_ms); + } + ++#if GAZELLE_ENABLE ++uint32_t sys_now_us(void) ++{ ++ uint64_t cur_us = rte_rdtsc() / cycles_per_us; ++ return (uint32_t)(cur_us - sys_start_us); ++} ++#endif ++ + /* + * Critical section + * */ +diff --git a/src/core/tcp.c b/src/core/tcp.c +index 69f6953..a22d5af 100644 +--- a/src/core/tcp.c ++++ b/src/core/tcp.c +@@ -301,6 +301,57 @@ tcp_tmr(void) + } + } + ++#if GAZELLE_ENABLE ++void ++tcp_rto_tmr(void) ++{ ++ struct tcp_pcb *pcb = tcp_active_pcbs; ++ u32_t now = sys_now_us(); ++ ++ while (pcb != NULL) { ++ if ((pcb->rtime > 0)) { ++ if (now - pcb->rtime >= pcb->rto) { ++ /* Time for a retransmission. */ ++ LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_slowtmr: rtime %"U32_F ++ " pcb->rto %"U32_F"\n", ++ pcb->rtime, pcb->rto)); ++ /* If prepare phase fails but we have unsent data but no unacked data, ++ still execute the backoff calculations below, as this means we somehow ++ failed to send segment. */ ++ if ((tcp_rexmit_rto_prepare(pcb) == ERR_OK) || ((pcb->unacked == NULL) && (pcb->unsent != NULL))) { ++ /* Double retransmission time-out unless we are trying to ++ * connect to somebody (i.e., we are in SYN_SENT). */ ++ if (pcb->state != SYN_SENT) { ++ u8_t backoff_idx = LWIP_MIN(pcb->nrtx, sizeof(tcp_backoff) - 1); ++ u32_t orig_rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++ pcb->rto = orig_rto << tcp_backoff[backoff_idx]; ++ } ++ ++ /* Reset the retransmission timer. */ ++ pcb->rtime = now; ++ ++ /* Reduce congestion window and ssthresh. */ ++ pcb->ssthresh = LWIP_MIN(pcb->cwnd, pcb->snd_wnd) >> 1; ++ if (pcb->ssthresh < (tcpwnd_size_t)(pcb->mss << 1)) { ++ pcb->ssthresh = (tcpwnd_size_t)(pcb->mss << 1); ++ } ++ pcb->cwnd = pcb->mss; ++ LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_slowtmr: cwnd %"TCPWNDSIZE_F ++ " ssthresh %"TCPWNDSIZE_F"\n", ++ pcb->cwnd, pcb->ssthresh)); ++ pcb->bytes_acked = 0; ++ ++ /* The following needs to be called AFTER cwnd is set to one ++ mss - STJ */ ++ tcp_rexmit_rto_commit(pcb); ++ } ++ } ++ } ++ pcb = pcb->next; ++ } ++} ++#endif ++ + #if LWIP_CALLBACK_API || TCP_LISTEN_BACKLOG + /** Called when a listen pcb is closed. Iterates one pcb list and removes the + * closed listener pcb from pcb->listener if matching. +@@ -1362,7 +1413,9 @@ void + tcp_slowtmr(void) + { + struct tcp_pcb *pcb, *prev; ++#if !GAZELLE_ENABLE + tcpwnd_size_t eff_wnd; ++#endif + u8_t pcb_remove; /* flag if a PCB should be removed */ + u8_t pcb_reset; /* flag if a RST should be sent when removing */ + err_t err; +@@ -1438,6 +1491,7 @@ tcp_slowtmr_start: + } + } + } else { ++#if !GAZELLE_ENABLE + /* Increase the retransmission timer if it is running */ + if ((pcb->rtime >= 0) && (pcb->rtime < 0x7FFF)) { + ++pcb->rtime; +@@ -1480,6 +1534,7 @@ tcp_slowtmr_start: + tcp_rexmit_rto_commit(pcb); + } + } ++#endif + } + } + /* Check if this PCB has stayed too long in FIN-WAIT-2 */ +@@ -1522,8 +1577,13 @@ tcp_slowtmr_start: + inactive for too long, will drop the data (it will eventually + be retransmitted). */ + #if TCP_QUEUE_OOSEQ ++#if GAZELLE_ENABLE ++ if (pcb->ooseq != NULL && ++ (tcp_ticks - pcb->tmr >= (u32_t)(pcb->rto / 1000 + TCP_SLOW_INTERVAL) / TCP_SLOW_INTERVAL * TCP_OOSEQ_TIMEOUT)) { ++#else + if (pcb->ooseq != NULL && + (tcp_ticks - pcb->tmr >= (u32_t)pcb->rto * TCP_OOSEQ_TIMEOUT)) { ++#endif + LWIP_DEBUGF(TCP_DEBUG | GAZELLE_DEBUG_SERIOUS, ("tcp_slowtmr: dropping OOSEQ queued data local_port=%u, remote_port=%u\n", pcb->local_port, pcb->remote_port)); + tcp_free_ooseq(pcb); + } +@@ -2115,9 +2175,15 @@ tcp_alloc(u8_t prio) + pcb->mss = INITIAL_MSS; + /* Set initial TCP's retransmission timeout to 3000 ms by default. + This value could be configured in lwipopts */ ++#if GAZELLE_ENABLE ++ pcb->rto = GAZELLE_MIN_RTO_US; ++ pcb->sv = LWIP_TCP_RTO_TIME / TCP_SLOW_INTERVAL; ++ pcb->rtime = 0; ++#else + pcb->rto = LWIP_TCP_RTO_TIME / TCP_SLOW_INTERVAL; + pcb->sv = LWIP_TCP_RTO_TIME / TCP_SLOW_INTERVAL; + pcb->rtime = -1; ++#endif + pcb->cwnd = 1; + pcb->tmr = tcp_ticks; + pcb->last_timer = tcp_timer_ctr; +@@ -2383,7 +2449,11 @@ tcp_pcb_purge(struct tcp_pcb *pcb) + + /* Stop the retransmission timer as it will expect data on unacked + queue if it fires */ ++#if GAZELLE_ENABLE ++ pcb->rtime = 0; ++#else + pcb->rtime = -1; ++#endif + + tcp_segs_free(pcb->unsent); + tcp_segs_free(pcb->unacked); +diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c +index 2922721..c6011f0 100644 +--- a/src/core/tcp_in.c ++++ b/src/core/tcp_in.c +@@ -61,6 +61,7 @@ + #endif /* LWIP_ND6_TCP_REACHABILITY_HINTS */ + #if GAZELLE_ENABLE + #include "lwip/api.h" ++#include "lwip/sys.h" + #endif + + #include +@@ -1059,9 +1060,15 @@ tcp_process(struct tcp_pcb *pcb) + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ + if (pcb->unacked == NULL) { ++#if GAZELLE_ENABLE ++ pcb->rtime = 0; ++ } else { ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = -1; + } else { + pcb->rtime = 0; ++#endif + pcb->nrtx = 0; + } + +@@ -1084,7 +1091,11 @@ tcp_process(struct tcp_pcb *pcb) + connection faster, but do not send more SYNs than we otherwise would + have, or we might get caught in a loop on loopback interfaces. */ + if (pcb->nrtx < TCP_SYNMAXRTX) { ++#if GAZELLE_ENABLE ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = 0; ++#endif + tcp_rexmit_rto(pcb); + } + } +@@ -1326,7 +1337,11 @@ tcp_free_acked_segments(struct tcp_pcb *pcb, struct tcp_seg *seg_list, const cha + static void + tcp_receive(struct tcp_pcb *pcb) + { ++#if GAZELLE_ENABLE ++ long m; ++#else + s16_t m; ++#endif + u32_t right_wnd_edge; + + LWIP_ASSERT("tcp_receive: invalid pcb", pcb != NULL); +@@ -1386,7 +1401,11 @@ tcp_receive(struct tcp_pcb *pcb) + /* Clause 3 */ + if (pcb->snd_wl2 + pcb->snd_wnd == right_wnd_edge) { + /* Clause 4 */ ++#if GAZELLE_ENABLE ++ if (pcb->rtime > 0) { ++#else + if (pcb->rtime >= 0) { ++#endif + /* Clause 5 */ + if (pcb->lastack == ackno) { + if ((u8_t)(pcb->dupacks + 1) > pcb->dupacks) { +@@ -1421,7 +1440,11 @@ tcp_receive(struct tcp_pcb *pcb) + pcb->nrtx = 0; + + /* Reset the retransmission time-out. */ ++#if GAZELLE_ENABLE ++ pcb->rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++#else + pcb->rto = (s16_t)((pcb->sa >> 3) + pcb->sv); ++#endif + + /* Record how much data this ACK acks */ + acked = (tcpwnd_size_t)(ackno - pcb->lastack); +@@ -1476,9 +1499,15 @@ tcp_receive(struct tcp_pcb *pcb) + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ + if (pcb->unacked == NULL) { ++#if GAZELLE_ENABLE ++ pcb->rtime = 0; ++ } else { ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = -1; + } else { + pcb->rtime = 0; ++#endif + } + + pcb->polltmr = 0; +@@ -1528,6 +1557,19 @@ tcp_receive(struct tcp_pcb *pcb) + if (pcb->rttest && TCP_SEQ_LT(pcb->rtseq, ackno)) { + /* diff between this shouldn't exceed 32K since this are tcp timer ticks + and a round-trip shouldn't be that long... */ ++#if GAZELLE_ENABLE ++ pcb->rtt_us = sys_now_us() - pcb->rttest; /* us */ ++ m = pcb->rtt_us; ++ ++ m = m - (pcb->sa >> 3); ++ pcb->sa = pcb->sa + m; ++ if (m < 0) { ++ m = -m; ++ } ++ m = m - (pcb->sv >> 2); ++ pcb->sv = pcb->sv + m; ++ pcb->rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++#else + m = (s16_t)(tcp_ticks - pcb->rttest); + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: experienced rtt %"U16_F" ticks (%"U16_F" msec).\n", +@@ -1545,7 +1587,7 @@ tcp_receive(struct tcp_pcb *pcb) + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: RTO %"U16_F" (%"U16_F" milliseconds)\n", + pcb->rto, (u16_t)(pcb->rto * TCP_SLOW_INTERVAL))); +- ++#endif + pcb->rttest = 0; + } + } +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index 1632a66..76245f3 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -1907,12 +1907,21 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + + /* Set retransmission timer running if it is not currently enabled + This must be set before checking the route. */ ++#if GAZELLE_ENABLE ++ if (pcb->rtime == 0) { ++ pcb->rtime = sys_now_us(); ++#else + if (pcb->rtime < 0) { + pcb->rtime = 0; ++#endif + } + + if (pcb->rttest == 0) { ++#if GAZELLE_ENABLE ++ pcb->rttest = sys_now_us(); ++#else + pcb->rttest = tcp_ticks; ++#endif + pcb->rtseq = lwip_ntohl(seg->tcphdr->seqno); + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_output_segment: rtseq %"U32_F"\n", pcb->rtseq)); +@@ -2257,7 +2266,11 @@ tcp_rexmit_fast(struct tcp_pcb *pcb) + tcp_set_flags(pcb, TF_INFR); + + /* Reset the retransmission timer to prevent immediate rto retransmissions */ ++#if GAZELLE_ENABLE ++ pcb->rtime = sys_now_us(); ++#else + pcb->rtime = 0; ++#endif + } + } + } +diff --git a/src/core/timeouts.c b/src/core/timeouts.c +index 610a2d7..72bfb22 100644 +--- a/src/core/timeouts.c ++++ b/src/core/timeouts.c +@@ -79,6 +79,9 @@ const struct lwip_cyclic_timer lwip_cyclic_timers[] = { + /* The TCP timer is a special case: it does not have to run always and + is triggered to start from TCP using tcp_timer_needed() */ + {TCP_TMR_INTERVAL, HANDLER(tcp_tmr)}, ++#if GAZELLE_ENABLE ++ {GAZELLE_RTO_TMR_INTERVAL, HANDLER(tcp_rto_tmr)}, ++#endif + #endif /* LWIP_TCP */ + #if LWIP_IPV4 + #if IP_REASSEMBLY +@@ -157,6 +160,20 @@ tcpip_tcp_timer(void *arg) + } + } + ++#if GAZELLE_ENABLE ++void ++tcpip_rto_timer(void *arg) ++{ ++ LWIP_UNUSED_ARG(arg); ++ ++ tcp_rto_tmr(); ++ if (tcp_active_pcbs || tcp_tw_pcbs) { ++ /* restart timer */ ++ sys_timeout(GAZELLE_RTO_TMR_INTERVAL, tcpip_rto_timer, NULL); ++ } ++} ++#endif ++ + /** + * Called from TCP_REG when registering a new PCB: + * the reason is to have the TCP timer only running when +@@ -172,6 +189,9 @@ tcp_timer_needed(void) + /* enable and start timer */ + tcpip_tcp_timer_active = 1; + sys_timeout(TCP_TMR_INTERVAL, tcpip_tcp_timer, NULL); ++#if GAZELLE_ENABLE ++ sys_timeout(GAZELLE_RTO_TMR_INTERVAL, tcpip_rto_timer, NULL); ++#endif + } + } + #endif /* LWIP_TCP */ +@@ -265,7 +285,11 @@ void sys_timeouts_init(void) + { + size_t i; + /* tcp_tmr() at index 0 is started on demand */ ++#if GAZELLE_ENABLE ++ for (i = (LWIP_TCP ? 2 : 0); i < LWIP_ARRAYSIZE(lwip_cyclic_timers); i++) { ++#else + for (i = (LWIP_TCP ? 1 : 0); i < LWIP_ARRAYSIZE(lwip_cyclic_timers); i++) { ++#endif + /* we have to cast via size_t to get rid of const warning + (this is OK as cyclic_timer() casts back to const* */ + sys_timeout(lwip_cyclic_timers[i].interval_ms, lwip_cyclic_timer, LWIP_CONST_CAST(void *, &lwip_cyclic_timers[i])); +diff --git a/src/include/arch/sys_arch.h b/src/include/arch/sys_arch.h +index bf7e437..3607569 100644 +--- a/src/include/arch/sys_arch.h ++++ b/src/include/arch/sys_arch.h +@@ -131,6 +131,9 @@ struct rte_ring *gazelle_ring_create_fast(const char *name, uint32_t size, uint3 + + void sys_calibrate_tsc(void); + uint32_t sys_now(void); ++#if GAZELLE_ENABLE ++uint32_t sys_now_us(void); ++#endif + __attribute__((always_inline)) inline int update_timeout(int timeout, uint32_t poll_ts) + { + uint32_t used_ms = sys_now() - poll_ts; +diff --git a/src/include/lwip/opt.h b/src/include/lwip/opt.h +index 53b1946..7bf316d 100644 +--- a/src/include/lwip/opt.h ++++ b/src/include/lwip/opt.h +@@ -499,7 +499,11 @@ + * The number of sys timeouts used by the core stack (not apps) + * The default number of timeouts is calculated here for all enabled modules. + */ ++#if defined GAZELLE_ENABLE ++#define LWIP_NUM_SYS_TIMEOUT_INTERNAL ((2*LWIP_TCP) + IP_REASSEMBLY + LWIP_ARP + (2*LWIP_DHCP) + LWIP_ACD + LWIP_IGMP + LWIP_DNS + PPP_NUM_TIMEOUTS + (LWIP_IPV6 * (1 + LWIP_IPV6_REASS + LWIP_IPV6_MLD + LWIP_IPV6_DHCP6))) ++#else + #define LWIP_NUM_SYS_TIMEOUT_INTERNAL (LWIP_TCP + IP_REASSEMBLY + LWIP_ARP + (2*LWIP_DHCP) + LWIP_ACD + LWIP_IGMP + LWIP_DNS + PPP_NUM_TIMEOUTS + (LWIP_IPV6 * (1 + LWIP_IPV6_REASS + LWIP_IPV6_MLD + LWIP_IPV6_DHCP6))) ++#endif + + /** + * MEMP_NUM_SYS_TIMEOUT: the number of simultaneously active timeouts. +diff --git a/src/include/lwip/priv/tcp_priv.h b/src/include/lwip/priv/tcp_priv.h +index 8d7b9df..8c29575 100644 +--- a/src/include/lwip/priv/tcp_priv.h ++++ b/src/include/lwip/priv/tcp_priv.h +@@ -66,6 +66,9 @@ void tcp_tmr (void); /* Must be called every + intervals (instead of calling tcp_tmr()). */ + void tcp_slowtmr (void); + void tcp_fasttmr (void); ++#if GAZELLE_ENABLE ++void tcp_rto_tmr (void); ++#endif + + /* Call this from a netif driver (watch out for threading issues!) that has + returned a memory error on transmit and now has free buffers to send more. +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index 5097179..c880757 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -323,16 +323,26 @@ struct tcp_pcb { + #endif /* LWIP_TCP_SACK_OUT */ + + /* Retransmission timer. */ ++#if GAZELLE_ENABLE ++ u32_t rtime; /* time (us) */ ++#else + s16_t rtime; ++#endif + + u16_t mss; /* maximum segment size */ + + /* RTT (round trip time) estimation variables */ + u32_t rttest; /* RTT estimate in 500ms ticks */ + u32_t rtseq; /* sequence number being timed */ ++#if GAZELLE_ENABLE ++ u32_t rtt_us; /* RTT (round trip time) (us) */ ++ u32_t sa, sv; /* @see "Congestion Avoidance and Control" by Van Jacobson and Karels */ ++ u32_t rto; /* retransmission time-out (us) */ ++#else + s16_t sa, sv; /* @see "Congestion Avoidance and Control" by Van Jacobson and Karels */ + + s16_t rto; /* retransmission time-out (in ticks of TCP_SLOW_INTERVAL) */ ++#endif + u8_t nrtx; /* number of retransmissions */ + + /* fast retransmit/recovery */ +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 28b8aca..e1a2954 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -229,6 +229,9 @@ + #define GAZELLE_TCP_MAX_CONN_PER_THREAD 65535 + #define GAZELLE_TCP_REUSE_IPPORT 1 + ++#define GAZELLE_RTO_TMR_INTERVAL 100 ++#define GAZELLE_MIN_RTO_US (GAZELLE_RTO_TMR_INTERVAL * 1000) ++ + /* + ------------------------------------ + ---------- Socket options ---------- diff --git a/0112-add-cubic-congestion-control.patch b/0112-add-cubic-congestion-control.patch new file mode 100644 index 0000000000000000000000000000000000000000..d090659fb7ade27c2d71eeb2ecc6069aa079fa35 --- /dev/null +++ b/0112-add-cubic-congestion-control.patch @@ -0,0 +1,468 @@ +diff --git a/src/core/tcp.c b/src/core/tcp.c +index a22d5af..234334a 100644 +--- a/src/core/tcp.c ++++ b/src/core/tcp.c +@@ -185,6 +185,16 @@ PER_THREAD struct tcp_pcb *tcp_tw_pcbs; + /** An array with all (non-temporary) PCB lists, mainly used for smaller code size */ + PER_THREAD struct tcp_pcb ** tcp_pcb_lists[NUM_TCP_PCB_LISTS] = {NULL, NULL, NULL, NULL}; + ++#if GAZELLE_CUBIC ++static uint64_t cycles_per_sec; ++/* Precompute a bunch of the scaling factors that are used per-packet based on SRTT of 100ms */ ++/* W(t) = C * (t - K)^3 + Wmax, where C = 0.4 */ ++static int bic_scale = 41; ++static u32_t cube_rtt_scale; ++static u64_t cube_factor; ++static u32_t beta_scale; ++#endif ++ + #if GAZELLE_TCP_PCB_HASH + #define INIT_TCP_HTABLE(ht_ptr) \ + do { \ +@@ -245,6 +255,14 @@ tcp_init(void) + LWIP_ASSERT("malloc tcp_active_htable mem failed.", tcp_active_htable != NULL); + INIT_TCP_HTABLE(tcp_active_htable); + #endif ++ ++#if GAZELLE_CUBIC ++ /* cubictcp_register */ ++ cycles_per_sec = rte_get_tsc_hz(); ++ cube_rtt_scale = bic_scale * 10; /* = C * 1024 = 410 */ ++ cube_factor = (1ull << (10 + 3*BICTCP_HZ)) / cube_rtt_scale; ++ beta_scale = 8 * (BICTCP_BETA_SCALE + BETA) / 3 / (BICTCP_BETA_SCALE - BETA); ++#endif + } + + /** Free a tcp pcb */ +@@ -331,11 +349,19 @@ tcp_rto_tmr(void) + pcb->rtime = now; + + /* Reduce congestion window and ssthresh. */ ++#if GAZELLE_CUBIC ++ pcb->ssthresh = LWIP_MIN(pcb->cwnd, pcb->snd_wnd) * BETA / BICTCP_BETA_SCALE; ++ pcb->ssthresh = LWIP_MAX(pcb->ssthresh, (tcpwnd_size_t)(pcb->mss << 1)); ++ bictcp_reset(&pcb->ca); ++ /* TODO: kernel: tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; */ ++ pcb->cwnd = pcb->mss; ++#else + pcb->ssthresh = LWIP_MIN(pcb->cwnd, pcb->snd_wnd) >> 1; + if (pcb->ssthresh < (tcpwnd_size_t)(pcb->mss << 1)) { + pcb->ssthresh = (tcpwnd_size_t)(pcb->mss << 1); + } + pcb->cwnd = pcb->mss; ++#endif + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_slowtmr: cwnd %"TCPWNDSIZE_F + " ssthresh %"TCPWNDSIZE_F"\n", + pcb->cwnd, pcb->ssthresh)); +@@ -2996,4 +3022,209 @@ tcp_ext_arg_invoke_callbacks_passive_open(struct tcp_pcb_listen *lpcb, struct tc + } + #endif /* LWIP_TCP_PCB_NUM_EXT_ARGS */ + ++#if GAZELLE_CUBIC ++void ++bictcp_reset(struct bictcp *ca) ++{ ++ ca->cnt = 0; ++ ca->last_max_cwnd = 0; ++ ca->last_cwnd = 0; ++ ca->last_time = 0; ++ ca->bic_origin_point = 0; ++ ca->bic_K = 0; ++ ca->delay_min = 0; ++ ca->epoch_start = 0; ++ ca->ack_cnt = 0; ++ ca->tcp_cwnd = 0; ++} ++ ++static u32_t fls64(uint64_t v) ++{ ++ static const u64_t debruijn_multiplicator = 0x6c04f118e9966f6bUL; ++ static const u8_t debruijn_bit_position[128] = { ++ 0, /* change to 1 if you want bitSize(0) = 1 */ ++ 48, -1, -1, 31, -1, 15, 51, -1, 63, 5, -1, -1, -1, 19, -1, ++ 23, 28, -1, -1, -1, 40, 36, 46, -1, 13, -1, -1, -1, 34, -1, 58, ++ -1, 60, 2, 43, 55, -1, -1, -1, 50, 62, 4, -1, 18, 27, -1, 39, ++ 45, -1, -1, 33, 57, -1, 1, 54, -1, 49, -1, 17, -1, -1, 32, -1, ++ 53, -1, 16, -1, -1, 52, -1, -1, -1, 64, 6, 7, 8, -1, 9, -1, ++ -1, -1, 20, 10, -1, -1, 24, -1, 29, -1, -1, 21, -1, 11, -1, -1, ++ 41, -1, 25, 37, -1, 47, -1, 30, 14, -1, -1, -1, -1, 22, -1, -1, ++ 35, 12, -1, -1, -1, 59, 42, -1, -1, 61, 3, 26, 38, 44, -1, 56 ++ }; ++ ++ v |= v >> 1; ++ v |= v >> 2; ++ v |= v >> 4; ++ v |= v >> 8; ++ v |= v >> 16; ++ v |= v >> 32; ++ return debruijn_bit_position[(u64_t)(v * debruijn_multiplicator) >> 57]; ++} ++ ++/* calculate the cubic root of x using a table lookup followed by one ++ * Newton-Raphson iteration. ++ * Avg err ~= 0.195% ++ */ ++static u32_t cubic_root(u64_t a) ++{ ++ u32_t x, b, shift; ++ /* ++ * cbrt(x) MSB values for x MSB values in [0..63]. ++ * Precomputed then refined by hand - Willy Tarreau ++ * ++ * For x in [0..63], ++ * v = cbrt(x << 18) - 1 ++ * cbrt(x) = (v[x] + 10) >> 6 ++ */ ++ static const u8_t v[] = { ++ /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, ++ /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, ++ /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, ++ /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, ++ /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, ++ /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, ++ /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, ++ /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, ++ }; ++ ++ b = fls64(a); ++ if (b < 7) { ++ /* a in [0..63] */ ++ return ((u32_t)v[(u32_t)a] + 35) >> 6; ++ } ++ ++ b = ((b * 84) >> 8) - 1; ++ shift = (a >> (b * 3)); ++ ++ x = ((u32_t)(((u32_t)v[shift] + 10) << b)) >> 6; ++ ++ /* ++ * Newton-Raphson iteration ++ * 2 ++ * x = ( 2 * x + a / x ) / 3 ++ * k+1 k k ++ */ ++ x = (2 * x + (u32_t)(a / ((u64_t)x * (u64_t)(x - 1)))); ++ x = ((x * 341) >> 10); ++ return x; ++} ++ ++void ++bictcp_update(struct bictcp *ca, u32_t cwnd, u32_t acked) ++{ ++ u32_t delta, bic_target, max_cnt; ++ u64_t offs, t; ++ ++ ca->ack_cnt += acked; /* count the number of ACKed packets */ ++ ++ u32_t now = (u32_t)rte_rdtsc(); ++ ++ // if (ca->last_cwnd == cwnd && (s32_t)(now - ca->last_time) <= LWIP_HZ / 32) ++ // return; ++ if (ca->last_cwnd == cwnd && (s32_t)(now - ca->last_time) <= LWIP_HZ / 200) ++ return; ++ ++ /* The CUBIC function can update ca->cnt at most once per jiffy. ++ * On all cwnd reduction events, ca->epoch_start is set to 0, ++ * which will force a recalculation of ca->cnt. ++ */ ++ if (ca->epoch_start && now == ca->last_time) ++ goto tcp_friendliness; ++ ++ ca->last_cwnd = cwnd; ++ ca->last_time = now; ++ ++ if (ca->epoch_start == 0) { ++ ca->epoch_start = now; /* record beginning */ ++ ca->ack_cnt = acked; /* start counting */ ++ ca->tcp_cwnd = cwnd; /* syn with cubic */ ++ ++ if (ca->last_max_cwnd <= cwnd) { ++ ca->bic_K = 0; ++ ca->bic_origin_point = cwnd; ++ } else { ++ /* Compute new K based on ++ * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) ++ */ ++ ca->bic_K = cubic_root(cube_factor * (ca->last_max_cwnd - cwnd)); ++ ca->bic_origin_point = ca->last_max_cwnd; ++ } ++ } ++ ++ /* cubic function - calc*/ ++ /* calculate c * time^3 / rtt, ++ * while considering overflow in calculation of time^3 ++ * (so time^3 is done by using 64 bit) ++ * and without the support of division of 64bit numbers ++ * (so all divisions are done by using 32 bit) ++ * also NOTE the unit of those veriables ++ * time = (t - K) / 2^bictcp_HZ ++ * c = bic_scale >> 10 ++ * rtt = (srtt >> 3) / HZ ++ * !!! The following code does not have overflow problems, ++ * if the cwnd < 1 million packets !!! ++ */ ++ ++ t = (s32_t)(now - ca->epoch_start); ++ t = t + ca->delay_min * (LWIP_HZ / 1000000); /* usec to jiffies (HZ > USEC_PER_SEC) */ ++ //t = t + (ca->delay_min + (1000000 / LWIP_HZ) - 1) / (1000000 / LWIP_HZ); /* usec to jiffies (HZ < USEC_PER_SEC) */ ++ /* change the unit from HZ to bictcp_HZ */ ++ t = t * GAZELLE_CUBIC_T_SCALE; ++ t <<= BICTCP_HZ; ++ t = t / LWIP_HZ; ++ ++ if (t < ca->bic_K) /* t - K */ ++ offs = ca->bic_K - t; ++ else ++ offs = t - ca->bic_K; ++ ++ /* c/rtt * (t-K)^3 */ ++ delta = (cube_rtt_scale * offs * offs * offs) >> (10 + 3*BICTCP_HZ); ++ if (t < ca->bic_K) /* below origin*/ ++ bic_target = ca->bic_origin_point - delta; ++ else /* above origin*/ ++ bic_target = ca->bic_origin_point + delta; ++ ++ /* cubic function - calc bictcp_cnt*/ ++ if (bic_target > cwnd) { ++ ca->cnt = cwnd / (bic_target - cwnd); ++ } else { ++ ca->cnt = 100 * cwnd; /* very small increment*/ ++ } ++ ++ /* ++ * The initial growth of cubic function may be too conservative ++ * when the available bandwidth is still unknown. ++ */ ++ if (ca->last_max_cwnd == 0 && ca->cnt > 20) ++ ca->cnt = 20; /* increase cwnd 5% per RTT */ ++ ++tcp_friendliness: ++ /* TCP Friendly */ ++ if (GAZELLE_CUBIC_FRIENDLY) { ++ u32_t scale = BETA_SCALE; ++ ++ delta = (cwnd * scale) >> 3; ++ while (ca->ack_cnt > delta) { /* update tcp cwnd */ ++ ca->ack_cnt -= delta; ++ ca->tcp_cwnd++; ++ } ++ ++ if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ ++ delta = ca->tcp_cwnd - cwnd; ++ max_cnt = cwnd / delta; ++ if (ca->cnt > max_cnt) ++ ca->cnt = max_cnt; ++ } ++ } ++ ++ /* The maximum rate of cwnd increase CUBIC allows is 1 packet per ++ * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. ++ */ ++ ca->cnt = LWIP_MAX(ca->cnt, 2U); ++} ++ ++#endif /* GAZLLE_CUBIC */ ++ + #endif /* LWIP_TCP */ +diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c +index c6011f0..596c141 100644 +--- a/src/core/tcp_in.c ++++ b/src/core/tcp_in.c +@@ -1456,6 +1456,32 @@ tcp_receive(struct tcp_pcb *pcb) + /* Update the congestion control variables (cwnd and + ssthresh). */ + if (pcb->state >= ESTABLISHED) { ++#if GAZELLE_CUBIC ++ if (pcb->cwnd < pcb->ssthresh) { ++ /* Slow Start */ ++ tcpwnd_size_t increase; ++ if (pcb->flags & TF_RTO) { ++ increase = (tcpwnd_size_t)(1 * pcb->mss); ++ } else { ++ increase = acked; ++ } ++ TCP_WND_INC(pcb->cwnd, increase); ++ pcb->cwnd = LWIP_MIN(pcb->cwnd, pcb->ssthresh); ++ } else { ++ /* Congestion Avoidance */ ++ bictcp_update(&pcb->ca, pcb->cwnd / pcb->mss, acked / pcb->mss); ++ if (pcb->bytes_acked >= pcb->ca.cnt * pcb->mss) { ++ pcb->bytes_acked = 0; ++ TCP_WND_INC(pcb->cwnd, pcb->mss); ++ } ++ TCP_WND_INC(pcb->bytes_acked, acked); ++ if (pcb->bytes_acked >= pcb->ca.cnt * pcb->mss) { ++ u32_t increase_pkts = pcb->bytes_acked / (pcb->ca.cnt * pcb->mss); ++ pcb->bytes_acked = (tcpwnd_size_t)(pcb->bytes_acked - increase_pkts * pcb->ca.cnt * pcb->mss); ++ TCP_WND_INC(pcb->cwnd, increase_pkts * pcb->mss); ++ } ++ } ++#else + if (pcb->cwnd < pcb->ssthresh) { + tcpwnd_size_t increase; + /* limit to 1 SMSS segment during period following RTO */ +@@ -1473,6 +1499,7 @@ tcp_receive(struct tcp_pcb *pcb) + } + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_receive: congestion avoidance cwnd %"TCPWNDSIZE_F"\n", pcb->cwnd)); + } ++#endif + } + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: ACK for %"U32_F", unacked->seqno %"U32_F":%"U32_F"\n", + ackno, +@@ -1569,6 +1596,12 @@ tcp_receive(struct tcp_pcb *pcb) + m = m - (pcb->sv >> 2); + pcb->sv = pcb->sv + m; + pcb->rto = LWIP_MAX((pcb->sa >> 3) + pcb->sv, GAZELLE_MIN_RTO_US); ++#if GAZELLE_CUBIC ++ u32_t delay = pcb->rtt_us; /* delay(us) */ ++ if (pcb->ca.delay_min == 0 || pcb->ca.delay_min > delay) { ++ pcb->ca.delay_min = delay; ++ } ++#endif + #else + m = (s16_t)(tcp_ticks - pcb->rttest); + +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index 67beae2..91ce710 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -1905,6 +1905,37 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + rte_prefetch0((uint8_t *)(seg->p) - sizeof(struct rte_mbuf) - sizeof(uint64_t) * 2); + #endif + ++#if GAZELLE_CUBIC ++ len = (u16_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); ++ if (len == 0) { ++ /** Exclude retransmitted segments from this count. */ ++ MIB2_STATS_INC(mib2.tcpoutsegs); ++ } ++ ++ seg->p->len -= len; ++ seg->p->tot_len -= len; ++ ++ /* Segment with data */ ++ if (seg->p->tot_len != TCPH_HDRLEN_BYTES(seg->tcphdr)) { ++ u32_t now = (u32_t)rte_rdtsc(); ++ ++ /* If we have no in-flight data, trigger CA_EVENT_TX_START. */ ++ if (pcb->rtime == 0) { ++ s32_t delta = now - pcb->lsndtime; ++ /* We were application limited (idle) for a while. ++ * Shift epoch_start to keep cwnd growth to cubic curve. ++ */ ++ if (pcb->ca.epoch_start && delta > 0) { ++ pcb->ca.epoch_start += delta; ++ if (pcb->ca.epoch_start > now) ++ pcb->ca.epoch_start = now; ++ } ++ } ++ ++ pcb->lsndtime = now; ++ } ++#endif ++ + /* Set retransmission timer running if it is not currently enabled + This must be set before checking the route. */ + #if GAZELLE_ENABLE +@@ -1929,7 +1960,7 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_output_segment: %"U32_F":%"U32_F"\n", + lwip_htonl(seg->tcphdr->seqno), lwip_htonl(seg->tcphdr->seqno) + + seg->len)); +- ++#if !GAZELLE_CUBIC + len = (u16_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); + if (len == 0) { + /** Exclude retransmitted segments from this count. */ +@@ -1938,6 +1969,7 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + + seg->p->len -= len; + seg->p->tot_len -= len; ++#endif + + seg->p->payload = seg->tcphdr; + +@@ -2308,9 +2340,21 @@ tcp_rexmit_fast(struct tcp_pcb *pcb) + (u16_t)pcb->dupacks, pcb->lastack, + lwip_ntohl(pcb->unacked->tcphdr->seqno))); + if (tcp_rexmit(pcb) == ERR_OK) { ++#if GAZELLE_CUBIC ++ pcb->ca.epoch_start = 0; ++ /* Wmax and fast convergence */ ++ if (pcb->cwnd / pcb->mss < pcb->ca.last_max_cwnd && GAZELLE_CUBIC_FAST_CONVERGENCE) { ++ pcb->ca.last_max_cwnd = (pcb->cwnd / pcb->mss * (BICTCP_BETA_SCALE + BETA)) ++ / (2 * BICTCP_BETA_SCALE); ++ } else { ++ pcb->ca.last_max_cwnd = pcb->cwnd / pcb->mss; ++ } ++ pcb->ssthresh = pcb->cwnd / BICTCP_BETA_SCALE * BETA; ++#else + /* Set ssthresh to half of the minimum of the current + * cwnd and the advertised window */ + pcb->ssthresh = LWIP_MIN(pcb->cwnd, pcb->snd_wnd) / 2; ++#endif + + /* The minimum value for ssthresh should be 2 MSS */ + if (pcb->ssthresh < (2U * pcb->mss)) { +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index c880757..10fb66c 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -265,6 +265,30 @@ struct tcp_pcb_listen { + #endif + }; + ++#if GAZELLE_CUBIC ++#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation */ ++#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ ++#define BETA 717 ++#define BETA_SCALE beta_scale ++#define LWIP_HZ cycles_per_sec ++ ++struct bictcp { ++ u32_t cnt; /* increase cwnd by 1 after ACKs */ ++ u32_t last_max_cwnd; /* last maximum snd_cwnd */ ++ u32_t last_cwnd; /* the last snd_cwnd */ ++ u32_t last_time; /* time when updated last_cwnd */ ++ u32_t bic_origin_point; /* origin point of bic function */ ++ u32_t bic_K; /* time to origin point ++ from the beginning of the current epoch */ ++ u32_t delay_min; /* min delay (msec << 3) */ ++ u32_t epoch_start; /* beginning of an epoch */ ++ u32_t ack_cnt; /* number of acks */ ++ u32_t tcp_cwnd; /* estimated tcp cwnd */ ++}; ++ ++void bictcp_reset(struct bictcp *ca); ++void bictcp_update(struct bictcp *ca, u32_t cwnd, u32_t acked); ++#endif + + /** the TCP protocol control block */ + struct tcp_pcb { +@@ -353,6 +377,11 @@ struct tcp_pcb { + tcpwnd_size_t cwnd; + tcpwnd_size_t ssthresh; + ++#if GAZELLE_CUBIC ++ u32_t lsndtime; /* timestamp of last sent data packet (for restart window) */ ++ struct bictcp ca; ++#endif ++ + /* first byte following last rto byte */ + u32_t rto_end; + +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index e1a2954..b73ff67 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -232,6 +232,11 @@ + #define GAZELLE_RTO_TMR_INTERVAL 100 + #define GAZELLE_MIN_RTO_US (GAZELLE_RTO_TMR_INTERVAL * 1000) + ++#define GAZELLE_CUBIC 1 ++#define GAZELLE_CUBIC_FRIENDLY 1 ++#define GAZELLE_CUBIC_FAST_CONVERGENCE 1 ++#define GAZELLE_CUBIC_T_SCALE 500 ++ + /* + ------------------------------------ + ---------- Socket options ---------- diff --git a/lwip.spec b/lwip.spec index f4d07f57250022774b68ae847e3e8e44dd1122c7..033ff66986b8120d233a227dc1ef054ec8c88e67 100644 --- a/lwip.spec +++ b/lwip.spec @@ -4,7 +4,7 @@ Summary: lwip is a small independent implementation of the TCP/IP protocol suite Name: lwip Version: 2.2.0 -Release: 2 +Release: 4 License: BSD URL: http://savannah.nongnu.org/projects/lwip/ Source0: http://download.savannah.nongnu.org/releases/lwip/%{name}-%{version}.zip @@ -122,6 +122,8 @@ Patch9106: 0107-fix-move-lpcb-to-the-front-of-list-error.patch Patch9107: 0108-fix-receive-fin-packet-process-error.patch Patch9108: 0109-support-udp-recv-zero-packets.patch Patch9109: 0110-adapt-lwip-2.2.0.patch +Patch9110: 0111-optimize-rto.patch +Patch9111: 0112-add-cubic-congestion-control.patch BuildRequires: gcc-c++ dos2unix dpdk-devel @@ -151,6 +153,13 @@ cd %{_builddir}/%{name}-%{version}/src %{_libdir}/liblwip.a %changelog +* Thu Mar 21 2024 zhengjiebing - 2.2.0-4 +- add cubic congestion control + +* Tue Mar 05 2024 zhengjiebing - 2.2.0-3 +- optimize rtt accuracy from 500ms to us +- add rto-tmr + * Sun Feb 18 2024 jiangheng - 2.2.0-2 - remove backport patches - sys_mbox_new return error when rte ring create failed