From 27f9d58b4dcc54adacb1a039bcef18197ceb69dc Mon Sep 17 00:00:00 2001 From: Lemmy Huang Date: Sat, 5 Jul 2025 13:33:27 +0800 Subject: [PATCH] tcp udp: merge RTW and RTC modes mempool: supports async and sync memory modes Signed-off-by: Lemmy Huang (cherry picked from commit c9f7760dd355fb8e4dd2c2560ccdcaa64615ebd4) --- ...ZELLE_SAME_NODE-and-GAZELLE_TCP_LAST.patch | 802 ++++ 0183-socket-refactor-sock_event.patch | 916 +++++ 0184-socket-refactor-tcp-and-udp.patch | 3394 +++++++++++++++++ 0185-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch | 166 + 0186-socket-fix-tcp-closed.patch | 30 + ...ait-cannot-be-interrupted-by-signals.patch | 89 + 0188-mempool-fix-sendmbox-not-free.patch | 56 + 0189-udp-fix-ip6_frag-nfb-and-last.patch | 111 + ...fix-recv_udp-sys_mbox_trypost-failed.patch | 41 + ...-epoll-wrong-event-notify-and-remove.patch | 54 + ...tcp-reduce-TCP_SNDQUEUELOWAT-to-1800.patch | 62 + lwip.spec | 18 +- 12 files changed, 5738 insertions(+), 1 deletion(-) create mode 100644 0182-cleancode-add-GAZELLE_SAME_NODE-and-GAZELLE_TCP_LAST.patch create mode 100644 0183-socket-refactor-sock_event.patch create mode 100644 0184-socket-refactor-tcp-and-udp.patch create mode 100644 0185-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch create mode 100644 0186-socket-fix-tcp-closed.patch create mode 100644 0187-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch create mode 100644 0188-mempool-fix-sendmbox-not-free.patch create mode 100644 0189-udp-fix-ip6_frag-nfb-and-last.patch create mode 100644 0190-udp-fix-recv_udp-sys_mbox_trypost-failed.patch create mode 100644 0191-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch create mode 100644 0192-tcp-reduce-TCP_SNDQUEUELOWAT-to-1800.patch diff --git a/0182-cleancode-add-GAZELLE_SAME_NODE-and-GAZELLE_TCP_LAST.patch b/0182-cleancode-add-GAZELLE_SAME_NODE-and-GAZELLE_TCP_LAST.patch new file mode 100644 index 0000000..a90d14d --- /dev/null +++ b/0182-cleancode-add-GAZELLE_SAME_NODE-and-GAZELLE_TCP_LAST.patch @@ -0,0 +1,802 @@ +From 02063d20737eec300be8b87ba1fa62e4e650609a Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 9 Mar 2025 14:31:21 +0800 +Subject: [PATCH 01/11] cleancode: add GAZELLE_SAME_NODE and + GAZELLE_TCP_LAST_SEG + +Signed-off-by: Lemmy Huang +--- + src/api/lwipgz_sock.c | 2 + + src/core/pbuf.c | 3 + + src/core/tcp.c | 24 ++++-- + src/core/tcp_in.c | 29 +++++--- + src/core/tcp_out.c | 147 +++++++++++++++++-------------------- + src/include/dpdk_version.h | 18 +++-- + src/include/lwip/ip.h | 1 - + src/include/lwip/opt.h | 2 +- + src/include/lwip/pbuf.h | 4 +- + src/include/lwip/tcp.h | 14 ++-- + src/include/lwipgz_list.h | 2 + + src/include/lwipgz_sock.h | 9 ++- + src/include/lwipopts.h | 9 +++ + 13 files changed, 147 insertions(+), 117 deletions(-) + +diff --git a/src/api/lwipgz_sock.c b/src/api/lwipgz_sock.c +index fc98568..00e13fa 100644 +--- a/src/api/lwipgz_sock.c ++++ b/src/api/lwipgz_sock.c +@@ -104,6 +104,7 @@ int gazelle_alloc_socket(struct netconn *newconn, int accepted, int flags) + if (do_lwip_init_sock(fd) != 0) + goto out; + ++#if GAZELLE_SAME_NODE + if (accepted) { + int ret = 0; + struct tcp_pcb *pcb = newconn->pcb.tcp; +@@ -114,6 +115,7 @@ int gazelle_alloc_socket(struct netconn *newconn, int accepted, int flags) + goto out; + } + } ++#endif /* GAZELLE_SAME_NODE */ + + netconn_set_nonblocking(newconn, flags & SOCK_NONBLOCK); + return fd; +diff --git a/src/core/pbuf.c b/src/core/pbuf.c +index 32ffaeb..a5e8668 100644 +--- a/src/core/pbuf.c ++++ b/src/core/pbuf.c +@@ -193,7 +193,10 @@ pbuf_init_alloced_pbuf(struct pbuf *p, void *payload, u16_t tot_len, u16_t len, + p->flags = flags; + p->ref = 1; + p->if_idx = NETIF_NO_INDEX; ++ ++#if GAZELLE_SAME_NODE + p->pcb = NULL; ++#endif /* GAZELLE_SAME_NODE */ + } + + /** +diff --git a/src/core/tcp.c b/src/core/tcp.c +index b41036e..31bfd7a 100644 +--- a/src/core/tcp.c ++++ b/src/core/tcp.c +@@ -243,7 +243,7 @@ tcp_init(void) + void + tcp_free(struct tcp_pcb *pcb) + { +-#if GAZELLE_ENABLE ++#if GAZELLE_SAME_NODE + if (pcb->free_ring == 1) { + struct netconn *netconn = NULL; + struct lwip_sock *sock = NULL; +@@ -256,6 +256,8 @@ tcp_free(struct tcp_pcb *pcb) + rte_memzone_free(sock->same_node_tx_ring->mz); + rte_memzone_free(sock->same_node_tx_ring_mz); + } ++#endif /* GAZELLE_SAME_NODE */ ++#if GAZELLE_ENABLE + vdev_unreg_done(pcb); + #endif + #if GAZELLE_TCP_NEW_PORT +@@ -1004,7 +1006,8 @@ tcp_listen_with_backlog_and_err(struct tcp_pcb *pcb, u16_t backlog, err_t *err) + * local_port=0 avoid to release sock table in tcp_free */ + pcb->local_port = 0; + #endif /* GAZELLE_TCP_NEW_PORT */ +-#if GAZELLE_ENABLE ++ ++#if GAZELLE_SAME_NODE + char name[RING_NAME_LEN]; + snprintf(name, sizeof(name), "listen_rx_ring_%u", lpcb->local_port); + if (rte_ring_lookup(name) != NULL) { +@@ -1013,7 +1016,8 @@ tcp_listen_with_backlog_and_err(struct tcp_pcb *pcb, u16_t backlog, err_t *err) + } else { + same_node_ring_create(&lpcb->listen_rx_ring, SAME_NODE_RING_SIZE, lpcb->local_port, "listen", "rx"); + } +-#endif /* GAZELLE_ENABLE */ ++#endif /* GAZELLE_SAME_NODE */ ++ + tcp_free(pcb); + #if LWIP_CALLBACK_API + lpcb->accept = tcp_accept_null; +@@ -1299,7 +1303,7 @@ tcp_connect(struct tcp_pcb *pcb, const ip_addr_t *ipaddr, u16_t port, + #endif /* SO_REUSE */ + } + +-#if GAZELLE_ENABLE ++#if GAZELLE_SAME_NODE + /* communication between processes on the same node */ + if (ip_addr_cmp(&pcb->local_ip, &pcb->remote_ip)) { + ret = create_same_node_ring(pcb); +@@ -1307,7 +1311,7 @@ tcp_connect(struct tcp_pcb *pcb, const ip_addr_t *ipaddr, u16_t port, + return ret; + } + } +-#endif ++#endif /* GAZELLE_SAME_NODE */ + + iss = tcp_next_iss(pcb); + pcb->rcv_nxt = 0; +@@ -2156,16 +2160,18 @@ tcp_alloc(u8_t prio) + pcb->keep_intvl = TCP_KEEPINTVL_DEFAULT; + pcb->keep_cnt = TCP_KEEPCNT_DEFAULT; + #endif /* LWIP_TCP_KEEPALIVE */ +-#if GAZELLE_ENABLE ++ ++#if GAZELLE_SAME_NODE + pcb->client_rx_ring = NULL; + pcb->client_tx_ring = NULL; + pcb->free_ring = 0; +-#endif ++#endif /* GAZELLE_SAME_NODE */ + #if GAZELLE_TCP_PINGPONG_MODE + pcb->lrcvtime = 0; + pcb->lsndtime = 0; + pcb->pingpong = 0; + #endif ++ + pcb_tci_init(pcb); + } + return pcb; +@@ -2330,6 +2336,7 @@ tcp_accept(struct tcp_pcb *pcb, tcp_accept_fn accept) + } + #endif /* LWIP_CALLBACK_API */ + ++ + /** + * @ingroup tcp_raw + * Specifies the polling interval and the callback function that should +@@ -2407,8 +2414,9 @@ tcp_pcb_purge(struct tcp_pcb *pcb) + tcp_segs_free(pcb->unsent); + tcp_segs_free(pcb->unacked); + pcb->unacked = pcb->unsent = NULL; ++#if GAZELLE_TCP_LAST_SEG + pcb->last_unacked = pcb->last_unsent = NULL; +- pcb->pcb_if = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + #if TCP_OVERSIZE + pcb->unsent_oversize = 0; + #endif /* TCP_OVERSIZE */ +diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c +index 75a2a48..346b9d1 100644 +--- a/src/core/tcp_in.c ++++ b/src/core/tcp_in.c +@@ -169,9 +169,8 @@ tcp_input(struct pbuf *p, struct netif *inp) + LWIP_ASSERT_CORE_LOCKED(); + LWIP_ASSERT("tcp_input: invalid pbuf", p != NULL); + +-#ifndef LWIP_PERF + PERF_START; +-#endif ++ + #if GAZELLE_ENABLE + TCP_STATS_INC(tcp.rx_in); + #else +@@ -214,13 +213,13 @@ tcp_input(struct pbuf *p, struct netif *inp) + } + if (ret != 0) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: packet discarded due to failing checksum\n")); +-#else ++#else /* OFFLOAD_CHECKSUM_CHECK_TCP */ + u16_t chksum = ip_chksum_pseudo(p, IP_PROTO_TCP, p->tot_len, + ip_current_src_addr(), ip_current_dest_addr()); + if (chksum != 0) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: packet discarded due to failing checksum 0x%04"X16_F"\n", + chksum)); +-#endif ++#endif /* OFFLOAD_CHECKSUM_CHECK_TCP */ + tcp_debug_print(tcphdr); + TCP_STATS_INC(tcp.chkerr); + goto dropped; +@@ -816,12 +815,14 @@ tcp_listen_input(struct tcp_pcb_listen *pcb) + + #if GAZELLE_ENABLE + vdev_reg_done(REG_RING_TCP_CONNECT, npcb); ++#endif /* GAZELLE_ENABLE */ ++#if GAZELLE_SAME_NODE + if (ip_addr_cmp(&npcb->local_ip, &npcb->remote_ip)) { + if (find_same_node_ring(npcb) != 0) { + return; + } + } +-#endif ++#endif /* GAZELLE_SAME_NODE */ + + /* Parse any options in the SYN. */ + tcp_parseopt(npcb); +@@ -1019,14 +1020,16 @@ tcp_process(struct tcp_pcb *pcb) + rseg = pcb->unsent; + LWIP_ASSERT("no segment to free", rseg != NULL); + pcb->unsent = rseg->next; +- if (pcb->last_unsent == rseg) { +- pcb->last_unsent = rseg->next; +- } ++#if GAZELLE_TCP_LAST_SEG ++ if (pcb->unsent == NULL) ++ pcb->last_unsent = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + } else { + pcb->unacked = rseg->next; +- if (pcb->last_unacked == rseg) { +- pcb->last_unacked = rseg->next; +- } ++#if GAZELLE_TCP_LAST_SEG ++ if (pcb->unacked == NULL) ++ pcb->last_unacked = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + } + tcp_seg_free(rseg); + +@@ -1423,8 +1426,10 @@ tcp_receive(struct tcp_pcb *pcb) + /* Remove segment from the unacknowledged list if the incoming + ACK acknowledges them. */ + pcb->unacked = tcp_free_acked_segments(pcb, pcb->unacked, "unacked", pcb->unsent); ++#if GAZELLE_TCP_LAST_SEG + if (pcb->unacked == NULL) + pcb->last_unacked = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + /* We go through the ->unsent list to see if any of the segments + on the list are acknowledged by the ACK. This may seem + strange since an "unsent" segment shouldn't be acked. The +@@ -1432,8 +1437,10 @@ tcp_receive(struct tcp_pcb *pcb) + ->unsent list after a retransmission, so these segments may + in fact have been sent once. */ + pcb->unsent = tcp_free_acked_segments(pcb, pcb->unsent, "unsent", pcb->unacked); ++#if GAZELLE_TCP_LAST_SEG + if (pcb->unsent == NULL) + pcb->last_unsent = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index cf93482..3c19f1d 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -86,7 +86,7 @@ + #if OFFLOAD_CHECKSUM_GEN_TCP + #include "lwipgz_offload.h" + #endif +-#endif ++#endif /* GAZELLE_ENABLE */ + + #ifdef LWIP_HOOK_FILENAME + #include LWIP_HOOK_FILENAME +@@ -149,6 +149,10 @@ tcp_route(const struct tcp_pcb *pcb, const ip_addr_t *src, const ip_addr_t *dst) + if ((pcb != NULL) && (pcb->netif_idx != NETIF_NO_INDEX)) { + return netif_get_by_index(pcb->netif_idx); + } else { ++#if GAZELLE_ENABLE ++ struct netif *netif = ip_route(src, dst); ++ tcp_bind_netif((struct tcp_pcb *)pcb, netif); ++#endif /* GAZELLE_ENABLE */ + return ip_route(src, dst); + } + } +@@ -525,13 +529,13 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + u16_t space; + u16_t unsent_optlen; + +-#if GAZELLE_ENABLE ++#if GAZELLE_TCP_LAST_SEG + last_unsent = pcb->last_unsent; +-#else ++#else /* GAZELLE_TCP_LAST_SEG */ + /* @todo: this could be sped up by keeping last_unsent in the pcb */ + for (last_unsent = pcb->unsent; last_unsent->next != NULL; + last_unsent = last_unsent->next); +-#endif ++#endif /* GAZELLE_TCP_LAST_SEG */ + + /* Usable space at the end of the last unsent segment */ + unsent_optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(last_unsent->flags, pcb); +@@ -828,15 +832,14 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + } else { + last_unsent->next = queue; + } ++#if GAZELLE_TCP_LAST_SEG ++ if (queue) ++ pcb->last_unsent = prev_seg; ++#endif /* GAZELLE_TCP_LAST_SEG */ + + /* + * Finally update the pcb state. + */ +-#if GAZELLE_ENABLE +- if (queue) { +- pcb->last_unsent = prev_seg; +- } +-#endif + pcb->snd_lbb += len; + pcb->snd_buf -= len; + pcb->snd_queuelen = queuelen; +@@ -1162,8 +1165,10 @@ tcp_split_unsent_seg(struct tcp_pcb *pcb, u16_t split) + /* Finally insert remainder into queue after split (which stays head) */ + seg->next = useg->next; + useg->next = seg; ++#if GAZELLE_TCP_LAST_SEG + if (pcb->last_unsent == useg) + pcb->last_unsent = seg; ++#endif /* GAZELLE_TCP_LAST_SEG */ + + #if TCP_OVERSIZE + /* If remainder is last segment on the unsent, ensure we clear the oversize amount +@@ -1199,8 +1204,14 @@ tcp_send_fin(struct tcp_pcb *pcb) + LWIP_ASSERT("tcp_send_fin: invalid pcb", pcb != NULL); + + /* first, try to add the fin to the last unsent segment */ +- if (pcb->last_unsent != NULL) { +- struct tcp_seg *last_unsent = pcb->last_unsent; ++ if (pcb->unsent != NULL) { ++ struct tcp_seg *last_unsent; ++#if GAZELLE_TCP_LAST_SEG ++ last_unsent = pcb->last_unsent; ++#else /* GAZELLE_TCP_LAST_SEG */ ++ for (last_unsent = pcb->unsent; last_unsent->next != NULL; ++ last_unsent = last_unsent->next); ++#endif /* GAZELLE_TCP_LAST_SEG */ + + if ((TCPH_FLAGS(last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { + /* no SYN/FIN/RST flag in the header, we can add the FIN flag */ +@@ -1294,10 +1305,18 @@ tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) + if (pcb->unsent == NULL) { + pcb->unsent = seg; + } else { +- struct tcp_seg *useg = pcb->last_unsent; ++ struct tcp_seg *useg; ++#if GAZELLE_TCP_LAST_SEG ++ useg = pcb->last_unsent; ++#else /* GAZELLE_TCP_LAST_SEG */ ++ for (useg = pcb->unsent; useg->next != NULL; useg = useg->next); ++#endif /* GAZELLE_TCP_LAST_SEG */ + useg->next = seg; + } ++#if GAZELLE_TCP_LAST_SEG + pcb->last_unsent = seg; ++#endif /* GAZELLE_TCP_LAST_SEG */ ++ + #if TCP_OVERSIZE + /* The new unsent tail has no space */ + pcb->unsent_oversize = 0; +@@ -1515,12 +1534,7 @@ tcp_output(struct tcp_pcb *pcb) + lwip_ntohl(seg->tcphdr->seqno), pcb->lastack)); + } + +- if (pcb->pcb_if == NULL) { +- netif = tcp_route(pcb, &pcb->local_ip, &pcb->remote_ip); +- pcb->pcb_if = netif; +- } else { +- netif = pcb->pcb_if; +- } ++ netif = tcp_route(pcb, &pcb->local_ip, &pcb->remote_ip); + if (netif == NULL) { + return ERR_RTE; + } +@@ -1721,17 +1735,20 @@ end_loop: + if (err != ERR_OK) { + /* segment could not be sent, for whatever reason */ + tcp_set_flags(pcb, TF_NAGLEMEMERR); ++#if GAZELLE_TCP_LAST_SEG + if (pcb->unsent == NULL) + pcb->last_unsent = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + return err; + } + #if TCP_OVERSIZE_DBGCHECK + seg->oversize_left = 0; + #endif /* TCP_OVERSIZE_DBGCHECK */ +- if (pcb->last_unsent == pcb->unsent) { +- pcb->last_unsent = seg->next; +- } + pcb->unsent = seg->next; ++#if GAZELLE_TCP_LAST_SEG ++ if (pcb->unsent == NULL) ++ pcb->last_unsent = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + if (pcb->state != SYN_SENT) { + tcp_clear_flags(pcb, TF_ACK_DELAY | TF_ACK_NOW); + } +@@ -1745,7 +1762,9 @@ end_loop: + /* unacked list is empty? */ + if (pcb->unacked == NULL) { + pcb->unacked = seg; ++#if GAZELLE_TCP_LAST_SEG + pcb->last_unacked = seg; ++#endif /* GAZELLE_TCP_LAST_SEG */ + useg = seg; + /* unacked list is not empty? */ + } else { +@@ -1765,7 +1784,9 @@ end_loop: + /* add segment to tail of unacked list */ + useg->next = seg; + useg = useg->next; ++#if GAZELLE_TCP_LAST_SEG + pcb->last_unacked = seg; ++#endif /* GAZELLE_TCP_LAST_SEG */ + } + } + /* do not queue empty segments on the unacked list */ +@@ -1784,8 +1805,10 @@ end_loop: + + output_done: + pcb->need_tso_send = 0; ++#if GAZELLE_TCP_LAST_SEG + if (pcb->unsent == NULL) + pcb->last_unsent = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + tcp_clear_flags(pcb, TF_NAGLEMEMERR); + return ERR_OK; + } +@@ -1956,7 +1979,9 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + #if OFFLOAD_CHECKSUM_GEN_TCP + if (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) { + ol_chksum_gen_tcp(seg->p, TCPH_HDRLEN_BYTES(seg->tcphdr)); +- } else { ++ } else ++#endif /* OFFLOAD_CHECKSUM_GEN_TCP */ ++ { + #if TCP_CHECKSUM_ON_COPY + u32_t acc; + #if TCP_CHECKSUM_ON_COPY_SANITY_CHECK +@@ -1991,53 +2016,20 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + seg->tcphdr->chksum = ip_chksum_pseudo(seg->p, IP_PROTO_TCP, + seg->p->tot_len, &pcb->local_ip, &pcb->remote_ip); + #endif /* TCP_CHECKSUM_ON_COPY */ +- + } +-#else +-#if TCP_CHECKSUM_ON_COPY +- u32_t acc; +-#if TCP_CHECKSUM_ON_COPY_SANITY_CHECK +- u16_t chksum_slow = ip_chksum_pseudo(seg->p, IP_PROTO_TCP, +- seg->p->tot_len, &pcb->local_ip, &pcb->remote_ip); +-#endif /* TCP_CHECKSUM_ON_COPY_SANITY_CHECK */ +- if ((seg->flags & TF_SEG_DATA_CHECKSUMMED) == 0) { +- LWIP_ASSERT("data included but not checksummed", +- seg->p->tot_len == TCPH_HDRLEN_BYTES(seg->tcphdr)); +- } +- +- /* rebuild TCP header checksum (TCP header changes for retransmissions!) */ +- acc = ip_chksum_pseudo_partial(seg->p, IP_PROTO_TCP, +- seg->p->tot_len, TCPH_HDRLEN_BYTES(seg->tcphdr), &pcb->local_ip, &pcb->remote_ip); +- /* add payload checksum */ +- if (seg->chksum_swapped) { +- seg_chksum_was_swapped = 1; +- seg->chksum = SWAP_BYTES_IN_WORD(seg->chksum); +- seg->chksum_swapped = 0; +- } +- acc = (u16_t)~acc + seg->chksum; +- seg->tcphdr->chksum = (u16_t)~FOLD_U32T(acc); +-#if TCP_CHECKSUM_ON_COPY_SANITY_CHECK +- if (chksum_slow != seg->tcphdr->chksum) { +- TCP_CHECKSUM_ON_COPY_SANITY_CHECK_FAIL( +- ("tcp_output_segment: calculated checksum is %"X16_F" instead of %"X16_F"\n", +- seg->tcphdr->chksum, chksum_slow)); +- seg->tcphdr->chksum = chksum_slow; +- } +-#endif /* TCP_CHECKSUM_ON_COPY_SANITY_CHECK */ +-#else /* TCP_CHECKSUM_ON_COPY */ +- seg->tcphdr->chksum = ip_chksum_pseudo(seg->p, IP_PROTO_TCP, +- seg->p->tot_len, &pcb->local_ip, &pcb->remote_ip); +-#endif /* TCP_CHECKSUM_ON_COPY */ +-#endif /* OFFLOAD_CHECKSUM_GEN_TCP */ + } + #endif /* CHECKSUM_GEN_TCP */ +-#if !GAZELLE_ENABLE ++ ++#if GAZELLE_SAME_NODE ++ seg->p->pcb = pcb; ++#endif /* GAZELLE_SAME_NODE */ ++#if GAZELLE_ENABLE ++ TCP_STATS_INC(tcp.tx_out); ++#else + TCP_STATS_INC(tcp.xmit); + #endif + + NETIF_SET_HINTS(netif, &(pcb->netif_hints)); +- +- seg->p->pcb = pcb; + err = ip_output_if(seg->p, &pcb->local_ip, &pcb->remote_ip, pcb->ttl, + pcb->tos, IP_PROTO_TCP, netif); + NETIF_RESET_HINTS(netif); +@@ -2050,9 +2042,6 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + seg->chksum_swapped = 1; + } + #endif +-#if GAZELLE_ENABLE +- TCP_STATS_INC(tcp.tx_out); +-#endif + + return err; + } +@@ -2089,6 +2078,7 @@ tcp_rexmit_rto_prepare(struct tcp_pcb *pcb) + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit_rto: segment busy\n")); + return ERR_VAL; + } ++ + /* concatenate unsent queue after unacked queue */ + seg->next = pcb->unsent; + #if TCP_OVERSIZE_DBGCHECK +@@ -2097,14 +2087,16 @@ tcp_rexmit_rto_prepare(struct tcp_pcb *pcb) + pcb->unsent_oversize = seg->oversize_left; + } + #endif /* TCP_OVERSIZE_DBGCHECK */ +- /* unsent queue is the concatenated queue (of unacked, unsent) */ +- if (pcb->unsent == NULL) { ++ ++#if GAZELLE_TCP_LAST_SEG ++ if (pcb->unsent == NULL) + pcb->last_unsent = pcb->last_unacked; +- } ++ pcb->last_unacked = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ ++ /* unsent queue is the concatenated queue (of unacked, unsent) */ + pcb->unsent = pcb->unacked; + /* unacked queue is now empty */ + pcb->unacked = NULL; +- pcb->last_unacked = NULL; + + /* Mark RTO in-progress */ + tcp_set_flags(pcb, TF_RTO); +@@ -2376,22 +2368,14 @@ tcp_output_fill_options(const struct tcp_pcb *pcb, struct pbuf *p, u8_t optflags + * header checksum and calling ip_output_if while handling netif hints and stats. + */ + static err_t +-tcp_output_control_segment(struct tcp_pcb *pcb, struct pbuf *p, ++tcp_output_control_segment(const struct tcp_pcb *pcb, struct pbuf *p, + const ip_addr_t *src, const ip_addr_t *dst) + { + struct netif *netif; + +- p->pcb = pcb; + LWIP_ASSERT("tcp_output_control_segment: invalid pbuf", p != NULL); + +- if (pcb == NULL || pcb->pcb_if == NULL) { +- netif = tcp_route(pcb, src, dst); +- if (pcb) { +- pcb->pcb_if = netif; +- } +- } else { +- netif = pcb->pcb_if; +- } ++ netif = tcp_route(pcb, src, dst); + if (netif == NULL) { + pbuf_free(p); + return ERR_RTE; +@@ -2440,6 +2424,9 @@ tcp_output_control_segment_netif(const struct tcp_pcb *pcb, struct pbuf *p, + ttl = TCP_TTL; + tos = 0; + } ++#if GAZELLE_SAME_NODE ++ p->pcb = (struct tcp_pcb *)pcb; ++#endif /* GAZELLE_SAME_NODE */ + #if GAZELLE_ENABLE + TCP_STATS_INC(tcp.tx_out); + #else +@@ -2516,7 +2503,7 @@ tcp_rst(const struct tcp_pcb *pcb, u32_t seqno, u32_t ackno, + + p = tcp_rst_common(pcb, seqno, ackno, local_ip, remote_ip, local_port, remote_port); + if (p != NULL) { +- tcp_output_control_segment((struct tcp_pcb *)pcb, p, local_ip, remote_ip); ++ tcp_output_control_segment(pcb, p, local_ip, remote_ip); + } + } + +diff --git a/src/include/dpdk_version.h b/src/include/dpdk_version.h +index bf03d98..b5b89cc 100644 +--- a/src/include/dpdk_version.h ++++ b/src/include/dpdk_version.h +@@ -52,18 +52,22 @@ + #define RTE_MBUF_F_TX_UDP_CKSUM PKT_TX_UDP_CKSUM + #define RTE_MBUF_F_TX_VLAN PKT_TX_VLAN_PKT + +-#define RTE_ETH_RX_OFFLOAD_TCP_CKSUM DEV_RX_OFFLOAD_TCP_CKSUM +-#define RTE_ETH_RX_OFFLOAD_UDP_CKSUM DEV_RX_OFFLOAD_UDP_CKSUM + #define RTE_ETH_RX_OFFLOAD_IPV4_CKSUM DEV_RX_OFFLOAD_IPV4_CKSUM +-#define RTE_ETH_RX_OFFLOAD_VLAN_STRIP DEV_RX_OFFLOAD_VLAN_STRIP +-#define RTE_ETH_RX_OFFLOAD_VLAN_FILTER DEV_RX_OFFLOAD_VLAN_FILTER +- + #define RTE_ETH_TX_OFFLOAD_IPV4_CKSUM DEV_TX_OFFLOAD_IPV4_CKSUM +-#define RTE_ETH_TX_OFFLOAD_VLAN_INSERT DEV_TX_OFFLOAD_VLAN_INSERT +-#define RTE_ETH_TX_OFFLOAD_TCP_TSO DEV_TX_OFFLOAD_TCP_TSO ++ ++#define RTE_ETH_RX_OFFLOAD_TCP_CKSUM DEV_RX_OFFLOAD_TCP_CKSUM + #define RTE_ETH_TX_OFFLOAD_TCP_CKSUM DEV_TX_OFFLOAD_TCP_CKSUM ++ ++#define RTE_ETH_RX_OFFLOAD_UDP_CKSUM DEV_RX_OFFLOAD_UDP_CKSUM + #define RTE_ETH_TX_OFFLOAD_UDP_CKSUM DEV_TX_OFFLOAD_UDP_CKSUM ++ + #define RTE_ETH_TX_OFFLOAD_MULTI_SEGS DEV_TX_OFFLOAD_MULTI_SEGS ++#define RTE_ETH_TX_OFFLOAD_TCP_TSO DEV_TX_OFFLOAD_TCP_TSO ++#define RTE_ETH_TX_OFFLOAD_UDP_TSO DEV_TX_OFFLOAD_UDP_TSO ++ ++#define RTE_ETH_RX_OFFLOAD_VLAN_STRIP DEV_RX_OFFLOAD_VLAN_STRIP ++#define RTE_ETH_RX_OFFLOAD_VLAN_FILTER DEV_RX_OFFLOAD_VLAN_FILTER ++#define RTE_ETH_TX_OFFLOAD_VLAN_INSERT DEV_TX_OFFLOAD_VLAN_INSERT + + #define RTE_ETH_LINK_SPEED_AUTONEG ETH_LINK_SPEED_AUTONEG + +diff --git a/src/include/lwip/ip.h b/src/include/lwip/ip.h +index d05de80..6257df7 100644 +--- a/src/include/lwip/ip.h ++++ b/src/include/lwip/ip.h +@@ -79,7 +79,6 @@ extern "C" { + ip_addr_t remote_ip; \ + /* Bound netif index */ \ + u8_t netif_idx; \ +- struct netif *pcb_if; \ + /* Socket options */ \ + u8_t so_options; \ + /* Type Of Service */ \ +diff --git a/src/include/lwip/opt.h b/src/include/lwip/opt.h +index b19ecd0..d20f9e4 100644 +--- a/src/include/lwip/opt.h ++++ b/src/include/lwip/opt.h +@@ -1453,7 +1453,7 @@ + * 0xff is the maximum (u8_t). + */ + #if !defined TCP_DEFAULT_LISTEN_BACKLOG || defined __DOXYGEN__ +-#define TCP_DEFAULT_LISTEN_BACKLOG 0xffff ++#define TCP_DEFAULT_LISTEN_BACKLOG 0xff + #endif + + /** +diff --git a/src/include/lwip/pbuf.h b/src/include/lwip/pbuf.h +index 16fe999..dda66c3 100644 +--- a/src/include/lwip/pbuf.h ++++ b/src/include/lwip/pbuf.h +@@ -224,10 +224,12 @@ struct pbuf { + /** For incoming packets, this contains the input netif's index */ + u8_t if_idx; + ++#if GAZELLE_SAME_NODE ++ struct tcp_pcb *pcb; ++#endif /* GAZELLE_SAME_NODE */ + #if GAZELLE_ENABLE + volatile u8_t allow_append; + pthread_spinlock_t pbuf_lock; +- struct tcp_pcb *pcb; + #if GAZELLE_UDP_ENABLE + ip_addr_t addr; + u16_t port; +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index 4e41037..2ffd2ef 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -248,9 +248,9 @@ struct tcp_pcb_listen { + u8_t master_lpcb; + #endif + +-#if GAZELLE_ENABLE ++#if GAZELLE_SAME_NODE + struct rte_ring *listen_rx_ring; +-#endif ++#endif /* GAZELLE_SAME_NODE */ + }; + + +@@ -414,16 +414,18 @@ struct tcp_pcb { + u8_t rcv_scale; + #endif + +-#if GAZELLE_ENABLE ++#if GAZELLE_TCP_LAST_SEG + struct tcp_seg *last_unsent; + struct tcp_seg *last_unacked; +- +- u8_t need_tso_send; +- ++#endif /* GAZELLE_TCP_LAST_SEG */ ++#if GAZELLE_SAME_NODE + #define SAME_NODE_RING_SIZE 512 + struct rte_ring *client_rx_ring; + struct rte_ring *client_tx_ring; + u8_t free_ring; ++#endif /* GAZELLE_SAME_NODE */ ++#if GAZELLE_ENABLE ++ u8_t need_tso_send; + #endif + }; + +diff --git a/src/include/lwipgz_list.h b/src/include/lwipgz_list.h +index 9729210..34f80c7 100644 +--- a/src/include/lwipgz_list.h ++++ b/src/include/lwipgz_list.h +@@ -33,6 +33,8 @@ + #ifndef __LWIPGZ_LIST_H__ + #define __LWIPGZ_LIST_H__ + ++#include ++ + /* double circular linked list */ + struct list_node { + struct list_node *prev; +diff --git a/src/include/lwipgz_sock.h b/src/include/lwipgz_sock.h +index 95c26df..ddf84dc 100644 +--- a/src/include/lwipgz_sock.h ++++ b/src/include/lwipgz_sock.h +@@ -93,6 +93,9 @@ extern void lstack_calculate_aggregate(int type, uint32_t len); + extern void time_stamp_transfer_pbuf(struct pbuf *pbuf_old, struct pbuf *pbuf_new); + extern void time_stamp_record(int fd, struct pbuf *pbuf); + ++#endif /* GAZELLE_ENABLE */ ++ ++#if GAZELLE_SAME_NODE + // 8M + #define SAME_NODE_RING_LEN (unsigned long long)(8388608) + #define SAME_NODE_RING_MASK (unsigned long long)(8388608 - 1) +@@ -106,8 +109,7 @@ extern err_t same_node_ring_create(struct rte_ring **ring, int size, int port, c + extern err_t create_same_node_ring(struct tcp_pcb *pcb); + extern err_t find_same_node_ring(struct tcp_pcb *pcb); + extern err_t find_same_node_memzone(struct tcp_pcb *pcb, struct lwip_sock *nsock); +- +-#endif /* GAZELLE_ENABLE */ ++#endif /* GAZELLE_SAME_NODE */ + + + /* move some definitions to the lwipgz_sock.h for libnet to use, and +@@ -187,11 +189,14 @@ struct lwip_sock { + struct rte_ring *recv_ring; + struct rte_ring *send_ring; + ++#if GAZELLE_SAME_NODE + /* same node send data ring */ + struct same_node_ring *same_node_rx_ring; + const struct rte_memzone *same_node_rx_ring_mz; + struct same_node_ring *same_node_tx_ring; + const struct rte_memzone *same_node_tx_ring_mz; ++#endif /* GAZELLE_SAME_NODE */ ++ + uint8_t already_bind_numa; + + struct sock_time_stamp stamp; +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 2ac48f5..9338fcd 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -58,6 +58,7 @@ + #define GAZELLE_UDP_ENABLE 1 + #define GAZELLE_UDP_NEW_PORT 1 + ++#define GAZELLE_SAME_NODE 1 + + /* + ---------------------------------- +@@ -119,6 +120,10 @@ + */ + #define LWIP_SUPPORT_CUSTOM_PBUF 1 + ++#define LWIP_CHECKSUM_ON_COPY 0 ++ ++#define PBUF_POOL_FREE_OOSEQ 1 ++ + #define MEMP_MEM_MALLOC 0 + #define MEM_LIBC_MALLOC 0 + #define MEM_USE_POOLS 0 +@@ -239,6 +244,8 @@ + #define LWIP_WND_SCALE 1 + #define TCP_RCV_SCALE 6 + ++#define GAZELLE_TCP_LAST_SEG 1 ++ + #define GAZELLE_TCP_MAX_CONN_PER_THREAD 65535 + #define GAZELLE_TCP_REUSE_IPPORT 1 + #define GAZELLE_TCP_PINGPONG_MODE 1 +@@ -250,6 +257,7 @@ + */ + #define LWIP_SOCKET 1 + ++#define LWIP_SOCKET_SELECT 1 + #define LWIP_SOCKET_POLL 0 + + #define LWIP_SO_SNDTIMEO 1 +@@ -295,6 +303,7 @@ + + #define SOCKETS_DEBUG LWIP_DBG_OFF + #define SYS_DEBUG LWIP_DBG_OFF ++#define MEMP_DEBUG LWIP_DBG_OFF + + + /* +-- +2.33.0 + diff --git a/0183-socket-refactor-sock_event.patch b/0183-socket-refactor-sock_event.patch new file mode 100644 index 0000000..015782a --- /dev/null +++ b/0183-socket-refactor-sock_event.patch @@ -0,0 +1,916 @@ +From 88fcfd6bcdb31551fae5d51dcf23154be6f0f740 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 9 Mar 2025 23:25:37 +0800 +Subject: [PATCH 02/11] socket: refactor sock_event + +Signed-off-by: Lemmy Huang +--- + src/api/api_msg.c | 21 ++- + src/api/sockets.c | 53 +++---- + src/api/sys_arch.c | 232 +++++++++++++++++-------------- + src/include/arch/sys_arch.h | 146 +++++++++++++++---- + src/include/lwip/priv/memp_std.h | 5 +- + src/include/lwipgz_event.h | 75 ---------- + src/include/lwipgz_posix_api.h | 2 +- + src/include/lwipgz_sock.h | 56 ++++++-- + src/include/lwipopts.h | 21 ++- + 9 files changed, 336 insertions(+), 275 deletions(-) + delete mode 100644 src/include/lwipgz_event.h + +diff --git a/src/api/api_msg.c b/src/api/api_msg.c +index acd4697..a979375 100644 +--- a/src/api/api_msg.c ++++ b/src/api/api_msg.c +@@ -514,13 +514,6 @@ err_tcp(void *arg, err_t err) + + SYS_ARCH_UNPROTECT(lev); + +- /* Notify the user layer about a connection error. Used to signal select. */ +- API_EVENT(conn, NETCONN_EVT_ERROR, 0); +- /* Try to release selects pending on 'read' or 'write', too. +- They will get an error if they actually try to read or write. */ +- API_EVENT(conn, NETCONN_EVT_RCVPLUS, 0); +- API_EVENT(conn, NETCONN_EVT_SENDPLUS, 0); +- + mbox_msg = lwip_netconn_err_to_msg(err); + /* pass error message to recvmbox to wake up pending recv */ + if (NETCONN_MBOX_VALID(conn, &conn->recvmbox)) { +@@ -536,6 +529,13 @@ err_tcp(void *arg, err_t err) + sys_mbox_trypost(&conn->acceptmbox, mbox_msg); + } + ++ /* Notify the user layer about a connection error. Used to signal select. */ ++ API_EVENT(conn, NETCONN_EVT_ERROR, 0); ++ /* Try to release selects pending on 'read' or 'write', too. ++ They will get an error if they actually try to read or write. */ ++ API_EVENT(conn, NETCONN_EVT_RCVPLUS, 0); ++ API_EVENT(conn, NETCONN_EVT_SENDPLUS, 0); ++ + if ((old_state == NETCONN_WRITE) || (old_state == NETCONN_CLOSE) || + (old_state == NETCONN_CONNECT)) { + /* calling lwip_netconn_do_writemore/lwip_netconn_do_close_internal is not necessary +@@ -1377,10 +1377,6 @@ lwip_netconn_do_connected(void *arg, struct tcp_pcb *pcb, err_t err) + return ERR_VAL; + } + +-#if GAZELLE_ENABLE +- do_lwip_connected_callback(conn); +-#endif +- + LWIP_ASSERT("conn->state == NETCONN_CONNECT", conn->state == NETCONN_CONNECT); + LWIP_ASSERT("(conn->current_msg != NULL) || conn->in_non_blocking_connect", + (conn->current_msg != NULL) || IN_NONBLOCKING_CONNECT(conn)); +@@ -1399,6 +1395,9 @@ lwip_netconn_do_connected(void *arg, struct tcp_pcb *pcb, err_t err) + (!was_blocking && op_completed_sem == NULL)); + conn->current_msg = NULL; + conn->state = NETCONN_NONE; ++#if GAZELLE_ENABLE ++ do_lwip_connected_callback(conn->callback_arg.socket); ++#endif + API_EVENT(conn, NETCONN_EVT_SENDPLUS, 0); + + if (was_blocking) { +diff --git a/src/api/sockets.c b/src/api/sockets.c +index 9fafca3..b78b1f3 100644 +--- a/src/api/sockets.c ++++ b/src/api/sockets.c +@@ -2641,6 +2641,24 @@ event_callback(struct netconn *conn, enum netconn_evt evt, u16_t len) + return; + } + ++#if GAZELLE_ENABLE ++ switch (evt) { ++ case NETCONN_EVT_ERROR: ++ sock->errevent = 1; ++ case NETCONN_EVT_RCVPLUS: ++ case NETCONN_EVT_SENDPLUS: ++ sock_event_notify_pending(sock, evt, len); ++ break; ++ case NETCONN_EVT_RCVMINUS: ++ case NETCONN_EVT_SENDMINUS: ++ sock_event_remove_pending(sock, evt, len); ++ break; ++ default: ++ break; ++ } ++ return; ++#endif /* GAZELLE_ENABLE */ ++ + check_waiters = 1; + SYS_ARCH_PROTECT(lev); + /* Set event as required */ +@@ -2650,61 +2668,26 @@ event_callback(struct netconn *conn, enum netconn_evt evt, u16_t len) + if (sock->rcvevent > 1) { + check_waiters = 0; + } +-#if GAZELLE_ENABLE +- if (netif_is_rtc_mode(netif_default)) { +- if (sock->rcvevent == 1) { +- add_sock_event_nolock(sock, POLLIN); +- } +- } else { +- if (conn->acceptmbox != NULL && !sys_mbox_empty(conn->acceptmbox)) { +- add_sock_event(sock, POLLIN); +- } +- } +-#endif + break; + case NETCONN_EVT_RCVMINUS: + sock->rcvevent--; + check_waiters = 0; +-#if GAZELLE_ENABLE +- if (netif_is_rtc_mode(netif_default)) { +- if (sock->rcvevent == 0) { +- del_sock_event_nolock(sock, POLLIN); +- } +- } +-#endif + break; + case NETCONN_EVT_SENDPLUS: + if (sock->sendevent) { + check_waiters = 0; + } + sock->sendevent = 1; +-#if GAZELLE_ENABLE +- if (netif_is_rtc_mode(netif_default)) { +- add_sock_event_nolock(sock, POLLOUT); +- } +-#endif + break; + case NETCONN_EVT_SENDMINUS: + sock->sendevent = 0; + check_waiters = 0; +-#if GAZELLE_ENABLE +- if (netif_is_rtc_mode(netif_default)) { +- del_sock_event_nolock(sock, POLLOUT); +- } +-#endif + break; + case NETCONN_EVT_ERROR: + if ((conn->pending_err != ERR_OK) && (conn->pending_err != ERR_RST)) { + LWIP_DEBUGF(LWIPGZ_LOG_WARNING, ("event_callback: have errevent, err=%d, fd=%d\n", conn->pending_err, conn->callback_arg.socket)); + } + sock->errevent = 1; +-#if GAZELLE_ENABLE +- if (netif_is_rtc_mode(netif_default)) { +- add_sock_event_nolock(sock, EPOLLERR); +- } else { +- add_sock_event(sock, EPOLLERR); +- } +-#endif + break; + default: + LWIP_ASSERT("unknown event", 0); +diff --git a/src/api/sys_arch.c b/src/api/sys_arch.c +index 4413b0f..f242154 100644 +--- a/src/api/sys_arch.c ++++ b/src/api/sys_arch.c +@@ -34,16 +34,17 @@ + #include + #include + #include ++#include + + #include + #include + #include + +-#include "lwip/err.h" ++#include "arch/sys_arch.h" ++ + #include "lwip/opt.h" + #include "lwip/sys.h" + #include "lwip/timeouts.h" +-#include "arch/sys_arch.h" + #include "lwipgz_sock.h" + + #define MBOX_NAME_PREFIX "_mbox_0x" +@@ -76,6 +77,17 @@ u64_t sys_now_us(void) + return (rte_rdtsc() / g_sys_cycles_per_us); + } + ++void sys_ms2timespec(struct timespec *timespec, int timeout) ++{ ++ clock_gettime(CLOCK_REALTIME, timespec); ++ ++ timespec->tv_sec += timeout / MS_PER_S; ++ timespec->tv_nsec += (timeout % MS_PER_S) * (NS_PER_S / MS_PER_S); ++ ++ timespec->tv_sec += timespec->tv_nsec / NS_PER_S; ++ timespec->tv_nsec = timespec->tv_nsec % NS_PER_S; ++} ++ + u32_t sys_timer_run(void) + { + u32_t sleeptime; +@@ -138,6 +150,117 @@ sys_thread_t sys_thread_new(const char *name, lwip_thread_fn function, void *arg + } + + ++/* ++ * Semaphore ++ * */ ++err_t sys_sem_new(sys_sem_t *sem, u8_t count) ++{ ++ sys_sem_t s; ++ s = (sys_sem_t)memp_malloc(MEMP_SYS_SEM); ++ if (s == NULL) { ++ LWIP_DEBUGF(SYS_DEBUG | LWIPGZ_LOG_ERR, ("sys_sem_new: failed to malloc\n")); ++ return ERR_MEM; ++ } ++ ++ s->count = count; ++ *sem = s; ++ return ERR_OK; ++} ++ ++void sys_sem_free(sys_sem_t *sem) ++{ ++ LWIP_ASSERT("sys_sem_valid", sys_sem_valid(sem)); ++ sys_sem_t s = *sem; ++ ++ s->count = 0; ++ memp_free(MEMP_SYS_SEM, s); ++ sys_sem_set_invalid(sem); ++} ++ ++void sys_sem_signal(sys_sem_t *sem) ++{ ++ LWIP_ASSERT("sys_sem_valid", sys_sem_valid(sem)); ++ sys_sem_t s = *sem; ++ ++ ++(s->count); ++} ++ ++u32_t sys_arch_sem_wait(sys_sem_t *sem, u32_t timeout) ++{ ++ LWIP_ASSERT("sys_sem_valid", sys_sem_valid(sem)); ++ sys_sem_t s = *sem; ++ ++ u32_t start = sys_now(); ++ while (s->count <= 0) { ++ /* 0 = wait forever */ ++ if (timeout > 0 && timeout <= sys_now() - start) { ++ return SYS_ARCH_TIMEOUT; ++ } ++ } ++ ++ --(s->count); ++ return ERR_OK; ++} ++ ++/* ++ * Mutex ++ * */ ++err_t sys_mutex_new(sys_mutex_t *mutex) ++{ ++#if LWIP_NETCONN_FULLDUPLEX ++ sys_mutex_t m; ++ m = (sys_mutex_t)memp_malloc(MEMP_SYS_MUTEX); ++ if (m == NULL) { ++ LWIP_DEBUGF(SYS_DEBUG | LWIPGZ_LOG_ERR, ("sys_mutex_new: failed to malloc\n")); ++ return ERR_MEM; ++ } ++ ++ *mutex = m; ++ return sys_mutex_new_internal(m); ++#else ++ return ERR_OK; ++#endif ++} ++ ++void sys_mutex_free(sys_mutex_t *mutex) ++{ ++#if LWIP_NETCONN_FULLDUPLEX ++ LWIP_ASSERT("sys_mutex_valid", sys_mutex_valid(mutex)); ++ sys_mutex_t m = *mutex; ++ ++ sys_mutex_free_internal(m); ++ memp_free(MEMP_SYS_MUTEX, m); ++ sys_mutex_set_invalid(mutex); ++#endif ++} ++ ++void sys_mutex_unlock(sys_mutex_t *mutex) ++{ ++#if LWIP_NETCONN_FULLDUPLEX ++ LWIP_ASSERT("sys_mutex_valid", sys_mutex_valid(mutex)); ++ sys_mutex_unlock_internal(*mutex); ++#endif ++} ++ ++void sys_mutex_lock(sys_mutex_t *mutex) ++{ ++#if LWIP_NETCONN_FULLDUPLEX ++ LWIP_ASSERT("sys_mutex_valid", sys_mutex_valid(mutex)); ++ sys_mutex_timedlock_internal(*mutex, -1); ++#endif ++} ++ ++/* ++ * Critical section ++ * */ ++sys_prot_t sys_arch_protect(void) ++{ ++ return 0; ++} ++void sys_arch_unprotect(sys_prot_t pval) ++{ ++} ++ + extern int eth_dev_poll(void); + /* + * Mailbox +@@ -304,111 +427,6 @@ int sys_mbox_empty(struct sys_mbox *mb) + return rte_ring_count(mb->ring) == 0; + } + +-/* +- * Semaphore +- * */ +-err_t sys_sem_new(struct sys_sem **sem, uint8_t count) +-{ +- *sem = (struct sys_sem *)memp_malloc(MEMP_SYS_SEM); +- if ((*sem) == NULL) { +- return ERR_MEM; +- } +- (*sem)->c = 0; +- (*sem)->wait_fn = mbox_wait_func; +- return ERR_OK; +-} +- +-void sys_sem_signal(struct sys_sem **s) +-{ +- struct sys_sem *sem = NULL; +- LWIP_ASSERT("invalid sem", (s != NULL) && (*s != NULL)); +- sem = *s; +- ++(sem->c); +-} +- +-static uint32_t cond_wait(struct sys_sem *sem, uint32_t timeout) +-{ +- uint32_t used_ms = 0; +- uint32_t poll_ts; +- +- if (timeout == 0) { +- (void)sem->wait_fn(); +- return 0; +- } +- +- poll_ts = sys_now(); +- +- while (used_ms < timeout) { +- if (sem->c > 0) +- return timeout - used_ms; +- +- (void)sem->wait_fn(); +- used_ms = sys_now() - poll_ts; +- } +- +- return SYS_ARCH_TIMEOUT; +-} +- +-uint32_t sys_arch_sem_wait(struct sys_sem **s, uint32_t timeout) +-{ +- uint32_t time_needed = 0; +- struct sys_sem *sem = NULL; +- LWIP_ASSERT("invalid sem", (s != NULL) && (*s != NULL)); +- sem = *s; +- +- while (sem->c <= 0) { +- if (timeout > 0) { +- time_needed = cond_wait(sem, timeout); +- +- if (time_needed == SYS_ARCH_TIMEOUT) { +- return SYS_ARCH_TIMEOUT; +- } +- } else { +- cond_wait(sem, 0); +- } +- } +- +- sem->c--; +- return time_needed; +-} +- +-void sys_sem_free(struct sys_sem **s) +-{ +- if ((s != NULL) && (*s != NULL)) +- memp_free(MEMP_SYS_SEM, *s); +-} +- +-/* +- * Mutex +- * */ +-err_t sys_mutex_new(struct sys_mutex **mutex) +-{ +- return ERR_OK; +-} +- +-void sys_mutex_lock(struct sys_mutex **mutex) +-{ +-} +- +-void sys_mutex_unlock(struct sys_mutex **mutex) +-{ +-} +- +-void sys_mutex_free(struct sys_mutex **mutex) +-{ +-} +- +-/* +- * Critical section +- * */ +-sys_prot_t sys_arch_protect(void) +-{ +- return 0; +-} +- +-void sys_arch_unprotect(sys_prot_t pval) +-{ +-} + + /* + * Memory +diff --git a/src/include/arch/sys_arch.h b/src/include/arch/sys_arch.h +index edaab4f..515042e 100644 +--- a/src/include/arch/sys_arch.h ++++ b/src/include/arch/sys_arch.h +@@ -35,12 +35,38 @@ + + #include + #include ++#include ++#include + ++#include ++#include ++ ++#include "lwip/err.h" + #include "lwip/debug.h" + #include "lwip/memp.h" + +-#define SYS_NAME_LEN 64 + ++#define SYS_FORMAT_NAME(buf, size, fmt, ...) \ ++ do { \ ++ int ret = snprintf(buf, size, ""fmt"", ##__VA_ARGS__); \ ++ if (ret < 0) { \ ++ LWIP_DEBUGF(SYS_DEBUG | LWIPGZ_LOG_ERR, ("%s:%d: sprintf failed\n", __FUNCTION__, __LINE__)); \ ++ (void)memset((void *)buf, 0, size); \ ++ } \ ++ } while(0) ++ ++ ++u32_t sys_timer_run(void); ++u32_t sys_now(void); ++u64_t sys_now_us(void); ++void sys_ms2timespec(struct timespec *timespec, int timeout); ++ ++u8_t *sys_hugepage_malloc(const char *name, unsigned size); ++void sys_mempool_var_init(struct memp_desc *memp, char *desc, u16_t size, u16_t num, ++ u8_t *base, struct memp **tab, struct stats_mem *stats); ++ ++ ++#define SYS_NAME_LEN 64 + struct sys_thread { + struct sys_thread *next; + char name[SYS_NAME_LEN]; +@@ -53,22 +79,111 @@ typedef void *(*thread_fn)(void *arg); + int thread_create(const char *name, unsigned id, thread_fn func, void *arg); + + ++typedef u32_t sys_prot_t; ++ + struct sys_sem { +- volatile unsigned int c; +- int (*wait_fn)(void); ++ unsigned count; /* single thread polling notify */ ++ sem_t ksem; /* multi thread blocking notify */ + }; + typedef struct sys_sem *sys_sem_t; +-#define sys_sem_valid(sem) (((sem) != NULL) && (*(sem) != NULL)) +-#define sys_sem_set_invalid(sem) do { if ((sem) != NULL) { *(sem) = NULL; }} while(0) ++#define sys_sem_valid(sem) ((likely((sem) != NULL)) && (*(sem) != NULL)) ++#define sys_sem_set_invalid(sem) do { if (likely((sem) != NULL)) { *(sem) = NULL; }} while(0) ++ ++static inline ++err_t sys_sem_new_internal(sys_sem_t s, u8_t count) ++{ ++ sem_init(&s->ksem, 0, count); ++ return ERR_OK; ++} ++ ++static inline ++void sys_sem_free_internal(sys_sem_t s) ++{ ++ sem_destroy(&s->ksem); ++} ++ ++static inline ++void sys_sem_signal_internal(sys_sem_t s) ++{ ++ sem_post(&s->ksem); ++} ++ ++static inline ++int sys_sem_wait_internal(sys_sem_t s, int timeout) ++{ ++ if (unlikely(timeout == 0)) { ++ return 0; ++ } ++ ++ if (timeout < 0) { ++ sem_wait(&s->ksem); ++ } else { ++ u32_t start = sys_now(); ++ struct timespec ts; ++ ++ sys_ms2timespec(&ts, timeout); ++ sem_timedwait(&s->ksem, &ts); ++ ++ if (timeout <= (int)(sys_now() - start)) { ++ timeout = 0; ++ } ++ } ++ ++ return timeout; ++} + + + struct sys_mutex { +- volatile unsigned int m; ++ pthread_mutex_t klock; + }; + typedef struct sys_mutex *sys_mutex_t; + #define sys_mutex_valid(mutex) sys_sem_valid(mutex) + #define sys_mutex_set_invalid(mutex) sys_sem_set_invalid(mutex) + ++static inline ++err_t sys_mutex_new_internal(sys_mutex_t m) ++{ ++ pthread_mutex_init(&m->klock, NULL); ++ pthread_mutex_trylock(&m->klock); ++ return ERR_OK; ++} ++ ++static inline ++void sys_mutex_free_internal(sys_mutex_t m) ++{ ++ pthread_mutex_destroy(&m->klock); ++} ++ ++static inline ++void sys_mutex_unlock_internal(sys_mutex_t m) ++{ ++ pthread_mutex_unlock(&m->klock); ++} ++ ++static inline ++int sys_mutex_timedlock_internal(sys_mutex_t m, int timeout) ++{ ++ if (unlikely(timeout == 0)) { ++ return 0; ++ } ++ ++ if (timeout < 0) { ++ pthread_mutex_lock(&m->klock); ++ } else { ++ u32_t start = sys_now(); ++ struct timespec ts; ++ ++ sys_ms2timespec(&ts, timeout); ++ pthread_mutex_timedlock(&m->klock, &ts); ++ ++ if (timeout <= (int)(sys_now() - start)) { ++ timeout = 0; ++ } ++ } ++ ++ return timeout; ++} ++ + + struct sys_mbox { + char name[SYS_NAME_LEN]; +@@ -83,27 +198,8 @@ typedef struct sys_mbox *sys_mbox_t; + #define sys_mbox_set_invalid(mbox) sys_sem_set_invalid(mbox) + int sys_mbox_empty(struct sys_mbox *); + +-typedef uint32_t sys_prot_t; +- +-u8_t *sys_hugepage_malloc(const char *name, unsigned size); +-void sys_mempool_var_init(struct memp_desc *memp, char *desc, u16_t size, u16_t num, +- u8_t *base, struct memp **tab, struct stats_mem *stats); +- +-u32_t sys_timer_run(void); +-u32_t sys_now(void); +-u64_t sys_now_us(void); +- +-#define SYS_FORMAT_NAME(buf, size, fmt, ...) \ +- do { \ +- int ret = snprintf(buf, size, ""fmt"", ##__VA_ARGS__); \ +- if (ret < 0) { \ +- LWIP_DEBUGF(SYS_DEBUG, ("%s:%d: sprintf failed\n", __FUNCTION__, __LINE__)); \ +- (void)memset((void *)buf, 0, size); \ +- } \ +- } while(0) + + #if GAZELLE_ENABLE +-#include + #include "dpdk_version.h" + + /* +diff --git a/src/include/lwip/priv/memp_std.h b/src/include/lwip/priv/memp_std.h +index 66d7e4e..cb75d15 100644 +--- a/src/include/lwip/priv/memp_std.h ++++ b/src/include/lwip/priv/memp_std.h +@@ -124,7 +124,10 @@ LWIP_MEMPOOL(MLD6_GROUP, MEMP_NUM_MLD6_GROUP, sizeof(struct mld_group), + + #if GAZELLE_ENABLE + #if !LWIP_NETCONN_SEM_PER_THREAD +-LWIP_MEMPOOL(SYS_SEM, MEMP_NUM_SYS_SEM, sizeof(struct sys_sem), "SYS_SEM") ++LWIP_MEMPOOL(SYS_SEM, MEMP_NUM_SYS_SEM, sizeof(struct sys_sem), "SYS_SEM") ++#endif ++#if LWIP_NETCONN_FULLDUPLEX ++LWIP_MEMPOOL(SYS_MUTEX, MEMP_NUM_SYS_MUTEX, sizeof(struct sys_sem), "SYS_MUTEX") + #endif + + LWIP_MEMPOOL(SYS_MBOX, MEMP_NUM_SYS_MBOX, sizeof(struct sys_mbox), "SYS_MBOX") +diff --git a/src/include/lwipgz_event.h b/src/include/lwipgz_event.h +deleted file mode 100644 +index d51b8f5..0000000 +--- a/src/include/lwipgz_event.h ++++ /dev/null +@@ -1,75 +0,0 @@ +-/* +- * Copyright (c) 2001-2004 Swedish Institute of Computer Science. +- * All rights reserved. +- * +- * Redistribution and use in source and binary forms, with or without modification, +- * are permitted provided that the following conditions are met: +- * +- * 1. Redistributions of source code must retain the above copyright notice, +- * this list of conditions and the following disclaimer. +- * 2. Redistributions in binary form must reproduce the above copyright notice, +- * this list of conditions and the following disclaimer in the documentation +- * and/or other materials provided with the distribution. +- * 3. The name of the author may not be used to endorse or promote products +- * derived from this software without specific prior written permission. +- * +- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +- * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +- * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY +- * OF SUCH DAMAGE. +- * +- * This file is part of the lwIP TCP/IP stack. +- * +- * Author: Huawei Technologies +- * +- */ +- +-#ifndef __LWIPGZ_EVENT_H__ +-#define __LWIPGZ_EVENT_H__ +- +-#include +- +-#include "arch/sys_arch.h" +-#include "lwip/api.h" +-#include "lwipgz_list.h" +- +-#define MAX_EPOLLFDS 32 +- +-#define LIBOS_EPOLLNONE (0x0) +-#define LIBOS_BADEP (NULL) +- +-struct event_queue { +- struct list_node events; +- /* total number of sockets have events */ +- int num_events; +-}; +- +-struct event_array { +- sys_mbox_t mbox; +- volatile int num_events; +- struct epoll_event events[0]; +-}; +- +-struct libos_epoll { +- struct event_queue *libos_queue; +- int num_hostfds; +- int hints; +- int fd; /* self fd */ +- int efd; /* eventfd */ +-}; +- +-struct lwip_sock; +-extern void add_sock_event(struct lwip_sock *sock, uint32_t event); +-extern void add_sock_event_nolock(struct lwip_sock *sock, uint32_t event); +-extern void del_sock_event(struct lwip_sock *sock, uint32_t event); +-extern void del_sock_event_nolock(struct lwip_sock *sock, uint32_t event); +- +-extern int32_t lstack_epoll_close(int32_t); +- +-#endif /* __LWIPGZ_EVENT_H__ */ +diff --git a/src/include/lwipgz_posix_api.h b/src/include/lwipgz_posix_api.h +index 5474592..e02dc0f 100644 +--- a/src/include/lwipgz_posix_api.h ++++ b/src/include/lwipgz_posix_api.h +@@ -66,7 +66,7 @@ typedef struct { + ssize_t (*writev_fn)(int fd, const struct iovec *iov, int iovcnt); + ssize_t (*recv_fn)(int fd, void *buf, size_t len, int flags); + ssize_t (*send_fn)(int fd, const void *buf, size_t len, int flags); +- ssize_t (*recvmsg_fn)(int fd, const struct msghdr *msg, int flags); ++ ssize_t (*recvmsg_fn)(int fd, struct msghdr *msg, int flags); + ssize_t (*sendmsg_fn)(int fd, const struct msghdr *msg, int flags); + ssize_t (*recvfrom_fn)(int fd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); + ssize_t (*sendto_fn)(int fd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); +diff --git a/src/include/lwipgz_sock.h b/src/include/lwipgz_sock.h +index ddf84dc..36527cf 100644 +--- a/src/include/lwipgz_sock.h ++++ b/src/include/lwipgz_sock.h +@@ -41,10 +41,16 @@ + #endif /* GAZELLE_SO_SNDBUF */ + + #if GAZELLE_ENABLE ++#include + #include ++#include ++#include ++ + #include + #include +-#include "lwipgz_event.h" ++#include ++ ++#include "lwipgz_list.h" + + enum posix_type { + POSIX_KERNEL = 0x100, +@@ -64,22 +70,29 @@ enum posix_type { + #define POSIX_IS_TYPE(sock, posix_type) \ + (((sock)->type & POSIX_ALL) == (posix_type)) + +-/* CLOSED means not lwip sock-fd, such as kernel sock-fd or file-fd or unix-fd */ ++/* CLOSED means not lwip sock-fd, such as kernel sock-fd or file-fd or unix-fd. ++ * sock->conn means lwip sock-fd. ++ * sock->sk_wait, means lwip epoll-fd. ++ */ + #define POSIX_IS_CLOSED(sock) \ +- ((sock) == NULL || (sock)->conn == NULL) ++ ((sock) == NULL || ((sock)->conn == NULL && (sock)->sk_wait == NULL)) + ++struct lwip_sock; + struct lwip_sock *lwip_get_socket(int fd); + int gazelle_alloc_socket(struct netconn *newconn, int accepted, int flags); + void gazelle_free_socket(struct lwip_sock *sock, int fd); + void lwip_sock_init(void); + void lwip_exit(void); + ++ ++extern void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); ++extern void sock_event_remove_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); ++ + extern int do_lwip_init_sock(int fd); + extern void do_lwip_clean_sock(int fd); ++extern void do_lwip_connected_callback(int fd); + + extern void do_lwip_add_recvlist(int32_t fd); +-extern void do_lwip_connected_callback(struct netconn *conn); +- + extern struct pbuf *do_lwip_udp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size); + extern struct pbuf *do_lwip_tcp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size); + extern void do_lwip_get_from_sendring_over(struct lwip_sock *sock); +@@ -165,9 +178,6 @@ struct lwip_sock { + /* app thread use */ + struct pbuf *recv_lastdata; /* unread data in one pbuf */ + uint16_t remain_len; +- uint32_t epoll_events; /* registered events, EPOLLONESHOT write frequently */ +- volatile uint32_t events; /* available events */ +- struct list_node event_list; + + char pad1 __rte_cache_aligned; + /* app and stack thread all use */ +@@ -181,11 +191,16 @@ struct lwip_sock { + char pad3 __rte_cache_aligned; + /* nerver change */ + enum posix_type type; ++ int stack_id; ++ bool affinity_numa; ++ + struct lwip_sock *listen_next; /* listenfd list */ +- struct protocol_stack *stack; +- struct wakeup_poll *wakeup; +- struct wakeup_poll *recv_block; +- epoll_data_t ep_data; ++ ++ /* Cannot support the same sock being waited by both epoll/poll/select or multiple epollfd. */ ++ struct sock_wait *sk_wait; /* for epoll/poll/select and blocking recv/accept/connect */ ++ ++ struct sock_time_stamp stamp; ++ + struct rte_ring *recv_ring; + struct rte_ring *send_ring; + +@@ -197,11 +212,22 @@ struct lwip_sock { + const struct rte_memzone *same_node_tx_ring_mz; + #endif /* GAZELLE_SAME_NODE */ + +- uint8_t already_bind_numa; ++ struct sock_event { ++ epoll_data_t ep_data; /* User data variable */ ++ unsigned events; /* requested events, EPOLLONESHOT write frequently */ + +- struct sock_time_stamp stamp; ++#if SOCK_WAIT_BATCH_NOTIFY ++ unsigned stk_pending; ++ struct list_node stk_event_node; ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ ++ char pad0 __rte_cache_aligned; /* new cache line */ ++ ++ unsigned pending; /* returned events */ ++ struct list_node event_node; ++ } sk_event; + #endif /* GAZELLE_ENABLE */ +-}; ++} __rte_cache_aligned; + + #if GAZELLE_SO_SNDBUF + void netconn_set_sndbufsize(struct netconn *conn, tcpwnd_size_t sndbufsize); +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 9338fcd..257dbb9 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -93,16 +93,23 @@ + ---------- lwIP APIs options ---------- + --------------------------------------- + */ ++ ++/* Using gazelle-multiple-thread instead of tcpip-thread, ++ * And Avoid using sem or mutex in lwip. */ + #define LWIP_TCPIP_CORE_LOCKING 1 ++#define LOCK_TCPIP_CORE() ++#define UNLOCK_TCPIP_CORE() + +-#define LWIP_TCPIP_TIMEOUT 0 ++#define LWIP_NETCONN_FULLDUPLEX 0 ++#define LWIP_NETCONN_SEM_PER_THREAD LWIP_NETCONN_FULLDUPLEX ++ ++#define SYS_LIGHTWEIGHT_PROT 0 + ++#define LWIP_TCPIP_TIMEOUT 0 + #define TCPIP_MBOX_SIZE (MEMP_NUM_TCPIP_MSG_API) + + #define LWIP_NETCONN 1 + +-#define LWIP_NETCONN_SEM_PER_THREAD 0 +- + #define LWIP_STATS 1 + + #define LWIP_STATS_DISPLAY 1 +@@ -137,7 +144,9 @@ + + #define MEMP_NUM_SYS_SEM (GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS) + +-#define MEMP_NUM_SYS_MBOX (GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS) ++#define MEMP_NUM_SYS_MUTEX (GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS) ++ ++#define MEMP_NUM_SYS_MBOX ((GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS) * 2) + + #define PBUF_POOL_SIZE (1024) + +@@ -244,6 +253,8 @@ + #define LWIP_WND_SCALE 1 + #define TCP_RCV_SCALE 6 + ++#define SOCK_WAIT_BATCH_NOTIFY 1 ++ + #define GAZELLE_TCP_LAST_SEG 1 + + #define GAZELLE_TCP_MAX_CONN_PER_THREAD 65535 +@@ -311,13 +322,13 @@ + ---------- Netif options ---------- + ------------------------------------ + */ ++#define LWIP_NETIF_LOOPBACK_MULTITHREADING 0 + #define LWIP_NETIF_LOOPBACK 1 + + #define ETHARP_SUPPORT_VLAN 1 + #define LWIP_VLAN_PCP 1 + #define VLAN_LEN 4 + +-#define LWIP_NETIF_LOOPBACK_MULTITHREADING 0 + + + /* +-- +2.33.0 + diff --git a/0184-socket-refactor-tcp-and-udp.patch b/0184-socket-refactor-tcp-and-udp.patch new file mode 100644 index 0000000..7682dc8 --- /dev/null +++ b/0184-socket-refactor-tcp-and-udp.patch @@ -0,0 +1,3394 @@ +From d6f2aa310698b288d16c4321fafca0c3646e6bfd Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Mon, 10 Mar 2025 00:12:24 +0800 +Subject: [PATCH 03/11] socket: refactor tcp and udp + +Signed-off-by: Lemmy Huang +--- + src/api/api_lib.c | 31 +- + src/api/api_msg.c | 64 +-- + src/api/netbuf.c | 37 ++ + src/api/sockets.c | 182 +++++++-- + src/api/sys_arch.c | 191 +++------ + src/core/init.c | 4 +- + src/core/ipv4/etharp.c | 17 +- + src/core/ipv4/ip4.c | 14 +- + src/core/ipv4/ip4_frag.c | 63 ++- + src/core/ipv6/ip6.c | 14 +- + src/core/ipv6/ip6_frag.c | 48 ++- + src/core/netif.c | 34 +- + src/core/pbuf.c | 113 +++-- + src/core/tcp.c | 18 +- + src/core/tcp_out.c | 679 +++++++++++++++---------------- + src/core/udp.c | 27 +- + src/include/arch/sys_arch.h | 130 +++--- + src/include/lwip/api.h | 6 +- + src/include/lwip/netbuf.h | 8 + + src/include/lwip/netif.h | 16 +- + src/include/lwip/pbuf.h | 57 ++- + src/include/lwip/priv/tcp_priv.h | 10 + + src/include/lwip/sockets.h | 19 +- + src/include/lwip/tcp.h | 7 - + src/include/lwipgz_offload.h | 59 ++- + src/include/lwipgz_sock.h | 29 +- + src/include/lwipopts.h | 111 +++-- + 27 files changed, 1085 insertions(+), 903 deletions(-) + +diff --git a/src/api/api_lib.c b/src/api/api_lib.c +index 851c7cc..c17db5d 100644 +--- a/src/api/api_lib.c ++++ b/src/api/api_lib.c +@@ -75,6 +75,10 @@ + + #include + ++#if GAZELLE_ENABLE ++#include "lwipgz_sock.h" ++#endif ++ + #define API_MSG_VAR_REF(name) API_VAR_REF(name) + #define API_MSG_VAR_DECLARE(name) API_VAR_DECLARE(struct api_msg, name) + #define API_MSG_VAR_ALLOC(name) API_VAR_ALLOC(struct api_msg, MEMP_API_MSG, name, ERR_MEM) +@@ -655,19 +659,18 @@ netconn_recv_data(struct netconn *conn, void **new_buf, u8_t apiflags) + #if (LWIP_UDP || LWIP_RAW) + { + LWIP_ASSERT("buf != NULL", buf != NULL); +-#if GAZELLE_UDP_ENABLE +- len = ((struct pbuf *)buf)->tot_len; +-#else /* GAZELLE_UDP_ENABLE */ + len = netbuf_len((struct netbuf *)buf); +-#endif /* GAZELLE_UDP_ENABLE */ + } + #endif /* (LWIP_UDP || LWIP_RAW) */ + + #if LWIP_SO_RCVBUF + SYS_ARCH_DEC(conn->recv_avail, len); + #endif /* LWIP_SO_RCVBUF */ +- /* Register event with callback */ +- API_EVENT(conn, NETCONN_EVT_RCVMINUS, len); ++#if GAZELLE_ENABLE ++ if (!(apiflags & NETCONN_NOAUTORCVD)) ++#endif /* GAZELLE_ENABLE */ ++ /* Register event with callback */ ++ API_EVENT(conn, NETCONN_EVT_RCVMINUS, len); + + LWIP_DEBUGF(API_LIB_DEBUG, ("netconn_recv_data: received %p, len=%"U16_F"\n", buf, len)); + +@@ -763,8 +766,10 @@ handle_fin: + } + return ERR_RST; + } ++#if !GAZELLE_ENABLE + /* RX side is closed, so deallocate the recvmbox */ + netconn_close_shutdown(conn, NETCONN_SHUT_RD); ++#endif /* GAZELLE_ENABLE */ + /* Don' store ERR_CLSD as conn->err since we are only half-closed */ + return ERR_CLSD; + } +@@ -831,16 +836,6 @@ netconn_recv_udp_raw_netbuf(struct netconn *conn, struct netbuf **new_buf) + return netconn_recv_data(conn, (void **)new_buf, 0); + } + +-#if GAZELLE_UDP_ENABLE +-err_t +-netconn_recv_udp_raw_pbuf_flags(struct netconn *conn, struct pbuf **new_buf, u8_t apiflags) +-{ +- LWIP_ERROR("netconn_recv_udp_raw_pbuf: invalid conn", (conn != NULL) && +- NETCONNTYPE_GROUP(netconn_type(conn)) != NETCONN_TCP, return ERR_ARG;); +- return netconn_recv_data(conn, (void **)new_buf, apiflags); +-} +-#endif /* GAZELLE_UDP_ENABLE */ +- + /** + * Receive data (in form of a netbuf) from a UDP or RAW netconn + * +@@ -961,6 +956,10 @@ netconn_send(struct netconn *conn, struct netbuf *buf) + + LWIP_DEBUGF(API_LIB_DEBUG, ("netconn_send: sending %"U16_F" bytes\n", buf->p->tot_len)); + ++#if GAZELLE_ENABLE ++ lstack_calculate_aggregate(1, buf->p->tot_len); ++#endif ++ + API_MSG_VAR_ALLOC(msg); + API_MSG_VAR_REF(msg).conn = conn; + API_MSG_VAR_REF(msg).msg.b = buf; +diff --git a/src/api/api_msg.c b/src/api/api_msg.c +index a979375..bbde5a5 100644 +--- a/src/api/api_msg.c ++++ b/src/api/api_msg.c +@@ -251,6 +251,7 @@ recv_udp(void *arg, struct udp_pcb *pcb, struct pbuf *p, + struct netbuf *buf; + struct netconn *conn; + u16_t len; ++ err_t err; + #if LWIP_SO_RCVBUF + int recv_avail; + #endif /* LWIP_SO_RCVBUF */ +@@ -278,21 +279,16 @@ recv_udp(void *arg, struct udp_pcb *pcb, struct pbuf *p, + return; + } + +-#if GAZELLE_UDP_ENABLE +- LWIP_UNUSED_ARG(buf); +- ip_addr_set(&p->addr, addr); +- p->port = port; +- len = p->tot_len; +- if (sys_mbox_trypost(&conn->recvmbox, p) != ERR_OK) { +- pbuf_free(p); +- return; +-#else /* GAZELLE_UDP_ENABLE */ +- err_t err; ++#if !GAZELLE_UDP_ENABLE + buf = (struct netbuf *)memp_malloc(MEMP_NETBUF); + if (buf == NULL) { + pbuf_free(p); + return; +- } else { ++ } else ++#else /* GAZELLE_UDP_ENABLE */ ++ buf = pbuf_to_netbuf(p); ++#endif /* GAZELLE_UDP_ENABLE */ ++ { + buf->p = p; + buf->ptr = p; + ip_addr_set(&buf->addr, addr); +@@ -323,14 +319,14 @@ recv_udp(void *arg, struct udp_pcb *pcb, struct pbuf *p, + netbuf_delete(buf); + LWIP_DEBUGF(API_MSG_DEBUG, ("recv_udp: sys_mbox_trypost failed, err=%d\n", err)); + return; +-#endif /* GAZELLE_UDP_ENABLE */ + } else { + #if LWIP_SO_RCVBUF + SYS_ARCH_INC(conn->recv_avail, len); + #endif /* LWIP_SO_RCVBUF */ ++ + #if GAZELLE_UDP_ENABLE ++ lstack_calculate_aggregate(0, len); + time_stamp_record(conn->callback_arg.socket, p); +- do_lwip_add_recvlist(conn->callback_arg.socket); + #endif /* GAZELLE_UDP_ENABLE */ + /* Register event with callback */ + API_EVENT(conn, NETCONN_EVT_RCVPLUS, len); +@@ -392,9 +388,12 @@ recv_tcp(void *arg, struct tcp_pcb *pcb, struct pbuf *p, err_t err) + SYS_ARCH_INC(conn->recv_avail, len); + #endif /* LWIP_SO_RCVBUF */ + #if GAZELLE_ENABLE ++ lstack_calculate_aggregate(0, len); + time_stamp_record(conn->callback_arg.socket, p); +- do_lwip_add_recvlist(conn->callback_arg.socket); +-#endif ++#if !TCP_RECV_AND_UPDATE ++ tcp_recved(conn->pcb.tcp, len); ++#endif /* TCP_RECV_AND_UPDATE */ ++#endif /* GAZELLE_ENABLE */ + /* Register event with callback */ + API_EVENT(conn, NETCONN_EVT_RCVPLUS, len); + } +@@ -519,9 +518,6 @@ err_tcp(void *arg, err_t err) + if (NETCONN_MBOX_VALID(conn, &conn->recvmbox)) { + /* use trypost to prevent deadlock */ + sys_mbox_trypost(&conn->recvmbox, mbox_msg); +-#if GAZELLE_ENABLE +- do_lwip_add_recvlist(conn->callback_arg.socket); +-#endif + } + /* pass error message to acceptmbox to wake up pending accept */ + if (NETCONN_MBOX_VALID(conn, &conn->acceptmbox)) { +@@ -809,9 +805,21 @@ netconn_alloc(enum netconn_type t, netconn_callback callback) + goto free_and_return; + } + ++#if !GAZELLE_ENABLE + if (sys_mbox_new(&conn->recvmbox, size) != ERR_OK) { + goto free_and_return; + } ++#else /* GAZELLE_ENABLE */ ++ int flags = NETCONNTYPE_GROUP(t) == NETCONN_TCP ? MBOX_FLAG_TCP : MBOX_FLAG_UDP; ++ if (sys_mbox_new_flags(&conn->recvmbox, size, flags | MBOX_FLAG_RECV) != ERR_OK) { ++ goto free_and_return; ++ } ++ if (sys_mbox_new_flags(&conn->sendmbox, DEFAULT_SENDMBOX_SIZE, flags | MBOX_FLAG_SEND) != ERR_OK) { ++ sys_mbox_free(&conn->recvmbox); ++ goto free_and_return; ++ } ++#endif /* GAZELLE_ENABLE */ ++ + #if !LWIP_NETCONN_SEM_PER_THREAD + if (sys_sem_new(&conn->op_completed, 0) != ERR_OK) { + sys_mbox_free(&conn->recvmbox); +@@ -1448,6 +1456,9 @@ lwip_netconn_do_connect(void *m) + msg->msg.bc.port, lwip_netconn_do_connected); + if (err == ERR_OK) { + u8_t non_blocking = netconn_is_nonblocking(msg->conn); ++#if GAZELLE_ENABLE ++ non_blocking = 1; /* Not blocking here! */ ++#endif /* GAZELLE_ENABLE */ + msg->conn->state = NETCONN_CONNECT; + SET_NONBLOCKING_CONNECT(msg->conn, non_blocking); + if (non_blocking) { +@@ -1558,7 +1569,11 @@ lwip_netconn_do_listen(void *m) + } + err = ERR_OK; + if (!sys_mbox_valid(&msg->conn->acceptmbox)) { ++#if !GAZELLE_ENABLE + err = sys_mbox_new(&msg->conn->acceptmbox, DEFAULT_ACCEPTMBOX_SIZE); ++#else /* GAZELLE_ENABLE */ ++ err = sys_mbox_new_flags(&msg->conn->acceptmbox, DEFAULT_ACCEPTMBOX_SIZE, MBOX_FLAG_TCP); ++#endif /* GAZELLE_ENABLE */ + } + if (err == ERR_OK) { + msg->conn->state = NETCONN_LISTEN; +@@ -1777,20 +1792,7 @@ lwip_netconn_do_writemore(struct netconn *conn WRITE_DELAYED_PARAM) + } else { + write_more = 0; + } +-#if GAZELLE_ENABLE +- if (netif_is_rtc_mode(netif_default)) { +- err = tcp_write(conn->pcb.tcp, dataptr, len, apiflags); +- } else { +- /* vector->ptr is private arg sock */ +- LWIP_UNUSED_ARG(dataptr); +- write_more = 0; +- err = tcp_write_from_stack(conn->pcb.tcp, conn->current_msg->msg.w.vector->ptr, len, apiflags); +- conn->current_msg->msg.w.len = len; +- } +- conn->pcb.tcp->need_tso_send = 1; +-#else + err = tcp_write(conn->pcb.tcp, dataptr, len, apiflags); +-#endif + if (err == ERR_OK) { + conn->current_msg->msg.w.offset += len; + conn->current_msg->msg.w.vector_off += len; +diff --git a/src/api/netbuf.c b/src/api/netbuf.c +index 8f5be9e..665cf6c 100644 +--- a/src/api/netbuf.c ++++ b/src/api/netbuf.c +@@ -81,11 +81,19 @@ void + netbuf_delete(struct netbuf *buf) + { + if (buf != NULL) { ++#if !GAZELLE_ENABLE + if (buf->p != NULL) { + pbuf_free(buf->p); + buf->p = buf->ptr = NULL; + } + memp_free(MEMP_NETBUF, buf); ++#else ++ struct pbuf *p = buf->p; ++ if (p != NULL) { ++ buf->p = buf->ptr = NULL; ++ pbuf_free(p); ++ } ++#endif /* GAZELLE_ENABLE */ + } + } + +@@ -127,15 +135,44 @@ void + netbuf_free(struct netbuf *buf) + { + LWIP_ERROR("netbuf_free: invalid buf", (buf != NULL), return;); ++#if LWIP_CHECKSUM_ON_COPY ++ buf->flags = 0; ++ buf->toport_chksum = 0; ++#endif /* LWIP_CHECKSUM_ON_COPY */ + if (buf->p != NULL) { + pbuf_free(buf->p); + } ++#if !GAZELLE_ENABLE + buf->p = buf->ptr = NULL; ++#endif /* GAZELLE_ENABLE */ ++} ++ ++#if GAZELLE_UDP_ENABLE ++struct netbuf * ++netbuf_create(struct pbuf *p) ++{ ++ RTE_BUILD_BUG_ON(sizeof(struct netbuf) > GAZELLE_SIZEOF_NETBUF); ++ struct netbuf *buf = pbuf_to_netbuf(p); ++ struct pbuf *t = pbuf_list_tail(p); ++ buf->tail = t; ++ ++ buf->p = buf->ptr = p; + #if LWIP_CHECKSUM_ON_COPY + buf->flags = 0; + buf->toport_chksum = 0; + #endif /* LWIP_CHECKSUM_ON_COPY */ ++ ++ return buf; ++} ++ ++void ++netbuf_chain_pbuf(struct netbuf *buf, struct pbuf *p) ++{ ++ struct pbuf *t = pbuf_list_tail(p); ++ buf->tail->next = p; ++ buf->tail = t; + } ++#endif /* GAZELLE_UDP_ENABLE */ + + /** + * @ingroup netbuf +diff --git a/src/api/sockets.c b/src/api/sockets.c +index b78b1f3..fa32476 100644 +--- a/src/api/sockets.c ++++ b/src/api/sockets.c +@@ -999,15 +999,6 @@ lwip_recv_tcp(struct lwip_sock *sock, void *mem, size_t len, int flags) + apiflags |= NETCONN_DONTBLOCK; + } + +-#if GAZELLE_ENABLE +- if (!netif_is_rtc_mode(netif_default)) { +- LWIP_UNUSED_ARG(recv_left); +- recvd = do_lwip_read_from_lwip(sock, flags, apiflags); +- if (recvd <= 0) { +- return recvd; +- } +- } else { +-#endif + do { + struct pbuf *p; + err_t err; +@@ -1086,15 +1077,17 @@ lwip_recv_tcp(struct lwip_sock *sock, void *mem, size_t len, int flags) + apiflags |= NETCONN_DONTBLOCK | NETCONN_NOFIN; + /* @todo: do we need to support peeking more than one pbuf? */ + } while ((recv_left > 0) && !(flags & MSG_PEEK)); +-#if GAZELLE_ENABLE +- } +-#endif /* GAZELLE_ENABLE */ ++ + lwip_recv_tcp_done: + if (apiflags & NETCONN_NOAUTORCVD) { ++#if !GAZELLE_ENABLE + if ((recvd > 0) && !(flags & MSG_PEEK)) { + /* ensure window update after copying all data */ + netconn_tcp_recvd(sock->conn, (size_t)recvd); + } ++#else /* GAZELLE_ENABLE */ ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, recvd); ++#endif /* GAZELLE_ENABLE */ + } + set_errno(0); + return recvd; +@@ -1102,7 +1095,7 @@ lwip_recv_tcp_done: + #endif + + /* Convert a netbuf's address data to struct sockaddr */ +-int ++static int + lwip_sock_make_addr(struct netconn *conn, ip_addr_t *fromaddr, u16_t port, + struct sockaddr *from, socklen_t *fromlen) + { +@@ -1162,12 +1155,40 @@ lwip_recv_tcp_from(struct lwip_sock *sock, struct sockaddr *from, socklen_t *fro + } + return 0; + } ++ ++#if GAZELLE_ENABLE ++void lwip_tcp_recvd(struct netconn *conn, size_t recvd, int flags) ++{ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_tcp_recvd(conn=%p, recvd=%"SZT_F", flags=0x%x)\n", ++ conn, recvd, flags)); ++ ++ if ((recvd > 0) && !(flags & MSG_PEEK)) { ++ /* ensure window update after copying all data */ ++ netconn_tcp_recvd(conn, (size_t)recvd); ++ } ++} ++ ++int lwip_tcp_recv_from(struct netconn *conn, struct sockaddr *from, socklen_t *fromlen, int dbg_ret) ++{ ++ if (from && fromlen) { ++ /* get remote addr/port from tcp_pcb */ ++ u16_t port; ++ ip_addr_t tmpaddr; ++ netconn_getaddr(conn, &tmpaddr, &port, 0); ++ ip_addr_debug_print_val(SOCKETS_DEBUG, tmpaddr); ++ LWIP_DEBUGF(SOCKETS_DEBUG, (" port=%"U16_F" len=%d\n", port, dbg_ret)); ++ return lwip_sock_make_addr(conn, &tmpaddr, port, from, fromlen); ++ } ++ return 0; ++} ++#endif /* GAZELLE_ENABLE */ ++ + #endif + + /* Helper function to receive a netbuf from a udp or raw netconn. + * Keeps sock->lastdata for peeking. + */ +-static err_t ++err_t + lwip_recvfrom_udp_raw(struct lwip_sock *sock, int flags, struct msghdr *msg, u16_t *datagram_len, int dbg_s) + { + struct netbuf *buf; +@@ -1185,7 +1206,6 @@ lwip_recvfrom_udp_raw(struct lwip_sock *sock, int flags, struct msghdr *msg, u16 + apiflags = 0; + } + +-#if !GAZELLE_UDP_ENABLE + LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_recvfrom_udp_raw[UDP/RAW]: top sock->lastdata=%p\n", (void *)sock->lastdata.netbuf)); + /* Check if there is data left from the last recv operation. */ + buf = sock->lastdata.netbuf; +@@ -1273,18 +1293,6 @@ lwip_recvfrom_udp_raw(struct lwip_sock *sock, int flags, struct msghdr *msg, u16 + sock->lastdata.netbuf = NULL; + netbuf_delete(buf); + } +-#else /* GAZELLE_UDP_ENABLE */ +- LWIP_UNUSED_ARG(copylen); +- LWIP_UNUSED_ARG(buf); +- LWIP_UNUSED_ARG(err); +- LWIP_UNUSED_ARG(copied); +- LWIP_UNUSED_ARG(i); +- buflen = do_lwip_read_from_lwip(sock, flags, apiflags); +- if (buflen < 0) { +- return ERR_BUF; +- } +- +-#endif /* GAZELLE_UDP_ENABLE */ + if (datagram_len) { + *datagram_len = buflen; + } +@@ -1751,13 +1759,7 @@ lwip_sendto(int s, const void *data, size_t size, int flags, + /* make the buffer point to the data that should be sent */ + #if LWIP_NETIF_TX_SINGLE_PBUF + /* Allocate a new netbuf and copy the data into it. */ +-#if GAZELLE_ENABLE +- /* In the gazelle scenario, only use buf.p->payload and buf.p->tot_len, and +- buf.p->payload stores the sock pointer info. Don't need to alloc short_size */ +- if (netbuf_alloc(&buf, 0) == NULL) { +-#else + if (netbuf_alloc(&buf, short_size) == NULL) { +-#endif + err = ERR_MEM; + } else { + #if LWIP_CHECKSUM_ON_COPY +@@ -1767,14 +1769,7 @@ lwip_sendto(int s, const void *data, size_t size, int flags, + } else + #endif /* LWIP_CHECKSUM_ON_COPY */ + { +-#if GAZELLE_ENABLE +- /* In the gazelle scenario, the payload is stored in send_ring, +- and the payload stores the sock pointer information. */ +- buf.p->payload = (void *)sock; +- buf.p->tot_len = short_size; +-#else + MEMCPY(buf.p->payload, data, short_size); +-#endif + } + err = ERR_OK; + } +@@ -1802,6 +1797,113 @@ lwip_sendto(int s, const void *data, size_t size, int flags, + return (err == ERR_OK ? short_size : -1); + } + ++#if GAZELLE_ENABLE ++void lwip_sendto_netbuf(struct netconn *conn, struct netbuf *buf, ++ const struct sockaddr *to, socklen_t tolen) ++{ ++ u16_t remote_port; ++ ++ if (to) { ++ SOCKADDR_TO_IPADDR_PORT(to, &buf->addr, remote_port); ++ } else { ++ remote_port = 0; ++ ip_addr_set_any(NETCONNTYPE_ISIPV6(netconn_type(conn)), &buf->addr); ++ } ++ netbuf_fromport(buf) = remote_port; ++ ++ ip_addr_debug_print_val(SOCKETS_DEBUG, buf->addr); ++ LWIP_DEBUGF(SOCKETS_DEBUG, (" port=%"U16_F"\n", remote_port)); ++ ++#if LWIP_IPV4 && LWIP_IPV6 ++ /* Dual-stack: Unmap IPv4 mapped IPv6 addresses */ ++ if (IP_IS_V6_VAL(buf->addr) && ip6_addr_isipv4mappedipv6(ip_2_ip6(&buf->addr))) { ++ unmap_ipv4_mapped_ipv6(ip_2_ip4(&buf->addr), ip_2_ip6(&buf->addr)); ++ IP_SET_TYPE_VAL(buf->addr, IPADDR_TYPE_V4); ++ } ++#endif /* LWIP_IPV4 && LWIP_IPV6 */ ++} ++ ++ssize_t lwip_recvmsg_check(const struct lwip_sock *sock, const struct msghdr *message, int flags) ++{ ++ msg_iovlen_t i; ++ ssize_t buflen; ++ ++ LWIP_UNUSED_ARG(sock); ++ ++ LWIP_ERROR("lwip_recvmsg: invalid message pointer", message != NULL, return ERR_ARG;); ++ LWIP_ERROR("lwip_recvmsg: unsupported flags", (flags & ~(MSG_PEEK|MSG_DONTWAIT)) == 0, ++ set_errno(EOPNOTSUPP); return -1;); ++ ++ if ((message->msg_iovlen <= 0) || (message->msg_iovlen > IOV_MAX)) { ++ set_errno(EMSGSIZE); ++ return -1; ++ } ++ ++ /* check for valid vectors */ ++ buflen = 0; ++ for (i = 0; i < message->msg_iovlen; i++) { ++ /* msg_iov[i].iov_len == 0 dont return ERRVAL ++ * According to the Single Unix Specification we should return EINVAL if an elment length is < 0 ++ * when cast to ssize_t ++ */ ++ if ((message->msg_iov[i].iov_base == NULL) || ((ssize_t)message->msg_iov[i].iov_len < 0) || ++ ((size_t)(ssize_t)message->msg_iov[i].iov_len != message->msg_iov[i].iov_len) || ++ ((ssize_t)(buflen + (ssize_t)message->msg_iov[i].iov_len) < 0)) { ++ set_errno(err_to_errno(ERR_VAL)); ++ return -1; ++ } ++ buflen = (ssize_t)(buflen + (ssize_t)message->msg_iov[i].iov_len); ++ } ++ ++ return buflen; ++} ++ ++ssize_t lwip_sendmsg_check(const struct lwip_sock *sock, const struct msghdr *msg, int flags) ++{ ++ size_t size; ++ int i; ++ ++ LWIP_ERROR("lwip_sendmsg: invalid msghdr", msg != NULL, ++ set_errno(err_to_errno(ERR_ARG)); done_socket(sock); return -1;); ++ LWIP_ERROR("lwip_sendmsg: invalid msghdr iov", msg->msg_iov != NULL, ++ set_errno(err_to_errno(ERR_ARG)); done_socket(sock); return -1;); ++ LWIP_ERROR("lwip_sendmsg: maximum iovs exceeded", (msg->msg_iovlen > 0) && (msg->msg_iovlen <= IOV_MAX), ++ set_errno(EMSGSIZE); done_socket(sock); return -1;); ++ LWIP_ERROR("lwip_sendmsg: unsupported flags", (flags & ~(MSG_DONTWAIT | MSG_MORE)) == 0, ++ set_errno(EOPNOTSUPP); done_socket(sock); return -1;); ++ ++ LWIP_UNUSED_ARG(msg->msg_control); ++ LWIP_UNUSED_ARG(msg->msg_controllen); ++ LWIP_UNUSED_ARG(msg->msg_flags); ++ ++ /* sum up the total size */ ++ size = 0; ++ for (i = 0; i < msg->msg_iovlen; i++) { ++ size += msg->msg_iov[i].iov_len; ++ if ((msg->msg_iov[i].iov_len > INT_MAX) || (size < (int)msg->msg_iov[i].iov_len)) { ++ /* overflow */ ++ set_errno(EMSGSIZE); ++ return -1; ++ } ++ } ++ ++ if (NETCONNTYPE_GROUP(netconn_type(sock->conn)) == NETCONN_UDP) { ++ LWIP_ERROR("lwip_sendmsg: invalid msghdr name", (((msg->msg_name == NULL) && (msg->msg_namelen == 0)) || ++ IS_SOCK_ADDR_LEN_VALID(msg->msg_namelen)), ++ set_errno(err_to_errno(ERR_ARG)); done_socket(sock); return -1;); ++ ++ if (size > 0xFFFF) { ++ /* overflow */ ++ set_errno(EMSGSIZE); ++ return -1; ++ } ++ } ++ ++ return size; ++} ++#endif /* GAZELLE_ENABLE */ ++ ++ + int + lwip_socket(int domain, int type, int protocol) + { +diff --git a/src/api/sys_arch.c b/src/api/sys_arch.c +index f242154..b2d21f8 100644 +--- a/src/api/sys_arch.c ++++ b/src/api/sys_arch.c +@@ -47,12 +47,23 @@ + #include "lwip/timeouts.h" + #include "lwipgz_sock.h" + +-#define MBOX_NAME_PREFIX "_mbox_0x" +-#define MAX_MBOX_NAME_LEN (sizeof(MBOX_NAME_PREFIX) + 32) // log(UINT64_MAX) < 32 ++static struct sys_config g_sys_config_val = {0}; ++const struct sys_config * const g_sys_config = &g_sys_config_val; + + static u64_t g_sys_cycles_per_ms = 0; + static u64_t g_sys_cycles_per_us = 0; + ++struct mbox_ring_ops g_mbox_rtc_default_ops = {0}; ++struct mbox_ring_ops g_mbox_rtw_default_ops = {0}; ++struct mbox_ring_ops g_mbox_rtw_append_ops = {0}; ++struct mbox_ring_ops g_mbox_rtw_peek_ops = {0}; ++const struct mbox_ring_ops *g_mbox_default_ops = NULL; ++ ++void sys_config_init(const struct sys_config *conf) ++{ ++ g_sys_config_val = *conf; ++} ++ + /* + * Timer + * */ +@@ -117,7 +128,7 @@ int thread_create(const char *name, unsigned id, thread_fn func, void *arg) + return ret; + } + +- SYS_FORMAT_NAME(thread_name, sizeof(thread_name), "%s_%02u", name, id); ++ SYS_FORMAT_NAME(thread_name, sizeof(thread_name), "%s%02u", name, id); + ret = pthread_setname_np(tid, thread_name); + if (ret != 0) { + LWIP_DEBUGF(SYS_DEBUG | LWIPGZ_LOG_WARNING, ("thread_create: pthread_setname_np %s failed\n", thread_name)); +@@ -261,170 +272,94 @@ void sys_arch_unprotect(sys_prot_t pval) + { + } + +-extern int eth_dev_poll(void); + /* + * Mailbox + * */ +-static int mbox_wait_func(void) +-{ +-#if LWIP_TIMERS +- sys_timer_run(); +-#endif /* LWIP_TIMER */ +- return eth_dev_poll(); +-} +- +-struct rte_ring *gazelle_ring_create_fast(const char *name, uint32_t size, uint32_t flags) ++err_t sys_mbox_new_flags(sys_mbox_t *mbox, int size, int flags) + { +- ssize_t ring_size; +- char ring_name[RTE_MEMZONE_NAMESIZE] = {0}; +- struct rte_ring *ring; +- +- ring_size = rte_ring_get_memsize(size); +- if (ring_size < 0) { +- RTE_LOG(ERR, EAL, "rte_ring_get_memszie failed\n"); +- return NULL; +- } ++ sys_mbox_t mb; ++ err_t ret; + +- /* +- * rte_ring_create is not used because it calls memzone_lookup_thread_unsafe function +- * time consuming when there are many rings +- */ +- ring = rte_malloc_socket(NULL, ring_size, RTE_CACHE_LINE_SIZE, rte_socket_id()); +- if (ring == NULL) { +- RTE_LOG(ERR, EAL, "cannot create rte_ring for mbox\n"); +- return NULL; +- } +- +- if (snprintf(ring_name, sizeof(ring_name), "%s""%"PRIXPTR, name, (uintptr_t)ring) < 0) { +- rte_free(ring); +- RTE_LOG(ERR, EAL, "snprintf failed\n"); +- return NULL; ++ mb = (sys_mbox_t)memp_malloc(MEMP_SYS_MBOX); ++ if (mb == NULL) { ++ return ERR_MEM; + } ++ memset(mb, 0, sizeof(*mb)); + +- if (rte_ring_init(ring, ring_name, size, flags) != 0) { +- rte_free(ring); +- RTE_LOG(ERR, EAL, "cannot init rte_ring for mbox\n"); +- return NULL; ++ SYS_FORMAT_NAME(mb->name, sizeof(mb->name), "mbox_%p", mb); ++ mb->mring.flags = flags; ++ ret = g_mbox_default_ops->create(&mb->mring, mb->name, size); ++ if (ret != 0) { ++ sys_mbox_free(&mb); ++ return ERR_MEM; + } + +- return ring; ++ *mbox = mb; ++ return ERR_OK; + } + +-void gazelle_ring_free_fast(struct rte_ring *ring) ++err_t sys_mbox_new(sys_mbox_t *mbox, int size) + { +- rte_free(ring); ++ return sys_mbox_new_flags(mbox, size, 0); + } + +-err_t sys_mbox_new(struct sys_mbox **mb, int size) ++void sys_mbox_free(sys_mbox_t *mbox) + { +- struct sys_mbox *mbox; +- +- mbox = (struct sys_mbox *)memp_malloc(MEMP_SYS_MBOX); +- if (mbox == NULL) { +- return ERR_MEM; +- } +- +- mbox->flags = RING_F_SP_ENQ | RING_F_SC_DEQ; +- mbox->size = size; +- mbox->socket_id = rte_socket_id(); ++ LWIP_ASSERT("sys_mbox_valid", sys_mbox_valid(mbox)); ++ sys_mbox_t mb = *mbox; + +- mbox->ring = gazelle_ring_create_fast(MBOX_NAME_PREFIX, mbox->size, mbox->flags); +- if (mbox->ring == NULL) { +- sys_mbox_free(&mbox); +- return ERR_MEM; +- } +- +- mbox->wait_fn = mbox_wait_func; +- *mb = mbox; +- +- return ERR_OK; +-} ++ g_mbox_default_ops->destroy(&mb->mring); + +-void sys_mbox_free(struct sys_mbox **mb) +-{ +- struct sys_mbox *mbox = *mb; +- if (mbox->ring != NULL) { +- gazelle_ring_free_fast(mbox->ring); +- mbox->ring = NULL; +- } +- memp_free(MEMP_SYS_MBOX, mbox); +- sys_mbox_set_invalid(mb); ++ memp_free(MEMP_SYS_MBOX, mb); ++ sys_mbox_set_invalid(mbox); + } + +-err_t sys_mbox_trypost(struct sys_mbox **mb, void *msg) ++err_t sys_mbox_trypost(sys_mbox_t *mbox, void *msg) + { +- unsigned int n; +- struct sys_mbox *mbox = *mb; ++ LWIP_ASSERT("sys_mbox_valid", sys_mbox_valid(mbox)); ++ struct mbox_ring *mr = &(*mbox)->mring; ++ unsigned ret; + +- n = gazelle_st_ring_enqueue_busrt(mbox->ring, &msg, 1); +- if (!n) +- return ERR_BUF; ++ ret = mr->ops->enqueue_burst(mr, &msg, 1); ++ if (unlikely(ret == 0)) { ++ LWIP_DEBUGF(SYS_DEBUG | LWIPGZ_LOG_ERR, ("sys_mbox_trypost failed, mbox %s\n", (*mbox)->name)); ++ return ERR_MEM; ++ } + return ERR_OK; + } + +-void sys_mbox_post(struct sys_mbox **mb, void *msg) ++err_t sys_mbox_trypost_fromisr(sys_mbox_t *mbox, void *msg) + { +- struct sys_mbox *mbox = *mb; +- +- /* NOTE: sys_mbox_post is used on mbox defined in src/api/tcpip.c. +- * If the ring size of mbox is greater than MEMP_NUM_TCPIP_MSG_API, +- * enqueue failure will never happen. +- * */ +- if (!gazelle_st_ring_enqueue_busrt(mbox->ring, &msg, 1)) { +- LWIP_ASSERT("It is failed to post msg into mbox", 0); +- } ++ return sys_mbox_trypost(mbox, msg); + } + +-err_t sys_mbox_trypost_fromisr(sys_mbox_t *q, void *msg) ++void sys_mbox_post(sys_mbox_t *mbox, void *msg) + { +- return sys_mbox_trypost(q, msg); ++ LWIP_ASSERT("sys_mbox_valid", sys_mbox_valid(mbox)); ++ ++ while (unlikely(sys_mbox_trypost(mbox, msg) != ERR_OK)) { } + } + +-uint32_t sys_arch_mbox_tryfetch(struct sys_mbox **mb, void **msg) ++u32_t sys_arch_mbox_tryfetch(sys_mbox_t *mbox, void **msg) + { +- unsigned int n; +- struct sys_mbox *mbox = *mb; ++ LWIP_ASSERT("sys_mbox_valid", sys_mbox_valid(mbox)); ++ struct mbox_ring *mr = &(*mbox)->mring; ++ unsigned ret; + +- n = gazelle_st_ring_dequeue_burst(mbox->ring, msg, 1); +- if (!n) { ++ ret = mr->ops->dequeue_burst(mr, msg, 1); ++ if (unlikely(ret == 0)) { + *msg = NULL; + return SYS_MBOX_EMPTY; + } +- +- return 0; ++ return ERR_OK; + } + +-uint32_t sys_arch_mbox_fetch(struct sys_mbox **mb, void **msg, uint32_t timeout) ++u32_t sys_arch_mbox_fetch(sys_mbox_t *mbox, void **msg, u32_t timeout) + { +- unsigned int n; +- uint32_t poll_ts = 0; +- uint32_t time_needed = 0; +- struct sys_mbox *mbox = *mb; +- +- n = gazelle_st_ring_dequeue_burst(mbox->ring, msg, 1); +- +- if (timeout > 0) +- poll_ts = sys_now(); +- +- while (!n) { +- if (timeout > 0) { +- time_needed = sys_now() - poll_ts; +- if (time_needed >= timeout) { +- return SYS_ARCH_TIMEOUT; +- } +- } +- +- (void)mbox->wait_fn(); +- +- n = gazelle_st_ring_dequeue_burst(mbox->ring, msg, 1); ++ if (sys_arch_mbox_tryfetch(mbox, msg) != ERR_OK) { ++ return SYS_ARCH_TIMEOUT; + } +- +- return time_needed; +-} +- +-int sys_mbox_empty(struct sys_mbox *mb) +-{ +- return rte_ring_count(mb->ring) == 0; ++ return ERR_OK; + } + + +diff --git a/src/core/init.c b/src/core/init.c +index e6cd9b6..8d208d4 100644 +--- a/src/core/init.c ++++ b/src/core/init.c +@@ -297,7 +297,7 @@ PACK_STRUCT_END + /* TCP sanity checks */ + #if !LWIP_DISABLE_TCP_SANITY_CHECKS + #if LWIP_TCP +-#if !MEMP_MEM_MALLOC && (MEMP_NUM_TCP_SEG < TCP_SND_QUEUELEN) ++#if !MEMP_MEM_MALLOC && (MEMP_NUM_TCP_SEG < TCP_SND_QUEUELEN) && !GAZELLE_ENABLE + #error "lwip_sanity_check: WARNING: MEMP_NUM_TCP_SEG should be at least as big as TCP_SND_QUEUELEN. If you know what you are doing, define LWIP_DISABLE_TCP_SANITY_CHECKS to 1 to disable this error." + #endif + #if TCP_SND_BUF < (2 * TCP_MSS) +@@ -312,9 +312,11 @@ PACK_STRUCT_END + #if TCP_MSS >= ((16 * 1024) - 1) + #error "lwip_sanity_check: WARNING: TCP_MSS must be <= 16382 to prevent u16_t underflow in TCP_SNDLOWAT calculation!" + #endif ++#if !LWIP_WND_SCALE + #if TCP_SNDLOWAT >= (0xFFFF - (4 * TCP_MSS)) + #error "lwip_sanity_check: WARNING: TCP_SNDLOWAT must at least be 4*MSS below u16_t overflow!" + #endif ++#endif /* LWIP_WND_SCALE */ + #if TCP_SNDQUEUELOWAT >= TCP_SND_QUEUELEN + #error "lwip_sanity_check: WARNING: TCP_SNDQUEUELOWAT must be less than TCP_SND_QUEUELEN. If you know what you are doing, define LWIP_DISABLE_TCP_SANITY_CHECKS to 1 to disable this error." + #endif +diff --git a/src/core/ipv4/etharp.c b/src/core/ipv4/etharp.c +index 579aa08..a472596 100644 +--- a/src/core/ipv4/etharp.c ++++ b/src/core/ipv4/etharp.c +@@ -483,13 +483,6 @@ etharp_update_arp_entry(struct netif *netif, const ip4_addr_t *ipaddr, struct et + struct pbuf *p = arp_table[i].q; + arp_table[i].q = NULL; + #endif /* ARP_QUEUEING */ +-#if GAZELLE_ENABLE +- struct pbuf *tmp = p->next; +- while (tmp != NULL) { +- tmp->ref--; +- tmp = tmp->next; +- } +-#endif + /* send the queued IP packet */ + ethernet_output(netif, p, (struct eth_addr *)(netif->hwaddr), ethaddr, ETHTYPE_IP); + /* free the queued IP packet */ +@@ -1048,15 +1041,7 @@ etharp_query(struct netif *netif, const ip4_addr_t *ipaddr, struct pbuf *q) + } else { + /* referencing the old pbuf is enough */ + p = q; +-#if GAZELLE_ENABLE +- struct pbuf *tmp = p; +- while (tmp != NULL) { +- pbuf_ref(tmp); +- tmp = tmp->next; +- } +-#else +- pbuf_ref(p); +-#endif ++ pbuf_ref(p); + } + /* packet could be taken over? */ + if (p != NULL) { +diff --git a/src/core/ipv4/ip4.c b/src/core/ipv4/ip4.c +index ad8638e..f43f3db 100644 +--- a/src/core/ipv4/ip4.c ++++ b/src/core/ipv4/ip4.c +@@ -1070,15 +1070,13 @@ ip4_output_if_opt_src(struct pbuf *p, const ip4_addr_t *src, const ip4_addr_t *d + #endif /* ENABLE_LOOPBACK */ + #if IP_FRAG + /* don't fragment if interface has mtu set to 0 [loopif] */ +-#if GAZELLE_ENABLE +- if ((!(netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_TSO)) || !(IPH_PROTO(iphdr) == IP_PROTO_TCP)) { +-#endif +- if (netif->mtu && (p->tot_len > netif->mtu)) { +- return ip4_frag(p, netif, dest); +- } +-#if GAZELLE_ENABLE ++#if OFFLOAD_TX_TCP_TSO || OFFLOAD_TX_UDP_TSO ++ if ( !((IPH_PROTO(iphdr) == IP_PROTO_TCP) && (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_TSO)) && ++ !((IPH_PROTO(iphdr) == IP_PROTO_UDP) && (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_UDP_TSO)) ) ++#endif /* OFFLOAD_TX_TCP_TSO || OFFLOAD_TX_UDP_TSO */ ++ if (netif->mtu && (p->tot_len > netif->mtu)) { ++ return ip4_frag(p, netif, dest); + } +-#endif + #endif /* IP_FRAG */ + #if GAZELLE_ENABLE + IP_STATS_INC(ip.tx_out); +diff --git a/src/core/ipv4/ip4_frag.c b/src/core/ipv4/ip4_frag.c +index c24f710..01dd0f3 100644 +--- a/src/core/ipv4/ip4_frag.c ++++ b/src/core/ipv4/ip4_frag.c +@@ -730,7 +730,6 @@ ip_frag_free_pbuf_custom_ref(struct pbuf_custom_ref *p) + + /** Free-callback function to free a 'struct pbuf_custom_ref', called by + * pbuf_free. */ +-#if !GAZELLE_ENABLE + static void + ipfrag_free_pbuf_custom(struct pbuf *p) + { +@@ -742,7 +741,6 @@ ipfrag_free_pbuf_custom(struct pbuf *p) + } + ip_frag_free_pbuf_custom_ref(pcr); + } +-#endif + #endif /* !LWIP_NETIF_TX_SINGLE_PBUF */ + + /** +@@ -768,7 +766,11 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + #endif + struct ip_hdr *original_iphdr; + struct ip_hdr *iphdr; ++#if !GAZELLE_UDP_ENABLE + const u16_t nfb = (u16_t)((netif->mtu - IP_HLEN) / 8); ++#else /* GAZELLE_UDP_ENABLE */ ++ u16_t nfb = (u16_t)((netif->mtu - IP_HLEN) / 8); ++#endif /* GAZELLE_UDP_ENABLE */ + u16_t left, fragsize; + u16_t ofo; + int last; +@@ -792,19 +794,47 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + + left = (u16_t)(p->tot_len - IP_HLEN); + ++#if GAZELLE_UDP_ENABLE ++ struct pbuf *q = p; ++ struct pbuf *rest_q; ++ if (IPH_PROTO(original_iphdr) == IP_PROTO_UDP) { ++ /* socket sender would free this pbuf head, avoid to free in here. */ ++ pbuf_ref(p); ++ } ++#endif /* GAZELLE_UDP_ENABLE */ ++ + while (left) { + /* Fill this fragment */ + fragsize = LWIP_MIN(left, (u16_t)(nfb * 8)); + ++#if GAZELLE_UDP_ENABLE ++ LWIP_ASSERT("ip4_frag find a tcp pbuf!", (IPH_PROTO(original_iphdr) == IP_PROTO_TCP)); ++ if (IPH_PROTO(original_iphdr) == IP_PROTO_UDP) { ++ pbuf_split_one(q, &rest_q); ++ rambuf = q; ++ q = rest_q; ++ ++ /* first pbuf aleady added header */ ++ if (rambuf != p) { ++ pbuf_add_header(rambuf, IP_HLEN); ++ /* fill in the IP header */ ++ SMEMCPY(rambuf->payload, original_iphdr, IP_HLEN); ++ } ++ fragsize = rambuf->len - IP_HLEN; ++ poff += fragsize; ++ iphdr = (struct ip_hdr *)rambuf->payload; ++ nfb = fragsize / 8; ++ ++ LWIP_DEBUGF(IP_REASS_DEBUG, ("ip4_frag: UDP p=%p, tot_len %u, fragsize %u, nfb %u\n", ++ rambuf, rambuf->tot_len, fragsize, nfb)); ++ } else ++#endif /* GAZELLE_UDP_ENABLE */ ++ { + #if LWIP_NETIF_TX_SINGLE_PBUF + rambuf = pbuf_alloc(PBUF_IP, fragsize, PBUF_RAM); + if (rambuf == NULL) { + goto memerr; + } +-#if GAZELLE_ENABLE +- /* transfer time stamp to new pbuf */ +- time_stamp_transfer_pbuf(p, rambuf); +-#endif + LWIP_ASSERT("this needs a pbuf in one piece!", + (rambuf->len == rambuf->tot_len) && (rambuf->next == NULL)); + poff += pbuf_copy_partial(p, rambuf->payload, fragsize, poff); +@@ -858,9 +888,7 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + } + pbuf_ref(p); + pcr->original = p; +-#if !GAZELLE_ENABLE + pcr->pc.custom_free_function = ipfrag_free_pbuf_custom; +-#endif + + /* Add it to end of rambuf's chain, but using pbuf_cat, not pbuf_chain + * so that it is removed when pbuf_dechain is later called on rambuf. +@@ -874,6 +902,7 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + } + poff = (u16_t)(poff + newpbuflen); + #endif /* LWIP_NETIF_TX_SINGLE_PBUF */ ++ } + + /* Correct header */ + last = (left <= netif->mtu - IP_HLEN); +@@ -890,17 +919,15 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + #if CHECKSUM_GEN_IP + IF__NETIF_CHECKSUM_ENABLED(netif, NETIF_CHECKSUM_GEN_IP) { + #if OFFLOAD_CHECKSUM_GEN_IP +- if (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) { +- ol_chksum_gen_ip(p, IP_HLEN, 1); +- ol_chksum_gen_ip(rambuf, IP_HLEN, 1); +- } else { +- ol_chksum_gen_ip(p, IP_HLEN, 0); +- IPH_CHKSUM_SET(iphdr, inet_chksum(iphdr, IP_HLEN)); +- } +- +-#else ++ if (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) { ++ ol_chksum_gen_ip(rambuf, IP_HLEN, 1); ++ } else { ++ ol_chksum_gen_ip(rambuf, IP_HLEN, 0); ++ IPH_CHKSUM_SET(iphdr, inet_chksum(iphdr, IP_HLEN)); ++ } ++#else /* OFFLOAD_CHECKSUM_GEN_IP */ + IPH_CHKSUM_SET(iphdr, inet_chksum(iphdr, IP_HLEN)); +-#endif ++#endif /* OFFLOAD_CHECKSUM_GEN_IP */ + } + #endif /* CHECKSUM_GEN_IP */ + +diff --git a/src/core/ipv6/ip6.c b/src/core/ipv6/ip6.c +index 4866f25..db7b491 100644 +--- a/src/core/ipv6/ip6.c ++++ b/src/core/ipv6/ip6.c +@@ -1268,15 +1268,13 @@ ip6_output_if_src(struct pbuf *p, const ip6_addr_t *src, const ip6_addr_t *dest, + #endif /* ENABLE_LOOPBACK */ + #if LWIP_IPV6_FRAG + /* don't fragment if interface has mtu set to 0 [loopif] */ +-#if GAZELLE_ENABLE +- if (!(netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_TSO)) { +-#endif +- if (netif_mtu6(netif) && (p->tot_len > nd6_get_destination_mtu(dest, netif))) { +- return ip6_frag(p, netif, dest); +- } +-#if GAZELLE_ENABLE ++#if OFFLOAD_TX_TCP_TSO || OFFLOAD_TX_UDP_TSO ++ if ( !((IP6H_NEXTH(ip6hdr) == IP6_NEXTH_TCP) && (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_TSO)) && ++ !((IP6H_NEXTH(ip6hdr) == IP6_NEXTH_UDP) && (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_UDP_TSO)) ) ++#endif /* OFFLOAD_TX_TCP_TSO || OFFLOAD_TX_UDP_TSO */ ++ if (netif_mtu6(netif) && (p->tot_len > nd6_get_destination_mtu(dest, netif))) { ++ return ip6_frag(p, netif, dest); + } +-#endif + #endif /* LWIP_IPV6_FRAG */ + + LWIP_DEBUGF(IP6_DEBUG, ("netif->output_ip6()\n")); +diff --git a/src/core/ipv6/ip6_frag.c b/src/core/ipv6/ip6_frag.c +index 78bcb2a..a5eb620 100644 +--- a/src/core/ipv6/ip6_frag.c ++++ b/src/core/ipv6/ip6_frag.c +@@ -689,7 +689,6 @@ ip6_frag_free_pbuf_custom_ref(struct pbuf_custom_ref* p) + memp_free(MEMP_FRAG_PBUF, p); + } + +-#if !GAZELLE_ENABLE + /** Free-callback function to free a 'struct pbuf_custom_ref', called by + * pbuf_free. */ + static void +@@ -703,7 +702,6 @@ ip6_frag_free_pbuf_custom(struct pbuf *p) + } + ip6_frag_free_pbuf_custom_ref(pcr); + } +-#endif /* !GAZELLE_ENABLE */ + #endif /* !LWIP_NETIF_TX_SINGLE_PBUF */ + + /** +@@ -733,7 +731,11 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + static u32_t identification; + u16_t left, cop; + const u16_t mtu = nd6_get_destination_mtu(dest, netif); ++#if !GAZELLE_UDP_ENABLE + const u16_t nfb = (u16_t)((mtu - (IP6_HLEN + IP6_FRAG_HLEN)) & IP6_FRAG_OFFSET_MASK); ++#else /* GAZELLE_UDP_ENABLE */ ++ u16_t nfb = (u16_t)((mtu - (IP6_HLEN + IP6_FRAG_HLEN)) & IP6_FRAG_OFFSET_MASK); ++#endif /* GAZELLE_UDP_ENABLE */ + u16_t fragment_offset = 0; + u16_t last; + u16_t poff = IP6_HLEN; +@@ -746,12 +748,51 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + LWIP_ASSERT("p->tot_len >= IP6_HLEN", p->tot_len >= IP6_HLEN); + left = (u16_t)(p->tot_len - IP6_HLEN); + ++#if GAZELLE_UDP_ENABLE ++ struct pbuf *q = p; ++ struct pbuf *rest_q; ++ struct ip6hdr local_original_ip6hdr; ++ if (IP6H_NEXTH(original_ip6hdr) == IP6_NEXTH_UDP) { ++ /* socket sender would free this pbuf head, avoid to free in here. */ ++ pbuf_ref(p); ++ ++ /* save ip6hdr */ ++ SMEMCPY(&local_original_ip6hdr, original_ip6hdr, IP6_HLEN); ++ original_ip6hdr = &local_original_ip6hdr; ++ /* remove IP6_HLEN, then add IP6_HLEN + IP6_FRAG_HLEN */ ++ pbuf_remove_header(p, IP6_HLEN); ++ left = (u16_t)(p->tot_len); ++ } ++#endif /* GAZELLE_UDP_ENABLE */ ++ + while (left) { + last = (left <= nfb); + + /* Fill this fragment */ + cop = last ? left : nfb; + ++#if GAZELLE_UDP_ENABLE ++ LWIP_ASSERT("ip6_frag find a tcp pbuf!", (IP6H_NEXTH(original_ip6hdr) == IP6_NEXTH_TCP)); ++ if (IP6H_NEXTH(original_ip6hdr) == IP6_NEXTH_UDP) { ++ pbuf_split_one(q, &rest_q); ++ rambuf = q; ++ q = rest_q; ++ ++ pbuf_add_header(rambuf, IP6_HLEN + IP6_FRAG_HLEN); ++ /* fill in the IP header */ ++ SMEMCPY(rambuf->payload, original_ip6hdr, IP6_HLEN); ++ ++ cop = rambuf->len - IP6_HLEN - IP6_FRAG_HLEN; ++ poff += cop; ++ ip6hdr = (struct ip6hdr *)rambuf->payload; ++ frag_hdr = (struct ip6_frag_hdr *)((u8_t*)rambuf->payload + IP6_HLEN); ++ nfb = cop / 8; ++ ++ LWIP_DEBUGF(IP_REASS_DEBUG, ("ip4_frag: UDP p=%p, tot_len %u, fragsize %u, nfb %u\n", ++ rambuf, rambuf->tot_len, cop, nfb)); ++ } else ++#endif /* GAZELLE_UDP_ENABLE */ ++ { + #if LWIP_NETIF_TX_SINGLE_PBUF + rambuf = pbuf_alloc(PBUF_IP, cop + IP6_FRAG_HLEN, PBUF_RAM); + if (rambuf == NULL) { +@@ -818,9 +859,7 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + } + pbuf_ref(p); + pcr->original = p; +-#if !GAZELLE_ENABLE + pcr->pc.custom_free_function = ip6_frag_free_pbuf_custom; +-#endif /* !GAZELLE_ENABLE */ + + /* Add it to end of rambuf's chain, but using pbuf_cat, not pbuf_chain + * so that it is removed when pbuf_dechain is later called on rambuf. +@@ -833,6 +872,7 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + } + poff = newpbuflen; + #endif /* LWIP_NETIF_TX_SINGLE_PBUF */ ++ } + + /* Set headers */ + frag_hdr->_nexth = original_ip6hdr->_nexth; +diff --git a/src/core/netif.c b/src/core/netif.c +index b79ab57..94cde99 100644 +--- a/src/core/netif.c ++++ b/src/core/netif.c +@@ -1101,13 +1101,6 @@ netif_set_link_down(struct netif *netif) + + #if GAZELLE_ENABLE + void +-netif_set_rtc_mode(struct netif *netif) +-{ +- if (!(netif->flags & NETIF_FLAG_RTC_MODE)) { +- netif_set_flags(netif, NETIF_FLAG_RTC_MODE); +- } +-} +-void + netif_set_rxol_flags(struct netif *netif, u64_t flags) + { + netif->rxol_flags |= flags; +@@ -1117,16 +1110,10 @@ netif_set_txol_flags(struct netif *netif, u64_t flags) + { + netif->txol_flags |= flags; + } +- + void netif_set_max_pbuf_frags(struct netif *netif, u8_t max_frags) + { + netif->max_pbuf_frags = max_frags; + } +- +-void netif_set_min_tso_seglen(struct netif *netif, u16_t min_tso_seglen) +-{ +- netif->min_tso_seglen = min_tso_seglen; +-} + #endif + + #if LWIP_NETIF_LINK_CALLBACK +@@ -1192,22 +1179,13 @@ udp_netif_loop_output(struct netif *netif, struct pbuf *p) + LWIP_ASSERT("netif_loop_output: invalid pbuf", p != NULL); + + /* Allocate a new pbuf */ +- u16_t p_clen = pbuf_clen(p); +- struct pbuf *temp[p_clen]; +- for (int i = 0; i < p_clen; i++) { +- temp[i] = pbuf_alloc(PBUF_LINK, p->len, PBUF_RAM); +- if (temp[i] == NULL) { +- LINK_STATS_INC(link.memerr); +- LINK_STATS_INC(link.drop); +- MIB2_STATS_NETIF_INC(stats_if, ifoutdiscards); +- pbuf_free(temp[0]); +- return ERR_MEM; +- } +- if (i > 0) { +- pbuf_cat(temp[0], temp[i]); +- } ++ r = pbuf_alloc(PBUF_LINK, p->tot_len, PBUF_RAM); ++ if (r == NULL) { ++ LINK_STATS_INC(link.memerr); ++ LINK_STATS_INC(link.drop); ++ MIB2_STATS_NETIF_INC(stats_if, ifoutdiscards); ++ return ERR_MEM; + } +- r = temp[0]; + #if LWIP_LOOPBACK_MAX_PBUFS + clen = pbuf_clen(r); + /* check for overflow or too many pbuf on queue */ +diff --git a/src/core/pbuf.c b/src/core/pbuf.c +index a5e8668..4cfe369 100644 +--- a/src/core/pbuf.c ++++ b/src/core/pbuf.c +@@ -69,7 +69,6 @@ + */ + + #include "lwip/opt.h" +-#include "lwipgz_sock.h" + + #include "lwip/pbuf.h" + #include "lwip/stats.h" +@@ -84,8 +83,9 @@ + #if LWIP_CHECKSUM_ON_COPY + #include "lwip/inet_chksum.h" + #endif ++ + #if GAZELLE_ENABLE +-#include ++#include "lwipgz_sock.h" + #include "lwipgz_offload.h" + #endif + +@@ -105,6 +105,7 @@ pbuf_skip_const(const struct pbuf *in, u16_t in_offset, u16_t *out_offset); + + #if !NO_SYS + #ifndef PBUF_POOL_FREE_OOSEQ_QUEUE_CALL ++#if !GAZELLE_ENABLE + #include "lwip/tcpip.h" + #define PBUF_POOL_FREE_OOSEQ_QUEUE_CALL() do { \ + if (tcpip_try_callback(pbuf_free_ooseq_callback, NULL) != ERR_OK) { \ +@@ -112,13 +113,15 @@ pbuf_skip_const(const struct pbuf *in, u16_t in_offset, u16_t *out_offset); + pbuf_free_ooseq_pending = 0; \ + SYS_ARCH_UNPROTECT(old_level); \ + } } while(0) ++#else /* GAZELLE_ENABLE */ ++#define PBUF_POOL_FREE_OOSEQ_QUEUE_CALL() pbuf_free_ooseq_callback(NULL) ++#endif /* GAZELLE_ENABLE */ + #endif /* PBUF_POOL_FREE_OOSEQ_QUEUE_CALL */ + #endif /* !NO_SYS */ + + volatile u8_t pbuf_free_ooseq_pending; + #define PBUF_POOL_IS_EMPTY() pbuf_pool_is_empty() + +-#if !GAZELLE_ENABLE + /** + * Attempt to reclaim some memory from queued out-of-sequence TCP segments + * if we run out of pool pbufs. It's better to give priority to new packets +@@ -178,7 +181,6 @@ pbuf_pool_is_empty(void) + } + #endif /* PBUF_POOL_FREE_OOSEQ_QUEUE_CALL */ + } +-#endif /* GAZELLE_ENABLE */ + #endif /* !LWIP_TCP || !TCP_QUEUE_OOSEQ || !PBUF_POOL_FREE_OOSEQ */ + + /* Initialize members of struct pbuf after allocation */ +@@ -197,6 +199,10 @@ pbuf_init_alloced_pbuf(struct pbuf *p, void *payload, u16_t tot_len, u16_t len, + #if GAZELLE_SAME_NODE + p->pcb = NULL; + #endif /* GAZELLE_SAME_NODE */ ++#if GAZELLE_ENABLE ++ p->mbuf_refcnt = 0; ++ p->tcp_psh = 0; ++#endif /* GAZELLE_ENABLE */ + } + + /** +@@ -238,16 +244,18 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + u16_t offset = (u16_t)layer; + LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_alloc(length=%"U16_F")\n", length)); + ++#if GAZELLE_ENABLE ++ /* alloc a pbuf list by PBUF_POOL. */ ++ if (type == PBUF_RAM) ++ type = PBUF_POOL; ++#endif /* GAZELLE_ENABLE */ ++ + switch (type) { + case PBUF_REF: /* fall through */ + case PBUF_ROM: + p = pbuf_alloc_reference(NULL, length, type); + break; + case PBUF_POOL: { +-#if GAZELLE_ENABLE +- // alloc from pktmbuf pool, one pbuf is enough +- p = do_lwip_alloc_pbuf(layer, length, type); +-#else + struct pbuf *q, *last; + u16_t rem_len; /* remaining length */ + p = NULL; +@@ -255,7 +263,11 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + rem_len = length; + do { + u16_t qlen; ++#if GAZELLE_ENABLE ++ q = mem_get_pbuf(-1, false); ++#else /* GAZELLE_ENABLE */ + q = (struct pbuf *)memp_malloc(MEMP_PBUF_POOL); ++#endif /* GAZELLE_ENABLE */ + if (q == NULL) { + PBUF_POOL_IS_EMPTY(); + /* free chain so far allocated */ +@@ -265,9 +277,14 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + /* bail out unsuccessfully */ + return NULL; + } ++#if GAZELLE_ENABLE ++ qlen = LWIP_MIN(rem_len, (u16_t)(TCP_MSS)); ++ mem_init_pbuf(q, layer, rem_len, qlen, type); ++#else /* GAZELLE_ENABLE */ + qlen = LWIP_MIN(rem_len, (u16_t)(PBUF_POOL_BUFSIZE_ALIGNED - LWIP_MEM_ALIGN_SIZE(offset))); + pbuf_init_alloced_pbuf(q, LWIP_MEM_ALIGN((void *)((u8_t *)q + SIZEOF_STRUCT_PBUF + offset)), + rem_len, qlen, type, 0); ++#endif /* GAZELLE_ENABLE */ + LWIP_ASSERT("pbuf_alloc: pbuf q->payload properly aligned", + ((mem_ptr_t)q->payload % MEM_ALIGNMENT) == 0); + LWIP_ASSERT("PBUF_POOL_BUFSIZE must be bigger than MEM_ALIGNMENT", +@@ -283,7 +300,6 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + rem_len = (u16_t)(rem_len - qlen); + offset = 0; + } while (rem_len > 0); +-#endif /* GAZELLE_ENABLE */ + break; + } + case PBUF_RAM: { +@@ -297,10 +313,6 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + } + + /* If pbuf is to be allocated in RAM, allocate memory for it. */ +-#if GAZELLE_ENABLE +- /* alloc mbuf avoid send copy */ +- p = do_lwip_alloc_pbuf(layer, length, type); +-#else + p = (struct pbuf *)mem_malloc(alloc_len); + if (p == NULL) { + return NULL; +@@ -309,7 +321,6 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + length, length, type, 0); + LWIP_ASSERT("pbuf_alloc: pbuf->payload properly aligned", + ((mem_ptr_t)p->payload % MEM_ALIGNMENT) == 0); +-#endif + break; + } + default: +@@ -760,18 +771,12 @@ pbuf_free(struct pbuf *p) + } + LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_free(%p)\n", (void *)p)); + +-#ifndef LWIP_PERF + PERF_START; +-#endif + + count = 0; + /* de-allocate all consecutive pbufs from the head of the chain that + * obtain a zero reference count after decrementing*/ + while (p != NULL) { +-#if GAZELLE_ENABLE +- if (p->next) +- rte_prefetch0(p->next); +-#endif + LWIP_PBUF_REF_T ref; + SYS_ARCH_DECL_PROTECT(old_level); + /* Since decrementing ref cannot be guaranteed to be a single machine operation +@@ -792,19 +797,19 @@ pbuf_free(struct pbuf *p) + #if LWIP_SUPPORT_CUSTOM_PBUF + /* is this a custom pbuf? */ + if ((p->flags & PBUF_FLAG_IS_CUSTOM) != 0) { +-#if GAZELLE_ENABLE +- do_lwip_free_pbuf(p); +-#else + struct pbuf_custom *pc = (struct pbuf_custom *)p; + LWIP_ASSERT("pc->custom_free_function != NULL", pc->custom_free_function != NULL); + pc->custom_free_function(p); +-#endif + } else + #endif /* LWIP_SUPPORT_CUSTOM_PBUF */ + { + /* is this a pbuf from the pool? */ + if (alloc_src == PBUF_TYPE_ALLOC_SRC_MASK_STD_MEMP_PBUF_POOL) { ++#if GAZELLE_ENABLE ++ mem_put_pbuf(p); ++#else /* GAZELLE_ENABLE */ + memp_free(MEMP_PBUF_POOL, p); ++#endif /* GAZELLE_ENABLE */ + /* is this a ROM or RAM referencing pbuf? */ + } else if (alloc_src == PBUF_TYPE_ALLOC_SRC_MASK_STD_MEMP_PBUF) { + memp_free(MEMP_PBUF, p); +@@ -827,9 +832,7 @@ pbuf_free(struct pbuf *p) + p = NULL; + } + } +-#ifndef LWIP_PERF + PERF_STOP("pbuf_free"); +-#endif + /* return number of de-allocated pbufs */ + return count; + } +@@ -1041,7 +1044,7 @@ pbuf_copy_partial_pbuf(struct pbuf *p_to, const struct pbuf *p_from, u16_t copy_ + len = p_to->len - offset_to; + } + +-#if GAZELLE_ENABLE && (OFFLOAD_CHECKSUM_CHECK_IP || OFFLOAD_CHECKSUM_GEN_IP) ++#if (OFFLOAD_CHECKSUM_CHECK_IP || OFFLOAD_CHECKSUM_GEN_IP) + pbuf_offload_copy(p_to, p_from); + #endif + +@@ -1376,25 +1379,11 @@ pbuf_clone(pbuf_layer layer, pbuf_type type, struct pbuf *p) + { + struct pbuf *q; + err_t err; +- u16_t p_clen = pbuf_clen(p); +- struct pbuf *temp[p_clen]; +- for (int i = 0; i < p_clen; i++) { +- temp[i] = pbuf_alloc(PBUF_LINK, p->len, PBUF_RAM); +- if (temp[i] == NULL) { +- pbuf_free(temp[0]); +- return NULL; +- } +- if (i > 0) { +- pbuf_cat(temp[0], temp[i]); +- } ++ q = pbuf_alloc(layer, p->tot_len, type); ++ if (q == NULL) { ++ return NULL; + } +- q = temp[0]; +- + err = pbuf_copy(q, p); +- if (err != ERR_OK) { +- pbuf_free(q); +- return NULL; +- } + LWIP_UNUSED_ARG(err); /* in case of LWIP_NOASSERT */ + LWIP_ASSERT("pbuf_copy failed", err == ERR_OK); + return q; +@@ -1441,6 +1430,42 @@ pbuf_fill_chksum(struct pbuf *p, u16_t start_offset, const void *dataptr, + } + #endif /* LWIP_CHECKSUM_ON_COPY */ + ++#if GAZELLE_ENABLE ++void ++pbuf_append_take(struct pbuf *h, struct pbuf *t, const void *dataptr, u16_t len, u16_t *chksum) ++{ ++ struct pbuf *q; ++ u16_t offset = t->len; ++ ++#if LWIP_CHECKSUM_ON_COPY ++ if (chksum != NULL) { ++ pbuf_fill_chksum(t, offset, dataptr, len, chksum) ++ } else ++#endif /* LWIP_CHECKSUM_ON_COPY */ ++ { ++ MEMCPY(((u8_t *)t->payload) + offset, dataptr, len); ++ } ++ ++ t->len += len; ++ for (q = h; q != NULL; q = q->next) { ++ q->tot_len += len; ++ } ++} ++ ++void pbuf_split_one(struct pbuf *p, struct pbuf **rest) ++{ ++ struct pbuf *new_head; ++ ++ new_head = p->next; ++ if (new_head != NULL) { ++ p->tot_len = p->len; ++ p->next = NULL; ++ } ++ ++ *rest = new_head; ++} ++#endif /* GAZELLE_ENABLE */ ++ + /** + * @ingroup pbuf + * Get one byte from the specified position in a pbuf +diff --git a/src/core/tcp.c b/src/core/tcp.c +index 31bfd7a..97d2c4d 100644 +--- a/src/core/tcp.c ++++ b/src/core/tcp.c +@@ -1849,15 +1849,22 @@ void + tcp_seg_free(struct tcp_seg *seg) + { + if (seg != NULL) { ++#if !GAZELLE_ENABLE + if (seg->p != NULL) { + pbuf_free(seg->p); + #if TCP_DEBUG + seg->p = NULL; + #endif /* TCP_DEBUG */ + } +-#if !GAZELLE_ENABLE + memp_free(MEMP_TCP_SEG, seg); +-#endif ++ ++#else /* GAZELLE_ENABLE */ ++ struct pbuf *p = seg->p; ++ if (p != NULL) { ++ seg->p = NULL; ++ pbuf_free(p); ++ } ++#endif /* GAZELLE_ENABLE */ + } + } + +@@ -1894,13 +1901,13 @@ tcp_seg_copy(struct tcp_seg *seg) + LWIP_ASSERT("tcp_seg_copy: invalid seg", seg != NULL); + + #if GAZELLE_ENABLE +- cseg = (struct tcp_seg *)((uint8_t *)seg->p + sizeof(struct pbuf_custom)); +-#else ++ cseg = pbuf_to_tcp_seg(seg->p); ++#else /* GAZELLE_ENABLE */ + cseg = (struct tcp_seg *)memp_malloc(MEMP_TCP_SEG); + if (cseg == NULL) { + return NULL; + } +-#endif ++#endif /* GAZELLE_ENABLE */ + SMEMCPY((u8_t *)cseg, (const u8_t *)seg, sizeof(struct tcp_seg)); + pbuf_ref(cseg->p); + return cseg; +@@ -2417,6 +2424,7 @@ tcp_pcb_purge(struct tcp_pcb *pcb) + #if GAZELLE_TCP_LAST_SEG + pcb->last_unacked = pcb->last_unsent = NULL; + #endif /* GAZELLE_TCP_LAST_SEG */ ++ + #if TCP_OVERSIZE + pcb->unsent_oversize = 0; + #endif /* TCP_OVERSIZE */ +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index 3c19f1d..759cb22 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -82,7 +82,6 @@ + + #if GAZELLE_ENABLE + #include "lwipgz_sock.h" +-#include + #if OFFLOAD_CHECKSUM_GEN_TCP + #include "lwipgz_offload.h" + #endif +@@ -171,40 +170,6 @@ tcp_route(const struct tcp_pcb *pcb, const ip_addr_t *src, const ip_addr_t *dst) + * The TCP header is filled in except ackno and wnd. + * p is freed on failure. + */ +-#if GAZELLE_ENABLE +-void tcp_init_segment(struct tcp_seg *seg, const struct tcp_pcb *pcb, struct pbuf *p, u8_t hdrflags, +- u32_t seqno, u8_t optflags) +-{ +- u8_t optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(optflags, pcb); +- +- seg->flags = optflags; +- seg->next = NULL; +- seg->p = p; +- seg->len = p->tot_len - optlen; +- +- /* build TCP header */ +- pbuf_add_header(p, TCP_HLEN); +- seg->tcphdr = (struct tcp_hdr *)seg->p->payload; +- seg->tcphdr->src = lwip_htons(pcb->local_port); +- seg->tcphdr->dest = lwip_htons(pcb->remote_port); +- seg->tcphdr->seqno = lwip_htonl(seqno); +- +- TCPH_HDRLEN_FLAGS_SET(seg->tcphdr, (TCP_HLEN + optlen) / 4, hdrflags); +- seg->tcphdr->urgp = 0; +-} +- +-static struct tcp_seg * +-tcp_create_segment(const struct tcp_pcb *pcb, struct pbuf *p, u8_t hdrflags, u32_t seqno, u8_t optflags) +-{ +- struct tcp_seg *seg; +- +- seg = (struct tcp_seg *)((uint8_t *)p + sizeof(struct pbuf_custom)); +- +- tcp_init_segment(seg, pcb, p, hdrflags, seqno, optflags); +- +- return seg; +-} +-#else + static struct tcp_seg * + tcp_create_segment(const struct tcp_pcb *pcb, struct pbuf *p, u8_t hdrflags, u32_t seqno, u8_t optflags) + { +@@ -216,11 +181,21 @@ tcp_create_segment(const struct tcp_pcb *pcb, struct pbuf *p, u8_t hdrflags, u32 + + optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(optflags, pcb); + ++#if GAZELLE_ENABLE ++ RTE_BUILD_BUG_ON(sizeof(struct tcp_seg) > GAZELLE_SIZEOF_TCP_SEG); ++ seg = pbuf_to_tcp_seg(p); ++ if (optlen > 0) { ++ // TODO ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ++ ("tcp_create_segment: not support optlen %u, tot_len %u.\n", optlen, p->tot_len)); ++ } ++#else + if ((seg = (struct tcp_seg *)memp_malloc(MEMP_TCP_SEG)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, ("tcp_create_segment: no memory.\n")); + pbuf_free(p); + return NULL; + } ++#endif + seg->flags = optflags; + seg->next = NULL; + seg->p = p; +@@ -254,7 +229,6 @@ tcp_create_segment(const struct tcp_pcb *pcb, struct pbuf *p, u8_t hdrflags, u32 + seg->tcphdr->urgp = 0; + return seg; + } +-#endif + + /** + * Allocate a PBUF_RAM pbuf, perhaps with extra space at the end. +@@ -312,6 +286,7 @@ tcp_pbuf_prealloc(pbuf_layer layer, u16_t length, u16_t max_length, + } + } + #endif /* LWIP_NETIF_TX_SINGLE_PBUF */ ++#if !GAZELLE_ENABLE + p = pbuf_alloc(layer, alloc, PBUF_RAM); + if (p == NULL) { + return NULL; +@@ -320,10 +295,17 @@ tcp_pbuf_prealloc(pbuf_layer layer, u16_t length, u16_t max_length, + *oversize = p->len - length; + /* trim p->len to the currently used size */ + p->len = p->tot_len = length; ++#else /* GAZELLE_ENABLE */ ++ p = pbuf_alloc(layer, length, PBUF_POOL); ++ if (p == NULL) { ++ return NULL; ++ } ++ *oversize = alloc - length; ++#endif /* GAZELLE_ENABLE */ + return p; + } + #else /* TCP_OVERSIZE */ +-#define tcp_pbuf_prealloc(layer, length, mx, os, pcb, api, fst) pbuf_alloc((layer), (length), PBUF_POOL) ++#define tcp_pbuf_prealloc(layer, length, mx, os, pcb, api, fst) pbuf_alloc((layer), (length), PBUF_RAM) + #endif /* TCP_OVERSIZE */ + + #if TCP_CHECKSUM_ON_COPY +@@ -466,8 +448,12 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + LWIP_ERROR("tcp_write: invalid pcb", pcb != NULL, return ERR_ARG); + + /* don't allocate segments bigger than half the maximum window we ever received */ ++#if !GAZELLE_ENABLE + mss_local = LWIP_MIN(pcb->mss, TCPWND_MIN16(pcb->snd_wnd_max / 2)); + mss_local = mss_local ? mss_local : pcb->mss; ++#else ++ mss_local = lwip_tcp_mss(pcb); ++#endif /* GAZELLE_ENABLE */ + + LWIP_ASSERT_CORE_LOCKED(); + +@@ -602,7 +588,12 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + #if TCP_OVERSIZE_DBGCHECK + oversize_add = oversize; + #endif /* TCP_OVERSIZE_DBGCHECK */ ++#if !GAZELLE_ENABLE + TCP_DATA_COPY2(concat_p->payload, (const u8_t *)arg + pos, seglen, &concat_chksum, &concat_chksum_swapped); ++#else ++ /* copy to pbuf list. */ ++ pbuf_take(concat_p, (const u8_t *)arg + pos, seglen); ++#endif /* GAZELLE_ENABLE */ + #if TCP_CHECKSUM_ON_COPY + concat_chksummed += seglen; + #endif /* TCP_CHECKSUM_ON_COPY */ +@@ -669,7 +660,12 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + } + LWIP_ASSERT("tcp_write: check that first pbuf can hold the complete seglen", + (p->len >= seglen)); ++#if !GAZELLE_ENABLE + TCP_DATA_COPY2((char *)p->payload + optlen, (const u8_t *)arg + pos, seglen, &chksum, &chksum_swapped); ++#else ++ /* copy to pbuf list. */ ++ pbuf_take_at(p, (const u8_t *)arg + pos, seglen, optlen); ++#endif /* GAZELLE_ENABLE */ + } else { + /* Copy is not set: First allocate a pbuf for holding the data. + * Since the referenced data is available at least until it is +@@ -696,7 +692,7 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + ((struct pbuf_rom *)p2)->payload = (const u8_t *)arg + pos; + + /* Second, allocate a pbuf for the headers. */ +- if ((p = pbuf_alloc(PBUF_TRANSPORT, optlen, PBUF_POOL)) == NULL) { ++ if ((p = pbuf_alloc(PBUF_TRANSPORT, optlen, PBUF_RAM)) == NULL) { + /* If allocation fails, we have to deallocate the data pbuf as + * well. */ + pbuf_free(p2); +@@ -719,10 +715,6 @@ tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) + goto memerr; + } + +-#if GAZELLE_ENABLE +- lstack_calculate_aggregate(2, p->tot_len); +-#endif +- + if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { + goto memerr; + } +@@ -876,136 +868,272 @@ memerr: + } + + #if GAZELLE_ENABLE +-err_t +-tcp_write_from_stack(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) ++/* refence to tcp_write() */ ++u16_t lwip_tcp_mss(const struct tcp_pcb *pcb) + { +- struct tcp_seg *last_unsent = NULL, *seg = NULL, *prev_seg = NULL, *queue = NULL; +- u16_t pos = 0; /* position in 'arg' data */ +- u16_t queuelen; +- u8_t optlen; +- u8_t optflags = 0; +- err_t err; +- u16_t mss_local; ++ u16_t mss_local, mss; ++ u32_t wnd; + +- /* don't allocate segments bigger than half the maximum window we ever received */ +- mss_local = LWIP_MIN(pcb->mss, TCPWND_MIN16(pcb->snd_wnd_max / 2)); +- mss_local = mss_local ? mss_local : pcb->mss; ++#if OFFLOAD_TX_TCP_TSO ++ struct netif *netif; ++ netif = netif_get_by_index(pcb->netif_idx); ++ if (netif != NULL && (netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_TSO)) { ++ mss = netif->max_pbuf_frags * TCP_MSS; ++ } else ++#endif /* OFFLOAD_TX_TCP_TSO */ ++ { ++ mss = pcb->mss; ++ } + +- err = tcp_write_checks(pcb, len); +- if (err != ERR_OK) { +- return err; ++ /* don't allocate segments bigger than half the maximum window we ever received */ ++ mss_local = LWIP_MIN(mss, TCPWND_MIN16(pcb->snd_wnd_max / 2)); ++ mss_local = mss_local ? mss_local : mss; ++#if LWIP_TCP_TIMESTAMPS ++ if ((pcb->flags & TF_TIMESTAMP)) { ++ /* ensure that segments can hold at least one data byte... */ ++ mss_local = LWIP_MAX(mss_local, LWIP_TCP_OPT_LEN_TS + 1); + } +- queuelen = pcb->snd_queuelen; ++#endif /* LWIP_TCP_TIMESTAMPS */ + +- optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(0, pcb); ++ wnd = LWIP_MIN(pcb->snd_wnd, pcb->cwnd); ++ if (wnd < pcb->mss) ++ wnd = pcb->mss; + +- last_unsent = pcb->last_unsent; ++ return mss_local <= wnd ? mss_local : wnd; ++} + +- /* +- * get pbuf from sendring and create new segments. +- */ +- while (pos < len) { +- struct pbuf *p; +- u16_t left = len - pos; +- u16_t max_len = mss_local - optlen; +- u16_t seglen = LWIP_MIN(left, max_len); ++/* refence to tcp_write() */ ++u8_t lwip_tcp_optlen(const struct tcp_pcb *pcb) ++{ ++ u8_t optlen; + +- p = do_lwip_tcp_get_from_sendring((struct lwip_sock *)arg, len - pos); +- if (p == NULL) { +- apiflags &= ~TCP_WRITE_FLAG_MORE; +- break; +- } +- seglen = p->tot_len; ++#if LWIP_TCP_TIMESTAMPS ++ if ((pcb->flags & TF_TIMESTAMP)) { ++ /* Make sure the timestamp option is only included in data segments if we ++ agreed about it with the remote host. */ ++ optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(TF_SEG_OPTS_TS, pcb); ++ } else ++#endif /* LWIP_TCP_TIMESTAMPS */ ++ { ++ optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(0, pcb); ++ } + +- queuelen += pbuf_clen(p); ++ return optlen; ++} + +- /* Now that there are more segments queued, we check again if the +- * length of the queue exceeds the configured maximum or +- * overflows. */ +- if (queuelen > LWIP_MIN(TCP_SND_QUEUELEN, TCP_SNDQUEUELEN_OVERFLOW)) { +- LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, ("tcp_write: queue too long %"U16_F" (%d)\n", +- queuelen, (int)TCP_SND_QUEUELEN)); +- if (pos > 0) { +- queuelen -= pbuf_clen(p); +- break; +- } +- goto memerr; +- } ++/* refence to lwip_netconn_do_writemore() */ ++u8_t lwip_tcp_allow_send(const struct tcp_pcb *pcb) ++{ ++ if ((tcp_sndbuf(pcb) <= TCP_SNDLOWAT) || ++ (tcp_sndqueuelen(pcb) >= TCP_SNDQUEUELOWAT)) { ++ /* The queued byte- or pbuf-count exceeds the configured low-water limit, ++ let select mark this pcb as non-writable. */ ++ return false; ++ } ++ return true; ++} + +- lstack_calculate_aggregate(2, p->tot_len); ++/* refence to lwip_netconn_do_writemore() */ ++u16_t lwip_tcp_prepare_seg(struct tcp_pcb *pcb, struct pbuf *pbuf_pkts[], u16_t num) ++{ ++ size_t send_total = 0; ++ struct tcp_seg *seg; ++ struct tcp_seg *prev = NULL; + +- if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { +- if (pos > 0) { +- queuelen -= pbuf_clen(p); +- break; +- } +- goto memerr; ++ u8_t optflags = 0; ++#if LWIP_TCP_TIMESTAMPS ++ if ((pcb->flags & TF_TIMESTAMP)) { ++ optflags = TF_SEG_OPTS_TS; ++ } ++#endif /* LWIP_TCP_TIMESTAMPS */ ++ ++ for (u16_t i = 0; i < num; ++i) { ++ seg = tcp_create_segment(pcb, pbuf_pkts[i], 0, pcb->snd_lbb, optflags); ++ if (pbuf_pkts[i]->tcp_psh) { ++ TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); + } + +- /* first segment of to-be-queued data? */ +- if (queue == NULL) { +- queue = seg; ++ /* | TCP_HLEN | optlen | datalen | ++ * \_ seg->tcphdr == p->payload ++ * seg->len == datalen ++ * p->tot_len == datalen + optlen + TCP_HLEN ++ */ ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("lwip_tcp_output_seg(pcb=%p, segs[%d]=%p {.p=%p, .len=%u, .seqno=%u})\n", ++ pcb, i, seg, seg->p, seg->len, lwip_ntohl(seg->tcphdr->seqno))); ++ ++ pcb->snd_lbb += seg->len; ++ pcb->snd_queuelen += pbuf_clen(seg->p); ++ if (pcb->snd_buf >= seg->len) { ++ pcb->snd_buf -= seg->len; + } else { +- /* Attach the segment to the end of the queued segments */ +- LWIP_ASSERT("prev_seg != NULL", prev_seg != NULL); +- prev_seg->next = seg; ++ pcb->snd_buf = 0; ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("lwip_tcp_output_seg: snd_buf not enough.\n")); + } +- /* remember last segment of to-be-queued data for next iteration */ +- prev_seg = seg; + +- LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, ("tcp_write: queueing %"U32_F":%"U32_F"\n", +- lwip_ntohl(seg->tcphdr->seqno), +- lwip_ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg))); ++ send_total += seg->len; + +- pos += seglen; +- do_lwip_get_from_sendring_over((struct lwip_sock*)arg); ++ if (prev == NULL) { ++ prev = seg; ++ } else { ++ prev->next = seg; ++ prev = seg; ++ } + } + +- /* +- * Phase 3: Append queue to pcb->unsent. Queue may be NULL, but that +- * is harmless +- */ +- if (last_unsent == NULL) { +- pcb->unsent = queue; ++#if GAZELLE_TCP_LAST_SEG ++ if (pcb->unsent == NULL) { ++ pcb->unsent = pbuf_to_tcp_seg(pbuf_pkts[0]); + } else { +- last_unsent->next = queue; ++ pcb->last_unsent->next = pbuf_to_tcp_seg(pbuf_pkts[0]); ++ } ++ pcb->last_unsent = pbuf_to_tcp_seg(pbuf_pkts[num - 1]); ++#else /* GAZELLE_TCP_LAST_SEG */ ++ if (pcb->unsent == NULL) { ++ pcb->unsent = pbuf_to_tcp_seg(pbuf_pkts[0]); ++ } else { ++ struct tcp_seg *last_unsent; ++ for (last_unsent = pcb->unsent; last_unsent->next != NULL; last_unsent = last_unsent->next); ++ last_unsent->next = pbuf_to_tcp_seg(pbuf_pkts[0]); + } ++#endif /* GAZELLE_TCP_LAST_SEG */ + +- /* +- * Finally update the pcb state. +- */ +- if (queue) { +- pcb->last_unsent = prev_seg; ++ return send_total; ++} ++ ++void lwip_tcp_tso_merge_seg(struct tcp_pcb *pcb, struct pbuf *pbuf_pkts[], u16_t *p_num) ++{ ++#if !LWIP_NETIF_TX_SINGLE_PBUF ++ int l, r; /* left and right */ ++ u16_t mss_local; ++ u16_t seg_num; ++ ++ mss_local = lwip_tcp_mss(pcb); ++ if (mss_local <= pcb->mss) ++ return; ++ ++ seg_num = *p_num; ++ for (l = 0, r = 1; r < seg_num; ++r) { ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("%s(pcb=%p, pbuf_pkts[%d]=%p tot_len=%u, pbuf_pkts[%d]=%p tot_len=%u)\n", ++ __FUNCTION__, pcb, l, pbuf_pkts[l], pbuf_pkts[l]->tot_len, ++ r, pbuf_pkts[r], pbuf_pkts[r]->tot_len)); ++ ++ if (pbuf_pkts[l]->tot_len < OFFLAOD_TX_TSO_MIN_SEGLEN || ++ pbuf_pkts[l]->tot_len + pbuf_pkts[r]->tot_len > mss_local) { ++ pbuf_pkts[++l] = pbuf_pkts[r]; ++ continue; ++ } ++ ++ pbuf_pkts[l]->tcp_psh |= pbuf_pkts[r]->tcp_psh; ++ pbuf_cat(pbuf_pkts[l], pbuf_pkts[r]); ++ pbuf_pkts[r] = NULL; + } +- pcb->snd_lbb += pos; +- pcb->snd_buf -= pos; +- pcb->snd_queuelen = queuelen; + +- LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: %"S16_F" (after enqueued)\n", +- pcb->snd_queuelen)); +- if (pcb->snd_queuelen != 0) { +- LWIP_ASSERT("tcp_write: valid queue length", +- pcb->unacked != NULL || pcb->unsent != NULL); ++ *p_num = l + 1; ++#endif /* LWIP_NETIF_TX_SINGLE_PBUF */ ++} ++ ++void lwip_tcp_tso_split_seg(struct tcp_pcb *pcb, struct tcp_seg **head_seg, struct tcp_seg **tail_seg) ++{ ++#if !LWIP_NETIF_TX_SINGLE_PBUF ++ struct pbuf *p, *n; ++ struct tcp_seg *seg, *next; ++ struct tcp_seg *prev = NULL; ++ struct tcp_seg *head = NULL; ++ ++ seg = *head_seg; ++ if (seg == NULL || seg->len <= pcb->mss) ++ return; ++ ++ u8_t optflags = 0; ++#if LWIP_TCP_TIMESTAMPS ++ if ((pcb->flags & TF_TIMESTAMP)) { ++ optflags = TF_SEG_OPTS_TS; + } ++#endif /* LWIP_TCP_TIMESTAMPS */ + +- /* Set the PSH flag in the last segment that we enqueued. */ +- if (seg != NULL && seg->tcphdr != NULL && ((apiflags & TCP_WRITE_FLAG_MORE) == 0)) { +- TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); ++ u16_t offset = 0; ++ u32_t seqno = lwip_ntohl(seg->tcphdr->seqno); ++ ++ next = seg->next; ++ /* see tcp_output_segment() ++ * reset the first segment, already include IP header */ ++ p = seg->p; ++ p->len -= (u8_t *)seg->tcphdr - (u8_t *)seg->p->payload; ++ p->tot_len = p->len; ++ p->payload = seg->tcphdr; ++ pbuf_remove_header(p, TCP_HLEN); ++#if OFFLOAD_TX_TCP_TSO ++ ol_tx_tcp_tso_unset(seg->p); ++#endif /* OFFLOAD_TX_TCP_TSO */ ++ ++ while (p != NULL) { ++ p->tot_len = p->len; ++ n = p->next; ++ p->next = NULL; ++ ++ seg = tcp_create_segment(pcb, p, 0, seqno + offset, optflags); ++ offset += seg->len; ++ p = n; ++ ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("%s(pcb=%p, seg=%p {.p=%p, .len=%u, .seqno=%u})\n", ++ __FUNCTION__, pcb, seg, seg->p, seg->len, lwip_ntohl(seg->tcphdr->seqno))); ++ ++ /* refence to tcp_free_acked_segments() */ ++ if (TCP_SEQ_LEQ(lwip_ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg), pcb->lastack)) { ++ tcp_seg_free(seg); ++ continue; ++ } ++ ++ if (prev == NULL) { ++ prev = seg; ++ head = seg; ++ } else { ++ prev->next = seg; ++ prev = seg; ++ } + } ++ prev->next = next; + +- return ERR_OK; +-memerr: +- tcp_set_flags(pcb, TF_NAGLEMEMERR); +- TCP_STATS_INC(tcp.memerr); ++#if GAZELLE_TCP_LAST_SEG ++ if (prev->next == NULL) ++ *tail_seg = prev; ++#endif /* GAZELLE_TCP_LAST_SEG */ ++ *head_seg = head; + +- if (pcb->snd_queuelen != 0) { +- LWIP_ASSERT("tcp_write: valid queue length", pcb->unacked != NULL || +- pcb->unsent != NULL); ++ tcp_seg_debug(pcb, __FUNCTION__); ++#endif /* LWIP_NETIF_TX_SINGLE_PBUF */ ++ return; ++} ++ ++void tcp_seg_debug(const struct tcp_pcb *pcb, const char *name) ++{ ++#if TCP_OUTPUT_DEBUG ++ const struct tcp_seg *seg; ++ ++ for (seg = pcb->unacked; seg != NULL; seg = seg->next) { ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("%s(pcb=%p, unacked seg=%p {.len=%u, .seqno=%u})\n", ++ name, pcb, seg, seg->len, lwip_ntohl(seg->tcphdr->seqno))); + } +- LWIP_DEBUGF(TCP_QLEN_DEBUG | LWIP_DBG_STATE, ("tcp_write: %"S16_F" (with mem err)\n", pcb->snd_queuelen)); +- return ERR_MEM; ++ seg = pcb->last_unacked; ++ if (seg != NULL) { ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("%s(pcb=%p, last_unacked seg=%p {.len=%u, .seqno=%u})\n", ++ name, pcb, seg, seg->len, lwip_ntohl(seg->tcphdr->seqno))); ++ } ++ ++ for (seg = pcb->unsent; seg != NULL; seg = seg->next) { ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("%s(pcb=%p, unsent seg=%p {.len=%u, .seqno=%u})\n", ++ name, pcb, seg, seg->len, lwip_ntohl(seg->tcphdr->seqno))); ++ } ++ seg = pcb->last_unsent; ++ if (seg != NULL) { ++ LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("%s(pcb=%p, last_unsent seg=%p {.len=%u, .seqno=%u})\n", ++ name, pcb, seg, seg->len, lwip_ntohl(seg->tcphdr->seqno))); ++ } ++ ++ return; ++#endif /* TCP_OUTPUT_DEBUG */ + } +-#endif ++#endif /* GAZELLE_ENABLE */ + + /** + * Split segment on the head of the unsent queue. If return is not +@@ -1038,6 +1166,10 @@ tcp_split_unsent_seg(struct tcp_pcb *pcb, u16_t split) + + LWIP_ASSERT("tcp_split_unsent_seg: invalid pcb", pcb != NULL); + ++#if OFFLOAD_TX_TCP_TSO ++ lwip_tcp_tso_split_seg(pcb, &pcb->unsent, &pcb->last_unsent); ++#endif /* OFFLOAD_TX_TCP_TSO */ ++ + useg = pcb->unsent; + if (useg == NULL) { + return ERR_MEM; +@@ -1438,35 +1570,6 @@ tcp_build_wnd_scale_option(u32_t *opts) + } + #endif + +-#if GAZELLE_ENABLE +-u32_t start_seqno = 0; +-#define TCP_INIT_SEGMENT(tem_seg, _pcb, _p, _hdrflags, _seqno, _optflags) \ +-do { \ +- struct tcp_seg *_seg = tem_seg; \ +- u8_t _optlen; \ +- rte_prefetch2(_p); \ +- \ +- _optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(_optflags, _pcb); \ +- _seg->flags = _optflags; \ +- _seg->next = NULL; \ +- _seg->p = _p; \ +- _seg->len = _p->tot_len - _optlen; \ +- /* build TCP header */ \ +- pbuf_add_header(_p, TCP_HLEN); \ +- _seg->tcphdr = (struct tcp_hdr *)_seg->p->payload; \ +- _seg->tcphdr->src = lwip_htons(_pcb->local_port); \ +- _seg->tcphdr->dest = lwip_htons(_pcb->remote_port); \ +- /* _seg->tcphdr->src = lwip_htons(_pcb->local_port); \ */ \ +- /* _seg->tcphdr->dest = lwip_htons(_pcb->remote_port); \ */ \ +- _seg->tcphdr->seqno = lwip_htonl(_seqno); \ +- \ +- if (start_seqno == 0) {\ +- start_seqno = _seqno; \ +- } \ +- TCPH_HDRLEN_FLAGS_SET(_seg->tcphdr, (5 + _optlen / 4), _hdrflags); \ +- _seg->tcphdr->urgp = 0; \ +-} while(0) +-#endif + /** + * @ingroup tcp_raw + * Find out what we can send and send it +@@ -1503,6 +1606,8 @@ tcp_output(struct tcp_pcb *pcb) + return ERR_OK; + } + ++ tcp_seg_debug(pcb, __FUNCTION__); ++ + wnd = LWIP_MIN(pcb->snd_wnd, pcb->cwnd); + + seg = pcb->unsent; +@@ -1526,6 +1631,12 @@ tcp_output(struct tcp_pcb *pcb) + /* nothing to send: shortcut out of here */ + goto output_done; + } else { ++#if OFFLOAD_TX_TCP_TSO ++ if (seg->len > wnd) { ++ lwip_tcp_tso_split_seg(pcb, &pcb->unsent, &pcb->last_unsent); ++ seg = pcb->unsent; ++ } ++#endif /* OFFLOAD_TX_TCP_TSO */ + LWIP_DEBUGF(TCP_CWND_DEBUG, + ("tcp_output: snd_wnd %"TCPWNDSIZE_F", cwnd %"TCPWNDSIZE_F", wnd %"U32_F + ", effwnd %"U32_F", seq %"U32_F", ack %"U32_F"\n", +@@ -1574,136 +1685,17 @@ tcp_output(struct tcp_pcb *pcb) + pcb->persist_backoff = 0; + + /* useg should point to last segment on unacked queue */ ++#if GAZELLE_TCP_LAST_SEG + useg = pcb->last_unacked; ++#else /* GAZELLE_TCP_LAST_SEG */ ++ useg = pcb->unacked; ++ if (useg != NULL) { ++ for (; useg->next != NULL; useg = useg->next); ++ } ++#endif /* GAZELLE_TCP_LAST_SEG */ + + /* data available and window allows it to be sent? */ +-#if GAZELLE_ENABLE +- if ((netif_get_txol_flags(netif) & RTE_ETH_TX_OFFLOAD_TCP_TSO) && pcb->need_tso_send) { +- uint16_t send_pkt = 0; +- +- do { +- struct tcp_seg * start_seg = seg; +- struct pbuf *new_pbuf = NULL; +- +- struct pbuf *tmp_pbuf = NULL; +- u32_t seg_seqno = lwip_ntohl(seg->tcphdr->seqno); +- u32_t last_seg_seqno = seg_seqno; +- +- struct tcp_seg *last_seg = NULL; +- u16_t last_seg_len = 0; +- u8_t pbuf_chain_len = 0; +- while (seg != NULL && seg_seqno - pcb->lastack + seg->len <= wnd && pbuf_chain_len < netif->max_pbuf_frags) { +- if (last_seg_len != 0 && (last_seg_len + seg->len < 1460) && seg->len < netif->min_tso_seglen) { +- break; +- } +- +- if ((tcp_do_output_nagle(pcb) == 0) && +- ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) { +- break; +- } +- if (last_seg_seqno + last_seg_len == seg_seqno) { +- pbuf_remove_header(seg->p, seg->p->tot_len - seg->len); +- if (new_pbuf == NULL) { +- new_pbuf = seg->p; +- tmp_pbuf = new_pbuf; +- } else { +- new_pbuf->tot_len += seg->p->len; +- tmp_pbuf->next = seg->p; +- tmp_pbuf = tmp_pbuf->next; +- } +- } else { +- break; +- } +- +- last_seg = seg; +- last_seg_len = seg->len; +- last_seg_seqno = seg_seqno; +- seg = seg->next; +- seg_seqno = (seg != NULL) ? lwip_ntohl(seg->tcphdr->seqno) : seg_seqno; +- pbuf_chain_len++; +- } +- +- // tcp_do_output_nagle, break +- if (new_pbuf == NULL) { +- goto end_loop; +- } +- +- struct tcp_seg new_seg; +- TCP_INIT_SEGMENT(&new_seg, pcb, new_pbuf, 0, lwip_ntohl(start_seg->tcphdr->seqno), 0); +- +- if (pcb->state != SYN_SENT) { +- TCPH_SET_FLAG(new_seg.tcphdr, TCP_ACK); +- } +- +- err = tcp_output_segment(&new_seg, pcb, netif); +- if (err != ERR_OK) { +- /* segment could not be sent, for whatever reason */ +- tcp_set_flags(pcb, TF_NAGLEMEMERR); +- return err; +- } +- +- if (pcb->last_unsent == pcb->unsent) { +- pcb->last_unsent = last_seg->next; +- } +- pcb->unsent = last_seg->next; +- +- if (pcb->state != SYN_SENT) { +- tcp_clear_flags(pcb, TF_ACK_DELAY | TF_ACK_NOW); +- } +- +- snd_nxt = last_seg_seqno + TCP_TCPLEN(last_seg); +- if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) { +- pcb->snd_nxt = snd_nxt; +- } +- +- pbuf_remove_header(new_seg.p, new_seg.p->tot_len - new_seg.len - TCP_HLEN); +- new_seg.p->tot_len = new_seg.p->len; +- +- struct tcp_seg **cur_seg = NULL; +- if (pcb->unacked != NULL) { +- cur_seg = &(pcb->unacked); +- } +- for (int start = pbuf_chain_len; start > 0; start--) { +- struct tcp_seg *tmp_seg = start_seg; +- start_seg = start_seg->next; +- tmp_seg->p->next = NULL; +- if (TCP_TCPLEN(tmp_seg) > 0) { +- tmp_seg->next = NULL; +- if (pcb->unacked == NULL) { +- pcb->last_unacked = tmp_seg; +- pcb->unacked = tmp_seg; +- useg = tmp_seg; +- cur_seg = &(pcb->unacked); +- } else { +- if (TCP_SEQ_LT(lwip_ntohl(tmp_seg->tcphdr->seqno), lwip_ntohl(useg->tcphdr->seqno))) { +- /* add segment to before tail of unacked list, keeping the list sorted */ +- while (*cur_seg && +- TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(tmp_seg->tcphdr->seqno))) { +- cur_seg = &((*cur_seg)->next ); +- } +- tmp_seg->next = (*cur_seg); +- (*cur_seg) = tmp_seg; +- } else { +- /* add segment to tail of unacked list */ +- useg->next = tmp_seg; +- if (pcb->last_unacked == useg) { +- pcb->last_unacked = tmp_seg; +- } +- useg = useg->next; +- } +- } +- } else { +- tcp_seg_free(tmp_seg); +- } +- } +- } while(seg != NULL && lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd && send_pkt++ < 10); +-end_loop: +- pcb->need_tso_send = 0; +- } else +-#endif +-{ +- uint16_t send_pkt = 0; +- while (seg != NULL && send_pkt++ < 10 && ++ while (seg != NULL && + lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd) { + LWIP_ASSERT("RST not expected here!", + (TCPH_FLAGS(seg->tcphdr) & TCP_RST) == 0); +@@ -1714,6 +1706,10 @@ end_loop: + * either seg->next != NULL or pcb->unacked == NULL; + * RST is no sent using tcp_write/tcp_output. + */ ++#if GAZELLE_ENABLE ++ // if (SYS_CONFIG(rtc_mode)) ++ if (false) ++#endif /* GAZELLE_ENABLE */ + if ((tcp_do_output_nagle(pcb) == 0) && + ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) { + break; +@@ -1731,6 +1727,14 @@ end_loop: + TCPH_SET_FLAG(seg->tcphdr, TCP_ACK); + } + ++#if GAZELLE_ENABLE ++ lstack_calculate_aggregate(1, seg->len); ++#endif ++#if OFFLOAD_TX_TCP_TSO ++ if (seg->len > TCP_MSS) { ++ ol_tx_tcp_tso_set(seg->p, TCP_MSS); ++ } ++#endif /* OFFLOAD_TX_TCP_TSO */ + err = tcp_output_segment(seg, pcb, netif); + if (err != ERR_OK) { + /* segment could not be sent, for whatever reason */ +@@ -1795,7 +1799,6 @@ end_loop: + } + seg = pcb->unsent; + } +-} + #if TCP_OVERSIZE + if (pcb->unsent == NULL) { + /* last unsent has been removed, reset unsent_oversize */ +@@ -1804,7 +1807,6 @@ end_loop: + #endif /* TCP_OVERSIZE */ + + output_done: +- pcb->need_tso_send = 0; + #if GAZELLE_TCP_LAST_SEG + if (pcb->unsent == NULL) + pcb->last_unsent = NULL; +@@ -1830,7 +1832,11 @@ tcp_output_segment_busy(const struct tcp_seg *seg) + /* We only need to check the first pbuf here: + If a pbuf is queued for transmission, a driver calls pbuf_ref(), + which only changes the ref count of the first pbuf */ ++// #if !GAZELLE_ENABLE + if (seg->p->ref != 1) { ++// #else ++// if (pbuf_read_mbuf_ref(seg->p) > 1) { ++// #endif + /* other reference found */ + return 1; + } +@@ -1863,9 +1869,6 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + } + } + #endif +-#if GAZELLE_ENABLE +- lstack_calculate_aggregate(1, seg->len); +-#endif + + LWIP_ASSERT("tcp_output_segment: invalid seg", seg != NULL); + LWIP_ASSERT("tcp_output_segment: invalid pcb", pcb != NULL); +@@ -1935,11 +1938,6 @@ tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb, struct netif *netif + } + #endif + +-#if GAZELLE_ENABLE +- /* pbuf into mbuf. ref dpdk_common.h */ +- rte_prefetch0((uint8_t *)(seg->p) - sizeof(struct rte_mbuf) - sizeof(uint64_t) * 2); +-#endif +- + /* Set retransmission timer running if it is not currently enabled + This must be set before checking the route. */ + if (pcb->rtime < 0) { +@@ -2166,48 +2164,39 @@ tcp_rexmit(struct tcp_pcb *pcb) + } + + seg = pcb->unacked; +-#if GAZELLE_ENABLE +- cur_seg = &(pcb->unsent); +- while (seg) { +-#endif +- /* Give up if the segment is still referenced by the netif driver +- due to deferred transmission. */ +- if (tcp_output_segment_busy(seg)) { +- LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit busy\n")); +- if (seg == pcb->unacked) +- return ERR_VAL; +- else +- break; +- } + +- /* Move the first unacked segment to the unsent queue */ +- /* Keep the unsent queue sorted. */ +- if (pcb->last_unacked == pcb->unacked) +- pcb->last_unacked = pcb->unacked->next; +- pcb->unacked = pcb->unacked->next; ++ /* Give up if the segment is still referenced by the netif driver ++ due to deferred transmission. */ ++ if (tcp_output_segment_busy(seg)) { ++ LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_rexmit busy\n")); ++ return ERR_VAL; ++ } + +-#if !GAZELLE_ENABLE +- cur_seg = &(pcb->unsent); +-#endif +- while (*cur_seg && +- TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(seg->tcphdr->seqno))) { +- cur_seg = &((*cur_seg)->next); +- } +- if (*cur_seg == NULL) +- pcb->last_unsent = seg; +- seg->next = *cur_seg; +- *cur_seg = seg; +-#if TCP_OVERSIZE +- if (seg->next == NULL) { +- /* the retransmitted segment is last in unsent, so reset unsent_oversize */ +- pcb->unsent_oversize = 0; +- } +-#endif /* TCP_OVERSIZE */ ++ /* Move the first unacked segment to the unsent queue */ ++ /* Keep the unsent queue sorted. */ ++ pcb->unacked = seg->next; ++#if GAZELLE_TCP_LAST_SEG ++ if (pcb->unacked == NULL) ++ pcb->last_unacked = NULL; ++#endif /* GAZELLE_TCP_LAST_SEG */ + +-#if GAZELLE_ENABLE +- seg = pcb->unacked; ++ cur_seg = &(pcb->unsent); ++ while (*cur_seg && ++ TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(seg->tcphdr->seqno))) { ++ cur_seg = &((*cur_seg)->next ); + } +-#endif ++ seg->next = *cur_seg; ++ *cur_seg = seg; ++#if GAZELLE_TCP_LAST_SEG ++ if (seg->next == NULL) ++ pcb->last_unsent = seg; ++#endif /* GAZELLE_TCP_LAST_SEG */ ++#if TCP_OVERSIZE ++ if (seg->next == NULL) { ++ /* the retransmitted segment is last in unsent, so reset unsent_oversize */ ++ pcb->unsent_oversize = 0; ++ } ++#endif /* TCP_OVERSIZE */ + + if (pcb->nrtx < 0xFF) { + ++pcb->nrtx; +@@ -2215,7 +2204,6 @@ tcp_rexmit(struct tcp_pcb *pcb) + + /* Don't take any rtt measurements after retransmitting. */ + pcb->rttest = 0; +- pcb->need_tso_send = 1; + + /* Do the actual retransmission. */ + MIB2_STATS_INC(mib2.tcpretranssegs); +@@ -2224,6 +2212,7 @@ tcp_rexmit(struct tcp_pcb *pcb) + return ERR_OK; + } + ++ + /** + * Handle retransmission after three dupacks received + * +diff --git a/src/core/udp.c b/src/core/udp.c +index a86f5ec..fa38d74 100644 +--- a/src/core/udp.c ++++ b/src/core/udp.c +@@ -687,31 +687,11 @@ udp_sendto_chksum(struct udp_pcb *pcb, struct pbuf *p, const ip_addr_t *dst_ip, + UDP_STATS_INC(udp.rterr); + return ERR_RTE; + } +-#if GAZELLE_UDP_ENABLE +- struct pbuf *udp_pbuf = do_lwip_udp_get_from_sendring((struct lwip_sock *)(p->payload), p->tot_len); +- do_lwip_get_from_sendring_over((struct lwip_sock *)(p->payload)); +- +- p = udp_pbuf; +- if (p == NULL) { +- return ERR_MEM; +- } +- +- if (p->port) { +-#if LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_UDP +- return udp_sendto_if_chksum(pcb, p, &(p->addr), p->port, netif, have_chksum, chksum); +-#else /* LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_UDP */ +- return udp_sendto_if(pcb, p, &(p->addr), p->port, netif); +-#endif /* LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_UDP */ +- } else { +-#endif /* GAZELLE_UDP_ENABLE */ + #if LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_UDP + return udp_sendto_if_chksum(pcb, p, dst_ip, dst_port, netif, have_chksum, chksum); + #else /* LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_UDP */ + return udp_sendto_if(pcb, p, dst_ip, dst_port, netif); + #endif /* LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_UDP */ +-#if GAZELLE_UDP_ENABLE +- } +-#endif /* GAZELLE_UDP_ENABLE */ + } + + /** +@@ -829,7 +809,7 @@ udp_sendto_if_src_chksum(struct udp_pcb *pcb, struct pbuf *p, const ip_addr_t *d + struct pbuf *q; /* q will be sent down the stack */ + u8_t ip_proto; + u8_t ttl; +- ++ + LWIP_ASSERT_CORE_LOCKED(); + #if GAZELLE_ENABLE + UDP_STATS_INC(udp.tx_in); +@@ -1031,19 +1011,14 @@ udp_sendto_if_src_chksum(struct udp_pcb *pcb, struct pbuf *p, const ip_addr_t *d + LWIP_DEBUGF(UDP_DEBUG, ("udp_send: ip_output_if (,,,,0x%02"X16_F",)\n", (u16_t)ip_proto)); + /* output to IP */ + NETIF_SET_HINTS(netif, &(pcb->netif_hints)); +-#if GAZELLE_UDP_ENABLE +- PBUF_TO_MBUF(q)->l4_len = UDP_HLEN; +-#endif /* GAZELLE_UDP_ENABLE */ + err = ip_output_if_src(q, src_ip, dst_ip, ttl, pcb->tos, ip_proto, netif); + NETIF_RESET_HINTS(netif); + + /* @todo: must this be increased even if error occurred? */ + MIB2_STATS_INC(mib2.udpoutdatagrams); + +-#if !GAZELLE_UDP_ENABLE + /* did we chain a separate header pbuf earlier? */ + if (q != p) +-#endif + { + /* free the header pbuf */ + pbuf_free(q); +diff --git a/src/include/arch/sys_arch.h b/src/include/arch/sys_arch.h +index 515042e..c36aa45 100644 +--- a/src/include/arch/sys_arch.h ++++ b/src/include/arch/sys_arch.h +@@ -44,6 +44,7 @@ + #include "lwip/err.h" + #include "lwip/debug.h" + #include "lwip/memp.h" ++#include "lwipgz_list.h" + + + #define SYS_FORMAT_NAME(buf, size, fmt, ...) \ +@@ -65,6 +66,12 @@ u8_t *sys_hugepage_malloc(const char *name, unsigned size); + void sys_mempool_var_init(struct memp_desc *memp, char *desc, u16_t size, u16_t num, + u8_t *base, struct memp **tab, struct stats_mem *stats); + ++struct sys_config { ++ bool rtc_mode; ++}; ++void sys_config_init(const struct sys_config *conf); ++extern const struct sys_config * const g_sys_config; ++#define SYS_CONFIG(conf) (g_sys_config->conf) + + #define SYS_NAME_LEN 64 + struct sys_thread { +@@ -185,69 +192,80 @@ int sys_mutex_timedlock_internal(sys_mutex_t m, int timeout) + } + + +-struct sys_mbox { +- char name[SYS_NAME_LEN]; +- int size; +- int socket_id; +- unsigned flags; +- struct rte_ring *ring; +- int (*wait_fn)(void); +-}; +-typedef struct sys_mbox *sys_mbox_t; +-#define sys_mbox_valid(mbox) sys_sem_valid(mbox) +-#define sys_mbox_set_invalid(mbox) sys_sem_set_invalid(mbox) +-int sys_mbox_empty(struct sys_mbox *); ++struct mbox_ring; ++struct mbox_ring_ops { ++ int (*create)(struct mbox_ring *mr, const char *name, unsigned count); ++ void (*destroy)(struct mbox_ring *mr); + ++ unsigned (*get_capacity)(const struct mbox_ring *mr); ++ unsigned (*count)(const struct mbox_ring *mr); ++ unsigned (*free_count)(const struct mbox_ring *mr); + +-#if GAZELLE_ENABLE +-#include "dpdk_version.h" ++ unsigned (*enqueue_burst)(struct mbox_ring *mr, void *const *obj_table, unsigned n); ++ unsigned (*dequeue_burst)(struct mbox_ring *mr, void **obj_table, unsigned n); + +-/* +- gazelle custom rte ring interface +- lightweight ring no atomic. +- only surpport in single thread. +- */ +-static __rte_always_inline uint32_t gazelle_st_ring_enqueue_busrt(struct rte_ring *r, void **obj_table, uint32_t n) +-{ +- uint32_t prod = r->prod.tail; +- uint32_t cons = r->cons.tail; +- uint32_t free_entries = r->capacity + cons - prod; +- +- if (n > free_entries) { +- return 0; +- } +- +- __rte_ring_enqueue_elems(r, prod, obj_table, sizeof(void *), n); +- +- r->prod.tail = prod + n; +- +- return n; +-} ++ unsigned (*recv_count)(const struct mbox_ring *mr); ++ unsigned (*recv_start_burst)(struct mbox_ring *mr, void **obj_table, unsigned n); ++ void (*recv_finish_burst)(struct mbox_ring *mr); + +-static __rte_always_inline uint32_t gazelle_st_ring_dequeue_burst(struct rte_ring *r, void **obj_table, uint32_t n) +-{ +- uint32_t cons = r->cons.tail; +- uint32_t prod = r->prod.tail; +- uint32_t entries = prod - cons; +- +- if (n > entries) { +- n = entries; +- } +- +- if (n == 0) { +- return 0; +- } +- +- __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n); ++ void *(*read_tail)(const struct mbox_ring *mr); ++ void (*push_tail)(struct mbox_ring *mr, void *obj); ++ void* (*pop_tail)(struct mbox_ring *mr, void *expect); ++}; ++extern struct mbox_ring_ops g_mbox_rtc_default_ops; ++extern struct mbox_ring_ops g_mbox_rtw_default_ops; ++extern struct mbox_ring_ops g_mbox_rtw_append_ops; ++extern struct mbox_ring_ops g_mbox_rtw_peek_ops; ++extern const struct mbox_ring_ops * g_mbox_default_ops; ++ ++struct mbox_ring { ++#define MBOX_FLAG_TCP 0x10 /* same as NETCONN_TCP */ ++#define MBOX_FLAG_UDP 0x20 /* same as NETCONN_UDP */ ++#define MBOX_FLAG_RECV 0x01 ++#define MBOX_FLAG_SEND 0x02 ++#define MBOX_FLAG_PEEK 0x04 ++ int flags; ++ ++ const struct mbox_ring_ops *ops; ++ ++ unsigned (*tail_count)(const struct mbox_ring *mr); ++ void (*obj_free_fn)(struct mbox_ring *mr, void *obj, bool is_tail); ++ void (*private_data_free_fn)(struct mbox_ring *mr); ++ void *private_data; + +- r->cons.tail = cons + n; ++ struct rte_ring *ring; + +- return n; +-} ++ /* only stack */ ++ char pad0 __rte_cache_aligned; /* empty cache line */ ++ ++ unsigned stk_queued_num; ++ ++ /* tcp: stack and app ++ * udp: only app */ ++ char pad1 __rte_cache_aligned; /* empty cache line */ ++ union { ++ void *st_obj; /* single thread */ ++ rte_atomic64_t mt_obj; /* multi thread */ ++ }; ++ ++ /* only app */ ++ char pad2 __rte_cache_aligned; /* empty cache line */ ++ unsigned app_free_count; /* send */ ++ unsigned app_queued_num; /* send or recv */ ++ union { ++ unsigned app_tail_left; /* send */ ++ unsigned app_recvd_len; /* recv */ ++ }; ++} __rte_cache_aligned; + +-void gazelle_ring_free_fast(struct rte_ring *ring); +-struct rte_ring *gazelle_ring_create_fast(const char *name, uint32_t size, uint32_t flags); ++struct sys_mbox { ++ char name[SYS_NAME_LEN]; ++ struct mbox_ring mring; ++}; ++typedef struct sys_mbox *sys_mbox_t; ++#define sys_mbox_valid(mbox) sys_sem_valid(mbox) ++#define sys_mbox_set_invalid(mbox) sys_sem_set_invalid(mbox) ++err_t sys_mbox_new_flags(sys_mbox_t *mbox, int size, int flags); + +-#endif /* GAZELLE_ENABLE */ + + #endif /* _LWIP_ARCH_SYS_ARCH_H_ */ +diff --git a/src/include/lwip/api.h b/src/include/lwip/api.h +index 0322a04..695b807 100644 +--- a/src/include/lwip/api.h ++++ b/src/include/lwip/api.h +@@ -239,6 +239,9 @@ struct netconn { + /** mbox where received packets are stored until they are fetched + by the netconn application thread (can grow quite big) */ + sys_mbox_t recvmbox; ++#if GAZELLE_ENABLE ++ sys_mbox_t sendmbox; ++#endif /* GAZELLE_ENABLE */ + #if LWIP_TCP + /** mbox where new connections are stored until processed + by the application thread */ +@@ -341,9 +344,6 @@ err_t netconn_accept(struct netconn *conn, struct netconn **new_conn); + err_t netconn_recv(struct netconn *conn, struct netbuf **new_buf); + err_t netconn_recv_udp_raw_netbuf(struct netconn *conn, struct netbuf **new_buf); + err_t netconn_recv_udp_raw_netbuf_flags(struct netconn *conn, struct netbuf **new_buf, u8_t apiflags); +-#if GAZELLE_UDP_ENABLE +-err_t netconn_recv_udp_raw_pbuf_flags(struct netconn *conn, struct pbuf **new_buf, u8_t apiflags); +-#endif /* GAZELLE_UDP_ENABLE */ + err_t netconn_recv_tcp_pbuf(struct netconn *conn, struct pbuf **new_buf); + err_t netconn_recv_tcp_pbuf_flags(struct netconn *conn, struct pbuf **new_buf, u8_t apiflags); + err_t netconn_tcp_recvd(struct netconn *conn, size_t len); +diff --git a/src/include/lwip/netbuf.h b/src/include/lwip/netbuf.h +index e7f0713..fccbb96 100644 +--- a/src/include/lwip/netbuf.h ++++ b/src/include/lwip/netbuf.h +@@ -73,7 +73,15 @@ struct netbuf { + #endif /* LWIP_IPV6 */ + #endif /* LWIP_NETBUF_RECVINFO */ + #endif /* LWIP_NETBUF_RECVINFO || LWIP_CHECKSUM_ON_COPY */ ++ ++#if GAZELLE_UDP_ENABLE ++ struct pbuf *tail; /* the tail of pbuf list */ ++#endif /* GAZELLE_UDP_ENABLE */ + }; ++#if GAZELLE_UDP_ENABLE ++struct netbuf *netbuf_create(struct pbuf *p); ++void netbuf_chain_pbuf(struct netbuf *buf, struct pbuf *p); ++#endif /* GAZELLE_UDP_ENABLE */ + + /* Network buffer functions: */ + struct netbuf * netbuf_new (void); +diff --git a/src/include/lwip/netif.h b/src/include/lwip/netif.h +index 75157de..89b8a40 100644 +--- a/src/include/lwip/netif.h ++++ b/src/include/lwip/netif.h +@@ -114,10 +114,6 @@ extern "C" { + * Set by the netif driver in its init function. */ + #define NETIF_FLAG_MLD6 0x40U + +-#if GAZELLE_ENABLE +-/** If set, use run to completion mode */ +-#define NETIF_FLAG_RTC_MODE 0x80U +-#endif + + /** + * @} +@@ -368,14 +364,13 @@ struct netif { + #if GAZELLE_ENABLE + u64_t rxol_flags; + u64_t txol_flags; +- bool vlan_enable; ++ u8_t max_pbuf_frags; ++ u8_t vlan_enable; + /** vlan id is an attribute of NIC. The variable 'netif_hints' is not used because it is assigned by pcb, + * while non transport layers without pcb cannot be enabled */ + u16_t vlan_tci; ++#endif /* GAZELLE_ENABLE */ + +- u8_t max_pbuf_frags; +- u16_t min_tso_seglen; +-#endif + /** descriptive abbreviation */ + char name[2]; + /** number of this interface. Used for @ref if_api and @ref netifapi_netif, +@@ -503,17 +498,14 @@ void netif_set_down(struct netif *netif); + #define netif_is_up(netif) (((netif)->flags & NETIF_FLAG_UP) ? (u8_t)1 : (u8_t)0) + + #if GAZELLE_ENABLE +-#define netif_is_rtc_mode(netif) (((netif)->flags & NETIF_FLAG_RTC_MODE) ? (u8_t)1 : (u8_t)0) + #define netif_get_rxol_flags(netif) ((netif)->rxol_flags) + #define netif_get_txol_flags(netif) ((netif)->txol_flags) + + void netif_set_vlan_tci(struct netif *netif, u16_t vlan_tci); +-void netif_set_rtc_mode(struct netif *netif); + void netif_set_rxol_flags(struct netif *netif, u64_t flags); + void netif_set_txol_flags(struct netif *netif, u64_t flags); +-void netif_set_min_tso_seglen(struct netif *netif, u16_t min_tso_seglen); + void netif_set_max_pbuf_frags(struct netif *netif, u8_t max_frags); +-#endif ++#endif /* GAZELLE_ENABLE */ + + #if LWIP_NETIF_STATUS_CALLBACK + void netif_set_status_callback(struct netif *netif, netif_status_callback_fn status_callback); +diff --git a/src/include/lwip/pbuf.h b/src/include/lwip/pbuf.h +index dda66c3..88579d1 100644 +--- a/src/include/lwip/pbuf.h ++++ b/src/include/lwip/pbuf.h +@@ -40,10 +40,6 @@ + + #include "lwip/opt.h" + #include "lwip/err.h" +-#if GAZELLE_UDP_ENABLE +-#include "lwip/ip_addr.h" +-#include "lwip/ip6_addr.h" +-#endif /* GAZELLE_UDP_ENABLE */ + + #ifdef __cplusplus + extern "C" { +@@ -168,7 +164,10 @@ typedef enum { + the first payload byte can be calculated from struct pbuf). + Don't use this for TX, if the pool becomes empty e.g. because of TCP queuing, + you are unable to receive TCP acks! */ +- PBUF_POOL = (PBUF_ALLOC_FLAG_RX | PBUF_TYPE_FLAG_STRUCT_DATA_CONTIGUOUS | PBUF_TYPE_ALLOC_SRC_MASK_STD_MEMP_PBUF_POOL) ++ PBUF_POOL = (PBUF_ALLOC_FLAG_RX | PBUF_TYPE_FLAG_STRUCT_DATA_CONTIGUOUS | PBUF_TYPE_ALLOC_SRC_MASK_STD_MEMP_PBUF_POOL), ++#if GAZELLE_ENABLE ++ PBUF_POOL_PREINIT = 0xFF, ++#endif /* GAZELLE_ENABLE */ + } pbuf_type; + + +@@ -228,18 +227,18 @@ struct pbuf { + struct tcp_pcb *pcb; + #endif /* GAZELLE_SAME_NODE */ + #if GAZELLE_ENABLE +- volatile u8_t allow_append; +- pthread_spinlock_t pbuf_lock; +-#if GAZELLE_UDP_ENABLE +- ip_addr_t addr; +- u16_t port; +-#endif /* GAZELLE_UDP_ENABLE */ +-#endif /* GAZELLE_ENABLE CHECKSUM_OFFLOAD_SWITCH */ ++ /* set 1, free would skip rte_pktmbuf_prefree_seg() */ ++ u8_t mbuf_refcnt; ++ u8_t tcp_psh; ++#endif /* GAZELLE_ENABLE */ + + /** In case the user needs to store data custom data on a pbuf */ + LWIP_PBUF_CUSTOM_DATA + }; +- ++#if GAZELLE_ENABLE ++void pbuf_append_take(struct pbuf *h, struct pbuf *t, const void *dataptr, u16_t len, u16_t *chksum); ++void pbuf_split_one(struct pbuf *p, struct pbuf **rest); ++#endif /* GAZELLE_ENABLE */ + + /** Helper struct for const-correctness only. + * The only meaning of this one is to provide a const payload pointer +@@ -262,10 +261,33 @@ struct pbuf_custom { + /** The actual pbuf */ + struct pbuf pbuf; + /** This function is called when pbuf_free deallocates this pbuf(_custom) */ +-#if !GAZELLE_ENABLE + pbuf_free_custom_fn custom_free_function; +-#endif ++#if GAZELLE_ENABLE ++ /* don't use `struct tcp_seg` directly to avoid conflicts by include lwip tcp header */ ++#define GAZELLE_SIZEOF_TCP_SEG 40 /* must be greater than sizeof(struct tcp_seg) */ ++#define GAZELLE_SIZEOF_NETBUF 56 /* must be greater than sizeof(struct netbuf) */ ++ union { ++ char tcp_seg[GAZELLE_SIZEOF_TCP_SEG]; ++ char netbuf[GAZELLE_SIZEOF_NETBUF]; ++ }; ++#endif /* GAZELLE_ENABLE */ + }; ++#if GAZELLE_ENABLE ++#include ++static inline struct pbuf *pbuf_list_tail(struct pbuf *p) ++{ ++ struct pbuf *t; ++ for (t = p; t->next != NULL; t = t->next) { /* get tail */ } ++ return t; ++} ++ ++#define pbuf_to_mbuf(p) ( (struct rte_mbuf *)RTE_PTR_SUB(p, sizeof(struct rte_mbuf)) ) ++#define pbuf_to_tcp_seg(p) ( (struct tcp_seg *)(((struct pbuf_custom *)(p))->tcp_seg) ) ++#define pbuf_to_netbuf(p) ( (struct netbuf *)(((struct pbuf_custom *)(p))->netbuf ) ) ++#define pbuf_read_mbuf_ref(p) rte_mbuf_refcnt_read(pbuf_to_mbuf(p)) ++// #define pbuf_set_mbuf_ref(p, v) rte_mbuf_refcnt_set(pbuf_to_mbuf(p), v) ++// #define pbuf_update_mbuf_ref(p, v) rte_mbuf_refcnt_update(pbuf_to_mbuf(p), v) ++#endif /* GAZELLE_ENABLE */ + #endif /* LWIP_SUPPORT_CUSTOM_PBUF */ + + /** Define this to 0 to prevent freeing ooseq pbufs when the PBUF_POOL is empty */ +@@ -289,10 +311,7 @@ void pbuf_free_ooseq(void); + + /* Initializes the pbuf module. This call is empty for now, but may not be in future. */ + #define pbuf_init() +-#if GAZELLE_ENABLE +-extern struct pbuf *do_lwip_alloc_pbuf(pbuf_layer layer, uint16_t length, pbuf_type type); +-extern void do_lwip_free_pbuf(struct pbuf *pbuf); +-#endif ++ + struct pbuf *pbuf_alloc(pbuf_layer l, u16_t length, pbuf_type type); + struct pbuf *pbuf_alloc_reference(void *payload, u16_t length, pbuf_type type); + #if LWIP_SUPPORT_CUSTOM_PBUF +diff --git a/src/include/lwip/priv/tcp_priv.h b/src/include/lwip/priv/tcp_priv.h +index 02df1d0..ba3055a 100644 +--- a/src/include/lwip/priv/tcp_priv.h ++++ b/src/include/lwip/priv/tcp_priv.h +@@ -269,6 +269,16 @@ struct tcp_seg { + #define TF_SEG_OPTS_SACK_PERM (u8_t)0x10U /* Include SACK Permitted option (only used in SYN segments) */ + struct tcp_hdr *tcphdr; /* the TCP header */ + }; ++#if GAZELLE_ENABLE ++u16_t lwip_tcp_mss(const struct tcp_pcb *pcb); ++u8_t lwip_tcp_optlen(const struct tcp_pcb *pcb); ++u8_t lwip_tcp_allow_send(const struct tcp_pcb *pcb); ++ ++u16_t lwip_tcp_prepare_seg(struct tcp_pcb *pcb, struct pbuf *pbuf_pkts[], u16_t num); ++void lwip_tcp_tso_merge_seg(struct tcp_pcb *pcb, struct pbuf *pbuf_pkts[], u16_t *num); ++void lwip_tcp_tso_split_seg(struct tcp_pcb *pcb, struct tcp_seg **head_seg, struct tcp_seg **tail_seg); ++void tcp_seg_debug(const struct tcp_pcb *pcb, const char *name); ++#endif /* GAZELLE_ENABLE */ + + #define LWIP_TCP_OPT_EOL 0 + #define LWIP_TCP_OPT_NOP 1 +diff --git a/src/include/lwip/sockets.h b/src/include/lwip/sockets.h +index 205a9d3..4f5105b 100644 +--- a/src/include/lwip/sockets.h ++++ b/src/include/lwip/sockets.h +@@ -681,10 +681,23 @@ int fcntl(int s, int cmd, ...); + #endif /* LWIP_COMPAT_SOCKETS == 2 */ + + #if GAZELLE_ENABLE +-int lwip_sock_make_addr(struct netconn *conn, ip_addr_t *fromaddr, u16_t port, +- struct sockaddr *from, socklen_t *fromlen); ++#include "lwipgz_sock.h" ++ ++void lwip_tcp_recvd(struct netconn *conn, size_t recvd, int flags); ++int lwip_tcp_recv_from(struct netconn *conn, struct sockaddr *from, socklen_t *fromlen, int dbg_ret); ++ ++void lwip_sendto_netbuf(struct netconn *conn, struct netbuf *buf, ++ const struct sockaddr *to, socklen_t tolen); ++ ++ssize_t lwip_recvmsg_check(const struct lwip_sock *sock, const struct msghdr *message, int flags); ++ssize_t lwip_sendmsg_check(const struct lwip_sock *sock, const struct msghdr *msg, int flags); ++ ++err_t lwip_recvfrom_udp_raw(struct lwip_sock *sock, int flags, ++ struct msghdr *msg, u16_t *datagram_len, int dbg_s); ++ + int lwip_accept4(int s, struct sockaddr *addr, socklen_t *addrlen, int flags); +-#endif ++#endif /* GAZELLE_ENABLE */ ++ + int lwip_accept(int s, struct sockaddr *addr, socklen_t *addrlen); + int lwip_bind(int s, const struct sockaddr *name, socklen_t namelen); + int lwip_shutdown(int s, int how); +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index 2ffd2ef..e3add72 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -424,9 +424,6 @@ struct tcp_pcb { + struct rte_ring *client_tx_ring; + u8_t free_ring; + #endif /* GAZELLE_SAME_NODE */ +-#if GAZELLE_ENABLE +- u8_t need_tso_send; +-#endif + }; + + #if LWIP_EVENT_API +@@ -513,10 +510,6 @@ err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); + + err_t tcp_write (struct tcp_pcb *pcb, const void *dataptr, u16_t len, + u8_t apiflags); +-#if GAZELLE_ENABLE +-err_t tcp_write_from_stack (struct tcp_pcb *pcb, const void *dataptr, u16_t len, +- u8_t apiflags); +-#endif + + void tcp_setprio (struct tcp_pcb *pcb, u8_t prio); + +diff --git a/src/include/lwipgz_offload.h b/src/include/lwipgz_offload.h +index 19b0190..31d3a28 100644 +--- a/src/include/lwipgz_offload.h ++++ b/src/include/lwipgz_offload.h +@@ -38,50 +38,55 @@ + #if GAZELLE_ENABLE + #include + #include +-#include ++#include + #include + + #include "dpdk_version.h" + #include "lwip/pbuf.h" + +-#define PBUF_TO_MBUF(p) ((struct rte_mbuf *)RTE_PTR_SUB(p, sizeof(struct rte_mbuf))) + + static inline void pbuf_offload_copy(struct pbuf *to, const struct pbuf *from) + { +- PBUF_TO_MBUF(to)->l4_len = PBUF_TO_MBUF(from)->l4_len; +- PBUF_TO_MBUF(to)->l3_len = PBUF_TO_MBUF(from)->l3_len; +- PBUF_TO_MBUF(to)->l2_len = PBUF_TO_MBUF(from)->l2_len; +- PBUF_TO_MBUF(to)->ol_flags = PBUF_TO_MBUF(from)->ol_flags; ++ struct rte_mbuf *m_to = pbuf_to_mbuf(to); ++ struct rte_mbuf *m_from = pbuf_to_mbuf(from); ++ ++ m_to->l4_len = m_from->l4_len; ++ m_to->l3_len = m_from->l3_len; ++ m_to->l2_len = m_from->l2_len; ++ m_to->ol_flags = m_from->ol_flags; + } + + static inline void pbuf_set_vlan(struct pbuf *p, u16_t vlan_tci) + { +- PBUF_TO_MBUF(p)->ol_flags |= RTE_MBUF_F_TX_VLAN; +- PBUF_TO_MBUF(p)->vlan_tci = vlan_tci; ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ m->ol_flags |= RTE_MBUF_F_TX_VLAN; ++ m->vlan_tci = vlan_tci; + } + + #if OFFLOAD_CHECKSUM_CHECK_IP + // replaces inet_chksum() for ip4_input + static inline u64_t ol_chksum_check_ip(struct pbuf *p) + { +- return PBUF_TO_MBUF(p)->ol_flags & (RTE_MBUF_F_RX_IP_CKSUM_BAD); ++ return pbuf_to_mbuf(p)->ol_flags & (RTE_MBUF_F_RX_IP_CKSUM_BAD); + } + #endif /* OFFLOAD_CHECKSUM_CHECK_IP */ + + #if OFFLOAD_CHECKSUM_GEN_IP + static inline void ol_chksum_gen_eth(struct pbuf *p, u16_t len) + { +- PBUF_TO_MBUF(p)->l2_len = len; ++ pbuf_to_mbuf(p)->l2_len = len; + } + + // replaces inet_chksum() for ip4_output + static inline void ol_chksum_gen_ip(struct pbuf *p, u16_t len, bool do_ipcksum) + { +- PBUF_TO_MBUF(p)->ol_flags |= ((len == IP_HLEN) ? RTE_MBUF_F_TX_IPV4 : RTE_MBUF_F_TX_IPV6); ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ ++ m->ol_flags |= ((len == IP_HLEN) ? RTE_MBUF_F_TX_IPV4 : RTE_MBUF_F_TX_IPV6); + if (do_ipcksum) { +- PBUF_TO_MBUF(p)->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; ++ m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; + } +- PBUF_TO_MBUF(p)->l3_len = len; ++ m->l3_len = len; + } + #endif /* OFFLOAD_CHECKSUM_GEN_IP */ + +@@ -89,7 +94,7 @@ static inline void ol_chksum_gen_ip(struct pbuf *p, u16_t len, bool do_ipcksum) + // replace ip_chksum_pseudo() for tcp_input + static inline u64_t ol_chksum_check_l4(struct pbuf *p) + { +- return PBUF_TO_MBUF(p)->ol_flags & (RTE_MBUF_F_RX_L4_CKSUM_BAD); ++ return pbuf_to_mbuf(p)->ol_flags & (RTE_MBUF_F_RX_L4_CKSUM_BAD); + } + #define ol_chksum_check_tcp ol_chksum_check_l4 + #define ol_chksum_check_udp ol_chksum_check_l4 +@@ -99,18 +104,36 @@ static inline u64_t ol_chksum_check_l4(struct pbuf *p) + // replace ip_chksum_pseudo() for tcp_output + static inline void ol_chksum_gen_tcp(struct pbuf *p, u16_t len) + { +- PBUF_TO_MBUF(p)->l4_len = len; +- PBUF_TO_MBUF(p)->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ m->l4_len = len; ++ m->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; + } + #endif /* OFFLOAD_CHECKSUM_GEN_TCP */ + + #if OFFLOAD_CHECKSUM_GEN_UDP + static inline void ol_chksum_gen_udp(struct pbuf *p, u16_t len) + { +- PBUF_TO_MBUF(p)->l4_len = len; +- PBUF_TO_MBUF(p)->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM; ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ m->l4_len = len; ++ m->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM; + } + #endif /* OFFLOAD_CHECKSUM_GEN_UDP */ + ++#if OFFLOAD_TX_TCP_TSO ++static inline void ol_tx_tcp_tso_set(struct pbuf *p, u16_t len) ++{ ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; ++ m->tso_segsz = len; ++} ++ ++static inline void ol_tx_tcp_tso_unset(struct pbuf *p) ++{ ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ m->ol_flags &= ~RTE_MBUF_F_TX_TCP_SEG; ++ m->tso_segsz = 0; ++} ++#endif /* OFFLOAD_TX_TCP_TSO */ ++ + #endif /* GAZELLE_ENABLE */ + #endif /* __LWIPGZ_OFFLOAD_H__ */ +diff --git a/src/include/lwipgz_sock.h b/src/include/lwipgz_sock.h +index 36527cf..57193e9 100644 +--- a/src/include/lwipgz_sock.h ++++ b/src/include/lwipgz_sock.h +@@ -85,6 +85,10 @@ void lwip_sock_init(void); + void lwip_exit(void); + + ++extern void mem_put_pbuf(struct pbuf *pbuf); ++extern struct pbuf *mem_get_pbuf(int stack_id, bool reserve); ++extern void mem_init_pbuf(struct pbuf *pbuf, pbuf_layer layer, u16_t tot_len, u16_t len, pbuf_type type); ++ + extern void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); + extern void sock_event_remove_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); + +@@ -92,18 +96,11 @@ extern int do_lwip_init_sock(int fd); + extern void do_lwip_clean_sock(int fd); + extern void do_lwip_connected_callback(int fd); + +-extern void do_lwip_add_recvlist(int32_t fd); +-extern struct pbuf *do_lwip_udp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size); +-extern struct pbuf *do_lwip_tcp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size); +-extern void do_lwip_get_from_sendring_over(struct lwip_sock *sock); +-extern ssize_t do_lwip_read_from_lwip(struct lwip_sock *sock, int32_t flags, u8_t apiflags); +- + struct sock_time_stamp { + uint64_t rpc_time_stamp; + uint64_t mbox_time_stamp; + }; + extern void lstack_calculate_aggregate(int type, uint32_t len); +-extern void time_stamp_transfer_pbuf(struct pbuf *pbuf_old, struct pbuf *pbuf_new); + extern void time_stamp_record(int fd, struct pbuf *pbuf); + + #endif /* GAZELLE_ENABLE */ +@@ -174,21 +171,6 @@ struct lwip_sock { + #endif + + #if GAZELLE_ENABLE +- char pad0 __rte_cache_aligned; +- /* app thread use */ +- struct pbuf *recv_lastdata; /* unread data in one pbuf */ +- uint16_t remain_len; +- +- char pad1 __rte_cache_aligned; +- /* app and stack thread all use */ +- uint32_t call_num; /* avoid sock too much send rpc msg*/ +- char pad2 __rte_cache_aligned; +- /* stack thread all use */ +- struct list_node recv_list; +- struct pbuf *send_pre_del; +- sem_t snd_ring_sem; +- +- char pad3 __rte_cache_aligned; + /* nerver change */ + enum posix_type type; + int stack_id; +@@ -201,9 +183,6 @@ struct lwip_sock { + + struct sock_time_stamp stamp; + +- struct rte_ring *recv_ring; +- struct rte_ring *send_ring; +- + #if GAZELLE_SAME_NODE + /* same node send data ring */ + struct same_node_ring *same_node_rx_ring; +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 257dbb9..08ab34c 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -43,9 +43,7 @@ + #define GAZELLE_USE_DPDK_LOG 1 + + #define GAZELLE_ENABLE 1 +-#define PER_THREAD __thread +- +-#define FRAME_MTU 1500 ++#define PER_THREAD __attribute__((tls_model("initial-exec"))) __thread + + #define GAZELLE_MAX_CLIENTS (20000) + #define GAZELLE_RESERVED_CLIENTS (2000) +@@ -87,6 +85,13 @@ + #define OFFLOAD_CHECKSUM_CHECK_UDP (1 && CHECKSUM_CHECK_UDP) + #define OFFLOAD_CHECKSUM_GEN_UDP (1 && CHECKSUM_GEN_UDP) + ++#define OFFLOAD_TX_TCP_TSO 1 ++#define OFFLOAD_TX_UDP_TSO 0 ++#define OFFLAOD_TX_TSO_MIN_SEGLEN 256 ++/* dpdk pmd support 40 max segs */ ++#define OFFLOAD_TX_TSO_MTU_FRAGS 40 /* 0xFFFF / 1460 */ ++/* kernel define MAX_SKB_FRAGS in skbuff.h */ ++#define OFFLOAD_TX_TSO_4K_FRAGS 16 /* 0xFFFF / 4096 */ + + /* + --------------------------------------- +@@ -125,6 +130,9 @@ + ---------- Internal Memory Pool Sizes ---------- + ------------------------------------------------ + */ ++//#define MEMCPY(dst,src,len) rte_memcpy(dst,src,len) ++//#define SMEMCPY(dst,src,len) rte_memcpy(dst,src,len) ++ + #define LWIP_SUPPORT_CUSTOM_PBUF 1 + + #define LWIP_CHECKSUM_ON_COPY 0 +@@ -136,6 +144,8 @@ + #define MEM_USE_POOLS 0 + #define MEMP_USE_CUSTOM_POOLS 0 + ++#define MEM_SIZE (GAZELLE_MAX_CLIENTS * 0xff) ++ + #define MEMP_NUM_TCP_PCB_LISTEN 3000 + + #define MEMP_NUM_TCP_PCB (GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS) +@@ -148,14 +158,10 @@ + + #define MEMP_NUM_SYS_MBOX ((GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS) * 2) + +-#define PBUF_POOL_SIZE (1024) +- +-/* we use PBUF_POOL instead of PBUF_RAM in tcp_write, so reduce PBUF_RAM size, +- * and do NOT let PBUF_POOL_BUFSIZE less then TCP_MSS +-*/ +-#define MEMP_NUM_TCP_SEG (128 * 128 * 2) +-#define PER_TCP_PCB_BUFFER (16 * 128) +-#define MEM_SIZE (((PER_TCP_PCB_BUFFER + 128) * MEMP_NUM_TCP_SEG) >> 2) ++#define MEMP_NUM_NETBUF 1 ++#define MEMP_NUM_TCP_SEG 1 ++#define PBUF_POOL_SIZE 1 ++#define MEMP_NUM_PBUF 1 + + /* + --------------------------------- +@@ -166,9 +172,9 @@ + + #define ARP_TABLE_SIZE 512 + +-#define ARP_QUEUEING 1 +- +-#define ARP_QUEUE_LEN 32 ++#define ARP_QUEUEING 1 ++#define ARP_QUEUE_LEN 2048 ++#define MEMP_NUM_ARP_QUEUE (ARP_QUEUE_LEN + 64) + + #define ETHARP_SUPPORT_STATIC_ENTRIES 1 + +@@ -185,8 +191,8 @@ + + #define IP_HLEN 20 + +-/* the max pbuf num of a udp pbuf chain is ((65535 + MBUF_MAX_DATA_LEN - 1) / MBUF_MAX_DATA_LEN) */ +-#define IP_REASS_MAX_PBUFS 46 ++/* NIC driver may split the MTU into two pbuf */ ++#define IP_REASS_MAX_PBUFS ((0xFFFF / TCP_MSS + 1) * 2) + + /* + ------------------------------------- +@@ -224,35 +230,38 @@ + + #define TCP_HLEN 20 + +-#define DEFAULT_ACCEPTMBOX_SIZE 4096 +-#define DEFAULT_TCP_RECVMBOX_SIZE 4096 +- +-#define TCP_LISTEN_BACKLOG 1 +-#define TCP_DEFAULT_LISTEN_BACKLOG 0xffff +- +-#define TCP_OVERSIZE TCP_MSS +-#define LWIP_NETIF_TX_SINGLE_PBUF 1 +- +-#define TCP_MSS (FRAME_MTU - IP6_HLEN - TCP_HLEN - VLAN_LEN) +- +-#define TCP_WND (2500 * TCP_MSS) +- +-#define TCP_SND_BUF (2500 * TCP_MSS) +- +-#define TCP_SND_QUEUELEN (8191) +- +-#define TCP_SND_BUF_MAX (TCP_SND_QUEUELEN * TCP_MSS) +- +-#define TCP_SNDLOWAT (32768) +- +-#define TCP_SNDQUEUELOWAT (TCP_SND_QUEUELEN / 5) +- + #define LWIP_TCP_KEEPALIVE 1 + + #define LWIP_TCP_SACK_OUT 1 + #define LWIP_WND_SCALE 1 + #define TCP_RCV_SCALE 6 + ++#define DEFAULT_ACCEPTMBOX_SIZE 4096 ++#define DEFAULT_TCP_RECVMBOX_SIZE 4096 ++#define DEFAULT_SENDMBOX_SIZE 256 ++ ++#define TCP_LISTEN_BACKLOG 1 ++#define TCP_DEFAULT_LISTEN_BACKLOG (DEFAULT_ACCEPTMBOX_SIZE << 1) ++ ++#define TCP_MSS (GAZELLE_ETH_MTU - PBUF_TRANSPORT) ++#define TCP_WND (2500 * TCP_MSS) ++// #define TCP_WND_UPDATE_THRESHOLD (16 * TCP_MSS) ++#define TCP_SND_BUF (2500 * TCP_MSS) ++#if LWIP_WND_SCALE ++#define TCP_SNDLOWAT LWIP_MIN(0xFFFF, (4 + OFFLOAD_TX_TSO_MTU_FRAGS) * TCP_MSS) ++#else /* LWIP_WND_SCALE */ ++#define TCP_SNDLOWAT (16 * TCP_MSS) ++#endif /* LWIP_WND_SCALE */ ++#define TCP_SND_QUEUELEN (5000) ++#define TCP_SNDQUEUELOWAT (TCP_SND_QUEUELEN - DEFAULT_SENDMBOX_SIZE - 32) ++#define TCP_SND_BUF_MAX (TCP_SND_QUEUELEN * TCP_MSS) ++#define TCP_OVERSIZE TCP_MSS ++/* round up to TCP_OVERSIZE */ ++#define TCP_OVERSIZE_CALC_LENGTH(length) \ ++ ((u16_t)(length) > 0xFFFF - TCP_OVERSIZE) ? (u16_t)(length) : \ ++ (((u16_t)(length) + TCP_OVERSIZE - 1) / TCP_OVERSIZE * TCP_OVERSIZE) ++ ++#define TCP_RECV_AND_UPDATE 0 + #define SOCK_WAIT_BATCH_NOTIFY 1 + + #define GAZELLE_TCP_LAST_SEG 1 +@@ -325,11 +334,29 @@ + #define LWIP_NETIF_LOOPBACK_MULTITHREADING 0 + #define LWIP_NETIF_LOOPBACK 1 + ++#define LWIP_NETIF_TX_SINGLE_PBUF 0 ++ + #define ETHARP_SUPPORT_VLAN 1 + #define LWIP_VLAN_PCP 1 +-#define VLAN_LEN 4 +- +- ++#ifndef SIZEOF_VLAN_HDR ++#define SIZEOF_VLAN_HDR 4 ++#endif ++ ++#define ETH_PAD_SIZE 0 ++#ifndef SIZEOF_ETH_HDR ++#define SIZEOF_ETH_HDR (14 + ETH_PAD_SIZE) ++#endif ++#define ETH_CRC_SIZE 4 ++ ++/* | MAC | IP | TCP/UDP | Payload | CRC | */ ++/* | TCP_MSS | */ ++/* | GAZELLE_IP_MTU | */ ++/* | GAZELLE_ETH_MTU | */ ++/* | PBUF_POOL_BUFSIZE | */ ++#define GAZELLE_IP_MTU 1500 ++/* PBUF_LINK_HLEN = SIZEOF_ETH_HDR + SIZEOF_VLAN_HDR + ETH_PAD_SIZE */ ++#define GAZELLE_ETH_MTU (PBUF_LINK_HLEN + GAZELLE_IP_MTU) ++#define PBUF_POOL_BUFSIZE LWIP_MEM_ALIGN_SIZE(GAZELLE_ETH_MTU + ETH_CRC_SIZE) + + /* + ------------------------------------ +-- +2.33.0 + diff --git a/0185-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch b/0185-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch new file mode 100644 index 0000000..5aa0cb1 --- /dev/null +++ b/0185-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch @@ -0,0 +1,166 @@ +From 2f4a262b54eea1f21d00f25144ca0bb859f09c11 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 18 Apr 2025 11:03:30 +0800 +Subject: [PATCH 04/11] tcp: add GAZELLE_TCP_ASYNC_RECVD + +Signed-off-by: Lemmy Huang +--- + src/api/api_msg.c | 53 +++++++++++++++++++++++++++++++-- + src/core/tcp.c | 11 ------- + src/include/lwip/priv/api_msg.h | 4 +++ + src/include/lwip/tcp.h | 5 ++++ + src/include/lwipopts.h | 2 +- + 5 files changed, 60 insertions(+), 15 deletions(-) + +diff --git a/src/api/api_msg.c b/src/api/api_msg.c +index bbde5a5..a8de131 100644 +--- a/src/api/api_msg.c ++++ b/src/api/api_msg.c +@@ -390,9 +390,14 @@ recv_tcp(void *arg, struct tcp_pcb *pcb, struct pbuf *p, err_t err) + #if GAZELLE_ENABLE + lstack_calculate_aggregate(0, len); + time_stamp_record(conn->callback_arg.socket, p); +-#if !TCP_RECV_AND_UPDATE +- tcp_recved(conn->pcb.tcp, len); +-#endif /* TCP_RECV_AND_UPDATE */ ++ if (!SYS_CONFIG(rtc_mode)) { ++#if GAZELLE_TCP_ASYNC_RECVD ++ u32_t recvd = lwip_netconn_get_recvd(conn, len, 1); ++ lwip_netconn_update_recvd(conn, recvd); ++#else /* GAZELLE_TCP_ASYNC_RECVD */ ++ tcp_recved(conn->pcb.tcp, len); ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ ++ } + #endif /* GAZELLE_ENABLE */ + /* Register event with callback */ + API_EVENT(conn, NETCONN_EVT_RCVPLUS, len); +@@ -1025,6 +1030,13 @@ lwip_netconn_do_close_internal(struct netconn *conn WRITE_DELAYED_PARAM) + shut_close = 0; + } + ++#if GAZELLE_TCP_ASYNC_RECVD ++ if (!SYS_CONFIG(rtc_mode) && tpcb->state > LISTEN) { ++ lwip_netconn_update_recvd(conn, tpcb->unrcved_len); ++ tpcb->unrcved_len = tpcb->unrcved_cnt = 0; ++ } ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ ++ + /* Set back some callback pointers */ + if (shut_close) { + tcp_arg(tpcb, NULL); +@@ -1687,6 +1699,41 @@ lwip_netconn_do_recv(void *m) + TCPIP_APIMSG_ACK(msg); + } + ++#if GAZELLE_TCP_ASYNC_RECVD ++u32_t lwip_netconn_get_recvd(struct netconn *conn, u16_t len, u16_t cnt) ++{ ++ struct mbox_ring *mr; ++ u16_t mboxcnt, percnt_len; ++ u32_t recvd; ++ ++ if (!sys_mbox_valid(&conn->recvmbox)) ++ return 0; ++ ++ mr = &conn->recvmbox->mring; ++ mboxcnt = mr->ops->recv_count(mr); ++ ++ conn->pcb.tcp->unrcved_len += len; ++ conn->pcb.tcp->unrcved_cnt += cnt; ++ percnt_len = conn->pcb.tcp->unrcved_len / conn->pcb.tcp->unrcved_cnt; ++ recvd = percnt_len * (conn->pcb.tcp->unrcved_cnt - mboxcnt); ++ ++ conn->pcb.tcp->unrcved_len -= recvd; ++ conn->pcb.tcp->unrcved_cnt = mboxcnt; ++ return recvd; ++} ++ ++/* see lwip_netconn_do_recv() */ ++void lwip_netconn_update_recvd(struct netconn *conn, u32_t remaining) ++{ ++ u16_t recved; ++ while (remaining > 0) { ++ recved = (u16_t)((remaining > 0xffff) ? 0xffff : remaining); ++ tcp_recved(conn->pcb.tcp, recved); ++ remaining -= recved; ++ }; ++} ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ ++ + #if TCP_LISTEN_BACKLOG + /** Indicate that a TCP pcb has been accepted + * Called from netconn_accept +diff --git a/src/core/tcp.c b/src/core/tcp.c +index 97d2c4d..cf618b5 100644 +--- a/src/core/tcp.c ++++ b/src/core/tcp.c +@@ -2168,17 +2168,6 @@ tcp_alloc(u8_t prio) + pcb->keep_cnt = TCP_KEEPCNT_DEFAULT; + #endif /* LWIP_TCP_KEEPALIVE */ + +-#if GAZELLE_SAME_NODE +- pcb->client_rx_ring = NULL; +- pcb->client_tx_ring = NULL; +- pcb->free_ring = 0; +-#endif /* GAZELLE_SAME_NODE */ +-#if GAZELLE_TCP_PINGPONG_MODE +- pcb->lrcvtime = 0; +- pcb->lsndtime = 0; +- pcb->pingpong = 0; +-#endif +- + pcb_tci_init(pcb); + } + return pcb; +diff --git a/src/include/lwip/priv/api_msg.h b/src/include/lwip/priv/api_msg.h +index b36f00a..be980b2 100644 +--- a/src/include/lwip/priv/api_msg.h ++++ b/src/include/lwip/priv/api_msg.h +@@ -200,6 +200,10 @@ void lwip_netconn_do_disconnect (void *m); + void lwip_netconn_do_listen (void *m); + void lwip_netconn_do_send (void *m); + void lwip_netconn_do_recv (void *m); ++#if GAZELLE_TCP_ASYNC_RECVD ++u32_t lwip_netconn_get_recvd(struct netconn *conn, u16_t len, u16_t cnt); ++void lwip_netconn_update_recvd(struct netconn *conn, u32_t remaining); ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + #if TCP_LISTEN_BACKLOG + void lwip_netconn_do_accepted (void *m); + #endif /* TCP_LISTEN_BACKLOG */ +diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h +index e3add72..a0da5a9 100644 +--- a/src/include/lwip/tcp.h ++++ b/src/include/lwip/tcp.h +@@ -414,10 +414,15 @@ struct tcp_pcb { + u8_t rcv_scale; + #endif + ++ /* tcp_alloc() would memset to 0. */ + #if GAZELLE_TCP_LAST_SEG + struct tcp_seg *last_unsent; + struct tcp_seg *last_unacked; + #endif /* GAZELLE_TCP_LAST_SEG */ ++#if GAZELLE_TCP_ASYNC_RECVD ++ u32_t unrcved_len; ++ u32_t unrcved_cnt; ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + #if GAZELLE_SAME_NODE + #define SAME_NODE_RING_SIZE 512 + struct rte_ring *client_rx_ring; +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index 08ab34c..ef34d1b 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -261,8 +261,8 @@ + ((u16_t)(length) > 0xFFFF - TCP_OVERSIZE) ? (u16_t)(length) : \ + (((u16_t)(length) + TCP_OVERSIZE - 1) / TCP_OVERSIZE * TCP_OVERSIZE) + +-#define TCP_RECV_AND_UPDATE 0 + #define SOCK_WAIT_BATCH_NOTIFY 1 ++#define GAZELLE_TCP_ASYNC_RECVD 1 + + #define GAZELLE_TCP_LAST_SEG 1 + +-- +2.33.0 + diff --git a/0186-socket-fix-tcp-closed.patch b/0186-socket-fix-tcp-closed.patch new file mode 100644 index 0000000..b2238a7 --- /dev/null +++ b/0186-socket-fix-tcp-closed.patch @@ -0,0 +1,30 @@ +From 8f838bdaec28b67590ea082217ccef81ffdf0ff5 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sat, 19 Apr 2025 21:34:07 +0800 +Subject: [PATCH 05/11] socket: fix tcp closed + +Signed-off-by: Lemmy Huang +--- + src/api/sockets.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/src/api/sockets.c b/src/api/sockets.c +index fa32476..c42334a 100644 +--- a/src/api/sockets.c ++++ b/src/api/sockets.c +@@ -623,6 +623,12 @@ free_socket_locked(struct lwip_sock *sock, int is_tcp, struct netconn **conn, + sock->lastdata.pbuf = NULL; + *conn = sock->conn; + sock->conn = NULL; ++#if GAZELLE_ENABLE ++ err_t err; ++ int lwip_netconn_is_err_msg(void *msg, err_t *err); ++ if (lwip_netconn_is_err_msg(lastdata->pbuf, &err)) ++ lastdata->pbuf = NULL; ++#endif /* GAZELLE_ENABLE */ + return 1; + } + +-- +2.33.0 + diff --git a/0187-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch b/0187-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch new file mode 100644 index 0000000..f465e33 --- /dev/null +++ b/0187-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch @@ -0,0 +1,89 @@ +From 3a691f86427bbbec01aa31fc95f7e6cb45155ee9 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 20 Apr 2025 12:11:06 +0800 +Subject: [PATCH 06/11] socket: fix sk_wait cannot be interrupted by signals + +Signed-off-by: Lemmy Huang +--- + src/include/arch/sys_arch.h | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/src/include/arch/sys_arch.h b/src/include/arch/sys_arch.h +index c36aa45..d70e776 100644 +--- a/src/include/arch/sys_arch.h ++++ b/src/include/arch/sys_arch.h +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -118,24 +119,27 @@ void sys_sem_signal_internal(sys_sem_t s) + static inline + int sys_sem_wait_internal(sys_sem_t s, int timeout) + { +- if (unlikely(timeout == 0)) { ++ int ret = 0; ++ if (unlikely(timeout == 0)) + return 0; +- } + + if (timeout < 0) { +- sem_wait(&s->ksem); ++ ret = sem_wait(&s->ksem); + } else { + u32_t start = sys_now(); + struct timespec ts; + + sys_ms2timespec(&ts, timeout); +- sem_timedwait(&s->ksem, &ts); ++ ret = sem_timedwait(&s->ksem, &ts); + + if (timeout <= (int)(sys_now() - start)) { + timeout = 0; + } + } + ++ if (unlikely(ret != 0)) ++ return ret; ++ errno = 0; + return timeout; + } + +@@ -170,24 +174,27 @@ void sys_mutex_unlock_internal(sys_mutex_t m) + static inline + int sys_mutex_timedlock_internal(sys_mutex_t m, int timeout) + { +- if (unlikely(timeout == 0)) { ++ int ret = 0; ++ if (unlikely(timeout == 0)) + return 0; +- } + + if (timeout < 0) { +- pthread_mutex_lock(&m->klock); ++ ret = pthread_mutex_lock(&m->klock); + } else { + u32_t start = sys_now(); + struct timespec ts; + + sys_ms2timespec(&ts, timeout); +- pthread_mutex_timedlock(&m->klock, &ts); ++ ret = pthread_mutex_timedlock(&m->klock, &ts); + + if (timeout <= (int)(sys_now() - start)) { + timeout = 0; + } + } + ++ if (unlikely(ret != 0)) ++ return ret; ++ errno = 0; + return timeout; + } + +-- +2.33.0 + diff --git a/0188-mempool-fix-sendmbox-not-free.patch b/0188-mempool-fix-sendmbox-not-free.patch new file mode 100644 index 0000000..ee1f72a --- /dev/null +++ b/0188-mempool-fix-sendmbox-not-free.patch @@ -0,0 +1,56 @@ +From 61d81c3c8c94231bf5f4cad9988c7b93a609f05a Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Mon, 21 Apr 2025 15:17:08 +0800 +Subject: [PATCH 07/11] mempool: fix sendmbox not free fix + mem_get_pbuf not reserve + +Signed-off-by: Lemmy Huang +--- + src/api/api_msg.c | 10 ++++++++++ + src/core/pbuf.c | 2 +- + 2 files changed, 11 insertions(+), 1 deletion(-) + +diff --git a/src/api/api_msg.c b/src/api/api_msg.c +index a8de131..676ca9a 100644 +--- a/src/api/api_msg.c ++++ b/src/api/api_msg.c +@@ -935,6 +935,12 @@ netconn_drain(struct netconn *conn) + sys_mbox_free(&conn->recvmbox); + sys_mbox_set_invalid(&conn->recvmbox); + } ++#if GAZELLE_ENABLE ++ if (sys_mbox_valid(&conn->sendmbox)) { ++ sys_mbox_free(&conn->sendmbox); ++ sys_mbox_set_invalid(&conn->sendmbox); ++ } ++#endif /* GAZELLE_ENABLE */ + + /* Delete and drain the acceptmbox. */ + #if LWIP_TCP +@@ -1585,6 +1591,10 @@ lwip_netconn_do_listen(void *m) + err = sys_mbox_new(&msg->conn->acceptmbox, DEFAULT_ACCEPTMBOX_SIZE); + #else /* GAZELLE_ENABLE */ + err = sys_mbox_new_flags(&msg->conn->acceptmbox, DEFAULT_ACCEPTMBOX_SIZE, MBOX_FLAG_TCP); ++ if (sys_mbox_valid(&msg->conn->sendmbox)) { ++ sys_mbox_free(&msg->conn->sendmbox); ++ sys_mbox_set_invalid(&msg->conn->sendmbox); ++ } + #endif /* GAZELLE_ENABLE */ + } + if (err == ERR_OK) { +diff --git a/src/core/pbuf.c b/src/core/pbuf.c +index 4cfe369..04a0c99 100644 +--- a/src/core/pbuf.c ++++ b/src/core/pbuf.c +@@ -264,7 +264,7 @@ pbuf_alloc(pbuf_layer layer, u16_t length, pbuf_type type) + do { + u16_t qlen; + #if GAZELLE_ENABLE +- q = mem_get_pbuf(-1, false); ++ q = mem_get_pbuf(-1, layer != PBUF_LINK); + #else /* GAZELLE_ENABLE */ + q = (struct pbuf *)memp_malloc(MEMP_PBUF_POOL); + #endif /* GAZELLE_ENABLE */ +-- +2.33.0 + diff --git a/0189-udp-fix-ip6_frag-nfb-and-last.patch b/0189-udp-fix-ip6_frag-nfb-and-last.patch new file mode 100644 index 0000000..a62d345 --- /dev/null +++ b/0189-udp-fix-ip6_frag-nfb-and-last.patch @@ -0,0 +1,111 @@ +From fd16c9099000df9cd42507477b943c7fbd19070f Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 3 Jun 2025 15:30:49 +0800 +Subject: [PATCH 08/11] udp: fix ip6_frag nfb and last + +Signed-off-by: Lemmy Huang +--- + src/core/ipv4/ip4_frag.c | 9 +++++---- + src/core/ipv6/ip6_frag.c | 16 ++++++---------- + 2 files changed, 11 insertions(+), 14 deletions(-) + +diff --git a/src/core/ipv4/ip4_frag.c b/src/core/ipv4/ip4_frag.c +index 01dd0f3..7c1aca5 100644 +--- a/src/core/ipv4/ip4_frag.c ++++ b/src/core/ipv4/ip4_frag.c +@@ -804,15 +804,13 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + #endif /* GAZELLE_UDP_ENABLE */ + + while (left) { +- /* Fill this fragment */ +- fragsize = LWIP_MIN(left, (u16_t)(nfb * 8)); +- + #if GAZELLE_UDP_ENABLE + LWIP_ASSERT("ip4_frag find a tcp pbuf!", (IPH_PROTO(original_iphdr) == IP_PROTO_TCP)); + if (IPH_PROTO(original_iphdr) == IP_PROTO_UDP) { + pbuf_split_one(q, &rest_q); + rambuf = q; + q = rest_q; ++ last = (q == NULL); + + /* first pbuf aleady added header */ + if (rambuf != p) { +@@ -830,6 +828,9 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + } else + #endif /* GAZELLE_UDP_ENABLE */ + { ++ /* Fill this fragment */ ++ fragsize = LWIP_MIN(left, (u16_t)(nfb * 8)); ++ + #if LWIP_NETIF_TX_SINGLE_PBUF + rambuf = pbuf_alloc(PBUF_IP, fragsize, PBUF_RAM); + if (rambuf == NULL) { +@@ -902,10 +903,10 @@ ip4_frag(struct pbuf *p, struct netif *netif, const ip4_addr_t *dest) + } + poff = (u16_t)(poff + newpbuflen); + #endif /* LWIP_NETIF_TX_SINGLE_PBUF */ +- } + + /* Correct header */ + last = (left <= netif->mtu - IP_HLEN); ++ } + + /* Set new offset and MF flag */ + tmp = (IP_OFFMASK & (ofo)); +diff --git a/src/core/ipv6/ip6_frag.c b/src/core/ipv6/ip6_frag.c +index a5eb620..c1d7e40 100644 +--- a/src/core/ipv6/ip6_frag.c ++++ b/src/core/ipv6/ip6_frag.c +@@ -731,11 +731,7 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + static u32_t identification; + u16_t left, cop; + const u16_t mtu = nd6_get_destination_mtu(dest, netif); +-#if !GAZELLE_UDP_ENABLE + const u16_t nfb = (u16_t)((mtu - (IP6_HLEN + IP6_FRAG_HLEN)) & IP6_FRAG_OFFSET_MASK); +-#else /* GAZELLE_UDP_ENABLE */ +- u16_t nfb = (u16_t)((mtu - (IP6_HLEN + IP6_FRAG_HLEN)) & IP6_FRAG_OFFSET_MASK); +-#endif /* GAZELLE_UDP_ENABLE */ + u16_t fragment_offset = 0; + u16_t last; + u16_t poff = IP6_HLEN; +@@ -766,17 +762,13 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + #endif /* GAZELLE_UDP_ENABLE */ + + while (left) { +- last = (left <= nfb); +- +- /* Fill this fragment */ +- cop = last ? left : nfb; +- + #if GAZELLE_UDP_ENABLE + LWIP_ASSERT("ip6_frag find a tcp pbuf!", (IP6H_NEXTH(original_ip6hdr) == IP6_NEXTH_TCP)); + if (IP6H_NEXTH(original_ip6hdr) == IP6_NEXTH_UDP) { + pbuf_split_one(q, &rest_q); + rambuf = q; + q = rest_q; ++ last = (q == NULL); + + pbuf_add_header(rambuf, IP6_HLEN + IP6_FRAG_HLEN); + /* fill in the IP header */ +@@ -786,13 +778,17 @@ ip6_frag(struct pbuf *p, struct netif *netif, const ip6_addr_t *dest) + poff += cop; + ip6hdr = (struct ip6hdr *)rambuf->payload; + frag_hdr = (struct ip6_frag_hdr *)((u8_t*)rambuf->payload + IP6_HLEN); +- nfb = cop / 8; + + LWIP_DEBUGF(IP_REASS_DEBUG, ("ip4_frag: UDP p=%p, tot_len %u, fragsize %u, nfb %u\n", + rambuf, rambuf->tot_len, cop, nfb)); + } else + #endif /* GAZELLE_UDP_ENABLE */ + { ++ last = (left <= nfb); ++ ++ /* Fill this fragment */ ++ cop = last ? left : nfb; ++ + #if LWIP_NETIF_TX_SINGLE_PBUF + rambuf = pbuf_alloc(PBUF_IP, cop + IP6_FRAG_HLEN, PBUF_RAM); + if (rambuf == NULL) { +-- +2.33.0 + diff --git a/0190-udp-fix-recv_udp-sys_mbox_trypost-failed.patch b/0190-udp-fix-recv_udp-sys_mbox_trypost-failed.patch new file mode 100644 index 0000000..a0c345f --- /dev/null +++ b/0190-udp-fix-recv_udp-sys_mbox_trypost-failed.patch @@ -0,0 +1,41 @@ +From 775fe48755010ca6097e2f8bb86e85422829c324 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 24 Jun 2025 14:38:25 +0800 +Subject: [PATCH 09/11] udp: fix recv_udp sys_mbox_trypost failed + +Signed-off-by: Lemmy Huang +--- + src/api/api_msg.c | 3 +++ + src/api/sys_arch.c | 2 +- + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/src/api/api_msg.c b/src/api/api_msg.c +index 676ca9a..8333221 100644 +--- a/src/api/api_msg.c ++++ b/src/api/api_msg.c +@@ -316,6 +316,9 @@ recv_udp(void *arg, struct udp_pcb *pcb, struct pbuf *p, + len = p->tot_len; + err = sys_mbox_trypost(&conn->recvmbox, buf); + if (err != ERR_OK) { ++#if GAZELLE_UDP_ENABLE ++ API_EVENT(conn, NETCONN_EVT_RCVPLUS, len); ++#endif /* GAZELLE_UDP_ENABLE */ + netbuf_delete(buf); + LWIP_DEBUGF(API_MSG_DEBUG, ("recv_udp: sys_mbox_trypost failed, err=%d\n", err)); + return; +diff --git a/src/api/sys_arch.c b/src/api/sys_arch.c +index b2d21f8..8e15650 100644 +--- a/src/api/sys_arch.c ++++ b/src/api/sys_arch.c +@@ -322,7 +322,7 @@ err_t sys_mbox_trypost(sys_mbox_t *mbox, void *msg) + + ret = mr->ops->enqueue_burst(mr, &msg, 1); + if (unlikely(ret == 0)) { +- LWIP_DEBUGF(SYS_DEBUG | LWIPGZ_LOG_ERR, ("sys_mbox_trypost failed, mbox %s\n", (*mbox)->name)); ++ LWIP_DEBUGF(SYS_DEBUG, ("sys_mbox_trypost failed, mbox %s\n", (*mbox)->name)); + return ERR_MEM; + } + return ERR_OK; +-- +2.33.0 + diff --git a/0191-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch b/0191-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch new file mode 100644 index 0000000..d6e7dc2 --- /dev/null +++ b/0191-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch @@ -0,0 +1,54 @@ +From 7ca59a28b86ddf71fc630ea573db8466133c4d53 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sat, 28 Jun 2025 15:02:32 +0800 +Subject: [PATCH 10/11] sk_event: fix rtw epoll wrong event notify and remove + +Signed-off-by: Lemmy Huang +--- + src/include/lwip/api.h | 10 +++++----- + src/include/lwipgz_sock.h | 4 +++- + 2 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/src/include/lwip/api.h b/src/include/lwip/api.h +index 695b807..5f15ac9 100644 +--- a/src/include/lwip/api.h ++++ b/src/include/lwip/api.h +@@ -182,11 +182,11 @@ enum netconn_state { + * A SENDMINUS event occurs when the next call to a netconn_send() would be blocking. + */ + enum netconn_evt { +- NETCONN_EVT_RCVPLUS, +- NETCONN_EVT_RCVMINUS, +- NETCONN_EVT_SENDPLUS, +- NETCONN_EVT_SENDMINUS, +- NETCONN_EVT_ERROR ++ NETCONN_EVT_RCVPLUS = 0x01, ++ NETCONN_EVT_RCVMINUS = 0x02, ++ NETCONN_EVT_SENDPLUS = 0x04, ++ NETCONN_EVT_SENDMINUS = 0x08, ++ NETCONN_EVT_ERROR = 0x10, + }; + + #if LWIP_IGMP || (LWIP_IPV6 && LWIP_IPV6_MLD) +diff --git a/src/include/lwipgz_sock.h b/src/include/lwipgz_sock.h +index 57193e9..b6d4efe 100644 +--- a/src/include/lwipgz_sock.h ++++ b/src/include/lwipgz_sock.h +@@ -192,11 +192,13 @@ struct lwip_sock { + #endif /* GAZELLE_SAME_NODE */ + + struct sock_event { ++ struct lwip_sock *sock; ++ + epoll_data_t ep_data; /* User data variable */ + unsigned events; /* requested events, EPOLLONESHOT write frequently */ + + #if SOCK_WAIT_BATCH_NOTIFY +- unsigned stk_pending; ++ enum netconn_evt stk_evts; + struct list_node stk_event_node; + #endif /* SOCK_WAIT_BATCH_NOTIFY */ + +-- +2.33.0 + diff --git a/0192-tcp-reduce-TCP_SNDQUEUELOWAT-to-1800.patch b/0192-tcp-reduce-TCP_SNDQUEUELOWAT-to-1800.patch new file mode 100644 index 0000000..af461bc --- /dev/null +++ b/0192-tcp-reduce-TCP_SNDQUEUELOWAT-to-1800.patch @@ -0,0 +1,62 @@ +From 2ffd85eef88dea7e167b744bad39ce26619936bd Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 4 Jul 2025 14:53:24 +0800 +Subject: [PATCH 11/11] tcp: reduce TCP_SNDQUEUELOWAT to 1800 fix SYN_SENT + state get EPOLLOUT + +Signed-off-by: Lemmy Huang +--- + src/api/api_msg.c | 1 + + src/core/tcp_out.c | 3 +++ + src/include/lwipopts.h | 6 +++--- + 3 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/src/api/api_msg.c b/src/api/api_msg.c +index 8333221..b600f26 100644 +--- a/src/api/api_msg.c ++++ b/src/api/api_msg.c +@@ -1426,6 +1426,7 @@ lwip_netconn_do_connected(void *arg, struct tcp_pcb *pcb, err_t err) + conn->state = NETCONN_NONE; + #if GAZELLE_ENABLE + do_lwip_connected_callback(conn->callback_arg.socket); ++ if (conn->pcb.tcp->state >= ESTABLISHED) + #endif + API_EVENT(conn, NETCONN_EVT_SENDPLUS, 0); + +diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c +index 759cb22..9d0746c 100644 +--- a/src/core/tcp_out.c ++++ b/src/core/tcp_out.c +@@ -924,6 +924,9 @@ u8_t lwip_tcp_optlen(const struct tcp_pcb *pcb) + /* refence to lwip_netconn_do_writemore() */ + u8_t lwip_tcp_allow_send(const struct tcp_pcb *pcb) + { ++ if (pcb->state < ESTABLISHED) { ++ return false; ++ } + if ((tcp_sndbuf(pcb) <= TCP_SNDLOWAT) || + (tcp_sndqueuelen(pcb) >= TCP_SNDQUEUELOWAT)) { + /* The queued byte- or pbuf-count exceeds the configured low-water limit, +diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h +index ef34d1b..258cbcc 100644 +--- a/src/include/lwipopts.h ++++ b/src/include/lwipopts.h +@@ -248,12 +248,12 @@ + // #define TCP_WND_UPDATE_THRESHOLD (16 * TCP_MSS) + #define TCP_SND_BUF (2500 * TCP_MSS) + #if LWIP_WND_SCALE +-#define TCP_SNDLOWAT LWIP_MIN(0xFFFF, (4 + OFFLOAD_TX_TSO_MTU_FRAGS) * TCP_MSS) ++#define TCP_SNDLOWAT LWIP_MIN(0xF000, (4 + OFFLOAD_TX_TSO_MTU_FRAGS) * TCP_MSS) + #else /* LWIP_WND_SCALE */ + #define TCP_SNDLOWAT (16 * TCP_MSS) + #endif /* LWIP_WND_SCALE */ +-#define TCP_SND_QUEUELEN (5000) +-#define TCP_SNDQUEUELOWAT (TCP_SND_QUEUELEN - DEFAULT_SENDMBOX_SIZE - 32) ++#define TCP_SNDQUEUELOWAT (1800) ++#define TCP_SND_QUEUELEN (TCP_SNDQUEUELOWAT * 3) + #define TCP_SND_BUF_MAX (TCP_SND_QUEUELEN * TCP_MSS) + #define TCP_OVERSIZE TCP_MSS + /* round up to TCP_OVERSIZE */ +-- +2.33.0 + diff --git a/lwip.spec b/lwip.spec index 73023d3..be44ae1 100644 --- a/lwip.spec +++ b/lwip.spec @@ -4,7 +4,7 @@ Summary: lwip is a small independent implementation of the TCP/IP protocol suite Name: lwip Version: 2.2.0 -Release: 70 +Release: 71 License: BSD URL: http://savannah.nongnu.org/projects/lwip/ Source0: http://download.savannah.nongnu.org/releases/lwip/%{name}-%{version}.zip @@ -197,6 +197,18 @@ Patch9179: 0179-fix-pcb.tcp-null-pointer-error-when-netperf-recv-RST.patch Patch9180: 0180-tso-max-frags-is-configurable.patch Patch9181: 0181-TCP_IN-fix-infinite-loopping-in-func-min_cnts_lpcb_g.patch +Patch9182: 0182-cleancode-add-GAZELLE_SAME_NODE-and-GAZELLE_TCP_LAST.patch +Patch9183: 0183-socket-refactor-sock_event.patch +Patch9184: 0184-socket-refactor-tcp-and-udp.patch +Patch9185: 0185-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch +Patch9186: 0186-socket-fix-tcp-closed.patch +Patch9187: 0187-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch +Patch9188: 0188-mempool-fix-sendmbox-not-free.patch +Patch9189: 0189-udp-fix-ip6_frag-nfb-and-last.patch +Patch9190: 0190-udp-fix-recv_udp-sys_mbox_trypost-failed.patch +Patch9191: 0191-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch +Patch9192: 0192-tcp-reduce-TCP_SNDQUEUELOWAT-to-1800.patch + BuildRequires: gcc-c++ dos2unix dpdk-devel #Requires: @@ -225,6 +237,10 @@ cd %{_builddir}/%{name}-%{version}/src %{_libdir}/liblwip.a %changelog +* Sat Jul 05 2025 LemmyHuang - 2.2.0-71 +- tcp udp: merge RTW and RTC modes +- mempool: supports async and sync memory modes + * Thu Feb 13 2025 yinbin - 2.2.0-70 - TCP_IN: fix infinite loopping in func min_cnts_lpcb_get -- Gitee